{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1000, "global_step": 12776, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015654351909830932, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 10.2049, "step": 1 }, { "epoch": 0.00031308703819661864, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 10.1719, "step": 2 }, { "epoch": 0.000469630557294928, "grad_norm": 10.289196014404297, "learning_rate": 1.3999999999999998e-07, "loss": 11.8275, "step": 3 }, { "epoch": 0.0006261740763932373, "grad_norm": 8.432656288146973, "learning_rate": 2.7999999999999997e-07, "loss": 9.669, "step": 4 }, { "epoch": 0.0007827175954915466, "grad_norm": 10.235750198364258, "learning_rate": 4.1999999999999995e-07, "loss": 11.888, "step": 5 }, { "epoch": 0.000939261114589856, "grad_norm": 9.022594451904297, "learning_rate": 5.599999999999999e-07, "loss": 11.0998, "step": 6 }, { "epoch": 0.0010958046336881652, "grad_norm": 9.15576457977295, "learning_rate": 7e-07, "loss": 11.0314, "step": 7 }, { "epoch": 0.0012523481527864746, "grad_norm": 15.189912796020508, "learning_rate": 8.399999999999999e-07, "loss": 18.5522, "step": 8 }, { "epoch": 0.001408891671884784, "grad_norm": 8.24608039855957, "learning_rate": 9.8e-07, "loss": 9.8142, "step": 9 }, { "epoch": 0.0015654351909830933, "grad_norm": 8.48282527923584, "learning_rate": 1.1199999999999999e-06, "loss": 10.5824, "step": 10 }, { "epoch": 0.0017219787100814026, "grad_norm": 8.063555717468262, "learning_rate": 1.2599999999999998e-06, "loss": 9.6978, "step": 11 }, { "epoch": 0.001878522229179712, "grad_norm": 12.570623397827148, "learning_rate": 1.4e-06, "loss": 15.562, "step": 12 }, { "epoch": 0.002035065748278021, "grad_norm": 14.773932456970215, "learning_rate": 1.5399999999999999e-06, "loss": 21.2006, "step": 13 }, { "epoch": 0.0021916092673763305, "grad_norm": 14.604262351989746, "learning_rate": 1.6799999999999998e-06, "loss": 20.1573, "step": 14 }, { "epoch": 0.00234815278647464, "grad_norm": Infinity, "learning_rate": 1.6799999999999998e-06, "loss": 25.9461, "step": 15 }, { "epoch": 0.002504696305572949, "grad_norm": 7.285233020782471, "learning_rate": 1.8199999999999997e-06, "loss": 9.076, "step": 16 }, { "epoch": 0.0026612398246712585, "grad_norm": 14.448596954345703, "learning_rate": 1.96e-06, "loss": 19.6473, "step": 17 }, { "epoch": 0.002817783343769568, "grad_norm": 24.030418395996094, "learning_rate": 2.1e-06, "loss": 24.9163, "step": 18 }, { "epoch": 0.002974326862867877, "grad_norm": 10.472304344177246, "learning_rate": 2.2399999999999997e-06, "loss": 14.4247, "step": 19 }, { "epoch": 0.0031308703819661866, "grad_norm": 8.932097434997559, "learning_rate": 2.38e-06, "loss": 11.251, "step": 20 }, { "epoch": 0.003287413901064496, "grad_norm": 8.210410118103027, "learning_rate": 2.5199999999999996e-06, "loss": 10.1346, "step": 21 }, { "epoch": 0.0034439574201628053, "grad_norm": 9.726058959960938, "learning_rate": 2.6599999999999995e-06, "loss": 12.5889, "step": 22 }, { "epoch": 0.0036005009392611146, "grad_norm": 10.66357135772705, "learning_rate": 2.8e-06, "loss": 15.2058, "step": 23 }, { "epoch": 0.003757044458359424, "grad_norm": 8.272431373596191, "learning_rate": 2.94e-06, "loss": 11.9428, "step": 24 }, { "epoch": 0.003913587977457733, "grad_norm": 11.079901695251465, "learning_rate": 3.0799999999999997e-06, "loss": 16.8849, "step": 25 }, { "epoch": 0.004070131496556042, "grad_norm": 12.501070022583008, "learning_rate": 3.2199999999999997e-06, "loss": 15.4899, "step": 26 }, { "epoch": 0.004226675015654352, "grad_norm": 13.564478874206543, "learning_rate": 3.3599999999999996e-06, "loss": 19.6491, "step": 27 }, { "epoch": 0.004383218534752661, "grad_norm": 8.572904586791992, "learning_rate": 3.5e-06, "loss": 13.1778, "step": 28 }, { "epoch": 0.004539762053850971, "grad_norm": 9.82887077331543, "learning_rate": 3.6399999999999995e-06, "loss": 14.8809, "step": 29 }, { "epoch": 0.00469630557294928, "grad_norm": 9.405379295349121, "learning_rate": 3.78e-06, "loss": 13.7245, "step": 30 }, { "epoch": 0.004852849092047589, "grad_norm": 10.518194198608398, "learning_rate": 3.92e-06, "loss": 16.8943, "step": 31 }, { "epoch": 0.005009392611145898, "grad_norm": 13.418915748596191, "learning_rate": 4.06e-06, "loss": 19.5626, "step": 32 }, { "epoch": 0.005165936130244208, "grad_norm": 11.481789588928223, "learning_rate": 4.2e-06, "loss": 17.701, "step": 33 }, { "epoch": 0.005322479649342517, "grad_norm": 9.263602256774902, "learning_rate": 4.34e-06, "loss": 15.0621, "step": 34 }, { "epoch": 0.005479023168440827, "grad_norm": 9.839520454406738, "learning_rate": 4.4799999999999995e-06, "loss": 15.8261, "step": 35 }, { "epoch": 0.005635566687539136, "grad_norm": 7.6345624923706055, "learning_rate": 4.62e-06, "loss": 13.4795, "step": 36 }, { "epoch": 0.0057921102066374455, "grad_norm": 9.936891555786133, "learning_rate": 4.76e-06, "loss": 15.7649, "step": 37 }, { "epoch": 0.005948653725735754, "grad_norm": 10.08678913116455, "learning_rate": 4.9e-06, "loss": 16.8913, "step": 38 }, { "epoch": 0.006105197244834064, "grad_norm": 7.729179382324219, "learning_rate": 5.039999999999999e-06, "loss": 12.891, "step": 39 }, { "epoch": 0.006261740763932373, "grad_norm": 8.714329719543457, "learning_rate": 5.1799999999999995e-06, "loss": 13.9582, "step": 40 }, { "epoch": 0.006418284283030683, "grad_norm": 6.456995964050293, "learning_rate": 5.319999999999999e-06, "loss": 12.2333, "step": 41 }, { "epoch": 0.006574827802128992, "grad_norm": 7.425848007202148, "learning_rate": 5.459999999999999e-06, "loss": 13.7365, "step": 42 }, { "epoch": 0.006731371321227302, "grad_norm": 6.977450847625732, "learning_rate": 5.6e-06, "loss": 12.658, "step": 43 }, { "epoch": 0.0068879148403256105, "grad_norm": 6.183155536651611, "learning_rate": 5.74e-06, "loss": 11.2999, "step": 44 }, { "epoch": 0.007044458359423919, "grad_norm": 5.967344760894775, "learning_rate": 5.88e-06, "loss": 10.4122, "step": 45 }, { "epoch": 0.007201001878522229, "grad_norm": 6.455341815948486, "learning_rate": 6.019999999999999e-06, "loss": 10.4953, "step": 46 }, { "epoch": 0.007357545397620538, "grad_norm": 5.5207200050354, "learning_rate": 6.1599999999999995e-06, "loss": 9.4733, "step": 47 }, { "epoch": 0.007514088916718848, "grad_norm": 5.1380743980407715, "learning_rate": 6.299999999999999e-06, "loss": 9.0898, "step": 48 }, { "epoch": 0.007670632435817157, "grad_norm": 4.1103386878967285, "learning_rate": 6.439999999999999e-06, "loss": 7.8549, "step": 49 }, { "epoch": 0.007827175954915467, "grad_norm": 3.4918389320373535, "learning_rate": 6.58e-06, "loss": 6.8378, "step": 50 }, { "epoch": 0.007983719474013776, "grad_norm": 11.115751266479492, "learning_rate": 6.719999999999999e-06, "loss": 12.295, "step": 51 }, { "epoch": 0.008140262993112084, "grad_norm": 10.414963722229004, "learning_rate": 6.8599999999999995e-06, "loss": 11.2789, "step": 52 }, { "epoch": 0.008296806512210394, "grad_norm": 15.939248085021973, "learning_rate": 7e-06, "loss": 15.9905, "step": 53 }, { "epoch": 0.008453350031308704, "grad_norm": 8.401090621948242, "learning_rate": 7.1399999999999986e-06, "loss": 9.1546, "step": 54 }, { "epoch": 0.008609893550407014, "grad_norm": 8.91043472290039, "learning_rate": 7.279999999999999e-06, "loss": 9.9619, "step": 55 }, { "epoch": 0.008766437069505322, "grad_norm": 8.919154167175293, "learning_rate": 7.419999999999999e-06, "loss": 10.4525, "step": 56 }, { "epoch": 0.008922980588603632, "grad_norm": 9.408407211303711, "learning_rate": 7.56e-06, "loss": 10.4294, "step": 57 }, { "epoch": 0.009079524107701941, "grad_norm": 11.395033836364746, "learning_rate": 7.699999999999999e-06, "loss": 13.1339, "step": 58 }, { "epoch": 0.009236067626800251, "grad_norm": 9.916975021362305, "learning_rate": 7.84e-06, "loss": 11.9539, "step": 59 }, { "epoch": 0.00939261114589856, "grad_norm": 9.535187721252441, "learning_rate": 7.98e-06, "loss": 10.7906, "step": 60 }, { "epoch": 0.009549154664996869, "grad_norm": 11.22845458984375, "learning_rate": 8.12e-06, "loss": 13.111, "step": 61 }, { "epoch": 0.009705698184095179, "grad_norm": 7.907777786254883, "learning_rate": 8.259999999999999e-06, "loss": 8.6685, "step": 62 }, { "epoch": 0.009862241703193489, "grad_norm": 10.00581169128418, "learning_rate": 8.4e-06, "loss": 12.4828, "step": 63 }, { "epoch": 0.010018785222291797, "grad_norm": 9.821576118469238, "learning_rate": 8.54e-06, "loss": 11.5506, "step": 64 }, { "epoch": 0.010175328741390106, "grad_norm": 7.86096715927124, "learning_rate": 8.68e-06, "loss": 8.8088, "step": 65 }, { "epoch": 0.010331872260488416, "grad_norm": 15.156925201416016, "learning_rate": 8.819999999999999e-06, "loss": 18.2734, "step": 66 }, { "epoch": 0.010488415779586726, "grad_norm": 8.843999862670898, "learning_rate": 8.959999999999999e-06, "loss": 10.03, "step": 67 }, { "epoch": 0.010644959298685034, "grad_norm": 15.220178604125977, "learning_rate": 9.1e-06, "loss": 20.7043, "step": 68 }, { "epoch": 0.010801502817783344, "grad_norm": 10.631319046020508, "learning_rate": 9.24e-06, "loss": 12.3995, "step": 69 }, { "epoch": 0.010958046336881654, "grad_norm": 7.633686542510986, "learning_rate": 9.38e-06, "loss": 8.8994, "step": 70 }, { "epoch": 0.011114589855979962, "grad_norm": 11.978484153747559, "learning_rate": 9.52e-06, "loss": 16.1353, "step": 71 }, { "epoch": 0.011271133375078271, "grad_norm": 14.891623497009277, "learning_rate": 9.66e-06, "loss": 20.0125, "step": 72 }, { "epoch": 0.011427676894176581, "grad_norm": 9.659276008605957, "learning_rate": 9.8e-06, "loss": 12.4733, "step": 73 }, { "epoch": 0.011584220413274891, "grad_norm": 11.308255195617676, "learning_rate": 9.939999999999998e-06, "loss": 12.5293, "step": 74 }, { "epoch": 0.011740763932373199, "grad_norm": 16.638826370239258, "learning_rate": 1.0079999999999998e-05, "loss": 22.3841, "step": 75 }, { "epoch": 0.011897307451471509, "grad_norm": 8.875855445861816, "learning_rate": 1.0219999999999999e-05, "loss": 11.794, "step": 76 }, { "epoch": 0.012053850970569819, "grad_norm": 15.547615051269531, "learning_rate": 1.0359999999999999e-05, "loss": 19.8593, "step": 77 }, { "epoch": 0.012210394489668128, "grad_norm": 18.36896514892578, "learning_rate": 1.05e-05, "loss": 24.0634, "step": 78 }, { "epoch": 0.012366938008766436, "grad_norm": 10.774385452270508, "learning_rate": 1.0639999999999998e-05, "loss": 15.1734, "step": 79 }, { "epoch": 0.012523481527864746, "grad_norm": 14.204286575317383, "learning_rate": 1.0779999999999998e-05, "loss": 18.8449, "step": 80 }, { "epoch": 0.012680025046963056, "grad_norm": 19.521907806396484, "learning_rate": 1.0919999999999999e-05, "loss": 26.0053, "step": 81 }, { "epoch": 0.012836568566061366, "grad_norm": 14.511372566223145, "learning_rate": 1.1059999999999999e-05, "loss": 19.8003, "step": 82 }, { "epoch": 0.012993112085159674, "grad_norm": 10.998359680175781, "learning_rate": 1.12e-05, "loss": 14.3632, "step": 83 }, { "epoch": 0.013149655604257984, "grad_norm": 13.207918167114258, "learning_rate": 1.134e-05, "loss": 17.7181, "step": 84 }, { "epoch": 0.013306199123356293, "grad_norm": 7.995650768280029, "learning_rate": 1.148e-05, "loss": 11.6433, "step": 85 }, { "epoch": 0.013462742642454603, "grad_norm": 11.832477569580078, "learning_rate": 1.1619999999999999e-05, "loss": 15.3234, "step": 86 }, { "epoch": 0.013619286161552911, "grad_norm": 9.073206901550293, "learning_rate": 1.176e-05, "loss": 13.8795, "step": 87 }, { "epoch": 0.013775829680651221, "grad_norm": 10.651725769042969, "learning_rate": 1.19e-05, "loss": 15.4059, "step": 88 }, { "epoch": 0.01393237319974953, "grad_norm": 8.981675148010254, "learning_rate": 1.2039999999999998e-05, "loss": 13.7809, "step": 89 }, { "epoch": 0.014088916718847839, "grad_norm": 13.276698112487793, "learning_rate": 1.2179999999999999e-05, "loss": 18.3986, "step": 90 }, { "epoch": 0.014245460237946149, "grad_norm": 10.566091537475586, "learning_rate": 1.2319999999999999e-05, "loss": 15.9462, "step": 91 }, { "epoch": 0.014402003757044458, "grad_norm": 9.792724609375, "learning_rate": 1.2459999999999998e-05, "loss": 12.0441, "step": 92 }, { "epoch": 0.014558547276142768, "grad_norm": 7.6955060958862305, "learning_rate": 1.2599999999999998e-05, "loss": 12.2863, "step": 93 }, { "epoch": 0.014715090795241076, "grad_norm": 7.509605407714844, "learning_rate": 1.2739999999999998e-05, "loss": 11.4705, "step": 94 }, { "epoch": 0.014871634314339386, "grad_norm": 9.54214096069336, "learning_rate": 1.2879999999999999e-05, "loss": 11.241, "step": 95 }, { "epoch": 0.015028177833437696, "grad_norm": 8.414856910705566, "learning_rate": 1.3019999999999999e-05, "loss": 9.2241, "step": 96 }, { "epoch": 0.015184721352536006, "grad_norm": 5.504304885864258, "learning_rate": 1.316e-05, "loss": 8.5842, "step": 97 }, { "epoch": 0.015341264871634314, "grad_norm": 4.9704670906066895, "learning_rate": 1.33e-05, "loss": 7.7601, "step": 98 }, { "epoch": 0.015497808390732623, "grad_norm": 4.762340545654297, "learning_rate": 1.3439999999999998e-05, "loss": 7.334, "step": 99 }, { "epoch": 0.015654351909830933, "grad_norm": 4.1859450340271, "learning_rate": 1.3579999999999999e-05, "loss": 6.2665, "step": 100 }, { "epoch": 0.01581089542892924, "grad_norm": 10.55567741394043, "learning_rate": 1.3719999999999999e-05, "loss": 10.0999, "step": 101 }, { "epoch": 0.015967438948027553, "grad_norm": 10.293669700622559, "learning_rate": 1.386e-05, "loss": 9.794, "step": 102 }, { "epoch": 0.01612398246712586, "grad_norm": 9.62580680847168, "learning_rate": 1.4e-05, "loss": 9.6167, "step": 103 }, { "epoch": 0.01628052598622417, "grad_norm": 27.018157958984375, "learning_rate": 1.414e-05, "loss": 22.2476, "step": 104 }, { "epoch": 0.01643706950532248, "grad_norm": 13.03243637084961, "learning_rate": 1.4279999999999997e-05, "loss": 14.411, "step": 105 }, { "epoch": 0.01659361302442079, "grad_norm": 10.15827465057373, "learning_rate": 1.4419999999999997e-05, "loss": 10.7, "step": 106 }, { "epoch": 0.0167501565435191, "grad_norm": 7.921515941619873, "learning_rate": 1.4559999999999998e-05, "loss": 9.1695, "step": 107 }, { "epoch": 0.016906700062617408, "grad_norm": 8.662211418151855, "learning_rate": 1.4699999999999998e-05, "loss": 11.0655, "step": 108 }, { "epoch": 0.017063243581715716, "grad_norm": 8.731613159179688, "learning_rate": 1.4839999999999999e-05, "loss": 9.9972, "step": 109 }, { "epoch": 0.017219787100814028, "grad_norm": 9.626958847045898, "learning_rate": 1.4979999999999999e-05, "loss": 11.2467, "step": 110 }, { "epoch": 0.017376330619912336, "grad_norm": 18.82622528076172, "learning_rate": 1.512e-05, "loss": 19.48, "step": 111 }, { "epoch": 0.017532874139010644, "grad_norm": 11.851399421691895, "learning_rate": 1.526e-05, "loss": 13.5211, "step": 112 }, { "epoch": 0.017689417658108955, "grad_norm": 11.499161720275879, "learning_rate": 1.5399999999999998e-05, "loss": 12.2219, "step": 113 }, { "epoch": 0.017845961177207263, "grad_norm": 9.371468544006348, "learning_rate": 1.554e-05, "loss": 11.1876, "step": 114 }, { "epoch": 0.01800250469630557, "grad_norm": 10.221906661987305, "learning_rate": 1.568e-05, "loss": 10.8776, "step": 115 }, { "epoch": 0.018159048215403883, "grad_norm": 12.396175384521484, "learning_rate": 1.5819999999999998e-05, "loss": 9.3664, "step": 116 }, { "epoch": 0.01831559173450219, "grad_norm": 20.554296493530273, "learning_rate": 1.596e-05, "loss": 15.196, "step": 117 }, { "epoch": 0.018472135253600502, "grad_norm": 6.890181541442871, "learning_rate": 1.61e-05, "loss": 7.8601, "step": 118 }, { "epoch": 0.01862867877269881, "grad_norm": 17.824600219726562, "learning_rate": 1.624e-05, "loss": 12.4901, "step": 119 }, { "epoch": 0.01878522229179712, "grad_norm": 15.72974681854248, "learning_rate": 1.638e-05, "loss": 10.874, "step": 120 }, { "epoch": 0.01894176581089543, "grad_norm": 10.616267204284668, "learning_rate": 1.6519999999999998e-05, "loss": 7.8705, "step": 121 }, { "epoch": 0.019098309329993738, "grad_norm": 15.241937637329102, "learning_rate": 1.6659999999999996e-05, "loss": 9.4864, "step": 122 }, { "epoch": 0.019254852849092046, "grad_norm": 21.63045310974121, "learning_rate": 1.68e-05, "loss": 11.5923, "step": 123 }, { "epoch": 0.019411396368190358, "grad_norm": 16.33150291442871, "learning_rate": 1.6939999999999997e-05, "loss": 8.9668, "step": 124 }, { "epoch": 0.019567939887288666, "grad_norm": 9.63389778137207, "learning_rate": 1.708e-05, "loss": 6.0932, "step": 125 }, { "epoch": 0.019724483406386977, "grad_norm": 30.359506607055664, "learning_rate": 1.7219999999999998e-05, "loss": 13.3015, "step": 126 }, { "epoch": 0.019881026925485285, "grad_norm": 32.979618072509766, "learning_rate": 1.736e-05, "loss": 13.6056, "step": 127 }, { "epoch": 0.020037570444583593, "grad_norm": 20.37031364440918, "learning_rate": 1.75e-05, "loss": 9.163, "step": 128 }, { "epoch": 0.020194113963681905, "grad_norm": 34.732444763183594, "learning_rate": 1.7639999999999997e-05, "loss": 13.5604, "step": 129 }, { "epoch": 0.020350657482780213, "grad_norm": 27.936887741088867, "learning_rate": 1.778e-05, "loss": 10.7831, "step": 130 }, { "epoch": 0.02050720100187852, "grad_norm": Infinity, "learning_rate": 1.778e-05, "loss": 12.8647, "step": 131 }, { "epoch": 0.020663744520976832, "grad_norm": 17.346717834472656, "learning_rate": 1.7919999999999998e-05, "loss": 7.6013, "step": 132 }, { "epoch": 0.02082028804007514, "grad_norm": 27.207372665405273, "learning_rate": 1.806e-05, "loss": 10.0163, "step": 133 }, { "epoch": 0.020976831559173452, "grad_norm": 24.486968994140625, "learning_rate": 1.82e-05, "loss": 9.0627, "step": 134 }, { "epoch": 0.02113337507827176, "grad_norm": 24.086740493774414, "learning_rate": 1.834e-05, "loss": 8.7799, "step": 135 }, { "epoch": 0.021289918597370068, "grad_norm": 23.159748077392578, "learning_rate": 1.848e-05, "loss": 8.2339, "step": 136 }, { "epoch": 0.02144646211646838, "grad_norm": 22.982559204101562, "learning_rate": 1.8619999999999998e-05, "loss": 7.9192, "step": 137 }, { "epoch": 0.021603005635566688, "grad_norm": 21.509071350097656, "learning_rate": 1.876e-05, "loss": 7.3969, "step": 138 }, { "epoch": 0.021759549154664996, "grad_norm": 29.417621612548828, "learning_rate": 1.89e-05, "loss": 9.0474, "step": 139 }, { "epoch": 0.021916092673763307, "grad_norm": 30.96747398376465, "learning_rate": 1.904e-05, "loss": 9.2711, "step": 140 }, { "epoch": 0.022072636192861615, "grad_norm": 28.67618179321289, "learning_rate": 1.918e-05, "loss": 8.4637, "step": 141 }, { "epoch": 0.022229179711959923, "grad_norm": 18.679611206054688, "learning_rate": 1.932e-05, "loss": 6.3187, "step": 142 }, { "epoch": 0.022385723231058235, "grad_norm": 15.19019889831543, "learning_rate": 1.946e-05, "loss": 5.6063, "step": 143 }, { "epoch": 0.022542266750156543, "grad_norm": 14.54664134979248, "learning_rate": 1.96e-05, "loss": 5.38, "step": 144 }, { "epoch": 0.022698810269254854, "grad_norm": 16.95233154296875, "learning_rate": 1.9739999999999997e-05, "loss": 5.737, "step": 145 }, { "epoch": 0.022855353788353162, "grad_norm": 16.732851028442383, "learning_rate": 1.9879999999999996e-05, "loss": 5.7085, "step": 146 }, { "epoch": 0.02301189730745147, "grad_norm": 16.44563865661621, "learning_rate": 2.0019999999999998e-05, "loss": 5.5645, "step": 147 }, { "epoch": 0.023168440826549782, "grad_norm": 11.397040367126465, "learning_rate": 2.0159999999999997e-05, "loss": 4.73, "step": 148 }, { "epoch": 0.02332498434564809, "grad_norm": 15.383413314819336, "learning_rate": 2.0299999999999995e-05, "loss": 5.2963, "step": 149 }, { "epoch": 0.023481527864746398, "grad_norm": 9.529000282287598, "learning_rate": 2.0439999999999997e-05, "loss": 4.3481, "step": 150 }, { "epoch": 0.02363807138384471, "grad_norm": 23.651248931884766, "learning_rate": 2.0579999999999996e-05, "loss": 6.8527, "step": 151 }, { "epoch": 0.023794614902943018, "grad_norm": 22.205150604248047, "learning_rate": 2.0719999999999998e-05, "loss": 6.2659, "step": 152 }, { "epoch": 0.02395115842204133, "grad_norm": 17.41342544555664, "learning_rate": 2.0859999999999997e-05, "loss": 5.413, "step": 153 }, { "epoch": 0.024107701941139637, "grad_norm": 25.43531036376953, "learning_rate": 2.1e-05, "loss": 6.6637, "step": 154 }, { "epoch": 0.024264245460237945, "grad_norm": 15.5702486038208, "learning_rate": 2.1139999999999997e-05, "loss": 4.9678, "step": 155 }, { "epoch": 0.024420788979336257, "grad_norm": 16.781112670898438, "learning_rate": 2.1279999999999996e-05, "loss": 5.0975, "step": 156 }, { "epoch": 0.024577332498434565, "grad_norm": 11.915963172912598, "learning_rate": 2.1419999999999998e-05, "loss": 4.3975, "step": 157 }, { "epoch": 0.024733876017532873, "grad_norm": 19.76316261291504, "learning_rate": 2.1559999999999997e-05, "loss": 5.517, "step": 158 }, { "epoch": 0.024890419536631184, "grad_norm": 17.933143615722656, "learning_rate": 2.17e-05, "loss": 5.1097, "step": 159 }, { "epoch": 0.025046963055729492, "grad_norm": 18.194990158081055, "learning_rate": 2.1839999999999998e-05, "loss": 5.1802, "step": 160 }, { "epoch": 0.0252035065748278, "grad_norm": 10.82386302947998, "learning_rate": 2.198e-05, "loss": 4.2095, "step": 161 }, { "epoch": 0.025360050093926112, "grad_norm": 15.061201095581055, "learning_rate": 2.2119999999999998e-05, "loss": 4.6955, "step": 162 }, { "epoch": 0.02551659361302442, "grad_norm": 59.57259750366211, "learning_rate": 2.2259999999999997e-05, "loss": 11.2915, "step": 163 }, { "epoch": 0.02567313713212273, "grad_norm": 11.474431991577148, "learning_rate": 2.24e-05, "loss": 4.2134, "step": 164 }, { "epoch": 0.02582968065122104, "grad_norm": 10.118967056274414, "learning_rate": 2.2539999999999998e-05, "loss": 4.0756, "step": 165 }, { "epoch": 0.025986224170319348, "grad_norm": 24.943572998046875, "learning_rate": 2.268e-05, "loss": 5.8853, "step": 166 }, { "epoch": 0.02614276768941766, "grad_norm": 24.3799991607666, "learning_rate": 2.282e-05, "loss": 5.923, "step": 167 }, { "epoch": 0.026299311208515967, "grad_norm": 49.28422546386719, "learning_rate": 2.296e-05, "loss": 9.2552, "step": 168 }, { "epoch": 0.026455854727614275, "grad_norm": 15.009178161621094, "learning_rate": 2.31e-05, "loss": 4.4853, "step": 169 }, { "epoch": 0.026612398246712587, "grad_norm": 18.419755935668945, "learning_rate": 2.3239999999999998e-05, "loss": 4.9687, "step": 170 }, { "epoch": 0.026768941765810895, "grad_norm": 20.04305076599121, "learning_rate": 2.338e-05, "loss": 5.1579, "step": 171 }, { "epoch": 0.026925485284909206, "grad_norm": 39.30318069458008, "learning_rate": 2.352e-05, "loss": 7.5104, "step": 172 }, { "epoch": 0.027082028804007514, "grad_norm": 15.618794441223145, "learning_rate": 2.366e-05, "loss": 4.5688, "step": 173 }, { "epoch": 0.027238572323105822, "grad_norm": 20.17620849609375, "learning_rate": 2.38e-05, "loss": 5.1475, "step": 174 }, { "epoch": 0.027395115842204134, "grad_norm": 37.770782470703125, "learning_rate": 2.394e-05, "loss": 7.1767, "step": 175 }, { "epoch": 0.027551659361302442, "grad_norm": 23.19755744934082, "learning_rate": 2.4079999999999996e-05, "loss": 5.4527, "step": 176 }, { "epoch": 0.02770820288040075, "grad_norm": 20.07772445678711, "learning_rate": 2.4219999999999995e-05, "loss": 4.9865, "step": 177 }, { "epoch": 0.02786474639949906, "grad_norm": 21.439393997192383, "learning_rate": 2.4359999999999997e-05, "loss": 5.2262, "step": 178 }, { "epoch": 0.02802128991859737, "grad_norm": 32.39883041381836, "learning_rate": 2.4499999999999996e-05, "loss": 6.4547, "step": 179 }, { "epoch": 0.028177833437695678, "grad_norm": 39.017112731933594, "learning_rate": 2.4639999999999998e-05, "loss": 7.2514, "step": 180 }, { "epoch": 0.02833437695679399, "grad_norm": 22.467863082885742, "learning_rate": 2.4779999999999997e-05, "loss": 5.2637, "step": 181 }, { "epoch": 0.028490920475892297, "grad_norm": 26.35807991027832, "learning_rate": 2.4919999999999995e-05, "loss": 5.6374, "step": 182 }, { "epoch": 0.02864746399499061, "grad_norm": 25.761241912841797, "learning_rate": 2.5059999999999997e-05, "loss": 5.7251, "step": 183 }, { "epoch": 0.028804007514088917, "grad_norm": 20.168643951416016, "learning_rate": 2.5199999999999996e-05, "loss": 5.2023, "step": 184 }, { "epoch": 0.028960551033187225, "grad_norm": 29.173480987548828, "learning_rate": 2.5339999999999998e-05, "loss": 6.0411, "step": 185 }, { "epoch": 0.029117094552285536, "grad_norm": 21.242950439453125, "learning_rate": 2.5479999999999997e-05, "loss": 5.0001, "step": 186 }, { "epoch": 0.029273638071383844, "grad_norm": 21.838102340698242, "learning_rate": 2.562e-05, "loss": 5.0762, "step": 187 }, { "epoch": 0.029430181590482152, "grad_norm": 22.79350471496582, "learning_rate": 2.5759999999999997e-05, "loss": 5.2379, "step": 188 }, { "epoch": 0.029586725109580464, "grad_norm": 19.880826950073242, "learning_rate": 2.5899999999999996e-05, "loss": 4.9423, "step": 189 }, { "epoch": 0.029743268628678772, "grad_norm": 15.980173110961914, "learning_rate": 2.6039999999999998e-05, "loss": 4.6093, "step": 190 }, { "epoch": 0.029899812147777084, "grad_norm": 19.216796875, "learning_rate": 2.6179999999999997e-05, "loss": 4.8236, "step": 191 }, { "epoch": 0.03005635566687539, "grad_norm": 19.159286499023438, "learning_rate": 2.632e-05, "loss": 4.8351, "step": 192 }, { "epoch": 0.0302128991859737, "grad_norm": 12.04419231414795, "learning_rate": 2.6459999999999997e-05, "loss": 4.1803, "step": 193 }, { "epoch": 0.03036944270507201, "grad_norm": 14.6306791305542, "learning_rate": 2.66e-05, "loss": 4.4252, "step": 194 }, { "epoch": 0.03052598622417032, "grad_norm": 11.30678653717041, "learning_rate": 2.6739999999999998e-05, "loss": 4.1199, "step": 195 }, { "epoch": 0.030682529743268627, "grad_norm": 14.148990631103516, "learning_rate": 2.6879999999999997e-05, "loss": 4.3081, "step": 196 }, { "epoch": 0.03083907326236694, "grad_norm": 11.136841773986816, "learning_rate": 2.702e-05, "loss": 4.087, "step": 197 }, { "epoch": 0.030995616781465247, "grad_norm": 10.438679695129395, "learning_rate": 2.7159999999999997e-05, "loss": 3.9999, "step": 198 }, { "epoch": 0.03115216030056356, "grad_norm": 7.094152927398682, "learning_rate": 2.73e-05, "loss": 3.7676, "step": 199 }, { "epoch": 0.031308703819661866, "grad_norm": 4.508761405944824, "learning_rate": 2.7439999999999998e-05, "loss": 3.6227, "step": 200 }, { "epoch": 0.031465247338760174, "grad_norm": Infinity, "learning_rate": 2.7439999999999998e-05, "loss": 12.4366, "step": 201 }, { "epoch": 0.03162179085785848, "grad_norm": 28.77062225341797, "learning_rate": 2.758e-05, "loss": 5.6762, "step": 202 }, { "epoch": 0.03177833437695679, "grad_norm": 9.3394775390625, "learning_rate": 2.772e-05, "loss": 3.7004, "step": 203 }, { "epoch": 0.031934877896055106, "grad_norm": 9.166064262390137, "learning_rate": 2.7859999999999998e-05, "loss": 3.6603, "step": 204 }, { "epoch": 0.032091421415153414, "grad_norm": 16.75288200378418, "learning_rate": 2.8e-05, "loss": 4.2684, "step": 205 }, { "epoch": 0.03224796493425172, "grad_norm": 14.719222068786621, "learning_rate": 2.8139999999999998e-05, "loss": 4.1728, "step": 206 }, { "epoch": 0.03240450845335003, "grad_norm": 14.848493576049805, "learning_rate": 2.828e-05, "loss": 4.2272, "step": 207 }, { "epoch": 0.03256105197244834, "grad_norm": 6.229966163635254, "learning_rate": 2.842e-05, "loss": 3.4759, "step": 208 }, { "epoch": 0.03271759549154665, "grad_norm": 12.533015251159668, "learning_rate": 2.8559999999999994e-05, "loss": 3.961, "step": 209 }, { "epoch": 0.03287413901064496, "grad_norm": 9.942062377929688, "learning_rate": 2.8699999999999996e-05, "loss": 3.7648, "step": 210 }, { "epoch": 0.03303068252974327, "grad_norm": 7.972365856170654, "learning_rate": 2.8839999999999995e-05, "loss": 3.6189, "step": 211 }, { "epoch": 0.03318722604884158, "grad_norm": 13.902708053588867, "learning_rate": 2.8979999999999997e-05, "loss": 4.0954, "step": 212 }, { "epoch": 0.033343769567939885, "grad_norm": 14.266663551330566, "learning_rate": 2.9119999999999996e-05, "loss": 4.0796, "step": 213 }, { "epoch": 0.0335003130870382, "grad_norm": 11.64871883392334, "learning_rate": 2.9259999999999998e-05, "loss": 3.8556, "step": 214 }, { "epoch": 0.03365685660613651, "grad_norm": 5.260553359985352, "learning_rate": 2.9399999999999996e-05, "loss": 3.4416, "step": 215 }, { "epoch": 0.033813400125234816, "grad_norm": 7.941871166229248, "learning_rate": 2.9539999999999995e-05, "loss": 3.5842, "step": 216 }, { "epoch": 0.033969943644333124, "grad_norm": 43.5201416015625, "learning_rate": 2.9679999999999997e-05, "loss": 6.9265, "step": 217 }, { "epoch": 0.03412648716343143, "grad_norm": 18.56870460510254, "learning_rate": 2.9819999999999996e-05, "loss": 4.4608, "step": 218 }, { "epoch": 0.03428303068252974, "grad_norm": 18.25544548034668, "learning_rate": 2.9959999999999998e-05, "loss": 4.5381, "step": 219 }, { "epoch": 0.034439574201628055, "grad_norm": 11.908080101013184, "learning_rate": 3.0099999999999996e-05, "loss": 3.948, "step": 220 }, { "epoch": 0.03459611772072636, "grad_norm": 27.604923248291016, "learning_rate": 3.024e-05, "loss": 5.435, "step": 221 }, { "epoch": 0.03475266123982467, "grad_norm": 13.88245677947998, "learning_rate": 3.0379999999999997e-05, "loss": 4.1922, "step": 222 }, { "epoch": 0.03490920475892298, "grad_norm": 10.664935111999512, "learning_rate": 3.052e-05, "loss": 3.891, "step": 223 }, { "epoch": 0.03506574827802129, "grad_norm": 9.414133071899414, "learning_rate": 3.0659999999999994e-05, "loss": 3.7648, "step": 224 }, { "epoch": 0.0352222917971196, "grad_norm": 22.3635311126709, "learning_rate": 3.0799999999999996e-05, "loss": 4.9448, "step": 225 }, { "epoch": 0.03537883531621791, "grad_norm": 25.144500732421875, "learning_rate": 3.094e-05, "loss": 5.2136, "step": 226 }, { "epoch": 0.03553537883531622, "grad_norm": 7.499974250793457, "learning_rate": 3.108e-05, "loss": 3.6061, "step": 227 }, { "epoch": 0.035691922354414526, "grad_norm": 15.297978401184082, "learning_rate": 3.1219999999999996e-05, "loss": 4.3248, "step": 228 }, { "epoch": 0.035848465873512834, "grad_norm": 9.243204116821289, "learning_rate": 3.136e-05, "loss": 3.8059, "step": 229 }, { "epoch": 0.03600500939261114, "grad_norm": 21.502397537231445, "learning_rate": 3.15e-05, "loss": 4.7616, "step": 230 }, { "epoch": 0.03616155291170946, "grad_norm": 18.98232650756836, "learning_rate": 3.1639999999999995e-05, "loss": 4.5398, "step": 231 }, { "epoch": 0.036318096430807766, "grad_norm": 12.74831771850586, "learning_rate": 3.178e-05, "loss": 4.0645, "step": 232 }, { "epoch": 0.036474639949906074, "grad_norm": 13.722548484802246, "learning_rate": 3.192e-05, "loss": 4.2198, "step": 233 }, { "epoch": 0.03663118346900438, "grad_norm": 11.861377716064453, "learning_rate": 3.206e-05, "loss": 4.0398, "step": 234 }, { "epoch": 0.03678772698810269, "grad_norm": 21.15317153930664, "learning_rate": 3.22e-05, "loss": 4.7558, "step": 235 }, { "epoch": 0.036944270507201005, "grad_norm": 12.645672798156738, "learning_rate": 3.234e-05, "loss": 4.1158, "step": 236 }, { "epoch": 0.03710081402629931, "grad_norm": 11.454920768737793, "learning_rate": 3.248e-05, "loss": 4.0685, "step": 237 }, { "epoch": 0.03725735754539762, "grad_norm": 11.464568138122559, "learning_rate": 3.2619999999999996e-05, "loss": 3.9596, "step": 238 }, { "epoch": 0.03741390106449593, "grad_norm": 15.744423866271973, "learning_rate": 3.276e-05, "loss": 4.327, "step": 239 }, { "epoch": 0.03757044458359424, "grad_norm": 13.98704719543457, "learning_rate": 3.289999999999999e-05, "loss": 4.1267, "step": 240 }, { "epoch": 0.03772698810269255, "grad_norm": 13.626340866088867, "learning_rate": 3.3039999999999995e-05, "loss": 4.0943, "step": 241 }, { "epoch": 0.03788353162179086, "grad_norm": 12.370940208435059, "learning_rate": 3.318e-05, "loss": 3.9801, "step": 242 }, { "epoch": 0.03804007514088917, "grad_norm": 10.311941146850586, "learning_rate": 3.331999999999999e-05, "loss": 3.9071, "step": 243 }, { "epoch": 0.038196618659987476, "grad_norm": 6.937376499176025, "learning_rate": 3.3459999999999995e-05, "loss": 3.6784, "step": 244 }, { "epoch": 0.038353162179085784, "grad_norm": 12.20211410522461, "learning_rate": 3.36e-05, "loss": 3.9865, "step": 245 }, { "epoch": 0.03850970569818409, "grad_norm": 4.820307731628418, "learning_rate": 3.374e-05, "loss": 3.5143, "step": 246 }, { "epoch": 0.03866624921728241, "grad_norm": 4.323901653289795, "learning_rate": 3.3879999999999994e-05, "loss": 3.4918, "step": 247 }, { "epoch": 0.038822792736380715, "grad_norm": 4.810269355773926, "learning_rate": 3.4019999999999996e-05, "loss": 3.4706, "step": 248 }, { "epoch": 0.03897933625547902, "grad_norm": 3.738006114959717, "learning_rate": 3.416e-05, "loss": 3.3656, "step": 249 }, { "epoch": 0.03913587977457733, "grad_norm": 3.72810435295105, "learning_rate": 3.4299999999999993e-05, "loss": 3.2776, "step": 250 }, { "epoch": 0.03929242329367564, "grad_norm": 6.585350513458252, "learning_rate": 3.4439999999999996e-05, "loss": 3.3667, "step": 251 }, { "epoch": 0.039448966812773954, "grad_norm": 5.654883861541748, "learning_rate": 3.458e-05, "loss": 3.343, "step": 252 }, { "epoch": 0.03960551033187226, "grad_norm": 11.666877746582031, "learning_rate": 3.472e-05, "loss": 3.7088, "step": 253 }, { "epoch": 0.03976205385097057, "grad_norm": 6.485385894775391, "learning_rate": 3.4859999999999995e-05, "loss": 3.3576, "step": 254 }, { "epoch": 0.03991859737006888, "grad_norm": 6.121241569519043, "learning_rate": 3.5e-05, "loss": 3.3626, "step": 255 }, { "epoch": 0.040075140889167186, "grad_norm": 5.4339141845703125, "learning_rate": 3.514e-05, "loss": 3.2933, "step": 256 }, { "epoch": 0.040231684408265495, "grad_norm": 4.199186325073242, "learning_rate": 3.5279999999999994e-05, "loss": 3.2537, "step": 257 }, { "epoch": 0.04038822792736381, "grad_norm": 4.203208923339844, "learning_rate": 3.5419999999999996e-05, "loss": 3.2617, "step": 258 }, { "epoch": 0.04054477144646212, "grad_norm": 18.386308670043945, "learning_rate": 3.556e-05, "loss": 4.5338, "step": 259 }, { "epoch": 0.040701314965560426, "grad_norm": 6.200634002685547, "learning_rate": 3.57e-05, "loss": 3.4138, "step": 260 }, { "epoch": 0.040857858484658734, "grad_norm": 5.144335746765137, "learning_rate": 3.5839999999999996e-05, "loss": 3.3172, "step": 261 }, { "epoch": 0.04101440200375704, "grad_norm": 3.195321798324585, "learning_rate": 3.598e-05, "loss": 3.1971, "step": 262 }, { "epoch": 0.04117094552285536, "grad_norm": 4.821245193481445, "learning_rate": 3.612e-05, "loss": 3.3516, "step": 263 }, { "epoch": 0.041327489041953665, "grad_norm": 3.4121010303497314, "learning_rate": 3.6259999999999995e-05, "loss": 3.2751, "step": 264 }, { "epoch": 0.04148403256105197, "grad_norm": 4.462987899780273, "learning_rate": 3.64e-05, "loss": 3.2847, "step": 265 }, { "epoch": 0.04164057608015028, "grad_norm": 5.4640116691589355, "learning_rate": 3.654e-05, "loss": 3.3456, "step": 266 }, { "epoch": 0.04179711959924859, "grad_norm": 17.4986515045166, "learning_rate": 3.668e-05, "loss": 4.4731, "step": 267 }, { "epoch": 0.041953663118346904, "grad_norm": 6.881798267364502, "learning_rate": 3.6819999999999996e-05, "loss": 3.4553, "step": 268 }, { "epoch": 0.04211020663744521, "grad_norm": 5.341724395751953, "learning_rate": 3.696e-05, "loss": 3.3121, "step": 269 }, { "epoch": 0.04226675015654352, "grad_norm": 8.90987777709961, "learning_rate": 3.71e-05, "loss": 3.565, "step": 270 }, { "epoch": 0.04242329367564183, "grad_norm": 24.777053833007812, "learning_rate": 3.7239999999999996e-05, "loss": 4.9198, "step": 271 }, { "epoch": 0.042579837194740136, "grad_norm": 14.460553169250488, "learning_rate": 3.738e-05, "loss": 4.1122, "step": 272 }, { "epoch": 0.042736380713838444, "grad_norm": 6.58654260635376, "learning_rate": 3.752e-05, "loss": 3.4372, "step": 273 }, { "epoch": 0.04289292423293676, "grad_norm": 17.076704025268555, "learning_rate": 3.766e-05, "loss": 4.1661, "step": 274 }, { "epoch": 0.04304946775203507, "grad_norm": 4.002966403961182, "learning_rate": 3.78e-05, "loss": 3.4077, "step": 275 }, { "epoch": 0.043206011271133375, "grad_norm": 2.726980686187744, "learning_rate": 3.794e-05, "loss": 3.2774, "step": 276 }, { "epoch": 0.04336255479023168, "grad_norm": 4.7901411056518555, "learning_rate": 3.808e-05, "loss": 3.4158, "step": 277 }, { "epoch": 0.04351909830932999, "grad_norm": 18.389915466308594, "learning_rate": 3.8219999999999997e-05, "loss": 4.3671, "step": 278 }, { "epoch": 0.043675641828428306, "grad_norm": 14.567598342895508, "learning_rate": 3.836e-05, "loss": 4.1776, "step": 279 }, { "epoch": 0.043832185347526614, "grad_norm": 8.96108341217041, "learning_rate": 3.85e-05, "loss": 3.5875, "step": 280 }, { "epoch": 0.04398872886662492, "grad_norm": 5.448246955871582, "learning_rate": 3.864e-05, "loss": 3.5198, "step": 281 }, { "epoch": 0.04414527238572323, "grad_norm": 9.138839721679688, "learning_rate": 3.878e-05, "loss": 3.6999, "step": 282 }, { "epoch": 0.04430181590482154, "grad_norm": 10.229423522949219, "learning_rate": 3.892e-05, "loss": 3.7375, "step": 283 }, { "epoch": 0.044458359423919847, "grad_norm": 11.68979549407959, "learning_rate": 3.906e-05, "loss": 3.9089, "step": 284 }, { "epoch": 0.04461490294301816, "grad_norm": 12.639559745788574, "learning_rate": 3.92e-05, "loss": 3.9994, "step": 285 }, { "epoch": 0.04477144646211647, "grad_norm": 4.521181583404541, "learning_rate": 3.934e-05, "loss": 3.3929, "step": 286 }, { "epoch": 0.04492798998121478, "grad_norm": 2.1542844772338867, "learning_rate": 3.9479999999999995e-05, "loss": 3.4402, "step": 287 }, { "epoch": 0.045084533500313086, "grad_norm": 14.633146286010742, "learning_rate": 3.961999999999999e-05, "loss": 3.9756, "step": 288 }, { "epoch": 0.045241077019411394, "grad_norm": 10.249550819396973, "learning_rate": 3.975999999999999e-05, "loss": 3.8468, "step": 289 }, { "epoch": 0.04539762053850971, "grad_norm": 6.249022483825684, "learning_rate": 3.9899999999999994e-05, "loss": 3.5803, "step": 290 }, { "epoch": 0.04555416405760802, "grad_norm": 8.393763542175293, "learning_rate": 4.0039999999999996e-05, "loss": 3.6792, "step": 291 }, { "epoch": 0.045710707576706325, "grad_norm": 5.026933193206787, "learning_rate": 4.017999999999999e-05, "loss": 3.356, "step": 292 }, { "epoch": 0.04586725109580463, "grad_norm": 4.246041297912598, "learning_rate": 4.0319999999999993e-05, "loss": 3.4519, "step": 293 }, { "epoch": 0.04602379461490294, "grad_norm": 4.978700637817383, "learning_rate": 4.0459999999999995e-05, "loss": 3.3964, "step": 294 }, { "epoch": 0.04618033813400125, "grad_norm": 2.1876394748687744, "learning_rate": 4.059999999999999e-05, "loss": 3.3925, "step": 295 }, { "epoch": 0.046336881653099564, "grad_norm": 2.8558971881866455, "learning_rate": 4.073999999999999e-05, "loss": 3.3408, "step": 296 }, { "epoch": 0.04649342517219787, "grad_norm": 3.4417145252227783, "learning_rate": 4.0879999999999995e-05, "loss": 3.2544, "step": 297 }, { "epoch": 0.04664996869129618, "grad_norm": 3.1559412479400635, "learning_rate": 4.102e-05, "loss": 3.1797, "step": 298 }, { "epoch": 0.04680651221039449, "grad_norm": 4.128979682922363, "learning_rate": 4.115999999999999e-05, "loss": 3.1832, "step": 299 }, { "epoch": 0.046963055729492796, "grad_norm": 2.514963388442993, "learning_rate": 4.1299999999999994e-05, "loss": 3.1399, "step": 300 }, { "epoch": 0.04711959924859111, "grad_norm": 53.75910949707031, "learning_rate": 4.1439999999999996e-05, "loss": 7.4382, "step": 301 }, { "epoch": 0.04727614276768942, "grad_norm": 3.7919201850891113, "learning_rate": 4.157999999999999e-05, "loss": 3.1439, "step": 302 }, { "epoch": 0.04743268628678773, "grad_norm": 28.56719207763672, "learning_rate": 4.1719999999999994e-05, "loss": 5.2481, "step": 303 }, { "epoch": 0.047589229805886035, "grad_norm": 2.9752249717712402, "learning_rate": 4.1859999999999996e-05, "loss": 3.113, "step": 304 }, { "epoch": 0.04774577332498434, "grad_norm": 2.378140926361084, "learning_rate": 4.2e-05, "loss": 3.1044, "step": 305 }, { "epoch": 0.04790231684408266, "grad_norm": 3.3146214485168457, "learning_rate": 4.213999999999999e-05, "loss": 3.1184, "step": 306 }, { "epoch": 0.048058860363180966, "grad_norm": 1.5616509914398193, "learning_rate": 4.2279999999999995e-05, "loss": 3.09, "step": 307 }, { "epoch": 0.048215403882279274, "grad_norm": 2.7862565517425537, "learning_rate": 4.242e-05, "loss": 3.1452, "step": 308 }, { "epoch": 0.04837194740137758, "grad_norm": 1.9219461679458618, "learning_rate": 4.255999999999999e-05, "loss": 3.0716, "step": 309 }, { "epoch": 0.04852849092047589, "grad_norm": 4.604033470153809, "learning_rate": 4.2699999999999994e-05, "loss": 3.1988, "step": 310 }, { "epoch": 0.0486850344395742, "grad_norm": 5.247476577758789, "learning_rate": 4.2839999999999996e-05, "loss": 3.243, "step": 311 }, { "epoch": 0.048841577958672514, "grad_norm": 1.311023235321045, "learning_rate": 4.298e-05, "loss": 3.0379, "step": 312 }, { "epoch": 0.04899812147777082, "grad_norm": 8.354813575744629, "learning_rate": 4.3119999999999994e-05, "loss": 3.4952, "step": 313 }, { "epoch": 0.04915466499686913, "grad_norm": 6.086847305297852, "learning_rate": 4.3259999999999996e-05, "loss": 3.2849, "step": 314 }, { "epoch": 0.04931120851596744, "grad_norm": 7.599371910095215, "learning_rate": 4.34e-05, "loss": 3.4249, "step": 315 }, { "epoch": 0.049467752035065746, "grad_norm": 4.477779865264893, "learning_rate": 4.353999999999999e-05, "loss": 3.3336, "step": 316 }, { "epoch": 0.04962429555416406, "grad_norm": 13.604911804199219, "learning_rate": 4.3679999999999995e-05, "loss": 3.8961, "step": 317 }, { "epoch": 0.04978083907326237, "grad_norm": 4.719150066375732, "learning_rate": 4.382e-05, "loss": 3.2595, "step": 318 }, { "epoch": 0.04993738259236068, "grad_norm": 1.904133677482605, "learning_rate": 4.396e-05, "loss": 3.2546, "step": 319 }, { "epoch": 0.050093926111458985, "grad_norm": 5.124321937561035, "learning_rate": 4.4099999999999995e-05, "loss": 3.4018, "step": 320 }, { "epoch": 0.05025046963055729, "grad_norm": 2.8773961067199707, "learning_rate": 4.4239999999999997e-05, "loss": 3.2185, "step": 321 }, { "epoch": 0.0504070131496556, "grad_norm": 9.473835945129395, "learning_rate": 4.438e-05, "loss": 3.5829, "step": 322 }, { "epoch": 0.050563556668753916, "grad_norm": 7.768177032470703, "learning_rate": 4.4519999999999994e-05, "loss": 3.5328, "step": 323 }, { "epoch": 0.050720100187852224, "grad_norm": 1.9281656742095947, "learning_rate": 4.4659999999999996e-05, "loss": 3.2461, "step": 324 }, { "epoch": 0.05087664370695053, "grad_norm": 1.4415054321289062, "learning_rate": 4.48e-05, "loss": 3.0669, "step": 325 }, { "epoch": 0.05103318722604884, "grad_norm": 8.724621772766113, "learning_rate": 4.494e-05, "loss": 3.4876, "step": 326 }, { "epoch": 0.05118973074514715, "grad_norm": 11.763263702392578, "learning_rate": 4.5079999999999995e-05, "loss": 3.5966, "step": 327 }, { "epoch": 0.05134627426424546, "grad_norm": 8.195770263671875, "learning_rate": 4.522e-05, "loss": 3.5513, "step": 328 }, { "epoch": 0.05150281778334377, "grad_norm": 10.06527042388916, "learning_rate": 4.536e-05, "loss": 3.5993, "step": 329 }, { "epoch": 0.05165936130244208, "grad_norm": 7.179144382476807, "learning_rate": 4.5499999999999995e-05, "loss": 3.4604, "step": 330 }, { "epoch": 0.05181590482154039, "grad_norm": 9.546896934509277, "learning_rate": 4.564e-05, "loss": 3.5112, "step": 331 }, { "epoch": 0.051972448340638695, "grad_norm": 3.6635303497314453, "learning_rate": 4.578e-05, "loss": 3.2679, "step": 332 }, { "epoch": 0.05212899185973701, "grad_norm": 11.099640846252441, "learning_rate": 4.592e-05, "loss": 3.5997, "step": 333 }, { "epoch": 0.05228553537883532, "grad_norm": 9.198573112487793, "learning_rate": 4.6059999999999996e-05, "loss": 3.4513, "step": 334 }, { "epoch": 0.052442078897933626, "grad_norm": 9.320886611938477, "learning_rate": 4.62e-05, "loss": 3.5541, "step": 335 }, { "epoch": 0.052598622417031934, "grad_norm": 3.1771163940429688, "learning_rate": 4.634e-05, "loss": 3.4021, "step": 336 }, { "epoch": 0.05275516593613024, "grad_norm": 6.499922752380371, "learning_rate": 4.6479999999999995e-05, "loss": 3.5162, "step": 337 }, { "epoch": 0.05291170945522855, "grad_norm": 4.661427021026611, "learning_rate": 4.662e-05, "loss": 3.4079, "step": 338 }, { "epoch": 0.053068252974326866, "grad_norm": 4.89291524887085, "learning_rate": 4.676e-05, "loss": 3.4006, "step": 339 }, { "epoch": 0.053224796493425174, "grad_norm": 3.411698818206787, "learning_rate": 4.69e-05, "loss": 3.2728, "step": 340 }, { "epoch": 0.05338134001252348, "grad_norm": 3.584462881088257, "learning_rate": 4.704e-05, "loss": 3.3751, "step": 341 }, { "epoch": 0.05353788353162179, "grad_norm": 2.863245725631714, "learning_rate": 4.718e-05, "loss": 3.2824, "step": 342 }, { "epoch": 0.0536944270507201, "grad_norm": 2.3288283348083496, "learning_rate": 4.732e-05, "loss": 3.2279, "step": 343 }, { "epoch": 0.05385097056981841, "grad_norm": 1.9179085493087769, "learning_rate": 4.7459999999999996e-05, "loss": 3.2359, "step": 344 }, { "epoch": 0.05400751408891672, "grad_norm": 2.076956272125244, "learning_rate": 4.76e-05, "loss": 3.2371, "step": 345 }, { "epoch": 0.05416405760801503, "grad_norm": 2.8752267360687256, "learning_rate": 4.774e-05, "loss": 3.1826, "step": 346 }, { "epoch": 0.05432060112711334, "grad_norm": 3.050999641418457, "learning_rate": 4.788e-05, "loss": 3.1194, "step": 347 }, { "epoch": 0.054477144646211645, "grad_norm": 3.2072484493255615, "learning_rate": 4.802e-05, "loss": 3.0669, "step": 348 }, { "epoch": 0.05463368816530995, "grad_norm": 2.464811325073242, "learning_rate": 4.815999999999999e-05, "loss": 3.1379, "step": 349 }, { "epoch": 0.05479023168440827, "grad_norm": 2.9768426418304443, "learning_rate": 4.8299999999999995e-05, "loss": 3.0859, "step": 350 }, { "epoch": 0.054946775203506576, "grad_norm": 6.276824951171875, "learning_rate": 4.843999999999999e-05, "loss": 3.0969, "step": 351 }, { "epoch": 0.055103318722604884, "grad_norm": 10.770678520202637, "learning_rate": 4.857999999999999e-05, "loss": 3.5724, "step": 352 }, { "epoch": 0.05525986224170319, "grad_norm": 15.48678970336914, "learning_rate": 4.8719999999999994e-05, "loss": 4.0608, "step": 353 }, { "epoch": 0.0554164057608015, "grad_norm": 1.473160743713379, "learning_rate": 4.885999999999999e-05, "loss": 2.9992, "step": 354 }, { "epoch": 0.055572949279899815, "grad_norm": 3.5868937969207764, "learning_rate": 4.899999999999999e-05, "loss": 3.1124, "step": 355 }, { "epoch": 0.05572949279899812, "grad_norm": 1.761987328529358, "learning_rate": 4.9139999999999994e-05, "loss": 3.0523, "step": 356 }, { "epoch": 0.05588603631809643, "grad_norm": 1.666123628616333, "learning_rate": 4.9279999999999996e-05, "loss": 2.9849, "step": 357 }, { "epoch": 0.05604257983719474, "grad_norm": 0.9745649099349976, "learning_rate": 4.941999999999999e-05, "loss": 2.9642, "step": 358 }, { "epoch": 0.05619912335629305, "grad_norm": 19.621166229248047, "learning_rate": 4.955999999999999e-05, "loss": 4.3261, "step": 359 }, { "epoch": 0.056355666875391355, "grad_norm": 4.479330539703369, "learning_rate": 4.9699999999999995e-05, "loss": 3.0874, "step": 360 }, { "epoch": 0.05651221039448967, "grad_norm": 25.318208694458008, "learning_rate": 4.983999999999999e-05, "loss": 4.5146, "step": 361 }, { "epoch": 0.05666875391358798, "grad_norm": 8.722978591918945, "learning_rate": 4.997999999999999e-05, "loss": 3.4201, "step": 362 }, { "epoch": 0.056825297432686286, "grad_norm": 3.8922910690307617, "learning_rate": 5.0119999999999994e-05, "loss": 3.0446, "step": 363 }, { "epoch": 0.056981840951784594, "grad_norm": 13.044817924499512, "learning_rate": 5.0259999999999997e-05, "loss": 3.8147, "step": 364 }, { "epoch": 0.0571383844708829, "grad_norm": 3.826681613922119, "learning_rate": 5.039999999999999e-05, "loss": 3.0716, "step": 365 }, { "epoch": 0.05729492798998122, "grad_norm": 3.5654048919677734, "learning_rate": 5.0539999999999994e-05, "loss": 3.3467, "step": 366 }, { "epoch": 0.057451471509079526, "grad_norm": 3.6011250019073486, "learning_rate": 5.0679999999999996e-05, "loss": 3.3084, "step": 367 }, { "epoch": 0.057608015028177834, "grad_norm": 1.1993982791900635, "learning_rate": 5.081999999999999e-05, "loss": 3.1187, "step": 368 }, { "epoch": 0.05776455854727614, "grad_norm": 1.2334762811660767, "learning_rate": 5.095999999999999e-05, "loss": 2.9954, "step": 369 }, { "epoch": 0.05792110206637445, "grad_norm": 0.696552574634552, "learning_rate": 5.1099999999999995e-05, "loss": 2.9845, "step": 370 }, { "epoch": 0.058077645585472765, "grad_norm": 7.719667434692383, "learning_rate": 5.124e-05, "loss": 3.2444, "step": 371 }, { "epoch": 0.05823418910457107, "grad_norm": 5.611908912658691, "learning_rate": 5.137999999999999e-05, "loss": 3.1913, "step": 372 }, { "epoch": 0.05839073262366938, "grad_norm": 7.84903621673584, "learning_rate": 5.1519999999999995e-05, "loss": 3.37, "step": 373 }, { "epoch": 0.05854727614276769, "grad_norm": 15.40585708618164, "learning_rate": 5.166e-05, "loss": 3.9866, "step": 374 }, { "epoch": 0.058703819661866, "grad_norm": 2.798308849334717, "learning_rate": 5.179999999999999e-05, "loss": 3.0555, "step": 375 }, { "epoch": 0.058860363180964305, "grad_norm": 9.838094711303711, "learning_rate": 5.1939999999999994e-05, "loss": 3.5761, "step": 376 }, { "epoch": 0.05901690670006262, "grad_norm": 5.334707260131836, "learning_rate": 5.2079999999999996e-05, "loss": 3.4454, "step": 377 }, { "epoch": 0.05917345021916093, "grad_norm": 1.580238938331604, "learning_rate": 5.222e-05, "loss": 3.2961, "step": 378 }, { "epoch": 0.059329993738259236, "grad_norm": 1.4036500453948975, "learning_rate": 5.235999999999999e-05, "loss": 3.213, "step": 379 }, { "epoch": 0.059486537257357544, "grad_norm": 7.042830467224121, "learning_rate": 5.2499999999999995e-05, "loss": 3.4195, "step": 380 }, { "epoch": 0.05964308077645585, "grad_norm": 8.693353652954102, "learning_rate": 5.264e-05, "loss": 3.4376, "step": 381 }, { "epoch": 0.05979962429555417, "grad_norm": 4.2105536460876465, "learning_rate": 5.277999999999999e-05, "loss": 3.2541, "step": 382 }, { "epoch": 0.059956167814652475, "grad_norm": 5.239287853240967, "learning_rate": 5.2919999999999995e-05, "loss": 3.083, "step": 383 }, { "epoch": 0.06011271133375078, "grad_norm": 2.8124728202819824, "learning_rate": 5.306e-05, "loss": 3.1264, "step": 384 }, { "epoch": 0.06026925485284909, "grad_norm": 7.982107639312744, "learning_rate": 5.32e-05, "loss": 3.3637, "step": 385 }, { "epoch": 0.0604257983719474, "grad_norm": 1.5782603025436401, "learning_rate": 5.3339999999999994e-05, "loss": 3.1169, "step": 386 }, { "epoch": 0.06058234189104571, "grad_norm": 5.1386399269104, "learning_rate": 5.3479999999999996e-05, "loss": 3.1831, "step": 387 }, { "epoch": 0.06073888541014402, "grad_norm": 1.747945785522461, "learning_rate": 5.362e-05, "loss": 3.1566, "step": 388 }, { "epoch": 0.06089542892924233, "grad_norm": 1.6316550970077515, "learning_rate": 5.3759999999999994e-05, "loss": 3.1493, "step": 389 }, { "epoch": 0.06105197244834064, "grad_norm": 4.590147495269775, "learning_rate": 5.3899999999999996e-05, "loss": 3.184, "step": 390 }, { "epoch": 0.061208515967438946, "grad_norm": 3.499342203140259, "learning_rate": 5.404e-05, "loss": 3.2257, "step": 391 }, { "epoch": 0.061365059486537255, "grad_norm": 2.639616012573242, "learning_rate": 5.418e-05, "loss": 3.2173, "step": 392 }, { "epoch": 0.06152160300563557, "grad_norm": 2.9570326805114746, "learning_rate": 5.4319999999999995e-05, "loss": 3.2191, "step": 393 }, { "epoch": 0.06167814652473388, "grad_norm": 4.223509311676025, "learning_rate": 5.446e-05, "loss": 3.1118, "step": 394 }, { "epoch": 0.061834690043832186, "grad_norm": 2.508502244949341, "learning_rate": 5.46e-05, "loss": 3.1308, "step": 395 }, { "epoch": 0.061991233562930494, "grad_norm": 4.6453166007995605, "learning_rate": 5.4739999999999994e-05, "loss": 3.1553, "step": 396 }, { "epoch": 0.0621477770820288, "grad_norm": 4.023975372314453, "learning_rate": 5.4879999999999996e-05, "loss": 3.0423, "step": 397 }, { "epoch": 0.06230432060112712, "grad_norm": 2.9428560733795166, "learning_rate": 5.502e-05, "loss": 3.21, "step": 398 }, { "epoch": 0.062460864120225425, "grad_norm": 3.505303382873535, "learning_rate": 5.516e-05, "loss": 3.0834, "step": 399 }, { "epoch": 0.06261740763932373, "grad_norm": 5.094984531402588, "learning_rate": 5.5299999999999996e-05, "loss": 3.0155, "step": 400 }, { "epoch": 0.06277395115842205, "grad_norm": 25.98197364807129, "learning_rate": 5.544e-05, "loss": 4.9358, "step": 401 }, { "epoch": 0.06293049467752035, "grad_norm": 3.287330150604248, "learning_rate": 5.558e-05, "loss": 2.9763, "step": 402 }, { "epoch": 0.06308703819661866, "grad_norm": 5.438999652862549, "learning_rate": 5.5719999999999995e-05, "loss": 3.1853, "step": 403 }, { "epoch": 0.06324358171571696, "grad_norm": 1.6344505548477173, "learning_rate": 5.586e-05, "loss": 2.9701, "step": 404 }, { "epoch": 0.06340012523481528, "grad_norm": 0.9229876399040222, "learning_rate": 5.6e-05, "loss": 2.9332, "step": 405 }, { "epoch": 0.06355666875391358, "grad_norm": 1.3133913278579712, "learning_rate": 5.614e-05, "loss": 2.9835, "step": 406 }, { "epoch": 0.0637132122730119, "grad_norm": 1.1192504167556763, "learning_rate": 5.6279999999999996e-05, "loss": 2.92, "step": 407 }, { "epoch": 0.06386975579211021, "grad_norm": 2.4241080284118652, "learning_rate": 5.642e-05, "loss": 3.0014, "step": 408 }, { "epoch": 0.06402629931120851, "grad_norm": 1.330134630203247, "learning_rate": 5.656e-05, "loss": 2.9185, "step": 409 }, { "epoch": 0.06418284283030683, "grad_norm": 1.5071699619293213, "learning_rate": 5.6699999999999996e-05, "loss": 2.9006, "step": 410 }, { "epoch": 0.06433938634940513, "grad_norm": 1.3628085851669312, "learning_rate": 5.684e-05, "loss": 3.0322, "step": 411 }, { "epoch": 0.06449592986850344, "grad_norm": 10.857100486755371, "learning_rate": 5.697999999999999e-05, "loss": 3.6546, "step": 412 }, { "epoch": 0.06465247338760176, "grad_norm": 1.8160020112991333, "learning_rate": 5.711999999999999e-05, "loss": 3.0462, "step": 413 }, { "epoch": 0.06480901690670006, "grad_norm": 0.875519335269928, "learning_rate": 5.725999999999999e-05, "loss": 2.926, "step": 414 }, { "epoch": 0.06496556042579837, "grad_norm": 1.4309730529785156, "learning_rate": 5.739999999999999e-05, "loss": 2.9425, "step": 415 }, { "epoch": 0.06512210394489668, "grad_norm": 1.572853684425354, "learning_rate": 5.7539999999999995e-05, "loss": 2.9289, "step": 416 }, { "epoch": 0.06527864746399499, "grad_norm": 1.674454927444458, "learning_rate": 5.767999999999999e-05, "loss": 2.942, "step": 417 }, { "epoch": 0.0654351909830933, "grad_norm": 2.338503122329712, "learning_rate": 5.781999999999999e-05, "loss": 2.9645, "step": 418 }, { "epoch": 0.0655917345021916, "grad_norm": 4.553142547607422, "learning_rate": 5.7959999999999994e-05, "loss": 3.1198, "step": 419 }, { "epoch": 0.06574827802128992, "grad_norm": 0.6445873379707336, "learning_rate": 5.809999999999999e-05, "loss": 2.927, "step": 420 }, { "epoch": 0.06590482154038822, "grad_norm": 1.2368569374084473, "learning_rate": 5.823999999999999e-05, "loss": 3.022, "step": 421 }, { "epoch": 0.06606136505948654, "grad_norm": 2.9631285667419434, "learning_rate": 5.837999999999999e-05, "loss": 3.1498, "step": 422 }, { "epoch": 0.06621790857858485, "grad_norm": 1.8125340938568115, "learning_rate": 5.8519999999999995e-05, "loss": 2.9333, "step": 423 }, { "epoch": 0.06637445209768315, "grad_norm": 2.6690914630889893, "learning_rate": 5.865999999999999e-05, "loss": 3.0792, "step": 424 }, { "epoch": 0.06653099561678147, "grad_norm": 5.652665615081787, "learning_rate": 5.879999999999999e-05, "loss": 3.2257, "step": 425 }, { "epoch": 0.06668753913587977, "grad_norm": 2.480700731277466, "learning_rate": 5.8939999999999995e-05, "loss": 2.9905, "step": 426 }, { "epoch": 0.06684408265497808, "grad_norm": 1.751585841178894, "learning_rate": 5.907999999999999e-05, "loss": 2.9634, "step": 427 }, { "epoch": 0.0670006261740764, "grad_norm": 4.167579174041748, "learning_rate": 5.921999999999999e-05, "loss": 3.1511, "step": 428 }, { "epoch": 0.0671571696931747, "grad_norm": 3.322667360305786, "learning_rate": 5.9359999999999994e-05, "loss": 3.1265, "step": 429 }, { "epoch": 0.06731371321227302, "grad_norm": 2.215233087539673, "learning_rate": 5.9499999999999996e-05, "loss": 3.0473, "step": 430 }, { "epoch": 0.06747025673137132, "grad_norm": 0.7397876977920532, "learning_rate": 5.963999999999999e-05, "loss": 2.9634, "step": 431 }, { "epoch": 0.06762680025046963, "grad_norm": 6.137924671173096, "learning_rate": 5.9779999999999993e-05, "loss": 3.255, "step": 432 }, { "epoch": 0.06778334376956793, "grad_norm": 3.397953510284424, "learning_rate": 5.9919999999999996e-05, "loss": 3.1648, "step": 433 }, { "epoch": 0.06793988728866625, "grad_norm": 0.8925051689147949, "learning_rate": 6.005999999999999e-05, "loss": 3.0007, "step": 434 }, { "epoch": 0.06809643080776456, "grad_norm": 1.32594895362854, "learning_rate": 6.019999999999999e-05, "loss": 3.0816, "step": 435 }, { "epoch": 0.06825297432686286, "grad_norm": 2.1833043098449707, "learning_rate": 6.0339999999999995e-05, "loss": 3.069, "step": 436 }, { "epoch": 0.06840951784596118, "grad_norm": 1.2829161882400513, "learning_rate": 6.048e-05, "loss": 3.1638, "step": 437 }, { "epoch": 0.06856606136505948, "grad_norm": 1.8730409145355225, "learning_rate": 6.061999999999999e-05, "loss": 3.0345, "step": 438 }, { "epoch": 0.0687226048841578, "grad_norm": 1.6662507057189941, "learning_rate": 6.0759999999999994e-05, "loss": 3.0274, "step": 439 }, { "epoch": 0.06887914840325611, "grad_norm": NaN, "learning_rate": 6.0759999999999994e-05, "loss": 0.0, "step": 440 }, { "epoch": 0.06903569192235441, "grad_norm": 2.0696589946746826, "learning_rate": 6.0899999999999996e-05, "loss": 3.1195, "step": 441 }, { "epoch": 0.06919223544145273, "grad_norm": 1.7176661491394043, "learning_rate": 6.104e-05, "loss": 3.1157, "step": 442 }, { "epoch": 0.06934877896055103, "grad_norm": 3.3021953105926514, "learning_rate": 6.118e-05, "loss": 3.2324, "step": 443 }, { "epoch": 0.06950532247964934, "grad_norm": 3.1533589363098145, "learning_rate": 6.131999999999999e-05, "loss": 3.0174, "step": 444 }, { "epoch": 0.06966186599874766, "grad_norm": 3.815134286880493, "learning_rate": 6.146e-05, "loss": 3.1389, "step": 445 }, { "epoch": 0.06981840951784596, "grad_norm": 1.8318499326705933, "learning_rate": 6.159999999999999e-05, "loss": 2.983, "step": 446 }, { "epoch": 0.06997495303694427, "grad_norm": 1.8545029163360596, "learning_rate": 6.174e-05, "loss": 2.9118, "step": 447 }, { "epoch": 0.07013149655604257, "grad_norm": 2.28959584236145, "learning_rate": 6.188e-05, "loss": 2.9511, "step": 448 }, { "epoch": 0.07028804007514089, "grad_norm": 2.481537342071533, "learning_rate": 6.201999999999999e-05, "loss": 2.9072, "step": 449 }, { "epoch": 0.0704445835942392, "grad_norm": 3.6115753650665283, "learning_rate": 6.216e-05, "loss": 2.8926, "step": 450 }, { "epoch": 0.0706011271133375, "grad_norm": 3.4907355308532715, "learning_rate": 6.23e-05, "loss": 2.9571, "step": 451 }, { "epoch": 0.07075767063243582, "grad_norm": 13.391167640686035, "learning_rate": 6.243999999999999e-05, "loss": 3.7347, "step": 452 }, { "epoch": 0.07091421415153412, "grad_norm": 2.6494460105895996, "learning_rate": 6.258e-05, "loss": 2.9855, "step": 453 }, { "epoch": 0.07107075767063244, "grad_norm": 0.9494996070861816, "learning_rate": 6.272e-05, "loss": 2.9514, "step": 454 }, { "epoch": 0.07122730118973075, "grad_norm": 1.1306931972503662, "learning_rate": 6.285999999999999e-05, "loss": 2.9697, "step": 455 }, { "epoch": 0.07138384470882905, "grad_norm": 0.8614223003387451, "learning_rate": 6.3e-05, "loss": 2.8942, "step": 456 }, { "epoch": 0.07154038822792737, "grad_norm": 2.1893296241760254, "learning_rate": 6.314e-05, "loss": 2.871, "step": 457 }, { "epoch": 0.07169693174702567, "grad_norm": 1.9369640350341797, "learning_rate": 6.327999999999999e-05, "loss": 2.8941, "step": 458 }, { "epoch": 0.07185347526612398, "grad_norm": 1.896386981010437, "learning_rate": 6.342e-05, "loss": 2.9751, "step": 459 }, { "epoch": 0.07201001878522229, "grad_norm": 0.7127363681793213, "learning_rate": 6.356e-05, "loss": 2.8717, "step": 460 }, { "epoch": 0.0721665623043206, "grad_norm": 1.3870997428894043, "learning_rate": 6.37e-05, "loss": 2.9001, "step": 461 }, { "epoch": 0.07232310582341892, "grad_norm": 0.7134589552879333, "learning_rate": 6.384e-05, "loss": 2.9198, "step": 462 }, { "epoch": 0.07247964934251722, "grad_norm": 6.471171855926514, "learning_rate": 6.398e-05, "loss": 3.1625, "step": 463 }, { "epoch": 0.07263619286161553, "grad_norm": 5.356045722961426, "learning_rate": 6.412e-05, "loss": 3.0925, "step": 464 }, { "epoch": 0.07279273638071383, "grad_norm": 2.269282579421997, "learning_rate": 6.426e-05, "loss": 2.8846, "step": 465 }, { "epoch": 0.07294927989981215, "grad_norm": 3.7076480388641357, "learning_rate": 6.44e-05, "loss": 3.0131, "step": 466 }, { "epoch": 0.07310582341891046, "grad_norm": 1.2271075248718262, "learning_rate": 6.454e-05, "loss": 2.9078, "step": 467 }, { "epoch": 0.07326236693800876, "grad_norm": 4.326624870300293, "learning_rate": 6.468e-05, "loss": 3.1584, "step": 468 }, { "epoch": 0.07341891045710708, "grad_norm": 4.925378799438477, "learning_rate": 6.481999999999999e-05, "loss": 3.1539, "step": 469 }, { "epoch": 0.07357545397620538, "grad_norm": 1.0414175987243652, "learning_rate": 6.496e-05, "loss": 2.9689, "step": 470 }, { "epoch": 0.0737319974953037, "grad_norm": 5.466559886932373, "learning_rate": 6.51e-05, "loss": 3.1443, "step": 471 }, { "epoch": 0.07388854101440201, "grad_norm": 3.0781617164611816, "learning_rate": 6.523999999999999e-05, "loss": 3.0921, "step": 472 }, { "epoch": 0.07404508453350031, "grad_norm": 2.2696845531463623, "learning_rate": 6.538e-05, "loss": 3.0663, "step": 473 }, { "epoch": 0.07420162805259863, "grad_norm": 2.8215489387512207, "learning_rate": 6.552e-05, "loss": 2.9396, "step": 474 }, { "epoch": 0.07435817157169693, "grad_norm": 2.02180814743042, "learning_rate": 6.565999999999999e-05, "loss": 3.0117, "step": 475 }, { "epoch": 0.07451471509079524, "grad_norm": 2.335123300552368, "learning_rate": 6.579999999999999e-05, "loss": 3.018, "step": 476 }, { "epoch": 0.07467125860989356, "grad_norm": 2.6606829166412354, "learning_rate": 6.594e-05, "loss": 2.9977, "step": 477 }, { "epoch": 0.07482780212899186, "grad_norm": 3.1835718154907227, "learning_rate": 6.607999999999999e-05, "loss": 3.0181, "step": 478 }, { "epoch": 0.07498434564809017, "grad_norm": 3.8521945476531982, "learning_rate": 6.621999999999999e-05, "loss": 2.9892, "step": 479 }, { "epoch": 0.07514088916718847, "grad_norm": 2.283893585205078, "learning_rate": 6.636e-05, "loss": 2.9682, "step": 480 }, { "epoch": 0.07529743268628679, "grad_norm": 9.207347869873047, "learning_rate": 6.649999999999999e-05, "loss": 3.39, "step": 481 }, { "epoch": 0.0754539762053851, "grad_norm": 1.712112545967102, "learning_rate": 6.663999999999999e-05, "loss": 3.0407, "step": 482 }, { "epoch": 0.0756105197244834, "grad_norm": 1.8745099306106567, "learning_rate": 6.678e-05, "loss": 3.0954, "step": 483 }, { "epoch": 0.07576706324358172, "grad_norm": 1.8073405027389526, "learning_rate": 6.691999999999999e-05, "loss": 3.072, "step": 484 }, { "epoch": 0.07592360676268002, "grad_norm": 1.3374913930892944, "learning_rate": 6.705999999999998e-05, "loss": 3.0623, "step": 485 }, { "epoch": 0.07608015028177834, "grad_norm": 2.1693854331970215, "learning_rate": 6.72e-05, "loss": 3.0345, "step": 486 }, { "epoch": 0.07623669380087664, "grad_norm": 4.127687454223633, "learning_rate": 6.733999999999999e-05, "loss": 3.1899, "step": 487 }, { "epoch": 0.07639323731997495, "grad_norm": 3.6713342666625977, "learning_rate": 6.748e-05, "loss": 3.1221, "step": 488 }, { "epoch": 0.07654978083907327, "grad_norm": 2.465630292892456, "learning_rate": 6.761999999999999e-05, "loss": 3.0684, "step": 489 }, { "epoch": 0.07670632435817157, "grad_norm": 1.8548108339309692, "learning_rate": 6.775999999999999e-05, "loss": 2.9735, "step": 490 }, { "epoch": 0.07686286787726988, "grad_norm": 2.20125675201416, "learning_rate": 6.79e-05, "loss": 3.0506, "step": 491 }, { "epoch": 0.07701941139636818, "grad_norm": 5.245711326599121, "learning_rate": 6.803999999999999e-05, "loss": 3.0033, "step": 492 }, { "epoch": 0.0771759549154665, "grad_norm": 1.4434552192687988, "learning_rate": 6.817999999999999e-05, "loss": 3.0777, "step": 493 }, { "epoch": 0.07733249843456481, "grad_norm": 2.1795578002929688, "learning_rate": 6.832e-05, "loss": 3.0205, "step": 494 }, { "epoch": 0.07748904195366312, "grad_norm": 2.3812692165374756, "learning_rate": 6.845999999999999e-05, "loss": 2.833, "step": 495 }, { "epoch": 0.07764558547276143, "grad_norm": 1.7308846712112427, "learning_rate": 6.859999999999999e-05, "loss": 2.8444, "step": 496 }, { "epoch": 0.07780212899185973, "grad_norm": 1.6226398944854736, "learning_rate": 6.874e-05, "loss": 2.8134, "step": 497 }, { "epoch": 0.07795867251095805, "grad_norm": 2.1018800735473633, "learning_rate": 6.887999999999999e-05, "loss": 2.7407, "step": 498 }, { "epoch": 0.07811521603005636, "grad_norm": 2.066850423812866, "learning_rate": 6.901999999999999e-05, "loss": 2.6417, "step": 499 }, { "epoch": 0.07827175954915466, "grad_norm": 3.613501787185669, "learning_rate": 6.916e-05, "loss": 2.7097, "step": 500 }, { "epoch": 0.07842830306825298, "grad_norm": 2.3364012241363525, "learning_rate": 6.929999999999999e-05, "loss": 2.8966, "step": 501 }, { "epoch": 0.07858484658735128, "grad_norm": 1.1388334035873413, "learning_rate": 6.944e-05, "loss": 2.8814, "step": 502 }, { "epoch": 0.0787413901064496, "grad_norm": 1.03941810131073, "learning_rate": 6.958e-05, "loss": 2.8706, "step": 503 }, { "epoch": 0.07889793362554791, "grad_norm": 1.4305840730667114, "learning_rate": 6.971999999999999e-05, "loss": 2.9769, "step": 504 }, { "epoch": 0.07905447714464621, "grad_norm": 0.9836013913154602, "learning_rate": 6.986e-05, "loss": 2.8783, "step": 505 }, { "epoch": 0.07921102066374452, "grad_norm": 0.7068406939506531, "learning_rate": 7e-05, "loss": 2.8777, "step": 506 }, { "epoch": 0.07936756418284283, "grad_norm": 1.998130440711975, "learning_rate": 7.013999999999999e-05, "loss": 2.8613, "step": 507 }, { "epoch": 0.07952410770194114, "grad_norm": 1.6989305019378662, "learning_rate": 7.028e-05, "loss": 2.8581, "step": 508 }, { "epoch": 0.07968065122103946, "grad_norm": 2.511554002761841, "learning_rate": 7.042e-05, "loss": 2.8916, "step": 509 }, { "epoch": 0.07983719474013776, "grad_norm": 5.153507709503174, "learning_rate": 7.055999999999999e-05, "loss": 3.0328, "step": 510 }, { "epoch": 0.07999373825923607, "grad_norm": 1.0285046100616455, "learning_rate": 7.07e-05, "loss": 2.8676, "step": 511 }, { "epoch": 0.08015028177833437, "grad_norm": 0.5948107242584229, "learning_rate": 7.083999999999999e-05, "loss": 2.8943, "step": 512 }, { "epoch": 0.08030682529743269, "grad_norm": 0.5872496366500854, "learning_rate": 7.098e-05, "loss": 2.8788, "step": 513 }, { "epoch": 0.08046336881653099, "grad_norm": 1.1362733840942383, "learning_rate": 7.112e-05, "loss": 2.8846, "step": 514 }, { "epoch": 0.0806199123356293, "grad_norm": 0.42162302136421204, "learning_rate": 7.125999999999999e-05, "loss": 2.8623, "step": 515 }, { "epoch": 0.08077645585472762, "grad_norm": 2.4953296184539795, "learning_rate": 7.14e-05, "loss": 3.0009, "step": 516 }, { "epoch": 0.08093299937382592, "grad_norm": 2.0124881267547607, "learning_rate": 7.154e-05, "loss": 2.9044, "step": 517 }, { "epoch": 0.08108954289292424, "grad_norm": 2.574488639831543, "learning_rate": 7.167999999999999e-05, "loss": 3.0357, "step": 518 }, { "epoch": 0.08124608641202254, "grad_norm": 1.6243562698364258, "learning_rate": 7.182e-05, "loss": 2.9614, "step": 519 }, { "epoch": 0.08140262993112085, "grad_norm": 1.3826205730438232, "learning_rate": 7.196e-05, "loss": 2.9003, "step": 520 }, { "epoch": 0.08155917345021917, "grad_norm": 0.680793821811676, "learning_rate": 7.209999999999999e-05, "loss": 2.9884, "step": 521 }, { "epoch": 0.08171571696931747, "grad_norm": 1.3191125392913818, "learning_rate": 7.224e-05, "loss": 2.931, "step": 522 }, { "epoch": 0.08187226048841578, "grad_norm": 0.9748547673225403, "learning_rate": 7.238e-05, "loss": 2.8969, "step": 523 }, { "epoch": 0.08202880400751408, "grad_norm": 1.0261504650115967, "learning_rate": 7.251999999999999e-05, "loss": 2.9627, "step": 524 }, { "epoch": 0.0821853475266124, "grad_norm": 4.3971333503723145, "learning_rate": 7.266e-05, "loss": 3.0272, "step": 525 }, { "epoch": 0.08234189104571071, "grad_norm": 2.1641757488250732, "learning_rate": 7.28e-05, "loss": 2.8305, "step": 526 }, { "epoch": 0.08249843456480901, "grad_norm": 7.192819118499756, "learning_rate": 7.294e-05, "loss": 3.2899, "step": 527 }, { "epoch": 0.08265497808390733, "grad_norm": 1.042494297027588, "learning_rate": 7.308e-05, "loss": 2.9381, "step": 528 }, { "epoch": 0.08281152160300563, "grad_norm": 1.9490046501159668, "learning_rate": 7.322e-05, "loss": 2.9896, "step": 529 }, { "epoch": 0.08296806512210395, "grad_norm": 2.549988269805908, "learning_rate": 7.336e-05, "loss": 2.9751, "step": 530 }, { "epoch": 0.08312460864120226, "grad_norm": 2.1152634620666504, "learning_rate": 7.35e-05, "loss": 2.9422, "step": 531 }, { "epoch": 0.08328115216030056, "grad_norm": 1.540295124053955, "learning_rate": 7.363999999999999e-05, "loss": 2.8751, "step": 532 }, { "epoch": 0.08343769567939888, "grad_norm": 1.410292387008667, "learning_rate": 7.378e-05, "loss": 3.0182, "step": 533 }, { "epoch": 0.08359423919849718, "grad_norm": 2.054586887359619, "learning_rate": 7.392e-05, "loss": 2.8315, "step": 534 }, { "epoch": 0.08375078271759549, "grad_norm": 1.32257878780365, "learning_rate": 7.405999999999999e-05, "loss": 2.9228, "step": 535 }, { "epoch": 0.08390732623669381, "grad_norm": 1.6682566404342651, "learning_rate": 7.42e-05, "loss": 2.8909, "step": 536 }, { "epoch": 0.08406386975579211, "grad_norm": 3.105055570602417, "learning_rate": 7.434e-05, "loss": 2.8866, "step": 537 }, { "epoch": 0.08422041327489042, "grad_norm": 4.16715669631958, "learning_rate": 7.447999999999999e-05, "loss": 2.9528, "step": 538 }, { "epoch": 0.08437695679398872, "grad_norm": 2.468618154525757, "learning_rate": 7.462e-05, "loss": 2.8113, "step": 539 }, { "epoch": 0.08453350031308704, "grad_norm": 2.3760077953338623, "learning_rate": 7.476e-05, "loss": 2.8861, "step": 540 }, { "epoch": 0.08469004383218534, "grad_norm": 3.5385241508483887, "learning_rate": 7.49e-05, "loss": 2.8922, "step": 541 }, { "epoch": 0.08484658735128366, "grad_norm": 2.3770461082458496, "learning_rate": 7.504e-05, "loss": 2.8256, "step": 542 }, { "epoch": 0.08500313087038197, "grad_norm": 1.4000109434127808, "learning_rate": 7.518e-05, "loss": 2.7394, "step": 543 }, { "epoch": 0.08515967438948027, "grad_norm": 2.695282459259033, "learning_rate": 7.532e-05, "loss": 2.7196, "step": 544 }, { "epoch": 0.08531621790857859, "grad_norm": 1.7002220153808594, "learning_rate": 7.546e-05, "loss": 2.7179, "step": 545 }, { "epoch": 0.08547276142767689, "grad_norm": 3.1049537658691406, "learning_rate": 7.56e-05, "loss": 2.442, "step": 546 }, { "epoch": 0.0856293049467752, "grad_norm": 2.3834686279296875, "learning_rate": 7.574e-05, "loss": 2.6144, "step": 547 }, { "epoch": 0.08578584846587352, "grad_norm": 2.76003360748291, "learning_rate": 7.588e-05, "loss": 2.6096, "step": 548 }, { "epoch": 0.08594239198497182, "grad_norm": 1.84690260887146, "learning_rate": 7.602e-05, "loss": 2.4598, "step": 549 }, { "epoch": 0.08609893550407013, "grad_norm": 1.6521265506744385, "learning_rate": 7.616e-05, "loss": 2.52, "step": 550 }, { "epoch": 0.08625547902316844, "grad_norm": 11.100810050964355, "learning_rate": 7.63e-05, "loss": 3.2894, "step": 551 }, { "epoch": 0.08641202254226675, "grad_norm": 8.84830379486084, "learning_rate": 7.643999999999999e-05, "loss": 3.1724, "step": 552 }, { "epoch": 0.08656856606136507, "grad_norm": 5.679826736450195, "learning_rate": 7.658e-05, "loss": 2.994, "step": 553 }, { "epoch": 0.08672510958046337, "grad_norm": 2.468665838241577, "learning_rate": 7.672e-05, "loss": 2.9113, "step": 554 }, { "epoch": 0.08688165309956168, "grad_norm": 1.5779838562011719, "learning_rate": 7.686e-05, "loss": 2.8815, "step": 555 }, { "epoch": 0.08703819661865998, "grad_norm": 3.218441963195801, "learning_rate": 7.7e-05, "loss": 2.9306, "step": 556 }, { "epoch": 0.0871947401377583, "grad_norm": 3.097341537475586, "learning_rate": 7.714e-05, "loss": 2.9045, "step": 557 }, { "epoch": 0.08735128365685661, "grad_norm": 6.378298282623291, "learning_rate": 7.728e-05, "loss": 3.3148, "step": 558 }, { "epoch": 0.08750782717595491, "grad_norm": 1.3787956237792969, "learning_rate": 7.742e-05, "loss": 2.9096, "step": 559 }, { "epoch": 0.08766437069505323, "grad_norm": 0.5859063863754272, "learning_rate": 7.756e-05, "loss": 2.8908, "step": 560 }, { "epoch": 0.08782091421415153, "grad_norm": 2.936734437942505, "learning_rate": 7.77e-05, "loss": 2.9232, "step": 561 }, { "epoch": 0.08797745773324984, "grad_norm": 4.0955891609191895, "learning_rate": 7.784e-05, "loss": 2.9191, "step": 562 }, { "epoch": 0.08813400125234815, "grad_norm": 4.440272808074951, "learning_rate": 7.798e-05, "loss": 2.9375, "step": 563 }, { "epoch": 0.08829054477144646, "grad_norm": 3.984534740447998, "learning_rate": 7.812e-05, "loss": 3.02, "step": 564 }, { "epoch": 0.08844708829054478, "grad_norm": 1.9874281883239746, "learning_rate": 7.826e-05, "loss": 2.8447, "step": 565 }, { "epoch": 0.08860363180964308, "grad_norm": 0.7987030744552612, "learning_rate": 7.84e-05, "loss": 2.9011, "step": 566 }, { "epoch": 0.08876017532874139, "grad_norm": 0.9665445685386658, "learning_rate": 7.854e-05, "loss": 2.9383, "step": 567 }, { "epoch": 0.08891671884783969, "grad_norm": 1.8189830780029297, "learning_rate": 7.868e-05, "loss": 2.9895, "step": 568 }, { "epoch": 0.08907326236693801, "grad_norm": 1.3790318965911865, "learning_rate": 7.881999999999998e-05, "loss": 2.8828, "step": 569 }, { "epoch": 0.08922980588603632, "grad_norm": 2.744035005569458, "learning_rate": 7.895999999999999e-05, "loss": 2.959, "step": 570 }, { "epoch": 0.08938634940513462, "grad_norm": 1.0563842058181763, "learning_rate": 7.909999999999998e-05, "loss": 2.9077, "step": 571 }, { "epoch": 0.08954289292423294, "grad_norm": 2.293290138244629, "learning_rate": 7.923999999999998e-05, "loss": 2.9149, "step": 572 }, { "epoch": 0.08969943644333124, "grad_norm": 3.3430798053741455, "learning_rate": 7.937999999999999e-05, "loss": 3.0014, "step": 573 }, { "epoch": 0.08985597996242956, "grad_norm": 2.3264386653900146, "learning_rate": 7.951999999999998e-05, "loss": 2.8606, "step": 574 }, { "epoch": 0.09001252348152787, "grad_norm": 0.7302554249763489, "learning_rate": 7.965999999999998e-05, "loss": 2.8798, "step": 575 }, { "epoch": 0.09016906700062617, "grad_norm": 2.4640374183654785, "learning_rate": 7.979999999999999e-05, "loss": 2.8171, "step": 576 }, { "epoch": 0.09032561051972449, "grad_norm": 2.573153257369995, "learning_rate": 7.993999999999998e-05, "loss": 2.9294, "step": 577 }, { "epoch": 0.09048215403882279, "grad_norm": 1.442392349243164, "learning_rate": 8.007999999999999e-05, "loss": 2.9444, "step": 578 }, { "epoch": 0.0906386975579211, "grad_norm": 1.0828009843826294, "learning_rate": 8.021999999999999e-05, "loss": 2.8334, "step": 579 }, { "epoch": 0.09079524107701942, "grad_norm": 4.217101097106934, "learning_rate": 8.035999999999998e-05, "loss": 2.9442, "step": 580 }, { "epoch": 0.09095178459611772, "grad_norm": 0.8283713459968567, "learning_rate": 8.049999999999999e-05, "loss": 2.7398, "step": 581 }, { "epoch": 0.09110832811521603, "grad_norm": 0.8829928636550903, "learning_rate": 8.063999999999999e-05, "loss": 2.7967, "step": 582 }, { "epoch": 0.09126487163431433, "grad_norm": 1.2256791591644287, "learning_rate": 8.077999999999998e-05, "loss": 2.8872, "step": 583 }, { "epoch": 0.09142141515341265, "grad_norm": 1.8439127206802368, "learning_rate": 8.091999999999999e-05, "loss": 2.6761, "step": 584 }, { "epoch": 0.09157795867251096, "grad_norm": 1.3791660070419312, "learning_rate": 8.105999999999999e-05, "loss": 2.8657, "step": 585 }, { "epoch": 0.09173450219160927, "grad_norm": 2.70691180229187, "learning_rate": 8.119999999999998e-05, "loss": 2.8441, "step": 586 }, { "epoch": 0.09189104571070758, "grad_norm": 1.5067205429077148, "learning_rate": 8.133999999999999e-05, "loss": 2.7653, "step": 587 }, { "epoch": 0.09204758922980588, "grad_norm": 2.6871140003204346, "learning_rate": 8.147999999999999e-05, "loss": 2.616, "step": 588 }, { "epoch": 0.0922041327489042, "grad_norm": 3.1449267864227295, "learning_rate": 8.161999999999998e-05, "loss": 2.6634, "step": 589 }, { "epoch": 0.0923606762680025, "grad_norm": 1.8038227558135986, "learning_rate": 8.175999999999999e-05, "loss": 2.719, "step": 590 }, { "epoch": 0.09251721978710081, "grad_norm": 1.8315274715423584, "learning_rate": 8.189999999999998e-05, "loss": 2.5572, "step": 591 }, { "epoch": 0.09267376330619913, "grad_norm": 2.2419536113739014, "learning_rate": 8.204e-05, "loss": 2.7329, "step": 592 }, { "epoch": 0.09283030682529743, "grad_norm": 1.3130000829696655, "learning_rate": 8.217999999999999e-05, "loss": 2.4834, "step": 593 }, { "epoch": 0.09298685034439574, "grad_norm": 1.3675116300582886, "learning_rate": 8.231999999999998e-05, "loss": 2.5511, "step": 594 }, { "epoch": 0.09314339386349405, "grad_norm": 1.8123753070831299, "learning_rate": 8.245999999999999e-05, "loss": 2.5266, "step": 595 }, { "epoch": 0.09329993738259236, "grad_norm": 3.733072280883789, "learning_rate": 8.259999999999999e-05, "loss": 2.3111, "step": 596 }, { "epoch": 0.09345648090169068, "grad_norm": 2.02605938911438, "learning_rate": 8.273999999999998e-05, "loss": 2.4535, "step": 597 }, { "epoch": 0.09361302442078898, "grad_norm": 1.969171166419983, "learning_rate": 8.287999999999999e-05, "loss": 2.4936, "step": 598 }, { "epoch": 0.09376956793988729, "grad_norm": 3.000863552093506, "learning_rate": 8.301999999999999e-05, "loss": 2.2823, "step": 599 }, { "epoch": 0.09392611145898559, "grad_norm": 1.7408140897750854, "learning_rate": 8.315999999999998e-05, "loss": 2.1555, "step": 600 }, { "epoch": 0.09408265497808391, "grad_norm": 4.489687919616699, "learning_rate": 8.329999999999999e-05, "loss": 3.0176, "step": 601 }, { "epoch": 0.09423919849718222, "grad_norm": 3.8201985359191895, "learning_rate": 8.343999999999999e-05, "loss": 2.9745, "step": 602 }, { "epoch": 0.09439574201628052, "grad_norm": 2.3922502994537354, "learning_rate": 8.357999999999998e-05, "loss": 2.8904, "step": 603 }, { "epoch": 0.09455228553537884, "grad_norm": 0.8516799211502075, "learning_rate": 8.371999999999999e-05, "loss": 2.8647, "step": 604 }, { "epoch": 0.09470882905447714, "grad_norm": 2.5623295307159424, "learning_rate": 8.385999999999999e-05, "loss": 2.8542, "step": 605 }, { "epoch": 0.09486537257357545, "grad_norm": 4.226286888122559, "learning_rate": 8.4e-05, "loss": 2.9238, "step": 606 }, { "epoch": 0.09502191609267377, "grad_norm": 2.9560070037841797, "learning_rate": 8.413999999999999e-05, "loss": 2.8869, "step": 607 }, { "epoch": 0.09517845961177207, "grad_norm": 3.6041259765625, "learning_rate": 8.427999999999999e-05, "loss": 2.9067, "step": 608 }, { "epoch": 0.09533500313087039, "grad_norm": 2.1277339458465576, "learning_rate": 8.442e-05, "loss": 2.8573, "step": 609 }, { "epoch": 0.09549154664996869, "grad_norm": 1.1484607458114624, "learning_rate": 8.455999999999999e-05, "loss": 2.8809, "step": 610 }, { "epoch": 0.095648090169067, "grad_norm": 1.037593960762024, "learning_rate": 8.469999999999999e-05, "loss": 2.8537, "step": 611 }, { "epoch": 0.09580463368816532, "grad_norm": 1.8294048309326172, "learning_rate": 8.484e-05, "loss": 2.8484, "step": 612 }, { "epoch": 0.09596117720726362, "grad_norm": 8.515142440795898, "learning_rate": 8.497999999999999e-05, "loss": 3.4446, "step": 613 }, { "epoch": 0.09611772072636193, "grad_norm": 1.435266137123108, "learning_rate": 8.511999999999998e-05, "loss": 2.9317, "step": 614 }, { "epoch": 0.09627426424546023, "grad_norm": 2.6695430278778076, "learning_rate": 8.526e-05, "loss": 2.8678, "step": 615 }, { "epoch": 0.09643080776455855, "grad_norm": 0.5360819697380066, "learning_rate": 8.539999999999999e-05, "loss": 2.8697, "step": 616 }, { "epoch": 0.09658735128365685, "grad_norm": 1.6742193698883057, "learning_rate": 8.553999999999998e-05, "loss": 2.8585, "step": 617 }, { "epoch": 0.09674389480275516, "grad_norm": 2.3096132278442383, "learning_rate": 8.567999999999999e-05, "loss": 2.8592, "step": 618 }, { "epoch": 0.09690043832185348, "grad_norm": 2.621411085128784, "learning_rate": 8.581999999999999e-05, "loss": 2.9058, "step": 619 }, { "epoch": 0.09705698184095178, "grad_norm": 0.7777635455131531, "learning_rate": 8.596e-05, "loss": 2.8421, "step": 620 }, { "epoch": 0.0972135253600501, "grad_norm": 1.642793893814087, "learning_rate": 8.609999999999999e-05, "loss": 2.8438, "step": 621 }, { "epoch": 0.0973700688791484, "grad_norm": 1.0681241750717163, "learning_rate": 8.623999999999999e-05, "loss": 2.9454, "step": 622 }, { "epoch": 0.09752661239824671, "grad_norm": 0.9751691222190857, "learning_rate": 8.638e-05, "loss": 2.8857, "step": 623 }, { "epoch": 0.09768315591734503, "grad_norm": 2.1378631591796875, "learning_rate": 8.651999999999999e-05, "loss": 2.8965, "step": 624 }, { "epoch": 0.09783969943644333, "grad_norm": 1.288797378540039, "learning_rate": 8.665999999999999e-05, "loss": 2.8132, "step": 625 }, { "epoch": 0.09799624295554164, "grad_norm": 1.5462113618850708, "learning_rate": 8.68e-05, "loss": 2.8404, "step": 626 }, { "epoch": 0.09815278647463994, "grad_norm": 1.1035099029541016, "learning_rate": 8.693999999999999e-05, "loss": 2.8998, "step": 627 }, { "epoch": 0.09830932999373826, "grad_norm": 1.574508547782898, "learning_rate": 8.707999999999999e-05, "loss": 2.8313, "step": 628 }, { "epoch": 0.09846587351283657, "grad_norm": 1.4513823986053467, "learning_rate": 8.722e-05, "loss": 2.7766, "step": 629 }, { "epoch": 0.09862241703193488, "grad_norm": 1.5229530334472656, "learning_rate": 8.735999999999999e-05, "loss": 2.7159, "step": 630 }, { "epoch": 0.09877896055103319, "grad_norm": 1.1908684968948364, "learning_rate": 8.749999999999999e-05, "loss": 2.7411, "step": 631 }, { "epoch": 0.09893550407013149, "grad_norm": 1.6419241428375244, "learning_rate": 8.764e-05, "loss": 2.761, "step": 632 }, { "epoch": 0.0990920475892298, "grad_norm": 1.42733895778656, "learning_rate": 8.777999999999999e-05, "loss": 2.7846, "step": 633 }, { "epoch": 0.09924859110832812, "grad_norm": 2.1779654026031494, "learning_rate": 8.792e-05, "loss": 2.6949, "step": 634 }, { "epoch": 0.09940513462742642, "grad_norm": 2.86039662361145, "learning_rate": 8.806e-05, "loss": 2.8665, "step": 635 }, { "epoch": 0.09956167814652474, "grad_norm": 3.148642063140869, "learning_rate": 8.819999999999999e-05, "loss": 2.7125, "step": 636 }, { "epoch": 0.09971822166562304, "grad_norm": 1.4110866785049438, "learning_rate": 8.834e-05, "loss": 2.5152, "step": 637 }, { "epoch": 0.09987476518472135, "grad_norm": 3.0192577838897705, "learning_rate": 8.847999999999999e-05, "loss": 2.6036, "step": 638 }, { "epoch": 0.10003130870381967, "grad_norm": 2.068753480911255, "learning_rate": 8.861999999999999e-05, "loss": 2.7059, "step": 639 }, { "epoch": 0.10018785222291797, "grad_norm": 3.629587173461914, "learning_rate": 8.876e-05, "loss": 2.8164, "step": 640 }, { "epoch": 0.10034439574201628, "grad_norm": 3.1281771659851074, "learning_rate": 8.889999999999999e-05, "loss": 2.7567, "step": 641 }, { "epoch": 0.10050093926111459, "grad_norm": 2.863898754119873, "learning_rate": 8.903999999999999e-05, "loss": 2.645, "step": 642 }, { "epoch": 0.1006574827802129, "grad_norm": 2.017267942428589, "learning_rate": 8.918e-05, "loss": 2.4937, "step": 643 }, { "epoch": 0.1008140262993112, "grad_norm": 1.6639717817306519, "learning_rate": 8.931999999999999e-05, "loss": 2.5942, "step": 644 }, { "epoch": 0.10097056981840952, "grad_norm": 2.2766458988189697, "learning_rate": 8.946e-05, "loss": 2.4, "step": 645 }, { "epoch": 0.10112711333750783, "grad_norm": 6.2266716957092285, "learning_rate": 8.96e-05, "loss": 2.5775, "step": 646 }, { "epoch": 0.10128365685660613, "grad_norm": 5.088527679443359, "learning_rate": 8.973999999999999e-05, "loss": 2.6901, "step": 647 }, { "epoch": 0.10144020037570445, "grad_norm": 1.4510891437530518, "learning_rate": 8.988e-05, "loss": 2.138, "step": 648 }, { "epoch": 0.10159674389480275, "grad_norm": 1.373038411140442, "learning_rate": 9.002e-05, "loss": 2.2692, "step": 649 }, { "epoch": 0.10175328741390106, "grad_norm": 3.7602195739746094, "learning_rate": 9.015999999999999e-05, "loss": 2.1993, "step": 650 }, { "epoch": 0.10190983093299938, "grad_norm": 1.4847732782363892, "learning_rate": 9.03e-05, "loss": 2.9333, "step": 651 }, { "epoch": 0.10206637445209768, "grad_norm": 1.532929539680481, "learning_rate": 9.044e-05, "loss": 2.8841, "step": 652 }, { "epoch": 0.102222917971196, "grad_norm": 1.7859914302825928, "learning_rate": 9.057999999999999e-05, "loss": 2.9098, "step": 653 }, { "epoch": 0.1023794614902943, "grad_norm": 1.9093459844589233, "learning_rate": 9.072e-05, "loss": 2.8683, "step": 654 }, { "epoch": 0.10253600500939261, "grad_norm": 2.4284961223602295, "learning_rate": 9.086e-05, "loss": 2.8872, "step": 655 }, { "epoch": 0.10269254852849093, "grad_norm": 2.410187244415283, "learning_rate": 9.099999999999999e-05, "loss": 2.9094, "step": 656 }, { "epoch": 0.10284909204758923, "grad_norm": 0.8081480860710144, "learning_rate": 9.114e-05, "loss": 2.8466, "step": 657 }, { "epoch": 0.10300563556668754, "grad_norm": 1.265276312828064, "learning_rate": 9.128e-05, "loss": 2.9126, "step": 658 }, { "epoch": 0.10316217908578584, "grad_norm": 0.8512921333312988, "learning_rate": 9.142e-05, "loss": 2.8192, "step": 659 }, { "epoch": 0.10331872260488416, "grad_norm": 0.758476972579956, "learning_rate": 9.156e-05, "loss": 2.8346, "step": 660 }, { "epoch": 0.10347526612398247, "grad_norm": 2.6204535961151123, "learning_rate": 9.169999999999999e-05, "loss": 2.8972, "step": 661 }, { "epoch": 0.10363180964308077, "grad_norm": 2.4315996170043945, "learning_rate": 9.184e-05, "loss": 2.8794, "step": 662 }, { "epoch": 0.10378835316217909, "grad_norm": 2.650017023086548, "learning_rate": 9.198e-05, "loss": 2.8723, "step": 663 }, { "epoch": 0.10394489668127739, "grad_norm": 0.8188495635986328, "learning_rate": 9.211999999999999e-05, "loss": 2.8609, "step": 664 }, { "epoch": 0.1041014402003757, "grad_norm": 1.487927794456482, "learning_rate": 9.226e-05, "loss": 2.874, "step": 665 }, { "epoch": 0.10425798371947402, "grad_norm": 1.0953418016433716, "learning_rate": 9.24e-05, "loss": 2.8506, "step": 666 }, { "epoch": 0.10441452723857232, "grad_norm": 3.1868810653686523, "learning_rate": 9.253999999999999e-05, "loss": 3.0821, "step": 667 }, { "epoch": 0.10457107075767064, "grad_norm": 0.9702733755111694, "learning_rate": 9.268e-05, "loss": 2.8428, "step": 668 }, { "epoch": 0.10472761427676894, "grad_norm": 0.9401553273200989, "learning_rate": 9.282e-05, "loss": 2.8111, "step": 669 }, { "epoch": 0.10488415779586725, "grad_norm": 2.7934765815734863, "learning_rate": 9.295999999999999e-05, "loss": 2.8008, "step": 670 }, { "epoch": 0.10504070131496555, "grad_norm": 1.5276439189910889, "learning_rate": 9.31e-05, "loss": 2.8088, "step": 671 }, { "epoch": 0.10519724483406387, "grad_norm": 1.0339336395263672, "learning_rate": 9.324e-05, "loss": 2.7978, "step": 672 }, { "epoch": 0.10535378835316218, "grad_norm": 2.7799739837646484, "learning_rate": 9.338e-05, "loss": 2.8838, "step": 673 }, { "epoch": 0.10551033187226048, "grad_norm": 1.9179078340530396, "learning_rate": 9.352e-05, "loss": 2.8577, "step": 674 }, { "epoch": 0.1056668753913588, "grad_norm": 1.5971529483795166, "learning_rate": 9.366e-05, "loss": 2.6386, "step": 675 }, { "epoch": 0.1058234189104571, "grad_norm": 1.2030808925628662, "learning_rate": 9.38e-05, "loss": 2.8363, "step": 676 }, { "epoch": 0.10597996242955542, "grad_norm": 0.8602957725524902, "learning_rate": 9.394e-05, "loss": 2.7644, "step": 677 }, { "epoch": 0.10613650594865373, "grad_norm": 2.3469550609588623, "learning_rate": 9.408e-05, "loss": 2.7364, "step": 678 }, { "epoch": 0.10629304946775203, "grad_norm": 1.8024535179138184, "learning_rate": 9.422e-05, "loss": 2.7554, "step": 679 }, { "epoch": 0.10644959298685035, "grad_norm": 1.4375693798065186, "learning_rate": 9.436e-05, "loss": 2.8329, "step": 680 }, { "epoch": 0.10660613650594865, "grad_norm": 1.7252322435379028, "learning_rate": 9.449999999999999e-05, "loss": 2.9088, "step": 681 }, { "epoch": 0.10676268002504696, "grad_norm": 1.2475411891937256, "learning_rate": 9.464e-05, "loss": 2.6744, "step": 682 }, { "epoch": 0.10691922354414528, "grad_norm": 0.9993375539779663, "learning_rate": 9.478e-05, "loss": 2.6178, "step": 683 }, { "epoch": 0.10707576706324358, "grad_norm": 1.7166149616241455, "learning_rate": 9.491999999999999e-05, "loss": 2.6437, "step": 684 }, { "epoch": 0.1072323105823419, "grad_norm": 4.021966934204102, "learning_rate": 9.506e-05, "loss": 2.749, "step": 685 }, { "epoch": 0.1073888541014402, "grad_norm": 2.7211408615112305, "learning_rate": 9.52e-05, "loss": 2.678, "step": 686 }, { "epoch": 0.10754539762053851, "grad_norm": 1.1073458194732666, "learning_rate": 9.534e-05, "loss": 2.6733, "step": 687 }, { "epoch": 0.10770194113963683, "grad_norm": 1.3809196949005127, "learning_rate": 9.548e-05, "loss": 2.8193, "step": 688 }, { "epoch": 0.10785848465873513, "grad_norm": 2.2120792865753174, "learning_rate": 9.562e-05, "loss": 2.6639, "step": 689 }, { "epoch": 0.10801502817783344, "grad_norm": 1.7682451009750366, "learning_rate": 9.576e-05, "loss": 2.6724, "step": 690 }, { "epoch": 0.10817157169693174, "grad_norm": 1.890321969985962, "learning_rate": 9.59e-05, "loss": 2.5904, "step": 691 }, { "epoch": 0.10832811521603006, "grad_norm": 2.1124002933502197, "learning_rate": 9.604e-05, "loss": 2.7319, "step": 692 }, { "epoch": 0.10848465873512837, "grad_norm": 2.8238260746002197, "learning_rate": 9.618e-05, "loss": 2.5775, "step": 693 }, { "epoch": 0.10864120225422667, "grad_norm": 2.0197784900665283, "learning_rate": 9.631999999999999e-05, "loss": 2.3455, "step": 694 }, { "epoch": 0.10879774577332499, "grad_norm": 1.6647446155548096, "learning_rate": 9.645999999999998e-05, "loss": 2.5223, "step": 695 }, { "epoch": 0.10895428929242329, "grad_norm": 1.962484359741211, "learning_rate": 9.659999999999999e-05, "loss": 2.1781, "step": 696 }, { "epoch": 0.1091108328115216, "grad_norm": 2.5072314739227295, "learning_rate": 9.673999999999999e-05, "loss": 2.2811, "step": 697 }, { "epoch": 0.1092673763306199, "grad_norm": 2.431469202041626, "learning_rate": 9.687999999999998e-05, "loss": 2.2547, "step": 698 }, { "epoch": 0.10942391984971822, "grad_norm": 1.4257805347442627, "learning_rate": 9.701999999999999e-05, "loss": 2.1571, "step": 699 }, { "epoch": 0.10958046336881654, "grad_norm": 2.262010335922241, "learning_rate": 9.715999999999998e-05, "loss": 2.0916, "step": 700 }, { "epoch": 0.10973700688791484, "grad_norm": 1.3178746700286865, "learning_rate": 9.729999999999998e-05, "loss": 2.9068, "step": 701 }, { "epoch": 0.10989355040701315, "grad_norm": 0.9112570881843567, "learning_rate": 9.743999999999999e-05, "loss": 2.8922, "step": 702 }, { "epoch": 0.11005009392611145, "grad_norm": 1.1442265510559082, "learning_rate": 9.757999999999998e-05, "loss": 2.8575, "step": 703 }, { "epoch": 0.11020663744520977, "grad_norm": 2.08422589302063, "learning_rate": 9.771999999999998e-05, "loss": 2.8724, "step": 704 }, { "epoch": 0.11036318096430808, "grad_norm": 2.7726058959960938, "learning_rate": 9.785999999999999e-05, "loss": 2.871, "step": 705 }, { "epoch": 0.11051972448340638, "grad_norm": 2.077904462814331, "learning_rate": 9.799999999999998e-05, "loss": 2.8446, "step": 706 }, { "epoch": 0.1106762680025047, "grad_norm": 0.760727047920227, "learning_rate": 9.813999999999998e-05, "loss": 2.8183, "step": 707 }, { "epoch": 0.110832811521603, "grad_norm": 1.2310662269592285, "learning_rate": 9.827999999999999e-05, "loss": 2.8084, "step": 708 }, { "epoch": 0.11098935504070132, "grad_norm": 0.8499639630317688, "learning_rate": 9.841999999999998e-05, "loss": 2.8289, "step": 709 }, { "epoch": 0.11114589855979963, "grad_norm": 0.7850086688995361, "learning_rate": 9.855999999999999e-05, "loss": 2.8231, "step": 710 }, { "epoch": 0.11130244207889793, "grad_norm": 2.7962753772735596, "learning_rate": 9.869999999999999e-05, "loss": 2.9292, "step": 711 }, { "epoch": 0.11145898559799625, "grad_norm": 2.2632522583007812, "learning_rate": 9.883999999999998e-05, "loss": 2.8112, "step": 712 }, { "epoch": 0.11161552911709455, "grad_norm": 3.973036527633667, "learning_rate": 9.897999999999999e-05, "loss": 2.9688, "step": 713 }, { "epoch": 0.11177207263619286, "grad_norm": 1.4864583015441895, "learning_rate": 9.911999999999999e-05, "loss": 2.8939, "step": 714 }, { "epoch": 0.11192861615529118, "grad_norm": 1.578096866607666, "learning_rate": 9.925999999999998e-05, "loss": 2.7988, "step": 715 }, { "epoch": 0.11208515967438948, "grad_norm": 0.40915024280548096, "learning_rate": 9.939999999999999e-05, "loss": 2.828, "step": 716 }, { "epoch": 0.1122417031934878, "grad_norm": 1.3297410011291504, "learning_rate": 9.953999999999999e-05, "loss": 2.8632, "step": 717 }, { "epoch": 0.1123982467125861, "grad_norm": 1.3306127786636353, "learning_rate": 9.967999999999998e-05, "loss": 2.8194, "step": 718 }, { "epoch": 0.11255479023168441, "grad_norm": 1.8279871940612793, "learning_rate": 9.981999999999999e-05, "loss": 2.8314, "step": 719 }, { "epoch": 0.11271133375078271, "grad_norm": 0.6762781143188477, "learning_rate": 9.995999999999998e-05, "loss": 2.7888, "step": 720 }, { "epoch": 0.11286787726988103, "grad_norm": 0.9003087878227234, "learning_rate": 0.00010009999999999998, "loss": 2.807, "step": 721 }, { "epoch": 0.11302442078897934, "grad_norm": 1.076756477355957, "learning_rate": 0.00010023999999999999, "loss": 2.8194, "step": 722 }, { "epoch": 0.11318096430807764, "grad_norm": 0.9080193638801575, "learning_rate": 0.00010037999999999998, "loss": 2.8307, "step": 723 }, { "epoch": 0.11333750782717596, "grad_norm": 0.9916881322860718, "learning_rate": 0.00010051999999999999, "loss": 2.8013, "step": 724 }, { "epoch": 0.11349405134627426, "grad_norm": 1.0087413787841797, "learning_rate": 0.00010065999999999999, "loss": 2.8192, "step": 725 }, { "epoch": 0.11365059486537257, "grad_norm": 1.6485861539840698, "learning_rate": 0.00010079999999999998, "loss": 2.814, "step": 726 }, { "epoch": 0.11380713838447089, "grad_norm": 2.1220555305480957, "learning_rate": 0.00010093999999999999, "loss": 2.9067, "step": 727 }, { "epoch": 0.11396368190356919, "grad_norm": 0.9233254194259644, "learning_rate": 0.00010107999999999999, "loss": 2.7573, "step": 728 }, { "epoch": 0.1141202254226675, "grad_norm": 0.7290918231010437, "learning_rate": 0.00010121999999999998, "loss": 2.7854, "step": 729 }, { "epoch": 0.1142767689417658, "grad_norm": 2.6598281860351562, "learning_rate": 0.00010135999999999999, "loss": 2.6142, "step": 730 }, { "epoch": 0.11443331246086412, "grad_norm": 1.2976399660110474, "learning_rate": 0.00010149999999999999, "loss": 2.6832, "step": 731 }, { "epoch": 0.11458985597996243, "grad_norm": 1.2561722993850708, "learning_rate": 0.00010163999999999998, "loss": 2.7044, "step": 732 }, { "epoch": 0.11474639949906074, "grad_norm": 1.116884708404541, "learning_rate": 0.00010177999999999999, "loss": 2.6394, "step": 733 }, { "epoch": 0.11490294301815905, "grad_norm": 1.446645736694336, "learning_rate": 0.00010191999999999999, "loss": 2.5789, "step": 734 }, { "epoch": 0.11505948653725735, "grad_norm": 2.160897731781006, "learning_rate": 0.00010205999999999998, "loss": 2.5513, "step": 735 }, { "epoch": 0.11521603005635567, "grad_norm": 0.988793671131134, "learning_rate": 0.00010219999999999999, "loss": 2.64, "step": 736 }, { "epoch": 0.11537257357545398, "grad_norm": 1.931432843208313, "learning_rate": 0.00010233999999999999, "loss": 2.7412, "step": 737 }, { "epoch": 0.11552911709455228, "grad_norm": 1.6486951112747192, "learning_rate": 0.00010248, "loss": 2.6024, "step": 738 }, { "epoch": 0.1156856606136506, "grad_norm": 1.5773563385009766, "learning_rate": 0.00010261999999999999, "loss": 2.5944, "step": 739 }, { "epoch": 0.1158422041327489, "grad_norm": 1.9509053230285645, "learning_rate": 0.00010275999999999999, "loss": 2.5517, "step": 740 }, { "epoch": 0.11599874765184721, "grad_norm": 2.108076333999634, "learning_rate": 0.0001029, "loss": 2.4797, "step": 741 }, { "epoch": 0.11615529117094553, "grad_norm": 1.8481203317642212, "learning_rate": 0.00010303999999999999, "loss": 2.4519, "step": 742 }, { "epoch": 0.11631183469004383, "grad_norm": 1.6996781826019287, "learning_rate": 0.00010317999999999998, "loss": 2.2561, "step": 743 }, { "epoch": 0.11646837820914215, "grad_norm": 1.9146181344985962, "learning_rate": 0.00010332, "loss": 2.3379, "step": 744 }, { "epoch": 0.11662492172824045, "grad_norm": 2.795153856277466, "learning_rate": 0.00010345999999999999, "loss": 2.0774, "step": 745 }, { "epoch": 0.11678146524733876, "grad_norm": 1.735450267791748, "learning_rate": 0.00010359999999999998, "loss": 2.3589, "step": 746 }, { "epoch": 0.11693800876643706, "grad_norm": 3.740893840789795, "learning_rate": 0.00010373999999999999, "loss": 2.2129, "step": 747 }, { "epoch": 0.11709455228553538, "grad_norm": 2.0985841751098633, "learning_rate": 0.00010387999999999999, "loss": 2.4758, "step": 748 }, { "epoch": 0.11725109580463369, "grad_norm": 1.8879237174987793, "learning_rate": 0.00010401999999999998, "loss": 2.0567, "step": 749 }, { "epoch": 0.117407639323732, "grad_norm": 1.401533603668213, "learning_rate": 0.00010415999999999999, "loss": 2.1373, "step": 750 }, { "epoch": 0.11756418284283031, "grad_norm": 4.911151885986328, "learning_rate": 0.00010429999999999999, "loss": 2.9637, "step": 751 }, { "epoch": 0.11772072636192861, "grad_norm": 3.5999467372894287, "learning_rate": 0.00010444, "loss": 2.8693, "step": 752 }, { "epoch": 0.11787726988102692, "grad_norm": 0.8501458764076233, "learning_rate": 0.00010457999999999999, "loss": 2.8108, "step": 753 }, { "epoch": 0.11803381340012524, "grad_norm": 5.197429180145264, "learning_rate": 0.00010471999999999999, "loss": 3.1202, "step": 754 }, { "epoch": 0.11819035691922354, "grad_norm": 3.0273237228393555, "learning_rate": 0.00010486, "loss": 2.8435, "step": 755 }, { "epoch": 0.11834690043832186, "grad_norm": 2.168957233428955, "learning_rate": 0.00010499999999999999, "loss": 2.8011, "step": 756 }, { "epoch": 0.11850344395742016, "grad_norm": 1.7167737483978271, "learning_rate": 0.00010513999999999999, "loss": 2.7909, "step": 757 }, { "epoch": 0.11865998747651847, "grad_norm": 0.9550461173057556, "learning_rate": 0.00010528, "loss": 2.7704, "step": 758 }, { "epoch": 0.11881653099561679, "grad_norm": 1.4692827463150024, "learning_rate": 0.00010541999999999999, "loss": 2.7736, "step": 759 }, { "epoch": 0.11897307451471509, "grad_norm": 3.0094316005706787, "learning_rate": 0.00010555999999999999, "loss": 2.8565, "step": 760 }, { "epoch": 0.1191296180338134, "grad_norm": 1.7324938774108887, "learning_rate": 0.0001057, "loss": 2.8138, "step": 761 }, { "epoch": 0.1192861615529117, "grad_norm": 0.6479263305664062, "learning_rate": 0.00010583999999999999, "loss": 2.7833, "step": 762 }, { "epoch": 0.11944270507201002, "grad_norm": 0.8179497122764587, "learning_rate": 0.00010598, "loss": 2.764, "step": 763 }, { "epoch": 0.11959924859110833, "grad_norm": 1.214524507522583, "learning_rate": 0.00010612, "loss": 2.7187, "step": 764 }, { "epoch": 0.11975579211020664, "grad_norm": 0.8078150749206543, "learning_rate": 0.00010625999999999999, "loss": 2.7217, "step": 765 }, { "epoch": 0.11991233562930495, "grad_norm": 0.5933513045310974, "learning_rate": 0.0001064, "loss": 2.8056, "step": 766 }, { "epoch": 0.12006887914840325, "grad_norm": 3.4325051307678223, "learning_rate": 0.00010653999999999999, "loss": 2.8174, "step": 767 }, { "epoch": 0.12022542266750157, "grad_norm": 2.178292751312256, "learning_rate": 0.00010667999999999999, "loss": 2.7634, "step": 768 }, { "epoch": 0.12038196618659988, "grad_norm": 1.1788640022277832, "learning_rate": 0.00010682, "loss": 2.7628, "step": 769 }, { "epoch": 0.12053850970569818, "grad_norm": 0.6470080614089966, "learning_rate": 0.00010695999999999999, "loss": 2.6896, "step": 770 }, { "epoch": 0.1206950532247965, "grad_norm": 0.8920907974243164, "learning_rate": 0.00010709999999999999, "loss": 2.6884, "step": 771 }, { "epoch": 0.1208515967438948, "grad_norm": 0.7915209531784058, "learning_rate": 0.00010724, "loss": 2.66, "step": 772 }, { "epoch": 0.12100814026299311, "grad_norm": 0.7725313305854797, "learning_rate": 0.00010737999999999999, "loss": 2.6398, "step": 773 }, { "epoch": 0.12116468378209141, "grad_norm": 0.7909332513809204, "learning_rate": 0.00010751999999999999, "loss": 2.6488, "step": 774 }, { "epoch": 0.12132122730118973, "grad_norm": 2.7973432540893555, "learning_rate": 0.00010766, "loss": 2.7846, "step": 775 }, { "epoch": 0.12147777082028804, "grad_norm": 4.6005072593688965, "learning_rate": 0.00010779999999999999, "loss": 2.8383, "step": 776 }, { "epoch": 0.12163431433938635, "grad_norm": 2.302690267562866, "learning_rate": 0.00010794, "loss": 2.8112, "step": 777 }, { "epoch": 0.12179085785848466, "grad_norm": 1.183908462524414, "learning_rate": 0.00010808, "loss": 2.5984, "step": 778 }, { "epoch": 0.12194740137758296, "grad_norm": 1.5682957172393799, "learning_rate": 0.00010821999999999999, "loss": 2.6565, "step": 779 }, { "epoch": 0.12210394489668128, "grad_norm": 1.2014832496643066, "learning_rate": 0.00010836, "loss": 2.7704, "step": 780 }, { "epoch": 0.12226048841577959, "grad_norm": 1.6014989614486694, "learning_rate": 0.0001085, "loss": 2.6136, "step": 781 }, { "epoch": 0.12241703193487789, "grad_norm": 1.7116495370864868, "learning_rate": 0.00010863999999999999, "loss": 2.7371, "step": 782 }, { "epoch": 0.12257357545397621, "grad_norm": 2.7797698974609375, "learning_rate": 0.00010878, "loss": 2.5162, "step": 783 }, { "epoch": 0.12273011897307451, "grad_norm": 1.0761003494262695, "learning_rate": 0.00010892, "loss": 2.7245, "step": 784 }, { "epoch": 0.12288666249217282, "grad_norm": 3.096675395965576, "learning_rate": 0.00010905999999999999, "loss": 2.6077, "step": 785 }, { "epoch": 0.12304320601127114, "grad_norm": 2.3801164627075195, "learning_rate": 0.0001092, "loss": 2.5498, "step": 786 }, { "epoch": 0.12319974953036944, "grad_norm": 1.6000444889068604, "learning_rate": 0.00010934, "loss": 2.619, "step": 787 }, { "epoch": 0.12335629304946776, "grad_norm": 3.186272621154785, "learning_rate": 0.00010947999999999999, "loss": 2.3956, "step": 788 }, { "epoch": 0.12351283656856606, "grad_norm": 2.3642852306365967, "learning_rate": 0.00010962, "loss": 2.1786, "step": 789 }, { "epoch": 0.12366938008766437, "grad_norm": 4.243374347686768, "learning_rate": 0.00010975999999999999, "loss": 2.3578, "step": 790 }, { "epoch": 0.12382592360676269, "grad_norm": 3.846938133239746, "learning_rate": 0.0001099, "loss": 2.5135, "step": 791 }, { "epoch": 0.12398246712586099, "grad_norm": 1.5724093914031982, "learning_rate": 0.00011004, "loss": 2.2236, "step": 792 }, { "epoch": 0.1241390106449593, "grad_norm": 3.214336395263672, "learning_rate": 0.00011017999999999999, "loss": 2.3937, "step": 793 }, { "epoch": 0.1242955541640576, "grad_norm": 1.8162360191345215, "learning_rate": 0.00011032, "loss": 2.1069, "step": 794 }, { "epoch": 0.12445209768315592, "grad_norm": 2.335206985473633, "learning_rate": 0.00011046, "loss": 2.2843, "step": 795 }, { "epoch": 0.12460864120225423, "grad_norm": 1.9656800031661987, "learning_rate": 0.00011059999999999999, "loss": 1.8953, "step": 796 }, { "epoch": 0.12476518472135253, "grad_norm": 3.1267311573028564, "learning_rate": 0.00011074, "loss": 1.9038, "step": 797 }, { "epoch": 0.12492172824045085, "grad_norm": 1.8955881595611572, "learning_rate": 0.00011088, "loss": 1.7882, "step": 798 }, { "epoch": 0.12507827175954916, "grad_norm": 4.234310626983643, "learning_rate": 0.00011101999999999999, "loss": 1.9273, "step": 799 }, { "epoch": 0.12523481527864747, "grad_norm": 1.78632390499115, "learning_rate": 0.00011116, "loss": 1.7571, "step": 800 }, { "epoch": 0.12539135879774577, "grad_norm": 1.041991114616394, "learning_rate": 0.0001113, "loss": 2.6974, "step": 801 }, { "epoch": 0.1255479023168441, "grad_norm": 0.8799676299095154, "learning_rate": 0.00011143999999999999, "loss": 2.683, "step": 802 }, { "epoch": 0.1257044458359424, "grad_norm": 0.7313692569732666, "learning_rate": 0.00011158, "loss": 2.6082, "step": 803 }, { "epoch": 0.1258609893550407, "grad_norm": 0.5345801115036011, "learning_rate": 0.00011172, "loss": 2.5935, "step": 804 }, { "epoch": 0.126017532874139, "grad_norm": 0.7087952494621277, "learning_rate": 0.00011186, "loss": 2.5459, "step": 805 }, { "epoch": 0.12617407639323733, "grad_norm": 0.9282543659210205, "learning_rate": 0.000112, "loss": 2.5396, "step": 806 }, { "epoch": 0.12633061991233563, "grad_norm": 0.9615651965141296, "learning_rate": 0.00011214, "loss": 2.5188, "step": 807 }, { "epoch": 0.12648716343143393, "grad_norm": 0.7608094215393066, "learning_rate": 0.00011228, "loss": 2.5365, "step": 808 }, { "epoch": 0.12664370695053226, "grad_norm": 0.5784610509872437, "learning_rate": 0.00011242, "loss": 2.5474, "step": 809 }, { "epoch": 0.12680025046963056, "grad_norm": 1.0504860877990723, "learning_rate": 0.00011255999999999999, "loss": 2.4385, "step": 810 }, { "epoch": 0.12695679398872886, "grad_norm": 1.4830121994018555, "learning_rate": 0.0001127, "loss": 2.4918, "step": 811 }, { "epoch": 0.12711333750782716, "grad_norm": 0.6658587455749512, "learning_rate": 0.00011284, "loss": 2.4198, "step": 812 }, { "epoch": 0.1272698810269255, "grad_norm": 0.8800923824310303, "learning_rate": 0.00011297999999999999, "loss": 2.407, "step": 813 }, { "epoch": 0.1274264245460238, "grad_norm": 0.8589938879013062, "learning_rate": 0.00011312, "loss": 2.413, "step": 814 }, { "epoch": 0.1275829680651221, "grad_norm": 1.3858628273010254, "learning_rate": 0.00011326, "loss": 2.4897, "step": 815 }, { "epoch": 0.12773951158422042, "grad_norm": 0.8424991369247437, "learning_rate": 0.00011339999999999999, "loss": 2.3499, "step": 816 }, { "epoch": 0.12789605510331872, "grad_norm": 1.1144732236862183, "learning_rate": 0.00011354, "loss": 2.368, "step": 817 }, { "epoch": 0.12805259862241702, "grad_norm": 1.466309666633606, "learning_rate": 0.00011368, "loss": 2.3391, "step": 818 }, { "epoch": 0.12820914214151535, "grad_norm": 1.1266729831695557, "learning_rate": 0.00011381999999999998, "loss": 2.3094, "step": 819 }, { "epoch": 0.12836568566061365, "grad_norm": 1.1242408752441406, "learning_rate": 0.00011395999999999999, "loss": 2.3819, "step": 820 }, { "epoch": 0.12852222917971196, "grad_norm": 1.5696038007736206, "learning_rate": 0.00011409999999999998, "loss": 2.2922, "step": 821 }, { "epoch": 0.12867877269881026, "grad_norm": 1.2671489715576172, "learning_rate": 0.00011423999999999998, "loss": 2.2125, "step": 822 }, { "epoch": 0.12883531621790859, "grad_norm": 2.0497825145721436, "learning_rate": 0.00011437999999999999, "loss": 2.3212, "step": 823 }, { "epoch": 0.1289918597370069, "grad_norm": 5.47742223739624, "learning_rate": 0.00011451999999999998, "loss": 2.7203, "step": 824 }, { "epoch": 0.1291484032561052, "grad_norm": 1.3360627889633179, "learning_rate": 0.00011465999999999998, "loss": 2.3193, "step": 825 }, { "epoch": 0.12930494677520352, "grad_norm": 1.6451613903045654, "learning_rate": 0.00011479999999999999, "loss": 2.383, "step": 826 }, { "epoch": 0.12946149029430182, "grad_norm": 1.3546360731124878, "learning_rate": 0.00011493999999999998, "loss": 2.3422, "step": 827 }, { "epoch": 0.12961803381340012, "grad_norm": 1.4826606512069702, "learning_rate": 0.00011507999999999999, "loss": 2.112, "step": 828 }, { "epoch": 0.12977457733249845, "grad_norm": 1.4059332609176636, "learning_rate": 0.00011521999999999998, "loss": 2.2946, "step": 829 }, { "epoch": 0.12993112085159675, "grad_norm": 3.641587734222412, "learning_rate": 0.00011535999999999998, "loss": 2.5302, "step": 830 }, { "epoch": 0.13008766437069505, "grad_norm": 1.7872023582458496, "learning_rate": 0.00011549999999999999, "loss": 2.1036, "step": 831 }, { "epoch": 0.13024420788979335, "grad_norm": 2.6338582038879395, "learning_rate": 0.00011563999999999998, "loss": 2.2891, "step": 832 }, { "epoch": 0.13040075140889168, "grad_norm": 2.3186681270599365, "learning_rate": 0.00011577999999999998, "loss": 2.2873, "step": 833 }, { "epoch": 0.13055729492798998, "grad_norm": 1.9568008184432983, "learning_rate": 0.00011591999999999999, "loss": 2.54, "step": 834 }, { "epoch": 0.13071383844708828, "grad_norm": 3.5063302516937256, "learning_rate": 0.00011605999999999998, "loss": 2.4192, "step": 835 }, { "epoch": 0.1308703819661866, "grad_norm": 1.621429443359375, "learning_rate": 0.00011619999999999998, "loss": 2.1953, "step": 836 }, { "epoch": 0.1310269254852849, "grad_norm": 1.953916072845459, "learning_rate": 0.00011633999999999999, "loss": 2.2671, "step": 837 }, { "epoch": 0.1311834690043832, "grad_norm": 1.6331963539123535, "learning_rate": 0.00011647999999999998, "loss": 2.5091, "step": 838 }, { "epoch": 0.13134001252348151, "grad_norm": 4.376822471618652, "learning_rate": 0.00011661999999999998, "loss": 2.6658, "step": 839 }, { "epoch": 0.13149655604257984, "grad_norm": 5.488125324249268, "learning_rate": 0.00011675999999999999, "loss": 2.3779, "step": 840 }, { "epoch": 0.13165309956167814, "grad_norm": 2.454759359359741, "learning_rate": 0.00011689999999999998, "loss": 2.2902, "step": 841 }, { "epoch": 0.13180964308077645, "grad_norm": 2.4398772716522217, "learning_rate": 0.00011703999999999999, "loss": 2.1758, "step": 842 }, { "epoch": 0.13196618659987477, "grad_norm": 3.648120880126953, "learning_rate": 0.00011717999999999999, "loss": 2.4808, "step": 843 }, { "epoch": 0.13212273011897308, "grad_norm": 2.9085566997528076, "learning_rate": 0.00011731999999999998, "loss": 2.0256, "step": 844 }, { "epoch": 0.13227927363807138, "grad_norm": 3.7388901710510254, "learning_rate": 0.00011745999999999999, "loss": 2.8691, "step": 845 }, { "epoch": 0.1324358171571697, "grad_norm": 2.394740343093872, "learning_rate": 0.00011759999999999999, "loss": 1.7118, "step": 846 }, { "epoch": 0.132592360676268, "grad_norm": 2.265409469604492, "learning_rate": 0.00011773999999999998, "loss": 1.4606, "step": 847 }, { "epoch": 0.1327489041953663, "grad_norm": 2.2392497062683105, "learning_rate": 0.00011787999999999999, "loss": 1.904, "step": 848 }, { "epoch": 0.1329054477144646, "grad_norm": 2.1380107402801514, "learning_rate": 0.00011801999999999998, "loss": 1.8813, "step": 849 }, { "epoch": 0.13306199123356294, "grad_norm": 2.731238603591919, "learning_rate": 0.00011815999999999998, "loss": 1.6695, "step": 850 }, { "epoch": 0.13321853475266124, "grad_norm": 2.5883421897888184, "learning_rate": 0.00011829999999999999, "loss": 2.3368, "step": 851 }, { "epoch": 0.13337507827175954, "grad_norm": 1.3229113817214966, "learning_rate": 0.00011843999999999998, "loss": 2.1475, "step": 852 }, { "epoch": 0.13353162179085787, "grad_norm": 1.177409291267395, "learning_rate": 0.00011857999999999998, "loss": 2.0759, "step": 853 }, { "epoch": 0.13368816530995617, "grad_norm": 0.9478422999382019, "learning_rate": 0.00011871999999999999, "loss": 2.0391, "step": 854 }, { "epoch": 0.13384470882905447, "grad_norm": 0.8899485468864441, "learning_rate": 0.00011885999999999998, "loss": 1.9936, "step": 855 }, { "epoch": 0.1340012523481528, "grad_norm": 1.1349371671676636, "learning_rate": 0.00011899999999999999, "loss": 1.9075, "step": 856 }, { "epoch": 0.1341577958672511, "grad_norm": 1.4690141677856445, "learning_rate": 0.00011913999999999999, "loss": 2.0102, "step": 857 }, { "epoch": 0.1343143393863494, "grad_norm": 1.5211303234100342, "learning_rate": 0.00011927999999999998, "loss": 2.0069, "step": 858 }, { "epoch": 0.1344708829054477, "grad_norm": 1.2659467458724976, "learning_rate": 0.00011941999999999999, "loss": 1.8842, "step": 859 }, { "epoch": 0.13462742642454603, "grad_norm": 0.8751201033592224, "learning_rate": 0.00011955999999999999, "loss": 1.8908, "step": 860 }, { "epoch": 0.13478396994364433, "grad_norm": 1.0208441019058228, "learning_rate": 0.00011969999999999998, "loss": 1.9122, "step": 861 }, { "epoch": 0.13494051346274263, "grad_norm": 0.9369400143623352, "learning_rate": 0.00011983999999999999, "loss": 1.938, "step": 862 }, { "epoch": 0.13509705698184096, "grad_norm": 1.110004186630249, "learning_rate": 0.00011997999999999999, "loss": 1.9331, "step": 863 }, { "epoch": 0.13525360050093926, "grad_norm": 2.832749366760254, "learning_rate": 0.00012011999999999998, "loss": 2.0428, "step": 864 }, { "epoch": 0.13541014402003757, "grad_norm": 1.0357258319854736, "learning_rate": 0.00012025999999999999, "loss": 1.8712, "step": 865 }, { "epoch": 0.13556668753913587, "grad_norm": 1.4955414533615112, "learning_rate": 0.00012039999999999999, "loss": 1.8601, "step": 866 }, { "epoch": 0.1357232310582342, "grad_norm": 1.5315372943878174, "learning_rate": 0.00012053999999999998, "loss": 2.0476, "step": 867 }, { "epoch": 0.1358797745773325, "grad_norm": 1.0272997617721558, "learning_rate": 0.00012067999999999999, "loss": 1.8413, "step": 868 }, { "epoch": 0.1360363180964308, "grad_norm": 2.2522802352905273, "learning_rate": 0.00012081999999999999, "loss": 1.8177, "step": 869 }, { "epoch": 0.13619286161552913, "grad_norm": 1.2140864133834839, "learning_rate": 0.00012096, "loss": 1.9239, "step": 870 }, { "epoch": 0.13634940513462743, "grad_norm": 1.2827775478363037, "learning_rate": 0.00012109999999999999, "loss": 1.8726, "step": 871 }, { "epoch": 0.13650594865372573, "grad_norm": 1.1074508428573608, "learning_rate": 0.00012123999999999998, "loss": 1.7546, "step": 872 }, { "epoch": 0.13666249217282406, "grad_norm": 2.1737685203552246, "learning_rate": 0.00012137999999999999, "loss": 1.8304, "step": 873 }, { "epoch": 0.13681903569192236, "grad_norm": 2.1538543701171875, "learning_rate": 0.00012151999999999999, "loss": 2.0279, "step": 874 }, { "epoch": 0.13697557921102066, "grad_norm": 2.4480011463165283, "learning_rate": 0.00012165999999999998, "loss": 1.9624, "step": 875 }, { "epoch": 0.13713212273011896, "grad_norm": 1.9370622634887695, "learning_rate": 0.00012179999999999999, "loss": 2.0022, "step": 876 }, { "epoch": 0.1372886662492173, "grad_norm": 4.1577043533325195, "learning_rate": 0.00012193999999999999, "loss": 2.1585, "step": 877 }, { "epoch": 0.1374452097683156, "grad_norm": 3.028465747833252, "learning_rate": 0.00012208, "loss": 2.1816, "step": 878 }, { "epoch": 0.1376017532874139, "grad_norm": 2.452176332473755, "learning_rate": 0.00012221999999999998, "loss": 2.2446, "step": 879 }, { "epoch": 0.13775829680651222, "grad_norm": 1.2644497156143188, "learning_rate": 0.00012236, "loss": 1.8058, "step": 880 }, { "epoch": 0.13791484032561052, "grad_norm": 2.126946449279785, "learning_rate": 0.0001225, "loss": 1.8175, "step": 881 }, { "epoch": 0.13807138384470882, "grad_norm": 3.8555541038513184, "learning_rate": 0.00012263999999999998, "loss": 1.9156, "step": 882 }, { "epoch": 0.13822792736380715, "grad_norm": 2.6028785705566406, "learning_rate": 0.00012278, "loss": 2.2242, "step": 883 }, { "epoch": 0.13838447088290545, "grad_norm": 4.564478874206543, "learning_rate": 0.00012292, "loss": 2.0552, "step": 884 }, { "epoch": 0.13854101440200375, "grad_norm": 4.910191535949707, "learning_rate": 0.00012305999999999998, "loss": 2.2197, "step": 885 }, { "epoch": 0.13869755792110205, "grad_norm": 3.7620277404785156, "learning_rate": 0.00012319999999999999, "loss": 2.5145, "step": 886 }, { "epoch": 0.13885410144020038, "grad_norm": 2.7889788150787354, "learning_rate": 0.00012334, "loss": 2.0011, "step": 887 }, { "epoch": 0.13901064495929868, "grad_norm": 3.595449924468994, "learning_rate": 0.00012348, "loss": 2.0472, "step": 888 }, { "epoch": 0.13916718847839699, "grad_norm": 2.879890203475952, "learning_rate": 0.00012361999999999999, "loss": 2.0581, "step": 889 }, { "epoch": 0.13932373199749531, "grad_norm": 2.7350289821624756, "learning_rate": 0.00012376, "loss": 2.1626, "step": 890 }, { "epoch": 0.13948027551659362, "grad_norm": 1.9148786067962646, "learning_rate": 0.0001239, "loss": 2.5487, "step": 891 }, { "epoch": 0.13963681903569192, "grad_norm": 2.3695597648620605, "learning_rate": 0.00012403999999999998, "loss": 1.8693, "step": 892 }, { "epoch": 0.13979336255479022, "grad_norm": 3.0898189544677734, "learning_rate": 0.00012418, "loss": 1.8561, "step": 893 }, { "epoch": 0.13994990607388855, "grad_norm": 2.2449326515197754, "learning_rate": 0.00012432, "loss": 1.9849, "step": 894 }, { "epoch": 0.14010644959298685, "grad_norm": 1.7513461112976074, "learning_rate": 0.00012445999999999998, "loss": 1.9753, "step": 895 }, { "epoch": 0.14026299311208515, "grad_norm": 1.6577256917953491, "learning_rate": 0.0001246, "loss": 1.8249, "step": 896 }, { "epoch": 0.14041953663118348, "grad_norm": 3.007333993911743, "learning_rate": 0.00012474, "loss": 1.9358, "step": 897 }, { "epoch": 0.14057608015028178, "grad_norm": 2.4221296310424805, "learning_rate": 0.00012487999999999998, "loss": 1.4833, "step": 898 }, { "epoch": 0.14073262366938008, "grad_norm": 2.173570394515991, "learning_rate": 0.00012502, "loss": 1.5678, "step": 899 }, { "epoch": 0.1408891671884784, "grad_norm": 2.5946638584136963, "learning_rate": 0.00012516, "loss": 1.9529, "step": 900 }, { "epoch": 0.1410457107075767, "grad_norm": 2.0543506145477295, "learning_rate": 0.00012529999999999998, "loss": 1.8275, "step": 901 }, { "epoch": 0.141202254226675, "grad_norm": 1.4679551124572754, "learning_rate": 0.00012544, "loss": 1.6818, "step": 902 }, { "epoch": 0.1413587977457733, "grad_norm": 0.9975478053092957, "learning_rate": 0.00012558, "loss": 1.5911, "step": 903 }, { "epoch": 0.14151534126487164, "grad_norm": 1.031247854232788, "learning_rate": 0.00012571999999999998, "loss": 1.4852, "step": 904 }, { "epoch": 0.14167188478396994, "grad_norm": 1.058544397354126, "learning_rate": 0.00012586, "loss": 1.5584, "step": 905 }, { "epoch": 0.14182842830306824, "grad_norm": 0.8970162272453308, "learning_rate": 0.000126, "loss": 1.5329, "step": 906 }, { "epoch": 0.14198497182216657, "grad_norm": 1.0310865640640259, "learning_rate": 0.00012613999999999998, "loss": 1.4247, "step": 907 }, { "epoch": 0.14214151534126487, "grad_norm": 1.0322051048278809, "learning_rate": 0.00012628, "loss": 1.4834, "step": 908 }, { "epoch": 0.14229805886036317, "grad_norm": 1.6974303722381592, "learning_rate": 0.00012642, "loss": 1.5051, "step": 909 }, { "epoch": 0.1424546023794615, "grad_norm": 1.39719820022583, "learning_rate": 0.00012655999999999998, "loss": 1.6462, "step": 910 }, { "epoch": 0.1426111458985598, "grad_norm": 1.317083716392517, "learning_rate": 0.0001267, "loss": 1.3929, "step": 911 }, { "epoch": 0.1427676894176581, "grad_norm": 1.0311046838760376, "learning_rate": 0.00012684, "loss": 1.3397, "step": 912 }, { "epoch": 0.1429242329367564, "grad_norm": 1.339275598526001, "learning_rate": 0.00012697999999999998, "loss": 1.5375, "step": 913 }, { "epoch": 0.14308077645585474, "grad_norm": 1.3102202415466309, "learning_rate": 0.00012712, "loss": 1.3948, "step": 914 }, { "epoch": 0.14323731997495304, "grad_norm": 1.320786714553833, "learning_rate": 0.00012726, "loss": 1.4153, "step": 915 }, { "epoch": 0.14339386349405134, "grad_norm": 1.2929140329360962, "learning_rate": 0.0001274, "loss": 1.4404, "step": 916 }, { "epoch": 0.14355040701314967, "grad_norm": 10.204378128051758, "learning_rate": 0.00012754, "loss": 2.5, "step": 917 }, { "epoch": 0.14370695053224797, "grad_norm": 1.370578646659851, "learning_rate": 0.00012768, "loss": 1.3495, "step": 918 }, { "epoch": 0.14386349405134627, "grad_norm": 1.7200727462768555, "learning_rate": 0.00012782, "loss": 1.515, "step": 919 }, { "epoch": 0.14402003757044457, "grad_norm": 1.4725751876831055, "learning_rate": 0.00012796, "loss": 1.536, "step": 920 }, { "epoch": 0.1441765810895429, "grad_norm": 2.600036382675171, "learning_rate": 0.0001281, "loss": 1.4426, "step": 921 }, { "epoch": 0.1443331246086412, "grad_norm": 1.7257699966430664, "learning_rate": 0.00012824, "loss": 1.3777, "step": 922 }, { "epoch": 0.1444896681277395, "grad_norm": 1.9464218616485596, "learning_rate": 0.00012838, "loss": 1.5901, "step": 923 }, { "epoch": 0.14464621164683783, "grad_norm": 3.0015709400177, "learning_rate": 0.00012852, "loss": 1.6244, "step": 924 }, { "epoch": 0.14480275516593613, "grad_norm": 5.0071868896484375, "learning_rate": 0.00012866, "loss": 1.5601, "step": 925 }, { "epoch": 0.14495929868503443, "grad_norm": 3.3104441165924072, "learning_rate": 0.0001288, "loss": 1.8699, "step": 926 }, { "epoch": 0.14511584220413276, "grad_norm": 4.033897399902344, "learning_rate": 0.00012894, "loss": 1.8322, "step": 927 }, { "epoch": 0.14527238572323106, "grad_norm": 3.1448891162872314, "learning_rate": 0.00012908, "loss": 2.1846, "step": 928 }, { "epoch": 0.14542892924232936, "grad_norm": 2.86073637008667, "learning_rate": 0.00012921999999999999, "loss": 1.7981, "step": 929 }, { "epoch": 0.14558547276142766, "grad_norm": 2.2653987407684326, "learning_rate": 0.00012936, "loss": 1.7882, "step": 930 }, { "epoch": 0.145742016280526, "grad_norm": 4.206417560577393, "learning_rate": 0.0001295, "loss": 1.8616, "step": 931 }, { "epoch": 0.1458985597996243, "grad_norm": 2.1221795082092285, "learning_rate": 0.00012963999999999999, "loss": 2.0097, "step": 932 }, { "epoch": 0.1460551033187226, "grad_norm": 2.066370964050293, "learning_rate": 0.00012978, "loss": 1.3491, "step": 933 }, { "epoch": 0.14621164683782092, "grad_norm": 8.296426773071289, "learning_rate": 0.00012992, "loss": 2.0536, "step": 934 }, { "epoch": 0.14636819035691923, "grad_norm": 4.587265968322754, "learning_rate": 0.00013005999999999998, "loss": 2.0738, "step": 935 }, { "epoch": 0.14652473387601753, "grad_norm": 8.826981544494629, "learning_rate": 0.0001302, "loss": 2.1215, "step": 936 }, { "epoch": 0.14668127739511586, "grad_norm": 2.9622299671173096, "learning_rate": 0.00013034, "loss": 1.8332, "step": 937 }, { "epoch": 0.14683782091421416, "grad_norm": 3.43253755569458, "learning_rate": 0.00013047999999999998, "loss": 2.0999, "step": 938 }, { "epoch": 0.14699436443331246, "grad_norm": 2.6353037357330322, "learning_rate": 0.00013062, "loss": 2.0374, "step": 939 }, { "epoch": 0.14715090795241076, "grad_norm": 2.8590893745422363, "learning_rate": 0.00013076, "loss": 1.6404, "step": 940 }, { "epoch": 0.1473074514715091, "grad_norm": 2.3156516551971436, "learning_rate": 0.00013089999999999998, "loss": 1.8064, "step": 941 }, { "epoch": 0.1474639949906074, "grad_norm": 3.146327018737793, "learning_rate": 0.00013104, "loss": 2.0147, "step": 942 }, { "epoch": 0.1476205385097057, "grad_norm": 2.656663417816162, "learning_rate": 0.00013118, "loss": 1.9578, "step": 943 }, { "epoch": 0.14777708202880402, "grad_norm": 11.462111473083496, "learning_rate": 0.00013131999999999998, "loss": 1.7045, "step": 944 }, { "epoch": 0.14793362554790232, "grad_norm": 3.1804471015930176, "learning_rate": 0.00013146, "loss": 1.7002, "step": 945 }, { "epoch": 0.14809016906700062, "grad_norm": 7.238349437713623, "learning_rate": 0.00013159999999999997, "loss": 1.5249, "step": 946 }, { "epoch": 0.14824671258609892, "grad_norm": 2.1947789192199707, "learning_rate": 0.00013173999999999998, "loss": 1.5727, "step": 947 }, { "epoch": 0.14840325610519725, "grad_norm": 2.1323177814483643, "learning_rate": 0.00013188, "loss": 1.8822, "step": 948 }, { "epoch": 0.14855979962429555, "grad_norm": 4.475412845611572, "learning_rate": 0.00013201999999999997, "loss": 1.7124, "step": 949 }, { "epoch": 0.14871634314339385, "grad_norm": 2.7015368938446045, "learning_rate": 0.00013215999999999998, "loss": 1.9315, "step": 950 }, { "epoch": 0.14887288666249218, "grad_norm": 1.46183443069458, "learning_rate": 0.0001323, "loss": 1.4221, "step": 951 }, { "epoch": 0.14902943018159048, "grad_norm": 1.3697696924209595, "learning_rate": 0.00013243999999999997, "loss": 1.2967, "step": 952 }, { "epoch": 0.14918597370068878, "grad_norm": 1.602593183517456, "learning_rate": 0.00013257999999999998, "loss": 1.5175, "step": 953 }, { "epoch": 0.1493425172197871, "grad_norm": 1.363165259361267, "learning_rate": 0.00013272, "loss": 1.1675, "step": 954 }, { "epoch": 0.14949906073888541, "grad_norm": 1.1354371309280396, "learning_rate": 0.00013285999999999997, "loss": 1.3275, "step": 955 }, { "epoch": 0.14965560425798372, "grad_norm": 1.0567865371704102, "learning_rate": 0.00013299999999999998, "loss": 1.2511, "step": 956 }, { "epoch": 0.14981214777708202, "grad_norm": 0.8563328385353088, "learning_rate": 0.00013314, "loss": 1.2332, "step": 957 }, { "epoch": 0.14996869129618035, "grad_norm": 1.031952977180481, "learning_rate": 0.00013327999999999997, "loss": 1.1388, "step": 958 }, { "epoch": 0.15012523481527865, "grad_norm": 0.9164784550666809, "learning_rate": 0.00013341999999999998, "loss": 1.0765, "step": 959 }, { "epoch": 0.15028177833437695, "grad_norm": 1.1913617849349976, "learning_rate": 0.00013356, "loss": 1.121, "step": 960 }, { "epoch": 0.15043832185347528, "grad_norm": 1.4612181186676025, "learning_rate": 0.00013369999999999997, "loss": 1.3126, "step": 961 }, { "epoch": 0.15059486537257358, "grad_norm": 1.0157369375228882, "learning_rate": 0.00013383999999999998, "loss": 1.0754, "step": 962 }, { "epoch": 0.15075140889167188, "grad_norm": 1.6872997283935547, "learning_rate": 0.00013398, "loss": 1.397, "step": 963 }, { "epoch": 0.1509079524107702, "grad_norm": 1.0621217489242554, "learning_rate": 0.00013411999999999997, "loss": 1.1537, "step": 964 }, { "epoch": 0.1510644959298685, "grad_norm": 1.0543960332870483, "learning_rate": 0.00013425999999999998, "loss": 1.0374, "step": 965 }, { "epoch": 0.1512210394489668, "grad_norm": 1.186785101890564, "learning_rate": 0.0001344, "loss": 1.1204, "step": 966 }, { "epoch": 0.1513775829680651, "grad_norm": 1.5316989421844482, "learning_rate": 0.00013454, "loss": 1.2028, "step": 967 }, { "epoch": 0.15153412648716344, "grad_norm": 1.6794068813323975, "learning_rate": 0.00013467999999999998, "loss": 1.2752, "step": 968 }, { "epoch": 0.15169067000626174, "grad_norm": 2.2077221870422363, "learning_rate": 0.00013482, "loss": 1.3483, "step": 969 }, { "epoch": 0.15184721352536004, "grad_norm": 2.032829523086548, "learning_rate": 0.00013496, "loss": 1.076, "step": 970 }, { "epoch": 0.15200375704445837, "grad_norm": 2.4871792793273926, "learning_rate": 0.00013509999999999998, "loss": 1.6775, "step": 971 }, { "epoch": 0.15216030056355667, "grad_norm": 2.935312032699585, "learning_rate": 0.00013523999999999999, "loss": 1.2158, "step": 972 }, { "epoch": 0.15231684408265497, "grad_norm": 2.2789218425750732, "learning_rate": 0.00013538, "loss": 1.3085, "step": 973 }, { "epoch": 0.15247338760175327, "grad_norm": 3.3459551334381104, "learning_rate": 0.00013551999999999998, "loss": 1.7355, "step": 974 }, { "epoch": 0.1526299311208516, "grad_norm": 6.613864898681641, "learning_rate": 0.00013565999999999999, "loss": 2.2709, "step": 975 }, { "epoch": 0.1527864746399499, "grad_norm": 3.2819619178771973, "learning_rate": 0.0001358, "loss": 1.3874, "step": 976 }, { "epoch": 0.1529430181590482, "grad_norm": 1.376680612564087, "learning_rate": 0.00013593999999999998, "loss": 1.5314, "step": 977 }, { "epoch": 0.15309956167814653, "grad_norm": 2.5585310459136963, "learning_rate": 0.00013607999999999998, "loss": 1.4208, "step": 978 }, { "epoch": 0.15325610519724484, "grad_norm": 2.9100072383880615, "learning_rate": 0.00013622, "loss": 1.6857, "step": 979 }, { "epoch": 0.15341264871634314, "grad_norm": 5.595428943634033, "learning_rate": 0.00013635999999999998, "loss": 1.6151, "step": 980 }, { "epoch": 0.15356919223544147, "grad_norm": 3.5693323612213135, "learning_rate": 0.00013649999999999998, "loss": 1.5767, "step": 981 }, { "epoch": 0.15372573575453977, "grad_norm": 1.8305296897888184, "learning_rate": 0.00013664, "loss": 1.5578, "step": 982 }, { "epoch": 0.15388227927363807, "grad_norm": 2.735269069671631, "learning_rate": 0.00013677999999999997, "loss": 1.4802, "step": 983 }, { "epoch": 0.15403882279273637, "grad_norm": 2.3742611408233643, "learning_rate": 0.00013691999999999998, "loss": 1.8017, "step": 984 }, { "epoch": 0.1541953663118347, "grad_norm": 2.7400946617126465, "learning_rate": 0.00013706, "loss": 1.5721, "step": 985 }, { "epoch": 0.154351909830933, "grad_norm": 3.509246349334717, "learning_rate": 0.00013719999999999997, "loss": 1.5171, "step": 986 }, { "epoch": 0.1545084533500313, "grad_norm": 3.410489320755005, "learning_rate": 0.00013733999999999998, "loss": 1.9255, "step": 987 }, { "epoch": 0.15466499686912963, "grad_norm": 3.7892508506774902, "learning_rate": 0.00013748, "loss": 1.9453, "step": 988 }, { "epoch": 0.15482154038822793, "grad_norm": 3.4583120346069336, "learning_rate": 0.00013761999999999997, "loss": 1.9914, "step": 989 }, { "epoch": 0.15497808390732623, "grad_norm": 3.0838608741760254, "learning_rate": 0.00013775999999999998, "loss": 2.4562, "step": 990 }, { "epoch": 0.15513462742642456, "grad_norm": 4.484135627746582, "learning_rate": 0.0001379, "loss": 1.8179, "step": 991 }, { "epoch": 0.15529117094552286, "grad_norm": 3.5873305797576904, "learning_rate": 0.00013803999999999997, "loss": 1.65, "step": 992 }, { "epoch": 0.15544771446462116, "grad_norm": 3.8719258308410645, "learning_rate": 0.00013817999999999998, "loss": 1.9819, "step": 993 }, { "epoch": 0.15560425798371946, "grad_norm": 2.5805089473724365, "learning_rate": 0.00013832, "loss": 1.5095, "step": 994 }, { "epoch": 0.1557608015028178, "grad_norm": 2.7916109561920166, "learning_rate": 0.00013846, "loss": 1.6662, "step": 995 }, { "epoch": 0.1559173450219161, "grad_norm": 3.5318758487701416, "learning_rate": 0.00013859999999999998, "loss": 0.9445, "step": 996 }, { "epoch": 0.1560738885410144, "grad_norm": 3.052644968032837, "learning_rate": 0.00013874, "loss": 1.4048, "step": 997 }, { "epoch": 0.15623043206011272, "grad_norm": 2.6523542404174805, "learning_rate": 0.00013888, "loss": 1.6899, "step": 998 }, { "epoch": 0.15638697557921102, "grad_norm": 3.5929372310638428, "learning_rate": 0.00013901999999999998, "loss": 1.2796, "step": 999 }, { "epoch": 0.15654351909830932, "grad_norm": 4.047633647918701, "learning_rate": 0.00013916, "loss": 1.5455, "step": 1000 }, { "epoch": 0.15654351909830932, "eval_loss": 1.3698441982269287, "eval_runtime": 228.1817, "eval_samples_per_second": 54.268, "eval_steps_per_second": 3.392, "eval_wer": 0.8373143209299361, "step": 1000 }, { "epoch": 0.15670006261740763, "grad_norm": 1.5415165424346924, "learning_rate": 0.0001393, "loss": 1.2233, "step": 1001 }, { "epoch": 0.15685660613650595, "grad_norm": 1.061021089553833, "learning_rate": 0.00013943999999999998, "loss": 1.1203, "step": 1002 }, { "epoch": 0.15701314965560426, "grad_norm": 0.8321822285652161, "learning_rate": 0.00013958, "loss": 0.8772, "step": 1003 }, { "epoch": 0.15716969317470256, "grad_norm": 0.8096780180931091, "learning_rate": 0.00013972, "loss": 0.9069, "step": 1004 }, { "epoch": 0.15732623669380089, "grad_norm": 1.34067702293396, "learning_rate": 0.00013985999999999998, "loss": 1.0085, "step": 1005 }, { "epoch": 0.1574827802128992, "grad_norm": 1.727418303489685, "learning_rate": 0.00014, "loss": 1.0505, "step": 1006 }, { "epoch": 0.1576393237319975, "grad_norm": 1.6297084093093872, "learning_rate": 0.00013998811141304347, "loss": 0.9991, "step": 1007 }, { "epoch": 0.15779586725109582, "grad_norm": 3.0399985313415527, "learning_rate": 0.00013997622282608694, "loss": 1.3636, "step": 1008 }, { "epoch": 0.15795241077019412, "grad_norm": 1.1833268404006958, "learning_rate": 0.00013996433423913042, "loss": 0.9547, "step": 1009 }, { "epoch": 0.15810895428929242, "grad_norm": 1.2304561138153076, "learning_rate": 0.0001399524456521739, "loss": 0.9626, "step": 1010 }, { "epoch": 0.15826549780839072, "grad_norm": 1.311085820198059, "learning_rate": 0.00013994055706521738, "loss": 0.98, "step": 1011 }, { "epoch": 0.15842204132748905, "grad_norm": 1.9254063367843628, "learning_rate": 0.00013992866847826086, "loss": 1.2136, "step": 1012 }, { "epoch": 0.15857858484658735, "grad_norm": 2.0011343955993652, "learning_rate": 0.00013991677989130433, "loss": 1.3695, "step": 1013 }, { "epoch": 0.15873512836568565, "grad_norm": 0.9859755039215088, "learning_rate": 0.0001399048913043478, "loss": 0.9877, "step": 1014 }, { "epoch": 0.15889167188478398, "grad_norm": 1.4412119388580322, "learning_rate": 0.0001398930027173913, "loss": 1.0775, "step": 1015 }, { "epoch": 0.15904821540388228, "grad_norm": 2.518597364425659, "learning_rate": 0.00013988111413043477, "loss": 1.1596, "step": 1016 }, { "epoch": 0.15920475892298058, "grad_norm": 1.7696287631988525, "learning_rate": 0.00013986922554347825, "loss": 0.9042, "step": 1017 }, { "epoch": 0.1593613024420789, "grad_norm": 2.221297264099121, "learning_rate": 0.00013985733695652173, "loss": 1.1497, "step": 1018 }, { "epoch": 0.1595178459611772, "grad_norm": 1.7115248441696167, "learning_rate": 0.0001398454483695652, "loss": 1.2922, "step": 1019 }, { "epoch": 0.1596743894802755, "grad_norm": 3.0485925674438477, "learning_rate": 0.00013983355978260868, "loss": 1.6158, "step": 1020 }, { "epoch": 0.15983093299937381, "grad_norm": 1.7239434719085693, "learning_rate": 0.00013982167119565216, "loss": 1.0365, "step": 1021 }, { "epoch": 0.15998747651847214, "grad_norm": 3.0233840942382812, "learning_rate": 0.00013980978260869564, "loss": 1.1305, "step": 1022 }, { "epoch": 0.16014402003757044, "grad_norm": 1.8098385334014893, "learning_rate": 0.00013979789402173912, "loss": 1.3633, "step": 1023 }, { "epoch": 0.16030056355666875, "grad_norm": 1.6836488246917725, "learning_rate": 0.0001397860054347826, "loss": 1.0722, "step": 1024 }, { "epoch": 0.16045710707576707, "grad_norm": 1.8708122968673706, "learning_rate": 0.00013977411684782607, "loss": 1.3807, "step": 1025 }, { "epoch": 0.16061365059486538, "grad_norm": 2.275538206100464, "learning_rate": 0.00013976222826086955, "loss": 1.5252, "step": 1026 }, { "epoch": 0.16077019411396368, "grad_norm": 2.313462495803833, "learning_rate": 0.00013975033967391303, "loss": 1.3437, "step": 1027 }, { "epoch": 0.16092673763306198, "grad_norm": 2.514657735824585, "learning_rate": 0.0001397384510869565, "loss": 1.4435, "step": 1028 }, { "epoch": 0.1610832811521603, "grad_norm": 2.4268715381622314, "learning_rate": 0.00013972656249999999, "loss": 1.7843, "step": 1029 }, { "epoch": 0.1612398246712586, "grad_norm": 3.1912734508514404, "learning_rate": 0.00013971467391304346, "loss": 1.0239, "step": 1030 }, { "epoch": 0.1613963681903569, "grad_norm": 4.297966957092285, "learning_rate": 0.00013970278532608694, "loss": 1.6437, "step": 1031 }, { "epoch": 0.16155291170945524, "grad_norm": 6.017107963562012, "learning_rate": 0.00013969089673913042, "loss": 2.0804, "step": 1032 }, { "epoch": 0.16170945522855354, "grad_norm": 2.5551257133483887, "learning_rate": 0.0001396790081521739, "loss": 1.0907, "step": 1033 }, { "epoch": 0.16186599874765184, "grad_norm": 2.698896884918213, "learning_rate": 0.00013966711956521738, "loss": 1.7325, "step": 1034 }, { "epoch": 0.16202254226675017, "grad_norm": 2.739001989364624, "learning_rate": 0.00013965523097826085, "loss": 1.6463, "step": 1035 }, { "epoch": 0.16217908578584847, "grad_norm": 5.473845481872559, "learning_rate": 0.00013964334239130433, "loss": 2.03, "step": 1036 }, { "epoch": 0.16233562930494677, "grad_norm": 3.909123420715332, "learning_rate": 0.0001396314538043478, "loss": 1.6519, "step": 1037 }, { "epoch": 0.16249217282404507, "grad_norm": 2.649529218673706, "learning_rate": 0.0001396195652173913, "loss": 2.2789, "step": 1038 }, { "epoch": 0.1626487163431434, "grad_norm": 5.0792460441589355, "learning_rate": 0.00013960767663043477, "loss": 1.2988, "step": 1039 }, { "epoch": 0.1628052598622417, "grad_norm": 4.057116985321045, "learning_rate": 0.00013959578804347825, "loss": 2.0138, "step": 1040 }, { "epoch": 0.16296180338134, "grad_norm": 6.3826141357421875, "learning_rate": 0.00013958389945652172, "loss": 1.848, "step": 1041 }, { "epoch": 0.16311834690043833, "grad_norm": 2.772425889968872, "learning_rate": 0.0001395720108695652, "loss": 1.0958, "step": 1042 }, { "epoch": 0.16327489041953663, "grad_norm": 2.9766244888305664, "learning_rate": 0.00013956012228260868, "loss": 1.645, "step": 1043 }, { "epoch": 0.16343143393863493, "grad_norm": 3.4380030632019043, "learning_rate": 0.00013954823369565216, "loss": 1.7502, "step": 1044 }, { "epoch": 0.16358797745773326, "grad_norm": 5.7332353591918945, "learning_rate": 0.00013953634510869564, "loss": 2.034, "step": 1045 }, { "epoch": 0.16374452097683156, "grad_norm": 3.710676431655884, "learning_rate": 0.00013952445652173911, "loss": 1.5026, "step": 1046 }, { "epoch": 0.16390106449592987, "grad_norm": 4.587976932525635, "learning_rate": 0.0001395125679347826, "loss": 1.5378, "step": 1047 }, { "epoch": 0.16405760801502817, "grad_norm": 3.7711026668548584, "learning_rate": 0.00013950067934782607, "loss": 1.092, "step": 1048 }, { "epoch": 0.1642141515341265, "grad_norm": 6.231589317321777, "learning_rate": 0.00013948879076086955, "loss": 1.2652, "step": 1049 }, { "epoch": 0.1643706950532248, "grad_norm": 4.928447246551514, "learning_rate": 0.00013947690217391303, "loss": 1.1766, "step": 1050 }, { "epoch": 0.1645272385723231, "grad_norm": 1.8291659355163574, "learning_rate": 0.0001394650135869565, "loss": 1.0315, "step": 1051 }, { "epoch": 0.16468378209142143, "grad_norm": 1.0126136541366577, "learning_rate": 0.00013945312499999998, "loss": 0.9022, "step": 1052 }, { "epoch": 0.16484032561051973, "grad_norm": 0.8456835150718689, "learning_rate": 0.00013944123641304346, "loss": 0.7849, "step": 1053 }, { "epoch": 0.16499686912961803, "grad_norm": 1.1472587585449219, "learning_rate": 0.00013942934782608694, "loss": 0.9459, "step": 1054 }, { "epoch": 0.16515341264871633, "grad_norm": 1.1470049619674683, "learning_rate": 0.00013941745923913042, "loss": 0.858, "step": 1055 }, { "epoch": 0.16530995616781466, "grad_norm": 1.9723553657531738, "learning_rate": 0.0001394055706521739, "loss": 1.059, "step": 1056 }, { "epoch": 0.16546649968691296, "grad_norm": 1.1330560445785522, "learning_rate": 0.00013939368206521737, "loss": 0.6842, "step": 1057 }, { "epoch": 0.16562304320601126, "grad_norm": 0.9715387225151062, "learning_rate": 0.00013938179347826085, "loss": 0.93, "step": 1058 }, { "epoch": 0.1657795867251096, "grad_norm": 1.1715155839920044, "learning_rate": 0.00013936990489130433, "loss": 0.7605, "step": 1059 }, { "epoch": 0.1659361302442079, "grad_norm": 1.1585019826889038, "learning_rate": 0.0001393580163043478, "loss": 0.7356, "step": 1060 }, { "epoch": 0.1660926737633062, "grad_norm": 1.050093412399292, "learning_rate": 0.00013934612771739129, "loss": 0.8181, "step": 1061 }, { "epoch": 0.16624921728240452, "grad_norm": 1.5957146883010864, "learning_rate": 0.00013933423913043476, "loss": 0.9484, "step": 1062 }, { "epoch": 0.16640576080150282, "grad_norm": 2.15415620803833, "learning_rate": 0.00013932235054347824, "loss": 0.9226, "step": 1063 }, { "epoch": 0.16656230432060112, "grad_norm": 1.1880176067352295, "learning_rate": 0.00013931046195652172, "loss": 0.8282, "step": 1064 }, { "epoch": 0.16671884783969942, "grad_norm": 1.481093168258667, "learning_rate": 0.0001392985733695652, "loss": 1.1132, "step": 1065 }, { "epoch": 0.16687539135879775, "grad_norm": 2.149517059326172, "learning_rate": 0.00013928668478260868, "loss": 1.1208, "step": 1066 }, { "epoch": 0.16703193487789605, "grad_norm": 1.713031530380249, "learning_rate": 0.00013927479619565216, "loss": 0.9256, "step": 1067 }, { "epoch": 0.16718847839699436, "grad_norm": 1.3453065156936646, "learning_rate": 0.00013926290760869563, "loss": 0.9416, "step": 1068 }, { "epoch": 0.16734502191609268, "grad_norm": 2.570960521697998, "learning_rate": 0.0001392510190217391, "loss": 1.2097, "step": 1069 }, { "epoch": 0.16750156543519099, "grad_norm": 1.8837968111038208, "learning_rate": 0.0001392391304347826, "loss": 0.9828, "step": 1070 }, { "epoch": 0.1676581089542893, "grad_norm": 2.3390352725982666, "learning_rate": 0.00013922724184782607, "loss": 1.0359, "step": 1071 }, { "epoch": 0.16781465247338762, "grad_norm": 2.3108623027801514, "learning_rate": 0.00013921535326086955, "loss": 1.3082, "step": 1072 }, { "epoch": 0.16797119599248592, "grad_norm": 1.5533983707427979, "learning_rate": 0.00013920346467391302, "loss": 0.9308, "step": 1073 }, { "epoch": 0.16812773951158422, "grad_norm": 2.3024415969848633, "learning_rate": 0.0001391915760869565, "loss": 1.0432, "step": 1074 }, { "epoch": 0.16828428303068252, "grad_norm": 1.336690902709961, "learning_rate": 0.00013917968749999998, "loss": 1.2015, "step": 1075 }, { "epoch": 0.16844082654978085, "grad_norm": 2.9477813243865967, "learning_rate": 0.00013916779891304346, "loss": 1.127, "step": 1076 }, { "epoch": 0.16859737006887915, "grad_norm": 3.189311981201172, "learning_rate": 0.00013915591032608694, "loss": 1.5686, "step": 1077 }, { "epoch": 0.16875391358797745, "grad_norm": 3.5925378799438477, "learning_rate": 0.00013914402173913041, "loss": 1.077, "step": 1078 }, { "epoch": 0.16891045710707578, "grad_norm": 2.5564584732055664, "learning_rate": 0.0001391321331521739, "loss": 1.1998, "step": 1079 }, { "epoch": 0.16906700062617408, "grad_norm": 3.2485923767089844, "learning_rate": 0.00013912024456521737, "loss": 1.1664, "step": 1080 }, { "epoch": 0.16922354414527238, "grad_norm": 6.488343715667725, "learning_rate": 0.00013910835597826085, "loss": 1.9496, "step": 1081 }, { "epoch": 0.16938008766437068, "grad_norm": 3.5911662578582764, "learning_rate": 0.00013909646739130433, "loss": 1.5137, "step": 1082 }, { "epoch": 0.169536631183469, "grad_norm": 2.6497204303741455, "learning_rate": 0.00013908457880434783, "loss": 1.3199, "step": 1083 }, { "epoch": 0.1696931747025673, "grad_norm": 3.6623032093048096, "learning_rate": 0.0001390726902173913, "loss": 1.5944, "step": 1084 }, { "epoch": 0.1698497182216656, "grad_norm": 3.5559940338134766, "learning_rate": 0.0001390608016304348, "loss": 1.5203, "step": 1085 }, { "epoch": 0.17000626174076394, "grad_norm": 3.5410549640655518, "learning_rate": 0.00013904891304347827, "loss": 1.952, "step": 1086 }, { "epoch": 0.17016280525986224, "grad_norm": 3.4739341735839844, "learning_rate": 0.00013903702445652172, "loss": 1.4786, "step": 1087 }, { "epoch": 0.17031934877896054, "grad_norm": 3.4184319972991943, "learning_rate": 0.0001390251358695652, "loss": 1.323, "step": 1088 }, { "epoch": 0.17047589229805887, "grad_norm": 3.2705204486846924, "learning_rate": 0.00013901324728260867, "loss": 1.6239, "step": 1089 }, { "epoch": 0.17063243581715717, "grad_norm": 3.210252285003662, "learning_rate": 0.00013900135869565215, "loss": 1.5789, "step": 1090 }, { "epoch": 0.17078897933625548, "grad_norm": 3.7260563373565674, "learning_rate": 0.00013898947010869563, "loss": 1.5236, "step": 1091 }, { "epoch": 0.17094552285535378, "grad_norm": 7.177730560302734, "learning_rate": 0.0001389775815217391, "loss": 2.3199, "step": 1092 }, { "epoch": 0.1711020663744521, "grad_norm": 5.096472263336182, "learning_rate": 0.0001389656929347826, "loss": 2.5233, "step": 1093 }, { "epoch": 0.1712586098935504, "grad_norm": 6.038705348968506, "learning_rate": 0.00013895380434782607, "loss": 1.8506, "step": 1094 }, { "epoch": 0.1714151534126487, "grad_norm": 3.0837748050689697, "learning_rate": 0.00013894191576086954, "loss": 1.545, "step": 1095 }, { "epoch": 0.17157169693174704, "grad_norm": 3.579521894454956, "learning_rate": 0.00013893002717391302, "loss": 1.5336, "step": 1096 }, { "epoch": 0.17172824045084534, "grad_norm": 6.743979454040527, "learning_rate": 0.0001389181385869565, "loss": 0.8934, "step": 1097 }, { "epoch": 0.17188478396994364, "grad_norm": 4.997381687164307, "learning_rate": 0.00013890624999999998, "loss": 1.2611, "step": 1098 }, { "epoch": 0.17204132748904197, "grad_norm": 4.715262413024902, "learning_rate": 0.00013889436141304346, "loss": 0.847, "step": 1099 }, { "epoch": 0.17219787100814027, "grad_norm": 6.635977745056152, "learning_rate": 0.00013888247282608693, "loss": 1.0562, "step": 1100 }, { "epoch": 0.17235441452723857, "grad_norm": 0.8800053000450134, "learning_rate": 0.0001388705842391304, "loss": 0.772, "step": 1101 }, { "epoch": 0.17251095804633687, "grad_norm": 0.7850254774093628, "learning_rate": 0.0001388586956521739, "loss": 0.7905, "step": 1102 }, { "epoch": 0.1726675015654352, "grad_norm": 0.821031928062439, "learning_rate": 0.0001388468070652174, "loss": 0.6522, "step": 1103 }, { "epoch": 0.1728240450845335, "grad_norm": 1.1483231782913208, "learning_rate": 0.00013883491847826087, "loss": 0.7869, "step": 1104 }, { "epoch": 0.1729805886036318, "grad_norm": 0.9111392498016357, "learning_rate": 0.00013882302989130435, "loss": 0.7277, "step": 1105 }, { "epoch": 0.17313713212273013, "grad_norm": 0.9502939581871033, "learning_rate": 0.00013881114130434783, "loss": 0.6384, "step": 1106 }, { "epoch": 0.17329367564182843, "grad_norm": 0.9236769080162048, "learning_rate": 0.0001387992527173913, "loss": 0.8573, "step": 1107 }, { "epoch": 0.17345021916092673, "grad_norm": 1.0548843145370483, "learning_rate": 0.0001387873641304348, "loss": 0.7505, "step": 1108 }, { "epoch": 0.17360676268002503, "grad_norm": 1.1872124671936035, "learning_rate": 0.00013877547554347826, "loss": 0.7895, "step": 1109 }, { "epoch": 0.17376330619912336, "grad_norm": 1.6686358451843262, "learning_rate": 0.00013876358695652172, "loss": 0.7377, "step": 1110 }, { "epoch": 0.17391984971822166, "grad_norm": 1.0482289791107178, "learning_rate": 0.0001387516983695652, "loss": 0.963, "step": 1111 }, { "epoch": 0.17407639323731997, "grad_norm": 2.695770025253296, "learning_rate": 0.00013873980978260867, "loss": 0.8714, "step": 1112 }, { "epoch": 0.1742329367564183, "grad_norm": 1.785277009010315, "learning_rate": 0.00013872792119565215, "loss": 0.9773, "step": 1113 }, { "epoch": 0.1743894802755166, "grad_norm": 1.0214200019836426, "learning_rate": 0.00013871603260869563, "loss": 0.7528, "step": 1114 }, { "epoch": 0.1745460237946149, "grad_norm": 2.8060266971588135, "learning_rate": 0.0001387041440217391, "loss": 1.1788, "step": 1115 }, { "epoch": 0.17470256731371323, "grad_norm": 3.707406759262085, "learning_rate": 0.00013869225543478258, "loss": 1.3324, "step": 1116 }, { "epoch": 0.17485911083281153, "grad_norm": 1.943273901939392, "learning_rate": 0.00013868036684782606, "loss": 0.812, "step": 1117 }, { "epoch": 0.17501565435190983, "grad_norm": 1.5674643516540527, "learning_rate": 0.00013866847826086954, "loss": 0.7308, "step": 1118 }, { "epoch": 0.17517219787100813, "grad_norm": 3.069028377532959, "learning_rate": 0.00013865658967391302, "loss": 1.1539, "step": 1119 }, { "epoch": 0.17532874139010646, "grad_norm": 6.27308988571167, "learning_rate": 0.0001386447010869565, "loss": 1.2243, "step": 1120 }, { "epoch": 0.17548528490920476, "grad_norm": 3.458484649658203, "learning_rate": 0.00013863281249999998, "loss": 1.3303, "step": 1121 }, { "epoch": 0.17564182842830306, "grad_norm": 1.5605264902114868, "learning_rate": 0.00013862092391304345, "loss": 0.8897, "step": 1122 }, { "epoch": 0.1757983719474014, "grad_norm": 3.9702603816986084, "learning_rate": 0.00013860903532608696, "loss": 1.1395, "step": 1123 }, { "epoch": 0.1759549154664997, "grad_norm": 2.432300329208374, "learning_rate": 0.00013859714673913044, "loss": 1.3528, "step": 1124 }, { "epoch": 0.176111458985598, "grad_norm": 2.906536102294922, "learning_rate": 0.00013858525815217392, "loss": 1.3549, "step": 1125 }, { "epoch": 0.1762680025046963, "grad_norm": 1.5664812326431274, "learning_rate": 0.0001385733695652174, "loss": 1.1747, "step": 1126 }, { "epoch": 0.17642454602379462, "grad_norm": 1.730055332183838, "learning_rate": 0.00013856148097826087, "loss": 1.0783, "step": 1127 }, { "epoch": 0.17658108954289292, "grad_norm": 1.8372676372528076, "learning_rate": 0.00013854959239130435, "loss": 1.0209, "step": 1128 }, { "epoch": 0.17673763306199122, "grad_norm": 5.294225215911865, "learning_rate": 0.00013853770380434783, "loss": 1.5207, "step": 1129 }, { "epoch": 0.17689417658108955, "grad_norm": 3.2137813568115234, "learning_rate": 0.0001385258152173913, "loss": 1.3878, "step": 1130 }, { "epoch": 0.17705072010018785, "grad_norm": 3.9759559631347656, "learning_rate": 0.00013851392663043478, "loss": 1.2395, "step": 1131 }, { "epoch": 0.17720726361928615, "grad_norm": 2.4173622131347656, "learning_rate": 0.00013850203804347826, "loss": 1.5307, "step": 1132 }, { "epoch": 0.17736380713838448, "grad_norm": 4.845193862915039, "learning_rate": 0.0001384901494565217, "loss": 1.2753, "step": 1133 }, { "epoch": 0.17752035065748278, "grad_norm": 3.2899038791656494, "learning_rate": 0.0001384782608695652, "loss": 1.6317, "step": 1134 }, { "epoch": 0.17767689417658108, "grad_norm": 3.013888120651245, "learning_rate": 0.00013846637228260867, "loss": 1.5242, "step": 1135 }, { "epoch": 0.17783343769567939, "grad_norm": 9.928550720214844, "learning_rate": 0.00013845448369565215, "loss": 1.4023, "step": 1136 }, { "epoch": 0.17798998121477771, "grad_norm": 3.181631326675415, "learning_rate": 0.00013844259510869563, "loss": 1.7007, "step": 1137 }, { "epoch": 0.17814652473387602, "grad_norm": 3.9870283603668213, "learning_rate": 0.0001384307065217391, "loss": 1.8608, "step": 1138 }, { "epoch": 0.17830306825297432, "grad_norm": 4.658505916595459, "learning_rate": 0.00013841881793478258, "loss": 1.4644, "step": 1139 }, { "epoch": 0.17845961177207265, "grad_norm": 2.5877692699432373, "learning_rate": 0.00013840692934782606, "loss": 1.596, "step": 1140 }, { "epoch": 0.17861615529117095, "grad_norm": 3.3328001499176025, "learning_rate": 0.00013839504076086954, "loss": 2.5196, "step": 1141 }, { "epoch": 0.17877269881026925, "grad_norm": 5.279830455780029, "learning_rate": 0.00013838315217391302, "loss": 1.7119, "step": 1142 }, { "epoch": 0.17892924232936758, "grad_norm": 1.8498363494873047, "learning_rate": 0.00013837126358695652, "loss": 1.527, "step": 1143 }, { "epoch": 0.17908578584846588, "grad_norm": 3.692067861557007, "learning_rate": 0.000138359375, "loss": 1.8178, "step": 1144 }, { "epoch": 0.17924232936756418, "grad_norm": 3.209484100341797, "learning_rate": 0.00013834748641304348, "loss": 1.4279, "step": 1145 }, { "epoch": 0.17939887288666248, "grad_norm": 2.2020740509033203, "learning_rate": 0.00013833559782608696, "loss": 1.1906, "step": 1146 }, { "epoch": 0.1795554164057608, "grad_norm": 2.9131336212158203, "learning_rate": 0.00013832370923913043, "loss": 1.2807, "step": 1147 }, { "epoch": 0.1797119599248591, "grad_norm": 1.2524566650390625, "learning_rate": 0.0001383118206521739, "loss": 1.0437, "step": 1148 }, { "epoch": 0.1798685034439574, "grad_norm": 5.672247886657715, "learning_rate": 0.0001382999320652174, "loss": 1.9938, "step": 1149 }, { "epoch": 0.18002504696305574, "grad_norm": 3.5022571086883545, "learning_rate": 0.00013828804347826087, "loss": 1.9666, "step": 1150 }, { "epoch": 0.18018159048215404, "grad_norm": 1.2709381580352783, "learning_rate": 0.00013827615489130435, "loss": 0.7809, "step": 1151 }, { "epoch": 0.18033813400125234, "grad_norm": 0.973462700843811, "learning_rate": 0.00013826426630434783, "loss": 0.7235, "step": 1152 }, { "epoch": 0.18049467752035064, "grad_norm": 0.8847609162330627, "learning_rate": 0.0001382523777173913, "loss": 0.6428, "step": 1153 }, { "epoch": 0.18065122103944897, "grad_norm": 0.8437796235084534, "learning_rate": 0.00013824048913043478, "loss": 0.6111, "step": 1154 }, { "epoch": 0.18080776455854727, "grad_norm": 1.0947692394256592, "learning_rate": 0.00013822860054347826, "loss": 0.9158, "step": 1155 }, { "epoch": 0.18096430807764557, "grad_norm": 0.9939146041870117, "learning_rate": 0.0001382167119565217, "loss": 0.6815, "step": 1156 }, { "epoch": 0.1811208515967439, "grad_norm": 1.1150538921356201, "learning_rate": 0.0001382048233695652, "loss": 0.6385, "step": 1157 }, { "epoch": 0.1812773951158422, "grad_norm": 0.8164706230163574, "learning_rate": 0.00013819293478260867, "loss": 0.7362, "step": 1158 }, { "epoch": 0.1814339386349405, "grad_norm": 1.062494158744812, "learning_rate": 0.00013818104619565215, "loss": 0.827, "step": 1159 }, { "epoch": 0.18159048215403883, "grad_norm": 2.4140355587005615, "learning_rate": 0.00013816915760869562, "loss": 0.8054, "step": 1160 }, { "epoch": 0.18174702567313714, "grad_norm": 1.4156569242477417, "learning_rate": 0.0001381572690217391, "loss": 0.6933, "step": 1161 }, { "epoch": 0.18190356919223544, "grad_norm": 1.4422311782836914, "learning_rate": 0.00013814538043478258, "loss": 0.7601, "step": 1162 }, { "epoch": 0.18206011271133374, "grad_norm": 1.5656534433364868, "learning_rate": 0.00013813349184782609, "loss": 0.7732, "step": 1163 }, { "epoch": 0.18221665623043207, "grad_norm": 1.879874348640442, "learning_rate": 0.00013812160326086956, "loss": 0.7946, "step": 1164 }, { "epoch": 0.18237319974953037, "grad_norm": 1.63367760181427, "learning_rate": 0.00013810971467391304, "loss": 1.1251, "step": 1165 }, { "epoch": 0.18252974326862867, "grad_norm": 1.4586844444274902, "learning_rate": 0.00013809782608695652, "loss": 0.7866, "step": 1166 }, { "epoch": 0.182686286787727, "grad_norm": 1.4551407098770142, "learning_rate": 0.0001380859375, "loss": 0.9986, "step": 1167 }, { "epoch": 0.1828428303068253, "grad_norm": 1.5485121011734009, "learning_rate": 0.00013807404891304348, "loss": 1.0334, "step": 1168 }, { "epoch": 0.1829993738259236, "grad_norm": 2.9052939414978027, "learning_rate": 0.00013806216032608695, "loss": 1.4625, "step": 1169 }, { "epoch": 0.18315591734502193, "grad_norm": 2.3746449947357178, "learning_rate": 0.00013805027173913043, "loss": 1.0981, "step": 1170 }, { "epoch": 0.18331246086412023, "grad_norm": 1.9850457906723022, "learning_rate": 0.0001380383831521739, "loss": 1.0633, "step": 1171 }, { "epoch": 0.18346900438321853, "grad_norm": 1.4339938163757324, "learning_rate": 0.0001380264945652174, "loss": 1.1682, "step": 1172 }, { "epoch": 0.18362554790231683, "grad_norm": 2.747748851776123, "learning_rate": 0.00013801460597826087, "loss": 1.2376, "step": 1173 }, { "epoch": 0.18378209142141516, "grad_norm": 1.8627957105636597, "learning_rate": 0.00013800271739130435, "loss": 0.9458, "step": 1174 }, { "epoch": 0.18393863494051346, "grad_norm": 1.4342074394226074, "learning_rate": 0.00013799082880434782, "loss": 1.1401, "step": 1175 }, { "epoch": 0.18409517845961176, "grad_norm": 1.7153773307800293, "learning_rate": 0.0001379789402173913, "loss": 0.8418, "step": 1176 }, { "epoch": 0.1842517219787101, "grad_norm": 2.2045717239379883, "learning_rate": 0.00013796705163043478, "loss": 1.2595, "step": 1177 }, { "epoch": 0.1844082654978084, "grad_norm": 2.0822932720184326, "learning_rate": 0.00013795516304347826, "loss": 1.1365, "step": 1178 }, { "epoch": 0.1845648090169067, "grad_norm": 2.6329472064971924, "learning_rate": 0.0001379432744565217, "loss": 1.0405, "step": 1179 }, { "epoch": 0.184721352536005, "grad_norm": 2.2440245151519775, "learning_rate": 0.0001379313858695652, "loss": 1.1682, "step": 1180 }, { "epoch": 0.18487789605510332, "grad_norm": 2.572028398513794, "learning_rate": 0.00013791949728260867, "loss": 1.1464, "step": 1181 }, { "epoch": 0.18503443957420163, "grad_norm": 4.593929290771484, "learning_rate": 0.00013790760869565214, "loss": 1.6957, "step": 1182 }, { "epoch": 0.18519098309329993, "grad_norm": 5.325618267059326, "learning_rate": 0.00013789572010869565, "loss": 1.6818, "step": 1183 }, { "epoch": 0.18534752661239826, "grad_norm": 2.949028730392456, "learning_rate": 0.00013788383152173913, "loss": 1.5896, "step": 1184 }, { "epoch": 0.18550407013149656, "grad_norm": 3.0618746280670166, "learning_rate": 0.0001378719429347826, "loss": 1.7882, "step": 1185 }, { "epoch": 0.18566061365059486, "grad_norm": 1.857090711593628, "learning_rate": 0.00013786005434782608, "loss": 1.2573, "step": 1186 }, { "epoch": 0.1858171571696932, "grad_norm": 3.9341039657592773, "learning_rate": 0.00013784816576086956, "loss": 1.4821, "step": 1187 }, { "epoch": 0.1859737006887915, "grad_norm": 3.8264975547790527, "learning_rate": 0.00013783627717391304, "loss": 1.8515, "step": 1188 }, { "epoch": 0.1861302442078898, "grad_norm": 3.483884811401367, "learning_rate": 0.00013782438858695652, "loss": 1.7353, "step": 1189 }, { "epoch": 0.1862867877269881, "grad_norm": 4.973394393920898, "learning_rate": 0.0001378125, "loss": 1.8278, "step": 1190 }, { "epoch": 0.18644333124608642, "grad_norm": 3.3971948623657227, "learning_rate": 0.00013780061141304347, "loss": 1.1264, "step": 1191 }, { "epoch": 0.18659987476518472, "grad_norm": 2.572930097579956, "learning_rate": 0.00013778872282608695, "loss": 1.9191, "step": 1192 }, { "epoch": 0.18675641828428302, "grad_norm": 5.544455051422119, "learning_rate": 0.00013777683423913043, "loss": 1.5688, "step": 1193 }, { "epoch": 0.18691296180338135, "grad_norm": 1.900350570678711, "learning_rate": 0.0001377649456521739, "loss": 1.4231, "step": 1194 }, { "epoch": 0.18706950532247965, "grad_norm": 4.271148681640625, "learning_rate": 0.00013775305706521739, "loss": 1.1562, "step": 1195 }, { "epoch": 0.18722604884157795, "grad_norm": 2.8429393768310547, "learning_rate": 0.00013774116847826086, "loss": 1.3179, "step": 1196 }, { "epoch": 0.18738259236067628, "grad_norm": 4.486495018005371, "learning_rate": 0.00013772927989130434, "loss": 1.3337, "step": 1197 }, { "epoch": 0.18753913587977458, "grad_norm": 4.442295551300049, "learning_rate": 0.00013771739130434782, "loss": 1.2293, "step": 1198 }, { "epoch": 0.18769567939887288, "grad_norm": 2.645477294921875, "learning_rate": 0.0001377055027173913, "loss": 1.5472, "step": 1199 }, { "epoch": 0.18785222291797118, "grad_norm": 1.652131199836731, "learning_rate": 0.00013769361413043478, "loss": 1.3974, "step": 1200 }, { "epoch": 0.1880087664370695, "grad_norm": 0.9231998920440674, "learning_rate": 0.00013768172554347826, "loss": 0.7124, "step": 1201 }, { "epoch": 0.18816530995616781, "grad_norm": 0.8119204640388489, "learning_rate": 0.0001376698369565217, "loss": 0.5684, "step": 1202 }, { "epoch": 0.18832185347526612, "grad_norm": 1.085740566253662, "learning_rate": 0.0001376579483695652, "loss": 0.8786, "step": 1203 }, { "epoch": 0.18847839699436444, "grad_norm": 1.020589828491211, "learning_rate": 0.0001376460597826087, "loss": 0.6744, "step": 1204 }, { "epoch": 0.18863494051346275, "grad_norm": 0.9869468808174133, "learning_rate": 0.00013763417119565217, "loss": 0.702, "step": 1205 }, { "epoch": 0.18879148403256105, "grad_norm": 1.1510159969329834, "learning_rate": 0.00013762228260869565, "loss": 0.7092, "step": 1206 }, { "epoch": 0.18894802755165935, "grad_norm": 1.2734460830688477, "learning_rate": 0.00013761039402173912, "loss": 0.7913, "step": 1207 }, { "epoch": 0.18910457107075768, "grad_norm": 1.1174362897872925, "learning_rate": 0.0001375985054347826, "loss": 0.6208, "step": 1208 }, { "epoch": 0.18926111458985598, "grad_norm": 1.6800272464752197, "learning_rate": 0.00013758661684782608, "loss": 0.698, "step": 1209 }, { "epoch": 0.18941765810895428, "grad_norm": 1.1643890142440796, "learning_rate": 0.00013757472826086956, "loss": 0.6331, "step": 1210 }, { "epoch": 0.1895742016280526, "grad_norm": 2.343560218811035, "learning_rate": 0.00013756283967391304, "loss": 0.632, "step": 1211 }, { "epoch": 0.1897307451471509, "grad_norm": 2.3179941177368164, "learning_rate": 0.00013755095108695651, "loss": 1.2171, "step": 1212 }, { "epoch": 0.1898872886662492, "grad_norm": 1.8673335313796997, "learning_rate": 0.0001375390625, "loss": 0.9145, "step": 1213 }, { "epoch": 0.19004383218534754, "grad_norm": 1.1038061380386353, "learning_rate": 0.00013752717391304347, "loss": 0.6739, "step": 1214 }, { "epoch": 0.19020037570444584, "grad_norm": 1.870535135269165, "learning_rate": 0.00013751528532608695, "loss": 0.8345, "step": 1215 }, { "epoch": 0.19035691922354414, "grad_norm": 1.7600821256637573, "learning_rate": 0.00013750339673913043, "loss": 0.7176, "step": 1216 }, { "epoch": 0.19051346274264244, "grad_norm": 2.6680655479431152, "learning_rate": 0.0001374915081521739, "loss": 0.9157, "step": 1217 }, { "epoch": 0.19067000626174077, "grad_norm": 3.1541664600372314, "learning_rate": 0.00013747961956521738, "loss": 1.1592, "step": 1218 }, { "epoch": 0.19082654978083907, "grad_norm": 1.8248345851898193, "learning_rate": 0.00013746773097826086, "loss": 1.0384, "step": 1219 }, { "epoch": 0.19098309329993737, "grad_norm": 3.0776758193969727, "learning_rate": 0.00013745584239130434, "loss": 0.8207, "step": 1220 }, { "epoch": 0.1911396368190357, "grad_norm": 2.2010486125946045, "learning_rate": 0.00013744395380434782, "loss": 0.8533, "step": 1221 }, { "epoch": 0.191296180338134, "grad_norm": 2.5094001293182373, "learning_rate": 0.0001374320652173913, "loss": 1.2414, "step": 1222 }, { "epoch": 0.1914527238572323, "grad_norm": 2.023785352706909, "learning_rate": 0.00013742017663043477, "loss": 0.8363, "step": 1223 }, { "epoch": 0.19160926737633063, "grad_norm": 1.927286148071289, "learning_rate": 0.00013740828804347825, "loss": 0.7612, "step": 1224 }, { "epoch": 0.19176581089542893, "grad_norm": 1.703173041343689, "learning_rate": 0.00013739639945652173, "loss": 0.8668, "step": 1225 }, { "epoch": 0.19192235441452724, "grad_norm": 2.5133018493652344, "learning_rate": 0.0001373845108695652, "loss": 1.6215, "step": 1226 }, { "epoch": 0.19207889793362554, "grad_norm": 1.9380569458007812, "learning_rate": 0.0001373726222826087, "loss": 1.0504, "step": 1227 }, { "epoch": 0.19223544145272387, "grad_norm": 2.7113821506500244, "learning_rate": 0.00013736073369565217, "loss": 1.1281, "step": 1228 }, { "epoch": 0.19239198497182217, "grad_norm": 3.867192029953003, "learning_rate": 0.00013734884510869564, "loss": 1.17, "step": 1229 }, { "epoch": 0.19254852849092047, "grad_norm": 1.9879852533340454, "learning_rate": 0.00013733695652173912, "loss": 1.0166, "step": 1230 }, { "epoch": 0.1927050720100188, "grad_norm": 2.452955961227417, "learning_rate": 0.0001373250679347826, "loss": 1.4574, "step": 1231 }, { "epoch": 0.1928616155291171, "grad_norm": 2.8354873657226562, "learning_rate": 0.00013731317934782608, "loss": 1.3339, "step": 1232 }, { "epoch": 0.1930181590482154, "grad_norm": 3.159738302230835, "learning_rate": 0.00013730129076086956, "loss": 1.2206, "step": 1233 }, { "epoch": 0.1931747025673137, "grad_norm": 1.803996205329895, "learning_rate": 0.00013728940217391303, "loss": 1.6111, "step": 1234 }, { "epoch": 0.19333124608641203, "grad_norm": 2.934810161590576, "learning_rate": 0.0001372775135869565, "loss": 1.2954, "step": 1235 }, { "epoch": 0.19348778960551033, "grad_norm": 3.1679277420043945, "learning_rate": 0.000137265625, "loss": 1.4686, "step": 1236 }, { "epoch": 0.19364433312460863, "grad_norm": 2.2336363792419434, "learning_rate": 0.00013725373641304347, "loss": 1.4578, "step": 1237 }, { "epoch": 0.19380087664370696, "grad_norm": 2.7858645915985107, "learning_rate": 0.00013724184782608695, "loss": 1.395, "step": 1238 }, { "epoch": 0.19395742016280526, "grad_norm": 3.48199725151062, "learning_rate": 0.00013722995923913043, "loss": 1.2312, "step": 1239 }, { "epoch": 0.19411396368190356, "grad_norm": 5.004460334777832, "learning_rate": 0.0001372180706521739, "loss": 1.2157, "step": 1240 }, { "epoch": 0.1942705072010019, "grad_norm": 2.8416566848754883, "learning_rate": 0.00013720618206521738, "loss": 1.8599, "step": 1241 }, { "epoch": 0.1944270507201002, "grad_norm": 3.4693965911865234, "learning_rate": 0.00013719429347826086, "loss": 1.6708, "step": 1242 }, { "epoch": 0.1945835942391985, "grad_norm": 7.818636894226074, "learning_rate": 0.00013718240489130434, "loss": 1.8119, "step": 1243 }, { "epoch": 0.1947401377582968, "grad_norm": 4.0367021560668945, "learning_rate": 0.00013717051630434782, "loss": 1.1751, "step": 1244 }, { "epoch": 0.19489668127739512, "grad_norm": 2.6872036457061768, "learning_rate": 0.0001371586277173913, "loss": 0.7392, "step": 1245 }, { "epoch": 0.19505322479649342, "grad_norm": 4.466470718383789, "learning_rate": 0.00013714673913043477, "loss": 0.7308, "step": 1246 }, { "epoch": 0.19520976831559173, "grad_norm": 5.105170249938965, "learning_rate": 0.00013713485054347825, "loss": 1.0691, "step": 1247 }, { "epoch": 0.19536631183469005, "grad_norm": 5.2139129638671875, "learning_rate": 0.00013712296195652173, "loss": 1.5228, "step": 1248 }, { "epoch": 0.19552285535378836, "grad_norm": 4.047123908996582, "learning_rate": 0.0001371110733695652, "loss": 1.6647, "step": 1249 }, { "epoch": 0.19567939887288666, "grad_norm": 5.306774616241455, "learning_rate": 0.00013709918478260868, "loss": 1.6387, "step": 1250 }, { "epoch": 0.19583594239198499, "grad_norm": 1.2049058675765991, "learning_rate": 0.00013708729619565216, "loss": 0.6708, "step": 1251 }, { "epoch": 0.1959924859110833, "grad_norm": 1.0765498876571655, "learning_rate": 0.00013707540760869564, "loss": 0.8027, "step": 1252 }, { "epoch": 0.1961490294301816, "grad_norm": 1.4122573137283325, "learning_rate": 0.00013706351902173912, "loss": 0.8061, "step": 1253 }, { "epoch": 0.1963055729492799, "grad_norm": 0.9493897557258606, "learning_rate": 0.0001370516304347826, "loss": 0.6295, "step": 1254 }, { "epoch": 0.19646211646837822, "grad_norm": 0.8869343996047974, "learning_rate": 0.00013703974184782608, "loss": 0.5388, "step": 1255 }, { "epoch": 0.19661865998747652, "grad_norm": 0.8252435922622681, "learning_rate": 0.00013702785326086955, "loss": 0.5341, "step": 1256 }, { "epoch": 0.19677520350657482, "grad_norm": 1.2648963928222656, "learning_rate": 0.00013701596467391303, "loss": 0.732, "step": 1257 }, { "epoch": 0.19693174702567315, "grad_norm": 0.8592041730880737, "learning_rate": 0.0001370040760869565, "loss": 0.5193, "step": 1258 }, { "epoch": 0.19708829054477145, "grad_norm": 0.9176813364028931, "learning_rate": 0.0001369921875, "loss": 0.6611, "step": 1259 }, { "epoch": 0.19724483406386975, "grad_norm": 0.8286765217781067, "learning_rate": 0.00013698029891304347, "loss": 0.52, "step": 1260 }, { "epoch": 0.19740137758296805, "grad_norm": 0.5870116949081421, "learning_rate": 0.00013696841032608694, "loss": 0.5105, "step": 1261 }, { "epoch": 0.19755792110206638, "grad_norm": 2.3690237998962402, "learning_rate": 0.00013695652173913042, "loss": 0.7847, "step": 1262 }, { "epoch": 0.19771446462116468, "grad_norm": 1.5183807611465454, "learning_rate": 0.0001369446331521739, "loss": 0.6798, "step": 1263 }, { "epoch": 0.19787100814026298, "grad_norm": 1.2444536685943604, "learning_rate": 0.00013693274456521738, "loss": 0.7014, "step": 1264 }, { "epoch": 0.1980275516593613, "grad_norm": 0.9331785440444946, "learning_rate": 0.00013692085597826086, "loss": 0.602, "step": 1265 }, { "epoch": 0.1981840951784596, "grad_norm": 1.6407747268676758, "learning_rate": 0.00013690896739130434, "loss": 0.6851, "step": 1266 }, { "epoch": 0.1983406386975579, "grad_norm": 2.1674468517303467, "learning_rate": 0.00013689707880434781, "loss": 0.7333, "step": 1267 }, { "epoch": 0.19849718221665624, "grad_norm": 2.083425760269165, "learning_rate": 0.0001368851902173913, "loss": 0.9378, "step": 1268 }, { "epoch": 0.19865372573575454, "grad_norm": 2.125591993331909, "learning_rate": 0.00013687330163043477, "loss": 0.889, "step": 1269 }, { "epoch": 0.19881026925485284, "grad_norm": 1.3988146781921387, "learning_rate": 0.00013686141304347825, "loss": 0.86, "step": 1270 }, { "epoch": 0.19896681277395115, "grad_norm": 2.1700074672698975, "learning_rate": 0.00013684952445652173, "loss": 0.7937, "step": 1271 }, { "epoch": 0.19912335629304947, "grad_norm": 4.374208450317383, "learning_rate": 0.0001368376358695652, "loss": 1.6128, "step": 1272 }, { "epoch": 0.19927989981214778, "grad_norm": 1.8651944398880005, "learning_rate": 0.00013682574728260868, "loss": 0.9622, "step": 1273 }, { "epoch": 0.19943644333124608, "grad_norm": 4.565978050231934, "learning_rate": 0.00013681385869565216, "loss": 1.3643, "step": 1274 }, { "epoch": 0.1995929868503444, "grad_norm": 1.787757158279419, "learning_rate": 0.00013680197010869564, "loss": 0.9697, "step": 1275 }, { "epoch": 0.1997495303694427, "grad_norm": 1.76967191696167, "learning_rate": 0.00013679008152173912, "loss": 1.113, "step": 1276 }, { "epoch": 0.199906073888541, "grad_norm": 2.1098241806030273, "learning_rate": 0.0001367781929347826, "loss": 1.0825, "step": 1277 }, { "epoch": 0.20006261740763934, "grad_norm": 1.765671730041504, "learning_rate": 0.00013676630434782607, "loss": 1.2671, "step": 1278 }, { "epoch": 0.20021916092673764, "grad_norm": 2.0657575130462646, "learning_rate": 0.00013675441576086955, "loss": 1.2702, "step": 1279 }, { "epoch": 0.20037570444583594, "grad_norm": 2.3238236904144287, "learning_rate": 0.00013674252717391303, "loss": 1.1761, "step": 1280 }, { "epoch": 0.20053224796493424, "grad_norm": 3.0345828533172607, "learning_rate": 0.0001367306385869565, "loss": 1.0745, "step": 1281 }, { "epoch": 0.20068879148403257, "grad_norm": 4.732701301574707, "learning_rate": 0.00013671874999999999, "loss": 1.3726, "step": 1282 }, { "epoch": 0.20084533500313087, "grad_norm": 4.7932634353637695, "learning_rate": 0.00013670686141304346, "loss": 1.5601, "step": 1283 }, { "epoch": 0.20100187852222917, "grad_norm": 3.920515537261963, "learning_rate": 0.00013669497282608694, "loss": 1.2732, "step": 1284 }, { "epoch": 0.2011584220413275, "grad_norm": 2.5828287601470947, "learning_rate": 0.00013668308423913042, "loss": 1.4237, "step": 1285 }, { "epoch": 0.2013149655604258, "grad_norm": 2.432622194290161, "learning_rate": 0.0001366711956521739, "loss": 1.3875, "step": 1286 }, { "epoch": 0.2014715090795241, "grad_norm": 2.9554829597473145, "learning_rate": 0.00013665930706521738, "loss": 1.2399, "step": 1287 }, { "epoch": 0.2016280525986224, "grad_norm": 1.7205358743667603, "learning_rate": 0.00013664741847826085, "loss": 1.17, "step": 1288 }, { "epoch": 0.20178459611772073, "grad_norm": 3.4813389778137207, "learning_rate": 0.00013663552989130433, "loss": 1.7085, "step": 1289 }, { "epoch": 0.20194113963681903, "grad_norm": 3.305692672729492, "learning_rate": 0.0001366236413043478, "loss": 1.3292, "step": 1290 }, { "epoch": 0.20209768315591733, "grad_norm": 3.015902280807495, "learning_rate": 0.0001366117527173913, "loss": 1.8773, "step": 1291 }, { "epoch": 0.20225422667501566, "grad_norm": 2.4495646953582764, "learning_rate": 0.00013659986413043477, "loss": 1.7199, "step": 1292 }, { "epoch": 0.20241077019411396, "grad_norm": 2.471389055252075, "learning_rate": 0.00013658797554347825, "loss": 1.4128, "step": 1293 }, { "epoch": 0.20256731371321227, "grad_norm": 3.1450483798980713, "learning_rate": 0.00013657608695652172, "loss": 0.9542, "step": 1294 }, { "epoch": 0.2027238572323106, "grad_norm": 4.366516590118408, "learning_rate": 0.0001365641983695652, "loss": 1.5098, "step": 1295 }, { "epoch": 0.2028804007514089, "grad_norm": 2.471235990524292, "learning_rate": 0.00013655230978260868, "loss": 0.8516, "step": 1296 }, { "epoch": 0.2030369442705072, "grad_norm": 2.169938325881958, "learning_rate": 0.00013654042119565216, "loss": 1.1177, "step": 1297 }, { "epoch": 0.2031934877896055, "grad_norm": 3.0887398719787598, "learning_rate": 0.00013652853260869564, "loss": 1.0151, "step": 1298 }, { "epoch": 0.20335003130870383, "grad_norm": 3.234626054763794, "learning_rate": 0.00013651664402173911, "loss": 1.1341, "step": 1299 }, { "epoch": 0.20350657482780213, "grad_norm": 3.7436914443969727, "learning_rate": 0.0001365047554347826, "loss": 1.2991, "step": 1300 }, { "epoch": 0.20366311834690043, "grad_norm": 0.6964896321296692, "learning_rate": 0.00013649286684782607, "loss": 0.7114, "step": 1301 }, { "epoch": 0.20381966186599876, "grad_norm": 0.8928024172782898, "learning_rate": 0.00013648097826086955, "loss": 0.6598, "step": 1302 }, { "epoch": 0.20397620538509706, "grad_norm": 1.1481163501739502, "learning_rate": 0.00013646908967391303, "loss": 0.7928, "step": 1303 }, { "epoch": 0.20413274890419536, "grad_norm": 3.157334566116333, "learning_rate": 0.0001364572010869565, "loss": 0.7732, "step": 1304 }, { "epoch": 0.2042892924232937, "grad_norm": 0.8631582260131836, "learning_rate": 0.00013644531249999998, "loss": 0.7103, "step": 1305 }, { "epoch": 0.204445835942392, "grad_norm": 1.3971761465072632, "learning_rate": 0.00013643342391304346, "loss": 0.5918, "step": 1306 }, { "epoch": 0.2046023794614903, "grad_norm": 1.0277576446533203, "learning_rate": 0.00013642153532608694, "loss": 0.6694, "step": 1307 }, { "epoch": 0.2047589229805886, "grad_norm": 3.3421449661254883, "learning_rate": 0.00013640964673913042, "loss": 0.6826, "step": 1308 }, { "epoch": 0.20491546649968692, "grad_norm": 0.9466357231140137, "learning_rate": 0.0001363977581521739, "loss": 0.5868, "step": 1309 }, { "epoch": 0.20507201001878522, "grad_norm": 1.0873324871063232, "learning_rate": 0.00013638586956521737, "loss": 0.5999, "step": 1310 }, { "epoch": 0.20522855353788352, "grad_norm": 7.587026596069336, "learning_rate": 0.00013637398097826085, "loss": 2.1765, "step": 1311 }, { "epoch": 0.20538509705698185, "grad_norm": 1.252220869064331, "learning_rate": 0.00013636209239130433, "loss": 0.4672, "step": 1312 }, { "epoch": 0.20554164057608015, "grad_norm": 3.4496586322784424, "learning_rate": 0.0001363502038043478, "loss": 0.8255, "step": 1313 }, { "epoch": 0.20569818409517845, "grad_norm": 1.2029796838760376, "learning_rate": 0.0001363383152173913, "loss": 0.63, "step": 1314 }, { "epoch": 0.20585472761427676, "grad_norm": 2.281799554824829, "learning_rate": 0.00013632642663043477, "loss": 1.0663, "step": 1315 }, { "epoch": 0.20601127113337508, "grad_norm": 1.193843126296997, "learning_rate": 0.00013631453804347824, "loss": 0.6846, "step": 1316 }, { "epoch": 0.20616781465247339, "grad_norm": 1.2917855978012085, "learning_rate": 0.00013630264945652172, "loss": 0.556, "step": 1317 }, { "epoch": 0.2063243581715717, "grad_norm": 2.324747323989868, "learning_rate": 0.0001362907608695652, "loss": 0.9876, "step": 1318 }, { "epoch": 0.20648090169067002, "grad_norm": 1.666579246520996, "learning_rate": 0.00013627887228260868, "loss": 0.8341, "step": 1319 }, { "epoch": 0.20663744520976832, "grad_norm": 1.402708888053894, "learning_rate": 0.00013626698369565216, "loss": 0.8508, "step": 1320 }, { "epoch": 0.20679398872886662, "grad_norm": 4.7486796379089355, "learning_rate": 0.00013625509510869563, "loss": 0.8358, "step": 1321 }, { "epoch": 0.20695053224796495, "grad_norm": 1.8604363203048706, "learning_rate": 0.0001362432065217391, "loss": 0.7419, "step": 1322 }, { "epoch": 0.20710707576706325, "grad_norm": 2.045491933822632, "learning_rate": 0.0001362313179347826, "loss": 1.0658, "step": 1323 }, { "epoch": 0.20726361928616155, "grad_norm": 3.869706153869629, "learning_rate": 0.00013621942934782607, "loss": 0.7819, "step": 1324 }, { "epoch": 0.20742016280525985, "grad_norm": 1.958597183227539, "learning_rate": 0.00013620754076086955, "loss": 0.8922, "step": 1325 }, { "epoch": 0.20757670632435818, "grad_norm": 2.524799108505249, "learning_rate": 0.00013619565217391302, "loss": 0.9113, "step": 1326 }, { "epoch": 0.20773324984345648, "grad_norm": 2.7690303325653076, "learning_rate": 0.0001361837635869565, "loss": 1.3878, "step": 1327 }, { "epoch": 0.20788979336255478, "grad_norm": 2.4917757511138916, "learning_rate": 0.00013617187499999998, "loss": 0.9661, "step": 1328 }, { "epoch": 0.2080463368816531, "grad_norm": 2.7036077976226807, "learning_rate": 0.00013615998641304346, "loss": 0.8318, "step": 1329 }, { "epoch": 0.2082028804007514, "grad_norm": 2.3120861053466797, "learning_rate": 0.00013614809782608694, "loss": 1.1235, "step": 1330 }, { "epoch": 0.2083594239198497, "grad_norm": 2.1221487522125244, "learning_rate": 0.00013613620923913042, "loss": 0.9128, "step": 1331 }, { "epoch": 0.20851596743894804, "grad_norm": 2.34417462348938, "learning_rate": 0.0001361243206521739, "loss": 1.0666, "step": 1332 }, { "epoch": 0.20867251095804634, "grad_norm": 5.182578086853027, "learning_rate": 0.00013611243206521737, "loss": 1.238, "step": 1333 }, { "epoch": 0.20882905447714464, "grad_norm": 2.449225664138794, "learning_rate": 0.00013610054347826085, "loss": 1.1774, "step": 1334 }, { "epoch": 0.20898559799624294, "grad_norm": 4.277975559234619, "learning_rate": 0.00013608865489130433, "loss": 1.7331, "step": 1335 }, { "epoch": 0.20914214151534127, "grad_norm": 3.441492795944214, "learning_rate": 0.0001360767663043478, "loss": 1.2985, "step": 1336 }, { "epoch": 0.20929868503443957, "grad_norm": 3.4803104400634766, "learning_rate": 0.0001360648777173913, "loss": 1.5546, "step": 1337 }, { "epoch": 0.20945522855353788, "grad_norm": 4.009057998657227, "learning_rate": 0.0001360529891304348, "loss": 1.5973, "step": 1338 }, { "epoch": 0.2096117720726362, "grad_norm": 3.0272057056427, "learning_rate": 0.00013604110054347827, "loss": 1.6188, "step": 1339 }, { "epoch": 0.2097683155917345, "grad_norm": 4.394728183746338, "learning_rate": 0.00013602921195652172, "loss": 1.1763, "step": 1340 }, { "epoch": 0.2099248591108328, "grad_norm": 3.192636728286743, "learning_rate": 0.0001360173233695652, "loss": 1.4876, "step": 1341 }, { "epoch": 0.2100814026299311, "grad_norm": 4.074566841125488, "learning_rate": 0.00013600543478260868, "loss": 1.5581, "step": 1342 }, { "epoch": 0.21023794614902944, "grad_norm": 7.710876941680908, "learning_rate": 0.00013599354619565215, "loss": 1.3942, "step": 1343 }, { "epoch": 0.21039448966812774, "grad_norm": 2.7888412475585938, "learning_rate": 0.00013598165760869563, "loss": 1.3867, "step": 1344 }, { "epoch": 0.21055103318722604, "grad_norm": 3.9246511459350586, "learning_rate": 0.0001359697690217391, "loss": 1.3722, "step": 1345 }, { "epoch": 0.21070757670632437, "grad_norm": 4.752890586853027, "learning_rate": 0.0001359578804347826, "loss": 1.1477, "step": 1346 }, { "epoch": 0.21086412022542267, "grad_norm": 2.6898443698883057, "learning_rate": 0.00013594599184782607, "loss": 1.1858, "step": 1347 }, { "epoch": 0.21102066374452097, "grad_norm": 2.8602135181427, "learning_rate": 0.00013593410326086954, "loss": 0.8912, "step": 1348 }, { "epoch": 0.2111772072636193, "grad_norm": 3.1345503330230713, "learning_rate": 0.00013592221467391302, "loss": 0.9177, "step": 1349 }, { "epoch": 0.2113337507827176, "grad_norm": 3.8274331092834473, "learning_rate": 0.0001359103260869565, "loss": 1.1516, "step": 1350 }, { "epoch": 0.2114902943018159, "grad_norm": 0.691088080406189, "learning_rate": 0.00013589843749999998, "loss": 0.5279, "step": 1351 }, { "epoch": 0.2116468378209142, "grad_norm": 1.0149836540222168, "learning_rate": 0.00013588654891304346, "loss": 0.7109, "step": 1352 }, { "epoch": 0.21180338134001253, "grad_norm": 0.7154799103736877, "learning_rate": 0.00013587466032608693, "loss": 0.505, "step": 1353 }, { "epoch": 0.21195992485911083, "grad_norm": 0.7268035411834717, "learning_rate": 0.0001358627717391304, "loss": 0.6024, "step": 1354 }, { "epoch": 0.21211646837820913, "grad_norm": 0.6841917634010315, "learning_rate": 0.0001358508831521739, "loss": 0.4405, "step": 1355 }, { "epoch": 0.21227301189730746, "grad_norm": 1.086714267730713, "learning_rate": 0.00013583899456521737, "loss": 0.6582, "step": 1356 }, { "epoch": 0.21242955541640576, "grad_norm": 1.375450849533081, "learning_rate": 0.00013582710597826087, "loss": 0.7446, "step": 1357 }, { "epoch": 0.21258609893550406, "grad_norm": 1.0958795547485352, "learning_rate": 0.00013581521739130435, "loss": 0.7268, "step": 1358 }, { "epoch": 0.2127426424546024, "grad_norm": 1.175258755683899, "learning_rate": 0.00013580332880434783, "loss": 0.6397, "step": 1359 }, { "epoch": 0.2128991859737007, "grad_norm": 3.679452657699585, "learning_rate": 0.0001357914402173913, "loss": 0.529, "step": 1360 }, { "epoch": 0.213055729492799, "grad_norm": 0.9180824160575867, "learning_rate": 0.0001357795516304348, "loss": 0.7345, "step": 1361 }, { "epoch": 0.2132122730118973, "grad_norm": 1.2122639417648315, "learning_rate": 0.00013576766304347827, "loss": 0.7949, "step": 1362 }, { "epoch": 0.21336881653099563, "grad_norm": 1.1910570859909058, "learning_rate": 0.00013575577445652172, "loss": 0.814, "step": 1363 }, { "epoch": 0.21352536005009393, "grad_norm": 0.9914449453353882, "learning_rate": 0.0001357438858695652, "loss": 0.5074, "step": 1364 }, { "epoch": 0.21368190356919223, "grad_norm": 1.2239723205566406, "learning_rate": 0.00013573199728260867, "loss": 0.5744, "step": 1365 }, { "epoch": 0.21383844708829056, "grad_norm": 1.2134511470794678, "learning_rate": 0.00013572010869565215, "loss": 0.8116, "step": 1366 }, { "epoch": 0.21399499060738886, "grad_norm": 2.1358299255371094, "learning_rate": 0.00013570822010869563, "loss": 1.1872, "step": 1367 }, { "epoch": 0.21415153412648716, "grad_norm": 1.1983097791671753, "learning_rate": 0.0001356963315217391, "loss": 0.5939, "step": 1368 }, { "epoch": 0.21430807764558546, "grad_norm": 2.933415174484253, "learning_rate": 0.00013568444293478259, "loss": 1.2927, "step": 1369 }, { "epoch": 0.2144646211646838, "grad_norm": 2.4206483364105225, "learning_rate": 0.00013567255434782606, "loss": 1.1305, "step": 1370 }, { "epoch": 0.2146211646837821, "grad_norm": 2.4028477668762207, "learning_rate": 0.00013566066576086954, "loss": 0.7431, "step": 1371 }, { "epoch": 0.2147777082028804, "grad_norm": 1.4380924701690674, "learning_rate": 0.00013564877717391302, "loss": 0.6418, "step": 1372 }, { "epoch": 0.21493425172197872, "grad_norm": 2.1044626235961914, "learning_rate": 0.0001356368885869565, "loss": 0.8495, "step": 1373 }, { "epoch": 0.21509079524107702, "grad_norm": 2.211191177368164, "learning_rate": 0.00013562499999999998, "loss": 1.1076, "step": 1374 }, { "epoch": 0.21524733876017532, "grad_norm": 1.5419483184814453, "learning_rate": 0.00013561311141304345, "loss": 0.9589, "step": 1375 }, { "epoch": 0.21540388227927365, "grad_norm": 2.0756912231445312, "learning_rate": 0.00013560122282608693, "loss": 0.9012, "step": 1376 }, { "epoch": 0.21556042579837195, "grad_norm": 1.8064388036727905, "learning_rate": 0.00013558933423913044, "loss": 0.9586, "step": 1377 }, { "epoch": 0.21571696931747025, "grad_norm": 2.476318120956421, "learning_rate": 0.00013557744565217392, "loss": 0.8951, "step": 1378 }, { "epoch": 0.21587351283656855, "grad_norm": 2.474285125732422, "learning_rate": 0.0001355655570652174, "loss": 0.998, "step": 1379 }, { "epoch": 0.21603005635566688, "grad_norm": 1.8395627737045288, "learning_rate": 0.00013555366847826087, "loss": 1.2297, "step": 1380 }, { "epoch": 0.21618659987476518, "grad_norm": 3.7457022666931152, "learning_rate": 0.00013554177989130435, "loss": 0.9893, "step": 1381 }, { "epoch": 0.21634314339386349, "grad_norm": 3.2942731380462646, "learning_rate": 0.00013552989130434783, "loss": 1.3742, "step": 1382 }, { "epoch": 0.21649968691296181, "grad_norm": 8.876026153564453, "learning_rate": 0.0001355180027173913, "loss": 1.3692, "step": 1383 }, { "epoch": 0.21665623043206012, "grad_norm": 2.4972915649414062, "learning_rate": 0.00013550611413043478, "loss": 1.4578, "step": 1384 }, { "epoch": 0.21681277395115842, "grad_norm": 3.449849843978882, "learning_rate": 0.00013549422554347826, "loss": 1.4563, "step": 1385 }, { "epoch": 0.21696931747025675, "grad_norm": 3.288839340209961, "learning_rate": 0.00013548233695652171, "loss": 1.5164, "step": 1386 }, { "epoch": 0.21712586098935505, "grad_norm": 3.0756020545959473, "learning_rate": 0.0001354704483695652, "loss": 1.5026, "step": 1387 }, { "epoch": 0.21728240450845335, "grad_norm": 5.86512565612793, "learning_rate": 0.00013545855978260867, "loss": 1.5658, "step": 1388 }, { "epoch": 0.21743894802755165, "grad_norm": 5.369242191314697, "learning_rate": 0.00013544667119565215, "loss": 1.6463, "step": 1389 }, { "epoch": 0.21759549154664998, "grad_norm": 4.809154510498047, "learning_rate": 0.00013543478260869563, "loss": 1.4531, "step": 1390 }, { "epoch": 0.21775203506574828, "grad_norm": 9.91645336151123, "learning_rate": 0.0001354228940217391, "loss": 1.8717, "step": 1391 }, { "epoch": 0.21790857858484658, "grad_norm": 7.736587047576904, "learning_rate": 0.00013541100543478258, "loss": 1.4747, "step": 1392 }, { "epoch": 0.2180651221039449, "grad_norm": 2.7608590126037598, "learning_rate": 0.00013539911684782606, "loss": 1.224, "step": 1393 }, { "epoch": 0.2182216656230432, "grad_norm": 3.6386053562164307, "learning_rate": 0.00013538722826086954, "loss": 1.3357, "step": 1394 }, { "epoch": 0.2183782091421415, "grad_norm": 3.895171642303467, "learning_rate": 0.00013537533967391302, "loss": 2.4788, "step": 1395 }, { "epoch": 0.2185347526612398, "grad_norm": 3.077681541442871, "learning_rate": 0.0001353634510869565, "loss": 1.1919, "step": 1396 }, { "epoch": 0.21869129618033814, "grad_norm": 9.889974594116211, "learning_rate": 0.0001353515625, "loss": 1.3552, "step": 1397 }, { "epoch": 0.21884783969943644, "grad_norm": 7.9142303466796875, "learning_rate": 0.00013533967391304348, "loss": 1.4417, "step": 1398 }, { "epoch": 0.21900438321853474, "grad_norm": 7.897679328918457, "learning_rate": 0.00013532778532608696, "loss": 1.7768, "step": 1399 }, { "epoch": 0.21916092673763307, "grad_norm": 6.522871017456055, "learning_rate": 0.00013531589673913044, "loss": 1.6873, "step": 1400 }, { "epoch": 0.21931747025673137, "grad_norm": 0.871116578578949, "learning_rate": 0.00013530400815217391, "loss": 0.5358, "step": 1401 }, { "epoch": 0.21947401377582967, "grad_norm": 0.6473991870880127, "learning_rate": 0.0001352921195652174, "loss": 0.544, "step": 1402 }, { "epoch": 0.219630557294928, "grad_norm": 0.8219231963157654, "learning_rate": 0.00013528023097826087, "loss": 0.5233, "step": 1403 }, { "epoch": 0.2197871008140263, "grad_norm": 1.029171109199524, "learning_rate": 0.00013526834239130435, "loss": 0.6052, "step": 1404 }, { "epoch": 0.2199436443331246, "grad_norm": 1.2191280126571655, "learning_rate": 0.00013525645380434783, "loss": 0.5233, "step": 1405 }, { "epoch": 0.2201001878522229, "grad_norm": 1.596596121788025, "learning_rate": 0.0001352445652173913, "loss": 0.6227, "step": 1406 }, { "epoch": 0.22025673137132123, "grad_norm": 1.0154201984405518, "learning_rate": 0.00013523267663043478, "loss": 0.5243, "step": 1407 }, { "epoch": 0.22041327489041954, "grad_norm": 0.8673685789108276, "learning_rate": 0.00013522078804347826, "loss": 0.4734, "step": 1408 }, { "epoch": 0.22056981840951784, "grad_norm": 2.377361536026001, "learning_rate": 0.0001352088994565217, "loss": 0.4279, "step": 1409 }, { "epoch": 0.22072636192861617, "grad_norm": 1.2111585140228271, "learning_rate": 0.0001351970108695652, "loss": 0.5279, "step": 1410 }, { "epoch": 0.22088290544771447, "grad_norm": 1.1884641647338867, "learning_rate": 0.00013518512228260867, "loss": 0.676, "step": 1411 }, { "epoch": 0.22103944896681277, "grad_norm": 1.6150407791137695, "learning_rate": 0.00013517323369565215, "loss": 0.7605, "step": 1412 }, { "epoch": 0.2211959924859111, "grad_norm": 1.1590385437011719, "learning_rate": 0.00013516134510869562, "loss": 0.6586, "step": 1413 }, { "epoch": 0.2213525360050094, "grad_norm": 1.1446272134780884, "learning_rate": 0.0001351494565217391, "loss": 0.4799, "step": 1414 }, { "epoch": 0.2215090795241077, "grad_norm": 0.815991222858429, "learning_rate": 0.00013513756793478258, "loss": 0.5668, "step": 1415 }, { "epoch": 0.221665623043206, "grad_norm": 1.4234789609909058, "learning_rate": 0.00013512567934782609, "loss": 0.9671, "step": 1416 }, { "epoch": 0.22182216656230433, "grad_norm": 3.0482447147369385, "learning_rate": 0.00013511379076086956, "loss": 1.1418, "step": 1417 }, { "epoch": 0.22197871008140263, "grad_norm": 1.5258711576461792, "learning_rate": 0.00013510190217391304, "loss": 0.8233, "step": 1418 }, { "epoch": 0.22213525360050093, "grad_norm": 1.4398740530014038, "learning_rate": 0.00013509001358695652, "loss": 0.7782, "step": 1419 }, { "epoch": 0.22229179711959926, "grad_norm": 2.67677640914917, "learning_rate": 0.000135078125, "loss": 0.6704, "step": 1420 }, { "epoch": 0.22244834063869756, "grad_norm": 1.6371910572052002, "learning_rate": 0.00013506623641304348, "loss": 0.7819, "step": 1421 }, { "epoch": 0.22260488415779586, "grad_norm": 4.9950103759765625, "learning_rate": 0.00013505434782608695, "loss": 0.8776, "step": 1422 }, { "epoch": 0.22276142767689416, "grad_norm": 1.2510411739349365, "learning_rate": 0.00013504245923913043, "loss": 0.7177, "step": 1423 }, { "epoch": 0.2229179711959925, "grad_norm": 3.493178367614746, "learning_rate": 0.0001350305706521739, "loss": 0.9402, "step": 1424 }, { "epoch": 0.2230745147150908, "grad_norm": 6.05262565612793, "learning_rate": 0.0001350186820652174, "loss": 1.0804, "step": 1425 }, { "epoch": 0.2232310582341891, "grad_norm": 1.3866798877716064, "learning_rate": 0.00013500679347826087, "loss": 0.7838, "step": 1426 }, { "epoch": 0.22338760175328742, "grad_norm": 2.6874451637268066, "learning_rate": 0.00013499490489130435, "loss": 0.999, "step": 1427 }, { "epoch": 0.22354414527238572, "grad_norm": 2.5552711486816406, "learning_rate": 0.00013498301630434782, "loss": 0.9415, "step": 1428 }, { "epoch": 0.22370068879148403, "grad_norm": 2.6012322902679443, "learning_rate": 0.0001349711277173913, "loss": 1.2046, "step": 1429 }, { "epoch": 0.22385723231058235, "grad_norm": 1.9993668794631958, "learning_rate": 0.00013495923913043478, "loss": 1.1991, "step": 1430 }, { "epoch": 0.22401377582968066, "grad_norm": 4.275752067565918, "learning_rate": 0.00013494735054347826, "loss": 1.2137, "step": 1431 }, { "epoch": 0.22417031934877896, "grad_norm": 4.183918476104736, "learning_rate": 0.0001349354619565217, "loss": 1.2241, "step": 1432 }, { "epoch": 0.22432686286787726, "grad_norm": 2.8593287467956543, "learning_rate": 0.0001349235733695652, "loss": 1.4086, "step": 1433 }, { "epoch": 0.2244834063869756, "grad_norm": 2.2803890705108643, "learning_rate": 0.00013491168478260867, "loss": 1.3216, "step": 1434 }, { "epoch": 0.2246399499060739, "grad_norm": 1.943198561668396, "learning_rate": 0.00013489979619565214, "loss": 1.3605, "step": 1435 }, { "epoch": 0.2247964934251722, "grad_norm": 2.1481311321258545, "learning_rate": 0.00013488790760869565, "loss": 1.1866, "step": 1436 }, { "epoch": 0.22495303694427052, "grad_norm": 4.041958332061768, "learning_rate": 0.00013487601902173913, "loss": 1.7402, "step": 1437 }, { "epoch": 0.22510958046336882, "grad_norm": 2.759448289871216, "learning_rate": 0.0001348641304347826, "loss": 1.1539, "step": 1438 }, { "epoch": 0.22526612398246712, "grad_norm": 1.7979768514633179, "learning_rate": 0.00013485224184782608, "loss": 1.602, "step": 1439 }, { "epoch": 0.22542266750156542, "grad_norm": 3.415788412094116, "learning_rate": 0.00013484035326086956, "loss": 1.488, "step": 1440 }, { "epoch": 0.22557921102066375, "grad_norm": 3.9472219944000244, "learning_rate": 0.00013482846467391304, "loss": 1.0181, "step": 1441 }, { "epoch": 0.22573575453976205, "grad_norm": 3.206657648086548, "learning_rate": 0.00013481657608695652, "loss": 1.5187, "step": 1442 }, { "epoch": 0.22589229805886035, "grad_norm": 2.1529242992401123, "learning_rate": 0.0001348046875, "loss": 1.3615, "step": 1443 }, { "epoch": 0.22604884157795868, "grad_norm": 2.633723497390747, "learning_rate": 0.00013479279891304347, "loss": 1.9852, "step": 1444 }, { "epoch": 0.22620538509705698, "grad_norm": 2.3937814235687256, "learning_rate": 0.00013478091032608695, "loss": 1.5871, "step": 1445 }, { "epoch": 0.22636192861615528, "grad_norm": 1.7919312715530396, "learning_rate": 0.00013476902173913043, "loss": 0.8795, "step": 1446 }, { "epoch": 0.2265184721352536, "grad_norm": 1.7783126831054688, "learning_rate": 0.0001347571331521739, "loss": 0.9494, "step": 1447 }, { "epoch": 0.2266750156543519, "grad_norm": 3.247185468673706, "learning_rate": 0.0001347452445652174, "loss": 1.3108, "step": 1448 }, { "epoch": 0.22683155917345021, "grad_norm": 3.648019313812256, "learning_rate": 0.00013473335597826087, "loss": 1.1117, "step": 1449 }, { "epoch": 0.22698810269254852, "grad_norm": 3.3539998531341553, "learning_rate": 0.00013472146739130434, "loss": 1.2961, "step": 1450 }, { "epoch": 0.22714464621164684, "grad_norm": 0.7904792428016663, "learning_rate": 0.00013470957880434782, "loss": 0.4632, "step": 1451 }, { "epoch": 0.22730118973074515, "grad_norm": 0.8753790855407715, "learning_rate": 0.0001346976902173913, "loss": 0.5486, "step": 1452 }, { "epoch": 0.22745773324984345, "grad_norm": 1.2585132122039795, "learning_rate": 0.00013468580163043478, "loss": 0.6501, "step": 1453 }, { "epoch": 0.22761427676894178, "grad_norm": 0.7282984852790833, "learning_rate": 0.00013467391304347826, "loss": 0.6315, "step": 1454 }, { "epoch": 0.22777082028804008, "grad_norm": 1.0508239269256592, "learning_rate": 0.0001346620244565217, "loss": 0.4306, "step": 1455 }, { "epoch": 0.22792736380713838, "grad_norm": 1.7002073526382446, "learning_rate": 0.0001346501358695652, "loss": 0.6832, "step": 1456 }, { "epoch": 0.2280839073262367, "grad_norm": 0.7528588175773621, "learning_rate": 0.0001346382472826087, "loss": 0.7428, "step": 1457 }, { "epoch": 0.228240450845335, "grad_norm": 1.2585810422897339, "learning_rate": 0.00013462635869565217, "loss": 0.4972, "step": 1458 }, { "epoch": 0.2283969943644333, "grad_norm": 1.2358628511428833, "learning_rate": 0.00013461447010869565, "loss": 0.5232, "step": 1459 }, { "epoch": 0.2285535378835316, "grad_norm": 1.132817268371582, "learning_rate": 0.00013460258152173912, "loss": 0.4863, "step": 1460 }, { "epoch": 0.22871008140262994, "grad_norm": 1.6958914995193481, "learning_rate": 0.0001345906929347826, "loss": 0.5535, "step": 1461 }, { "epoch": 0.22886662492172824, "grad_norm": 1.1937307119369507, "learning_rate": 0.00013457880434782608, "loss": 0.7272, "step": 1462 }, { "epoch": 0.22902316844082654, "grad_norm": 1.3135823011398315, "learning_rate": 0.00013456691576086956, "loss": 0.8316, "step": 1463 }, { "epoch": 0.22917971195992487, "grad_norm": 1.097524642944336, "learning_rate": 0.00013455502717391304, "loss": 0.5053, "step": 1464 }, { "epoch": 0.22933625547902317, "grad_norm": 2.5244555473327637, "learning_rate": 0.00013454313858695652, "loss": 0.8621, "step": 1465 }, { "epoch": 0.22949279899812147, "grad_norm": 1.4429869651794434, "learning_rate": 0.00013453125, "loss": 0.663, "step": 1466 }, { "epoch": 0.22964934251721977, "grad_norm": 1.7694810628890991, "learning_rate": 0.00013451936141304347, "loss": 0.6897, "step": 1467 }, { "epoch": 0.2298058860363181, "grad_norm": 1.547844648361206, "learning_rate": 0.00013450747282608695, "loss": 0.7621, "step": 1468 }, { "epoch": 0.2299624295554164, "grad_norm": 2.112004041671753, "learning_rate": 0.00013449558423913043, "loss": 0.8467, "step": 1469 }, { "epoch": 0.2301189730745147, "grad_norm": 2.2373204231262207, "learning_rate": 0.0001344836956521739, "loss": 0.7434, "step": 1470 }, { "epoch": 0.23027551659361303, "grad_norm": 1.5916227102279663, "learning_rate": 0.00013447180706521738, "loss": 0.5413, "step": 1471 }, { "epoch": 0.23043206011271133, "grad_norm": 2.8337807655334473, "learning_rate": 0.00013445991847826086, "loss": 0.7708, "step": 1472 }, { "epoch": 0.23058860363180964, "grad_norm": 1.6144152879714966, "learning_rate": 0.00013444802989130434, "loss": 0.7699, "step": 1473 }, { "epoch": 0.23074514715090796, "grad_norm": 3.916670799255371, "learning_rate": 0.00013443614130434782, "loss": 0.6363, "step": 1474 }, { "epoch": 0.23090169067000627, "grad_norm": 3.056612014770508, "learning_rate": 0.0001344242527173913, "loss": 1.235, "step": 1475 }, { "epoch": 0.23105823418910457, "grad_norm": 2.636298418045044, "learning_rate": 0.00013441236413043478, "loss": 0.6467, "step": 1476 }, { "epoch": 0.23121477770820287, "grad_norm": 1.6724399328231812, "learning_rate": 0.00013440047554347825, "loss": 0.6238, "step": 1477 }, { "epoch": 0.2313713212273012, "grad_norm": 2.268935203552246, "learning_rate": 0.00013438858695652173, "loss": 0.9056, "step": 1478 }, { "epoch": 0.2315278647463995, "grad_norm": 2.1689982414245605, "learning_rate": 0.0001343766983695652, "loss": 1.3432, "step": 1479 }, { "epoch": 0.2316844082654978, "grad_norm": 2.970827579498291, "learning_rate": 0.0001343648097826087, "loss": 0.9936, "step": 1480 }, { "epoch": 0.23184095178459613, "grad_norm": 3.0228865146636963, "learning_rate": 0.00013435292119565217, "loss": 1.0153, "step": 1481 }, { "epoch": 0.23199749530369443, "grad_norm": 3.3610877990722656, "learning_rate": 0.00013434103260869564, "loss": 1.2257, "step": 1482 }, { "epoch": 0.23215403882279273, "grad_norm": 3.2228806018829346, "learning_rate": 0.00013432914402173912, "loss": 1.1938, "step": 1483 }, { "epoch": 0.23231058234189106, "grad_norm": 1.5909347534179688, "learning_rate": 0.0001343172554347826, "loss": 0.7596, "step": 1484 }, { "epoch": 0.23246712586098936, "grad_norm": 4.628849029541016, "learning_rate": 0.00013430536684782608, "loss": 1.4253, "step": 1485 }, { "epoch": 0.23262366938008766, "grad_norm": 6.622790336608887, "learning_rate": 0.00013429347826086956, "loss": 1.3766, "step": 1486 }, { "epoch": 0.23278021289918596, "grad_norm": 4.750699520111084, "learning_rate": 0.00013428158967391303, "loss": 1.2474, "step": 1487 }, { "epoch": 0.2329367564182843, "grad_norm": 4.514869213104248, "learning_rate": 0.0001342697010869565, "loss": 1.2399, "step": 1488 }, { "epoch": 0.2330932999373826, "grad_norm": 3.6941189765930176, "learning_rate": 0.0001342578125, "loss": 1.9947, "step": 1489 }, { "epoch": 0.2332498434564809, "grad_norm": 3.217207670211792, "learning_rate": 0.00013424592391304347, "loss": 1.6044, "step": 1490 }, { "epoch": 0.23340638697557922, "grad_norm": 5.296628475189209, "learning_rate": 0.00013423403532608695, "loss": 1.7678, "step": 1491 }, { "epoch": 0.23356293049467752, "grad_norm": 4.404066562652588, "learning_rate": 0.00013422214673913043, "loss": 0.8744, "step": 1492 }, { "epoch": 0.23371947401377582, "grad_norm": 3.2494266033172607, "learning_rate": 0.0001342102581521739, "loss": 1.2919, "step": 1493 }, { "epoch": 0.23387601753287413, "grad_norm": 3.642239809036255, "learning_rate": 0.00013419836956521738, "loss": 1.7588, "step": 1494 }, { "epoch": 0.23403256105197245, "grad_norm": 3.5199389457702637, "learning_rate": 0.00013418648097826086, "loss": 2.0484, "step": 1495 }, { "epoch": 0.23418910457107076, "grad_norm": 3.701643705368042, "learning_rate": 0.00013417459239130434, "loss": 1.5313, "step": 1496 }, { "epoch": 0.23434564809016906, "grad_norm": 2.595499038696289, "learning_rate": 0.00013416270380434782, "loss": 0.773, "step": 1497 }, { "epoch": 0.23450219160926739, "grad_norm": 2.247371196746826, "learning_rate": 0.0001341508152173913, "loss": 0.9921, "step": 1498 }, { "epoch": 0.2346587351283657, "grad_norm": 3.550616502761841, "learning_rate": 0.00013413892663043477, "loss": 0.8682, "step": 1499 }, { "epoch": 0.234815278647464, "grad_norm": 2.3132824897766113, "learning_rate": 0.00013412703804347825, "loss": 1.2897, "step": 1500 }, { "epoch": 0.23497182216656232, "grad_norm": 0.8679249286651611, "learning_rate": 0.00013411514945652173, "loss": 0.5378, "step": 1501 }, { "epoch": 0.23512836568566062, "grad_norm": 0.7591186165809631, "learning_rate": 0.0001341032608695652, "loss": 0.5274, "step": 1502 }, { "epoch": 0.23528490920475892, "grad_norm": 0.5724111199378967, "learning_rate": 0.00013409137228260869, "loss": 0.4093, "step": 1503 }, { "epoch": 0.23544145272385722, "grad_norm": 0.7575477361679077, "learning_rate": 0.00013407948369565216, "loss": 0.4915, "step": 1504 }, { "epoch": 0.23559799624295555, "grad_norm": 0.785041332244873, "learning_rate": 0.00013406759510869564, "loss": 0.4166, "step": 1505 }, { "epoch": 0.23575453976205385, "grad_norm": 0.7798678278923035, "learning_rate": 0.00013405570652173912, "loss": 0.4144, "step": 1506 }, { "epoch": 0.23591108328115215, "grad_norm": 1.1331361532211304, "learning_rate": 0.0001340438179347826, "loss": 0.5402, "step": 1507 }, { "epoch": 0.23606762680025048, "grad_norm": 0.8889287114143372, "learning_rate": 0.00013403192934782608, "loss": 0.5316, "step": 1508 }, { "epoch": 0.23622417031934878, "grad_norm": 6.923069000244141, "learning_rate": 0.00013402004076086955, "loss": 1.5035, "step": 1509 }, { "epoch": 0.23638071383844708, "grad_norm": 1.386387586593628, "learning_rate": 0.00013400815217391303, "loss": 0.5256, "step": 1510 }, { "epoch": 0.2365372573575454, "grad_norm": 1.8758655786514282, "learning_rate": 0.0001339962635869565, "loss": 0.7722, "step": 1511 }, { "epoch": 0.2366938008766437, "grad_norm": 1.0302928686141968, "learning_rate": 0.000133984375, "loss": 0.4901, "step": 1512 }, { "epoch": 0.236850344395742, "grad_norm": 2.7306668758392334, "learning_rate": 0.00013397248641304347, "loss": 0.8504, "step": 1513 }, { "epoch": 0.23700688791484031, "grad_norm": 1.8752633333206177, "learning_rate": 0.00013396059782608695, "loss": 0.6103, "step": 1514 }, { "epoch": 0.23716343143393864, "grad_norm": 1.2988919019699097, "learning_rate": 0.00013394870923913042, "loss": 0.5658, "step": 1515 }, { "epoch": 0.23731997495303694, "grad_norm": 1.2477346658706665, "learning_rate": 0.0001339368206521739, "loss": 0.6102, "step": 1516 }, { "epoch": 0.23747651847213525, "grad_norm": 2.34330415725708, "learning_rate": 0.00013392493206521738, "loss": 0.7753, "step": 1517 }, { "epoch": 0.23763306199123357, "grad_norm": 2.6282455921173096, "learning_rate": 0.00013391304347826086, "loss": 0.5847, "step": 1518 }, { "epoch": 0.23778960551033188, "grad_norm": 1.5470976829528809, "learning_rate": 0.00013390115489130434, "loss": 0.7202, "step": 1519 }, { "epoch": 0.23794614902943018, "grad_norm": 2.7066538333892822, "learning_rate": 0.00013388926630434781, "loss": 0.9813, "step": 1520 }, { "epoch": 0.23810269254852848, "grad_norm": 2.9889869689941406, "learning_rate": 0.0001338773777173913, "loss": 0.8257, "step": 1521 }, { "epoch": 0.2382592360676268, "grad_norm": 3.0295348167419434, "learning_rate": 0.00013386548913043477, "loss": 1.1907, "step": 1522 }, { "epoch": 0.2384157795867251, "grad_norm": 2.640406370162964, "learning_rate": 0.00013385360054347825, "loss": 0.9404, "step": 1523 }, { "epoch": 0.2385723231058234, "grad_norm": 1.8137588500976562, "learning_rate": 0.00013384171195652173, "loss": 0.7863, "step": 1524 }, { "epoch": 0.23872886662492174, "grad_norm": 1.8479584455490112, "learning_rate": 0.0001338298233695652, "loss": 1.0707, "step": 1525 }, { "epoch": 0.23888541014402004, "grad_norm": 2.2326648235321045, "learning_rate": 0.00013381793478260868, "loss": 1.1315, "step": 1526 }, { "epoch": 0.23904195366311834, "grad_norm": 2.9276936054229736, "learning_rate": 0.00013380604619565216, "loss": 1.1018, "step": 1527 }, { "epoch": 0.23919849718221667, "grad_norm": 1.7345106601715088, "learning_rate": 0.00013379415760869564, "loss": 0.6288, "step": 1528 }, { "epoch": 0.23935504070131497, "grad_norm": 1.5573650598526, "learning_rate": 0.00013378226902173912, "loss": 1.0065, "step": 1529 }, { "epoch": 0.23951158422041327, "grad_norm": 3.3014566898345947, "learning_rate": 0.0001337703804347826, "loss": 1.1783, "step": 1530 }, { "epoch": 0.23966812773951157, "grad_norm": 1.8784143924713135, "learning_rate": 0.00013375849184782607, "loss": 1.3251, "step": 1531 }, { "epoch": 0.2398246712586099, "grad_norm": 1.2015159130096436, "learning_rate": 0.00013374660326086955, "loss": 1.2432, "step": 1532 }, { "epoch": 0.2399812147777082, "grad_norm": 2.168745517730713, "learning_rate": 0.00013373471467391303, "loss": 1.6119, "step": 1533 }, { "epoch": 0.2401377582968065, "grad_norm": 1.7328457832336426, "learning_rate": 0.0001337228260869565, "loss": 0.875, "step": 1534 }, { "epoch": 0.24029430181590483, "grad_norm": 2.1353564262390137, "learning_rate": 0.0001337109375, "loss": 1.0807, "step": 1535 }, { "epoch": 0.24045084533500313, "grad_norm": 4.003932952880859, "learning_rate": 0.00013369904891304346, "loss": 1.0837, "step": 1536 }, { "epoch": 0.24060738885410143, "grad_norm": 1.9647778272628784, "learning_rate": 0.00013368716032608694, "loss": 1.52, "step": 1537 }, { "epoch": 0.24076393237319976, "grad_norm": 3.325221300125122, "learning_rate": 0.00013367527173913042, "loss": 0.9012, "step": 1538 }, { "epoch": 0.24092047589229806, "grad_norm": 8.016311645507812, "learning_rate": 0.0001336633831521739, "loss": 1.6185, "step": 1539 }, { "epoch": 0.24107701941139636, "grad_norm": 3.170025110244751, "learning_rate": 0.00013365149456521738, "loss": 1.2498, "step": 1540 }, { "epoch": 0.24123356293049467, "grad_norm": 3.285511016845703, "learning_rate": 0.00013363960597826086, "loss": 1.8279, "step": 1541 }, { "epoch": 0.241390106449593, "grad_norm": 2.7962310314178467, "learning_rate": 0.00013362771739130433, "loss": 1.3426, "step": 1542 }, { "epoch": 0.2415466499686913, "grad_norm": 6.000093936920166, "learning_rate": 0.0001336158288043478, "loss": 1.4063, "step": 1543 }, { "epoch": 0.2417031934877896, "grad_norm": 4.446857452392578, "learning_rate": 0.0001336039402173913, "loss": 1.6587, "step": 1544 }, { "epoch": 0.24185973700688793, "grad_norm": 3.521705150604248, "learning_rate": 0.00013359205163043477, "loss": 1.9288, "step": 1545 }, { "epoch": 0.24201628052598623, "grad_norm": 2.3071281909942627, "learning_rate": 0.00013358016304347825, "loss": 1.2174, "step": 1546 }, { "epoch": 0.24217282404508453, "grad_norm": 4.375818729400635, "learning_rate": 0.00013356827445652172, "loss": 1.2346, "step": 1547 }, { "epoch": 0.24232936756418283, "grad_norm": 1.5740619897842407, "learning_rate": 0.0001335563858695652, "loss": 0.9227, "step": 1548 }, { "epoch": 0.24248591108328116, "grad_norm": 2.206937074661255, "learning_rate": 0.00013354449728260868, "loss": 1.5496, "step": 1549 }, { "epoch": 0.24264245460237946, "grad_norm": 2.6909444332122803, "learning_rate": 0.00013353260869565216, "loss": 0.9358, "step": 1550 }, { "epoch": 0.24279899812147776, "grad_norm": 1.0903772115707397, "learning_rate": 0.00013352072010869564, "loss": 0.4004, "step": 1551 }, { "epoch": 0.2429555416405761, "grad_norm": 0.6675677299499512, "learning_rate": 0.00013350883152173912, "loss": 0.3934, "step": 1552 }, { "epoch": 0.2431120851596744, "grad_norm": 0.7618956565856934, "learning_rate": 0.0001334969429347826, "loss": 0.4917, "step": 1553 }, { "epoch": 0.2432686286787727, "grad_norm": 1.0635961294174194, "learning_rate": 0.00013348505434782607, "loss": 0.4724, "step": 1554 }, { "epoch": 0.24342517219787102, "grad_norm": 0.84605473279953, "learning_rate": 0.00013347316576086955, "loss": 0.5184, "step": 1555 }, { "epoch": 0.24358171571696932, "grad_norm": 1.0764834880828857, "learning_rate": 0.00013346127717391303, "loss": 0.6523, "step": 1556 }, { "epoch": 0.24373825923606762, "grad_norm": 1.3209165334701538, "learning_rate": 0.0001334493885869565, "loss": 0.5089, "step": 1557 }, { "epoch": 0.24389480275516592, "grad_norm": 1.0174200534820557, "learning_rate": 0.00013343749999999998, "loss": 0.6986, "step": 1558 }, { "epoch": 0.24405134627426425, "grad_norm": 0.8678247928619385, "learning_rate": 0.00013342561141304346, "loss": 0.4461, "step": 1559 }, { "epoch": 0.24420788979336255, "grad_norm": 1.049925684928894, "learning_rate": 0.00013341372282608694, "loss": 0.5771, "step": 1560 }, { "epoch": 0.24436443331246085, "grad_norm": 1.0940614938735962, "learning_rate": 0.00013340183423913042, "loss": 0.5525, "step": 1561 }, { "epoch": 0.24452097683155918, "grad_norm": 1.6677793264389038, "learning_rate": 0.0001333899456521739, "loss": 0.6069, "step": 1562 }, { "epoch": 0.24467752035065748, "grad_norm": 1.134541630744934, "learning_rate": 0.00013337805706521737, "loss": 0.583, "step": 1563 }, { "epoch": 0.24483406386975579, "grad_norm": 0.9047479629516602, "learning_rate": 0.00013336616847826085, "loss": 0.587, "step": 1564 }, { "epoch": 0.24499060738885411, "grad_norm": 1.231367588043213, "learning_rate": 0.00013335427989130433, "loss": 0.6363, "step": 1565 }, { "epoch": 0.24514715090795242, "grad_norm": 1.1310420036315918, "learning_rate": 0.0001333423913043478, "loss": 0.6302, "step": 1566 }, { "epoch": 0.24530369442705072, "grad_norm": 1.8226572275161743, "learning_rate": 0.0001333305027173913, "loss": 0.9778, "step": 1567 }, { "epoch": 0.24546023794614902, "grad_norm": 2.258199453353882, "learning_rate": 0.00013331861413043477, "loss": 0.8427, "step": 1568 }, { "epoch": 0.24561678146524735, "grad_norm": 2.8594961166381836, "learning_rate": 0.00013330672554347824, "loss": 1.0197, "step": 1569 }, { "epoch": 0.24577332498434565, "grad_norm": 1.4345529079437256, "learning_rate": 0.00013329483695652172, "loss": 0.9222, "step": 1570 }, { "epoch": 0.24592986850344395, "grad_norm": 1.592672348022461, "learning_rate": 0.0001332829483695652, "loss": 0.5116, "step": 1571 }, { "epoch": 0.24608641202254228, "grad_norm": 3.2878530025482178, "learning_rate": 0.00013327105978260868, "loss": 0.9674, "step": 1572 }, { "epoch": 0.24624295554164058, "grad_norm": 2.6214356422424316, "learning_rate": 0.00013325917119565216, "loss": 1.0667, "step": 1573 }, { "epoch": 0.24639949906073888, "grad_norm": 1.6351972818374634, "learning_rate": 0.00013324728260869563, "loss": 0.7111, "step": 1574 }, { "epoch": 0.24655604257983718, "grad_norm": 3.9043800830841064, "learning_rate": 0.0001332353940217391, "loss": 1.1804, "step": 1575 }, { "epoch": 0.2467125860989355, "grad_norm": 1.6355661153793335, "learning_rate": 0.0001332235054347826, "loss": 0.8952, "step": 1576 }, { "epoch": 0.2468691296180338, "grad_norm": 2.8054418563842773, "learning_rate": 0.00013321161684782607, "loss": 1.2305, "step": 1577 }, { "epoch": 0.2470256731371321, "grad_norm": 1.8013755083084106, "learning_rate": 0.00013319972826086955, "loss": 0.7151, "step": 1578 }, { "epoch": 0.24718221665623044, "grad_norm": 1.7306121587753296, "learning_rate": 0.00013318783967391303, "loss": 1.0737, "step": 1579 }, { "epoch": 0.24733876017532874, "grad_norm": 2.737297296524048, "learning_rate": 0.0001331759510869565, "loss": 0.9917, "step": 1580 }, { "epoch": 0.24749530369442704, "grad_norm": 4.141730785369873, "learning_rate": 0.00013316406249999998, "loss": 0.8518, "step": 1581 }, { "epoch": 0.24765184721352537, "grad_norm": 3.151050090789795, "learning_rate": 0.00013315217391304346, "loss": 1.487, "step": 1582 }, { "epoch": 0.24780839073262367, "grad_norm": 2.109626054763794, "learning_rate": 0.00013314028532608694, "loss": 1.3193, "step": 1583 }, { "epoch": 0.24796493425172197, "grad_norm": 3.38362717628479, "learning_rate": 0.00013312839673913042, "loss": 1.3084, "step": 1584 }, { "epoch": 0.24812147777082028, "grad_norm": 2.0524396896362305, "learning_rate": 0.0001331165081521739, "loss": 1.204, "step": 1585 }, { "epoch": 0.2482780212899186, "grad_norm": 2.6889712810516357, "learning_rate": 0.00013310461956521737, "loss": 1.7223, "step": 1586 }, { "epoch": 0.2484345648090169, "grad_norm": 3.9944424629211426, "learning_rate": 0.00013309273097826085, "loss": 1.0373, "step": 1587 }, { "epoch": 0.2485911083281152, "grad_norm": 3.2164306640625, "learning_rate": 0.00013308084239130433, "loss": 2.3317, "step": 1588 }, { "epoch": 0.24874765184721354, "grad_norm": 4.4747700691223145, "learning_rate": 0.0001330689538043478, "loss": 1.8348, "step": 1589 }, { "epoch": 0.24890419536631184, "grad_norm": 3.3150413036346436, "learning_rate": 0.0001330570652173913, "loss": 1.2712, "step": 1590 }, { "epoch": 0.24906073888541014, "grad_norm": 3.416360855102539, "learning_rate": 0.0001330451766304348, "loss": 1.9975, "step": 1591 }, { "epoch": 0.24921728240450847, "grad_norm": 5.742218017578125, "learning_rate": 0.00013303328804347827, "loss": 1.7958, "step": 1592 }, { "epoch": 0.24937382592360677, "grad_norm": 3.679699659347534, "learning_rate": 0.00013302139945652172, "loss": 1.6919, "step": 1593 }, { "epoch": 0.24953036944270507, "grad_norm": 1.6401225328445435, "learning_rate": 0.0001330095108695652, "loss": 1.9761, "step": 1594 }, { "epoch": 0.24968691296180337, "grad_norm": 3.162715435028076, "learning_rate": 0.00013299762228260868, "loss": 1.313, "step": 1595 }, { "epoch": 0.2498434564809017, "grad_norm": 1.9941905736923218, "learning_rate": 0.00013298573369565215, "loss": 1.7282, "step": 1596 }, { "epoch": 0.25, "grad_norm": 2.233170986175537, "learning_rate": 0.00013297384510869563, "loss": 0.8205, "step": 1597 }, { "epoch": 0.25015654351909833, "grad_norm": 1.736202597618103, "learning_rate": 0.0001329619565217391, "loss": 1.1269, "step": 1598 }, { "epoch": 0.2503130870381966, "grad_norm": 1.6513491868972778, "learning_rate": 0.0001329500679347826, "loss": 0.5567, "step": 1599 }, { "epoch": 0.25046963055729493, "grad_norm": 4.561160087585449, "learning_rate": 0.00013293817934782607, "loss": 1.3017, "step": 1600 }, { "epoch": 0.25062617407639326, "grad_norm": 0.741373598575592, "learning_rate": 0.00013292629076086954, "loss": 0.4545, "step": 1601 }, { "epoch": 0.25078271759549153, "grad_norm": 0.9385247230529785, "learning_rate": 0.00013291440217391302, "loss": 0.4194, "step": 1602 }, { "epoch": 0.25093926111458986, "grad_norm": 0.728826105594635, "learning_rate": 0.0001329025135869565, "loss": 0.4632, "step": 1603 }, { "epoch": 0.2510958046336882, "grad_norm": 0.707032322883606, "learning_rate": 0.00013289062499999998, "loss": 0.4362, "step": 1604 }, { "epoch": 0.25125234815278646, "grad_norm": 1.4269180297851562, "learning_rate": 0.00013287873641304346, "loss": 1.1212, "step": 1605 }, { "epoch": 0.2514088916718848, "grad_norm": 0.7600623369216919, "learning_rate": 0.00013286684782608694, "loss": 0.6083, "step": 1606 }, { "epoch": 0.25156543519098307, "grad_norm": 1.4600136280059814, "learning_rate": 0.00013285495923913041, "loss": 0.5514, "step": 1607 }, { "epoch": 0.2517219787100814, "grad_norm": 0.8098030090332031, "learning_rate": 0.0001328430706521739, "loss": 0.445, "step": 1608 }, { "epoch": 0.2518785222291797, "grad_norm": 1.8138145208358765, "learning_rate": 0.00013283118206521737, "loss": 0.7018, "step": 1609 }, { "epoch": 0.252035065748278, "grad_norm": 1.1059894561767578, "learning_rate": 0.00013281929347826088, "loss": 0.5369, "step": 1610 }, { "epoch": 0.2521916092673763, "grad_norm": 0.9829384088516235, "learning_rate": 0.00013280740489130435, "loss": 0.4834, "step": 1611 }, { "epoch": 0.25234815278647466, "grad_norm": 1.9244307279586792, "learning_rate": 0.00013279551630434783, "loss": 0.6386, "step": 1612 }, { "epoch": 0.25250469630557293, "grad_norm": 1.721109390258789, "learning_rate": 0.0001327836277173913, "loss": 0.4655, "step": 1613 }, { "epoch": 0.25266123982467126, "grad_norm": 1.1373052597045898, "learning_rate": 0.0001327717391304348, "loss": 0.5077, "step": 1614 }, { "epoch": 0.2528177833437696, "grad_norm": 1.8511611223220825, "learning_rate": 0.00013275985054347827, "loss": 0.6425, "step": 1615 }, { "epoch": 0.25297432686286786, "grad_norm": 0.959922194480896, "learning_rate": 0.00013274796195652172, "loss": 0.4612, "step": 1616 }, { "epoch": 0.2531308703819662, "grad_norm": 2.739140033721924, "learning_rate": 0.0001327360733695652, "loss": 0.8883, "step": 1617 }, { "epoch": 0.2532874139010645, "grad_norm": 1.0505585670471191, "learning_rate": 0.00013272418478260867, "loss": 0.4925, "step": 1618 }, { "epoch": 0.2534439574201628, "grad_norm": 1.5403441190719604, "learning_rate": 0.00013271229619565215, "loss": 0.467, "step": 1619 }, { "epoch": 0.2536005009392611, "grad_norm": 1.4733951091766357, "learning_rate": 0.00013270040760869563, "loss": 0.9243, "step": 1620 }, { "epoch": 0.25375704445835945, "grad_norm": 2.294351816177368, "learning_rate": 0.0001326885190217391, "loss": 0.7377, "step": 1621 }, { "epoch": 0.2539135879774577, "grad_norm": 2.379896879196167, "learning_rate": 0.00013267663043478259, "loss": 0.9671, "step": 1622 }, { "epoch": 0.25407013149655605, "grad_norm": 2.311002731323242, "learning_rate": 0.00013266474184782606, "loss": 0.8299, "step": 1623 }, { "epoch": 0.2542266750156543, "grad_norm": 2.7572197914123535, "learning_rate": 0.00013265285326086954, "loss": 0.7967, "step": 1624 }, { "epoch": 0.25438321853475265, "grad_norm": 2.94844913482666, "learning_rate": 0.00013264096467391302, "loss": 0.9135, "step": 1625 }, { "epoch": 0.254539762053851, "grad_norm": 1.8184032440185547, "learning_rate": 0.0001326290760869565, "loss": 0.8594, "step": 1626 }, { "epoch": 0.25469630557294926, "grad_norm": 4.0777587890625, "learning_rate": 0.00013261718749999998, "loss": 0.7726, "step": 1627 }, { "epoch": 0.2548528490920476, "grad_norm": 1.7395285367965698, "learning_rate": 0.00013260529891304345, "loss": 0.8962, "step": 1628 }, { "epoch": 0.2550093926111459, "grad_norm": 2.0731241703033447, "learning_rate": 0.00013259341032608693, "loss": 1.1093, "step": 1629 }, { "epoch": 0.2551659361302442, "grad_norm": 1.9725360870361328, "learning_rate": 0.00013258152173913044, "loss": 0.9304, "step": 1630 }, { "epoch": 0.2553224796493425, "grad_norm": 1.998984932899475, "learning_rate": 0.00013256963315217392, "loss": 0.9139, "step": 1631 }, { "epoch": 0.25547902316844084, "grad_norm": 5.8523736000061035, "learning_rate": 0.0001325577445652174, "loss": 1.5021, "step": 1632 }, { "epoch": 0.2556355666875391, "grad_norm": 1.7847434282302856, "learning_rate": 0.00013254585597826087, "loss": 0.8551, "step": 1633 }, { "epoch": 0.25579211020663745, "grad_norm": 2.9655184745788574, "learning_rate": 0.00013253396739130435, "loss": 1.552, "step": 1634 }, { "epoch": 0.2559486537257358, "grad_norm": 3.2326982021331787, "learning_rate": 0.00013252207880434783, "loss": 1.0616, "step": 1635 }, { "epoch": 0.25610519724483405, "grad_norm": 3.0509462356567383, "learning_rate": 0.0001325101902173913, "loss": 1.328, "step": 1636 }, { "epoch": 0.2562617407639324, "grad_norm": 2.210646152496338, "learning_rate": 0.00013249830163043479, "loss": 1.7504, "step": 1637 }, { "epoch": 0.2564182842830307, "grad_norm": 3.203110694885254, "learning_rate": 0.00013248641304347826, "loss": 1.0261, "step": 1638 }, { "epoch": 0.256574827802129, "grad_norm": 3.303529977798462, "learning_rate": 0.00013247452445652171, "loss": 1.8458, "step": 1639 }, { "epoch": 0.2567313713212273, "grad_norm": 2.712512493133545, "learning_rate": 0.0001324626358695652, "loss": 1.1433, "step": 1640 }, { "epoch": 0.25688791484032564, "grad_norm": 4.905345439910889, "learning_rate": 0.00013245074728260867, "loss": 1.3231, "step": 1641 }, { "epoch": 0.2570444583594239, "grad_norm": 5.792779922485352, "learning_rate": 0.00013243885869565215, "loss": 1.4329, "step": 1642 }, { "epoch": 0.25720100187852224, "grad_norm": 4.869967460632324, "learning_rate": 0.00013242697010869563, "loss": 1.092, "step": 1643 }, { "epoch": 0.2573575453976205, "grad_norm": 4.13190221786499, "learning_rate": 0.0001324150815217391, "loss": 0.8726, "step": 1644 }, { "epoch": 0.25751408891671884, "grad_norm": 4.0399322509765625, "learning_rate": 0.00013240319293478258, "loss": 2.2183, "step": 1645 }, { "epoch": 0.25767063243581717, "grad_norm": 4.044432163238525, "learning_rate": 0.00013239130434782606, "loss": 1.4265, "step": 1646 }, { "epoch": 0.25782717595491544, "grad_norm": 4.041375160217285, "learning_rate": 0.00013237941576086954, "loss": 0.7829, "step": 1647 }, { "epoch": 0.2579837194740138, "grad_norm": 2.780714273452759, "learning_rate": 0.00013236752717391302, "loss": 0.7976, "step": 1648 }, { "epoch": 0.2581402629931121, "grad_norm": 5.197601795196533, "learning_rate": 0.0001323556385869565, "loss": 0.8064, "step": 1649 }, { "epoch": 0.2582968065122104, "grad_norm": 3.868778705596924, "learning_rate": 0.00013234375, "loss": 1.706, "step": 1650 }, { "epoch": 0.2584533500313087, "grad_norm": 0.6355668306350708, "learning_rate": 0.00013233186141304348, "loss": 0.4456, "step": 1651 }, { "epoch": 0.25860989355040703, "grad_norm": 0.8465474843978882, "learning_rate": 0.00013231997282608696, "loss": 0.4664, "step": 1652 }, { "epoch": 0.2587664370695053, "grad_norm": 1.0502899885177612, "learning_rate": 0.00013230808423913044, "loss": 0.6226, "step": 1653 }, { "epoch": 0.25892298058860364, "grad_norm": 0.6016854643821716, "learning_rate": 0.00013229619565217391, "loss": 0.4053, "step": 1654 }, { "epoch": 0.25907952410770196, "grad_norm": 0.8290314674377441, "learning_rate": 0.0001322843070652174, "loss": 0.414, "step": 1655 }, { "epoch": 0.25923606762680024, "grad_norm": 1.0017277002334595, "learning_rate": 0.00013227241847826087, "loss": 0.3635, "step": 1656 }, { "epoch": 0.25939261114589857, "grad_norm": 1.2997344732284546, "learning_rate": 0.00013226052989130435, "loss": 0.4827, "step": 1657 }, { "epoch": 0.2595491546649969, "grad_norm": 1.57986319065094, "learning_rate": 0.00013224864130434783, "loss": 0.7307, "step": 1658 }, { "epoch": 0.25970569818409517, "grad_norm": 1.1290791034698486, "learning_rate": 0.0001322367527173913, "loss": 0.6119, "step": 1659 }, { "epoch": 0.2598622417031935, "grad_norm": 0.8408543467521667, "learning_rate": 0.00013222486413043478, "loss": 0.3863, "step": 1660 }, { "epoch": 0.26001878522229177, "grad_norm": 1.655318260192871, "learning_rate": 0.00013221297554347826, "loss": 0.5683, "step": 1661 }, { "epoch": 0.2601753287413901, "grad_norm": 1.300347924232483, "learning_rate": 0.0001322010869565217, "loss": 0.4831, "step": 1662 }, { "epoch": 0.26033187226048843, "grad_norm": 1.3160808086395264, "learning_rate": 0.0001321891983695652, "loss": 0.449, "step": 1663 }, { "epoch": 0.2604884157795867, "grad_norm": 1.3511909246444702, "learning_rate": 0.00013217730978260867, "loss": 0.5431, "step": 1664 }, { "epoch": 0.26064495929868503, "grad_norm": 3.6300532817840576, "learning_rate": 0.00013216542119565215, "loss": 0.7246, "step": 1665 }, { "epoch": 0.26080150281778336, "grad_norm": 1.4117393493652344, "learning_rate": 0.00013215353260869562, "loss": 0.7313, "step": 1666 }, { "epoch": 0.26095804633688163, "grad_norm": 3.013296604156494, "learning_rate": 0.0001321416440217391, "loss": 0.9243, "step": 1667 }, { "epoch": 0.26111458985597996, "grad_norm": 1.609023094177246, "learning_rate": 0.00013212975543478258, "loss": 0.7414, "step": 1668 }, { "epoch": 0.2612711333750783, "grad_norm": 1.6254165172576904, "learning_rate": 0.00013211786684782606, "loss": 0.5074, "step": 1669 }, { "epoch": 0.26142767689417656, "grad_norm": 2.1881580352783203, "learning_rate": 0.00013210597826086956, "loss": 0.9145, "step": 1670 }, { "epoch": 0.2615842204132749, "grad_norm": 1.7380225658416748, "learning_rate": 0.00013209408967391304, "loss": 0.7649, "step": 1671 }, { "epoch": 0.2617407639323732, "grad_norm": 1.8285080194473267, "learning_rate": 0.00013208220108695652, "loss": 0.5892, "step": 1672 }, { "epoch": 0.2618973074514715, "grad_norm": 1.986586570739746, "learning_rate": 0.0001320703125, "loss": 0.8571, "step": 1673 }, { "epoch": 0.2620538509705698, "grad_norm": 1.7807897329330444, "learning_rate": 0.00013205842391304348, "loss": 1.0165, "step": 1674 }, { "epoch": 0.26221039448966815, "grad_norm": 1.6004165410995483, "learning_rate": 0.00013204653532608696, "loss": 0.8666, "step": 1675 }, { "epoch": 0.2623669380087664, "grad_norm": 2.263204574584961, "learning_rate": 0.00013203464673913043, "loss": 0.8936, "step": 1676 }, { "epoch": 0.26252348152786475, "grad_norm": 1.2410616874694824, "learning_rate": 0.0001320227581521739, "loss": 0.7184, "step": 1677 }, { "epoch": 0.26268002504696303, "grad_norm": 1.8866908550262451, "learning_rate": 0.0001320108695652174, "loss": 0.9425, "step": 1678 }, { "epoch": 0.26283656856606136, "grad_norm": 3.2602944374084473, "learning_rate": 0.00013199898097826087, "loss": 1.1892, "step": 1679 }, { "epoch": 0.2629931120851597, "grad_norm": 2.5280826091766357, "learning_rate": 0.00013198709239130435, "loss": 1.1615, "step": 1680 }, { "epoch": 0.26314965560425796, "grad_norm": 5.20416784286499, "learning_rate": 0.00013197520380434782, "loss": 1.2032, "step": 1681 }, { "epoch": 0.2633061991233563, "grad_norm": 2.5858280658721924, "learning_rate": 0.0001319633152173913, "loss": 1.4503, "step": 1682 }, { "epoch": 0.2634627426424546, "grad_norm": 3.119642734527588, "learning_rate": 0.00013195142663043478, "loss": 1.2663, "step": 1683 }, { "epoch": 0.2636192861615529, "grad_norm": 2.9548559188842773, "learning_rate": 0.00013193953804347826, "loss": 0.9119, "step": 1684 }, { "epoch": 0.2637758296806512, "grad_norm": 3.7546298503875732, "learning_rate": 0.0001319276494565217, "loss": 1.5528, "step": 1685 }, { "epoch": 0.26393237319974955, "grad_norm": 2.279881238937378, "learning_rate": 0.0001319157608695652, "loss": 1.0407, "step": 1686 }, { "epoch": 0.2640889167188478, "grad_norm": 2.403942346572876, "learning_rate": 0.00013190387228260867, "loss": 0.8683, "step": 1687 }, { "epoch": 0.26424546023794615, "grad_norm": 3.0011584758758545, "learning_rate": 0.00013189198369565214, "loss": 1.3231, "step": 1688 }, { "epoch": 0.2644020037570445, "grad_norm": 2.3529255390167236, "learning_rate": 0.00013188009510869562, "loss": 1.1853, "step": 1689 }, { "epoch": 0.26455854727614275, "grad_norm": 3.166783332824707, "learning_rate": 0.00013186820652173913, "loss": 1.3084, "step": 1690 }, { "epoch": 0.2647150907952411, "grad_norm": 3.0203654766082764, "learning_rate": 0.0001318563179347826, "loss": 1.4258, "step": 1691 }, { "epoch": 0.2648716343143394, "grad_norm": 3.711552858352661, "learning_rate": 0.00013184442934782608, "loss": 1.7424, "step": 1692 }, { "epoch": 0.2650281778334377, "grad_norm": 2.6570987701416016, "learning_rate": 0.00013183254076086956, "loss": 1.6999, "step": 1693 }, { "epoch": 0.265184721352536, "grad_norm": 2.479626178741455, "learning_rate": 0.00013182065217391304, "loss": 1.0663, "step": 1694 }, { "epoch": 0.26534126487163434, "grad_norm": 2.4725427627563477, "learning_rate": 0.00013180876358695652, "loss": 1.3276, "step": 1695 }, { "epoch": 0.2654978083907326, "grad_norm": 1.2109977006912231, "learning_rate": 0.000131796875, "loss": 0.9343, "step": 1696 }, { "epoch": 0.26565435190983094, "grad_norm": 5.526186466217041, "learning_rate": 0.00013178498641304347, "loss": 1.2922, "step": 1697 }, { "epoch": 0.2658108954289292, "grad_norm": 3.337153196334839, "learning_rate": 0.00013177309782608695, "loss": 0.9426, "step": 1698 }, { "epoch": 0.26596743894802755, "grad_norm": 4.307013988494873, "learning_rate": 0.00013176120923913043, "loss": 1.0431, "step": 1699 }, { "epoch": 0.2661239824671259, "grad_norm": 4.412743091583252, "learning_rate": 0.0001317493206521739, "loss": 1.7938, "step": 1700 }, { "epoch": 0.26628052598622415, "grad_norm": 0.848796010017395, "learning_rate": 0.0001317374320652174, "loss": 0.425, "step": 1701 }, { "epoch": 0.2664370695053225, "grad_norm": 1.0841331481933594, "learning_rate": 0.00013172554347826087, "loss": 0.5429, "step": 1702 }, { "epoch": 0.2665936130244208, "grad_norm": 1.78682279586792, "learning_rate": 0.00013171365489130434, "loss": 0.6097, "step": 1703 }, { "epoch": 0.2667501565435191, "grad_norm": 1.007727861404419, "learning_rate": 0.00013170176630434782, "loss": 0.3966, "step": 1704 }, { "epoch": 0.2669067000626174, "grad_norm": 1.0612438917160034, "learning_rate": 0.0001316898777173913, "loss": 0.606, "step": 1705 }, { "epoch": 0.26706324358171574, "grad_norm": 1.2123675346374512, "learning_rate": 0.00013167798913043478, "loss": 0.8882, "step": 1706 }, { "epoch": 0.267219787100814, "grad_norm": 3.7745492458343506, "learning_rate": 0.00013166610054347826, "loss": 0.8153, "step": 1707 }, { "epoch": 0.26737633061991234, "grad_norm": 0.7866659760475159, "learning_rate": 0.0001316542119565217, "loss": 0.4387, "step": 1708 }, { "epoch": 0.26753287413901067, "grad_norm": 0.9814565181732178, "learning_rate": 0.00013164232336956519, "loss": 0.5191, "step": 1709 }, { "epoch": 0.26768941765810894, "grad_norm": 0.788360059261322, "learning_rate": 0.0001316304347826087, "loss": 0.3809, "step": 1710 }, { "epoch": 0.26784596117720727, "grad_norm": 1.5764691829681396, "learning_rate": 0.00013161854619565217, "loss": 0.7514, "step": 1711 }, { "epoch": 0.2680025046963056, "grad_norm": 1.0822551250457764, "learning_rate": 0.00013160665760869565, "loss": 0.5228, "step": 1712 }, { "epoch": 0.2681590482154039, "grad_norm": 4.454845428466797, "learning_rate": 0.00013159476902173913, "loss": 1.0512, "step": 1713 }, { "epoch": 0.2683155917345022, "grad_norm": 1.2444859743118286, "learning_rate": 0.0001315828804347826, "loss": 0.6215, "step": 1714 }, { "epoch": 0.2684721352536005, "grad_norm": 1.473549723625183, "learning_rate": 0.00013157099184782608, "loss": 0.6843, "step": 1715 }, { "epoch": 0.2686286787726988, "grad_norm": 1.815303921699524, "learning_rate": 0.00013155910326086956, "loss": 0.7618, "step": 1716 }, { "epoch": 0.26878522229179713, "grad_norm": 1.2764370441436768, "learning_rate": 0.00013154721467391304, "loss": 0.9438, "step": 1717 }, { "epoch": 0.2689417658108954, "grad_norm": 1.175432801246643, "learning_rate": 0.00013153532608695652, "loss": 0.4818, "step": 1718 }, { "epoch": 0.26909830932999373, "grad_norm": 1.388062596321106, "learning_rate": 0.0001315234375, "loss": 0.5612, "step": 1719 }, { "epoch": 0.26925485284909206, "grad_norm": 2.889636278152466, "learning_rate": 0.00013151154891304347, "loss": 0.5464, "step": 1720 }, { "epoch": 0.26941139636819034, "grad_norm": 1.9603755474090576, "learning_rate": 0.00013149966032608695, "loss": 0.6225, "step": 1721 }, { "epoch": 0.26956793988728867, "grad_norm": 1.7247517108917236, "learning_rate": 0.00013148777173913043, "loss": 0.6304, "step": 1722 }, { "epoch": 0.269724483406387, "grad_norm": 5.156110763549805, "learning_rate": 0.0001314758831521739, "loss": 0.6827, "step": 1723 }, { "epoch": 0.26988102692548527, "grad_norm": 2.129100799560547, "learning_rate": 0.00013146399456521739, "loss": 0.7994, "step": 1724 }, { "epoch": 0.2700375704445836, "grad_norm": 3.391395330429077, "learning_rate": 0.00013145210597826086, "loss": 0.8592, "step": 1725 }, { "epoch": 0.2701941139636819, "grad_norm": 2.054480791091919, "learning_rate": 0.00013144021739130434, "loss": 0.9876, "step": 1726 }, { "epoch": 0.2703506574827802, "grad_norm": 2.249589443206787, "learning_rate": 0.00013142832880434782, "loss": 0.8222, "step": 1727 }, { "epoch": 0.27050720100187853, "grad_norm": 1.7239291667938232, "learning_rate": 0.0001314164402173913, "loss": 0.7429, "step": 1728 }, { "epoch": 0.27066374452097686, "grad_norm": 1.9185943603515625, "learning_rate": 0.00013140455163043478, "loss": 0.9973, "step": 1729 }, { "epoch": 0.27082028804007513, "grad_norm": 2.8949387073516846, "learning_rate": 0.00013139266304347825, "loss": 0.7595, "step": 1730 }, { "epoch": 0.27097683155917346, "grad_norm": 2.101834297180176, "learning_rate": 0.00013138077445652173, "loss": 1.0956, "step": 1731 }, { "epoch": 0.27113337507827173, "grad_norm": 4.981101989746094, "learning_rate": 0.0001313688858695652, "loss": 1.3228, "step": 1732 }, { "epoch": 0.27128991859737006, "grad_norm": 4.2141594886779785, "learning_rate": 0.0001313569972826087, "loss": 1.0727, "step": 1733 }, { "epoch": 0.2714464621164684, "grad_norm": 3.9989116191864014, "learning_rate": 0.00013134510869565217, "loss": 1.3718, "step": 1734 }, { "epoch": 0.27160300563556666, "grad_norm": 2.5876946449279785, "learning_rate": 0.00013133322010869564, "loss": 1.0538, "step": 1735 }, { "epoch": 0.271759549154665, "grad_norm": 3.1893906593322754, "learning_rate": 0.00013132133152173912, "loss": 1.1404, "step": 1736 }, { "epoch": 0.2719160926737633, "grad_norm": 4.167642593383789, "learning_rate": 0.0001313094429347826, "loss": 1.8006, "step": 1737 }, { "epoch": 0.2720726361928616, "grad_norm": 2.9479360580444336, "learning_rate": 0.00013129755434782608, "loss": 1.2748, "step": 1738 }, { "epoch": 0.2722291797119599, "grad_norm": 2.9585535526275635, "learning_rate": 0.00013128566576086956, "loss": 1.4371, "step": 1739 }, { "epoch": 0.27238572323105825, "grad_norm": 3.858891487121582, "learning_rate": 0.00013127377717391304, "loss": 2.1697, "step": 1740 }, { "epoch": 0.2725422667501565, "grad_norm": 1.9289121627807617, "learning_rate": 0.00013126188858695651, "loss": 1.1945, "step": 1741 }, { "epoch": 0.27269881026925485, "grad_norm": 5.27614688873291, "learning_rate": 0.00013125, "loss": 1.4668, "step": 1742 }, { "epoch": 0.2728553537883532, "grad_norm": 3.634310483932495, "learning_rate": 0.00013123811141304347, "loss": 1.3177, "step": 1743 }, { "epoch": 0.27301189730745146, "grad_norm": 5.224123477935791, "learning_rate": 0.00013122622282608695, "loss": 1.5269, "step": 1744 }, { "epoch": 0.2731684408265498, "grad_norm": 4.057072639465332, "learning_rate": 0.00013121433423913043, "loss": 1.6025, "step": 1745 }, { "epoch": 0.2733249843456481, "grad_norm": 2.488065004348755, "learning_rate": 0.0001312024456521739, "loss": 1.1624, "step": 1746 }, { "epoch": 0.2734815278647464, "grad_norm": 2.9103729724884033, "learning_rate": 0.00013119055706521738, "loss": 1.3193, "step": 1747 }, { "epoch": 0.2736380713838447, "grad_norm": 2.6319925785064697, "learning_rate": 0.00013117866847826086, "loss": 0.971, "step": 1748 }, { "epoch": 0.27379461490294305, "grad_norm": 2.015558958053589, "learning_rate": 0.00013116677989130434, "loss": 1.3121, "step": 1749 }, { "epoch": 0.2739511584220413, "grad_norm": 1.4835504293441772, "learning_rate": 0.00013115489130434782, "loss": 1.1212, "step": 1750 }, { "epoch": 0.27410770194113965, "grad_norm": 0.6878445148468018, "learning_rate": 0.0001311430027173913, "loss": 0.4123, "step": 1751 }, { "epoch": 0.2742642454602379, "grad_norm": 0.7793351411819458, "learning_rate": 0.00013113111413043477, "loss": 0.3865, "step": 1752 }, { "epoch": 0.27442078897933625, "grad_norm": 1.7292464971542358, "learning_rate": 0.00013111922554347825, "loss": 0.4617, "step": 1753 }, { "epoch": 0.2745773324984346, "grad_norm": 1.0223158597946167, "learning_rate": 0.00013110733695652173, "loss": 0.367, "step": 1754 }, { "epoch": 0.27473387601753285, "grad_norm": 0.6096580624580383, "learning_rate": 0.0001310954483695652, "loss": 0.3167, "step": 1755 }, { "epoch": 0.2748904195366312, "grad_norm": 1.073090672492981, "learning_rate": 0.00013108355978260869, "loss": 0.5213, "step": 1756 }, { "epoch": 0.2750469630557295, "grad_norm": 1.4813119173049927, "learning_rate": 0.00013107167119565216, "loss": 0.4714, "step": 1757 }, { "epoch": 0.2752035065748278, "grad_norm": 1.1490185260772705, "learning_rate": 0.00013105978260869564, "loss": 0.5013, "step": 1758 }, { "epoch": 0.2753600500939261, "grad_norm": 0.9912382364273071, "learning_rate": 0.00013104789402173912, "loss": 0.4544, "step": 1759 }, { "epoch": 0.27551659361302444, "grad_norm": 3.0413525104522705, "learning_rate": 0.0001310360054347826, "loss": 0.5104, "step": 1760 }, { "epoch": 0.2756731371321227, "grad_norm": 1.1774282455444336, "learning_rate": 0.00013102411684782608, "loss": 0.4444, "step": 1761 }, { "epoch": 0.27582968065122104, "grad_norm": 0.9276003241539001, "learning_rate": 0.00013101222826086956, "loss": 0.3799, "step": 1762 }, { "epoch": 0.27598622417031937, "grad_norm": 1.706849217414856, "learning_rate": 0.00013100033967391303, "loss": 0.5282, "step": 1763 }, { "epoch": 0.27614276768941765, "grad_norm": 2.3933637142181396, "learning_rate": 0.0001309884510869565, "loss": 0.6306, "step": 1764 }, { "epoch": 0.276299311208516, "grad_norm": 1.9308264255523682, "learning_rate": 0.0001309765625, "loss": 0.5215, "step": 1765 }, { "epoch": 0.2764558547276143, "grad_norm": 2.8655917644500732, "learning_rate": 0.00013096467391304347, "loss": 0.8707, "step": 1766 }, { "epoch": 0.2766123982467126, "grad_norm": 1.6037839651107788, "learning_rate": 0.00013095278532608695, "loss": 0.6724, "step": 1767 }, { "epoch": 0.2767689417658109, "grad_norm": 1.3326549530029297, "learning_rate": 0.00013094089673913042, "loss": 0.7929, "step": 1768 }, { "epoch": 0.2769254852849092, "grad_norm": 1.4208683967590332, "learning_rate": 0.0001309290081521739, "loss": 0.6046, "step": 1769 }, { "epoch": 0.2770820288040075, "grad_norm": 1.0622613430023193, "learning_rate": 0.00013091711956521738, "loss": 0.5457, "step": 1770 }, { "epoch": 0.27723857232310584, "grad_norm": 2.1251227855682373, "learning_rate": 0.00013090523097826086, "loss": 0.784, "step": 1771 }, { "epoch": 0.2773951158422041, "grad_norm": 5.092341899871826, "learning_rate": 0.00013089334239130434, "loss": 0.8708, "step": 1772 }, { "epoch": 0.27755165936130244, "grad_norm": 2.455476760864258, "learning_rate": 0.00013088145380434781, "loss": 0.6405, "step": 1773 }, { "epoch": 0.27770820288040077, "grad_norm": 1.4771926403045654, "learning_rate": 0.0001308695652173913, "loss": 0.7479, "step": 1774 }, { "epoch": 0.27786474639949904, "grad_norm": 2.720136880874634, "learning_rate": 0.00013085767663043477, "loss": 1.0738, "step": 1775 }, { "epoch": 0.27802128991859737, "grad_norm": 2.424391746520996, "learning_rate": 0.00013084578804347825, "loss": 1.1301, "step": 1776 }, { "epoch": 0.2781778334376957, "grad_norm": 1.795849084854126, "learning_rate": 0.00013083389945652173, "loss": 0.6873, "step": 1777 }, { "epoch": 0.27833437695679397, "grad_norm": 1.6143054962158203, "learning_rate": 0.0001308220108695652, "loss": 0.8984, "step": 1778 }, { "epoch": 0.2784909204758923, "grad_norm": 1.8885301351547241, "learning_rate": 0.00013081012228260868, "loss": 0.7609, "step": 1779 }, { "epoch": 0.27864746399499063, "grad_norm": 3.9236578941345215, "learning_rate": 0.00013079823369565216, "loss": 0.9296, "step": 1780 }, { "epoch": 0.2788040075140889, "grad_norm": 2.481388568878174, "learning_rate": 0.00013078634510869564, "loss": 0.944, "step": 1781 }, { "epoch": 0.27896055103318723, "grad_norm": 2.337458372116089, "learning_rate": 0.00013077445652173912, "loss": 0.6465, "step": 1782 }, { "epoch": 0.27911709455228556, "grad_norm": 4.378272533416748, "learning_rate": 0.0001307625679347826, "loss": 1.6135, "step": 1783 }, { "epoch": 0.27927363807138383, "grad_norm": 2.255790948867798, "learning_rate": 0.00013075067934782607, "loss": 1.4189, "step": 1784 }, { "epoch": 0.27943018159048216, "grad_norm": 2.671877861022949, "learning_rate": 0.00013073879076086955, "loss": 0.94, "step": 1785 }, { "epoch": 0.27958672510958044, "grad_norm": 2.615446090698242, "learning_rate": 0.00013072690217391303, "loss": 1.4287, "step": 1786 }, { "epoch": 0.27974326862867877, "grad_norm": 3.436405658721924, "learning_rate": 0.0001307150135869565, "loss": 1.445, "step": 1787 }, { "epoch": 0.2798998121477771, "grad_norm": 3.1122164726257324, "learning_rate": 0.000130703125, "loss": 1.3743, "step": 1788 }, { "epoch": 0.28005635566687537, "grad_norm": 6.861082553863525, "learning_rate": 0.00013069123641304347, "loss": 1.27, "step": 1789 }, { "epoch": 0.2802128991859737, "grad_norm": 3.9584569931030273, "learning_rate": 0.00013067934782608694, "loss": 1.4008, "step": 1790 }, { "epoch": 0.280369442705072, "grad_norm": 5.784465312957764, "learning_rate": 0.00013066745923913042, "loss": 1.5061, "step": 1791 }, { "epoch": 0.2805259862241703, "grad_norm": 3.3332810401916504, "learning_rate": 0.0001306555706521739, "loss": 1.2909, "step": 1792 }, { "epoch": 0.2806825297432686, "grad_norm": 5.123645305633545, "learning_rate": 0.00013064368206521738, "loss": 1.3969, "step": 1793 }, { "epoch": 0.28083907326236696, "grad_norm": 2.8497986793518066, "learning_rate": 0.00013063179347826086, "loss": 1.0398, "step": 1794 }, { "epoch": 0.28099561678146523, "grad_norm": 3.0624849796295166, "learning_rate": 0.00013061990489130433, "loss": 2.0607, "step": 1795 }, { "epoch": 0.28115216030056356, "grad_norm": 2.6404590606689453, "learning_rate": 0.0001306080163043478, "loss": 1.1569, "step": 1796 }, { "epoch": 0.2813087038196619, "grad_norm": 3.3102316856384277, "learning_rate": 0.0001305961277173913, "loss": 0.819, "step": 1797 }, { "epoch": 0.28146524733876016, "grad_norm": 2.5928115844726562, "learning_rate": 0.00013058423913043477, "loss": 0.7946, "step": 1798 }, { "epoch": 0.2816217908578585, "grad_norm": 1.6684446334838867, "learning_rate": 0.00013057235054347825, "loss": 1.5216, "step": 1799 }, { "epoch": 0.2817783343769568, "grad_norm": 2.151193618774414, "learning_rate": 0.00013056046195652172, "loss": 1.3652, "step": 1800 }, { "epoch": 0.2819348778960551, "grad_norm": 0.7368059754371643, "learning_rate": 0.0001305485733695652, "loss": 0.408, "step": 1801 }, { "epoch": 0.2820914214151534, "grad_norm": 0.6979202628135681, "learning_rate": 0.00013053668478260868, "loss": 0.3859, "step": 1802 }, { "epoch": 0.28224796493425175, "grad_norm": 0.7594988346099854, "learning_rate": 0.00013052479619565216, "loss": 0.452, "step": 1803 }, { "epoch": 0.28240450845335, "grad_norm": 1.0655336380004883, "learning_rate": 0.00013051290760869564, "loss": 0.3727, "step": 1804 }, { "epoch": 0.28256105197244835, "grad_norm": 0.7894644737243652, "learning_rate": 0.00013050101902173912, "loss": 0.429, "step": 1805 }, { "epoch": 0.2827175954915466, "grad_norm": 1.001785159111023, "learning_rate": 0.0001304891304347826, "loss": 0.4995, "step": 1806 }, { "epoch": 0.28287413901064495, "grad_norm": 0.7456864714622498, "learning_rate": 0.00013047724184782607, "loss": 0.4501, "step": 1807 }, { "epoch": 0.2830306825297433, "grad_norm": 1.0624433755874634, "learning_rate": 0.00013046535326086955, "loss": 0.5081, "step": 1808 }, { "epoch": 0.28318722604884156, "grad_norm": 1.7353073358535767, "learning_rate": 0.00013045346467391303, "loss": 0.5324, "step": 1809 }, { "epoch": 0.2833437695679399, "grad_norm": 1.4701814651489258, "learning_rate": 0.0001304415760869565, "loss": 0.5572, "step": 1810 }, { "epoch": 0.2835003130870382, "grad_norm": 1.8427650928497314, "learning_rate": 0.00013042968749999998, "loss": 0.5755, "step": 1811 }, { "epoch": 0.2836568566061365, "grad_norm": 1.3868951797485352, "learning_rate": 0.00013041779891304346, "loss": 0.5776, "step": 1812 }, { "epoch": 0.2838134001252348, "grad_norm": 1.1609983444213867, "learning_rate": 0.00013040591032608694, "loss": 0.512, "step": 1813 }, { "epoch": 0.28396994364433314, "grad_norm": 2.921144962310791, "learning_rate": 0.00013039402173913042, "loss": 0.842, "step": 1814 }, { "epoch": 0.2841264871634314, "grad_norm": 1.2857376337051392, "learning_rate": 0.0001303821331521739, "loss": 0.5858, "step": 1815 }, { "epoch": 0.28428303068252975, "grad_norm": 1.3938236236572266, "learning_rate": 0.00013037024456521738, "loss": 0.5372, "step": 1816 }, { "epoch": 0.2844395742016281, "grad_norm": 1.316893458366394, "learning_rate": 0.00013035835597826085, "loss": 0.6858, "step": 1817 }, { "epoch": 0.28459611772072635, "grad_norm": 1.180131435394287, "learning_rate": 0.00013034646739130433, "loss": 0.5886, "step": 1818 }, { "epoch": 0.2847526612398247, "grad_norm": 2.4792895317077637, "learning_rate": 0.0001303345788043478, "loss": 0.7681, "step": 1819 }, { "epoch": 0.284909204758923, "grad_norm": 1.6231844425201416, "learning_rate": 0.0001303226902173913, "loss": 0.5425, "step": 1820 }, { "epoch": 0.2850657482780213, "grad_norm": 2.0413460731506348, "learning_rate": 0.00013031080163043477, "loss": 0.6662, "step": 1821 }, { "epoch": 0.2852222917971196, "grad_norm": 1.3662265539169312, "learning_rate": 0.00013029891304347824, "loss": 0.7108, "step": 1822 }, { "epoch": 0.2853788353162179, "grad_norm": 1.7527470588684082, "learning_rate": 0.00013028702445652172, "loss": 0.6628, "step": 1823 }, { "epoch": 0.2855353788353162, "grad_norm": 3.958101987838745, "learning_rate": 0.0001302751358695652, "loss": 1.0185, "step": 1824 }, { "epoch": 0.28569192235441454, "grad_norm": 2.4435555934906006, "learning_rate": 0.00013026324728260868, "loss": 1.0174, "step": 1825 }, { "epoch": 0.2858484658735128, "grad_norm": 1.7471866607666016, "learning_rate": 0.00013025135869565216, "loss": 0.6986, "step": 1826 }, { "epoch": 0.28600500939261114, "grad_norm": 2.5032992362976074, "learning_rate": 0.00013023947010869564, "loss": 0.8474, "step": 1827 }, { "epoch": 0.28616155291170947, "grad_norm": 2.0306198596954346, "learning_rate": 0.0001302275815217391, "loss": 0.8933, "step": 1828 }, { "epoch": 0.28631809643080774, "grad_norm": 2.328265428543091, "learning_rate": 0.0001302156929347826, "loss": 1.0538, "step": 1829 }, { "epoch": 0.2864746399499061, "grad_norm": 2.8509438037872314, "learning_rate": 0.00013020380434782607, "loss": 0.996, "step": 1830 }, { "epoch": 0.2866311834690044, "grad_norm": 1.6224405765533447, "learning_rate": 0.00013019191576086955, "loss": 0.6185, "step": 1831 }, { "epoch": 0.2867877269881027, "grad_norm": 2.904491901397705, "learning_rate": 0.00013018002717391303, "loss": 1.1333, "step": 1832 }, { "epoch": 0.286944270507201, "grad_norm": 3.994330883026123, "learning_rate": 0.0001301681385869565, "loss": 0.9702, "step": 1833 }, { "epoch": 0.28710081402629933, "grad_norm": 2.9747259616851807, "learning_rate": 0.00013015624999999998, "loss": 1.0918, "step": 1834 }, { "epoch": 0.2872573575453976, "grad_norm": 2.4779093265533447, "learning_rate": 0.00013014436141304346, "loss": 1.0387, "step": 1835 }, { "epoch": 0.28741390106449594, "grad_norm": 1.7374556064605713, "learning_rate": 0.00013013247282608694, "loss": 0.8885, "step": 1836 }, { "epoch": 0.28757044458359426, "grad_norm": 3.847935438156128, "learning_rate": 0.00013012058423913042, "loss": 1.2228, "step": 1837 }, { "epoch": 0.28772698810269254, "grad_norm": 3.059866189956665, "learning_rate": 0.0001301086956521739, "loss": 0.9307, "step": 1838 }, { "epoch": 0.28788353162179087, "grad_norm": 2.9234864711761475, "learning_rate": 0.00013009680706521737, "loss": 1.1051, "step": 1839 }, { "epoch": 0.28804007514088914, "grad_norm": 2.672654628753662, "learning_rate": 0.00013008491847826085, "loss": 1.3859, "step": 1840 }, { "epoch": 0.28819661865998747, "grad_norm": 3.7434275150299072, "learning_rate": 0.00013007302989130433, "loss": 1.3454, "step": 1841 }, { "epoch": 0.2883531621790858, "grad_norm": 4.585793972015381, "learning_rate": 0.0001300611413043478, "loss": 1.5537, "step": 1842 }, { "epoch": 0.28850970569818407, "grad_norm": 2.3395538330078125, "learning_rate": 0.00013004925271739129, "loss": 1.7292, "step": 1843 }, { "epoch": 0.2886662492172824, "grad_norm": 4.517784118652344, "learning_rate": 0.0001300373641304348, "loss": 1.7832, "step": 1844 }, { "epoch": 0.28882279273638073, "grad_norm": 4.467792510986328, "learning_rate": 0.00013002547554347827, "loss": 2.0586, "step": 1845 }, { "epoch": 0.288979336255479, "grad_norm": 1.9136050939559937, "learning_rate": 0.00013001358695652172, "loss": 1.0399, "step": 1846 }, { "epoch": 0.28913587977457733, "grad_norm": 3.462797164916992, "learning_rate": 0.0001300016983695652, "loss": 1.7909, "step": 1847 }, { "epoch": 0.28929242329367566, "grad_norm": 4.35165548324585, "learning_rate": 0.00012998980978260868, "loss": 1.3321, "step": 1848 }, { "epoch": 0.28944896681277393, "grad_norm": 2.8734819889068604, "learning_rate": 0.00012997792119565215, "loss": 1.3305, "step": 1849 }, { "epoch": 0.28960551033187226, "grad_norm": 2.8559577465057373, "learning_rate": 0.00012996603260869563, "loss": 1.5002, "step": 1850 }, { "epoch": 0.2897620538509706, "grad_norm": 0.7629559636116028, "learning_rate": 0.0001299541440217391, "loss": 0.5029, "step": 1851 }, { "epoch": 0.28991859737006886, "grad_norm": 0.7242524027824402, "learning_rate": 0.0001299422554347826, "loss": 0.2794, "step": 1852 }, { "epoch": 0.2900751408891672, "grad_norm": 0.8066074252128601, "learning_rate": 0.00012993036684782607, "loss": 0.3765, "step": 1853 }, { "epoch": 0.2902316844082655, "grad_norm": 0.8926177024841309, "learning_rate": 0.00012991847826086955, "loss": 0.4635, "step": 1854 }, { "epoch": 0.2903882279273638, "grad_norm": 0.817337691783905, "learning_rate": 0.00012990658967391302, "loss": 0.4342, "step": 1855 }, { "epoch": 0.2905447714464621, "grad_norm": 2.752033233642578, "learning_rate": 0.0001298947010869565, "loss": 0.8995, "step": 1856 }, { "epoch": 0.29070131496556045, "grad_norm": 0.8265368938446045, "learning_rate": 0.00012988281249999998, "loss": 0.5469, "step": 1857 }, { "epoch": 0.2908578584846587, "grad_norm": 0.7180613875389099, "learning_rate": 0.00012987092391304346, "loss": 0.2655, "step": 1858 }, { "epoch": 0.29101440200375706, "grad_norm": 0.9130600094795227, "learning_rate": 0.00012985903532608694, "loss": 0.3797, "step": 1859 }, { "epoch": 0.29117094552285533, "grad_norm": 0.9848387837409973, "learning_rate": 0.00012984714673913041, "loss": 0.3319, "step": 1860 }, { "epoch": 0.29132748904195366, "grad_norm": 1.2787880897521973, "learning_rate": 0.0001298352581521739, "loss": 0.5011, "step": 1861 }, { "epoch": 0.291484032561052, "grad_norm": 2.0281686782836914, "learning_rate": 0.00012982336956521737, "loss": 0.4503, "step": 1862 }, { "epoch": 0.29164057608015026, "grad_norm": 1.9242205619812012, "learning_rate": 0.00012981148097826085, "loss": 0.644, "step": 1863 }, { "epoch": 0.2917971195992486, "grad_norm": 1.546212077140808, "learning_rate": 0.00012979959239130435, "loss": 0.4559, "step": 1864 }, { "epoch": 0.2919536631183469, "grad_norm": 1.4780254364013672, "learning_rate": 0.00012978770380434783, "loss": 0.5431, "step": 1865 }, { "epoch": 0.2921102066374452, "grad_norm": 1.4987674951553345, "learning_rate": 0.0001297758152173913, "loss": 0.7082, "step": 1866 }, { "epoch": 0.2922667501565435, "grad_norm": 1.8761472702026367, "learning_rate": 0.0001297639266304348, "loss": 0.6771, "step": 1867 }, { "epoch": 0.29242329367564185, "grad_norm": 1.692105770111084, "learning_rate": 0.00012975203804347827, "loss": 0.7661, "step": 1868 }, { "epoch": 0.2925798371947401, "grad_norm": 2.538506031036377, "learning_rate": 0.00012974014945652172, "loss": 0.8948, "step": 1869 }, { "epoch": 0.29273638071383845, "grad_norm": 2.5544934272766113, "learning_rate": 0.0001297282608695652, "loss": 0.8622, "step": 1870 }, { "epoch": 0.2928929242329368, "grad_norm": 1.7024216651916504, "learning_rate": 0.00012971637228260867, "loss": 0.9824, "step": 1871 }, { "epoch": 0.29304946775203505, "grad_norm": 1.4630886316299438, "learning_rate": 0.00012970448369565215, "loss": 0.5957, "step": 1872 }, { "epoch": 0.2932060112711334, "grad_norm": 1.4507321119308472, "learning_rate": 0.00012969259510869563, "loss": 0.6726, "step": 1873 }, { "epoch": 0.2933625547902317, "grad_norm": 2.015676259994507, "learning_rate": 0.0001296807065217391, "loss": 0.6512, "step": 1874 }, { "epoch": 0.29351909830933, "grad_norm": 2.7799665927886963, "learning_rate": 0.0001296688179347826, "loss": 0.8909, "step": 1875 }, { "epoch": 0.2936756418284283, "grad_norm": 2.70676326751709, "learning_rate": 0.00012965692934782606, "loss": 0.8287, "step": 1876 }, { "epoch": 0.2938321853475266, "grad_norm": 2.152337074279785, "learning_rate": 0.00012964504076086954, "loss": 0.9313, "step": 1877 }, { "epoch": 0.2939887288666249, "grad_norm": 2.7132723331451416, "learning_rate": 0.00012963315217391302, "loss": 0.7231, "step": 1878 }, { "epoch": 0.29414527238572324, "grad_norm": 2.3062238693237305, "learning_rate": 0.0001296212635869565, "loss": 1.1973, "step": 1879 }, { "epoch": 0.2943018159048215, "grad_norm": 3.902287721633911, "learning_rate": 0.00012960937499999998, "loss": 1.1206, "step": 1880 }, { "epoch": 0.29445835942391985, "grad_norm": 2.6069366931915283, "learning_rate": 0.00012959748641304346, "loss": 1.1603, "step": 1881 }, { "epoch": 0.2946149029430182, "grad_norm": 2.8978781700134277, "learning_rate": 0.00012958559782608693, "loss": 1.2202, "step": 1882 }, { "epoch": 0.29477144646211645, "grad_norm": 2.627462148666382, "learning_rate": 0.00012957370923913044, "loss": 1.0909, "step": 1883 }, { "epoch": 0.2949279899812148, "grad_norm": 5.56995153427124, "learning_rate": 0.00012956182065217392, "loss": 1.4722, "step": 1884 }, { "epoch": 0.2950845335003131, "grad_norm": 4.747687816619873, "learning_rate": 0.0001295499320652174, "loss": 1.3674, "step": 1885 }, { "epoch": 0.2952410770194114, "grad_norm": 3.677747964859009, "learning_rate": 0.00012953804347826087, "loss": 1.7019, "step": 1886 }, { "epoch": 0.2953976205385097, "grad_norm": 5.182453632354736, "learning_rate": 0.00012952615489130435, "loss": 0.9613, "step": 1887 }, { "epoch": 0.29555416405760804, "grad_norm": 4.827957630157471, "learning_rate": 0.00012951426630434783, "loss": 1.6248, "step": 1888 }, { "epoch": 0.2957107075767063, "grad_norm": 2.750530958175659, "learning_rate": 0.0001295023777173913, "loss": 1.4756, "step": 1889 }, { "epoch": 0.29586725109580464, "grad_norm": 3.4063305854797363, "learning_rate": 0.00012949048913043479, "loss": 0.9446, "step": 1890 }, { "epoch": 0.29602379461490297, "grad_norm": 3.702359437942505, "learning_rate": 0.00012947860054347826, "loss": 1.3061, "step": 1891 }, { "epoch": 0.29618033813400124, "grad_norm": 7.412850379943848, "learning_rate": 0.00012946671195652172, "loss": 1.3999, "step": 1892 }, { "epoch": 0.29633688165309957, "grad_norm": 5.539935111999512, "learning_rate": 0.0001294548233695652, "loss": 1.735, "step": 1893 }, { "epoch": 0.29649342517219784, "grad_norm": 3.3781418800354004, "learning_rate": 0.00012944293478260867, "loss": 2.0082, "step": 1894 }, { "epoch": 0.2966499686912962, "grad_norm": 5.937712669372559, "learning_rate": 0.00012943104619565215, "loss": 1.1923, "step": 1895 }, { "epoch": 0.2968065122103945, "grad_norm": NaN, "learning_rate": 0.00012943104619565215, "loss": 0.0, "step": 1896 }, { "epoch": 0.2969630557294928, "grad_norm": 2.6432459354400635, "learning_rate": 0.00012941915760869563, "loss": 1.0927, "step": 1897 }, { "epoch": 0.2971195992485911, "grad_norm": 5.214190483093262, "learning_rate": 0.0001294072690217391, "loss": 0.7804, "step": 1898 }, { "epoch": 0.29727614276768943, "grad_norm": 2.9769136905670166, "learning_rate": 0.00012939538043478258, "loss": 1.0075, "step": 1899 }, { "epoch": 0.2974326862867877, "grad_norm": 3.5729973316192627, "learning_rate": 0.00012938349184782606, "loss": 1.2963, "step": 1900 }, { "epoch": 0.29758922980588604, "grad_norm": 0.6903343200683594, "learning_rate": 0.00012937160326086954, "loss": 0.3936, "step": 1901 }, { "epoch": 0.29774577332498436, "grad_norm": 0.6262027621269226, "learning_rate": 0.00012935971467391302, "loss": 0.4333, "step": 1902 }, { "epoch": 0.29790231684408264, "grad_norm": 0.7837537527084351, "learning_rate": 0.0001293478260869565, "loss": 0.3443, "step": 1903 }, { "epoch": 0.29805886036318097, "grad_norm": 1.1239062547683716, "learning_rate": 0.0001293359375, "loss": 0.3214, "step": 1904 }, { "epoch": 0.2982154038822793, "grad_norm": 0.8480125069618225, "learning_rate": 0.00012932404891304348, "loss": 0.3442, "step": 1905 }, { "epoch": 0.29837194740137757, "grad_norm": 1.0205204486846924, "learning_rate": 0.00012931216032608696, "loss": 0.3958, "step": 1906 }, { "epoch": 0.2985284909204759, "grad_norm": 1.1068170070648193, "learning_rate": 0.00012930027173913044, "loss": 0.4388, "step": 1907 }, { "epoch": 0.2986850344395742, "grad_norm": 1.186209797859192, "learning_rate": 0.00012928838315217391, "loss": 0.4648, "step": 1908 }, { "epoch": 0.2988415779586725, "grad_norm": 0.9373310208320618, "learning_rate": 0.0001292764945652174, "loss": 0.3966, "step": 1909 }, { "epoch": 0.29899812147777083, "grad_norm": 1.6830101013183594, "learning_rate": 0.00012926460597826087, "loss": 0.9516, "step": 1910 }, { "epoch": 0.29915466499686916, "grad_norm": 1.4225823879241943, "learning_rate": 0.00012925271739130435, "loss": 0.5147, "step": 1911 }, { "epoch": 0.29931120851596743, "grad_norm": 1.3201262950897217, "learning_rate": 0.00012924082880434783, "loss": 0.5022, "step": 1912 }, { "epoch": 0.29946775203506576, "grad_norm": 0.9612787961959839, "learning_rate": 0.0001292289402173913, "loss": 0.5163, "step": 1913 }, { "epoch": 0.29962429555416403, "grad_norm": 1.1251895427703857, "learning_rate": 0.00012921705163043478, "loss": 0.629, "step": 1914 }, { "epoch": 0.29978083907326236, "grad_norm": 2.488968849182129, "learning_rate": 0.00012920516304347826, "loss": 0.7532, "step": 1915 }, { "epoch": 0.2999373825923607, "grad_norm": 1.6762433052062988, "learning_rate": 0.0001291932744565217, "loss": 0.5006, "step": 1916 }, { "epoch": 0.30009392611145896, "grad_norm": 2.3939027786254883, "learning_rate": 0.0001291813858695652, "loss": 0.4302, "step": 1917 }, { "epoch": 0.3002504696305573, "grad_norm": 1.432478666305542, "learning_rate": 0.00012916949728260867, "loss": 0.5893, "step": 1918 }, { "epoch": 0.3004070131496556, "grad_norm": 3.5417850017547607, "learning_rate": 0.00012915760869565215, "loss": 0.5905, "step": 1919 }, { "epoch": 0.3005635566687539, "grad_norm": 2.2391293048858643, "learning_rate": 0.00012914572010869563, "loss": 1.1089, "step": 1920 }, { "epoch": 0.3007201001878522, "grad_norm": 1.4063631296157837, "learning_rate": 0.0001291338315217391, "loss": 0.771, "step": 1921 }, { "epoch": 0.30087664370695055, "grad_norm": 2.146338701248169, "learning_rate": 0.00012912194293478258, "loss": 0.7771, "step": 1922 }, { "epoch": 0.3010331872260488, "grad_norm": 3.4744086265563965, "learning_rate": 0.00012911005434782606, "loss": 0.7068, "step": 1923 }, { "epoch": 0.30118973074514716, "grad_norm": 1.6833046674728394, "learning_rate": 0.00012909816576086957, "loss": 0.4393, "step": 1924 }, { "epoch": 0.3013462742642455, "grad_norm": 6.197470188140869, "learning_rate": 0.00012908627717391304, "loss": 1.4676, "step": 1925 }, { "epoch": 0.30150281778334376, "grad_norm": 2.446017265319824, "learning_rate": 0.00012907438858695652, "loss": 0.95, "step": 1926 }, { "epoch": 0.3016593613024421, "grad_norm": 2.272367477416992, "learning_rate": 0.0001290625, "loss": 0.9095, "step": 1927 }, { "epoch": 0.3018159048215404, "grad_norm": 5.511720180511475, "learning_rate": 0.00012905061141304348, "loss": 1.6161, "step": 1928 }, { "epoch": 0.3019724483406387, "grad_norm": 1.9593899250030518, "learning_rate": 0.00012903872282608696, "loss": 0.8704, "step": 1929 }, { "epoch": 0.302128991859737, "grad_norm": 2.4725069999694824, "learning_rate": 0.00012902683423913043, "loss": 0.8219, "step": 1930 }, { "epoch": 0.3022855353788353, "grad_norm": 4.2200798988342285, "learning_rate": 0.0001290149456521739, "loss": 1.0075, "step": 1931 }, { "epoch": 0.3024420788979336, "grad_norm": 1.9041141271591187, "learning_rate": 0.0001290030570652174, "loss": 0.8388, "step": 1932 }, { "epoch": 0.30259862241703195, "grad_norm": 2.6128487586975098, "learning_rate": 0.00012899116847826087, "loss": 1.2094, "step": 1933 }, { "epoch": 0.3027551659361302, "grad_norm": 5.955155849456787, "learning_rate": 0.00012897927989130435, "loss": 0.9809, "step": 1934 }, { "epoch": 0.30291170945522855, "grad_norm": 2.143261432647705, "learning_rate": 0.00012896739130434782, "loss": 1.3753, "step": 1935 }, { "epoch": 0.3030682529743269, "grad_norm": 2.149247169494629, "learning_rate": 0.0001289555027173913, "loss": 1.5327, "step": 1936 }, { "epoch": 0.30322479649342515, "grad_norm": 6.526374340057373, "learning_rate": 0.00012894361413043478, "loss": 1.6077, "step": 1937 }, { "epoch": 0.3033813400125235, "grad_norm": 1.899046540260315, "learning_rate": 0.00012893172554347826, "loss": 1.2696, "step": 1938 }, { "epoch": 0.3035378835316218, "grad_norm": 2.8614537715911865, "learning_rate": 0.0001289198369565217, "loss": 1.3443, "step": 1939 }, { "epoch": 0.3036944270507201, "grad_norm": 4.185934543609619, "learning_rate": 0.0001289079483695652, "loss": 1.2619, "step": 1940 }, { "epoch": 0.3038509705698184, "grad_norm": 3.298905849456787, "learning_rate": 0.00012889605978260867, "loss": 1.7674, "step": 1941 }, { "epoch": 0.30400751408891674, "grad_norm": 8.126358985900879, "learning_rate": 0.00012888417119565214, "loss": 1.3492, "step": 1942 }, { "epoch": 0.304164057608015, "grad_norm": 4.395397663116455, "learning_rate": 0.00012887228260869562, "loss": 1.4496, "step": 1943 }, { "epoch": 0.30432060112711334, "grad_norm": 3.9913604259490967, "learning_rate": 0.00012886039402173913, "loss": 1.5517, "step": 1944 }, { "epoch": 0.3044771446462117, "grad_norm": 3.4465482234954834, "learning_rate": 0.0001288485054347826, "loss": 1.5222, "step": 1945 }, { "epoch": 0.30463368816530995, "grad_norm": 2.3812568187713623, "learning_rate": 0.00012883661684782608, "loss": 1.3632, "step": 1946 }, { "epoch": 0.3047902316844083, "grad_norm": 2.6299450397491455, "learning_rate": 0.00012882472826086956, "loss": 0.9959, "step": 1947 }, { "epoch": 0.30494677520350655, "grad_norm": 4.751199245452881, "learning_rate": 0.00012881283967391304, "loss": 1.3225, "step": 1948 }, { "epoch": 0.3051033187226049, "grad_norm": 3.4646661281585693, "learning_rate": 0.00012880095108695652, "loss": 1.1039, "step": 1949 }, { "epoch": 0.3052598622417032, "grad_norm": 2.408487558364868, "learning_rate": 0.0001287890625, "loss": 1.3578, "step": 1950 }, { "epoch": 0.3054164057608015, "grad_norm": 0.9276137948036194, "learning_rate": 0.00012877717391304348, "loss": 0.4122, "step": 1951 }, { "epoch": 0.3055729492798998, "grad_norm": 1.0094804763793945, "learning_rate": 0.00012876528532608695, "loss": 0.4436, "step": 1952 }, { "epoch": 0.30572949279899814, "grad_norm": 0.7859315276145935, "learning_rate": 0.00012875339673913043, "loss": 0.367, "step": 1953 }, { "epoch": 0.3058860363180964, "grad_norm": 0.6343227624893188, "learning_rate": 0.0001287415081521739, "loss": 0.3751, "step": 1954 }, { "epoch": 0.30604257983719474, "grad_norm": 1.3266167640686035, "learning_rate": 0.0001287296195652174, "loss": 0.4672, "step": 1955 }, { "epoch": 0.30619912335629307, "grad_norm": 1.2268856763839722, "learning_rate": 0.00012871773097826087, "loss": 0.4386, "step": 1956 }, { "epoch": 0.30635566687539134, "grad_norm": 0.6165496110916138, "learning_rate": 0.00012870584239130434, "loss": 0.3163, "step": 1957 }, { "epoch": 0.30651221039448967, "grad_norm": 0.9123106002807617, "learning_rate": 0.00012869395380434782, "loss": 0.4279, "step": 1958 }, { "epoch": 0.306668753913588, "grad_norm": 1.6710667610168457, "learning_rate": 0.0001286820652173913, "loss": 0.6563, "step": 1959 }, { "epoch": 0.3068252974326863, "grad_norm": 0.8497854471206665, "learning_rate": 0.00012867017663043478, "loss": 0.3409, "step": 1960 }, { "epoch": 0.3069818409517846, "grad_norm": 1.2684895992279053, "learning_rate": 0.00012865828804347826, "loss": 0.5042, "step": 1961 }, { "epoch": 0.30713838447088293, "grad_norm": 1.0742262601852417, "learning_rate": 0.0001286463994565217, "loss": 0.46, "step": 1962 }, { "epoch": 0.3072949279899812, "grad_norm": 2.347844123840332, "learning_rate": 0.00012863451086956519, "loss": 0.4709, "step": 1963 }, { "epoch": 0.30745147150907953, "grad_norm": 2.058762550354004, "learning_rate": 0.0001286226222826087, "loss": 0.7188, "step": 1964 }, { "epoch": 0.3076080150281778, "grad_norm": 2.2945525646209717, "learning_rate": 0.00012861073369565217, "loss": 0.7345, "step": 1965 }, { "epoch": 0.30776455854727613, "grad_norm": 1.4691871404647827, "learning_rate": 0.00012859884510869565, "loss": 0.8672, "step": 1966 }, { "epoch": 0.30792110206637446, "grad_norm": 1.3400033712387085, "learning_rate": 0.00012858695652173913, "loss": 0.5486, "step": 1967 }, { "epoch": 0.30807764558547274, "grad_norm": 1.8109097480773926, "learning_rate": 0.0001285750679347826, "loss": 0.6923, "step": 1968 }, { "epoch": 0.30823418910457107, "grad_norm": 1.947255253791809, "learning_rate": 0.00012856317934782608, "loss": 0.5967, "step": 1969 }, { "epoch": 0.3083907326236694, "grad_norm": 1.8979631662368774, "learning_rate": 0.00012855129076086956, "loss": 0.7573, "step": 1970 }, { "epoch": 0.30854727614276767, "grad_norm": 3.378042221069336, "learning_rate": 0.00012853940217391304, "loss": 0.7173, "step": 1971 }, { "epoch": 0.308703819661866, "grad_norm": 2.7298340797424316, "learning_rate": 0.00012852751358695652, "loss": 1.0047, "step": 1972 }, { "epoch": 0.3088603631809643, "grad_norm": 3.457345485687256, "learning_rate": 0.000128515625, "loss": 1.1157, "step": 1973 }, { "epoch": 0.3090169067000626, "grad_norm": 1.8782943487167358, "learning_rate": 0.00012850373641304347, "loss": 0.7409, "step": 1974 }, { "epoch": 0.30917345021916093, "grad_norm": 4.0386576652526855, "learning_rate": 0.00012849184782608695, "loss": 0.7333, "step": 1975 }, { "epoch": 0.30932999373825926, "grad_norm": 3.3666348457336426, "learning_rate": 0.00012847995923913043, "loss": 0.7425, "step": 1976 }, { "epoch": 0.30948653725735753, "grad_norm": 5.3515238761901855, "learning_rate": 0.0001284680706521739, "loss": 0.8805, "step": 1977 }, { "epoch": 0.30964308077645586, "grad_norm": 2.6234452724456787, "learning_rate": 0.00012845618206521739, "loss": 0.9477, "step": 1978 }, { "epoch": 0.3097996242955542, "grad_norm": 2.5419058799743652, "learning_rate": 0.00012844429347826086, "loss": 0.9396, "step": 1979 }, { "epoch": 0.30995616781465246, "grad_norm": 1.92665433883667, "learning_rate": 0.00012843240489130434, "loss": 0.8596, "step": 1980 }, { "epoch": 0.3101127113337508, "grad_norm": 1.7717418670654297, "learning_rate": 0.00012842051630434782, "loss": 0.762, "step": 1981 }, { "epoch": 0.3102692548528491, "grad_norm": 2.702404737472534, "learning_rate": 0.0001284086277173913, "loss": 1.2426, "step": 1982 }, { "epoch": 0.3104257983719474, "grad_norm": 1.6071867942810059, "learning_rate": 0.00012839673913043478, "loss": 0.5726, "step": 1983 }, { "epoch": 0.3105823418910457, "grad_norm": 2.3567862510681152, "learning_rate": 0.00012838485054347825, "loss": 1.1946, "step": 1984 }, { "epoch": 0.310738885410144, "grad_norm": 3.479128837585449, "learning_rate": 0.00012837296195652173, "loss": 0.9345, "step": 1985 }, { "epoch": 0.3108954289292423, "grad_norm": 2.9900925159454346, "learning_rate": 0.0001283610733695652, "loss": 1.3635, "step": 1986 }, { "epoch": 0.31105197244834065, "grad_norm": 2.2305846214294434, "learning_rate": 0.0001283491847826087, "loss": 1.1756, "step": 1987 }, { "epoch": 0.3112085159674389, "grad_norm": 12.483503341674805, "learning_rate": 0.00012833729619565217, "loss": 1.5628, "step": 1988 }, { "epoch": 0.31136505948653725, "grad_norm": 4.1620192527771, "learning_rate": 0.00012832540760869565, "loss": 1.644, "step": 1989 }, { "epoch": 0.3115216030056356, "grad_norm": 5.6619791984558105, "learning_rate": 0.00012831351902173912, "loss": 1.2053, "step": 1990 }, { "epoch": 0.31167814652473386, "grad_norm": 2.012946367263794, "learning_rate": 0.0001283016304347826, "loss": 1.3935, "step": 1991 }, { "epoch": 0.3118346900438322, "grad_norm": 2.0405664443969727, "learning_rate": 0.00012828974184782608, "loss": 1.3439, "step": 1992 }, { "epoch": 0.3119912335629305, "grad_norm": 3.453801155090332, "learning_rate": 0.00012827785326086956, "loss": 1.7407, "step": 1993 }, { "epoch": 0.3121477770820288, "grad_norm": 4.172926425933838, "learning_rate": 0.00012826596467391304, "loss": 1.1149, "step": 1994 }, { "epoch": 0.3123043206011271, "grad_norm": 3.1522281169891357, "learning_rate": 0.00012825407608695651, "loss": 1.0901, "step": 1995 }, { "epoch": 0.31246086412022545, "grad_norm": 3.3010659217834473, "learning_rate": 0.0001282421875, "loss": 0.7855, "step": 1996 }, { "epoch": 0.3126174076393237, "grad_norm": 4.838016510009766, "learning_rate": 0.00012823029891304347, "loss": 1.1945, "step": 1997 }, { "epoch": 0.31277395115842205, "grad_norm": 4.610550403594971, "learning_rate": 0.00012821841032608695, "loss": 1.556, "step": 1998 }, { "epoch": 0.3129304946775204, "grad_norm": 5.128667831420898, "learning_rate": 0.00012820652173913043, "loss": 0.9952, "step": 1999 }, { "epoch": 0.31308703819661865, "grad_norm": 3.086717128753662, "learning_rate": 0.0001281946331521739, "loss": 1.3019, "step": 2000 }, { "epoch": 0.31308703819661865, "eval_loss": 0.7274868488311768, "eval_runtime": 204.205, "eval_samples_per_second": 60.64, "eval_steps_per_second": 3.79, "eval_wer": 0.414646067535117, "step": 2000 }, { "epoch": 0.313243581715717, "grad_norm": 1.1281012296676636, "learning_rate": 0.00012818274456521738, "loss": 0.5284, "step": 2001 }, { "epoch": 0.31340012523481525, "grad_norm": 0.8917173147201538, "learning_rate": 0.00012817085597826086, "loss": 0.4143, "step": 2002 }, { "epoch": 0.3135566687539136, "grad_norm": 0.8058364391326904, "learning_rate": 0.00012815896739130434, "loss": 0.4511, "step": 2003 }, { "epoch": 0.3137132122730119, "grad_norm": 0.9206722974777222, "learning_rate": 0.00012814707880434782, "loss": 0.3486, "step": 2004 }, { "epoch": 0.3138697557921102, "grad_norm": 0.6768581867218018, "learning_rate": 0.0001281351902173913, "loss": 0.4852, "step": 2005 }, { "epoch": 0.3140262993112085, "grad_norm": 1.0464776754379272, "learning_rate": 0.00012812330163043477, "loss": 0.4564, "step": 2006 }, { "epoch": 0.31418284283030684, "grad_norm": 1.1046277284622192, "learning_rate": 0.00012811141304347825, "loss": 0.3682, "step": 2007 }, { "epoch": 0.3143393863494051, "grad_norm": 1.0193133354187012, "learning_rate": 0.00012809952445652173, "loss": 0.6066, "step": 2008 }, { "epoch": 0.31449592986850344, "grad_norm": 0.9877195358276367, "learning_rate": 0.0001280876358695652, "loss": 0.5261, "step": 2009 }, { "epoch": 0.31465247338760177, "grad_norm": 1.0596412420272827, "learning_rate": 0.0001280757472826087, "loss": 0.4768, "step": 2010 }, { "epoch": 0.31480901690670005, "grad_norm": 0.8972132802009583, "learning_rate": 0.00012806385869565216, "loss": 0.5844, "step": 2011 }, { "epoch": 0.3149655604257984, "grad_norm": 1.0469647645950317, "learning_rate": 0.00012805197010869564, "loss": 0.4988, "step": 2012 }, { "epoch": 0.3151221039448967, "grad_norm": 1.4878398180007935, "learning_rate": 0.00012804008152173912, "loss": 0.5619, "step": 2013 }, { "epoch": 0.315278647463995, "grad_norm": 1.5213273763656616, "learning_rate": 0.0001280281929347826, "loss": 0.577, "step": 2014 }, { "epoch": 0.3154351909830933, "grad_norm": 1.760290265083313, "learning_rate": 0.00012801630434782608, "loss": 0.5426, "step": 2015 }, { "epoch": 0.31559173450219163, "grad_norm": 1.541623830795288, "learning_rate": 0.00012800441576086956, "loss": 0.7694, "step": 2016 }, { "epoch": 0.3157482780212899, "grad_norm": 1.4588918685913086, "learning_rate": 0.00012799252717391303, "loss": 0.5831, "step": 2017 }, { "epoch": 0.31590482154038824, "grad_norm": 1.6236376762390137, "learning_rate": 0.0001279806385869565, "loss": 0.8699, "step": 2018 }, { "epoch": 0.3160613650594865, "grad_norm": 3.149576425552368, "learning_rate": 0.00012796875, "loss": 1.048, "step": 2019 }, { "epoch": 0.31621790857858484, "grad_norm": 1.8454132080078125, "learning_rate": 0.00012795686141304347, "loss": 0.8029, "step": 2020 }, { "epoch": 0.31637445209768317, "grad_norm": 1.272111415863037, "learning_rate": 0.00012794497282608695, "loss": 0.7694, "step": 2021 }, { "epoch": 0.31653099561678144, "grad_norm": 1.8354430198669434, "learning_rate": 0.00012793308423913042, "loss": 1.2, "step": 2022 }, { "epoch": 0.31668753913587977, "grad_norm": 2.4283607006073, "learning_rate": 0.0001279211956521739, "loss": 1.1314, "step": 2023 }, { "epoch": 0.3168440826549781, "grad_norm": 1.1895619630813599, "learning_rate": 0.00012790930706521738, "loss": 0.5722, "step": 2024 }, { "epoch": 0.31700062617407637, "grad_norm": 2.874372959136963, "learning_rate": 0.00012789741847826086, "loss": 0.7933, "step": 2025 }, { "epoch": 0.3171571696931747, "grad_norm": 2.9173803329467773, "learning_rate": 0.00012788552989130434, "loss": 0.8758, "step": 2026 }, { "epoch": 0.31731371321227303, "grad_norm": 1.5069630146026611, "learning_rate": 0.00012787364130434782, "loss": 0.5237, "step": 2027 }, { "epoch": 0.3174702567313713, "grad_norm": 1.9356404542922974, "learning_rate": 0.0001278617527173913, "loss": 0.6612, "step": 2028 }, { "epoch": 0.31762680025046963, "grad_norm": 2.4982175827026367, "learning_rate": 0.00012784986413043477, "loss": 1.1985, "step": 2029 }, { "epoch": 0.31778334376956796, "grad_norm": 2.6187028884887695, "learning_rate": 0.00012783797554347825, "loss": 0.9885, "step": 2030 }, { "epoch": 0.31793988728866623, "grad_norm": 2.68552827835083, "learning_rate": 0.00012782608695652173, "loss": 1.0055, "step": 2031 }, { "epoch": 0.31809643080776456, "grad_norm": 4.573462009429932, "learning_rate": 0.0001278141983695652, "loss": 0.87, "step": 2032 }, { "epoch": 0.3182529743268629, "grad_norm": 2.740635633468628, "learning_rate": 0.00012780230978260868, "loss": 1.1438, "step": 2033 }, { "epoch": 0.31840951784596117, "grad_norm": 2.329270362854004, "learning_rate": 0.00012779042119565216, "loss": 1.027, "step": 2034 }, { "epoch": 0.3185660613650595, "grad_norm": 3.17224383354187, "learning_rate": 0.00012777853260869564, "loss": 1.104, "step": 2035 }, { "epoch": 0.3187226048841578, "grad_norm": 2.715611696243286, "learning_rate": 0.00012776664402173912, "loss": 0.9335, "step": 2036 }, { "epoch": 0.3188791484032561, "grad_norm": 6.211226463317871, "learning_rate": 0.0001277547554347826, "loss": 0.9579, "step": 2037 }, { "epoch": 0.3190356919223544, "grad_norm": 3.4495043754577637, "learning_rate": 0.00012774286684782608, "loss": 1.2878, "step": 2038 }, { "epoch": 0.3191922354414527, "grad_norm": 2.26336407661438, "learning_rate": 0.00012773097826086955, "loss": 1.0581, "step": 2039 }, { "epoch": 0.319348778960551, "grad_norm": 5.088692665100098, "learning_rate": 0.00012771908967391303, "loss": 1.8268, "step": 2040 }, { "epoch": 0.31950532247964936, "grad_norm": 1.7484240531921387, "learning_rate": 0.0001277072010869565, "loss": 1.5243, "step": 2041 }, { "epoch": 0.31966186599874763, "grad_norm": 3.02624249458313, "learning_rate": 0.0001276953125, "loss": 1.4687, "step": 2042 }, { "epoch": 0.31981840951784596, "grad_norm": 3.6646227836608887, "learning_rate": 0.00012768342391304347, "loss": 1.3303, "step": 2043 }, { "epoch": 0.3199749530369443, "grad_norm": 3.25817608833313, "learning_rate": 0.00012767153532608694, "loss": 1.3615, "step": 2044 }, { "epoch": 0.32013149655604256, "grad_norm": 2.2707977294921875, "learning_rate": 0.00012765964673913042, "loss": 0.9138, "step": 2045 }, { "epoch": 0.3202880400751409, "grad_norm": 2.653170347213745, "learning_rate": 0.0001276477581521739, "loss": 1.0442, "step": 2046 }, { "epoch": 0.3204445835942392, "grad_norm": 4.048224449157715, "learning_rate": 0.00012763586956521738, "loss": 1.2047, "step": 2047 }, { "epoch": 0.3206011271133375, "grad_norm": 2.723649501800537, "learning_rate": 0.00012762398097826086, "loss": 1.3828, "step": 2048 }, { "epoch": 0.3207576706324358, "grad_norm": 1.663755178451538, "learning_rate": 0.00012761209239130433, "loss": 0.8736, "step": 2049 }, { "epoch": 0.32091421415153415, "grad_norm": 2.0240681171417236, "learning_rate": 0.0001276002038043478, "loss": 1.5025, "step": 2050 }, { "epoch": 0.3210707576706324, "grad_norm": 0.4613945782184601, "learning_rate": 0.0001275883152173913, "loss": 0.3282, "step": 2051 }, { "epoch": 0.32122730118973075, "grad_norm": 0.5784393548965454, "learning_rate": 0.00012757642663043477, "loss": 0.3192, "step": 2052 }, { "epoch": 0.3213838447088291, "grad_norm": 0.7284885048866272, "learning_rate": 0.00012756453804347825, "loss": 0.4505, "step": 2053 }, { "epoch": 0.32154038822792735, "grad_norm": 1.0775094032287598, "learning_rate": 0.00012755264945652173, "loss": 0.4925, "step": 2054 }, { "epoch": 0.3216969317470257, "grad_norm": 0.7570642232894897, "learning_rate": 0.0001275407608695652, "loss": 0.431, "step": 2055 }, { "epoch": 0.32185347526612396, "grad_norm": 1.0520694255828857, "learning_rate": 0.00012752887228260868, "loss": 0.4022, "step": 2056 }, { "epoch": 0.3220100187852223, "grad_norm": 0.8558493852615356, "learning_rate": 0.00012751698369565216, "loss": 0.3897, "step": 2057 }, { "epoch": 0.3221665623043206, "grad_norm": 0.9139475226402283, "learning_rate": 0.00012750509510869564, "loss": 0.4055, "step": 2058 }, { "epoch": 0.3223231058234189, "grad_norm": 0.9128738045692444, "learning_rate": 0.00012749320652173912, "loss": 0.6009, "step": 2059 }, { "epoch": 0.3224796493425172, "grad_norm": 1.4610978364944458, "learning_rate": 0.0001274813179347826, "loss": 0.5733, "step": 2060 }, { "epoch": 0.32263619286161555, "grad_norm": 2.3546719551086426, "learning_rate": 0.00012746942934782607, "loss": 0.5168, "step": 2061 }, { "epoch": 0.3227927363807138, "grad_norm": 0.9505919814109802, "learning_rate": 0.00012745754076086955, "loss": 0.6013, "step": 2062 }, { "epoch": 0.32294927989981215, "grad_norm": 1.427451252937317, "learning_rate": 0.00012744565217391303, "loss": 0.5187, "step": 2063 }, { "epoch": 0.3231058234189105, "grad_norm": 1.3946638107299805, "learning_rate": 0.0001274337635869565, "loss": 0.6363, "step": 2064 }, { "epoch": 0.32326236693800875, "grad_norm": 1.5606716871261597, "learning_rate": 0.00012742187499999999, "loss": 0.9534, "step": 2065 }, { "epoch": 0.3234189104571071, "grad_norm": 1.1318150758743286, "learning_rate": 0.00012740998641304346, "loss": 0.3872, "step": 2066 }, { "epoch": 0.3235754539762054, "grad_norm": 1.2735399007797241, "learning_rate": 0.00012739809782608694, "loss": 0.6756, "step": 2067 }, { "epoch": 0.3237319974953037, "grad_norm": 1.8341726064682007, "learning_rate": 0.00012738620923913042, "loss": 0.8308, "step": 2068 }, { "epoch": 0.323888541014402, "grad_norm": 0.8212501406669617, "learning_rate": 0.0001273743206521739, "loss": 0.4577, "step": 2069 }, { "epoch": 0.32404508453350034, "grad_norm": 1.4134632349014282, "learning_rate": 0.00012736243206521738, "loss": 0.6533, "step": 2070 }, { "epoch": 0.3242016280525986, "grad_norm": 1.4738585948944092, "learning_rate": 0.00012735054347826085, "loss": 0.8359, "step": 2071 }, { "epoch": 0.32435817157169694, "grad_norm": 1.6055957078933716, "learning_rate": 0.00012733865489130433, "loss": 0.7502, "step": 2072 }, { "epoch": 0.3245147150907952, "grad_norm": 1.8502129316329956, "learning_rate": 0.0001273267663043478, "loss": 0.6975, "step": 2073 }, { "epoch": 0.32467125860989354, "grad_norm": 6.762905120849609, "learning_rate": 0.0001273148777173913, "loss": 0.671, "step": 2074 }, { "epoch": 0.32482780212899187, "grad_norm": 3.4390506744384766, "learning_rate": 0.00012730298913043477, "loss": 1.2759, "step": 2075 }, { "epoch": 0.32498434564809014, "grad_norm": 2.6699492931365967, "learning_rate": 0.00012729110054347824, "loss": 0.5477, "step": 2076 }, { "epoch": 0.3251408891671885, "grad_norm": 1.8547773361206055, "learning_rate": 0.00012727921195652172, "loss": 0.7222, "step": 2077 }, { "epoch": 0.3252974326862868, "grad_norm": 2.9768614768981934, "learning_rate": 0.0001272673233695652, "loss": 0.7239, "step": 2078 }, { "epoch": 0.3254539762053851, "grad_norm": 4.415472030639648, "learning_rate": 0.00012725543478260868, "loss": 0.8202, "step": 2079 }, { "epoch": 0.3256105197244834, "grad_norm": 3.1210122108459473, "learning_rate": 0.00012724354619565216, "loss": 1.0054, "step": 2080 }, { "epoch": 0.32576706324358173, "grad_norm": 1.23275887966156, "learning_rate": 0.00012723165760869564, "loss": 0.5687, "step": 2081 }, { "epoch": 0.32592360676268, "grad_norm": 4.217541694641113, "learning_rate": 0.00012721976902173911, "loss": 1.5155, "step": 2082 }, { "epoch": 0.32608015028177834, "grad_norm": 2.5780742168426514, "learning_rate": 0.0001272078804347826, "loss": 1.313, "step": 2083 }, { "epoch": 0.32623669380087666, "grad_norm": 4.87647008895874, "learning_rate": 0.00012719599184782607, "loss": 1.5834, "step": 2084 }, { "epoch": 0.32639323731997494, "grad_norm": 1.4462968111038208, "learning_rate": 0.00012718410326086955, "loss": 0.5445, "step": 2085 }, { "epoch": 0.32654978083907327, "grad_norm": 3.3860857486724854, "learning_rate": 0.00012717221467391303, "loss": 1.6184, "step": 2086 }, { "epoch": 0.3267063243581716, "grad_norm": 2.9732470512390137, "learning_rate": 0.0001271603260869565, "loss": 0.7919, "step": 2087 }, { "epoch": 0.32686286787726987, "grad_norm": 3.7999870777130127, "learning_rate": 0.00012714843749999998, "loss": 1.3238, "step": 2088 }, { "epoch": 0.3270194113963682, "grad_norm": 3.3030741214752197, "learning_rate": 0.00012713654891304346, "loss": 0.8167, "step": 2089 }, { "epoch": 0.3271759549154665, "grad_norm": 2.3827152252197266, "learning_rate": 0.00012712466032608694, "loss": 1.4271, "step": 2090 }, { "epoch": 0.3273324984345648, "grad_norm": 4.5567498207092285, "learning_rate": 0.00012711277173913042, "loss": 1.0502, "step": 2091 }, { "epoch": 0.32748904195366313, "grad_norm": 2.8947346210479736, "learning_rate": 0.0001271008831521739, "loss": 0.8612, "step": 2092 }, { "epoch": 0.3276455854727614, "grad_norm": 2.599928140640259, "learning_rate": 0.00012708899456521737, "loss": 1.9398, "step": 2093 }, { "epoch": 0.32780212899185973, "grad_norm": 3.296515941619873, "learning_rate": 0.00012707710597826085, "loss": 1.3639, "step": 2094 }, { "epoch": 0.32795867251095806, "grad_norm": 2.403913974761963, "learning_rate": 0.00012706521739130433, "loss": 1.1575, "step": 2095 }, { "epoch": 0.32811521603005633, "grad_norm": 2.5071821212768555, "learning_rate": 0.0001270533288043478, "loss": 0.7343, "step": 2096 }, { "epoch": 0.32827175954915466, "grad_norm": 3.071596622467041, "learning_rate": 0.00012704144021739129, "loss": 0.4925, "step": 2097 }, { "epoch": 0.328428303068253, "grad_norm": 2.297931432723999, "learning_rate": 0.0001270295516304348, "loss": 1.2768, "step": 2098 }, { "epoch": 0.32858484658735126, "grad_norm": 3.3990745544433594, "learning_rate": 0.00012701766304347827, "loss": 1.109, "step": 2099 }, { "epoch": 0.3287413901064496, "grad_norm": 2.9067583084106445, "learning_rate": 0.00012700577445652172, "loss": 1.1791, "step": 2100 }, { "epoch": 0.3288979336255479, "grad_norm": 1.1281002759933472, "learning_rate": 0.0001269938858695652, "loss": 0.3839, "step": 2101 }, { "epoch": 0.3290544771446462, "grad_norm": 1.0031917095184326, "learning_rate": 0.00012698199728260868, "loss": 0.2977, "step": 2102 }, { "epoch": 0.3292110206637445, "grad_norm": 0.7730886936187744, "learning_rate": 0.00012697010869565216, "loss": 0.4292, "step": 2103 }, { "epoch": 0.32936756418284285, "grad_norm": 0.7962760329246521, "learning_rate": 0.00012695822010869563, "loss": 0.4449, "step": 2104 }, { "epoch": 0.3295241077019411, "grad_norm": 0.8197981715202332, "learning_rate": 0.0001269463315217391, "loss": 0.4454, "step": 2105 }, { "epoch": 0.32968065122103946, "grad_norm": 0.8438326120376587, "learning_rate": 0.0001269344429347826, "loss": 0.5498, "step": 2106 }, { "epoch": 0.3298371947401378, "grad_norm": 0.8887110352516174, "learning_rate": 0.00012692255434782607, "loss": 0.428, "step": 2107 }, { "epoch": 0.32999373825923606, "grad_norm": 0.9601888060569763, "learning_rate": 0.00012691066576086955, "loss": 0.4359, "step": 2108 }, { "epoch": 0.3301502817783344, "grad_norm": 0.976886510848999, "learning_rate": 0.00012689877717391302, "loss": 0.4258, "step": 2109 }, { "epoch": 0.33030682529743266, "grad_norm": 1.286248803138733, "learning_rate": 0.0001268868885869565, "loss": 0.522, "step": 2110 }, { "epoch": 0.330463368816531, "grad_norm": 1.5811436176300049, "learning_rate": 0.00012687499999999998, "loss": 0.4681, "step": 2111 }, { "epoch": 0.3306199123356293, "grad_norm": 1.1789262294769287, "learning_rate": 0.00012686311141304346, "loss": 0.4267, "step": 2112 }, { "epoch": 0.3307764558547276, "grad_norm": 1.0988821983337402, "learning_rate": 0.00012685122282608694, "loss": 0.6289, "step": 2113 }, { "epoch": 0.3309329993738259, "grad_norm": 2.061668872833252, "learning_rate": 0.00012683933423913041, "loss": 0.7354, "step": 2114 }, { "epoch": 0.33108954289292425, "grad_norm": 4.640627861022949, "learning_rate": 0.0001268274456521739, "loss": 0.8347, "step": 2115 }, { "epoch": 0.3312460864120225, "grad_norm": 3.2614641189575195, "learning_rate": 0.00012681555706521737, "loss": 0.5416, "step": 2116 }, { "epoch": 0.33140262993112085, "grad_norm": 6.707008361816406, "learning_rate": 0.00012680366847826085, "loss": 0.7295, "step": 2117 }, { "epoch": 0.3315591734502192, "grad_norm": 1.8124698400497437, "learning_rate": 0.00012679177989130435, "loss": 0.6188, "step": 2118 }, { "epoch": 0.33171571696931745, "grad_norm": 1.8256293535232544, "learning_rate": 0.00012677989130434783, "loss": 0.6012, "step": 2119 }, { "epoch": 0.3318722604884158, "grad_norm": 1.8738330602645874, "learning_rate": 0.0001267680027173913, "loss": 0.5605, "step": 2120 }, { "epoch": 0.3320288040075141, "grad_norm": 1.4613176584243774, "learning_rate": 0.0001267561141304348, "loss": 0.4485, "step": 2121 }, { "epoch": 0.3321853475266124, "grad_norm": 2.423973798751831, "learning_rate": 0.00012674422554347827, "loss": 0.8472, "step": 2122 }, { "epoch": 0.3323418910457107, "grad_norm": 1.646233081817627, "learning_rate": 0.00012673233695652172, "loss": 0.533, "step": 2123 }, { "epoch": 0.33249843456480904, "grad_norm": 3.0914671421051025, "learning_rate": 0.0001267204483695652, "loss": 0.7151, "step": 2124 }, { "epoch": 0.3326549780839073, "grad_norm": 3.65962553024292, "learning_rate": 0.00012670855978260867, "loss": 0.8971, "step": 2125 }, { "epoch": 0.33281152160300564, "grad_norm": 2.529097557067871, "learning_rate": 0.00012669667119565215, "loss": 1.0616, "step": 2126 }, { "epoch": 0.3329680651221039, "grad_norm": 1.8951325416564941, "learning_rate": 0.00012668478260869563, "loss": 0.7114, "step": 2127 }, { "epoch": 0.33312460864120225, "grad_norm": 1.6600371599197388, "learning_rate": 0.0001266728940217391, "loss": 0.973, "step": 2128 }, { "epoch": 0.3332811521603006, "grad_norm": 2.867217540740967, "learning_rate": 0.0001266610054347826, "loss": 1.1145, "step": 2129 }, { "epoch": 0.33343769567939885, "grad_norm": 2.2849912643432617, "learning_rate": 0.00012664911684782607, "loss": 0.9426, "step": 2130 }, { "epoch": 0.3335942391984972, "grad_norm": 3.080897569656372, "learning_rate": 0.00012663722826086954, "loss": 0.9263, "step": 2131 }, { "epoch": 0.3337507827175955, "grad_norm": 1.9380543231964111, "learning_rate": 0.00012662533967391302, "loss": 0.7396, "step": 2132 }, { "epoch": 0.3339073262366938, "grad_norm": 2.152670383453369, "learning_rate": 0.0001266134510869565, "loss": 0.9079, "step": 2133 }, { "epoch": 0.3340638697557921, "grad_norm": 7.16734504699707, "learning_rate": 0.00012660156249999998, "loss": 0.7958, "step": 2134 }, { "epoch": 0.33422041327489044, "grad_norm": 5.773569107055664, "learning_rate": 0.00012658967391304346, "loss": 1.2114, "step": 2135 }, { "epoch": 0.3343769567939887, "grad_norm": 5.230566024780273, "learning_rate": 0.00012657778532608693, "loss": 1.3437, "step": 2136 }, { "epoch": 0.33453350031308704, "grad_norm": 3.2707464694976807, "learning_rate": 0.0001265658967391304, "loss": 1.3271, "step": 2137 }, { "epoch": 0.33469004383218537, "grad_norm": 3.173471450805664, "learning_rate": 0.00012655400815217392, "loss": 1.293, "step": 2138 }, { "epoch": 0.33484658735128364, "grad_norm": 2.834434986114502, "learning_rate": 0.0001265421195652174, "loss": 0.9963, "step": 2139 }, { "epoch": 0.33500313087038197, "grad_norm": 2.2935187816619873, "learning_rate": 0.00012653023097826087, "loss": 0.926, "step": 2140 }, { "epoch": 0.3351596743894803, "grad_norm": 2.0420169830322266, "learning_rate": 0.00012651834239130435, "loss": 1.8464, "step": 2141 }, { "epoch": 0.3353162179085786, "grad_norm": 2.460814952850342, "learning_rate": 0.00012650645380434783, "loss": 1.1305, "step": 2142 }, { "epoch": 0.3354727614276769, "grad_norm": 2.783442735671997, "learning_rate": 0.0001264945652173913, "loss": 1.3901, "step": 2143 }, { "epoch": 0.33562930494677523, "grad_norm": 3.408578872680664, "learning_rate": 0.0001264826766304348, "loss": 1.255, "step": 2144 }, { "epoch": 0.3357858484658735, "grad_norm": 3.6116654872894287, "learning_rate": 0.00012647078804347826, "loss": 1.2464, "step": 2145 }, { "epoch": 0.33594239198497183, "grad_norm": 2.8809239864349365, "learning_rate": 0.00012645889945652172, "loss": 0.9867, "step": 2146 }, { "epoch": 0.3360989355040701, "grad_norm": 2.0919134616851807, "learning_rate": 0.0001264470108695652, "loss": 0.7673, "step": 2147 }, { "epoch": 0.33625547902316844, "grad_norm": 2.3491108417510986, "learning_rate": 0.00012643512228260867, "loss": 0.8601, "step": 2148 }, { "epoch": 0.33641202254226676, "grad_norm": 4.47485876083374, "learning_rate": 0.00012642323369565215, "loss": 1.0941, "step": 2149 }, { "epoch": 0.33656856606136504, "grad_norm": 5.546586513519287, "learning_rate": 0.00012641134510869563, "loss": 1.204, "step": 2150 }, { "epoch": 0.33672510958046337, "grad_norm": 0.8124656677246094, "learning_rate": 0.0001263994565217391, "loss": 0.4029, "step": 2151 }, { "epoch": 0.3368816530995617, "grad_norm": 0.7942912578582764, "learning_rate": 0.00012638756793478258, "loss": 0.3955, "step": 2152 }, { "epoch": 0.33703819661865997, "grad_norm": 0.7162039875984192, "learning_rate": 0.00012637567934782606, "loss": 0.3255, "step": 2153 }, { "epoch": 0.3371947401377583, "grad_norm": 1.0614885091781616, "learning_rate": 0.00012636379076086954, "loss": 0.3996, "step": 2154 }, { "epoch": 0.3373512836568566, "grad_norm": 0.8457649946212769, "learning_rate": 0.00012635190217391302, "loss": 0.4387, "step": 2155 }, { "epoch": 0.3375078271759549, "grad_norm": 1.2543790340423584, "learning_rate": 0.0001263400135869565, "loss": 0.4533, "step": 2156 }, { "epoch": 0.33766437069505323, "grad_norm": 1.0427265167236328, "learning_rate": 0.00012632812499999998, "loss": 0.5582, "step": 2157 }, { "epoch": 0.33782091421415156, "grad_norm": 0.8301647901535034, "learning_rate": 0.00012631623641304348, "loss": 0.4266, "step": 2158 }, { "epoch": 0.33797745773324983, "grad_norm": 1.2939120531082153, "learning_rate": 0.00012630434782608696, "loss": 0.5401, "step": 2159 }, { "epoch": 0.33813400125234816, "grad_norm": 1.8037058115005493, "learning_rate": 0.00012629245923913044, "loss": 0.5591, "step": 2160 }, { "epoch": 0.3382905447714465, "grad_norm": 0.8379096388816833, "learning_rate": 0.00012628057065217392, "loss": 0.3777, "step": 2161 }, { "epoch": 0.33844708829054476, "grad_norm": 1.126826286315918, "learning_rate": 0.0001262686820652174, "loss": 0.7762, "step": 2162 }, { "epoch": 0.3386036318096431, "grad_norm": 2.0113210678100586, "learning_rate": 0.00012625679347826087, "loss": 0.4329, "step": 2163 }, { "epoch": 0.33876017532874136, "grad_norm": 0.9261629581451416, "learning_rate": 0.00012624490489130435, "loss": 0.3983, "step": 2164 }, { "epoch": 0.3389167188478397, "grad_norm": 0.8966819047927856, "learning_rate": 0.00012623301630434783, "loss": 0.419, "step": 2165 }, { "epoch": 0.339073262366938, "grad_norm": 1.4736039638519287, "learning_rate": 0.0001262211277173913, "loss": 0.7537, "step": 2166 }, { "epoch": 0.3392298058860363, "grad_norm": 1.5574650764465332, "learning_rate": 0.00012620923913043478, "loss": 0.5822, "step": 2167 }, { "epoch": 0.3393863494051346, "grad_norm": 1.7829583883285522, "learning_rate": 0.00012619735054347826, "loss": 0.6917, "step": 2168 }, { "epoch": 0.33954289292423295, "grad_norm": 1.7684218883514404, "learning_rate": 0.0001261854619565217, "loss": 0.7268, "step": 2169 }, { "epoch": 0.3396994364433312, "grad_norm": 2.189419984817505, "learning_rate": 0.0001261735733695652, "loss": 0.7865, "step": 2170 }, { "epoch": 0.33985597996242956, "grad_norm": 1.7211482524871826, "learning_rate": 0.00012616168478260867, "loss": 0.6494, "step": 2171 }, { "epoch": 0.3400125234815279, "grad_norm": 3.992259979248047, "learning_rate": 0.00012614979619565215, "loss": 0.7542, "step": 2172 }, { "epoch": 0.34016906700062616, "grad_norm": 1.4193315505981445, "learning_rate": 0.00012613790760869563, "loss": 0.6677, "step": 2173 }, { "epoch": 0.3403256105197245, "grad_norm": 3.587916612625122, "learning_rate": 0.0001261260190217391, "loss": 0.9292, "step": 2174 }, { "epoch": 0.3404821540388228, "grad_norm": 2.766146421432495, "learning_rate": 0.00012611413043478258, "loss": 0.7697, "step": 2175 }, { "epoch": 0.3406386975579211, "grad_norm": 2.9992923736572266, "learning_rate": 0.00012610224184782606, "loss": 1.085, "step": 2176 }, { "epoch": 0.3407952410770194, "grad_norm": 2.824118137359619, "learning_rate": 0.00012609035326086954, "loss": 1.1942, "step": 2177 }, { "epoch": 0.34095178459611775, "grad_norm": 2.893942356109619, "learning_rate": 0.00012607846467391304, "loss": 0.769, "step": 2178 }, { "epoch": 0.341108328115216, "grad_norm": 2.985380172729492, "learning_rate": 0.00012606657608695652, "loss": 0.818, "step": 2179 }, { "epoch": 0.34126487163431435, "grad_norm": 4.611202716827393, "learning_rate": 0.0001260546875, "loss": 1.0646, "step": 2180 }, { "epoch": 0.3414214151534126, "grad_norm": 5.70366096496582, "learning_rate": 0.00012604279891304348, "loss": 0.8106, "step": 2181 }, { "epoch": 0.34157795867251095, "grad_norm": 3.2267680168151855, "learning_rate": 0.00012603091032608696, "loss": 1.0364, "step": 2182 }, { "epoch": 0.3417345021916093, "grad_norm": 2.8997790813446045, "learning_rate": 0.00012601902173913043, "loss": 1.1305, "step": 2183 }, { "epoch": 0.34189104571070755, "grad_norm": 4.346092700958252, "learning_rate": 0.0001260071331521739, "loss": 1.2126, "step": 2184 }, { "epoch": 0.3420475892298059, "grad_norm": 3.945911407470703, "learning_rate": 0.0001259952445652174, "loss": 1.1105, "step": 2185 }, { "epoch": 0.3422041327489042, "grad_norm": 2.853818655014038, "learning_rate": 0.00012598335597826087, "loss": 1.3774, "step": 2186 }, { "epoch": 0.3423606762680025, "grad_norm": 3.022303342819214, "learning_rate": 0.00012597146739130435, "loss": 1.3085, "step": 2187 }, { "epoch": 0.3425172197871008, "grad_norm": 2.761207342147827, "learning_rate": 0.00012595957880434783, "loss": 1.374, "step": 2188 }, { "epoch": 0.34267376330619914, "grad_norm": 2.815232515335083, "learning_rate": 0.0001259476902173913, "loss": 1.2261, "step": 2189 }, { "epoch": 0.3428303068252974, "grad_norm": 5.909341335296631, "learning_rate": 0.00012593580163043478, "loss": 1.465, "step": 2190 }, { "epoch": 0.34298685034439574, "grad_norm": 2.784719228744507, "learning_rate": 0.00012592391304347826, "loss": 0.7416, "step": 2191 }, { "epoch": 0.3431433938634941, "grad_norm": 2.1267483234405518, "learning_rate": 0.0001259120244565217, "loss": 1.0426, "step": 2192 }, { "epoch": 0.34329993738259235, "grad_norm": 4.897862434387207, "learning_rate": 0.0001259001358695652, "loss": 1.5357, "step": 2193 }, { "epoch": 0.3434564809016907, "grad_norm": 5.760490894317627, "learning_rate": 0.00012588824728260867, "loss": 1.1784, "step": 2194 }, { "epoch": 0.343613024420789, "grad_norm": 2.751293182373047, "learning_rate": 0.00012587635869565215, "loss": 0.8997, "step": 2195 }, { "epoch": 0.3437695679398873, "grad_norm": 3.3567137718200684, "learning_rate": 0.00012586447010869562, "loss": 1.4412, "step": 2196 }, { "epoch": 0.3439261114589856, "grad_norm": 2.8758676052093506, "learning_rate": 0.0001258525815217391, "loss": 1.0196, "step": 2197 }, { "epoch": 0.34408265497808393, "grad_norm": 3.466468334197998, "learning_rate": 0.0001258406929347826, "loss": 0.893, "step": 2198 }, { "epoch": 0.3442391984971822, "grad_norm": 3.0711543560028076, "learning_rate": 0.00012582880434782609, "loss": 0.8078, "step": 2199 }, { "epoch": 0.34439574201628054, "grad_norm": 2.079040288925171, "learning_rate": 0.00012581691576086956, "loss": 1.452, "step": 2200 }, { "epoch": 0.3445522855353788, "grad_norm": 0.6017398834228516, "learning_rate": 0.00012580502717391304, "loss": 0.3961, "step": 2201 }, { "epoch": 0.34470882905447714, "grad_norm": 0.9816662669181824, "learning_rate": 0.00012579313858695652, "loss": 0.5355, "step": 2202 }, { "epoch": 0.34486537257357547, "grad_norm": 0.8994389176368713, "learning_rate": 0.00012578125, "loss": 0.5364, "step": 2203 }, { "epoch": 0.34502191609267374, "grad_norm": 1.6095701456069946, "learning_rate": 0.00012576936141304348, "loss": 0.4117, "step": 2204 }, { "epoch": 0.34517845961177207, "grad_norm": 0.714461624622345, "learning_rate": 0.00012575747282608695, "loss": 0.4356, "step": 2205 }, { "epoch": 0.3453350031308704, "grad_norm": 0.9784564971923828, "learning_rate": 0.00012574558423913043, "loss": 0.3089, "step": 2206 }, { "epoch": 0.3454915466499687, "grad_norm": 1.0498417615890503, "learning_rate": 0.0001257336956521739, "loss": 0.5161, "step": 2207 }, { "epoch": 0.345648090169067, "grad_norm": 1.122424840927124, "learning_rate": 0.0001257218070652174, "loss": 0.4839, "step": 2208 }, { "epoch": 0.34580463368816533, "grad_norm": 1.1541037559509277, "learning_rate": 0.00012570991847826087, "loss": 0.5129, "step": 2209 }, { "epoch": 0.3459611772072636, "grad_norm": 0.9966637492179871, "learning_rate": 0.00012569802989130434, "loss": 0.5804, "step": 2210 }, { "epoch": 0.34611772072636193, "grad_norm": 1.200415015220642, "learning_rate": 0.00012568614130434782, "loss": 0.4202, "step": 2211 }, { "epoch": 0.34627426424546026, "grad_norm": 0.8537223935127258, "learning_rate": 0.0001256742527173913, "loss": 0.3677, "step": 2212 }, { "epoch": 0.34643080776455853, "grad_norm": 1.0772727727890015, "learning_rate": 0.00012566236413043478, "loss": 0.3695, "step": 2213 }, { "epoch": 0.34658735128365686, "grad_norm": 1.3382177352905273, "learning_rate": 0.00012565047554347826, "loss": 0.5362, "step": 2214 }, { "epoch": 0.3467438948027552, "grad_norm": 4.4127631187438965, "learning_rate": 0.0001256385869565217, "loss": 0.861, "step": 2215 }, { "epoch": 0.34690043832185347, "grad_norm": 1.3227994441986084, "learning_rate": 0.0001256266983695652, "loss": 0.4464, "step": 2216 }, { "epoch": 0.3470569818409518, "grad_norm": 1.6503797769546509, "learning_rate": 0.0001256148097826087, "loss": 0.8262, "step": 2217 }, { "epoch": 0.34721352536005007, "grad_norm": 1.5541952848434448, "learning_rate": 0.00012560292119565217, "loss": 0.7715, "step": 2218 }, { "epoch": 0.3473700688791484, "grad_norm": 0.9548878073692322, "learning_rate": 0.00012559103260869565, "loss": 0.4359, "step": 2219 }, { "epoch": 0.3475266123982467, "grad_norm": 1.51823890209198, "learning_rate": 0.00012557914402173913, "loss": 0.6745, "step": 2220 }, { "epoch": 0.347683155917345, "grad_norm": 2.065911054611206, "learning_rate": 0.0001255672554347826, "loss": 0.7433, "step": 2221 }, { "epoch": 0.34783969943644333, "grad_norm": 2.264336109161377, "learning_rate": 0.00012555536684782608, "loss": 0.6229, "step": 2222 }, { "epoch": 0.34799624295554166, "grad_norm": 1.7056647539138794, "learning_rate": 0.00012554347826086956, "loss": 0.6934, "step": 2223 }, { "epoch": 0.34815278647463993, "grad_norm": 1.1092779636383057, "learning_rate": 0.00012553158967391304, "loss": 0.8211, "step": 2224 }, { "epoch": 0.34830932999373826, "grad_norm": 5.08414888381958, "learning_rate": 0.00012551970108695652, "loss": 0.9078, "step": 2225 }, { "epoch": 0.3484658735128366, "grad_norm": 1.3602114915847778, "learning_rate": 0.0001255078125, "loss": 0.6234, "step": 2226 }, { "epoch": 0.34862241703193486, "grad_norm": 2.3018980026245117, "learning_rate": 0.00012549592391304347, "loss": 0.7814, "step": 2227 }, { "epoch": 0.3487789605510332, "grad_norm": 1.5054535865783691, "learning_rate": 0.00012548403532608695, "loss": 0.5617, "step": 2228 }, { "epoch": 0.3489355040701315, "grad_norm": 2.393860101699829, "learning_rate": 0.00012547214673913043, "loss": 0.9951, "step": 2229 }, { "epoch": 0.3490920475892298, "grad_norm": 2.7539191246032715, "learning_rate": 0.0001254602581521739, "loss": 1.0484, "step": 2230 }, { "epoch": 0.3492485911083281, "grad_norm": 3.524348735809326, "learning_rate": 0.00012544836956521739, "loss": 1.1552, "step": 2231 }, { "epoch": 0.34940513462742645, "grad_norm": 3.0820209980010986, "learning_rate": 0.00012543648097826086, "loss": 0.7473, "step": 2232 }, { "epoch": 0.3495616781465247, "grad_norm": 4.747170448303223, "learning_rate": 0.00012542459239130434, "loss": 1.2069, "step": 2233 }, { "epoch": 0.34971822166562305, "grad_norm": 2.520557403564453, "learning_rate": 0.00012541270380434782, "loss": 0.7397, "step": 2234 }, { "epoch": 0.3498747651847213, "grad_norm": 2.188842535018921, "learning_rate": 0.0001254008152173913, "loss": 0.9727, "step": 2235 }, { "epoch": 0.35003130870381965, "grad_norm": 4.56074857711792, "learning_rate": 0.00012538892663043478, "loss": 1.4721, "step": 2236 }, { "epoch": 0.350187852222918, "grad_norm": 2.514192581176758, "learning_rate": 0.00012537703804347826, "loss": 1.0779, "step": 2237 }, { "epoch": 0.35034439574201626, "grad_norm": 4.893073558807373, "learning_rate": 0.00012536514945652173, "loss": 1.0326, "step": 2238 }, { "epoch": 0.3505009392611146, "grad_norm": 3.340116262435913, "learning_rate": 0.0001253532608695652, "loss": 1.3443, "step": 2239 }, { "epoch": 0.3506574827802129, "grad_norm": 2.859121561050415, "learning_rate": 0.0001253413722826087, "loss": 1.0415, "step": 2240 }, { "epoch": 0.3508140262993112, "grad_norm": 3.408392906188965, "learning_rate": 0.00012532948369565217, "loss": 1.1798, "step": 2241 }, { "epoch": 0.3509705698184095, "grad_norm": 6.768486022949219, "learning_rate": 0.00012531759510869565, "loss": 1.5761, "step": 2242 }, { "epoch": 0.35112711333750785, "grad_norm": 3.3833112716674805, "learning_rate": 0.00012530570652173912, "loss": 0.9792, "step": 2243 }, { "epoch": 0.3512836568566061, "grad_norm": 3.183173894882202, "learning_rate": 0.0001252938179347826, "loss": 1.5007, "step": 2244 }, { "epoch": 0.35144020037570445, "grad_norm": 3.046445608139038, "learning_rate": 0.00012528192934782608, "loss": 1.1806, "step": 2245 }, { "epoch": 0.3515967438948028, "grad_norm": 2.441122531890869, "learning_rate": 0.00012527004076086956, "loss": 1.5278, "step": 2246 }, { "epoch": 0.35175328741390105, "grad_norm": 4.563473701477051, "learning_rate": 0.00012525815217391304, "loss": 0.9748, "step": 2247 }, { "epoch": 0.3519098309329994, "grad_norm": 2.186586380004883, "learning_rate": 0.00012524626358695651, "loss": 0.9187, "step": 2248 }, { "epoch": 0.3520663744520977, "grad_norm": 1.7997037172317505, "learning_rate": 0.000125234375, "loss": 0.8756, "step": 2249 }, { "epoch": 0.352222917971196, "grad_norm": 2.3448503017425537, "learning_rate": 0.00012522248641304347, "loss": 1.1736, "step": 2250 }, { "epoch": 0.3523794614902943, "grad_norm": 0.770124077796936, "learning_rate": 0.00012521059782608695, "loss": 0.3863, "step": 2251 }, { "epoch": 0.3525360050093926, "grad_norm": 0.5010112524032593, "learning_rate": 0.00012519870923913043, "loss": 0.3054, "step": 2252 }, { "epoch": 0.3526925485284909, "grad_norm": 0.956013560295105, "learning_rate": 0.0001251868206521739, "loss": 0.4631, "step": 2253 }, { "epoch": 0.35284909204758924, "grad_norm": 0.6629157662391663, "learning_rate": 0.00012517493206521738, "loss": 0.3359, "step": 2254 }, { "epoch": 0.3530056355666875, "grad_norm": 0.6969934701919556, "learning_rate": 0.00012516304347826086, "loss": 0.3912, "step": 2255 }, { "epoch": 0.35316217908578584, "grad_norm": 0.5790801644325256, "learning_rate": 0.00012515115489130434, "loss": 0.3326, "step": 2256 }, { "epoch": 0.3533187226048842, "grad_norm": 1.4302763938903809, "learning_rate": 0.00012513926630434782, "loss": 0.341, "step": 2257 }, { "epoch": 0.35347526612398245, "grad_norm": 1.334959626197815, "learning_rate": 0.0001251273777173913, "loss": 0.4281, "step": 2258 }, { "epoch": 0.3536318096430808, "grad_norm": 1.1449229717254639, "learning_rate": 0.00012511548913043477, "loss": 0.3376, "step": 2259 }, { "epoch": 0.3537883531621791, "grad_norm": 0.9361873865127563, "learning_rate": 0.00012510360054347825, "loss": 0.4883, "step": 2260 }, { "epoch": 0.3539448966812774, "grad_norm": 1.020612359046936, "learning_rate": 0.00012509171195652173, "loss": 0.4166, "step": 2261 }, { "epoch": 0.3541014402003757, "grad_norm": 0.904575765132904, "learning_rate": 0.0001250798233695652, "loss": 0.5444, "step": 2262 }, { "epoch": 0.35425798371947403, "grad_norm": 1.0230172872543335, "learning_rate": 0.0001250679347826087, "loss": 0.3482, "step": 2263 }, { "epoch": 0.3544145272385723, "grad_norm": 1.2869044542312622, "learning_rate": 0.00012505604619565217, "loss": 0.3855, "step": 2264 }, { "epoch": 0.35457107075767064, "grad_norm": 1.2110122442245483, "learning_rate": 0.00012504415760869564, "loss": 0.4911, "step": 2265 }, { "epoch": 0.35472761427676897, "grad_norm": 1.220137119293213, "learning_rate": 0.00012503226902173912, "loss": 0.6627, "step": 2266 }, { "epoch": 0.35488415779586724, "grad_norm": 1.544517993927002, "learning_rate": 0.0001250203804347826, "loss": 0.8396, "step": 2267 }, { "epoch": 0.35504070131496557, "grad_norm": 2.3632657527923584, "learning_rate": 0.00012500849184782608, "loss": 0.4839, "step": 2268 }, { "epoch": 0.3551972448340639, "grad_norm": 1.4084978103637695, "learning_rate": 0.00012499660326086956, "loss": 0.7134, "step": 2269 }, { "epoch": 0.35535378835316217, "grad_norm": 2.4869885444641113, "learning_rate": 0.00012498471467391303, "loss": 0.5891, "step": 2270 }, { "epoch": 0.3555103318722605, "grad_norm": 1.9490966796875, "learning_rate": 0.0001249728260869565, "loss": 0.7974, "step": 2271 }, { "epoch": 0.35566687539135877, "grad_norm": 2.1890714168548584, "learning_rate": 0.0001249609375, "loss": 0.7465, "step": 2272 }, { "epoch": 0.3558234189104571, "grad_norm": 2.373098373413086, "learning_rate": 0.00012494904891304347, "loss": 0.591, "step": 2273 }, { "epoch": 0.35597996242955543, "grad_norm": 3.75408935546875, "learning_rate": 0.00012493716032608695, "loss": 1.2105, "step": 2274 }, { "epoch": 0.3561365059486537, "grad_norm": 3.102027177810669, "learning_rate": 0.00012492527173913043, "loss": 1.2034, "step": 2275 }, { "epoch": 0.35629304946775203, "grad_norm": 2.683661937713623, "learning_rate": 0.0001249133831521739, "loss": 1.207, "step": 2276 }, { "epoch": 0.35644959298685036, "grad_norm": 2.581218719482422, "learning_rate": 0.00012490149456521738, "loss": 1.1981, "step": 2277 }, { "epoch": 0.35660613650594863, "grad_norm": 1.1779757738113403, "learning_rate": 0.00012488960597826086, "loss": 0.5282, "step": 2278 }, { "epoch": 0.35676268002504696, "grad_norm": 3.013615846633911, "learning_rate": 0.00012487771739130434, "loss": 1.2073, "step": 2279 }, { "epoch": 0.3569192235441453, "grad_norm": 1.7694698572158813, "learning_rate": 0.00012486582880434782, "loss": 0.9802, "step": 2280 }, { "epoch": 0.35707576706324357, "grad_norm": 4.771090984344482, "learning_rate": 0.0001248539402173913, "loss": 1.4887, "step": 2281 }, { "epoch": 0.3572323105823419, "grad_norm": 2.5205774307250977, "learning_rate": 0.00012484205163043477, "loss": 0.8076, "step": 2282 }, { "epoch": 0.3573888541014402, "grad_norm": 1.6154232025146484, "learning_rate": 0.00012483016304347825, "loss": 0.9419, "step": 2283 }, { "epoch": 0.3575453976205385, "grad_norm": 4.308727741241455, "learning_rate": 0.00012481827445652173, "loss": 1.3147, "step": 2284 }, { "epoch": 0.3577019411396368, "grad_norm": 4.727560997009277, "learning_rate": 0.0001248063858695652, "loss": 1.2614, "step": 2285 }, { "epoch": 0.35785848465873515, "grad_norm": 3.281662702560425, "learning_rate": 0.00012479449728260868, "loss": 1.5092, "step": 2286 }, { "epoch": 0.3580150281778334, "grad_norm": 2.1371243000030518, "learning_rate": 0.00012478260869565216, "loss": 1.4421, "step": 2287 }, { "epoch": 0.35817157169693176, "grad_norm": 2.4553184509277344, "learning_rate": 0.00012477072010869564, "loss": 1.1999, "step": 2288 }, { "epoch": 0.35832811521603003, "grad_norm": 2.2943806648254395, "learning_rate": 0.00012475883152173912, "loss": 1.4009, "step": 2289 }, { "epoch": 0.35848465873512836, "grad_norm": 6.167749404907227, "learning_rate": 0.0001247469429347826, "loss": 0.9988, "step": 2290 }, { "epoch": 0.3586412022542267, "grad_norm": 3.7226905822753906, "learning_rate": 0.00012473505434782608, "loss": 1.8773, "step": 2291 }, { "epoch": 0.35879774577332496, "grad_norm": 3.150379180908203, "learning_rate": 0.00012472316576086955, "loss": 1.0615, "step": 2292 }, { "epoch": 0.3589542892924233, "grad_norm": 2.194098949432373, "learning_rate": 0.00012471127717391303, "loss": 1.5124, "step": 2293 }, { "epoch": 0.3591108328115216, "grad_norm": 5.306440353393555, "learning_rate": 0.0001246993885869565, "loss": 1.6304, "step": 2294 }, { "epoch": 0.3592673763306199, "grad_norm": 2.4836504459381104, "learning_rate": 0.0001246875, "loss": 1.6661, "step": 2295 }, { "epoch": 0.3594239198497182, "grad_norm": 1.1367416381835938, "learning_rate": 0.00012467561141304347, "loss": 0.6907, "step": 2296 }, { "epoch": 0.35958046336881655, "grad_norm": 2.945011615753174, "learning_rate": 0.00012466372282608694, "loss": 1.0769, "step": 2297 }, { "epoch": 0.3597370068879148, "grad_norm": 2.369023084640503, "learning_rate": 0.00012465183423913042, "loss": 1.015, "step": 2298 }, { "epoch": 0.35989355040701315, "grad_norm": 2.4492568969726562, "learning_rate": 0.0001246399456521739, "loss": 1.5329, "step": 2299 }, { "epoch": 0.3600500939261115, "grad_norm": 2.9945552349090576, "learning_rate": 0.00012462805706521738, "loss": 1.3433, "step": 2300 }, { "epoch": 0.36020663744520975, "grad_norm": 0.4942561089992523, "learning_rate": 0.00012461616847826086, "loss": 0.3148, "step": 2301 }, { "epoch": 0.3603631809643081, "grad_norm": 0.8054972290992737, "learning_rate": 0.00012460427989130434, "loss": 0.4659, "step": 2302 }, { "epoch": 0.3605197244834064, "grad_norm": 0.9301678538322449, "learning_rate": 0.0001245923913043478, "loss": 0.4267, "step": 2303 }, { "epoch": 0.3606762680025047, "grad_norm": 1.148542881011963, "learning_rate": 0.0001245805027173913, "loss": 0.4279, "step": 2304 }, { "epoch": 0.360832811521603, "grad_norm": 0.7093629240989685, "learning_rate": 0.00012456861413043477, "loss": 0.3743, "step": 2305 }, { "epoch": 0.3609893550407013, "grad_norm": 0.9219041466712952, "learning_rate": 0.00012455672554347825, "loss": 0.2774, "step": 2306 }, { "epoch": 0.3611458985597996, "grad_norm": 1.0405831336975098, "learning_rate": 0.00012454483695652173, "loss": 0.3648, "step": 2307 }, { "epoch": 0.36130244207889795, "grad_norm": 1.1154102087020874, "learning_rate": 0.0001245329483695652, "loss": 0.4166, "step": 2308 }, { "epoch": 0.3614589855979962, "grad_norm": 0.6714631915092468, "learning_rate": 0.00012452105978260868, "loss": 0.3839, "step": 2309 }, { "epoch": 0.36161552911709455, "grad_norm": 16.046674728393555, "learning_rate": 0.00012450917119565216, "loss": 1.9826, "step": 2310 }, { "epoch": 0.3617720726361929, "grad_norm": 1.1279475688934326, "learning_rate": 0.00012449728260869564, "loss": 0.4834, "step": 2311 }, { "epoch": 0.36192861615529115, "grad_norm": 0.9293304681777954, "learning_rate": 0.00012448539402173912, "loss": 0.4405, "step": 2312 }, { "epoch": 0.3620851596743895, "grad_norm": 1.3273478746414185, "learning_rate": 0.0001244735054347826, "loss": 0.5756, "step": 2313 }, { "epoch": 0.3622417031934878, "grad_norm": 1.1575911045074463, "learning_rate": 0.00012446161684782607, "loss": 0.4136, "step": 2314 }, { "epoch": 0.3623982467125861, "grad_norm": 1.9252724647521973, "learning_rate": 0.00012444972826086955, "loss": 0.7275, "step": 2315 }, { "epoch": 0.3625547902316844, "grad_norm": 0.9440420269966125, "learning_rate": 0.00012443783967391303, "loss": 0.6775, "step": 2316 }, { "epoch": 0.36271133375078274, "grad_norm": 1.42560613155365, "learning_rate": 0.0001244259510869565, "loss": 0.4696, "step": 2317 }, { "epoch": 0.362867877269881, "grad_norm": 1.1781710386276245, "learning_rate": 0.00012441406249999999, "loss": 0.6498, "step": 2318 }, { "epoch": 0.36302442078897934, "grad_norm": 1.8116865158081055, "learning_rate": 0.00012440217391304346, "loss": 0.5661, "step": 2319 }, { "epoch": 0.36318096430807767, "grad_norm": 1.429457664489746, "learning_rate": 0.00012439028532608694, "loss": 0.69, "step": 2320 }, { "epoch": 0.36333750782717594, "grad_norm": 1.2182400226593018, "learning_rate": 0.00012437839673913042, "loss": 0.4411, "step": 2321 }, { "epoch": 0.36349405134627427, "grad_norm": 2.1038870811462402, "learning_rate": 0.0001243665081521739, "loss": 0.6139, "step": 2322 }, { "epoch": 0.3636505948653726, "grad_norm": 2.2629289627075195, "learning_rate": 0.00012435461956521738, "loss": 0.5671, "step": 2323 }, { "epoch": 0.3638071383844709, "grad_norm": 1.6777275800704956, "learning_rate": 0.00012434273097826085, "loss": 0.6257, "step": 2324 }, { "epoch": 0.3639636819035692, "grad_norm": 1.6103070974349976, "learning_rate": 0.00012433084239130433, "loss": 0.7729, "step": 2325 }, { "epoch": 0.3641202254226675, "grad_norm": 3.8531270027160645, "learning_rate": 0.0001243189538043478, "loss": 0.725, "step": 2326 }, { "epoch": 0.3642767689417658, "grad_norm": 3.4792587757110596, "learning_rate": 0.0001243070652173913, "loss": 0.6923, "step": 2327 }, { "epoch": 0.36443331246086413, "grad_norm": 2.821716785430908, "learning_rate": 0.00012429517663043477, "loss": 0.7323, "step": 2328 }, { "epoch": 0.3645898559799624, "grad_norm": 2.288912057876587, "learning_rate": 0.00012428328804347825, "loss": 0.7916, "step": 2329 }, { "epoch": 0.36474639949906074, "grad_norm": 2.449446678161621, "learning_rate": 0.00012427139945652172, "loss": 0.8491, "step": 2330 }, { "epoch": 0.36490294301815906, "grad_norm": 1.6004130840301514, "learning_rate": 0.0001242595108695652, "loss": 0.5082, "step": 2331 }, { "epoch": 0.36505948653725734, "grad_norm": 3.067465305328369, "learning_rate": 0.00012424762228260868, "loss": 1.3655, "step": 2332 }, { "epoch": 0.36521603005635567, "grad_norm": 4.45902156829834, "learning_rate": 0.00012423573369565216, "loss": 1.4774, "step": 2333 }, { "epoch": 0.365372573575454, "grad_norm": 2.775860548019409, "learning_rate": 0.00012422384510869564, "loss": 1.4401, "step": 2334 }, { "epoch": 0.36552911709455227, "grad_norm": 4.790217876434326, "learning_rate": 0.00012421195652173911, "loss": 0.9714, "step": 2335 }, { "epoch": 0.3656856606136506, "grad_norm": 2.987180233001709, "learning_rate": 0.0001242000679347826, "loss": 1.3099, "step": 2336 }, { "epoch": 0.3658422041327489, "grad_norm": 5.654740810394287, "learning_rate": 0.00012418817934782607, "loss": 1.5818, "step": 2337 }, { "epoch": 0.3659987476518472, "grad_norm": 3.417714834213257, "learning_rate": 0.00012417629076086955, "loss": 1.267, "step": 2338 }, { "epoch": 0.36615529117094553, "grad_norm": 1.774653673171997, "learning_rate": 0.00012416440217391303, "loss": 0.9698, "step": 2339 }, { "epoch": 0.36631183469004386, "grad_norm": 3.666904926300049, "learning_rate": 0.0001241525135869565, "loss": 1.0616, "step": 2340 }, { "epoch": 0.36646837820914213, "grad_norm": 4.576032638549805, "learning_rate": 0.00012414062499999998, "loss": 1.6319, "step": 2341 }, { "epoch": 0.36662492172824046, "grad_norm": 2.0509932041168213, "learning_rate": 0.00012412873641304346, "loss": 1.2315, "step": 2342 }, { "epoch": 0.36678146524733873, "grad_norm": 2.997654914855957, "learning_rate": 0.00012411684782608694, "loss": 1.4414, "step": 2343 }, { "epoch": 0.36693800876643706, "grad_norm": 1.8473763465881348, "learning_rate": 0.00012410495923913042, "loss": 1.1696, "step": 2344 }, { "epoch": 0.3670945522855354, "grad_norm": 3.300407648086548, "learning_rate": 0.0001240930706521739, "loss": 1.6252, "step": 2345 }, { "epoch": 0.36725109580463366, "grad_norm": 3.447380781173706, "learning_rate": 0.00012408118206521737, "loss": 1.3135, "step": 2346 }, { "epoch": 0.367407639323732, "grad_norm": 1.7984750270843506, "learning_rate": 0.00012406929347826085, "loss": 0.8276, "step": 2347 }, { "epoch": 0.3675641828428303, "grad_norm": 1.6383579969406128, "learning_rate": 0.00012405740489130433, "loss": 0.7665, "step": 2348 }, { "epoch": 0.3677207263619286, "grad_norm": 2.8749639987945557, "learning_rate": 0.0001240455163043478, "loss": 1.396, "step": 2349 }, { "epoch": 0.3678772698810269, "grad_norm": 2.22799015045166, "learning_rate": 0.0001240336277173913, "loss": 0.9845, "step": 2350 }, { "epoch": 0.36803381340012525, "grad_norm": 0.612299919128418, "learning_rate": 0.00012402173913043476, "loss": 0.3236, "step": 2351 }, { "epoch": 0.3681903569192235, "grad_norm": 0.8682419061660767, "learning_rate": 0.00012400985054347827, "loss": 0.3551, "step": 2352 }, { "epoch": 0.36834690043832186, "grad_norm": 0.6294960379600525, "learning_rate": 0.00012399796195652172, "loss": 0.2651, "step": 2353 }, { "epoch": 0.3685034439574202, "grad_norm": 0.6257913112640381, "learning_rate": 0.0001239860733695652, "loss": 0.2584, "step": 2354 }, { "epoch": 0.36865998747651846, "grad_norm": 0.5992159247398376, "learning_rate": 0.00012397418478260868, "loss": 0.3332, "step": 2355 }, { "epoch": 0.3688165309956168, "grad_norm": 0.8407636284828186, "learning_rate": 0.00012396229619565216, "loss": 0.3838, "step": 2356 }, { "epoch": 0.3689730745147151, "grad_norm": 0.9869884252548218, "learning_rate": 0.00012395040760869563, "loss": 0.561, "step": 2357 }, { "epoch": 0.3691296180338134, "grad_norm": 0.9067991375923157, "learning_rate": 0.0001239385190217391, "loss": 0.3845, "step": 2358 }, { "epoch": 0.3692861615529117, "grad_norm": 0.9432851672172546, "learning_rate": 0.0001239266304347826, "loss": 0.6224, "step": 2359 }, { "epoch": 0.36944270507201, "grad_norm": 0.9739526510238647, "learning_rate": 0.00012391474184782607, "loss": 0.4506, "step": 2360 }, { "epoch": 0.3695992485911083, "grad_norm": 1.597746729850769, "learning_rate": 0.00012390285326086955, "loss": 0.5953, "step": 2361 }, { "epoch": 0.36975579211020665, "grad_norm": 0.8575089573860168, "learning_rate": 0.00012389096467391302, "loss": 0.3847, "step": 2362 }, { "epoch": 0.3699123356293049, "grad_norm": 1.192870020866394, "learning_rate": 0.0001238790760869565, "loss": 0.5936, "step": 2363 }, { "epoch": 0.37006887914840325, "grad_norm": 1.21517813205719, "learning_rate": 0.00012386718749999998, "loss": 0.4195, "step": 2364 }, { "epoch": 0.3702254226675016, "grad_norm": 1.3836802244186401, "learning_rate": 0.00012385529891304346, "loss": 0.746, "step": 2365 }, { "epoch": 0.37038196618659985, "grad_norm": 1.2586456537246704, "learning_rate": 0.00012384341032608694, "loss": 0.394, "step": 2366 }, { "epoch": 0.3705385097056982, "grad_norm": 1.5035698413848877, "learning_rate": 0.00012383152173913042, "loss": 0.7355, "step": 2367 }, { "epoch": 0.3706950532247965, "grad_norm": 1.6647913455963135, "learning_rate": 0.0001238196331521739, "loss": 0.4864, "step": 2368 }, { "epoch": 0.3708515967438948, "grad_norm": 1.7913579940795898, "learning_rate": 0.00012380774456521737, "loss": 0.6226, "step": 2369 }, { "epoch": 0.3710081402629931, "grad_norm": 3.2139782905578613, "learning_rate": 0.00012379585597826085, "loss": 1.0397, "step": 2370 }, { "epoch": 0.37116468378209144, "grad_norm": 3.8848347663879395, "learning_rate": 0.00012378396739130436, "loss": 0.5075, "step": 2371 }, { "epoch": 0.3713212273011897, "grad_norm": 1.395669937133789, "learning_rate": 0.00012377207880434783, "loss": 0.8471, "step": 2372 }, { "epoch": 0.37147777082028804, "grad_norm": 2.357369899749756, "learning_rate": 0.0001237601902173913, "loss": 0.5904, "step": 2373 }, { "epoch": 0.3716343143393864, "grad_norm": 1.5921155214309692, "learning_rate": 0.0001237483016304348, "loss": 0.7092, "step": 2374 }, { "epoch": 0.37179085785848465, "grad_norm": 1.845360279083252, "learning_rate": 0.00012373641304347827, "loss": 0.7815, "step": 2375 }, { "epoch": 0.371947401377583, "grad_norm": 2.227464437484741, "learning_rate": 0.00012372452445652172, "loss": 0.6687, "step": 2376 }, { "epoch": 0.3721039448966813, "grad_norm": 2.277585029602051, "learning_rate": 0.0001237126358695652, "loss": 0.6758, "step": 2377 }, { "epoch": 0.3722604884157796, "grad_norm": 2.4786057472229004, "learning_rate": 0.00012370074728260868, "loss": 0.8588, "step": 2378 }, { "epoch": 0.3724170319348779, "grad_norm": 5.1083269119262695, "learning_rate": 0.00012368885869565215, "loss": 0.7378, "step": 2379 }, { "epoch": 0.3725735754539762, "grad_norm": 4.045231342315674, "learning_rate": 0.00012367697010869563, "loss": 0.9599, "step": 2380 }, { "epoch": 0.3727301189730745, "grad_norm": 1.9183619022369385, "learning_rate": 0.0001236650815217391, "loss": 0.9661, "step": 2381 }, { "epoch": 0.37288666249217284, "grad_norm": 2.9183361530303955, "learning_rate": 0.0001236531929347826, "loss": 0.9451, "step": 2382 }, { "epoch": 0.3730432060112711, "grad_norm": 2.932462692260742, "learning_rate": 0.00012364130434782607, "loss": 0.6064, "step": 2383 }, { "epoch": 0.37319974953036944, "grad_norm": 2.549577236175537, "learning_rate": 0.00012362941576086954, "loss": 0.8368, "step": 2384 }, { "epoch": 0.37335629304946777, "grad_norm": 4.700106143951416, "learning_rate": 0.00012361752717391302, "loss": 1.5522, "step": 2385 }, { "epoch": 0.37351283656856604, "grad_norm": 3.9243674278259277, "learning_rate": 0.0001236056385869565, "loss": 0.93, "step": 2386 }, { "epoch": 0.37366938008766437, "grad_norm": 4.2067036628723145, "learning_rate": 0.00012359374999999998, "loss": 1.2275, "step": 2387 }, { "epoch": 0.3738259236067627, "grad_norm": 6.562367916107178, "learning_rate": 0.00012358186141304346, "loss": 1.3238, "step": 2388 }, { "epoch": 0.373982467125861, "grad_norm": 3.3032047748565674, "learning_rate": 0.00012356997282608693, "loss": 1.5199, "step": 2389 }, { "epoch": 0.3741390106449593, "grad_norm": 1.7024391889572144, "learning_rate": 0.0001235580842391304, "loss": 0.8014, "step": 2390 }, { "epoch": 0.37429555416405763, "grad_norm": 2.9324328899383545, "learning_rate": 0.00012354619565217392, "loss": 0.9499, "step": 2391 }, { "epoch": 0.3744520976831559, "grad_norm": 7.310845851898193, "learning_rate": 0.0001235343070652174, "loss": 1.8158, "step": 2392 }, { "epoch": 0.37460864120225423, "grad_norm": 3.0126001834869385, "learning_rate": 0.00012352241847826087, "loss": 1.275, "step": 2393 }, { "epoch": 0.37476518472135256, "grad_norm": 2.559518814086914, "learning_rate": 0.00012351052989130435, "loss": 1.4076, "step": 2394 }, { "epoch": 0.37492172824045084, "grad_norm": 2.9501383304595947, "learning_rate": 0.00012349864130434783, "loss": 1.1298, "step": 2395 }, { "epoch": 0.37507827175954916, "grad_norm": 5.70507287979126, "learning_rate": 0.0001234867527173913, "loss": 1.4247, "step": 2396 }, { "epoch": 0.37523481527864744, "grad_norm": 1.8401626348495483, "learning_rate": 0.0001234748641304348, "loss": 0.6123, "step": 2397 }, { "epoch": 0.37539135879774577, "grad_norm": 2.7817304134368896, "learning_rate": 0.00012346297554347827, "loss": 0.5876, "step": 2398 }, { "epoch": 0.3755479023168441, "grad_norm": 4.052642345428467, "learning_rate": 0.00012345108695652172, "loss": 1.2844, "step": 2399 }, { "epoch": 0.37570444583594237, "grad_norm": 2.819882869720459, "learning_rate": 0.0001234391983695652, "loss": 1.2992, "step": 2400 }, { "epoch": 0.3758609893550407, "grad_norm": 0.7759575843811035, "learning_rate": 0.00012342730978260867, "loss": 0.368, "step": 2401 }, { "epoch": 0.376017532874139, "grad_norm": 0.6064957976341248, "learning_rate": 0.00012341542119565215, "loss": 0.3746, "step": 2402 }, { "epoch": 0.3761740763932373, "grad_norm": 0.8010700941085815, "learning_rate": 0.00012340353260869563, "loss": 0.425, "step": 2403 }, { "epoch": 0.37633061991233563, "grad_norm": 1.0714805126190186, "learning_rate": 0.0001233916440217391, "loss": 0.6438, "step": 2404 }, { "epoch": 0.37648716343143396, "grad_norm": 0.6898729205131531, "learning_rate": 0.00012337975543478259, "loss": 0.4089, "step": 2405 }, { "epoch": 0.37664370695053223, "grad_norm": 0.7778715491294861, "learning_rate": 0.00012336786684782606, "loss": 0.409, "step": 2406 }, { "epoch": 0.37680025046963056, "grad_norm": 0.4963880181312561, "learning_rate": 0.00012335597826086954, "loss": 0.346, "step": 2407 }, { "epoch": 0.3769567939887289, "grad_norm": 1.5007030963897705, "learning_rate": 0.00012334408967391302, "loss": 0.554, "step": 2408 }, { "epoch": 0.37711333750782716, "grad_norm": 0.8978845477104187, "learning_rate": 0.0001233322010869565, "loss": 0.3674, "step": 2409 }, { "epoch": 0.3772698810269255, "grad_norm": 1.119147539138794, "learning_rate": 0.00012332031249999998, "loss": 0.4839, "step": 2410 }, { "epoch": 0.3774264245460238, "grad_norm": 1.352634072303772, "learning_rate": 0.00012330842391304348, "loss": 0.5762, "step": 2411 }, { "epoch": 0.3775829680651221, "grad_norm": 1.225042462348938, "learning_rate": 0.00012329653532608696, "loss": 0.6294, "step": 2412 }, { "epoch": 0.3777395115842204, "grad_norm": 1.2806023359298706, "learning_rate": 0.00012328464673913044, "loss": 0.552, "step": 2413 }, { "epoch": 0.3778960551033187, "grad_norm": 1.5758956670761108, "learning_rate": 0.00012327275815217392, "loss": 0.4508, "step": 2414 }, { "epoch": 0.378052598622417, "grad_norm": 1.8809442520141602, "learning_rate": 0.0001232608695652174, "loss": 0.7332, "step": 2415 }, { "epoch": 0.37820914214151535, "grad_norm": 2.3350000381469727, "learning_rate": 0.00012324898097826087, "loss": 0.5772, "step": 2416 }, { "epoch": 0.3783656856606136, "grad_norm": 1.8165593147277832, "learning_rate": 0.00012323709239130435, "loss": 0.5526, "step": 2417 }, { "epoch": 0.37852222917971196, "grad_norm": 3.7924726009368896, "learning_rate": 0.00012322520380434783, "loss": 0.7088, "step": 2418 }, { "epoch": 0.3786787726988103, "grad_norm": 1.0744736194610596, "learning_rate": 0.0001232133152173913, "loss": 0.4955, "step": 2419 }, { "epoch": 0.37883531621790856, "grad_norm": 1.8881947994232178, "learning_rate": 0.00012320142663043478, "loss": 0.6081, "step": 2420 }, { "epoch": 0.3789918597370069, "grad_norm": 2.848379611968994, "learning_rate": 0.00012318953804347826, "loss": 0.9173, "step": 2421 }, { "epoch": 0.3791484032561052, "grad_norm": 3.4534378051757812, "learning_rate": 0.00012317764945652171, "loss": 0.6725, "step": 2422 }, { "epoch": 0.3793049467752035, "grad_norm": 1.9793630838394165, "learning_rate": 0.0001231657608695652, "loss": 0.6615, "step": 2423 }, { "epoch": 0.3794614902943018, "grad_norm": 13.770583152770996, "learning_rate": 0.00012315387228260867, "loss": 1.2355, "step": 2424 }, { "epoch": 0.37961803381340015, "grad_norm": 1.95522141456604, "learning_rate": 0.00012314198369565215, "loss": 0.6873, "step": 2425 }, { "epoch": 0.3797745773324984, "grad_norm": 2.248499631881714, "learning_rate": 0.00012313009510869563, "loss": 0.852, "step": 2426 }, { "epoch": 0.37993112085159675, "grad_norm": 1.8435453176498413, "learning_rate": 0.0001231182065217391, "loss": 0.8591, "step": 2427 }, { "epoch": 0.3800876643706951, "grad_norm": 2.120399236679077, "learning_rate": 0.00012310631793478258, "loss": 0.5398, "step": 2428 }, { "epoch": 0.38024420788979335, "grad_norm": 2.137493848800659, "learning_rate": 0.00012309442934782606, "loss": 0.9252, "step": 2429 }, { "epoch": 0.3804007514088917, "grad_norm": 3.676638603210449, "learning_rate": 0.00012308254076086954, "loss": 1.1063, "step": 2430 }, { "epoch": 0.38055729492799, "grad_norm": 1.6085705757141113, "learning_rate": 0.00012307065217391304, "loss": 0.9882, "step": 2431 }, { "epoch": 0.3807138384470883, "grad_norm": 3.33835768699646, "learning_rate": 0.00012305876358695652, "loss": 0.9472, "step": 2432 }, { "epoch": 0.3808703819661866, "grad_norm": 2.625366449356079, "learning_rate": 0.000123046875, "loss": 1.2136, "step": 2433 }, { "epoch": 0.3810269254852849, "grad_norm": 2.1222760677337646, "learning_rate": 0.00012303498641304348, "loss": 0.6859, "step": 2434 }, { "epoch": 0.3811834690043832, "grad_norm": 2.2259886264801025, "learning_rate": 0.00012302309782608696, "loss": 0.9921, "step": 2435 }, { "epoch": 0.38134001252348154, "grad_norm": 2.3545053005218506, "learning_rate": 0.00012301120923913044, "loss": 0.7525, "step": 2436 }, { "epoch": 0.3814965560425798, "grad_norm": 3.660973072052002, "learning_rate": 0.0001229993206521739, "loss": 1.1253, "step": 2437 }, { "epoch": 0.38165309956167814, "grad_norm": 3.5081028938293457, "learning_rate": 0.0001229874320652174, "loss": 1.322, "step": 2438 }, { "epoch": 0.3818096430807765, "grad_norm": 1.8863067626953125, "learning_rate": 0.00012297554347826087, "loss": 1.3001, "step": 2439 }, { "epoch": 0.38196618659987475, "grad_norm": 3.4045238494873047, "learning_rate": 0.00012296365489130435, "loss": 1.4979, "step": 2440 }, { "epoch": 0.3821227301189731, "grad_norm": 2.693185806274414, "learning_rate": 0.00012295176630434783, "loss": 1.1538, "step": 2441 }, { "epoch": 0.3822792736380714, "grad_norm": 1.9664855003356934, "learning_rate": 0.0001229398777173913, "loss": 0.9917, "step": 2442 }, { "epoch": 0.3824358171571697, "grad_norm": 2.866515874862671, "learning_rate": 0.00012292798913043478, "loss": 1.3214, "step": 2443 }, { "epoch": 0.382592360676268, "grad_norm": 3.4112961292266846, "learning_rate": 0.00012291610054347826, "loss": 1.4174, "step": 2444 }, { "epoch": 0.38274890419536634, "grad_norm": 4.345746994018555, "learning_rate": 0.0001229042119565217, "loss": 1.3181, "step": 2445 }, { "epoch": 0.3829054477144646, "grad_norm": 2.491366386413574, "learning_rate": 0.0001228923233695652, "loss": 0.8938, "step": 2446 }, { "epoch": 0.38306199123356294, "grad_norm": 1.9747676849365234, "learning_rate": 0.00012288043478260867, "loss": 0.9568, "step": 2447 }, { "epoch": 0.38321853475266127, "grad_norm": 1.5641342401504517, "learning_rate": 0.00012286854619565215, "loss": 0.5093, "step": 2448 }, { "epoch": 0.38337507827175954, "grad_norm": 1.2731655836105347, "learning_rate": 0.00012285665760869562, "loss": 0.9726, "step": 2449 }, { "epoch": 0.38353162179085787, "grad_norm": 2.872917652130127, "learning_rate": 0.0001228447690217391, "loss": 1.3768, "step": 2450 }, { "epoch": 0.38368816530995614, "grad_norm": 0.5826495289802551, "learning_rate": 0.0001228328804347826, "loss": 0.3789, "step": 2451 }, { "epoch": 0.38384470882905447, "grad_norm": 0.6612105965614319, "learning_rate": 0.00012282099184782609, "loss": 0.4403, "step": 2452 }, { "epoch": 0.3840012523481528, "grad_norm": 0.6991161704063416, "learning_rate": 0.00012280910326086956, "loss": 0.3127, "step": 2453 }, { "epoch": 0.3841577958672511, "grad_norm": 0.7863770723342896, "learning_rate": 0.00012279721467391304, "loss": 0.5124, "step": 2454 }, { "epoch": 0.3843143393863494, "grad_norm": 0.6990340352058411, "learning_rate": 0.00012278532608695652, "loss": 0.4057, "step": 2455 }, { "epoch": 0.38447088290544773, "grad_norm": 1.1530709266662598, "learning_rate": 0.0001227734375, "loss": 0.3901, "step": 2456 }, { "epoch": 0.384627426424546, "grad_norm": 0.9170289635658264, "learning_rate": 0.00012276154891304348, "loss": 0.5597, "step": 2457 }, { "epoch": 0.38478396994364433, "grad_norm": 1.2961112260818481, "learning_rate": 0.00012274966032608695, "loss": 0.4916, "step": 2458 }, { "epoch": 0.38494051346274266, "grad_norm": 0.9988864660263062, "learning_rate": 0.00012273777173913043, "loss": 0.3355, "step": 2459 }, { "epoch": 0.38509705698184094, "grad_norm": 2.393350124359131, "learning_rate": 0.0001227258831521739, "loss": 0.3432, "step": 2460 }, { "epoch": 0.38525360050093926, "grad_norm": 0.8604625463485718, "learning_rate": 0.0001227139945652174, "loss": 0.4269, "step": 2461 }, { "epoch": 0.3854101440200376, "grad_norm": 0.7254533171653748, "learning_rate": 0.00012270210597826087, "loss": 0.311, "step": 2462 }, { "epoch": 0.38556668753913587, "grad_norm": 1.0425138473510742, "learning_rate": 0.00012269021739130435, "loss": 0.5138, "step": 2463 }, { "epoch": 0.3857232310582342, "grad_norm": 2.1206748485565186, "learning_rate": 0.00012267832880434782, "loss": 0.5375, "step": 2464 }, { "epoch": 0.3858797745773325, "grad_norm": 1.6082167625427246, "learning_rate": 0.0001226664402173913, "loss": 0.6145, "step": 2465 }, { "epoch": 0.3860363180964308, "grad_norm": 3.8264963626861572, "learning_rate": 0.00012265455163043478, "loss": 0.569, "step": 2466 }, { "epoch": 0.3861928616155291, "grad_norm": 1.3399851322174072, "learning_rate": 0.00012264266304347826, "loss": 0.3903, "step": 2467 }, { "epoch": 0.3863494051346274, "grad_norm": 1.9135864973068237, "learning_rate": 0.0001226307744565217, "loss": 0.4629, "step": 2468 }, { "epoch": 0.38650594865372573, "grad_norm": 1.8885048627853394, "learning_rate": 0.0001226188858695652, "loss": 0.7199, "step": 2469 }, { "epoch": 0.38666249217282406, "grad_norm": 1.8466732501983643, "learning_rate": 0.00012260699728260867, "loss": 0.4612, "step": 2470 }, { "epoch": 0.38681903569192233, "grad_norm": 2.2335102558135986, "learning_rate": 0.00012259510869565217, "loss": 0.7026, "step": 2471 }, { "epoch": 0.38697557921102066, "grad_norm": 3.756434679031372, "learning_rate": 0.00012258322010869565, "loss": 0.7283, "step": 2472 }, { "epoch": 0.387132122730119, "grad_norm": 2.1889426708221436, "learning_rate": 0.00012257133152173913, "loss": 0.4699, "step": 2473 }, { "epoch": 0.38728866624921726, "grad_norm": 2.1952478885650635, "learning_rate": 0.0001225594429347826, "loss": 0.7712, "step": 2474 }, { "epoch": 0.3874452097683156, "grad_norm": 2.616609811782837, "learning_rate": 0.00012254755434782608, "loss": 0.8864, "step": 2475 }, { "epoch": 0.3876017532874139, "grad_norm": 2.8751447200775146, "learning_rate": 0.00012253566576086956, "loss": 0.7913, "step": 2476 }, { "epoch": 0.3877582968065122, "grad_norm": 2.4153692722320557, "learning_rate": 0.00012252377717391304, "loss": 0.5289, "step": 2477 }, { "epoch": 0.3879148403256105, "grad_norm": 2.4725420475006104, "learning_rate": 0.00012251188858695652, "loss": 0.8711, "step": 2478 }, { "epoch": 0.38807138384470885, "grad_norm": 2.9182472229003906, "learning_rate": 0.0001225, "loss": 0.9106, "step": 2479 }, { "epoch": 0.3882279273638071, "grad_norm": 1.753714919090271, "learning_rate": 0.00012248811141304347, "loss": 0.5313, "step": 2480 }, { "epoch": 0.38838447088290545, "grad_norm": 1.7443362474441528, "learning_rate": 0.00012247622282608695, "loss": 0.8757, "step": 2481 }, { "epoch": 0.3885410144020038, "grad_norm": 1.2549642324447632, "learning_rate": 0.00012246433423913043, "loss": 0.6657, "step": 2482 }, { "epoch": 0.38869755792110205, "grad_norm": 3.3449065685272217, "learning_rate": 0.0001224524456521739, "loss": 1.0382, "step": 2483 }, { "epoch": 0.3888541014402004, "grad_norm": 4.623504638671875, "learning_rate": 0.0001224405570652174, "loss": 1.4126, "step": 2484 }, { "epoch": 0.3890106449592987, "grad_norm": 2.1473443508148193, "learning_rate": 0.00012242866847826086, "loss": 0.6982, "step": 2485 }, { "epoch": 0.389167188478397, "grad_norm": 4.044695854187012, "learning_rate": 0.00012241677989130434, "loss": 1.0686, "step": 2486 }, { "epoch": 0.3893237319974953, "grad_norm": 5.165140628814697, "learning_rate": 0.00012240489130434782, "loss": 1.1146, "step": 2487 }, { "epoch": 0.3894802755165936, "grad_norm": 6.540341854095459, "learning_rate": 0.0001223930027173913, "loss": 1.1135, "step": 2488 }, { "epoch": 0.3896368190356919, "grad_norm": 6.601261615753174, "learning_rate": 0.00012238111413043478, "loss": 1.6304, "step": 2489 }, { "epoch": 0.38979336255479025, "grad_norm": 4.507144927978516, "learning_rate": 0.00012236922554347826, "loss": 1.1657, "step": 2490 }, { "epoch": 0.3899499060738885, "grad_norm": 6.097291946411133, "learning_rate": 0.00012235733695652173, "loss": 1.0525, "step": 2491 }, { "epoch": 0.39010644959298685, "grad_norm": 4.097568511962891, "learning_rate": 0.0001223454483695652, "loss": 1.3202, "step": 2492 }, { "epoch": 0.3902629931120852, "grad_norm": 6.173952102661133, "learning_rate": 0.0001223335597826087, "loss": 1.6457, "step": 2493 }, { "epoch": 0.39041953663118345, "grad_norm": 3.106642007827759, "learning_rate": 0.00012232167119565217, "loss": 1.3672, "step": 2494 }, { "epoch": 0.3905760801502818, "grad_norm": 5.812266826629639, "learning_rate": 0.00012230978260869565, "loss": 2.3999, "step": 2495 }, { "epoch": 0.3907326236693801, "grad_norm": 2.194594144821167, "learning_rate": 0.00012229789402173912, "loss": 0.6825, "step": 2496 }, { "epoch": 0.3908891671884784, "grad_norm": 2.7776644229888916, "learning_rate": 0.0001222860054347826, "loss": 1.2833, "step": 2497 }, { "epoch": 0.3910457107075767, "grad_norm": 3.133209228515625, "learning_rate": 0.00012227411684782608, "loss": 0.9091, "step": 2498 }, { "epoch": 0.39120225422667504, "grad_norm": 2.526089906692505, "learning_rate": 0.00012226222826086956, "loss": 1.3552, "step": 2499 }, { "epoch": 0.3913587977457733, "grad_norm": 4.264857292175293, "learning_rate": 0.00012225033967391304, "loss": 1.4406, "step": 2500 }, { "epoch": 0.39151534126487164, "grad_norm": 0.5238121747970581, "learning_rate": 0.00012223845108695652, "loss": 0.3932, "step": 2501 }, { "epoch": 0.39167188478396997, "grad_norm": 0.5175689458847046, "learning_rate": 0.0001222265625, "loss": 0.27, "step": 2502 }, { "epoch": 0.39182842830306824, "grad_norm": 0.5337406992912292, "learning_rate": 0.00012221467391304347, "loss": 0.3479, "step": 2503 }, { "epoch": 0.3919849718221666, "grad_norm": 0.553479790687561, "learning_rate": 0.00012220278532608695, "loss": 0.3262, "step": 2504 }, { "epoch": 0.39214151534126485, "grad_norm": 0.6474884152412415, "learning_rate": 0.00012219089673913043, "loss": 0.2844, "step": 2505 }, { "epoch": 0.3922980588603632, "grad_norm": 0.6730402112007141, "learning_rate": 0.0001221790081521739, "loss": 0.3144, "step": 2506 }, { "epoch": 0.3924546023794615, "grad_norm": 0.8529598712921143, "learning_rate": 0.00012216711956521738, "loss": 0.3173, "step": 2507 }, { "epoch": 0.3926111458985598, "grad_norm": 0.7380754351615906, "learning_rate": 0.00012215523097826086, "loss": 0.3037, "step": 2508 }, { "epoch": 0.3927676894176581, "grad_norm": 5.02721643447876, "learning_rate": 0.00012214334239130434, "loss": 1.0857, "step": 2509 }, { "epoch": 0.39292423293675643, "grad_norm": 1.4800679683685303, "learning_rate": 0.00012213145380434782, "loss": 0.4454, "step": 2510 }, { "epoch": 0.3930807764558547, "grad_norm": 1.2033591270446777, "learning_rate": 0.0001221195652173913, "loss": 0.2079, "step": 2511 }, { "epoch": 0.39323731997495304, "grad_norm": 1.2518796920776367, "learning_rate": 0.00012210767663043478, "loss": 0.5229, "step": 2512 }, { "epoch": 0.39339386349405137, "grad_norm": 2.3093745708465576, "learning_rate": 0.00012209578804347825, "loss": 0.6401, "step": 2513 }, { "epoch": 0.39355040701314964, "grad_norm": 2.6228365898132324, "learning_rate": 0.00012208389945652173, "loss": 0.5423, "step": 2514 }, { "epoch": 0.39370695053224797, "grad_norm": 0.8076986074447632, "learning_rate": 0.0001220720108695652, "loss": 0.3375, "step": 2515 }, { "epoch": 0.3938634940513463, "grad_norm": 1.1801353693008423, "learning_rate": 0.00012206012228260867, "loss": 0.3932, "step": 2516 }, { "epoch": 0.39402003757044457, "grad_norm": 2.269956111907959, "learning_rate": 0.00012204823369565215, "loss": 0.5877, "step": 2517 }, { "epoch": 0.3941765810895429, "grad_norm": 1.7723361253738403, "learning_rate": 0.00012203634510869563, "loss": 0.5703, "step": 2518 }, { "epoch": 0.39433312460864123, "grad_norm": 2.1057846546173096, "learning_rate": 0.00012202445652173912, "loss": 0.7793, "step": 2519 }, { "epoch": 0.3944896681277395, "grad_norm": 1.666768193244934, "learning_rate": 0.0001220125679347826, "loss": 0.7967, "step": 2520 }, { "epoch": 0.39464621164683783, "grad_norm": 1.4529966115951538, "learning_rate": 0.00012200067934782608, "loss": 0.7456, "step": 2521 }, { "epoch": 0.3948027551659361, "grad_norm": 1.7698068618774414, "learning_rate": 0.00012198879076086956, "loss": 0.7509, "step": 2522 }, { "epoch": 0.39495929868503443, "grad_norm": 2.8326730728149414, "learning_rate": 0.00012197690217391303, "loss": 0.7761, "step": 2523 }, { "epoch": 0.39511584220413276, "grad_norm": 2.382026433944702, "learning_rate": 0.00012196501358695651, "loss": 0.7987, "step": 2524 }, { "epoch": 0.39527238572323103, "grad_norm": 2.5399117469787598, "learning_rate": 0.00012195312499999999, "loss": 1.0055, "step": 2525 }, { "epoch": 0.39542892924232936, "grad_norm": 3.11348819732666, "learning_rate": 0.00012194123641304347, "loss": 0.8862, "step": 2526 }, { "epoch": 0.3955854727614277, "grad_norm": 3.446614980697632, "learning_rate": 0.00012192934782608695, "loss": 1.231, "step": 2527 }, { "epoch": 0.39574201628052597, "grad_norm": 1.4411289691925049, "learning_rate": 0.00012191745923913043, "loss": 0.6865, "step": 2528 }, { "epoch": 0.3958985597996243, "grad_norm": 2.249289035797119, "learning_rate": 0.0001219055706521739, "loss": 0.4584, "step": 2529 }, { "epoch": 0.3960551033187226, "grad_norm": 2.0353381633758545, "learning_rate": 0.00012189368206521738, "loss": 0.8449, "step": 2530 }, { "epoch": 0.3962116468378209, "grad_norm": 1.637546181678772, "learning_rate": 0.00012188179347826086, "loss": 0.8009, "step": 2531 }, { "epoch": 0.3963681903569192, "grad_norm": 3.3161466121673584, "learning_rate": 0.00012186990489130434, "loss": 1.1366, "step": 2532 }, { "epoch": 0.39652473387601755, "grad_norm": 2.5834524631500244, "learning_rate": 0.00012185801630434782, "loss": 1.212, "step": 2533 }, { "epoch": 0.3966812773951158, "grad_norm": 2.350759983062744, "learning_rate": 0.0001218461277173913, "loss": 1.482, "step": 2534 }, { "epoch": 0.39683782091421416, "grad_norm": 2.5927116870880127, "learning_rate": 0.00012183423913043477, "loss": 0.8759, "step": 2535 }, { "epoch": 0.3969943644333125, "grad_norm": 2.0874369144439697, "learning_rate": 0.00012182235054347825, "loss": 1.4651, "step": 2536 }, { "epoch": 0.39715090795241076, "grad_norm": 2.326216220855713, "learning_rate": 0.00012181046195652172, "loss": 0.8147, "step": 2537 }, { "epoch": 0.3973074514715091, "grad_norm": 2.25915789604187, "learning_rate": 0.0001217985733695652, "loss": 0.86, "step": 2538 }, { "epoch": 0.3974639949906074, "grad_norm": 3.76519513130188, "learning_rate": 0.00012178668478260869, "loss": 1.142, "step": 2539 }, { "epoch": 0.3976205385097057, "grad_norm": 2.842725992202759, "learning_rate": 0.00012177479619565216, "loss": 1.3185, "step": 2540 }, { "epoch": 0.397777082028804, "grad_norm": 3.1393585205078125, "learning_rate": 0.00012176290760869564, "loss": 1.3109, "step": 2541 }, { "epoch": 0.3979336255479023, "grad_norm": 5.868533134460449, "learning_rate": 0.00012175101902173912, "loss": 1.1667, "step": 2542 }, { "epoch": 0.3980901690670006, "grad_norm": 1.3143138885498047, "learning_rate": 0.0001217391304347826, "loss": 0.7761, "step": 2543 }, { "epoch": 0.39824671258609895, "grad_norm": 3.443429946899414, "learning_rate": 0.00012172724184782608, "loss": 2.0284, "step": 2544 }, { "epoch": 0.3984032561051972, "grad_norm": 2.7832374572753906, "learning_rate": 0.00012171535326086955, "loss": 1.6488, "step": 2545 }, { "epoch": 0.39855979962429555, "grad_norm": 1.946377158164978, "learning_rate": 0.00012170346467391303, "loss": 0.6628, "step": 2546 }, { "epoch": 0.3987163431433939, "grad_norm": 3.453458309173584, "learning_rate": 0.00012169157608695651, "loss": 1.9424, "step": 2547 }, { "epoch": 0.39887288666249215, "grad_norm": 3.7765722274780273, "learning_rate": 0.00012167968749999999, "loss": 1.0526, "step": 2548 }, { "epoch": 0.3990294301815905, "grad_norm": 4.131337642669678, "learning_rate": 0.00012166779891304347, "loss": 0.7567, "step": 2549 }, { "epoch": 0.3991859737006888, "grad_norm": 2.879791021347046, "learning_rate": 0.00012165591032608695, "loss": 1.3918, "step": 2550 }, { "epoch": 0.3993425172197871, "grad_norm": 0.6843430399894714, "learning_rate": 0.00012164402173913042, "loss": 0.3288, "step": 2551 }, { "epoch": 0.3994990607388854, "grad_norm": 0.737568199634552, "learning_rate": 0.0001216321331521739, "loss": 0.3987, "step": 2552 }, { "epoch": 0.39965560425798374, "grad_norm": 0.6193193197250366, "learning_rate": 0.00012162024456521738, "loss": 0.3534, "step": 2553 }, { "epoch": 0.399812147777082, "grad_norm": 1.2402939796447754, "learning_rate": 0.00012160835597826086, "loss": 0.294, "step": 2554 }, { "epoch": 0.39996869129618035, "grad_norm": 0.6182275414466858, "learning_rate": 0.00012159646739130434, "loss": 0.271, "step": 2555 }, { "epoch": 0.4001252348152787, "grad_norm": 0.8951207995414734, "learning_rate": 0.00012158457880434783, "loss": 0.3574, "step": 2556 }, { "epoch": 0.40028177833437695, "grad_norm": 1.1185402870178223, "learning_rate": 0.0001215726902173913, "loss": 0.3871, "step": 2557 }, { "epoch": 0.4004383218534753, "grad_norm": 1.0252203941345215, "learning_rate": 0.00012156080163043478, "loss": 0.5917, "step": 2558 }, { "epoch": 0.40059486537257355, "grad_norm": 1.268689751625061, "learning_rate": 0.00012154891304347826, "loss": 0.452, "step": 2559 }, { "epoch": 0.4007514088916719, "grad_norm": 1.040252685546875, "learning_rate": 0.00012153702445652173, "loss": 0.498, "step": 2560 }, { "epoch": 0.4009079524107702, "grad_norm": 0.9879304766654968, "learning_rate": 0.0001215251358695652, "loss": 0.4315, "step": 2561 }, { "epoch": 0.4010644959298685, "grad_norm": 1.1798256635665894, "learning_rate": 0.00012151324728260868, "loss": 0.3153, "step": 2562 }, { "epoch": 0.4012210394489668, "grad_norm": 1.1884937286376953, "learning_rate": 0.00012150135869565216, "loss": 0.5301, "step": 2563 }, { "epoch": 0.40137758296806514, "grad_norm": 1.0273900032043457, "learning_rate": 0.00012148947010869564, "loss": 0.5142, "step": 2564 }, { "epoch": 0.4015341264871634, "grad_norm": 1.395438313484192, "learning_rate": 0.00012147758152173912, "loss": 0.4282, "step": 2565 }, { "epoch": 0.40169067000626174, "grad_norm": 1.2487528324127197, "learning_rate": 0.0001214656929347826, "loss": 0.5888, "step": 2566 }, { "epoch": 0.40184721352536007, "grad_norm": 1.2256935834884644, "learning_rate": 0.00012145380434782607, "loss": 0.4022, "step": 2567 }, { "epoch": 0.40200375704445834, "grad_norm": 1.293699860572815, "learning_rate": 0.00012144191576086955, "loss": 0.5641, "step": 2568 }, { "epoch": 0.40216030056355667, "grad_norm": 1.0810221433639526, "learning_rate": 0.00012143002717391303, "loss": 0.6616, "step": 2569 }, { "epoch": 0.402316844082655, "grad_norm": 1.433933973312378, "learning_rate": 0.00012141813858695651, "loss": 0.5374, "step": 2570 }, { "epoch": 0.4024733876017533, "grad_norm": 1.5107245445251465, "learning_rate": 0.00012140624999999999, "loss": 0.732, "step": 2571 }, { "epoch": 0.4026299311208516, "grad_norm": 1.348952293395996, "learning_rate": 0.00012139436141304346, "loss": 0.4406, "step": 2572 }, { "epoch": 0.40278647463994993, "grad_norm": 2.0091729164123535, "learning_rate": 0.00012138247282608694, "loss": 0.6531, "step": 2573 }, { "epoch": 0.4029430181590482, "grad_norm": 1.687111496925354, "learning_rate": 0.00012137058423913042, "loss": 0.6332, "step": 2574 }, { "epoch": 0.40309956167814653, "grad_norm": 1.6713591814041138, "learning_rate": 0.0001213586956521739, "loss": 0.5196, "step": 2575 }, { "epoch": 0.4032561051972448, "grad_norm": 2.353668689727783, "learning_rate": 0.00012134680706521739, "loss": 1.0662, "step": 2576 }, { "epoch": 0.40341264871634314, "grad_norm": 3.9432876110076904, "learning_rate": 0.00012133491847826087, "loss": 0.7233, "step": 2577 }, { "epoch": 0.40356919223544147, "grad_norm": 2.861628293991089, "learning_rate": 0.00012132302989130435, "loss": 0.9355, "step": 2578 }, { "epoch": 0.40372573575453974, "grad_norm": 5.780348300933838, "learning_rate": 0.00012131114130434783, "loss": 1.0241, "step": 2579 }, { "epoch": 0.40388227927363807, "grad_norm": 2.444028377532959, "learning_rate": 0.0001212992527173913, "loss": 1.0138, "step": 2580 }, { "epoch": 0.4040388227927364, "grad_norm": 2.2366597652435303, "learning_rate": 0.00012128736413043478, "loss": 0.6735, "step": 2581 }, { "epoch": 0.40419536631183467, "grad_norm": 4.73460054397583, "learning_rate": 0.00012127547554347826, "loss": 1.2356, "step": 2582 }, { "epoch": 0.404351909830933, "grad_norm": 2.231412410736084, "learning_rate": 0.00012126358695652172, "loss": 1.0918, "step": 2583 }, { "epoch": 0.4045084533500313, "grad_norm": 4.1993536949157715, "learning_rate": 0.0001212516983695652, "loss": 1.2654, "step": 2584 }, { "epoch": 0.4046649968691296, "grad_norm": 3.39054536819458, "learning_rate": 0.00012123980978260868, "loss": 1.0354, "step": 2585 }, { "epoch": 0.40482154038822793, "grad_norm": 2.816228151321411, "learning_rate": 0.00012122792119565216, "loss": 1.4741, "step": 2586 }, { "epoch": 0.40497808390732626, "grad_norm": 1.602815866470337, "learning_rate": 0.00012121603260869564, "loss": 0.9331, "step": 2587 }, { "epoch": 0.40513462742642453, "grad_norm": 2.060817241668701, "learning_rate": 0.00012120414402173912, "loss": 1.0665, "step": 2588 }, { "epoch": 0.40529117094552286, "grad_norm": 2.5213370323181152, "learning_rate": 0.00012119225543478259, "loss": 0.8244, "step": 2589 }, { "epoch": 0.4054477144646212, "grad_norm": 2.4653825759887695, "learning_rate": 0.00012118036684782607, "loss": 0.9658, "step": 2590 }, { "epoch": 0.40560425798371946, "grad_norm": 2.9625437259674072, "learning_rate": 0.00012116847826086955, "loss": 1.6768, "step": 2591 }, { "epoch": 0.4057608015028178, "grad_norm": 2.807004690170288, "learning_rate": 0.00012115658967391303, "loss": 1.1953, "step": 2592 }, { "epoch": 0.40591734502191607, "grad_norm": 4.605734825134277, "learning_rate": 0.0001211447010869565, "loss": 1.4856, "step": 2593 }, { "epoch": 0.4060738885410144, "grad_norm": 3.169226884841919, "learning_rate": 0.00012113281249999998, "loss": 1.5128, "step": 2594 }, { "epoch": 0.4062304320601127, "grad_norm": 3.3522324562072754, "learning_rate": 0.00012112092391304346, "loss": 1.1814, "step": 2595 }, { "epoch": 0.406386975579211, "grad_norm": 5.017669677734375, "learning_rate": 0.00012110903532608695, "loss": 0.9278, "step": 2596 }, { "epoch": 0.4065435190983093, "grad_norm": 2.167792797088623, "learning_rate": 0.00012109714673913043, "loss": 1.1176, "step": 2597 }, { "epoch": 0.40670006261740765, "grad_norm": 2.8446156978607178, "learning_rate": 0.00012108525815217391, "loss": 0.8985, "step": 2598 }, { "epoch": 0.4068566061365059, "grad_norm": 3.679098606109619, "learning_rate": 0.00012107336956521739, "loss": 0.7826, "step": 2599 }, { "epoch": 0.40701314965560426, "grad_norm": 1.8164395093917847, "learning_rate": 0.00012106148097826087, "loss": 1.0337, "step": 2600 }, { "epoch": 0.4071696931747026, "grad_norm": 0.6113756895065308, "learning_rate": 0.00012104959239130434, "loss": 0.3748, "step": 2601 }, { "epoch": 0.40732623669380086, "grad_norm": 0.5684760808944702, "learning_rate": 0.00012103770380434782, "loss": 0.3259, "step": 2602 }, { "epoch": 0.4074827802128992, "grad_norm": 0.7240707278251648, "learning_rate": 0.0001210258152173913, "loss": 0.3702, "step": 2603 }, { "epoch": 0.4076393237319975, "grad_norm": 0.9759418368339539, "learning_rate": 0.00012101392663043478, "loss": 0.4386, "step": 2604 }, { "epoch": 0.4077958672510958, "grad_norm": 0.6301271915435791, "learning_rate": 0.00012100203804347826, "loss": 0.4698, "step": 2605 }, { "epoch": 0.4079524107701941, "grad_norm": 0.6351445913314819, "learning_rate": 0.00012099014945652172, "loss": 0.2736, "step": 2606 }, { "epoch": 0.40810895428929245, "grad_norm": 1.0460169315338135, "learning_rate": 0.0001209782608695652, "loss": 0.3969, "step": 2607 }, { "epoch": 0.4082654978083907, "grad_norm": 1.3247261047363281, "learning_rate": 0.00012096637228260868, "loss": 0.3322, "step": 2608 }, { "epoch": 0.40842204132748905, "grad_norm": 0.9346051812171936, "learning_rate": 0.00012095448369565216, "loss": 0.3928, "step": 2609 }, { "epoch": 0.4085785848465874, "grad_norm": 1.6921943426132202, "learning_rate": 0.00012094259510869563, "loss": 0.7017, "step": 2610 }, { "epoch": 0.40873512836568565, "grad_norm": 0.9847664833068848, "learning_rate": 0.00012093070652173911, "loss": 0.3451, "step": 2611 }, { "epoch": 0.408891671884784, "grad_norm": 1.568885326385498, "learning_rate": 0.00012091881793478259, "loss": 0.6285, "step": 2612 }, { "epoch": 0.40904821540388225, "grad_norm": 1.217980146408081, "learning_rate": 0.00012090692934782607, "loss": 0.3842, "step": 2613 }, { "epoch": 0.4092047589229806, "grad_norm": 3.8595974445343018, "learning_rate": 0.00012089504076086955, "loss": 1.0096, "step": 2614 }, { "epoch": 0.4093613024420789, "grad_norm": 1.4492768049240112, "learning_rate": 0.00012088315217391303, "loss": 0.4729, "step": 2615 }, { "epoch": 0.4095178459611772, "grad_norm": 2.061356544494629, "learning_rate": 0.00012087126358695652, "loss": 0.6378, "step": 2616 }, { "epoch": 0.4096743894802755, "grad_norm": 1.193534016609192, "learning_rate": 0.000120859375, "loss": 0.5822, "step": 2617 }, { "epoch": 0.40983093299937384, "grad_norm": 1.1382167339324951, "learning_rate": 0.00012084748641304347, "loss": 0.6386, "step": 2618 }, { "epoch": 0.4099874765184721, "grad_norm": 1.7614052295684814, "learning_rate": 0.00012083559782608695, "loss": 0.5932, "step": 2619 }, { "epoch": 0.41014402003757044, "grad_norm": 2.5790398120880127, "learning_rate": 0.00012082370923913043, "loss": 0.7836, "step": 2620 }, { "epoch": 0.4103005635566688, "grad_norm": 2.2906134128570557, "learning_rate": 0.00012081182065217391, "loss": 0.8864, "step": 2621 }, { "epoch": 0.41045710707576705, "grad_norm": 3.415750503540039, "learning_rate": 0.00012079993206521739, "loss": 0.9349, "step": 2622 }, { "epoch": 0.4106136505948654, "grad_norm": 2.072577476501465, "learning_rate": 0.00012078804347826086, "loss": 0.7049, "step": 2623 }, { "epoch": 0.4107701941139637, "grad_norm": 1.6630712747573853, "learning_rate": 0.00012077615489130434, "loss": 0.8536, "step": 2624 }, { "epoch": 0.410926737633062, "grad_norm": 3.1487619876861572, "learning_rate": 0.00012076426630434782, "loss": 1.0806, "step": 2625 }, { "epoch": 0.4110832811521603, "grad_norm": 2.295105218887329, "learning_rate": 0.0001207523777173913, "loss": 1.0126, "step": 2626 }, { "epoch": 0.41123982467125864, "grad_norm": 4.992067337036133, "learning_rate": 0.00012074048913043478, "loss": 1.0068, "step": 2627 }, { "epoch": 0.4113963681903569, "grad_norm": 2.5883820056915283, "learning_rate": 0.00012072860054347825, "loss": 0.8223, "step": 2628 }, { "epoch": 0.41155291170945524, "grad_norm": 2.477476119995117, "learning_rate": 0.00012071671195652172, "loss": 0.8727, "step": 2629 }, { "epoch": 0.4117094552285535, "grad_norm": 3.4067130088806152, "learning_rate": 0.0001207048233695652, "loss": 1.2259, "step": 2630 }, { "epoch": 0.41186599874765184, "grad_norm": 2.0503129959106445, "learning_rate": 0.00012069293478260868, "loss": 0.6314, "step": 2631 }, { "epoch": 0.41202254226675017, "grad_norm": 2.2714431285858154, "learning_rate": 0.00012068104619565215, "loss": 1.0202, "step": 2632 }, { "epoch": 0.41217908578584844, "grad_norm": 1.9566336870193481, "learning_rate": 0.00012066915760869563, "loss": 0.4363, "step": 2633 }, { "epoch": 0.41233562930494677, "grad_norm": 1.7467445135116577, "learning_rate": 0.00012065726902173911, "loss": 0.7758, "step": 2634 }, { "epoch": 0.4124921728240451, "grad_norm": 2.631634473800659, "learning_rate": 0.00012064538043478259, "loss": 1.675, "step": 2635 }, { "epoch": 0.4126487163431434, "grad_norm": 5.259006500244141, "learning_rate": 0.00012063349184782608, "loss": 1.4144, "step": 2636 }, { "epoch": 0.4128052598622417, "grad_norm": 6.630587100982666, "learning_rate": 0.00012062160326086956, "loss": 1.506, "step": 2637 }, { "epoch": 0.41296180338134003, "grad_norm": 2.766071081161499, "learning_rate": 0.00012060971467391304, "loss": 1.4373, "step": 2638 }, { "epoch": 0.4131183469004383, "grad_norm": 4.5618815422058105, "learning_rate": 0.00012059782608695651, "loss": 1.7193, "step": 2639 }, { "epoch": 0.41327489041953663, "grad_norm": 4.681007385253906, "learning_rate": 0.00012058593749999999, "loss": 1.3042, "step": 2640 }, { "epoch": 0.41343143393863496, "grad_norm": 2.12680983543396, "learning_rate": 0.00012057404891304347, "loss": 1.1086, "step": 2641 }, { "epoch": 0.41358797745773324, "grad_norm": 2.4128050804138184, "learning_rate": 0.00012056216032608695, "loss": 1.0601, "step": 2642 }, { "epoch": 0.41374452097683156, "grad_norm": 2.255906820297241, "learning_rate": 0.00012055027173913043, "loss": 1.3766, "step": 2643 }, { "epoch": 0.4139010644959299, "grad_norm": 2.672701120376587, "learning_rate": 0.0001205383831521739, "loss": 1.4318, "step": 2644 }, { "epoch": 0.41405760801502817, "grad_norm": 2.505720853805542, "learning_rate": 0.00012052649456521738, "loss": 1.3499, "step": 2645 }, { "epoch": 0.4142141515341265, "grad_norm": 3.672961950302124, "learning_rate": 0.00012051460597826086, "loss": 1.328, "step": 2646 }, { "epoch": 0.41437069505322477, "grad_norm": 4.751410007476807, "learning_rate": 0.00012050271739130434, "loss": 1.4617, "step": 2647 }, { "epoch": 0.4145272385723231, "grad_norm": 4.49873161315918, "learning_rate": 0.00012049082880434782, "loss": 0.9532, "step": 2648 }, { "epoch": 0.4146837820914214, "grad_norm": 4.3581085205078125, "learning_rate": 0.0001204789402173913, "loss": 1.4864, "step": 2649 }, { "epoch": 0.4148403256105197, "grad_norm": 3.096511125564575, "learning_rate": 0.00012046705163043477, "loss": 2.0096, "step": 2650 }, { "epoch": 0.41499686912961803, "grad_norm": 0.7950087189674377, "learning_rate": 0.00012045516304347825, "loss": 0.3696, "step": 2651 }, { "epoch": 0.41515341264871636, "grad_norm": 0.6666126251220703, "learning_rate": 0.00012044327445652172, "loss": 0.3892, "step": 2652 }, { "epoch": 0.41530995616781463, "grad_norm": 0.94556725025177, "learning_rate": 0.0001204313858695652, "loss": 0.3702, "step": 2653 }, { "epoch": 0.41546649968691296, "grad_norm": 0.6939385533332825, "learning_rate": 0.00012041949728260867, "loss": 0.4631, "step": 2654 }, { "epoch": 0.4156230432060113, "grad_norm": 0.9601438641548157, "learning_rate": 0.00012040760869565215, "loss": 0.3624, "step": 2655 }, { "epoch": 0.41577958672510956, "grad_norm": 0.7301419377326965, "learning_rate": 0.00012039572010869564, "loss": 0.3518, "step": 2656 }, { "epoch": 0.4159361302442079, "grad_norm": 0.978477418422699, "learning_rate": 0.00012038383152173912, "loss": 0.3767, "step": 2657 }, { "epoch": 0.4160926737633062, "grad_norm": 0.9243035316467285, "learning_rate": 0.0001203719429347826, "loss": 0.3036, "step": 2658 }, { "epoch": 0.4162492172824045, "grad_norm": 0.5911157727241516, "learning_rate": 0.00012036005434782608, "loss": 0.3814, "step": 2659 }, { "epoch": 0.4164057608015028, "grad_norm": 1.5018486976623535, "learning_rate": 0.00012034816576086956, "loss": 0.5838, "step": 2660 }, { "epoch": 0.41656230432060115, "grad_norm": 1.1221776008605957, "learning_rate": 0.00012033627717391303, "loss": 0.5737, "step": 2661 }, { "epoch": 0.4167188478396994, "grad_norm": 1.4704062938690186, "learning_rate": 0.00012032438858695651, "loss": 0.4441, "step": 2662 }, { "epoch": 0.41687539135879775, "grad_norm": 4.719549179077148, "learning_rate": 0.00012031249999999999, "loss": 0.7044, "step": 2663 }, { "epoch": 0.4170319348778961, "grad_norm": 1.6576167345046997, "learning_rate": 0.00012030061141304347, "loss": 0.6543, "step": 2664 }, { "epoch": 0.41718847839699436, "grad_norm": 1.098601222038269, "learning_rate": 0.00012028872282608695, "loss": 0.4553, "step": 2665 }, { "epoch": 0.4173450219160927, "grad_norm": 1.7287403345108032, "learning_rate": 0.00012027683423913042, "loss": 0.6217, "step": 2666 }, { "epoch": 0.41750156543519096, "grad_norm": 1.3844611644744873, "learning_rate": 0.0001202649456521739, "loss": 0.5899, "step": 2667 }, { "epoch": 0.4176581089542893, "grad_norm": 1.5446525812149048, "learning_rate": 0.00012025305706521738, "loss": 0.7695, "step": 2668 }, { "epoch": 0.4178146524733876, "grad_norm": 1.2691655158996582, "learning_rate": 0.00012024116847826086, "loss": 0.5721, "step": 2669 }, { "epoch": 0.4179711959924859, "grad_norm": 1.3794989585876465, "learning_rate": 0.00012022927989130434, "loss": 0.6225, "step": 2670 }, { "epoch": 0.4181277395115842, "grad_norm": 1.9189690351486206, "learning_rate": 0.00012021739130434782, "loss": 0.7992, "step": 2671 }, { "epoch": 0.41828428303068255, "grad_norm": 1.653315544128418, "learning_rate": 0.0001202055027173913, "loss": 0.4368, "step": 2672 }, { "epoch": 0.4184408265497808, "grad_norm": 2.4601852893829346, "learning_rate": 0.00012019361413043479, "loss": 0.8308, "step": 2673 }, { "epoch": 0.41859737006887915, "grad_norm": 2.822730541229248, "learning_rate": 0.00012018172554347826, "loss": 0.9482, "step": 2674 }, { "epoch": 0.4187539135879775, "grad_norm": 2.0566039085388184, "learning_rate": 0.00012016983695652171, "loss": 0.756, "step": 2675 }, { "epoch": 0.41891045710707575, "grad_norm": 2.156198024749756, "learning_rate": 0.0001201579483695652, "loss": 0.991, "step": 2676 }, { "epoch": 0.4190670006261741, "grad_norm": 2.9669742584228516, "learning_rate": 0.00012014605978260868, "loss": 0.6109, "step": 2677 }, { "epoch": 0.4192235441452724, "grad_norm": 1.361220359802246, "learning_rate": 0.00012013417119565216, "loss": 0.6, "step": 2678 }, { "epoch": 0.4193800876643707, "grad_norm": 3.3304171562194824, "learning_rate": 0.00012012228260869564, "loss": 1.1111, "step": 2679 }, { "epoch": 0.419536631183469, "grad_norm": 3.8657286167144775, "learning_rate": 0.00012011039402173912, "loss": 0.8411, "step": 2680 }, { "epoch": 0.41969317470256734, "grad_norm": 2.1914920806884766, "learning_rate": 0.0001200985054347826, "loss": 0.681, "step": 2681 }, { "epoch": 0.4198497182216656, "grad_norm": 2.308708906173706, "learning_rate": 0.00012008661684782608, "loss": 0.8235, "step": 2682 }, { "epoch": 0.42000626174076394, "grad_norm": 3.1811623573303223, "learning_rate": 0.00012007472826086955, "loss": 1.1622, "step": 2683 }, { "epoch": 0.4201628052598622, "grad_norm": 2.9301698207855225, "learning_rate": 0.00012006283967391303, "loss": 0.8668, "step": 2684 }, { "epoch": 0.42031934877896054, "grad_norm": 4.274133682250977, "learning_rate": 0.00012005095108695651, "loss": 1.6769, "step": 2685 }, { "epoch": 0.4204758922980589, "grad_norm": 3.471942663192749, "learning_rate": 0.00012003906249999999, "loss": 1.444, "step": 2686 }, { "epoch": 0.42063243581715715, "grad_norm": 5.533565998077393, "learning_rate": 0.00012002717391304347, "loss": 0.9382, "step": 2687 }, { "epoch": 0.4207889793362555, "grad_norm": 2.5464694499969482, "learning_rate": 0.00012001528532608694, "loss": 0.7821, "step": 2688 }, { "epoch": 0.4209455228553538, "grad_norm": 1.9431028366088867, "learning_rate": 0.00012000339673913042, "loss": 0.6826, "step": 2689 }, { "epoch": 0.4211020663744521, "grad_norm": 3.7855887413024902, "learning_rate": 0.0001199915081521739, "loss": 1.5877, "step": 2690 }, { "epoch": 0.4212586098935504, "grad_norm": 3.026482582092285, "learning_rate": 0.00011997961956521738, "loss": 1.7165, "step": 2691 }, { "epoch": 0.42141515341264874, "grad_norm": 2.518068552017212, "learning_rate": 0.00011996773097826086, "loss": 1.896, "step": 2692 }, { "epoch": 0.421571696931747, "grad_norm": 3.2349047660827637, "learning_rate": 0.00011995584239130435, "loss": 1.3788, "step": 2693 }, { "epoch": 0.42172824045084534, "grad_norm": 2.917187452316284, "learning_rate": 0.00011994395380434783, "loss": 1.8002, "step": 2694 }, { "epoch": 0.42188478396994367, "grad_norm": 3.086336851119995, "learning_rate": 0.0001199320652173913, "loss": 1.3163, "step": 2695 }, { "epoch": 0.42204132748904194, "grad_norm": 3.972723960876465, "learning_rate": 0.00011992017663043478, "loss": 1.0253, "step": 2696 }, { "epoch": 0.42219787100814027, "grad_norm": 1.9225069284439087, "learning_rate": 0.00011990828804347826, "loss": 0.9225, "step": 2697 }, { "epoch": 0.4223544145272386, "grad_norm": 2.5532755851745605, "learning_rate": 0.00011989639945652173, "loss": 0.6075, "step": 2698 }, { "epoch": 0.42251095804633687, "grad_norm": 1.7455850839614868, "learning_rate": 0.0001198845108695652, "loss": 1.0703, "step": 2699 }, { "epoch": 0.4226675015654352, "grad_norm": 2.417510509490967, "learning_rate": 0.00011987262228260868, "loss": 1.3408, "step": 2700 }, { "epoch": 0.4228240450845335, "grad_norm": 0.7847635746002197, "learning_rate": 0.00011986073369565216, "loss": 0.3329, "step": 2701 }, { "epoch": 0.4229805886036318, "grad_norm": 0.9687214493751526, "learning_rate": 0.00011984884510869564, "loss": 0.4971, "step": 2702 }, { "epoch": 0.42313713212273013, "grad_norm": 0.47365134954452515, "learning_rate": 0.00011983695652173912, "loss": 0.237, "step": 2703 }, { "epoch": 0.4232936756418284, "grad_norm": 1.0728774070739746, "learning_rate": 0.0001198250679347826, "loss": 0.3388, "step": 2704 }, { "epoch": 0.42345021916092673, "grad_norm": 0.5943195223808289, "learning_rate": 0.00011981317934782607, "loss": 0.3212, "step": 2705 }, { "epoch": 0.42360676268002506, "grad_norm": 0.7538720965385437, "learning_rate": 0.00011980129076086955, "loss": 0.398, "step": 2706 }, { "epoch": 0.42376330619912334, "grad_norm": 0.8637356758117676, "learning_rate": 0.00011978940217391303, "loss": 0.3741, "step": 2707 }, { "epoch": 0.42391984971822166, "grad_norm": 1.0518661737442017, "learning_rate": 0.00011977751358695651, "loss": 0.3931, "step": 2708 }, { "epoch": 0.42407639323732, "grad_norm": 1.0089255571365356, "learning_rate": 0.00011976562499999999, "loss": 0.3396, "step": 2709 }, { "epoch": 0.42423293675641827, "grad_norm": 1.1645863056182861, "learning_rate": 0.00011975373641304346, "loss": 0.3591, "step": 2710 }, { "epoch": 0.4243894802755166, "grad_norm": 0.7552050948143005, "learning_rate": 0.00011974184782608694, "loss": 0.3204, "step": 2711 }, { "epoch": 0.4245460237946149, "grad_norm": 1.758046269416809, "learning_rate": 0.00011972995923913042, "loss": 0.5245, "step": 2712 }, { "epoch": 0.4247025673137132, "grad_norm": 6.559011459350586, "learning_rate": 0.00011971807065217391, "loss": 0.4664, "step": 2713 }, { "epoch": 0.4248591108328115, "grad_norm": 0.9603021144866943, "learning_rate": 0.00011970618206521739, "loss": 0.397, "step": 2714 }, { "epoch": 0.42501565435190986, "grad_norm": 1.1370185613632202, "learning_rate": 0.00011969429347826087, "loss": 0.5429, "step": 2715 }, { "epoch": 0.42517219787100813, "grad_norm": 1.628521203994751, "learning_rate": 0.00011968240489130435, "loss": 0.4801, "step": 2716 }, { "epoch": 0.42532874139010646, "grad_norm": 1.0861529111862183, "learning_rate": 0.00011967051630434782, "loss": 0.4401, "step": 2717 }, { "epoch": 0.4254852849092048, "grad_norm": 2.422715902328491, "learning_rate": 0.0001196586277173913, "loss": 0.4817, "step": 2718 }, { "epoch": 0.42564182842830306, "grad_norm": 1.1873364448547363, "learning_rate": 0.00011964673913043478, "loss": 0.5504, "step": 2719 }, { "epoch": 0.4257983719474014, "grad_norm": 1.8809908628463745, "learning_rate": 0.00011963485054347826, "loss": 0.4683, "step": 2720 }, { "epoch": 0.42595491546649966, "grad_norm": 1.4986248016357422, "learning_rate": 0.00011962296195652172, "loss": 0.5326, "step": 2721 }, { "epoch": 0.426111458985598, "grad_norm": 1.357971429824829, "learning_rate": 0.0001196110733695652, "loss": 0.5892, "step": 2722 }, { "epoch": 0.4262680025046963, "grad_norm": 2.1278610229492188, "learning_rate": 0.00011959918478260868, "loss": 0.6176, "step": 2723 }, { "epoch": 0.4264245460237946, "grad_norm": 2.1388776302337646, "learning_rate": 0.00011958729619565216, "loss": 0.665, "step": 2724 }, { "epoch": 0.4265810895428929, "grad_norm": 2.1311681270599365, "learning_rate": 0.00011957540760869564, "loss": 1.0884, "step": 2725 }, { "epoch": 0.42673763306199125, "grad_norm": 1.3920578956604004, "learning_rate": 0.00011956351902173911, "loss": 0.8222, "step": 2726 }, { "epoch": 0.4268941765810895, "grad_norm": 1.7735397815704346, "learning_rate": 0.00011955163043478259, "loss": 0.4428, "step": 2727 }, { "epoch": 0.42705072010018785, "grad_norm": 2.0223824977874756, "learning_rate": 0.00011953974184782607, "loss": 0.5838, "step": 2728 }, { "epoch": 0.4272072636192862, "grad_norm": 1.4602162837982178, "learning_rate": 0.00011952785326086955, "loss": 0.8844, "step": 2729 }, { "epoch": 0.42736380713838445, "grad_norm": 2.6950294971466064, "learning_rate": 0.00011951596467391303, "loss": 1.3236, "step": 2730 }, { "epoch": 0.4275203506574828, "grad_norm": 1.753066062927246, "learning_rate": 0.0001195040760869565, "loss": 1.2039, "step": 2731 }, { "epoch": 0.4276768941765811, "grad_norm": 2.1846113204956055, "learning_rate": 0.00011949218749999998, "loss": 0.8958, "step": 2732 }, { "epoch": 0.4278334376956794, "grad_norm": 1.934695839881897, "learning_rate": 0.00011948029891304347, "loss": 1.359, "step": 2733 }, { "epoch": 0.4279899812147777, "grad_norm": 4.934608459472656, "learning_rate": 0.00011946841032608695, "loss": 1.2165, "step": 2734 }, { "epoch": 0.42814652473387604, "grad_norm": 2.8699393272399902, "learning_rate": 0.00011945652173913043, "loss": 0.854, "step": 2735 }, { "epoch": 0.4283030682529743, "grad_norm": 1.8003040552139282, "learning_rate": 0.00011944463315217391, "loss": 0.8562, "step": 2736 }, { "epoch": 0.42845961177207265, "grad_norm": 2.0494468212127686, "learning_rate": 0.00011943274456521739, "loss": 0.9111, "step": 2737 }, { "epoch": 0.4286161552911709, "grad_norm": 3.445319175720215, "learning_rate": 0.00011942085597826087, "loss": 1.5171, "step": 2738 }, { "epoch": 0.42877269881026925, "grad_norm": 3.1267483234405518, "learning_rate": 0.00011940896739130434, "loss": 1.2198, "step": 2739 }, { "epoch": 0.4289292423293676, "grad_norm": 2.6813037395477295, "learning_rate": 0.00011939707880434782, "loss": 1.1437, "step": 2740 }, { "epoch": 0.42908578584846585, "grad_norm": 2.377642869949341, "learning_rate": 0.0001193851902173913, "loss": 1.6329, "step": 2741 }, { "epoch": 0.4292423293675642, "grad_norm": 3.761469841003418, "learning_rate": 0.00011937330163043478, "loss": 0.8687, "step": 2742 }, { "epoch": 0.4293988728866625, "grad_norm": 3.4511895179748535, "learning_rate": 0.00011936141304347826, "loss": 1.6552, "step": 2743 }, { "epoch": 0.4295554164057608, "grad_norm": 2.277958631515503, "learning_rate": 0.00011934952445652172, "loss": 0.9624, "step": 2744 }, { "epoch": 0.4297119599248591, "grad_norm": 2.9129812717437744, "learning_rate": 0.0001193376358695652, "loss": 1.6101, "step": 2745 }, { "epoch": 0.42986850344395744, "grad_norm": 3.3123767375946045, "learning_rate": 0.00011932574728260868, "loss": 1.4987, "step": 2746 }, { "epoch": 0.4300250469630557, "grad_norm": 1.2242462635040283, "learning_rate": 0.00011931385869565216, "loss": 0.6082, "step": 2747 }, { "epoch": 0.43018159048215404, "grad_norm": 3.4506800174713135, "learning_rate": 0.00011930197010869563, "loss": 0.7005, "step": 2748 }, { "epoch": 0.43033813400125237, "grad_norm": 3.818756341934204, "learning_rate": 0.00011929008152173911, "loss": 0.8468, "step": 2749 }, { "epoch": 0.43049467752035064, "grad_norm": 3.431147336959839, "learning_rate": 0.00011927819293478259, "loss": 1.0886, "step": 2750 }, { "epoch": 0.430651221039449, "grad_norm": 1.0269166231155396, "learning_rate": 0.00011926630434782607, "loss": 0.405, "step": 2751 }, { "epoch": 0.4308077645585473, "grad_norm": 0.8605489134788513, "learning_rate": 0.00011925441576086955, "loss": 0.453, "step": 2752 }, { "epoch": 0.4309643080776456, "grad_norm": 0.5959200859069824, "learning_rate": 0.00011924252717391304, "loss": 0.3127, "step": 2753 }, { "epoch": 0.4311208515967439, "grad_norm": 0.8196786046028137, "learning_rate": 0.00011923063858695652, "loss": 0.2571, "step": 2754 }, { "epoch": 0.4312773951158422, "grad_norm": 0.9281113743782043, "learning_rate": 0.00011921875, "loss": 0.3978, "step": 2755 }, { "epoch": 0.4314339386349405, "grad_norm": 0.49586185812950134, "learning_rate": 0.00011920686141304347, "loss": 0.2616, "step": 2756 }, { "epoch": 0.43159048215403883, "grad_norm": 1.2268654108047485, "learning_rate": 0.00011919497282608695, "loss": 0.5271, "step": 2757 }, { "epoch": 0.4317470256731371, "grad_norm": 0.9317075610160828, "learning_rate": 0.00011918308423913043, "loss": 0.4624, "step": 2758 }, { "epoch": 0.43190356919223544, "grad_norm": 0.5854963064193726, "learning_rate": 0.00011917119565217391, "loss": 0.2833, "step": 2759 }, { "epoch": 0.43206011271133377, "grad_norm": 1.3253191709518433, "learning_rate": 0.00011915930706521738, "loss": 0.5647, "step": 2760 }, { "epoch": 0.43221665623043204, "grad_norm": 0.8859946131706238, "learning_rate": 0.00011914741847826086, "loss": 0.4341, "step": 2761 }, { "epoch": 0.43237319974953037, "grad_norm": 1.0950340032577515, "learning_rate": 0.00011913552989130434, "loss": 0.4025, "step": 2762 }, { "epoch": 0.4325297432686287, "grad_norm": 1.513818383216858, "learning_rate": 0.00011912364130434782, "loss": 0.3693, "step": 2763 }, { "epoch": 0.43268628678772697, "grad_norm": 2.243410348892212, "learning_rate": 0.0001191117527173913, "loss": 0.7419, "step": 2764 }, { "epoch": 0.4328428303068253, "grad_norm": 1.5918797254562378, "learning_rate": 0.00011909986413043478, "loss": 0.4261, "step": 2765 }, { "epoch": 0.43299937382592363, "grad_norm": 2.3063459396362305, "learning_rate": 0.00011908797554347825, "loss": 0.7233, "step": 2766 }, { "epoch": 0.4331559173450219, "grad_norm": 1.1621209383010864, "learning_rate": 0.00011907608695652172, "loss": 0.5908, "step": 2767 }, { "epoch": 0.43331246086412023, "grad_norm": 1.2831814289093018, "learning_rate": 0.0001190641983695652, "loss": 0.4806, "step": 2768 }, { "epoch": 0.43346900438321856, "grad_norm": 2.7145683765411377, "learning_rate": 0.00011905230978260867, "loss": 0.9149, "step": 2769 }, { "epoch": 0.43362554790231683, "grad_norm": 2.2560527324676514, "learning_rate": 0.00011904042119565215, "loss": 0.5587, "step": 2770 }, { "epoch": 0.43378209142141516, "grad_norm": 3.3750569820404053, "learning_rate": 0.00011902853260869563, "loss": 1.1647, "step": 2771 }, { "epoch": 0.4339386349405135, "grad_norm": 1.2521886825561523, "learning_rate": 0.00011901664402173911, "loss": 0.419, "step": 2772 }, { "epoch": 0.43409517845961176, "grad_norm": 1.9437496662139893, "learning_rate": 0.0001190047554347826, "loss": 0.7214, "step": 2773 }, { "epoch": 0.4342517219787101, "grad_norm": 1.3029905557632446, "learning_rate": 0.00011899286684782608, "loss": 0.7093, "step": 2774 }, { "epoch": 0.43440826549780837, "grad_norm": 1.4327139854431152, "learning_rate": 0.00011898097826086956, "loss": 0.5877, "step": 2775 }, { "epoch": 0.4345648090169067, "grad_norm": 1.7973238229751587, "learning_rate": 0.00011896908967391304, "loss": 0.616, "step": 2776 }, { "epoch": 0.434721352536005, "grad_norm": 2.801872491836548, "learning_rate": 0.00011895720108695651, "loss": 0.5517, "step": 2777 }, { "epoch": 0.4348778960551033, "grad_norm": 2.906214952468872, "learning_rate": 0.00011894531249999999, "loss": 0.8854, "step": 2778 }, { "epoch": 0.4350344395742016, "grad_norm": 1.7814884185791016, "learning_rate": 0.00011893342391304347, "loss": 0.5098, "step": 2779 }, { "epoch": 0.43519098309329995, "grad_norm": 2.5462357997894287, "learning_rate": 0.00011892153532608695, "loss": 0.6114, "step": 2780 }, { "epoch": 0.43534752661239823, "grad_norm": 2.1136202812194824, "learning_rate": 0.00011890964673913043, "loss": 0.9147, "step": 2781 }, { "epoch": 0.43550407013149656, "grad_norm": 2.5413708686828613, "learning_rate": 0.0001188977581521739, "loss": 0.7922, "step": 2782 }, { "epoch": 0.4356606136505949, "grad_norm": 2.5781848430633545, "learning_rate": 0.00011888586956521738, "loss": 0.8197, "step": 2783 }, { "epoch": 0.43581715716969316, "grad_norm": 4.075167179107666, "learning_rate": 0.00011887398097826086, "loss": 1.4182, "step": 2784 }, { "epoch": 0.4359737006887915, "grad_norm": 1.622704029083252, "learning_rate": 0.00011886209239130434, "loss": 0.7403, "step": 2785 }, { "epoch": 0.4361302442078898, "grad_norm": 2.0153493881225586, "learning_rate": 0.00011885020380434782, "loss": 0.8498, "step": 2786 }, { "epoch": 0.4362867877269881, "grad_norm": 2.751173734664917, "learning_rate": 0.0001188383152173913, "loss": 1.1021, "step": 2787 }, { "epoch": 0.4364433312460864, "grad_norm": 6.307168960571289, "learning_rate": 0.00011882642663043477, "loss": 1.356, "step": 2788 }, { "epoch": 0.43659987476518475, "grad_norm": 2.068727970123291, "learning_rate": 0.00011881453804347825, "loss": 1.6632, "step": 2789 }, { "epoch": 0.436756418284283, "grad_norm": 2.2961113452911377, "learning_rate": 0.00011880264945652172, "loss": 1.3074, "step": 2790 }, { "epoch": 0.43691296180338135, "grad_norm": 2.958080291748047, "learning_rate": 0.0001187907608695652, "loss": 0.9778, "step": 2791 }, { "epoch": 0.4370695053224796, "grad_norm": 1.6572376489639282, "learning_rate": 0.00011877887228260867, "loss": 1.0408, "step": 2792 }, { "epoch": 0.43722604884157795, "grad_norm": 3.4145641326904297, "learning_rate": 0.00011876698369565216, "loss": 1.6186, "step": 2793 }, { "epoch": 0.4373825923606763, "grad_norm": 3.1067819595336914, "learning_rate": 0.00011875509510869564, "loss": 1.342, "step": 2794 }, { "epoch": 0.43753913587977455, "grad_norm": 1.9902410507202148, "learning_rate": 0.00011874320652173912, "loss": 1.1257, "step": 2795 }, { "epoch": 0.4376956793988729, "grad_norm": 3.058708906173706, "learning_rate": 0.0001187313179347826, "loss": 0.6668, "step": 2796 }, { "epoch": 0.4378522229179712, "grad_norm": 1.706297516822815, "learning_rate": 0.00011871942934782608, "loss": 0.6709, "step": 2797 }, { "epoch": 0.4380087664370695, "grad_norm": 1.91255784034729, "learning_rate": 0.00011870754076086955, "loss": 0.6171, "step": 2798 }, { "epoch": 0.4381653099561678, "grad_norm": 3.7008438110351562, "learning_rate": 0.00011869565217391303, "loss": 1.4825, "step": 2799 }, { "epoch": 0.43832185347526614, "grad_norm": 2.958805561065674, "learning_rate": 0.00011868376358695651, "loss": 1.1179, "step": 2800 }, { "epoch": 0.4384783969943644, "grad_norm": 1.430053949356079, "learning_rate": 0.00011867187499999999, "loss": 0.5493, "step": 2801 }, { "epoch": 0.43863494051346275, "grad_norm": 0.5838083624839783, "learning_rate": 0.00011865998641304347, "loss": 0.2323, "step": 2802 }, { "epoch": 0.4387914840325611, "grad_norm": 0.9111196994781494, "learning_rate": 0.00011864809782608695, "loss": 0.3151, "step": 2803 }, { "epoch": 0.43894802755165935, "grad_norm": 4.047616958618164, "learning_rate": 0.00011863620923913042, "loss": 0.9446, "step": 2804 }, { "epoch": 0.4391045710707577, "grad_norm": 0.7957575917243958, "learning_rate": 0.0001186243206521739, "loss": 0.3586, "step": 2805 }, { "epoch": 0.439261114589856, "grad_norm": 0.6905149221420288, "learning_rate": 0.00011861243206521738, "loss": 0.435, "step": 2806 }, { "epoch": 0.4394176581089543, "grad_norm": 1.2967065572738647, "learning_rate": 0.00011860054347826086, "loss": 0.3442, "step": 2807 }, { "epoch": 0.4395742016280526, "grad_norm": 0.8736449480056763, "learning_rate": 0.00011858865489130434, "loss": 0.4894, "step": 2808 }, { "epoch": 0.4397307451471509, "grad_norm": 0.8889498114585876, "learning_rate": 0.00011857676630434781, "loss": 0.3746, "step": 2809 }, { "epoch": 0.4398872886662492, "grad_norm": 0.9119848608970642, "learning_rate": 0.0001185648777173913, "loss": 0.3885, "step": 2810 }, { "epoch": 0.44004383218534754, "grad_norm": 1.04476797580719, "learning_rate": 0.00011855298913043478, "loss": 0.3452, "step": 2811 }, { "epoch": 0.4402003757044458, "grad_norm": 1.2246156930923462, "learning_rate": 0.00011854110054347826, "loss": 0.4843, "step": 2812 }, { "epoch": 0.44035691922354414, "grad_norm": 1.9814249277114868, "learning_rate": 0.00011852921195652173, "loss": 0.5404, "step": 2813 }, { "epoch": 0.44051346274264247, "grad_norm": 2.886953592300415, "learning_rate": 0.0001185173233695652, "loss": 0.7155, "step": 2814 }, { "epoch": 0.44067000626174074, "grad_norm": 1.0956217050552368, "learning_rate": 0.00011850543478260868, "loss": 0.4527, "step": 2815 }, { "epoch": 0.44082654978083907, "grad_norm": 1.7574673891067505, "learning_rate": 0.00011849354619565216, "loss": 0.5307, "step": 2816 }, { "epoch": 0.4409830932999374, "grad_norm": 3.0308120250701904, "learning_rate": 0.00011848165760869564, "loss": 0.7692, "step": 2817 }, { "epoch": 0.4411396368190357, "grad_norm": 3.4085443019866943, "learning_rate": 0.00011846976902173912, "loss": 0.9086, "step": 2818 }, { "epoch": 0.441296180338134, "grad_norm": 1.5825872421264648, "learning_rate": 0.0001184578804347826, "loss": 0.4877, "step": 2819 }, { "epoch": 0.44145272385723233, "grad_norm": 2.1300570964813232, "learning_rate": 0.00011844599184782607, "loss": 0.6828, "step": 2820 }, { "epoch": 0.4416092673763306, "grad_norm": 2.2025997638702393, "learning_rate": 0.00011843410326086955, "loss": 0.4371, "step": 2821 }, { "epoch": 0.44176581089542893, "grad_norm": 3.5614941120147705, "learning_rate": 0.00011842221467391303, "loss": 0.5471, "step": 2822 }, { "epoch": 0.44192235441452726, "grad_norm": 1.4089703559875488, "learning_rate": 0.00011841032608695651, "loss": 0.5467, "step": 2823 }, { "epoch": 0.44207889793362554, "grad_norm": 4.0391998291015625, "learning_rate": 0.00011839843749999999, "loss": 0.9549, "step": 2824 }, { "epoch": 0.44223544145272387, "grad_norm": 1.540336012840271, "learning_rate": 0.00011838654891304347, "loss": 0.7522, "step": 2825 }, { "epoch": 0.4423919849718222, "grad_norm": 1.7382581233978271, "learning_rate": 0.00011837466032608694, "loss": 0.6398, "step": 2826 }, { "epoch": 0.44254852849092047, "grad_norm": 2.5779566764831543, "learning_rate": 0.00011836277173913042, "loss": 0.5323, "step": 2827 }, { "epoch": 0.4427050720100188, "grad_norm": 1.7034765481948853, "learning_rate": 0.0001183508831521739, "loss": 0.5453, "step": 2828 }, { "epoch": 0.44286161552911707, "grad_norm": 2.0922088623046875, "learning_rate": 0.00011833899456521738, "loss": 0.6501, "step": 2829 }, { "epoch": 0.4430181590482154, "grad_norm": 3.5052828788757324, "learning_rate": 0.00011832710597826087, "loss": 0.8618, "step": 2830 }, { "epoch": 0.4431747025673137, "grad_norm": 2.7906999588012695, "learning_rate": 0.00011831521739130435, "loss": 1.6258, "step": 2831 }, { "epoch": 0.443331246086412, "grad_norm": 2.5461206436157227, "learning_rate": 0.00011830332880434783, "loss": 1.274, "step": 2832 }, { "epoch": 0.44348778960551033, "grad_norm": 2.050483465194702, "learning_rate": 0.0001182914402173913, "loss": 0.497, "step": 2833 }, { "epoch": 0.44364433312460866, "grad_norm": 4.103992462158203, "learning_rate": 0.00011827955163043478, "loss": 1.3092, "step": 2834 }, { "epoch": 0.44380087664370693, "grad_norm": 3.229093551635742, "learning_rate": 0.00011826766304347826, "loss": 0.8887, "step": 2835 }, { "epoch": 0.44395742016280526, "grad_norm": 2.4672348499298096, "learning_rate": 0.00011825577445652172, "loss": 1.166, "step": 2836 }, { "epoch": 0.4441139636819036, "grad_norm": 2.68979811668396, "learning_rate": 0.0001182438858695652, "loss": 1.3912, "step": 2837 }, { "epoch": 0.44427050720100186, "grad_norm": 1.4644300937652588, "learning_rate": 0.00011823199728260868, "loss": 0.9196, "step": 2838 }, { "epoch": 0.4444270507201002, "grad_norm": 4.532619476318359, "learning_rate": 0.00011822010869565216, "loss": 1.0204, "step": 2839 }, { "epoch": 0.4445835942391985, "grad_norm": 2.133330821990967, "learning_rate": 0.00011820822010869564, "loss": 1.6006, "step": 2840 }, { "epoch": 0.4447401377582968, "grad_norm": 3.1727030277252197, "learning_rate": 0.00011819633152173912, "loss": 0.9629, "step": 2841 }, { "epoch": 0.4448966812773951, "grad_norm": 3.0505363941192627, "learning_rate": 0.0001181844429347826, "loss": 0.8964, "step": 2842 }, { "epoch": 0.44505322479649345, "grad_norm": 4.719102382659912, "learning_rate": 0.00011817255434782607, "loss": 1.3369, "step": 2843 }, { "epoch": 0.4452097683155917, "grad_norm": 2.300682783126831, "learning_rate": 0.00011816066576086955, "loss": 1.0759, "step": 2844 }, { "epoch": 0.44536631183469005, "grad_norm": 4.02455472946167, "learning_rate": 0.00011814877717391303, "loss": 1.2652, "step": 2845 }, { "epoch": 0.4455228553537883, "grad_norm": 2.34755802154541, "learning_rate": 0.0001181368885869565, "loss": 0.7363, "step": 2846 }, { "epoch": 0.44567939887288666, "grad_norm": 1.7929432392120361, "learning_rate": 0.00011812499999999998, "loss": 0.4892, "step": 2847 }, { "epoch": 0.445835942391985, "grad_norm": 4.00241756439209, "learning_rate": 0.00011811311141304346, "loss": 1.0362, "step": 2848 }, { "epoch": 0.44599248591108326, "grad_norm": 2.899726152420044, "learning_rate": 0.00011810122282608694, "loss": 1.0642, "step": 2849 }, { "epoch": 0.4461490294301816, "grad_norm": 3.6462204456329346, "learning_rate": 0.00011808933423913043, "loss": 0.8571, "step": 2850 }, { "epoch": 0.4463055729492799, "grad_norm": 0.5139651298522949, "learning_rate": 0.00011807744565217391, "loss": 0.3517, "step": 2851 }, { "epoch": 0.4464621164683782, "grad_norm": 0.4749407470226288, "learning_rate": 0.00011806555706521739, "loss": 0.2727, "step": 2852 }, { "epoch": 0.4466186599874765, "grad_norm": 0.6569601893424988, "learning_rate": 0.00011805366847826087, "loss": 0.2671, "step": 2853 }, { "epoch": 0.44677520350657485, "grad_norm": 0.8176965117454529, "learning_rate": 0.00011804177989130435, "loss": 0.329, "step": 2854 }, { "epoch": 0.4469317470256731, "grad_norm": 0.8310616612434387, "learning_rate": 0.00011802989130434782, "loss": 0.325, "step": 2855 }, { "epoch": 0.44708829054477145, "grad_norm": 0.50809645652771, "learning_rate": 0.0001180180027173913, "loss": 0.2842, "step": 2856 }, { "epoch": 0.4472448340638698, "grad_norm": 1.252798318862915, "learning_rate": 0.00011800611413043478, "loss": 0.3715, "step": 2857 }, { "epoch": 0.44740137758296805, "grad_norm": 0.6238808035850525, "learning_rate": 0.00011799422554347826, "loss": 0.301, "step": 2858 }, { "epoch": 0.4475579211020664, "grad_norm": 1.2412691116333008, "learning_rate": 0.00011798233695652172, "loss": 0.4754, "step": 2859 }, { "epoch": 0.4477144646211647, "grad_norm": 1.0265389680862427, "learning_rate": 0.0001179704483695652, "loss": 0.5272, "step": 2860 }, { "epoch": 0.447871008140263, "grad_norm": 1.321620225906372, "learning_rate": 0.00011795855978260868, "loss": 0.5562, "step": 2861 }, { "epoch": 0.4480275516593613, "grad_norm": 0.8188022375106812, "learning_rate": 0.00011794667119565216, "loss": 0.3841, "step": 2862 }, { "epoch": 0.4481840951784596, "grad_norm": 2.991973400115967, "learning_rate": 0.00011793478260869564, "loss": 0.6215, "step": 2863 }, { "epoch": 0.4483406386975579, "grad_norm": 1.0603901147842407, "learning_rate": 0.00011792289402173911, "loss": 0.6332, "step": 2864 }, { "epoch": 0.44849718221665624, "grad_norm": 1.0893632173538208, "learning_rate": 0.00011791100543478259, "loss": 0.4197, "step": 2865 }, { "epoch": 0.4486537257357545, "grad_norm": 1.875597357749939, "learning_rate": 0.00011789911684782607, "loss": 0.5953, "step": 2866 }, { "epoch": 0.44881026925485284, "grad_norm": 1.211543321609497, "learning_rate": 0.00011788722826086955, "loss": 0.536, "step": 2867 }, { "epoch": 0.4489668127739512, "grad_norm": 1.112198829650879, "learning_rate": 0.00011787533967391303, "loss": 0.4409, "step": 2868 }, { "epoch": 0.44912335629304945, "grad_norm": 2.086336612701416, "learning_rate": 0.0001178634510869565, "loss": 0.498, "step": 2869 }, { "epoch": 0.4492798998121478, "grad_norm": 1.054787278175354, "learning_rate": 0.0001178515625, "loss": 0.4829, "step": 2870 }, { "epoch": 0.4494364433312461, "grad_norm": 3.5074751377105713, "learning_rate": 0.00011783967391304347, "loss": 0.8687, "step": 2871 }, { "epoch": 0.4495929868503444, "grad_norm": 1.6412101984024048, "learning_rate": 0.00011782778532608695, "loss": 0.5959, "step": 2872 }, { "epoch": 0.4497495303694427, "grad_norm": 1.0887348651885986, "learning_rate": 0.00011781589673913043, "loss": 0.48, "step": 2873 }, { "epoch": 0.44990607388854104, "grad_norm": 2.1610517501831055, "learning_rate": 0.00011780400815217391, "loss": 0.5745, "step": 2874 }, { "epoch": 0.4500626174076393, "grad_norm": 1.6872764825820923, "learning_rate": 0.00011779211956521739, "loss": 0.6703, "step": 2875 }, { "epoch": 0.45021916092673764, "grad_norm": 2.1915605068206787, "learning_rate": 0.00011778023097826086, "loss": 0.8716, "step": 2876 }, { "epoch": 0.45037570444583597, "grad_norm": 2.0586256980895996, "learning_rate": 0.00011776834239130434, "loss": 0.5795, "step": 2877 }, { "epoch": 0.45053224796493424, "grad_norm": 1.697564959526062, "learning_rate": 0.00011775645380434782, "loss": 0.6631, "step": 2878 }, { "epoch": 0.45068879148403257, "grad_norm": 2.54819917678833, "learning_rate": 0.0001177445652173913, "loss": 0.5556, "step": 2879 }, { "epoch": 0.45084533500313084, "grad_norm": 2.376826286315918, "learning_rate": 0.00011773267663043478, "loss": 0.7379, "step": 2880 }, { "epoch": 0.45100187852222917, "grad_norm": 2.2576401233673096, "learning_rate": 0.00011772078804347826, "loss": 0.5038, "step": 2881 }, { "epoch": 0.4511584220413275, "grad_norm": 3.138314962387085, "learning_rate": 0.00011770889945652172, "loss": 0.7518, "step": 2882 }, { "epoch": 0.4513149655604258, "grad_norm": 2.655306577682495, "learning_rate": 0.0001176970108695652, "loss": 1.4162, "step": 2883 }, { "epoch": 0.4514715090795241, "grad_norm": 2.4931647777557373, "learning_rate": 0.00011768512228260868, "loss": 0.8036, "step": 2884 }, { "epoch": 0.45162805259862243, "grad_norm": 1.3004099130630493, "learning_rate": 0.00011767323369565215, "loss": 0.6838, "step": 2885 }, { "epoch": 0.4517845961177207, "grad_norm": 2.8225693702697754, "learning_rate": 0.00011766134510869563, "loss": 1.1744, "step": 2886 }, { "epoch": 0.45194113963681903, "grad_norm": 3.8002490997314453, "learning_rate": 0.00011764945652173911, "loss": 0.8759, "step": 2887 }, { "epoch": 0.45209768315591736, "grad_norm": 2.5469765663146973, "learning_rate": 0.00011763756793478259, "loss": 0.8248, "step": 2888 }, { "epoch": 0.45225422667501564, "grad_norm": 3.614959955215454, "learning_rate": 0.00011762567934782608, "loss": 1.2339, "step": 2889 }, { "epoch": 0.45241077019411396, "grad_norm": 2.5202553272247314, "learning_rate": 0.00011761379076086956, "loss": 0.8963, "step": 2890 }, { "epoch": 0.4525673137132123, "grad_norm": 3.637542724609375, "learning_rate": 0.00011760190217391304, "loss": 1.6043, "step": 2891 }, { "epoch": 0.45272385723231057, "grad_norm": 1.853327989578247, "learning_rate": 0.00011759001358695652, "loss": 1.3626, "step": 2892 }, { "epoch": 0.4528804007514089, "grad_norm": 2.5517992973327637, "learning_rate": 0.000117578125, "loss": 1.0114, "step": 2893 }, { "epoch": 0.4530369442705072, "grad_norm": 1.97283136844635, "learning_rate": 0.00011756623641304347, "loss": 1.3208, "step": 2894 }, { "epoch": 0.4531934877896055, "grad_norm": 2.824045181274414, "learning_rate": 0.00011755434782608695, "loss": 1.4217, "step": 2895 }, { "epoch": 0.4533500313087038, "grad_norm": 2.1729578971862793, "learning_rate": 0.00011754245923913043, "loss": 1.2088, "step": 2896 }, { "epoch": 0.45350657482780216, "grad_norm": 2.2887349128723145, "learning_rate": 0.0001175305706521739, "loss": 0.6954, "step": 2897 }, { "epoch": 0.45366311834690043, "grad_norm": 2.299851894378662, "learning_rate": 0.00011751868206521738, "loss": 0.9167, "step": 2898 }, { "epoch": 0.45381966186599876, "grad_norm": 3.770685911178589, "learning_rate": 0.00011750679347826086, "loss": 1.3955, "step": 2899 }, { "epoch": 0.45397620538509703, "grad_norm": 3.1097662448883057, "learning_rate": 0.00011749490489130434, "loss": 1.1411, "step": 2900 }, { "epoch": 0.45413274890419536, "grad_norm": 0.9261181950569153, "learning_rate": 0.00011748301630434782, "loss": 0.4688, "step": 2901 }, { "epoch": 0.4542892924232937, "grad_norm": 0.7953492999076843, "learning_rate": 0.0001174711277173913, "loss": 0.3721, "step": 2902 }, { "epoch": 0.45444583594239196, "grad_norm": 0.8486496210098267, "learning_rate": 0.00011745923913043477, "loss": 0.4119, "step": 2903 }, { "epoch": 0.4546023794614903, "grad_norm": 0.7711522579193115, "learning_rate": 0.00011744735054347825, "loss": 0.3717, "step": 2904 }, { "epoch": 0.4547589229805886, "grad_norm": 0.6170092225074768, "learning_rate": 0.00011743546195652172, "loss": 0.3125, "step": 2905 }, { "epoch": 0.4549154664996869, "grad_norm": 0.9886530637741089, "learning_rate": 0.0001174235733695652, "loss": 0.2388, "step": 2906 }, { "epoch": 0.4550720100187852, "grad_norm": 0.7934997081756592, "learning_rate": 0.00011741168478260867, "loss": 0.4637, "step": 2907 }, { "epoch": 0.45522855353788355, "grad_norm": 0.8261517286300659, "learning_rate": 0.00011739979619565215, "loss": 0.487, "step": 2908 }, { "epoch": 0.4553850970569818, "grad_norm": 0.9170175194740295, "learning_rate": 0.00011738790760869564, "loss": 0.3745, "step": 2909 }, { "epoch": 0.45554164057608015, "grad_norm": 1.2580679655075073, "learning_rate": 0.00011737601902173912, "loss": 0.3973, "step": 2910 }, { "epoch": 0.4556981840951785, "grad_norm": 0.9119375348091125, "learning_rate": 0.0001173641304347826, "loss": 0.3923, "step": 2911 }, { "epoch": 0.45585472761427676, "grad_norm": 0.956879198551178, "learning_rate": 0.00011735224184782608, "loss": 0.3384, "step": 2912 }, { "epoch": 0.4560112711333751, "grad_norm": 0.7462704181671143, "learning_rate": 0.00011734035326086956, "loss": 0.4033, "step": 2913 }, { "epoch": 0.4561678146524734, "grad_norm": 0.9212595224380493, "learning_rate": 0.00011732846467391303, "loss": 0.301, "step": 2914 }, { "epoch": 0.4563243581715717, "grad_norm": 1.8281265497207642, "learning_rate": 0.00011731657608695651, "loss": 0.4461, "step": 2915 }, { "epoch": 0.45648090169067, "grad_norm": 0.8645868301391602, "learning_rate": 0.00011730468749999999, "loss": 0.3205, "step": 2916 }, { "epoch": 0.4566374452097683, "grad_norm": 1.4984132051467896, "learning_rate": 0.00011729279891304347, "loss": 0.375, "step": 2917 }, { "epoch": 0.4567939887288666, "grad_norm": 1.5641120672225952, "learning_rate": 0.00011728091032608695, "loss": 0.6192, "step": 2918 }, { "epoch": 0.45695053224796495, "grad_norm": 1.9126235246658325, "learning_rate": 0.00011726902173913043, "loss": 0.5558, "step": 2919 }, { "epoch": 0.4571070757670632, "grad_norm": 0.9506475925445557, "learning_rate": 0.0001172571331521739, "loss": 0.3816, "step": 2920 }, { "epoch": 0.45726361928616155, "grad_norm": 1.721663236618042, "learning_rate": 0.00011724524456521738, "loss": 0.3862, "step": 2921 }, { "epoch": 0.4574201628052599, "grad_norm": 3.086521863937378, "learning_rate": 0.00011723335597826086, "loss": 0.4967, "step": 2922 }, { "epoch": 0.45757670632435815, "grad_norm": 1.6216446161270142, "learning_rate": 0.00011722146739130434, "loss": 0.6321, "step": 2923 }, { "epoch": 0.4577332498434565, "grad_norm": 1.629676103591919, "learning_rate": 0.00011720957880434782, "loss": 0.8485, "step": 2924 }, { "epoch": 0.4578897933625548, "grad_norm": 4.322177410125732, "learning_rate": 0.0001171976902173913, "loss": 1.0166, "step": 2925 }, { "epoch": 0.4580463368816531, "grad_norm": 2.4228410720825195, "learning_rate": 0.00011718580163043477, "loss": 0.506, "step": 2926 }, { "epoch": 0.4582028804007514, "grad_norm": 1.1961411237716675, "learning_rate": 0.00011717391304347826, "loss": 0.7779, "step": 2927 }, { "epoch": 0.45835942391984974, "grad_norm": 1.928945541381836, "learning_rate": 0.00011716202445652172, "loss": 0.6776, "step": 2928 }, { "epoch": 0.458515967438948, "grad_norm": 2.12562894821167, "learning_rate": 0.00011715013586956521, "loss": 0.7337, "step": 2929 }, { "epoch": 0.45867251095804634, "grad_norm": 3.4720780849456787, "learning_rate": 0.00011713824728260869, "loss": 0.7266, "step": 2930 }, { "epoch": 0.45882905447714467, "grad_norm": 1.8333406448364258, "learning_rate": 0.00011712635869565216, "loss": 0.6168, "step": 2931 }, { "epoch": 0.45898559799624294, "grad_norm": 2.7856972217559814, "learning_rate": 0.00011711447010869564, "loss": 0.706, "step": 2932 }, { "epoch": 0.4591421415153413, "grad_norm": 2.7505545616149902, "learning_rate": 0.00011710258152173912, "loss": 0.7901, "step": 2933 }, { "epoch": 0.45929868503443955, "grad_norm": 2.0848498344421387, "learning_rate": 0.0001170906929347826, "loss": 0.7795, "step": 2934 }, { "epoch": 0.4594552285535379, "grad_norm": 2.7549569606781006, "learning_rate": 0.00011707880434782608, "loss": 0.8613, "step": 2935 }, { "epoch": 0.4596117720726362, "grad_norm": 4.466119766235352, "learning_rate": 0.00011706691576086955, "loss": 1.1873, "step": 2936 }, { "epoch": 0.4597683155917345, "grad_norm": 2.856257915496826, "learning_rate": 0.00011705502717391303, "loss": 1.7095, "step": 2937 }, { "epoch": 0.4599248591108328, "grad_norm": 6.407963752746582, "learning_rate": 0.00011704313858695651, "loss": 0.8718, "step": 2938 }, { "epoch": 0.46008140262993114, "grad_norm": 3.4138357639312744, "learning_rate": 0.00011703124999999999, "loss": 1.2293, "step": 2939 }, { "epoch": 0.4602379461490294, "grad_norm": 3.3489084243774414, "learning_rate": 0.00011701936141304347, "loss": 1.4073, "step": 2940 }, { "epoch": 0.46039448966812774, "grad_norm": 2.3879947662353516, "learning_rate": 0.00011700747282608694, "loss": 0.8818, "step": 2941 }, { "epoch": 0.46055103318722607, "grad_norm": 2.2544147968292236, "learning_rate": 0.00011699558423913042, "loss": 0.9193, "step": 2942 }, { "epoch": 0.46070757670632434, "grad_norm": 3.3511080741882324, "learning_rate": 0.0001169836956521739, "loss": 1.8204, "step": 2943 }, { "epoch": 0.46086412022542267, "grad_norm": 4.960054874420166, "learning_rate": 0.00011697180706521738, "loss": 1.3556, "step": 2944 }, { "epoch": 0.461020663744521, "grad_norm": 2.998173952102661, "learning_rate": 0.00011695991847826086, "loss": 1.4917, "step": 2945 }, { "epoch": 0.46117720726361927, "grad_norm": 3.295503616333008, "learning_rate": 0.00011694802989130434, "loss": 1.3481, "step": 2946 }, { "epoch": 0.4613337507827176, "grad_norm": 9.30244255065918, "learning_rate": 0.00011693614130434783, "loss": 1.1287, "step": 2947 }, { "epoch": 0.46149029430181593, "grad_norm": 4.704041004180908, "learning_rate": 0.0001169242527173913, "loss": 1.1334, "step": 2948 }, { "epoch": 0.4616468378209142, "grad_norm": 3.449502468109131, "learning_rate": 0.00011691236413043478, "loss": 1.2397, "step": 2949 }, { "epoch": 0.46180338134001253, "grad_norm": 3.3932483196258545, "learning_rate": 0.00011690047554347826, "loss": 1.392, "step": 2950 }, { "epoch": 0.46195992485911086, "grad_norm": 0.4678114354610443, "learning_rate": 0.00011688858695652173, "loss": 0.296, "step": 2951 }, { "epoch": 0.46211646837820913, "grad_norm": 0.8265451192855835, "learning_rate": 0.0001168766983695652, "loss": 0.4044, "step": 2952 }, { "epoch": 0.46227301189730746, "grad_norm": 0.6797813773155212, "learning_rate": 0.00011686480978260868, "loss": 0.3934, "step": 2953 }, { "epoch": 0.46242955541640574, "grad_norm": 0.7167798280715942, "learning_rate": 0.00011685292119565216, "loss": 0.468, "step": 2954 }, { "epoch": 0.46258609893550406, "grad_norm": 0.7148298025131226, "learning_rate": 0.00011684103260869564, "loss": 0.4072, "step": 2955 }, { "epoch": 0.4627426424546024, "grad_norm": 0.7426893711090088, "learning_rate": 0.00011682914402173912, "loss": 0.3936, "step": 2956 }, { "epoch": 0.46289918597370067, "grad_norm": 0.8843663334846497, "learning_rate": 0.0001168172554347826, "loss": 0.3269, "step": 2957 }, { "epoch": 0.463055729492799, "grad_norm": 0.7918097376823425, "learning_rate": 0.00011680536684782607, "loss": 0.4392, "step": 2958 }, { "epoch": 0.4632122730118973, "grad_norm": 0.641027569770813, "learning_rate": 0.00011679347826086955, "loss": 0.3419, "step": 2959 }, { "epoch": 0.4633688165309956, "grad_norm": 1.3960933685302734, "learning_rate": 0.00011678158967391303, "loss": 0.3633, "step": 2960 }, { "epoch": 0.4635253600500939, "grad_norm": 0.7780926823616028, "learning_rate": 0.00011676970108695651, "loss": 0.3209, "step": 2961 }, { "epoch": 0.46368190356919226, "grad_norm": 0.9025008082389832, "learning_rate": 0.00011675781249999999, "loss": 0.443, "step": 2962 }, { "epoch": 0.46383844708829053, "grad_norm": 1.7290984392166138, "learning_rate": 0.00011674592391304346, "loss": 0.5227, "step": 2963 }, { "epoch": 0.46399499060738886, "grad_norm": 1.0105255842208862, "learning_rate": 0.00011673403532608694, "loss": 0.4556, "step": 2964 }, { "epoch": 0.4641515341264872, "grad_norm": 1.3150509595870972, "learning_rate": 0.00011672214673913042, "loss": 0.5578, "step": 2965 }, { "epoch": 0.46430807764558546, "grad_norm": 1.3037266731262207, "learning_rate": 0.00011671025815217391, "loss": 0.5435, "step": 2966 }, { "epoch": 0.4644646211646838, "grad_norm": 3.256736993789673, "learning_rate": 0.00011669836956521739, "loss": 0.582, "step": 2967 }, { "epoch": 0.4646211646837821, "grad_norm": 1.6995822191238403, "learning_rate": 0.00011668648097826087, "loss": 0.6356, "step": 2968 }, { "epoch": 0.4647777082028804, "grad_norm": 2.63299822807312, "learning_rate": 0.00011667459239130435, "loss": 0.6785, "step": 2969 }, { "epoch": 0.4649342517219787, "grad_norm": 4.092477798461914, "learning_rate": 0.00011666270380434782, "loss": 1.1466, "step": 2970 }, { "epoch": 0.465090795241077, "grad_norm": 1.3012350797653198, "learning_rate": 0.0001166508152173913, "loss": 0.4581, "step": 2971 }, { "epoch": 0.4652473387601753, "grad_norm": 2.0222556591033936, "learning_rate": 0.00011663892663043478, "loss": 0.6925, "step": 2972 }, { "epoch": 0.46540388227927365, "grad_norm": 2.0666074752807617, "learning_rate": 0.00011662703804347826, "loss": 0.4579, "step": 2973 }, { "epoch": 0.4655604257983719, "grad_norm": 2.8129732608795166, "learning_rate": 0.00011661514945652172, "loss": 0.6562, "step": 2974 }, { "epoch": 0.46571696931747025, "grad_norm": 1.6945956945419312, "learning_rate": 0.0001166032608695652, "loss": 0.4433, "step": 2975 }, { "epoch": 0.4658735128365686, "grad_norm": 4.177333831787109, "learning_rate": 0.00011659137228260868, "loss": 1.0726, "step": 2976 }, { "epoch": 0.46603005635566686, "grad_norm": 1.6780121326446533, "learning_rate": 0.00011657948369565216, "loss": 0.5457, "step": 2977 }, { "epoch": 0.4661865998747652, "grad_norm": 1.4728416204452515, "learning_rate": 0.00011656759510869564, "loss": 0.9848, "step": 2978 }, { "epoch": 0.4663431433938635, "grad_norm": 3.2233259677886963, "learning_rate": 0.00011655570652173911, "loss": 1.1147, "step": 2979 }, { "epoch": 0.4664996869129618, "grad_norm": 2.37790846824646, "learning_rate": 0.00011654381793478259, "loss": 0.9646, "step": 2980 }, { "epoch": 0.4666562304320601, "grad_norm": 2.2972042560577393, "learning_rate": 0.00011653192934782607, "loss": 0.648, "step": 2981 }, { "epoch": 0.46681277395115844, "grad_norm": 2.372081756591797, "learning_rate": 0.00011652004076086955, "loss": 0.7764, "step": 2982 }, { "epoch": 0.4669693174702567, "grad_norm": 2.218454122543335, "learning_rate": 0.00011650815217391303, "loss": 0.7818, "step": 2983 }, { "epoch": 0.46712586098935505, "grad_norm": 2.7565038204193115, "learning_rate": 0.0001164962635869565, "loss": 0.9092, "step": 2984 }, { "epoch": 0.4672824045084534, "grad_norm": 4.893089294433594, "learning_rate": 0.00011648437499999998, "loss": 1.2965, "step": 2985 }, { "epoch": 0.46743894802755165, "grad_norm": 2.4852454662323, "learning_rate": 0.00011647248641304348, "loss": 0.745, "step": 2986 }, { "epoch": 0.46759549154665, "grad_norm": 2.1352572441101074, "learning_rate": 0.00011646059782608695, "loss": 1.0236, "step": 2987 }, { "epoch": 0.46775203506574825, "grad_norm": 5.34435510635376, "learning_rate": 0.00011644870923913043, "loss": 1.8887, "step": 2988 }, { "epoch": 0.4679085785848466, "grad_norm": 8.595197677612305, "learning_rate": 0.00011643682065217391, "loss": 1.3759, "step": 2989 }, { "epoch": 0.4680651221039449, "grad_norm": 8.77237606048584, "learning_rate": 0.00011642493206521739, "loss": 1.6906, "step": 2990 }, { "epoch": 0.4682216656230432, "grad_norm": 4.3176469802856445, "learning_rate": 0.00011641304347826087, "loss": 1.2679, "step": 2991 }, { "epoch": 0.4683782091421415, "grad_norm": 3.183436155319214, "learning_rate": 0.00011640115489130434, "loss": 1.6955, "step": 2992 }, { "epoch": 0.46853475266123984, "grad_norm": 2.351607084274292, "learning_rate": 0.00011638926630434782, "loss": 1.4798, "step": 2993 }, { "epoch": 0.4686912961803381, "grad_norm": 4.044800281524658, "learning_rate": 0.0001163773777173913, "loss": 1.2846, "step": 2994 }, { "epoch": 0.46884783969943644, "grad_norm": 2.4923434257507324, "learning_rate": 0.00011636548913043478, "loss": 1.7092, "step": 2995 }, { "epoch": 0.46900438321853477, "grad_norm": 1.966068148612976, "learning_rate": 0.00011635360054347826, "loss": 1.1312, "step": 2996 }, { "epoch": 0.46916092673763304, "grad_norm": 3.4395751953125, "learning_rate": 0.00011634171195652172, "loss": 0.767, "step": 2997 }, { "epoch": 0.4693174702567314, "grad_norm": 2.31632137298584, "learning_rate": 0.0001163298233695652, "loss": 1.1815, "step": 2998 }, { "epoch": 0.4694740137758297, "grad_norm": 2.649517774581909, "learning_rate": 0.00011631793478260868, "loss": 1.0986, "step": 2999 }, { "epoch": 0.469630557294928, "grad_norm": 1.8176891803741455, "learning_rate": 0.00011630604619565216, "loss": 0.9922, "step": 3000 }, { "epoch": 0.469630557294928, "eval_loss": 0.6046863198280334, "eval_runtime": 204.1945, "eval_samples_per_second": 60.643, "eval_steps_per_second": 3.791, "eval_wer": 0.3662898187304758, "step": 3000 }, { "epoch": 0.4697871008140263, "grad_norm": 0.44361093640327454, "learning_rate": 0.00011629415760869563, "loss": 0.2374, "step": 3001 }, { "epoch": 0.46994364433312463, "grad_norm": 0.692966878414154, "learning_rate": 0.00011628226902173911, "loss": 0.288, "step": 3002 }, { "epoch": 0.4701001878522229, "grad_norm": 0.5186521410942078, "learning_rate": 0.00011627038043478259, "loss": 0.287, "step": 3003 }, { "epoch": 0.47025673137132123, "grad_norm": 1.1101317405700684, "learning_rate": 0.00011625849184782607, "loss": 0.3316, "step": 3004 }, { "epoch": 0.47041327489041956, "grad_norm": 0.9362457394599915, "learning_rate": 0.00011624660326086955, "loss": 0.3782, "step": 3005 }, { "epoch": 0.47056981840951784, "grad_norm": 0.7504150867462158, "learning_rate": 0.00011623471467391304, "loss": 0.2511, "step": 3006 }, { "epoch": 0.47072636192861617, "grad_norm": 0.8385571837425232, "learning_rate": 0.00011622282608695652, "loss": 0.3525, "step": 3007 }, { "epoch": 0.47088290544771444, "grad_norm": 2.401582717895508, "learning_rate": 0.0001162109375, "loss": 0.6329, "step": 3008 }, { "epoch": 0.47103944896681277, "grad_norm": 1.011551022529602, "learning_rate": 0.00011619904891304347, "loss": 0.4106, "step": 3009 }, { "epoch": 0.4711959924859111, "grad_norm": 2.5987987518310547, "learning_rate": 0.00011618716032608695, "loss": 0.5343, "step": 3010 }, { "epoch": 0.47135253600500937, "grad_norm": 1.737550973892212, "learning_rate": 0.00011617527173913043, "loss": 0.6727, "step": 3011 }, { "epoch": 0.4715090795241077, "grad_norm": 0.7144822478294373, "learning_rate": 0.00011616338315217391, "loss": 0.3001, "step": 3012 }, { "epoch": 0.47166562304320603, "grad_norm": 1.3237375020980835, "learning_rate": 0.00011615149456521739, "loss": 0.4708, "step": 3013 }, { "epoch": 0.4718221665623043, "grad_norm": 0.7266702055931091, "learning_rate": 0.00011613960597826086, "loss": 0.4186, "step": 3014 }, { "epoch": 0.47197871008140263, "grad_norm": 0.934363067150116, "learning_rate": 0.00011612771739130434, "loss": 0.4837, "step": 3015 }, { "epoch": 0.47213525360050096, "grad_norm": 1.3956588506698608, "learning_rate": 0.00011611582880434782, "loss": 0.4243, "step": 3016 }, { "epoch": 0.47229179711959923, "grad_norm": 1.1425904035568237, "learning_rate": 0.0001161039402173913, "loss": 0.5655, "step": 3017 }, { "epoch": 0.47244834063869756, "grad_norm": 2.727883815765381, "learning_rate": 0.00011609205163043478, "loss": 0.7536, "step": 3018 }, { "epoch": 0.4726048841577959, "grad_norm": 0.9569172263145447, "learning_rate": 0.00011608016304347825, "loss": 0.5157, "step": 3019 }, { "epoch": 0.47276142767689416, "grad_norm": 1.3983803987503052, "learning_rate": 0.00011606827445652172, "loss": 0.5658, "step": 3020 }, { "epoch": 0.4729179711959925, "grad_norm": 1.5950350761413574, "learning_rate": 0.0001160563858695652, "loss": 0.7069, "step": 3021 }, { "epoch": 0.4730745147150908, "grad_norm": 2.59912371635437, "learning_rate": 0.00011604449728260868, "loss": 1.0505, "step": 3022 }, { "epoch": 0.4732310582341891, "grad_norm": 2.162581205368042, "learning_rate": 0.00011603260869565215, "loss": 0.6965, "step": 3023 }, { "epoch": 0.4733876017532874, "grad_norm": 5.501928329467773, "learning_rate": 0.00011602072010869563, "loss": 1.267, "step": 3024 }, { "epoch": 0.4735441452723857, "grad_norm": 2.8267219066619873, "learning_rate": 0.00011600883152173911, "loss": 0.7751, "step": 3025 }, { "epoch": 0.473700688791484, "grad_norm": 2.507749557495117, "learning_rate": 0.0001159969429347826, "loss": 0.825, "step": 3026 }, { "epoch": 0.47385723231058235, "grad_norm": 2.113342046737671, "learning_rate": 0.00011598505434782608, "loss": 0.6172, "step": 3027 }, { "epoch": 0.47401377582968063, "grad_norm": 3.352511405944824, "learning_rate": 0.00011597316576086956, "loss": 0.7568, "step": 3028 }, { "epoch": 0.47417031934877896, "grad_norm": 4.41267204284668, "learning_rate": 0.00011596127717391304, "loss": 1.0041, "step": 3029 }, { "epoch": 0.4743268628678773, "grad_norm": 2.9606730937957764, "learning_rate": 0.00011594938858695651, "loss": 0.5, "step": 3030 }, { "epoch": 0.47448340638697556, "grad_norm": 2.515286684036255, "learning_rate": 0.00011593749999999999, "loss": 0.6212, "step": 3031 }, { "epoch": 0.4746399499060739, "grad_norm": 2.7675294876098633, "learning_rate": 0.00011592561141304347, "loss": 0.6962, "step": 3032 }, { "epoch": 0.4747964934251722, "grad_norm": 3.1411895751953125, "learning_rate": 0.00011591372282608695, "loss": 0.981, "step": 3033 }, { "epoch": 0.4749530369442705, "grad_norm": 3.7392842769622803, "learning_rate": 0.00011590183423913043, "loss": 0.9467, "step": 3034 }, { "epoch": 0.4751095804633688, "grad_norm": 3.737863779067993, "learning_rate": 0.0001158899456521739, "loss": 1.0643, "step": 3035 }, { "epoch": 0.47526612398246715, "grad_norm": 2.35760498046875, "learning_rate": 0.00011587805706521738, "loss": 1.0372, "step": 3036 }, { "epoch": 0.4754226675015654, "grad_norm": 3.404975652694702, "learning_rate": 0.00011586616847826086, "loss": 1.1044, "step": 3037 }, { "epoch": 0.47557921102066375, "grad_norm": 3.0189404487609863, "learning_rate": 0.00011585427989130434, "loss": 1.0333, "step": 3038 }, { "epoch": 0.4757357545397621, "grad_norm": 4.005167484283447, "learning_rate": 0.00011584239130434782, "loss": 1.4272, "step": 3039 }, { "epoch": 0.47589229805886035, "grad_norm": 2.3638622760772705, "learning_rate": 0.0001158305027173913, "loss": 1.0146, "step": 3040 }, { "epoch": 0.4760488415779587, "grad_norm": 3.6472175121307373, "learning_rate": 0.00011581861413043477, "loss": 1.2068, "step": 3041 }, { "epoch": 0.47620538509705695, "grad_norm": 3.3812386989593506, "learning_rate": 0.00011580672554347825, "loss": 2.0472, "step": 3042 }, { "epoch": 0.4763619286161553, "grad_norm": 3.154134511947632, "learning_rate": 0.00011579483695652172, "loss": 0.8259, "step": 3043 }, { "epoch": 0.4765184721352536, "grad_norm": 3.8733673095703125, "learning_rate": 0.0001157829483695652, "loss": 1.6406, "step": 3044 }, { "epoch": 0.4766750156543519, "grad_norm": 3.940770387649536, "learning_rate": 0.00011577105978260867, "loss": 1.6632, "step": 3045 }, { "epoch": 0.4768315591734502, "grad_norm": 2.4799177646636963, "learning_rate": 0.00011575917119565216, "loss": 0.9628, "step": 3046 }, { "epoch": 0.47698810269254854, "grad_norm": 1.4423363208770752, "learning_rate": 0.00011574728260869564, "loss": 0.5365, "step": 3047 }, { "epoch": 0.4771446462116468, "grad_norm": 4.42836332321167, "learning_rate": 0.00011573539402173912, "loss": 0.9692, "step": 3048 }, { "epoch": 0.47730118973074515, "grad_norm": 2.3443071842193604, "learning_rate": 0.0001157235054347826, "loss": 1.0829, "step": 3049 }, { "epoch": 0.4774577332498435, "grad_norm": 2.69545841217041, "learning_rate": 0.00011571161684782608, "loss": 1.1142, "step": 3050 }, { "epoch": 0.47761427676894175, "grad_norm": 0.6946386694908142, "learning_rate": 0.00011569972826086956, "loss": 0.3508, "step": 3051 }, { "epoch": 0.4777708202880401, "grad_norm": 0.6795670390129089, "learning_rate": 0.00011568783967391303, "loss": 0.285, "step": 3052 }, { "epoch": 0.4779273638071384, "grad_norm": 0.5717368125915527, "learning_rate": 0.00011567595108695651, "loss": 0.3018, "step": 3053 }, { "epoch": 0.4780839073262367, "grad_norm": 1.002393364906311, "learning_rate": 0.00011566406249999999, "loss": 0.3787, "step": 3054 }, { "epoch": 0.478240450845335, "grad_norm": 0.799136757850647, "learning_rate": 0.00011565217391304347, "loss": 0.3236, "step": 3055 }, { "epoch": 0.47839699436443334, "grad_norm": 1.7953540086746216, "learning_rate": 0.00011564028532608695, "loss": 0.2753, "step": 3056 }, { "epoch": 0.4785535378835316, "grad_norm": 0.7098072171211243, "learning_rate": 0.00011562839673913042, "loss": 0.2555, "step": 3057 }, { "epoch": 0.47871008140262994, "grad_norm": 1.5776171684265137, "learning_rate": 0.0001156165081521739, "loss": 0.5355, "step": 3058 }, { "epoch": 0.47886662492172827, "grad_norm": 1.0951203107833862, "learning_rate": 0.00011560461956521738, "loss": 0.2979, "step": 3059 }, { "epoch": 0.47902316844082654, "grad_norm": 0.8206717371940613, "learning_rate": 0.00011559273097826086, "loss": 0.3225, "step": 3060 }, { "epoch": 0.47917971195992487, "grad_norm": 1.8100754022598267, "learning_rate": 0.00011558084239130434, "loss": 0.4277, "step": 3061 }, { "epoch": 0.47933625547902314, "grad_norm": 0.9530273675918579, "learning_rate": 0.00011556895380434782, "loss": 0.3358, "step": 3062 }, { "epoch": 0.47949279899812147, "grad_norm": 1.3974215984344482, "learning_rate": 0.00011555706521739131, "loss": 0.4799, "step": 3063 }, { "epoch": 0.4796493425172198, "grad_norm": 1.6866368055343628, "learning_rate": 0.00011554517663043479, "loss": 0.5412, "step": 3064 }, { "epoch": 0.4798058860363181, "grad_norm": 0.8114184141159058, "learning_rate": 0.00011553328804347826, "loss": 0.5694, "step": 3065 }, { "epoch": 0.4799624295554164, "grad_norm": 1.582995891571045, "learning_rate": 0.00011552139945652173, "loss": 0.4652, "step": 3066 }, { "epoch": 0.48011897307451473, "grad_norm": 1.8564964532852173, "learning_rate": 0.0001155095108695652, "loss": 0.6504, "step": 3067 }, { "epoch": 0.480275516593613, "grad_norm": 1.8705960512161255, "learning_rate": 0.00011549762228260868, "loss": 0.6461, "step": 3068 }, { "epoch": 0.48043206011271133, "grad_norm": 3.2124664783477783, "learning_rate": 0.00011548573369565216, "loss": 0.834, "step": 3069 }, { "epoch": 0.48058860363180966, "grad_norm": 3.4829061031341553, "learning_rate": 0.00011547384510869564, "loss": 0.7443, "step": 3070 }, { "epoch": 0.48074514715090794, "grad_norm": 2.1975138187408447, "learning_rate": 0.00011546195652173912, "loss": 0.4612, "step": 3071 }, { "epoch": 0.48090169067000627, "grad_norm": 1.6253330707550049, "learning_rate": 0.0001154500679347826, "loss": 0.5176, "step": 3072 }, { "epoch": 0.4810582341891046, "grad_norm": 1.9487390518188477, "learning_rate": 0.00011543817934782607, "loss": 0.7578, "step": 3073 }, { "epoch": 0.48121477770820287, "grad_norm": 2.7315866947174072, "learning_rate": 0.00011542629076086955, "loss": 0.8053, "step": 3074 }, { "epoch": 0.4813713212273012, "grad_norm": 6.302629470825195, "learning_rate": 0.00011541440217391303, "loss": 0.8781, "step": 3075 }, { "epoch": 0.4815278647463995, "grad_norm": 3.4214847087860107, "learning_rate": 0.00011540251358695651, "loss": 0.8036, "step": 3076 }, { "epoch": 0.4816844082654978, "grad_norm": 1.6136085987091064, "learning_rate": 0.00011539062499999999, "loss": 0.6254, "step": 3077 }, { "epoch": 0.4818409517845961, "grad_norm": 2.2907681465148926, "learning_rate": 0.00011537873641304347, "loss": 1.0473, "step": 3078 }, { "epoch": 0.4819974953036944, "grad_norm": 2.7893059253692627, "learning_rate": 0.00011536684782608694, "loss": 0.8123, "step": 3079 }, { "epoch": 0.48215403882279273, "grad_norm": 2.0105090141296387, "learning_rate": 0.00011535495923913042, "loss": 0.796, "step": 3080 }, { "epoch": 0.48231058234189106, "grad_norm": 2.4699647426605225, "learning_rate": 0.0001153430706521739, "loss": 1.1752, "step": 3081 }, { "epoch": 0.48246712586098933, "grad_norm": 2.1579205989837646, "learning_rate": 0.00011533118206521738, "loss": 0.6968, "step": 3082 }, { "epoch": 0.48262366938008766, "grad_norm": 2.047567367553711, "learning_rate": 0.00011531929347826087, "loss": 1.2284, "step": 3083 }, { "epoch": 0.482780212899186, "grad_norm": 2.4728331565856934, "learning_rate": 0.00011530740489130435, "loss": 0.7557, "step": 3084 }, { "epoch": 0.48293675641828426, "grad_norm": 2.4179792404174805, "learning_rate": 0.00011529551630434783, "loss": 0.6961, "step": 3085 }, { "epoch": 0.4830932999373826, "grad_norm": 4.0756120681762695, "learning_rate": 0.0001152836277173913, "loss": 1.0224, "step": 3086 }, { "epoch": 0.4832498434564809, "grad_norm": 1.8902158737182617, "learning_rate": 0.00011527173913043478, "loss": 0.8239, "step": 3087 }, { "epoch": 0.4834063869755792, "grad_norm": 4.0916266441345215, "learning_rate": 0.00011525985054347826, "loss": 1.5018, "step": 3088 }, { "epoch": 0.4835629304946775, "grad_norm": 5.539779186248779, "learning_rate": 0.00011524796195652173, "loss": 1.029, "step": 3089 }, { "epoch": 0.48371947401377585, "grad_norm": 3.763655424118042, "learning_rate": 0.0001152360733695652, "loss": 1.0466, "step": 3090 }, { "epoch": 0.4838760175328741, "grad_norm": 4.653838634490967, "learning_rate": 0.00011522418478260868, "loss": 2.3914, "step": 3091 }, { "epoch": 0.48403256105197245, "grad_norm": 2.6356582641601562, "learning_rate": 0.00011521229619565216, "loss": 1.1758, "step": 3092 }, { "epoch": 0.4841891045710708, "grad_norm": 2.2819085121154785, "learning_rate": 0.00011520040760869564, "loss": 1.1586, "step": 3093 }, { "epoch": 0.48434564809016906, "grad_norm": NaN, "learning_rate": 0.00011520040760869564, "loss": 0.0, "step": 3094 }, { "epoch": 0.4845021916092674, "grad_norm": 3.7227530479431152, "learning_rate": 0.00011518851902173912, "loss": 1.3858, "step": 3095 }, { "epoch": 0.48465873512836566, "grad_norm": 2.6453323364257812, "learning_rate": 0.0001151766304347826, "loss": 0.9297, "step": 3096 }, { "epoch": 0.484815278647464, "grad_norm": 2.636465549468994, "learning_rate": 0.00011516474184782607, "loss": 0.9747, "step": 3097 }, { "epoch": 0.4849718221665623, "grad_norm": 3.399754524230957, "learning_rate": 0.00011515285326086955, "loss": 1.2404, "step": 3098 }, { "epoch": 0.4851283656856606, "grad_norm": 5.908987045288086, "learning_rate": 0.00011514096467391303, "loss": 0.9168, "step": 3099 }, { "epoch": 0.4852849092047589, "grad_norm": 4.009003639221191, "learning_rate": 0.00011512907608695651, "loss": 1.3608, "step": 3100 }, { "epoch": 0.48544145272385725, "grad_norm": 0.8089908361434937, "learning_rate": 0.00011511718749999999, "loss": 0.3172, "step": 3101 }, { "epoch": 0.4855979962429555, "grad_norm": 0.9123069047927856, "learning_rate": 0.00011510529891304346, "loss": 0.4037, "step": 3102 }, { "epoch": 0.48575453976205385, "grad_norm": 0.8825123906135559, "learning_rate": 0.00011509341032608694, "loss": 0.2559, "step": 3103 }, { "epoch": 0.4859110832811522, "grad_norm": 0.5127149224281311, "learning_rate": 0.00011508152173913043, "loss": 0.3085, "step": 3104 }, { "epoch": 0.48606762680025045, "grad_norm": 1.3916382789611816, "learning_rate": 0.00011506963315217391, "loss": 0.3548, "step": 3105 }, { "epoch": 0.4862241703193488, "grad_norm": 0.6457573175430298, "learning_rate": 0.00011505774456521739, "loss": 0.3408, "step": 3106 }, { "epoch": 0.4863807138384471, "grad_norm": 1.2301112413406372, "learning_rate": 0.00011504585597826087, "loss": 0.348, "step": 3107 }, { "epoch": 0.4865372573575454, "grad_norm": 1.3921070098876953, "learning_rate": 0.00011503396739130435, "loss": 0.4246, "step": 3108 }, { "epoch": 0.4866938008766437, "grad_norm": 1.0452390909194946, "learning_rate": 0.00011502207880434782, "loss": 0.4704, "step": 3109 }, { "epoch": 0.48685034439574204, "grad_norm": 2.4116580486297607, "learning_rate": 0.0001150101902173913, "loss": 0.281, "step": 3110 }, { "epoch": 0.4870068879148403, "grad_norm": 1.1766630411148071, "learning_rate": 0.00011499830163043478, "loss": 0.3998, "step": 3111 }, { "epoch": 0.48716343143393864, "grad_norm": 1.0441794395446777, "learning_rate": 0.00011498641304347826, "loss": 0.4516, "step": 3112 }, { "epoch": 0.48731997495303697, "grad_norm": 1.4161518812179565, "learning_rate": 0.00011497452445652172, "loss": 0.4635, "step": 3113 }, { "epoch": 0.48747651847213525, "grad_norm": 1.6451808214187622, "learning_rate": 0.0001149626358695652, "loss": 0.4806, "step": 3114 }, { "epoch": 0.4876330619912336, "grad_norm": 1.9022589921951294, "learning_rate": 0.00011495074728260868, "loss": 0.6819, "step": 3115 }, { "epoch": 0.48778960551033185, "grad_norm": 1.6618225574493408, "learning_rate": 0.00011493885869565216, "loss": 0.7888, "step": 3116 }, { "epoch": 0.4879461490294302, "grad_norm": 0.7980567812919617, "learning_rate": 0.00011492697010869564, "loss": 0.4267, "step": 3117 }, { "epoch": 0.4881026925485285, "grad_norm": 2.5109238624572754, "learning_rate": 0.00011491508152173911, "loss": 0.7993, "step": 3118 }, { "epoch": 0.4882592360676268, "grad_norm": 1.361090064048767, "learning_rate": 0.00011490319293478259, "loss": 0.654, "step": 3119 }, { "epoch": 0.4884157795867251, "grad_norm": 2.0764195919036865, "learning_rate": 0.00011489130434782607, "loss": 0.7336, "step": 3120 }, { "epoch": 0.48857232310582344, "grad_norm": 2.432222604751587, "learning_rate": 0.00011487941576086955, "loss": 0.8214, "step": 3121 }, { "epoch": 0.4887288666249217, "grad_norm": 2.332545280456543, "learning_rate": 0.00011486752717391303, "loss": 0.7275, "step": 3122 }, { "epoch": 0.48888541014402004, "grad_norm": 2.3331990242004395, "learning_rate": 0.0001148556385869565, "loss": 0.8932, "step": 3123 }, { "epoch": 0.48904195366311837, "grad_norm": 2.057756185531616, "learning_rate": 0.00011484375, "loss": 0.811, "step": 3124 }, { "epoch": 0.48919849718221664, "grad_norm": 1.3923561573028564, "learning_rate": 0.00011483186141304347, "loss": 0.6425, "step": 3125 }, { "epoch": 0.48935504070131497, "grad_norm": 1.5605244636535645, "learning_rate": 0.00011481997282608695, "loss": 0.9075, "step": 3126 }, { "epoch": 0.4895115842204133, "grad_norm": 6.287894248962402, "learning_rate": 0.00011480808423913043, "loss": 1.1497, "step": 3127 }, { "epoch": 0.48966812773951157, "grad_norm": 2.213205099105835, "learning_rate": 0.00011479619565217391, "loss": 1.1561, "step": 3128 }, { "epoch": 0.4898246712586099, "grad_norm": 2.45697283744812, "learning_rate": 0.00011478430706521739, "loss": 0.7015, "step": 3129 }, { "epoch": 0.48998121477770823, "grad_norm": 2.40280818939209, "learning_rate": 0.00011477241847826087, "loss": 0.8219, "step": 3130 }, { "epoch": 0.4901377582968065, "grad_norm": 3.050903797149658, "learning_rate": 0.00011476052989130434, "loss": 1.1388, "step": 3131 }, { "epoch": 0.49029430181590483, "grad_norm": 2.6100192070007324, "learning_rate": 0.00011474864130434782, "loss": 0.6243, "step": 3132 }, { "epoch": 0.4904508453350031, "grad_norm": 1.9134602546691895, "learning_rate": 0.0001147367527173913, "loss": 1.023, "step": 3133 }, { "epoch": 0.49060738885410143, "grad_norm": 2.1718993186950684, "learning_rate": 0.00011472486413043478, "loss": 0.9742, "step": 3134 }, { "epoch": 0.49076393237319976, "grad_norm": 2.9972193241119385, "learning_rate": 0.00011471297554347826, "loss": 0.9018, "step": 3135 }, { "epoch": 0.49092047589229804, "grad_norm": 2.0232410430908203, "learning_rate": 0.00011470108695652172, "loss": 0.7515, "step": 3136 }, { "epoch": 0.49107701941139636, "grad_norm": 3.7845919132232666, "learning_rate": 0.0001146891983695652, "loss": 0.9449, "step": 3137 }, { "epoch": 0.4912335629304947, "grad_norm": 2.835109233856201, "learning_rate": 0.00011467730978260868, "loss": 1.2612, "step": 3138 }, { "epoch": 0.49139010644959297, "grad_norm": 3.1239535808563232, "learning_rate": 0.00011466542119565216, "loss": 0.9858, "step": 3139 }, { "epoch": 0.4915466499686913, "grad_norm": 3.931164264678955, "learning_rate": 0.00011465353260869563, "loss": 1.5825, "step": 3140 }, { "epoch": 0.4917031934877896, "grad_norm": 2.2049059867858887, "learning_rate": 0.00011464164402173911, "loss": 0.8991, "step": 3141 }, { "epoch": 0.4918597370068879, "grad_norm": 2.252042770385742, "learning_rate": 0.00011462975543478259, "loss": 1.5497, "step": 3142 }, { "epoch": 0.4920162805259862, "grad_norm": 2.661829948425293, "learning_rate": 0.00011461786684782607, "loss": 1.1815, "step": 3143 }, { "epoch": 0.49217282404508456, "grad_norm": 2.2882144451141357, "learning_rate": 0.00011460597826086956, "loss": 1.1545, "step": 3144 }, { "epoch": 0.49232936756418283, "grad_norm": 2.3774328231811523, "learning_rate": 0.00011459408967391304, "loss": 1.4335, "step": 3145 }, { "epoch": 0.49248591108328116, "grad_norm": 2.2506277561187744, "learning_rate": 0.00011458220108695652, "loss": 0.948, "step": 3146 }, { "epoch": 0.4926424546023795, "grad_norm": 3.9895050525665283, "learning_rate": 0.0001145703125, "loss": 1.2378, "step": 3147 }, { "epoch": 0.49279899812147776, "grad_norm": 3.4761366844177246, "learning_rate": 0.00011455842391304347, "loss": 0.8557, "step": 3148 }, { "epoch": 0.4929555416405761, "grad_norm": 1.0362615585327148, "learning_rate": 0.00011454653532608695, "loss": 0.7098, "step": 3149 }, { "epoch": 0.49311208515967436, "grad_norm": 6.007410049438477, "learning_rate": 0.00011453464673913043, "loss": 0.8687, "step": 3150 }, { "epoch": 0.4932686286787727, "grad_norm": 0.8632155060768127, "learning_rate": 0.0001145227581521739, "loss": 0.3578, "step": 3151 }, { "epoch": 0.493425172197871, "grad_norm": 1.1344983577728271, "learning_rate": 0.00011451086956521738, "loss": 0.3515, "step": 3152 }, { "epoch": 0.4935817157169693, "grad_norm": 0.6022083759307861, "learning_rate": 0.00011449898097826086, "loss": 0.2681, "step": 3153 }, { "epoch": 0.4937382592360676, "grad_norm": 0.8309706449508667, "learning_rate": 0.00011448709239130434, "loss": 0.2627, "step": 3154 }, { "epoch": 0.49389480275516595, "grad_norm": 1.6729079484939575, "learning_rate": 0.00011447520380434782, "loss": 0.2932, "step": 3155 }, { "epoch": 0.4940513462742642, "grad_norm": 0.7928094267845154, "learning_rate": 0.0001144633152173913, "loss": 0.2296, "step": 3156 }, { "epoch": 0.49420788979336255, "grad_norm": 0.9802852869033813, "learning_rate": 0.00011445142663043478, "loss": 0.4284, "step": 3157 }, { "epoch": 0.4943644333124609, "grad_norm": 1.6847466230392456, "learning_rate": 0.00011443953804347825, "loss": 0.293, "step": 3158 }, { "epoch": 0.49452097683155916, "grad_norm": 2.670808792114258, "learning_rate": 0.00011442764945652172, "loss": 0.4218, "step": 3159 }, { "epoch": 0.4946775203506575, "grad_norm": 1.0731489658355713, "learning_rate": 0.0001144157608695652, "loss": 0.5328, "step": 3160 }, { "epoch": 0.4948340638697558, "grad_norm": 2.0256481170654297, "learning_rate": 0.00011440387228260867, "loss": 0.457, "step": 3161 }, { "epoch": 0.4949906073888541, "grad_norm": 1.0189393758773804, "learning_rate": 0.00011439198369565215, "loss": 0.3576, "step": 3162 }, { "epoch": 0.4951471509079524, "grad_norm": 1.4449996948242188, "learning_rate": 0.00011438009510869563, "loss": 0.5934, "step": 3163 }, { "epoch": 0.49530369442705074, "grad_norm": 1.7612146139144897, "learning_rate": 0.00011436820652173912, "loss": 0.7191, "step": 3164 }, { "epoch": 0.495460237946149, "grad_norm": 1.6001476049423218, "learning_rate": 0.0001143563179347826, "loss": 0.5194, "step": 3165 }, { "epoch": 0.49561678146524735, "grad_norm": 1.1913166046142578, "learning_rate": 0.00011434442934782608, "loss": 0.539, "step": 3166 }, { "epoch": 0.4957733249843457, "grad_norm": 1.5129746198654175, "learning_rate": 0.00011433254076086956, "loss": 0.5599, "step": 3167 }, { "epoch": 0.49592986850344395, "grad_norm": 1.3409605026245117, "learning_rate": 0.00011432065217391304, "loss": 0.595, "step": 3168 }, { "epoch": 0.4960864120225423, "grad_norm": 0.8371987342834473, "learning_rate": 0.00011430876358695651, "loss": 0.3108, "step": 3169 }, { "epoch": 0.49624295554164055, "grad_norm": 1.4178225994110107, "learning_rate": 0.00011429687499999999, "loss": 0.5118, "step": 3170 }, { "epoch": 0.4963994990607389, "grad_norm": 2.9523565769195557, "learning_rate": 0.00011428498641304347, "loss": 0.8432, "step": 3171 }, { "epoch": 0.4965560425798372, "grad_norm": 1.2381799221038818, "learning_rate": 0.00011427309782608695, "loss": 0.3993, "step": 3172 }, { "epoch": 0.4967125860989355, "grad_norm": 1.208204984664917, "learning_rate": 0.00011426120923913043, "loss": 0.5848, "step": 3173 }, { "epoch": 0.4968691296180338, "grad_norm": 2.5409939289093018, "learning_rate": 0.0001142493206521739, "loss": 0.4723, "step": 3174 }, { "epoch": 0.49702567313713214, "grad_norm": 2.5270321369171143, "learning_rate": 0.00011423743206521738, "loss": 0.746, "step": 3175 }, { "epoch": 0.4971822166562304, "grad_norm": 4.4485578536987305, "learning_rate": 0.00011422554347826086, "loss": 0.7506, "step": 3176 }, { "epoch": 0.49733876017532874, "grad_norm": 2.533590316772461, "learning_rate": 0.00011421365489130434, "loss": 0.8205, "step": 3177 }, { "epoch": 0.49749530369442707, "grad_norm": 2.835082769393921, "learning_rate": 0.00011420176630434782, "loss": 1.1184, "step": 3178 }, { "epoch": 0.49765184721352534, "grad_norm": 2.6077747344970703, "learning_rate": 0.0001141898777173913, "loss": 0.9071, "step": 3179 }, { "epoch": 0.4978083907326237, "grad_norm": 1.5391504764556885, "learning_rate": 0.00011417798913043477, "loss": 0.6851, "step": 3180 }, { "epoch": 0.497964934251722, "grad_norm": 2.0913045406341553, "learning_rate": 0.00011416610054347826, "loss": 0.474, "step": 3181 }, { "epoch": 0.4981214777708203, "grad_norm": 3.630483388900757, "learning_rate": 0.00011415421195652172, "loss": 1.0824, "step": 3182 }, { "epoch": 0.4982780212899186, "grad_norm": 3.1027588844299316, "learning_rate": 0.0001141423233695652, "loss": 0.9844, "step": 3183 }, { "epoch": 0.49843456480901693, "grad_norm": 3.271501064300537, "learning_rate": 0.00011413043478260869, "loss": 1.0513, "step": 3184 }, { "epoch": 0.4985911083281152, "grad_norm": 4.9461565017700195, "learning_rate": 0.00011411854619565216, "loss": 0.7605, "step": 3185 }, { "epoch": 0.49874765184721354, "grad_norm": 1.774553894996643, "learning_rate": 0.00011410665760869564, "loss": 0.7644, "step": 3186 }, { "epoch": 0.4989041953663118, "grad_norm": 2.267664909362793, "learning_rate": 0.00011409476902173912, "loss": 1.0694, "step": 3187 }, { "epoch": 0.49906073888541014, "grad_norm": 2.29099440574646, "learning_rate": 0.0001140828804347826, "loss": 0.928, "step": 3188 }, { "epoch": 0.49921728240450847, "grad_norm": 2.604968309402466, "learning_rate": 0.00011407099184782608, "loss": 1.1504, "step": 3189 }, { "epoch": 0.49937382592360674, "grad_norm": 2.4398140907287598, "learning_rate": 0.00011405910326086955, "loss": 1.0195, "step": 3190 }, { "epoch": 0.49953036944270507, "grad_norm": 3.483396291732788, "learning_rate": 0.00011404721467391303, "loss": 1.1652, "step": 3191 }, { "epoch": 0.4996869129618034, "grad_norm": 4.33449649810791, "learning_rate": 0.00011403532608695651, "loss": 1.8529, "step": 3192 }, { "epoch": 0.49984345648090167, "grad_norm": 6.52424430847168, "learning_rate": 0.00011402343749999999, "loss": 1.4227, "step": 3193 }, { "epoch": 0.5, "grad_norm": 9.553756713867188, "learning_rate": 0.00011401154891304347, "loss": 1.5129, "step": 3194 }, { "epoch": 0.5001565435190983, "grad_norm": 5.555142402648926, "learning_rate": 0.00011399966032608695, "loss": 1.1118, "step": 3195 }, { "epoch": 0.5003130870381967, "grad_norm": 4.870430946350098, "learning_rate": 0.00011398777173913042, "loss": 0.9421, "step": 3196 }, { "epoch": 0.5004696305572949, "grad_norm": 5.852546691894531, "learning_rate": 0.0001139758831521739, "loss": 1.2617, "step": 3197 }, { "epoch": 0.5006261740763932, "grad_norm": 5.960766315460205, "learning_rate": 0.00011396399456521738, "loss": 0.9918, "step": 3198 }, { "epoch": 0.5007827175954915, "grad_norm": 2.3266239166259766, "learning_rate": 0.00011395210597826086, "loss": 0.8469, "step": 3199 }, { "epoch": 0.5009392611145899, "grad_norm": 2.275381088256836, "learning_rate": 0.00011394021739130434, "loss": 0.9318, "step": 3200 }, { "epoch": 0.5010958046336882, "grad_norm": 0.5467706322669983, "learning_rate": 0.00011392832880434783, "loss": 0.2947, "step": 3201 }, { "epoch": 0.5012523481527865, "grad_norm": 2.2040793895721436, "learning_rate": 0.0001139164402173913, "loss": 0.4692, "step": 3202 }, { "epoch": 0.5014088916718847, "grad_norm": 0.5163888931274414, "learning_rate": 0.00011390455163043478, "loss": 0.3257, "step": 3203 }, { "epoch": 0.5015654351909831, "grad_norm": 0.819380521774292, "learning_rate": 0.00011389266304347826, "loss": 0.2814, "step": 3204 }, { "epoch": 0.5017219787100814, "grad_norm": 0.4860137701034546, "learning_rate": 0.00011388077445652173, "loss": 0.2403, "step": 3205 }, { "epoch": 0.5018785222291797, "grad_norm": 1.131955862045288, "learning_rate": 0.0001138688858695652, "loss": 0.2977, "step": 3206 }, { "epoch": 0.502035065748278, "grad_norm": 0.6189460754394531, "learning_rate": 0.00011385699728260868, "loss": 0.2687, "step": 3207 }, { "epoch": 0.5021916092673764, "grad_norm": 0.6557197570800781, "learning_rate": 0.00011384510869565216, "loss": 0.3672, "step": 3208 }, { "epoch": 0.5023481527864746, "grad_norm": 1.2981706857681274, "learning_rate": 0.00011383322010869564, "loss": 0.4947, "step": 3209 }, { "epoch": 0.5025046963055729, "grad_norm": 1.307680368423462, "learning_rate": 0.00011382133152173912, "loss": 0.9273, "step": 3210 }, { "epoch": 0.5026612398246713, "grad_norm": 0.686278223991394, "learning_rate": 0.0001138094429347826, "loss": 0.3373, "step": 3211 }, { "epoch": 0.5028177833437696, "grad_norm": 1.3786416053771973, "learning_rate": 0.00011379755434782607, "loss": 0.592, "step": 3212 }, { "epoch": 0.5029743268628679, "grad_norm": 2.6350762844085693, "learning_rate": 0.00011378566576086955, "loss": 0.5423, "step": 3213 }, { "epoch": 0.5031308703819661, "grad_norm": 1.2983195781707764, "learning_rate": 0.00011377377717391303, "loss": 0.5296, "step": 3214 }, { "epoch": 0.5032874139010645, "grad_norm": 1.8936702013015747, "learning_rate": 0.00011376188858695651, "loss": 0.6084, "step": 3215 }, { "epoch": 0.5034439574201628, "grad_norm": 2.6016745567321777, "learning_rate": 0.00011374999999999999, "loss": 0.4298, "step": 3216 }, { "epoch": 0.5036005009392611, "grad_norm": 1.7090603113174438, "learning_rate": 0.00011373811141304346, "loss": 0.414, "step": 3217 }, { "epoch": 0.5037570444583594, "grad_norm": 1.578865885734558, "learning_rate": 0.00011372622282608694, "loss": 0.6374, "step": 3218 }, { "epoch": 0.5039135879774578, "grad_norm": 1.6763290166854858, "learning_rate": 0.00011371433423913042, "loss": 0.4929, "step": 3219 }, { "epoch": 0.504070131496556, "grad_norm": 1.9304485321044922, "learning_rate": 0.0001137024456521739, "loss": 0.5274, "step": 3220 }, { "epoch": 0.5042266750156543, "grad_norm": 1.8000478744506836, "learning_rate": 0.00011369055706521739, "loss": 0.44, "step": 3221 }, { "epoch": 0.5043832185347527, "grad_norm": 1.2024234533309937, "learning_rate": 0.00011367866847826087, "loss": 0.5413, "step": 3222 }, { "epoch": 0.504539762053851, "grad_norm": 1.4786911010742188, "learning_rate": 0.00011366677989130435, "loss": 0.6353, "step": 3223 }, { "epoch": 0.5046963055729493, "grad_norm": 1.8831760883331299, "learning_rate": 0.00011365489130434783, "loss": 1.0054, "step": 3224 }, { "epoch": 0.5048528490920476, "grad_norm": 3.8601737022399902, "learning_rate": 0.0001136430027173913, "loss": 1.2517, "step": 3225 }, { "epoch": 0.5050093926111459, "grad_norm": 2.588557481765747, "learning_rate": 0.00011363111413043478, "loss": 1.2329, "step": 3226 }, { "epoch": 0.5051659361302442, "grad_norm": 1.459268569946289, "learning_rate": 0.00011361922554347826, "loss": 0.4539, "step": 3227 }, { "epoch": 0.5053224796493425, "grad_norm": 2.600733757019043, "learning_rate": 0.00011360733695652172, "loss": 1.0056, "step": 3228 }, { "epoch": 0.5054790231684408, "grad_norm": 1.9696216583251953, "learning_rate": 0.0001135954483695652, "loss": 0.5915, "step": 3229 }, { "epoch": 0.5056355666875392, "grad_norm": 2.593480110168457, "learning_rate": 0.00011358355978260868, "loss": 0.9972, "step": 3230 }, { "epoch": 0.5057921102066374, "grad_norm": 3.069195032119751, "learning_rate": 0.00011357167119565216, "loss": 1.0211, "step": 3231 }, { "epoch": 0.5059486537257357, "grad_norm": 3.7026925086975098, "learning_rate": 0.00011355978260869564, "loss": 0.9764, "step": 3232 }, { "epoch": 0.506105197244834, "grad_norm": 3.377251625061035, "learning_rate": 0.00011354789402173912, "loss": 1.0545, "step": 3233 }, { "epoch": 0.5062617407639324, "grad_norm": 2.220733404159546, "learning_rate": 0.0001135360054347826, "loss": 0.8044, "step": 3234 }, { "epoch": 0.5064182842830307, "grad_norm": 4.220874786376953, "learning_rate": 0.00011352411684782607, "loss": 1.2236, "step": 3235 }, { "epoch": 0.506574827802129, "grad_norm": 1.6839908361434937, "learning_rate": 0.00011351222826086955, "loss": 1.1276, "step": 3236 }, { "epoch": 0.5067313713212273, "grad_norm": 5.753138065338135, "learning_rate": 0.00011350033967391303, "loss": 1.4925, "step": 3237 }, { "epoch": 0.5068879148403256, "grad_norm": 2.555654287338257, "learning_rate": 0.0001134884510869565, "loss": 1.1225, "step": 3238 }, { "epoch": 0.5070444583594239, "grad_norm": 1.9356811046600342, "learning_rate": 0.00011347656249999998, "loss": 1.5873, "step": 3239 }, { "epoch": 0.5072010018785222, "grad_norm": 3.420184373855591, "learning_rate": 0.00011346467391304346, "loss": 1.0631, "step": 3240 }, { "epoch": 0.5073575453976206, "grad_norm": 1.6056885719299316, "learning_rate": 0.00011345278532608695, "loss": 1.1093, "step": 3241 }, { "epoch": 0.5075140889167189, "grad_norm": 4.892837047576904, "learning_rate": 0.00011344089673913043, "loss": 1.5124, "step": 3242 }, { "epoch": 0.5076706324358171, "grad_norm": 2.902073860168457, "learning_rate": 0.00011342900815217391, "loss": 1.6534, "step": 3243 }, { "epoch": 0.5078271759549154, "grad_norm": 3.1783697605133057, "learning_rate": 0.00011341711956521739, "loss": 0.9006, "step": 3244 }, { "epoch": 0.5079837194740138, "grad_norm": 4.1951398849487305, "learning_rate": 0.00011340523097826087, "loss": 1.5623, "step": 3245 }, { "epoch": 0.5081402629931121, "grad_norm": 1.9096518754959106, "learning_rate": 0.00011339334239130434, "loss": 0.5721, "step": 3246 }, { "epoch": 0.5082968065122104, "grad_norm": 2.0758280754089355, "learning_rate": 0.00011338145380434782, "loss": 0.5085, "step": 3247 }, { "epoch": 0.5084533500313086, "grad_norm": 3.7097935676574707, "learning_rate": 0.0001133695652173913, "loss": 1.1098, "step": 3248 }, { "epoch": 0.508609893550407, "grad_norm": 1.7260905504226685, "learning_rate": 0.00011335767663043478, "loss": 0.7854, "step": 3249 }, { "epoch": 0.5087664370695053, "grad_norm": 2.4862236976623535, "learning_rate": 0.00011334578804347826, "loss": 1.5121, "step": 3250 }, { "epoch": 0.5089229805886036, "grad_norm": 0.7406353950500488, "learning_rate": 0.00011333389945652172, "loss": 0.4135, "step": 3251 }, { "epoch": 0.509079524107702, "grad_norm": 0.4725307822227478, "learning_rate": 0.0001133220108695652, "loss": 0.3035, "step": 3252 }, { "epoch": 0.5092360676268003, "grad_norm": 0.5265825390815735, "learning_rate": 0.00011331012228260868, "loss": 0.2676, "step": 3253 }, { "epoch": 0.5093926111458985, "grad_norm": 0.6390096545219421, "learning_rate": 0.00011329823369565216, "loss": 0.3265, "step": 3254 }, { "epoch": 0.5095491546649968, "grad_norm": 0.6926789879798889, "learning_rate": 0.00011328634510869563, "loss": 0.3034, "step": 3255 }, { "epoch": 0.5097056981840952, "grad_norm": 0.6442525386810303, "learning_rate": 0.00011327445652173911, "loss": 0.4489, "step": 3256 }, { "epoch": 0.5098622417031935, "grad_norm": 1.1631696224212646, "learning_rate": 0.00011326256793478259, "loss": 0.4595, "step": 3257 }, { "epoch": 0.5100187852222918, "grad_norm": 1.0446901321411133, "learning_rate": 0.00011325067934782607, "loss": 0.4933, "step": 3258 }, { "epoch": 0.5101753287413902, "grad_norm": 1.0826081037521362, "learning_rate": 0.00011323879076086955, "loss": 0.368, "step": 3259 }, { "epoch": 0.5103318722604884, "grad_norm": 1.1472591161727905, "learning_rate": 0.00011322690217391303, "loss": 0.4376, "step": 3260 }, { "epoch": 0.5104884157795867, "grad_norm": 1.3993219137191772, "learning_rate": 0.00011321501358695652, "loss": 0.6062, "step": 3261 }, { "epoch": 0.510644959298685, "grad_norm": 0.8530794978141785, "learning_rate": 0.000113203125, "loss": 0.2799, "step": 3262 }, { "epoch": 0.5108015028177834, "grad_norm": 0.8654371500015259, "learning_rate": 0.00011319123641304347, "loss": 0.4342, "step": 3263 }, { "epoch": 0.5109580463368817, "grad_norm": 1.3323676586151123, "learning_rate": 0.00011317934782608695, "loss": 0.4645, "step": 3264 }, { "epoch": 0.51111458985598, "grad_norm": 1.2478748559951782, "learning_rate": 0.00011316745923913043, "loss": 0.3744, "step": 3265 }, { "epoch": 0.5112711333750782, "grad_norm": 1.1204488277435303, "learning_rate": 0.00011315557065217391, "loss": 0.4208, "step": 3266 }, { "epoch": 0.5114276768941766, "grad_norm": 1.2278555631637573, "learning_rate": 0.00011314368206521739, "loss": 0.4889, "step": 3267 }, { "epoch": 0.5115842204132749, "grad_norm": 1.8348767757415771, "learning_rate": 0.00011313179347826086, "loss": 0.3268, "step": 3268 }, { "epoch": 0.5117407639323732, "grad_norm": 1.7428613901138306, "learning_rate": 0.00011311990489130434, "loss": 0.5186, "step": 3269 }, { "epoch": 0.5118973074514716, "grad_norm": 1.5740140676498413, "learning_rate": 0.00011310801630434782, "loss": 0.6612, "step": 3270 }, { "epoch": 0.5120538509705698, "grad_norm": 1.3479955196380615, "learning_rate": 0.0001130961277173913, "loss": 0.6288, "step": 3271 }, { "epoch": 0.5122103944896681, "grad_norm": 1.491121768951416, "learning_rate": 0.00011308423913043478, "loss": 0.9085, "step": 3272 }, { "epoch": 0.5123669380087664, "grad_norm": 1.4121989011764526, "learning_rate": 0.00011307235054347826, "loss": 0.6707, "step": 3273 }, { "epoch": 0.5125234815278648, "grad_norm": 2.01304030418396, "learning_rate": 0.00011306046195652172, "loss": 1.0679, "step": 3274 }, { "epoch": 0.5126800250469631, "grad_norm": 2.6822500228881836, "learning_rate": 0.0001130485733695652, "loss": 0.606, "step": 3275 }, { "epoch": 0.5128365685660614, "grad_norm": 1.951314926147461, "learning_rate": 0.00011303668478260868, "loss": 0.6766, "step": 3276 }, { "epoch": 0.5129931120851596, "grad_norm": 2.193668842315674, "learning_rate": 0.00011302479619565215, "loss": 0.6993, "step": 3277 }, { "epoch": 0.513149655604258, "grad_norm": 1.8586572408676147, "learning_rate": 0.00011301290760869563, "loss": 0.8385, "step": 3278 }, { "epoch": 0.5133061991233563, "grad_norm": 1.7612591981887817, "learning_rate": 0.00011300101902173911, "loss": 0.4752, "step": 3279 }, { "epoch": 0.5134627426424546, "grad_norm": 1.7117680311203003, "learning_rate": 0.0001129891304347826, "loss": 0.5574, "step": 3280 }, { "epoch": 0.513619286161553, "grad_norm": 3.604433059692383, "learning_rate": 0.00011297724184782608, "loss": 0.8794, "step": 3281 }, { "epoch": 0.5137758296806513, "grad_norm": 3.2087631225585938, "learning_rate": 0.00011296535326086956, "loss": 0.6452, "step": 3282 }, { "epoch": 0.5139323731997495, "grad_norm": 2.999070882797241, "learning_rate": 0.00011295346467391304, "loss": 0.7934, "step": 3283 }, { "epoch": 0.5140889167188478, "grad_norm": 2.0397908687591553, "learning_rate": 0.00011294157608695651, "loss": 0.994, "step": 3284 }, { "epoch": 0.5142454602379462, "grad_norm": NaN, "learning_rate": 0.00011294157608695651, "loss": 0.0, "step": 3285 }, { "epoch": 0.5144020037570445, "grad_norm": 3.277897357940674, "learning_rate": 0.00011292968749999999, "loss": 0.9381, "step": 3286 }, { "epoch": 0.5145585472761428, "grad_norm": 2.3456990718841553, "learning_rate": 0.00011291779891304347, "loss": 0.9733, "step": 3287 }, { "epoch": 0.514715090795241, "grad_norm": 2.0013461112976074, "learning_rate": 0.00011290591032608695, "loss": 1.4567, "step": 3288 }, { "epoch": 0.5148716343143394, "grad_norm": 3.3697702884674072, "learning_rate": 0.00011289402173913043, "loss": 1.0652, "step": 3289 }, { "epoch": 0.5150281778334377, "grad_norm": 2.9003469944000244, "learning_rate": 0.0001128821331521739, "loss": 1.2956, "step": 3290 }, { "epoch": 0.515184721352536, "grad_norm": 2.9003005027770996, "learning_rate": 0.00011287024456521738, "loss": 1.1428, "step": 3291 }, { "epoch": 0.5153412648716343, "grad_norm": 3.1923482418060303, "learning_rate": 0.00011285835597826086, "loss": 1.2375, "step": 3292 }, { "epoch": 0.5154978083907327, "grad_norm": 1.706063151359558, "learning_rate": 0.00011284646739130434, "loss": 1.2401, "step": 3293 }, { "epoch": 0.5156543519098309, "grad_norm": 2.9703023433685303, "learning_rate": 0.00011283457880434782, "loss": 1.6664, "step": 3294 }, { "epoch": 0.5158108954289292, "grad_norm": 2.888195514678955, "learning_rate": 0.0001128226902173913, "loss": 1.5466, "step": 3295 }, { "epoch": 0.5159674389480275, "grad_norm": 1.9334441423416138, "learning_rate": 0.00011281080163043477, "loss": 0.9633, "step": 3296 }, { "epoch": 0.5161239824671259, "grad_norm": 2.6576924324035645, "learning_rate": 0.00011279891304347825, "loss": 0.8821, "step": 3297 }, { "epoch": 0.5162805259862242, "grad_norm": 2.9541609287261963, "learning_rate": 0.00011278702445652172, "loss": 0.5971, "step": 3298 }, { "epoch": 0.5164370695053225, "grad_norm": 2.1944942474365234, "learning_rate": 0.0001127751358695652, "loss": 0.9306, "step": 3299 }, { "epoch": 0.5165936130244208, "grad_norm": 1.8665993213653564, "learning_rate": 0.00011276324728260867, "loss": 1.1411, "step": 3300 }, { "epoch": 0.5167501565435191, "grad_norm": 0.8364828824996948, "learning_rate": 0.00011275135869565217, "loss": 0.3568, "step": 3301 }, { "epoch": 0.5169067000626174, "grad_norm": 0.930505096912384, "learning_rate": 0.00011273947010869564, "loss": 0.3004, "step": 3302 }, { "epoch": 0.5170632435817157, "grad_norm": 0.803800642490387, "learning_rate": 0.00011272758152173912, "loss": 0.3215, "step": 3303 }, { "epoch": 0.5172197871008141, "grad_norm": 2.7054405212402344, "learning_rate": 0.0001127156929347826, "loss": 0.5556, "step": 3304 }, { "epoch": 0.5173763306199123, "grad_norm": 1.1184911727905273, "learning_rate": 0.00011270380434782608, "loss": 0.3674, "step": 3305 }, { "epoch": 0.5175328741390106, "grad_norm": 1.4789512157440186, "learning_rate": 0.00011269191576086956, "loss": 0.5363, "step": 3306 }, { "epoch": 0.5176894176581089, "grad_norm": 0.9778873324394226, "learning_rate": 0.00011268002717391303, "loss": 0.553, "step": 3307 }, { "epoch": 0.5178459611772073, "grad_norm": 1.5970479249954224, "learning_rate": 0.00011266813858695651, "loss": 0.4248, "step": 3308 }, { "epoch": 0.5180025046963056, "grad_norm": 1.1663488149642944, "learning_rate": 0.00011265624999999999, "loss": 0.3962, "step": 3309 }, { "epoch": 0.5181590482154039, "grad_norm": 2.150560140609741, "learning_rate": 0.00011264436141304347, "loss": 0.6826, "step": 3310 }, { "epoch": 0.5183155917345021, "grad_norm": 1.4061486721038818, "learning_rate": 0.00011263247282608695, "loss": 0.3511, "step": 3311 }, { "epoch": 0.5184721352536005, "grad_norm": 1.2626513242721558, "learning_rate": 0.00011262058423913043, "loss": 0.4337, "step": 3312 }, { "epoch": 0.5186286787726988, "grad_norm": 0.7658659219741821, "learning_rate": 0.0001126086956521739, "loss": 0.3942, "step": 3313 }, { "epoch": 0.5187852222917971, "grad_norm": 0.9743196368217468, "learning_rate": 0.00011259680706521738, "loss": 0.3241, "step": 3314 }, { "epoch": 0.5189417658108955, "grad_norm": 1.161643624305725, "learning_rate": 0.00011258491847826086, "loss": 0.5305, "step": 3315 }, { "epoch": 0.5190983093299938, "grad_norm": 1.1131004095077515, "learning_rate": 0.00011257302989130434, "loss": 0.5184, "step": 3316 }, { "epoch": 0.519254852849092, "grad_norm": 1.725988507270813, "learning_rate": 0.00011256114130434782, "loss": 0.793, "step": 3317 }, { "epoch": 0.5194113963681903, "grad_norm": 0.9101620316505432, "learning_rate": 0.0001125492527173913, "loss": 0.4079, "step": 3318 }, { "epoch": 0.5195679398872887, "grad_norm": 1.7146433591842651, "learning_rate": 0.00011253736413043479, "loss": 0.3504, "step": 3319 }, { "epoch": 0.519724483406387, "grad_norm": 1.5554237365722656, "learning_rate": 0.00011252547554347826, "loss": 0.5347, "step": 3320 }, { "epoch": 0.5198810269254853, "grad_norm": 1.6072149276733398, "learning_rate": 0.00011251358695652173, "loss": 0.5421, "step": 3321 }, { "epoch": 0.5200375704445835, "grad_norm": 1.7767386436462402, "learning_rate": 0.0001125016983695652, "loss": 0.6134, "step": 3322 }, { "epoch": 0.5201941139636819, "grad_norm": 2.342153787612915, "learning_rate": 0.00011248980978260868, "loss": 0.5909, "step": 3323 }, { "epoch": 0.5203506574827802, "grad_norm": 1.8662400245666504, "learning_rate": 0.00011247792119565216, "loss": 0.6428, "step": 3324 }, { "epoch": 0.5205072010018785, "grad_norm": 1.924338698387146, "learning_rate": 0.00011246603260869564, "loss": 0.8876, "step": 3325 }, { "epoch": 0.5206637445209769, "grad_norm": 2.138592481613159, "learning_rate": 0.00011245414402173912, "loss": 0.6736, "step": 3326 }, { "epoch": 0.5208202880400752, "grad_norm": 1.214433193206787, "learning_rate": 0.0001124422554347826, "loss": 0.6133, "step": 3327 }, { "epoch": 0.5209768315591734, "grad_norm": 1.6266801357269287, "learning_rate": 0.00011243036684782608, "loss": 0.4874, "step": 3328 }, { "epoch": 0.5211333750782717, "grad_norm": 2.8735084533691406, "learning_rate": 0.00011241847826086955, "loss": 1.0066, "step": 3329 }, { "epoch": 0.5212899185973701, "grad_norm": 1.6031203269958496, "learning_rate": 0.00011240658967391303, "loss": 0.5788, "step": 3330 }, { "epoch": 0.5214464621164684, "grad_norm": 1.4447262287139893, "learning_rate": 0.00011239470108695651, "loss": 0.5092, "step": 3331 }, { "epoch": 0.5216030056355667, "grad_norm": 2.531625509262085, "learning_rate": 0.00011238281249999999, "loss": 0.6131, "step": 3332 }, { "epoch": 0.521759549154665, "grad_norm": 2.95619797706604, "learning_rate": 0.00011237092391304347, "loss": 0.8358, "step": 3333 }, { "epoch": 0.5219160926737633, "grad_norm": 2.5800063610076904, "learning_rate": 0.00011235903532608694, "loss": 0.7704, "step": 3334 }, { "epoch": 0.5220726361928616, "grad_norm": 5.160123348236084, "learning_rate": 0.00011234714673913042, "loss": 1.2622, "step": 3335 }, { "epoch": 0.5222291797119599, "grad_norm": 2.319087505340576, "learning_rate": 0.0001123352581521739, "loss": 0.8734, "step": 3336 }, { "epoch": 0.5223857232310583, "grad_norm": 3.4536752700805664, "learning_rate": 0.00011232336956521738, "loss": 0.7292, "step": 3337 }, { "epoch": 0.5225422667501566, "grad_norm": 3.0074119567871094, "learning_rate": 0.00011231148097826086, "loss": 1.178, "step": 3338 }, { "epoch": 0.5226988102692548, "grad_norm": 2.885584592819214, "learning_rate": 0.00011229959239130435, "loss": 1.2641, "step": 3339 }, { "epoch": 0.5228553537883531, "grad_norm": 4.068000793457031, "learning_rate": 0.00011228770380434783, "loss": 1.2678, "step": 3340 }, { "epoch": 0.5230118973074515, "grad_norm": 4.241951942443848, "learning_rate": 0.0001122758152173913, "loss": 1.7755, "step": 3341 }, { "epoch": 0.5231684408265498, "grad_norm": 3.28718638420105, "learning_rate": 0.00011226392663043478, "loss": 1.0605, "step": 3342 }, { "epoch": 0.5233249843456481, "grad_norm": 8.025979995727539, "learning_rate": 0.00011225203804347826, "loss": 1.2488, "step": 3343 }, { "epoch": 0.5234815278647464, "grad_norm": 5.656589508056641, "learning_rate": 0.00011224014945652173, "loss": 0.9758, "step": 3344 }, { "epoch": 0.5236380713838447, "grad_norm": 4.637467384338379, "learning_rate": 0.0001122282608695652, "loss": 1.5362, "step": 3345 }, { "epoch": 0.523794614902943, "grad_norm": 3.9303641319274902, "learning_rate": 0.00011221637228260868, "loss": 1.1447, "step": 3346 }, { "epoch": 0.5239511584220413, "grad_norm": 3.8709306716918945, "learning_rate": 0.00011220448369565216, "loss": 0.7072, "step": 3347 }, { "epoch": 0.5241077019411396, "grad_norm": 1.6775429248809814, "learning_rate": 0.00011219259510869564, "loss": 0.86, "step": 3348 }, { "epoch": 0.524264245460238, "grad_norm": 2.3210561275482178, "learning_rate": 0.00011218070652173912, "loss": 0.8344, "step": 3349 }, { "epoch": 0.5244207889793363, "grad_norm": 1.6691206693649292, "learning_rate": 0.0001121688179347826, "loss": 0.9953, "step": 3350 }, { "epoch": 0.5245773324984345, "grad_norm": 0.4628916084766388, "learning_rate": 0.00011215692934782607, "loss": 0.3583, "step": 3351 }, { "epoch": 0.5247338760175329, "grad_norm": 0.7104076147079468, "learning_rate": 0.00011214504076086955, "loss": 0.289, "step": 3352 }, { "epoch": 0.5248904195366312, "grad_norm": 0.8564532995223999, "learning_rate": 0.00011213315217391303, "loss": 0.4143, "step": 3353 }, { "epoch": 0.5250469630557295, "grad_norm": 0.7229275107383728, "learning_rate": 0.00011212126358695651, "loss": 0.2943, "step": 3354 }, { "epoch": 0.5252035065748278, "grad_norm": 0.7988685965538025, "learning_rate": 0.00011210937499999999, "loss": 0.3821, "step": 3355 }, { "epoch": 0.5253600500939261, "grad_norm": 0.8212572932243347, "learning_rate": 0.00011209748641304346, "loss": 0.3322, "step": 3356 }, { "epoch": 0.5255165936130244, "grad_norm": 4.967774391174316, "learning_rate": 0.00011208559782608694, "loss": 0.4252, "step": 3357 }, { "epoch": 0.5256731371321227, "grad_norm": 0.80504310131073, "learning_rate": 0.00011207370923913043, "loss": 0.4384, "step": 3358 }, { "epoch": 0.525829680651221, "grad_norm": 1.1391942501068115, "learning_rate": 0.00011206182065217391, "loss": 0.4599, "step": 3359 }, { "epoch": 0.5259862241703194, "grad_norm": 1.3096336126327515, "learning_rate": 0.00011204993206521739, "loss": 0.4296, "step": 3360 }, { "epoch": 0.5261427676894177, "grad_norm": 0.9137067794799805, "learning_rate": 0.00011203804347826087, "loss": 0.4445, "step": 3361 }, { "epoch": 0.5262993112085159, "grad_norm": 14.93716812133789, "learning_rate": 0.00011202615489130435, "loss": 3.6514, "step": 3362 }, { "epoch": 0.5264558547276142, "grad_norm": 0.9395525455474854, "learning_rate": 0.00011201426630434782, "loss": 0.3485, "step": 3363 }, { "epoch": 0.5266123982467126, "grad_norm": 0.6992456316947937, "learning_rate": 0.0001120023777173913, "loss": 0.4144, "step": 3364 }, { "epoch": 0.5267689417658109, "grad_norm": 2.2863237857818604, "learning_rate": 0.00011199048913043478, "loss": 0.5653, "step": 3365 }, { "epoch": 0.5269254852849092, "grad_norm": 2.0885026454925537, "learning_rate": 0.00011197860054347826, "loss": 0.455, "step": 3366 }, { "epoch": 0.5270820288040076, "grad_norm": 3.211329460144043, "learning_rate": 0.00011196671195652172, "loss": 0.8506, "step": 3367 }, { "epoch": 0.5272385723231058, "grad_norm": 1.8506795167922974, "learning_rate": 0.0001119548233695652, "loss": 0.5912, "step": 3368 }, { "epoch": 0.5273951158422041, "grad_norm": 1.0545448064804077, "learning_rate": 0.00011194293478260868, "loss": 0.4753, "step": 3369 }, { "epoch": 0.5275516593613024, "grad_norm": 1.4904415607452393, "learning_rate": 0.00011193104619565216, "loss": 0.6126, "step": 3370 }, { "epoch": 0.5277082028804008, "grad_norm": 1.2915406227111816, "learning_rate": 0.00011191915760869564, "loss": 0.3764, "step": 3371 }, { "epoch": 0.5278647463994991, "grad_norm": 1.7138348817825317, "learning_rate": 0.00011190726902173911, "loss": 0.8348, "step": 3372 }, { "epoch": 0.5280212899185974, "grad_norm": 1.3370615243911743, "learning_rate": 0.00011189538043478259, "loss": 0.7119, "step": 3373 }, { "epoch": 0.5281778334376956, "grad_norm": 1.1037729978561401, "learning_rate": 0.00011188349184782607, "loss": 0.4941, "step": 3374 }, { "epoch": 0.528334376956794, "grad_norm": 2.1124205589294434, "learning_rate": 0.00011187160326086955, "loss": 0.6168, "step": 3375 }, { "epoch": 0.5284909204758923, "grad_norm": 1.507457971572876, "learning_rate": 0.00011185971467391303, "loss": 0.7744, "step": 3376 }, { "epoch": 0.5286474639949906, "grad_norm": 2.7326130867004395, "learning_rate": 0.0001118478260869565, "loss": 0.8235, "step": 3377 }, { "epoch": 0.528804007514089, "grad_norm": 2.8210039138793945, "learning_rate": 0.0001118359375, "loss": 0.7044, "step": 3378 }, { "epoch": 0.5289605510331872, "grad_norm": 4.957598686218262, "learning_rate": 0.00011182404891304348, "loss": 1.4001, "step": 3379 }, { "epoch": 0.5291170945522855, "grad_norm": 2.2010719776153564, "learning_rate": 0.00011181216032608695, "loss": 0.7249, "step": 3380 }, { "epoch": 0.5292736380713838, "grad_norm": 3.67569637298584, "learning_rate": 0.00011180027173913043, "loss": 0.8132, "step": 3381 }, { "epoch": 0.5294301815904822, "grad_norm": 2.917200803756714, "learning_rate": 0.00011178838315217391, "loss": 0.891, "step": 3382 }, { "epoch": 0.5295867251095805, "grad_norm": 1.5264979600906372, "learning_rate": 0.00011177649456521739, "loss": 0.3772, "step": 3383 }, { "epoch": 0.5297432686286788, "grad_norm": 1.9645525217056274, "learning_rate": 0.00011176460597826087, "loss": 0.8021, "step": 3384 }, { "epoch": 0.529899812147777, "grad_norm": 1.7574735879898071, "learning_rate": 0.00011175271739130434, "loss": 0.6696, "step": 3385 }, { "epoch": 0.5300563556668754, "grad_norm": 5.492279052734375, "learning_rate": 0.00011174082880434782, "loss": 1.3134, "step": 3386 }, { "epoch": 0.5302128991859737, "grad_norm": 3.7675137519836426, "learning_rate": 0.0001117289402173913, "loss": 0.9908, "step": 3387 }, { "epoch": 0.530369442705072, "grad_norm": 5.049015045166016, "learning_rate": 0.00011171705163043478, "loss": 1.4213, "step": 3388 }, { "epoch": 0.5305259862241704, "grad_norm": 2.375999927520752, "learning_rate": 0.00011170516304347826, "loss": 0.8639, "step": 3389 }, { "epoch": 0.5306825297432687, "grad_norm": 4.651785373687744, "learning_rate": 0.00011169327445652172, "loss": 0.8849, "step": 3390 }, { "epoch": 0.5308390732623669, "grad_norm": 3.3194515705108643, "learning_rate": 0.0001116813858695652, "loss": 1.7901, "step": 3391 }, { "epoch": 0.5309956167814652, "grad_norm": 3.6694235801696777, "learning_rate": 0.00011166949728260868, "loss": 1.4483, "step": 3392 }, { "epoch": 0.5311521603005636, "grad_norm": 3.3108303546905518, "learning_rate": 0.00011165760869565216, "loss": 1.5089, "step": 3393 }, { "epoch": 0.5313087038196619, "grad_norm": 1.9968076944351196, "learning_rate": 0.00011164572010869563, "loss": 0.6728, "step": 3394 }, { "epoch": 0.5314652473387602, "grad_norm": 3.9596071243286133, "learning_rate": 0.00011163383152173911, "loss": 1.0375, "step": 3395 }, { "epoch": 0.5316217908578584, "grad_norm": 2.6021058559417725, "learning_rate": 0.00011162194293478259, "loss": 1.4244, "step": 3396 }, { "epoch": 0.5317783343769568, "grad_norm": 4.119985103607178, "learning_rate": 0.00011161005434782607, "loss": 0.9367, "step": 3397 }, { "epoch": 0.5319348778960551, "grad_norm": 3.151984930038452, "learning_rate": 0.00011159816576086956, "loss": 0.8171, "step": 3398 }, { "epoch": 0.5320914214151534, "grad_norm": 4.2190961837768555, "learning_rate": 0.00011158627717391304, "loss": 1.5488, "step": 3399 }, { "epoch": 0.5322479649342517, "grad_norm": 2.8796863555908203, "learning_rate": 0.00011157438858695652, "loss": 1.2969, "step": 3400 }, { "epoch": 0.5324045084533501, "grad_norm": 0.7016223073005676, "learning_rate": 0.0001115625, "loss": 0.339, "step": 3401 }, { "epoch": 0.5325610519724483, "grad_norm": 0.5836032032966614, "learning_rate": 0.00011155061141304347, "loss": 0.2832, "step": 3402 }, { "epoch": 0.5327175954915466, "grad_norm": 0.5470978617668152, "learning_rate": 0.00011153872282608695, "loss": 0.3216, "step": 3403 }, { "epoch": 0.532874139010645, "grad_norm": 1.1922338008880615, "learning_rate": 0.00011152683423913043, "loss": 0.4776, "step": 3404 }, { "epoch": 0.5330306825297433, "grad_norm": 0.4305607080459595, "learning_rate": 0.00011151494565217391, "loss": 0.2022, "step": 3405 }, { "epoch": 0.5331872260488416, "grad_norm": 0.8476937413215637, "learning_rate": 0.00011150305706521739, "loss": 0.3651, "step": 3406 }, { "epoch": 0.5333437695679399, "grad_norm": 0.7953325510025024, "learning_rate": 0.00011149116847826086, "loss": 0.3282, "step": 3407 }, { "epoch": 0.5335003130870382, "grad_norm": 0.9949222207069397, "learning_rate": 0.00011147927989130434, "loss": 0.4358, "step": 3408 }, { "epoch": 0.5336568566061365, "grad_norm": 0.7778651118278503, "learning_rate": 0.00011146739130434782, "loss": 0.3256, "step": 3409 }, { "epoch": 0.5338134001252348, "grad_norm": 0.8406221270561218, "learning_rate": 0.0001114555027173913, "loss": 0.5122, "step": 3410 }, { "epoch": 0.5339699436443331, "grad_norm": 1.0037559270858765, "learning_rate": 0.00011144361413043478, "loss": 0.5358, "step": 3411 }, { "epoch": 0.5341264871634315, "grad_norm": 0.8850845694541931, "learning_rate": 0.00011143172554347825, "loss": 0.4601, "step": 3412 }, { "epoch": 0.5342830306825297, "grad_norm": 1.1619433164596558, "learning_rate": 0.00011141983695652172, "loss": 0.4343, "step": 3413 }, { "epoch": 0.534439574201628, "grad_norm": 1.798865795135498, "learning_rate": 0.0001114079483695652, "loss": 0.6544, "step": 3414 }, { "epoch": 0.5345961177207263, "grad_norm": 2.124359607696533, "learning_rate": 0.00011139605978260868, "loss": 0.4481, "step": 3415 }, { "epoch": 0.5347526612398247, "grad_norm": 1.8622241020202637, "learning_rate": 0.00011138417119565215, "loss": 0.4465, "step": 3416 }, { "epoch": 0.534909204758923, "grad_norm": 1.7045314311981201, "learning_rate": 0.00011137228260869563, "loss": 0.5412, "step": 3417 }, { "epoch": 0.5350657482780213, "grad_norm": 1.5642012357711792, "learning_rate": 0.00011136039402173912, "loss": 0.6456, "step": 3418 }, { "epoch": 0.5352222917971196, "grad_norm": 0.9745003581047058, "learning_rate": 0.0001113485054347826, "loss": 0.3097, "step": 3419 }, { "epoch": 0.5353788353162179, "grad_norm": 1.8561583757400513, "learning_rate": 0.00011133661684782608, "loss": 0.3268, "step": 3420 }, { "epoch": 0.5355353788353162, "grad_norm": 1.1429892778396606, "learning_rate": 0.00011132472826086956, "loss": 0.5202, "step": 3421 }, { "epoch": 0.5356919223544145, "grad_norm": 2.592538595199585, "learning_rate": 0.00011131283967391304, "loss": 0.7099, "step": 3422 }, { "epoch": 0.5358484658735129, "grad_norm": 1.464481234550476, "learning_rate": 0.00011130095108695651, "loss": 0.5684, "step": 3423 }, { "epoch": 0.5360050093926112, "grad_norm": 1.664000153541565, "learning_rate": 0.00011128906249999999, "loss": 0.7289, "step": 3424 }, { "epoch": 0.5361615529117094, "grad_norm": 1.8290214538574219, "learning_rate": 0.00011127717391304347, "loss": 0.6766, "step": 3425 }, { "epoch": 0.5363180964308077, "grad_norm": 3.4447247982025146, "learning_rate": 0.00011126528532608695, "loss": 1.0781, "step": 3426 }, { "epoch": 0.5364746399499061, "grad_norm": 3.275972366333008, "learning_rate": 0.00011125339673913043, "loss": 0.8248, "step": 3427 }, { "epoch": 0.5366311834690044, "grad_norm": 3.1975746154785156, "learning_rate": 0.0001112415081521739, "loss": 0.9636, "step": 3428 }, { "epoch": 0.5367877269881027, "grad_norm": 2.719964027404785, "learning_rate": 0.00011122961956521738, "loss": 1.0408, "step": 3429 }, { "epoch": 0.536944270507201, "grad_norm": 2.9791324138641357, "learning_rate": 0.00011121773097826086, "loss": 0.7397, "step": 3430 }, { "epoch": 0.5371008140262993, "grad_norm": 3.958561658859253, "learning_rate": 0.00011120584239130434, "loss": 1.4585, "step": 3431 }, { "epoch": 0.5372573575453976, "grad_norm": 2.429102659225464, "learning_rate": 0.00011119395380434782, "loss": 0.9803, "step": 3432 }, { "epoch": 0.5374139010644959, "grad_norm": 2.8197100162506104, "learning_rate": 0.0001111820652173913, "loss": 0.9633, "step": 3433 }, { "epoch": 0.5375704445835943, "grad_norm": 2.3831098079681396, "learning_rate": 0.00011117017663043477, "loss": 0.7283, "step": 3434 }, { "epoch": 0.5377269881026926, "grad_norm": 2.9680798053741455, "learning_rate": 0.00011115828804347827, "loss": 1.0699, "step": 3435 }, { "epoch": 0.5378835316217908, "grad_norm": 2.8820903301239014, "learning_rate": 0.00011114639945652172, "loss": 0.8332, "step": 3436 }, { "epoch": 0.5380400751408891, "grad_norm": 2.272130250930786, "learning_rate": 0.0001111345108695652, "loss": 0.9264, "step": 3437 }, { "epoch": 0.5381966186599875, "grad_norm": 1.858933448791504, "learning_rate": 0.00011112262228260869, "loss": 1.0031, "step": 3438 }, { "epoch": 0.5383531621790858, "grad_norm": 2.0221424102783203, "learning_rate": 0.00011111073369565216, "loss": 0.8399, "step": 3439 }, { "epoch": 0.5385097056981841, "grad_norm": 1.944527268409729, "learning_rate": 0.00011109884510869564, "loss": 1.1033, "step": 3440 }, { "epoch": 0.5386662492172825, "grad_norm": 6.095865726470947, "learning_rate": 0.00011108695652173912, "loss": 1.2028, "step": 3441 }, { "epoch": 0.5388227927363807, "grad_norm": 5.01286506652832, "learning_rate": 0.0001110750679347826, "loss": 1.5025, "step": 3442 }, { "epoch": 0.538979336255479, "grad_norm": 2.613521099090576, "learning_rate": 0.00011106317934782608, "loss": 1.0294, "step": 3443 }, { "epoch": 0.5391358797745773, "grad_norm": 4.618668556213379, "learning_rate": 0.00011105129076086956, "loss": 1.5084, "step": 3444 }, { "epoch": 0.5392924232936757, "grad_norm": 3.7636311054229736, "learning_rate": 0.00011103940217391303, "loss": 2.0002, "step": 3445 }, { "epoch": 0.539448966812774, "grad_norm": 3.130589723587036, "learning_rate": 0.00011102751358695651, "loss": 0.6777, "step": 3446 }, { "epoch": 0.5396055103318722, "grad_norm": 1.3224811553955078, "learning_rate": 0.00011101562499999999, "loss": 0.5159, "step": 3447 }, { "epoch": 0.5397620538509705, "grad_norm": 2.868272542953491, "learning_rate": 0.00011100373641304347, "loss": 1.3099, "step": 3448 }, { "epoch": 0.5399185973700689, "grad_norm": 2.4921178817749023, "learning_rate": 0.00011099184782608695, "loss": 0.8699, "step": 3449 }, { "epoch": 0.5400751408891672, "grad_norm": 2.682108163833618, "learning_rate": 0.00011097995923913042, "loss": 1.2957, "step": 3450 }, { "epoch": 0.5402316844082655, "grad_norm": 0.7467138171195984, "learning_rate": 0.0001109680706521739, "loss": 0.3045, "step": 3451 }, { "epoch": 0.5403882279273639, "grad_norm": 1.0937817096710205, "learning_rate": 0.00011095618206521738, "loss": 0.4467, "step": 3452 }, { "epoch": 0.5405447714464621, "grad_norm": 0.6195390224456787, "learning_rate": 0.00011094429347826086, "loss": 0.2774, "step": 3453 }, { "epoch": 0.5407013149655604, "grad_norm": 0.6483671069145203, "learning_rate": 0.00011093240489130434, "loss": 0.3139, "step": 3454 }, { "epoch": 0.5408578584846587, "grad_norm": 0.5435556769371033, "learning_rate": 0.00011092051630434783, "loss": 0.3124, "step": 3455 }, { "epoch": 0.5410144020037571, "grad_norm": 4.127707481384277, "learning_rate": 0.0001109086277173913, "loss": 0.6369, "step": 3456 }, { "epoch": 0.5411709455228554, "grad_norm": 0.8546549677848816, "learning_rate": 0.00011089673913043478, "loss": 0.3245, "step": 3457 }, { "epoch": 0.5413274890419537, "grad_norm": 1.0216944217681885, "learning_rate": 0.00011088485054347826, "loss": 0.3659, "step": 3458 }, { "epoch": 0.5414840325610519, "grad_norm": 0.5538146495819092, "learning_rate": 0.00011087296195652173, "loss": 0.2632, "step": 3459 }, { "epoch": 0.5416405760801503, "grad_norm": 0.9092782139778137, "learning_rate": 0.0001108610733695652, "loss": 0.383, "step": 3460 }, { "epoch": 0.5417971195992486, "grad_norm": 0.7627533078193665, "learning_rate": 0.00011084918478260868, "loss": 0.3048, "step": 3461 }, { "epoch": 0.5419536631183469, "grad_norm": 0.9529313445091248, "learning_rate": 0.00011083729619565216, "loss": 0.3819, "step": 3462 }, { "epoch": 0.5421102066374452, "grad_norm": 1.3082811832427979, "learning_rate": 0.00011082540760869564, "loss": 0.4316, "step": 3463 }, { "epoch": 0.5422667501565435, "grad_norm": 1.80794358253479, "learning_rate": 0.00011081351902173912, "loss": 0.7195, "step": 3464 }, { "epoch": 0.5424232936756418, "grad_norm": 1.188079833984375, "learning_rate": 0.0001108016304347826, "loss": 0.3532, "step": 3465 }, { "epoch": 0.5425798371947401, "grad_norm": 1.3598816394805908, "learning_rate": 0.00011078974184782607, "loss": 0.5173, "step": 3466 }, { "epoch": 0.5427363807138385, "grad_norm": 1.215948462486267, "learning_rate": 0.00011077785326086955, "loss": 0.4274, "step": 3467 }, { "epoch": 0.5428929242329368, "grad_norm": 2.7520697116851807, "learning_rate": 0.00011076596467391303, "loss": 0.9072, "step": 3468 }, { "epoch": 0.5430494677520351, "grad_norm": 2.407381772994995, "learning_rate": 0.00011075407608695651, "loss": 0.55, "step": 3469 }, { "epoch": 0.5432060112711333, "grad_norm": 1.3498611450195312, "learning_rate": 0.00011074218749999999, "loss": 0.7565, "step": 3470 }, { "epoch": 0.5433625547902317, "grad_norm": 1.6073955297470093, "learning_rate": 0.00011073029891304347, "loss": 0.5865, "step": 3471 }, { "epoch": 0.54351909830933, "grad_norm": 2.073822021484375, "learning_rate": 0.00011071841032608694, "loss": 0.8622, "step": 3472 }, { "epoch": 0.5436756418284283, "grad_norm": 2.0434021949768066, "learning_rate": 0.00011070652173913042, "loss": 0.7373, "step": 3473 }, { "epoch": 0.5438321853475266, "grad_norm": 3.521308183670044, "learning_rate": 0.0001106946331521739, "loss": 0.5922, "step": 3474 }, { "epoch": 0.543988728866625, "grad_norm": 1.7725569009780884, "learning_rate": 0.00011068274456521739, "loss": 0.6328, "step": 3475 }, { "epoch": 0.5441452723857232, "grad_norm": 2.6232948303222656, "learning_rate": 0.00011067085597826087, "loss": 0.6632, "step": 3476 }, { "epoch": 0.5443018159048215, "grad_norm": 2.1565167903900146, "learning_rate": 0.00011065896739130435, "loss": 0.9212, "step": 3477 }, { "epoch": 0.5444583594239198, "grad_norm": 2.475931406021118, "learning_rate": 0.00011064707880434783, "loss": 1.0727, "step": 3478 }, { "epoch": 0.5446149029430182, "grad_norm": 1.2700499296188354, "learning_rate": 0.0001106351902173913, "loss": 0.5014, "step": 3479 }, { "epoch": 0.5447714464621165, "grad_norm": 2.176129102706909, "learning_rate": 0.00011062330163043478, "loss": 0.6805, "step": 3480 }, { "epoch": 0.5449279899812148, "grad_norm": 5.424550533294678, "learning_rate": 0.00011061141304347826, "loss": 1.0049, "step": 3481 }, { "epoch": 0.545084533500313, "grad_norm": 4.09802770614624, "learning_rate": 0.00011059952445652173, "loss": 0.9863, "step": 3482 }, { "epoch": 0.5452410770194114, "grad_norm": 2.937049627304077, "learning_rate": 0.0001105876358695652, "loss": 0.7051, "step": 3483 }, { "epoch": 0.5453976205385097, "grad_norm": 2.3214454650878906, "learning_rate": 0.00011057574728260868, "loss": 0.8151, "step": 3484 }, { "epoch": 0.545554164057608, "grad_norm": 1.38384211063385, "learning_rate": 0.00011056385869565216, "loss": 0.7341, "step": 3485 }, { "epoch": 0.5457107075767064, "grad_norm": 1.8677239418029785, "learning_rate": 0.00011055197010869564, "loss": 0.8374, "step": 3486 }, { "epoch": 0.5458672510958046, "grad_norm": 2.25844407081604, "learning_rate": 0.00011054008152173912, "loss": 1.1039, "step": 3487 }, { "epoch": 0.5460237946149029, "grad_norm": 2.560241222381592, "learning_rate": 0.0001105281929347826, "loss": 0.9002, "step": 3488 }, { "epoch": 0.5461803381340012, "grad_norm": 5.53599739074707, "learning_rate": 0.00011051630434782607, "loss": 0.8766, "step": 3489 }, { "epoch": 0.5463368816530996, "grad_norm": 6.788541793823242, "learning_rate": 0.00011050441576086955, "loss": 0.8553, "step": 3490 }, { "epoch": 0.5464934251721979, "grad_norm": 2.503903865814209, "learning_rate": 0.00011049252717391303, "loss": 1.6841, "step": 3491 }, { "epoch": 0.5466499686912962, "grad_norm": 6.413868427276611, "learning_rate": 0.0001104806385869565, "loss": 1.6638, "step": 3492 }, { "epoch": 0.5468065122103944, "grad_norm": 3.306854009628296, "learning_rate": 0.00011046874999999998, "loss": 1.3411, "step": 3493 }, { "epoch": 0.5469630557294928, "grad_norm": 3.589545249938965, "learning_rate": 0.00011045686141304346, "loss": 1.7772, "step": 3494 }, { "epoch": 0.5471195992485911, "grad_norm": 5.197908401489258, "learning_rate": 0.00011044497282608695, "loss": 1.8198, "step": 3495 }, { "epoch": 0.5472761427676894, "grad_norm": NaN, "learning_rate": 0.00011044497282608695, "loss": 0.0, "step": 3496 }, { "epoch": 0.5474326862867878, "grad_norm": 3.196678638458252, "learning_rate": 0.00011043308423913043, "loss": 1.0794, "step": 3497 }, { "epoch": 0.5475892298058861, "grad_norm": 3.384082794189453, "learning_rate": 0.00011042119565217391, "loss": 0.7966, "step": 3498 }, { "epoch": 0.5477457733249843, "grad_norm": 1.9554734230041504, "learning_rate": 0.00011040930706521739, "loss": 0.5171, "step": 3499 }, { "epoch": 0.5479023168440826, "grad_norm": 3.138946294784546, "learning_rate": 0.00011039741847826087, "loss": 1.4983, "step": 3500 }, { "epoch": 0.548058860363181, "grad_norm": 0.43462076783180237, "learning_rate": 0.00011038552989130435, "loss": 0.2014, "step": 3501 }, { "epoch": 0.5482154038822793, "grad_norm": 0.6490321755409241, "learning_rate": 0.00011037364130434782, "loss": 0.3832, "step": 3502 }, { "epoch": 0.5483719474013776, "grad_norm": 0.7384947538375854, "learning_rate": 0.0001103617527173913, "loss": 0.2373, "step": 3503 }, { "epoch": 0.5485284909204758, "grad_norm": 1.018007755279541, "learning_rate": 0.00011034986413043478, "loss": 0.3038, "step": 3504 }, { "epoch": 0.5486850344395742, "grad_norm": 0.8815956115722656, "learning_rate": 0.00011033797554347826, "loss": 0.2619, "step": 3505 }, { "epoch": 0.5488415779586725, "grad_norm": 1.2462259531021118, "learning_rate": 0.00011032608695652172, "loss": 0.3898, "step": 3506 }, { "epoch": 0.5489981214777708, "grad_norm": 0.6918308734893799, "learning_rate": 0.0001103141983695652, "loss": 0.3005, "step": 3507 }, { "epoch": 0.5491546649968692, "grad_norm": 0.849932074546814, "learning_rate": 0.00011030230978260868, "loss": 0.2619, "step": 3508 }, { "epoch": 0.5493112085159675, "grad_norm": 0.8610263466835022, "learning_rate": 0.00011029042119565216, "loss": 0.2533, "step": 3509 }, { "epoch": 0.5494677520350657, "grad_norm": 0.8926490545272827, "learning_rate": 0.00011027853260869564, "loss": 0.4094, "step": 3510 }, { "epoch": 0.549624295554164, "grad_norm": 1.083909273147583, "learning_rate": 0.00011026664402173911, "loss": 0.3891, "step": 3511 }, { "epoch": 0.5497808390732624, "grad_norm": 1.6358364820480347, "learning_rate": 0.00011025475543478259, "loss": 0.3447, "step": 3512 }, { "epoch": 0.5499373825923607, "grad_norm": 1.1487104892730713, "learning_rate": 0.00011024286684782607, "loss": 0.2729, "step": 3513 }, { "epoch": 0.550093926111459, "grad_norm": 1.6536211967468262, "learning_rate": 0.00011023097826086955, "loss": 0.8396, "step": 3514 }, { "epoch": 0.5502504696305573, "grad_norm": 1.258245587348938, "learning_rate": 0.00011021908967391303, "loss": 0.4733, "step": 3515 }, { "epoch": 0.5504070131496556, "grad_norm": 0.8897906541824341, "learning_rate": 0.00011020720108695652, "loss": 0.3667, "step": 3516 }, { "epoch": 0.5505635566687539, "grad_norm": 0.9303690195083618, "learning_rate": 0.0001101953125, "loss": 0.53, "step": 3517 }, { "epoch": 0.5507201001878522, "grad_norm": 1.9776372909545898, "learning_rate": 0.00011018342391304347, "loss": 0.5043, "step": 3518 }, { "epoch": 0.5508766437069506, "grad_norm": 1.3152869939804077, "learning_rate": 0.00011017153532608695, "loss": 0.3948, "step": 3519 }, { "epoch": 0.5510331872260489, "grad_norm": 1.5295816659927368, "learning_rate": 0.00011015964673913043, "loss": 0.59, "step": 3520 }, { "epoch": 0.5511897307451471, "grad_norm": 1.6912004947662354, "learning_rate": 0.00011014775815217391, "loss": 0.5829, "step": 3521 }, { "epoch": 0.5513462742642454, "grad_norm": 2.7575604915618896, "learning_rate": 0.00011013586956521739, "loss": 0.6618, "step": 3522 }, { "epoch": 0.5515028177833438, "grad_norm": 2.3394219875335693, "learning_rate": 0.00011012398097826086, "loss": 0.8986, "step": 3523 }, { "epoch": 0.5516593613024421, "grad_norm": 2.3434157371520996, "learning_rate": 0.00011011209239130434, "loss": 0.9908, "step": 3524 }, { "epoch": 0.5518159048215404, "grad_norm": 2.011910915374756, "learning_rate": 0.00011010020380434782, "loss": 0.7987, "step": 3525 }, { "epoch": 0.5519724483406387, "grad_norm": 2.8754212856292725, "learning_rate": 0.0001100883152173913, "loss": 1.0095, "step": 3526 }, { "epoch": 0.552128991859737, "grad_norm": 4.040971279144287, "learning_rate": 0.00011007642663043478, "loss": 1.2794, "step": 3527 }, { "epoch": 0.5522855353788353, "grad_norm": 2.8165037631988525, "learning_rate": 0.00011006453804347826, "loss": 0.7704, "step": 3528 }, { "epoch": 0.5524420788979336, "grad_norm": 3.0529489517211914, "learning_rate": 0.00011005264945652172, "loss": 0.9655, "step": 3529 }, { "epoch": 0.552598622417032, "grad_norm": 3.129857301712036, "learning_rate": 0.0001100407608695652, "loss": 1.3813, "step": 3530 }, { "epoch": 0.5527551659361303, "grad_norm": 3.027365207672119, "learning_rate": 0.00011002887228260868, "loss": 0.9569, "step": 3531 }, { "epoch": 0.5529117094552286, "grad_norm": 1.4493299722671509, "learning_rate": 0.00011001698369565215, "loss": 0.9138, "step": 3532 }, { "epoch": 0.5530682529743268, "grad_norm": 1.4689825773239136, "learning_rate": 0.00011000509510869563, "loss": 0.7224, "step": 3533 }, { "epoch": 0.5532247964934252, "grad_norm": 3.0895557403564453, "learning_rate": 0.00010999320652173911, "loss": 0.9747, "step": 3534 }, { "epoch": 0.5533813400125235, "grad_norm": 1.75118088722229, "learning_rate": 0.00010998131793478259, "loss": 0.7986, "step": 3535 }, { "epoch": 0.5535378835316218, "grad_norm": 2.9842000007629395, "learning_rate": 0.00010996942934782608, "loss": 1.1836, "step": 3536 }, { "epoch": 0.5536944270507201, "grad_norm": 2.635169744491577, "learning_rate": 0.00010995754076086956, "loss": 1.0964, "step": 3537 }, { "epoch": 0.5538509705698184, "grad_norm": 2.6638717651367188, "learning_rate": 0.00010994565217391304, "loss": 1.1213, "step": 3538 }, { "epoch": 0.5540075140889167, "grad_norm": 3.4747042655944824, "learning_rate": 0.00010993376358695652, "loss": 0.8967, "step": 3539 }, { "epoch": 0.554164057608015, "grad_norm": 5.025138854980469, "learning_rate": 0.000109921875, "loss": 0.9008, "step": 3540 }, { "epoch": 0.5543206011271133, "grad_norm": 3.7665510177612305, "learning_rate": 0.00010990998641304347, "loss": 1.124, "step": 3541 }, { "epoch": 0.5544771446462117, "grad_norm": 1.9393270015716553, "learning_rate": 0.00010989809782608695, "loss": 1.4021, "step": 3542 }, { "epoch": 0.55463368816531, "grad_norm": 3.0149013996124268, "learning_rate": 0.00010988620923913043, "loss": 1.3759, "step": 3543 }, { "epoch": 0.5547902316844082, "grad_norm": 2.646127462387085, "learning_rate": 0.0001098743206521739, "loss": 1.1562, "step": 3544 }, { "epoch": 0.5549467752035065, "grad_norm": 2.376739025115967, "learning_rate": 0.00010986243206521738, "loss": 0.7936, "step": 3545 }, { "epoch": 0.5551033187226049, "grad_norm": 2.577127456665039, "learning_rate": 0.00010985054347826086, "loss": 0.6335, "step": 3546 }, { "epoch": 0.5552598622417032, "grad_norm": 4.54538631439209, "learning_rate": 0.00010983865489130434, "loss": 1.0413, "step": 3547 }, { "epoch": 0.5554164057608015, "grad_norm": 6.393764972686768, "learning_rate": 0.00010982676630434782, "loss": 1.0262, "step": 3548 }, { "epoch": 0.5555729492798999, "grad_norm": 2.6708343029022217, "learning_rate": 0.0001098148777173913, "loss": 0.8612, "step": 3549 }, { "epoch": 0.5557294927989981, "grad_norm": 3.876636028289795, "learning_rate": 0.00010980298913043478, "loss": 1.6765, "step": 3550 }, { "epoch": 0.5558860363180964, "grad_norm": 0.8184608817100525, "learning_rate": 0.00010979110054347825, "loss": 0.2928, "step": 3551 }, { "epoch": 0.5560425798371947, "grad_norm": 0.5266475081443787, "learning_rate": 0.00010977921195652172, "loss": 0.2968, "step": 3552 }, { "epoch": 0.5561991233562931, "grad_norm": 0.8559831976890564, "learning_rate": 0.0001097673233695652, "loss": 0.2977, "step": 3553 }, { "epoch": 0.5563556668753914, "grad_norm": 1.2017126083374023, "learning_rate": 0.00010975543478260867, "loss": 0.3357, "step": 3554 }, { "epoch": 0.5565122103944896, "grad_norm": 0.8536816835403442, "learning_rate": 0.00010974354619565215, "loss": 0.3424, "step": 3555 }, { "epoch": 0.5566687539135879, "grad_norm": 0.9533822536468506, "learning_rate": 0.00010973165760869564, "loss": 0.3509, "step": 3556 }, { "epoch": 0.5568252974326863, "grad_norm": 0.5368643999099731, "learning_rate": 0.00010971976902173912, "loss": 0.1854, "step": 3557 }, { "epoch": 0.5569818409517846, "grad_norm": 0.6242415308952332, "learning_rate": 0.0001097078804347826, "loss": 0.3463, "step": 3558 }, { "epoch": 0.5571383844708829, "grad_norm": 1.3431727886199951, "learning_rate": 0.00010969599184782608, "loss": 0.4885, "step": 3559 }, { "epoch": 0.5572949279899813, "grad_norm": 1.2369906902313232, "learning_rate": 0.00010968410326086956, "loss": 0.41, "step": 3560 }, { "epoch": 0.5574514715090795, "grad_norm": 1.1734521389007568, "learning_rate": 0.00010967221467391303, "loss": 0.2763, "step": 3561 }, { "epoch": 0.5576080150281778, "grad_norm": 0.6655096411705017, "learning_rate": 0.00010966032608695651, "loss": 0.3547, "step": 3562 }, { "epoch": 0.5577645585472761, "grad_norm": 0.8951511383056641, "learning_rate": 0.00010964843749999999, "loss": 0.3409, "step": 3563 }, { "epoch": 0.5579211020663745, "grad_norm": 1.0563545227050781, "learning_rate": 0.00010963654891304347, "loss": 0.4119, "step": 3564 }, { "epoch": 0.5580776455854728, "grad_norm": NaN, "learning_rate": 0.00010963654891304347, "loss": 0.0, "step": 3565 }, { "epoch": 0.5582341891045711, "grad_norm": 1.5232510566711426, "learning_rate": 0.00010962466032608695, "loss": 0.5333, "step": 3566 }, { "epoch": 0.5583907326236693, "grad_norm": 1.587132453918457, "learning_rate": 0.00010961277173913043, "loss": 0.6154, "step": 3567 }, { "epoch": 0.5585472761427677, "grad_norm": 1.82208251953125, "learning_rate": 0.0001096008831521739, "loss": 0.5729, "step": 3568 }, { "epoch": 0.558703819661866, "grad_norm": 1.1485824584960938, "learning_rate": 0.00010958899456521738, "loss": 0.6682, "step": 3569 }, { "epoch": 0.5588603631809643, "grad_norm": 1.6878355741500854, "learning_rate": 0.00010957710597826086, "loss": 0.7345, "step": 3570 }, { "epoch": 0.5590169067000627, "grad_norm": 1.814380168914795, "learning_rate": 0.00010956521739130434, "loss": 0.8602, "step": 3571 }, { "epoch": 0.5591734502191609, "grad_norm": 1.6915546655654907, "learning_rate": 0.00010955332880434782, "loss": 0.4772, "step": 3572 }, { "epoch": 0.5593299937382592, "grad_norm": 2.5840437412261963, "learning_rate": 0.0001095414402173913, "loss": 1.213, "step": 3573 }, { "epoch": 0.5594865372573575, "grad_norm": 1.3970454931259155, "learning_rate": 0.00010952955163043479, "loss": 0.5129, "step": 3574 }, { "epoch": 0.5596430807764559, "grad_norm": 3.813704013824463, "learning_rate": 0.00010951766304347826, "loss": 0.6049, "step": 3575 }, { "epoch": 0.5597996242955542, "grad_norm": 3.3036446571350098, "learning_rate": 0.00010950577445652172, "loss": 1.0921, "step": 3576 }, { "epoch": 0.5599561678146525, "grad_norm": 1.136748194694519, "learning_rate": 0.00010949388586956521, "loss": 0.4018, "step": 3577 }, { "epoch": 0.5601127113337507, "grad_norm": 1.8297280073165894, "learning_rate": 0.00010948199728260869, "loss": 0.3522, "step": 3578 }, { "epoch": 0.5602692548528491, "grad_norm": 3.0237388610839844, "learning_rate": 0.00010947010869565216, "loss": 1.0319, "step": 3579 }, { "epoch": 0.5604257983719474, "grad_norm": 3.588622570037842, "learning_rate": 0.00010945822010869564, "loss": 1.0556, "step": 3580 }, { "epoch": 0.5605823418910457, "grad_norm": 2.381256341934204, "learning_rate": 0.00010944633152173912, "loss": 0.5797, "step": 3581 }, { "epoch": 0.560738885410144, "grad_norm": 2.8259384632110596, "learning_rate": 0.0001094344429347826, "loss": 0.8609, "step": 3582 }, { "epoch": 0.5608954289292424, "grad_norm": 2.0703482627868652, "learning_rate": 0.00010942255434782608, "loss": 0.6809, "step": 3583 }, { "epoch": 0.5610519724483406, "grad_norm": 2.8220319747924805, "learning_rate": 0.00010941066576086955, "loss": 0.7398, "step": 3584 }, { "epoch": 0.5612085159674389, "grad_norm": 3.0024633407592773, "learning_rate": 0.00010939877717391303, "loss": 1.1865, "step": 3585 }, { "epoch": 0.5613650594865373, "grad_norm": 1.940361499786377, "learning_rate": 0.00010938688858695651, "loss": 0.74, "step": 3586 }, { "epoch": 0.5615216030056356, "grad_norm": 2.2479465007781982, "learning_rate": 0.00010937499999999999, "loss": 0.9479, "step": 3587 }, { "epoch": 0.5616781465247339, "grad_norm": 4.6724138259887695, "learning_rate": 0.00010936311141304347, "loss": 1.1546, "step": 3588 }, { "epoch": 0.5618346900438321, "grad_norm": 6.2551984786987305, "learning_rate": 0.00010935122282608695, "loss": 1.9299, "step": 3589 }, { "epoch": 0.5619912335629305, "grad_norm": 2.7485203742980957, "learning_rate": 0.00010933933423913042, "loss": 0.8795, "step": 3590 }, { "epoch": 0.5621477770820288, "grad_norm": 2.4721004962921143, "learning_rate": 0.0001093274456521739, "loss": 1.1086, "step": 3591 }, { "epoch": 0.5623043206011271, "grad_norm": 2.5063998699188232, "learning_rate": 0.00010931555706521738, "loss": 1.3727, "step": 3592 }, { "epoch": 0.5624608641202254, "grad_norm": 3.9205901622772217, "learning_rate": 0.00010930366847826086, "loss": 1.4736, "step": 3593 }, { "epoch": 0.5626174076393238, "grad_norm": 4.170874118804932, "learning_rate": 0.00010929177989130435, "loss": 1.7676, "step": 3594 }, { "epoch": 0.562773951158422, "grad_norm": 3.0156571865081787, "learning_rate": 0.00010927989130434783, "loss": 1.2218, "step": 3595 }, { "epoch": 0.5629304946775203, "grad_norm": 1.8519896268844604, "learning_rate": 0.0001092680027173913, "loss": 0.5407, "step": 3596 }, { "epoch": 0.5630870381966186, "grad_norm": 2.605187177658081, "learning_rate": 0.00010925611413043478, "loss": 0.7893, "step": 3597 }, { "epoch": 0.563243581715717, "grad_norm": 3.910198926925659, "learning_rate": 0.00010924422554347826, "loss": 1.0699, "step": 3598 }, { "epoch": 0.5634001252348153, "grad_norm": 3.242035150527954, "learning_rate": 0.00010923233695652173, "loss": 1.1479, "step": 3599 }, { "epoch": 0.5635566687539136, "grad_norm": 2.1618731021881104, "learning_rate": 0.0001092204483695652, "loss": 1.1272, "step": 3600 }, { "epoch": 0.5637132122730119, "grad_norm": 0.6593093872070312, "learning_rate": 0.00010920855978260868, "loss": 0.3597, "step": 3601 }, { "epoch": 0.5638697557921102, "grad_norm": 0.44913193583488464, "learning_rate": 0.00010919667119565216, "loss": 0.2208, "step": 3602 }, { "epoch": 0.5640262993112085, "grad_norm": 0.686331570148468, "learning_rate": 0.00010918478260869564, "loss": 0.3592, "step": 3603 }, { "epoch": 0.5641828428303068, "grad_norm": 0.7575231194496155, "learning_rate": 0.00010917289402173912, "loss": 0.4772, "step": 3604 }, { "epoch": 0.5643393863494052, "grad_norm": 1.229622483253479, "learning_rate": 0.0001091610054347826, "loss": 0.3675, "step": 3605 }, { "epoch": 0.5644959298685035, "grad_norm": 0.8193878531455994, "learning_rate": 0.00010914911684782607, "loss": 0.3594, "step": 3606 }, { "epoch": 0.5646524733876017, "grad_norm": 0.8675310015678406, "learning_rate": 0.00010913722826086955, "loss": 0.3397, "step": 3607 }, { "epoch": 0.5648090169067, "grad_norm": 0.9793321490287781, "learning_rate": 0.00010912533967391303, "loss": 0.2987, "step": 3608 }, { "epoch": 0.5649655604257984, "grad_norm": 1.1407331228256226, "learning_rate": 0.00010911345108695651, "loss": 0.4864, "step": 3609 }, { "epoch": 0.5651221039448967, "grad_norm": 1.015324592590332, "learning_rate": 0.00010910156249999999, "loss": 0.2722, "step": 3610 }, { "epoch": 0.565278647463995, "grad_norm": 1.5619347095489502, "learning_rate": 0.00010908967391304346, "loss": 0.536, "step": 3611 }, { "epoch": 0.5654351909830932, "grad_norm": 4.293629169464111, "learning_rate": 0.00010907778532608694, "loss": 0.9414, "step": 3612 }, { "epoch": 0.5655917345021916, "grad_norm": 0.8079026341438293, "learning_rate": 0.00010906589673913042, "loss": 0.2756, "step": 3613 }, { "epoch": 0.5657482780212899, "grad_norm": 1.6520304679870605, "learning_rate": 0.00010905400815217391, "loss": 0.5073, "step": 3614 }, { "epoch": 0.5659048215403882, "grad_norm": 0.9841470122337341, "learning_rate": 0.00010904211956521739, "loss": 0.3869, "step": 3615 }, { "epoch": 0.5660613650594866, "grad_norm": 1.5788778066635132, "learning_rate": 0.00010903023097826087, "loss": 0.5411, "step": 3616 }, { "epoch": 0.5662179085785849, "grad_norm": 1.9592663049697876, "learning_rate": 0.00010901834239130435, "loss": 0.6583, "step": 3617 }, { "epoch": 0.5663744520976831, "grad_norm": 2.097867727279663, "learning_rate": 0.00010900645380434783, "loss": 0.5364, "step": 3618 }, { "epoch": 0.5665309956167814, "grad_norm": 1.9247876405715942, "learning_rate": 0.0001089945652173913, "loss": 0.7009, "step": 3619 }, { "epoch": 0.5666875391358798, "grad_norm": 1.770000696182251, "learning_rate": 0.00010898267663043478, "loss": 0.6234, "step": 3620 }, { "epoch": 0.5668440826549781, "grad_norm": 2.4808125495910645, "learning_rate": 0.00010897078804347826, "loss": 0.7931, "step": 3621 }, { "epoch": 0.5670006261740764, "grad_norm": 1.6155678033828735, "learning_rate": 0.00010895889945652172, "loss": 0.6729, "step": 3622 }, { "epoch": 0.5671571696931748, "grad_norm": 1.658316969871521, "learning_rate": 0.0001089470108695652, "loss": 0.6872, "step": 3623 }, { "epoch": 0.567313713212273, "grad_norm": 2.195277214050293, "learning_rate": 0.00010893512228260868, "loss": 0.598, "step": 3624 }, { "epoch": 0.5674702567313713, "grad_norm": 1.5798367261886597, "learning_rate": 0.00010892323369565216, "loss": 0.5523, "step": 3625 }, { "epoch": 0.5676268002504696, "grad_norm": 4.624732494354248, "learning_rate": 0.00010891134510869564, "loss": 1.2515, "step": 3626 }, { "epoch": 0.567783343769568, "grad_norm": 1.6468130350112915, "learning_rate": 0.00010889945652173911, "loss": 0.6789, "step": 3627 }, { "epoch": 0.5679398872886663, "grad_norm": 2.5445430278778076, "learning_rate": 0.00010888756793478259, "loss": 1.0051, "step": 3628 }, { "epoch": 0.5680964308077645, "grad_norm": 2.788323402404785, "learning_rate": 0.00010887567934782607, "loss": 1.0156, "step": 3629 }, { "epoch": 0.5682529743268628, "grad_norm": 3.8453712463378906, "learning_rate": 0.00010886379076086955, "loss": 0.7843, "step": 3630 }, { "epoch": 0.5684095178459612, "grad_norm": 3.3953325748443604, "learning_rate": 0.00010885190217391303, "loss": 0.9266, "step": 3631 }, { "epoch": 0.5685660613650595, "grad_norm": 2.8403854370117188, "learning_rate": 0.0001088400135869565, "loss": 1.1566, "step": 3632 }, { "epoch": 0.5687226048841578, "grad_norm": 2.7144296169281006, "learning_rate": 0.00010882812499999998, "loss": 1.3101, "step": 3633 }, { "epoch": 0.5688791484032562, "grad_norm": 2.9398272037506104, "learning_rate": 0.00010881623641304348, "loss": 1.209, "step": 3634 }, { "epoch": 0.5690356919223544, "grad_norm": 3.2524032592773438, "learning_rate": 0.00010880434782608695, "loss": 1.0283, "step": 3635 }, { "epoch": 0.5691922354414527, "grad_norm": 3.768702745437622, "learning_rate": 0.00010879245923913043, "loss": 1.0781, "step": 3636 }, { "epoch": 0.569348778960551, "grad_norm": 1.5707626342773438, "learning_rate": 0.00010878057065217391, "loss": 0.5625, "step": 3637 }, { "epoch": 0.5695053224796494, "grad_norm": 2.6336238384246826, "learning_rate": 0.00010876868206521739, "loss": 1.2404, "step": 3638 }, { "epoch": 0.5696618659987477, "grad_norm": 4.211549758911133, "learning_rate": 0.00010875679347826087, "loss": 1.3364, "step": 3639 }, { "epoch": 0.569818409517846, "grad_norm": 3.9228575229644775, "learning_rate": 0.00010874490489130434, "loss": 2.1385, "step": 3640 }, { "epoch": 0.5699749530369442, "grad_norm": 2.937666893005371, "learning_rate": 0.00010873301630434782, "loss": 1.1924, "step": 3641 }, { "epoch": 0.5701314965560426, "grad_norm": 2.36543869972229, "learning_rate": 0.0001087211277173913, "loss": 1.0264, "step": 3642 }, { "epoch": 0.5702880400751409, "grad_norm": 3.189908742904663, "learning_rate": 0.00010870923913043478, "loss": 1.2215, "step": 3643 }, { "epoch": 0.5704445835942392, "grad_norm": 2.394692897796631, "learning_rate": 0.00010869735054347826, "loss": 1.2891, "step": 3644 }, { "epoch": 0.5706011271133375, "grad_norm": 1.5275219678878784, "learning_rate": 0.00010868546195652172, "loss": 0.4485, "step": 3645 }, { "epoch": 0.5707576706324358, "grad_norm": 4.146634101867676, "learning_rate": 0.0001086735733695652, "loss": 1.2015, "step": 3646 }, { "epoch": 0.5709142141515341, "grad_norm": 1.953955054283142, "learning_rate": 0.00010866168478260868, "loss": 0.7174, "step": 3647 }, { "epoch": 0.5710707576706324, "grad_norm": 3.5816566944122314, "learning_rate": 0.00010864979619565216, "loss": 0.433, "step": 3648 }, { "epoch": 0.5712273011897308, "grad_norm": 1.9393279552459717, "learning_rate": 0.00010863790760869563, "loss": 0.5027, "step": 3649 }, { "epoch": 0.5713838447088291, "grad_norm": 4.551398754119873, "learning_rate": 0.00010862601902173911, "loss": 1.1152, "step": 3650 }, { "epoch": 0.5715403882279274, "grad_norm": 0.7220586538314819, "learning_rate": 0.00010861413043478259, "loss": 0.2828, "step": 3651 }, { "epoch": 0.5716969317470256, "grad_norm": 0.7288694977760315, "learning_rate": 0.00010860224184782607, "loss": 0.3469, "step": 3652 }, { "epoch": 0.571853475266124, "grad_norm": 0.7031374573707581, "learning_rate": 0.00010859035326086955, "loss": 0.3159, "step": 3653 }, { "epoch": 0.5720100187852223, "grad_norm": 0.8593475222587585, "learning_rate": 0.00010857846467391304, "loss": 0.3409, "step": 3654 }, { "epoch": 0.5721665623043206, "grad_norm": 1.5594534873962402, "learning_rate": 0.00010856657608695652, "loss": 0.4163, "step": 3655 }, { "epoch": 0.5723231058234189, "grad_norm": 0.9399692416191101, "learning_rate": 0.0001085546875, "loss": 0.3206, "step": 3656 }, { "epoch": 0.5724796493425173, "grad_norm": 0.7405820488929749, "learning_rate": 0.00010854279891304347, "loss": 0.2109, "step": 3657 }, { "epoch": 0.5726361928616155, "grad_norm": 0.9408073425292969, "learning_rate": 0.00010853091032608695, "loss": 0.4161, "step": 3658 }, { "epoch": 0.5727927363807138, "grad_norm": 1.220417857170105, "learning_rate": 0.00010851902173913043, "loss": 0.5291, "step": 3659 }, { "epoch": 0.5729492798998121, "grad_norm": 1.381637692451477, "learning_rate": 0.00010850713315217391, "loss": 0.5147, "step": 3660 }, { "epoch": 0.5731058234189105, "grad_norm": 1.155821681022644, "learning_rate": 0.00010849524456521739, "loss": 0.3345, "step": 3661 }, { "epoch": 0.5732623669380088, "grad_norm": 1.44887113571167, "learning_rate": 0.00010848335597826086, "loss": 0.3566, "step": 3662 }, { "epoch": 0.573418910457107, "grad_norm": 0.852017343044281, "learning_rate": 0.00010847146739130434, "loss": 0.3877, "step": 3663 }, { "epoch": 0.5735754539762054, "grad_norm": 2.776008367538452, "learning_rate": 0.00010845957880434782, "loss": 0.6294, "step": 3664 }, { "epoch": 0.5737319974953037, "grad_norm": 0.9705535769462585, "learning_rate": 0.0001084476902173913, "loss": 0.411, "step": 3665 }, { "epoch": 0.573888541014402, "grad_norm": 0.7825808525085449, "learning_rate": 0.00010843580163043478, "loss": 0.4569, "step": 3666 }, { "epoch": 0.5740450845335003, "grad_norm": 1.1461485624313354, "learning_rate": 0.00010842391304347825, "loss": 0.5506, "step": 3667 }, { "epoch": 0.5742016280525987, "grad_norm": 1.5732611417770386, "learning_rate": 0.00010841202445652172, "loss": 0.4467, "step": 3668 }, { "epoch": 0.5743581715716969, "grad_norm": 1.2339757680892944, "learning_rate": 0.0001084001358695652, "loss": 0.5033, "step": 3669 }, { "epoch": 0.5745147150907952, "grad_norm": 1.1127384901046753, "learning_rate": 0.00010838824728260868, "loss": 0.3982, "step": 3670 }, { "epoch": 0.5746712586098935, "grad_norm": 3.5493721961975098, "learning_rate": 0.00010837635869565215, "loss": 0.7902, "step": 3671 }, { "epoch": 0.5748278021289919, "grad_norm": 1.1872025728225708, "learning_rate": 0.00010836447010869563, "loss": 0.5674, "step": 3672 }, { "epoch": 0.5749843456480902, "grad_norm": 1.8014198541641235, "learning_rate": 0.00010835258152173911, "loss": 0.5945, "step": 3673 }, { "epoch": 0.5751408891671885, "grad_norm": 1.9140770435333252, "learning_rate": 0.0001083406929347826, "loss": 0.5095, "step": 3674 }, { "epoch": 0.5752974326862867, "grad_norm": 1.6112767457962036, "learning_rate": 0.00010832880434782608, "loss": 0.6082, "step": 3675 }, { "epoch": 0.5754539762053851, "grad_norm": 2.0819289684295654, "learning_rate": 0.00010831691576086956, "loss": 0.5159, "step": 3676 }, { "epoch": 0.5756105197244834, "grad_norm": 3.1906344890594482, "learning_rate": 0.00010830502717391304, "loss": 0.752, "step": 3677 }, { "epoch": 0.5757670632435817, "grad_norm": 4.227867126464844, "learning_rate": 0.00010829313858695651, "loss": 1.083, "step": 3678 }, { "epoch": 0.5759236067626801, "grad_norm": 3.1978940963745117, "learning_rate": 0.00010828124999999999, "loss": 0.6578, "step": 3679 }, { "epoch": 0.5760801502817783, "grad_norm": 5.178042888641357, "learning_rate": 0.00010826936141304347, "loss": 1.3822, "step": 3680 }, { "epoch": 0.5762366938008766, "grad_norm": 1.7489368915557861, "learning_rate": 0.00010825747282608695, "loss": 0.7331, "step": 3681 }, { "epoch": 0.5763932373199749, "grad_norm": 2.995007276535034, "learning_rate": 0.00010824558423913043, "loss": 0.8525, "step": 3682 }, { "epoch": 0.5765497808390733, "grad_norm": 2.223456382751465, "learning_rate": 0.0001082336956521739, "loss": 0.9293, "step": 3683 }, { "epoch": 0.5767063243581716, "grad_norm": 1.893291711807251, "learning_rate": 0.00010822180706521738, "loss": 0.8159, "step": 3684 }, { "epoch": 0.5768628678772699, "grad_norm": 2.223822593688965, "learning_rate": 0.00010820991847826086, "loss": 0.949, "step": 3685 }, { "epoch": 0.5770194113963681, "grad_norm": 4.9765801429748535, "learning_rate": 0.00010819802989130434, "loss": 1.2576, "step": 3686 }, { "epoch": 0.5771759549154665, "grad_norm": 2.3331453800201416, "learning_rate": 0.00010818614130434782, "loss": 0.9985, "step": 3687 }, { "epoch": 0.5773324984345648, "grad_norm": 2.3143470287323, "learning_rate": 0.0001081742527173913, "loss": 0.9169, "step": 3688 }, { "epoch": 0.5774890419536631, "grad_norm": 3.1698381900787354, "learning_rate": 0.00010816236413043477, "loss": 1.3438, "step": 3689 }, { "epoch": 0.5776455854727615, "grad_norm": 2.0402960777282715, "learning_rate": 0.00010815047554347825, "loss": 0.7631, "step": 3690 }, { "epoch": 0.5778021289918598, "grad_norm": 3.1873247623443604, "learning_rate": 0.00010813858695652172, "loss": 1.1257, "step": 3691 }, { "epoch": 0.577958672510958, "grad_norm": 4.544883728027344, "learning_rate": 0.0001081266983695652, "loss": 1.3166, "step": 3692 }, { "epoch": 0.5781152160300563, "grad_norm": 4.7779154777526855, "learning_rate": 0.00010811480978260869, "loss": 1.438, "step": 3693 }, { "epoch": 0.5782717595491547, "grad_norm": 1.781118631362915, "learning_rate": 0.00010810292119565216, "loss": 0.7582, "step": 3694 }, { "epoch": 0.578428303068253, "grad_norm": 5.493315696716309, "learning_rate": 0.00010809103260869564, "loss": 1.3104, "step": 3695 }, { "epoch": 0.5785848465873513, "grad_norm": 0.8669617772102356, "learning_rate": 0.00010807914402173912, "loss": 0.2638, "step": 3696 }, { "epoch": 0.5787413901064495, "grad_norm": 2.310742139816284, "learning_rate": 0.0001080672554347826, "loss": 0.5982, "step": 3697 }, { "epoch": 0.5788979336255479, "grad_norm": 2.520021677017212, "learning_rate": 0.00010805536684782608, "loss": 0.8094, "step": 3698 }, { "epoch": 0.5790544771446462, "grad_norm": 2.9768612384796143, "learning_rate": 0.00010804347826086956, "loss": 0.9273, "step": 3699 }, { "epoch": 0.5792110206637445, "grad_norm": 2.6576929092407227, "learning_rate": 0.00010803158967391303, "loss": 0.92, "step": 3700 }, { "epoch": 0.5793675641828429, "grad_norm": 0.5418853759765625, "learning_rate": 0.00010801970108695651, "loss": 0.3821, "step": 3701 }, { "epoch": 0.5795241077019412, "grad_norm": 0.7097650170326233, "learning_rate": 0.00010800781249999999, "loss": 0.3358, "step": 3702 }, { "epoch": 0.5796806512210394, "grad_norm": 0.7229509353637695, "learning_rate": 0.00010799592391304347, "loss": 0.5081, "step": 3703 }, { "epoch": 0.5798371947401377, "grad_norm": 1.1250455379486084, "learning_rate": 0.00010798403532608695, "loss": 0.4607, "step": 3704 }, { "epoch": 0.5799937382592361, "grad_norm": 1.1280608177185059, "learning_rate": 0.00010797214673913042, "loss": 0.4369, "step": 3705 }, { "epoch": 0.5801502817783344, "grad_norm": 0.7761743068695068, "learning_rate": 0.0001079602581521739, "loss": 0.3202, "step": 3706 }, { "epoch": 0.5803068252974327, "grad_norm": 0.8888364434242249, "learning_rate": 0.00010794836956521738, "loss": 0.3762, "step": 3707 }, { "epoch": 0.580463368816531, "grad_norm": 0.9126970171928406, "learning_rate": 0.00010793648097826086, "loss": 0.3258, "step": 3708 }, { "epoch": 0.5806199123356293, "grad_norm": 0.6278502345085144, "learning_rate": 0.00010792459239130434, "loss": 0.2805, "step": 3709 }, { "epoch": 0.5807764558547276, "grad_norm": 1.5540169477462769, "learning_rate": 0.00010791270380434782, "loss": 0.3903, "step": 3710 }, { "epoch": 0.5809329993738259, "grad_norm": 0.9952980875968933, "learning_rate": 0.00010790081521739131, "loss": 0.4304, "step": 3711 }, { "epoch": 0.5810895428929242, "grad_norm": 1.2816461324691772, "learning_rate": 0.00010788892663043479, "loss": 0.4154, "step": 3712 }, { "epoch": 0.5812460864120226, "grad_norm": 0.910155177116394, "learning_rate": 0.00010787703804347826, "loss": 0.4076, "step": 3713 }, { "epoch": 0.5814026299311209, "grad_norm": 1.451486587524414, "learning_rate": 0.00010786514945652173, "loss": 0.6157, "step": 3714 }, { "epoch": 0.5815591734502191, "grad_norm": 1.6452242136001587, "learning_rate": 0.0001078532608695652, "loss": 0.677, "step": 3715 }, { "epoch": 0.5817157169693175, "grad_norm": 1.5034422874450684, "learning_rate": 0.00010784137228260868, "loss": 0.5175, "step": 3716 }, { "epoch": 0.5818722604884158, "grad_norm": 1.5957036018371582, "learning_rate": 0.00010782948369565216, "loss": 0.4731, "step": 3717 }, { "epoch": 0.5820288040075141, "grad_norm": 1.6214855909347534, "learning_rate": 0.00010781759510869564, "loss": 0.5733, "step": 3718 }, { "epoch": 0.5821853475266124, "grad_norm": 2.339266538619995, "learning_rate": 0.00010780570652173912, "loss": 0.8531, "step": 3719 }, { "epoch": 0.5823418910457107, "grad_norm": 1.7470428943634033, "learning_rate": 0.0001077938179347826, "loss": 0.7054, "step": 3720 }, { "epoch": 0.582498434564809, "grad_norm": 2.766298770904541, "learning_rate": 0.00010778192934782608, "loss": 0.6577, "step": 3721 }, { "epoch": 0.5826549780839073, "grad_norm": 1.6953901052474976, "learning_rate": 0.00010777004076086955, "loss": 0.5075, "step": 3722 }, { "epoch": 0.5828115216030056, "grad_norm": 1.7674201726913452, "learning_rate": 0.00010775815217391303, "loss": 0.4882, "step": 3723 }, { "epoch": 0.582968065122104, "grad_norm": 2.929274320602417, "learning_rate": 0.00010774626358695651, "loss": 0.9027, "step": 3724 }, { "epoch": 0.5831246086412023, "grad_norm": 2.1252686977386475, "learning_rate": 0.00010773437499999999, "loss": 0.5739, "step": 3725 }, { "epoch": 0.5832811521603005, "grad_norm": 3.213656425476074, "learning_rate": 0.00010772248641304347, "loss": 1.0635, "step": 3726 }, { "epoch": 0.5834376956793988, "grad_norm": 1.6815288066864014, "learning_rate": 0.00010771059782608694, "loss": 0.5281, "step": 3727 }, { "epoch": 0.5835942391984972, "grad_norm": 2.4188082218170166, "learning_rate": 0.00010769870923913042, "loss": 0.4547, "step": 3728 }, { "epoch": 0.5837507827175955, "grad_norm": 1.5152485370635986, "learning_rate": 0.0001076868206521739, "loss": 0.7661, "step": 3729 }, { "epoch": 0.5839073262366938, "grad_norm": 2.067028045654297, "learning_rate": 0.00010767493206521738, "loss": 1.2385, "step": 3730 }, { "epoch": 0.5840638697557922, "grad_norm": 3.0686159133911133, "learning_rate": 0.00010766304347826087, "loss": 0.5263, "step": 3731 }, { "epoch": 0.5842204132748904, "grad_norm": 2.2678613662719727, "learning_rate": 0.00010765115489130435, "loss": 0.6137, "step": 3732 }, { "epoch": 0.5843769567939887, "grad_norm": 2.9555768966674805, "learning_rate": 0.00010763926630434783, "loss": 1.289, "step": 3733 }, { "epoch": 0.584533500313087, "grad_norm": 2.904304027557373, "learning_rate": 0.0001076273777173913, "loss": 0.7653, "step": 3734 }, { "epoch": 0.5846900438321854, "grad_norm": 2.8357343673706055, "learning_rate": 0.00010761548913043478, "loss": 1.1108, "step": 3735 }, { "epoch": 0.5848465873512837, "grad_norm": 1.5449702739715576, "learning_rate": 0.00010760360054347826, "loss": 0.8997, "step": 3736 }, { "epoch": 0.5850031308703819, "grad_norm": 2.875903844833374, "learning_rate": 0.00010759171195652173, "loss": 1.0057, "step": 3737 }, { "epoch": 0.5851596743894802, "grad_norm": 2.7486588954925537, "learning_rate": 0.0001075798233695652, "loss": 1.032, "step": 3738 }, { "epoch": 0.5853162179085786, "grad_norm": 2.4628145694732666, "learning_rate": 0.00010756793478260868, "loss": 0.8694, "step": 3739 }, { "epoch": 0.5854727614276769, "grad_norm": 4.921718120574951, "learning_rate": 0.00010755604619565216, "loss": 1.2979, "step": 3740 }, { "epoch": 0.5856293049467752, "grad_norm": 2.7755963802337646, "learning_rate": 0.00010754415760869564, "loss": 1.2, "step": 3741 }, { "epoch": 0.5857858484658736, "grad_norm": 3.8409717082977295, "learning_rate": 0.00010753226902173912, "loss": 1.0896, "step": 3742 }, { "epoch": 0.5859423919849718, "grad_norm": 2.368907928466797, "learning_rate": 0.0001075203804347826, "loss": 1.7539, "step": 3743 }, { "epoch": 0.5860989355040701, "grad_norm": 3.432253122329712, "learning_rate": 0.00010750849184782607, "loss": 0.8776, "step": 3744 }, { "epoch": 0.5862554790231684, "grad_norm": 2.1992316246032715, "learning_rate": 0.00010749660326086955, "loss": 1.1074, "step": 3745 }, { "epoch": 0.5864120225422668, "grad_norm": 2.2419140338897705, "learning_rate": 0.00010748471467391303, "loss": 1.5195, "step": 3746 }, { "epoch": 0.5865685660613651, "grad_norm": 1.8999665975570679, "learning_rate": 0.00010747282608695651, "loss": 0.6837, "step": 3747 }, { "epoch": 0.5867251095804634, "grad_norm": 5.416408538818359, "learning_rate": 0.00010746093749999999, "loss": 2.0091, "step": 3748 }, { "epoch": 0.5868816530995616, "grad_norm": 3.296325445175171, "learning_rate": 0.00010744904891304346, "loss": 0.9238, "step": 3749 }, { "epoch": 0.58703819661866, "grad_norm": 2.6537368297576904, "learning_rate": 0.00010743716032608694, "loss": 1.1161, "step": 3750 }, { "epoch": 0.5871947401377583, "grad_norm": 0.6871047019958496, "learning_rate": 0.00010742527173913043, "loss": 0.3557, "step": 3751 }, { "epoch": 0.5873512836568566, "grad_norm": 0.5269290804862976, "learning_rate": 0.00010741338315217391, "loss": 0.2562, "step": 3752 }, { "epoch": 0.587507827175955, "grad_norm": 2.1127371788024902, "learning_rate": 0.00010740149456521739, "loss": 1.2572, "step": 3753 }, { "epoch": 0.5876643706950532, "grad_norm": 0.892360508441925, "learning_rate": 0.00010738960597826087, "loss": 0.3173, "step": 3754 }, { "epoch": 0.5878209142141515, "grad_norm": 0.5568608641624451, "learning_rate": 0.00010737771739130435, "loss": 0.3307, "step": 3755 }, { "epoch": 0.5879774577332498, "grad_norm": 0.8082489371299744, "learning_rate": 0.00010736582880434782, "loss": 0.394, "step": 3756 }, { "epoch": 0.5881340012523482, "grad_norm": 0.9102075695991516, "learning_rate": 0.0001073539402173913, "loss": 0.4034, "step": 3757 }, { "epoch": 0.5882905447714465, "grad_norm": 1.1039905548095703, "learning_rate": 0.00010734205163043478, "loss": 0.3349, "step": 3758 }, { "epoch": 0.5884470882905448, "grad_norm": 0.7442502975463867, "learning_rate": 0.00010733016304347826, "loss": 0.3759, "step": 3759 }, { "epoch": 0.588603631809643, "grad_norm": 1.0571779012680054, "learning_rate": 0.00010731827445652172, "loss": 0.5457, "step": 3760 }, { "epoch": 0.5887601753287414, "grad_norm": 1.2267342805862427, "learning_rate": 0.0001073063858695652, "loss": 0.3824, "step": 3761 }, { "epoch": 0.5889167188478397, "grad_norm": 0.7323843240737915, "learning_rate": 0.00010729449728260868, "loss": 0.2642, "step": 3762 }, { "epoch": 0.589073262366938, "grad_norm": 0.8958896994590759, "learning_rate": 0.00010728260869565216, "loss": 0.2946, "step": 3763 }, { "epoch": 0.5892298058860364, "grad_norm": 1.2227075099945068, "learning_rate": 0.00010727072010869564, "loss": 0.3921, "step": 3764 }, { "epoch": 0.5893863494051347, "grad_norm": 1.3738597631454468, "learning_rate": 0.00010725883152173911, "loss": 0.3603, "step": 3765 }, { "epoch": 0.5895428929242329, "grad_norm": 1.2295888662338257, "learning_rate": 0.00010724694293478259, "loss": 0.4455, "step": 3766 }, { "epoch": 0.5896994364433312, "grad_norm": 1.0013219118118286, "learning_rate": 0.00010723505434782607, "loss": 0.4074, "step": 3767 }, { "epoch": 0.5898559799624296, "grad_norm": 1.8807568550109863, "learning_rate": 0.00010722316576086955, "loss": 0.6856, "step": 3768 }, { "epoch": 0.5900125234815279, "grad_norm": 1.3776133060455322, "learning_rate": 0.00010721127717391303, "loss": 0.2584, "step": 3769 }, { "epoch": 0.5901690670006262, "grad_norm": 1.280185341835022, "learning_rate": 0.00010719938858695652, "loss": 0.5818, "step": 3770 }, { "epoch": 0.5903256105197244, "grad_norm": 1.2145872116088867, "learning_rate": 0.0001071875, "loss": 0.5588, "step": 3771 }, { "epoch": 0.5904821540388228, "grad_norm": 3.9799046516418457, "learning_rate": 0.00010717561141304347, "loss": 1.0864, "step": 3772 }, { "epoch": 0.5906386975579211, "grad_norm": 1.480193853378296, "learning_rate": 0.00010716372282608695, "loss": 0.6933, "step": 3773 }, { "epoch": 0.5907952410770194, "grad_norm": 3.3650622367858887, "learning_rate": 0.00010715183423913043, "loss": 0.6418, "step": 3774 }, { "epoch": 0.5909517845961177, "grad_norm": 1.8126425743103027, "learning_rate": 0.00010713994565217391, "loss": 0.8918, "step": 3775 }, { "epoch": 0.5911083281152161, "grad_norm": 1.136564016342163, "learning_rate": 0.00010712805706521739, "loss": 0.5393, "step": 3776 }, { "epoch": 0.5912648716343143, "grad_norm": 1.7965614795684814, "learning_rate": 0.00010711616847826087, "loss": 0.3963, "step": 3777 }, { "epoch": 0.5914214151534126, "grad_norm": 3.52066969871521, "learning_rate": 0.00010710427989130434, "loss": 0.9325, "step": 3778 }, { "epoch": 0.591577958672511, "grad_norm": 3.205779552459717, "learning_rate": 0.00010709239130434782, "loss": 0.7262, "step": 3779 }, { "epoch": 0.5917345021916093, "grad_norm": 2.849879741668701, "learning_rate": 0.0001070805027173913, "loss": 0.5877, "step": 3780 }, { "epoch": 0.5918910457107076, "grad_norm": 1.7741955518722534, "learning_rate": 0.00010706861413043478, "loss": 0.4834, "step": 3781 }, { "epoch": 0.5920475892298059, "grad_norm": 2.245349407196045, "learning_rate": 0.00010705672554347826, "loss": 0.6211, "step": 3782 }, { "epoch": 0.5922041327489042, "grad_norm": 2.5652883052825928, "learning_rate": 0.00010704483695652172, "loss": 0.8064, "step": 3783 }, { "epoch": 0.5923606762680025, "grad_norm": 4.257562637329102, "learning_rate": 0.0001070329483695652, "loss": 1.1249, "step": 3784 }, { "epoch": 0.5925172197871008, "grad_norm": 2.8859758377075195, "learning_rate": 0.00010702105978260868, "loss": 0.9779, "step": 3785 }, { "epoch": 0.5926737633061991, "grad_norm": 2.640791893005371, "learning_rate": 0.00010700917119565216, "loss": 0.9805, "step": 3786 }, { "epoch": 0.5928303068252975, "grad_norm": 2.9654104709625244, "learning_rate": 0.00010699728260869563, "loss": 0.7767, "step": 3787 }, { "epoch": 0.5929868503443957, "grad_norm": 2.4620118141174316, "learning_rate": 0.00010698539402173911, "loss": 1.1104, "step": 3788 }, { "epoch": 0.593143393863494, "grad_norm": 2.2617673873901367, "learning_rate": 0.00010697350543478259, "loss": 0.9304, "step": 3789 }, { "epoch": 0.5932999373825923, "grad_norm": 2.7657861709594727, "learning_rate": 0.00010696161684782608, "loss": 1.6148, "step": 3790 }, { "epoch": 0.5934564809016907, "grad_norm": 4.850158214569092, "learning_rate": 0.00010694972826086956, "loss": 1.6436, "step": 3791 }, { "epoch": 0.593613024420789, "grad_norm": 3.168074607849121, "learning_rate": 0.00010693783967391304, "loss": 1.2643, "step": 3792 }, { "epoch": 0.5937695679398873, "grad_norm": 1.495097041130066, "learning_rate": 0.00010692595108695652, "loss": 1.0498, "step": 3793 }, { "epoch": 0.5939261114589856, "grad_norm": 2.7815985679626465, "learning_rate": 0.0001069140625, "loss": 1.7415, "step": 3794 }, { "epoch": 0.5940826549780839, "grad_norm": 2.3889918327331543, "learning_rate": 0.00010690217391304347, "loss": 1.194, "step": 3795 }, { "epoch": 0.5942391984971822, "grad_norm": 1.5526939630508423, "learning_rate": 0.00010689028532608695, "loss": 0.5942, "step": 3796 }, { "epoch": 0.5943957420162805, "grad_norm": 1.6814937591552734, "learning_rate": 0.00010687839673913043, "loss": 0.7073, "step": 3797 }, { "epoch": 0.5945522855353789, "grad_norm": 1.502492070198059, "learning_rate": 0.0001068665081521739, "loss": 0.7115, "step": 3798 }, { "epoch": 0.5947088290544772, "grad_norm": 1.9815309047698975, "learning_rate": 0.00010685461956521738, "loss": 0.9751, "step": 3799 }, { "epoch": 0.5948653725735754, "grad_norm": 2.1998322010040283, "learning_rate": 0.00010684273097826086, "loss": 1.1981, "step": 3800 }, { "epoch": 0.5950219160926737, "grad_norm": 0.8829543590545654, "learning_rate": 0.00010683084239130434, "loss": 0.3782, "step": 3801 }, { "epoch": 0.5951784596117721, "grad_norm": 1.3998905420303345, "learning_rate": 0.00010681895380434782, "loss": 0.3113, "step": 3802 }, { "epoch": 0.5953350031308704, "grad_norm": 0.9637031555175781, "learning_rate": 0.0001068070652173913, "loss": 0.3017, "step": 3803 }, { "epoch": 0.5954915466499687, "grad_norm": 1.5433114767074585, "learning_rate": 0.00010679517663043478, "loss": 0.2993, "step": 3804 }, { "epoch": 0.595648090169067, "grad_norm": 0.8290585875511169, "learning_rate": 0.00010678328804347825, "loss": 0.2978, "step": 3805 }, { "epoch": 0.5958046336881653, "grad_norm": 1.8299587965011597, "learning_rate": 0.00010677139945652172, "loss": 0.2658, "step": 3806 }, { "epoch": 0.5959611772072636, "grad_norm": 0.8946764469146729, "learning_rate": 0.0001067595108695652, "loss": 0.346, "step": 3807 }, { "epoch": 0.5961177207263619, "grad_norm": 0.8937351703643799, "learning_rate": 0.00010674762228260867, "loss": 0.4673, "step": 3808 }, { "epoch": 0.5962742642454603, "grad_norm": 1.3481258153915405, "learning_rate": 0.00010673573369565215, "loss": 0.3943, "step": 3809 }, { "epoch": 0.5964308077645586, "grad_norm": 1.750969409942627, "learning_rate": 0.00010672384510869564, "loss": 0.6494, "step": 3810 }, { "epoch": 0.5965873512836568, "grad_norm": 1.9181060791015625, "learning_rate": 0.00010671195652173912, "loss": 0.4218, "step": 3811 }, { "epoch": 0.5967438948027551, "grad_norm": 1.3101545572280884, "learning_rate": 0.0001067000679347826, "loss": 0.3621, "step": 3812 }, { "epoch": 0.5969004383218535, "grad_norm": 1.3157144784927368, "learning_rate": 0.00010668817934782608, "loss": 0.547, "step": 3813 }, { "epoch": 0.5970569818409518, "grad_norm": 1.2218587398529053, "learning_rate": 0.00010667629076086956, "loss": 0.3869, "step": 3814 }, { "epoch": 0.5972135253600501, "grad_norm": 0.9501075744628906, "learning_rate": 0.00010666440217391304, "loss": 0.3601, "step": 3815 }, { "epoch": 0.5973700688791485, "grad_norm": 0.8756763935089111, "learning_rate": 0.00010665251358695651, "loss": 0.2368, "step": 3816 }, { "epoch": 0.5975266123982467, "grad_norm": 0.8717665076255798, "learning_rate": 0.00010664062499999999, "loss": 0.4869, "step": 3817 }, { "epoch": 0.597683155917345, "grad_norm": 2.0403175354003906, "learning_rate": 0.00010662873641304347, "loss": 0.8816, "step": 3818 }, { "epoch": 0.5978396994364433, "grad_norm": 2.151135206222534, "learning_rate": 0.00010661684782608695, "loss": 0.5523, "step": 3819 }, { "epoch": 0.5979962429555417, "grad_norm": 0.8784058094024658, "learning_rate": 0.00010660495923913043, "loss": 0.31, "step": 3820 }, { "epoch": 0.59815278647464, "grad_norm": 2.1384453773498535, "learning_rate": 0.0001065930706521739, "loss": 0.7764, "step": 3821 }, { "epoch": 0.5983093299937383, "grad_norm": 1.5402135848999023, "learning_rate": 0.00010658118206521738, "loss": 0.5792, "step": 3822 }, { "epoch": 0.5984658735128365, "grad_norm": 2.752272129058838, "learning_rate": 0.00010656929347826086, "loss": 0.8061, "step": 3823 }, { "epoch": 0.5986224170319349, "grad_norm": 4.682284355163574, "learning_rate": 0.00010655740489130434, "loss": 1.103, "step": 3824 }, { "epoch": 0.5987789605510332, "grad_norm": 3.4299683570861816, "learning_rate": 0.00010654551630434782, "loss": 0.8155, "step": 3825 }, { "epoch": 0.5989355040701315, "grad_norm": 2.2212400436401367, "learning_rate": 0.0001065336277173913, "loss": 0.9116, "step": 3826 }, { "epoch": 0.5990920475892298, "grad_norm": 1.9403469562530518, "learning_rate": 0.00010652173913043477, "loss": 0.6411, "step": 3827 }, { "epoch": 0.5992485911083281, "grad_norm": 4.767629146575928, "learning_rate": 0.00010650985054347826, "loss": 0.9087, "step": 3828 }, { "epoch": 0.5994051346274264, "grad_norm": 2.0158355236053467, "learning_rate": 0.00010649796195652172, "loss": 0.62, "step": 3829 }, { "epoch": 0.5995616781465247, "grad_norm": 3.2873449325561523, "learning_rate": 0.00010648607336956521, "loss": 0.7884, "step": 3830 }, { "epoch": 0.599718221665623, "grad_norm": 2.1717588901519775, "learning_rate": 0.00010647418478260869, "loss": 0.6681, "step": 3831 }, { "epoch": 0.5998747651847214, "grad_norm": 5.286881923675537, "learning_rate": 0.00010646229619565216, "loss": 0.7463, "step": 3832 }, { "epoch": 0.6000313087038197, "grad_norm": 2.3036048412323, "learning_rate": 0.00010645040760869564, "loss": 1.0214, "step": 3833 }, { "epoch": 0.6001878522229179, "grad_norm": 2.9481945037841797, "learning_rate": 0.00010643851902173912, "loss": 0.7508, "step": 3834 }, { "epoch": 0.6003443957420163, "grad_norm": 2.809746742248535, "learning_rate": 0.0001064266304347826, "loss": 1.1095, "step": 3835 }, { "epoch": 0.6005009392611146, "grad_norm": 2.200028419494629, "learning_rate": 0.00010641474184782608, "loss": 1.1068, "step": 3836 }, { "epoch": 0.6006574827802129, "grad_norm": 2.8553411960601807, "learning_rate": 0.00010640285326086955, "loss": 0.4281, "step": 3837 }, { "epoch": 0.6008140262993112, "grad_norm": 3.141061544418335, "learning_rate": 0.00010639096467391303, "loss": 1.2379, "step": 3838 }, { "epoch": 0.6009705698184096, "grad_norm": 3.0599584579467773, "learning_rate": 0.00010637907608695651, "loss": 1.0534, "step": 3839 }, { "epoch": 0.6011271133375078, "grad_norm": 3.4885733127593994, "learning_rate": 0.00010636718749999999, "loss": 1.1075, "step": 3840 }, { "epoch": 0.6012836568566061, "grad_norm": 2.621548891067505, "learning_rate": 0.00010635529891304347, "loss": 1.2646, "step": 3841 }, { "epoch": 0.6014402003757044, "grad_norm": 8.210897445678711, "learning_rate": 0.00010634341032608695, "loss": 1.831, "step": 3842 }, { "epoch": 0.6015967438948028, "grad_norm": 4.982256889343262, "learning_rate": 0.00010633152173913042, "loss": 1.6537, "step": 3843 }, { "epoch": 0.6017532874139011, "grad_norm": 3.4506077766418457, "learning_rate": 0.0001063196331521739, "loss": 1.111, "step": 3844 }, { "epoch": 0.6019098309329993, "grad_norm": 3.34985613822937, "learning_rate": 0.00010630774456521738, "loss": 1.0665, "step": 3845 }, { "epoch": 0.6020663744520977, "grad_norm": 4.336419582366943, "learning_rate": 0.00010629585597826086, "loss": 1.5539, "step": 3846 }, { "epoch": 0.602222917971196, "grad_norm": 1.158301591873169, "learning_rate": 0.00010628396739130435, "loss": 0.7818, "step": 3847 }, { "epoch": 0.6023794614902943, "grad_norm": 2.6690986156463623, "learning_rate": 0.00010627207880434783, "loss": 1.0101, "step": 3848 }, { "epoch": 0.6025360050093926, "grad_norm": 2.3023133277893066, "learning_rate": 0.0001062601902173913, "loss": 0.9477, "step": 3849 }, { "epoch": 0.602692548528491, "grad_norm": 3.9358882904052734, "learning_rate": 0.00010624830163043478, "loss": 0.7915, "step": 3850 }, { "epoch": 0.6028490920475892, "grad_norm": 0.7566927671432495, "learning_rate": 0.00010623641304347826, "loss": 0.3354, "step": 3851 }, { "epoch": 0.6030056355666875, "grad_norm": 0.47428858280181885, "learning_rate": 0.00010622452445652173, "loss": 0.2749, "step": 3852 }, { "epoch": 0.6031621790857858, "grad_norm": 0.645195722579956, "learning_rate": 0.0001062126358695652, "loss": 0.389, "step": 3853 }, { "epoch": 0.6033187226048842, "grad_norm": 0.630821704864502, "learning_rate": 0.00010620074728260868, "loss": 0.2387, "step": 3854 }, { "epoch": 0.6034752661239825, "grad_norm": 0.371026873588562, "learning_rate": 0.00010618885869565216, "loss": 0.1623, "step": 3855 }, { "epoch": 0.6036318096430808, "grad_norm": 0.6148006916046143, "learning_rate": 0.00010617697010869564, "loss": 0.3395, "step": 3856 }, { "epoch": 0.603788353162179, "grad_norm": 0.5248280167579651, "learning_rate": 0.00010616508152173912, "loss": 0.2295, "step": 3857 }, { "epoch": 0.6039448966812774, "grad_norm": 1.0640558004379272, "learning_rate": 0.0001061531929347826, "loss": 0.3412, "step": 3858 }, { "epoch": 0.6041014402003757, "grad_norm": 0.659545361995697, "learning_rate": 0.00010614130434782607, "loss": 0.3187, "step": 3859 }, { "epoch": 0.604257983719474, "grad_norm": 0.8242048025131226, "learning_rate": 0.00010612941576086955, "loss": 0.2822, "step": 3860 }, { "epoch": 0.6044145272385724, "grad_norm": 1.6869125366210938, "learning_rate": 0.00010611752717391303, "loss": 0.5745, "step": 3861 }, { "epoch": 0.6045710707576706, "grad_norm": 1.117938756942749, "learning_rate": 0.00010610563858695651, "loss": 0.3776, "step": 3862 }, { "epoch": 0.6047276142767689, "grad_norm": 0.6582819223403931, "learning_rate": 0.00010609374999999999, "loss": 0.2642, "step": 3863 }, { "epoch": 0.6048841577958672, "grad_norm": 1.0945160388946533, "learning_rate": 0.00010608186141304347, "loss": 0.3203, "step": 3864 }, { "epoch": 0.6050407013149656, "grad_norm": 1.270145297050476, "learning_rate": 0.00010606997282608694, "loss": 0.5072, "step": 3865 }, { "epoch": 0.6051972448340639, "grad_norm": 1.841850757598877, "learning_rate": 0.00010605808423913042, "loss": 0.6779, "step": 3866 }, { "epoch": 0.6053537883531622, "grad_norm": 0.8727262020111084, "learning_rate": 0.00010604619565217391, "loss": 0.503, "step": 3867 }, { "epoch": 0.6055103318722604, "grad_norm": 2.208461284637451, "learning_rate": 0.00010603430706521739, "loss": 0.554, "step": 3868 }, { "epoch": 0.6056668753913588, "grad_norm": 2.4995923042297363, "learning_rate": 0.00010602241847826087, "loss": 0.632, "step": 3869 }, { "epoch": 0.6058234189104571, "grad_norm": 1.2315558195114136, "learning_rate": 0.00010601052989130435, "loss": 0.4267, "step": 3870 }, { "epoch": 0.6059799624295554, "grad_norm": 1.8663618564605713, "learning_rate": 0.00010599864130434783, "loss": 0.5738, "step": 3871 }, { "epoch": 0.6061365059486538, "grad_norm": 1.6554889678955078, "learning_rate": 0.0001059867527173913, "loss": 0.5435, "step": 3872 }, { "epoch": 0.6062930494677521, "grad_norm": 1.801230788230896, "learning_rate": 0.00010597486413043478, "loss": 0.6486, "step": 3873 }, { "epoch": 0.6064495929868503, "grad_norm": 1.6065112352371216, "learning_rate": 0.00010596297554347826, "loss": 0.821, "step": 3874 }, { "epoch": 0.6066061365059486, "grad_norm": 2.7553110122680664, "learning_rate": 0.00010595108695652172, "loss": 0.8125, "step": 3875 }, { "epoch": 0.606762680025047, "grad_norm": 1.2869905233383179, "learning_rate": 0.0001059391983695652, "loss": 0.547, "step": 3876 }, { "epoch": 0.6069192235441453, "grad_norm": 1.73623788356781, "learning_rate": 0.00010592730978260868, "loss": 0.8936, "step": 3877 }, { "epoch": 0.6070757670632436, "grad_norm": 1.8490263223648071, "learning_rate": 0.00010591542119565216, "loss": 0.9476, "step": 3878 }, { "epoch": 0.6072323105823418, "grad_norm": 1.0288007259368896, "learning_rate": 0.00010590353260869564, "loss": 0.4792, "step": 3879 }, { "epoch": 0.6073888541014402, "grad_norm": 2.442324161529541, "learning_rate": 0.00010589164402173912, "loss": 0.6637, "step": 3880 }, { "epoch": 0.6075453976205385, "grad_norm": 1.1494264602661133, "learning_rate": 0.0001058797554347826, "loss": 0.2721, "step": 3881 }, { "epoch": 0.6077019411396368, "grad_norm": 1.5502212047576904, "learning_rate": 0.00010586786684782607, "loss": 0.4818, "step": 3882 }, { "epoch": 0.6078584846587352, "grad_norm": 2.272243022918701, "learning_rate": 0.00010585597826086955, "loss": 0.7413, "step": 3883 }, { "epoch": 0.6080150281778335, "grad_norm": 4.173930644989014, "learning_rate": 0.00010584408967391303, "loss": 1.1889, "step": 3884 }, { "epoch": 0.6081715716969317, "grad_norm": 1.6223623752593994, "learning_rate": 0.0001058322010869565, "loss": 0.6516, "step": 3885 }, { "epoch": 0.60832811521603, "grad_norm": 2.918154716491699, "learning_rate": 0.00010582031249999998, "loss": 1.4172, "step": 3886 }, { "epoch": 0.6084846587351284, "grad_norm": 2.1456072330474854, "learning_rate": 0.00010580842391304348, "loss": 0.5239, "step": 3887 }, { "epoch": 0.6086412022542267, "grad_norm": 2.7290213108062744, "learning_rate": 0.00010579653532608695, "loss": 1.1336, "step": 3888 }, { "epoch": 0.608797745773325, "grad_norm": 3.9845528602600098, "learning_rate": 0.00010578464673913043, "loss": 1.4301, "step": 3889 }, { "epoch": 0.6089542892924233, "grad_norm": 6.077491760253906, "learning_rate": 0.00010577275815217391, "loss": 0.885, "step": 3890 }, { "epoch": 0.6091108328115216, "grad_norm": 4.090402126312256, "learning_rate": 0.00010576086956521739, "loss": 1.5473, "step": 3891 }, { "epoch": 0.6092673763306199, "grad_norm": 1.6476160287857056, "learning_rate": 0.00010574898097826087, "loss": 1.1867, "step": 3892 }, { "epoch": 0.6094239198497182, "grad_norm": 3.966313362121582, "learning_rate": 0.00010573709239130435, "loss": 1.2551, "step": 3893 }, { "epoch": 0.6095804633688165, "grad_norm": 2.4396960735321045, "learning_rate": 0.00010572520380434782, "loss": 1.1643, "step": 3894 }, { "epoch": 0.6097370068879149, "grad_norm": 3.8909707069396973, "learning_rate": 0.0001057133152173913, "loss": 0.853, "step": 3895 }, { "epoch": 0.6098935504070131, "grad_norm": 4.06716251373291, "learning_rate": 0.00010570142663043478, "loss": 0.5497, "step": 3896 }, { "epoch": 0.6100500939261114, "grad_norm": 3.097525119781494, "learning_rate": 0.00010568953804347826, "loss": 1.1687, "step": 3897 }, { "epoch": 0.6102066374452098, "grad_norm": 2.0527987480163574, "learning_rate": 0.00010567764945652172, "loss": 0.7541, "step": 3898 }, { "epoch": 0.6103631809643081, "grad_norm": 3.919420003890991, "learning_rate": 0.0001056657608695652, "loss": 0.5798, "step": 3899 }, { "epoch": 0.6105197244834064, "grad_norm": 2.3917932510375977, "learning_rate": 0.00010565387228260868, "loss": 1.0594, "step": 3900 }, { "epoch": 0.6106762680025047, "grad_norm": 0.6328274011611938, "learning_rate": 0.00010564198369565216, "loss": 0.3398, "step": 3901 }, { "epoch": 0.610832811521603, "grad_norm": 0.5632893443107605, "learning_rate": 0.00010563009510869563, "loss": 0.2368, "step": 3902 }, { "epoch": 0.6109893550407013, "grad_norm": 0.914347767829895, "learning_rate": 0.00010561820652173911, "loss": 0.3416, "step": 3903 }, { "epoch": 0.6111458985597996, "grad_norm": 0.6815041899681091, "learning_rate": 0.00010560631793478259, "loss": 0.4527, "step": 3904 }, { "epoch": 0.611302442078898, "grad_norm": 0.581895112991333, "learning_rate": 0.00010559442934782607, "loss": 0.2546, "step": 3905 }, { "epoch": 0.6114589855979963, "grad_norm": 0.6965485215187073, "learning_rate": 0.00010558254076086955, "loss": 0.2914, "step": 3906 }, { "epoch": 0.6116155291170946, "grad_norm": 0.8494430184364319, "learning_rate": 0.00010557065217391304, "loss": 0.4059, "step": 3907 }, { "epoch": 0.6117720726361928, "grad_norm": 1.559639811515808, "learning_rate": 0.00010555876358695652, "loss": 0.5215, "step": 3908 }, { "epoch": 0.6119286161552911, "grad_norm": 1.102810025215149, "learning_rate": 0.000105546875, "loss": 0.4804, "step": 3909 }, { "epoch": 0.6120851596743895, "grad_norm": 0.7031951546669006, "learning_rate": 0.00010553498641304347, "loss": 0.3031, "step": 3910 }, { "epoch": 0.6122417031934878, "grad_norm": 0.8192260265350342, "learning_rate": 0.00010552309782608695, "loss": 0.23, "step": 3911 }, { "epoch": 0.6123982467125861, "grad_norm": 1.706674337387085, "learning_rate": 0.00010551120923913043, "loss": 0.5049, "step": 3912 }, { "epoch": 0.6125547902316844, "grad_norm": 1.1605480909347534, "learning_rate": 0.00010549932065217391, "loss": 0.4168, "step": 3913 }, { "epoch": 0.6127113337507827, "grad_norm": 1.0519968271255493, "learning_rate": 0.00010548743206521739, "loss": 0.4904, "step": 3914 }, { "epoch": 0.612867877269881, "grad_norm": 1.3376681804656982, "learning_rate": 0.00010547554347826086, "loss": 0.3182, "step": 3915 }, { "epoch": 0.6130244207889793, "grad_norm": 1.6382347345352173, "learning_rate": 0.00010546365489130434, "loss": 0.3299, "step": 3916 }, { "epoch": 0.6131809643080777, "grad_norm": 2.0815157890319824, "learning_rate": 0.00010545176630434782, "loss": 0.9523, "step": 3917 }, { "epoch": 0.613337507827176, "grad_norm": 1.655992865562439, "learning_rate": 0.0001054398777173913, "loss": 0.4767, "step": 3918 }, { "epoch": 0.6134940513462742, "grad_norm": 1.5186612606048584, "learning_rate": 0.00010542798913043478, "loss": 0.4949, "step": 3919 }, { "epoch": 0.6136505948653725, "grad_norm": 3.3689160346984863, "learning_rate": 0.00010541610054347826, "loss": 0.8697, "step": 3920 }, { "epoch": 0.6138071383844709, "grad_norm": 0.8339585065841675, "learning_rate": 0.00010540421195652172, "loss": 0.3485, "step": 3921 }, { "epoch": 0.6139636819035692, "grad_norm": 1.9409875869750977, "learning_rate": 0.0001053923233695652, "loss": 0.4836, "step": 3922 }, { "epoch": 0.6141202254226675, "grad_norm": 1.7795597314834595, "learning_rate": 0.00010538043478260868, "loss": 0.4575, "step": 3923 }, { "epoch": 0.6142767689417659, "grad_norm": 2.4119184017181396, "learning_rate": 0.00010536854619565215, "loss": 0.6472, "step": 3924 }, { "epoch": 0.6144333124608641, "grad_norm": 4.758424758911133, "learning_rate": 0.00010535665760869563, "loss": 0.9763, "step": 3925 }, { "epoch": 0.6145898559799624, "grad_norm": 2.1010801792144775, "learning_rate": 0.00010534476902173911, "loss": 0.5971, "step": 3926 }, { "epoch": 0.6147463994990607, "grad_norm": 3.1299712657928467, "learning_rate": 0.0001053328804347826, "loss": 0.6698, "step": 3927 }, { "epoch": 0.6149029430181591, "grad_norm": 2.040713310241699, "learning_rate": 0.00010532099184782608, "loss": 0.9354, "step": 3928 }, { "epoch": 0.6150594865372574, "grad_norm": 1.7890876531600952, "learning_rate": 0.00010530910326086956, "loss": 0.6292, "step": 3929 }, { "epoch": 0.6152160300563556, "grad_norm": 1.5200982093811035, "learning_rate": 0.00010529721467391304, "loss": 0.4425, "step": 3930 }, { "epoch": 0.6153725735754539, "grad_norm": 2.0981948375701904, "learning_rate": 0.00010528532608695652, "loss": 0.8111, "step": 3931 }, { "epoch": 0.6155291170945523, "grad_norm": 2.281752586364746, "learning_rate": 0.00010527343749999999, "loss": 0.8494, "step": 3932 }, { "epoch": 0.6156856606136506, "grad_norm": 2.8116817474365234, "learning_rate": 0.00010526154891304347, "loss": 0.814, "step": 3933 }, { "epoch": 0.6158422041327489, "grad_norm": 3.144151210784912, "learning_rate": 0.00010524966032608695, "loss": 0.8957, "step": 3934 }, { "epoch": 0.6159987476518473, "grad_norm": 3.352497100830078, "learning_rate": 0.00010523777173913043, "loss": 0.9598, "step": 3935 }, { "epoch": 0.6161552911709455, "grad_norm": 3.363102436065674, "learning_rate": 0.0001052258831521739, "loss": 0.9139, "step": 3936 }, { "epoch": 0.6163118346900438, "grad_norm": 4.275569438934326, "learning_rate": 0.00010521399456521738, "loss": 1.811, "step": 3937 }, { "epoch": 0.6164683782091421, "grad_norm": 2.7663331031799316, "learning_rate": 0.00010520210597826086, "loss": 0.9835, "step": 3938 }, { "epoch": 0.6166249217282405, "grad_norm": 1.9852709770202637, "learning_rate": 0.00010519021739130434, "loss": 0.8789, "step": 3939 }, { "epoch": 0.6167814652473388, "grad_norm": 3.5936591625213623, "learning_rate": 0.00010517832880434782, "loss": 1.2557, "step": 3940 }, { "epoch": 0.6169380087664371, "grad_norm": 3.899261474609375, "learning_rate": 0.0001051664402173913, "loss": 1.5786, "step": 3941 }, { "epoch": 0.6170945522855353, "grad_norm": 3.7433266639709473, "learning_rate": 0.00010515455163043477, "loss": 1.4756, "step": 3942 }, { "epoch": 0.6172510958046337, "grad_norm": 5.790081977844238, "learning_rate": 0.00010514266304347825, "loss": 1.265, "step": 3943 }, { "epoch": 0.617407639323732, "grad_norm": 2.4613213539123535, "learning_rate": 0.00010513077445652172, "loss": 1.8606, "step": 3944 }, { "epoch": 0.6175641828428303, "grad_norm": 2.3583734035491943, "learning_rate": 0.0001051188858695652, "loss": 1.2573, "step": 3945 }, { "epoch": 0.6177207263619287, "grad_norm": 2.1380624771118164, "learning_rate": 0.00010510699728260867, "loss": 1.0696, "step": 3946 }, { "epoch": 0.617877269881027, "grad_norm": 1.7173173427581787, "learning_rate": 0.00010509510869565217, "loss": 0.8284, "step": 3947 }, { "epoch": 0.6180338134001252, "grad_norm": 4.817514896392822, "learning_rate": 0.00010508322010869564, "loss": 0.7141, "step": 3948 }, { "epoch": 0.6181903569192235, "grad_norm": 2.796125888824463, "learning_rate": 0.00010507133152173912, "loss": 1.0275, "step": 3949 }, { "epoch": 0.6183469004383219, "grad_norm": 1.977777361869812, "learning_rate": 0.0001050594429347826, "loss": 0.8487, "step": 3950 }, { "epoch": 0.6185034439574202, "grad_norm": 0.43347790837287903, "learning_rate": 0.00010504755434782608, "loss": 0.2617, "step": 3951 }, { "epoch": 0.6186599874765185, "grad_norm": 0.4446995258331299, "learning_rate": 0.00010503566576086956, "loss": 0.2957, "step": 3952 }, { "epoch": 0.6188165309956167, "grad_norm": 1.0638105869293213, "learning_rate": 0.00010502377717391303, "loss": 0.3573, "step": 3953 }, { "epoch": 0.6189730745147151, "grad_norm": 0.5864948034286499, "learning_rate": 0.00010501188858695651, "loss": 0.3318, "step": 3954 }, { "epoch": 0.6191296180338134, "grad_norm": 0.9187406897544861, "learning_rate": 0.00010499999999999999, "loss": 0.3174, "step": 3955 }, { "epoch": 0.6192861615529117, "grad_norm": 0.7227632403373718, "learning_rate": 0.00010498811141304347, "loss": 0.4379, "step": 3956 }, { "epoch": 0.61944270507201, "grad_norm": 0.6838106513023376, "learning_rate": 0.00010497622282608695, "loss": 0.3754, "step": 3957 }, { "epoch": 0.6195992485911084, "grad_norm": 0.8625887632369995, "learning_rate": 0.00010496433423913043, "loss": 0.3879, "step": 3958 }, { "epoch": 0.6197557921102066, "grad_norm": 0.9823177456855774, "learning_rate": 0.0001049524456521739, "loss": 0.3999, "step": 3959 }, { "epoch": 0.6199123356293049, "grad_norm": 0.7134853601455688, "learning_rate": 0.00010494055706521738, "loss": 0.2866, "step": 3960 }, { "epoch": 0.6200688791484033, "grad_norm": 0.9020194411277771, "learning_rate": 0.00010492866847826086, "loss": 0.2529, "step": 3961 }, { "epoch": 0.6202254226675016, "grad_norm": 0.6266462206840515, "learning_rate": 0.00010491677989130434, "loss": 0.2559, "step": 3962 }, { "epoch": 0.6203819661865999, "grad_norm": 0.7530433535575867, "learning_rate": 0.00010490489130434782, "loss": 0.3096, "step": 3963 }, { "epoch": 0.6205385097056982, "grad_norm": 1.410359263420105, "learning_rate": 0.00010489300271739131, "loss": 0.4652, "step": 3964 }, { "epoch": 0.6206950532247965, "grad_norm": 1.7763659954071045, "learning_rate": 0.00010488111413043479, "loss": 0.5241, "step": 3965 }, { "epoch": 0.6208515967438948, "grad_norm": 1.0573714971542358, "learning_rate": 0.00010486922554347826, "loss": 0.2001, "step": 3966 }, { "epoch": 0.6210081402629931, "grad_norm": 3.2106401920318604, "learning_rate": 0.00010485733695652173, "loss": 0.4232, "step": 3967 }, { "epoch": 0.6211646837820914, "grad_norm": 1.9928174018859863, "learning_rate": 0.00010484544836956521, "loss": 0.6006, "step": 3968 }, { "epoch": 0.6213212273011898, "grad_norm": 2.0402376651763916, "learning_rate": 0.00010483355978260868, "loss": 0.6769, "step": 3969 }, { "epoch": 0.621477770820288, "grad_norm": 1.238856554031372, "learning_rate": 0.00010482167119565216, "loss": 0.6831, "step": 3970 }, { "epoch": 0.6216343143393863, "grad_norm": 1.2060602903366089, "learning_rate": 0.00010480978260869564, "loss": 0.3268, "step": 3971 }, { "epoch": 0.6217908578584846, "grad_norm": 2.249809980392456, "learning_rate": 0.00010479789402173912, "loss": 0.645, "step": 3972 }, { "epoch": 0.621947401377583, "grad_norm": 1.661942958831787, "learning_rate": 0.0001047860054347826, "loss": 0.3542, "step": 3973 }, { "epoch": 0.6221039448966813, "grad_norm": 1.7823187112808228, "learning_rate": 0.00010477411684782608, "loss": 0.6193, "step": 3974 }, { "epoch": 0.6222604884157796, "grad_norm": 1.4900668859481812, "learning_rate": 0.00010476222826086955, "loss": 0.6331, "step": 3975 }, { "epoch": 0.6224170319348779, "grad_norm": 1.4783353805541992, "learning_rate": 0.00010475033967391303, "loss": 0.4844, "step": 3976 }, { "epoch": 0.6225735754539762, "grad_norm": 1.6575108766555786, "learning_rate": 0.00010473845108695651, "loss": 0.581, "step": 3977 }, { "epoch": 0.6227301189730745, "grad_norm": 1.9975718259811401, "learning_rate": 0.00010472656249999999, "loss": 0.5673, "step": 3978 }, { "epoch": 0.6228866624921728, "grad_norm": 3.1856536865234375, "learning_rate": 0.00010471467391304347, "loss": 0.9823, "step": 3979 }, { "epoch": 0.6230432060112712, "grad_norm": 2.253793954849243, "learning_rate": 0.00010470278532608694, "loss": 0.9934, "step": 3980 }, { "epoch": 0.6231997495303695, "grad_norm": 2.964022159576416, "learning_rate": 0.00010469089673913042, "loss": 0.9817, "step": 3981 }, { "epoch": 0.6233562930494677, "grad_norm": 3.0037264823913574, "learning_rate": 0.0001046790081521739, "loss": 0.819, "step": 3982 }, { "epoch": 0.623512836568566, "grad_norm": 2.0290884971618652, "learning_rate": 0.00010466711956521738, "loss": 0.7123, "step": 3983 }, { "epoch": 0.6236693800876644, "grad_norm": 2.066370964050293, "learning_rate": 0.00010465523097826087, "loss": 0.9428, "step": 3984 }, { "epoch": 0.6238259236067627, "grad_norm": 1.4438239336013794, "learning_rate": 0.00010464334239130435, "loss": 0.4996, "step": 3985 }, { "epoch": 0.623982467125861, "grad_norm": 3.8833160400390625, "learning_rate": 0.00010463145380434783, "loss": 1.1203, "step": 3986 }, { "epoch": 0.6241390106449592, "grad_norm": 2.913878917694092, "learning_rate": 0.0001046195652173913, "loss": 0.9039, "step": 3987 }, { "epoch": 0.6242955541640576, "grad_norm": 3.2756423950195312, "learning_rate": 0.00010460767663043478, "loss": 0.7236, "step": 3988 }, { "epoch": 0.6244520976831559, "grad_norm": 1.410951852798462, "learning_rate": 0.00010459578804347826, "loss": 0.376, "step": 3989 }, { "epoch": 0.6246086412022542, "grad_norm": 4.086179733276367, "learning_rate": 0.00010458389945652173, "loss": 1.4196, "step": 3990 }, { "epoch": 0.6247651847213526, "grad_norm": 5.5366926193237305, "learning_rate": 0.0001045720108695652, "loss": 1.5388, "step": 3991 }, { "epoch": 0.6249217282404509, "grad_norm": 3.381153106689453, "learning_rate": 0.00010456012228260868, "loss": 1.2609, "step": 3992 }, { "epoch": 0.6250782717595491, "grad_norm": 3.1878719329833984, "learning_rate": 0.00010454823369565216, "loss": 1.8202, "step": 3993 }, { "epoch": 0.6252348152786474, "grad_norm": 4.378516674041748, "learning_rate": 0.00010453634510869564, "loss": 1.4864, "step": 3994 }, { "epoch": 0.6253913587977458, "grad_norm": 2.408742904663086, "learning_rate": 0.00010452445652173912, "loss": 0.829, "step": 3995 }, { "epoch": 0.6255479023168441, "grad_norm": 1.094522476196289, "learning_rate": 0.0001045125679347826, "loss": 0.4179, "step": 3996 }, { "epoch": 0.6257044458359424, "grad_norm": 2.335756301879883, "learning_rate": 0.00010450067934782607, "loss": 0.3892, "step": 3997 }, { "epoch": 0.6258609893550408, "grad_norm": 3.4787280559539795, "learning_rate": 0.00010448879076086955, "loss": 0.8277, "step": 3998 }, { "epoch": 0.626017532874139, "grad_norm": 2.3362085819244385, "learning_rate": 0.00010447690217391303, "loss": 0.7101, "step": 3999 }, { "epoch": 0.6261740763932373, "grad_norm": 1.3911454677581787, "learning_rate": 0.00010446501358695651, "loss": 0.5129, "step": 4000 }, { "epoch": 0.6261740763932373, "eval_loss": 0.5772714018821716, "eval_runtime": 203.5459, "eval_samples_per_second": 60.836, "eval_steps_per_second": 3.803, "eval_wer": 0.3657904243789447, "step": 4000 }, { "epoch": 0.6263306199123356, "grad_norm": 0.6978228688240051, "learning_rate": 0.00010445312499999999, "loss": 0.298, "step": 4001 }, { "epoch": 0.626487163431434, "grad_norm": 0.5260750651359558, "learning_rate": 0.00010444123641304346, "loss": 0.2226, "step": 4002 }, { "epoch": 0.6266437069505323, "grad_norm": 0.3482053279876709, "learning_rate": 0.00010442934782608694, "loss": 0.1326, "step": 4003 }, { "epoch": 0.6268002504696305, "grad_norm": 0.827627956867218, "learning_rate": 0.00010441745923913043, "loss": 0.312, "step": 4004 }, { "epoch": 0.6269567939887288, "grad_norm": 0.581010103225708, "learning_rate": 0.00010440557065217391, "loss": 0.2627, "step": 4005 }, { "epoch": 0.6271133375078272, "grad_norm": 0.6377661824226379, "learning_rate": 0.00010439368206521739, "loss": 0.3021, "step": 4006 }, { "epoch": 0.6272698810269255, "grad_norm": 0.8933166265487671, "learning_rate": 0.00010438179347826087, "loss": 0.2723, "step": 4007 }, { "epoch": 0.6274264245460238, "grad_norm": 0.7774326205253601, "learning_rate": 0.00010436990489130435, "loss": 0.3056, "step": 4008 }, { "epoch": 0.6275829680651221, "grad_norm": 0.7836750745773315, "learning_rate": 0.00010435801630434782, "loss": 0.2444, "step": 4009 }, { "epoch": 0.6277395115842204, "grad_norm": 1.091309905052185, "learning_rate": 0.0001043461277173913, "loss": 0.3245, "step": 4010 }, { "epoch": 0.6278960551033187, "grad_norm": 1.654447317123413, "learning_rate": 0.00010433423913043478, "loss": 0.7128, "step": 4011 }, { "epoch": 0.628052598622417, "grad_norm": 0.7571307420730591, "learning_rate": 0.00010432235054347826, "loss": 0.2909, "step": 4012 }, { "epoch": 0.6282091421415154, "grad_norm": 1.0385518074035645, "learning_rate": 0.00010431046195652172, "loss": 0.2183, "step": 4013 }, { "epoch": 0.6283656856606137, "grad_norm": 1.4710664749145508, "learning_rate": 0.0001042985733695652, "loss": 0.3217, "step": 4014 }, { "epoch": 0.628522229179712, "grad_norm": 1.3069616556167603, "learning_rate": 0.00010428668478260868, "loss": 0.4053, "step": 4015 }, { "epoch": 0.6286787726988102, "grad_norm": 1.2037640810012817, "learning_rate": 0.00010427479619565216, "loss": 0.3417, "step": 4016 }, { "epoch": 0.6288353162179086, "grad_norm": 1.3363749980926514, "learning_rate": 0.00010426290760869564, "loss": 0.6021, "step": 4017 }, { "epoch": 0.6289918597370069, "grad_norm": 1.2290174961090088, "learning_rate": 0.00010425101902173911, "loss": 0.5178, "step": 4018 }, { "epoch": 0.6291484032561052, "grad_norm": 0.8243271112442017, "learning_rate": 0.00010423913043478259, "loss": 0.3679, "step": 4019 }, { "epoch": 0.6293049467752035, "grad_norm": 1.031148076057434, "learning_rate": 0.00010422724184782607, "loss": 0.3059, "step": 4020 }, { "epoch": 0.6294614902943018, "grad_norm": 1.8514176607131958, "learning_rate": 0.00010421535326086955, "loss": 0.6852, "step": 4021 }, { "epoch": 0.6296180338134001, "grad_norm": 2.974085807800293, "learning_rate": 0.00010420346467391303, "loss": 0.8811, "step": 4022 }, { "epoch": 0.6297745773324984, "grad_norm": 1.4989734888076782, "learning_rate": 0.0001041915760869565, "loss": 0.6977, "step": 4023 }, { "epoch": 0.6299311208515967, "grad_norm": 1.8424277305603027, "learning_rate": 0.0001041796875, "loss": 0.3281, "step": 4024 }, { "epoch": 0.6300876643706951, "grad_norm": 1.8009552955627441, "learning_rate": 0.00010416779891304348, "loss": 0.6614, "step": 4025 }, { "epoch": 0.6302442078897934, "grad_norm": 1.0094013214111328, "learning_rate": 0.00010415591032608695, "loss": 0.4184, "step": 4026 }, { "epoch": 0.6304007514088916, "grad_norm": 3.122659683227539, "learning_rate": 0.00010414402173913043, "loss": 0.4806, "step": 4027 }, { "epoch": 0.63055729492799, "grad_norm": 5.16139554977417, "learning_rate": 0.00010413213315217391, "loss": 0.7214, "step": 4028 }, { "epoch": 0.6307138384470883, "grad_norm": 3.217289924621582, "learning_rate": 0.00010412024456521739, "loss": 0.7113, "step": 4029 }, { "epoch": 0.6308703819661866, "grad_norm": 1.9092555046081543, "learning_rate": 0.00010410835597826087, "loss": 0.9429, "step": 4030 }, { "epoch": 0.6310269254852849, "grad_norm": 2.1655004024505615, "learning_rate": 0.00010409646739130434, "loss": 0.6868, "step": 4031 }, { "epoch": 0.6311834690043833, "grad_norm": 4.495182991027832, "learning_rate": 0.00010408457880434782, "loss": 0.7381, "step": 4032 }, { "epoch": 0.6313400125234815, "grad_norm": 1.7808223962783813, "learning_rate": 0.0001040726902173913, "loss": 0.4878, "step": 4033 }, { "epoch": 0.6314965560425798, "grad_norm": 3.6679999828338623, "learning_rate": 0.00010406080163043478, "loss": 0.7865, "step": 4034 }, { "epoch": 0.6316530995616781, "grad_norm": 2.491879463195801, "learning_rate": 0.00010404891304347826, "loss": 0.9386, "step": 4035 }, { "epoch": 0.6318096430807765, "grad_norm": 4.499117851257324, "learning_rate": 0.00010403702445652172, "loss": 1.5761, "step": 4036 }, { "epoch": 0.6319661865998748, "grad_norm": 2.5232341289520264, "learning_rate": 0.0001040251358695652, "loss": 0.6912, "step": 4037 }, { "epoch": 0.632122730118973, "grad_norm": 3.5835039615631104, "learning_rate": 0.00010401324728260868, "loss": 0.8897, "step": 4038 }, { "epoch": 0.6322792736380713, "grad_norm": 2.9215965270996094, "learning_rate": 0.00010400135869565216, "loss": 0.8802, "step": 4039 }, { "epoch": 0.6324358171571697, "grad_norm": 2.4509475231170654, "learning_rate": 0.00010398947010869563, "loss": 1.4003, "step": 4040 }, { "epoch": 0.632592360676268, "grad_norm": 4.1349005699157715, "learning_rate": 0.00010397758152173911, "loss": 1.5385, "step": 4041 }, { "epoch": 0.6327489041953663, "grad_norm": 2.9227840900421143, "learning_rate": 0.00010396569293478259, "loss": 0.9354, "step": 4042 }, { "epoch": 0.6329054477144647, "grad_norm": 2.7755608558654785, "learning_rate": 0.00010395380434782607, "loss": 1.4833, "step": 4043 }, { "epoch": 0.6330619912335629, "grad_norm": 3.1057541370391846, "learning_rate": 0.00010394191576086956, "loss": 1.6103, "step": 4044 }, { "epoch": 0.6332185347526612, "grad_norm": 3.3839879035949707, "learning_rate": 0.00010393002717391304, "loss": 1.0411, "step": 4045 }, { "epoch": 0.6333750782717595, "grad_norm": 3.183776378631592, "learning_rate": 0.00010391813858695652, "loss": 1.0271, "step": 4046 }, { "epoch": 0.6335316217908579, "grad_norm": 6.319929122924805, "learning_rate": 0.00010390625, "loss": 0.7778, "step": 4047 }, { "epoch": 0.6336881653099562, "grad_norm": 3.903947353363037, "learning_rate": 0.00010389436141304347, "loss": 1.41, "step": 4048 }, { "epoch": 0.6338447088290545, "grad_norm": 2.1196155548095703, "learning_rate": 0.00010388247282608695, "loss": 1.0683, "step": 4049 }, { "epoch": 0.6340012523481527, "grad_norm": 3.4568004608154297, "learning_rate": 0.00010387058423913043, "loss": 1.6134, "step": 4050 }, { "epoch": 0.6341577958672511, "grad_norm": 0.4585835039615631, "learning_rate": 0.00010385869565217391, "loss": 0.2759, "step": 4051 }, { "epoch": 0.6343143393863494, "grad_norm": 0.5921961069107056, "learning_rate": 0.00010384680706521739, "loss": 0.2677, "step": 4052 }, { "epoch": 0.6344708829054477, "grad_norm": 0.5343387126922607, "learning_rate": 0.00010383491847826086, "loss": 0.3265, "step": 4053 }, { "epoch": 0.6346274264245461, "grad_norm": 0.5477249622344971, "learning_rate": 0.00010382302989130434, "loss": 0.2046, "step": 4054 }, { "epoch": 0.6347839699436444, "grad_norm": 0.5539407730102539, "learning_rate": 0.00010381114130434782, "loss": 0.2746, "step": 4055 }, { "epoch": 0.6349405134627426, "grad_norm": 0.6220961213111877, "learning_rate": 0.0001037992527173913, "loss": 0.3914, "step": 4056 }, { "epoch": 0.6350970569818409, "grad_norm": 0.6065669655799866, "learning_rate": 0.00010378736413043478, "loss": 0.3194, "step": 4057 }, { "epoch": 0.6352536005009393, "grad_norm": 0.7210943102836609, "learning_rate": 0.00010377547554347825, "loss": 0.3632, "step": 4058 }, { "epoch": 0.6354101440200376, "grad_norm": 1.4486981630325317, "learning_rate": 0.00010376358695652172, "loss": 0.3343, "step": 4059 }, { "epoch": 0.6355666875391359, "grad_norm": 1.020706057548523, "learning_rate": 0.0001037516983695652, "loss": 0.2903, "step": 4060 }, { "epoch": 0.6357232310582341, "grad_norm": 1.299141764640808, "learning_rate": 0.00010373980978260868, "loss": 0.6675, "step": 4061 }, { "epoch": 0.6358797745773325, "grad_norm": 0.7903620004653931, "learning_rate": 0.00010372792119565215, "loss": 0.3021, "step": 4062 }, { "epoch": 0.6360363180964308, "grad_norm": 2.2974159717559814, "learning_rate": 0.00010371603260869563, "loss": 0.4991, "step": 4063 }, { "epoch": 0.6361928616155291, "grad_norm": 1.054249882698059, "learning_rate": 0.00010370414402173912, "loss": 0.3998, "step": 4064 }, { "epoch": 0.6363494051346275, "grad_norm": 1.160606861114502, "learning_rate": 0.0001036922554347826, "loss": 0.4337, "step": 4065 }, { "epoch": 0.6365059486537258, "grad_norm": 0.941593587398529, "learning_rate": 0.00010368036684782608, "loss": 0.4164, "step": 4066 }, { "epoch": 0.636662492172824, "grad_norm": 1.4010554552078247, "learning_rate": 0.00010366847826086956, "loss": 0.5314, "step": 4067 }, { "epoch": 0.6368190356919223, "grad_norm": 1.251501441001892, "learning_rate": 0.00010365658967391304, "loss": 0.285, "step": 4068 }, { "epoch": 0.6369755792110207, "grad_norm": 1.306986927986145, "learning_rate": 0.00010364470108695651, "loss": 0.4729, "step": 4069 }, { "epoch": 0.637132122730119, "grad_norm": 1.633126139640808, "learning_rate": 0.00010363281249999999, "loss": 0.8296, "step": 4070 }, { "epoch": 0.6372886662492173, "grad_norm": 2.225796937942505, "learning_rate": 0.00010362092391304347, "loss": 0.5607, "step": 4071 }, { "epoch": 0.6374452097683156, "grad_norm": 2.1511332988739014, "learning_rate": 0.00010360903532608695, "loss": 0.5974, "step": 4072 }, { "epoch": 0.6376017532874139, "grad_norm": 4.184909343719482, "learning_rate": 0.00010359714673913043, "loss": 0.7845, "step": 4073 }, { "epoch": 0.6377582968065122, "grad_norm": 1.7790095806121826, "learning_rate": 0.0001035852581521739, "loss": 0.5926, "step": 4074 }, { "epoch": 0.6379148403256105, "grad_norm": 1.9596532583236694, "learning_rate": 0.00010357336956521738, "loss": 0.6445, "step": 4075 }, { "epoch": 0.6380713838447089, "grad_norm": 2.6455159187316895, "learning_rate": 0.00010356148097826086, "loss": 0.9494, "step": 4076 }, { "epoch": 0.6382279273638072, "grad_norm": 2.376279354095459, "learning_rate": 0.00010354959239130434, "loss": 0.6673, "step": 4077 }, { "epoch": 0.6383844708829054, "grad_norm": 1.7074650526046753, "learning_rate": 0.00010353770380434782, "loss": 0.3404, "step": 4078 }, { "epoch": 0.6385410144020037, "grad_norm": 1.6392316818237305, "learning_rate": 0.0001035258152173913, "loss": 0.7197, "step": 4079 }, { "epoch": 0.638697557921102, "grad_norm": 3.772491693496704, "learning_rate": 0.00010351392663043477, "loss": 1.0216, "step": 4080 }, { "epoch": 0.6388541014402004, "grad_norm": 2.3005876541137695, "learning_rate": 0.00010350203804347827, "loss": 0.8894, "step": 4081 }, { "epoch": 0.6390106449592987, "grad_norm": 2.974719285964966, "learning_rate": 0.00010349014945652172, "loss": 0.6195, "step": 4082 }, { "epoch": 0.639167188478397, "grad_norm": 3.0893924236297607, "learning_rate": 0.00010347826086956521, "loss": 1.1732, "step": 4083 }, { "epoch": 0.6393237319974953, "grad_norm": 1.2101927995681763, "learning_rate": 0.00010346637228260869, "loss": 0.5295, "step": 4084 }, { "epoch": 0.6394802755165936, "grad_norm": 1.7010469436645508, "learning_rate": 0.00010345448369565216, "loss": 1.0197, "step": 4085 }, { "epoch": 0.6396368190356919, "grad_norm": 2.8491408824920654, "learning_rate": 0.00010344259510869564, "loss": 1.4094, "step": 4086 }, { "epoch": 0.6397933625547902, "grad_norm": 2.6932125091552734, "learning_rate": 0.00010343070652173912, "loss": 0.8381, "step": 4087 }, { "epoch": 0.6399499060738886, "grad_norm": 3.556104898452759, "learning_rate": 0.0001034188179347826, "loss": 1.1687, "step": 4088 }, { "epoch": 0.6401064495929869, "grad_norm": 4.639225959777832, "learning_rate": 0.00010340692934782608, "loss": 1.1235, "step": 4089 }, { "epoch": 0.6402629931120851, "grad_norm": 4.422469139099121, "learning_rate": 0.00010339504076086956, "loss": 1.6915, "step": 4090 }, { "epoch": 0.6404195366311835, "grad_norm": 1.7094950675964355, "learning_rate": 0.00010338315217391303, "loss": 0.8344, "step": 4091 }, { "epoch": 0.6405760801502818, "grad_norm": 2.2492215633392334, "learning_rate": 0.00010337126358695651, "loss": 0.915, "step": 4092 }, { "epoch": 0.6407326236693801, "grad_norm": 1.7790583372116089, "learning_rate": 0.00010335937499999999, "loss": 1.1052, "step": 4093 }, { "epoch": 0.6408891671884784, "grad_norm": 4.020028591156006, "learning_rate": 0.00010334748641304347, "loss": 1.4823, "step": 4094 }, { "epoch": 0.6410457107075767, "grad_norm": 5.459923267364502, "learning_rate": 0.00010333559782608695, "loss": 0.6403, "step": 4095 }, { "epoch": 0.641202254226675, "grad_norm": 1.770797848701477, "learning_rate": 0.00010332370923913042, "loss": 0.6226, "step": 4096 }, { "epoch": 0.6413587977457733, "grad_norm": 4.1617536544799805, "learning_rate": 0.0001033118206521739, "loss": 1.1179, "step": 4097 }, { "epoch": 0.6415153412648716, "grad_norm": 2.390986680984497, "learning_rate": 0.00010329993206521738, "loss": 1.0695, "step": 4098 }, { "epoch": 0.64167188478397, "grad_norm": 1.9928092956542969, "learning_rate": 0.00010328804347826086, "loss": 1.2217, "step": 4099 }, { "epoch": 0.6418284283030683, "grad_norm": 3.2374558448791504, "learning_rate": 0.00010327615489130434, "loss": 1.2815, "step": 4100 }, { "epoch": 0.6419849718221665, "grad_norm": 0.7591182589530945, "learning_rate": 0.00010326426630434783, "loss": 0.447, "step": 4101 }, { "epoch": 0.6421415153412648, "grad_norm": 0.5413169860839844, "learning_rate": 0.00010325237771739131, "loss": 0.2865, "step": 4102 }, { "epoch": 0.6422980588603632, "grad_norm": 0.5783164501190186, "learning_rate": 0.00010324048913043478, "loss": 0.228, "step": 4103 }, { "epoch": 0.6424546023794615, "grad_norm": 0.6916652917861938, "learning_rate": 0.00010322860054347826, "loss": 0.2981, "step": 4104 }, { "epoch": 0.6426111458985598, "grad_norm": 1.1056618690490723, "learning_rate": 0.00010321671195652173, "loss": 0.3587, "step": 4105 }, { "epoch": 0.6427676894176582, "grad_norm": 0.4993519186973572, "learning_rate": 0.0001032048233695652, "loss": 0.2501, "step": 4106 }, { "epoch": 0.6429242329367564, "grad_norm": 0.8355669975280762, "learning_rate": 0.00010319293478260868, "loss": 0.366, "step": 4107 }, { "epoch": 0.6430807764558547, "grad_norm": 1.125137209892273, "learning_rate": 0.00010318104619565216, "loss": 0.3888, "step": 4108 }, { "epoch": 0.643237319974953, "grad_norm": 0.9485219120979309, "learning_rate": 0.00010316915760869564, "loss": 0.2888, "step": 4109 }, { "epoch": 0.6433938634940514, "grad_norm": 0.8328590393066406, "learning_rate": 0.00010315726902173912, "loss": 0.3128, "step": 4110 }, { "epoch": 0.6435504070131497, "grad_norm": 0.9747029542922974, "learning_rate": 0.0001031453804347826, "loss": 0.41, "step": 4111 }, { "epoch": 0.6437069505322479, "grad_norm": 1.4699167013168335, "learning_rate": 0.00010313349184782607, "loss": 0.3877, "step": 4112 }, { "epoch": 0.6438634940513462, "grad_norm": 0.7101068496704102, "learning_rate": 0.00010312160326086955, "loss": 0.2868, "step": 4113 }, { "epoch": 0.6440200375704446, "grad_norm": 0.8660994172096252, "learning_rate": 0.00010310971467391303, "loss": 0.4217, "step": 4114 }, { "epoch": 0.6441765810895429, "grad_norm": 1.5901340246200562, "learning_rate": 0.00010309782608695651, "loss": 0.381, "step": 4115 }, { "epoch": 0.6443331246086412, "grad_norm": 2.3090481758117676, "learning_rate": 0.00010308593749999999, "loss": 0.7915, "step": 4116 }, { "epoch": 0.6444896681277396, "grad_norm": 1.9702280759811401, "learning_rate": 0.00010307404891304347, "loss": 0.4281, "step": 4117 }, { "epoch": 0.6446462116468378, "grad_norm": 1.523441195487976, "learning_rate": 0.00010306216032608694, "loss": 0.7454, "step": 4118 }, { "epoch": 0.6448027551659361, "grad_norm": 1.429452896118164, "learning_rate": 0.00010305027173913042, "loss": 0.3925, "step": 4119 }, { "epoch": 0.6449592986850344, "grad_norm": 2.1524550914764404, "learning_rate": 0.0001030383831521739, "loss": 0.5927, "step": 4120 }, { "epoch": 0.6451158422041328, "grad_norm": 1.5227117538452148, "learning_rate": 0.00010302649456521739, "loss": 0.6505, "step": 4121 }, { "epoch": 0.6452723857232311, "grad_norm": 1.455626368522644, "learning_rate": 0.00010301460597826087, "loss": 0.4702, "step": 4122 }, { "epoch": 0.6454289292423294, "grad_norm": 1.1152808666229248, "learning_rate": 0.00010300271739130435, "loss": 0.4937, "step": 4123 }, { "epoch": 0.6455854727614276, "grad_norm": 2.3649699687957764, "learning_rate": 0.00010299082880434783, "loss": 0.5262, "step": 4124 }, { "epoch": 0.645742016280526, "grad_norm": 1.6312533617019653, "learning_rate": 0.0001029789402173913, "loss": 0.7233, "step": 4125 }, { "epoch": 0.6458985597996243, "grad_norm": 2.767646551132202, "learning_rate": 0.00010296705163043478, "loss": 0.8281, "step": 4126 }, { "epoch": 0.6460551033187226, "grad_norm": 2.120460033416748, "learning_rate": 0.00010295516304347826, "loss": 0.7566, "step": 4127 }, { "epoch": 0.646211646837821, "grad_norm": 2.3103082180023193, "learning_rate": 0.00010294327445652173, "loss": 0.8699, "step": 4128 }, { "epoch": 0.6463681903569192, "grad_norm": 1.8997623920440674, "learning_rate": 0.0001029313858695652, "loss": 0.8036, "step": 4129 }, { "epoch": 0.6465247338760175, "grad_norm": 2.224393844604492, "learning_rate": 0.00010291949728260868, "loss": 0.9677, "step": 4130 }, { "epoch": 0.6466812773951158, "grad_norm": 2.258553981781006, "learning_rate": 0.00010290760869565216, "loss": 0.8416, "step": 4131 }, { "epoch": 0.6468378209142142, "grad_norm": 1.9712226390838623, "learning_rate": 0.00010289572010869564, "loss": 0.5686, "step": 4132 }, { "epoch": 0.6469943644333125, "grad_norm": 2.525386333465576, "learning_rate": 0.00010288383152173912, "loss": 1.153, "step": 4133 }, { "epoch": 0.6471509079524108, "grad_norm": 2.0125203132629395, "learning_rate": 0.0001028719429347826, "loss": 0.4562, "step": 4134 }, { "epoch": 0.647307451471509, "grad_norm": 1.3632500171661377, "learning_rate": 0.00010286005434782607, "loss": 0.8889, "step": 4135 }, { "epoch": 0.6474639949906074, "grad_norm": 4.24240779876709, "learning_rate": 0.00010284816576086955, "loss": 1.7773, "step": 4136 }, { "epoch": 0.6476205385097057, "grad_norm": 3.4192938804626465, "learning_rate": 0.00010283627717391303, "loss": 0.9526, "step": 4137 }, { "epoch": 0.647777082028804, "grad_norm": 2.883601665496826, "learning_rate": 0.00010282438858695651, "loss": 0.7496, "step": 4138 }, { "epoch": 0.6479336255479023, "grad_norm": 2.531860113143921, "learning_rate": 0.00010281249999999999, "loss": 0.766, "step": 4139 }, { "epoch": 0.6480901690670007, "grad_norm": 1.926196575164795, "learning_rate": 0.00010280061141304346, "loss": 1.2389, "step": 4140 }, { "epoch": 0.6482467125860989, "grad_norm": 3.817086696624756, "learning_rate": 0.00010278872282608695, "loss": 1.8259, "step": 4141 }, { "epoch": 0.6484032561051972, "grad_norm": 3.6205596923828125, "learning_rate": 0.00010277683423913043, "loss": 1.52, "step": 4142 }, { "epoch": 0.6485597996242956, "grad_norm": 3.1745455265045166, "learning_rate": 0.00010276494565217391, "loss": 1.3014, "step": 4143 }, { "epoch": 0.6487163431433939, "grad_norm": 2.5420145988464355, "learning_rate": 0.00010275305706521739, "loss": 1.6306, "step": 4144 }, { "epoch": 0.6488728866624922, "grad_norm": 2.5598254203796387, "learning_rate": 0.00010274116847826087, "loss": 1.0348, "step": 4145 }, { "epoch": 0.6490294301815904, "grad_norm": 2.678588390350342, "learning_rate": 0.00010272927989130435, "loss": 1.0922, "step": 4146 }, { "epoch": 0.6491859737006888, "grad_norm": 1.9039177894592285, "learning_rate": 0.00010271739130434782, "loss": 0.6895, "step": 4147 }, { "epoch": 0.6493425172197871, "grad_norm": 1.8550409078598022, "learning_rate": 0.0001027055027173913, "loss": 0.9207, "step": 4148 }, { "epoch": 0.6494990607388854, "grad_norm": 2.1836347579956055, "learning_rate": 0.00010269361413043478, "loss": 1.0407, "step": 4149 }, { "epoch": 0.6496556042579837, "grad_norm": 2.36080002784729, "learning_rate": 0.00010268172554347826, "loss": 1.5266, "step": 4150 }, { "epoch": 0.6498121477770821, "grad_norm": 0.4822608530521393, "learning_rate": 0.00010266983695652172, "loss": 0.2605, "step": 4151 }, { "epoch": 0.6499686912961803, "grad_norm": 0.5204498171806335, "learning_rate": 0.0001026579483695652, "loss": 0.2398, "step": 4152 }, { "epoch": 0.6501252348152786, "grad_norm": 0.5622870922088623, "learning_rate": 0.00010264605978260868, "loss": 0.281, "step": 4153 }, { "epoch": 0.650281778334377, "grad_norm": 0.7722529768943787, "learning_rate": 0.00010263417119565216, "loss": 0.3361, "step": 4154 }, { "epoch": 0.6504383218534753, "grad_norm": 0.8939089179039001, "learning_rate": 0.00010262228260869564, "loss": 0.2871, "step": 4155 }, { "epoch": 0.6505948653725736, "grad_norm": 1.1866737604141235, "learning_rate": 0.00010261039402173911, "loss": 0.3489, "step": 4156 }, { "epoch": 0.6507514088916719, "grad_norm": 1.1030261516571045, "learning_rate": 0.00010259850543478259, "loss": 0.2415, "step": 4157 }, { "epoch": 0.6509079524107702, "grad_norm": 1.4707717895507812, "learning_rate": 0.00010258661684782607, "loss": 0.3505, "step": 4158 }, { "epoch": 0.6510644959298685, "grad_norm": 0.86351078748703, "learning_rate": 0.00010257472826086955, "loss": 0.3455, "step": 4159 }, { "epoch": 0.6512210394489668, "grad_norm": 3.543945789337158, "learning_rate": 0.00010256283967391304, "loss": 0.5662, "step": 4160 }, { "epoch": 0.6513775829680651, "grad_norm": 1.0591367483139038, "learning_rate": 0.00010255095108695652, "loss": 0.2258, "step": 4161 }, { "epoch": 0.6515341264871635, "grad_norm": 2.9239308834075928, "learning_rate": 0.0001025390625, "loss": 0.7183, "step": 4162 }, { "epoch": 0.6516906700062617, "grad_norm": 0.5783994793891907, "learning_rate": 0.00010252717391304347, "loss": 0.2891, "step": 4163 }, { "epoch": 0.65184721352536, "grad_norm": 1.2667509317398071, "learning_rate": 0.00010251528532608695, "loss": 0.4401, "step": 4164 }, { "epoch": 0.6520037570444583, "grad_norm": 1.418181300163269, "learning_rate": 0.00010250339673913043, "loss": 0.5597, "step": 4165 }, { "epoch": 0.6521603005635567, "grad_norm": 1.141985535621643, "learning_rate": 0.00010249150815217391, "loss": 0.4411, "step": 4166 }, { "epoch": 0.652316844082655, "grad_norm": 1.1245514154434204, "learning_rate": 0.00010247961956521739, "loss": 0.3716, "step": 4167 }, { "epoch": 0.6524733876017533, "grad_norm": 1.5991551876068115, "learning_rate": 0.00010246773097826087, "loss": 0.4296, "step": 4168 }, { "epoch": 0.6526299311208515, "grad_norm": 1.570334553718567, "learning_rate": 0.00010245584239130434, "loss": 0.5159, "step": 4169 }, { "epoch": 0.6527864746399499, "grad_norm": 1.3773804903030396, "learning_rate": 0.00010244395380434782, "loss": 0.3359, "step": 4170 }, { "epoch": 0.6529430181590482, "grad_norm": 1.3291363716125488, "learning_rate": 0.0001024320652173913, "loss": 0.4606, "step": 4171 }, { "epoch": 0.6530995616781465, "grad_norm": 1.028124451637268, "learning_rate": 0.00010242017663043478, "loss": 0.3932, "step": 4172 }, { "epoch": 0.6532561051972449, "grad_norm": 1.9738014936447144, "learning_rate": 0.00010240828804347826, "loss": 0.5721, "step": 4173 }, { "epoch": 0.6534126487163432, "grad_norm": 3.2061328887939453, "learning_rate": 0.00010239639945652172, "loss": 0.7387, "step": 4174 }, { "epoch": 0.6535691922354414, "grad_norm": 2.0698328018188477, "learning_rate": 0.0001023845108695652, "loss": 0.6407, "step": 4175 }, { "epoch": 0.6537257357545397, "grad_norm": 1.177880883216858, "learning_rate": 0.00010237262228260868, "loss": 0.3499, "step": 4176 }, { "epoch": 0.6538822792736381, "grad_norm": 2.0393683910369873, "learning_rate": 0.00010236073369565215, "loss": 0.5814, "step": 4177 }, { "epoch": 0.6540388227927364, "grad_norm": 2.0707309246063232, "learning_rate": 0.00010234884510869563, "loss": 0.7245, "step": 4178 }, { "epoch": 0.6541953663118347, "grad_norm": 2.003929853439331, "learning_rate": 0.00010233695652173911, "loss": 0.6778, "step": 4179 }, { "epoch": 0.654351909830933, "grad_norm": 4.076833724975586, "learning_rate": 0.0001023250679347826, "loss": 0.7984, "step": 4180 }, { "epoch": 0.6545084533500313, "grad_norm": 1.593590259552002, "learning_rate": 0.00010231317934782608, "loss": 1.1511, "step": 4181 }, { "epoch": 0.6546649968691296, "grad_norm": 3.793682098388672, "learning_rate": 0.00010230129076086956, "loss": 0.878, "step": 4182 }, { "epoch": 0.6548215403882279, "grad_norm": 3.4322893619537354, "learning_rate": 0.00010228940217391304, "loss": 0.7515, "step": 4183 }, { "epoch": 0.6549780839073263, "grad_norm": 1.5116567611694336, "learning_rate": 0.00010227751358695652, "loss": 0.5655, "step": 4184 }, { "epoch": 0.6551346274264246, "grad_norm": 2.7753686904907227, "learning_rate": 0.000102265625, "loss": 0.9285, "step": 4185 }, { "epoch": 0.6552911709455228, "grad_norm": 2.2391088008880615, "learning_rate": 0.00010225373641304347, "loss": 0.5235, "step": 4186 }, { "epoch": 0.6554477144646211, "grad_norm": 3.4614176750183105, "learning_rate": 0.00010224184782608695, "loss": 1.0102, "step": 4187 }, { "epoch": 0.6556042579837195, "grad_norm": 3.2584307193756104, "learning_rate": 0.00010222995923913043, "loss": 1.0124, "step": 4188 }, { "epoch": 0.6557608015028178, "grad_norm": 2.1870687007904053, "learning_rate": 0.0001022180706521739, "loss": 1.0364, "step": 4189 }, { "epoch": 0.6559173450219161, "grad_norm": 2.7879443168640137, "learning_rate": 0.00010220618206521738, "loss": 1.0986, "step": 4190 }, { "epoch": 0.6560738885410144, "grad_norm": 3.1401877403259277, "learning_rate": 0.00010219429347826086, "loss": 1.3621, "step": 4191 }, { "epoch": 0.6562304320601127, "grad_norm": 2.785287618637085, "learning_rate": 0.00010218240489130434, "loss": 1.0314, "step": 4192 }, { "epoch": 0.656386975579211, "grad_norm": 2.514516830444336, "learning_rate": 0.00010217051630434782, "loss": 1.0492, "step": 4193 }, { "epoch": 0.6565435190983093, "grad_norm": 3.0282514095306396, "learning_rate": 0.0001021586277173913, "loss": 0.9498, "step": 4194 }, { "epoch": 0.6567000626174077, "grad_norm": 2.205033779144287, "learning_rate": 0.00010214673913043478, "loss": 1.372, "step": 4195 }, { "epoch": 0.656856606136506, "grad_norm": 1.4840466976165771, "learning_rate": 0.00010213485054347825, "loss": 0.5097, "step": 4196 }, { "epoch": 0.6570131496556043, "grad_norm": 3.2332770824432373, "learning_rate": 0.00010212296195652172, "loss": 0.7366, "step": 4197 }, { "epoch": 0.6571696931747025, "grad_norm": 1.5061862468719482, "learning_rate": 0.0001021110733695652, "loss": 0.6595, "step": 4198 }, { "epoch": 0.6573262366938009, "grad_norm": 2.254748582839966, "learning_rate": 0.00010209918478260867, "loss": 0.5057, "step": 4199 }, { "epoch": 0.6574827802128992, "grad_norm": 4.771387577056885, "learning_rate": 0.00010208729619565217, "loss": 1.3745, "step": 4200 }, { "epoch": 0.6576393237319975, "grad_norm": 0.6593992710113525, "learning_rate": 0.00010207540760869564, "loss": 0.2033, "step": 4201 }, { "epoch": 0.6577958672510958, "grad_norm": 0.7704123258590698, "learning_rate": 0.00010206351902173912, "loss": 0.2935, "step": 4202 }, { "epoch": 0.6579524107701941, "grad_norm": 0.9974800944328308, "learning_rate": 0.0001020516304347826, "loss": 0.2354, "step": 4203 }, { "epoch": 0.6581089542892924, "grad_norm": 0.49255990982055664, "learning_rate": 0.00010203974184782608, "loss": 0.2237, "step": 4204 }, { "epoch": 0.6582654978083907, "grad_norm": 1.4800585508346558, "learning_rate": 0.00010202785326086956, "loss": 0.3293, "step": 4205 }, { "epoch": 0.658422041327489, "grad_norm": 0.8603168725967407, "learning_rate": 0.00010201596467391304, "loss": 0.3488, "step": 4206 }, { "epoch": 0.6585785848465874, "grad_norm": 1.0220345258712769, "learning_rate": 0.00010200407608695651, "loss": 0.3765, "step": 4207 }, { "epoch": 0.6587351283656857, "grad_norm": 0.8767854571342468, "learning_rate": 0.00010199218749999999, "loss": 0.2995, "step": 4208 }, { "epoch": 0.6588916718847839, "grad_norm": 0.7438730597496033, "learning_rate": 0.00010198029891304347, "loss": 0.3552, "step": 4209 }, { "epoch": 0.6590482154038823, "grad_norm": 11.577252388000488, "learning_rate": 0.00010196841032608695, "loss": 0.9644, "step": 4210 }, { "epoch": 0.6592047589229806, "grad_norm": 8.06590747833252, "learning_rate": 0.00010195652173913043, "loss": 1.8144, "step": 4211 }, { "epoch": 0.6593613024420789, "grad_norm": 2.531743288040161, "learning_rate": 0.0001019446331521739, "loss": 0.5026, "step": 4212 }, { "epoch": 0.6595178459611772, "grad_norm": 1.6023063659667969, "learning_rate": 0.00010193274456521738, "loss": 0.7085, "step": 4213 }, { "epoch": 0.6596743894802756, "grad_norm": 4.220205783843994, "learning_rate": 0.00010192085597826086, "loss": 0.5263, "step": 4214 }, { "epoch": 0.6598309329993738, "grad_norm": 2.140004873275757, "learning_rate": 0.00010190896739130434, "loss": 0.4032, "step": 4215 }, { "epoch": 0.6599874765184721, "grad_norm": 1.301822304725647, "learning_rate": 0.00010189707880434782, "loss": 0.6505, "step": 4216 }, { "epoch": 0.6601440200375704, "grad_norm": 1.5176812410354614, "learning_rate": 0.0001018851902173913, "loss": 0.4, "step": 4217 }, { "epoch": 0.6603005635566688, "grad_norm": 1.516833782196045, "learning_rate": 0.00010187330163043479, "loss": 0.3704, "step": 4218 }, { "epoch": 0.6604571070757671, "grad_norm": 2.046092987060547, "learning_rate": 0.00010186141304347826, "loss": 0.833, "step": 4219 }, { "epoch": 0.6606136505948653, "grad_norm": 1.4354286193847656, "learning_rate": 0.00010184952445652173, "loss": 0.4833, "step": 4220 }, { "epoch": 0.6607701941139636, "grad_norm": 4.915871620178223, "learning_rate": 0.00010183763586956521, "loss": 1.0631, "step": 4221 }, { "epoch": 0.660926737633062, "grad_norm": 1.5807666778564453, "learning_rate": 0.00010182574728260869, "loss": 0.6223, "step": 4222 }, { "epoch": 0.6610832811521603, "grad_norm": 1.6048120260238647, "learning_rate": 0.00010181385869565216, "loss": 0.5977, "step": 4223 }, { "epoch": 0.6612398246712586, "grad_norm": 1.9477416276931763, "learning_rate": 0.00010180197010869564, "loss": 0.6353, "step": 4224 }, { "epoch": 0.661396368190357, "grad_norm": 1.9098070859909058, "learning_rate": 0.00010179008152173912, "loss": 0.7533, "step": 4225 }, { "epoch": 0.6615529117094552, "grad_norm": 2.691256046295166, "learning_rate": 0.0001017781929347826, "loss": 0.6719, "step": 4226 }, { "epoch": 0.6617094552285535, "grad_norm": 2.1427195072174072, "learning_rate": 0.00010176630434782608, "loss": 0.8115, "step": 4227 }, { "epoch": 0.6618659987476518, "grad_norm": 2.8612864017486572, "learning_rate": 0.00010175441576086955, "loss": 0.5873, "step": 4228 }, { "epoch": 0.6620225422667502, "grad_norm": 1.4517146348953247, "learning_rate": 0.00010174252717391303, "loss": 0.5453, "step": 4229 }, { "epoch": 0.6621790857858485, "grad_norm": 1.6928911209106445, "learning_rate": 0.00010173063858695651, "loss": 0.7463, "step": 4230 }, { "epoch": 0.6623356293049468, "grad_norm": 3.621450662612915, "learning_rate": 0.00010171874999999999, "loss": 0.8513, "step": 4231 }, { "epoch": 0.662492172824045, "grad_norm": 2.654491901397705, "learning_rate": 0.00010170686141304347, "loss": 0.8476, "step": 4232 }, { "epoch": 0.6626487163431434, "grad_norm": 2.2833385467529297, "learning_rate": 0.00010169497282608695, "loss": 0.6186, "step": 4233 }, { "epoch": 0.6628052598622417, "grad_norm": 2.836026906967163, "learning_rate": 0.00010168308423913042, "loss": 0.5342, "step": 4234 }, { "epoch": 0.66296180338134, "grad_norm": 2.914484739303589, "learning_rate": 0.0001016711956521739, "loss": 0.9879, "step": 4235 }, { "epoch": 0.6631183469004384, "grad_norm": 2.9716427326202393, "learning_rate": 0.00010165930706521738, "loss": 1.471, "step": 4236 }, { "epoch": 0.6632748904195366, "grad_norm": 3.874879837036133, "learning_rate": 0.00010164741847826087, "loss": 1.2474, "step": 4237 }, { "epoch": 0.6634314339386349, "grad_norm": 4.7228779792785645, "learning_rate": 0.00010163552989130435, "loss": 1.0017, "step": 4238 }, { "epoch": 0.6635879774577332, "grad_norm": 3.5189247131347656, "learning_rate": 0.00010162364130434783, "loss": 1.1276, "step": 4239 }, { "epoch": 0.6637445209768316, "grad_norm": 4.749283313751221, "learning_rate": 0.0001016117527173913, "loss": 1.5934, "step": 4240 }, { "epoch": 0.6639010644959299, "grad_norm": 3.601686477661133, "learning_rate": 0.00010159986413043478, "loss": 1.5033, "step": 4241 }, { "epoch": 0.6640576080150282, "grad_norm": 4.380939960479736, "learning_rate": 0.00010158797554347826, "loss": 1.348, "step": 4242 }, { "epoch": 0.6642141515341264, "grad_norm": 3.9142322540283203, "learning_rate": 0.00010157608695652173, "loss": 1.0288, "step": 4243 }, { "epoch": 0.6643706950532248, "grad_norm": 3.30307936668396, "learning_rate": 0.0001015641983695652, "loss": 1.435, "step": 4244 }, { "epoch": 0.6645272385723231, "grad_norm": 4.084417343139648, "learning_rate": 0.00010155230978260868, "loss": 1.3992, "step": 4245 }, { "epoch": 0.6646837820914214, "grad_norm": 2.1718642711639404, "learning_rate": 0.00010154042119565216, "loss": 0.6942, "step": 4246 }, { "epoch": 0.6648403256105198, "grad_norm": 1.5551209449768066, "learning_rate": 0.00010152853260869564, "loss": 0.6376, "step": 4247 }, { "epoch": 0.6649968691296181, "grad_norm": 4.316013813018799, "learning_rate": 0.00010151664402173912, "loss": 1.0754, "step": 4248 }, { "epoch": 0.6651534126487163, "grad_norm": 2.1561245918273926, "learning_rate": 0.0001015047554347826, "loss": 0.4133, "step": 4249 }, { "epoch": 0.6653099561678146, "grad_norm": 4.181296348571777, "learning_rate": 0.00010149286684782607, "loss": 1.0135, "step": 4250 }, { "epoch": 0.665466499686913, "grad_norm": 0.6004887223243713, "learning_rate": 0.00010148097826086955, "loss": 0.2814, "step": 4251 }, { "epoch": 0.6656230432060113, "grad_norm": 0.5103965997695923, "learning_rate": 0.00010146908967391303, "loss": 0.2969, "step": 4252 }, { "epoch": 0.6657795867251096, "grad_norm": 0.4943591356277466, "learning_rate": 0.00010145720108695651, "loss": 0.2637, "step": 4253 }, { "epoch": 0.6659361302442078, "grad_norm": 0.4737556576728821, "learning_rate": 0.00010144531249999999, "loss": 0.2309, "step": 4254 }, { "epoch": 0.6660926737633062, "grad_norm": 0.4835273325443268, "learning_rate": 0.00010143342391304346, "loss": 0.281, "step": 4255 }, { "epoch": 0.6662492172824045, "grad_norm": 0.7521408200263977, "learning_rate": 0.00010142153532608694, "loss": 0.4319, "step": 4256 }, { "epoch": 0.6664057608015028, "grad_norm": 0.9452478885650635, "learning_rate": 0.00010140964673913043, "loss": 0.4818, "step": 4257 }, { "epoch": 0.6665623043206012, "grad_norm": 0.7315664887428284, "learning_rate": 0.00010139775815217391, "loss": 0.3244, "step": 4258 }, { "epoch": 0.6667188478396995, "grad_norm": 1.12488853931427, "learning_rate": 0.00010138586956521739, "loss": 0.2858, "step": 4259 }, { "epoch": 0.6668753913587977, "grad_norm": 0.5876471400260925, "learning_rate": 0.00010137398097826087, "loss": 0.258, "step": 4260 }, { "epoch": 0.667031934877896, "grad_norm": 0.7857499718666077, "learning_rate": 0.00010136209239130435, "loss": 0.3458, "step": 4261 }, { "epoch": 0.6671884783969944, "grad_norm": 0.7814789414405823, "learning_rate": 0.00010135020380434783, "loss": 0.3597, "step": 4262 }, { "epoch": 0.6673450219160927, "grad_norm": 1.1170368194580078, "learning_rate": 0.0001013383152173913, "loss": 0.4776, "step": 4263 }, { "epoch": 0.667501565435191, "grad_norm": 0.8558453321456909, "learning_rate": 0.00010132642663043478, "loss": 0.2874, "step": 4264 }, { "epoch": 0.6676581089542893, "grad_norm": 1.298049807548523, "learning_rate": 0.00010131453804347826, "loss": 0.432, "step": 4265 }, { "epoch": 0.6678146524733876, "grad_norm": 1.8326239585876465, "learning_rate": 0.00010130264945652172, "loss": 0.5387, "step": 4266 }, { "epoch": 0.6679711959924859, "grad_norm": 1.1257387399673462, "learning_rate": 0.0001012907608695652, "loss": 0.3947, "step": 4267 }, { "epoch": 0.6681277395115842, "grad_norm": 1.435884952545166, "learning_rate": 0.00010127887228260868, "loss": 0.5536, "step": 4268 }, { "epoch": 0.6682842830306825, "grad_norm": 2.0724804401397705, "learning_rate": 0.00010126698369565216, "loss": 0.4539, "step": 4269 }, { "epoch": 0.6684408265497809, "grad_norm": 1.0072883367538452, "learning_rate": 0.00010125509510869564, "loss": 0.3843, "step": 4270 }, { "epoch": 0.6685973700688791, "grad_norm": 1.591432809829712, "learning_rate": 0.00010124320652173912, "loss": 0.5936, "step": 4271 }, { "epoch": 0.6687539135879774, "grad_norm": 1.7080459594726562, "learning_rate": 0.0001012313179347826, "loss": 0.3912, "step": 4272 }, { "epoch": 0.6689104571070758, "grad_norm": 1.9229294061660767, "learning_rate": 0.00010121942934782607, "loss": 0.7911, "step": 4273 }, { "epoch": 0.6690670006261741, "grad_norm": 2.024768829345703, "learning_rate": 0.00010120754076086955, "loss": 0.485, "step": 4274 }, { "epoch": 0.6692235441452724, "grad_norm": 2.4698126316070557, "learning_rate": 0.00010119565217391303, "loss": 0.7331, "step": 4275 }, { "epoch": 0.6693800876643707, "grad_norm": 1.4480772018432617, "learning_rate": 0.0001011837635869565, "loss": 0.5701, "step": 4276 }, { "epoch": 0.669536631183469, "grad_norm": 2.1094462871551514, "learning_rate": 0.000101171875, "loss": 0.672, "step": 4277 }, { "epoch": 0.6696931747025673, "grad_norm": 1.5886313915252686, "learning_rate": 0.00010115998641304348, "loss": 0.7831, "step": 4278 }, { "epoch": 0.6698497182216656, "grad_norm": 2.211982011795044, "learning_rate": 0.00010114809782608695, "loss": 0.6997, "step": 4279 }, { "epoch": 0.6700062617407639, "grad_norm": 2.8656527996063232, "learning_rate": 0.00010113620923913043, "loss": 0.7549, "step": 4280 }, { "epoch": 0.6701628052598623, "grad_norm": 1.5182746648788452, "learning_rate": 0.00010112432065217391, "loss": 0.7386, "step": 4281 }, { "epoch": 0.6703193487789606, "grad_norm": 3.8582139015197754, "learning_rate": 0.00010111243206521739, "loss": 0.8915, "step": 4282 }, { "epoch": 0.6704758922980588, "grad_norm": 2.1936566829681396, "learning_rate": 0.00010110054347826087, "loss": 1.0321, "step": 4283 }, { "epoch": 0.6706324358171571, "grad_norm": 11.265280723571777, "learning_rate": 0.00010108865489130434, "loss": 1.1428, "step": 4284 }, { "epoch": 0.6707889793362555, "grad_norm": 2.072824239730835, "learning_rate": 0.00010107676630434782, "loss": 0.8987, "step": 4285 }, { "epoch": 0.6709455228553538, "grad_norm": 1.824108600616455, "learning_rate": 0.0001010648777173913, "loss": 0.6405, "step": 4286 }, { "epoch": 0.6711020663744521, "grad_norm": 4.802488803863525, "learning_rate": 0.00010105298913043478, "loss": 0.9921, "step": 4287 }, { "epoch": 0.6712586098935505, "grad_norm": 1.7986516952514648, "learning_rate": 0.00010104110054347826, "loss": 1.0624, "step": 4288 }, { "epoch": 0.6714151534126487, "grad_norm": 3.3219714164733887, "learning_rate": 0.00010102921195652172, "loss": 0.9191, "step": 4289 }, { "epoch": 0.671571696931747, "grad_norm": 3.2097368240356445, "learning_rate": 0.0001010173233695652, "loss": 0.7893, "step": 4290 }, { "epoch": 0.6717282404508453, "grad_norm": 5.52312707901001, "learning_rate": 0.00010100543478260868, "loss": 1.3489, "step": 4291 }, { "epoch": 0.6718847839699437, "grad_norm": 6.544090270996094, "learning_rate": 0.00010099354619565216, "loss": 1.317, "step": 4292 }, { "epoch": 0.672041327489042, "grad_norm": 3.959998846054077, "learning_rate": 0.00010098165760869563, "loss": 1.6515, "step": 4293 }, { "epoch": 0.6721978710081402, "grad_norm": 3.915320634841919, "learning_rate": 0.00010096976902173911, "loss": 1.4696, "step": 4294 }, { "epoch": 0.6723544145272385, "grad_norm": 3.2768478393554688, "learning_rate": 0.00010095788043478259, "loss": 1.4117, "step": 4295 }, { "epoch": 0.6725109580463369, "grad_norm": 2.2501237392425537, "learning_rate": 0.00010094599184782607, "loss": 1.0882, "step": 4296 }, { "epoch": 0.6726675015654352, "grad_norm": 3.655709981918335, "learning_rate": 0.00010093410326086956, "loss": 1.2169, "step": 4297 }, { "epoch": 0.6728240450845335, "grad_norm": 2.5137290954589844, "learning_rate": 0.00010092221467391304, "loss": 0.708, "step": 4298 }, { "epoch": 0.6729805886036319, "grad_norm": 2.811969757080078, "learning_rate": 0.00010091032608695652, "loss": 1.1132, "step": 4299 }, { "epoch": 0.6731371321227301, "grad_norm": 3.5562970638275146, "learning_rate": 0.0001008984375, "loss": 0.8131, "step": 4300 }, { "epoch": 0.6732936756418284, "grad_norm": 1.0938668251037598, "learning_rate": 0.00010088654891304347, "loss": 0.395, "step": 4301 }, { "epoch": 0.6734502191609267, "grad_norm": 0.4825965166091919, "learning_rate": 0.00010087466032608695, "loss": 0.2544, "step": 4302 }, { "epoch": 0.6736067626800251, "grad_norm": 1.0952693223953247, "learning_rate": 0.00010086277173913043, "loss": 0.4099, "step": 4303 }, { "epoch": 0.6737633061991234, "grad_norm": 0.8391582369804382, "learning_rate": 0.00010085088315217391, "loss": 0.3382, "step": 4304 }, { "epoch": 0.6739198497182217, "grad_norm": 0.6425570249557495, "learning_rate": 0.00010083899456521739, "loss": 0.4596, "step": 4305 }, { "epoch": 0.6740763932373199, "grad_norm": 0.8869735598564148, "learning_rate": 0.00010082710597826086, "loss": 0.3189, "step": 4306 }, { "epoch": 0.6742329367564183, "grad_norm": 0.9811785221099854, "learning_rate": 0.00010081521739130434, "loss": 0.5247, "step": 4307 }, { "epoch": 0.6743894802755166, "grad_norm": 0.6160823106765747, "learning_rate": 0.00010080332880434782, "loss": 0.3062, "step": 4308 }, { "epoch": 0.6745460237946149, "grad_norm": 0.81550133228302, "learning_rate": 0.0001007914402173913, "loss": 0.324, "step": 4309 }, { "epoch": 0.6747025673137133, "grad_norm": 0.9266514778137207, "learning_rate": 0.00010077955163043478, "loss": 0.438, "step": 4310 }, { "epoch": 0.6748591108328115, "grad_norm": 1.1824613809585571, "learning_rate": 0.00010076766304347826, "loss": 0.3266, "step": 4311 }, { "epoch": 0.6750156543519098, "grad_norm": 0.9998323917388916, "learning_rate": 0.00010075577445652172, "loss": 0.4269, "step": 4312 }, { "epoch": 0.6751721978710081, "grad_norm": 1.1617019176483154, "learning_rate": 0.0001007438858695652, "loss": 0.3766, "step": 4313 }, { "epoch": 0.6753287413901065, "grad_norm": 1.702675461769104, "learning_rate": 0.00010073199728260868, "loss": 0.4599, "step": 4314 }, { "epoch": 0.6754852849092048, "grad_norm": 1.5458407402038574, "learning_rate": 0.00010072010869565215, "loss": 0.5521, "step": 4315 }, { "epoch": 0.6756418284283031, "grad_norm": 2.5902888774871826, "learning_rate": 0.00010070822010869563, "loss": 0.7884, "step": 4316 }, { "epoch": 0.6757983719474013, "grad_norm": 1.6311050653457642, "learning_rate": 0.00010069633152173912, "loss": 0.4385, "step": 4317 }, { "epoch": 0.6759549154664997, "grad_norm": 1.8552831411361694, "learning_rate": 0.0001006844429347826, "loss": 0.4185, "step": 4318 }, { "epoch": 0.676111458985598, "grad_norm": 1.646878719329834, "learning_rate": 0.00010067255434782608, "loss": 0.5432, "step": 4319 }, { "epoch": 0.6762680025046963, "grad_norm": 1.4758496284484863, "learning_rate": 0.00010066066576086956, "loss": 0.5823, "step": 4320 }, { "epoch": 0.6764245460237946, "grad_norm": 1.8455051183700562, "learning_rate": 0.00010064877717391304, "loss": 0.8182, "step": 4321 }, { "epoch": 0.676581089542893, "grad_norm": 1.9887446165084839, "learning_rate": 0.00010063688858695651, "loss": 0.5701, "step": 4322 }, { "epoch": 0.6767376330619912, "grad_norm": 1.6858872175216675, "learning_rate": 0.00010062499999999999, "loss": 0.546, "step": 4323 }, { "epoch": 0.6768941765810895, "grad_norm": 2.600813865661621, "learning_rate": 0.00010061311141304347, "loss": 0.8806, "step": 4324 }, { "epoch": 0.6770507201001879, "grad_norm": 1.989012360572815, "learning_rate": 0.00010060122282608695, "loss": 0.8151, "step": 4325 }, { "epoch": 0.6772072636192862, "grad_norm": 2.1735410690307617, "learning_rate": 0.00010058933423913043, "loss": 0.563, "step": 4326 }, { "epoch": 0.6773638071383845, "grad_norm": 3.0469369888305664, "learning_rate": 0.0001005774456521739, "loss": 0.7712, "step": 4327 }, { "epoch": 0.6775203506574827, "grad_norm": 1.1242660284042358, "learning_rate": 0.00010056555706521738, "loss": 0.5332, "step": 4328 }, { "epoch": 0.6776768941765811, "grad_norm": 3.0852997303009033, "learning_rate": 0.00010055366847826086, "loss": 0.6027, "step": 4329 }, { "epoch": 0.6778334376956794, "grad_norm": 3.1933093070983887, "learning_rate": 0.00010054177989130434, "loss": 0.61, "step": 4330 }, { "epoch": 0.6779899812147777, "grad_norm": 2.741356134414673, "learning_rate": 0.00010052989130434782, "loss": 0.7609, "step": 4331 }, { "epoch": 0.678146524733876, "grad_norm": 1.909173846244812, "learning_rate": 0.0001005180027173913, "loss": 0.8195, "step": 4332 }, { "epoch": 0.6783030682529744, "grad_norm": 4.567829132080078, "learning_rate": 0.00010050611413043477, "loss": 1.2149, "step": 4333 }, { "epoch": 0.6784596117720726, "grad_norm": 2.5052363872528076, "learning_rate": 0.00010049422554347827, "loss": 0.7616, "step": 4334 }, { "epoch": 0.6786161552911709, "grad_norm": 3.4371678829193115, "learning_rate": 0.00010048233695652172, "loss": 1.0369, "step": 4335 }, { "epoch": 0.6787726988102692, "grad_norm": 3.023883819580078, "learning_rate": 0.0001004704483695652, "loss": 1.0625, "step": 4336 }, { "epoch": 0.6789292423293676, "grad_norm": 3.775174617767334, "learning_rate": 0.00010045855978260869, "loss": 1.4147, "step": 4337 }, { "epoch": 0.6790857858484659, "grad_norm": 4.192546844482422, "learning_rate": 0.00010044667119565217, "loss": 1.6406, "step": 4338 }, { "epoch": 0.6792423293675642, "grad_norm": 2.5800366401672363, "learning_rate": 0.00010043478260869564, "loss": 1.127, "step": 4339 }, { "epoch": 0.6793988728866625, "grad_norm": 3.5980615615844727, "learning_rate": 0.00010042289402173912, "loss": 1.5735, "step": 4340 }, { "epoch": 0.6795554164057608, "grad_norm": 2.3579885959625244, "learning_rate": 0.0001004110054347826, "loss": 0.6896, "step": 4341 }, { "epoch": 0.6797119599248591, "grad_norm": 5.482131004333496, "learning_rate": 0.00010039911684782608, "loss": 1.1094, "step": 4342 }, { "epoch": 0.6798685034439574, "grad_norm": 2.0575878620147705, "learning_rate": 0.00010038722826086956, "loss": 1.0379, "step": 4343 }, { "epoch": 0.6800250469630558, "grad_norm": 2.3809876441955566, "learning_rate": 0.00010037533967391303, "loss": 1.2391, "step": 4344 }, { "epoch": 0.680181590482154, "grad_norm": 3.3228299617767334, "learning_rate": 0.00010036345108695651, "loss": 1.0506, "step": 4345 }, { "epoch": 0.6803381340012523, "grad_norm": 1.1489919424057007, "learning_rate": 0.00010035156249999999, "loss": 0.4288, "step": 4346 }, { "epoch": 0.6804946775203506, "grad_norm": 3.4122703075408936, "learning_rate": 0.00010033967391304347, "loss": 0.8821, "step": 4347 }, { "epoch": 0.680651221039449, "grad_norm": 3.018977642059326, "learning_rate": 0.00010032778532608695, "loss": 0.4416, "step": 4348 }, { "epoch": 0.6808077645585473, "grad_norm": 2.6489129066467285, "learning_rate": 0.00010031589673913042, "loss": 0.9176, "step": 4349 }, { "epoch": 0.6809643080776456, "grad_norm": 2.834998607635498, "learning_rate": 0.0001003040081521739, "loss": 1.2088, "step": 4350 }, { "epoch": 0.6811208515967438, "grad_norm": 0.5988664627075195, "learning_rate": 0.00010029211956521738, "loss": 0.308, "step": 4351 }, { "epoch": 0.6812773951158422, "grad_norm": 0.7443435192108154, "learning_rate": 0.00010028023097826086, "loss": 0.3281, "step": 4352 }, { "epoch": 0.6814339386349405, "grad_norm": 0.7624204158782959, "learning_rate": 0.00010026834239130434, "loss": 0.3599, "step": 4353 }, { "epoch": 0.6815904821540388, "grad_norm": 1.2347975969314575, "learning_rate": 0.00010025645380434783, "loss": 0.3831, "step": 4354 }, { "epoch": 0.6817470256731372, "grad_norm": 0.6346348524093628, "learning_rate": 0.00010024456521739131, "loss": 0.2688, "step": 4355 }, { "epoch": 0.6819035691922355, "grad_norm": 0.8639354705810547, "learning_rate": 0.00010023267663043479, "loss": 0.373, "step": 4356 }, { "epoch": 0.6820601127113337, "grad_norm": 0.9461267590522766, "learning_rate": 0.00010022078804347826, "loss": 0.509, "step": 4357 }, { "epoch": 0.682216656230432, "grad_norm": 1.377577304840088, "learning_rate": 0.00010020889945652173, "loss": 0.4116, "step": 4358 }, { "epoch": 0.6823731997495304, "grad_norm": 0.8991801142692566, "learning_rate": 0.0001001970108695652, "loss": 0.421, "step": 4359 }, { "epoch": 0.6825297432686287, "grad_norm": 1.285883903503418, "learning_rate": 0.00010018512228260868, "loss": 0.5022, "step": 4360 }, { "epoch": 0.682686286787727, "grad_norm": 0.9156930446624756, "learning_rate": 0.00010017323369565216, "loss": 0.2642, "step": 4361 }, { "epoch": 0.6828428303068252, "grad_norm": 1.0816419124603271, "learning_rate": 0.00010016134510869564, "loss": 0.2817, "step": 4362 }, { "epoch": 0.6829993738259236, "grad_norm": 1.8566310405731201, "learning_rate": 0.00010014945652173912, "loss": 0.739, "step": 4363 }, { "epoch": 0.6831559173450219, "grad_norm": 1.2524845600128174, "learning_rate": 0.0001001375679347826, "loss": 0.5162, "step": 4364 }, { "epoch": 0.6833124608641202, "grad_norm": 0.951764702796936, "learning_rate": 0.00010012567934782608, "loss": 0.3261, "step": 4365 }, { "epoch": 0.6834690043832186, "grad_norm": 0.9953811764717102, "learning_rate": 0.00010011379076086955, "loss": 0.3971, "step": 4366 }, { "epoch": 0.6836255479023169, "grad_norm": 1.3510175943374634, "learning_rate": 0.00010010190217391303, "loss": 0.5121, "step": 4367 }, { "epoch": 0.6837820914214151, "grad_norm": 1.8387508392333984, "learning_rate": 0.00010009001358695651, "loss": 0.3831, "step": 4368 }, { "epoch": 0.6839386349405134, "grad_norm": 1.9706523418426514, "learning_rate": 0.00010007812499999999, "loss": 0.576, "step": 4369 }, { "epoch": 0.6840951784596118, "grad_norm": 2.1403725147247314, "learning_rate": 0.00010006623641304347, "loss": 0.5415, "step": 4370 }, { "epoch": 0.6842517219787101, "grad_norm": 1.8154163360595703, "learning_rate": 0.00010005434782608694, "loss": 0.4989, "step": 4371 }, { "epoch": 0.6844082654978084, "grad_norm": 1.9493091106414795, "learning_rate": 0.00010004245923913042, "loss": 0.7995, "step": 4372 }, { "epoch": 0.6845648090169068, "grad_norm": 1.4493317604064941, "learning_rate": 0.0001000305706521739, "loss": 0.4617, "step": 4373 }, { "epoch": 0.684721352536005, "grad_norm": 1.3593460321426392, "learning_rate": 0.00010001868206521739, "loss": 0.541, "step": 4374 }, { "epoch": 0.6848778960551033, "grad_norm": 1.7882930040359497, "learning_rate": 0.00010000679347826087, "loss": 0.7429, "step": 4375 }, { "epoch": 0.6850344395742016, "grad_norm": 1.312345266342163, "learning_rate": 9.999490489130435e-05, "loss": 0.549, "step": 4376 }, { "epoch": 0.6851909830933, "grad_norm": 6.289135456085205, "learning_rate": 9.998301630434783e-05, "loss": 0.6922, "step": 4377 }, { "epoch": 0.6853475266123983, "grad_norm": 2.7427756786346436, "learning_rate": 9.99711277173913e-05, "loss": 0.8901, "step": 4378 }, { "epoch": 0.6855040701314965, "grad_norm": 1.9230971336364746, "learning_rate": 9.995923913043478e-05, "loss": 0.7833, "step": 4379 }, { "epoch": 0.6856606136505948, "grad_norm": 3.8026487827301025, "learning_rate": 9.994735054347826e-05, "loss": 0.7645, "step": 4380 }, { "epoch": 0.6858171571696932, "grad_norm": 2.514539957046509, "learning_rate": 9.993546195652173e-05, "loss": 0.6103, "step": 4381 }, { "epoch": 0.6859737006887915, "grad_norm": 2.6352412700653076, "learning_rate": 9.99235733695652e-05, "loss": 0.5901, "step": 4382 }, { "epoch": 0.6861302442078898, "grad_norm": 3.2987895011901855, "learning_rate": 9.991168478260868e-05, "loss": 0.8546, "step": 4383 }, { "epoch": 0.6862867877269881, "grad_norm": 3.0919618606567383, "learning_rate": 9.989979619565216e-05, "loss": 1.4027, "step": 4384 }, { "epoch": 0.6864433312460864, "grad_norm": 2.1543827056884766, "learning_rate": 9.988790760869564e-05, "loss": 0.9585, "step": 4385 }, { "epoch": 0.6865998747651847, "grad_norm": 3.9680240154266357, "learning_rate": 9.987601902173912e-05, "loss": 1.4308, "step": 4386 }, { "epoch": 0.686756418284283, "grad_norm": 2.1025686264038086, "learning_rate": 9.98641304347826e-05, "loss": 1.0541, "step": 4387 }, { "epoch": 0.6869129618033814, "grad_norm": 2.9918055534362793, "learning_rate": 9.985224184782607e-05, "loss": 1.0194, "step": 4388 }, { "epoch": 0.6870695053224797, "grad_norm": 3.382951259613037, "learning_rate": 9.984035326086955e-05, "loss": 1.0145, "step": 4389 }, { "epoch": 0.687226048841578, "grad_norm": 2.841057538986206, "learning_rate": 9.982846467391303e-05, "loss": 0.7753, "step": 4390 }, { "epoch": 0.6873825923606762, "grad_norm": 4.426357269287109, "learning_rate": 9.981657608695651e-05, "loss": 1.2983, "step": 4391 }, { "epoch": 0.6875391358797746, "grad_norm": 2.436708450317383, "learning_rate": 9.980468749999999e-05, "loss": 1.1402, "step": 4392 }, { "epoch": 0.6876956793988729, "grad_norm": 3.7287349700927734, "learning_rate": 9.979279891304346e-05, "loss": 1.9057, "step": 4393 }, { "epoch": 0.6878522229179712, "grad_norm": 4.84196138381958, "learning_rate": 9.978091032608696e-05, "loss": 1.4566, "step": 4394 }, { "epoch": 0.6880087664370695, "grad_norm": 4.003749847412109, "learning_rate": 9.976902173913043e-05, "loss": 1.6219, "step": 4395 }, { "epoch": 0.6881653099561679, "grad_norm": 5.028621673583984, "learning_rate": 9.975713315217391e-05, "loss": 1.4812, "step": 4396 }, { "epoch": 0.6883218534752661, "grad_norm": 2.6154534816741943, "learning_rate": 9.974524456521739e-05, "loss": 1.2168, "step": 4397 }, { "epoch": 0.6884783969943644, "grad_norm": 4.652947425842285, "learning_rate": 9.973335597826087e-05, "loss": 1.3638, "step": 4398 }, { "epoch": 0.6886349405134627, "grad_norm": 2.7821924686431885, "learning_rate": 9.972146739130435e-05, "loss": 1.0817, "step": 4399 }, { "epoch": 0.6887914840325611, "grad_norm": 3.387585401535034, "learning_rate": 9.970957880434782e-05, "loss": 1.1317, "step": 4400 }, { "epoch": 0.6889480275516594, "grad_norm": 0.9512802362442017, "learning_rate": 9.96976902173913e-05, "loss": 0.8032, "step": 4401 }, { "epoch": 0.6891045710707576, "grad_norm": 0.5512304902076721, "learning_rate": 9.968580163043478e-05, "loss": 0.247, "step": 4402 }, { "epoch": 0.689261114589856, "grad_norm": 0.6910927891731262, "learning_rate": 9.967391304347826e-05, "loss": 0.33, "step": 4403 }, { "epoch": 0.6894176581089543, "grad_norm": 0.5225275158882141, "learning_rate": 9.966202445652172e-05, "loss": 0.2847, "step": 4404 }, { "epoch": 0.6895742016280526, "grad_norm": 0.6632809638977051, "learning_rate": 9.96501358695652e-05, "loss": 0.3569, "step": 4405 }, { "epoch": 0.6897307451471509, "grad_norm": 0.7125046849250793, "learning_rate": 9.963824728260868e-05, "loss": 0.2982, "step": 4406 }, { "epoch": 0.6898872886662493, "grad_norm": 0.6289424896240234, "learning_rate": 9.962635869565216e-05, "loss": 0.2884, "step": 4407 }, { "epoch": 0.6900438321853475, "grad_norm": 0.8851990103721619, "learning_rate": 9.961447010869564e-05, "loss": 0.361, "step": 4408 }, { "epoch": 0.6902003757044458, "grad_norm": 2.231055974960327, "learning_rate": 9.960258152173911e-05, "loss": 0.4356, "step": 4409 }, { "epoch": 0.6903569192235441, "grad_norm": 0.8509057760238647, "learning_rate": 9.959069293478259e-05, "loss": 0.3017, "step": 4410 }, { "epoch": 0.6905134627426425, "grad_norm": 0.82469642162323, "learning_rate": 9.957880434782607e-05, "loss": 0.3025, "step": 4411 }, { "epoch": 0.6906700062617408, "grad_norm": 1.0696184635162354, "learning_rate": 9.956691576086955e-05, "loss": 0.418, "step": 4412 }, { "epoch": 0.6908265497808391, "grad_norm": 1.2558475732803345, "learning_rate": 9.955502717391303e-05, "loss": 0.4, "step": 4413 }, { "epoch": 0.6909830932999373, "grad_norm": 1.0123722553253174, "learning_rate": 9.954313858695652e-05, "loss": 0.5348, "step": 4414 }, { "epoch": 0.6911396368190357, "grad_norm": 1.4655754566192627, "learning_rate": 9.953125e-05, "loss": 0.3455, "step": 4415 }, { "epoch": 0.691296180338134, "grad_norm": 0.885408878326416, "learning_rate": 9.951936141304347e-05, "loss": 0.3477, "step": 4416 }, { "epoch": 0.6914527238572323, "grad_norm": 2.5597846508026123, "learning_rate": 9.950747282608695e-05, "loss": 0.6389, "step": 4417 }, { "epoch": 0.6916092673763307, "grad_norm": 1.184470534324646, "learning_rate": 9.949558423913043e-05, "loss": 0.4649, "step": 4418 }, { "epoch": 0.6917658108954289, "grad_norm": 2.5143356323242188, "learning_rate": 9.948369565217391e-05, "loss": 0.5667, "step": 4419 }, { "epoch": 0.6919223544145272, "grad_norm": 2.872513771057129, "learning_rate": 9.947180706521739e-05, "loss": 0.8843, "step": 4420 }, { "epoch": 0.6920788979336255, "grad_norm": 1.6400690078735352, "learning_rate": 9.945991847826087e-05, "loss": 0.5198, "step": 4421 }, { "epoch": 0.6922354414527239, "grad_norm": 1.2501685619354248, "learning_rate": 9.944802989130434e-05, "loss": 0.6243, "step": 4422 }, { "epoch": 0.6923919849718222, "grad_norm": 3.3147220611572266, "learning_rate": 9.943614130434782e-05, "loss": 0.5537, "step": 4423 }, { "epoch": 0.6925485284909205, "grad_norm": 2.0223898887634277, "learning_rate": 9.94242527173913e-05, "loss": 0.7239, "step": 4424 }, { "epoch": 0.6927050720100187, "grad_norm": 1.2683730125427246, "learning_rate": 9.941236413043478e-05, "loss": 0.5735, "step": 4425 }, { "epoch": 0.6928616155291171, "grad_norm": 2.2776870727539062, "learning_rate": 9.940047554347826e-05, "loss": 0.8064, "step": 4426 }, { "epoch": 0.6930181590482154, "grad_norm": 1.8946512937545776, "learning_rate": 9.938858695652172e-05, "loss": 0.57, "step": 4427 }, { "epoch": 0.6931747025673137, "grad_norm": 2.4030535221099854, "learning_rate": 9.93766983695652e-05, "loss": 0.9162, "step": 4428 }, { "epoch": 0.6933312460864121, "grad_norm": 1.8622697591781616, "learning_rate": 9.936480978260868e-05, "loss": 0.7033, "step": 4429 }, { "epoch": 0.6934877896055104, "grad_norm": 2.677424192428589, "learning_rate": 9.935292119565216e-05, "loss": 1.1493, "step": 4430 }, { "epoch": 0.6936443331246086, "grad_norm": 1.8469141721725464, "learning_rate": 9.934103260869563e-05, "loss": 1.0302, "step": 4431 }, { "epoch": 0.6938008766437069, "grad_norm": 2.177751302719116, "learning_rate": 9.932914402173911e-05, "loss": 0.9587, "step": 4432 }, { "epoch": 0.6939574201628053, "grad_norm": 2.552072048187256, "learning_rate": 9.931725543478259e-05, "loss": 0.7986, "step": 4433 }, { "epoch": 0.6941139636819036, "grad_norm": 3.6106319427490234, "learning_rate": 9.930536684782608e-05, "loss": 1.2489, "step": 4434 }, { "epoch": 0.6942705072010019, "grad_norm": 3.796252965927124, "learning_rate": 9.929347826086956e-05, "loss": 0.9082, "step": 4435 }, { "epoch": 0.6944270507201001, "grad_norm": 2.5933096408843994, "learning_rate": 9.928158967391304e-05, "loss": 1.2087, "step": 4436 }, { "epoch": 0.6945835942391985, "grad_norm": 2.1912710666656494, "learning_rate": 9.926970108695652e-05, "loss": 1.1423, "step": 4437 }, { "epoch": 0.6947401377582968, "grad_norm": 3.956108331680298, "learning_rate": 9.92578125e-05, "loss": 1.0783, "step": 4438 }, { "epoch": 0.6948966812773951, "grad_norm": 2.709578037261963, "learning_rate": 9.924592391304347e-05, "loss": 0.6828, "step": 4439 }, { "epoch": 0.6950532247964935, "grad_norm": 3.7104883193969727, "learning_rate": 9.923403532608695e-05, "loss": 0.9035, "step": 4440 }, { "epoch": 0.6952097683155918, "grad_norm": 6.496031761169434, "learning_rate": 9.922214673913043e-05, "loss": 1.2438, "step": 4441 }, { "epoch": 0.69536631183469, "grad_norm": 2.21712064743042, "learning_rate": 9.921025815217391e-05, "loss": 1.0145, "step": 4442 }, { "epoch": 0.6955228553537883, "grad_norm": 2.3093605041503906, "learning_rate": 9.919836956521739e-05, "loss": 1.6454, "step": 4443 }, { "epoch": 0.6956793988728867, "grad_norm": 2.465881824493408, "learning_rate": 9.918648097826086e-05, "loss": 1.7108, "step": 4444 }, { "epoch": 0.695835942391985, "grad_norm": 3.6095521450042725, "learning_rate": 9.917459239130434e-05, "loss": 0.9712, "step": 4445 }, { "epoch": 0.6959924859110833, "grad_norm": 2.040978193283081, "learning_rate": 9.916270380434782e-05, "loss": 0.7853, "step": 4446 }, { "epoch": 0.6961490294301816, "grad_norm": 1.6060633659362793, "learning_rate": 9.91508152173913e-05, "loss": 0.7098, "step": 4447 }, { "epoch": 0.6963055729492799, "grad_norm": 3.9329674243927, "learning_rate": 9.913892663043478e-05, "loss": 0.8212, "step": 4448 }, { "epoch": 0.6964621164683782, "grad_norm": 2.2458059787750244, "learning_rate": 9.912703804347825e-05, "loss": 0.7766, "step": 4449 }, { "epoch": 0.6966186599874765, "grad_norm": 2.8083086013793945, "learning_rate": 9.911514945652172e-05, "loss": 1.2474, "step": 4450 }, { "epoch": 0.6967752035065748, "grad_norm": 0.7622883319854736, "learning_rate": 9.91032608695652e-05, "loss": 0.3838, "step": 4451 }, { "epoch": 0.6969317470256732, "grad_norm": 0.9308403134346008, "learning_rate": 9.909137228260868e-05, "loss": 0.2251, "step": 4452 }, { "epoch": 0.6970882905447714, "grad_norm": 0.642752468585968, "learning_rate": 9.907948369565215e-05, "loss": 0.2604, "step": 4453 }, { "epoch": 0.6972448340638697, "grad_norm": 0.8565986752510071, "learning_rate": 9.906759510869564e-05, "loss": 0.4108, "step": 4454 }, { "epoch": 0.697401377582968, "grad_norm": 1.0320876836776733, "learning_rate": 9.905570652173912e-05, "loss": 0.3123, "step": 4455 }, { "epoch": 0.6975579211020664, "grad_norm": 0.8842648267745972, "learning_rate": 9.90438179347826e-05, "loss": 0.3007, "step": 4456 }, { "epoch": 0.6977144646211647, "grad_norm": 0.9582691192626953, "learning_rate": 9.903192934782608e-05, "loss": 0.4886, "step": 4457 }, { "epoch": 0.697871008140263, "grad_norm": 0.9258428812026978, "learning_rate": 9.902004076086956e-05, "loss": 0.2272, "step": 4458 }, { "epoch": 0.6980275516593613, "grad_norm": 0.9868488907814026, "learning_rate": 9.900815217391304e-05, "loss": 0.4479, "step": 4459 }, { "epoch": 0.6981840951784596, "grad_norm": 0.7826197147369385, "learning_rate": 9.899626358695651e-05, "loss": 0.3851, "step": 4460 }, { "epoch": 0.6983406386975579, "grad_norm": 1.1949875354766846, "learning_rate": 9.898437499999999e-05, "loss": 0.3225, "step": 4461 }, { "epoch": 0.6984971822166562, "grad_norm": 1.5966954231262207, "learning_rate": 9.897248641304347e-05, "loss": 0.306, "step": 4462 }, { "epoch": 0.6986537257357546, "grad_norm": 1.2998764514923096, "learning_rate": 9.896059782608695e-05, "loss": 0.2981, "step": 4463 }, { "epoch": 0.6988102692548529, "grad_norm": 1.2707816362380981, "learning_rate": 9.894870923913043e-05, "loss": 0.4577, "step": 4464 }, { "epoch": 0.6989668127739511, "grad_norm": 0.8326631188392639, "learning_rate": 9.89368206521739e-05, "loss": 0.2874, "step": 4465 }, { "epoch": 0.6991233562930494, "grad_norm": 1.2825976610183716, "learning_rate": 9.892493206521738e-05, "loss": 0.5716, "step": 4466 }, { "epoch": 0.6992798998121478, "grad_norm": 1.3456063270568848, "learning_rate": 9.891304347826086e-05, "loss": 0.4833, "step": 4467 }, { "epoch": 0.6994364433312461, "grad_norm": 1.424704670906067, "learning_rate": 9.890115489130434e-05, "loss": 0.4797, "step": 4468 }, { "epoch": 0.6995929868503444, "grad_norm": 2.2309422492980957, "learning_rate": 9.888926630434782e-05, "loss": 0.9694, "step": 4469 }, { "epoch": 0.6997495303694427, "grad_norm": 2.4052908420562744, "learning_rate": 9.88773777173913e-05, "loss": 0.5425, "step": 4470 }, { "epoch": 0.699906073888541, "grad_norm": 4.000556468963623, "learning_rate": 9.886548913043479e-05, "loss": 0.6847, "step": 4471 }, { "epoch": 0.7000626174076393, "grad_norm": 1.6493200063705444, "learning_rate": 9.885360054347827e-05, "loss": 0.8894, "step": 4472 }, { "epoch": 0.7002191609267376, "grad_norm": 3.143533706665039, "learning_rate": 9.884171195652172e-05, "loss": 0.4887, "step": 4473 }, { "epoch": 0.700375704445836, "grad_norm": 1.381941795349121, "learning_rate": 9.882982336956521e-05, "loss": 0.3662, "step": 4474 }, { "epoch": 0.7005322479649343, "grad_norm": 2.6353306770324707, "learning_rate": 9.881793478260869e-05, "loss": 0.7537, "step": 4475 }, { "epoch": 0.7006887914840325, "grad_norm": 1.832458257675171, "learning_rate": 9.880604619565216e-05, "loss": 1.1365, "step": 4476 }, { "epoch": 0.7008453350031308, "grad_norm": 2.999711275100708, "learning_rate": 9.879415760869564e-05, "loss": 1.2519, "step": 4477 }, { "epoch": 0.7010018785222292, "grad_norm": 2.437774181365967, "learning_rate": 9.878226902173912e-05, "loss": 0.6373, "step": 4478 }, { "epoch": 0.7011584220413275, "grad_norm": 2.2450287342071533, "learning_rate": 9.87703804347826e-05, "loss": 0.5706, "step": 4479 }, { "epoch": 0.7013149655604258, "grad_norm": 2.0681262016296387, "learning_rate": 9.875849184782608e-05, "loss": 0.7618, "step": 4480 }, { "epoch": 0.7014715090795242, "grad_norm": 4.230677604675293, "learning_rate": 9.874660326086956e-05, "loss": 0.8214, "step": 4481 }, { "epoch": 0.7016280525986224, "grad_norm": 2.5353498458862305, "learning_rate": 9.873471467391303e-05, "loss": 1.1104, "step": 4482 }, { "epoch": 0.7017845961177207, "grad_norm": 3.380772829055786, "learning_rate": 9.872282608695651e-05, "loss": 1.3174, "step": 4483 }, { "epoch": 0.701941139636819, "grad_norm": 1.663814902305603, "learning_rate": 9.871093749999999e-05, "loss": 0.6088, "step": 4484 }, { "epoch": 0.7020976831559174, "grad_norm": 3.1651248931884766, "learning_rate": 9.869904891304347e-05, "loss": 1.0758, "step": 4485 }, { "epoch": 0.7022542266750157, "grad_norm": 2.3384368419647217, "learning_rate": 9.868716032608695e-05, "loss": 1.1637, "step": 4486 }, { "epoch": 0.7024107701941139, "grad_norm": 3.4036343097686768, "learning_rate": 9.867527173913042e-05, "loss": 0.7514, "step": 4487 }, { "epoch": 0.7025673137132122, "grad_norm": 3.5142600536346436, "learning_rate": 9.86633831521739e-05, "loss": 0.8931, "step": 4488 }, { "epoch": 0.7027238572323106, "grad_norm": 2.923440933227539, "learning_rate": 9.865149456521738e-05, "loss": 0.5095, "step": 4489 }, { "epoch": 0.7028804007514089, "grad_norm": 2.0722482204437256, "learning_rate": 9.863960597826086e-05, "loss": 1.0427, "step": 4490 }, { "epoch": 0.7030369442705072, "grad_norm": 2.0248806476593018, "learning_rate": 9.862771739130435e-05, "loss": 1.338, "step": 4491 }, { "epoch": 0.7031934877896056, "grad_norm": 3.2983057498931885, "learning_rate": 9.861582880434783e-05, "loss": 1.2273, "step": 4492 }, { "epoch": 0.7033500313087038, "grad_norm": 2.466486692428589, "learning_rate": 9.86039402173913e-05, "loss": 0.8612, "step": 4493 }, { "epoch": 0.7035065748278021, "grad_norm": 2.5930309295654297, "learning_rate": 9.859205163043478e-05, "loss": 1.1094, "step": 4494 }, { "epoch": 0.7036631183469004, "grad_norm": 5.906215667724609, "learning_rate": 9.858016304347826e-05, "loss": 0.7641, "step": 4495 }, { "epoch": 0.7038196618659988, "grad_norm": 2.96946120262146, "learning_rate": 9.856827445652173e-05, "loss": 1.0266, "step": 4496 }, { "epoch": 0.7039762053850971, "grad_norm": 2.0124101638793945, "learning_rate": 9.85563858695652e-05, "loss": 1.0122, "step": 4497 }, { "epoch": 0.7041327489041954, "grad_norm": 2.586094617843628, "learning_rate": 9.854449728260868e-05, "loss": 0.8653, "step": 4498 }, { "epoch": 0.7042892924232936, "grad_norm": 1.836812138557434, "learning_rate": 9.853260869565216e-05, "loss": 1.075, "step": 4499 }, { "epoch": 0.704445835942392, "grad_norm": 2.102390766143799, "learning_rate": 9.852072010869564e-05, "loss": 0.7667, "step": 4500 }, { "epoch": 0.7046023794614903, "grad_norm": 0.5810683369636536, "learning_rate": 9.850883152173912e-05, "loss": 0.2116, "step": 4501 }, { "epoch": 0.7047589229805886, "grad_norm": 0.703874945640564, "learning_rate": 9.84969429347826e-05, "loss": 0.3193, "step": 4502 }, { "epoch": 0.704915466499687, "grad_norm": 0.6528502106666565, "learning_rate": 9.848505434782607e-05, "loss": 0.3239, "step": 4503 }, { "epoch": 0.7050720100187852, "grad_norm": 0.7256782054901123, "learning_rate": 9.847316576086955e-05, "loss": 0.2615, "step": 4504 }, { "epoch": 0.7052285535378835, "grad_norm": 0.6598588228225708, "learning_rate": 9.846127717391303e-05, "loss": 0.3215, "step": 4505 }, { "epoch": 0.7053850970569818, "grad_norm": 0.744325578212738, "learning_rate": 9.844938858695651e-05, "loss": 0.3104, "step": 4506 }, { "epoch": 0.7055416405760802, "grad_norm": 0.9657143950462341, "learning_rate": 9.843749999999999e-05, "loss": 0.3323, "step": 4507 }, { "epoch": 0.7056981840951785, "grad_norm": 0.9187541007995605, "learning_rate": 9.842561141304347e-05, "loss": 0.434, "step": 4508 }, { "epoch": 0.7058547276142768, "grad_norm": 2.6070563793182373, "learning_rate": 9.841372282608694e-05, "loss": 0.3943, "step": 4509 }, { "epoch": 0.706011271133375, "grad_norm": 0.7187080383300781, "learning_rate": 9.840183423913042e-05, "loss": 0.3567, "step": 4510 }, { "epoch": 0.7061678146524734, "grad_norm": 0.9720143675804138, "learning_rate": 9.838994565217391e-05, "loss": 0.3952, "step": 4511 }, { "epoch": 0.7063243581715717, "grad_norm": 0.8199292421340942, "learning_rate": 9.837805706521739e-05, "loss": 0.3375, "step": 4512 }, { "epoch": 0.70648090169067, "grad_norm": 0.5709250569343567, "learning_rate": 9.836616847826087e-05, "loss": 0.3288, "step": 4513 }, { "epoch": 0.7066374452097683, "grad_norm": 1.560362458229065, "learning_rate": 9.835427989130435e-05, "loss": 0.6452, "step": 4514 }, { "epoch": 0.7067939887288667, "grad_norm": 1.3299816846847534, "learning_rate": 9.834239130434783e-05, "loss": 0.4671, "step": 4515 }, { "epoch": 0.7069505322479649, "grad_norm": 2.3403170108795166, "learning_rate": 9.83305027173913e-05, "loss": 0.5213, "step": 4516 }, { "epoch": 0.7071070757670632, "grad_norm": 1.0421926975250244, "learning_rate": 9.831861413043478e-05, "loss": 0.5177, "step": 4517 }, { "epoch": 0.7072636192861615, "grad_norm": 1.0902512073516846, "learning_rate": 9.830672554347826e-05, "loss": 0.4042, "step": 4518 }, { "epoch": 0.7074201628052599, "grad_norm": 1.5383235216140747, "learning_rate": 9.829483695652173e-05, "loss": 0.5264, "step": 4519 }, { "epoch": 0.7075767063243582, "grad_norm": 3.8564977645874023, "learning_rate": 9.82829483695652e-05, "loss": 0.8137, "step": 4520 }, { "epoch": 0.7077332498434565, "grad_norm": 1.729108452796936, "learning_rate": 9.827105978260868e-05, "loss": 0.9685, "step": 4521 }, { "epoch": 0.7078897933625548, "grad_norm": 2.284099817276001, "learning_rate": 9.825917119565216e-05, "loss": 0.6307, "step": 4522 }, { "epoch": 0.7080463368816531, "grad_norm": 1.8373669385910034, "learning_rate": 9.824728260869564e-05, "loss": 0.4407, "step": 4523 }, { "epoch": 0.7082028804007514, "grad_norm": 1.9688785076141357, "learning_rate": 9.823539402173912e-05, "loss": 0.6933, "step": 4524 }, { "epoch": 0.7083594239198497, "grad_norm": 6.961009979248047, "learning_rate": 9.82235054347826e-05, "loss": 0.8058, "step": 4525 }, { "epoch": 0.7085159674389481, "grad_norm": 1.1565146446228027, "learning_rate": 9.821161684782607e-05, "loss": 0.5403, "step": 4526 }, { "epoch": 0.7086725109580463, "grad_norm": 3.4319751262664795, "learning_rate": 9.819972826086955e-05, "loss": 1.1, "step": 4527 }, { "epoch": 0.7088290544771446, "grad_norm": 1.58231782913208, "learning_rate": 9.818783967391303e-05, "loss": 0.6445, "step": 4528 }, { "epoch": 0.7089855979962429, "grad_norm": 1.6332128047943115, "learning_rate": 9.81759510869565e-05, "loss": 0.6083, "step": 4529 }, { "epoch": 0.7091421415153413, "grad_norm": 1.8914631605148315, "learning_rate": 9.816406249999998e-05, "loss": 0.7025, "step": 4530 }, { "epoch": 0.7092986850344396, "grad_norm": 1.3999665975570679, "learning_rate": 9.815217391304348e-05, "loss": 0.537, "step": 4531 }, { "epoch": 0.7094552285535379, "grad_norm": 2.9034018516540527, "learning_rate": 9.814028532608695e-05, "loss": 1.1835, "step": 4532 }, { "epoch": 0.7096117720726361, "grad_norm": 2.9422671794891357, "learning_rate": 9.812839673913043e-05, "loss": 0.9772, "step": 4533 }, { "epoch": 0.7097683155917345, "grad_norm": 2.862938642501831, "learning_rate": 9.811650815217391e-05, "loss": 0.9638, "step": 4534 }, { "epoch": 0.7099248591108328, "grad_norm": 6.34733772277832, "learning_rate": 9.810461956521739e-05, "loss": 1.0401, "step": 4535 }, { "epoch": 0.7100814026299311, "grad_norm": 2.0503294467926025, "learning_rate": 9.809273097826087e-05, "loss": 1.1317, "step": 4536 }, { "epoch": 0.7102379461490295, "grad_norm": 2.2220096588134766, "learning_rate": 9.808084239130435e-05, "loss": 0.9921, "step": 4537 }, { "epoch": 0.7103944896681278, "grad_norm": 2.083740711212158, "learning_rate": 9.806895380434782e-05, "loss": 0.9431, "step": 4538 }, { "epoch": 0.710551033187226, "grad_norm": 4.177829265594482, "learning_rate": 9.80570652173913e-05, "loss": 1.4635, "step": 4539 }, { "epoch": 0.7107075767063243, "grad_norm": 2.3851370811462402, "learning_rate": 9.804517663043478e-05, "loss": 0.8421, "step": 4540 }, { "epoch": 0.7108641202254227, "grad_norm": 2.289033889770508, "learning_rate": 9.803328804347826e-05, "loss": 1.276, "step": 4541 }, { "epoch": 0.711020663744521, "grad_norm": 3.1281564235687256, "learning_rate": 9.802139945652172e-05, "loss": 1.6047, "step": 4542 }, { "epoch": 0.7111772072636193, "grad_norm": 2.889497995376587, "learning_rate": 9.80095108695652e-05, "loss": 1.2785, "step": 4543 }, { "epoch": 0.7113337507827175, "grad_norm": 4.448215484619141, "learning_rate": 9.799762228260868e-05, "loss": 1.515, "step": 4544 }, { "epoch": 0.7114902943018159, "grad_norm": 3.3718996047973633, "learning_rate": 9.798573369565216e-05, "loss": 1.1969, "step": 4545 }, { "epoch": 0.7116468378209142, "grad_norm": 0.929951012134552, "learning_rate": 9.797384510869564e-05, "loss": 0.6792, "step": 4546 }, { "epoch": 0.7118033813400125, "grad_norm": 1.9395979642868042, "learning_rate": 9.796195652173911e-05, "loss": 0.722, "step": 4547 }, { "epoch": 0.7119599248591109, "grad_norm": 4.071780204772949, "learning_rate": 9.795006793478259e-05, "loss": 1.3032, "step": 4548 }, { "epoch": 0.7121164683782092, "grad_norm": 2.300243616104126, "learning_rate": 9.793817934782607e-05, "loss": 0.4965, "step": 4549 }, { "epoch": 0.7122730118973074, "grad_norm": 2.127016067504883, "learning_rate": 9.792629076086955e-05, "loss": 1.3813, "step": 4550 }, { "epoch": 0.7124295554164057, "grad_norm": 2.0612592697143555, "learning_rate": 9.791440217391304e-05, "loss": 0.8125, "step": 4551 }, { "epoch": 0.7125860989355041, "grad_norm": 0.4824146330356598, "learning_rate": 9.790251358695652e-05, "loss": 0.2209, "step": 4552 }, { "epoch": 0.7127426424546024, "grad_norm": 0.5090584754943848, "learning_rate": 9.7890625e-05, "loss": 0.2529, "step": 4553 }, { "epoch": 0.7128991859737007, "grad_norm": 0.5486109852790833, "learning_rate": 9.787873641304347e-05, "loss": 0.3317, "step": 4554 }, { "epoch": 0.713055729492799, "grad_norm": 0.899691641330719, "learning_rate": 9.786684782608695e-05, "loss": 0.3785, "step": 4555 }, { "epoch": 0.7132122730118973, "grad_norm": 1.930052399635315, "learning_rate": 9.785495923913043e-05, "loss": 0.2991, "step": 4556 }, { "epoch": 0.7133688165309956, "grad_norm": 1.0138503313064575, "learning_rate": 9.784307065217391e-05, "loss": 0.3575, "step": 4557 }, { "epoch": 0.7135253600500939, "grad_norm": 1.128669261932373, "learning_rate": 9.783118206521739e-05, "loss": 0.3012, "step": 4558 }, { "epoch": 0.7136819035691923, "grad_norm": 0.6940640211105347, "learning_rate": 9.781929347826086e-05, "loss": 0.2999, "step": 4559 }, { "epoch": 0.7138384470882906, "grad_norm": 0.8381147980690002, "learning_rate": 9.780740489130434e-05, "loss": 0.3803, "step": 4560 }, { "epoch": 0.7139949906073888, "grad_norm": 0.981237530708313, "learning_rate": 9.779551630434782e-05, "loss": 0.4012, "step": 4561 }, { "epoch": 0.7141515341264871, "grad_norm": 1.6725552082061768, "learning_rate": 9.77836277173913e-05, "loss": 0.6485, "step": 4562 }, { "epoch": 0.7143080776455855, "grad_norm": 1.7875100374221802, "learning_rate": 9.777173913043478e-05, "loss": 0.56, "step": 4563 }, { "epoch": 0.7144646211646838, "grad_norm": 1.2759300470352173, "learning_rate": 9.775985054347826e-05, "loss": 0.4404, "step": 4564 }, { "epoch": 0.7146211646837821, "grad_norm": 1.0585551261901855, "learning_rate": 9.774796195652172e-05, "loss": 0.2641, "step": 4565 }, { "epoch": 0.7147777082028804, "grad_norm": 1.4532231092453003, "learning_rate": 9.77360733695652e-05, "loss": 0.7796, "step": 4566 }, { "epoch": 0.7149342517219787, "grad_norm": 1.5008692741394043, "learning_rate": 9.772418478260868e-05, "loss": 0.5105, "step": 4567 }, { "epoch": 0.715090795241077, "grad_norm": 1.2292274236679077, "learning_rate": 9.771229619565215e-05, "loss": 0.5592, "step": 4568 }, { "epoch": 0.7152473387601753, "grad_norm": 2.715550184249878, "learning_rate": 9.770040760869563e-05, "loss": 0.754, "step": 4569 }, { "epoch": 0.7154038822792737, "grad_norm": 1.8343254327774048, "learning_rate": 9.768851902173912e-05, "loss": 0.3502, "step": 4570 }, { "epoch": 0.715560425798372, "grad_norm": 1.8266228437423706, "learning_rate": 9.76766304347826e-05, "loss": 0.6609, "step": 4571 }, { "epoch": 0.7157169693174703, "grad_norm": 1.7999610900878906, "learning_rate": 9.766474184782608e-05, "loss": 0.6392, "step": 4572 }, { "epoch": 0.7158735128365685, "grad_norm": 1.6436513662338257, "learning_rate": 9.765285326086956e-05, "loss": 0.4186, "step": 4573 }, { "epoch": 0.7160300563556669, "grad_norm": 4.116649150848389, "learning_rate": 9.764096467391304e-05, "loss": 1.1117, "step": 4574 }, { "epoch": 0.7161865998747652, "grad_norm": 1.5703610181808472, "learning_rate": 9.762907608695652e-05, "loss": 0.5162, "step": 4575 }, { "epoch": 0.7163431433938635, "grad_norm": 2.038010358810425, "learning_rate": 9.76171875e-05, "loss": 0.5124, "step": 4576 }, { "epoch": 0.7164996869129618, "grad_norm": 2.1448919773101807, "learning_rate": 9.760529891304347e-05, "loss": 0.6621, "step": 4577 }, { "epoch": 0.7166562304320601, "grad_norm": 2.6684587001800537, "learning_rate": 9.759341032608695e-05, "loss": 0.8272, "step": 4578 }, { "epoch": 0.7168127739511584, "grad_norm": 2.981003761291504, "learning_rate": 9.758152173913043e-05, "loss": 0.8432, "step": 4579 }, { "epoch": 0.7169693174702567, "grad_norm": 1.864626169204712, "learning_rate": 9.75696331521739e-05, "loss": 0.7368, "step": 4580 }, { "epoch": 0.717125860989355, "grad_norm": 2.0472166538238525, "learning_rate": 9.755774456521738e-05, "loss": 0.5773, "step": 4581 }, { "epoch": 0.7172824045084534, "grad_norm": 1.9272555112838745, "learning_rate": 9.754585597826086e-05, "loss": 0.7794, "step": 4582 }, { "epoch": 0.7174389480275517, "grad_norm": 3.599456548690796, "learning_rate": 9.753396739130434e-05, "loss": 1.0839, "step": 4583 }, { "epoch": 0.7175954915466499, "grad_norm": 2.570726156234741, "learning_rate": 9.752207880434782e-05, "loss": 1.2032, "step": 4584 }, { "epoch": 0.7177520350657483, "grad_norm": 3.49001407623291, "learning_rate": 9.75101902173913e-05, "loss": 1.6806, "step": 4585 }, { "epoch": 0.7179085785848466, "grad_norm": 4.707046031951904, "learning_rate": 9.749830163043478e-05, "loss": 0.8744, "step": 4586 }, { "epoch": 0.7180651221039449, "grad_norm": 2.0290629863739014, "learning_rate": 9.748641304347825e-05, "loss": 0.6136, "step": 4587 }, { "epoch": 0.7182216656230432, "grad_norm": 4.061986923217773, "learning_rate": 9.747452445652172e-05, "loss": 1.1442, "step": 4588 }, { "epoch": 0.7183782091421416, "grad_norm": 4.0650506019592285, "learning_rate": 9.74626358695652e-05, "loss": 1.1992, "step": 4589 }, { "epoch": 0.7185347526612398, "grad_norm": 2.2935659885406494, "learning_rate": 9.745074728260869e-05, "loss": 1.3736, "step": 4590 }, { "epoch": 0.7186912961803381, "grad_norm": 2.075472116470337, "learning_rate": 9.743885869565217e-05, "loss": 1.0534, "step": 4591 }, { "epoch": 0.7188478396994364, "grad_norm": 4.019814491271973, "learning_rate": 9.742697010869564e-05, "loss": 1.3149, "step": 4592 }, { "epoch": 0.7190043832185348, "grad_norm": 2.93245530128479, "learning_rate": 9.741508152173912e-05, "loss": 1.2592, "step": 4593 }, { "epoch": 0.7191609267376331, "grad_norm": 1.9722386598587036, "learning_rate": 9.74031929347826e-05, "loss": 1.003, "step": 4594 }, { "epoch": 0.7193174702567313, "grad_norm": 2.1157119274139404, "learning_rate": 9.739130434782608e-05, "loss": 1.3262, "step": 4595 }, { "epoch": 0.7194740137758296, "grad_norm": 2.391645669937134, "learning_rate": 9.737941576086956e-05, "loss": 1.0801, "step": 4596 }, { "epoch": 0.719630557294928, "grad_norm": 1.05311119556427, "learning_rate": 9.736752717391303e-05, "loss": 0.5611, "step": 4597 }, { "epoch": 0.7197871008140263, "grad_norm": 2.742539405822754, "learning_rate": 9.735563858695651e-05, "loss": 1.1244, "step": 4598 }, { "epoch": 0.7199436443331246, "grad_norm": 2.7674336433410645, "learning_rate": 9.734374999999999e-05, "loss": 0.8861, "step": 4599 }, { "epoch": 0.720100187852223, "grad_norm": 2.5479369163513184, "learning_rate": 9.733186141304347e-05, "loss": 0.9553, "step": 4600 }, { "epoch": 0.7202567313713212, "grad_norm": 0.5672402381896973, "learning_rate": 9.731997282608695e-05, "loss": 0.2965, "step": 4601 }, { "epoch": 0.7204132748904195, "grad_norm": 0.9051365256309509, "learning_rate": 9.730808423913043e-05, "loss": 0.2567, "step": 4602 }, { "epoch": 0.7205698184095178, "grad_norm": 0.7147610187530518, "learning_rate": 9.72961956521739e-05, "loss": 0.3091, "step": 4603 }, { "epoch": 0.7207263619286162, "grad_norm": 0.8043972253799438, "learning_rate": 9.728430706521738e-05, "loss": 0.2283, "step": 4604 }, { "epoch": 0.7208829054477145, "grad_norm": 0.8598465323448181, "learning_rate": 9.727241847826086e-05, "loss": 0.2628, "step": 4605 }, { "epoch": 0.7210394489668128, "grad_norm": 0.7135474681854248, "learning_rate": 9.726052989130434e-05, "loss": 0.2936, "step": 4606 }, { "epoch": 0.721195992485911, "grad_norm": 0.8633602857589722, "learning_rate": 9.724864130434782e-05, "loss": 0.3259, "step": 4607 }, { "epoch": 0.7213525360050094, "grad_norm": 0.5899518132209778, "learning_rate": 9.723675271739131e-05, "loss": 0.3105, "step": 4608 }, { "epoch": 0.7215090795241077, "grad_norm": 0.8945555686950684, "learning_rate": 9.722486413043479e-05, "loss": 0.3073, "step": 4609 }, { "epoch": 0.721665623043206, "grad_norm": 0.8323129415512085, "learning_rate": 9.721297554347826e-05, "loss": 0.2965, "step": 4610 }, { "epoch": 0.7218221665623044, "grad_norm": 1.9724828004837036, "learning_rate": 9.720108695652173e-05, "loss": 0.714, "step": 4611 }, { "epoch": 0.7219787100814026, "grad_norm": 0.9920156002044678, "learning_rate": 9.718919836956521e-05, "loss": 0.3831, "step": 4612 }, { "epoch": 0.7221352536005009, "grad_norm": 2.8673179149627686, "learning_rate": 9.717730978260869e-05, "loss": 0.4105, "step": 4613 }, { "epoch": 0.7222917971195992, "grad_norm": 2.2413456439971924, "learning_rate": 9.716542119565216e-05, "loss": 0.478, "step": 4614 }, { "epoch": 0.7224483406386976, "grad_norm": 1.4359052181243896, "learning_rate": 9.715353260869564e-05, "loss": 0.4763, "step": 4615 }, { "epoch": 0.7226048841577959, "grad_norm": 1.7364391088485718, "learning_rate": 9.714164402173912e-05, "loss": 0.5248, "step": 4616 }, { "epoch": 0.7227614276768942, "grad_norm": 0.9565919041633606, "learning_rate": 9.71297554347826e-05, "loss": 0.351, "step": 4617 }, { "epoch": 0.7229179711959924, "grad_norm": 2.6367897987365723, "learning_rate": 9.711786684782608e-05, "loss": 0.7658, "step": 4618 }, { "epoch": 0.7230745147150908, "grad_norm": 1.2902474403381348, "learning_rate": 9.710597826086955e-05, "loss": 0.4512, "step": 4619 }, { "epoch": 0.7232310582341891, "grad_norm": 1.7196956872940063, "learning_rate": 9.709408967391303e-05, "loss": 0.4842, "step": 4620 }, { "epoch": 0.7233876017532874, "grad_norm": 2.4778826236724854, "learning_rate": 9.708220108695651e-05, "loss": 0.6216, "step": 4621 }, { "epoch": 0.7235441452723858, "grad_norm": 1.8141534328460693, "learning_rate": 9.707031249999999e-05, "loss": 0.6116, "step": 4622 }, { "epoch": 0.7237006887914841, "grad_norm": 1.414859414100647, "learning_rate": 9.705842391304347e-05, "loss": 0.253, "step": 4623 }, { "epoch": 0.7238572323105823, "grad_norm": 2.546786308288574, "learning_rate": 9.704653532608694e-05, "loss": 0.5515, "step": 4624 }, { "epoch": 0.7240137758296806, "grad_norm": 2.0236713886260986, "learning_rate": 9.703464673913042e-05, "loss": 0.4526, "step": 4625 }, { "epoch": 0.724170319348779, "grad_norm": 3.478756904602051, "learning_rate": 9.70227581521739e-05, "loss": 0.4982, "step": 4626 }, { "epoch": 0.7243268628678773, "grad_norm": 2.0406577587127686, "learning_rate": 9.701086956521738e-05, "loss": 0.6074, "step": 4627 }, { "epoch": 0.7244834063869756, "grad_norm": 2.534243583679199, "learning_rate": 9.699898097826087e-05, "loss": 0.5075, "step": 4628 }, { "epoch": 0.7246399499060739, "grad_norm": 3.6490719318389893, "learning_rate": 9.698709239130435e-05, "loss": 0.8324, "step": 4629 }, { "epoch": 0.7247964934251722, "grad_norm": 3.480515480041504, "learning_rate": 9.697520380434783e-05, "loss": 0.6732, "step": 4630 }, { "epoch": 0.7249530369442705, "grad_norm": 4.109898090362549, "learning_rate": 9.69633152173913e-05, "loss": 0.9374, "step": 4631 }, { "epoch": 0.7251095804633688, "grad_norm": 3.9714324474334717, "learning_rate": 9.695142663043478e-05, "loss": 1.1732, "step": 4632 }, { "epoch": 0.7252661239824671, "grad_norm": 2.8652408123016357, "learning_rate": 9.693953804347826e-05, "loss": 0.9188, "step": 4633 }, { "epoch": 0.7254226675015655, "grad_norm": 2.8688390254974365, "learning_rate": 9.692764945652173e-05, "loss": 1.2771, "step": 4634 }, { "epoch": 0.7255792110206637, "grad_norm": 2.830960273742676, "learning_rate": 9.69157608695652e-05, "loss": 0.8754, "step": 4635 }, { "epoch": 0.725735754539762, "grad_norm": 1.9983919858932495, "learning_rate": 9.690387228260868e-05, "loss": 0.761, "step": 4636 }, { "epoch": 0.7258922980588604, "grad_norm": 2.3073606491088867, "learning_rate": 9.689198369565216e-05, "loss": 1.0796, "step": 4637 }, { "epoch": 0.7260488415779587, "grad_norm": 3.993588924407959, "learning_rate": 9.688009510869564e-05, "loss": 0.7959, "step": 4638 }, { "epoch": 0.726205385097057, "grad_norm": 3.1257011890411377, "learning_rate": 9.686820652173912e-05, "loss": 1.0186, "step": 4639 }, { "epoch": 0.7263619286161553, "grad_norm": 4.390204429626465, "learning_rate": 9.68563179347826e-05, "loss": 1.145, "step": 4640 }, { "epoch": 0.7265184721352536, "grad_norm": 2.0536561012268066, "learning_rate": 9.684442934782607e-05, "loss": 1.1352, "step": 4641 }, { "epoch": 0.7266750156543519, "grad_norm": 4.0154571533203125, "learning_rate": 9.683254076086955e-05, "loss": 1.0661, "step": 4642 }, { "epoch": 0.7268315591734502, "grad_norm": 8.578271865844727, "learning_rate": 9.682065217391303e-05, "loss": 0.8575, "step": 4643 }, { "epoch": 0.7269881026925485, "grad_norm": 2.3489909172058105, "learning_rate": 9.680876358695651e-05, "loss": 1.2232, "step": 4644 }, { "epoch": 0.7271446462116469, "grad_norm": 4.5153093338012695, "learning_rate": 9.679687499999999e-05, "loss": 0.6531, "step": 4645 }, { "epoch": 0.7273011897307452, "grad_norm": 7.1713738441467285, "learning_rate": 9.678498641304346e-05, "loss": 1.2253, "step": 4646 }, { "epoch": 0.7274577332498434, "grad_norm": 2.803091287612915, "learning_rate": 9.677309782608696e-05, "loss": 0.8687, "step": 4647 }, { "epoch": 0.7276142767689417, "grad_norm": 4.257869243621826, "learning_rate": 9.676120923913043e-05, "loss": 0.9142, "step": 4648 }, { "epoch": 0.7277708202880401, "grad_norm": 2.9807214736938477, "learning_rate": 9.674932065217391e-05, "loss": 0.8145, "step": 4649 }, { "epoch": 0.7279273638071384, "grad_norm": 2.4273903369903564, "learning_rate": 9.673743206521739e-05, "loss": 1.4284, "step": 4650 }, { "epoch": 0.7280839073262367, "grad_norm": 0.7988284826278687, "learning_rate": 9.672554347826087e-05, "loss": 0.3456, "step": 4651 }, { "epoch": 0.728240450845335, "grad_norm": 0.6992729902267456, "learning_rate": 9.671365489130435e-05, "loss": 0.3065, "step": 4652 }, { "epoch": 0.7283969943644333, "grad_norm": 1.055504322052002, "learning_rate": 9.670176630434783e-05, "loss": 0.3088, "step": 4653 }, { "epoch": 0.7285535378835316, "grad_norm": 0.615355908870697, "learning_rate": 9.66898777173913e-05, "loss": 0.2826, "step": 4654 }, { "epoch": 0.7287100814026299, "grad_norm": 0.5976371765136719, "learning_rate": 9.667798913043478e-05, "loss": 0.2984, "step": 4655 }, { "epoch": 0.7288666249217283, "grad_norm": 0.9710000157356262, "learning_rate": 9.666610054347826e-05, "loss": 0.4697, "step": 4656 }, { "epoch": 0.7290231684408266, "grad_norm": 0.8848055005073547, "learning_rate": 9.665421195652172e-05, "loss": 0.2874, "step": 4657 }, { "epoch": 0.7291797119599248, "grad_norm": 0.6396862268447876, "learning_rate": 9.66423233695652e-05, "loss": 0.2437, "step": 4658 }, { "epoch": 0.7293362554790231, "grad_norm": 1.1116563081741333, "learning_rate": 9.663043478260868e-05, "loss": 0.3257, "step": 4659 }, { "epoch": 0.7294927989981215, "grad_norm": 1.1618049144744873, "learning_rate": 9.661854619565216e-05, "loss": 0.3236, "step": 4660 }, { "epoch": 0.7296493425172198, "grad_norm": 0.7657711505889893, "learning_rate": 9.660665760869564e-05, "loss": 0.2979, "step": 4661 }, { "epoch": 0.7298058860363181, "grad_norm": 1.1340723037719727, "learning_rate": 9.659476902173911e-05, "loss": 0.4134, "step": 4662 }, { "epoch": 0.7299624295554165, "grad_norm": 1.3047329187393188, "learning_rate": 9.658288043478259e-05, "loss": 0.3787, "step": 4663 }, { "epoch": 0.7301189730745147, "grad_norm": 1.5476713180541992, "learning_rate": 9.657099184782607e-05, "loss": 0.5547, "step": 4664 }, { "epoch": 0.730275516593613, "grad_norm": 1.5405519008636475, "learning_rate": 9.655910326086955e-05, "loss": 0.4097, "step": 4665 }, { "epoch": 0.7304320601127113, "grad_norm": 1.1537443399429321, "learning_rate": 9.654721467391303e-05, "loss": 0.3532, "step": 4666 }, { "epoch": 0.7305886036318097, "grad_norm": 5.727363109588623, "learning_rate": 9.653532608695652e-05, "loss": 2.9847, "step": 4667 }, { "epoch": 0.730745147150908, "grad_norm": 4.765348434448242, "learning_rate": 9.65234375e-05, "loss": 0.9375, "step": 4668 }, { "epoch": 0.7309016906700062, "grad_norm": 1.6438020467758179, "learning_rate": 9.651154891304348e-05, "loss": 0.4778, "step": 4669 }, { "epoch": 0.7310582341891045, "grad_norm": 3.1229074001312256, "learning_rate": 9.649966032608695e-05, "loss": 0.5629, "step": 4670 }, { "epoch": 0.7312147777082029, "grad_norm": 2.3925795555114746, "learning_rate": 9.648777173913043e-05, "loss": 0.9293, "step": 4671 }, { "epoch": 0.7313713212273012, "grad_norm": 2.0994749069213867, "learning_rate": 9.647588315217391e-05, "loss": 0.5218, "step": 4672 }, { "epoch": 0.7315278647463995, "grad_norm": 3.362201690673828, "learning_rate": 9.646399456521739e-05, "loss": 0.6847, "step": 4673 }, { "epoch": 0.7316844082654979, "grad_norm": 1.287867784500122, "learning_rate": 9.645210597826087e-05, "loss": 0.5767, "step": 4674 }, { "epoch": 0.7318409517845961, "grad_norm": 3.4943673610687256, "learning_rate": 9.644021739130434e-05, "loss": 0.8225, "step": 4675 }, { "epoch": 0.7319974953036944, "grad_norm": 1.8795511722564697, "learning_rate": 9.642832880434782e-05, "loss": 0.6454, "step": 4676 }, { "epoch": 0.7321540388227927, "grad_norm": 4.894820690155029, "learning_rate": 9.64164402173913e-05, "loss": 1.1414, "step": 4677 }, { "epoch": 0.7323105823418911, "grad_norm": 2.090751886367798, "learning_rate": 9.640455163043478e-05, "loss": 0.8599, "step": 4678 }, { "epoch": 0.7324671258609894, "grad_norm": 1.5502605438232422, "learning_rate": 9.639266304347826e-05, "loss": 0.5661, "step": 4679 }, { "epoch": 0.7326236693800877, "grad_norm": 3.158719062805176, "learning_rate": 9.638077445652172e-05, "loss": 0.9444, "step": 4680 }, { "epoch": 0.7327802128991859, "grad_norm": 2.4538416862487793, "learning_rate": 9.63688858695652e-05, "loss": 0.8634, "step": 4681 }, { "epoch": 0.7329367564182843, "grad_norm": 1.5698764324188232, "learning_rate": 9.635699728260868e-05, "loss": 0.6418, "step": 4682 }, { "epoch": 0.7330932999373826, "grad_norm": 1.5031574964523315, "learning_rate": 9.634510869565216e-05, "loss": 0.5181, "step": 4683 }, { "epoch": 0.7332498434564809, "grad_norm": 2.8994638919830322, "learning_rate": 9.633322010869563e-05, "loss": 0.9082, "step": 4684 }, { "epoch": 0.7334063869755792, "grad_norm": 4.377370357513428, "learning_rate": 9.632133152173911e-05, "loss": 0.8905, "step": 4685 }, { "epoch": 0.7335629304946775, "grad_norm": 6.1531805992126465, "learning_rate": 9.630944293478259e-05, "loss": 1.034, "step": 4686 }, { "epoch": 0.7337194740137758, "grad_norm": 3.4596285820007324, "learning_rate": 9.629755434782608e-05, "loss": 0.5596, "step": 4687 }, { "epoch": 0.7338760175328741, "grad_norm": 3.468567371368408, "learning_rate": 9.628566576086956e-05, "loss": 0.9611, "step": 4688 }, { "epoch": 0.7340325610519725, "grad_norm": 3.3375537395477295, "learning_rate": 9.627377717391304e-05, "loss": 1.6937, "step": 4689 }, { "epoch": 0.7341891045710708, "grad_norm": 2.7900583744049072, "learning_rate": 9.626188858695652e-05, "loss": 1.2855, "step": 4690 }, { "epoch": 0.7343456480901691, "grad_norm": 1.9942313432693481, "learning_rate": 9.625e-05, "loss": 0.783, "step": 4691 }, { "epoch": 0.7345021916092673, "grad_norm": 4.920779705047607, "learning_rate": 9.623811141304347e-05, "loss": 1.4185, "step": 4692 }, { "epoch": 0.7346587351283657, "grad_norm": 1.7060154676437378, "learning_rate": 9.622622282608695e-05, "loss": 0.7993, "step": 4693 }, { "epoch": 0.734815278647464, "grad_norm": 2.6004233360290527, "learning_rate": 9.621433423913043e-05, "loss": 1.7464, "step": 4694 }, { "epoch": 0.7349718221665623, "grad_norm": 1.4559508562088013, "learning_rate": 9.620244565217391e-05, "loss": 0.7008, "step": 4695 }, { "epoch": 0.7351283656856606, "grad_norm": 2.194650411605835, "learning_rate": 9.619055706521739e-05, "loss": 0.7342, "step": 4696 }, { "epoch": 0.735284909204759, "grad_norm": 1.5546398162841797, "learning_rate": 9.617866847826086e-05, "loss": 0.472, "step": 4697 }, { "epoch": 0.7354414527238572, "grad_norm": 2.325026512145996, "learning_rate": 9.616677989130434e-05, "loss": 1.3154, "step": 4698 }, { "epoch": 0.7355979962429555, "grad_norm": 3.5443687438964844, "learning_rate": 9.615489130434782e-05, "loss": 0.9449, "step": 4699 }, { "epoch": 0.7357545397620538, "grad_norm": 2.4076144695281982, "learning_rate": 9.61430027173913e-05, "loss": 1.4059, "step": 4700 }, { "epoch": 0.7359110832811522, "grad_norm": 0.4032496511936188, "learning_rate": 9.613111413043478e-05, "loss": 0.1882, "step": 4701 }, { "epoch": 0.7360676268002505, "grad_norm": 0.4813295006752014, "learning_rate": 9.611922554347825e-05, "loss": 0.2496, "step": 4702 }, { "epoch": 0.7362241703193487, "grad_norm": 0.97858065366745, "learning_rate": 9.610733695652172e-05, "loss": 0.2596, "step": 4703 }, { "epoch": 0.736380713838447, "grad_norm": 0.845024585723877, "learning_rate": 9.60954483695652e-05, "loss": 0.4322, "step": 4704 }, { "epoch": 0.7365372573575454, "grad_norm": 1.0401923656463623, "learning_rate": 9.608355978260868e-05, "loss": 0.3164, "step": 4705 }, { "epoch": 0.7366938008766437, "grad_norm": 1.370955228805542, "learning_rate": 9.607167119565215e-05, "loss": 0.3548, "step": 4706 }, { "epoch": 0.736850344395742, "grad_norm": 0.6335982084274292, "learning_rate": 9.605978260869565e-05, "loss": 0.3142, "step": 4707 }, { "epoch": 0.7370068879148404, "grad_norm": 0.790152370929718, "learning_rate": 9.604789402173912e-05, "loss": 0.2447, "step": 4708 }, { "epoch": 0.7371634314339386, "grad_norm": 1.305909514427185, "learning_rate": 9.60360054347826e-05, "loss": 0.3727, "step": 4709 }, { "epoch": 0.7373199749530369, "grad_norm": 1.2561976909637451, "learning_rate": 9.602411684782608e-05, "loss": 0.3732, "step": 4710 }, { "epoch": 0.7374765184721352, "grad_norm": 1.2020632028579712, "learning_rate": 9.601222826086956e-05, "loss": 0.4318, "step": 4711 }, { "epoch": 0.7376330619912336, "grad_norm": 1.2742091417312622, "learning_rate": 9.600033967391304e-05, "loss": 0.4873, "step": 4712 }, { "epoch": 0.7377896055103319, "grad_norm": 2.185486078262329, "learning_rate": 9.598845108695651e-05, "loss": 0.7972, "step": 4713 }, { "epoch": 0.7379461490294302, "grad_norm": 1.7685611248016357, "learning_rate": 9.597656249999999e-05, "loss": 0.6286, "step": 4714 }, { "epoch": 0.7381026925485284, "grad_norm": 1.7627841234207153, "learning_rate": 9.596467391304347e-05, "loss": 0.4771, "step": 4715 }, { "epoch": 0.7382592360676268, "grad_norm": 2.112159252166748, "learning_rate": 9.595278532608695e-05, "loss": 0.3416, "step": 4716 }, { "epoch": 0.7384157795867251, "grad_norm": 1.283502459526062, "learning_rate": 9.594089673913043e-05, "loss": 0.5318, "step": 4717 }, { "epoch": 0.7385723231058234, "grad_norm": 2.4753010272979736, "learning_rate": 9.59290081521739e-05, "loss": 0.5964, "step": 4718 }, { "epoch": 0.7387288666249218, "grad_norm": 1.5373972654342651, "learning_rate": 9.591711956521738e-05, "loss": 0.5746, "step": 4719 }, { "epoch": 0.73888541014402, "grad_norm": 1.1910043954849243, "learning_rate": 9.590523097826086e-05, "loss": 0.4848, "step": 4720 }, { "epoch": 0.7390419536631183, "grad_norm": 2.436483860015869, "learning_rate": 9.589334239130434e-05, "loss": 0.6548, "step": 4721 }, { "epoch": 0.7391984971822166, "grad_norm": 0.8683515191078186, "learning_rate": 9.588145380434782e-05, "loss": 0.3076, "step": 4722 }, { "epoch": 0.739355040701315, "grad_norm": 1.0505564212799072, "learning_rate": 9.58695652173913e-05, "loss": 0.2302, "step": 4723 }, { "epoch": 0.7395115842204133, "grad_norm": 2.6238274574279785, "learning_rate": 9.585767663043479e-05, "loss": 0.8773, "step": 4724 }, { "epoch": 0.7396681277395116, "grad_norm": 1.88201904296875, "learning_rate": 9.584578804347827e-05, "loss": 0.7542, "step": 4725 }, { "epoch": 0.7398246712586098, "grad_norm": 1.488835334777832, "learning_rate": 9.583389945652172e-05, "loss": 0.6552, "step": 4726 }, { "epoch": 0.7399812147777082, "grad_norm": 2.1274304389953613, "learning_rate": 9.582201086956521e-05, "loss": 0.7501, "step": 4727 }, { "epoch": 0.7401377582968065, "grad_norm": 3.4050116539001465, "learning_rate": 9.581012228260869e-05, "loss": 0.8315, "step": 4728 }, { "epoch": 0.7402943018159048, "grad_norm": 3.779296398162842, "learning_rate": 9.579823369565216e-05, "loss": 0.8553, "step": 4729 }, { "epoch": 0.7404508453350032, "grad_norm": 3.5307717323303223, "learning_rate": 9.578634510869564e-05, "loss": 1.2456, "step": 4730 }, { "epoch": 0.7406073888541015, "grad_norm": 1.7206707000732422, "learning_rate": 9.577445652173912e-05, "loss": 0.8493, "step": 4731 }, { "epoch": 0.7407639323731997, "grad_norm": 4.106436729431152, "learning_rate": 9.57625679347826e-05, "loss": 1.1477, "step": 4732 }, { "epoch": 0.740920475892298, "grad_norm": 3.5245492458343506, "learning_rate": 9.575067934782608e-05, "loss": 1.0363, "step": 4733 }, { "epoch": 0.7410770194113964, "grad_norm": 6.668485164642334, "learning_rate": 9.573879076086956e-05, "loss": 0.6661, "step": 4734 }, { "epoch": 0.7412335629304947, "grad_norm": 2.4661147594451904, "learning_rate": 9.572690217391303e-05, "loss": 1.0744, "step": 4735 }, { "epoch": 0.741390106449593, "grad_norm": 2.599827527999878, "learning_rate": 9.571501358695651e-05, "loss": 1.1098, "step": 4736 }, { "epoch": 0.7415466499686914, "grad_norm": 1.8842095136642456, "learning_rate": 9.570312499999999e-05, "loss": 1.0415, "step": 4737 }, { "epoch": 0.7417031934877896, "grad_norm": 3.552600145339966, "learning_rate": 9.569123641304347e-05, "loss": 0.8821, "step": 4738 }, { "epoch": 0.7418597370068879, "grad_norm": 4.984460830688477, "learning_rate": 9.567934782608695e-05, "loss": 1.1121, "step": 4739 }, { "epoch": 0.7420162805259862, "grad_norm": 3.0623972415924072, "learning_rate": 9.566745923913042e-05, "loss": 1.0809, "step": 4740 }, { "epoch": 0.7421728240450846, "grad_norm": 1.9566965103149414, "learning_rate": 9.56555706521739e-05, "loss": 1.0259, "step": 4741 }, { "epoch": 0.7423293675641829, "grad_norm": 2.0789287090301514, "learning_rate": 9.564368206521738e-05, "loss": 1.3331, "step": 4742 }, { "epoch": 0.7424859110832811, "grad_norm": 1.6715154647827148, "learning_rate": 9.563179347826086e-05, "loss": 0.8815, "step": 4743 }, { "epoch": 0.7426424546023794, "grad_norm": 2.697754144668579, "learning_rate": 9.561990489130435e-05, "loss": 0.8943, "step": 4744 }, { "epoch": 0.7427989981214778, "grad_norm": 1.7861288785934448, "learning_rate": 9.560801630434783e-05, "loss": 1.1354, "step": 4745 }, { "epoch": 0.7429555416405761, "grad_norm": 1.6735752820968628, "learning_rate": 9.559612771739131e-05, "loss": 0.6491, "step": 4746 }, { "epoch": 0.7431120851596744, "grad_norm": 1.72998046875, "learning_rate": 9.558423913043479e-05, "loss": 0.7052, "step": 4747 }, { "epoch": 0.7432686286787727, "grad_norm": 2.8748669624328613, "learning_rate": 9.557235054347826e-05, "loss": 1.1973, "step": 4748 }, { "epoch": 0.743425172197871, "grad_norm": 2.2512776851654053, "learning_rate": 9.556046195652173e-05, "loss": 1.2031, "step": 4749 }, { "epoch": 0.7435817157169693, "grad_norm": 2.2197084426879883, "learning_rate": 9.55485733695652e-05, "loss": 0.8574, "step": 4750 }, { "epoch": 0.7437382592360676, "grad_norm": 0.6056637167930603, "learning_rate": 9.553668478260868e-05, "loss": 0.313, "step": 4751 }, { "epoch": 0.743894802755166, "grad_norm": 0.5206002593040466, "learning_rate": 9.552479619565216e-05, "loss": 0.2733, "step": 4752 }, { "epoch": 0.7440513462742643, "grad_norm": 0.5983753800392151, "learning_rate": 9.551290760869564e-05, "loss": 0.29, "step": 4753 }, { "epoch": 0.7442078897933626, "grad_norm": 0.5603713989257812, "learning_rate": 9.550101902173912e-05, "loss": 0.3288, "step": 4754 }, { "epoch": 0.7443644333124608, "grad_norm": 0.7142877578735352, "learning_rate": 9.54891304347826e-05, "loss": 0.1736, "step": 4755 }, { "epoch": 0.7445209768315592, "grad_norm": 0.8155891299247742, "learning_rate": 9.547724184782608e-05, "loss": 0.3201, "step": 4756 }, { "epoch": 0.7446775203506575, "grad_norm": 0.8380536437034607, "learning_rate": 9.546535326086955e-05, "loss": 0.3132, "step": 4757 }, { "epoch": 0.7448340638697558, "grad_norm": 1.3261717557907104, "learning_rate": 9.545346467391303e-05, "loss": 0.3266, "step": 4758 }, { "epoch": 0.7449906073888541, "grad_norm": 0.7950668931007385, "learning_rate": 9.544157608695651e-05, "loss": 0.3283, "step": 4759 }, { "epoch": 0.7451471509079524, "grad_norm": 1.0159289836883545, "learning_rate": 9.542968749999999e-05, "loss": 0.3913, "step": 4760 }, { "epoch": 0.7453036944270507, "grad_norm": 1.1346447467803955, "learning_rate": 9.541779891304347e-05, "loss": 0.4009, "step": 4761 }, { "epoch": 0.745460237946149, "grad_norm": 0.7333198189735413, "learning_rate": 9.540591032608694e-05, "loss": 0.3402, "step": 4762 }, { "epoch": 0.7456167814652473, "grad_norm": 1.5276025533676147, "learning_rate": 9.539402173913042e-05, "loss": 0.5908, "step": 4763 }, { "epoch": 0.7457733249843457, "grad_norm": 1.561141848564148, "learning_rate": 9.538213315217391e-05, "loss": 0.4703, "step": 4764 }, { "epoch": 0.745929868503444, "grad_norm": 0.9059814810752869, "learning_rate": 9.537024456521739e-05, "loss": 0.2963, "step": 4765 }, { "epoch": 0.7460864120225422, "grad_norm": 2.070159912109375, "learning_rate": 9.535835597826087e-05, "loss": 1.2214, "step": 4766 }, { "epoch": 0.7462429555416406, "grad_norm": 2.8663361072540283, "learning_rate": 9.534646739130435e-05, "loss": 0.5781, "step": 4767 }, { "epoch": 0.7463994990607389, "grad_norm": 1.4459375143051147, "learning_rate": 9.533457880434783e-05, "loss": 0.5381, "step": 4768 }, { "epoch": 0.7465560425798372, "grad_norm": 1.4252934455871582, "learning_rate": 9.53226902173913e-05, "loss": 0.7539, "step": 4769 }, { "epoch": 0.7467125860989355, "grad_norm": 1.4846265316009521, "learning_rate": 9.531080163043478e-05, "loss": 0.5113, "step": 4770 }, { "epoch": 0.7468691296180339, "grad_norm": 1.4053518772125244, "learning_rate": 9.529891304347826e-05, "loss": 0.6024, "step": 4771 }, { "epoch": 0.7470256731371321, "grad_norm": 1.602142095565796, "learning_rate": 9.528702445652173e-05, "loss": 0.5599, "step": 4772 }, { "epoch": 0.7471822166562304, "grad_norm": 1.2513864040374756, "learning_rate": 9.52751358695652e-05, "loss": 0.5479, "step": 4773 }, { "epoch": 0.7473387601753287, "grad_norm": 2.7303340435028076, "learning_rate": 9.526324728260868e-05, "loss": 0.4896, "step": 4774 }, { "epoch": 0.7474953036944271, "grad_norm": 2.3124642372131348, "learning_rate": 9.525135869565216e-05, "loss": 0.6983, "step": 4775 }, { "epoch": 0.7476518472135254, "grad_norm": 1.2120572328567505, "learning_rate": 9.523947010869564e-05, "loss": 0.5559, "step": 4776 }, { "epoch": 0.7478083907326236, "grad_norm": 4.0880279541015625, "learning_rate": 9.522758152173912e-05, "loss": 0.9246, "step": 4777 }, { "epoch": 0.747964934251722, "grad_norm": 2.1208438873291016, "learning_rate": 9.52156929347826e-05, "loss": 0.4567, "step": 4778 }, { "epoch": 0.7481214777708203, "grad_norm": 2.4273791313171387, "learning_rate": 9.520380434782607e-05, "loss": 0.5764, "step": 4779 }, { "epoch": 0.7482780212899186, "grad_norm": 3.1031994819641113, "learning_rate": 9.519191576086955e-05, "loss": 0.7984, "step": 4780 }, { "epoch": 0.7484345648090169, "grad_norm": 1.7339438199996948, "learning_rate": 9.518002717391303e-05, "loss": 1.0727, "step": 4781 }, { "epoch": 0.7485911083281153, "grad_norm": 2.459740400314331, "learning_rate": 9.516813858695651e-05, "loss": 0.7628, "step": 4782 }, { "epoch": 0.7487476518472135, "grad_norm": 2.826054334640503, "learning_rate": 9.515624999999999e-05, "loss": 0.959, "step": 4783 }, { "epoch": 0.7489041953663118, "grad_norm": 2.4641060829162598, "learning_rate": 9.514436141304348e-05, "loss": 1.2276, "step": 4784 }, { "epoch": 0.7490607388854101, "grad_norm": 2.3868408203125, "learning_rate": 9.513247282608696e-05, "loss": 1.1942, "step": 4785 }, { "epoch": 0.7492172824045085, "grad_norm": 3.0998692512512207, "learning_rate": 9.512058423913043e-05, "loss": 1.322, "step": 4786 }, { "epoch": 0.7493738259236068, "grad_norm": 3.1579031944274902, "learning_rate": 9.510869565217391e-05, "loss": 1.1911, "step": 4787 }, { "epoch": 0.7495303694427051, "grad_norm": 1.9726893901824951, "learning_rate": 9.509680706521739e-05, "loss": 0.9184, "step": 4788 }, { "epoch": 0.7496869129618033, "grad_norm": 4.9134202003479, "learning_rate": 9.508491847826087e-05, "loss": 0.9702, "step": 4789 }, { "epoch": 0.7498434564809017, "grad_norm": 2.382641077041626, "learning_rate": 9.507302989130435e-05, "loss": 1.4841, "step": 4790 }, { "epoch": 0.75, "grad_norm": 5.924990653991699, "learning_rate": 9.506114130434782e-05, "loss": 1.3906, "step": 4791 }, { "epoch": 0.7501565435190983, "grad_norm": 3.582961320877075, "learning_rate": 9.50492527173913e-05, "loss": 1.3234, "step": 4792 }, { "epoch": 0.7503130870381967, "grad_norm": 3.741305351257324, "learning_rate": 9.503736413043478e-05, "loss": 1.6778, "step": 4793 }, { "epoch": 0.7504696305572949, "grad_norm": 3.513657569885254, "learning_rate": 9.502547554347826e-05, "loss": 1.5883, "step": 4794 }, { "epoch": 0.7506261740763932, "grad_norm": 2.2735915184020996, "learning_rate": 9.501358695652172e-05, "loss": 1.3378, "step": 4795 }, { "epoch": 0.7507827175954915, "grad_norm": 2.5143234729766846, "learning_rate": 9.50016983695652e-05, "loss": 0.5364, "step": 4796 }, { "epoch": 0.7509392611145899, "grad_norm": 1.4831533432006836, "learning_rate": 9.498980978260868e-05, "loss": 0.5212, "step": 4797 }, { "epoch": 0.7510958046336882, "grad_norm": 2.5491783618927, "learning_rate": 9.497792119565216e-05, "loss": 1.0498, "step": 4798 }, { "epoch": 0.7512523481527865, "grad_norm": 2.315971851348877, "learning_rate": 9.496603260869564e-05, "loss": 1.0358, "step": 4799 }, { "epoch": 0.7514088916718847, "grad_norm": 2.648317813873291, "learning_rate": 9.495414402173911e-05, "loss": 1.1499, "step": 4800 }, { "epoch": 0.7515654351909831, "grad_norm": 0.5628464818000793, "learning_rate": 9.494225543478259e-05, "loss": 0.3271, "step": 4801 }, { "epoch": 0.7517219787100814, "grad_norm": 0.4414207935333252, "learning_rate": 9.493036684782607e-05, "loss": 0.2554, "step": 4802 }, { "epoch": 0.7518785222291797, "grad_norm": 0.6465880274772644, "learning_rate": 9.491847826086955e-05, "loss": 0.329, "step": 4803 }, { "epoch": 0.752035065748278, "grad_norm": 0.8311790823936462, "learning_rate": 9.490658967391304e-05, "loss": 0.2537, "step": 4804 }, { "epoch": 0.7521916092673764, "grad_norm": 0.4551326334476471, "learning_rate": 9.489470108695652e-05, "loss": 0.2861, "step": 4805 }, { "epoch": 0.7523481527864746, "grad_norm": 0.8220376372337341, "learning_rate": 9.48828125e-05, "loss": 0.3418, "step": 4806 }, { "epoch": 0.7525046963055729, "grad_norm": 1.224426507949829, "learning_rate": 9.487092391304347e-05, "loss": 0.3676, "step": 4807 }, { "epoch": 0.7526612398246713, "grad_norm": 0.7546990513801575, "learning_rate": 9.485903532608695e-05, "loss": 0.374, "step": 4808 }, { "epoch": 0.7528177833437696, "grad_norm": 0.6211254000663757, "learning_rate": 9.484714673913043e-05, "loss": 0.351, "step": 4809 }, { "epoch": 0.7529743268628679, "grad_norm": 0.6387569308280945, "learning_rate": 9.483525815217391e-05, "loss": 0.2189, "step": 4810 }, { "epoch": 0.7531308703819661, "grad_norm": 1.404960036277771, "learning_rate": 9.482336956521739e-05, "loss": 0.4992, "step": 4811 }, { "epoch": 0.7532874139010645, "grad_norm": 1.3530001640319824, "learning_rate": 9.481148097826087e-05, "loss": 0.3234, "step": 4812 }, { "epoch": 0.7534439574201628, "grad_norm": 0.6103454232215881, "learning_rate": 9.479959239130434e-05, "loss": 0.2672, "step": 4813 }, { "epoch": 0.7536005009392611, "grad_norm": 1.8715227842330933, "learning_rate": 9.478770380434782e-05, "loss": 0.2896, "step": 4814 }, { "epoch": 0.7537570444583594, "grad_norm": 1.316591739654541, "learning_rate": 9.47758152173913e-05, "loss": 0.434, "step": 4815 }, { "epoch": 0.7539135879774578, "grad_norm": 1.4193220138549805, "learning_rate": 9.476392663043478e-05, "loss": 0.5536, "step": 4816 }, { "epoch": 0.754070131496556, "grad_norm": 1.2599214315414429, "learning_rate": 9.475203804347826e-05, "loss": 0.4232, "step": 4817 }, { "epoch": 0.7542266750156543, "grad_norm": 2.3969743251800537, "learning_rate": 9.474014945652172e-05, "loss": 0.5816, "step": 4818 }, { "epoch": 0.7543832185347527, "grad_norm": 0.8212363123893738, "learning_rate": 9.47282608695652e-05, "loss": 0.3044, "step": 4819 }, { "epoch": 0.754539762053851, "grad_norm": 1.4407825469970703, "learning_rate": 9.471637228260868e-05, "loss": 0.3258, "step": 4820 }, { "epoch": 0.7546963055729493, "grad_norm": 1.43205988407135, "learning_rate": 9.470448369565216e-05, "loss": 0.3956, "step": 4821 }, { "epoch": 0.7548528490920476, "grad_norm": 1.245641827583313, "learning_rate": 9.469259510869563e-05, "loss": 0.6232, "step": 4822 }, { "epoch": 0.7550093926111459, "grad_norm": 1.542025089263916, "learning_rate": 9.468070652173911e-05, "loss": 0.5113, "step": 4823 }, { "epoch": 0.7551659361302442, "grad_norm": 1.881481647491455, "learning_rate": 9.46688179347826e-05, "loss": 0.5712, "step": 4824 }, { "epoch": 0.7553224796493425, "grad_norm": 3.9492197036743164, "learning_rate": 9.465692934782608e-05, "loss": 0.6017, "step": 4825 }, { "epoch": 0.7554790231684408, "grad_norm": 2.896026372909546, "learning_rate": 9.464504076086956e-05, "loss": 0.6682, "step": 4826 }, { "epoch": 0.7556355666875392, "grad_norm": 1.1921778917312622, "learning_rate": 9.463315217391304e-05, "loss": 0.58, "step": 4827 }, { "epoch": 0.7557921102066374, "grad_norm": 6.091293811798096, "learning_rate": 9.462126358695652e-05, "loss": 0.6782, "step": 4828 }, { "epoch": 0.7559486537257357, "grad_norm": 2.51035475730896, "learning_rate": 9.4609375e-05, "loss": 0.4901, "step": 4829 }, { "epoch": 0.756105197244834, "grad_norm": 2.2754321098327637, "learning_rate": 9.459748641304347e-05, "loss": 0.8614, "step": 4830 }, { "epoch": 0.7562617407639324, "grad_norm": 2.14615797996521, "learning_rate": 9.458559782608695e-05, "loss": 0.8948, "step": 4831 }, { "epoch": 0.7564182842830307, "grad_norm": 2.4224283695220947, "learning_rate": 9.457370923913043e-05, "loss": 1.1003, "step": 4832 }, { "epoch": 0.756574827802129, "grad_norm": 2.835547685623169, "learning_rate": 9.45618206521739e-05, "loss": 1.1248, "step": 4833 }, { "epoch": 0.7567313713212273, "grad_norm": 3.542811632156372, "learning_rate": 9.454993206521738e-05, "loss": 0.5641, "step": 4834 }, { "epoch": 0.7568879148403256, "grad_norm": 2.5606157779693604, "learning_rate": 9.453804347826086e-05, "loss": 0.9408, "step": 4835 }, { "epoch": 0.7570444583594239, "grad_norm": 2.654442548751831, "learning_rate": 9.452615489130434e-05, "loss": 0.9191, "step": 4836 }, { "epoch": 0.7572010018785222, "grad_norm": 3.117924451828003, "learning_rate": 9.451426630434782e-05, "loss": 1.0532, "step": 4837 }, { "epoch": 0.7573575453976206, "grad_norm": 2.7334351539611816, "learning_rate": 9.45023777173913e-05, "loss": 1.0597, "step": 4838 }, { "epoch": 0.7575140889167189, "grad_norm": 2.5398058891296387, "learning_rate": 9.449048913043478e-05, "loss": 0.6711, "step": 4839 }, { "epoch": 0.7576706324358171, "grad_norm": 2.4723563194274902, "learning_rate": 9.447860054347825e-05, "loss": 1.1417, "step": 4840 }, { "epoch": 0.7578271759549154, "grad_norm": 5.379875659942627, "learning_rate": 9.446671195652172e-05, "loss": 1.3684, "step": 4841 }, { "epoch": 0.7579837194740138, "grad_norm": 2.515951156616211, "learning_rate": 9.44548233695652e-05, "loss": 1.3833, "step": 4842 }, { "epoch": 0.7581402629931121, "grad_norm": 3.2667489051818848, "learning_rate": 9.444293478260867e-05, "loss": 1.7795, "step": 4843 }, { "epoch": 0.7582968065122104, "grad_norm": 2.5143392086029053, "learning_rate": 9.443104619565217e-05, "loss": 1.3814, "step": 4844 }, { "epoch": 0.7584533500313086, "grad_norm": 2.376826763153076, "learning_rate": 9.441915760869564e-05, "loss": 1.3495, "step": 4845 }, { "epoch": 0.758609893550407, "grad_norm": 1.247537612915039, "learning_rate": 9.440726902173912e-05, "loss": 0.352, "step": 4846 }, { "epoch": 0.7587664370695053, "grad_norm": 1.761738657951355, "learning_rate": 9.43953804347826e-05, "loss": 0.819, "step": 4847 }, { "epoch": 0.7589229805886036, "grad_norm": 1.758331060409546, "learning_rate": 9.438349184782608e-05, "loss": 0.3681, "step": 4848 }, { "epoch": 0.759079524107702, "grad_norm": 2.8479857444763184, "learning_rate": 9.437160326086956e-05, "loss": 0.86, "step": 4849 }, { "epoch": 0.7592360676268003, "grad_norm": 3.0261714458465576, "learning_rate": 9.435971467391304e-05, "loss": 1.0657, "step": 4850 }, { "epoch": 0.7593926111458985, "grad_norm": 0.4226309061050415, "learning_rate": 9.434782608695651e-05, "loss": 0.2173, "step": 4851 }, { "epoch": 0.7595491546649968, "grad_norm": 0.4465206265449524, "learning_rate": 9.433593749999999e-05, "loss": 0.25, "step": 4852 }, { "epoch": 0.7597056981840952, "grad_norm": 0.5547592043876648, "learning_rate": 9.432404891304347e-05, "loss": 0.2522, "step": 4853 }, { "epoch": 0.7598622417031935, "grad_norm": 0.5369309186935425, "learning_rate": 9.431216032608695e-05, "loss": 0.1853, "step": 4854 }, { "epoch": 0.7600187852222918, "grad_norm": 0.904988706111908, "learning_rate": 9.430027173913043e-05, "loss": 0.4253, "step": 4855 }, { "epoch": 0.7601753287413902, "grad_norm": 0.5872569680213928, "learning_rate": 9.42883831521739e-05, "loss": 0.2792, "step": 4856 }, { "epoch": 0.7603318722604884, "grad_norm": 0.8798103928565979, "learning_rate": 9.427649456521738e-05, "loss": 0.2859, "step": 4857 }, { "epoch": 0.7604884157795867, "grad_norm": 0.7089309692382812, "learning_rate": 9.426460597826086e-05, "loss": 0.2545, "step": 4858 }, { "epoch": 0.760644959298685, "grad_norm": 1.3868992328643799, "learning_rate": 9.425271739130434e-05, "loss": 0.3864, "step": 4859 }, { "epoch": 0.7608015028177834, "grad_norm": 1.194176197052002, "learning_rate": 9.424082880434782e-05, "loss": 0.4315, "step": 4860 }, { "epoch": 0.7609580463368817, "grad_norm": 0.796387255191803, "learning_rate": 9.422894021739131e-05, "loss": 0.4184, "step": 4861 }, { "epoch": 0.76111458985598, "grad_norm": 1.2200770378112793, "learning_rate": 9.421705163043479e-05, "loss": 0.5097, "step": 4862 }, { "epoch": 0.7612711333750782, "grad_norm": 1.4070953130722046, "learning_rate": 9.420516304347826e-05, "loss": 0.4115, "step": 4863 }, { "epoch": 0.7614276768941766, "grad_norm": 1.216076135635376, "learning_rate": 9.419327445652173e-05, "loss": 0.4448, "step": 4864 }, { "epoch": 0.7615842204132749, "grad_norm": 1.9642174243927002, "learning_rate": 9.418138586956521e-05, "loss": 0.6377, "step": 4865 }, { "epoch": 0.7617407639323732, "grad_norm": 0.7803585529327393, "learning_rate": 9.416949728260869e-05, "loss": 0.2699, "step": 4866 }, { "epoch": 0.7618973074514716, "grad_norm": 2.4552650451660156, "learning_rate": 9.415760869565216e-05, "loss": 0.5385, "step": 4867 }, { "epoch": 0.7620538509705698, "grad_norm": 1.604275107383728, "learning_rate": 9.414572010869564e-05, "loss": 0.7432, "step": 4868 }, { "epoch": 0.7622103944896681, "grad_norm": 2.4252769947052, "learning_rate": 9.413383152173912e-05, "loss": 0.7012, "step": 4869 }, { "epoch": 0.7623669380087664, "grad_norm": 1.856266975402832, "learning_rate": 9.41219429347826e-05, "loss": 0.5623, "step": 4870 }, { "epoch": 0.7625234815278648, "grad_norm": 3.1551449298858643, "learning_rate": 9.411005434782608e-05, "loss": 0.4643, "step": 4871 }, { "epoch": 0.7626800250469631, "grad_norm": 1.3406792879104614, "learning_rate": 9.409816576086955e-05, "loss": 0.4186, "step": 4872 }, { "epoch": 0.7628365685660614, "grad_norm": 1.6723238229751587, "learning_rate": 9.408627717391303e-05, "loss": 0.4512, "step": 4873 }, { "epoch": 0.7629931120851596, "grad_norm": 3.1178135871887207, "learning_rate": 9.407438858695651e-05, "loss": 0.9487, "step": 4874 }, { "epoch": 0.763149655604258, "grad_norm": 2.7199130058288574, "learning_rate": 9.406249999999999e-05, "loss": 0.6258, "step": 4875 }, { "epoch": 0.7633061991233563, "grad_norm": 2.2668631076812744, "learning_rate": 9.405061141304347e-05, "loss": 0.3745, "step": 4876 }, { "epoch": 0.7634627426424546, "grad_norm": 2.275179624557495, "learning_rate": 9.403872282608695e-05, "loss": 0.5923, "step": 4877 }, { "epoch": 0.763619286161553, "grad_norm": 1.8642381429672241, "learning_rate": 9.402683423913042e-05, "loss": 0.5269, "step": 4878 }, { "epoch": 0.7637758296806513, "grad_norm": 3.0518617630004883, "learning_rate": 9.40149456521739e-05, "loss": 0.7895, "step": 4879 }, { "epoch": 0.7639323731997495, "grad_norm": 2.4241714477539062, "learning_rate": 9.400305706521738e-05, "loss": 0.9758, "step": 4880 }, { "epoch": 0.7640889167188478, "grad_norm": 5.117717742919922, "learning_rate": 9.399116847826087e-05, "loss": 0.7801, "step": 4881 }, { "epoch": 0.7642454602379462, "grad_norm": 3.4158992767333984, "learning_rate": 9.397927989130435e-05, "loss": 0.9398, "step": 4882 }, { "epoch": 0.7644020037570445, "grad_norm": 4.0195231437683105, "learning_rate": 9.396739130434783e-05, "loss": 1.0646, "step": 4883 }, { "epoch": 0.7645585472761428, "grad_norm": 3.007345676422119, "learning_rate": 9.39555027173913e-05, "loss": 0.9707, "step": 4884 }, { "epoch": 0.764715090795241, "grad_norm": 4.07368278503418, "learning_rate": 9.394361413043478e-05, "loss": 1.1362, "step": 4885 }, { "epoch": 0.7648716343143394, "grad_norm": 4.089749813079834, "learning_rate": 9.393172554347826e-05, "loss": 0.7763, "step": 4886 }, { "epoch": 0.7650281778334377, "grad_norm": 2.2085750102996826, "learning_rate": 9.391983695652173e-05, "loss": 0.928, "step": 4887 }, { "epoch": 0.765184721352536, "grad_norm": 2.2536983489990234, "learning_rate": 9.39079483695652e-05, "loss": 1.0305, "step": 4888 }, { "epoch": 0.7653412648716343, "grad_norm": 2.40940260887146, "learning_rate": 9.389605978260868e-05, "loss": 1.1171, "step": 4889 }, { "epoch": 0.7654978083907327, "grad_norm": 3.5781447887420654, "learning_rate": 9.388417119565216e-05, "loss": 0.9567, "step": 4890 }, { "epoch": 0.7656543519098309, "grad_norm": 3.1497230529785156, "learning_rate": 9.387228260869564e-05, "loss": 1.4171, "step": 4891 }, { "epoch": 0.7658108954289292, "grad_norm": 3.8695733547210693, "learning_rate": 9.386039402173912e-05, "loss": 0.8516, "step": 4892 }, { "epoch": 0.7659674389480275, "grad_norm": 2.4286696910858154, "learning_rate": 9.38485054347826e-05, "loss": 0.9552, "step": 4893 }, { "epoch": 0.7661239824671259, "grad_norm": 2.557781934738159, "learning_rate": 9.383661684782607e-05, "loss": 1.4645, "step": 4894 }, { "epoch": 0.7662805259862242, "grad_norm": 2.7860097885131836, "learning_rate": 9.382472826086955e-05, "loss": 2.2733, "step": 4895 }, { "epoch": 0.7664370695053225, "grad_norm": 2.3535985946655273, "learning_rate": 9.381283967391303e-05, "loss": 0.8871, "step": 4896 }, { "epoch": 0.7665936130244208, "grad_norm": NaN, "learning_rate": 9.381283967391303e-05, "loss": 0.0, "step": 4897 }, { "epoch": 0.7667501565435191, "grad_norm": 4.227570056915283, "learning_rate": 9.380095108695651e-05, "loss": 0.8405, "step": 4898 }, { "epoch": 0.7669067000626174, "grad_norm": 3.2194440364837646, "learning_rate": 9.378906249999999e-05, "loss": 0.9697, "step": 4899 }, { "epoch": 0.7670632435817157, "grad_norm": 2.0597171783447266, "learning_rate": 9.377717391304346e-05, "loss": 0.6616, "step": 4900 }, { "epoch": 0.7672197871008141, "grad_norm": 0.6304075121879578, "learning_rate": 9.376528532608694e-05, "loss": 0.3096, "step": 4901 }, { "epoch": 0.7673763306199123, "grad_norm": 0.6210527420043945, "learning_rate": 9.375339673913043e-05, "loss": 0.3054, "step": 4902 }, { "epoch": 0.7675328741390106, "grad_norm": 0.7041450142860413, "learning_rate": 9.374150815217391e-05, "loss": 0.2896, "step": 4903 }, { "epoch": 0.7676894176581089, "grad_norm": 2.1157753467559814, "learning_rate": 9.372961956521739e-05, "loss": 0.466, "step": 4904 }, { "epoch": 0.7678459611772073, "grad_norm": 0.7002118229866028, "learning_rate": 9.371773097826087e-05, "loss": 0.2728, "step": 4905 }, { "epoch": 0.7680025046963056, "grad_norm": 1.0115981101989746, "learning_rate": 9.370584239130435e-05, "loss": 0.3841, "step": 4906 }, { "epoch": 0.7681590482154039, "grad_norm": 0.5723740458488464, "learning_rate": 9.369395380434783e-05, "loss": 0.2355, "step": 4907 }, { "epoch": 0.7683155917345021, "grad_norm": 0.9875540733337402, "learning_rate": 9.36820652173913e-05, "loss": 0.3165, "step": 4908 }, { "epoch": 0.7684721352536005, "grad_norm": 0.8502452969551086, "learning_rate": 9.367017663043478e-05, "loss": 0.2688, "step": 4909 }, { "epoch": 0.7686286787726988, "grad_norm": 0.9966556429862976, "learning_rate": 9.365828804347826e-05, "loss": 0.4991, "step": 4910 }, { "epoch": 0.7687852222917971, "grad_norm": 0.893009603023529, "learning_rate": 9.364639945652172e-05, "loss": 0.3812, "step": 4911 }, { "epoch": 0.7689417658108955, "grad_norm": 0.9345742464065552, "learning_rate": 9.36345108695652e-05, "loss": 0.5183, "step": 4912 }, { "epoch": 0.7690983093299938, "grad_norm": 1.080954909324646, "learning_rate": 9.362262228260868e-05, "loss": 0.411, "step": 4913 }, { "epoch": 0.769254852849092, "grad_norm": 0.8507555723190308, "learning_rate": 9.361073369565216e-05, "loss": 0.2828, "step": 4914 }, { "epoch": 0.7694113963681903, "grad_norm": 2.7921273708343506, "learning_rate": 9.359884510869564e-05, "loss": 0.3842, "step": 4915 }, { "epoch": 0.7695679398872887, "grad_norm": 1.1294119358062744, "learning_rate": 9.358695652173912e-05, "loss": 0.2903, "step": 4916 }, { "epoch": 0.769724483406387, "grad_norm": 2.160783529281616, "learning_rate": 9.35750679347826e-05, "loss": 0.477, "step": 4917 }, { "epoch": 0.7698810269254853, "grad_norm": 2.4826340675354004, "learning_rate": 9.356317934782607e-05, "loss": 0.6963, "step": 4918 }, { "epoch": 0.7700375704445835, "grad_norm": 1.2169830799102783, "learning_rate": 9.355129076086955e-05, "loss": 0.4076, "step": 4919 }, { "epoch": 0.7701941139636819, "grad_norm": 1.7800770998001099, "learning_rate": 9.353940217391303e-05, "loss": 0.5067, "step": 4920 }, { "epoch": 0.7703506574827802, "grad_norm": 1.989539623260498, "learning_rate": 9.35275135869565e-05, "loss": 0.5213, "step": 4921 }, { "epoch": 0.7705072010018785, "grad_norm": 2.140895128250122, "learning_rate": 9.3515625e-05, "loss": 0.619, "step": 4922 }, { "epoch": 0.7706637445209769, "grad_norm": 2.178298234939575, "learning_rate": 9.350373641304348e-05, "loss": 0.6624, "step": 4923 }, { "epoch": 0.7708202880400752, "grad_norm": 1.3059680461883545, "learning_rate": 9.349184782608695e-05, "loss": 0.4987, "step": 4924 }, { "epoch": 0.7709768315591734, "grad_norm": 1.6404166221618652, "learning_rate": 9.347995923913043e-05, "loss": 0.4047, "step": 4925 }, { "epoch": 0.7711333750782717, "grad_norm": 1.6002541780471802, "learning_rate": 9.346807065217391e-05, "loss": 0.7402, "step": 4926 }, { "epoch": 0.7712899185973701, "grad_norm": 8.006962776184082, "learning_rate": 9.345618206521739e-05, "loss": 0.9933, "step": 4927 }, { "epoch": 0.7714464621164684, "grad_norm": 2.7961108684539795, "learning_rate": 9.344429347826087e-05, "loss": 1.3477, "step": 4928 }, { "epoch": 0.7716030056355667, "grad_norm": 1.4518694877624512, "learning_rate": 9.343240489130435e-05, "loss": 0.6181, "step": 4929 }, { "epoch": 0.771759549154665, "grad_norm": 1.8338056802749634, "learning_rate": 9.342051630434782e-05, "loss": 0.5472, "step": 4930 }, { "epoch": 0.7719160926737633, "grad_norm": 2.5989444255828857, "learning_rate": 9.34086277173913e-05, "loss": 0.7801, "step": 4931 }, { "epoch": 0.7720726361928616, "grad_norm": 2.0683727264404297, "learning_rate": 9.339673913043478e-05, "loss": 0.7698, "step": 4932 }, { "epoch": 0.7722291797119599, "grad_norm": 1.1811422109603882, "learning_rate": 9.338485054347826e-05, "loss": 0.4572, "step": 4933 }, { "epoch": 0.7723857232310583, "grad_norm": 4.0500335693359375, "learning_rate": 9.337296195652172e-05, "loss": 1.0926, "step": 4934 }, { "epoch": 0.7725422667501566, "grad_norm": 2.766352653503418, "learning_rate": 9.33610733695652e-05, "loss": 0.8087, "step": 4935 }, { "epoch": 0.7726988102692548, "grad_norm": 1.8342289924621582, "learning_rate": 9.334918478260868e-05, "loss": 0.706, "step": 4936 }, { "epoch": 0.7728553537883531, "grad_norm": 2.3889713287353516, "learning_rate": 9.333729619565216e-05, "loss": 1.354, "step": 4937 }, { "epoch": 0.7730118973074515, "grad_norm": 3.2101640701293945, "learning_rate": 9.332540760869563e-05, "loss": 1.2056, "step": 4938 }, { "epoch": 0.7731684408265498, "grad_norm": 1.452364206314087, "learning_rate": 9.331351902173911e-05, "loss": 0.6885, "step": 4939 }, { "epoch": 0.7733249843456481, "grad_norm": 1.9743738174438477, "learning_rate": 9.330163043478259e-05, "loss": 0.974, "step": 4940 }, { "epoch": 0.7734815278647464, "grad_norm": 3.299337863922119, "learning_rate": 9.328974184782607e-05, "loss": 1.2464, "step": 4941 }, { "epoch": 0.7736380713838447, "grad_norm": 3.452582359313965, "learning_rate": 9.327785326086956e-05, "loss": 1.1441, "step": 4942 }, { "epoch": 0.773794614902943, "grad_norm": 1.909281611442566, "learning_rate": 9.326596467391304e-05, "loss": 0.8937, "step": 4943 }, { "epoch": 0.7739511584220413, "grad_norm": 2.718191146850586, "learning_rate": 9.325407608695652e-05, "loss": 1.2463, "step": 4944 }, { "epoch": 0.7741077019411396, "grad_norm": 2.436645746231079, "learning_rate": 9.32421875e-05, "loss": 1.5744, "step": 4945 }, { "epoch": 0.774264245460238, "grad_norm": 2.3774540424346924, "learning_rate": 9.323029891304347e-05, "loss": 0.6416, "step": 4946 }, { "epoch": 0.7744207889793363, "grad_norm": 4.493185043334961, "learning_rate": 9.321841032608695e-05, "loss": 0.9919, "step": 4947 }, { "epoch": 0.7745773324984345, "grad_norm": 4.44124174118042, "learning_rate": 9.320652173913043e-05, "loss": 0.7098, "step": 4948 }, { "epoch": 0.7747338760175329, "grad_norm": 4.378411769866943, "learning_rate": 9.319463315217391e-05, "loss": 1.1196, "step": 4949 }, { "epoch": 0.7748904195366312, "grad_norm": 1.7359799146652222, "learning_rate": 9.318274456521739e-05, "loss": 0.5702, "step": 4950 }, { "epoch": 0.7750469630557295, "grad_norm": 0.9319252371788025, "learning_rate": 9.317085597826086e-05, "loss": 0.2751, "step": 4951 }, { "epoch": 0.7752035065748278, "grad_norm": 1.0348061323165894, "learning_rate": 9.315896739130434e-05, "loss": 0.7265, "step": 4952 }, { "epoch": 0.7753600500939261, "grad_norm": 0.7444149851799011, "learning_rate": 9.314707880434782e-05, "loss": 0.3697, "step": 4953 }, { "epoch": 0.7755165936130244, "grad_norm": 0.47092360258102417, "learning_rate": 9.31351902173913e-05, "loss": 0.2173, "step": 4954 }, { "epoch": 0.7756731371321227, "grad_norm": 1.2624183893203735, "learning_rate": 9.312330163043478e-05, "loss": 0.4809, "step": 4955 }, { "epoch": 0.775829680651221, "grad_norm": 1.1168748140335083, "learning_rate": 9.311141304347826e-05, "loss": 0.4062, "step": 4956 }, { "epoch": 0.7759862241703194, "grad_norm": 1.1442910432815552, "learning_rate": 9.309952445652172e-05, "loss": 0.3511, "step": 4957 }, { "epoch": 0.7761427676894177, "grad_norm": 0.8298973441123962, "learning_rate": 9.30876358695652e-05, "loss": 0.3687, "step": 4958 }, { "epoch": 0.7762993112085159, "grad_norm": 0.9854577779769897, "learning_rate": 9.307574728260868e-05, "loss": 0.4418, "step": 4959 }, { "epoch": 0.7764558547276142, "grad_norm": 0.7722923755645752, "learning_rate": 9.306385869565215e-05, "loss": 0.2454, "step": 4960 }, { "epoch": 0.7766123982467126, "grad_norm": 0.848616361618042, "learning_rate": 9.305197010869565e-05, "loss": 0.3026, "step": 4961 }, { "epoch": 0.7767689417658109, "grad_norm": 1.0151417255401611, "learning_rate": 9.304008152173912e-05, "loss": 0.3651, "step": 4962 }, { "epoch": 0.7769254852849092, "grad_norm": 0.9920995831489563, "learning_rate": 9.30281929347826e-05, "loss": 0.447, "step": 4963 }, { "epoch": 0.7770820288040076, "grad_norm": 8.958861351013184, "learning_rate": 9.301630434782608e-05, "loss": 1.0115, "step": 4964 }, { "epoch": 0.7772385723231058, "grad_norm": 1.8432679176330566, "learning_rate": 9.300441576086956e-05, "loss": 0.5407, "step": 4965 }, { "epoch": 0.7773951158422041, "grad_norm": 1.1635173559188843, "learning_rate": 9.299252717391304e-05, "loss": 0.4318, "step": 4966 }, { "epoch": 0.7775516593613024, "grad_norm": 2.178474187850952, "learning_rate": 9.298063858695651e-05, "loss": 0.5449, "step": 4967 }, { "epoch": 0.7777082028804008, "grad_norm": 1.6616100072860718, "learning_rate": 9.296874999999999e-05, "loss": 0.3832, "step": 4968 }, { "epoch": 0.7778647463994991, "grad_norm": 1.161520004272461, "learning_rate": 9.295686141304347e-05, "loss": 0.4951, "step": 4969 }, { "epoch": 0.7780212899185974, "grad_norm": 1.192434549331665, "learning_rate": 9.294497282608695e-05, "loss": 0.3792, "step": 4970 }, { "epoch": 0.7781778334376956, "grad_norm": 3.2194557189941406, "learning_rate": 9.293308423913043e-05, "loss": 0.685, "step": 4971 }, { "epoch": 0.778334376956794, "grad_norm": 0.9109926819801331, "learning_rate": 9.29211956521739e-05, "loss": 0.4705, "step": 4972 }, { "epoch": 0.7784909204758923, "grad_norm": 2.2255444526672363, "learning_rate": 9.290930706521738e-05, "loss": 0.4789, "step": 4973 }, { "epoch": 0.7786474639949906, "grad_norm": 1.7087409496307373, "learning_rate": 9.289741847826086e-05, "loss": 0.4404, "step": 4974 }, { "epoch": 0.778804007514089, "grad_norm": 1.4297690391540527, "learning_rate": 9.288552989130434e-05, "loss": 0.4779, "step": 4975 }, { "epoch": 0.7789605510331872, "grad_norm": 3.6916770935058594, "learning_rate": 9.287364130434782e-05, "loss": 1.1262, "step": 4976 }, { "epoch": 0.7791170945522855, "grad_norm": 2.3606367111206055, "learning_rate": 9.28617527173913e-05, "loss": 0.7712, "step": 4977 }, { "epoch": 0.7792736380713838, "grad_norm": 2.634479284286499, "learning_rate": 9.284986413043477e-05, "loss": 0.7842, "step": 4978 }, { "epoch": 0.7794301815904822, "grad_norm": 2.819321632385254, "learning_rate": 9.283797554347827e-05, "loss": 1.2567, "step": 4979 }, { "epoch": 0.7795867251095805, "grad_norm": 1.5821731090545654, "learning_rate": 9.282608695652172e-05, "loss": 0.6329, "step": 4980 }, { "epoch": 0.7797432686286788, "grad_norm": 2.2171661853790283, "learning_rate": 9.281419836956521e-05, "loss": 0.8573, "step": 4981 }, { "epoch": 0.779899812147777, "grad_norm": 1.930423617362976, "learning_rate": 9.280230978260869e-05, "loss": 0.9734, "step": 4982 }, { "epoch": 0.7800563556668754, "grad_norm": 1.7816969156265259, "learning_rate": 9.279042119565217e-05, "loss": 0.5485, "step": 4983 }, { "epoch": 0.7802128991859737, "grad_norm": 14.197225570678711, "learning_rate": 9.277853260869564e-05, "loss": 1.4978, "step": 4984 }, { "epoch": 0.780369442705072, "grad_norm": 2.545103073120117, "learning_rate": 9.276664402173912e-05, "loss": 0.9852, "step": 4985 }, { "epoch": 0.7805259862241704, "grad_norm": 2.6654446125030518, "learning_rate": 9.27547554347826e-05, "loss": 0.9737, "step": 4986 }, { "epoch": 0.7806825297432687, "grad_norm": 3.761345863342285, "learning_rate": 9.274286684782608e-05, "loss": 1.2093, "step": 4987 }, { "epoch": 0.7808390732623669, "grad_norm": 2.443068027496338, "learning_rate": 9.273097826086956e-05, "loss": 1.2332, "step": 4988 }, { "epoch": 0.7809956167814652, "grad_norm": 4.837691307067871, "learning_rate": 9.271908967391303e-05, "loss": 1.281, "step": 4989 }, { "epoch": 0.7811521603005636, "grad_norm": 7.481253147125244, "learning_rate": 9.270720108695651e-05, "loss": 0.965, "step": 4990 }, { "epoch": 0.7813087038196619, "grad_norm": 2.461653470993042, "learning_rate": 9.269531249999999e-05, "loss": 1.2553, "step": 4991 }, { "epoch": 0.7814652473387602, "grad_norm": 2.9148612022399902, "learning_rate": 9.268342391304347e-05, "loss": 1.5393, "step": 4992 }, { "epoch": 0.7816217908578584, "grad_norm": 4.53103494644165, "learning_rate": 9.267153532608695e-05, "loss": 1.0091, "step": 4993 }, { "epoch": 0.7817783343769568, "grad_norm": 1.961355447769165, "learning_rate": 9.265964673913043e-05, "loss": 1.2144, "step": 4994 }, { "epoch": 0.7819348778960551, "grad_norm": 3.9124298095703125, "learning_rate": 9.26477581521739e-05, "loss": 1.2595, "step": 4995 }, { "epoch": 0.7820914214151534, "grad_norm": 5.025557041168213, "learning_rate": 9.263586956521738e-05, "loss": 0.7114, "step": 4996 }, { "epoch": 0.7822479649342517, "grad_norm": 3.1193740367889404, "learning_rate": 9.262398097826086e-05, "loss": 0.5283, "step": 4997 }, { "epoch": 0.7824045084533501, "grad_norm": 3.2161612510681152, "learning_rate": 9.261209239130434e-05, "loss": 1.008, "step": 4998 }, { "epoch": 0.7825610519724483, "grad_norm": 2.684314727783203, "learning_rate": 9.260020380434783e-05, "loss": 0.806, "step": 4999 }, { "epoch": 0.7827175954915466, "grad_norm": 1.9209939241409302, "learning_rate": 9.258831521739131e-05, "loss": 0.85, "step": 5000 }, { "epoch": 0.7827175954915466, "eval_loss": 0.5387270450592041, "eval_runtime": 205.7353, "eval_samples_per_second": 60.189, "eval_steps_per_second": 3.762, "eval_wer": 0.3537730837069936, "step": 5000 }, { "epoch": 0.782874139010645, "grad_norm": 0.9356840252876282, "learning_rate": 9.257642663043479e-05, "loss": 0.5084, "step": 5001 }, { "epoch": 0.7830306825297433, "grad_norm": 0.5864033699035645, "learning_rate": 9.256453804347826e-05, "loss": 0.256, "step": 5002 }, { "epoch": 0.7831872260488416, "grad_norm": 0.6143030524253845, "learning_rate": 9.255264945652173e-05, "loss": 0.2315, "step": 5003 }, { "epoch": 0.7833437695679399, "grad_norm": 0.7045772671699524, "learning_rate": 9.25407608695652e-05, "loss": 0.2364, "step": 5004 }, { "epoch": 0.7835003130870382, "grad_norm": 0.7820684313774109, "learning_rate": 9.252887228260868e-05, "loss": 0.3416, "step": 5005 }, { "epoch": 0.7836568566061365, "grad_norm": 0.9582191109657288, "learning_rate": 9.251698369565216e-05, "loss": 0.221, "step": 5006 }, { "epoch": 0.7838134001252348, "grad_norm": 0.7094539403915405, "learning_rate": 9.250509510869564e-05, "loss": 0.2551, "step": 5007 }, { "epoch": 0.7839699436443331, "grad_norm": 0.6800323724746704, "learning_rate": 9.249320652173912e-05, "loss": 0.297, "step": 5008 }, { "epoch": 0.7841264871634315, "grad_norm": 0.9111486077308655, "learning_rate": 9.24813179347826e-05, "loss": 0.3683, "step": 5009 }, { "epoch": 0.7842830306825297, "grad_norm": 1.0616520643234253, "learning_rate": 9.246942934782608e-05, "loss": 0.2958, "step": 5010 }, { "epoch": 0.784439574201628, "grad_norm": 0.8273745775222778, "learning_rate": 9.245754076086955e-05, "loss": 0.3305, "step": 5011 }, { "epoch": 0.7845961177207263, "grad_norm": 1.6020891666412354, "learning_rate": 9.244565217391303e-05, "loss": 0.6555, "step": 5012 }, { "epoch": 0.7847526612398247, "grad_norm": 0.6864603757858276, "learning_rate": 9.243376358695651e-05, "loss": 0.5425, "step": 5013 }, { "epoch": 0.784909204758923, "grad_norm": 1.4303507804870605, "learning_rate": 9.242187499999999e-05, "loss": 0.4789, "step": 5014 }, { "epoch": 0.7850657482780213, "grad_norm": 1.591371774673462, "learning_rate": 9.240998641304347e-05, "loss": 0.4698, "step": 5015 }, { "epoch": 0.7852222917971196, "grad_norm": 1.686511516571045, "learning_rate": 9.239809782608694e-05, "loss": 0.663, "step": 5016 }, { "epoch": 0.7853788353162179, "grad_norm": 1.2187076807022095, "learning_rate": 9.238620923913042e-05, "loss": 0.2544, "step": 5017 }, { "epoch": 0.7855353788353162, "grad_norm": 2.07966947555542, "learning_rate": 9.23743206521739e-05, "loss": 0.4983, "step": 5018 }, { "epoch": 0.7856919223544145, "grad_norm": 1.0877245664596558, "learning_rate": 9.236243206521739e-05, "loss": 0.5234, "step": 5019 }, { "epoch": 0.7858484658735129, "grad_norm": 1.7769485712051392, "learning_rate": 9.235054347826087e-05, "loss": 0.6695, "step": 5020 }, { "epoch": 0.7860050093926112, "grad_norm": 1.288838267326355, "learning_rate": 9.233865489130435e-05, "loss": 0.5504, "step": 5021 }, { "epoch": 0.7861615529117094, "grad_norm": 2.216315507888794, "learning_rate": 9.232676630434783e-05, "loss": 0.4411, "step": 5022 }, { "epoch": 0.7863180964308077, "grad_norm": 1.8092610836029053, "learning_rate": 9.23148777173913e-05, "loss": 0.6009, "step": 5023 }, { "epoch": 0.7864746399499061, "grad_norm": 2.2961783409118652, "learning_rate": 9.230298913043478e-05, "loss": 0.4824, "step": 5024 }, { "epoch": 0.7866311834690044, "grad_norm": 1.7604140043258667, "learning_rate": 9.229110054347826e-05, "loss": 0.5631, "step": 5025 }, { "epoch": 0.7867877269881027, "grad_norm": 2.6699719429016113, "learning_rate": 9.227921195652173e-05, "loss": 0.876, "step": 5026 }, { "epoch": 0.786944270507201, "grad_norm": 3.037482976913452, "learning_rate": 9.22673233695652e-05, "loss": 0.6283, "step": 5027 }, { "epoch": 0.7871008140262993, "grad_norm": 2.2697391510009766, "learning_rate": 9.225543478260868e-05, "loss": 0.9685, "step": 5028 }, { "epoch": 0.7872573575453976, "grad_norm": 2.8843438625335693, "learning_rate": 9.224354619565216e-05, "loss": 0.697, "step": 5029 }, { "epoch": 0.7874139010644959, "grad_norm": 1.9005508422851562, "learning_rate": 9.223165760869564e-05, "loss": 0.3636, "step": 5030 }, { "epoch": 0.7875704445835943, "grad_norm": 1.233635663986206, "learning_rate": 9.221976902173912e-05, "loss": 0.5899, "step": 5031 }, { "epoch": 0.7877269881026926, "grad_norm": 1.9422658681869507, "learning_rate": 9.22078804347826e-05, "loss": 0.7832, "step": 5032 }, { "epoch": 0.7878835316217908, "grad_norm": 2.0286898612976074, "learning_rate": 9.219599184782607e-05, "loss": 0.9073, "step": 5033 }, { "epoch": 0.7880400751408891, "grad_norm": 4.13677453994751, "learning_rate": 9.218410326086955e-05, "loss": 1.1905, "step": 5034 }, { "epoch": 0.7881966186599875, "grad_norm": 2.3263325691223145, "learning_rate": 9.217221467391303e-05, "loss": 0.7153, "step": 5035 }, { "epoch": 0.7883531621790858, "grad_norm": 3.430190086364746, "learning_rate": 9.216032608695651e-05, "loss": 1.1697, "step": 5036 }, { "epoch": 0.7885097056981841, "grad_norm": 4.063048839569092, "learning_rate": 9.214843749999999e-05, "loss": 1.5453, "step": 5037 }, { "epoch": 0.7886662492172825, "grad_norm": 2.095430374145508, "learning_rate": 9.213654891304348e-05, "loss": 0.7764, "step": 5038 }, { "epoch": 0.7888227927363807, "grad_norm": 4.12368631362915, "learning_rate": 9.212466032608696e-05, "loss": 0.8099, "step": 5039 }, { "epoch": 0.788979336255479, "grad_norm": 2.225512981414795, "learning_rate": 9.211277173913043e-05, "loss": 0.9933, "step": 5040 }, { "epoch": 0.7891358797745773, "grad_norm": 3.6626815795898438, "learning_rate": 9.210088315217391e-05, "loss": 0.9955, "step": 5041 }, { "epoch": 0.7892924232936757, "grad_norm": 2.2790184020996094, "learning_rate": 9.208899456521739e-05, "loss": 1.1699, "step": 5042 }, { "epoch": 0.789448966812774, "grad_norm": 3.551569938659668, "learning_rate": 9.207710597826087e-05, "loss": 1.668, "step": 5043 }, { "epoch": 0.7896055103318722, "grad_norm": 2.5663068294525146, "learning_rate": 9.206521739130435e-05, "loss": 0.9298, "step": 5044 }, { "epoch": 0.7897620538509705, "grad_norm": 4.728140354156494, "learning_rate": 9.205332880434782e-05, "loss": 1.4311, "step": 5045 }, { "epoch": 0.7899185973700689, "grad_norm": 1.70958411693573, "learning_rate": 9.20414402173913e-05, "loss": 0.9462, "step": 5046 }, { "epoch": 0.7900751408891672, "grad_norm": 1.9240398406982422, "learning_rate": 9.202955163043478e-05, "loss": 0.9146, "step": 5047 }, { "epoch": 0.7902316844082655, "grad_norm": 1.6049437522888184, "learning_rate": 9.201766304347826e-05, "loss": 0.7656, "step": 5048 }, { "epoch": 0.7903882279273639, "grad_norm": 2.8326220512390137, "learning_rate": 9.200577445652172e-05, "loss": 1.0789, "step": 5049 }, { "epoch": 0.7905447714464621, "grad_norm": 2.2410073280334473, "learning_rate": 9.19938858695652e-05, "loss": 0.9118, "step": 5050 }, { "epoch": 0.7907013149655604, "grad_norm": 0.535487174987793, "learning_rate": 9.198199728260868e-05, "loss": 0.277, "step": 5051 }, { "epoch": 0.7908578584846587, "grad_norm": 0.4781067371368408, "learning_rate": 9.197010869565216e-05, "loss": 0.2626, "step": 5052 }, { "epoch": 0.7910144020037571, "grad_norm": 0.8535751104354858, "learning_rate": 9.195822010869564e-05, "loss": 0.2565, "step": 5053 }, { "epoch": 0.7911709455228554, "grad_norm": 0.59380042552948, "learning_rate": 9.194633152173911e-05, "loss": 0.3354, "step": 5054 }, { "epoch": 0.7913274890419537, "grad_norm": 0.8147178888320923, "learning_rate": 9.193444293478259e-05, "loss": 0.2779, "step": 5055 }, { "epoch": 0.7914840325610519, "grad_norm": 0.8417216539382935, "learning_rate": 9.192255434782607e-05, "loss": 0.3785, "step": 5056 }, { "epoch": 0.7916405760801503, "grad_norm": 0.8246333003044128, "learning_rate": 9.191066576086955e-05, "loss": 0.3115, "step": 5057 }, { "epoch": 0.7917971195992486, "grad_norm": 0.7021090984344482, "learning_rate": 9.189877717391304e-05, "loss": 0.3778, "step": 5058 }, { "epoch": 0.7919536631183469, "grad_norm": 7.471440315246582, "learning_rate": 9.188688858695652e-05, "loss": 0.6963, "step": 5059 }, { "epoch": 0.7921102066374452, "grad_norm": 1.02217698097229, "learning_rate": 9.1875e-05, "loss": 0.3759, "step": 5060 }, { "epoch": 0.7922667501565435, "grad_norm": 1.3012560606002808, "learning_rate": 9.186311141304348e-05, "loss": 0.3073, "step": 5061 }, { "epoch": 0.7924232936756418, "grad_norm": 1.176374077796936, "learning_rate": 9.185122282608695e-05, "loss": 0.3249, "step": 5062 }, { "epoch": 0.7925798371947401, "grad_norm": 1.6997063159942627, "learning_rate": 9.183933423913043e-05, "loss": 0.6107, "step": 5063 }, { "epoch": 0.7927363807138385, "grad_norm": 1.549422025680542, "learning_rate": 9.182744565217391e-05, "loss": 0.644, "step": 5064 }, { "epoch": 0.7928929242329368, "grad_norm": 1.7165303230285645, "learning_rate": 9.181555706521739e-05, "loss": 0.5332, "step": 5065 }, { "epoch": 0.7930494677520351, "grad_norm": 1.4964061975479126, "learning_rate": 9.180366847826087e-05, "loss": 0.3613, "step": 5066 }, { "epoch": 0.7932060112711333, "grad_norm": 1.0738518238067627, "learning_rate": 9.179177989130434e-05, "loss": 0.4037, "step": 5067 }, { "epoch": 0.7933625547902317, "grad_norm": 1.5395009517669678, "learning_rate": 9.177989130434782e-05, "loss": 0.5663, "step": 5068 }, { "epoch": 0.79351909830933, "grad_norm": 2.450265884399414, "learning_rate": 9.17680027173913e-05, "loss": 0.515, "step": 5069 }, { "epoch": 0.7936756418284283, "grad_norm": 1.4264479875564575, "learning_rate": 9.175611413043478e-05, "loss": 0.7608, "step": 5070 }, { "epoch": 0.7938321853475266, "grad_norm": 1.85027277469635, "learning_rate": 9.174422554347826e-05, "loss": 1.0071, "step": 5071 }, { "epoch": 0.793988728866625, "grad_norm": 1.6609456539154053, "learning_rate": 9.173233695652172e-05, "loss": 0.4953, "step": 5072 }, { "epoch": 0.7941452723857232, "grad_norm": 2.925245761871338, "learning_rate": 9.17204483695652e-05, "loss": 0.6101, "step": 5073 }, { "epoch": 0.7943018159048215, "grad_norm": 4.2921905517578125, "learning_rate": 9.170855978260868e-05, "loss": 0.6274, "step": 5074 }, { "epoch": 0.7944583594239198, "grad_norm": 1.8039535284042358, "learning_rate": 9.169667119565216e-05, "loss": 0.502, "step": 5075 }, { "epoch": 0.7946149029430182, "grad_norm": 3.1664130687713623, "learning_rate": 9.168478260869563e-05, "loss": 1.1116, "step": 5076 }, { "epoch": 0.7947714464621165, "grad_norm": 2.594465732574463, "learning_rate": 9.167289402173911e-05, "loss": 0.9371, "step": 5077 }, { "epoch": 0.7949279899812148, "grad_norm": 2.71630597114563, "learning_rate": 9.16610054347826e-05, "loss": 1.0837, "step": 5078 }, { "epoch": 0.795084533500313, "grad_norm": 1.6090675592422485, "learning_rate": 9.164911684782608e-05, "loss": 0.6293, "step": 5079 }, { "epoch": 0.7952410770194114, "grad_norm": 1.8240386247634888, "learning_rate": 9.163722826086956e-05, "loss": 0.6314, "step": 5080 }, { "epoch": 0.7953976205385097, "grad_norm": 2.1499764919281006, "learning_rate": 9.162533967391304e-05, "loss": 0.7157, "step": 5081 }, { "epoch": 0.795554164057608, "grad_norm": 2.7661333084106445, "learning_rate": 9.161345108695652e-05, "loss": 0.9396, "step": 5082 }, { "epoch": 0.7957107075767064, "grad_norm": 1.850858449935913, "learning_rate": 9.16015625e-05, "loss": 0.6147, "step": 5083 }, { "epoch": 0.7958672510958046, "grad_norm": 3.710118055343628, "learning_rate": 9.158967391304347e-05, "loss": 0.9725, "step": 5084 }, { "epoch": 0.7960237946149029, "grad_norm": 2.098453998565674, "learning_rate": 9.157778532608695e-05, "loss": 0.6743, "step": 5085 }, { "epoch": 0.7961803381340012, "grad_norm": 2.022991180419922, "learning_rate": 9.156589673913043e-05, "loss": 0.8464, "step": 5086 }, { "epoch": 0.7963368816530996, "grad_norm": 1.9592376947402954, "learning_rate": 9.155400815217391e-05, "loss": 0.971, "step": 5087 }, { "epoch": 0.7964934251721979, "grad_norm": 3.091723918914795, "learning_rate": 9.154211956521739e-05, "loss": 0.8484, "step": 5088 }, { "epoch": 0.7966499686912962, "grad_norm": 3.164120674133301, "learning_rate": 9.153023097826086e-05, "loss": 1.4545, "step": 5089 }, { "epoch": 0.7968065122103944, "grad_norm": 2.724959373474121, "learning_rate": 9.151834239130434e-05, "loss": 1.6283, "step": 5090 }, { "epoch": 0.7969630557294928, "grad_norm": 3.223039388656616, "learning_rate": 9.150645380434782e-05, "loss": 1.4133, "step": 5091 }, { "epoch": 0.7971195992485911, "grad_norm": 3.6361100673675537, "learning_rate": 9.14945652173913e-05, "loss": 0.9677, "step": 5092 }, { "epoch": 0.7972761427676894, "grad_norm": 2.1256818771362305, "learning_rate": 9.148267663043478e-05, "loss": 1.31, "step": 5093 }, { "epoch": 0.7974326862867878, "grad_norm": 1.9931708574295044, "learning_rate": 9.147078804347825e-05, "loss": 0.9585, "step": 5094 }, { "epoch": 0.7975892298058861, "grad_norm": 4.971303462982178, "learning_rate": 9.145889945652172e-05, "loss": 1.1102, "step": 5095 }, { "epoch": 0.7977457733249843, "grad_norm": 6.743244171142578, "learning_rate": 9.14470108695652e-05, "loss": 0.781, "step": 5096 }, { "epoch": 0.7979023168440826, "grad_norm": 1.9358583688735962, "learning_rate": 9.143512228260868e-05, "loss": 0.5234, "step": 5097 }, { "epoch": 0.798058860363181, "grad_norm": 2.058945655822754, "learning_rate": 9.142323369565217e-05, "loss": 0.7303, "step": 5098 }, { "epoch": 0.7982154038822793, "grad_norm": 2.224940299987793, "learning_rate": 9.141134510869565e-05, "loss": 0.5953, "step": 5099 }, { "epoch": 0.7983719474013776, "grad_norm": 2.9579946994781494, "learning_rate": 9.139945652173912e-05, "loss": 0.6187, "step": 5100 }, { "epoch": 0.7985284909204758, "grad_norm": 3.2123584747314453, "learning_rate": 9.13875679347826e-05, "loss": 0.6479, "step": 5101 }, { "epoch": 0.7986850344395742, "grad_norm": 0.8349288702011108, "learning_rate": 9.137567934782608e-05, "loss": 0.3889, "step": 5102 }, { "epoch": 0.7988415779586725, "grad_norm": 0.8739946484565735, "learning_rate": 9.136379076086956e-05, "loss": 0.3168, "step": 5103 }, { "epoch": 0.7989981214777708, "grad_norm": 0.7845439314842224, "learning_rate": 9.135190217391304e-05, "loss": 0.2473, "step": 5104 }, { "epoch": 0.7991546649968692, "grad_norm": 0.7957096695899963, "learning_rate": 9.134001358695651e-05, "loss": 0.2313, "step": 5105 }, { "epoch": 0.7993112085159675, "grad_norm": 0.7732931971549988, "learning_rate": 9.132812499999999e-05, "loss": 0.251, "step": 5106 }, { "epoch": 0.7994677520350657, "grad_norm": 0.6843956112861633, "learning_rate": 9.131623641304347e-05, "loss": 0.1941, "step": 5107 }, { "epoch": 0.799624295554164, "grad_norm": 1.0814142227172852, "learning_rate": 9.130434782608695e-05, "loss": 0.2796, "step": 5108 }, { "epoch": 0.7997808390732624, "grad_norm": 1.5580850839614868, "learning_rate": 9.129245923913043e-05, "loss": 0.4564, "step": 5109 }, { "epoch": 0.7999373825923607, "grad_norm": 1.0704268217086792, "learning_rate": 9.12805706521739e-05, "loss": 0.36, "step": 5110 }, { "epoch": 0.800093926111459, "grad_norm": 0.7008376717567444, "learning_rate": 9.126868206521738e-05, "loss": 0.4046, "step": 5111 }, { "epoch": 0.8002504696305573, "grad_norm": 7.4942626953125, "learning_rate": 9.125679347826086e-05, "loss": 0.7442, "step": 5112 }, { "epoch": 0.8004070131496556, "grad_norm": 0.8437686562538147, "learning_rate": 9.124490489130434e-05, "loss": 0.2642, "step": 5113 }, { "epoch": 0.8005635566687539, "grad_norm": 0.980031430721283, "learning_rate": 9.123301630434782e-05, "loss": 0.3568, "step": 5114 }, { "epoch": 0.8007201001878522, "grad_norm": 1.138289451599121, "learning_rate": 9.122112771739131e-05, "loss": 0.4018, "step": 5115 }, { "epoch": 0.8008766437069506, "grad_norm": 1.2823410034179688, "learning_rate": 9.120923913043479e-05, "loss": 0.4183, "step": 5116 }, { "epoch": 0.8010331872260489, "grad_norm": 1.0042510032653809, "learning_rate": 9.119735054347827e-05, "loss": 0.4676, "step": 5117 }, { "epoch": 0.8011897307451471, "grad_norm": 1.4981905221939087, "learning_rate": 9.118546195652173e-05, "loss": 0.6472, "step": 5118 }, { "epoch": 0.8013462742642454, "grad_norm": 1.1510316133499146, "learning_rate": 9.117357336956521e-05, "loss": 0.3829, "step": 5119 }, { "epoch": 0.8015028177833438, "grad_norm": 3.069683074951172, "learning_rate": 9.116168478260869e-05, "loss": 0.7525, "step": 5120 }, { "epoch": 0.8016593613024421, "grad_norm": 1.4094674587249756, "learning_rate": 9.114979619565216e-05, "loss": 0.6884, "step": 5121 }, { "epoch": 0.8018159048215404, "grad_norm": 3.866238832473755, "learning_rate": 9.113790760869564e-05, "loss": 0.6329, "step": 5122 }, { "epoch": 0.8019724483406387, "grad_norm": 1.6411136388778687, "learning_rate": 9.112601902173912e-05, "loss": 0.7298, "step": 5123 }, { "epoch": 0.802128991859737, "grad_norm": 3.86564302444458, "learning_rate": 9.11141304347826e-05, "loss": 0.7411, "step": 5124 }, { "epoch": 0.8022855353788353, "grad_norm": 2.210313081741333, "learning_rate": 9.110224184782608e-05, "loss": 0.8776, "step": 5125 }, { "epoch": 0.8024420788979336, "grad_norm": 1.8598724603652954, "learning_rate": 9.109035326086956e-05, "loss": 0.5889, "step": 5126 }, { "epoch": 0.802598622417032, "grad_norm": 1.815516710281372, "learning_rate": 9.107846467391303e-05, "loss": 0.6033, "step": 5127 }, { "epoch": 0.8027551659361303, "grad_norm": 1.8517919778823853, "learning_rate": 9.106657608695651e-05, "loss": 0.8441, "step": 5128 }, { "epoch": 0.8029117094552286, "grad_norm": 1.5312602519989014, "learning_rate": 9.105468749999999e-05, "loss": 0.903, "step": 5129 }, { "epoch": 0.8030682529743268, "grad_norm": 2.651784896850586, "learning_rate": 9.104279891304347e-05, "loss": 0.4051, "step": 5130 }, { "epoch": 0.8032247964934252, "grad_norm": 2.2098960876464844, "learning_rate": 9.103091032608695e-05, "loss": 0.9866, "step": 5131 }, { "epoch": 0.8033813400125235, "grad_norm": 2.783113718032837, "learning_rate": 9.101902173913042e-05, "loss": 0.6312, "step": 5132 }, { "epoch": 0.8035378835316218, "grad_norm": 1.4234397411346436, "learning_rate": 9.10071331521739e-05, "loss": 0.522, "step": 5133 }, { "epoch": 0.8036944270507201, "grad_norm": 2.594248056411743, "learning_rate": 9.099524456521738e-05, "loss": 0.8824, "step": 5134 }, { "epoch": 0.8038509705698184, "grad_norm": 4.33629035949707, "learning_rate": 9.098335597826087e-05, "loss": 1.0054, "step": 5135 }, { "epoch": 0.8040075140889167, "grad_norm": 2.9931421279907227, "learning_rate": 9.097146739130435e-05, "loss": 0.9422, "step": 5136 }, { "epoch": 0.804164057608015, "grad_norm": 2.5796477794647217, "learning_rate": 9.095957880434783e-05, "loss": 1.5539, "step": 5137 }, { "epoch": 0.8043206011271133, "grad_norm": 2.201625347137451, "learning_rate": 9.09476902173913e-05, "loss": 0.55, "step": 5138 }, { "epoch": 0.8044771446462117, "grad_norm": 4.931890487670898, "learning_rate": 9.093580163043478e-05, "loss": 1.5433, "step": 5139 }, { "epoch": 0.80463368816531, "grad_norm": 2.0606000423431396, "learning_rate": 9.092391304347826e-05, "loss": 1.1666, "step": 5140 }, { "epoch": 0.8047902316844082, "grad_norm": 2.729480504989624, "learning_rate": 9.091202445652173e-05, "loss": 0.878, "step": 5141 }, { "epoch": 0.8049467752035065, "grad_norm": 2.1092474460601807, "learning_rate": 9.09001358695652e-05, "loss": 0.5018, "step": 5142 }, { "epoch": 0.8051033187226049, "grad_norm": 4.04967737197876, "learning_rate": 9.088824728260868e-05, "loss": 1.5124, "step": 5143 }, { "epoch": 0.8052598622417032, "grad_norm": 2.9261670112609863, "learning_rate": 9.087635869565216e-05, "loss": 1.4229, "step": 5144 }, { "epoch": 0.8054164057608015, "grad_norm": 16.086502075195312, "learning_rate": 9.086447010869564e-05, "loss": 0.8364, "step": 5145 }, { "epoch": 0.8055729492798999, "grad_norm": 2.2099711894989014, "learning_rate": 9.085258152173912e-05, "loss": 1.1597, "step": 5146 }, { "epoch": 0.8057294927989981, "grad_norm": 2.745413303375244, "learning_rate": 9.08406929347826e-05, "loss": 1.148, "step": 5147 }, { "epoch": 0.8058860363180964, "grad_norm": 2.5650506019592285, "learning_rate": 9.082880434782607e-05, "loss": 0.7847, "step": 5148 }, { "epoch": 0.8060425798371947, "grad_norm": 4.871495723724365, "learning_rate": 9.081691576086955e-05, "loss": 0.6588, "step": 5149 }, { "epoch": 0.8061991233562931, "grad_norm": 2.9876716136932373, "learning_rate": 9.080502717391303e-05, "loss": 0.8813, "step": 5150 }, { "epoch": 0.8063556668753914, "grad_norm": 0.5811265110969543, "learning_rate": 9.079313858695651e-05, "loss": 0.2137, "step": 5151 }, { "epoch": 0.8065122103944896, "grad_norm": 1.3678025007247925, "learning_rate": 9.078124999999999e-05, "loss": 0.3372, "step": 5152 }, { "epoch": 0.8066687539135879, "grad_norm": 0.6283859610557556, "learning_rate": 9.076936141304347e-05, "loss": 0.3093, "step": 5153 }, { "epoch": 0.8068252974326863, "grad_norm": 0.5436118245124817, "learning_rate": 9.075747282608694e-05, "loss": 0.3026, "step": 5154 }, { "epoch": 0.8069818409517846, "grad_norm": 0.7187196016311646, "learning_rate": 9.074558423913044e-05, "loss": 0.3424, "step": 5155 }, { "epoch": 0.8071383844708829, "grad_norm": 1.3662220239639282, "learning_rate": 9.073369565217391e-05, "loss": 0.4846, "step": 5156 }, { "epoch": 0.8072949279899813, "grad_norm": 1.3756555318832397, "learning_rate": 9.072180706521739e-05, "loss": 0.3823, "step": 5157 }, { "epoch": 0.8074514715090795, "grad_norm": 0.6776832342147827, "learning_rate": 9.070991847826087e-05, "loss": 0.2109, "step": 5158 }, { "epoch": 0.8076080150281778, "grad_norm": 0.7444601058959961, "learning_rate": 9.069802989130435e-05, "loss": 0.331, "step": 5159 }, { "epoch": 0.8077645585472761, "grad_norm": 2.584075450897217, "learning_rate": 9.068614130434783e-05, "loss": 0.401, "step": 5160 }, { "epoch": 0.8079211020663745, "grad_norm": 1.3976757526397705, "learning_rate": 9.06742527173913e-05, "loss": 0.4338, "step": 5161 }, { "epoch": 0.8080776455854728, "grad_norm": 0.9835537075996399, "learning_rate": 9.066236413043478e-05, "loss": 0.3016, "step": 5162 }, { "epoch": 0.8082341891045711, "grad_norm": 0.9067820310592651, "learning_rate": 9.065047554347826e-05, "loss": 0.2037, "step": 5163 }, { "epoch": 0.8083907326236693, "grad_norm": 14.53964900970459, "learning_rate": 9.063858695652173e-05, "loss": 1.7727, "step": 5164 }, { "epoch": 0.8085472761427677, "grad_norm": 1.971322774887085, "learning_rate": 9.06266983695652e-05, "loss": 0.5883, "step": 5165 }, { "epoch": 0.808703819661866, "grad_norm": 2.0334925651550293, "learning_rate": 9.061480978260868e-05, "loss": 0.5124, "step": 5166 }, { "epoch": 0.8088603631809643, "grad_norm": 1.4684797525405884, "learning_rate": 9.060292119565216e-05, "loss": 0.4038, "step": 5167 }, { "epoch": 0.8090169067000627, "grad_norm": 3.4837534427642822, "learning_rate": 9.059103260869564e-05, "loss": 1.015, "step": 5168 }, { "epoch": 0.8091734502191609, "grad_norm": 1.5744003057479858, "learning_rate": 9.057914402173912e-05, "loss": 0.5161, "step": 5169 }, { "epoch": 0.8093299937382592, "grad_norm": 2.6315269470214844, "learning_rate": 9.05672554347826e-05, "loss": 0.8493, "step": 5170 }, { "epoch": 0.8094865372573575, "grad_norm": 1.6973198652267456, "learning_rate": 9.055536684782607e-05, "loss": 0.3762, "step": 5171 }, { "epoch": 0.8096430807764559, "grad_norm": 1.6313327550888062, "learning_rate": 9.054347826086955e-05, "loss": 0.5282, "step": 5172 }, { "epoch": 0.8097996242955542, "grad_norm": 1.903200387954712, "learning_rate": 9.053158967391303e-05, "loss": 0.3345, "step": 5173 }, { "epoch": 0.8099561678146525, "grad_norm": 3.651978015899658, "learning_rate": 9.051970108695651e-05, "loss": 0.9033, "step": 5174 }, { "epoch": 0.8101127113337507, "grad_norm": 2.8398547172546387, "learning_rate": 9.05078125e-05, "loss": 1.0102, "step": 5175 }, { "epoch": 0.8102692548528491, "grad_norm": 3.6892893314361572, "learning_rate": 9.049592391304348e-05, "loss": 0.6804, "step": 5176 }, { "epoch": 0.8104257983719474, "grad_norm": 4.008319854736328, "learning_rate": 9.048403532608695e-05, "loss": 0.7939, "step": 5177 }, { "epoch": 0.8105823418910457, "grad_norm": 3.396547317504883, "learning_rate": 9.047214673913043e-05, "loss": 0.5385, "step": 5178 }, { "epoch": 0.810738885410144, "grad_norm": 3.480149269104004, "learning_rate": 9.046025815217391e-05, "loss": 0.6342, "step": 5179 }, { "epoch": 0.8108954289292424, "grad_norm": 2.2630836963653564, "learning_rate": 9.044836956521739e-05, "loss": 0.7959, "step": 5180 }, { "epoch": 0.8110519724483406, "grad_norm": 1.7971911430358887, "learning_rate": 9.043648097826087e-05, "loss": 0.6475, "step": 5181 }, { "epoch": 0.8112085159674389, "grad_norm": 2.9120261669158936, "learning_rate": 9.042459239130435e-05, "loss": 1.1436, "step": 5182 }, { "epoch": 0.8113650594865373, "grad_norm": 1.841413140296936, "learning_rate": 9.041270380434782e-05, "loss": 0.9023, "step": 5183 }, { "epoch": 0.8115216030056356, "grad_norm": 1.9039865732192993, "learning_rate": 9.04008152173913e-05, "loss": 0.7385, "step": 5184 }, { "epoch": 0.8116781465247339, "grad_norm": 3.0569567680358887, "learning_rate": 9.038892663043478e-05, "loss": 1.0655, "step": 5185 }, { "epoch": 0.8118346900438321, "grad_norm": 1.840646505355835, "learning_rate": 9.037703804347826e-05, "loss": 0.5975, "step": 5186 }, { "epoch": 0.8119912335629305, "grad_norm": 4.439432144165039, "learning_rate": 9.036514945652172e-05, "loss": 1.0222, "step": 5187 }, { "epoch": 0.8121477770820288, "grad_norm": 2.1042206287384033, "learning_rate": 9.03532608695652e-05, "loss": 0.831, "step": 5188 }, { "epoch": 0.8123043206011271, "grad_norm": 5.543478488922119, "learning_rate": 9.034137228260868e-05, "loss": 1.8925, "step": 5189 }, { "epoch": 0.8124608641202254, "grad_norm": 2.252702474594116, "learning_rate": 9.032948369565216e-05, "loss": 1.1355, "step": 5190 }, { "epoch": 0.8126174076393238, "grad_norm": 4.76620626449585, "learning_rate": 9.031759510869564e-05, "loss": 1.0459, "step": 5191 }, { "epoch": 0.812773951158422, "grad_norm": 4.394347667694092, "learning_rate": 9.030570652173911e-05, "loss": 1.2921, "step": 5192 }, { "epoch": 0.8129304946775203, "grad_norm": 3.7465455532073975, "learning_rate": 9.029381793478259e-05, "loss": 1.26, "step": 5193 }, { "epoch": 0.8130870381966186, "grad_norm": 3.441333293914795, "learning_rate": 9.028192934782607e-05, "loss": 0.9597, "step": 5194 }, { "epoch": 0.813243581715717, "grad_norm": 2.1421010494232178, "learning_rate": 9.027004076086956e-05, "loss": 1.0755, "step": 5195 }, { "epoch": 0.8134001252348153, "grad_norm": 2.512258529663086, "learning_rate": 9.025815217391304e-05, "loss": 0.8368, "step": 5196 }, { "epoch": 0.8135566687539136, "grad_norm": 11.474546432495117, "learning_rate": 9.024626358695652e-05, "loss": 0.682, "step": 5197 }, { "epoch": 0.8137132122730119, "grad_norm": 3.1507840156555176, "learning_rate": 9.0234375e-05, "loss": 0.5533, "step": 5198 }, { "epoch": 0.8138697557921102, "grad_norm": 5.622971534729004, "learning_rate": 9.022248641304347e-05, "loss": 0.8327, "step": 5199 }, { "epoch": 0.8140262993112085, "grad_norm": 3.0818259716033936, "learning_rate": 9.021059782608695e-05, "loss": 1.3179, "step": 5200 }, { "epoch": 0.8141828428303068, "grad_norm": 0.5449981689453125, "learning_rate": 9.019870923913043e-05, "loss": 0.2762, "step": 5201 }, { "epoch": 0.8143393863494052, "grad_norm": 0.39672890305519104, "learning_rate": 9.018682065217391e-05, "loss": 0.1915, "step": 5202 }, { "epoch": 0.8144959298685035, "grad_norm": 0.6138084530830383, "learning_rate": 9.017493206521739e-05, "loss": 0.2955, "step": 5203 }, { "epoch": 0.8146524733876017, "grad_norm": 0.6013307571411133, "learning_rate": 9.016304347826087e-05, "loss": 0.2433, "step": 5204 }, { "epoch": 0.8148090169067, "grad_norm": 0.7346773743629456, "learning_rate": 9.015115489130434e-05, "loss": 0.3046, "step": 5205 }, { "epoch": 0.8149655604257984, "grad_norm": 0.7437261343002319, "learning_rate": 9.013926630434782e-05, "loss": 0.2481, "step": 5206 }, { "epoch": 0.8151221039448967, "grad_norm": 0.5661160349845886, "learning_rate": 9.01273777173913e-05, "loss": 0.2415, "step": 5207 }, { "epoch": 0.815278647463995, "grad_norm": 0.8748947978019714, "learning_rate": 9.011548913043478e-05, "loss": 0.28, "step": 5208 }, { "epoch": 0.8154351909830932, "grad_norm": 0.604087769985199, "learning_rate": 9.010360054347826e-05, "loss": 0.3066, "step": 5209 }, { "epoch": 0.8155917345021916, "grad_norm": 0.6227664351463318, "learning_rate": 9.009171195652172e-05, "loss": 0.2119, "step": 5210 }, { "epoch": 0.8157482780212899, "grad_norm": 1.6041699647903442, "learning_rate": 9.00798233695652e-05, "loss": 0.4011, "step": 5211 }, { "epoch": 0.8159048215403882, "grad_norm": 1.358917474746704, "learning_rate": 9.006793478260868e-05, "loss": 0.3311, "step": 5212 }, { "epoch": 0.8160613650594866, "grad_norm": 1.4908629655838013, "learning_rate": 9.005604619565215e-05, "loss": 0.4191, "step": 5213 }, { "epoch": 0.8162179085785849, "grad_norm": 1.3448807001113892, "learning_rate": 9.004415760869563e-05, "loss": 0.5709, "step": 5214 }, { "epoch": 0.8163744520976831, "grad_norm": 0.7902805805206299, "learning_rate": 9.003226902173912e-05, "loss": 0.2791, "step": 5215 }, { "epoch": 0.8165309956167814, "grad_norm": 0.7835685014724731, "learning_rate": 9.00203804347826e-05, "loss": 0.4157, "step": 5216 }, { "epoch": 0.8166875391358798, "grad_norm": 1.1008400917053223, "learning_rate": 9.000849184782608e-05, "loss": 0.398, "step": 5217 }, { "epoch": 0.8168440826549781, "grad_norm": 1.043525218963623, "learning_rate": 8.999660326086956e-05, "loss": 0.4428, "step": 5218 }, { "epoch": 0.8170006261740764, "grad_norm": 1.421157956123352, "learning_rate": 8.998471467391304e-05, "loss": 0.547, "step": 5219 }, { "epoch": 0.8171571696931748, "grad_norm": 3.2245328426361084, "learning_rate": 8.997282608695652e-05, "loss": 0.6445, "step": 5220 }, { "epoch": 0.817313713212273, "grad_norm": 1.4050172567367554, "learning_rate": 8.99609375e-05, "loss": 0.5749, "step": 5221 }, { "epoch": 0.8174702567313713, "grad_norm": 1.5908122062683105, "learning_rate": 8.994904891304347e-05, "loss": 0.6319, "step": 5222 }, { "epoch": 0.8176268002504696, "grad_norm": 1.7123173475265503, "learning_rate": 8.993716032608695e-05, "loss": 0.6289, "step": 5223 }, { "epoch": 0.817783343769568, "grad_norm": 2.2724430561065674, "learning_rate": 8.992527173913043e-05, "loss": 0.907, "step": 5224 }, { "epoch": 0.8179398872886663, "grad_norm": 1.8656607866287231, "learning_rate": 8.99133831521739e-05, "loss": 0.7396, "step": 5225 }, { "epoch": 0.8180964308077645, "grad_norm": 2.2086830139160156, "learning_rate": 8.990149456521738e-05, "loss": 0.8745, "step": 5226 }, { "epoch": 0.8182529743268628, "grad_norm": 2.470444679260254, "learning_rate": 8.988960597826086e-05, "loss": 0.6421, "step": 5227 }, { "epoch": 0.8184095178459612, "grad_norm": 2.476414680480957, "learning_rate": 8.987771739130434e-05, "loss": 0.9946, "step": 5228 }, { "epoch": 0.8185660613650595, "grad_norm": 2.9045820236206055, "learning_rate": 8.986582880434782e-05, "loss": 0.5577, "step": 5229 }, { "epoch": 0.8187226048841578, "grad_norm": 2.580047369003296, "learning_rate": 8.98539402173913e-05, "loss": 0.8211, "step": 5230 }, { "epoch": 0.8188791484032562, "grad_norm": 1.693959355354309, "learning_rate": 8.984205163043478e-05, "loss": 0.5566, "step": 5231 }, { "epoch": 0.8190356919223544, "grad_norm": 2.687079668045044, "learning_rate": 8.983016304347827e-05, "loss": 0.7373, "step": 5232 }, { "epoch": 0.8191922354414527, "grad_norm": 2.1986677646636963, "learning_rate": 8.981827445652172e-05, "loss": 0.7866, "step": 5233 }, { "epoch": 0.819348778960551, "grad_norm": 2.115166187286377, "learning_rate": 8.98063858695652e-05, "loss": 0.9064, "step": 5234 }, { "epoch": 0.8195053224796494, "grad_norm": 2.4676339626312256, "learning_rate": 8.979449728260869e-05, "loss": 0.6148, "step": 5235 }, { "epoch": 0.8196618659987477, "grad_norm": 3.390285015106201, "learning_rate": 8.978260869565217e-05, "loss": 0.8406, "step": 5236 }, { "epoch": 0.819818409517846, "grad_norm": 1.6849570274353027, "learning_rate": 8.977072010869564e-05, "loss": 0.6681, "step": 5237 }, { "epoch": 0.8199749530369442, "grad_norm": 3.2923293113708496, "learning_rate": 8.975883152173912e-05, "loss": 1.2006, "step": 5238 }, { "epoch": 0.8201314965560426, "grad_norm": 3.541882038116455, "learning_rate": 8.97469429347826e-05, "loss": 1.4384, "step": 5239 }, { "epoch": 0.8202880400751409, "grad_norm": 3.132275342941284, "learning_rate": 8.973505434782608e-05, "loss": 1.2695, "step": 5240 }, { "epoch": 0.8204445835942392, "grad_norm": 6.217861652374268, "learning_rate": 8.972316576086956e-05, "loss": 1.3588, "step": 5241 }, { "epoch": 0.8206011271133375, "grad_norm": 2.9279093742370605, "learning_rate": 8.971127717391303e-05, "loss": 1.2458, "step": 5242 }, { "epoch": 0.8207576706324358, "grad_norm": 2.4642515182495117, "learning_rate": 8.969938858695651e-05, "loss": 1.5905, "step": 5243 }, { "epoch": 0.8209142141515341, "grad_norm": 2.5411171913146973, "learning_rate": 8.968749999999999e-05, "loss": 1.5079, "step": 5244 }, { "epoch": 0.8210707576706324, "grad_norm": 3.131920099258423, "learning_rate": 8.967561141304347e-05, "loss": 1.9553, "step": 5245 }, { "epoch": 0.8212273011897308, "grad_norm": 2.7852349281311035, "learning_rate": 8.966372282608695e-05, "loss": 1.0083, "step": 5246 }, { "epoch": 0.8213838447088291, "grad_norm": 1.7214953899383545, "learning_rate": 8.965183423913043e-05, "loss": 0.7616, "step": 5247 }, { "epoch": 0.8215403882279274, "grad_norm": 2.84987211227417, "learning_rate": 8.96399456521739e-05, "loss": 0.7461, "step": 5248 }, { "epoch": 0.8216969317470256, "grad_norm": 4.042830944061279, "learning_rate": 8.962805706521738e-05, "loss": 1.2721, "step": 5249 }, { "epoch": 0.821853475266124, "grad_norm": 1.8156780004501343, "learning_rate": 8.961616847826086e-05, "loss": 1.1093, "step": 5250 }, { "epoch": 0.8220100187852223, "grad_norm": 0.5204058885574341, "learning_rate": 8.960427989130434e-05, "loss": 0.3071, "step": 5251 }, { "epoch": 0.8221665623043206, "grad_norm": 0.5995351672172546, "learning_rate": 8.959239130434783e-05, "loss": 0.2884, "step": 5252 }, { "epoch": 0.8223231058234189, "grad_norm": 0.6316986680030823, "learning_rate": 8.958050271739131e-05, "loss": 0.2474, "step": 5253 }, { "epoch": 0.8224796493425173, "grad_norm": 1.440759301185608, "learning_rate": 8.956861413043479e-05, "loss": 0.4071, "step": 5254 }, { "epoch": 0.8226361928616155, "grad_norm": 0.7926093339920044, "learning_rate": 8.955672554347826e-05, "loss": 0.32, "step": 5255 }, { "epoch": 0.8227927363807138, "grad_norm": 0.7304885983467102, "learning_rate": 8.954483695652173e-05, "loss": 0.3798, "step": 5256 }, { "epoch": 0.8229492798998121, "grad_norm": 0.869553804397583, "learning_rate": 8.953294836956521e-05, "loss": 0.5308, "step": 5257 }, { "epoch": 0.8231058234189105, "grad_norm": 0.7494041919708252, "learning_rate": 8.952105978260869e-05, "loss": 0.3451, "step": 5258 }, { "epoch": 0.8232623669380088, "grad_norm": 0.7324798107147217, "learning_rate": 8.950917119565216e-05, "loss": 0.3682, "step": 5259 }, { "epoch": 0.823418910457107, "grad_norm": 1.1920076608657837, "learning_rate": 8.949728260869564e-05, "loss": 0.4915, "step": 5260 }, { "epoch": 0.8235754539762054, "grad_norm": 0.9661399126052856, "learning_rate": 8.948539402173912e-05, "loss": 0.3385, "step": 5261 }, { "epoch": 0.8237319974953037, "grad_norm": 1.7544409036636353, "learning_rate": 8.94735054347826e-05, "loss": 0.4466, "step": 5262 }, { "epoch": 0.823888541014402, "grad_norm": 0.8017387986183167, "learning_rate": 8.946161684782608e-05, "loss": 0.2416, "step": 5263 }, { "epoch": 0.8240450845335003, "grad_norm": 0.7948090434074402, "learning_rate": 8.944972826086955e-05, "loss": 0.2183, "step": 5264 }, { "epoch": 0.8242016280525987, "grad_norm": 0.9001139998435974, "learning_rate": 8.943783967391303e-05, "loss": 0.4299, "step": 5265 }, { "epoch": 0.8243581715716969, "grad_norm": 1.0421308279037476, "learning_rate": 8.942595108695651e-05, "loss": 0.3709, "step": 5266 }, { "epoch": 0.8245147150907952, "grad_norm": 0.7900252938270569, "learning_rate": 8.941406249999999e-05, "loss": 0.2756, "step": 5267 }, { "epoch": 0.8246712586098935, "grad_norm": 1.1609721183776855, "learning_rate": 8.940217391304347e-05, "loss": 0.3717, "step": 5268 }, { "epoch": 0.8248278021289919, "grad_norm": 1.6797921657562256, "learning_rate": 8.939028532608695e-05, "loss": 0.5423, "step": 5269 }, { "epoch": 0.8249843456480902, "grad_norm": 1.5651843547821045, "learning_rate": 8.937839673913042e-05, "loss": 0.3926, "step": 5270 }, { "epoch": 0.8251408891671885, "grad_norm": 1.3950986862182617, "learning_rate": 8.93665081521739e-05, "loss": 0.4353, "step": 5271 }, { "epoch": 0.8252974326862867, "grad_norm": 1.2526239156723022, "learning_rate": 8.935461956521739e-05, "loss": 0.3523, "step": 5272 }, { "epoch": 0.8254539762053851, "grad_norm": 2.2909116744995117, "learning_rate": 8.934273097826087e-05, "loss": 0.5659, "step": 5273 }, { "epoch": 0.8256105197244834, "grad_norm": 2.607268810272217, "learning_rate": 8.933084239130435e-05, "loss": 0.8468, "step": 5274 }, { "epoch": 0.8257670632435817, "grad_norm": 2.4901235103607178, "learning_rate": 8.931895380434783e-05, "loss": 0.66, "step": 5275 }, { "epoch": 0.8259236067626801, "grad_norm": 2.091348648071289, "learning_rate": 8.93070652173913e-05, "loss": 0.9635, "step": 5276 }, { "epoch": 0.8260801502817783, "grad_norm": 2.180196523666382, "learning_rate": 8.929517663043478e-05, "loss": 0.6957, "step": 5277 }, { "epoch": 0.8262366938008766, "grad_norm": 2.4630768299102783, "learning_rate": 8.928328804347826e-05, "loss": 1.1796, "step": 5278 }, { "epoch": 0.8263932373199749, "grad_norm": 2.284344434738159, "learning_rate": 8.927139945652173e-05, "loss": 0.6761, "step": 5279 }, { "epoch": 0.8265497808390733, "grad_norm": 3.7270045280456543, "learning_rate": 8.92595108695652e-05, "loss": 1.1771, "step": 5280 }, { "epoch": 0.8267063243581716, "grad_norm": 1.8908765316009521, "learning_rate": 8.924762228260868e-05, "loss": 0.7194, "step": 5281 }, { "epoch": 0.8268628678772699, "grad_norm": 2.371622323989868, "learning_rate": 8.923573369565216e-05, "loss": 1.361, "step": 5282 }, { "epoch": 0.8270194113963681, "grad_norm": 1.7995071411132812, "learning_rate": 8.922384510869564e-05, "loss": 0.7064, "step": 5283 }, { "epoch": 0.8271759549154665, "grad_norm": 1.641873836517334, "learning_rate": 8.921195652173912e-05, "loss": 0.7598, "step": 5284 }, { "epoch": 0.8273324984345648, "grad_norm": 1.780644178390503, "learning_rate": 8.92000679347826e-05, "loss": 1.2362, "step": 5285 }, { "epoch": 0.8274890419536631, "grad_norm": 1.8862626552581787, "learning_rate": 8.918817934782607e-05, "loss": 0.5319, "step": 5286 }, { "epoch": 0.8276455854727615, "grad_norm": 2.8890132904052734, "learning_rate": 8.917629076086955e-05, "loss": 0.6433, "step": 5287 }, { "epoch": 0.8278021289918598, "grad_norm": 2.940089225769043, "learning_rate": 8.916440217391303e-05, "loss": 1.2296, "step": 5288 }, { "epoch": 0.827958672510958, "grad_norm": 3.4027416706085205, "learning_rate": 8.915251358695651e-05, "loss": 1.2001, "step": 5289 }, { "epoch": 0.8281152160300563, "grad_norm": 2.946690559387207, "learning_rate": 8.914062499999999e-05, "loss": 1.0802, "step": 5290 }, { "epoch": 0.8282717595491547, "grad_norm": 2.8609871864318848, "learning_rate": 8.912873641304346e-05, "loss": 1.3492, "step": 5291 }, { "epoch": 0.828428303068253, "grad_norm": 2.3392350673675537, "learning_rate": 8.911684782608696e-05, "loss": 1.4659, "step": 5292 }, { "epoch": 0.8285848465873513, "grad_norm": 2.620279550552368, "learning_rate": 8.910495923913043e-05, "loss": 0.9808, "step": 5293 }, { "epoch": 0.8287413901064495, "grad_norm": 1.6755872964859009, "learning_rate": 8.909307065217391e-05, "loss": 1.2143, "step": 5294 }, { "epoch": 0.8288979336255479, "grad_norm": 2.370971918106079, "learning_rate": 8.908118206521739e-05, "loss": 1.2963, "step": 5295 }, { "epoch": 0.8290544771446462, "grad_norm": 2.544985294342041, "learning_rate": 8.906929347826087e-05, "loss": 1.0297, "step": 5296 }, { "epoch": 0.8292110206637445, "grad_norm": 2.483201265335083, "learning_rate": 8.905740489130435e-05, "loss": 0.5809, "step": 5297 }, { "epoch": 0.8293675641828429, "grad_norm": 1.7798092365264893, "learning_rate": 8.904551630434783e-05, "loss": 0.7525, "step": 5298 }, { "epoch": 0.8295241077019412, "grad_norm": 2.6105306148529053, "learning_rate": 8.90336277173913e-05, "loss": 1.1633, "step": 5299 }, { "epoch": 0.8296806512210394, "grad_norm": 3.186844825744629, "learning_rate": 8.902173913043478e-05, "loss": 1.6183, "step": 5300 }, { "epoch": 0.8298371947401377, "grad_norm": 0.4722957909107208, "learning_rate": 8.900985054347826e-05, "loss": 0.264, "step": 5301 }, { "epoch": 0.8299937382592361, "grad_norm": 0.6712448000907898, "learning_rate": 8.899796195652172e-05, "loss": 0.3235, "step": 5302 }, { "epoch": 0.8301502817783344, "grad_norm": 0.48853543400764465, "learning_rate": 8.89860733695652e-05, "loss": 0.2865, "step": 5303 }, { "epoch": 0.8303068252974327, "grad_norm": 0.7262674570083618, "learning_rate": 8.897418478260868e-05, "loss": 0.3687, "step": 5304 }, { "epoch": 0.830463368816531, "grad_norm": 0.6689429879188538, "learning_rate": 8.896229619565216e-05, "loss": 0.3094, "step": 5305 }, { "epoch": 0.8306199123356293, "grad_norm": 1.3405731916427612, "learning_rate": 8.895040760869564e-05, "loss": 0.3854, "step": 5306 }, { "epoch": 0.8307764558547276, "grad_norm": 0.6720079779624939, "learning_rate": 8.893851902173912e-05, "loss": 0.3264, "step": 5307 }, { "epoch": 0.8309329993738259, "grad_norm": 0.9881694316864014, "learning_rate": 8.89266304347826e-05, "loss": 0.4043, "step": 5308 }, { "epoch": 0.8310895428929242, "grad_norm": 2.131093978881836, "learning_rate": 8.891474184782607e-05, "loss": 0.7577, "step": 5309 }, { "epoch": 0.8312460864120226, "grad_norm": 0.5986841917037964, "learning_rate": 8.890285326086955e-05, "loss": 0.3009, "step": 5310 }, { "epoch": 0.8314026299311209, "grad_norm": 1.9602125883102417, "learning_rate": 8.889096467391303e-05, "loss": 0.5437, "step": 5311 }, { "epoch": 0.8315591734502191, "grad_norm": 0.7702073454856873, "learning_rate": 8.887907608695652e-05, "loss": 0.4191, "step": 5312 }, { "epoch": 0.8317157169693175, "grad_norm": 0.9896547794342041, "learning_rate": 8.88671875e-05, "loss": 0.3885, "step": 5313 }, { "epoch": 0.8318722604884158, "grad_norm": 1.5765944719314575, "learning_rate": 8.885529891304348e-05, "loss": 0.4729, "step": 5314 }, { "epoch": 0.8320288040075141, "grad_norm": 1.6858851909637451, "learning_rate": 8.884341032608695e-05, "loss": 0.5103, "step": 5315 }, { "epoch": 0.8321853475266124, "grad_norm": 1.5775293111801147, "learning_rate": 8.883152173913043e-05, "loss": 0.5822, "step": 5316 }, { "epoch": 0.8323418910457107, "grad_norm": 0.9942771792411804, "learning_rate": 8.881963315217391e-05, "loss": 0.3548, "step": 5317 }, { "epoch": 0.832498434564809, "grad_norm": 1.3510366678237915, "learning_rate": 8.880774456521739e-05, "loss": 0.4214, "step": 5318 }, { "epoch": 0.8326549780839073, "grad_norm": 1.6100376844406128, "learning_rate": 8.879585597826087e-05, "loss": 0.8586, "step": 5319 }, { "epoch": 0.8328115216030056, "grad_norm": 0.9903608560562134, "learning_rate": 8.878396739130434e-05, "loss": 0.5319, "step": 5320 }, { "epoch": 0.832968065122104, "grad_norm": 2.235217809677124, "learning_rate": 8.877207880434782e-05, "loss": 0.4827, "step": 5321 }, { "epoch": 0.8331246086412023, "grad_norm": 2.907715320587158, "learning_rate": 8.87601902173913e-05, "loss": 0.8935, "step": 5322 }, { "epoch": 0.8332811521603005, "grad_norm": 2.9199488162994385, "learning_rate": 8.874830163043478e-05, "loss": 0.7604, "step": 5323 }, { "epoch": 0.8334376956793988, "grad_norm": 3.287466526031494, "learning_rate": 8.873641304347826e-05, "loss": 0.7577, "step": 5324 }, { "epoch": 0.8335942391984972, "grad_norm": 2.235964775085449, "learning_rate": 8.872452445652172e-05, "loss": 0.7584, "step": 5325 }, { "epoch": 0.8337507827175955, "grad_norm": 2.4475655555725098, "learning_rate": 8.87126358695652e-05, "loss": 0.8481, "step": 5326 }, { "epoch": 0.8339073262366938, "grad_norm": 2.6088016033172607, "learning_rate": 8.870074728260868e-05, "loss": 0.5786, "step": 5327 }, { "epoch": 0.8340638697557922, "grad_norm": 2.4300477504730225, "learning_rate": 8.868885869565216e-05, "loss": 0.8591, "step": 5328 }, { "epoch": 0.8342204132748904, "grad_norm": 3.439194440841675, "learning_rate": 8.867697010869563e-05, "loss": 0.7479, "step": 5329 }, { "epoch": 0.8343769567939887, "grad_norm": 2.859330892562866, "learning_rate": 8.866508152173911e-05, "loss": 0.5575, "step": 5330 }, { "epoch": 0.834533500313087, "grad_norm": 1.3635895252227783, "learning_rate": 8.865319293478259e-05, "loss": 0.6576, "step": 5331 }, { "epoch": 0.8346900438321854, "grad_norm": 2.030200242996216, "learning_rate": 8.864130434782608e-05, "loss": 0.6233, "step": 5332 }, { "epoch": 0.8348465873512837, "grad_norm": 4.215113162994385, "learning_rate": 8.862941576086956e-05, "loss": 0.6956, "step": 5333 }, { "epoch": 0.8350031308703819, "grad_norm": 2.2112786769866943, "learning_rate": 8.861752717391304e-05, "loss": 0.8774, "step": 5334 }, { "epoch": 0.8351596743894802, "grad_norm": 4.143497943878174, "learning_rate": 8.860563858695652e-05, "loss": 0.9879, "step": 5335 }, { "epoch": 0.8353162179085786, "grad_norm": 1.8709466457366943, "learning_rate": 8.859375e-05, "loss": 0.6721, "step": 5336 }, { "epoch": 0.8354727614276769, "grad_norm": 2.1915245056152344, "learning_rate": 8.858186141304347e-05, "loss": 1.1088, "step": 5337 }, { "epoch": 0.8356293049467752, "grad_norm": 3.0211217403411865, "learning_rate": 8.856997282608695e-05, "loss": 1.1345, "step": 5338 }, { "epoch": 0.8357858484658736, "grad_norm": 1.9924949407577515, "learning_rate": 8.855808423913043e-05, "loss": 0.9194, "step": 5339 }, { "epoch": 0.8359423919849718, "grad_norm": 2.067997932434082, "learning_rate": 8.854619565217391e-05, "loss": 0.8068, "step": 5340 }, { "epoch": 0.8360989355040701, "grad_norm": 6.409606456756592, "learning_rate": 8.853430706521739e-05, "loss": 0.8915, "step": 5341 }, { "epoch": 0.8362554790231684, "grad_norm": 5.322498321533203, "learning_rate": 8.852241847826086e-05, "loss": 1.2216, "step": 5342 }, { "epoch": 0.8364120225422668, "grad_norm": 3.9579882621765137, "learning_rate": 8.851052989130434e-05, "loss": 0.9754, "step": 5343 }, { "epoch": 0.8365685660613651, "grad_norm": 1.9576719999313354, "learning_rate": 8.849864130434782e-05, "loss": 1.0272, "step": 5344 }, { "epoch": 0.8367251095804634, "grad_norm": 2.2106149196624756, "learning_rate": 8.84867527173913e-05, "loss": 0.7185, "step": 5345 }, { "epoch": 0.8368816530995616, "grad_norm": 1.92005455493927, "learning_rate": 8.847486413043478e-05, "loss": 0.8107, "step": 5346 }, { "epoch": 0.83703819661866, "grad_norm": 2.5457143783569336, "learning_rate": 8.846297554347825e-05, "loss": 1.272, "step": 5347 }, { "epoch": 0.8371947401377583, "grad_norm": 0.9756335020065308, "learning_rate": 8.845108695652172e-05, "loss": 0.3926, "step": 5348 }, { "epoch": 0.8373512836568566, "grad_norm": 3.5690853595733643, "learning_rate": 8.84391983695652e-05, "loss": 0.8406, "step": 5349 }, { "epoch": 0.837507827175955, "grad_norm": 3.0543441772460938, "learning_rate": 8.842730978260868e-05, "loss": 0.9469, "step": 5350 }, { "epoch": 0.8376643706950532, "grad_norm": 0.3854352533817291, "learning_rate": 8.841542119565215e-05, "loss": 0.2404, "step": 5351 }, { "epoch": 0.8378209142141515, "grad_norm": 0.4354816675186157, "learning_rate": 8.840353260869565e-05, "loss": 0.3064, "step": 5352 }, { "epoch": 0.8379774577332498, "grad_norm": 1.0940358638763428, "learning_rate": 8.839164402173912e-05, "loss": 0.287, "step": 5353 }, { "epoch": 0.8381340012523482, "grad_norm": 0.7098099589347839, "learning_rate": 8.83797554347826e-05, "loss": 0.3401, "step": 5354 }, { "epoch": 0.8382905447714465, "grad_norm": 0.5488841533660889, "learning_rate": 8.836786684782608e-05, "loss": 0.2397, "step": 5355 }, { "epoch": 0.8384470882905448, "grad_norm": 0.9296640157699585, "learning_rate": 8.835597826086956e-05, "loss": 0.4306, "step": 5356 }, { "epoch": 0.838603631809643, "grad_norm": 0.8003067374229431, "learning_rate": 8.834408967391304e-05, "loss": 0.2113, "step": 5357 }, { "epoch": 0.8387601753287414, "grad_norm": 1.1343063116073608, "learning_rate": 8.833220108695651e-05, "loss": 0.2946, "step": 5358 }, { "epoch": 0.8389167188478397, "grad_norm": 0.6426655054092407, "learning_rate": 8.832031249999999e-05, "loss": 0.3353, "step": 5359 }, { "epoch": 0.839073262366938, "grad_norm": 0.712860643863678, "learning_rate": 8.830842391304347e-05, "loss": 0.2596, "step": 5360 }, { "epoch": 0.8392298058860364, "grad_norm": 0.913581907749176, "learning_rate": 8.829653532608695e-05, "loss": 0.3247, "step": 5361 }, { "epoch": 0.8393863494051347, "grad_norm": 2.0829362869262695, "learning_rate": 8.828464673913043e-05, "loss": 0.4038, "step": 5362 }, { "epoch": 0.8395428929242329, "grad_norm": 1.4852325916290283, "learning_rate": 8.82727581521739e-05, "loss": 0.5716, "step": 5363 }, { "epoch": 0.8396994364433312, "grad_norm": 0.913151204586029, "learning_rate": 8.826086956521738e-05, "loss": 0.3351, "step": 5364 }, { "epoch": 0.8398559799624296, "grad_norm": 0.979848325252533, "learning_rate": 8.824898097826086e-05, "loss": 0.3262, "step": 5365 }, { "epoch": 0.8400125234815279, "grad_norm": 1.4704645872116089, "learning_rate": 8.823709239130434e-05, "loss": 0.5402, "step": 5366 }, { "epoch": 0.8401690670006262, "grad_norm": 2.111781120300293, "learning_rate": 8.822520380434782e-05, "loss": 0.5006, "step": 5367 }, { "epoch": 0.8403256105197244, "grad_norm": 1.1378949880599976, "learning_rate": 8.82133152173913e-05, "loss": 0.4165, "step": 5368 }, { "epoch": 0.8404821540388228, "grad_norm": 1.1391712427139282, "learning_rate": 8.820142663043479e-05, "loss": 0.3374, "step": 5369 }, { "epoch": 0.8406386975579211, "grad_norm": 1.2761986255645752, "learning_rate": 8.818953804347827e-05, "loss": 0.5062, "step": 5370 }, { "epoch": 0.8407952410770194, "grad_norm": 2.9176485538482666, "learning_rate": 8.817764945652173e-05, "loss": 0.4242, "step": 5371 }, { "epoch": 0.8409517845961177, "grad_norm": 1.9821102619171143, "learning_rate": 8.816576086956521e-05, "loss": 0.6903, "step": 5372 }, { "epoch": 0.8411083281152161, "grad_norm": 6.328606128692627, "learning_rate": 8.815387228260869e-05, "loss": 0.7485, "step": 5373 }, { "epoch": 0.8412648716343143, "grad_norm": 3.2388646602630615, "learning_rate": 8.814198369565217e-05, "loss": 1.1587, "step": 5374 }, { "epoch": 0.8414214151534126, "grad_norm": 1.6539113521575928, "learning_rate": 8.813009510869564e-05, "loss": 0.5597, "step": 5375 }, { "epoch": 0.841577958672511, "grad_norm": 2.182218313217163, "learning_rate": 8.811820652173912e-05, "loss": 0.4129, "step": 5376 }, { "epoch": 0.8417345021916093, "grad_norm": 2.2672152519226074, "learning_rate": 8.81063179347826e-05, "loss": 0.6416, "step": 5377 }, { "epoch": 0.8418910457107076, "grad_norm": 2.731959581375122, "learning_rate": 8.809442934782608e-05, "loss": 0.8309, "step": 5378 }, { "epoch": 0.8420475892298059, "grad_norm": 1.254808783531189, "learning_rate": 8.808254076086956e-05, "loss": 0.3879, "step": 5379 }, { "epoch": 0.8422041327489042, "grad_norm": 2.081847667694092, "learning_rate": 8.807065217391303e-05, "loss": 0.5263, "step": 5380 }, { "epoch": 0.8423606762680025, "grad_norm": 1.542258620262146, "learning_rate": 8.805876358695651e-05, "loss": 0.9478, "step": 5381 }, { "epoch": 0.8425172197871008, "grad_norm": 2.4175546169281006, "learning_rate": 8.804687499999999e-05, "loss": 0.857, "step": 5382 }, { "epoch": 0.8426737633061991, "grad_norm": 2.33785343170166, "learning_rate": 8.803498641304347e-05, "loss": 0.7995, "step": 5383 }, { "epoch": 0.8428303068252975, "grad_norm": 1.7313936948776245, "learning_rate": 8.802309782608695e-05, "loss": 0.694, "step": 5384 }, { "epoch": 0.8429868503443957, "grad_norm": 4.474482536315918, "learning_rate": 8.801120923913042e-05, "loss": 1.0639, "step": 5385 }, { "epoch": 0.843143393863494, "grad_norm": 7.876155376434326, "learning_rate": 8.79993206521739e-05, "loss": 2.0009, "step": 5386 }, { "epoch": 0.8432999373825923, "grad_norm": 4.573262691497803, "learning_rate": 8.798743206521738e-05, "loss": 0.9889, "step": 5387 }, { "epoch": 0.8434564809016907, "grad_norm": 2.8231537342071533, "learning_rate": 8.797554347826086e-05, "loss": 1.2256, "step": 5388 }, { "epoch": 0.843613024420789, "grad_norm": 1.4422330856323242, "learning_rate": 8.796365489130435e-05, "loss": 0.8941, "step": 5389 }, { "epoch": 0.8437695679398873, "grad_norm": 2.805431365966797, "learning_rate": 8.795176630434783e-05, "loss": 1.7834, "step": 5390 }, { "epoch": 0.8439261114589856, "grad_norm": 3.2926900386810303, "learning_rate": 8.793987771739131e-05, "loss": 1.3006, "step": 5391 }, { "epoch": 0.8440826549780839, "grad_norm": 4.782486915588379, "learning_rate": 8.792798913043479e-05, "loss": 1.8549, "step": 5392 }, { "epoch": 0.8442391984971822, "grad_norm": 2.453493118286133, "learning_rate": 8.791610054347826e-05, "loss": 1.2019, "step": 5393 }, { "epoch": 0.8443957420162805, "grad_norm": 3.491562843322754, "learning_rate": 8.790421195652173e-05, "loss": 1.0382, "step": 5394 }, { "epoch": 0.8445522855353789, "grad_norm": 2.1968631744384766, "learning_rate": 8.78923233695652e-05, "loss": 0.6296, "step": 5395 }, { "epoch": 0.8447088290544772, "grad_norm": 1.2232187986373901, "learning_rate": 8.788043478260868e-05, "loss": 0.475, "step": 5396 }, { "epoch": 0.8448653725735754, "grad_norm": 2.3276283740997314, "learning_rate": 8.786854619565216e-05, "loss": 0.9694, "step": 5397 }, { "epoch": 0.8450219160926737, "grad_norm": 3.8735203742980957, "learning_rate": 8.785665760869564e-05, "loss": 0.8536, "step": 5398 }, { "epoch": 0.8451784596117721, "grad_norm": 5.586703777313232, "learning_rate": 8.784476902173912e-05, "loss": 1.5263, "step": 5399 }, { "epoch": 0.8453350031308704, "grad_norm": 2.887132406234741, "learning_rate": 8.78328804347826e-05, "loss": 1.0959, "step": 5400 }, { "epoch": 0.8454915466499687, "grad_norm": 0.5541558265686035, "learning_rate": 8.782099184782608e-05, "loss": 0.2778, "step": 5401 }, { "epoch": 0.845648090169067, "grad_norm": 0.5790005922317505, "learning_rate": 8.780910326086955e-05, "loss": 0.2971, "step": 5402 }, { "epoch": 0.8458046336881653, "grad_norm": 0.4915883243083954, "learning_rate": 8.779721467391303e-05, "loss": 0.2258, "step": 5403 }, { "epoch": 0.8459611772072636, "grad_norm": 2.7705881595611572, "learning_rate": 8.778532608695651e-05, "loss": 0.3402, "step": 5404 }, { "epoch": 0.8461177207263619, "grad_norm": 0.46802377700805664, "learning_rate": 8.777343749999999e-05, "loss": 0.1933, "step": 5405 }, { "epoch": 0.8462742642454603, "grad_norm": 0.651267409324646, "learning_rate": 8.776154891304347e-05, "loss": 0.2855, "step": 5406 }, { "epoch": 0.8464308077645586, "grad_norm": 0.7912462949752808, "learning_rate": 8.774966032608694e-05, "loss": 0.2839, "step": 5407 }, { "epoch": 0.8465873512836568, "grad_norm": 0.6992650032043457, "learning_rate": 8.773777173913042e-05, "loss": 0.2689, "step": 5408 }, { "epoch": 0.8467438948027551, "grad_norm": 0.8923744559288025, "learning_rate": 8.772588315217391e-05, "loss": 0.5415, "step": 5409 }, { "epoch": 0.8469004383218535, "grad_norm": 0.8478952050209045, "learning_rate": 8.771399456521739e-05, "loss": 0.3664, "step": 5410 }, { "epoch": 0.8470569818409518, "grad_norm": 0.972070574760437, "learning_rate": 8.770210597826087e-05, "loss": 0.2863, "step": 5411 }, { "epoch": 0.8472135253600501, "grad_norm": 1.483775019645691, "learning_rate": 8.769021739130435e-05, "loss": 0.3084, "step": 5412 }, { "epoch": 0.8473700688791485, "grad_norm": 1.6633985042572021, "learning_rate": 8.767832880434783e-05, "loss": 0.4881, "step": 5413 }, { "epoch": 0.8475266123982467, "grad_norm": 1.1154967546463013, "learning_rate": 8.76664402173913e-05, "loss": 0.2594, "step": 5414 }, { "epoch": 0.847683155917345, "grad_norm": 2.1713781356811523, "learning_rate": 8.765455163043478e-05, "loss": 0.4527, "step": 5415 }, { "epoch": 0.8478396994364433, "grad_norm": 1.3490898609161377, "learning_rate": 8.764266304347826e-05, "loss": 0.8232, "step": 5416 }, { "epoch": 0.8479962429555417, "grad_norm": 1.4562464952468872, "learning_rate": 8.763077445652173e-05, "loss": 0.5445, "step": 5417 }, { "epoch": 0.84815278647464, "grad_norm": 1.7043849229812622, "learning_rate": 8.76188858695652e-05, "loss": 0.6293, "step": 5418 }, { "epoch": 0.8483093299937383, "grad_norm": 1.720474362373352, "learning_rate": 8.760699728260868e-05, "loss": 0.756, "step": 5419 }, { "epoch": 0.8484658735128365, "grad_norm": 1.2404755353927612, "learning_rate": 8.759510869565216e-05, "loss": 0.4256, "step": 5420 }, { "epoch": 0.8486224170319349, "grad_norm": 1.0619641542434692, "learning_rate": 8.758322010869564e-05, "loss": 0.5005, "step": 5421 }, { "epoch": 0.8487789605510332, "grad_norm": 1.5431100130081177, "learning_rate": 8.757133152173912e-05, "loss": 0.5555, "step": 5422 }, { "epoch": 0.8489355040701315, "grad_norm": 0.8918030858039856, "learning_rate": 8.75594429347826e-05, "loss": 0.4193, "step": 5423 }, { "epoch": 0.8490920475892298, "grad_norm": 1.7513856887817383, "learning_rate": 8.754755434782607e-05, "loss": 0.6833, "step": 5424 }, { "epoch": 0.8492485911083281, "grad_norm": 2.0813148021698, "learning_rate": 8.753566576086955e-05, "loss": 0.9052, "step": 5425 }, { "epoch": 0.8494051346274264, "grad_norm": 6.79749059677124, "learning_rate": 8.752377717391303e-05, "loss": 1.5576, "step": 5426 }, { "epoch": 0.8495616781465247, "grad_norm": 1.3928219079971313, "learning_rate": 8.751188858695651e-05, "loss": 0.8637, "step": 5427 }, { "epoch": 0.849718221665623, "grad_norm": 1.8753681182861328, "learning_rate": 8.749999999999999e-05, "loss": 0.475, "step": 5428 }, { "epoch": 0.8498747651847214, "grad_norm": 1.4572772979736328, "learning_rate": 8.748811141304348e-05, "loss": 0.6288, "step": 5429 }, { "epoch": 0.8500313087038197, "grad_norm": 2.512629985809326, "learning_rate": 8.747622282608696e-05, "loss": 0.7829, "step": 5430 }, { "epoch": 0.8501878522229179, "grad_norm": 1.797520399093628, "learning_rate": 8.746433423913043e-05, "loss": 0.976, "step": 5431 }, { "epoch": 0.8503443957420163, "grad_norm": 1.3994966745376587, "learning_rate": 8.745244565217391e-05, "loss": 0.5698, "step": 5432 }, { "epoch": 0.8505009392611146, "grad_norm": 2.3969244956970215, "learning_rate": 8.744055706521739e-05, "loss": 0.9816, "step": 5433 }, { "epoch": 0.8506574827802129, "grad_norm": 2.9744532108306885, "learning_rate": 8.742866847826087e-05, "loss": 0.6115, "step": 5434 }, { "epoch": 0.8508140262993112, "grad_norm": 2.3417508602142334, "learning_rate": 8.741677989130435e-05, "loss": 0.7238, "step": 5435 }, { "epoch": 0.8509705698184096, "grad_norm": 1.5508543252944946, "learning_rate": 8.740489130434782e-05, "loss": 0.4601, "step": 5436 }, { "epoch": 0.8511271133375078, "grad_norm": 1.9206079244613647, "learning_rate": 8.73930027173913e-05, "loss": 0.8719, "step": 5437 }, { "epoch": 0.8512836568566061, "grad_norm": 4.516556739807129, "learning_rate": 8.738111413043478e-05, "loss": 1.3199, "step": 5438 }, { "epoch": 0.8514402003757044, "grad_norm": 4.989134788513184, "learning_rate": 8.736922554347826e-05, "loss": 1.3886, "step": 5439 }, { "epoch": 0.8515967438948028, "grad_norm": 3.3823702335357666, "learning_rate": 8.735733695652172e-05, "loss": 1.1139, "step": 5440 }, { "epoch": 0.8517532874139011, "grad_norm": 2.7713770866394043, "learning_rate": 8.73454483695652e-05, "loss": 0.8019, "step": 5441 }, { "epoch": 0.8519098309329993, "grad_norm": 2.924119472503662, "learning_rate": 8.733355978260868e-05, "loss": 1.1867, "step": 5442 }, { "epoch": 0.8520663744520977, "grad_norm": 4.350882053375244, "learning_rate": 8.732167119565216e-05, "loss": 1.6805, "step": 5443 }, { "epoch": 0.852222917971196, "grad_norm": 3.4019932746887207, "learning_rate": 8.730978260869564e-05, "loss": 1.2891, "step": 5444 }, { "epoch": 0.8523794614902943, "grad_norm": 3.2382094860076904, "learning_rate": 8.729789402173911e-05, "loss": 0.8802, "step": 5445 }, { "epoch": 0.8525360050093926, "grad_norm": 4.198530197143555, "learning_rate": 8.728600543478259e-05, "loss": 1.051, "step": 5446 }, { "epoch": 0.852692548528491, "grad_norm": 1.4369323253631592, "learning_rate": 8.727411684782607e-05, "loss": 0.5306, "step": 5447 }, { "epoch": 0.8528490920475892, "grad_norm": 1.0436128377914429, "learning_rate": 8.726222826086956e-05, "loss": 0.3982, "step": 5448 }, { "epoch": 0.8530056355666875, "grad_norm": 3.2520949840545654, "learning_rate": 8.725033967391304e-05, "loss": 0.9321, "step": 5449 }, { "epoch": 0.8531621790857858, "grad_norm": 3.6256816387176514, "learning_rate": 8.723845108695652e-05, "loss": 1.1842, "step": 5450 }, { "epoch": 0.8533187226048842, "grad_norm": 0.784745454788208, "learning_rate": 8.72265625e-05, "loss": 0.2511, "step": 5451 }, { "epoch": 0.8534752661239825, "grad_norm": 0.6600539684295654, "learning_rate": 8.721467391304347e-05, "loss": 0.3887, "step": 5452 }, { "epoch": 0.8536318096430808, "grad_norm": 0.5664318799972534, "learning_rate": 8.720278532608695e-05, "loss": 0.245, "step": 5453 }, { "epoch": 0.853788353162179, "grad_norm": 0.601123571395874, "learning_rate": 8.719089673913043e-05, "loss": 0.2498, "step": 5454 }, { "epoch": 0.8539448966812774, "grad_norm": 1.5043984651565552, "learning_rate": 8.717900815217391e-05, "loss": 0.4086, "step": 5455 }, { "epoch": 0.8541014402003757, "grad_norm": 1.2479976415634155, "learning_rate": 8.716711956521739e-05, "loss": 0.497, "step": 5456 }, { "epoch": 0.854257983719474, "grad_norm": 1.0096694231033325, "learning_rate": 8.715523097826087e-05, "loss": 0.5093, "step": 5457 }, { "epoch": 0.8544145272385724, "grad_norm": 0.8372544050216675, "learning_rate": 8.714334239130434e-05, "loss": 0.3848, "step": 5458 }, { "epoch": 0.8545710707576706, "grad_norm": 0.9401189088821411, "learning_rate": 8.713145380434782e-05, "loss": 0.2812, "step": 5459 }, { "epoch": 0.8547276142767689, "grad_norm": 0.9087872505187988, "learning_rate": 8.71195652173913e-05, "loss": 0.4055, "step": 5460 }, { "epoch": 0.8548841577958672, "grad_norm": 1.029895305633545, "learning_rate": 8.710767663043478e-05, "loss": 0.4778, "step": 5461 }, { "epoch": 0.8550407013149656, "grad_norm": 1.4157477617263794, "learning_rate": 8.709578804347826e-05, "loss": 0.3785, "step": 5462 }, { "epoch": 0.8551972448340639, "grad_norm": 1.5097975730895996, "learning_rate": 8.708389945652172e-05, "loss": 0.4808, "step": 5463 }, { "epoch": 0.8553537883531622, "grad_norm": 1.2706317901611328, "learning_rate": 8.70720108695652e-05, "loss": 0.6059, "step": 5464 }, { "epoch": 0.8555103318722604, "grad_norm": 1.0231353044509888, "learning_rate": 8.706012228260868e-05, "loss": 0.4147, "step": 5465 }, { "epoch": 0.8556668753913588, "grad_norm": 1.1858527660369873, "learning_rate": 8.704823369565216e-05, "loss": 0.5491, "step": 5466 }, { "epoch": 0.8558234189104571, "grad_norm": 1.8789476156234741, "learning_rate": 8.703634510869563e-05, "loss": 0.5504, "step": 5467 }, { "epoch": 0.8559799624295554, "grad_norm": 1.1016745567321777, "learning_rate": 8.702445652173913e-05, "loss": 0.6061, "step": 5468 }, { "epoch": 0.8561365059486538, "grad_norm": 0.9951156973838806, "learning_rate": 8.70125679347826e-05, "loss": 0.3922, "step": 5469 }, { "epoch": 0.8562930494677521, "grad_norm": 1.0811753273010254, "learning_rate": 8.700067934782608e-05, "loss": 0.5249, "step": 5470 }, { "epoch": 0.8564495929868503, "grad_norm": 2.3542215824127197, "learning_rate": 8.698879076086956e-05, "loss": 0.7895, "step": 5471 }, { "epoch": 0.8566061365059486, "grad_norm": 3.1874825954437256, "learning_rate": 8.697690217391304e-05, "loss": 1.0905, "step": 5472 }, { "epoch": 0.856762680025047, "grad_norm": 1.4346171617507935, "learning_rate": 8.696501358695652e-05, "loss": 0.5587, "step": 5473 }, { "epoch": 0.8569192235441453, "grad_norm": 2.6417720317840576, "learning_rate": 8.6953125e-05, "loss": 0.5772, "step": 5474 }, { "epoch": 0.8570757670632436, "grad_norm": 2.8630905151367188, "learning_rate": 8.694123641304347e-05, "loss": 0.9781, "step": 5475 }, { "epoch": 0.8572323105823418, "grad_norm": 1.3780604600906372, "learning_rate": 8.692934782608695e-05, "loss": 0.9561, "step": 5476 }, { "epoch": 0.8573888541014402, "grad_norm": 2.1020588874816895, "learning_rate": 8.691745923913043e-05, "loss": 0.5639, "step": 5477 }, { "epoch": 0.8575453976205385, "grad_norm": 1.1991020441055298, "learning_rate": 8.690557065217391e-05, "loss": 0.6973, "step": 5478 }, { "epoch": 0.8577019411396368, "grad_norm": 1.5312730073928833, "learning_rate": 8.689368206521739e-05, "loss": 0.2777, "step": 5479 }, { "epoch": 0.8578584846587352, "grad_norm": 1.6915258169174194, "learning_rate": 8.688179347826086e-05, "loss": 0.8324, "step": 5480 }, { "epoch": 0.8580150281778335, "grad_norm": 2.9591636657714844, "learning_rate": 8.686990489130434e-05, "loss": 0.8627, "step": 5481 }, { "epoch": 0.8581715716969317, "grad_norm": 2.1169724464416504, "learning_rate": 8.685801630434782e-05, "loss": 0.7397, "step": 5482 }, { "epoch": 0.85832811521603, "grad_norm": 7.48992919921875, "learning_rate": 8.68461277173913e-05, "loss": 1.9694, "step": 5483 }, { "epoch": 0.8584846587351284, "grad_norm": 1.8384634256362915, "learning_rate": 8.683423913043478e-05, "loss": 1.0347, "step": 5484 }, { "epoch": 0.8586412022542267, "grad_norm": 5.302188396453857, "learning_rate": 8.682235054347825e-05, "loss": 0.6467, "step": 5485 }, { "epoch": 0.858797745773325, "grad_norm": 2.307727098464966, "learning_rate": 8.681046195652172e-05, "loss": 1.0747, "step": 5486 }, { "epoch": 0.8589542892924233, "grad_norm": 2.329775094985962, "learning_rate": 8.67985733695652e-05, "loss": 1.2413, "step": 5487 }, { "epoch": 0.8591108328115216, "grad_norm": 4.266937732696533, "learning_rate": 8.678668478260869e-05, "loss": 1.2167, "step": 5488 }, { "epoch": 0.8592673763306199, "grad_norm": 4.533767223358154, "learning_rate": 8.677479619565217e-05, "loss": 1.2731, "step": 5489 }, { "epoch": 0.8594239198497182, "grad_norm": 3.9129202365875244, "learning_rate": 8.676290760869564e-05, "loss": 1.5203, "step": 5490 }, { "epoch": 0.8595804633688165, "grad_norm": 1.6305701732635498, "learning_rate": 8.675101902173912e-05, "loss": 0.9807, "step": 5491 }, { "epoch": 0.8597370068879149, "grad_norm": 3.2365901470184326, "learning_rate": 8.67391304347826e-05, "loss": 0.6512, "step": 5492 }, { "epoch": 0.8598935504070131, "grad_norm": 1.8785210847854614, "learning_rate": 8.672724184782608e-05, "loss": 0.4477, "step": 5493 }, { "epoch": 0.8600500939261114, "grad_norm": 2.2417590618133545, "learning_rate": 8.671535326086956e-05, "loss": 1.5176, "step": 5494 }, { "epoch": 0.8602066374452098, "grad_norm": 2.366938352584839, "learning_rate": 8.670346467391304e-05, "loss": 0.8247, "step": 5495 }, { "epoch": 0.8603631809643081, "grad_norm": 4.872781753540039, "learning_rate": 8.669157608695651e-05, "loss": 1.2017, "step": 5496 }, { "epoch": 0.8605197244834064, "grad_norm": 3.524395227432251, "learning_rate": 8.667968749999999e-05, "loss": 0.4309, "step": 5497 }, { "epoch": 0.8606762680025047, "grad_norm": 3.38177227973938, "learning_rate": 8.666779891304347e-05, "loss": 0.7246, "step": 5498 }, { "epoch": 0.860832811521603, "grad_norm": 2.670896053314209, "learning_rate": 8.665591032608695e-05, "loss": 0.8497, "step": 5499 }, { "epoch": 0.8609893550407013, "grad_norm": 4.118897438049316, "learning_rate": 8.664402173913043e-05, "loss": 1.684, "step": 5500 }, { "epoch": 0.8611458985597996, "grad_norm": 0.5886116027832031, "learning_rate": 8.66321331521739e-05, "loss": 0.2791, "step": 5501 }, { "epoch": 0.861302442078898, "grad_norm": 0.6227549910545349, "learning_rate": 8.662024456521738e-05, "loss": 0.2522, "step": 5502 }, { "epoch": 0.8614589855979963, "grad_norm": 0.6188749670982361, "learning_rate": 8.660835597826086e-05, "loss": 0.2427, "step": 5503 }, { "epoch": 0.8616155291170946, "grad_norm": 1.1619402170181274, "learning_rate": 8.659646739130434e-05, "loss": 0.5164, "step": 5504 }, { "epoch": 0.8617720726361928, "grad_norm": 1.2367018461227417, "learning_rate": 8.658457880434783e-05, "loss": 0.4406, "step": 5505 }, { "epoch": 0.8619286161552911, "grad_norm": 0.8324200510978699, "learning_rate": 8.657269021739131e-05, "loss": 0.3535, "step": 5506 }, { "epoch": 0.8620851596743895, "grad_norm": 0.7902551293373108, "learning_rate": 8.656080163043479e-05, "loss": 0.3339, "step": 5507 }, { "epoch": 0.8622417031934878, "grad_norm": 0.8638984560966492, "learning_rate": 8.654891304347827e-05, "loss": 0.2731, "step": 5508 }, { "epoch": 0.8623982467125861, "grad_norm": 0.5960444808006287, "learning_rate": 8.653702445652173e-05, "loss": 0.1935, "step": 5509 }, { "epoch": 0.8625547902316844, "grad_norm": 0.7089747786521912, "learning_rate": 8.652513586956521e-05, "loss": 0.3306, "step": 5510 }, { "epoch": 0.8627113337507827, "grad_norm": 0.8517075181007385, "learning_rate": 8.651324728260869e-05, "loss": 0.2748, "step": 5511 }, { "epoch": 0.862867877269881, "grad_norm": 0.6808117032051086, "learning_rate": 8.650135869565216e-05, "loss": 0.3366, "step": 5512 }, { "epoch": 0.8630244207889793, "grad_norm": 0.8240519165992737, "learning_rate": 8.648947010869564e-05, "loss": 0.3883, "step": 5513 }, { "epoch": 0.8631809643080777, "grad_norm": 1.1415549516677856, "learning_rate": 8.647758152173912e-05, "loss": 0.3666, "step": 5514 }, { "epoch": 0.863337507827176, "grad_norm": 1.6499348878860474, "learning_rate": 8.64656929347826e-05, "loss": 0.5248, "step": 5515 }, { "epoch": 0.8634940513462742, "grad_norm": 3.07328724861145, "learning_rate": 8.645380434782608e-05, "loss": 0.6293, "step": 5516 }, { "epoch": 0.8636505948653725, "grad_norm": 1.3510329723358154, "learning_rate": 8.644191576086955e-05, "loss": 0.6983, "step": 5517 }, { "epoch": 0.8638071383844709, "grad_norm": 1.370919942855835, "learning_rate": 8.643002717391303e-05, "loss": 0.5079, "step": 5518 }, { "epoch": 0.8639636819035692, "grad_norm": 1.6073662042617798, "learning_rate": 8.641813858695651e-05, "loss": 0.4177, "step": 5519 }, { "epoch": 0.8641202254226675, "grad_norm": 1.490707278251648, "learning_rate": 8.640624999999999e-05, "loss": 0.5618, "step": 5520 }, { "epoch": 0.8642767689417659, "grad_norm": 1.2659006118774414, "learning_rate": 8.639436141304347e-05, "loss": 0.5766, "step": 5521 }, { "epoch": 0.8644333124608641, "grad_norm": 1.438582420349121, "learning_rate": 8.638247282608695e-05, "loss": 0.6417, "step": 5522 }, { "epoch": 0.8645898559799624, "grad_norm": 1.6417059898376465, "learning_rate": 8.637058423913042e-05, "loss": 0.5826, "step": 5523 }, { "epoch": 0.8647463994990607, "grad_norm": 2.700366497039795, "learning_rate": 8.63586956521739e-05, "loss": 0.7268, "step": 5524 }, { "epoch": 0.8649029430181591, "grad_norm": 1.0571210384368896, "learning_rate": 8.63468070652174e-05, "loss": 0.5675, "step": 5525 }, { "epoch": 0.8650594865372574, "grad_norm": 1.9994057416915894, "learning_rate": 8.633491847826087e-05, "loss": 0.5473, "step": 5526 }, { "epoch": 0.8652160300563556, "grad_norm": 1.6926462650299072, "learning_rate": 8.632302989130435e-05, "loss": 1.0283, "step": 5527 }, { "epoch": 0.8653725735754539, "grad_norm": 2.2869679927825928, "learning_rate": 8.631114130434783e-05, "loss": 1.0466, "step": 5528 }, { "epoch": 0.8655291170945523, "grad_norm": 2.1803433895111084, "learning_rate": 8.62992527173913e-05, "loss": 0.9587, "step": 5529 }, { "epoch": 0.8656856606136506, "grad_norm": 2.8904969692230225, "learning_rate": 8.628736413043478e-05, "loss": 0.6973, "step": 5530 }, { "epoch": 0.8658422041327489, "grad_norm": 3.755099296569824, "learning_rate": 8.627547554347826e-05, "loss": 0.7483, "step": 5531 }, { "epoch": 0.8659987476518473, "grad_norm": 1.8937904834747314, "learning_rate": 8.626358695652173e-05, "loss": 0.7294, "step": 5532 }, { "epoch": 0.8661552911709455, "grad_norm": 4.799919605255127, "learning_rate": 8.62516983695652e-05, "loss": 0.6913, "step": 5533 }, { "epoch": 0.8663118346900438, "grad_norm": 3.507878541946411, "learning_rate": 8.623980978260868e-05, "loss": 0.8743, "step": 5534 }, { "epoch": 0.8664683782091421, "grad_norm": 3.011235475540161, "learning_rate": 8.622792119565216e-05, "loss": 0.9489, "step": 5535 }, { "epoch": 0.8666249217282405, "grad_norm": 2.876493453979492, "learning_rate": 8.621603260869564e-05, "loss": 0.9968, "step": 5536 }, { "epoch": 0.8667814652473388, "grad_norm": 3.2890398502349854, "learning_rate": 8.620414402173912e-05, "loss": 0.9033, "step": 5537 }, { "epoch": 0.8669380087664371, "grad_norm": 3.1737139225006104, "learning_rate": 8.61922554347826e-05, "loss": 1.41, "step": 5538 }, { "epoch": 0.8670945522855353, "grad_norm": 2.4880547523498535, "learning_rate": 8.618036684782607e-05, "loss": 1.4273, "step": 5539 }, { "epoch": 0.8672510958046337, "grad_norm": 2.660600185394287, "learning_rate": 8.616847826086955e-05, "loss": 0.892, "step": 5540 }, { "epoch": 0.867407639323732, "grad_norm": 2.990304946899414, "learning_rate": 8.615658967391303e-05, "loss": 1.9043, "step": 5541 }, { "epoch": 0.8675641828428303, "grad_norm": 2.4657864570617676, "learning_rate": 8.614470108695651e-05, "loss": 1.1122, "step": 5542 }, { "epoch": 0.8677207263619287, "grad_norm": 1.5288726091384888, "learning_rate": 8.613281249999999e-05, "loss": 0.9252, "step": 5543 }, { "epoch": 0.867877269881027, "grad_norm": 1.7360360622406006, "learning_rate": 8.612092391304347e-05, "loss": 1.0553, "step": 5544 }, { "epoch": 0.8680338134001252, "grad_norm": 3.447996139526367, "learning_rate": 8.610903532608696e-05, "loss": 1.3029, "step": 5545 }, { "epoch": 0.8681903569192235, "grad_norm": 6.369134902954102, "learning_rate": 8.609714673913044e-05, "loss": 1.4618, "step": 5546 }, { "epoch": 0.8683469004383219, "grad_norm": 2.436089515686035, "learning_rate": 8.608525815217391e-05, "loss": 1.1468, "step": 5547 }, { "epoch": 0.8685034439574202, "grad_norm": 1.508509635925293, "learning_rate": 8.607336956521739e-05, "loss": 0.6771, "step": 5548 }, { "epoch": 0.8686599874765185, "grad_norm": 3.072213888168335, "learning_rate": 8.606148097826087e-05, "loss": 1.0405, "step": 5549 }, { "epoch": 0.8688165309956167, "grad_norm": 1.440189242362976, "learning_rate": 8.604959239130435e-05, "loss": 0.7609, "step": 5550 }, { "epoch": 0.8689730745147151, "grad_norm": 0.718450665473938, "learning_rate": 8.603770380434783e-05, "loss": 0.348, "step": 5551 }, { "epoch": 0.8691296180338134, "grad_norm": 0.5306031107902527, "learning_rate": 8.60258152173913e-05, "loss": 0.3138, "step": 5552 }, { "epoch": 0.8692861615529117, "grad_norm": 0.5810693502426147, "learning_rate": 8.601392663043478e-05, "loss": 0.2266, "step": 5553 }, { "epoch": 0.86944270507201, "grad_norm": 0.5506734251976013, "learning_rate": 8.600203804347826e-05, "loss": 0.3086, "step": 5554 }, { "epoch": 0.8695992485911084, "grad_norm": 0.8341335654258728, "learning_rate": 8.599014945652172e-05, "loss": 0.3603, "step": 5555 }, { "epoch": 0.8697557921102066, "grad_norm": 0.7220032811164856, "learning_rate": 8.59782608695652e-05, "loss": 0.3213, "step": 5556 }, { "epoch": 0.8699123356293049, "grad_norm": 0.9753401279449463, "learning_rate": 8.596637228260868e-05, "loss": 0.2653, "step": 5557 }, { "epoch": 0.8700688791484033, "grad_norm": 1.0051226615905762, "learning_rate": 8.595448369565216e-05, "loss": 0.3932, "step": 5558 }, { "epoch": 0.8702254226675016, "grad_norm": 0.5699295401573181, "learning_rate": 8.594259510869564e-05, "loss": 0.2034, "step": 5559 }, { "epoch": 0.8703819661865999, "grad_norm": 1.0160009860992432, "learning_rate": 8.593070652173912e-05, "loss": 0.3592, "step": 5560 }, { "epoch": 0.8705385097056982, "grad_norm": 0.8355585932731628, "learning_rate": 8.59188179347826e-05, "loss": 0.2455, "step": 5561 }, { "epoch": 0.8706950532247965, "grad_norm": 0.5303447246551514, "learning_rate": 8.590692934782607e-05, "loss": 0.2527, "step": 5562 }, { "epoch": 0.8708515967438948, "grad_norm": 1.200756311416626, "learning_rate": 8.589504076086955e-05, "loss": 0.3862, "step": 5563 }, { "epoch": 0.8710081402629931, "grad_norm": 1.4274461269378662, "learning_rate": 8.588315217391303e-05, "loss": 0.3724, "step": 5564 }, { "epoch": 0.8711646837820914, "grad_norm": 1.845873236656189, "learning_rate": 8.587126358695652e-05, "loss": 0.6024, "step": 5565 }, { "epoch": 0.8713212273011898, "grad_norm": 0.9118216037750244, "learning_rate": 8.5859375e-05, "loss": 0.3657, "step": 5566 }, { "epoch": 0.871477770820288, "grad_norm": 2.55080246925354, "learning_rate": 8.584748641304348e-05, "loss": 0.4481, "step": 5567 }, { "epoch": 0.8716343143393863, "grad_norm": 2.7012875080108643, "learning_rate": 8.583559782608695e-05, "loss": 0.8201, "step": 5568 }, { "epoch": 0.8717908578584846, "grad_norm": 2.4417409896850586, "learning_rate": 8.582370923913043e-05, "loss": 0.4634, "step": 5569 }, { "epoch": 0.871947401377583, "grad_norm": 1.131874680519104, "learning_rate": 8.581182065217391e-05, "loss": 0.5402, "step": 5570 }, { "epoch": 0.8721039448966813, "grad_norm": 1.3945516347885132, "learning_rate": 8.579993206521739e-05, "loss": 0.5256, "step": 5571 }, { "epoch": 0.8722604884157796, "grad_norm": 2.8240392208099365, "learning_rate": 8.578804347826087e-05, "loss": 0.5244, "step": 5572 }, { "epoch": 0.8724170319348779, "grad_norm": 2.185271739959717, "learning_rate": 8.577615489130435e-05, "loss": 0.6691, "step": 5573 }, { "epoch": 0.8725735754539762, "grad_norm": 2.473571300506592, "learning_rate": 8.576426630434782e-05, "loss": 0.4033, "step": 5574 }, { "epoch": 0.8727301189730745, "grad_norm": 3.7589945793151855, "learning_rate": 8.57523777173913e-05, "loss": 0.7357, "step": 5575 }, { "epoch": 0.8728866624921728, "grad_norm": 1.869259238243103, "learning_rate": 8.574048913043478e-05, "loss": 0.5656, "step": 5576 }, { "epoch": 0.8730432060112712, "grad_norm": 1.5778647661209106, "learning_rate": 8.572860054347826e-05, "loss": 0.4255, "step": 5577 }, { "epoch": 0.8731997495303695, "grad_norm": 3.484623432159424, "learning_rate": 8.571671195652172e-05, "loss": 0.7168, "step": 5578 }, { "epoch": 0.8733562930494677, "grad_norm": 4.5062713623046875, "learning_rate": 8.57048233695652e-05, "loss": 1.2711, "step": 5579 }, { "epoch": 0.873512836568566, "grad_norm": 1.7533302307128906, "learning_rate": 8.569293478260868e-05, "loss": 0.5995, "step": 5580 }, { "epoch": 0.8736693800876644, "grad_norm": 2.053168296813965, "learning_rate": 8.568104619565216e-05, "loss": 0.71, "step": 5581 }, { "epoch": 0.8738259236067627, "grad_norm": 5.223612308502197, "learning_rate": 8.566915760869564e-05, "loss": 1.1205, "step": 5582 }, { "epoch": 0.873982467125861, "grad_norm": 4.895323753356934, "learning_rate": 8.565726902173911e-05, "loss": 1.0368, "step": 5583 }, { "epoch": 0.8741390106449592, "grad_norm": 2.226228952407837, "learning_rate": 8.564538043478259e-05, "loss": 1.0134, "step": 5584 }, { "epoch": 0.8742955541640576, "grad_norm": 2.488269567489624, "learning_rate": 8.563349184782608e-05, "loss": 1.0651, "step": 5585 }, { "epoch": 0.8744520976831559, "grad_norm": 2.762920618057251, "learning_rate": 8.562160326086956e-05, "loss": 0.7315, "step": 5586 }, { "epoch": 0.8746086412022542, "grad_norm": 4.3104023933410645, "learning_rate": 8.560971467391304e-05, "loss": 1.4875, "step": 5587 }, { "epoch": 0.8747651847213526, "grad_norm": 3.1027889251708984, "learning_rate": 8.559782608695652e-05, "loss": 0.8099, "step": 5588 }, { "epoch": 0.8749217282404509, "grad_norm": 2.1330435276031494, "learning_rate": 8.55859375e-05, "loss": 0.9458, "step": 5589 }, { "epoch": 0.8750782717595491, "grad_norm": 2.4546496868133545, "learning_rate": 8.557404891304347e-05, "loss": 0.8181, "step": 5590 }, { "epoch": 0.8752348152786474, "grad_norm": 3.069246292114258, "learning_rate": 8.556216032608695e-05, "loss": 1.4575, "step": 5591 }, { "epoch": 0.8753913587977458, "grad_norm": 5.648372650146484, "learning_rate": 8.555027173913043e-05, "loss": 1.677, "step": 5592 }, { "epoch": 0.8755479023168441, "grad_norm": 2.246485948562622, "learning_rate": 8.553838315217391e-05, "loss": 1.4386, "step": 5593 }, { "epoch": 0.8757044458359424, "grad_norm": 2.1204938888549805, "learning_rate": 8.552649456521739e-05, "loss": 0.8165, "step": 5594 }, { "epoch": 0.8758609893550408, "grad_norm": 2.4663760662078857, "learning_rate": 8.551460597826086e-05, "loss": 1.0349, "step": 5595 }, { "epoch": 0.876017532874139, "grad_norm": 3.0044302940368652, "learning_rate": 8.550271739130434e-05, "loss": 1.2344, "step": 5596 }, { "epoch": 0.8761740763932373, "grad_norm": 6.975174903869629, "learning_rate": 8.549082880434782e-05, "loss": 0.8401, "step": 5597 }, { "epoch": 0.8763306199123356, "grad_norm": 2.2275023460388184, "learning_rate": 8.54789402173913e-05, "loss": 0.3913, "step": 5598 }, { "epoch": 0.876487163431434, "grad_norm": 4.160890579223633, "learning_rate": 8.546705163043478e-05, "loss": 1.4344, "step": 5599 }, { "epoch": 0.8766437069505323, "grad_norm": 2.926431179046631, "learning_rate": 8.545516304347826e-05, "loss": 1.1854, "step": 5600 }, { "epoch": 0.8768002504696305, "grad_norm": 0.5364245772361755, "learning_rate": 8.544327445652172e-05, "loss": 0.2829, "step": 5601 }, { "epoch": 0.8769567939887288, "grad_norm": 0.6170448064804077, "learning_rate": 8.54313858695652e-05, "loss": 0.3111, "step": 5602 }, { "epoch": 0.8771133375078272, "grad_norm": 0.44011300802230835, "learning_rate": 8.541949728260868e-05, "loss": 0.291, "step": 5603 }, { "epoch": 0.8772698810269255, "grad_norm": 0.43509936332702637, "learning_rate": 8.540760869565215e-05, "loss": 0.1902, "step": 5604 }, { "epoch": 0.8774264245460238, "grad_norm": 0.9144447445869446, "learning_rate": 8.539572010869565e-05, "loss": 0.5166, "step": 5605 }, { "epoch": 0.8775829680651221, "grad_norm": 0.5999534726142883, "learning_rate": 8.538383152173912e-05, "loss": 0.3374, "step": 5606 }, { "epoch": 0.8777395115842204, "grad_norm": 1.1343799829483032, "learning_rate": 8.53719429347826e-05, "loss": 0.4133, "step": 5607 }, { "epoch": 0.8778960551033187, "grad_norm": 0.7616267800331116, "learning_rate": 8.536005434782608e-05, "loss": 0.3724, "step": 5608 }, { "epoch": 0.878052598622417, "grad_norm": 0.9691711664199829, "learning_rate": 8.534816576086956e-05, "loss": 0.5112, "step": 5609 }, { "epoch": 0.8782091421415154, "grad_norm": 0.7008672952651978, "learning_rate": 8.533627717391304e-05, "loss": 0.2419, "step": 5610 }, { "epoch": 0.8783656856606137, "grad_norm": 0.801258385181427, "learning_rate": 8.532438858695652e-05, "loss": 0.3265, "step": 5611 }, { "epoch": 0.878522229179712, "grad_norm": 0.8847915530204773, "learning_rate": 8.53125e-05, "loss": 0.2959, "step": 5612 }, { "epoch": 0.8786787726988102, "grad_norm": 1.0883069038391113, "learning_rate": 8.530061141304347e-05, "loss": 0.3826, "step": 5613 }, { "epoch": 0.8788353162179086, "grad_norm": 0.7413970232009888, "learning_rate": 8.528872282608695e-05, "loss": 0.2595, "step": 5614 }, { "epoch": 0.8789918597370069, "grad_norm": 1.1260814666748047, "learning_rate": 8.527683423913043e-05, "loss": 0.3538, "step": 5615 }, { "epoch": 0.8791484032561052, "grad_norm": 2.7166354656219482, "learning_rate": 8.52649456521739e-05, "loss": 0.4093, "step": 5616 }, { "epoch": 0.8793049467752035, "grad_norm": 1.742403268814087, "learning_rate": 8.525305706521738e-05, "loss": 0.6475, "step": 5617 }, { "epoch": 0.8794614902943018, "grad_norm": 3.3085103034973145, "learning_rate": 8.524116847826086e-05, "loss": 0.5905, "step": 5618 }, { "epoch": 0.8796180338134001, "grad_norm": 1.1979215145111084, "learning_rate": 8.522927989130434e-05, "loss": 0.5336, "step": 5619 }, { "epoch": 0.8797745773324984, "grad_norm": 1.5335686206817627, "learning_rate": 8.521739130434782e-05, "loss": 0.4007, "step": 5620 }, { "epoch": 0.8799311208515967, "grad_norm": 0.8835433125495911, "learning_rate": 8.52055027173913e-05, "loss": 0.349, "step": 5621 }, { "epoch": 0.8800876643706951, "grad_norm": 1.429181694984436, "learning_rate": 8.519361413043479e-05, "loss": 0.2704, "step": 5622 }, { "epoch": 0.8802442078897934, "grad_norm": 1.2020723819732666, "learning_rate": 8.518172554347827e-05, "loss": 0.4578, "step": 5623 }, { "epoch": 0.8804007514088916, "grad_norm": 2.103182077407837, "learning_rate": 8.516983695652172e-05, "loss": 0.7579, "step": 5624 }, { "epoch": 0.88055729492799, "grad_norm": 1.5911911725997925, "learning_rate": 8.515794836956521e-05, "loss": 0.5613, "step": 5625 }, { "epoch": 0.8807138384470883, "grad_norm": 2.6920995712280273, "learning_rate": 8.514605978260869e-05, "loss": 0.6981, "step": 5626 }, { "epoch": 0.8808703819661866, "grad_norm": 1.2458611726760864, "learning_rate": 8.513417119565217e-05, "loss": 0.5185, "step": 5627 }, { "epoch": 0.8810269254852849, "grad_norm": 3.2910423278808594, "learning_rate": 8.512228260869564e-05, "loss": 0.889, "step": 5628 }, { "epoch": 0.8811834690043833, "grad_norm": 3.5602588653564453, "learning_rate": 8.511039402173912e-05, "loss": 0.4587, "step": 5629 }, { "epoch": 0.8813400125234815, "grad_norm": 3.0259573459625244, "learning_rate": 8.50985054347826e-05, "loss": 0.6304, "step": 5630 }, { "epoch": 0.8814965560425798, "grad_norm": 3.5086677074432373, "learning_rate": 8.508661684782608e-05, "loss": 0.6472, "step": 5631 }, { "epoch": 0.8816530995616781, "grad_norm": 2.7139394283294678, "learning_rate": 8.507472826086956e-05, "loss": 0.8811, "step": 5632 }, { "epoch": 0.8818096430807765, "grad_norm": 2.547722101211548, "learning_rate": 8.506283967391303e-05, "loss": 0.9349, "step": 5633 }, { "epoch": 0.8819661865998748, "grad_norm": 4.535624980926514, "learning_rate": 8.505095108695651e-05, "loss": 1.0334, "step": 5634 }, { "epoch": 0.882122730118973, "grad_norm": 4.26133394241333, "learning_rate": 8.503906249999999e-05, "loss": 0.6099, "step": 5635 }, { "epoch": 0.8822792736380713, "grad_norm": 2.64609694480896, "learning_rate": 8.502717391304347e-05, "loss": 0.8639, "step": 5636 }, { "epoch": 0.8824358171571697, "grad_norm": 2.5036866664886475, "learning_rate": 8.501528532608695e-05, "loss": 1.0999, "step": 5637 }, { "epoch": 0.882592360676268, "grad_norm": 5.7538676261901855, "learning_rate": 8.500339673913043e-05, "loss": 1.4173, "step": 5638 }, { "epoch": 0.8827489041953663, "grad_norm": 1.5510731935501099, "learning_rate": 8.49915081521739e-05, "loss": 0.8433, "step": 5639 }, { "epoch": 0.8829054477144647, "grad_norm": 1.3866808414459229, "learning_rate": 8.497961956521738e-05, "loss": 0.7673, "step": 5640 }, { "epoch": 0.8830619912335629, "grad_norm": 4.0734381675720215, "learning_rate": 8.496773097826086e-05, "loss": 1.7006, "step": 5641 }, { "epoch": 0.8832185347526612, "grad_norm": 2.119198799133301, "learning_rate": 8.495584239130435e-05, "loss": 1.3527, "step": 5642 }, { "epoch": 0.8833750782717595, "grad_norm": 2.153388500213623, "learning_rate": 8.494395380434783e-05, "loss": 1.3711, "step": 5643 }, { "epoch": 0.8835316217908579, "grad_norm": 4.6524763107299805, "learning_rate": 8.493206521739131e-05, "loss": 1.3099, "step": 5644 }, { "epoch": 0.8836881653099562, "grad_norm": 5.488666534423828, "learning_rate": 8.492017663043479e-05, "loss": 1.8353, "step": 5645 }, { "epoch": 0.8838447088290545, "grad_norm": 1.9798827171325684, "learning_rate": 8.490828804347826e-05, "loss": 0.5329, "step": 5646 }, { "epoch": 0.8840012523481527, "grad_norm": 2.2661190032958984, "learning_rate": 8.489639945652173e-05, "loss": 0.8219, "step": 5647 }, { "epoch": 0.8841577958672511, "grad_norm": 1.3685269355773926, "learning_rate": 8.488451086956521e-05, "loss": 0.3239, "step": 5648 }, { "epoch": 0.8843143393863494, "grad_norm": 2.0860238075256348, "learning_rate": 8.487262228260869e-05, "loss": 0.7514, "step": 5649 }, { "epoch": 0.8844708829054477, "grad_norm": 2.8044235706329346, "learning_rate": 8.486073369565216e-05, "loss": 0.9532, "step": 5650 }, { "epoch": 0.8846274264245461, "grad_norm": 0.6034975647926331, "learning_rate": 8.484884510869564e-05, "loss": 0.2107, "step": 5651 }, { "epoch": 0.8847839699436444, "grad_norm": 0.3424580991268158, "learning_rate": 8.483695652173912e-05, "loss": 0.1748, "step": 5652 }, { "epoch": 0.8849405134627426, "grad_norm": 0.521429181098938, "learning_rate": 8.48250679347826e-05, "loss": 0.2397, "step": 5653 }, { "epoch": 0.8850970569818409, "grad_norm": 1.8413515090942383, "learning_rate": 8.481317934782608e-05, "loss": 0.2835, "step": 5654 }, { "epoch": 0.8852536005009393, "grad_norm": 0.6744848489761353, "learning_rate": 8.480129076086955e-05, "loss": 0.4257, "step": 5655 }, { "epoch": 0.8854101440200376, "grad_norm": 1.022513508796692, "learning_rate": 8.478940217391303e-05, "loss": 0.4251, "step": 5656 }, { "epoch": 0.8855666875391359, "grad_norm": 0.8862607479095459, "learning_rate": 8.477751358695651e-05, "loss": 0.2907, "step": 5657 }, { "epoch": 0.8857232310582341, "grad_norm": 1.0817153453826904, "learning_rate": 8.476562499999999e-05, "loss": 0.2174, "step": 5658 }, { "epoch": 0.8858797745773325, "grad_norm": 1.22175931930542, "learning_rate": 8.475373641304347e-05, "loss": 0.3966, "step": 5659 }, { "epoch": 0.8860363180964308, "grad_norm": 0.7390747666358948, "learning_rate": 8.474184782608694e-05, "loss": 0.1975, "step": 5660 }, { "epoch": 0.8861928616155291, "grad_norm": 2.3600504398345947, "learning_rate": 8.472995923913042e-05, "loss": 0.4328, "step": 5661 }, { "epoch": 0.8863494051346275, "grad_norm": 1.2243072986602783, "learning_rate": 8.471807065217391e-05, "loss": 0.3096, "step": 5662 }, { "epoch": 0.8865059486537258, "grad_norm": 0.9596685171127319, "learning_rate": 8.470618206521739e-05, "loss": 0.4035, "step": 5663 }, { "epoch": 0.886662492172824, "grad_norm": 1.0262401103973389, "learning_rate": 8.469429347826087e-05, "loss": 0.4734, "step": 5664 }, { "epoch": 0.8868190356919223, "grad_norm": 1.785001277923584, "learning_rate": 8.468240489130435e-05, "loss": 0.4238, "step": 5665 }, { "epoch": 0.8869755792110207, "grad_norm": 1.1328532695770264, "learning_rate": 8.467051630434783e-05, "loss": 0.4399, "step": 5666 }, { "epoch": 0.887132122730119, "grad_norm": 1.86044442653656, "learning_rate": 8.46586277173913e-05, "loss": 0.4694, "step": 5667 }, { "epoch": 0.8872886662492173, "grad_norm": 1.6520259380340576, "learning_rate": 8.464673913043478e-05, "loss": 0.5489, "step": 5668 }, { "epoch": 0.8874452097683156, "grad_norm": 1.0653953552246094, "learning_rate": 8.463485054347826e-05, "loss": 0.3641, "step": 5669 }, { "epoch": 0.8876017532874139, "grad_norm": 1.4679546356201172, "learning_rate": 8.462296195652173e-05, "loss": 0.5266, "step": 5670 }, { "epoch": 0.8877582968065122, "grad_norm": 2.0548572540283203, "learning_rate": 8.46110733695652e-05, "loss": 0.4598, "step": 5671 }, { "epoch": 0.8879148403256105, "grad_norm": 3.088702440261841, "learning_rate": 8.459918478260868e-05, "loss": 0.8793, "step": 5672 }, { "epoch": 0.8880713838447089, "grad_norm": 4.059612274169922, "learning_rate": 8.458729619565216e-05, "loss": 1.0715, "step": 5673 }, { "epoch": 0.8882279273638072, "grad_norm": 2.0208981037139893, "learning_rate": 8.457540760869564e-05, "loss": 0.6248, "step": 5674 }, { "epoch": 0.8883844708829054, "grad_norm": 2.1756882667541504, "learning_rate": 8.456351902173912e-05, "loss": 0.8878, "step": 5675 }, { "epoch": 0.8885410144020037, "grad_norm": 1.4827241897583008, "learning_rate": 8.45516304347826e-05, "loss": 0.5498, "step": 5676 }, { "epoch": 0.888697557921102, "grad_norm": 2.451460123062134, "learning_rate": 8.453974184782607e-05, "loss": 0.7308, "step": 5677 }, { "epoch": 0.8888541014402004, "grad_norm": 2.7435262203216553, "learning_rate": 8.452785326086955e-05, "loss": 0.5301, "step": 5678 }, { "epoch": 0.8890106449592987, "grad_norm": 4.374394416809082, "learning_rate": 8.451596467391303e-05, "loss": 1.057, "step": 5679 }, { "epoch": 0.889167188478397, "grad_norm": 2.228970766067505, "learning_rate": 8.450407608695651e-05, "loss": 0.468, "step": 5680 }, { "epoch": 0.8893237319974953, "grad_norm": 2.0377864837646484, "learning_rate": 8.449218749999999e-05, "loss": 0.539, "step": 5681 }, { "epoch": 0.8894802755165936, "grad_norm": 2.1815099716186523, "learning_rate": 8.448029891304348e-05, "loss": 1.0967, "step": 5682 }, { "epoch": 0.8896368190356919, "grad_norm": 3.5851120948791504, "learning_rate": 8.446841032608696e-05, "loss": 0.8065, "step": 5683 }, { "epoch": 0.8897933625547902, "grad_norm": 2.3149988651275635, "learning_rate": 8.445652173913043e-05, "loss": 0.763, "step": 5684 }, { "epoch": 0.8899499060738886, "grad_norm": 3.3453121185302734, "learning_rate": 8.444463315217391e-05, "loss": 1.2823, "step": 5685 }, { "epoch": 0.8901064495929869, "grad_norm": 1.474342942237854, "learning_rate": 8.443274456521739e-05, "loss": 0.8039, "step": 5686 }, { "epoch": 0.8902629931120851, "grad_norm": 2.3227295875549316, "learning_rate": 8.442085597826087e-05, "loss": 1.186, "step": 5687 }, { "epoch": 0.8904195366311835, "grad_norm": 3.6722195148468018, "learning_rate": 8.440896739130435e-05, "loss": 1.4627, "step": 5688 }, { "epoch": 0.8905760801502818, "grad_norm": 3.27451229095459, "learning_rate": 8.439707880434782e-05, "loss": 1.0795, "step": 5689 }, { "epoch": 0.8907326236693801, "grad_norm": 1.8609652519226074, "learning_rate": 8.43851902173913e-05, "loss": 0.9637, "step": 5690 }, { "epoch": 0.8908891671884784, "grad_norm": 4.330729007720947, "learning_rate": 8.437330163043478e-05, "loss": 1.4978, "step": 5691 }, { "epoch": 0.8910457107075767, "grad_norm": 2.0930607318878174, "learning_rate": 8.436141304347826e-05, "loss": 1.2389, "step": 5692 }, { "epoch": 0.891202254226675, "grad_norm": 1.1972079277038574, "learning_rate": 8.434952445652172e-05, "loss": 0.4619, "step": 5693 }, { "epoch": 0.8913587977457733, "grad_norm": 3.5796210765838623, "learning_rate": 8.43376358695652e-05, "loss": 1.096, "step": 5694 }, { "epoch": 0.8915153412648716, "grad_norm": 1.7520577907562256, "learning_rate": 8.432574728260868e-05, "loss": 0.5417, "step": 5695 }, { "epoch": 0.89167188478397, "grad_norm": 1.8694969415664673, "learning_rate": 8.431385869565216e-05, "loss": 0.8409, "step": 5696 }, { "epoch": 0.8918284283030683, "grad_norm": 3.169936418533325, "learning_rate": 8.430197010869564e-05, "loss": 1.0887, "step": 5697 }, { "epoch": 0.8919849718221665, "grad_norm": 2.9689128398895264, "learning_rate": 8.429008152173911e-05, "loss": 0.5177, "step": 5698 }, { "epoch": 0.8921415153412648, "grad_norm": 1.9250215291976929, "learning_rate": 8.427819293478259e-05, "loss": 0.7981, "step": 5699 }, { "epoch": 0.8922980588603632, "grad_norm": 2.543565273284912, "learning_rate": 8.426630434782607e-05, "loss": 0.9293, "step": 5700 }, { "epoch": 0.8924546023794615, "grad_norm": 0.5426438450813293, "learning_rate": 8.425441576086955e-05, "loss": 0.2444, "step": 5701 }, { "epoch": 0.8926111458985598, "grad_norm": 1.1581140756607056, "learning_rate": 8.424252717391304e-05, "loss": 0.38, "step": 5702 }, { "epoch": 0.8927676894176582, "grad_norm": 0.5217311978340149, "learning_rate": 8.423063858695652e-05, "loss": 0.1863, "step": 5703 }, { "epoch": 0.8929242329367564, "grad_norm": 0.6980552077293396, "learning_rate": 8.421875e-05, "loss": 0.3642, "step": 5704 }, { "epoch": 0.8930807764558547, "grad_norm": 0.7826350927352905, "learning_rate": 8.420686141304348e-05, "loss": 0.2905, "step": 5705 }, { "epoch": 0.893237319974953, "grad_norm": 0.5508867502212524, "learning_rate": 8.419497282608695e-05, "loss": 0.2124, "step": 5706 }, { "epoch": 0.8933938634940514, "grad_norm": 0.8048489689826965, "learning_rate": 8.418308423913043e-05, "loss": 0.3459, "step": 5707 }, { "epoch": 0.8935504070131497, "grad_norm": 0.875093936920166, "learning_rate": 8.417119565217391e-05, "loss": 0.3368, "step": 5708 }, { "epoch": 0.8937069505322479, "grad_norm": 3.804374933242798, "learning_rate": 8.415930706521739e-05, "loss": 0.4983, "step": 5709 }, { "epoch": 0.8938634940513462, "grad_norm": 1.4631229639053345, "learning_rate": 8.414741847826087e-05, "loss": 0.388, "step": 5710 }, { "epoch": 0.8940200375704446, "grad_norm": 1.3285586833953857, "learning_rate": 8.413552989130434e-05, "loss": 0.5216, "step": 5711 }, { "epoch": 0.8941765810895429, "grad_norm": 0.7994397878646851, "learning_rate": 8.412364130434782e-05, "loss": 0.367, "step": 5712 }, { "epoch": 0.8943331246086412, "grad_norm": 1.7152365446090698, "learning_rate": 8.41117527173913e-05, "loss": 0.345, "step": 5713 }, { "epoch": 0.8944896681277396, "grad_norm": 1.2902787923812866, "learning_rate": 8.409986413043478e-05, "loss": 0.4038, "step": 5714 }, { "epoch": 0.8946462116468378, "grad_norm": 2.0512630939483643, "learning_rate": 8.408797554347826e-05, "loss": 0.5299, "step": 5715 }, { "epoch": 0.8948027551659361, "grad_norm": 0.9397214651107788, "learning_rate": 8.407608695652172e-05, "loss": 0.319, "step": 5716 }, { "epoch": 0.8949592986850344, "grad_norm": 1.611824631690979, "learning_rate": 8.40641983695652e-05, "loss": 0.5318, "step": 5717 }, { "epoch": 0.8951158422041328, "grad_norm": 4.094127178192139, "learning_rate": 8.405230978260868e-05, "loss": 0.9, "step": 5718 }, { "epoch": 0.8952723857232311, "grad_norm": 3.037473440170288, "learning_rate": 8.404042119565216e-05, "loss": 0.6309, "step": 5719 }, { "epoch": 0.8954289292423294, "grad_norm": 1.550382137298584, "learning_rate": 8.402853260869563e-05, "loss": 0.4367, "step": 5720 }, { "epoch": 0.8955854727614276, "grad_norm": 4.417881965637207, "learning_rate": 8.401664402173911e-05, "loss": 0.7959, "step": 5721 }, { "epoch": 0.895742016280526, "grad_norm": 1.5170397758483887, "learning_rate": 8.40047554347826e-05, "loss": 0.4889, "step": 5722 }, { "epoch": 0.8958985597996243, "grad_norm": 1.148320198059082, "learning_rate": 8.399286684782608e-05, "loss": 0.4291, "step": 5723 }, { "epoch": 0.8960551033187226, "grad_norm": 1.7855775356292725, "learning_rate": 8.398097826086956e-05, "loss": 0.9209, "step": 5724 }, { "epoch": 0.896211646837821, "grad_norm": 1.6997770071029663, "learning_rate": 8.396908967391304e-05, "loss": 0.53, "step": 5725 }, { "epoch": 0.8963681903569192, "grad_norm": 2.206437110900879, "learning_rate": 8.395720108695652e-05, "loss": 0.3743, "step": 5726 }, { "epoch": 0.8965247338760175, "grad_norm": 2.4575119018554688, "learning_rate": 8.39453125e-05, "loss": 0.7673, "step": 5727 }, { "epoch": 0.8966812773951158, "grad_norm": 1.85260808467865, "learning_rate": 8.393342391304347e-05, "loss": 0.4877, "step": 5728 }, { "epoch": 0.8968378209142142, "grad_norm": 2.2449772357940674, "learning_rate": 8.392153532608695e-05, "loss": 0.97, "step": 5729 }, { "epoch": 0.8969943644333125, "grad_norm": 2.5626494884490967, "learning_rate": 8.390964673913043e-05, "loss": 0.8067, "step": 5730 }, { "epoch": 0.8971509079524108, "grad_norm": 1.8255183696746826, "learning_rate": 8.389775815217391e-05, "loss": 0.8116, "step": 5731 }, { "epoch": 0.897307451471509, "grad_norm": 4.334131240844727, "learning_rate": 8.388586956521739e-05, "loss": 0.7212, "step": 5732 }, { "epoch": 0.8974639949906074, "grad_norm": 3.620640516281128, "learning_rate": 8.387398097826086e-05, "loss": 0.9832, "step": 5733 }, { "epoch": 0.8976205385097057, "grad_norm": 2.5611448287963867, "learning_rate": 8.386209239130434e-05, "loss": 0.4909, "step": 5734 }, { "epoch": 0.897777082028804, "grad_norm": 2.3620946407318115, "learning_rate": 8.385020380434782e-05, "loss": 0.8697, "step": 5735 }, { "epoch": 0.8979336255479023, "grad_norm": 2.3125922679901123, "learning_rate": 8.38383152173913e-05, "loss": 1.3243, "step": 5736 }, { "epoch": 0.8980901690670007, "grad_norm": 3.9600303173065186, "learning_rate": 8.382642663043478e-05, "loss": 1.095, "step": 5737 }, { "epoch": 0.8982467125860989, "grad_norm": 3.835322141647339, "learning_rate": 8.381453804347825e-05, "loss": 0.7198, "step": 5738 }, { "epoch": 0.8984032561051972, "grad_norm": 2.318319797515869, "learning_rate": 8.380264945652172e-05, "loss": 1.1581, "step": 5739 }, { "epoch": 0.8985597996242956, "grad_norm": 2.004098415374756, "learning_rate": 8.37907608695652e-05, "loss": 1.2485, "step": 5740 }, { "epoch": 0.8987163431433939, "grad_norm": 3.3624937534332275, "learning_rate": 8.377887228260868e-05, "loss": 1.1296, "step": 5741 }, { "epoch": 0.8988728866624922, "grad_norm": 3.133422374725342, "learning_rate": 8.376698369565217e-05, "loss": 1.0418, "step": 5742 }, { "epoch": 0.8990294301815904, "grad_norm": 2.6518824100494385, "learning_rate": 8.375509510869565e-05, "loss": 1.0801, "step": 5743 }, { "epoch": 0.8991859737006888, "grad_norm": 2.907630443572998, "learning_rate": 8.374320652173912e-05, "loss": 1.3666, "step": 5744 }, { "epoch": 0.8993425172197871, "grad_norm": 3.6174309253692627, "learning_rate": 8.37313179347826e-05, "loss": 1.526, "step": 5745 }, { "epoch": 0.8994990607388854, "grad_norm": 2.2109310626983643, "learning_rate": 8.371942934782608e-05, "loss": 0.9193, "step": 5746 }, { "epoch": 0.8996556042579837, "grad_norm": 2.3903801441192627, "learning_rate": 8.370754076086956e-05, "loss": 0.3228, "step": 5747 }, { "epoch": 0.8998121477770821, "grad_norm": 2.1873867511749268, "learning_rate": 8.369565217391304e-05, "loss": 0.5332, "step": 5748 }, { "epoch": 0.8999686912961803, "grad_norm": 3.0562222003936768, "learning_rate": 8.368376358695651e-05, "loss": 1.4112, "step": 5749 }, { "epoch": 0.9001252348152786, "grad_norm": 1.9017657041549683, "learning_rate": 8.367187499999999e-05, "loss": 0.7995, "step": 5750 }, { "epoch": 0.900281778334377, "grad_norm": 0.5253403782844543, "learning_rate": 8.365998641304347e-05, "loss": 0.2922, "step": 5751 }, { "epoch": 0.9004383218534753, "grad_norm": 0.42662474513053894, "learning_rate": 8.364809782608695e-05, "loss": 0.1922, "step": 5752 }, { "epoch": 0.9005948653725736, "grad_norm": 0.3838902711868286, "learning_rate": 8.363620923913043e-05, "loss": 0.1948, "step": 5753 }, { "epoch": 0.9007514088916719, "grad_norm": 0.8775641918182373, "learning_rate": 8.36243206521739e-05, "loss": 0.2884, "step": 5754 }, { "epoch": 0.9009079524107702, "grad_norm": 0.6916599869728088, "learning_rate": 8.361243206521738e-05, "loss": 0.2681, "step": 5755 }, { "epoch": 0.9010644959298685, "grad_norm": 0.6020183563232422, "learning_rate": 8.360054347826086e-05, "loss": 0.3173, "step": 5756 }, { "epoch": 0.9012210394489668, "grad_norm": 0.9643011093139648, "learning_rate": 8.358865489130434e-05, "loss": 0.3904, "step": 5757 }, { "epoch": 0.9013775829680651, "grad_norm": 1.3834688663482666, "learning_rate": 8.357676630434782e-05, "loss": 0.3707, "step": 5758 }, { "epoch": 0.9015341264871635, "grad_norm": 0.6364767551422119, "learning_rate": 8.356487771739131e-05, "loss": 0.357, "step": 5759 }, { "epoch": 0.9016906700062617, "grad_norm": 0.5862623453140259, "learning_rate": 8.355298913043479e-05, "loss": 0.2786, "step": 5760 }, { "epoch": 0.90184721352536, "grad_norm": 1.0804471969604492, "learning_rate": 8.354110054347827e-05, "loss": 0.4311, "step": 5761 }, { "epoch": 0.9020037570444583, "grad_norm": 0.9951147437095642, "learning_rate": 8.352921195652173e-05, "loss": 0.4384, "step": 5762 }, { "epoch": 0.9021603005635567, "grad_norm": 0.8209784626960754, "learning_rate": 8.351732336956521e-05, "loss": 0.4064, "step": 5763 }, { "epoch": 0.902316844082655, "grad_norm": 1.7704964876174927, "learning_rate": 8.350543478260869e-05, "loss": 0.3926, "step": 5764 }, { "epoch": 0.9024733876017533, "grad_norm": 0.8398094177246094, "learning_rate": 8.349354619565216e-05, "loss": 0.5324, "step": 5765 }, { "epoch": 0.9026299311208515, "grad_norm": 1.3464723825454712, "learning_rate": 8.348165760869564e-05, "loss": 0.7322, "step": 5766 }, { "epoch": 0.9027864746399499, "grad_norm": 1.4759631156921387, "learning_rate": 8.346976902173912e-05, "loss": 0.5365, "step": 5767 }, { "epoch": 0.9029430181590482, "grad_norm": 1.7144701480865479, "learning_rate": 8.34578804347826e-05, "loss": 0.4711, "step": 5768 }, { "epoch": 0.9030995616781465, "grad_norm": 1.7815099954605103, "learning_rate": 8.344599184782608e-05, "loss": 0.5391, "step": 5769 }, { "epoch": 0.9032561051972449, "grad_norm": 1.96354079246521, "learning_rate": 8.343410326086956e-05, "loss": 0.9849, "step": 5770 }, { "epoch": 0.9034126487163432, "grad_norm": 3.4149434566497803, "learning_rate": 8.342221467391303e-05, "loss": 0.8311, "step": 5771 }, { "epoch": 0.9035691922354414, "grad_norm": 2.112342119216919, "learning_rate": 8.341032608695651e-05, "loss": 0.5737, "step": 5772 }, { "epoch": 0.9037257357545397, "grad_norm": 0.7358482480049133, "learning_rate": 8.339843749999999e-05, "loss": 0.4897, "step": 5773 }, { "epoch": 0.9038822792736381, "grad_norm": 1.3970773220062256, "learning_rate": 8.338654891304347e-05, "loss": 0.4507, "step": 5774 }, { "epoch": 0.9040388227927364, "grad_norm": 2.3489909172058105, "learning_rate": 8.337466032608695e-05, "loss": 0.6751, "step": 5775 }, { "epoch": 0.9041953663118347, "grad_norm": 1.259873390197754, "learning_rate": 8.336277173913042e-05, "loss": 0.7336, "step": 5776 }, { "epoch": 0.904351909830933, "grad_norm": 1.4757461547851562, "learning_rate": 8.33508831521739e-05, "loss": 0.9814, "step": 5777 }, { "epoch": 0.9045084533500313, "grad_norm": 3.185161590576172, "learning_rate": 8.333899456521738e-05, "loss": 0.8403, "step": 5778 }, { "epoch": 0.9046649968691296, "grad_norm": 2.439042329788208, "learning_rate": 8.332710597826087e-05, "loss": 0.4809, "step": 5779 }, { "epoch": 0.9048215403882279, "grad_norm": 1.503978967666626, "learning_rate": 8.331521739130435e-05, "loss": 0.8175, "step": 5780 }, { "epoch": 0.9049780839073263, "grad_norm": 3.33864426612854, "learning_rate": 8.330332880434783e-05, "loss": 0.9187, "step": 5781 }, { "epoch": 0.9051346274264246, "grad_norm": 1.9215167760849, "learning_rate": 8.329144021739131e-05, "loss": 0.8159, "step": 5782 }, { "epoch": 0.9052911709455228, "grad_norm": 2.3878414630889893, "learning_rate": 8.327955163043479e-05, "loss": 0.931, "step": 5783 }, { "epoch": 0.9054477144646211, "grad_norm": 2.427306652069092, "learning_rate": 8.326766304347826e-05, "loss": 0.7934, "step": 5784 }, { "epoch": 0.9056042579837195, "grad_norm": 2.378511428833008, "learning_rate": 8.325577445652173e-05, "loss": 0.7177, "step": 5785 }, { "epoch": 0.9057608015028178, "grad_norm": 2.2480249404907227, "learning_rate": 8.32438858695652e-05, "loss": 1.0275, "step": 5786 }, { "epoch": 0.9059173450219161, "grad_norm": 3.304759979248047, "learning_rate": 8.323199728260868e-05, "loss": 1.2679, "step": 5787 }, { "epoch": 0.9060738885410144, "grad_norm": 2.9758148193359375, "learning_rate": 8.322010869565216e-05, "loss": 1.4043, "step": 5788 }, { "epoch": 0.9062304320601127, "grad_norm": 1.895653486251831, "learning_rate": 8.320822010869564e-05, "loss": 0.8938, "step": 5789 }, { "epoch": 0.906386975579211, "grad_norm": 3.5650174617767334, "learning_rate": 8.319633152173912e-05, "loss": 1.0375, "step": 5790 }, { "epoch": 0.9065435190983093, "grad_norm": 2.8632473945617676, "learning_rate": 8.31844429347826e-05, "loss": 0.8877, "step": 5791 }, { "epoch": 0.9067000626174077, "grad_norm": 3.5991108417510986, "learning_rate": 8.317255434782608e-05, "loss": 0.8919, "step": 5792 }, { "epoch": 0.906856606136506, "grad_norm": 4.891104221343994, "learning_rate": 8.316066576086955e-05, "loss": 1.4094, "step": 5793 }, { "epoch": 0.9070131496556043, "grad_norm": 2.686347007751465, "learning_rate": 8.314877717391303e-05, "loss": 0.8811, "step": 5794 }, { "epoch": 0.9071696931747025, "grad_norm": 2.8857901096343994, "learning_rate": 8.313688858695651e-05, "loss": 0.9802, "step": 5795 }, { "epoch": 0.9073262366938009, "grad_norm": 2.82417893409729, "learning_rate": 8.312499999999999e-05, "loss": 0.7031, "step": 5796 }, { "epoch": 0.9074827802128992, "grad_norm": 1.8049668073654175, "learning_rate": 8.311311141304347e-05, "loss": 0.4457, "step": 5797 }, { "epoch": 0.9076393237319975, "grad_norm": 2.443573474884033, "learning_rate": 8.310122282608694e-05, "loss": 0.9097, "step": 5798 }, { "epoch": 0.9077958672510958, "grad_norm": 3.2308688163757324, "learning_rate": 8.308933423913044e-05, "loss": 0.7713, "step": 5799 }, { "epoch": 0.9079524107701941, "grad_norm": 16.045928955078125, "learning_rate": 8.307744565217391e-05, "loss": 1.5573, "step": 5800 }, { "epoch": 0.9081089542892924, "grad_norm": 1.0607836246490479, "learning_rate": 8.306555706521739e-05, "loss": 0.3787, "step": 5801 }, { "epoch": 0.9082654978083907, "grad_norm": 1.1177259683609009, "learning_rate": 8.305366847826087e-05, "loss": 0.2369, "step": 5802 }, { "epoch": 0.908422041327489, "grad_norm": 0.6010622382164001, "learning_rate": 8.304177989130435e-05, "loss": 0.2284, "step": 5803 }, { "epoch": 0.9085785848465874, "grad_norm": 0.7978283762931824, "learning_rate": 8.302989130434783e-05, "loss": 0.356, "step": 5804 }, { "epoch": 0.9087351283656857, "grad_norm": 0.6510826945304871, "learning_rate": 8.30180027173913e-05, "loss": 0.2847, "step": 5805 }, { "epoch": 0.9088916718847839, "grad_norm": 1.1136950254440308, "learning_rate": 8.300611413043478e-05, "loss": 0.4214, "step": 5806 }, { "epoch": 0.9090482154038823, "grad_norm": 0.9748147130012512, "learning_rate": 8.299422554347826e-05, "loss": 0.3518, "step": 5807 }, { "epoch": 0.9092047589229806, "grad_norm": 1.1029340028762817, "learning_rate": 8.298233695652173e-05, "loss": 0.3048, "step": 5808 }, { "epoch": 0.9093613024420789, "grad_norm": 0.9841145277023315, "learning_rate": 8.29704483695652e-05, "loss": 0.3834, "step": 5809 }, { "epoch": 0.9095178459611772, "grad_norm": 1.3478530645370483, "learning_rate": 8.295855978260868e-05, "loss": 0.2094, "step": 5810 }, { "epoch": 0.9096743894802756, "grad_norm": 0.9360737800598145, "learning_rate": 8.294667119565216e-05, "loss": 0.3836, "step": 5811 }, { "epoch": 0.9098309329993738, "grad_norm": 0.7247090935707092, "learning_rate": 8.293478260869564e-05, "loss": 0.2409, "step": 5812 }, { "epoch": 0.9099874765184721, "grad_norm": 0.9642817974090576, "learning_rate": 8.292289402173912e-05, "loss": 0.4463, "step": 5813 }, { "epoch": 0.9101440200375704, "grad_norm": 1.258475422859192, "learning_rate": 8.29110054347826e-05, "loss": 0.4625, "step": 5814 }, { "epoch": 0.9103005635566688, "grad_norm": 1.008986234664917, "learning_rate": 8.289911684782607e-05, "loss": 0.5382, "step": 5815 }, { "epoch": 0.9104571070757671, "grad_norm": 1.7694896459579468, "learning_rate": 8.288722826086955e-05, "loss": 0.6273, "step": 5816 }, { "epoch": 0.9106136505948653, "grad_norm": 1.2385503053665161, "learning_rate": 8.287533967391303e-05, "loss": 0.3004, "step": 5817 }, { "epoch": 0.9107701941139636, "grad_norm": 2.5662918090820312, "learning_rate": 8.286345108695651e-05, "loss": 0.5291, "step": 5818 }, { "epoch": 0.910926737633062, "grad_norm": 1.3427395820617676, "learning_rate": 8.28515625e-05, "loss": 0.5343, "step": 5819 }, { "epoch": 0.9110832811521603, "grad_norm": 2.6030118465423584, "learning_rate": 8.283967391304348e-05, "loss": 0.8583, "step": 5820 }, { "epoch": 0.9112398246712586, "grad_norm": 1.1839137077331543, "learning_rate": 8.282778532608696e-05, "loss": 0.6096, "step": 5821 }, { "epoch": 0.911396368190357, "grad_norm": 1.3953943252563477, "learning_rate": 8.281589673913043e-05, "loss": 0.5183, "step": 5822 }, { "epoch": 0.9115529117094552, "grad_norm": 1.3026783466339111, "learning_rate": 8.280400815217391e-05, "loss": 0.4881, "step": 5823 }, { "epoch": 0.9117094552285535, "grad_norm": 2.58965802192688, "learning_rate": 8.279211956521739e-05, "loss": 0.8549, "step": 5824 }, { "epoch": 0.9118659987476518, "grad_norm": 1.4085767269134521, "learning_rate": 8.278023097826087e-05, "loss": 0.3575, "step": 5825 }, { "epoch": 0.9120225422667502, "grad_norm": 2.274871826171875, "learning_rate": 8.276834239130435e-05, "loss": 0.5369, "step": 5826 }, { "epoch": 0.9121790857858485, "grad_norm": 1.7993477582931519, "learning_rate": 8.275645380434782e-05, "loss": 0.9583, "step": 5827 }, { "epoch": 0.9123356293049468, "grad_norm": 1.8564696311950684, "learning_rate": 8.27445652173913e-05, "loss": 0.6559, "step": 5828 }, { "epoch": 0.912492172824045, "grad_norm": 1.6150579452514648, "learning_rate": 8.273267663043478e-05, "loss": 0.5643, "step": 5829 }, { "epoch": 0.9126487163431434, "grad_norm": 2.4778690338134766, "learning_rate": 8.272078804347826e-05, "loss": 0.8549, "step": 5830 }, { "epoch": 0.9128052598622417, "grad_norm": 2.3300933837890625, "learning_rate": 8.270889945652172e-05, "loss": 0.7568, "step": 5831 }, { "epoch": 0.91296180338134, "grad_norm": 2.0047013759613037, "learning_rate": 8.26970108695652e-05, "loss": 0.8197, "step": 5832 }, { "epoch": 0.9131183469004384, "grad_norm": 2.2043344974517822, "learning_rate": 8.268512228260868e-05, "loss": 0.663, "step": 5833 }, { "epoch": 0.9132748904195366, "grad_norm": 2.255497455596924, "learning_rate": 8.267323369565216e-05, "loss": 0.7705, "step": 5834 }, { "epoch": 0.9134314339386349, "grad_norm": 2.2725846767425537, "learning_rate": 8.266134510869564e-05, "loss": 0.7244, "step": 5835 }, { "epoch": 0.9135879774577332, "grad_norm": 3.871090888977051, "learning_rate": 8.264945652173911e-05, "loss": 1.4599, "step": 5836 }, { "epoch": 0.9137445209768316, "grad_norm": 6.261083126068115, "learning_rate": 8.263756793478259e-05, "loss": 1.1166, "step": 5837 }, { "epoch": 0.9139010644959299, "grad_norm": 3.1226882934570312, "learning_rate": 8.262567934782608e-05, "loss": 1.025, "step": 5838 }, { "epoch": 0.9140576080150282, "grad_norm": 2.993051528930664, "learning_rate": 8.261379076086956e-05, "loss": 0.9042, "step": 5839 }, { "epoch": 0.9142141515341264, "grad_norm": 2.091888427734375, "learning_rate": 8.260190217391304e-05, "loss": 0.9913, "step": 5840 }, { "epoch": 0.9143706950532248, "grad_norm": 1.9208333492279053, "learning_rate": 8.259001358695652e-05, "loss": 1.1423, "step": 5841 }, { "epoch": 0.9145272385723231, "grad_norm": 1.9901241064071655, "learning_rate": 8.2578125e-05, "loss": 0.7965, "step": 5842 }, { "epoch": 0.9146837820914214, "grad_norm": 1.7320561408996582, "learning_rate": 8.256623641304347e-05, "loss": 1.4497, "step": 5843 }, { "epoch": 0.9148403256105198, "grad_norm": 4.089570045471191, "learning_rate": 8.255434782608695e-05, "loss": 1.7108, "step": 5844 }, { "epoch": 0.9149968691296181, "grad_norm": 2.705132246017456, "learning_rate": 8.254245923913043e-05, "loss": 1.4399, "step": 5845 }, { "epoch": 0.9151534126487163, "grad_norm": 2.760812759399414, "learning_rate": 8.253057065217391e-05, "loss": 1.0744, "step": 5846 }, { "epoch": 0.9153099561678146, "grad_norm": 3.336362361907959, "learning_rate": 8.251868206521739e-05, "loss": 0.7682, "step": 5847 }, { "epoch": 0.915466499686913, "grad_norm": 3.5283286571502686, "learning_rate": 8.250679347826087e-05, "loss": 0.8669, "step": 5848 }, { "epoch": 0.9156230432060113, "grad_norm": 4.878379821777344, "learning_rate": 8.249490489130434e-05, "loss": 0.8134, "step": 5849 }, { "epoch": 0.9157795867251096, "grad_norm": 1.7339633703231812, "learning_rate": 8.248301630434782e-05, "loss": 0.6603, "step": 5850 }, { "epoch": 0.9159361302442078, "grad_norm": 0.45425450801849365, "learning_rate": 8.24711277173913e-05, "loss": 0.3288, "step": 5851 }, { "epoch": 0.9160926737633062, "grad_norm": 0.5437692999839783, "learning_rate": 8.245923913043478e-05, "loss": 0.3053, "step": 5852 }, { "epoch": 0.9162492172824045, "grad_norm": 0.8942420482635498, "learning_rate": 8.244735054347826e-05, "loss": 0.4409, "step": 5853 }, { "epoch": 0.9164057608015028, "grad_norm": 0.7713162302970886, "learning_rate": 8.243546195652172e-05, "loss": 0.3682, "step": 5854 }, { "epoch": 0.9165623043206012, "grad_norm": 0.6309749484062195, "learning_rate": 8.24235733695652e-05, "loss": 0.3277, "step": 5855 }, { "epoch": 0.9167188478396995, "grad_norm": 1.0893193483352661, "learning_rate": 8.241168478260868e-05, "loss": 0.3403, "step": 5856 }, { "epoch": 0.9168753913587977, "grad_norm": 0.7750117778778076, "learning_rate": 8.239979619565216e-05, "loss": 0.264, "step": 5857 }, { "epoch": 0.917031934877896, "grad_norm": 1.0323286056518555, "learning_rate": 8.238790760869565e-05, "loss": 0.4127, "step": 5858 }, { "epoch": 0.9171884783969944, "grad_norm": 0.6902104020118713, "learning_rate": 8.237601902173913e-05, "loss": 0.2712, "step": 5859 }, { "epoch": 0.9173450219160927, "grad_norm": 0.8350241184234619, "learning_rate": 8.23641304347826e-05, "loss": 0.2876, "step": 5860 }, { "epoch": 0.917501565435191, "grad_norm": 0.7352054119110107, "learning_rate": 8.235224184782608e-05, "loss": 0.2533, "step": 5861 }, { "epoch": 0.9176581089542893, "grad_norm": 1.4780681133270264, "learning_rate": 8.234035326086956e-05, "loss": 0.5501, "step": 5862 }, { "epoch": 0.9178146524733876, "grad_norm": 0.7402490973472595, "learning_rate": 8.232846467391304e-05, "loss": 0.2628, "step": 5863 }, { "epoch": 0.9179711959924859, "grad_norm": 1.6511338949203491, "learning_rate": 8.231657608695652e-05, "loss": 0.4622, "step": 5864 }, { "epoch": 0.9181277395115842, "grad_norm": 1.4911284446716309, "learning_rate": 8.23046875e-05, "loss": 0.4745, "step": 5865 }, { "epoch": 0.9182842830306825, "grad_norm": 0.9446761012077332, "learning_rate": 8.229279891304347e-05, "loss": 0.4071, "step": 5866 }, { "epoch": 0.9184408265497809, "grad_norm": 1.0139440298080444, "learning_rate": 8.228091032608695e-05, "loss": 0.4063, "step": 5867 }, { "epoch": 0.9185973700688791, "grad_norm": 1.959220051765442, "learning_rate": 8.226902173913043e-05, "loss": 0.762, "step": 5868 }, { "epoch": 0.9187539135879774, "grad_norm": 1.4147017002105713, "learning_rate": 8.22571331521739e-05, "loss": 0.4487, "step": 5869 }, { "epoch": 0.9189104571070758, "grad_norm": 1.6871453523635864, "learning_rate": 8.224524456521738e-05, "loss": 0.7856, "step": 5870 }, { "epoch": 0.9190670006261741, "grad_norm": 1.1437699794769287, "learning_rate": 8.223335597826086e-05, "loss": 0.3082, "step": 5871 }, { "epoch": 0.9192235441452724, "grad_norm": 1.727158546447754, "learning_rate": 8.222146739130434e-05, "loss": 0.4218, "step": 5872 }, { "epoch": 0.9193800876643707, "grad_norm": 1.844420075416565, "learning_rate": 8.220957880434782e-05, "loss": 0.8988, "step": 5873 }, { "epoch": 0.919536631183469, "grad_norm": 1.668535590171814, "learning_rate": 8.21976902173913e-05, "loss": 0.4631, "step": 5874 }, { "epoch": 0.9196931747025673, "grad_norm": 2.699343204498291, "learning_rate": 8.218580163043478e-05, "loss": 0.6379, "step": 5875 }, { "epoch": 0.9198497182216656, "grad_norm": 1.6714773178100586, "learning_rate": 8.217391304347827e-05, "loss": 0.4612, "step": 5876 }, { "epoch": 0.9200062617407639, "grad_norm": 2.1292471885681152, "learning_rate": 8.216202445652172e-05, "loss": 0.6026, "step": 5877 }, { "epoch": 0.9201628052598623, "grad_norm": 2.3564045429229736, "learning_rate": 8.215013586956521e-05, "loss": 0.8407, "step": 5878 }, { "epoch": 0.9203193487789606, "grad_norm": 1.5088531970977783, "learning_rate": 8.213824728260869e-05, "loss": 0.8013, "step": 5879 }, { "epoch": 0.9204758922980588, "grad_norm": 3.421456813812256, "learning_rate": 8.212635869565217e-05, "loss": 1.1305, "step": 5880 }, { "epoch": 0.9206324358171571, "grad_norm": 1.8198307752609253, "learning_rate": 8.211447010869564e-05, "loss": 0.5553, "step": 5881 }, { "epoch": 0.9207889793362555, "grad_norm": 4.2033610343933105, "learning_rate": 8.210258152173912e-05, "loss": 0.6585, "step": 5882 }, { "epoch": 0.9209455228553538, "grad_norm": 3.03883957862854, "learning_rate": 8.20906929347826e-05, "loss": 0.8266, "step": 5883 }, { "epoch": 0.9211020663744521, "grad_norm": 2.4774816036224365, "learning_rate": 8.207880434782608e-05, "loss": 0.9128, "step": 5884 }, { "epoch": 0.9212586098935505, "grad_norm": 2.108961820602417, "learning_rate": 8.206691576086956e-05, "loss": 0.5248, "step": 5885 }, { "epoch": 0.9214151534126487, "grad_norm": 2.7606303691864014, "learning_rate": 8.205502717391304e-05, "loss": 1.0048, "step": 5886 }, { "epoch": 0.921571696931747, "grad_norm": 2.8885838985443115, "learning_rate": 8.204313858695651e-05, "loss": 1.083, "step": 5887 }, { "epoch": 0.9217282404508453, "grad_norm": 2.0821335315704346, "learning_rate": 8.203124999999999e-05, "loss": 1.0017, "step": 5888 }, { "epoch": 0.9218847839699437, "grad_norm": 2.4067907333374023, "learning_rate": 8.201936141304347e-05, "loss": 1.3561, "step": 5889 }, { "epoch": 0.922041327489042, "grad_norm": 2.7139458656311035, "learning_rate": 8.200747282608695e-05, "loss": 1.1997, "step": 5890 }, { "epoch": 0.9221978710081402, "grad_norm": 2.378068447113037, "learning_rate": 8.199558423913043e-05, "loss": 1.1021, "step": 5891 }, { "epoch": 0.9223544145272385, "grad_norm": 4.0304083824157715, "learning_rate": 8.19836956521739e-05, "loss": 0.9723, "step": 5892 }, { "epoch": 0.9225109580463369, "grad_norm": 2.7098708152770996, "learning_rate": 8.197180706521738e-05, "loss": 1.3461, "step": 5893 }, { "epoch": 0.9226675015654352, "grad_norm": 2.785339832305908, "learning_rate": 8.195991847826086e-05, "loss": 0.9844, "step": 5894 }, { "epoch": 0.9228240450845335, "grad_norm": 2.4239602088928223, "learning_rate": 8.194802989130434e-05, "loss": 0.513, "step": 5895 }, { "epoch": 0.9229805886036319, "grad_norm": 4.945959091186523, "learning_rate": 8.193614130434783e-05, "loss": 0.4088, "step": 5896 }, { "epoch": 0.9231371321227301, "grad_norm": 4.820830345153809, "learning_rate": 8.192425271739131e-05, "loss": 1.1729, "step": 5897 }, { "epoch": 0.9232936756418284, "grad_norm": 2.53597092628479, "learning_rate": 8.191236413043479e-05, "loss": 0.9303, "step": 5898 }, { "epoch": 0.9234502191609267, "grad_norm": 2.810370445251465, "learning_rate": 8.190047554347826e-05, "loss": 0.769, "step": 5899 }, { "epoch": 0.9236067626800251, "grad_norm": 3.9943039417266846, "learning_rate": 8.188858695652173e-05, "loss": 1.2543, "step": 5900 }, { "epoch": 0.9237633061991234, "grad_norm": 0.5419001579284668, "learning_rate": 8.187669836956521e-05, "loss": 0.3252, "step": 5901 }, { "epoch": 0.9239198497182217, "grad_norm": 0.8974428176879883, "learning_rate": 8.186480978260869e-05, "loss": 0.3451, "step": 5902 }, { "epoch": 0.9240763932373199, "grad_norm": 0.471832811832428, "learning_rate": 8.185292119565216e-05, "loss": 0.2122, "step": 5903 }, { "epoch": 0.9242329367564183, "grad_norm": 0.763027548789978, "learning_rate": 8.184103260869564e-05, "loss": 0.3262, "step": 5904 }, { "epoch": 0.9243894802755166, "grad_norm": 1.1657114028930664, "learning_rate": 8.182914402173912e-05, "loss": 0.3131, "step": 5905 }, { "epoch": 0.9245460237946149, "grad_norm": 0.925940215587616, "learning_rate": 8.18172554347826e-05, "loss": 0.3064, "step": 5906 }, { "epoch": 0.9247025673137133, "grad_norm": 0.6849583983421326, "learning_rate": 8.180536684782608e-05, "loss": 0.2181, "step": 5907 }, { "epoch": 0.9248591108328115, "grad_norm": 4.117188453674316, "learning_rate": 8.179347826086955e-05, "loss": 0.5151, "step": 5908 }, { "epoch": 0.9250156543519098, "grad_norm": 0.9687039256095886, "learning_rate": 8.178158967391303e-05, "loss": 0.3416, "step": 5909 }, { "epoch": 0.9251721978710081, "grad_norm": 0.7160677313804626, "learning_rate": 8.176970108695651e-05, "loss": 0.3624, "step": 5910 }, { "epoch": 0.9253287413901065, "grad_norm": 0.6614377498626709, "learning_rate": 8.175781249999999e-05, "loss": 0.268, "step": 5911 }, { "epoch": 0.9254852849092048, "grad_norm": 2.4342446327209473, "learning_rate": 8.174592391304347e-05, "loss": 0.4479, "step": 5912 }, { "epoch": 0.9256418284283031, "grad_norm": 1.1237590312957764, "learning_rate": 8.173403532608695e-05, "loss": 0.6647, "step": 5913 }, { "epoch": 0.9257983719474013, "grad_norm": 0.9230726957321167, "learning_rate": 8.172214673913042e-05, "loss": 0.283, "step": 5914 }, { "epoch": 0.9259549154664997, "grad_norm": 0.7556199431419373, "learning_rate": 8.171025815217392e-05, "loss": 0.4891, "step": 5915 }, { "epoch": 0.926111458985598, "grad_norm": 1.1495575904846191, "learning_rate": 8.16983695652174e-05, "loss": 0.5712, "step": 5916 }, { "epoch": 0.9262680025046963, "grad_norm": 1.2531867027282715, "learning_rate": 8.168648097826087e-05, "loss": 0.611, "step": 5917 }, { "epoch": 0.9264245460237946, "grad_norm": 1.6143046617507935, "learning_rate": 8.167459239130435e-05, "loss": 0.3343, "step": 5918 }, { "epoch": 0.926581089542893, "grad_norm": 1.960915207862854, "learning_rate": 8.166270380434783e-05, "loss": 0.3166, "step": 5919 }, { "epoch": 0.9267376330619912, "grad_norm": 1.8726801872253418, "learning_rate": 8.16508152173913e-05, "loss": 0.5364, "step": 5920 }, { "epoch": 0.9268941765810895, "grad_norm": 1.1904747486114502, "learning_rate": 8.163892663043478e-05, "loss": 0.4274, "step": 5921 }, { "epoch": 0.9270507201001879, "grad_norm": 1.5239918231964111, "learning_rate": 8.162703804347826e-05, "loss": 0.4574, "step": 5922 }, { "epoch": 0.9272072636192862, "grad_norm": 1.4943199157714844, "learning_rate": 8.161514945652173e-05, "loss": 0.571, "step": 5923 }, { "epoch": 0.9273638071383845, "grad_norm": 1.160144567489624, "learning_rate": 8.16032608695652e-05, "loss": 0.4624, "step": 5924 }, { "epoch": 0.9275203506574827, "grad_norm": 1.1706626415252686, "learning_rate": 8.159137228260868e-05, "loss": 0.416, "step": 5925 }, { "epoch": 0.9276768941765811, "grad_norm": 1.837422490119934, "learning_rate": 8.157948369565216e-05, "loss": 0.6287, "step": 5926 }, { "epoch": 0.9278334376956794, "grad_norm": 2.4920859336853027, "learning_rate": 8.156759510869564e-05, "loss": 0.7323, "step": 5927 }, { "epoch": 0.9279899812147777, "grad_norm": 1.9277658462524414, "learning_rate": 8.155570652173912e-05, "loss": 0.6264, "step": 5928 }, { "epoch": 0.928146524733876, "grad_norm": 2.472630262374878, "learning_rate": 8.15438179347826e-05, "loss": 0.5776, "step": 5929 }, { "epoch": 0.9283030682529744, "grad_norm": 1.3773844242095947, "learning_rate": 8.153192934782607e-05, "loss": 0.3457, "step": 5930 }, { "epoch": 0.9284596117720726, "grad_norm": 2.6893858909606934, "learning_rate": 8.152004076086955e-05, "loss": 0.5603, "step": 5931 }, { "epoch": 0.9286161552911709, "grad_norm": 1.59843111038208, "learning_rate": 8.150815217391303e-05, "loss": 0.736, "step": 5932 }, { "epoch": 0.9287726988102692, "grad_norm": 1.5192131996154785, "learning_rate": 8.149626358695651e-05, "loss": 0.9149, "step": 5933 }, { "epoch": 0.9289292423293676, "grad_norm": 2.41623854637146, "learning_rate": 8.148437499999999e-05, "loss": 0.8793, "step": 5934 }, { "epoch": 0.9290857858484659, "grad_norm": 1.916828989982605, "learning_rate": 8.147248641304348e-05, "loss": 0.5225, "step": 5935 }, { "epoch": 0.9292423293675642, "grad_norm": 3.592534065246582, "learning_rate": 8.146059782608696e-05, "loss": 1.2903, "step": 5936 }, { "epoch": 0.9293988728866625, "grad_norm": 2.6587655544281006, "learning_rate": 8.144870923913043e-05, "loss": 0.944, "step": 5937 }, { "epoch": 0.9295554164057608, "grad_norm": 3.225898504257202, "learning_rate": 8.143682065217391e-05, "loss": 0.7473, "step": 5938 }, { "epoch": 0.9297119599248591, "grad_norm": 3.398913860321045, "learning_rate": 8.142493206521739e-05, "loss": 1.7658, "step": 5939 }, { "epoch": 0.9298685034439574, "grad_norm": 3.804900646209717, "learning_rate": 8.141304347826087e-05, "loss": 1.126, "step": 5940 }, { "epoch": 0.9300250469630558, "grad_norm": 2.71474027633667, "learning_rate": 8.140115489130435e-05, "loss": 0.6653, "step": 5941 }, { "epoch": 0.930181590482154, "grad_norm": 3.3990025520324707, "learning_rate": 8.138926630434783e-05, "loss": 1.0727, "step": 5942 }, { "epoch": 0.9303381340012523, "grad_norm": 3.7923007011413574, "learning_rate": 8.13773777173913e-05, "loss": 1.3007, "step": 5943 }, { "epoch": 0.9304946775203506, "grad_norm": 1.9489545822143555, "learning_rate": 8.136548913043478e-05, "loss": 1.052, "step": 5944 }, { "epoch": 0.930651221039449, "grad_norm": 2.6349220275878906, "learning_rate": 8.135360054347826e-05, "loss": 1.272, "step": 5945 }, { "epoch": 0.9308077645585473, "grad_norm": 2.7562966346740723, "learning_rate": 8.134171195652172e-05, "loss": 1.0274, "step": 5946 }, { "epoch": 0.9309643080776456, "grad_norm": 1.863601803779602, "learning_rate": 8.13298233695652e-05, "loss": 1.2376, "step": 5947 }, { "epoch": 0.9311208515967438, "grad_norm": 1.8882333040237427, "learning_rate": 8.131793478260868e-05, "loss": 0.545, "step": 5948 }, { "epoch": 0.9312773951158422, "grad_norm": 2.5359857082366943, "learning_rate": 8.130604619565216e-05, "loss": 0.7976, "step": 5949 }, { "epoch": 0.9314339386349405, "grad_norm": 2.7962682247161865, "learning_rate": 8.129415760869564e-05, "loss": 0.8374, "step": 5950 }, { "epoch": 0.9315904821540388, "grad_norm": 0.48833003640174866, "learning_rate": 8.128226902173912e-05, "loss": 0.3174, "step": 5951 }, { "epoch": 0.9317470256731372, "grad_norm": 0.5678693652153015, "learning_rate": 8.12703804347826e-05, "loss": 0.2548, "step": 5952 }, { "epoch": 0.9319035691922355, "grad_norm": 0.9396930932998657, "learning_rate": 8.125849184782607e-05, "loss": 0.1786, "step": 5953 }, { "epoch": 0.9320601127113337, "grad_norm": 0.8024551868438721, "learning_rate": 8.124660326086955e-05, "loss": 0.3355, "step": 5954 }, { "epoch": 0.932216656230432, "grad_norm": 1.3675965070724487, "learning_rate": 8.123471467391304e-05, "loss": 0.4763, "step": 5955 }, { "epoch": 0.9323731997495304, "grad_norm": 0.5368399024009705, "learning_rate": 8.122282608695652e-05, "loss": 0.3096, "step": 5956 }, { "epoch": 0.9325297432686287, "grad_norm": 3.5642130374908447, "learning_rate": 8.12109375e-05, "loss": 0.6857, "step": 5957 }, { "epoch": 0.932686286787727, "grad_norm": 0.48266950249671936, "learning_rate": 8.119904891304348e-05, "loss": 0.2269, "step": 5958 }, { "epoch": 0.9328428303068252, "grad_norm": 0.6741616129875183, "learning_rate": 8.118716032608695e-05, "loss": 0.2487, "step": 5959 }, { "epoch": 0.9329993738259236, "grad_norm": 1.2659307718276978, "learning_rate": 8.117527173913043e-05, "loss": 0.3821, "step": 5960 }, { "epoch": 0.9331559173450219, "grad_norm": 0.9921276569366455, "learning_rate": 8.116338315217391e-05, "loss": 0.357, "step": 5961 }, { "epoch": 0.9333124608641202, "grad_norm": 1.1036118268966675, "learning_rate": 8.115149456521739e-05, "loss": 0.5257, "step": 5962 }, { "epoch": 0.9334690043832186, "grad_norm": 0.6902830600738525, "learning_rate": 8.113960597826087e-05, "loss": 0.2197, "step": 5963 }, { "epoch": 0.9336255479023169, "grad_norm": 1.4992848634719849, "learning_rate": 8.112771739130434e-05, "loss": 0.5123, "step": 5964 }, { "epoch": 0.9337820914214151, "grad_norm": 5.613006114959717, "learning_rate": 8.111582880434782e-05, "loss": 1.6389, "step": 5965 }, { "epoch": 0.9339386349405134, "grad_norm": 0.9730287790298462, "learning_rate": 8.11039402173913e-05, "loss": 0.2858, "step": 5966 }, { "epoch": 0.9340951784596118, "grad_norm": 0.9406409859657288, "learning_rate": 8.109205163043478e-05, "loss": 0.5356, "step": 5967 }, { "epoch": 0.9342517219787101, "grad_norm": 1.4176697731018066, "learning_rate": 8.108016304347826e-05, "loss": 0.3332, "step": 5968 }, { "epoch": 0.9344082654978084, "grad_norm": 1.5533779859542847, "learning_rate": 8.106827445652172e-05, "loss": 0.4362, "step": 5969 }, { "epoch": 0.9345648090169068, "grad_norm": 1.2681866884231567, "learning_rate": 8.10563858695652e-05, "loss": 0.4873, "step": 5970 }, { "epoch": 0.934721352536005, "grad_norm": 1.6322954893112183, "learning_rate": 8.104449728260868e-05, "loss": 0.5282, "step": 5971 }, { "epoch": 0.9348778960551033, "grad_norm": 2.1253926753997803, "learning_rate": 8.103260869565216e-05, "loss": 0.4649, "step": 5972 }, { "epoch": 0.9350344395742016, "grad_norm": 2.3322808742523193, "learning_rate": 8.102072010869563e-05, "loss": 0.5134, "step": 5973 }, { "epoch": 0.9351909830933, "grad_norm": 1.994574785232544, "learning_rate": 8.100883152173911e-05, "loss": 0.6106, "step": 5974 }, { "epoch": 0.9353475266123983, "grad_norm": 1.6682393550872803, "learning_rate": 8.09969429347826e-05, "loss": 0.6846, "step": 5975 }, { "epoch": 0.9355040701314965, "grad_norm": 6.040599822998047, "learning_rate": 8.098505434782608e-05, "loss": 0.7793, "step": 5976 }, { "epoch": 0.9356606136505948, "grad_norm": 2.2120423316955566, "learning_rate": 8.097316576086956e-05, "loss": 0.6162, "step": 5977 }, { "epoch": 0.9358171571696932, "grad_norm": 3.311535120010376, "learning_rate": 8.096127717391304e-05, "loss": 0.9038, "step": 5978 }, { "epoch": 0.9359737006887915, "grad_norm": 1.5582735538482666, "learning_rate": 8.094938858695652e-05, "loss": 0.5008, "step": 5979 }, { "epoch": 0.9361302442078898, "grad_norm": 2.8463995456695557, "learning_rate": 8.09375e-05, "loss": 0.8914, "step": 5980 }, { "epoch": 0.9362867877269881, "grad_norm": 1.6707054376602173, "learning_rate": 8.092561141304347e-05, "loss": 0.8192, "step": 5981 }, { "epoch": 0.9364433312460864, "grad_norm": 3.009993314743042, "learning_rate": 8.091372282608695e-05, "loss": 0.516, "step": 5982 }, { "epoch": 0.9365998747651847, "grad_norm": 3.612006425857544, "learning_rate": 8.090183423913043e-05, "loss": 1.0835, "step": 5983 }, { "epoch": 0.936756418284283, "grad_norm": 2.9400954246520996, "learning_rate": 8.088994565217391e-05, "loss": 0.9436, "step": 5984 }, { "epoch": 0.9369129618033814, "grad_norm": 2.072477340698242, "learning_rate": 8.087805706521739e-05, "loss": 0.9488, "step": 5985 }, { "epoch": 0.9370695053224797, "grad_norm": 1.7016026973724365, "learning_rate": 8.086616847826086e-05, "loss": 0.4201, "step": 5986 }, { "epoch": 0.937226048841578, "grad_norm": 2.7949697971343994, "learning_rate": 8.085427989130434e-05, "loss": 1.292, "step": 5987 }, { "epoch": 0.9373825923606762, "grad_norm": 2.802271604537964, "learning_rate": 8.084239130434782e-05, "loss": 0.9578, "step": 5988 }, { "epoch": 0.9375391358797746, "grad_norm": 4.772951602935791, "learning_rate": 8.08305027173913e-05, "loss": 1.1962, "step": 5989 }, { "epoch": 0.9376956793988729, "grad_norm": 3.8746469020843506, "learning_rate": 8.081861413043478e-05, "loss": 0.7993, "step": 5990 }, { "epoch": 0.9378522229179712, "grad_norm": 2.842878818511963, "learning_rate": 8.080672554347826e-05, "loss": 1.2222, "step": 5991 }, { "epoch": 0.9380087664370695, "grad_norm": 2.3396155834198, "learning_rate": 8.079483695652172e-05, "loss": 0.5586, "step": 5992 }, { "epoch": 0.9381653099561679, "grad_norm": 3.294530153274536, "learning_rate": 8.07829483695652e-05, "loss": 0.7908, "step": 5993 }, { "epoch": 0.9383218534752661, "grad_norm": 3.1602258682250977, "learning_rate": 8.077105978260868e-05, "loss": 0.9632, "step": 5994 }, { "epoch": 0.9384783969943644, "grad_norm": 3.1301023960113525, "learning_rate": 8.075917119565217e-05, "loss": 1.0652, "step": 5995 }, { "epoch": 0.9386349405134627, "grad_norm": 7.138947010040283, "learning_rate": 8.074728260869565e-05, "loss": 0.7946, "step": 5996 }, { "epoch": 0.9387914840325611, "grad_norm": 2.0475358963012695, "learning_rate": 8.073539402173912e-05, "loss": 0.8167, "step": 5997 }, { "epoch": 0.9389480275516594, "grad_norm": 5.097840785980225, "learning_rate": 8.07235054347826e-05, "loss": 1.5155, "step": 5998 }, { "epoch": 0.9391045710707576, "grad_norm": 2.5914838314056396, "learning_rate": 8.071161684782608e-05, "loss": 1.3236, "step": 5999 }, { "epoch": 0.939261114589856, "grad_norm": 3.7079977989196777, "learning_rate": 8.069972826086956e-05, "loss": 1.4588, "step": 6000 }, { "epoch": 0.939261114589856, "eval_loss": 0.558110237121582, "eval_runtime": 205.0701, "eval_samples_per_second": 60.384, "eval_steps_per_second": 3.774, "eval_wer": 0.33257538729625774, "step": 6000 }, { "epoch": 0.9394176581089543, "grad_norm": 0.4687590003013611, "learning_rate": 8.068783967391304e-05, "loss": 0.2659, "step": 6001 }, { "epoch": 0.9395742016280526, "grad_norm": 0.6479398012161255, "learning_rate": 8.067595108695651e-05, "loss": 0.2277, "step": 6002 }, { "epoch": 0.9397307451471509, "grad_norm": 1.001130223274231, "learning_rate": 8.066406249999999e-05, "loss": 0.2817, "step": 6003 }, { "epoch": 0.9398872886662493, "grad_norm": 0.6786360144615173, "learning_rate": 8.065217391304347e-05, "loss": 0.267, "step": 6004 }, { "epoch": 0.9400438321853475, "grad_norm": 0.7092812657356262, "learning_rate": 8.064028532608695e-05, "loss": 0.4061, "step": 6005 }, { "epoch": 0.9402003757044458, "grad_norm": 0.7106993794441223, "learning_rate": 8.062839673913043e-05, "loss": 0.2795, "step": 6006 }, { "epoch": 0.9403569192235441, "grad_norm": 0.5311773419380188, "learning_rate": 8.06165081521739e-05, "loss": 0.2724, "step": 6007 }, { "epoch": 0.9405134627426425, "grad_norm": 0.7962337136268616, "learning_rate": 8.060461956521738e-05, "loss": 0.3108, "step": 6008 }, { "epoch": 0.9406700062617408, "grad_norm": 0.43347877264022827, "learning_rate": 8.059273097826086e-05, "loss": 0.2007, "step": 6009 }, { "epoch": 0.9408265497808391, "grad_norm": 0.8105084300041199, "learning_rate": 8.058084239130434e-05, "loss": 0.3568, "step": 6010 }, { "epoch": 0.9409830932999373, "grad_norm": 0.8157312870025635, "learning_rate": 8.056895380434782e-05, "loss": 0.2515, "step": 6011 }, { "epoch": 0.9411396368190357, "grad_norm": 1.4417880773544312, "learning_rate": 8.055706521739131e-05, "loss": 0.2954, "step": 6012 }, { "epoch": 0.941296180338134, "grad_norm": 1.0217734575271606, "learning_rate": 8.054517663043479e-05, "loss": 0.4396, "step": 6013 }, { "epoch": 0.9414527238572323, "grad_norm": 1.3109517097473145, "learning_rate": 8.053328804347827e-05, "loss": 0.3174, "step": 6014 }, { "epoch": 0.9416092673763307, "grad_norm": 1.3965989351272583, "learning_rate": 8.052139945652173e-05, "loss": 0.3446, "step": 6015 }, { "epoch": 0.9417658108954289, "grad_norm": 1.3073153495788574, "learning_rate": 8.050951086956521e-05, "loss": 0.5749, "step": 6016 }, { "epoch": 0.9419223544145272, "grad_norm": 1.5838618278503418, "learning_rate": 8.049762228260869e-05, "loss": 0.3777, "step": 6017 }, { "epoch": 0.9420788979336255, "grad_norm": 2.0790598392486572, "learning_rate": 8.048573369565217e-05, "loss": 0.6975, "step": 6018 }, { "epoch": 0.9422354414527239, "grad_norm": 2.0597851276397705, "learning_rate": 8.047384510869564e-05, "loss": 0.6155, "step": 6019 }, { "epoch": 0.9423919849718222, "grad_norm": 1.9506394863128662, "learning_rate": 8.046195652173912e-05, "loss": 0.5694, "step": 6020 }, { "epoch": 0.9425485284909205, "grad_norm": 2.539808988571167, "learning_rate": 8.04500679347826e-05, "loss": 0.3104, "step": 6021 }, { "epoch": 0.9427050720100187, "grad_norm": 1.1550416946411133, "learning_rate": 8.043817934782608e-05, "loss": 0.3241, "step": 6022 }, { "epoch": 0.9428616155291171, "grad_norm": 1.501029372215271, "learning_rate": 8.042629076086956e-05, "loss": 0.4262, "step": 6023 }, { "epoch": 0.9430181590482154, "grad_norm": 2.166271686553955, "learning_rate": 8.041440217391303e-05, "loss": 0.7927, "step": 6024 }, { "epoch": 0.9431747025673137, "grad_norm": 1.897139549255371, "learning_rate": 8.040251358695651e-05, "loss": 0.3534, "step": 6025 }, { "epoch": 0.9433312460864121, "grad_norm": 1.8425812721252441, "learning_rate": 8.039062499999999e-05, "loss": 0.6235, "step": 6026 }, { "epoch": 0.9434877896055104, "grad_norm": 2.379276990890503, "learning_rate": 8.037873641304347e-05, "loss": 0.7595, "step": 6027 }, { "epoch": 0.9436443331246086, "grad_norm": 2.8747622966766357, "learning_rate": 8.036684782608695e-05, "loss": 0.7129, "step": 6028 }, { "epoch": 0.9438008766437069, "grad_norm": 2.1694905757904053, "learning_rate": 8.035495923913043e-05, "loss": 0.8939, "step": 6029 }, { "epoch": 0.9439574201628053, "grad_norm": 2.8825173377990723, "learning_rate": 8.03430706521739e-05, "loss": 0.7715, "step": 6030 }, { "epoch": 0.9441139636819036, "grad_norm": 3.163801670074463, "learning_rate": 8.033118206521738e-05, "loss": 1.0961, "step": 6031 }, { "epoch": 0.9442705072010019, "grad_norm": 2.822566032409668, "learning_rate": 8.031929347826087e-05, "loss": 0.5916, "step": 6032 }, { "epoch": 0.9444270507201001, "grad_norm": 3.223205089569092, "learning_rate": 8.030740489130435e-05, "loss": 1.6075, "step": 6033 }, { "epoch": 0.9445835942391985, "grad_norm": 2.2751057147979736, "learning_rate": 8.029551630434783e-05, "loss": 0.6671, "step": 6034 }, { "epoch": 0.9447401377582968, "grad_norm": 3.7146522998809814, "learning_rate": 8.028362771739131e-05, "loss": 0.77, "step": 6035 }, { "epoch": 0.9448966812773951, "grad_norm": 3.446007251739502, "learning_rate": 8.027173913043479e-05, "loss": 1.2481, "step": 6036 }, { "epoch": 0.9450532247964935, "grad_norm": 2.710857629776001, "learning_rate": 8.025985054347826e-05, "loss": 1.1194, "step": 6037 }, { "epoch": 0.9452097683155918, "grad_norm": 3.6452062129974365, "learning_rate": 8.024796195652173e-05, "loss": 0.9489, "step": 6038 }, { "epoch": 0.94536631183469, "grad_norm": 3.4232230186462402, "learning_rate": 8.02360733695652e-05, "loss": 1.3918, "step": 6039 }, { "epoch": 0.9455228553537883, "grad_norm": 4.172904014587402, "learning_rate": 8.022418478260868e-05, "loss": 0.9745, "step": 6040 }, { "epoch": 0.9456793988728867, "grad_norm": 6.846565246582031, "learning_rate": 8.021229619565216e-05, "loss": 0.8713, "step": 6041 }, { "epoch": 0.945835942391985, "grad_norm": 2.4423372745513916, "learning_rate": 8.020040760869564e-05, "loss": 0.9466, "step": 6042 }, { "epoch": 0.9459924859110833, "grad_norm": 1.9919493198394775, "learning_rate": 8.018851902173912e-05, "loss": 1.361, "step": 6043 }, { "epoch": 0.9461490294301816, "grad_norm": 1.6006492376327515, "learning_rate": 8.01766304347826e-05, "loss": 1.3025, "step": 6044 }, { "epoch": 0.9463055729492799, "grad_norm": 3.24600887298584, "learning_rate": 8.016474184782608e-05, "loss": 1.3568, "step": 6045 }, { "epoch": 0.9464621164683782, "grad_norm": 2.7289235591888428, "learning_rate": 8.015285326086955e-05, "loss": 1.2043, "step": 6046 }, { "epoch": 0.9466186599874765, "grad_norm": 2.316633701324463, "learning_rate": 8.014096467391303e-05, "loss": 0.5308, "step": 6047 }, { "epoch": 0.9467752035065748, "grad_norm": 4.530444145202637, "learning_rate": 8.012907608695651e-05, "loss": 1.3048, "step": 6048 }, { "epoch": 0.9469317470256732, "grad_norm": 2.7445623874664307, "learning_rate": 8.011718749999999e-05, "loss": 1.0126, "step": 6049 }, { "epoch": 0.9470882905447714, "grad_norm": 2.331571102142334, "learning_rate": 8.010529891304347e-05, "loss": 0.7977, "step": 6050 }, { "epoch": 0.9472448340638697, "grad_norm": 0.5514488220214844, "learning_rate": 8.009341032608694e-05, "loss": 0.3384, "step": 6051 }, { "epoch": 0.947401377582968, "grad_norm": 0.558311402797699, "learning_rate": 8.008152173913044e-05, "loss": 0.3047, "step": 6052 }, { "epoch": 0.9475579211020664, "grad_norm": 0.6496766209602356, "learning_rate": 8.006963315217391e-05, "loss": 0.2992, "step": 6053 }, { "epoch": 0.9477144646211647, "grad_norm": 0.869314432144165, "learning_rate": 8.005774456521739e-05, "loss": 0.2525, "step": 6054 }, { "epoch": 0.947871008140263, "grad_norm": 0.6826937794685364, "learning_rate": 8.004585597826087e-05, "loss": 0.3387, "step": 6055 }, { "epoch": 0.9480275516593613, "grad_norm": 0.5601934790611267, "learning_rate": 8.003396739130435e-05, "loss": 0.2672, "step": 6056 }, { "epoch": 0.9481840951784596, "grad_norm": 1.4788881540298462, "learning_rate": 8.002207880434783e-05, "loss": 0.3516, "step": 6057 }, { "epoch": 0.9483406386975579, "grad_norm": 0.9023474454879761, "learning_rate": 8.00101902173913e-05, "loss": 0.2989, "step": 6058 }, { "epoch": 0.9484971822166562, "grad_norm": 0.5385057926177979, "learning_rate": 7.999830163043478e-05, "loss": 0.252, "step": 6059 }, { "epoch": 0.9486537257357546, "grad_norm": 0.7548342347145081, "learning_rate": 7.998641304347826e-05, "loss": 0.2623, "step": 6060 }, { "epoch": 0.9488102692548529, "grad_norm": 0.7601940631866455, "learning_rate": 7.997452445652173e-05, "loss": 0.2785, "step": 6061 }, { "epoch": 0.9489668127739511, "grad_norm": 1.1327934265136719, "learning_rate": 7.99626358695652e-05, "loss": 0.4158, "step": 6062 }, { "epoch": 0.9491233562930494, "grad_norm": 1.0963046550750732, "learning_rate": 7.995074728260868e-05, "loss": 0.3544, "step": 6063 }, { "epoch": 0.9492798998121478, "grad_norm": 0.9404799938201904, "learning_rate": 7.993885869565216e-05, "loss": 0.3831, "step": 6064 }, { "epoch": 0.9494364433312461, "grad_norm": 1.109542727470398, "learning_rate": 7.992697010869564e-05, "loss": 0.4311, "step": 6065 }, { "epoch": 0.9495929868503444, "grad_norm": 1.2478801012039185, "learning_rate": 7.991508152173912e-05, "loss": 0.3595, "step": 6066 }, { "epoch": 0.9497495303694427, "grad_norm": 1.581331491470337, "learning_rate": 7.99031929347826e-05, "loss": 0.5328, "step": 6067 }, { "epoch": 0.949906073888541, "grad_norm": 2.1307568550109863, "learning_rate": 7.989130434782607e-05, "loss": 0.4203, "step": 6068 }, { "epoch": 0.9500626174076393, "grad_norm": 1.4859285354614258, "learning_rate": 7.987941576086955e-05, "loss": 0.4392, "step": 6069 }, { "epoch": 0.9502191609267376, "grad_norm": 1.4669233560562134, "learning_rate": 7.986752717391303e-05, "loss": 0.492, "step": 6070 }, { "epoch": 0.950375704445836, "grad_norm": 1.6100584268569946, "learning_rate": 7.985563858695651e-05, "loss": 0.5851, "step": 6071 }, { "epoch": 0.9505322479649343, "grad_norm": 1.1326746940612793, "learning_rate": 7.984375e-05, "loss": 0.5703, "step": 6072 }, { "epoch": 0.9506887914840325, "grad_norm": 3.110069751739502, "learning_rate": 7.983186141304348e-05, "loss": 0.8107, "step": 6073 }, { "epoch": 0.9508453350031308, "grad_norm": 2.302612066268921, "learning_rate": 7.981997282608696e-05, "loss": 0.5661, "step": 6074 }, { "epoch": 0.9510018785222292, "grad_norm": 2.301774501800537, "learning_rate": 7.980808423913043e-05, "loss": 0.499, "step": 6075 }, { "epoch": 0.9511584220413275, "grad_norm": 2.541309356689453, "learning_rate": 7.979619565217391e-05, "loss": 0.6706, "step": 6076 }, { "epoch": 0.9513149655604258, "grad_norm": 2.3693478107452393, "learning_rate": 7.978430706521739e-05, "loss": 0.5601, "step": 6077 }, { "epoch": 0.9514715090795242, "grad_norm": 2.4937827587127686, "learning_rate": 7.977241847826087e-05, "loss": 0.5195, "step": 6078 }, { "epoch": 0.9516280525986224, "grad_norm": 1.930615782737732, "learning_rate": 7.976052989130435e-05, "loss": 0.7118, "step": 6079 }, { "epoch": 0.9517845961177207, "grad_norm": 2.428485155105591, "learning_rate": 7.974864130434782e-05, "loss": 0.7141, "step": 6080 }, { "epoch": 0.951941139636819, "grad_norm": 2.176396131515503, "learning_rate": 7.97367527173913e-05, "loss": 0.7322, "step": 6081 }, { "epoch": 0.9520976831559174, "grad_norm": 1.8659486770629883, "learning_rate": 7.972486413043478e-05, "loss": 0.8894, "step": 6082 }, { "epoch": 0.9522542266750157, "grad_norm": 5.017393589019775, "learning_rate": 7.971297554347826e-05, "loss": 0.9762, "step": 6083 }, { "epoch": 0.9524107701941139, "grad_norm": 2.149415969848633, "learning_rate": 7.970108695652172e-05, "loss": 1.1027, "step": 6084 }, { "epoch": 0.9525673137132122, "grad_norm": 5.08442497253418, "learning_rate": 7.96891983695652e-05, "loss": 0.926, "step": 6085 }, { "epoch": 0.9527238572323106, "grad_norm": 3.5474188327789307, "learning_rate": 7.967730978260868e-05, "loss": 1.0661, "step": 6086 }, { "epoch": 0.9528804007514089, "grad_norm": 1.3698737621307373, "learning_rate": 7.966542119565216e-05, "loss": 0.5978, "step": 6087 }, { "epoch": 0.9530369442705072, "grad_norm": 3.596003770828247, "learning_rate": 7.965353260869564e-05, "loss": 0.5511, "step": 6088 }, { "epoch": 0.9531934877896056, "grad_norm": 2.108426094055176, "learning_rate": 7.964164402173911e-05, "loss": 0.9733, "step": 6089 }, { "epoch": 0.9533500313087038, "grad_norm": 4.808554649353027, "learning_rate": 7.962975543478259e-05, "loss": 1.4404, "step": 6090 }, { "epoch": 0.9535065748278021, "grad_norm": 4.908686637878418, "learning_rate": 7.961786684782607e-05, "loss": 1.2876, "step": 6091 }, { "epoch": 0.9536631183469004, "grad_norm": 2.9680705070495605, "learning_rate": 7.960597826086956e-05, "loss": 0.9088, "step": 6092 }, { "epoch": 0.9538196618659988, "grad_norm": 2.5711662769317627, "learning_rate": 7.959408967391304e-05, "loss": 1.0794, "step": 6093 }, { "epoch": 0.9539762053850971, "grad_norm": 2.1405997276306152, "learning_rate": 7.958220108695652e-05, "loss": 0.6652, "step": 6094 }, { "epoch": 0.9541327489041954, "grad_norm": 4.15911865234375, "learning_rate": 7.95703125e-05, "loss": 1.2038, "step": 6095 }, { "epoch": 0.9542892924232936, "grad_norm": 4.1436920166015625, "learning_rate": 7.955842391304348e-05, "loss": 1.0629, "step": 6096 }, { "epoch": 0.954445835942392, "grad_norm": 4.08286714553833, "learning_rate": 7.954653532608695e-05, "loss": 0.4929, "step": 6097 }, { "epoch": 0.9546023794614903, "grad_norm": 3.0798473358154297, "learning_rate": 7.953464673913043e-05, "loss": 1.486, "step": 6098 }, { "epoch": 0.9547589229805886, "grad_norm": 2.6724355220794678, "learning_rate": 7.952275815217391e-05, "loss": 0.6521, "step": 6099 }, { "epoch": 0.954915466499687, "grad_norm": 4.492459774017334, "learning_rate": 7.951086956521739e-05, "loss": 1.3345, "step": 6100 }, { "epoch": 0.9550720100187852, "grad_norm": 0.6550471782684326, "learning_rate": 7.949898097826087e-05, "loss": 0.2996, "step": 6101 }, { "epoch": 0.9552285535378835, "grad_norm": 0.6819889545440674, "learning_rate": 7.948709239130434e-05, "loss": 0.3127, "step": 6102 }, { "epoch": 0.9553850970569818, "grad_norm": 0.791537344455719, "learning_rate": 7.947520380434782e-05, "loss": 0.3827, "step": 6103 }, { "epoch": 0.9555416405760802, "grad_norm": 0.7334808707237244, "learning_rate": 7.94633152173913e-05, "loss": 0.2538, "step": 6104 }, { "epoch": 0.9556981840951785, "grad_norm": 0.7328970432281494, "learning_rate": 7.945142663043478e-05, "loss": 0.2541, "step": 6105 }, { "epoch": 0.9558547276142768, "grad_norm": 0.6492490768432617, "learning_rate": 7.943953804347826e-05, "loss": 0.2453, "step": 6106 }, { "epoch": 0.956011271133375, "grad_norm": 0.7318797707557678, "learning_rate": 7.942764945652172e-05, "loss": 0.2227, "step": 6107 }, { "epoch": 0.9561678146524734, "grad_norm": 0.7529894709587097, "learning_rate": 7.94157608695652e-05, "loss": 0.2702, "step": 6108 }, { "epoch": 0.9563243581715717, "grad_norm": 0.7953017354011536, "learning_rate": 7.940387228260868e-05, "loss": 0.3316, "step": 6109 }, { "epoch": 0.95648090169067, "grad_norm": 0.677038848400116, "learning_rate": 7.939198369565216e-05, "loss": 0.2272, "step": 6110 }, { "epoch": 0.9566374452097683, "grad_norm": 0.9476549625396729, "learning_rate": 7.938009510869563e-05, "loss": 0.3618, "step": 6111 }, { "epoch": 0.9567939887288667, "grad_norm": 1.2375916242599487, "learning_rate": 7.936820652173913e-05, "loss": 0.444, "step": 6112 }, { "epoch": 0.9569505322479649, "grad_norm": 1.3669099807739258, "learning_rate": 7.93563179347826e-05, "loss": 0.4945, "step": 6113 }, { "epoch": 0.9571070757670632, "grad_norm": 1.0362024307250977, "learning_rate": 7.934442934782608e-05, "loss": 0.4411, "step": 6114 }, { "epoch": 0.9572636192861615, "grad_norm": 0.7257043719291687, "learning_rate": 7.933254076086956e-05, "loss": 0.2749, "step": 6115 }, { "epoch": 0.9574201628052599, "grad_norm": 1.358597993850708, "learning_rate": 7.932065217391304e-05, "loss": 0.2533, "step": 6116 }, { "epoch": 0.9575767063243582, "grad_norm": 4.438745498657227, "learning_rate": 7.930876358695652e-05, "loss": 0.7733, "step": 6117 }, { "epoch": 0.9577332498434565, "grad_norm": 1.2425792217254639, "learning_rate": 7.9296875e-05, "loss": 0.4189, "step": 6118 }, { "epoch": 0.9578897933625548, "grad_norm": 1.8373950719833374, "learning_rate": 7.928498641304347e-05, "loss": 0.5281, "step": 6119 }, { "epoch": 0.9580463368816531, "grad_norm": 4.510956287384033, "learning_rate": 7.927309782608695e-05, "loss": 1.0497, "step": 6120 }, { "epoch": 0.9582028804007514, "grad_norm": 1.6456094980239868, "learning_rate": 7.926120923913043e-05, "loss": 0.5509, "step": 6121 }, { "epoch": 0.9583594239198497, "grad_norm": 1.7094457149505615, "learning_rate": 7.924932065217391e-05, "loss": 0.7562, "step": 6122 }, { "epoch": 0.9585159674389481, "grad_norm": 1.3752944469451904, "learning_rate": 7.923743206521739e-05, "loss": 0.6487, "step": 6123 }, { "epoch": 0.9586725109580463, "grad_norm": 1.1641725301742554, "learning_rate": 7.922554347826086e-05, "loss": 0.3854, "step": 6124 }, { "epoch": 0.9588290544771446, "grad_norm": 1.5674796104431152, "learning_rate": 7.921365489130434e-05, "loss": 0.9298, "step": 6125 }, { "epoch": 0.9589855979962429, "grad_norm": 1.232322096824646, "learning_rate": 7.920176630434782e-05, "loss": 0.448, "step": 6126 }, { "epoch": 0.9591421415153413, "grad_norm": 3.7246060371398926, "learning_rate": 7.91898777173913e-05, "loss": 1.2498, "step": 6127 }, { "epoch": 0.9592986850344396, "grad_norm": 1.8834781646728516, "learning_rate": 7.917798913043478e-05, "loss": 0.9312, "step": 6128 }, { "epoch": 0.9594552285535379, "grad_norm": 2.147043228149414, "learning_rate": 7.916610054347827e-05, "loss": 0.7625, "step": 6129 }, { "epoch": 0.9596117720726361, "grad_norm": 1.9769816398620605, "learning_rate": 7.915421195652172e-05, "loss": 0.8706, "step": 6130 }, { "epoch": 0.9597683155917345, "grad_norm": 2.1152896881103516, "learning_rate": 7.91423233695652e-05, "loss": 0.8783, "step": 6131 }, { "epoch": 0.9599248591108328, "grad_norm": 2.341646432876587, "learning_rate": 7.913043478260869e-05, "loss": 0.4949, "step": 6132 }, { "epoch": 0.9600814026299311, "grad_norm": 2.464297294616699, "learning_rate": 7.911854619565217e-05, "loss": 0.937, "step": 6133 }, { "epoch": 0.9602379461490295, "grad_norm": 2.9177486896514893, "learning_rate": 7.910665760869565e-05, "loss": 0.9085, "step": 6134 }, { "epoch": 0.9603944896681278, "grad_norm": 2.5271756649017334, "learning_rate": 7.909476902173912e-05, "loss": 0.9177, "step": 6135 }, { "epoch": 0.960551033187226, "grad_norm": 4.023510932922363, "learning_rate": 7.90828804347826e-05, "loss": 1.2514, "step": 6136 }, { "epoch": 0.9607075767063243, "grad_norm": 1.9781827926635742, "learning_rate": 7.907099184782608e-05, "loss": 0.6382, "step": 6137 }, { "epoch": 0.9608641202254227, "grad_norm": 3.27557110786438, "learning_rate": 7.905910326086956e-05, "loss": 0.5174, "step": 6138 }, { "epoch": 0.961020663744521, "grad_norm": 8.624598503112793, "learning_rate": 7.904721467391304e-05, "loss": 1.2683, "step": 6139 }, { "epoch": 0.9611772072636193, "grad_norm": 2.9472908973693848, "learning_rate": 7.903532608695651e-05, "loss": 1.0886, "step": 6140 }, { "epoch": 0.9613337507827175, "grad_norm": 2.963677167892456, "learning_rate": 7.902343749999999e-05, "loss": 1.4692, "step": 6141 }, { "epoch": 0.9614902943018159, "grad_norm": 2.796529769897461, "learning_rate": 7.901154891304347e-05, "loss": 1.2165, "step": 6142 }, { "epoch": 0.9616468378209142, "grad_norm": 2.4648451805114746, "learning_rate": 7.899966032608695e-05, "loss": 1.2779, "step": 6143 }, { "epoch": 0.9618033813400125, "grad_norm": 1.9170864820480347, "learning_rate": 7.898777173913043e-05, "loss": 1.297, "step": 6144 }, { "epoch": 0.9619599248591109, "grad_norm": 3.014103889465332, "learning_rate": 7.89758831521739e-05, "loss": 1.2577, "step": 6145 }, { "epoch": 0.9621164683782092, "grad_norm": 2.415367841720581, "learning_rate": 7.896399456521738e-05, "loss": 1.2759, "step": 6146 }, { "epoch": 0.9622730118973074, "grad_norm": 2.003873586654663, "learning_rate": 7.895210597826086e-05, "loss": 0.5409, "step": 6147 }, { "epoch": 0.9624295554164057, "grad_norm": 1.548331379890442, "learning_rate": 7.894021739130434e-05, "loss": 0.5912, "step": 6148 }, { "epoch": 0.9625860989355041, "grad_norm": 2.8148369789123535, "learning_rate": 7.892832880434783e-05, "loss": 0.6753, "step": 6149 }, { "epoch": 0.9627426424546024, "grad_norm": 1.3178937435150146, "learning_rate": 7.891644021739131e-05, "loss": 1.0497, "step": 6150 }, { "epoch": 0.9628991859737007, "grad_norm": 0.5022231936454773, "learning_rate": 7.890455163043479e-05, "loss": 0.2426, "step": 6151 }, { "epoch": 0.963055729492799, "grad_norm": 0.7901156544685364, "learning_rate": 7.889266304347827e-05, "loss": 0.3308, "step": 6152 }, { "epoch": 0.9632122730118973, "grad_norm": 0.35896509885787964, "learning_rate": 7.888077445652173e-05, "loss": 0.2259, "step": 6153 }, { "epoch": 0.9633688165309956, "grad_norm": 0.9130063056945801, "learning_rate": 7.886888586956521e-05, "loss": 0.3834, "step": 6154 }, { "epoch": 0.9635253600500939, "grad_norm": 0.9630022048950195, "learning_rate": 7.885699728260869e-05, "loss": 0.4304, "step": 6155 }, { "epoch": 0.9636819035691923, "grad_norm": 0.8702998757362366, "learning_rate": 7.884510869565216e-05, "loss": 0.4383, "step": 6156 }, { "epoch": 0.9638384470882906, "grad_norm": 0.5648989677429199, "learning_rate": 7.883322010869564e-05, "loss": 0.2469, "step": 6157 }, { "epoch": 0.9639949906073888, "grad_norm": 0.6362563967704773, "learning_rate": 7.882133152173912e-05, "loss": 0.2798, "step": 6158 }, { "epoch": 0.9641515341264871, "grad_norm": 0.5868794918060303, "learning_rate": 7.88094429347826e-05, "loss": 0.2755, "step": 6159 }, { "epoch": 0.9643080776455855, "grad_norm": 1.0891036987304688, "learning_rate": 7.879755434782608e-05, "loss": 0.3364, "step": 6160 }, { "epoch": 0.9644646211646838, "grad_norm": 0.8689764738082886, "learning_rate": 7.878566576086956e-05, "loss": 0.2101, "step": 6161 }, { "epoch": 0.9646211646837821, "grad_norm": 1.4299262762069702, "learning_rate": 7.877377717391303e-05, "loss": 0.4312, "step": 6162 }, { "epoch": 0.9647777082028804, "grad_norm": 2.4540791511535645, "learning_rate": 7.876188858695651e-05, "loss": 0.5273, "step": 6163 }, { "epoch": 0.9649342517219787, "grad_norm": 1.2234249114990234, "learning_rate": 7.874999999999999e-05, "loss": 0.4333, "step": 6164 }, { "epoch": 0.965090795241077, "grad_norm": 0.7687028646469116, "learning_rate": 7.873811141304347e-05, "loss": 0.2697, "step": 6165 }, { "epoch": 0.9652473387601753, "grad_norm": 1.052056074142456, "learning_rate": 7.872622282608695e-05, "loss": 0.3195, "step": 6166 }, { "epoch": 0.9654038822792737, "grad_norm": 3.116245985031128, "learning_rate": 7.871433423913042e-05, "loss": 0.6269, "step": 6167 }, { "epoch": 0.965560425798372, "grad_norm": 1.730777621269226, "learning_rate": 7.87024456521739e-05, "loss": 0.4184, "step": 6168 }, { "epoch": 0.9657169693174703, "grad_norm": 0.8890530467033386, "learning_rate": 7.86905570652174e-05, "loss": 0.3846, "step": 6169 }, { "epoch": 0.9658735128365685, "grad_norm": 1.4419732093811035, "learning_rate": 7.867866847826087e-05, "loss": 0.3373, "step": 6170 }, { "epoch": 0.9660300563556669, "grad_norm": 1.2863332033157349, "learning_rate": 7.866677989130435e-05, "loss": 0.4138, "step": 6171 }, { "epoch": 0.9661865998747652, "grad_norm": 1.6415061950683594, "learning_rate": 7.865489130434783e-05, "loss": 0.4348, "step": 6172 }, { "epoch": 0.9663431433938635, "grad_norm": 2.617326021194458, "learning_rate": 7.86430027173913e-05, "loss": 0.5279, "step": 6173 }, { "epoch": 0.9664996869129618, "grad_norm": 2.528158187866211, "learning_rate": 7.863111413043478e-05, "loss": 0.7477, "step": 6174 }, { "epoch": 0.9666562304320601, "grad_norm": 1.6932047605514526, "learning_rate": 7.861922554347826e-05, "loss": 0.9852, "step": 6175 }, { "epoch": 0.9668127739511584, "grad_norm": 3.284409284591675, "learning_rate": 7.860733695652173e-05, "loss": 1.0177, "step": 6176 }, { "epoch": 0.9669693174702567, "grad_norm": 1.775658130645752, "learning_rate": 7.85954483695652e-05, "loss": 0.7089, "step": 6177 }, { "epoch": 0.967125860989355, "grad_norm": 1.8737943172454834, "learning_rate": 7.858355978260868e-05, "loss": 0.5609, "step": 6178 }, { "epoch": 0.9672824045084534, "grad_norm": 1.5686863660812378, "learning_rate": 7.857167119565216e-05, "loss": 0.9119, "step": 6179 }, { "epoch": 0.9674389480275517, "grad_norm": 2.6192920207977295, "learning_rate": 7.855978260869564e-05, "loss": 0.7137, "step": 6180 }, { "epoch": 0.9675954915466499, "grad_norm": 2.4271416664123535, "learning_rate": 7.854789402173912e-05, "loss": 0.7485, "step": 6181 }, { "epoch": 0.9677520350657483, "grad_norm": 3.2027461528778076, "learning_rate": 7.85360054347826e-05, "loss": 0.7423, "step": 6182 }, { "epoch": 0.9679085785848466, "grad_norm": 3.3975908756256104, "learning_rate": 7.852411684782607e-05, "loss": 0.8504, "step": 6183 }, { "epoch": 0.9680651221039449, "grad_norm": 2.3106210231781006, "learning_rate": 7.851222826086955e-05, "loss": 1.4285, "step": 6184 }, { "epoch": 0.9682216656230432, "grad_norm": 2.29410457611084, "learning_rate": 7.850033967391303e-05, "loss": 0.9087, "step": 6185 }, { "epoch": 0.9683782091421416, "grad_norm": 2.7497637271881104, "learning_rate": 7.848845108695651e-05, "loss": 0.6885, "step": 6186 }, { "epoch": 0.9685347526612398, "grad_norm": 3.2839131355285645, "learning_rate": 7.847656249999999e-05, "loss": 1.0123, "step": 6187 }, { "epoch": 0.9686912961803381, "grad_norm": 3.0704946517944336, "learning_rate": 7.846467391304347e-05, "loss": 1.1889, "step": 6188 }, { "epoch": 0.9688478396994364, "grad_norm": 3.7175395488739014, "learning_rate": 7.845278532608696e-05, "loss": 1.0306, "step": 6189 }, { "epoch": 0.9690043832185348, "grad_norm": 2.3154947757720947, "learning_rate": 7.844089673913044e-05, "loss": 0.9195, "step": 6190 }, { "epoch": 0.9691609267376331, "grad_norm": 2.365691661834717, "learning_rate": 7.842900815217391e-05, "loss": 0.8376, "step": 6191 }, { "epoch": 0.9693174702567313, "grad_norm": 2.72786283493042, "learning_rate": 7.841711956521739e-05, "loss": 1.3032, "step": 6192 }, { "epoch": 0.9694740137758296, "grad_norm": 4.138810634613037, "learning_rate": 7.840523097826087e-05, "loss": 1.5354, "step": 6193 }, { "epoch": 0.969630557294928, "grad_norm": 2.253678560256958, "learning_rate": 7.839334239130435e-05, "loss": 0.8418, "step": 6194 }, { "epoch": 0.9697871008140263, "grad_norm": 2.7182729244232178, "learning_rate": 7.838145380434783e-05, "loss": 1.1361, "step": 6195 }, { "epoch": 0.9699436443331246, "grad_norm": 1.4330157041549683, "learning_rate": 7.83695652173913e-05, "loss": 0.3804, "step": 6196 }, { "epoch": 0.970100187852223, "grad_norm": 2.520134210586548, "learning_rate": 7.835767663043478e-05, "loss": 0.714, "step": 6197 }, { "epoch": 0.9702567313713212, "grad_norm": 2.7298521995544434, "learning_rate": 7.834578804347826e-05, "loss": 0.727, "step": 6198 }, { "epoch": 0.9704132748904195, "grad_norm": 3.074273109436035, "learning_rate": 7.833389945652173e-05, "loss": 0.7622, "step": 6199 }, { "epoch": 0.9705698184095178, "grad_norm": 2.538619041442871, "learning_rate": 7.83220108695652e-05, "loss": 0.7886, "step": 6200 }, { "epoch": 0.9707263619286162, "grad_norm": 0.5261865258216858, "learning_rate": 7.831012228260868e-05, "loss": 0.3247, "step": 6201 }, { "epoch": 0.9708829054477145, "grad_norm": 0.45909595489501953, "learning_rate": 7.829823369565216e-05, "loss": 0.2451, "step": 6202 }, { "epoch": 0.9710394489668128, "grad_norm": 0.6595438718795776, "learning_rate": 7.828634510869564e-05, "loss": 0.3339, "step": 6203 }, { "epoch": 0.971195992485911, "grad_norm": 0.5164029002189636, "learning_rate": 7.827445652173912e-05, "loss": 0.2365, "step": 6204 }, { "epoch": 0.9713525360050094, "grad_norm": 0.609967827796936, "learning_rate": 7.82625679347826e-05, "loss": 0.2712, "step": 6205 }, { "epoch": 0.9715090795241077, "grad_norm": 0.7000541687011719, "learning_rate": 7.825067934782607e-05, "loss": 0.2232, "step": 6206 }, { "epoch": 0.971665623043206, "grad_norm": 0.5687620043754578, "learning_rate": 7.823879076086955e-05, "loss": 0.2454, "step": 6207 }, { "epoch": 0.9718221665623044, "grad_norm": 0.9333391785621643, "learning_rate": 7.822690217391303e-05, "loss": 0.3604, "step": 6208 }, { "epoch": 0.9719787100814026, "grad_norm": 1.5063854455947876, "learning_rate": 7.821501358695652e-05, "loss": 0.2504, "step": 6209 }, { "epoch": 0.9721352536005009, "grad_norm": 2.163081407546997, "learning_rate": 7.8203125e-05, "loss": 0.3582, "step": 6210 }, { "epoch": 0.9722917971195992, "grad_norm": 1.3789689540863037, "learning_rate": 7.819123641304348e-05, "loss": 0.529, "step": 6211 }, { "epoch": 0.9724483406386976, "grad_norm": 1.2947298288345337, "learning_rate": 7.817934782608695e-05, "loss": 0.5602, "step": 6212 }, { "epoch": 0.9726048841577959, "grad_norm": 1.2061067819595337, "learning_rate": 7.816745923913043e-05, "loss": 0.4008, "step": 6213 }, { "epoch": 0.9727614276768942, "grad_norm": 1.5031312704086304, "learning_rate": 7.815557065217391e-05, "loss": 0.3401, "step": 6214 }, { "epoch": 0.9729179711959924, "grad_norm": 0.941938042640686, "learning_rate": 7.814368206521739e-05, "loss": 0.2979, "step": 6215 }, { "epoch": 0.9730745147150908, "grad_norm": 1.9271979331970215, "learning_rate": 7.813179347826087e-05, "loss": 0.6329, "step": 6216 }, { "epoch": 0.9732310582341891, "grad_norm": 1.3388257026672363, "learning_rate": 7.811990489130435e-05, "loss": 0.6206, "step": 6217 }, { "epoch": 0.9733876017532874, "grad_norm": 2.2460029125213623, "learning_rate": 7.810801630434782e-05, "loss": 0.5943, "step": 6218 }, { "epoch": 0.9735441452723858, "grad_norm": 1.029801607131958, "learning_rate": 7.80961277173913e-05, "loss": 0.445, "step": 6219 }, { "epoch": 0.9737006887914841, "grad_norm": 1.7900571823120117, "learning_rate": 7.808423913043478e-05, "loss": 0.4001, "step": 6220 }, { "epoch": 0.9738572323105823, "grad_norm": 1.5407251119613647, "learning_rate": 7.807235054347826e-05, "loss": 0.3577, "step": 6221 }, { "epoch": 0.9740137758296806, "grad_norm": 1.7530872821807861, "learning_rate": 7.806046195652172e-05, "loss": 0.7873, "step": 6222 }, { "epoch": 0.974170319348779, "grad_norm": 2.171942949295044, "learning_rate": 7.80485733695652e-05, "loss": 0.7554, "step": 6223 }, { "epoch": 0.9743268628678773, "grad_norm": 0.9990300536155701, "learning_rate": 7.803668478260868e-05, "loss": 0.3471, "step": 6224 }, { "epoch": 0.9744834063869756, "grad_norm": 1.4028034210205078, "learning_rate": 7.802479619565216e-05, "loss": 0.4669, "step": 6225 }, { "epoch": 0.9746399499060739, "grad_norm": 2.223710536956787, "learning_rate": 7.801290760869564e-05, "loss": 0.4039, "step": 6226 }, { "epoch": 0.9747964934251722, "grad_norm": 1.6957495212554932, "learning_rate": 7.800101902173911e-05, "loss": 0.7174, "step": 6227 }, { "epoch": 0.9749530369442705, "grad_norm": 3.165759801864624, "learning_rate": 7.79891304347826e-05, "loss": 0.7114, "step": 6228 }, { "epoch": 0.9751095804633688, "grad_norm": 1.6233524084091187, "learning_rate": 7.797724184782608e-05, "loss": 0.4071, "step": 6229 }, { "epoch": 0.9752661239824671, "grad_norm": 2.5625531673431396, "learning_rate": 7.796535326086956e-05, "loss": 0.8234, "step": 6230 }, { "epoch": 0.9754226675015655, "grad_norm": 2.583817481994629, "learning_rate": 7.795346467391304e-05, "loss": 0.6378, "step": 6231 }, { "epoch": 0.9755792110206637, "grad_norm": 1.532922625541687, "learning_rate": 7.794157608695652e-05, "loss": 0.7079, "step": 6232 }, { "epoch": 0.975735754539762, "grad_norm": 2.5469720363616943, "learning_rate": 7.79296875e-05, "loss": 0.3882, "step": 6233 }, { "epoch": 0.9758922980588604, "grad_norm": 4.0617265701293945, "learning_rate": 7.791779891304347e-05, "loss": 1.2438, "step": 6234 }, { "epoch": 0.9760488415779587, "grad_norm": 1.806945562362671, "learning_rate": 7.790591032608695e-05, "loss": 0.4233, "step": 6235 }, { "epoch": 0.976205385097057, "grad_norm": 2.129638433456421, "learning_rate": 7.789402173913043e-05, "loss": 0.9708, "step": 6236 }, { "epoch": 0.9763619286161553, "grad_norm": 4.004551887512207, "learning_rate": 7.788213315217391e-05, "loss": 0.7619, "step": 6237 }, { "epoch": 0.9765184721352536, "grad_norm": 3.3113749027252197, "learning_rate": 7.787024456521739e-05, "loss": 1.3661, "step": 6238 }, { "epoch": 0.9766750156543519, "grad_norm": 2.7673356533050537, "learning_rate": 7.785835597826086e-05, "loss": 0.6993, "step": 6239 }, { "epoch": 0.9768315591734502, "grad_norm": 2.498518943786621, "learning_rate": 7.784646739130434e-05, "loss": 1.7911, "step": 6240 }, { "epoch": 0.9769881026925485, "grad_norm": 2.5352067947387695, "learning_rate": 7.783457880434782e-05, "loss": 0.9784, "step": 6241 }, { "epoch": 0.9771446462116469, "grad_norm": 1.9731851816177368, "learning_rate": 7.78226902173913e-05, "loss": 0.9081, "step": 6242 }, { "epoch": 0.9773011897307452, "grad_norm": 2.1407358646392822, "learning_rate": 7.781080163043478e-05, "loss": 1.0736, "step": 6243 }, { "epoch": 0.9774577332498434, "grad_norm": 3.1653034687042236, "learning_rate": 7.779891304347826e-05, "loss": 0.92, "step": 6244 }, { "epoch": 0.9776142767689417, "grad_norm": 1.3449409008026123, "learning_rate": 7.778702445652172e-05, "loss": 0.5506, "step": 6245 }, { "epoch": 0.9777708202880401, "grad_norm": 2.599491596221924, "learning_rate": 7.77751358695652e-05, "loss": 1.0886, "step": 6246 }, { "epoch": 0.9779273638071384, "grad_norm": 4.216004371643066, "learning_rate": 7.776324728260868e-05, "loss": 1.3053, "step": 6247 }, { "epoch": 0.9780839073262367, "grad_norm": 3.7875475883483887, "learning_rate": 7.775135869565217e-05, "loss": 1.2004, "step": 6248 }, { "epoch": 0.978240450845335, "grad_norm": 3.0867724418640137, "learning_rate": 7.773947010869565e-05, "loss": 0.8003, "step": 6249 }, { "epoch": 0.9783969943644333, "grad_norm": 3.6429240703582764, "learning_rate": 7.772758152173912e-05, "loss": 1.302, "step": 6250 }, { "epoch": 0.9785535378835316, "grad_norm": 0.6485228538513184, "learning_rate": 7.77156929347826e-05, "loss": 0.3001, "step": 6251 }, { "epoch": 0.9787100814026299, "grad_norm": 0.5136064291000366, "learning_rate": 7.770380434782608e-05, "loss": 0.2676, "step": 6252 }, { "epoch": 0.9788666249217283, "grad_norm": 0.5993520021438599, "learning_rate": 7.769191576086956e-05, "loss": 0.2569, "step": 6253 }, { "epoch": 0.9790231684408266, "grad_norm": 0.3628522455692291, "learning_rate": 7.768002717391304e-05, "loss": 0.1825, "step": 6254 }, { "epoch": 0.9791797119599248, "grad_norm": 0.6104713082313538, "learning_rate": 7.766813858695652e-05, "loss": 0.2406, "step": 6255 }, { "epoch": 0.9793362554790231, "grad_norm": 0.5523651838302612, "learning_rate": 7.765625e-05, "loss": 0.2331, "step": 6256 }, { "epoch": 0.9794927989981215, "grad_norm": 0.9539133906364441, "learning_rate": 7.764436141304347e-05, "loss": 0.236, "step": 6257 }, { "epoch": 0.9796493425172198, "grad_norm": 0.9070444107055664, "learning_rate": 7.763247282608695e-05, "loss": 0.3911, "step": 6258 }, { "epoch": 0.9798058860363181, "grad_norm": 0.8831202387809753, "learning_rate": 7.762058423913043e-05, "loss": 0.2246, "step": 6259 }, { "epoch": 0.9799624295554165, "grad_norm": 1.2289154529571533, "learning_rate": 7.76086956521739e-05, "loss": 0.4191, "step": 6260 }, { "epoch": 0.9801189730745147, "grad_norm": 0.8633725643157959, "learning_rate": 7.759680706521738e-05, "loss": 0.3285, "step": 6261 }, { "epoch": 0.980275516593613, "grad_norm": 0.843951404094696, "learning_rate": 7.758491847826086e-05, "loss": 0.2802, "step": 6262 }, { "epoch": 0.9804320601127113, "grad_norm": 1.1072437763214111, "learning_rate": 7.757302989130434e-05, "loss": 0.3745, "step": 6263 }, { "epoch": 0.9805886036318097, "grad_norm": 1.1991252899169922, "learning_rate": 7.756114130434782e-05, "loss": 0.4553, "step": 6264 }, { "epoch": 0.980745147150908, "grad_norm": 0.928307056427002, "learning_rate": 7.75492527173913e-05, "loss": 0.4102, "step": 6265 }, { "epoch": 0.9809016906700062, "grad_norm": 1.6666730642318726, "learning_rate": 7.753736413043479e-05, "loss": 0.394, "step": 6266 }, { "epoch": 0.9810582341891045, "grad_norm": 1.0673190355300903, "learning_rate": 7.752547554347827e-05, "loss": 0.4396, "step": 6267 }, { "epoch": 0.9812147777082029, "grad_norm": 1.5239180326461792, "learning_rate": 7.751358695652173e-05, "loss": 0.468, "step": 6268 }, { "epoch": 0.9813713212273012, "grad_norm": 1.6975862979888916, "learning_rate": 7.750169836956521e-05, "loss": 0.6133, "step": 6269 }, { "epoch": 0.9815278647463995, "grad_norm": 1.4703673124313354, "learning_rate": 7.748980978260869e-05, "loss": 0.5516, "step": 6270 }, { "epoch": 0.9816844082654979, "grad_norm": 1.4117891788482666, "learning_rate": 7.747792119565217e-05, "loss": 0.3468, "step": 6271 }, { "epoch": 0.9818409517845961, "grad_norm": 3.008951187133789, "learning_rate": 7.746603260869564e-05, "loss": 0.793, "step": 6272 }, { "epoch": 0.9819974953036944, "grad_norm": 2.0892045497894287, "learning_rate": 7.745414402173912e-05, "loss": 0.3913, "step": 6273 }, { "epoch": 0.9821540388227927, "grad_norm": 5.540923118591309, "learning_rate": 7.74422554347826e-05, "loss": 0.6095, "step": 6274 }, { "epoch": 0.9823105823418911, "grad_norm": 1.8689546585083008, "learning_rate": 7.743036684782608e-05, "loss": 0.5582, "step": 6275 }, { "epoch": 0.9824671258609894, "grad_norm": 2.822338104248047, "learning_rate": 7.741847826086956e-05, "loss": 0.6713, "step": 6276 }, { "epoch": 0.9826236693800877, "grad_norm": 2.1027066707611084, "learning_rate": 7.740658967391303e-05, "loss": 0.8201, "step": 6277 }, { "epoch": 0.9827802128991859, "grad_norm": 3.328375816345215, "learning_rate": 7.739470108695651e-05, "loss": 0.8323, "step": 6278 }, { "epoch": 0.9829367564182843, "grad_norm": 4.054839611053467, "learning_rate": 7.738281249999999e-05, "loss": 0.5298, "step": 6279 }, { "epoch": 0.9830932999373826, "grad_norm": 1.4086027145385742, "learning_rate": 7.737092391304347e-05, "loss": 0.5926, "step": 6280 }, { "epoch": 0.9832498434564809, "grad_norm": 2.3915324211120605, "learning_rate": 7.735903532608695e-05, "loss": 1.0081, "step": 6281 }, { "epoch": 0.9834063869755792, "grad_norm": 3.3088927268981934, "learning_rate": 7.734714673913043e-05, "loss": 0.4569, "step": 6282 }, { "epoch": 0.9835629304946775, "grad_norm": 2.524566888809204, "learning_rate": 7.73352581521739e-05, "loss": 0.6755, "step": 6283 }, { "epoch": 0.9837194740137758, "grad_norm": 3.3401291370391846, "learning_rate": 7.732336956521738e-05, "loss": 0.8431, "step": 6284 }, { "epoch": 0.9838760175328741, "grad_norm": 2.52885103225708, "learning_rate": 7.731148097826086e-05, "loss": 0.937, "step": 6285 }, { "epoch": 0.9840325610519725, "grad_norm": 2.560624599456787, "learning_rate": 7.729959239130435e-05, "loss": 0.8152, "step": 6286 }, { "epoch": 0.9841891045710708, "grad_norm": 2.228534698486328, "learning_rate": 7.728770380434783e-05, "loss": 0.8895, "step": 6287 }, { "epoch": 0.9843456480901691, "grad_norm": 3.191978931427002, "learning_rate": 7.727581521739131e-05, "loss": 0.4964, "step": 6288 }, { "epoch": 0.9845021916092673, "grad_norm": 2.7139172554016113, "learning_rate": 7.726392663043479e-05, "loss": 0.9824, "step": 6289 }, { "epoch": 0.9846587351283657, "grad_norm": 2.5267953872680664, "learning_rate": 7.725203804347826e-05, "loss": 1.281, "step": 6290 }, { "epoch": 0.984815278647464, "grad_norm": 2.5963995456695557, "learning_rate": 7.724014945652173e-05, "loss": 1.1824, "step": 6291 }, { "epoch": 0.9849718221665623, "grad_norm": 2.6494646072387695, "learning_rate": 7.722826086956521e-05, "loss": 0.984, "step": 6292 }, { "epoch": 0.9851283656856606, "grad_norm": 2.7027342319488525, "learning_rate": 7.721637228260869e-05, "loss": 0.6219, "step": 6293 }, { "epoch": 0.985284909204759, "grad_norm": 6.453477382659912, "learning_rate": 7.720448369565216e-05, "loss": 1.1792, "step": 6294 }, { "epoch": 0.9854414527238572, "grad_norm": 4.777997970581055, "learning_rate": 7.719259510869564e-05, "loss": 1.401, "step": 6295 }, { "epoch": 0.9855979962429555, "grad_norm": 6.57703161239624, "learning_rate": 7.718070652173912e-05, "loss": 0.7891, "step": 6296 }, { "epoch": 0.9857545397620538, "grad_norm": 6.22396183013916, "learning_rate": 7.71688179347826e-05, "loss": 0.7095, "step": 6297 }, { "epoch": 0.9859110832811522, "grad_norm": 4.309599876403809, "learning_rate": 7.715692934782608e-05, "loss": 0.9471, "step": 6298 }, { "epoch": 0.9860676268002505, "grad_norm": 1.7303317785263062, "learning_rate": 7.714504076086955e-05, "loss": 0.3453, "step": 6299 }, { "epoch": 0.9862241703193487, "grad_norm": 1.9023922681808472, "learning_rate": 7.713315217391303e-05, "loss": 0.5027, "step": 6300 }, { "epoch": 0.986380713838447, "grad_norm": 0.5261631608009338, "learning_rate": 7.712126358695651e-05, "loss": 0.3373, "step": 6301 }, { "epoch": 0.9865372573575454, "grad_norm": 0.6902672052383423, "learning_rate": 7.710937499999999e-05, "loss": 0.2726, "step": 6302 }, { "epoch": 0.9866938008766437, "grad_norm": 0.607015073299408, "learning_rate": 7.709748641304347e-05, "loss": 0.314, "step": 6303 }, { "epoch": 0.986850344395742, "grad_norm": 0.4140775203704834, "learning_rate": 7.708559782608695e-05, "loss": 0.2427, "step": 6304 }, { "epoch": 0.9870068879148404, "grad_norm": 0.46970629692077637, "learning_rate": 7.707370923913044e-05, "loss": 0.1816, "step": 6305 }, { "epoch": 0.9871634314339386, "grad_norm": 1.2084412574768066, "learning_rate": 7.706182065217391e-05, "loss": 0.3306, "step": 6306 }, { "epoch": 0.9873199749530369, "grad_norm": 0.795584499835968, "learning_rate": 7.704993206521739e-05, "loss": 0.4177, "step": 6307 }, { "epoch": 0.9874765184721352, "grad_norm": 0.6087934970855713, "learning_rate": 7.703804347826087e-05, "loss": 0.2625, "step": 6308 }, { "epoch": 0.9876330619912336, "grad_norm": 0.8638258576393127, "learning_rate": 7.702615489130435e-05, "loss": 0.3222, "step": 6309 }, { "epoch": 0.9877896055103319, "grad_norm": 1.0475910902023315, "learning_rate": 7.701426630434783e-05, "loss": 0.3088, "step": 6310 }, { "epoch": 0.9879461490294302, "grad_norm": 0.8402711749076843, "learning_rate": 7.70023777173913e-05, "loss": 0.2755, "step": 6311 }, { "epoch": 0.9881026925485284, "grad_norm": 2.9019362926483154, "learning_rate": 7.699048913043478e-05, "loss": 0.5002, "step": 6312 }, { "epoch": 0.9882592360676268, "grad_norm": 0.9871068596839905, "learning_rate": 7.697860054347826e-05, "loss": 0.391, "step": 6313 }, { "epoch": 0.9884157795867251, "grad_norm": 0.9312556385993958, "learning_rate": 7.696671195652173e-05, "loss": 0.4883, "step": 6314 }, { "epoch": 0.9885723231058234, "grad_norm": 1.1027108430862427, "learning_rate": 7.69548233695652e-05, "loss": 0.418, "step": 6315 }, { "epoch": 0.9887288666249218, "grad_norm": 1.2418862581253052, "learning_rate": 7.694293478260868e-05, "loss": 0.5252, "step": 6316 }, { "epoch": 0.98888541014402, "grad_norm": 0.8051701784133911, "learning_rate": 7.693104619565216e-05, "loss": 0.2543, "step": 6317 }, { "epoch": 0.9890419536631183, "grad_norm": 4.758349895477295, "learning_rate": 7.691915760869564e-05, "loss": 1.078, "step": 6318 }, { "epoch": 0.9891984971822166, "grad_norm": 1.2540920972824097, "learning_rate": 7.690726902173912e-05, "loss": 0.5278, "step": 6319 }, { "epoch": 0.989355040701315, "grad_norm": 1.333819031715393, "learning_rate": 7.68953804347826e-05, "loss": 0.3321, "step": 6320 }, { "epoch": 0.9895115842204133, "grad_norm": 1.354090690612793, "learning_rate": 7.688349184782607e-05, "loss": 0.3308, "step": 6321 }, { "epoch": 0.9896681277395116, "grad_norm": 3.114743947982788, "learning_rate": 7.687160326086955e-05, "loss": 0.8965, "step": 6322 }, { "epoch": 0.9898246712586098, "grad_norm": 1.1855286359786987, "learning_rate": 7.685971467391303e-05, "loss": 0.49, "step": 6323 }, { "epoch": 0.9899812147777082, "grad_norm": 2.4625539779663086, "learning_rate": 7.684782608695651e-05, "loss": 0.561, "step": 6324 }, { "epoch": 0.9901377582968065, "grad_norm": 2.029081106185913, "learning_rate": 7.68359375e-05, "loss": 0.5917, "step": 6325 }, { "epoch": 0.9902943018159048, "grad_norm": 2.4051315784454346, "learning_rate": 7.682404891304348e-05, "loss": 0.4351, "step": 6326 }, { "epoch": 0.9904508453350032, "grad_norm": 1.6094017028808594, "learning_rate": 7.681216032608696e-05, "loss": 0.5486, "step": 6327 }, { "epoch": 0.9906073888541015, "grad_norm": 1.9165980815887451, "learning_rate": 7.680027173913043e-05, "loss": 0.6986, "step": 6328 }, { "epoch": 0.9907639323731997, "grad_norm": 3.3021066188812256, "learning_rate": 7.678838315217391e-05, "loss": 0.6288, "step": 6329 }, { "epoch": 0.990920475892298, "grad_norm": 2.149843692779541, "learning_rate": 7.677649456521739e-05, "loss": 0.7077, "step": 6330 }, { "epoch": 0.9910770194113964, "grad_norm": 1.4441592693328857, "learning_rate": 7.676460597826087e-05, "loss": 0.373, "step": 6331 }, { "epoch": 0.9912335629304947, "grad_norm": 3.323961019515991, "learning_rate": 7.675271739130435e-05, "loss": 0.7695, "step": 6332 }, { "epoch": 0.991390106449593, "grad_norm": 3.136991500854492, "learning_rate": 7.674082880434783e-05, "loss": 0.9579, "step": 6333 }, { "epoch": 0.9915466499686914, "grad_norm": 1.0111290216445923, "learning_rate": 7.67289402173913e-05, "loss": 0.4378, "step": 6334 }, { "epoch": 0.9917031934877896, "grad_norm": 2.039132595062256, "learning_rate": 7.671705163043478e-05, "loss": 0.7772, "step": 6335 }, { "epoch": 0.9918597370068879, "grad_norm": 2.5691778659820557, "learning_rate": 7.670516304347826e-05, "loss": 0.8589, "step": 6336 }, { "epoch": 0.9920162805259862, "grad_norm": 3.899750232696533, "learning_rate": 7.669327445652172e-05, "loss": 0.806, "step": 6337 }, { "epoch": 0.9921728240450846, "grad_norm": 3.2123732566833496, "learning_rate": 7.66813858695652e-05, "loss": 1.208, "step": 6338 }, { "epoch": 0.9923293675641829, "grad_norm": 2.8668837547302246, "learning_rate": 7.666949728260868e-05, "loss": 1.3482, "step": 6339 }, { "epoch": 0.9924859110832811, "grad_norm": 3.916611671447754, "learning_rate": 7.665760869565216e-05, "loss": 0.8786, "step": 6340 }, { "epoch": 0.9926424546023794, "grad_norm": 3.4708690643310547, "learning_rate": 7.664572010869564e-05, "loss": 1.0324, "step": 6341 }, { "epoch": 0.9927989981214778, "grad_norm": 2.6522860527038574, "learning_rate": 7.663383152173912e-05, "loss": 0.6626, "step": 6342 }, { "epoch": 0.9929555416405761, "grad_norm": 3.278883934020996, "learning_rate": 7.662194293478259e-05, "loss": 1.1472, "step": 6343 }, { "epoch": 0.9931120851596744, "grad_norm": 2.767601490020752, "learning_rate": 7.661005434782607e-05, "loss": 1.354, "step": 6344 }, { "epoch": 0.9932686286787727, "grad_norm": 2.6005399227142334, "learning_rate": 7.659816576086956e-05, "loss": 1.0358, "step": 6345 }, { "epoch": 0.993425172197871, "grad_norm": 2.4536871910095215, "learning_rate": 7.658627717391304e-05, "loss": 0.7261, "step": 6346 }, { "epoch": 0.9935817157169693, "grad_norm": 6.118600845336914, "learning_rate": 7.657438858695652e-05, "loss": 1.1911, "step": 6347 }, { "epoch": 0.9937382592360676, "grad_norm": 4.041357040405273, "learning_rate": 7.65625e-05, "loss": 0.3521, "step": 6348 }, { "epoch": 0.993894802755166, "grad_norm": 4.022017002105713, "learning_rate": 7.655061141304348e-05, "loss": 1.0088, "step": 6349 }, { "epoch": 0.9940513462742643, "grad_norm": 4.330567359924316, "learning_rate": 7.653872282608695e-05, "loss": 1.5823, "step": 6350 }, { "epoch": 0.9942078897933626, "grad_norm": 0.6632595658302307, "learning_rate": 7.652683423913043e-05, "loss": 0.2478, "step": 6351 }, { "epoch": 0.9943644333124608, "grad_norm": 1.1715456247329712, "learning_rate": 7.651494565217391e-05, "loss": 0.3565, "step": 6352 }, { "epoch": 0.9945209768315592, "grad_norm": 0.6758922934532166, "learning_rate": 7.650305706521739e-05, "loss": 0.2755, "step": 6353 }, { "epoch": 0.9946775203506575, "grad_norm": 0.7245686650276184, "learning_rate": 7.649116847826087e-05, "loss": 0.2789, "step": 6354 }, { "epoch": 0.9948340638697558, "grad_norm": 0.9284975528717041, "learning_rate": 7.647927989130434e-05, "loss": 0.3021, "step": 6355 }, { "epoch": 0.9949906073888541, "grad_norm": 0.6697148680686951, "learning_rate": 7.646739130434782e-05, "loss": 0.3272, "step": 6356 }, { "epoch": 0.9951471509079524, "grad_norm": 1.020462155342102, "learning_rate": 7.64555027173913e-05, "loss": 0.2795, "step": 6357 }, { "epoch": 0.9953036944270507, "grad_norm": 0.6910146474838257, "learning_rate": 7.644361413043478e-05, "loss": 0.3021, "step": 6358 }, { "epoch": 0.995460237946149, "grad_norm": 0.844189465045929, "learning_rate": 7.643172554347826e-05, "loss": 0.3617, "step": 6359 }, { "epoch": 0.9956167814652473, "grad_norm": 1.1689993143081665, "learning_rate": 7.641983695652172e-05, "loss": 0.4303, "step": 6360 }, { "epoch": 0.9957733249843457, "grad_norm": 1.6506588459014893, "learning_rate": 7.64079483695652e-05, "loss": 0.524, "step": 6361 }, { "epoch": 0.995929868503444, "grad_norm": 1.4250555038452148, "learning_rate": 7.639605978260868e-05, "loss": 0.6165, "step": 6362 }, { "epoch": 0.9960864120225422, "grad_norm": 0.8762028217315674, "learning_rate": 7.638417119565216e-05, "loss": 0.4523, "step": 6363 }, { "epoch": 0.9962429555416406, "grad_norm": 1.549350619316101, "learning_rate": 7.637228260869563e-05, "loss": 0.4435, "step": 6364 }, { "epoch": 0.9963994990607389, "grad_norm": 2.3429107666015625, "learning_rate": 7.636039402173913e-05, "loss": 0.6042, "step": 6365 }, { "epoch": 0.9965560425798372, "grad_norm": 2.14688777923584, "learning_rate": 7.63485054347826e-05, "loss": 0.2998, "step": 6366 }, { "epoch": 0.9967125860989355, "grad_norm": 2.147268533706665, "learning_rate": 7.633661684782608e-05, "loss": 0.4835, "step": 6367 }, { "epoch": 0.9968691296180339, "grad_norm": 2.2680180072784424, "learning_rate": 7.632472826086956e-05, "loss": 0.5081, "step": 6368 }, { "epoch": 0.9970256731371321, "grad_norm": 1.7378318309783936, "learning_rate": 7.631283967391304e-05, "loss": 0.5196, "step": 6369 }, { "epoch": 0.9971822166562304, "grad_norm": 2.8935177326202393, "learning_rate": 7.630095108695652e-05, "loss": 0.6659, "step": 6370 }, { "epoch": 0.9973387601753287, "grad_norm": 2.4998559951782227, "learning_rate": 7.62890625e-05, "loss": 0.5372, "step": 6371 }, { "epoch": 0.9974953036944271, "grad_norm": 2.8013362884521484, "learning_rate": 7.627717391304347e-05, "loss": 0.8928, "step": 6372 }, { "epoch": 0.9976518472135254, "grad_norm": 2.4261038303375244, "learning_rate": 7.626528532608695e-05, "loss": 0.9303, "step": 6373 }, { "epoch": 0.9978083907326236, "grad_norm": 2.0305798053741455, "learning_rate": 7.625339673913043e-05, "loss": 1.0551, "step": 6374 }, { "epoch": 0.997964934251722, "grad_norm": 2.4422719478607178, "learning_rate": 7.624150815217391e-05, "loss": 0.7331, "step": 6375 }, { "epoch": 0.9981214777708203, "grad_norm": 3.933943510055542, "learning_rate": 7.622961956521739e-05, "loss": 0.8755, "step": 6376 }, { "epoch": 0.9982780212899186, "grad_norm": 2.2634928226470947, "learning_rate": 7.621773097826086e-05, "loss": 1.0599, "step": 6377 }, { "epoch": 0.9984345648090169, "grad_norm": 3.2617249488830566, "learning_rate": 7.620584239130434e-05, "loss": 0.6112, "step": 6378 }, { "epoch": 0.9985911083281153, "grad_norm": 2.343459129333496, "learning_rate": 7.619395380434782e-05, "loss": 0.7943, "step": 6379 }, { "epoch": 0.9987476518472135, "grad_norm": 2.339581251144409, "learning_rate": 7.61820652173913e-05, "loss": 0.9265, "step": 6380 }, { "epoch": 0.9989041953663118, "grad_norm": 4.236205577850342, "learning_rate": 7.617017663043478e-05, "loss": 1.6129, "step": 6381 }, { "epoch": 0.9990607388854101, "grad_norm": 3.141669750213623, "learning_rate": 7.615828804347827e-05, "loss": 0.8349, "step": 6382 }, { "epoch": 0.9992172824045085, "grad_norm": 2.887883424758911, "learning_rate": 7.614639945652172e-05, "loss": 1.361, "step": 6383 }, { "epoch": 0.9993738259236068, "grad_norm": 1.7991834878921509, "learning_rate": 7.61345108695652e-05, "loss": 1.0732, "step": 6384 }, { "epoch": 0.9995303694427051, "grad_norm": 1.895387887954712, "learning_rate": 7.612262228260869e-05, "loss": 0.5491, "step": 6385 }, { "epoch": 0.9996869129618033, "grad_norm": 2.371849775314331, "learning_rate": 7.611073369565217e-05, "loss": 1.0694, "step": 6386 }, { "epoch": 0.9998434564809017, "grad_norm": 3.4722578525543213, "learning_rate": 7.609884510869565e-05, "loss": 1.4887, "step": 6387 }, { "epoch": 1.0, "grad_norm": 4.553646564483643, "learning_rate": 7.608695652173912e-05, "loss": 1.2063, "step": 6388 }, { "epoch": 1.0001565435190982, "grad_norm": 0.6001192927360535, "learning_rate": 7.60750679347826e-05, "loss": 0.3039, "step": 6389 }, { "epoch": 1.0003130870381967, "grad_norm": 0.4178409278392792, "learning_rate": 7.606317934782608e-05, "loss": 0.2128, "step": 6390 }, { "epoch": 1.0004696305572949, "grad_norm": 0.4665043354034424, "learning_rate": 7.605129076086956e-05, "loss": 0.2021, "step": 6391 }, { "epoch": 1.0006261740763933, "grad_norm": 0.6831966638565063, "learning_rate": 7.603940217391304e-05, "loss": 0.2721, "step": 6392 }, { "epoch": 1.0007827175954915, "grad_norm": 0.5500322580337524, "learning_rate": 7.602751358695651e-05, "loss": 0.2361, "step": 6393 }, { "epoch": 1.0009392611145898, "grad_norm": 0.6699058413505554, "learning_rate": 7.601562499999999e-05, "loss": 0.2524, "step": 6394 }, { "epoch": 1.0010958046336882, "grad_norm": 0.7705897092819214, "learning_rate": 7.600373641304347e-05, "loss": 0.3373, "step": 6395 }, { "epoch": 1.0012523481527864, "grad_norm": 0.6969059705734253, "learning_rate": 7.599184782608695e-05, "loss": 0.2538, "step": 6396 }, { "epoch": 1.0014088916718848, "grad_norm": 0.672387421131134, "learning_rate": 7.597995923913043e-05, "loss": 0.3028, "step": 6397 }, { "epoch": 1.001565435190983, "grad_norm": 0.4394746422767639, "learning_rate": 7.59680706521739e-05, "loss": 0.1555, "step": 6398 }, { "epoch": 1.0017219787100815, "grad_norm": 0.8397753834724426, "learning_rate": 7.595618206521738e-05, "loss": 0.3809, "step": 6399 }, { "epoch": 1.0018785222291797, "grad_norm": 0.6694694757461548, "learning_rate": 7.594429347826086e-05, "loss": 0.2888, "step": 6400 }, { "epoch": 1.002035065748278, "grad_norm": 1.0034270286560059, "learning_rate": 7.593240489130434e-05, "loss": 0.4026, "step": 6401 }, { "epoch": 1.0021916092673764, "grad_norm": 0.9187131524085999, "learning_rate": 7.592051630434783e-05, "loss": 0.4519, "step": 6402 }, { "epoch": 1.0023481527864746, "grad_norm": 1.2549426555633545, "learning_rate": 7.590862771739131e-05, "loss": 0.3526, "step": 6403 }, { "epoch": 1.002504696305573, "grad_norm": 1.6785026788711548, "learning_rate": 7.589673913043479e-05, "loss": 0.402, "step": 6404 }, { "epoch": 1.0026612398246713, "grad_norm": 2.0456597805023193, "learning_rate": 7.588485054347827e-05, "loss": 0.4025, "step": 6405 }, { "epoch": 1.0028177833437695, "grad_norm": 1.1898757219314575, "learning_rate": 7.587296195652173e-05, "loss": 0.4552, "step": 6406 }, { "epoch": 1.002974326862868, "grad_norm": 1.5748989582061768, "learning_rate": 7.586107336956521e-05, "loss": 0.4277, "step": 6407 }, { "epoch": 1.0031308703819661, "grad_norm": 0.8402990102767944, "learning_rate": 7.584918478260869e-05, "loss": 0.2448, "step": 6408 }, { "epoch": 1.0032874139010646, "grad_norm": 1.2277874946594238, "learning_rate": 7.583729619565217e-05, "loss": 0.3407, "step": 6409 }, { "epoch": 1.0034439574201628, "grad_norm": 1.482352614402771, "learning_rate": 7.582540760869564e-05, "loss": 0.3994, "step": 6410 }, { "epoch": 1.003600500939261, "grad_norm": 2.4624292850494385, "learning_rate": 7.581351902173912e-05, "loss": 0.342, "step": 6411 }, { "epoch": 1.0037570444583594, "grad_norm": 1.6444847583770752, "learning_rate": 7.58016304347826e-05, "loss": 0.3328, "step": 6412 }, { "epoch": 1.0039135879774577, "grad_norm": 1.6734468936920166, "learning_rate": 7.578974184782608e-05, "loss": 0.6024, "step": 6413 }, { "epoch": 1.004070131496556, "grad_norm": 1.70050847530365, "learning_rate": 7.577785326086956e-05, "loss": 0.44, "step": 6414 }, { "epoch": 1.0042266750156543, "grad_norm": 2.8348610401153564, "learning_rate": 7.576596467391303e-05, "loss": 0.7743, "step": 6415 }, { "epoch": 1.0043832185347528, "grad_norm": 1.8026468753814697, "learning_rate": 7.575407608695651e-05, "loss": 0.518, "step": 6416 }, { "epoch": 1.004539762053851, "grad_norm": 1.7979260683059692, "learning_rate": 7.574218749999999e-05, "loss": 0.58, "step": 6417 }, { "epoch": 1.0046963055729492, "grad_norm": 2.3760693073272705, "learning_rate": 7.573029891304347e-05, "loss": 0.8129, "step": 6418 }, { "epoch": 1.0048528490920476, "grad_norm": 3.341911792755127, "learning_rate": 7.571841032608695e-05, "loss": 0.7442, "step": 6419 }, { "epoch": 1.0050093926111459, "grad_norm": 2.9365475177764893, "learning_rate": 7.570652173913042e-05, "loss": 1.2944, "step": 6420 }, { "epoch": 1.0051659361302443, "grad_norm": 2.231358289718628, "learning_rate": 7.56946331521739e-05, "loss": 0.7099, "step": 6421 }, { "epoch": 1.0053224796493425, "grad_norm": 3.8128771781921387, "learning_rate": 7.56827445652174e-05, "loss": 1.1554, "step": 6422 }, { "epoch": 1.0054790231684407, "grad_norm": 2.4171152114868164, "learning_rate": 7.567085597826087e-05, "loss": 0.6582, "step": 6423 }, { "epoch": 1.0056355666875392, "grad_norm": 2.3684074878692627, "learning_rate": 7.565896739130435e-05, "loss": 0.9098, "step": 6424 }, { "epoch": 1.0057921102066374, "grad_norm": 2.75146222114563, "learning_rate": 7.564707880434783e-05, "loss": 0.6007, "step": 6425 }, { "epoch": 1.0059486537257358, "grad_norm": 1.7537280321121216, "learning_rate": 7.563519021739131e-05, "loss": 1.0167, "step": 6426 }, { "epoch": 1.006105197244834, "grad_norm": 4.291356086730957, "learning_rate": 7.562330163043479e-05, "loss": 1.3665, "step": 6427 }, { "epoch": 1.0062617407639323, "grad_norm": 1.9886155128479004, "learning_rate": 7.561141304347826e-05, "loss": 0.9156, "step": 6428 }, { "epoch": 1.0064182842830307, "grad_norm": 4.117796421051025, "learning_rate": 7.559952445652173e-05, "loss": 1.4701, "step": 6429 }, { "epoch": 1.006574827802129, "grad_norm": 3.7817461490631104, "learning_rate": 7.55876358695652e-05, "loss": 1.2271, "step": 6430 }, { "epoch": 1.0067313713212274, "grad_norm": 2.7326035499572754, "learning_rate": 7.557574728260868e-05, "loss": 1.6426, "step": 6431 }, { "epoch": 1.0068879148403256, "grad_norm": 19.162715911865234, "learning_rate": 7.556385869565216e-05, "loss": 1.4853, "step": 6432 }, { "epoch": 1.007044458359424, "grad_norm": 1.8472074270248413, "learning_rate": 7.555197010869564e-05, "loss": 1.0408, "step": 6433 }, { "epoch": 1.0072010018785222, "grad_norm": 3.577080726623535, "learning_rate": 7.554008152173912e-05, "loss": 0.9826, "step": 6434 }, { "epoch": 1.0073575453976205, "grad_norm": 1.4635287523269653, "learning_rate": 7.55281929347826e-05, "loss": 0.4099, "step": 6435 }, { "epoch": 1.007514088916719, "grad_norm": 1.3778190612792969, "learning_rate": 7.551630434782608e-05, "loss": 0.5893, "step": 6436 }, { "epoch": 1.0076706324358171, "grad_norm": 6.350305080413818, "learning_rate": 7.550441576086955e-05, "loss": 0.7736, "step": 6437 }, { "epoch": 1.0078271759549156, "grad_norm": 4.446704387664795, "learning_rate": 7.549252717391303e-05, "loss": 1.5658, "step": 6438 }, { "epoch": 1.0079837194740138, "grad_norm": 0.4691751301288605, "learning_rate": 7.548063858695651e-05, "loss": 0.2181, "step": 6439 }, { "epoch": 1.008140262993112, "grad_norm": 0.6122845411300659, "learning_rate": 7.546874999999999e-05, "loss": 0.2158, "step": 6440 }, { "epoch": 1.0082968065122104, "grad_norm": 0.567440390586853, "learning_rate": 7.545686141304347e-05, "loss": 0.1861, "step": 6441 }, { "epoch": 1.0084533500313086, "grad_norm": 0.44533684849739075, "learning_rate": 7.544497282608696e-05, "loss": 0.1717, "step": 6442 }, { "epoch": 1.008609893550407, "grad_norm": 0.3669249713420868, "learning_rate": 7.543308423913044e-05, "loss": 0.151, "step": 6443 }, { "epoch": 1.0087664370695053, "grad_norm": 0.42615944147109985, "learning_rate": 7.542119565217391e-05, "loss": 0.2197, "step": 6444 }, { "epoch": 1.0089229805886035, "grad_norm": 0.6324515342712402, "learning_rate": 7.540930706521739e-05, "loss": 0.1968, "step": 6445 }, { "epoch": 1.009079524107702, "grad_norm": 0.4388452470302582, "learning_rate": 7.539741847826087e-05, "loss": 0.23, "step": 6446 }, { "epoch": 1.0092360676268002, "grad_norm": 0.5916063785552979, "learning_rate": 7.538552989130435e-05, "loss": 0.2174, "step": 6447 }, { "epoch": 1.0093926111458986, "grad_norm": 0.8190235495567322, "learning_rate": 7.537364130434783e-05, "loss": 0.3226, "step": 6448 }, { "epoch": 1.0095491546649968, "grad_norm": 0.7548123002052307, "learning_rate": 7.53617527173913e-05, "loss": 0.2475, "step": 6449 }, { "epoch": 1.0097056981840953, "grad_norm": 1.2669299840927124, "learning_rate": 7.534986413043478e-05, "loss": 0.3083, "step": 6450 }, { "epoch": 1.0098622417031935, "grad_norm": 1.2313741445541382, "learning_rate": 7.533797554347826e-05, "loss": 0.316, "step": 6451 }, { "epoch": 1.0100187852222917, "grad_norm": 1.8489904403686523, "learning_rate": 7.532608695652173e-05, "loss": 0.4896, "step": 6452 }, { "epoch": 1.0101753287413902, "grad_norm": 0.6738482117652893, "learning_rate": 7.53141983695652e-05, "loss": 0.2647, "step": 6453 }, { "epoch": 1.0103318722604884, "grad_norm": 1.7635385990142822, "learning_rate": 7.530230978260868e-05, "loss": 0.4676, "step": 6454 }, { "epoch": 1.0104884157795868, "grad_norm": 1.1622222661972046, "learning_rate": 7.529042119565216e-05, "loss": 0.4658, "step": 6455 }, { "epoch": 1.010644959298685, "grad_norm": 1.9762064218521118, "learning_rate": 7.527853260869564e-05, "loss": 0.5199, "step": 6456 }, { "epoch": 1.0108015028177832, "grad_norm": 1.5878314971923828, "learning_rate": 7.526664402173912e-05, "loss": 0.4417, "step": 6457 }, { "epoch": 1.0109580463368817, "grad_norm": 0.9727537035942078, "learning_rate": 7.52547554347826e-05, "loss": 0.3808, "step": 6458 }, { "epoch": 1.01111458985598, "grad_norm": 1.0449767112731934, "learning_rate": 7.524286684782607e-05, "loss": 0.2539, "step": 6459 }, { "epoch": 1.0112711333750783, "grad_norm": 1.878714680671692, "learning_rate": 7.523097826086955e-05, "loss": 0.4982, "step": 6460 }, { "epoch": 1.0114276768941766, "grad_norm": 1.2134568691253662, "learning_rate": 7.521908967391303e-05, "loss": 0.536, "step": 6461 }, { "epoch": 1.0115842204132748, "grad_norm": 1.563976764678955, "learning_rate": 7.520720108695652e-05, "loss": 0.4837, "step": 6462 }, { "epoch": 1.0117407639323732, "grad_norm": 2.2560715675354004, "learning_rate": 7.51953125e-05, "loss": 0.3629, "step": 6463 }, { "epoch": 1.0118973074514714, "grad_norm": 1.1623791456222534, "learning_rate": 7.518342391304348e-05, "loss": 0.4423, "step": 6464 }, { "epoch": 1.0120538509705699, "grad_norm": 1.7022820711135864, "learning_rate": 7.517153532608696e-05, "loss": 0.6643, "step": 6465 }, { "epoch": 1.012210394489668, "grad_norm": 4.928578853607178, "learning_rate": 7.515964673913043e-05, "loss": 0.9989, "step": 6466 }, { "epoch": 1.0123669380087665, "grad_norm": 2.550072193145752, "learning_rate": 7.514775815217391e-05, "loss": 0.6056, "step": 6467 }, { "epoch": 1.0125234815278648, "grad_norm": 2.095518112182617, "learning_rate": 7.513586956521739e-05, "loss": 0.9581, "step": 6468 }, { "epoch": 1.012680025046963, "grad_norm": 2.1749167442321777, "learning_rate": 7.512398097826087e-05, "loss": 0.6624, "step": 6469 }, { "epoch": 1.0128365685660614, "grad_norm": 2.161616563796997, "learning_rate": 7.511209239130435e-05, "loss": 0.9275, "step": 6470 }, { "epoch": 1.0129931120851596, "grad_norm": 2.711392402648926, "learning_rate": 7.510020380434782e-05, "loss": 0.8749, "step": 6471 }, { "epoch": 1.013149655604258, "grad_norm": 2.3660900592803955, "learning_rate": 7.50883152173913e-05, "loss": 0.8154, "step": 6472 }, { "epoch": 1.0133061991233563, "grad_norm": 2.2174079418182373, "learning_rate": 7.507642663043478e-05, "loss": 0.7318, "step": 6473 }, { "epoch": 1.0134627426424545, "grad_norm": 2.5282206535339355, "learning_rate": 7.506453804347826e-05, "loss": 1.073, "step": 6474 }, { "epoch": 1.013619286161553, "grad_norm": 3.7436554431915283, "learning_rate": 7.505264945652172e-05, "loss": 1.0534, "step": 6475 }, { "epoch": 1.0137758296806512, "grad_norm": 3.4530110359191895, "learning_rate": 7.50407608695652e-05, "loss": 0.9144, "step": 6476 }, { "epoch": 1.0139323731997496, "grad_norm": 2.4044699668884277, "learning_rate": 7.502887228260868e-05, "loss": 0.7184, "step": 6477 }, { "epoch": 1.0140889167188478, "grad_norm": 2.417465925216675, "learning_rate": 7.501698369565216e-05, "loss": 1.643, "step": 6478 }, { "epoch": 1.014245460237946, "grad_norm": 2.391596794128418, "learning_rate": 7.500509510869564e-05, "loss": 0.7243, "step": 6479 }, { "epoch": 1.0144020037570445, "grad_norm": 2.1569626331329346, "learning_rate": 7.499320652173911e-05, "loss": 1.2422, "step": 6480 }, { "epoch": 1.0145585472761427, "grad_norm": 2.4277703762054443, "learning_rate": 7.498131793478259e-05, "loss": 1.3122, "step": 6481 }, { "epoch": 1.0147150907952411, "grad_norm": 2.2818732261657715, "learning_rate": 7.496942934782608e-05, "loss": 1.1466, "step": 6482 }, { "epoch": 1.0148716343143394, "grad_norm": 1.6942583322525024, "learning_rate": 7.495754076086956e-05, "loss": 0.4944, "step": 6483 }, { "epoch": 1.0150281778334378, "grad_norm": 3.959101676940918, "learning_rate": 7.494565217391304e-05, "loss": 0.5181, "step": 6484 }, { "epoch": 1.015184721352536, "grad_norm": 2.390839099884033, "learning_rate": 7.493376358695652e-05, "loss": 0.4375, "step": 6485 }, { "epoch": 1.0153412648716342, "grad_norm": 2.5121917724609375, "learning_rate": 7.4921875e-05, "loss": 0.8835, "step": 6486 }, { "epoch": 1.0154978083907327, "grad_norm": 2.3958444595336914, "learning_rate": 7.490998641304347e-05, "loss": 0.5208, "step": 6487 }, { "epoch": 1.0156543519098309, "grad_norm": 3.532904624938965, "learning_rate": 7.489809782608695e-05, "loss": 1.0215, "step": 6488 }, { "epoch": 1.0158108954289293, "grad_norm": 0.530795693397522, "learning_rate": 7.488620923913043e-05, "loss": 0.2748, "step": 6489 }, { "epoch": 1.0159674389480275, "grad_norm": 0.3939019441604614, "learning_rate": 7.487432065217391e-05, "loss": 0.1879, "step": 6490 }, { "epoch": 1.0161239824671258, "grad_norm": 0.4133587181568146, "learning_rate": 7.486243206521739e-05, "loss": 0.1798, "step": 6491 }, { "epoch": 1.0162805259862242, "grad_norm": 0.8522438406944275, "learning_rate": 7.485054347826087e-05, "loss": 0.3503, "step": 6492 }, { "epoch": 1.0164370695053224, "grad_norm": 0.7988050580024719, "learning_rate": 7.483865489130434e-05, "loss": 0.1788, "step": 6493 }, { "epoch": 1.0165936130244209, "grad_norm": 0.8898146152496338, "learning_rate": 7.482676630434782e-05, "loss": 0.1766, "step": 6494 }, { "epoch": 1.016750156543519, "grad_norm": 1.2646383047103882, "learning_rate": 7.48148777173913e-05, "loss": 0.3403, "step": 6495 }, { "epoch": 1.0169067000626173, "grad_norm": 1.1363064050674438, "learning_rate": 7.480298913043478e-05, "loss": 0.3767, "step": 6496 }, { "epoch": 1.0170632435817157, "grad_norm": 0.4901716709136963, "learning_rate": 7.479110054347826e-05, "loss": 0.226, "step": 6497 }, { "epoch": 1.017219787100814, "grad_norm": 0.7146872878074646, "learning_rate": 7.477921195652172e-05, "loss": 0.2613, "step": 6498 }, { "epoch": 1.0173763306199124, "grad_norm": 1.2413454055786133, "learning_rate": 7.47673233695652e-05, "loss": 0.2978, "step": 6499 }, { "epoch": 1.0175328741390106, "grad_norm": 3.5597922801971436, "learning_rate": 7.475543478260868e-05, "loss": 1.0911, "step": 6500 }, { "epoch": 1.017689417658109, "grad_norm": 1.024419903755188, "learning_rate": 7.474354619565216e-05, "loss": 0.2796, "step": 6501 }, { "epoch": 1.0178459611772073, "grad_norm": 1.92246675491333, "learning_rate": 7.473165760869565e-05, "loss": 0.5319, "step": 6502 }, { "epoch": 1.0180025046963055, "grad_norm": 2.332329511642456, "learning_rate": 7.471976902173913e-05, "loss": 0.3632, "step": 6503 }, { "epoch": 1.018159048215404, "grad_norm": 1.2345008850097656, "learning_rate": 7.47078804347826e-05, "loss": 0.4228, "step": 6504 }, { "epoch": 1.0183155917345021, "grad_norm": 3.274994134902954, "learning_rate": 7.469599184782608e-05, "loss": 0.3653, "step": 6505 }, { "epoch": 1.0184721352536006, "grad_norm": 1.1013717651367188, "learning_rate": 7.468410326086956e-05, "loss": 0.434, "step": 6506 }, { "epoch": 1.0186286787726988, "grad_norm": 1.3186479806900024, "learning_rate": 7.467221467391304e-05, "loss": 0.5261, "step": 6507 }, { "epoch": 1.018785222291797, "grad_norm": 0.970658004283905, "learning_rate": 7.466032608695652e-05, "loss": 0.2383, "step": 6508 }, { "epoch": 1.0189417658108955, "grad_norm": 1.5805773735046387, "learning_rate": 7.46484375e-05, "loss": 0.3825, "step": 6509 }, { "epoch": 1.0190983093299937, "grad_norm": 1.3062975406646729, "learning_rate": 7.463654891304347e-05, "loss": 0.4313, "step": 6510 }, { "epoch": 1.0192548528490921, "grad_norm": 3.119933843612671, "learning_rate": 7.462466032608695e-05, "loss": 0.5805, "step": 6511 }, { "epoch": 1.0194113963681903, "grad_norm": 1.33110511302948, "learning_rate": 7.461277173913043e-05, "loss": 0.4892, "step": 6512 }, { "epoch": 1.0195679398872888, "grad_norm": 1.2369314432144165, "learning_rate": 7.460088315217391e-05, "loss": 0.3699, "step": 6513 }, { "epoch": 1.019724483406387, "grad_norm": 3.8935961723327637, "learning_rate": 7.458899456521738e-05, "loss": 1.5945, "step": 6514 }, { "epoch": 1.0198810269254852, "grad_norm": 1.7183773517608643, "learning_rate": 7.457710597826086e-05, "loss": 0.5625, "step": 6515 }, { "epoch": 1.0200375704445837, "grad_norm": 2.040800094604492, "learning_rate": 7.456521739130434e-05, "loss": 0.6876, "step": 6516 }, { "epoch": 1.0201941139636819, "grad_norm": 1.12918221950531, "learning_rate": 7.455332880434782e-05, "loss": 0.3491, "step": 6517 }, { "epoch": 1.0203506574827803, "grad_norm": 2.508899688720703, "learning_rate": 7.45414402173913e-05, "loss": 0.6808, "step": 6518 }, { "epoch": 1.0205072010018785, "grad_norm": 3.2516815662384033, "learning_rate": 7.452955163043479e-05, "loss": 0.9266, "step": 6519 }, { "epoch": 1.0206637445209767, "grad_norm": 1.9316941499710083, "learning_rate": 7.451766304347827e-05, "loss": 0.586, "step": 6520 }, { "epoch": 1.0208202880400752, "grad_norm": 2.0345582962036133, "learning_rate": 7.450577445652172e-05, "loss": 0.8896, "step": 6521 }, { "epoch": 1.0209768315591734, "grad_norm": 2.193934440612793, "learning_rate": 7.449388586956521e-05, "loss": 0.4009, "step": 6522 }, { "epoch": 1.0211333750782718, "grad_norm": 2.579660415649414, "learning_rate": 7.448199728260869e-05, "loss": 0.6667, "step": 6523 }, { "epoch": 1.02128991859737, "grad_norm": 3.2063286304473877, "learning_rate": 7.447010869565217e-05, "loss": 1.0517, "step": 6524 }, { "epoch": 1.0214464621164683, "grad_norm": 2.113741636276245, "learning_rate": 7.445822010869564e-05, "loss": 0.5844, "step": 6525 }, { "epoch": 1.0216030056355667, "grad_norm": 7.344755172729492, "learning_rate": 7.444633152173912e-05, "loss": 0.9708, "step": 6526 }, { "epoch": 1.021759549154665, "grad_norm": 3.3634283542633057, "learning_rate": 7.44344429347826e-05, "loss": 1.1624, "step": 6527 }, { "epoch": 1.0219160926737634, "grad_norm": 4.631773471832275, "learning_rate": 7.442255434782608e-05, "loss": 0.953, "step": 6528 }, { "epoch": 1.0220726361928616, "grad_norm": 2.522754669189453, "learning_rate": 7.441066576086956e-05, "loss": 0.451, "step": 6529 }, { "epoch": 1.0222291797119598, "grad_norm": 5.221353530883789, "learning_rate": 7.439877717391304e-05, "loss": 2.0493, "step": 6530 }, { "epoch": 1.0223857232310583, "grad_norm": 5.680267810821533, "learning_rate": 7.438688858695651e-05, "loss": 1.3798, "step": 6531 }, { "epoch": 1.0225422667501565, "grad_norm": 5.642065525054932, "learning_rate": 7.437499999999999e-05, "loss": 2.3934, "step": 6532 }, { "epoch": 1.022698810269255, "grad_norm": 3.597320318222046, "learning_rate": 7.436311141304347e-05, "loss": 1.1679, "step": 6533 }, { "epoch": 1.0228553537883531, "grad_norm": 2.764183521270752, "learning_rate": 7.435122282608695e-05, "loss": 0.4771, "step": 6534 }, { "epoch": 1.0230118973074516, "grad_norm": 2.630017042160034, "learning_rate": 7.433933423913043e-05, "loss": 0.6356, "step": 6535 }, { "epoch": 1.0231684408265498, "grad_norm": 2.3017687797546387, "learning_rate": 7.43274456521739e-05, "loss": 0.3422, "step": 6536 }, { "epoch": 1.023324984345648, "grad_norm": 4.046975135803223, "learning_rate": 7.431555706521738e-05, "loss": 1.0657, "step": 6537 }, { "epoch": 1.0234815278647464, "grad_norm": 3.5612523555755615, "learning_rate": 7.430366847826086e-05, "loss": 1.1974, "step": 6538 }, { "epoch": 1.0236380713838447, "grad_norm": 1.0258066654205322, "learning_rate": 7.429177989130435e-05, "loss": 0.3171, "step": 6539 }, { "epoch": 1.023794614902943, "grad_norm": 0.8265370726585388, "learning_rate": 7.427989130434783e-05, "loss": 0.202, "step": 6540 }, { "epoch": 1.0239511584220413, "grad_norm": 0.7159648537635803, "learning_rate": 7.426800271739131e-05, "loss": 0.3532, "step": 6541 }, { "epoch": 1.0241077019411395, "grad_norm": 0.5166998505592346, "learning_rate": 7.425611413043479e-05, "loss": 0.2068, "step": 6542 }, { "epoch": 1.024264245460238, "grad_norm": 0.5321266055107117, "learning_rate": 7.424422554347827e-05, "loss": 0.2066, "step": 6543 }, { "epoch": 1.0244207889793362, "grad_norm": 0.43547409772872925, "learning_rate": 7.423233695652173e-05, "loss": 0.1508, "step": 6544 }, { "epoch": 1.0245773324984346, "grad_norm": 0.7692564129829407, "learning_rate": 7.422044836956521e-05, "loss": 0.2916, "step": 6545 }, { "epoch": 1.0247338760175329, "grad_norm": 0.5208930373191833, "learning_rate": 7.420855978260869e-05, "loss": 0.2205, "step": 6546 }, { "epoch": 1.0248904195366313, "grad_norm": 0.591747522354126, "learning_rate": 7.419667119565216e-05, "loss": 0.1535, "step": 6547 }, { "epoch": 1.0250469630557295, "grad_norm": 0.8271506428718567, "learning_rate": 7.418478260869564e-05, "loss": 0.2839, "step": 6548 }, { "epoch": 1.0252035065748277, "grad_norm": 1.0175583362579346, "learning_rate": 7.417289402173912e-05, "loss": 0.2301, "step": 6549 }, { "epoch": 1.0253600500939262, "grad_norm": 0.8600952625274658, "learning_rate": 7.41610054347826e-05, "loss": 0.2877, "step": 6550 }, { "epoch": 1.0255165936130244, "grad_norm": 0.8018500804901123, "learning_rate": 7.414911684782608e-05, "loss": 0.2903, "step": 6551 }, { "epoch": 1.0256731371321228, "grad_norm": 1.3432765007019043, "learning_rate": 7.413722826086955e-05, "loss": 0.2309, "step": 6552 }, { "epoch": 1.025829680651221, "grad_norm": 1.432913064956665, "learning_rate": 7.412533967391303e-05, "loss": 0.3055, "step": 6553 }, { "epoch": 1.0259862241703193, "grad_norm": 1.4969290494918823, "learning_rate": 7.411345108695651e-05, "loss": 0.2714, "step": 6554 }, { "epoch": 1.0261427676894177, "grad_norm": 2.2575843334198, "learning_rate": 7.410156249999999e-05, "loss": 0.7041, "step": 6555 }, { "epoch": 1.026299311208516, "grad_norm": 1.5013397932052612, "learning_rate": 7.408967391304347e-05, "loss": 0.4507, "step": 6556 }, { "epoch": 1.0264558547276144, "grad_norm": 1.641891598701477, "learning_rate": 7.407778532608695e-05, "loss": 0.5029, "step": 6557 }, { "epoch": 1.0266123982467126, "grad_norm": 1.627663493156433, "learning_rate": 7.406589673913042e-05, "loss": 0.3212, "step": 6558 }, { "epoch": 1.0267689417658108, "grad_norm": 1.5232723951339722, "learning_rate": 7.405400815217392e-05, "loss": 0.2443, "step": 6559 }, { "epoch": 1.0269254852849092, "grad_norm": 3.626127243041992, "learning_rate": 7.40421195652174e-05, "loss": 0.608, "step": 6560 }, { "epoch": 1.0270820288040075, "grad_norm": 1.4446834325790405, "learning_rate": 7.403023097826087e-05, "loss": 0.5226, "step": 6561 }, { "epoch": 1.027238572323106, "grad_norm": 1.4086202383041382, "learning_rate": 7.401834239130435e-05, "loss": 0.4609, "step": 6562 }, { "epoch": 1.027395115842204, "grad_norm": 1.612732172012329, "learning_rate": 7.400645380434783e-05, "loss": 0.3843, "step": 6563 }, { "epoch": 1.0275516593613025, "grad_norm": 1.288953423500061, "learning_rate": 7.39945652173913e-05, "loss": 0.5596, "step": 6564 }, { "epoch": 1.0277082028804008, "grad_norm": 1.781461477279663, "learning_rate": 7.398267663043478e-05, "loss": 0.4535, "step": 6565 }, { "epoch": 1.027864746399499, "grad_norm": 1.2972556352615356, "learning_rate": 7.397078804347826e-05, "loss": 0.5715, "step": 6566 }, { "epoch": 1.0280212899185974, "grad_norm": 2.7533719539642334, "learning_rate": 7.395889945652173e-05, "loss": 0.9495, "step": 6567 }, { "epoch": 1.0281778334376956, "grad_norm": 2.003894805908203, "learning_rate": 7.39470108695652e-05, "loss": 0.8758, "step": 6568 }, { "epoch": 1.028334376956794, "grad_norm": 1.741528868675232, "learning_rate": 7.393512228260868e-05, "loss": 0.6945, "step": 6569 }, { "epoch": 1.0284909204758923, "grad_norm": 2.608429193496704, "learning_rate": 7.392323369565216e-05, "loss": 1.0679, "step": 6570 }, { "epoch": 1.0286474639949905, "grad_norm": 2.417332887649536, "learning_rate": 7.391134510869564e-05, "loss": 0.8213, "step": 6571 }, { "epoch": 1.028804007514089, "grad_norm": 2.42399001121521, "learning_rate": 7.389945652173912e-05, "loss": 0.9078, "step": 6572 }, { "epoch": 1.0289605510331872, "grad_norm": 1.8630995750427246, "learning_rate": 7.38875679347826e-05, "loss": 1.0964, "step": 6573 }, { "epoch": 1.0291170945522856, "grad_norm": 2.5436573028564453, "learning_rate": 7.387567934782607e-05, "loss": 0.9192, "step": 6574 }, { "epoch": 1.0292736380713838, "grad_norm": 5.027364730834961, "learning_rate": 7.386379076086955e-05, "loss": 1.6422, "step": 6575 }, { "epoch": 1.029430181590482, "grad_norm": 3.64905047416687, "learning_rate": 7.385190217391303e-05, "loss": 1.3195, "step": 6576 }, { "epoch": 1.0295867251095805, "grad_norm": 1.8479515314102173, "learning_rate": 7.384001358695651e-05, "loss": 0.7818, "step": 6577 }, { "epoch": 1.0297432686286787, "grad_norm": 2.5276098251342773, "learning_rate": 7.382812499999999e-05, "loss": 1.1045, "step": 6578 }, { "epoch": 1.0298998121477771, "grad_norm": 4.901021957397461, "learning_rate": 7.381623641304348e-05, "loss": 1.2489, "step": 6579 }, { "epoch": 1.0300563556668754, "grad_norm": 3.2178430557250977, "learning_rate": 7.380434782608696e-05, "loss": 1.1241, "step": 6580 }, { "epoch": 1.0302128991859738, "grad_norm": 2.4693567752838135, "learning_rate": 7.379245923913043e-05, "loss": 0.8659, "step": 6581 }, { "epoch": 1.030369442705072, "grad_norm": 2.549469470977783, "learning_rate": 7.378057065217391e-05, "loss": 1.2706, "step": 6582 }, { "epoch": 1.0305259862241702, "grad_norm": 2.4976537227630615, "learning_rate": 7.376868206521739e-05, "loss": 1.4526, "step": 6583 }, { "epoch": 1.0306825297432687, "grad_norm": 2.164929151535034, "learning_rate": 7.375679347826087e-05, "loss": 0.5519, "step": 6584 }, { "epoch": 1.030839073262367, "grad_norm": 4.438109874725342, "learning_rate": 7.374490489130435e-05, "loss": 0.8093, "step": 6585 }, { "epoch": 1.0309956167814653, "grad_norm": 1.5766468048095703, "learning_rate": 7.373301630434783e-05, "loss": 0.8197, "step": 6586 }, { "epoch": 1.0311521603005636, "grad_norm": 2.4296581745147705, "learning_rate": 7.37211277173913e-05, "loss": 0.7125, "step": 6587 }, { "epoch": 1.0313087038196618, "grad_norm": 2.8091585636138916, "learning_rate": 7.370923913043478e-05, "loss": 0.6276, "step": 6588 }, { "epoch": 1.0314652473387602, "grad_norm": 0.6448042988777161, "learning_rate": 7.369735054347826e-05, "loss": 0.2244, "step": 6589 }, { "epoch": 1.0316217908578584, "grad_norm": 0.591480553150177, "learning_rate": 7.368546195652172e-05, "loss": 0.3146, "step": 6590 }, { "epoch": 1.0317783343769569, "grad_norm": 1.4256582260131836, "learning_rate": 7.36735733695652e-05, "loss": 0.274, "step": 6591 }, { "epoch": 1.031934877896055, "grad_norm": 1.8251866102218628, "learning_rate": 7.366168478260868e-05, "loss": 0.2989, "step": 6592 }, { "epoch": 1.0320914214151533, "grad_norm": 0.8277043700218201, "learning_rate": 7.364979619565216e-05, "loss": 0.3648, "step": 6593 }, { "epoch": 1.0322479649342517, "grad_norm": 0.8065482974052429, "learning_rate": 7.363790760869564e-05, "loss": 0.2926, "step": 6594 }, { "epoch": 1.03240450845335, "grad_norm": 0.7568385601043701, "learning_rate": 7.362601902173912e-05, "loss": 0.2635, "step": 6595 }, { "epoch": 1.0325610519724484, "grad_norm": 1.2743661403656006, "learning_rate": 7.36141304347826e-05, "loss": 0.4315, "step": 6596 }, { "epoch": 1.0327175954915466, "grad_norm": 0.7147644758224487, "learning_rate": 7.360224184782607e-05, "loss": 0.2698, "step": 6597 }, { "epoch": 1.032874139010645, "grad_norm": 0.6902076601982117, "learning_rate": 7.359035326086955e-05, "loss": 0.3303, "step": 6598 }, { "epoch": 1.0330306825297433, "grad_norm": 0.7016822695732117, "learning_rate": 7.357846467391304e-05, "loss": 0.2066, "step": 6599 }, { "epoch": 1.0331872260488415, "grad_norm": 1.679163932800293, "learning_rate": 7.356657608695652e-05, "loss": 0.2163, "step": 6600 }, { "epoch": 1.03334376956794, "grad_norm": 1.1165539026260376, "learning_rate": 7.35546875e-05, "loss": 0.2382, "step": 6601 }, { "epoch": 1.0335003130870382, "grad_norm": 1.3484091758728027, "learning_rate": 7.354279891304348e-05, "loss": 0.2644, "step": 6602 }, { "epoch": 1.0336568566061366, "grad_norm": 1.462308645248413, "learning_rate": 7.353091032608695e-05, "loss": 0.3812, "step": 6603 }, { "epoch": 1.0338134001252348, "grad_norm": 1.383228063583374, "learning_rate": 7.351902173913043e-05, "loss": 0.4029, "step": 6604 }, { "epoch": 1.033969943644333, "grad_norm": 1.5172810554504395, "learning_rate": 7.350713315217391e-05, "loss": 0.3171, "step": 6605 }, { "epoch": 1.0341264871634315, "grad_norm": 2.0369606018066406, "learning_rate": 7.349524456521739e-05, "loss": 0.4376, "step": 6606 }, { "epoch": 1.0342830306825297, "grad_norm": 0.7087504267692566, "learning_rate": 7.348335597826087e-05, "loss": 0.1737, "step": 6607 }, { "epoch": 1.0344395742016281, "grad_norm": 1.903241515159607, "learning_rate": 7.347146739130435e-05, "loss": 0.4287, "step": 6608 }, { "epoch": 1.0345961177207263, "grad_norm": 2.2313735485076904, "learning_rate": 7.345957880434782e-05, "loss": 0.4001, "step": 6609 }, { "epoch": 1.0347526612398246, "grad_norm": 4.286136150360107, "learning_rate": 7.34476902173913e-05, "loss": 0.7221, "step": 6610 }, { "epoch": 1.034909204758923, "grad_norm": 3.323439359664917, "learning_rate": 7.343580163043478e-05, "loss": 0.7724, "step": 6611 }, { "epoch": 1.0350657482780212, "grad_norm": 3.745821475982666, "learning_rate": 7.342391304347826e-05, "loss": 0.5761, "step": 6612 }, { "epoch": 1.0352222917971197, "grad_norm": 1.9806195497512817, "learning_rate": 7.341202445652172e-05, "loss": 0.5736, "step": 6613 }, { "epoch": 1.0353788353162179, "grad_norm": 3.214118719100952, "learning_rate": 7.34001358695652e-05, "loss": 0.4881, "step": 6614 }, { "epoch": 1.0355353788353163, "grad_norm": 1.205475926399231, "learning_rate": 7.338824728260868e-05, "loss": 0.4395, "step": 6615 }, { "epoch": 1.0356919223544145, "grad_norm": 2.218646287918091, "learning_rate": 7.337635869565216e-05, "loss": 0.7622, "step": 6616 }, { "epoch": 1.0358484658735128, "grad_norm": 2.0042080879211426, "learning_rate": 7.336447010869564e-05, "loss": 0.5194, "step": 6617 }, { "epoch": 1.0360050093926112, "grad_norm": 3.3782403469085693, "learning_rate": 7.335258152173911e-05, "loss": 0.8407, "step": 6618 }, { "epoch": 1.0361615529117094, "grad_norm": 2.8393168449401855, "learning_rate": 7.33406929347826e-05, "loss": 0.6299, "step": 6619 }, { "epoch": 1.0363180964308079, "grad_norm": 7.916807174682617, "learning_rate": 7.332880434782608e-05, "loss": 1.3871, "step": 6620 }, { "epoch": 1.036474639949906, "grad_norm": 5.563041687011719, "learning_rate": 7.331691576086956e-05, "loss": 0.7699, "step": 6621 }, { "epoch": 1.0366311834690043, "grad_norm": 4.00060510635376, "learning_rate": 7.330502717391304e-05, "loss": 0.5302, "step": 6622 }, { "epoch": 1.0367877269881027, "grad_norm": 2.383948564529419, "learning_rate": 7.329313858695652e-05, "loss": 0.7452, "step": 6623 }, { "epoch": 1.036944270507201, "grad_norm": 1.4827508926391602, "learning_rate": 7.328125e-05, "loss": 0.7944, "step": 6624 }, { "epoch": 1.0371008140262994, "grad_norm": 3.4058096408843994, "learning_rate": 7.326936141304347e-05, "loss": 0.6257, "step": 6625 }, { "epoch": 1.0372573575453976, "grad_norm": 2.2584023475646973, "learning_rate": 7.325747282608695e-05, "loss": 0.8417, "step": 6626 }, { "epoch": 1.0374139010644958, "grad_norm": 3.593433380126953, "learning_rate": 7.324558423913043e-05, "loss": 0.5628, "step": 6627 }, { "epoch": 1.0375704445835943, "grad_norm": 2.491833448410034, "learning_rate": 7.323369565217391e-05, "loss": 0.5795, "step": 6628 }, { "epoch": 1.0377269881026925, "grad_norm": 3.130948305130005, "learning_rate": 7.322180706521739e-05, "loss": 0.8587, "step": 6629 }, { "epoch": 1.037883531621791, "grad_norm": 4.65477180480957, "learning_rate": 7.320991847826086e-05, "loss": 1.4065, "step": 6630 }, { "epoch": 1.0380400751408891, "grad_norm": 2.916506767272949, "learning_rate": 7.319802989130434e-05, "loss": 1.5237, "step": 6631 }, { "epoch": 1.0381966186599876, "grad_norm": 2.492412567138672, "learning_rate": 7.318614130434782e-05, "loss": 1.1683, "step": 6632 }, { "epoch": 1.0383531621790858, "grad_norm": 3.592910051345825, "learning_rate": 7.31742527173913e-05, "loss": 1.1632, "step": 6633 }, { "epoch": 1.038509705698184, "grad_norm": 3.6367557048797607, "learning_rate": 7.316236413043478e-05, "loss": 0.4513, "step": 6634 }, { "epoch": 1.0386662492172825, "grad_norm": 4.080811977386475, "learning_rate": 7.315047554347826e-05, "loss": 0.6758, "step": 6635 }, { "epoch": 1.0388227927363807, "grad_norm": 2.8109793663024902, "learning_rate": 7.313858695652172e-05, "loss": 0.5352, "step": 6636 }, { "epoch": 1.0389793362554791, "grad_norm": 2.5042009353637695, "learning_rate": 7.31266983695652e-05, "loss": 0.446, "step": 6637 }, { "epoch": 1.0391358797745773, "grad_norm": 1.987352967262268, "learning_rate": 7.311480978260869e-05, "loss": 0.7228, "step": 6638 }, { "epoch": 1.0392924232936755, "grad_norm": 0.740470290184021, "learning_rate": 7.310292119565217e-05, "loss": 0.1966, "step": 6639 }, { "epoch": 1.039448966812774, "grad_norm": 0.4670119881629944, "learning_rate": 7.309103260869565e-05, "loss": 0.1988, "step": 6640 }, { "epoch": 1.0396055103318722, "grad_norm": 0.6226357817649841, "learning_rate": 7.307914402173912e-05, "loss": 0.1486, "step": 6641 }, { "epoch": 1.0397620538509706, "grad_norm": 0.5998409390449524, "learning_rate": 7.30672554347826e-05, "loss": 0.2258, "step": 6642 }, { "epoch": 1.0399185973700689, "grad_norm": 0.7351714372634888, "learning_rate": 7.305536684782608e-05, "loss": 0.2001, "step": 6643 }, { "epoch": 1.040075140889167, "grad_norm": 0.9992901682853699, "learning_rate": 7.304347826086956e-05, "loss": 0.2626, "step": 6644 }, { "epoch": 1.0402316844082655, "grad_norm": 0.7380092740058899, "learning_rate": 7.303158967391304e-05, "loss": 0.2352, "step": 6645 }, { "epoch": 1.0403882279273637, "grad_norm": 0.6874791383743286, "learning_rate": 7.301970108695652e-05, "loss": 0.2766, "step": 6646 }, { "epoch": 1.0405447714464622, "grad_norm": 5.664193630218506, "learning_rate": 7.30078125e-05, "loss": 0.9578, "step": 6647 }, { "epoch": 1.0407013149655604, "grad_norm": 0.9669755101203918, "learning_rate": 7.299592391304347e-05, "loss": 0.2616, "step": 6648 }, { "epoch": 1.0408578584846588, "grad_norm": 1.044015884399414, "learning_rate": 7.298403532608695e-05, "loss": 0.3189, "step": 6649 }, { "epoch": 1.041014402003757, "grad_norm": 1.1144205331802368, "learning_rate": 7.297214673913043e-05, "loss": 0.2848, "step": 6650 }, { "epoch": 1.0411709455228553, "grad_norm": 0.7769481539726257, "learning_rate": 7.29602581521739e-05, "loss": 0.2921, "step": 6651 }, { "epoch": 1.0413274890419537, "grad_norm": 1.2420200109481812, "learning_rate": 7.294836956521738e-05, "loss": 0.3648, "step": 6652 }, { "epoch": 1.041484032561052, "grad_norm": 0.848224937915802, "learning_rate": 7.293648097826086e-05, "loss": 0.2893, "step": 6653 }, { "epoch": 1.0416405760801504, "grad_norm": 2.0525639057159424, "learning_rate": 7.292459239130434e-05, "loss": 0.6143, "step": 6654 }, { "epoch": 1.0417971195992486, "grad_norm": 0.8527804017066956, "learning_rate": 7.291270380434782e-05, "loss": 0.3466, "step": 6655 }, { "epoch": 1.0419536631183468, "grad_norm": 1.849114179611206, "learning_rate": 7.290081521739131e-05, "loss": 0.5473, "step": 6656 }, { "epoch": 1.0421102066374452, "grad_norm": 3.173233985900879, "learning_rate": 7.288892663043479e-05, "loss": 0.4924, "step": 6657 }, { "epoch": 1.0422667501565435, "grad_norm": 1.2863091230392456, "learning_rate": 7.287703804347827e-05, "loss": 0.5234, "step": 6658 }, { "epoch": 1.042423293675642, "grad_norm": 0.865753173828125, "learning_rate": 7.286514945652173e-05, "loss": 0.3597, "step": 6659 }, { "epoch": 1.0425798371947401, "grad_norm": 1.6976008415222168, "learning_rate": 7.285326086956521e-05, "loss": 0.334, "step": 6660 }, { "epoch": 1.0427363807138383, "grad_norm": 1.3009635210037231, "learning_rate": 7.284137228260869e-05, "loss": 0.3789, "step": 6661 }, { "epoch": 1.0428929242329368, "grad_norm": 1.4918591976165771, "learning_rate": 7.282948369565217e-05, "loss": 0.4461, "step": 6662 }, { "epoch": 1.043049467752035, "grad_norm": 2.5654542446136475, "learning_rate": 7.281759510869564e-05, "loss": 0.438, "step": 6663 }, { "epoch": 1.0432060112711334, "grad_norm": 2.7973122596740723, "learning_rate": 7.280570652173912e-05, "loss": 0.7971, "step": 6664 }, { "epoch": 1.0433625547902317, "grad_norm": 2.3838679790496826, "learning_rate": 7.27938179347826e-05, "loss": 0.6383, "step": 6665 }, { "epoch": 1.04351909830933, "grad_norm": 2.0948374271392822, "learning_rate": 7.278192934782608e-05, "loss": 0.4692, "step": 6666 }, { "epoch": 1.0436756418284283, "grad_norm": 1.9463791847229004, "learning_rate": 7.277004076086956e-05, "loss": 0.559, "step": 6667 }, { "epoch": 1.0438321853475265, "grad_norm": 1.7207005023956299, "learning_rate": 7.275815217391303e-05, "loss": 0.6452, "step": 6668 }, { "epoch": 1.043988728866625, "grad_norm": 2.666168689727783, "learning_rate": 7.274626358695651e-05, "loss": 0.8553, "step": 6669 }, { "epoch": 1.0441452723857232, "grad_norm": 1.561698317527771, "learning_rate": 7.273437499999999e-05, "loss": 0.3626, "step": 6670 }, { "epoch": 1.0443018159048216, "grad_norm": 1.716331124305725, "learning_rate": 7.272248641304347e-05, "loss": 0.5804, "step": 6671 }, { "epoch": 1.0444583594239198, "grad_norm": 3.9577455520629883, "learning_rate": 7.271059782608695e-05, "loss": 0.9082, "step": 6672 }, { "epoch": 1.044614902943018, "grad_norm": 4.280698299407959, "learning_rate": 7.269870923913043e-05, "loss": 1.0076, "step": 6673 }, { "epoch": 1.0447714464621165, "grad_norm": 1.8609548807144165, "learning_rate": 7.26868206521739e-05, "loss": 0.8146, "step": 6674 }, { "epoch": 1.0449279899812147, "grad_norm": 2.6442410945892334, "learning_rate": 7.267493206521738e-05, "loss": 0.494, "step": 6675 }, { "epoch": 1.0450845335003132, "grad_norm": 3.3722870349884033, "learning_rate": 7.266304347826087e-05, "loss": 1.2655, "step": 6676 }, { "epoch": 1.0452410770194114, "grad_norm": 3.7704365253448486, "learning_rate": 7.265115489130435e-05, "loss": 1.3899, "step": 6677 }, { "epoch": 1.0453976205385098, "grad_norm": 4.12312126159668, "learning_rate": 7.263926630434783e-05, "loss": 1.1436, "step": 6678 }, { "epoch": 1.045554164057608, "grad_norm": 3.2051970958709717, "learning_rate": 7.262737771739131e-05, "loss": 1.0459, "step": 6679 }, { "epoch": 1.0457107075767063, "grad_norm": 3.8072099685668945, "learning_rate": 7.261548913043479e-05, "loss": 0.7763, "step": 6680 }, { "epoch": 1.0458672510958047, "grad_norm": 4.45210599899292, "learning_rate": 7.260360054347826e-05, "loss": 1.0709, "step": 6681 }, { "epoch": 1.046023794614903, "grad_norm": 3.0215954780578613, "learning_rate": 7.259171195652173e-05, "loss": 1.4372, "step": 6682 }, { "epoch": 1.0461803381340014, "grad_norm": 2.699596405029297, "learning_rate": 7.257982336956521e-05, "loss": 1.3508, "step": 6683 }, { "epoch": 1.0463368816530996, "grad_norm": 5.339703559875488, "learning_rate": 7.256793478260869e-05, "loss": 0.4965, "step": 6684 }, { "epoch": 1.0464934251721978, "grad_norm": 2.390233039855957, "learning_rate": 7.255604619565216e-05, "loss": 0.5734, "step": 6685 }, { "epoch": 1.0466499686912962, "grad_norm": 2.8153247833251953, "learning_rate": 7.254415760869564e-05, "loss": 1.1236, "step": 6686 }, { "epoch": 1.0468065122103944, "grad_norm": 4.379684925079346, "learning_rate": 7.253226902173912e-05, "loss": 0.834, "step": 6687 }, { "epoch": 1.0469630557294929, "grad_norm": 2.3191773891448975, "learning_rate": 7.25203804347826e-05, "loss": 0.4117, "step": 6688 }, { "epoch": 1.047119599248591, "grad_norm": 0.44716623425483704, "learning_rate": 7.250849184782608e-05, "loss": 0.2332, "step": 6689 }, { "epoch": 1.0472761427676893, "grad_norm": 0.696064829826355, "learning_rate": 7.249660326086955e-05, "loss": 0.2177, "step": 6690 }, { "epoch": 1.0474326862867878, "grad_norm": 0.564370334148407, "learning_rate": 7.248471467391303e-05, "loss": 0.1925, "step": 6691 }, { "epoch": 1.047589229805886, "grad_norm": 0.5813634991645813, "learning_rate": 7.247282608695651e-05, "loss": 0.2111, "step": 6692 }, { "epoch": 1.0477457733249844, "grad_norm": 0.5474847555160522, "learning_rate": 7.246093749999999e-05, "loss": 0.2034, "step": 6693 }, { "epoch": 1.0479023168440826, "grad_norm": 0.7939810156822205, "learning_rate": 7.244904891304347e-05, "loss": 0.2787, "step": 6694 }, { "epoch": 1.0480588603631809, "grad_norm": 0.6931917071342468, "learning_rate": 7.243716032608694e-05, "loss": 0.2416, "step": 6695 }, { "epoch": 1.0482154038822793, "grad_norm": 0.5710418820381165, "learning_rate": 7.242527173913044e-05, "loss": 0.2054, "step": 6696 }, { "epoch": 1.0483719474013775, "grad_norm": 0.6783114075660706, "learning_rate": 7.241338315217391e-05, "loss": 0.2135, "step": 6697 }, { "epoch": 1.048528490920476, "grad_norm": 0.5982029438018799, "learning_rate": 7.240149456521739e-05, "loss": 0.2424, "step": 6698 }, { "epoch": 1.0486850344395742, "grad_norm": 0.8738577961921692, "learning_rate": 7.238960597826087e-05, "loss": 0.2755, "step": 6699 }, { "epoch": 1.0488415779586726, "grad_norm": 2.539001703262329, "learning_rate": 7.237771739130435e-05, "loss": 0.4226, "step": 6700 }, { "epoch": 1.0489981214777708, "grad_norm": 0.5330018997192383, "learning_rate": 7.236582880434783e-05, "loss": 0.1585, "step": 6701 }, { "epoch": 1.049154664996869, "grad_norm": 0.9732142090797424, "learning_rate": 7.23539402173913e-05, "loss": 0.3133, "step": 6702 }, { "epoch": 1.0493112085159675, "grad_norm": 0.821988046169281, "learning_rate": 7.234205163043478e-05, "loss": 0.2591, "step": 6703 }, { "epoch": 1.0494677520350657, "grad_norm": 1.202370047569275, "learning_rate": 7.233016304347826e-05, "loss": 0.3469, "step": 6704 }, { "epoch": 1.0496242955541641, "grad_norm": 1.1501399278640747, "learning_rate": 7.231827445652173e-05, "loss": 0.2724, "step": 6705 }, { "epoch": 1.0497808390732624, "grad_norm": 2.178100824356079, "learning_rate": 7.23063858695652e-05, "loss": 0.5147, "step": 6706 }, { "epoch": 1.0499373825923606, "grad_norm": 1.276677131652832, "learning_rate": 7.229449728260868e-05, "loss": 0.3398, "step": 6707 }, { "epoch": 1.050093926111459, "grad_norm": 1.8798608779907227, "learning_rate": 7.228260869565216e-05, "loss": 0.285, "step": 6708 }, { "epoch": 1.0502504696305572, "grad_norm": 1.7381170988082886, "learning_rate": 7.227072010869564e-05, "loss": 0.445, "step": 6709 }, { "epoch": 1.0504070131496557, "grad_norm": 2.432908296585083, "learning_rate": 7.225883152173912e-05, "loss": 0.6458, "step": 6710 }, { "epoch": 1.050563556668754, "grad_norm": 1.8882864713668823, "learning_rate": 7.22469429347826e-05, "loss": 0.4622, "step": 6711 }, { "epoch": 1.0507201001878523, "grad_norm": 9.337702751159668, "learning_rate": 7.223505434782607e-05, "loss": 0.5234, "step": 6712 }, { "epoch": 1.0508766437069506, "grad_norm": 1.3617793321609497, "learning_rate": 7.222316576086955e-05, "loss": 0.6443, "step": 6713 }, { "epoch": 1.0510331872260488, "grad_norm": 1.636407732963562, "learning_rate": 7.221127717391303e-05, "loss": 0.4505, "step": 6714 }, { "epoch": 1.0511897307451472, "grad_norm": 1.988609790802002, "learning_rate": 7.219938858695652e-05, "loss": 0.7795, "step": 6715 }, { "epoch": 1.0513462742642454, "grad_norm": 2.2436959743499756, "learning_rate": 7.21875e-05, "loss": 0.5912, "step": 6716 }, { "epoch": 1.0515028177833439, "grad_norm": 2.3296713829040527, "learning_rate": 7.217561141304348e-05, "loss": 0.6477, "step": 6717 }, { "epoch": 1.051659361302442, "grad_norm": 2.6347343921661377, "learning_rate": 7.216372282608696e-05, "loss": 0.8207, "step": 6718 }, { "epoch": 1.0518159048215403, "grad_norm": 4.049637794494629, "learning_rate": 7.215183423913043e-05, "loss": 1.1768, "step": 6719 }, { "epoch": 1.0519724483406387, "grad_norm": 1.7626824378967285, "learning_rate": 7.213994565217391e-05, "loss": 0.7382, "step": 6720 }, { "epoch": 1.052128991859737, "grad_norm": 2.959731340408325, "learning_rate": 7.212805706521739e-05, "loss": 0.9213, "step": 6721 }, { "epoch": 1.0522855353788354, "grad_norm": 2.4273898601531982, "learning_rate": 7.211616847826087e-05, "loss": 0.7838, "step": 6722 }, { "epoch": 1.0524420788979336, "grad_norm": 2.250666618347168, "learning_rate": 7.210427989130435e-05, "loss": 0.9334, "step": 6723 }, { "epoch": 1.0525986224170318, "grad_norm": 2.9733455181121826, "learning_rate": 7.209239130434782e-05, "loss": 0.9781, "step": 6724 }, { "epoch": 1.0527551659361303, "grad_norm": 3.12424898147583, "learning_rate": 7.20805027173913e-05, "loss": 0.8421, "step": 6725 }, { "epoch": 1.0529117094552285, "grad_norm": 2.7065255641937256, "learning_rate": 7.206861413043478e-05, "loss": 0.8348, "step": 6726 }, { "epoch": 1.053068252974327, "grad_norm": 1.908237338066101, "learning_rate": 7.205672554347826e-05, "loss": 0.9513, "step": 6727 }, { "epoch": 1.0532247964934252, "grad_norm": 2.613156318664551, "learning_rate": 7.204483695652172e-05, "loss": 1.2656, "step": 6728 }, { "epoch": 1.0533813400125234, "grad_norm": 3.636770486831665, "learning_rate": 7.20329483695652e-05, "loss": 1.122, "step": 6729 }, { "epoch": 1.0535378835316218, "grad_norm": 3.1322784423828125, "learning_rate": 7.202105978260868e-05, "loss": 1.1574, "step": 6730 }, { "epoch": 1.05369442705072, "grad_norm": 3.0649423599243164, "learning_rate": 7.200917119565216e-05, "loss": 1.1081, "step": 6731 }, { "epoch": 1.0538509705698185, "grad_norm": 2.5836501121520996, "learning_rate": 7.199728260869564e-05, "loss": 0.9668, "step": 6732 }, { "epoch": 1.0540075140889167, "grad_norm": 2.115630865097046, "learning_rate": 7.198539402173911e-05, "loss": 0.9798, "step": 6733 }, { "epoch": 1.0541640576080151, "grad_norm": 5.184930324554443, "learning_rate": 7.197350543478259e-05, "loss": 0.7331, "step": 6734 }, { "epoch": 1.0543206011271133, "grad_norm": 2.199101686477661, "learning_rate": 7.196161684782608e-05, "loss": 1.1609, "step": 6735 }, { "epoch": 1.0544771446462116, "grad_norm": 1.8229550123214722, "learning_rate": 7.194972826086956e-05, "loss": 0.5378, "step": 6736 }, { "epoch": 1.05463368816531, "grad_norm": 3.602360725402832, "learning_rate": 7.193783967391304e-05, "loss": 1.3758, "step": 6737 }, { "epoch": 1.0547902316844082, "grad_norm": 1.2580493688583374, "learning_rate": 7.192595108695652e-05, "loss": 0.428, "step": 6738 }, { "epoch": 1.0549467752035067, "grad_norm": 0.7351574301719666, "learning_rate": 7.19140625e-05, "loss": 0.2411, "step": 6739 }, { "epoch": 1.0551033187226049, "grad_norm": 0.8662021160125732, "learning_rate": 7.190217391304348e-05, "loss": 0.2707, "step": 6740 }, { "epoch": 1.055259862241703, "grad_norm": 0.6735130548477173, "learning_rate": 7.189028532608695e-05, "loss": 0.171, "step": 6741 }, { "epoch": 1.0554164057608015, "grad_norm": 0.597131609916687, "learning_rate": 7.187839673913043e-05, "loss": 0.248, "step": 6742 }, { "epoch": 1.0555729492798998, "grad_norm": 0.595777690410614, "learning_rate": 7.186650815217391e-05, "loss": 0.3351, "step": 6743 }, { "epoch": 1.0557294927989982, "grad_norm": 0.9849507808685303, "learning_rate": 7.185461956521739e-05, "loss": 0.4035, "step": 6744 }, { "epoch": 1.0558860363180964, "grad_norm": 0.7776398062705994, "learning_rate": 7.184273097826087e-05, "loss": 0.2151, "step": 6745 }, { "epoch": 1.0560425798371949, "grad_norm": 1.0063278675079346, "learning_rate": 7.183084239130434e-05, "loss": 0.2409, "step": 6746 }, { "epoch": 1.056199123356293, "grad_norm": 1.7327438592910767, "learning_rate": 7.181895380434782e-05, "loss": 0.3747, "step": 6747 }, { "epoch": 1.0563556668753913, "grad_norm": 1.4473403692245483, "learning_rate": 7.18070652173913e-05, "loss": 0.4215, "step": 6748 }, { "epoch": 1.0565122103944897, "grad_norm": 0.9088017344474792, "learning_rate": 7.179517663043478e-05, "loss": 0.2879, "step": 6749 }, { "epoch": 1.056668753913588, "grad_norm": 0.7560357451438904, "learning_rate": 7.178328804347826e-05, "loss": 0.2008, "step": 6750 }, { "epoch": 1.0568252974326864, "grad_norm": 0.7335060238838196, "learning_rate": 7.177139945652172e-05, "loss": 0.1462, "step": 6751 }, { "epoch": 1.0569818409517846, "grad_norm": 1.1731523275375366, "learning_rate": 7.17595108695652e-05, "loss": 0.4361, "step": 6752 }, { "epoch": 1.0571383844708828, "grad_norm": 2.459390163421631, "learning_rate": 7.174762228260868e-05, "loss": 0.2445, "step": 6753 }, { "epoch": 1.0572949279899813, "grad_norm": 0.8345919251441956, "learning_rate": 7.173573369565216e-05, "loss": 0.3209, "step": 6754 }, { "epoch": 1.0574514715090795, "grad_norm": 2.6425862312316895, "learning_rate": 7.172384510869565e-05, "loss": 0.4858, "step": 6755 }, { "epoch": 1.057608015028178, "grad_norm": 2.55289363861084, "learning_rate": 7.171195652173913e-05, "loss": 0.5415, "step": 6756 }, { "epoch": 1.0577645585472761, "grad_norm": 1.647573471069336, "learning_rate": 7.17000679347826e-05, "loss": 0.5257, "step": 6757 }, { "epoch": 1.0579211020663744, "grad_norm": 3.2940421104431152, "learning_rate": 7.168817934782608e-05, "loss": 0.5014, "step": 6758 }, { "epoch": 1.0580776455854728, "grad_norm": 1.3385570049285889, "learning_rate": 7.167629076086956e-05, "loss": 0.4633, "step": 6759 }, { "epoch": 1.058234189104571, "grad_norm": 2.0079402923583984, "learning_rate": 7.166440217391304e-05, "loss": 0.496, "step": 6760 }, { "epoch": 1.0583907326236695, "grad_norm": 2.9150617122650146, "learning_rate": 7.165251358695652e-05, "loss": 0.7529, "step": 6761 }, { "epoch": 1.0585472761427677, "grad_norm": 2.083343505859375, "learning_rate": 7.1640625e-05, "loss": 0.2043, "step": 6762 }, { "epoch": 1.0587038196618659, "grad_norm": 3.9466986656188965, "learning_rate": 7.162873641304347e-05, "loss": 1.0028, "step": 6763 }, { "epoch": 1.0588603631809643, "grad_norm": 1.0493789911270142, "learning_rate": 7.161684782608695e-05, "loss": 0.4271, "step": 6764 }, { "epoch": 1.0590169067000625, "grad_norm": 1.2901350259780884, "learning_rate": 7.160495923913043e-05, "loss": 0.4447, "step": 6765 }, { "epoch": 1.059173450219161, "grad_norm": 2.744870662689209, "learning_rate": 7.159307065217391e-05, "loss": 0.5148, "step": 6766 }, { "epoch": 1.0593299937382592, "grad_norm": 1.8166593313217163, "learning_rate": 7.158118206521739e-05, "loss": 0.8182, "step": 6767 }, { "epoch": 1.0594865372573576, "grad_norm": 1.296596884727478, "learning_rate": 7.156929347826086e-05, "loss": 0.63, "step": 6768 }, { "epoch": 1.0596430807764559, "grad_norm": 5.4226393699646, "learning_rate": 7.155740489130434e-05, "loss": 1.5211, "step": 6769 }, { "epoch": 1.059799624295554, "grad_norm": 2.6706974506378174, "learning_rate": 7.154551630434782e-05, "loss": 0.7385, "step": 6770 }, { "epoch": 1.0599561678146525, "grad_norm": 4.222756862640381, "learning_rate": 7.15336277173913e-05, "loss": 1.0152, "step": 6771 }, { "epoch": 1.0601127113337507, "grad_norm": 2.2908401489257812, "learning_rate": 7.152173913043478e-05, "loss": 0.8047, "step": 6772 }, { "epoch": 1.0602692548528492, "grad_norm": 2.890692710876465, "learning_rate": 7.150985054347827e-05, "loss": 1.2244, "step": 6773 }, { "epoch": 1.0604257983719474, "grad_norm": 3.8330726623535156, "learning_rate": 7.149796195652172e-05, "loss": 0.9923, "step": 6774 }, { "epoch": 1.0605823418910456, "grad_norm": 3.1470947265625, "learning_rate": 7.148607336956521e-05, "loss": 1.557, "step": 6775 }, { "epoch": 1.060738885410144, "grad_norm": 5.538039207458496, "learning_rate": 7.147418478260869e-05, "loss": 1.0615, "step": 6776 }, { "epoch": 1.0608954289292423, "grad_norm": 2.564342975616455, "learning_rate": 7.146229619565217e-05, "loss": 0.6698, "step": 6777 }, { "epoch": 1.0610519724483407, "grad_norm": 3.348891258239746, "learning_rate": 7.145040760869565e-05, "loss": 1.0354, "step": 6778 }, { "epoch": 1.061208515967439, "grad_norm": 2.5788443088531494, "learning_rate": 7.143851902173912e-05, "loss": 1.0858, "step": 6779 }, { "epoch": 1.0613650594865374, "grad_norm": 4.3266921043396, "learning_rate": 7.14266304347826e-05, "loss": 0.8648, "step": 6780 }, { "epoch": 1.0615216030056356, "grad_norm": 2.924952507019043, "learning_rate": 7.141474184782608e-05, "loss": 1.3985, "step": 6781 }, { "epoch": 1.0616781465247338, "grad_norm": 2.4685592651367188, "learning_rate": 7.140285326086956e-05, "loss": 1.0191, "step": 6782 }, { "epoch": 1.0618346900438322, "grad_norm": 2.0769765377044678, "learning_rate": 7.139096467391304e-05, "loss": 0.8877, "step": 6783 }, { "epoch": 1.0619912335629305, "grad_norm": 2.7833032608032227, "learning_rate": 7.137907608695651e-05, "loss": 0.4831, "step": 6784 }, { "epoch": 1.062147777082029, "grad_norm": 2.3439688682556152, "learning_rate": 7.136718749999999e-05, "loss": 1.0021, "step": 6785 }, { "epoch": 1.0623043206011271, "grad_norm": 2.69468092918396, "learning_rate": 7.135529891304347e-05, "loss": 0.6282, "step": 6786 }, { "epoch": 1.0624608641202253, "grad_norm": 2.7514419555664062, "learning_rate": 7.134341032608695e-05, "loss": 1.2434, "step": 6787 }, { "epoch": 1.0626174076393238, "grad_norm": 7.126882553100586, "learning_rate": 7.133152173913043e-05, "loss": 0.7055, "step": 6788 }, { "epoch": 1.062773951158422, "grad_norm": 0.8003222346305847, "learning_rate": 7.13196331521739e-05, "loss": 0.2942, "step": 6789 }, { "epoch": 1.0629304946775204, "grad_norm": 0.6073592901229858, "learning_rate": 7.130774456521738e-05, "loss": 0.2954, "step": 6790 }, { "epoch": 1.0630870381966186, "grad_norm": 0.553068220615387, "learning_rate": 7.129585597826086e-05, "loss": 0.2769, "step": 6791 }, { "epoch": 1.0632435817157169, "grad_norm": 0.6229810118675232, "learning_rate": 7.128396739130435e-05, "loss": 0.2259, "step": 6792 }, { "epoch": 1.0634001252348153, "grad_norm": 0.676392138004303, "learning_rate": 7.127207880434783e-05, "loss": 0.2441, "step": 6793 }, { "epoch": 1.0635566687539135, "grad_norm": 0.6112626791000366, "learning_rate": 7.126019021739131e-05, "loss": 0.2802, "step": 6794 }, { "epoch": 1.063713212273012, "grad_norm": 1.469604730606079, "learning_rate": 7.124830163043479e-05, "loss": 0.3318, "step": 6795 }, { "epoch": 1.0638697557921102, "grad_norm": 2.8414363861083984, "learning_rate": 7.123641304347827e-05, "loss": 0.7736, "step": 6796 }, { "epoch": 1.0640262993112084, "grad_norm": 0.7384973168373108, "learning_rate": 7.122452445652173e-05, "loss": 0.276, "step": 6797 }, { "epoch": 1.0641828428303068, "grad_norm": 0.690690279006958, "learning_rate": 7.121263586956521e-05, "loss": 0.2267, "step": 6798 }, { "epoch": 1.064339386349405, "grad_norm": 0.9890545010566711, "learning_rate": 7.120074728260869e-05, "loss": 0.3016, "step": 6799 }, { "epoch": 1.0644959298685035, "grad_norm": 1.7683066129684448, "learning_rate": 7.118885869565216e-05, "loss": 0.4081, "step": 6800 }, { "epoch": 1.0646524733876017, "grad_norm": 0.9371797442436218, "learning_rate": 7.117697010869564e-05, "loss": 0.2101, "step": 6801 }, { "epoch": 1.0648090169067002, "grad_norm": 0.846589207649231, "learning_rate": 7.116508152173912e-05, "loss": 0.2941, "step": 6802 }, { "epoch": 1.0649655604257984, "grad_norm": 1.7193230390548706, "learning_rate": 7.11531929347826e-05, "loss": 0.5924, "step": 6803 }, { "epoch": 1.0651221039448966, "grad_norm": 2.2120800018310547, "learning_rate": 7.114130434782608e-05, "loss": 0.4414, "step": 6804 }, { "epoch": 1.065278647463995, "grad_norm": 1.0317106246948242, "learning_rate": 7.112941576086956e-05, "loss": 0.3396, "step": 6805 }, { "epoch": 1.0654351909830932, "grad_norm": 0.7863645553588867, "learning_rate": 7.111752717391303e-05, "loss": 0.2288, "step": 6806 }, { "epoch": 1.0655917345021917, "grad_norm": 1.332118272781372, "learning_rate": 7.110563858695651e-05, "loss": 0.5424, "step": 6807 }, { "epoch": 1.06574827802129, "grad_norm": 0.9983425736427307, "learning_rate": 7.109374999999999e-05, "loss": 0.4844, "step": 6808 }, { "epoch": 1.0659048215403881, "grad_norm": 3.638415575027466, "learning_rate": 7.108186141304347e-05, "loss": 0.7175, "step": 6809 }, { "epoch": 1.0660613650594866, "grad_norm": 1.6094450950622559, "learning_rate": 7.106997282608695e-05, "loss": 0.3811, "step": 6810 }, { "epoch": 1.0662179085785848, "grad_norm": 1.0253472328186035, "learning_rate": 7.105808423913042e-05, "loss": 0.3524, "step": 6811 }, { "epoch": 1.0663744520976832, "grad_norm": 1.483228325843811, "learning_rate": 7.104619565217392e-05, "loss": 0.4372, "step": 6812 }, { "epoch": 1.0665309956167814, "grad_norm": 1.9803868532180786, "learning_rate": 7.10343070652174e-05, "loss": 0.4696, "step": 6813 }, { "epoch": 1.0666875391358799, "grad_norm": 2.710296869277954, "learning_rate": 7.102241847826087e-05, "loss": 0.6458, "step": 6814 }, { "epoch": 1.066844082654978, "grad_norm": 2.0737431049346924, "learning_rate": 7.101052989130435e-05, "loss": 0.7297, "step": 6815 }, { "epoch": 1.0670006261740763, "grad_norm": 2.9299068450927734, "learning_rate": 7.099864130434783e-05, "loss": 0.6034, "step": 6816 }, { "epoch": 1.0671571696931748, "grad_norm": 2.1174144744873047, "learning_rate": 7.098675271739131e-05, "loss": 0.3934, "step": 6817 }, { "epoch": 1.067313713212273, "grad_norm": 2.374932050704956, "learning_rate": 7.097486413043479e-05, "loss": 0.6419, "step": 6818 }, { "epoch": 1.0674702567313714, "grad_norm": 2.565227746963501, "learning_rate": 7.096297554347826e-05, "loss": 0.8419, "step": 6819 }, { "epoch": 1.0676268002504696, "grad_norm": 2.6175639629364014, "learning_rate": 7.095108695652173e-05, "loss": 0.7549, "step": 6820 }, { "epoch": 1.0677833437695678, "grad_norm": 2.0213446617126465, "learning_rate": 7.09391983695652e-05, "loss": 0.9118, "step": 6821 }, { "epoch": 1.0679398872886663, "grad_norm": 2.6343886852264404, "learning_rate": 7.092730978260868e-05, "loss": 0.7737, "step": 6822 }, { "epoch": 1.0680964308077645, "grad_norm": 2.4514646530151367, "learning_rate": 7.091542119565216e-05, "loss": 1.1544, "step": 6823 }, { "epoch": 1.068252974326863, "grad_norm": 2.5534474849700928, "learning_rate": 7.090353260869564e-05, "loss": 0.5348, "step": 6824 }, { "epoch": 1.0684095178459612, "grad_norm": 1.9158401489257812, "learning_rate": 7.089164402173912e-05, "loss": 0.4738, "step": 6825 }, { "epoch": 1.0685660613650594, "grad_norm": 3.030926465988159, "learning_rate": 7.08797554347826e-05, "loss": 0.8747, "step": 6826 }, { "epoch": 1.0687226048841578, "grad_norm": 3.2869303226470947, "learning_rate": 7.086786684782607e-05, "loss": 0.8695, "step": 6827 }, { "epoch": 1.068879148403256, "grad_norm": 3.833065986633301, "learning_rate": 7.085597826086955e-05, "loss": 0.8203, "step": 6828 }, { "epoch": 1.0690356919223545, "grad_norm": 3.2401394844055176, "learning_rate": 7.084408967391303e-05, "loss": 1.428, "step": 6829 }, { "epoch": 1.0691922354414527, "grad_norm": 1.9963586330413818, "learning_rate": 7.083220108695651e-05, "loss": 0.7257, "step": 6830 }, { "epoch": 1.069348778960551, "grad_norm": 4.458832740783691, "learning_rate": 7.082031249999999e-05, "loss": 0.9234, "step": 6831 }, { "epoch": 1.0695053224796494, "grad_norm": 2.7680375576019287, "learning_rate": 7.080842391304348e-05, "loss": 0.7797, "step": 6832 }, { "epoch": 1.0696618659987476, "grad_norm": 2.195693016052246, "learning_rate": 7.079653532608696e-05, "loss": 0.6713, "step": 6833 }, { "epoch": 1.069818409517846, "grad_norm": 2.3401682376861572, "learning_rate": 7.078464673913044e-05, "loss": 0.7364, "step": 6834 }, { "epoch": 1.0699749530369442, "grad_norm": 2.550170660018921, "learning_rate": 7.077275815217391e-05, "loss": 0.3758, "step": 6835 }, { "epoch": 1.0701314965560427, "grad_norm": 2.8093676567077637, "learning_rate": 7.076086956521739e-05, "loss": 1.2617, "step": 6836 }, { "epoch": 1.070288040075141, "grad_norm": 6.320163249969482, "learning_rate": 7.074898097826087e-05, "loss": 0.4722, "step": 6837 }, { "epoch": 1.070444583594239, "grad_norm": 2.02006459236145, "learning_rate": 7.073709239130435e-05, "loss": 0.6068, "step": 6838 }, { "epoch": 1.0706011271133375, "grad_norm": 0.8496372699737549, "learning_rate": 7.072520380434783e-05, "loss": 0.3052, "step": 6839 }, { "epoch": 1.0707576706324358, "grad_norm": 0.410180926322937, "learning_rate": 7.07133152173913e-05, "loss": 0.1452, "step": 6840 }, { "epoch": 1.0709142141515342, "grad_norm": 0.8518435955047607, "learning_rate": 7.070142663043478e-05, "loss": 0.2703, "step": 6841 }, { "epoch": 1.0710707576706324, "grad_norm": 0.5784423351287842, "learning_rate": 7.068953804347826e-05, "loss": 0.1518, "step": 6842 }, { "epoch": 1.0712273011897309, "grad_norm": 0.8527161478996277, "learning_rate": 7.067764945652173e-05, "loss": 0.4562, "step": 6843 }, { "epoch": 1.071383844708829, "grad_norm": 0.5433830618858337, "learning_rate": 7.06657608695652e-05, "loss": 0.1771, "step": 6844 }, { "epoch": 1.0715403882279273, "grad_norm": 0.6588781476020813, "learning_rate": 7.065387228260868e-05, "loss": 0.2624, "step": 6845 }, { "epoch": 1.0716969317470257, "grad_norm": 0.6739845871925354, "learning_rate": 7.064198369565216e-05, "loss": 0.2517, "step": 6846 }, { "epoch": 1.071853475266124, "grad_norm": 0.8908132314682007, "learning_rate": 7.063009510869564e-05, "loss": 0.2256, "step": 6847 }, { "epoch": 1.0720100187852224, "grad_norm": 0.8792808055877686, "learning_rate": 7.061820652173912e-05, "loss": 0.2675, "step": 6848 }, { "epoch": 1.0721665623043206, "grad_norm": 0.6944761276245117, "learning_rate": 7.06063179347826e-05, "loss": 0.1678, "step": 6849 }, { "epoch": 1.0723231058234188, "grad_norm": 0.9996775984764099, "learning_rate": 7.059442934782607e-05, "loss": 0.3017, "step": 6850 }, { "epoch": 1.0724796493425173, "grad_norm": 1.1539695262908936, "learning_rate": 7.058254076086955e-05, "loss": 0.3427, "step": 6851 }, { "epoch": 1.0726361928616155, "grad_norm": 0.8776004314422607, "learning_rate": 7.057065217391304e-05, "loss": 0.3911, "step": 6852 }, { "epoch": 1.072792736380714, "grad_norm": 2.304277181625366, "learning_rate": 7.055876358695652e-05, "loss": 0.3321, "step": 6853 }, { "epoch": 1.0729492798998121, "grad_norm": 1.6756871938705444, "learning_rate": 7.0546875e-05, "loss": 0.4462, "step": 6854 }, { "epoch": 1.0731058234189104, "grad_norm": 0.8535544276237488, "learning_rate": 7.053498641304348e-05, "loss": 0.3696, "step": 6855 }, { "epoch": 1.0732623669380088, "grad_norm": 1.570290446281433, "learning_rate": 7.052309782608696e-05, "loss": 0.5757, "step": 6856 }, { "epoch": 1.073418910457107, "grad_norm": 1.2395174503326416, "learning_rate": 7.051120923913043e-05, "loss": 0.376, "step": 6857 }, { "epoch": 1.0735754539762055, "grad_norm": 0.9213850498199463, "learning_rate": 7.049932065217391e-05, "loss": 0.3302, "step": 6858 }, { "epoch": 1.0737319974953037, "grad_norm": 1.0447638034820557, "learning_rate": 7.048743206521739e-05, "loss": 0.2662, "step": 6859 }, { "epoch": 1.073888541014402, "grad_norm": 1.8861572742462158, "learning_rate": 7.047554347826087e-05, "loss": 0.3007, "step": 6860 }, { "epoch": 1.0740450845335003, "grad_norm": 2.3748066425323486, "learning_rate": 7.046365489130435e-05, "loss": 0.4111, "step": 6861 }, { "epoch": 1.0742016280525986, "grad_norm": 2.1521494388580322, "learning_rate": 7.045176630434782e-05, "loss": 0.5365, "step": 6862 }, { "epoch": 1.074358171571697, "grad_norm": 2.2056937217712402, "learning_rate": 7.04398777173913e-05, "loss": 0.6766, "step": 6863 }, { "epoch": 1.0745147150907952, "grad_norm": 1.4351582527160645, "learning_rate": 7.042798913043478e-05, "loss": 0.3738, "step": 6864 }, { "epoch": 1.0746712586098937, "grad_norm": 4.139111042022705, "learning_rate": 7.041610054347826e-05, "loss": 0.7118, "step": 6865 }, { "epoch": 1.0748278021289919, "grad_norm": 1.7452930212020874, "learning_rate": 7.040421195652172e-05, "loss": 0.3695, "step": 6866 }, { "epoch": 1.07498434564809, "grad_norm": 2.344233751296997, "learning_rate": 7.03923233695652e-05, "loss": 0.6241, "step": 6867 }, { "epoch": 1.0751408891671885, "grad_norm": 3.866716146469116, "learning_rate": 7.038043478260868e-05, "loss": 0.8656, "step": 6868 }, { "epoch": 1.0752974326862867, "grad_norm": 3.5903499126434326, "learning_rate": 7.036854619565216e-05, "loss": 0.5413, "step": 6869 }, { "epoch": 1.0754539762053852, "grad_norm": 3.284325122833252, "learning_rate": 7.035665760869564e-05, "loss": 0.7511, "step": 6870 }, { "epoch": 1.0756105197244834, "grad_norm": 1.5409493446350098, "learning_rate": 7.034476902173911e-05, "loss": 0.5139, "step": 6871 }, { "epoch": 1.0757670632435816, "grad_norm": 4.1582770347595215, "learning_rate": 7.03328804347826e-05, "loss": 0.9077, "step": 6872 }, { "epoch": 1.07592360676268, "grad_norm": 2.8311514854431152, "learning_rate": 7.032099184782608e-05, "loss": 1.1019, "step": 6873 }, { "epoch": 1.0760801502817783, "grad_norm": 2.3498809337615967, "learning_rate": 7.030910326086956e-05, "loss": 0.6201, "step": 6874 }, { "epoch": 1.0762366938008767, "grad_norm": 3.8238120079040527, "learning_rate": 7.029721467391304e-05, "loss": 0.8949, "step": 6875 }, { "epoch": 1.076393237319975, "grad_norm": 2.3376271724700928, "learning_rate": 7.028532608695652e-05, "loss": 0.5445, "step": 6876 }, { "epoch": 1.0765497808390734, "grad_norm": 4.082411766052246, "learning_rate": 7.02734375e-05, "loss": 1.5505, "step": 6877 }, { "epoch": 1.0767063243581716, "grad_norm": 4.3577165603637695, "learning_rate": 7.026154891304347e-05, "loss": 0.6094, "step": 6878 }, { "epoch": 1.0768628678772698, "grad_norm": 2.907994031906128, "learning_rate": 7.024966032608695e-05, "loss": 1.1514, "step": 6879 }, { "epoch": 1.0770194113963683, "grad_norm": 5.605425834655762, "learning_rate": 7.023777173913043e-05, "loss": 1.2486, "step": 6880 }, { "epoch": 1.0771759549154665, "grad_norm": 6.361100196838379, "learning_rate": 7.022588315217391e-05, "loss": 1.0209, "step": 6881 }, { "epoch": 1.077332498434565, "grad_norm": 2.472179889678955, "learning_rate": 7.021399456521739e-05, "loss": 0.811, "step": 6882 }, { "epoch": 1.0774890419536631, "grad_norm": 3.578303098678589, "learning_rate": 7.020210597826087e-05, "loss": 0.5858, "step": 6883 }, { "epoch": 1.0776455854727613, "grad_norm": 4.364825248718262, "learning_rate": 7.019021739130434e-05, "loss": 1.0075, "step": 6884 }, { "epoch": 1.0778021289918598, "grad_norm": 5.38032865524292, "learning_rate": 7.017832880434782e-05, "loss": 0.3935, "step": 6885 }, { "epoch": 1.077958672510958, "grad_norm": 2.914609909057617, "learning_rate": 7.01664402173913e-05, "loss": 0.6716, "step": 6886 }, { "epoch": 1.0781152160300564, "grad_norm": 2.6489715576171875, "learning_rate": 7.015455163043478e-05, "loss": 0.5736, "step": 6887 }, { "epoch": 1.0782717595491547, "grad_norm": 2.729121446609497, "learning_rate": 7.014266304347826e-05, "loss": 0.7758, "step": 6888 }, { "epoch": 1.0784283030682529, "grad_norm": 0.4604628384113312, "learning_rate": 7.013077445652172e-05, "loss": 0.2246, "step": 6889 }, { "epoch": 1.0785848465873513, "grad_norm": 0.43380358815193176, "learning_rate": 7.01188858695652e-05, "loss": 0.195, "step": 6890 }, { "epoch": 1.0787413901064495, "grad_norm": 0.5019301176071167, "learning_rate": 7.010699728260868e-05, "loss": 0.1738, "step": 6891 }, { "epoch": 1.078897933625548, "grad_norm": 1.2891687154769897, "learning_rate": 7.009510869565217e-05, "loss": 0.2768, "step": 6892 }, { "epoch": 1.0790544771446462, "grad_norm": 0.5829753279685974, "learning_rate": 7.008322010869565e-05, "loss": 0.2012, "step": 6893 }, { "epoch": 1.0792110206637444, "grad_norm": 0.6717230677604675, "learning_rate": 7.007133152173912e-05, "loss": 0.2616, "step": 6894 }, { "epoch": 1.0793675641828429, "grad_norm": 1.2181594371795654, "learning_rate": 7.00594429347826e-05, "loss": 0.346, "step": 6895 }, { "epoch": 1.079524107701941, "grad_norm": 0.7478859424591064, "learning_rate": 7.004755434782608e-05, "loss": 0.1745, "step": 6896 }, { "epoch": 1.0796806512210395, "grad_norm": 0.7985224723815918, "learning_rate": 7.003566576086956e-05, "loss": 0.2324, "step": 6897 }, { "epoch": 1.0798371947401377, "grad_norm": 0.7171376347541809, "learning_rate": 7.002377717391304e-05, "loss": 0.2624, "step": 6898 }, { "epoch": 1.0799937382592362, "grad_norm": 1.1062828302383423, "learning_rate": 7.001188858695652e-05, "loss": 0.2673, "step": 6899 }, { "epoch": 1.0801502817783344, "grad_norm": 0.8364916443824768, "learning_rate": 7e-05, "loss": 0.273, "step": 6900 }, { "epoch": 1.0803068252974326, "grad_norm": 1.0623760223388672, "learning_rate": 6.998811141304347e-05, "loss": 0.3466, "step": 6901 }, { "epoch": 1.080463368816531, "grad_norm": 1.4367897510528564, "learning_rate": 6.997622282608695e-05, "loss": 0.5902, "step": 6902 }, { "epoch": 1.0806199123356293, "grad_norm": 1.5382661819458008, "learning_rate": 6.996433423913043e-05, "loss": 0.3134, "step": 6903 }, { "epoch": 1.0807764558547277, "grad_norm": 0.6520094275474548, "learning_rate": 6.99524456521739e-05, "loss": 0.221, "step": 6904 }, { "epoch": 1.080932999373826, "grad_norm": 0.9320122003555298, "learning_rate": 6.994055706521738e-05, "loss": 0.2552, "step": 6905 }, { "epoch": 1.0810895428929241, "grad_norm": 0.9431401491165161, "learning_rate": 6.992866847826086e-05, "loss": 0.3188, "step": 6906 }, { "epoch": 1.0812460864120226, "grad_norm": 1.5343830585479736, "learning_rate": 6.991677989130434e-05, "loss": 0.4822, "step": 6907 }, { "epoch": 1.0814026299311208, "grad_norm": 1.2231159210205078, "learning_rate": 6.990489130434782e-05, "loss": 0.2949, "step": 6908 }, { "epoch": 1.0815591734502192, "grad_norm": 1.919303059577942, "learning_rate": 6.98930027173913e-05, "loss": 0.3831, "step": 6909 }, { "epoch": 1.0817157169693175, "grad_norm": 1.8407715559005737, "learning_rate": 6.988111413043478e-05, "loss": 0.6337, "step": 6910 }, { "epoch": 1.081872260488416, "grad_norm": 2.7307770252227783, "learning_rate": 6.986922554347825e-05, "loss": 0.9585, "step": 6911 }, { "epoch": 1.0820288040075141, "grad_norm": 2.3501815795898438, "learning_rate": 6.985733695652173e-05, "loss": 0.8763, "step": 6912 }, { "epoch": 1.0821853475266123, "grad_norm": 1.2494398355484009, "learning_rate": 6.984544836956521e-05, "loss": 0.6367, "step": 6913 }, { "epoch": 1.0823418910457108, "grad_norm": 1.7012557983398438, "learning_rate": 6.983355978260869e-05, "loss": 0.5319, "step": 6914 }, { "epoch": 1.082498434564809, "grad_norm": 6.114250183105469, "learning_rate": 6.982167119565217e-05, "loss": 0.8392, "step": 6915 }, { "epoch": 1.0826549780839074, "grad_norm": 2.5270793437957764, "learning_rate": 6.980978260869564e-05, "loss": 0.7361, "step": 6916 }, { "epoch": 1.0828115216030056, "grad_norm": 3.9582738876342773, "learning_rate": 6.979789402173912e-05, "loss": 0.9542, "step": 6917 }, { "epoch": 1.0829680651221039, "grad_norm": 1.9782977104187012, "learning_rate": 6.97860054347826e-05, "loss": 1.1699, "step": 6918 }, { "epoch": 1.0831246086412023, "grad_norm": 3.058692216873169, "learning_rate": 6.977411684782608e-05, "loss": 0.6688, "step": 6919 }, { "epoch": 1.0832811521603005, "grad_norm": 2.74263858795166, "learning_rate": 6.976222826086956e-05, "loss": 0.9369, "step": 6920 }, { "epoch": 1.083437695679399, "grad_norm": 2.8544299602508545, "learning_rate": 6.975033967391304e-05, "loss": 0.9864, "step": 6921 }, { "epoch": 1.0835942391984972, "grad_norm": 3.3317909240722656, "learning_rate": 6.973845108695651e-05, "loss": 0.8292, "step": 6922 }, { "epoch": 1.0837507827175954, "grad_norm": 2.494579553604126, "learning_rate": 6.972656249999999e-05, "loss": 0.9883, "step": 6923 }, { "epoch": 1.0839073262366938, "grad_norm": 3.1076154708862305, "learning_rate": 6.971467391304347e-05, "loss": 0.908, "step": 6924 }, { "epoch": 1.084063869755792, "grad_norm": 6.823611736297607, "learning_rate": 6.970278532608695e-05, "loss": 1.2047, "step": 6925 }, { "epoch": 1.0842204132748905, "grad_norm": 6.038473606109619, "learning_rate": 6.969089673913043e-05, "loss": 0.8685, "step": 6926 }, { "epoch": 1.0843769567939887, "grad_norm": 3.092426300048828, "learning_rate": 6.96790081521739e-05, "loss": 0.9502, "step": 6927 }, { "epoch": 1.084533500313087, "grad_norm": 5.738861083984375, "learning_rate": 6.966711956521738e-05, "loss": 1.0795, "step": 6928 }, { "epoch": 1.0846900438321854, "grad_norm": 3.9953479766845703, "learning_rate": 6.965523097826086e-05, "loss": 1.1771, "step": 6929 }, { "epoch": 1.0848465873512836, "grad_norm": 3.54019832611084, "learning_rate": 6.964334239130434e-05, "loss": 1.0738, "step": 6930 }, { "epoch": 1.085003130870382, "grad_norm": 3.4735052585601807, "learning_rate": 6.963145380434782e-05, "loss": 1.4871, "step": 6931 }, { "epoch": 1.0851596743894802, "grad_norm": 2.9564907550811768, "learning_rate": 6.96195652173913e-05, "loss": 1.4669, "step": 6932 }, { "epoch": 1.0853162179085787, "grad_norm": 3.895662784576416, "learning_rate": 6.960767663043477e-05, "loss": 1.5239, "step": 6933 }, { "epoch": 1.085472761427677, "grad_norm": 1.2577346563339233, "learning_rate": 6.959578804347825e-05, "loss": 0.2232, "step": 6934 }, { "epoch": 1.0856293049467751, "grad_norm": 2.0775082111358643, "learning_rate": 6.958389945652173e-05, "loss": 1.0367, "step": 6935 }, { "epoch": 1.0857858484658736, "grad_norm": 6.599319934844971, "learning_rate": 6.957201086956521e-05, "loss": 0.9333, "step": 6936 }, { "epoch": 1.0859423919849718, "grad_norm": 2.7362372875213623, "learning_rate": 6.956012228260869e-05, "loss": 0.6312, "step": 6937 }, { "epoch": 1.0860989355040702, "grad_norm": 4.639647483825684, "learning_rate": 6.954823369565216e-05, "loss": 1.0843, "step": 6938 }, { "epoch": 1.0862554790231684, "grad_norm": 0.3623231053352356, "learning_rate": 6.953634510869566e-05, "loss": 0.2107, "step": 6939 }, { "epoch": 1.0864120225422667, "grad_norm": 0.42492246627807617, "learning_rate": 6.952445652173913e-05, "loss": 0.208, "step": 6940 }, { "epoch": 1.086568566061365, "grad_norm": 0.4546562731266022, "learning_rate": 6.95125679347826e-05, "loss": 0.2231, "step": 6941 }, { "epoch": 1.0867251095804633, "grad_norm": 0.9410432577133179, "learning_rate": 6.950067934782608e-05, "loss": 0.1949, "step": 6942 }, { "epoch": 1.0868816530995618, "grad_norm": 0.8752755522727966, "learning_rate": 6.948879076086955e-05, "loss": 0.2807, "step": 6943 }, { "epoch": 1.08703819661866, "grad_norm": 0.7666401863098145, "learning_rate": 6.947690217391303e-05, "loss": 0.2352, "step": 6944 }, { "epoch": 1.0871947401377584, "grad_norm": 0.6589954495429993, "learning_rate": 6.946501358695651e-05, "loss": 0.1904, "step": 6945 }, { "epoch": 1.0873512836568566, "grad_norm": 1.0952553749084473, "learning_rate": 6.945312499999999e-05, "loss": 0.4093, "step": 6946 }, { "epoch": 1.0875078271759548, "grad_norm": 0.676835298538208, "learning_rate": 6.944123641304347e-05, "loss": 0.2506, "step": 6947 }, { "epoch": 1.0876643706950533, "grad_norm": 0.7468671798706055, "learning_rate": 6.942934782608695e-05, "loss": 0.2431, "step": 6948 }, { "epoch": 1.0878209142141515, "grad_norm": 0.8175029754638672, "learning_rate": 6.941745923913044e-05, "loss": 0.2943, "step": 6949 }, { "epoch": 1.08797745773325, "grad_norm": 1.2426763772964478, "learning_rate": 6.940557065217392e-05, "loss": 0.3161, "step": 6950 }, { "epoch": 1.0881340012523482, "grad_norm": 0.9814749360084534, "learning_rate": 6.93936820652174e-05, "loss": 0.3297, "step": 6951 }, { "epoch": 1.0882905447714464, "grad_norm": 0.8811530470848083, "learning_rate": 6.938179347826086e-05, "loss": 0.2555, "step": 6952 }, { "epoch": 1.0884470882905448, "grad_norm": 0.9403009414672852, "learning_rate": 6.936990489130434e-05, "loss": 0.3454, "step": 6953 }, { "epoch": 1.088603631809643, "grad_norm": 1.2492094039916992, "learning_rate": 6.935801630434781e-05, "loss": 0.2651, "step": 6954 }, { "epoch": 1.0887601753287415, "grad_norm": 1.2951079607009888, "learning_rate": 6.934612771739129e-05, "loss": 0.4003, "step": 6955 }, { "epoch": 1.0889167188478397, "grad_norm": 1.5389074087142944, "learning_rate": 6.933423913043477e-05, "loss": 0.3419, "step": 6956 }, { "epoch": 1.089073262366938, "grad_norm": 1.5660606622695923, "learning_rate": 6.932235054347825e-05, "loss": 0.5979, "step": 6957 }, { "epoch": 1.0892298058860364, "grad_norm": 1.6922165155410767, "learning_rate": 6.931046195652173e-05, "loss": 0.4784, "step": 6958 }, { "epoch": 1.0893863494051346, "grad_norm": 1.215303659439087, "learning_rate": 6.929857336956522e-05, "loss": 0.6302, "step": 6959 }, { "epoch": 1.089542892924233, "grad_norm": 1.8138667345046997, "learning_rate": 6.92866847826087e-05, "loss": 0.6542, "step": 6960 }, { "epoch": 1.0896994364433312, "grad_norm": 1.7557185888290405, "learning_rate": 6.927479619565217e-05, "loss": 0.4036, "step": 6961 }, { "epoch": 1.0898559799624294, "grad_norm": 1.804795265197754, "learning_rate": 6.926290760869565e-05, "loss": 0.4645, "step": 6962 }, { "epoch": 1.0900125234815279, "grad_norm": 6.502444267272949, "learning_rate": 6.925101902173913e-05, "loss": 0.728, "step": 6963 }, { "epoch": 1.090169067000626, "grad_norm": 3.039884328842163, "learning_rate": 6.92391304347826e-05, "loss": 0.5475, "step": 6964 }, { "epoch": 1.0903256105197245, "grad_norm": 1.685293197631836, "learning_rate": 6.922724184782607e-05, "loss": 0.6751, "step": 6965 }, { "epoch": 1.0904821540388228, "grad_norm": 1.8663785457611084, "learning_rate": 6.921535326086955e-05, "loss": 0.4296, "step": 6966 }, { "epoch": 1.0906386975579212, "grad_norm": 1.7505784034729004, "learning_rate": 6.920346467391303e-05, "loss": 0.3148, "step": 6967 }, { "epoch": 1.0907952410770194, "grad_norm": 2.9449806213378906, "learning_rate": 6.919157608695651e-05, "loss": 0.909, "step": 6968 }, { "epoch": 1.0909517845961176, "grad_norm": 1.711647629737854, "learning_rate": 6.91796875e-05, "loss": 0.6998, "step": 6969 }, { "epoch": 1.091108328115216, "grad_norm": 2.342890977859497, "learning_rate": 6.916779891304348e-05, "loss": 0.5713, "step": 6970 }, { "epoch": 1.0912648716343143, "grad_norm": 3.9850456714630127, "learning_rate": 6.915591032608696e-05, "loss": 0.8097, "step": 6971 }, { "epoch": 1.0914214151534127, "grad_norm": 4.002450942993164, "learning_rate": 6.914402173913043e-05, "loss": 0.8752, "step": 6972 }, { "epoch": 1.091577958672511, "grad_norm": 5.988186836242676, "learning_rate": 6.913213315217391e-05, "loss": 0.9577, "step": 6973 }, { "epoch": 1.0917345021916092, "grad_norm": 1.863346815109253, "learning_rate": 6.912024456521739e-05, "loss": 0.5705, "step": 6974 }, { "epoch": 1.0918910457107076, "grad_norm": 2.5063953399658203, "learning_rate": 6.910835597826086e-05, "loss": 1.1131, "step": 6975 }, { "epoch": 1.0920475892298058, "grad_norm": 4.264064788818359, "learning_rate": 6.909646739130433e-05, "loss": 1.4755, "step": 6976 }, { "epoch": 1.0922041327489043, "grad_norm": 2.086885690689087, "learning_rate": 6.908457880434781e-05, "loss": 0.6961, "step": 6977 }, { "epoch": 1.0923606762680025, "grad_norm": 5.720505237579346, "learning_rate": 6.907269021739129e-05, "loss": 1.1393, "step": 6978 }, { "epoch": 1.092517219787101, "grad_norm": 2.5626511573791504, "learning_rate": 6.906080163043478e-05, "loss": 0.7133, "step": 6979 }, { "epoch": 1.0926737633061991, "grad_norm": 3.8095879554748535, "learning_rate": 6.904891304347826e-05, "loss": 1.3644, "step": 6980 }, { "epoch": 1.0928303068252974, "grad_norm": 2.322049617767334, "learning_rate": 6.903702445652174e-05, "loss": 1.0174, "step": 6981 }, { "epoch": 1.0929868503443958, "grad_norm": 3.6272060871124268, "learning_rate": 6.902513586956522e-05, "loss": 1.0648, "step": 6982 }, { "epoch": 1.093143393863494, "grad_norm": 2.0485658645629883, "learning_rate": 6.90132472826087e-05, "loss": 1.0269, "step": 6983 }, { "epoch": 1.0932999373825925, "grad_norm": 4.239789962768555, "learning_rate": 6.900135869565217e-05, "loss": 1.0788, "step": 6984 }, { "epoch": 1.0934564809016907, "grad_norm": 1.8695679903030396, "learning_rate": 6.898947010869565e-05, "loss": 0.8805, "step": 6985 }, { "epoch": 1.093613024420789, "grad_norm": 1.407981514930725, "learning_rate": 6.897758152173913e-05, "loss": 0.51, "step": 6986 }, { "epoch": 1.0937695679398873, "grad_norm": 1.349907398223877, "learning_rate": 6.89656929347826e-05, "loss": 0.4146, "step": 6987 }, { "epoch": 1.0939261114589856, "grad_norm": 3.8788111209869385, "learning_rate": 6.895380434782607e-05, "loss": 0.8199, "step": 6988 }, { "epoch": 1.094082654978084, "grad_norm": 0.4054253399372101, "learning_rate": 6.894191576086956e-05, "loss": 0.2723, "step": 6989 }, { "epoch": 1.0942391984971822, "grad_norm": 0.7108949422836304, "learning_rate": 6.893002717391304e-05, "loss": 0.1674, "step": 6990 }, { "epoch": 1.0943957420162804, "grad_norm": 0.4834432899951935, "learning_rate": 6.891813858695652e-05, "loss": 0.2149, "step": 6991 }, { "epoch": 1.0945522855353789, "grad_norm": 0.6015425324440002, "learning_rate": 6.890625e-05, "loss": 0.272, "step": 6992 }, { "epoch": 1.094708829054477, "grad_norm": 0.7071064710617065, "learning_rate": 6.889436141304348e-05, "loss": 0.2645, "step": 6993 }, { "epoch": 1.0948653725735755, "grad_norm": 1.538178563117981, "learning_rate": 6.888247282608695e-05, "loss": 0.2729, "step": 6994 }, { "epoch": 1.0950219160926737, "grad_norm": 0.9366376996040344, "learning_rate": 6.887058423913043e-05, "loss": 0.1921, "step": 6995 }, { "epoch": 1.095178459611772, "grad_norm": 1.0217678546905518, "learning_rate": 6.885869565217391e-05, "loss": 0.2123, "step": 6996 }, { "epoch": 1.0953350031308704, "grad_norm": 0.8424039483070374, "learning_rate": 6.884680706521739e-05, "loss": 0.2828, "step": 6997 }, { "epoch": 1.0954915466499686, "grad_norm": 0.90024733543396, "learning_rate": 6.883491847826085e-05, "loss": 0.2307, "step": 6998 }, { "epoch": 1.095648090169067, "grad_norm": 0.7072389721870422, "learning_rate": 6.882302989130434e-05, "loss": 0.3114, "step": 6999 }, { "epoch": 1.0958046336881653, "grad_norm": 0.803411066532135, "learning_rate": 6.881114130434782e-05, "loss": 0.2646, "step": 7000 }, { "epoch": 1.0958046336881653, "eval_loss": 0.5215813517570496, "eval_runtime": 205.1416, "eval_samples_per_second": 60.363, "eval_steps_per_second": 3.773, "eval_wer": 0.3294090145993157, "step": 7000 }, { "epoch": 1.0959611772072637, "grad_norm": 0.8853934407234192, "learning_rate": 6.87992527173913e-05, "loss": 0.3307, "step": 7001 }, { "epoch": 1.096117720726362, "grad_norm": 1.4964919090270996, "learning_rate": 6.878736413043478e-05, "loss": 0.4241, "step": 7002 }, { "epoch": 1.0962742642454602, "grad_norm": 4.507322788238525, "learning_rate": 6.877547554347826e-05, "loss": 0.7333, "step": 7003 }, { "epoch": 1.0964308077645586, "grad_norm": 1.6973412036895752, "learning_rate": 6.876358695652174e-05, "loss": 0.3092, "step": 7004 }, { "epoch": 1.0965873512836568, "grad_norm": 3.122850179672241, "learning_rate": 6.875169836956521e-05, "loss": 0.639, "step": 7005 }, { "epoch": 1.0967438948027552, "grad_norm": 1.265026330947876, "learning_rate": 6.873980978260869e-05, "loss": 0.3706, "step": 7006 }, { "epoch": 1.0969004383218535, "grad_norm": 1.0781527757644653, "learning_rate": 6.872792119565217e-05, "loss": 0.4565, "step": 7007 }, { "epoch": 1.0970569818409517, "grad_norm": 1.0059528350830078, "learning_rate": 6.871603260869565e-05, "loss": 0.3123, "step": 7008 }, { "epoch": 1.0972135253600501, "grad_norm": 2.6335248947143555, "learning_rate": 6.870414402173913e-05, "loss": 0.4536, "step": 7009 }, { "epoch": 1.0973700688791483, "grad_norm": 3.8736624717712402, "learning_rate": 6.86922554347826e-05, "loss": 0.7389, "step": 7010 }, { "epoch": 1.0975266123982468, "grad_norm": 2.9472780227661133, "learning_rate": 6.868036684782608e-05, "loss": 0.5428, "step": 7011 }, { "epoch": 1.097683155917345, "grad_norm": 4.217750072479248, "learning_rate": 6.866847826086956e-05, "loss": 0.8684, "step": 7012 }, { "epoch": 1.0978396994364434, "grad_norm": 1.1717655658721924, "learning_rate": 6.865658967391304e-05, "loss": 0.2878, "step": 7013 }, { "epoch": 1.0979962429555417, "grad_norm": 2.0427722930908203, "learning_rate": 6.864470108695652e-05, "loss": 0.5905, "step": 7014 }, { "epoch": 1.0981527864746399, "grad_norm": 2.4837188720703125, "learning_rate": 6.86328125e-05, "loss": 0.7186, "step": 7015 }, { "epoch": 1.0983093299937383, "grad_norm": 3.22302508354187, "learning_rate": 6.862092391304347e-05, "loss": 0.4548, "step": 7016 }, { "epoch": 1.0984658735128365, "grad_norm": 2.1882426738739014, "learning_rate": 6.860903532608695e-05, "loss": 0.3669, "step": 7017 }, { "epoch": 1.098622417031935, "grad_norm": 1.9030394554138184, "learning_rate": 6.859714673913043e-05, "loss": 0.6014, "step": 7018 }, { "epoch": 1.0987789605510332, "grad_norm": 1.831048846244812, "learning_rate": 6.858525815217391e-05, "loss": 0.306, "step": 7019 }, { "epoch": 1.0989355040701314, "grad_norm": 3.8712666034698486, "learning_rate": 6.857336956521739e-05, "loss": 0.746, "step": 7020 }, { "epoch": 1.0990920475892298, "grad_norm": 2.0850110054016113, "learning_rate": 6.856148097826086e-05, "loss": 0.8881, "step": 7021 }, { "epoch": 1.099248591108328, "grad_norm": 2.95853853225708, "learning_rate": 6.854959239130434e-05, "loss": 0.6072, "step": 7022 }, { "epoch": 1.0994051346274265, "grad_norm": 2.419833183288574, "learning_rate": 6.853770380434782e-05, "loss": 0.715, "step": 7023 }, { "epoch": 1.0995616781465247, "grad_norm": 1.3920847177505493, "learning_rate": 6.85258152173913e-05, "loss": 0.3829, "step": 7024 }, { "epoch": 1.099718221665623, "grad_norm": 3.10505747795105, "learning_rate": 6.851392663043478e-05, "loss": 0.7742, "step": 7025 }, { "epoch": 1.0998747651847214, "grad_norm": 2.489821672439575, "learning_rate": 6.850203804347826e-05, "loss": 1.0541, "step": 7026 }, { "epoch": 1.1000313087038196, "grad_norm": 6.303160667419434, "learning_rate": 6.849014945652173e-05, "loss": 1.0787, "step": 7027 }, { "epoch": 1.100187852222918, "grad_norm": 2.854536294937134, "learning_rate": 6.847826086956521e-05, "loss": 0.9025, "step": 7028 }, { "epoch": 1.1003443957420163, "grad_norm": 3.5457942485809326, "learning_rate": 6.846637228260869e-05, "loss": 1.1945, "step": 7029 }, { "epoch": 1.1005009392611145, "grad_norm": 4.457611560821533, "learning_rate": 6.845448369565217e-05, "loss": 1.7158, "step": 7030 }, { "epoch": 1.100657482780213, "grad_norm": 4.104443073272705, "learning_rate": 6.844259510869565e-05, "loss": 1.2405, "step": 7031 }, { "epoch": 1.1008140262993111, "grad_norm": 2.9390945434570312, "learning_rate": 6.843070652173912e-05, "loss": 1.486, "step": 7032 }, { "epoch": 1.1009705698184096, "grad_norm": 2.586413621902466, "learning_rate": 6.84188179347826e-05, "loss": 0.92, "step": 7033 }, { "epoch": 1.1011271133375078, "grad_norm": 2.880575656890869, "learning_rate": 6.840692934782608e-05, "loss": 0.5706, "step": 7034 }, { "epoch": 1.1012836568566062, "grad_norm": 2.4178457260131836, "learning_rate": 6.839504076086956e-05, "loss": 0.7681, "step": 7035 }, { "epoch": 1.1014402003757044, "grad_norm": 5.115987300872803, "learning_rate": 6.838315217391304e-05, "loss": 0.6602, "step": 7036 }, { "epoch": 1.1015967438948027, "grad_norm": 6.864778518676758, "learning_rate": 6.837126358695651e-05, "loss": 1.4685, "step": 7037 }, { "epoch": 1.101753287413901, "grad_norm": 2.921862840652466, "learning_rate": 6.835937499999999e-05, "loss": 1.1695, "step": 7038 }, { "epoch": 1.1019098309329993, "grad_norm": 0.5533789396286011, "learning_rate": 6.834748641304347e-05, "loss": 0.2214, "step": 7039 }, { "epoch": 1.1020663744520978, "grad_norm": 0.34108278155326843, "learning_rate": 6.833559782608695e-05, "loss": 0.1644, "step": 7040 }, { "epoch": 1.102222917971196, "grad_norm": 0.45392435789108276, "learning_rate": 6.832370923913043e-05, "loss": 0.1906, "step": 7041 }, { "epoch": 1.1023794614902942, "grad_norm": 0.6409109830856323, "learning_rate": 6.83118206521739e-05, "loss": 0.219, "step": 7042 }, { "epoch": 1.1025360050093926, "grad_norm": 1.0520700216293335, "learning_rate": 6.829993206521738e-05, "loss": 0.4656, "step": 7043 }, { "epoch": 1.1026925485284909, "grad_norm": 0.6128873825073242, "learning_rate": 6.828804347826086e-05, "loss": 0.3167, "step": 7044 }, { "epoch": 1.1028490920475893, "grad_norm": 1.1852487325668335, "learning_rate": 6.827615489130434e-05, "loss": 0.3378, "step": 7045 }, { "epoch": 1.1030056355666875, "grad_norm": 0.8927292227745056, "learning_rate": 6.826426630434782e-05, "loss": 0.3282, "step": 7046 }, { "epoch": 1.103162179085786, "grad_norm": 0.8148831129074097, "learning_rate": 6.82523777173913e-05, "loss": 0.1637, "step": 7047 }, { "epoch": 1.1033187226048842, "grad_norm": 1.4134552478790283, "learning_rate": 6.824048913043477e-05, "loss": 0.3622, "step": 7048 }, { "epoch": 1.1034752661239824, "grad_norm": 1.6063189506530762, "learning_rate": 6.822860054347825e-05, "loss": 0.351, "step": 7049 }, { "epoch": 1.1036318096430808, "grad_norm": 2.0655741691589355, "learning_rate": 6.821671195652173e-05, "loss": 0.6489, "step": 7050 }, { "epoch": 1.103788353162179, "grad_norm": 1.2007325887680054, "learning_rate": 6.820482336956521e-05, "loss": 0.3573, "step": 7051 }, { "epoch": 1.1039448966812775, "grad_norm": 0.6590083837509155, "learning_rate": 6.819293478260869e-05, "loss": 0.3138, "step": 7052 }, { "epoch": 1.1041014402003757, "grad_norm": 0.7847480177879333, "learning_rate": 6.818104619565217e-05, "loss": 0.3014, "step": 7053 }, { "epoch": 1.104257983719474, "grad_norm": 1.5840784311294556, "learning_rate": 6.816915760869564e-05, "loss": 0.2114, "step": 7054 }, { "epoch": 1.1044145272385724, "grad_norm": 3.050092935562134, "learning_rate": 6.815726902173912e-05, "loss": 0.8216, "step": 7055 }, { "epoch": 1.1045710707576706, "grad_norm": 0.9905421733856201, "learning_rate": 6.81453804347826e-05, "loss": 0.2632, "step": 7056 }, { "epoch": 1.104727614276769, "grad_norm": 1.5077053308486938, "learning_rate": 6.813349184782608e-05, "loss": 0.4157, "step": 7057 }, { "epoch": 1.1048841577958672, "grad_norm": 1.2804769277572632, "learning_rate": 6.812160326086956e-05, "loss": 0.4395, "step": 7058 }, { "epoch": 1.1050407013149655, "grad_norm": 1.2429428100585938, "learning_rate": 6.810971467391303e-05, "loss": 0.4506, "step": 7059 }, { "epoch": 1.105197244834064, "grad_norm": 0.8043637871742249, "learning_rate": 6.809782608695651e-05, "loss": 0.2874, "step": 7060 }, { "epoch": 1.1053537883531621, "grad_norm": 2.0967705249786377, "learning_rate": 6.808593749999999e-05, "loss": 0.7651, "step": 7061 }, { "epoch": 1.1055103318722606, "grad_norm": 1.6419777870178223, "learning_rate": 6.807404891304347e-05, "loss": 0.3935, "step": 7062 }, { "epoch": 1.1056668753913588, "grad_norm": 1.5563385486602783, "learning_rate": 6.806216032608695e-05, "loss": 0.384, "step": 7063 }, { "epoch": 1.105823418910457, "grad_norm": 3.0951173305511475, "learning_rate": 6.805027173913043e-05, "loss": 0.4278, "step": 7064 }, { "epoch": 1.1059799624295554, "grad_norm": 1.0444039106369019, "learning_rate": 6.80383831521739e-05, "loss": 0.3594, "step": 7065 }, { "epoch": 1.1061365059486536, "grad_norm": 1.2262977361679077, "learning_rate": 6.80264945652174e-05, "loss": 0.4039, "step": 7066 }, { "epoch": 1.106293049467752, "grad_norm": 3.753011703491211, "learning_rate": 6.801460597826086e-05, "loss": 0.3837, "step": 7067 }, { "epoch": 1.1064495929868503, "grad_norm": 2.6269149780273438, "learning_rate": 6.800271739130434e-05, "loss": 0.4926, "step": 7068 }, { "epoch": 1.1066061365059487, "grad_norm": 2.313959836959839, "learning_rate": 6.799082880434782e-05, "loss": 0.6906, "step": 7069 }, { "epoch": 1.106762680025047, "grad_norm": 1.9340176582336426, "learning_rate": 6.79789402173913e-05, "loss": 0.6259, "step": 7070 }, { "epoch": 1.1069192235441452, "grad_norm": 2.5382258892059326, "learning_rate": 6.796705163043477e-05, "loss": 0.7239, "step": 7071 }, { "epoch": 1.1070757670632436, "grad_norm": 2.2338552474975586, "learning_rate": 6.795516304347825e-05, "loss": 0.9922, "step": 7072 }, { "epoch": 1.1072323105823418, "grad_norm": 3.501128911972046, "learning_rate": 6.794327445652173e-05, "loss": 0.9232, "step": 7073 }, { "epoch": 1.1073888541014403, "grad_norm": 5.252306938171387, "learning_rate": 6.79313858695652e-05, "loss": 0.9087, "step": 7074 }, { "epoch": 1.1075453976205385, "grad_norm": 3.2309017181396484, "learning_rate": 6.791949728260868e-05, "loss": 1.1288, "step": 7075 }, { "epoch": 1.107701941139637, "grad_norm": 2.503234386444092, "learning_rate": 6.790760869565218e-05, "loss": 0.6736, "step": 7076 }, { "epoch": 1.1078584846587352, "grad_norm": 3.129061460494995, "learning_rate": 6.789572010869565e-05, "loss": 1.0254, "step": 7077 }, { "epoch": 1.1080150281778334, "grad_norm": 7.795956611633301, "learning_rate": 6.788383152173913e-05, "loss": 1.206, "step": 7078 }, { "epoch": 1.1081715716969318, "grad_norm": 1.209658145904541, "learning_rate": 6.78719429347826e-05, "loss": 0.5926, "step": 7079 }, { "epoch": 1.10832811521603, "grad_norm": 3.4029994010925293, "learning_rate": 6.786005434782608e-05, "loss": 0.5214, "step": 7080 }, { "epoch": 1.1084846587351285, "grad_norm": 2.411487102508545, "learning_rate": 6.784816576086955e-05, "loss": 0.9289, "step": 7081 }, { "epoch": 1.1086412022542267, "grad_norm": 3.3024051189422607, "learning_rate": 6.783627717391303e-05, "loss": 1.0343, "step": 7082 }, { "epoch": 1.108797745773325, "grad_norm": 3.0771424770355225, "learning_rate": 6.782438858695651e-05, "loss": 1.3138, "step": 7083 }, { "epoch": 1.1089542892924233, "grad_norm": 3.490532398223877, "learning_rate": 6.781249999999999e-05, "loss": 1.1527, "step": 7084 }, { "epoch": 1.1091108328115216, "grad_norm": 3.1182477474212646, "learning_rate": 6.780061141304347e-05, "loss": 0.684, "step": 7085 }, { "epoch": 1.10926737633062, "grad_norm": 3.782221794128418, "learning_rate": 6.778872282608696e-05, "loss": 0.6956, "step": 7086 }, { "epoch": 1.1094239198497182, "grad_norm": 2.978675127029419, "learning_rate": 6.777683423913044e-05, "loss": 1.2216, "step": 7087 }, { "epoch": 1.1095804633688164, "grad_norm": 1.8613024950027466, "learning_rate": 6.776494565217391e-05, "loss": 0.6532, "step": 7088 }, { "epoch": 1.1097370068879149, "grad_norm": 0.41223862767219543, "learning_rate": 6.775305706521739e-05, "loss": 0.1686, "step": 7089 }, { "epoch": 1.109893550407013, "grad_norm": 0.40649768710136414, "learning_rate": 6.774116847826086e-05, "loss": 0.1415, "step": 7090 }, { "epoch": 1.1100500939261115, "grad_norm": 0.6153949499130249, "learning_rate": 6.772927989130434e-05, "loss": 0.1934, "step": 7091 }, { "epoch": 1.1102066374452098, "grad_norm": 0.40550896525382996, "learning_rate": 6.771739130434781e-05, "loss": 0.1477, "step": 7092 }, { "epoch": 1.110363180964308, "grad_norm": 0.7406269907951355, "learning_rate": 6.770550271739129e-05, "loss": 0.1792, "step": 7093 }, { "epoch": 1.1105197244834064, "grad_norm": 1.0589624643325806, "learning_rate": 6.769361413043477e-05, "loss": 0.2986, "step": 7094 }, { "epoch": 1.1106762680025046, "grad_norm": 0.7856292128562927, "learning_rate": 6.768172554347825e-05, "loss": 0.326, "step": 7095 }, { "epoch": 1.110832811521603, "grad_norm": 0.6881037354469299, "learning_rate": 6.766983695652174e-05, "loss": 0.2823, "step": 7096 }, { "epoch": 1.1109893550407013, "grad_norm": 0.8114328980445862, "learning_rate": 6.765794836956522e-05, "loss": 0.2582, "step": 7097 }, { "epoch": 1.1111458985597997, "grad_norm": 1.015999436378479, "learning_rate": 6.76460597826087e-05, "loss": 0.3206, "step": 7098 }, { "epoch": 1.111302442078898, "grad_norm": 0.6613655686378479, "learning_rate": 6.763417119565217e-05, "loss": 0.2486, "step": 7099 }, { "epoch": 1.1114589855979962, "grad_norm": 0.672130286693573, "learning_rate": 6.762228260869565e-05, "loss": 0.2327, "step": 7100 }, { "epoch": 1.1116155291170946, "grad_norm": 0.7996574640274048, "learning_rate": 6.761039402173913e-05, "loss": 0.3086, "step": 7101 }, { "epoch": 1.1117720726361928, "grad_norm": 1.7597891092300415, "learning_rate": 6.75985054347826e-05, "loss": 0.4537, "step": 7102 }, { "epoch": 1.1119286161552913, "grad_norm": 2.8927395343780518, "learning_rate": 6.758661684782607e-05, "loss": 0.4165, "step": 7103 }, { "epoch": 1.1120851596743895, "grad_norm": 1.327236294746399, "learning_rate": 6.757472826086955e-05, "loss": 0.4915, "step": 7104 }, { "epoch": 1.1122417031934877, "grad_norm": 2.1421878337860107, "learning_rate": 6.756283967391304e-05, "loss": 0.4288, "step": 7105 }, { "epoch": 1.1123982467125861, "grad_norm": 1.8084936141967773, "learning_rate": 6.755095108695652e-05, "loss": 0.4924, "step": 7106 }, { "epoch": 1.1125547902316844, "grad_norm": 1.3511940240859985, "learning_rate": 6.75390625e-05, "loss": 0.4159, "step": 7107 }, { "epoch": 1.1127113337507828, "grad_norm": 2.5567023754119873, "learning_rate": 6.752717391304348e-05, "loss": 0.7348, "step": 7108 }, { "epoch": 1.112867877269881, "grad_norm": 1.1728402376174927, "learning_rate": 6.751528532608696e-05, "loss": 0.2945, "step": 7109 }, { "epoch": 1.1130244207889795, "grad_norm": 1.571470022201538, "learning_rate": 6.750339673913043e-05, "loss": 0.3899, "step": 7110 }, { "epoch": 1.1131809643080777, "grad_norm": 2.089855670928955, "learning_rate": 6.749150815217391e-05, "loss": 0.6828, "step": 7111 }, { "epoch": 1.1133375078271759, "grad_norm": 1.8497016429901123, "learning_rate": 6.747961956521739e-05, "loss": 0.4724, "step": 7112 }, { "epoch": 1.1134940513462743, "grad_norm": 1.9942258596420288, "learning_rate": 6.746773097826085e-05, "loss": 0.4891, "step": 7113 }, { "epoch": 1.1136505948653725, "grad_norm": 1.2494137287139893, "learning_rate": 6.745584239130433e-05, "loss": 0.4556, "step": 7114 }, { "epoch": 1.113807138384471, "grad_norm": 1.4381778240203857, "learning_rate": 6.744395380434782e-05, "loss": 0.4614, "step": 7115 }, { "epoch": 1.1139636819035692, "grad_norm": 2.0031867027282715, "learning_rate": 6.74320652173913e-05, "loss": 0.5172, "step": 7116 }, { "epoch": 1.1141202254226674, "grad_norm": 2.5972137451171875, "learning_rate": 6.742017663043478e-05, "loss": 0.5911, "step": 7117 }, { "epoch": 1.1142767689417659, "grad_norm": 2.3142037391662598, "learning_rate": 6.740828804347826e-05, "loss": 0.7714, "step": 7118 }, { "epoch": 1.114433312460864, "grad_norm": 2.073540210723877, "learning_rate": 6.739639945652174e-05, "loss": 1.0284, "step": 7119 }, { "epoch": 1.1145898559799625, "grad_norm": 1.7937099933624268, "learning_rate": 6.738451086956522e-05, "loss": 0.9321, "step": 7120 }, { "epoch": 1.1147463994990607, "grad_norm": 1.7594245672225952, "learning_rate": 6.73726222826087e-05, "loss": 0.5438, "step": 7121 }, { "epoch": 1.114902943018159, "grad_norm": 2.9183928966522217, "learning_rate": 6.736073369565217e-05, "loss": 0.5604, "step": 7122 }, { "epoch": 1.1150594865372574, "grad_norm": 2.4790399074554443, "learning_rate": 6.734884510869565e-05, "loss": 0.8887, "step": 7123 }, { "epoch": 1.1152160300563556, "grad_norm": 2.959824562072754, "learning_rate": 6.733695652173913e-05, "loss": 0.6385, "step": 7124 }, { "epoch": 1.115372573575454, "grad_norm": 1.4421095848083496, "learning_rate": 6.73250679347826e-05, "loss": 1.1371, "step": 7125 }, { "epoch": 1.1155291170945523, "grad_norm": 2.009667158126831, "learning_rate": 6.731317934782608e-05, "loss": 0.5928, "step": 7126 }, { "epoch": 1.1156856606136505, "grad_norm": 2.006736993789673, "learning_rate": 6.730129076086956e-05, "loss": 0.7895, "step": 7127 }, { "epoch": 1.115842204132749, "grad_norm": 1.8106614351272583, "learning_rate": 6.728940217391304e-05, "loss": 0.755, "step": 7128 }, { "epoch": 1.1159987476518471, "grad_norm": 2.937199115753174, "learning_rate": 6.727751358695652e-05, "loss": 1.1406, "step": 7129 }, { "epoch": 1.1161552911709456, "grad_norm": 1.8880271911621094, "learning_rate": 6.7265625e-05, "loss": 1.0826, "step": 7130 }, { "epoch": 1.1163118346900438, "grad_norm": 5.111264228820801, "learning_rate": 6.725373641304348e-05, "loss": 1.1068, "step": 7131 }, { "epoch": 1.1164683782091422, "grad_norm": 4.259989261627197, "learning_rate": 6.724184782608695e-05, "loss": 0.8059, "step": 7132 }, { "epoch": 1.1166249217282405, "grad_norm": 2.3799612522125244, "learning_rate": 6.722995923913043e-05, "loss": 0.9, "step": 7133 }, { "epoch": 1.1167814652473387, "grad_norm": 3.3538572788238525, "learning_rate": 6.721807065217391e-05, "loss": 0.9905, "step": 7134 }, { "epoch": 1.1169380087664371, "grad_norm": 0.8264369964599609, "learning_rate": 6.720618206521739e-05, "loss": 0.4046, "step": 7135 }, { "epoch": 1.1170945522855353, "grad_norm": 1.92601478099823, "learning_rate": 6.719429347826087e-05, "loss": 0.5804, "step": 7136 }, { "epoch": 1.1172510958046338, "grad_norm": 1.753648281097412, "learning_rate": 6.718240489130434e-05, "loss": 0.462, "step": 7137 }, { "epoch": 1.117407639323732, "grad_norm": 2.3155481815338135, "learning_rate": 6.717051630434782e-05, "loss": 1.3944, "step": 7138 }, { "epoch": 1.1175641828428302, "grad_norm": 0.899722158908844, "learning_rate": 6.71586277173913e-05, "loss": 0.3807, "step": 7139 }, { "epoch": 1.1177207263619287, "grad_norm": 0.5582410097122192, "learning_rate": 6.714673913043478e-05, "loss": 0.2465, "step": 7140 }, { "epoch": 1.1178772698810269, "grad_norm": 0.6346827745437622, "learning_rate": 6.713485054347826e-05, "loss": 0.2493, "step": 7141 }, { "epoch": 1.1180338134001253, "grad_norm": 0.8923242688179016, "learning_rate": 6.712296195652173e-05, "loss": 0.1906, "step": 7142 }, { "epoch": 1.1181903569192235, "grad_norm": 0.6886404752731323, "learning_rate": 6.711107336956521e-05, "loss": 0.3377, "step": 7143 }, { "epoch": 1.118346900438322, "grad_norm": 0.8652153611183167, "learning_rate": 6.709918478260869e-05, "loss": 0.2677, "step": 7144 }, { "epoch": 1.1185034439574202, "grad_norm": 2.3500208854675293, "learning_rate": 6.708729619565217e-05, "loss": 0.2874, "step": 7145 }, { "epoch": 1.1186599874765184, "grad_norm": 0.7232169508934021, "learning_rate": 6.707540760869565e-05, "loss": 0.2096, "step": 7146 }, { "epoch": 1.1188165309956168, "grad_norm": 0.8267368078231812, "learning_rate": 6.706351902173913e-05, "loss": 0.2318, "step": 7147 }, { "epoch": 1.118973074514715, "grad_norm": 0.5976278781890869, "learning_rate": 6.70516304347826e-05, "loss": 0.1634, "step": 7148 }, { "epoch": 1.1191296180338135, "grad_norm": 0.80687016248703, "learning_rate": 6.703974184782608e-05, "loss": 0.3051, "step": 7149 }, { "epoch": 1.1192861615529117, "grad_norm": 10.725024223327637, "learning_rate": 6.702785326086956e-05, "loss": 2.0184, "step": 7150 }, { "epoch": 1.11944270507201, "grad_norm": 0.8957884311676025, "learning_rate": 6.701596467391304e-05, "loss": 0.3658, "step": 7151 }, { "epoch": 1.1195992485911084, "grad_norm": 0.6956129670143127, "learning_rate": 6.700407608695652e-05, "loss": 0.2877, "step": 7152 }, { "epoch": 1.1197557921102066, "grad_norm": 0.897357702255249, "learning_rate": 6.69921875e-05, "loss": 0.2823, "step": 7153 }, { "epoch": 1.119912335629305, "grad_norm": 0.9134781360626221, "learning_rate": 6.698029891304347e-05, "loss": 0.3298, "step": 7154 }, { "epoch": 1.1200688791484033, "grad_norm": 1.7607685327529907, "learning_rate": 6.696841032608695e-05, "loss": 0.479, "step": 7155 }, { "epoch": 1.1202254226675015, "grad_norm": 2.9478602409362793, "learning_rate": 6.695652173913043e-05, "loss": 0.5015, "step": 7156 }, { "epoch": 1.1203819661866, "grad_norm": 1.297196626663208, "learning_rate": 6.694463315217391e-05, "loss": 0.3978, "step": 7157 }, { "epoch": 1.1205385097056981, "grad_norm": 1.4974116086959839, "learning_rate": 6.693274456521739e-05, "loss": 0.4958, "step": 7158 }, { "epoch": 1.1206950532247966, "grad_norm": 1.144086480140686, "learning_rate": 6.692085597826086e-05, "loss": 0.3511, "step": 7159 }, { "epoch": 1.1208515967438948, "grad_norm": 3.9303457736968994, "learning_rate": 6.690896739130434e-05, "loss": 0.2971, "step": 7160 }, { "epoch": 1.121008140262993, "grad_norm": 4.222661972045898, "learning_rate": 6.689707880434782e-05, "loss": 0.4898, "step": 7161 }, { "epoch": 1.1211646837820914, "grad_norm": 1.9720267057418823, "learning_rate": 6.68851902173913e-05, "loss": 0.5552, "step": 7162 }, { "epoch": 1.1213212273011897, "grad_norm": 5.736477851867676, "learning_rate": 6.687330163043478e-05, "loss": 0.4907, "step": 7163 }, { "epoch": 1.121477770820288, "grad_norm": 2.434476613998413, "learning_rate": 6.686141304347825e-05, "loss": 0.3476, "step": 7164 }, { "epoch": 1.1216343143393863, "grad_norm": 2.3495452404022217, "learning_rate": 6.684952445652173e-05, "loss": 0.7288, "step": 7165 }, { "epoch": 1.1217908578584848, "grad_norm": 2.324531078338623, "learning_rate": 6.683763586956521e-05, "loss": 0.5412, "step": 7166 }, { "epoch": 1.121947401377583, "grad_norm": 2.283395767211914, "learning_rate": 6.682574728260869e-05, "loss": 0.3999, "step": 7167 }, { "epoch": 1.1221039448966812, "grad_norm": 2.0466253757476807, "learning_rate": 6.681385869565217e-05, "loss": 0.8843, "step": 7168 }, { "epoch": 1.1222604884157796, "grad_norm": 2.0652129650115967, "learning_rate": 6.680197010869564e-05, "loss": 0.8308, "step": 7169 }, { "epoch": 1.1224170319348779, "grad_norm": 1.9687703847885132, "learning_rate": 6.679008152173912e-05, "loss": 0.6166, "step": 7170 }, { "epoch": 1.1225735754539763, "grad_norm": 1.8024779558181763, "learning_rate": 6.67781929347826e-05, "loss": 0.9186, "step": 7171 }, { "epoch": 1.1227301189730745, "grad_norm": 5.204569339752197, "learning_rate": 6.676630434782608e-05, "loss": 0.9102, "step": 7172 }, { "epoch": 1.1228866624921727, "grad_norm": 3.2372303009033203, "learning_rate": 6.675441576086956e-05, "loss": 0.8732, "step": 7173 }, { "epoch": 1.1230432060112712, "grad_norm": 2.772037982940674, "learning_rate": 6.674252717391304e-05, "loss": 1.0838, "step": 7174 }, { "epoch": 1.1231997495303694, "grad_norm": 2.0030109882354736, "learning_rate": 6.673063858695651e-05, "loss": 1.2048, "step": 7175 }, { "epoch": 1.1233562930494678, "grad_norm": 4.773928642272949, "learning_rate": 6.671874999999999e-05, "loss": 1.3997, "step": 7176 }, { "epoch": 1.123512836568566, "grad_norm": 2.0424530506134033, "learning_rate": 6.670686141304347e-05, "loss": 0.8154, "step": 7177 }, { "epoch": 1.1236693800876645, "grad_norm": 3.065593719482422, "learning_rate": 6.669497282608695e-05, "loss": 0.7408, "step": 7178 }, { "epoch": 1.1238259236067627, "grad_norm": 4.477370738983154, "learning_rate": 6.668308423913043e-05, "loss": 1.363, "step": 7179 }, { "epoch": 1.123982467125861, "grad_norm": 4.729310989379883, "learning_rate": 6.66711956521739e-05, "loss": 0.7837, "step": 7180 }, { "epoch": 1.1241390106449594, "grad_norm": 2.187133312225342, "learning_rate": 6.665930706521738e-05, "loss": 1.1285, "step": 7181 }, { "epoch": 1.1242955541640576, "grad_norm": 5.13003396987915, "learning_rate": 6.664741847826086e-05, "loss": 0.9973, "step": 7182 }, { "epoch": 1.124452097683156, "grad_norm": 2.4387364387512207, "learning_rate": 6.663552989130434e-05, "loss": 0.6945, "step": 7183 }, { "epoch": 1.1246086412022542, "grad_norm": 3.426036834716797, "learning_rate": 6.662364130434782e-05, "loss": 1.1455, "step": 7184 }, { "epoch": 1.1247651847213525, "grad_norm": 2.1871657371520996, "learning_rate": 6.66117527173913e-05, "loss": 0.8369, "step": 7185 }, { "epoch": 1.124921728240451, "grad_norm": 4.101746082305908, "learning_rate": 6.659986413043477e-05, "loss": 0.81, "step": 7186 }, { "epoch": 1.125078271759549, "grad_norm": 2.3457961082458496, "learning_rate": 6.658797554347825e-05, "loss": 1.0541, "step": 7187 }, { "epoch": 1.1252348152786475, "grad_norm": 2.871886730194092, "learning_rate": 6.657608695652173e-05, "loss": 1.211, "step": 7188 }, { "epoch": 1.1253913587977458, "grad_norm": 0.5157379508018494, "learning_rate": 6.656419836956521e-05, "loss": 0.1798, "step": 7189 }, { "epoch": 1.125547902316844, "grad_norm": 1.2976057529449463, "learning_rate": 6.655230978260869e-05, "loss": 0.3867, "step": 7190 }, { "epoch": 1.1257044458359424, "grad_norm": 0.44633230566978455, "learning_rate": 6.654042119565216e-05, "loss": 0.1665, "step": 7191 }, { "epoch": 1.1258609893550406, "grad_norm": 0.384564071893692, "learning_rate": 6.652853260869566e-05, "loss": 0.1242, "step": 7192 }, { "epoch": 1.126017532874139, "grad_norm": 0.8477802872657776, "learning_rate": 6.651664402173913e-05, "loss": 0.2438, "step": 7193 }, { "epoch": 1.1261740763932373, "grad_norm": 1.3204951286315918, "learning_rate": 6.65047554347826e-05, "loss": 0.3119, "step": 7194 }, { "epoch": 1.1263306199123355, "grad_norm": 1.0762141942977905, "learning_rate": 6.649286684782608e-05, "loss": 0.6748, "step": 7195 }, { "epoch": 1.126487163431434, "grad_norm": 0.6202916502952576, "learning_rate": 6.648097826086956e-05, "loss": 0.1947, "step": 7196 }, { "epoch": 1.1266437069505322, "grad_norm": 1.3842802047729492, "learning_rate": 6.646908967391303e-05, "loss": 0.112, "step": 7197 }, { "epoch": 1.1268002504696306, "grad_norm": 0.7977646589279175, "learning_rate": 6.645720108695651e-05, "loss": 0.3052, "step": 7198 }, { "epoch": 1.1269567939887288, "grad_norm": 0.9328012466430664, "learning_rate": 6.644531249999999e-05, "loss": 0.3234, "step": 7199 }, { "epoch": 1.127113337507827, "grad_norm": 0.723777711391449, "learning_rate": 6.643342391304347e-05, "loss": 0.3028, "step": 7200 }, { "epoch": 1.1272698810269255, "grad_norm": 1.098273515701294, "learning_rate": 6.642153532608695e-05, "loss": 0.3276, "step": 7201 }, { "epoch": 1.1274264245460237, "grad_norm": 1.009188175201416, "learning_rate": 6.640964673913044e-05, "loss": 0.1999, "step": 7202 }, { "epoch": 1.1275829680651221, "grad_norm": 0.7974739670753479, "learning_rate": 6.639775815217392e-05, "loss": 0.3315, "step": 7203 }, { "epoch": 1.1277395115842204, "grad_norm": 0.7093189358711243, "learning_rate": 6.63858695652174e-05, "loss": 0.3123, "step": 7204 }, { "epoch": 1.1278960551033188, "grad_norm": 1.445184350013733, "learning_rate": 6.637398097826086e-05, "loss": 0.7069, "step": 7205 }, { "epoch": 1.128052598622417, "grad_norm": 2.2451374530792236, "learning_rate": 6.636209239130434e-05, "loss": 0.3959, "step": 7206 }, { "epoch": 1.1282091421415155, "grad_norm": 1.567031979560852, "learning_rate": 6.635020380434781e-05, "loss": 0.3337, "step": 7207 }, { "epoch": 1.1283656856606137, "grad_norm": 1.4711880683898926, "learning_rate": 6.633831521739129e-05, "loss": 0.4925, "step": 7208 }, { "epoch": 1.128522229179712, "grad_norm": 1.7639625072479248, "learning_rate": 6.632642663043477e-05, "loss": 0.4214, "step": 7209 }, { "epoch": 1.1286787726988103, "grad_norm": 1.1779727935791016, "learning_rate": 6.631453804347825e-05, "loss": 0.3554, "step": 7210 }, { "epoch": 1.1288353162179086, "grad_norm": 2.466904878616333, "learning_rate": 6.630264945652173e-05, "loss": 0.5739, "step": 7211 }, { "epoch": 1.128991859737007, "grad_norm": 1.4827758073806763, "learning_rate": 6.629076086956522e-05, "loss": 0.4805, "step": 7212 }, { "epoch": 1.1291484032561052, "grad_norm": 1.638680338859558, "learning_rate": 6.62788722826087e-05, "loss": 0.4838, "step": 7213 }, { "epoch": 1.1293049467752034, "grad_norm": 2.2787678241729736, "learning_rate": 6.626698369565218e-05, "loss": 0.7832, "step": 7214 }, { "epoch": 1.1294614902943019, "grad_norm": 2.8750882148742676, "learning_rate": 6.625509510869565e-05, "loss": 0.2934, "step": 7215 }, { "epoch": 1.1296180338134, "grad_norm": 6.39399528503418, "learning_rate": 6.624320652173913e-05, "loss": 0.9316, "step": 7216 }, { "epoch": 1.1297745773324985, "grad_norm": 2.6225123405456543, "learning_rate": 6.62313179347826e-05, "loss": 0.8883, "step": 7217 }, { "epoch": 1.1299311208515967, "grad_norm": 3.131990671157837, "learning_rate": 6.621942934782607e-05, "loss": 0.7652, "step": 7218 }, { "epoch": 1.130087664370695, "grad_norm": 1.8523328304290771, "learning_rate": 6.620754076086955e-05, "loss": 0.7419, "step": 7219 }, { "epoch": 1.1302442078897934, "grad_norm": 2.1053483486175537, "learning_rate": 6.619565217391303e-05, "loss": 0.2964, "step": 7220 }, { "epoch": 1.1304007514088916, "grad_norm": 2.597956657409668, "learning_rate": 6.618376358695651e-05, "loss": 0.6805, "step": 7221 }, { "epoch": 1.13055729492799, "grad_norm": 3.6546289920806885, "learning_rate": 6.6171875e-05, "loss": 1.1161, "step": 7222 }, { "epoch": 1.1307138384470883, "grad_norm": 2.915066719055176, "learning_rate": 6.615998641304348e-05, "loss": 0.7929, "step": 7223 }, { "epoch": 1.1308703819661865, "grad_norm": 3.3552472591400146, "learning_rate": 6.614809782608696e-05, "loss": 0.871, "step": 7224 }, { "epoch": 1.131026925485285, "grad_norm": 1.781880259513855, "learning_rate": 6.613620923913044e-05, "loss": 0.6809, "step": 7225 }, { "epoch": 1.1311834690043832, "grad_norm": 4.1137542724609375, "learning_rate": 6.612432065217391e-05, "loss": 0.5339, "step": 7226 }, { "epoch": 1.1313400125234816, "grad_norm": 4.267662525177002, "learning_rate": 6.611243206521739e-05, "loss": 1.212, "step": 7227 }, { "epoch": 1.1314965560425798, "grad_norm": 4.158174514770508, "learning_rate": 6.610054347826086e-05, "loss": 0.6462, "step": 7228 }, { "epoch": 1.131653099561678, "grad_norm": 3.2616429328918457, "learning_rate": 6.608865489130433e-05, "loss": 0.9939, "step": 7229 }, { "epoch": 1.1318096430807765, "grad_norm": 3.8046867847442627, "learning_rate": 6.607676630434781e-05, "loss": 1.0406, "step": 7230 }, { "epoch": 1.1319661865998747, "grad_norm": 2.6862597465515137, "learning_rate": 6.606487771739129e-05, "loss": 1.175, "step": 7231 }, { "epoch": 1.1321227301189731, "grad_norm": 2.9816908836364746, "learning_rate": 6.605298913043478e-05, "loss": 1.1073, "step": 7232 }, { "epoch": 1.1322792736380713, "grad_norm": 2.1412601470947266, "learning_rate": 6.604110054347826e-05, "loss": 1.789, "step": 7233 }, { "epoch": 1.1324358171571698, "grad_norm": 3.572944164276123, "learning_rate": 6.602921195652174e-05, "loss": 0.9765, "step": 7234 }, { "epoch": 1.132592360676268, "grad_norm": 1.9203709363937378, "learning_rate": 6.601732336956522e-05, "loss": 0.5193, "step": 7235 }, { "epoch": 1.1327489041953662, "grad_norm": 5.322623252868652, "learning_rate": 6.60054347826087e-05, "loss": 0.918, "step": 7236 }, { "epoch": 1.1329054477144647, "grad_norm": 3.186340093612671, "learning_rate": 6.599354619565217e-05, "loss": 0.8385, "step": 7237 }, { "epoch": 1.1330619912335629, "grad_norm": 2.698596954345703, "learning_rate": 6.598165760869565e-05, "loss": 1.1992, "step": 7238 }, { "epoch": 1.1332185347526613, "grad_norm": 0.5395888090133667, "learning_rate": 6.596976902173913e-05, "loss": 0.2419, "step": 7239 }, { "epoch": 1.1333750782717595, "grad_norm": 0.5855292677879333, "learning_rate": 6.59578804347826e-05, "loss": 0.2055, "step": 7240 }, { "epoch": 1.133531621790858, "grad_norm": 0.863116979598999, "learning_rate": 6.594599184782607e-05, "loss": 0.3556, "step": 7241 }, { "epoch": 1.1336881653099562, "grad_norm": 0.4644695222377777, "learning_rate": 6.593410326086956e-05, "loss": 0.1831, "step": 7242 }, { "epoch": 1.1338447088290544, "grad_norm": 0.5582319498062134, "learning_rate": 6.592221467391304e-05, "loss": 0.1726, "step": 7243 }, { "epoch": 1.1340012523481529, "grad_norm": 0.8745964169502258, "learning_rate": 6.591032608695652e-05, "loss": 0.2707, "step": 7244 }, { "epoch": 1.134157795867251, "grad_norm": 0.824928879737854, "learning_rate": 6.58984375e-05, "loss": 0.2518, "step": 7245 }, { "epoch": 1.1343143393863495, "grad_norm": 0.661558210849762, "learning_rate": 6.588654891304348e-05, "loss": 0.1843, "step": 7246 }, { "epoch": 1.1344708829054477, "grad_norm": 0.542941689491272, "learning_rate": 6.587466032608695e-05, "loss": 0.1641, "step": 7247 }, { "epoch": 1.134627426424546, "grad_norm": 1.0743364095687866, "learning_rate": 6.586277173913043e-05, "loss": 0.2931, "step": 7248 }, { "epoch": 1.1347839699436444, "grad_norm": 0.9799054861068726, "learning_rate": 6.585088315217391e-05, "loss": 0.3504, "step": 7249 }, { "epoch": 1.1349405134627426, "grad_norm": 0.883016049861908, "learning_rate": 6.583899456521739e-05, "loss": 0.2908, "step": 7250 }, { "epoch": 1.135097056981841, "grad_norm": 1.6358848810195923, "learning_rate": 6.582710597826085e-05, "loss": 0.3374, "step": 7251 }, { "epoch": 1.1352536005009393, "grad_norm": 0.6222871541976929, "learning_rate": 6.581521739130435e-05, "loss": 0.1705, "step": 7252 }, { "epoch": 1.1354101440200375, "grad_norm": 0.905521035194397, "learning_rate": 6.580332880434782e-05, "loss": 0.3434, "step": 7253 }, { "epoch": 1.135566687539136, "grad_norm": 2.544538974761963, "learning_rate": 6.57914402173913e-05, "loss": 0.4346, "step": 7254 }, { "epoch": 1.1357232310582341, "grad_norm": 1.9248782396316528, "learning_rate": 6.577955163043478e-05, "loss": 0.5832, "step": 7255 }, { "epoch": 1.1358797745773326, "grad_norm": 1.6689033508300781, "learning_rate": 6.576766304347826e-05, "loss": 0.3189, "step": 7256 }, { "epoch": 1.1360363180964308, "grad_norm": 1.192184329032898, "learning_rate": 6.575577445652174e-05, "loss": 0.2864, "step": 7257 }, { "epoch": 1.136192861615529, "grad_norm": 1.4858559370040894, "learning_rate": 6.574388586956521e-05, "loss": 0.36, "step": 7258 }, { "epoch": 1.1363494051346275, "grad_norm": 2.499189615249634, "learning_rate": 6.573199728260869e-05, "loss": 0.7048, "step": 7259 }, { "epoch": 1.1365059486537257, "grad_norm": 1.278712272644043, "learning_rate": 6.572010869565217e-05, "loss": 0.3815, "step": 7260 }, { "epoch": 1.1366624921728241, "grad_norm": 1.6788917779922485, "learning_rate": 6.570822010869565e-05, "loss": 0.5211, "step": 7261 }, { "epoch": 1.1368190356919223, "grad_norm": 0.8307589292526245, "learning_rate": 6.569633152173913e-05, "loss": 0.2593, "step": 7262 }, { "epoch": 1.1369755792110205, "grad_norm": 2.259539842605591, "learning_rate": 6.56844429347826e-05, "loss": 0.4455, "step": 7263 }, { "epoch": 1.137132122730119, "grad_norm": 1.3458757400512695, "learning_rate": 6.567255434782608e-05, "loss": 0.4632, "step": 7264 }, { "epoch": 1.1372886662492172, "grad_norm": 1.5677992105484009, "learning_rate": 6.566066576086956e-05, "loss": 0.5687, "step": 7265 }, { "epoch": 1.1374452097683156, "grad_norm": 2.719622850418091, "learning_rate": 6.564877717391304e-05, "loss": 0.4493, "step": 7266 }, { "epoch": 1.1376017532874139, "grad_norm": 1.953373670578003, "learning_rate": 6.563688858695652e-05, "loss": 0.5108, "step": 7267 }, { "epoch": 1.1377582968065123, "grad_norm": 1.5841574668884277, "learning_rate": 6.5625e-05, "loss": 0.6133, "step": 7268 }, { "epoch": 1.1379148403256105, "grad_norm": 4.900529861450195, "learning_rate": 6.561311141304347e-05, "loss": 0.3795, "step": 7269 }, { "epoch": 1.1380713838447087, "grad_norm": 1.8375279903411865, "learning_rate": 6.560122282608695e-05, "loss": 0.6335, "step": 7270 }, { "epoch": 1.1382279273638072, "grad_norm": 1.7174475193023682, "learning_rate": 6.558933423913043e-05, "loss": 0.5877, "step": 7271 }, { "epoch": 1.1383844708829054, "grad_norm": 3.2704668045043945, "learning_rate": 6.557744565217391e-05, "loss": 0.6404, "step": 7272 }, { "epoch": 1.1385410144020038, "grad_norm": 2.5093932151794434, "learning_rate": 6.556555706521739e-05, "loss": 1.0113, "step": 7273 }, { "epoch": 1.138697557921102, "grad_norm": 3.30535626411438, "learning_rate": 6.555366847826086e-05, "loss": 0.8026, "step": 7274 }, { "epoch": 1.1388541014402005, "grad_norm": 3.0456643104553223, "learning_rate": 6.554177989130434e-05, "loss": 1.5636, "step": 7275 }, { "epoch": 1.1390106449592987, "grad_norm": 1.5780030488967896, "learning_rate": 6.552989130434782e-05, "loss": 0.8322, "step": 7276 }, { "epoch": 1.139167188478397, "grad_norm": 2.8441030979156494, "learning_rate": 6.55180027173913e-05, "loss": 0.649, "step": 7277 }, { "epoch": 1.1393237319974954, "grad_norm": 3.1044180393218994, "learning_rate": 6.550611413043478e-05, "loss": 0.9883, "step": 7278 }, { "epoch": 1.1394802755165936, "grad_norm": 2.1206631660461426, "learning_rate": 6.549422554347826e-05, "loss": 1.197, "step": 7279 }, { "epoch": 1.139636819035692, "grad_norm": 3.412625789642334, "learning_rate": 6.548233695652173e-05, "loss": 1.5935, "step": 7280 }, { "epoch": 1.1397933625547902, "grad_norm": 3.0310847759246826, "learning_rate": 6.547044836956521e-05, "loss": 1.3411, "step": 7281 }, { "epoch": 1.1399499060738885, "grad_norm": 6.384232044219971, "learning_rate": 6.545855978260869e-05, "loss": 1.6873, "step": 7282 }, { "epoch": 1.140106449592987, "grad_norm": 1.8649979829788208, "learning_rate": 6.544667119565217e-05, "loss": 0.7841, "step": 7283 }, { "epoch": 1.1402629931120851, "grad_norm": 1.6260277032852173, "learning_rate": 6.543478260869565e-05, "loss": 0.7412, "step": 7284 }, { "epoch": 1.1404195366311836, "grad_norm": 2.939988613128662, "learning_rate": 6.542289402173912e-05, "loss": 0.5437, "step": 7285 }, { "epoch": 1.1405760801502818, "grad_norm": 1.1887261867523193, "learning_rate": 6.54110054347826e-05, "loss": 0.4715, "step": 7286 }, { "epoch": 1.14073262366938, "grad_norm": 2.6982009410858154, "learning_rate": 6.539911684782608e-05, "loss": 0.8469, "step": 7287 }, { "epoch": 1.1408891671884784, "grad_norm": 1.229513168334961, "learning_rate": 6.538722826086956e-05, "loss": 0.3161, "step": 7288 }, { "epoch": 1.1410457107075767, "grad_norm": 0.41034597158432007, "learning_rate": 6.537533967391304e-05, "loss": 0.2337, "step": 7289 }, { "epoch": 1.141202254226675, "grad_norm": 0.47240588068962097, "learning_rate": 6.536345108695652e-05, "loss": 0.2469, "step": 7290 }, { "epoch": 1.1413587977457733, "grad_norm": 0.5267093777656555, "learning_rate": 6.53515625e-05, "loss": 0.2376, "step": 7291 }, { "epoch": 1.1415153412648715, "grad_norm": 0.6336192488670349, "learning_rate": 6.533967391304347e-05, "loss": 0.1703, "step": 7292 }, { "epoch": 1.14167188478397, "grad_norm": 0.6381465196609497, "learning_rate": 6.532778532608695e-05, "loss": 0.286, "step": 7293 }, { "epoch": 1.1418284283030682, "grad_norm": 0.7212666869163513, "learning_rate": 6.531589673913043e-05, "loss": 0.2234, "step": 7294 }, { "epoch": 1.1419849718221666, "grad_norm": 3.5217363834381104, "learning_rate": 6.53040081521739e-05, "loss": 0.4098, "step": 7295 }, { "epoch": 1.1421415153412648, "grad_norm": 0.7518934607505798, "learning_rate": 6.529211956521738e-05, "loss": 0.2262, "step": 7296 }, { "epoch": 1.142298058860363, "grad_norm": 1.0941826105117798, "learning_rate": 6.528023097826086e-05, "loss": 0.2321, "step": 7297 }, { "epoch": 1.1424546023794615, "grad_norm": 0.8732725381851196, "learning_rate": 6.526834239130434e-05, "loss": 0.1816, "step": 7298 }, { "epoch": 1.1426111458985597, "grad_norm": 0.5155500173568726, "learning_rate": 6.525645380434782e-05, "loss": 0.1571, "step": 7299 }, { "epoch": 1.1427676894176582, "grad_norm": 0.9412835240364075, "learning_rate": 6.52445652173913e-05, "loss": 0.3702, "step": 7300 }, { "epoch": 1.1429242329367564, "grad_norm": 1.0758274793624878, "learning_rate": 6.523267663043478e-05, "loss": 0.3882, "step": 7301 }, { "epoch": 1.1430807764558548, "grad_norm": 2.043506383895874, "learning_rate": 6.522078804347825e-05, "loss": 0.2315, "step": 7302 }, { "epoch": 1.143237319974953, "grad_norm": 1.016661524772644, "learning_rate": 6.520889945652173e-05, "loss": 0.3997, "step": 7303 }, { "epoch": 1.1433938634940513, "grad_norm": 0.9645010232925415, "learning_rate": 6.519701086956521e-05, "loss": 0.2663, "step": 7304 }, { "epoch": 1.1435504070131497, "grad_norm": 1.9348256587982178, "learning_rate": 6.518512228260869e-05, "loss": 0.4571, "step": 7305 }, { "epoch": 1.143706950532248, "grad_norm": 1.6275123357772827, "learning_rate": 6.517323369565217e-05, "loss": 0.6473, "step": 7306 }, { "epoch": 1.1438634940513464, "grad_norm": 1.225703239440918, "learning_rate": 6.516134510869564e-05, "loss": 0.3778, "step": 7307 }, { "epoch": 1.1440200375704446, "grad_norm": 1.9001502990722656, "learning_rate": 6.514945652173912e-05, "loss": 0.6511, "step": 7308 }, { "epoch": 1.144176581089543, "grad_norm": 0.9575929045677185, "learning_rate": 6.51375679347826e-05, "loss": 0.4102, "step": 7309 }, { "epoch": 1.1443331246086412, "grad_norm": 1.7546546459197998, "learning_rate": 6.512567934782608e-05, "loss": 0.5805, "step": 7310 }, { "epoch": 1.1444896681277394, "grad_norm": 2.5071325302124023, "learning_rate": 6.511379076086956e-05, "loss": 0.2699, "step": 7311 }, { "epoch": 1.1446462116468379, "grad_norm": 3.536316394805908, "learning_rate": 6.510190217391303e-05, "loss": 0.6421, "step": 7312 }, { "epoch": 1.144802755165936, "grad_norm": 2.1001999378204346, "learning_rate": 6.509001358695651e-05, "loss": 0.6165, "step": 7313 }, { "epoch": 1.1449592986850345, "grad_norm": 1.1454308032989502, "learning_rate": 6.507812499999999e-05, "loss": 0.4112, "step": 7314 }, { "epoch": 1.1451158422041328, "grad_norm": 1.6488538980484009, "learning_rate": 6.506623641304347e-05, "loss": 0.4807, "step": 7315 }, { "epoch": 1.145272385723231, "grad_norm": 2.346478223800659, "learning_rate": 6.505434782608695e-05, "loss": 0.6584, "step": 7316 }, { "epoch": 1.1454289292423294, "grad_norm": 3.495988368988037, "learning_rate": 6.504245923913043e-05, "loss": 0.5891, "step": 7317 }, { "epoch": 1.1455854727614276, "grad_norm": 1.9708099365234375, "learning_rate": 6.50305706521739e-05, "loss": 0.7554, "step": 7318 }, { "epoch": 1.145742016280526, "grad_norm": 2.0773098468780518, "learning_rate": 6.50186820652174e-05, "loss": 0.5416, "step": 7319 }, { "epoch": 1.1458985597996243, "grad_norm": 2.7990381717681885, "learning_rate": 6.500679347826086e-05, "loss": 0.6916, "step": 7320 }, { "epoch": 1.1460551033187225, "grad_norm": 1.889979362487793, "learning_rate": 6.499490489130434e-05, "loss": 0.918, "step": 7321 }, { "epoch": 1.146211646837821, "grad_norm": 4.403543472290039, "learning_rate": 6.498301630434782e-05, "loss": 0.5684, "step": 7322 }, { "epoch": 1.1463681903569192, "grad_norm": 3.5408987998962402, "learning_rate": 6.49711277173913e-05, "loss": 0.7935, "step": 7323 }, { "epoch": 1.1465247338760176, "grad_norm": 2.3161160945892334, "learning_rate": 6.495923913043477e-05, "loss": 0.9359, "step": 7324 }, { "epoch": 1.1466812773951158, "grad_norm": 3.1461379528045654, "learning_rate": 6.494735054347825e-05, "loss": 0.7673, "step": 7325 }, { "epoch": 1.146837820914214, "grad_norm": 3.0608153343200684, "learning_rate": 6.493546195652173e-05, "loss": 0.9325, "step": 7326 }, { "epoch": 1.1469943644333125, "grad_norm": 4.847982406616211, "learning_rate": 6.492357336956521e-05, "loss": 0.7764, "step": 7327 }, { "epoch": 1.1471509079524107, "grad_norm": 10.528264999389648, "learning_rate": 6.491168478260869e-05, "loss": 0.801, "step": 7328 }, { "epoch": 1.1473074514715091, "grad_norm": 2.31807279586792, "learning_rate": 6.489979619565218e-05, "loss": 1.0894, "step": 7329 }, { "epoch": 1.1474639949906074, "grad_norm": 5.215738296508789, "learning_rate": 6.488790760869566e-05, "loss": 1.0174, "step": 7330 }, { "epoch": 1.1476205385097056, "grad_norm": 3.061176300048828, "learning_rate": 6.487601902173913e-05, "loss": 1.2388, "step": 7331 }, { "epoch": 1.147777082028804, "grad_norm": 2.801727771759033, "learning_rate": 6.48641304347826e-05, "loss": 1.1572, "step": 7332 }, { "epoch": 1.1479336255479022, "grad_norm": 3.8724498748779297, "learning_rate": 6.485224184782608e-05, "loss": 1.1181, "step": 7333 }, { "epoch": 1.1480901690670007, "grad_norm": 3.1903138160705566, "learning_rate": 6.484035326086955e-05, "loss": 1.2105, "step": 7334 }, { "epoch": 1.148246712586099, "grad_norm": 1.7528518438339233, "learning_rate": 6.482846467391303e-05, "loss": 0.2872, "step": 7335 }, { "epoch": 1.1484032561051973, "grad_norm": 3.175358772277832, "learning_rate": 6.481657608695651e-05, "loss": 0.8378, "step": 7336 }, { "epoch": 1.1485597996242956, "grad_norm": 1.7859840393066406, "learning_rate": 6.480468749999999e-05, "loss": 0.3288, "step": 7337 }, { "epoch": 1.1487163431433938, "grad_norm": 3.376330614089966, "learning_rate": 6.479279891304347e-05, "loss": 1.1071, "step": 7338 }, { "epoch": 1.1488728866624922, "grad_norm": 0.5149374604225159, "learning_rate": 6.478091032608696e-05, "loss": 0.2384, "step": 7339 }, { "epoch": 1.1490294301815904, "grad_norm": 0.6652015447616577, "learning_rate": 6.476902173913044e-05, "loss": 0.17, "step": 7340 }, { "epoch": 1.1491859737006889, "grad_norm": 0.5575904250144958, "learning_rate": 6.475713315217391e-05, "loss": 0.1973, "step": 7341 }, { "epoch": 1.149342517219787, "grad_norm": 0.6650729179382324, "learning_rate": 6.474524456521739e-05, "loss": 0.1865, "step": 7342 }, { "epoch": 1.1494990607388855, "grad_norm": 0.9185170531272888, "learning_rate": 6.473335597826086e-05, "loss": 0.1477, "step": 7343 }, { "epoch": 1.1496556042579837, "grad_norm": 0.8915801048278809, "learning_rate": 6.472146739130434e-05, "loss": 0.277, "step": 7344 }, { "epoch": 1.149812147777082, "grad_norm": 0.8749103546142578, "learning_rate": 6.470957880434781e-05, "loss": 0.2302, "step": 7345 }, { "epoch": 1.1499686912961804, "grad_norm": 0.68445885181427, "learning_rate": 6.469769021739129e-05, "loss": 0.2677, "step": 7346 }, { "epoch": 1.1501252348152786, "grad_norm": 0.5380271077156067, "learning_rate": 6.468580163043477e-05, "loss": 0.1805, "step": 7347 }, { "epoch": 1.150281778334377, "grad_norm": 1.368733286857605, "learning_rate": 6.467391304347825e-05, "loss": 0.261, "step": 7348 }, { "epoch": 1.1504383218534753, "grad_norm": 0.5163238048553467, "learning_rate": 6.466202445652174e-05, "loss": 0.1568, "step": 7349 }, { "epoch": 1.1505948653725735, "grad_norm": 1.0244938135147095, "learning_rate": 6.465013586956522e-05, "loss": 0.3665, "step": 7350 }, { "epoch": 1.150751408891672, "grad_norm": 0.8415913581848145, "learning_rate": 6.46382472826087e-05, "loss": 0.3849, "step": 7351 }, { "epoch": 1.1509079524107702, "grad_norm": 1.333866000175476, "learning_rate": 6.462635869565217e-05, "loss": 0.2809, "step": 7352 }, { "epoch": 1.1510644959298686, "grad_norm": 2.3053886890411377, "learning_rate": 6.461447010869565e-05, "loss": 0.2991, "step": 7353 }, { "epoch": 1.1512210394489668, "grad_norm": 1.167094349861145, "learning_rate": 6.460258152173913e-05, "loss": 0.3528, "step": 7354 }, { "epoch": 1.151377582968065, "grad_norm": 1.2448670864105225, "learning_rate": 6.45906929347826e-05, "loss": 0.3239, "step": 7355 }, { "epoch": 1.1515341264871635, "grad_norm": 2.1350491046905518, "learning_rate": 6.457880434782607e-05, "loss": 0.4718, "step": 7356 }, { "epoch": 1.1516906700062617, "grad_norm": 1.1463764905929565, "learning_rate": 6.456691576086955e-05, "loss": 0.4992, "step": 7357 }, { "epoch": 1.1518472135253601, "grad_norm": 2.44110107421875, "learning_rate": 6.455502717391303e-05, "loss": 0.4605, "step": 7358 }, { "epoch": 1.1520037570444583, "grad_norm": 1.2371114492416382, "learning_rate": 6.454313858695652e-05, "loss": 0.2887, "step": 7359 }, { "epoch": 1.1521603005635566, "grad_norm": 2.598203659057617, "learning_rate": 6.453125e-05, "loss": 0.5471, "step": 7360 }, { "epoch": 1.152316844082655, "grad_norm": 1.4896284341812134, "learning_rate": 6.451936141304348e-05, "loss": 0.4933, "step": 7361 }, { "epoch": 1.1524733876017532, "grad_norm": 1.4669653177261353, "learning_rate": 6.450747282608696e-05, "loss": 0.5212, "step": 7362 }, { "epoch": 1.1526299311208517, "grad_norm": 3.655029296875, "learning_rate": 6.449558423913043e-05, "loss": 0.623, "step": 7363 }, { "epoch": 1.1527864746399499, "grad_norm": 1.9891315698623657, "learning_rate": 6.448369565217391e-05, "loss": 0.6586, "step": 7364 }, { "epoch": 1.152943018159048, "grad_norm": 2.4671247005462646, "learning_rate": 6.447180706521739e-05, "loss": 0.4118, "step": 7365 }, { "epoch": 1.1530995616781465, "grad_norm": 2.4649534225463867, "learning_rate": 6.445991847826086e-05, "loss": 0.4885, "step": 7366 }, { "epoch": 1.1532561051972448, "grad_norm": 3.2287397384643555, "learning_rate": 6.444802989130433e-05, "loss": 0.8411, "step": 7367 }, { "epoch": 1.1534126487163432, "grad_norm": 3.2847251892089844, "learning_rate": 6.443614130434781e-05, "loss": 0.6613, "step": 7368 }, { "epoch": 1.1535691922354414, "grad_norm": 1.7752827405929565, "learning_rate": 6.44242527173913e-05, "loss": 0.4251, "step": 7369 }, { "epoch": 1.1537257357545398, "grad_norm": 4.709701061248779, "learning_rate": 6.441236413043478e-05, "loss": 0.7302, "step": 7370 }, { "epoch": 1.153882279273638, "grad_norm": 3.4790589809417725, "learning_rate": 6.440047554347826e-05, "loss": 0.6338, "step": 7371 }, { "epoch": 1.1540388227927363, "grad_norm": 3.709327220916748, "learning_rate": 6.438858695652174e-05, "loss": 0.5933, "step": 7372 }, { "epoch": 1.1541953663118347, "grad_norm": 2.509720802307129, "learning_rate": 6.437669836956522e-05, "loss": 1.1501, "step": 7373 }, { "epoch": 1.154351909830933, "grad_norm": 2.5406687259674072, "learning_rate": 6.43648097826087e-05, "loss": 0.4903, "step": 7374 }, { "epoch": 1.1545084533500314, "grad_norm": 3.15944766998291, "learning_rate": 6.435292119565217e-05, "loss": 1.0692, "step": 7375 }, { "epoch": 1.1546649968691296, "grad_norm": 3.794546604156494, "learning_rate": 6.434103260869565e-05, "loss": 0.904, "step": 7376 }, { "epoch": 1.154821540388228, "grad_norm": 3.9875683784484863, "learning_rate": 6.432914402173913e-05, "loss": 1.2376, "step": 7377 }, { "epoch": 1.1549780839073263, "grad_norm": 2.715790271759033, "learning_rate": 6.431725543478259e-05, "loss": 0.8434, "step": 7378 }, { "epoch": 1.1551346274264245, "grad_norm": 2.1241092681884766, "learning_rate": 6.430536684782608e-05, "loss": 0.6964, "step": 7379 }, { "epoch": 1.155291170945523, "grad_norm": 3.1614975929260254, "learning_rate": 6.429347826086956e-05, "loss": 1.0329, "step": 7380 }, { "epoch": 1.1554477144646211, "grad_norm": 3.4021432399749756, "learning_rate": 6.428158967391304e-05, "loss": 1.1542, "step": 7381 }, { "epoch": 1.1556042579837196, "grad_norm": 3.0113308429718018, "learning_rate": 6.426970108695652e-05, "loss": 1.2447, "step": 7382 }, { "epoch": 1.1557608015028178, "grad_norm": 2.9265246391296387, "learning_rate": 6.42578125e-05, "loss": 1.0161, "step": 7383 }, { "epoch": 1.155917345021916, "grad_norm": 2.4634146690368652, "learning_rate": 6.424592391304348e-05, "loss": 0.6714, "step": 7384 }, { "epoch": 1.1560738885410144, "grad_norm": 3.5429112911224365, "learning_rate": 6.423403532608695e-05, "loss": 0.5319, "step": 7385 }, { "epoch": 1.1562304320601127, "grad_norm": 3.116182565689087, "learning_rate": 6.422214673913043e-05, "loss": 0.5415, "step": 7386 }, { "epoch": 1.156386975579211, "grad_norm": 1.0918234586715698, "learning_rate": 6.421025815217391e-05, "loss": 0.2204, "step": 7387 }, { "epoch": 1.1565435190983093, "grad_norm": 2.450413703918457, "learning_rate": 6.419836956521739e-05, "loss": 0.8353, "step": 7388 }, { "epoch": 1.1567000626174075, "grad_norm": 0.5511382222175598, "learning_rate": 6.418648097826087e-05, "loss": 0.2329, "step": 7389 }, { "epoch": 1.156856606136506, "grad_norm": 0.44227489829063416, "learning_rate": 6.417459239130434e-05, "loss": 0.2184, "step": 7390 }, { "epoch": 1.1570131496556042, "grad_norm": 0.48784637451171875, "learning_rate": 6.416270380434782e-05, "loss": 0.2236, "step": 7391 }, { "epoch": 1.1571696931747026, "grad_norm": 1.136298656463623, "learning_rate": 6.41508152173913e-05, "loss": 0.1829, "step": 7392 }, { "epoch": 1.1573262366938009, "grad_norm": 0.6005204319953918, "learning_rate": 6.413892663043478e-05, "loss": 0.2124, "step": 7393 }, { "epoch": 1.157482780212899, "grad_norm": 0.737918496131897, "learning_rate": 6.412703804347826e-05, "loss": 0.3826, "step": 7394 }, { "epoch": 1.1576393237319975, "grad_norm": 0.8930999636650085, "learning_rate": 6.411514945652174e-05, "loss": 0.2524, "step": 7395 }, { "epoch": 1.1577958672510957, "grad_norm": 0.79004967212677, "learning_rate": 6.410326086956521e-05, "loss": 0.2526, "step": 7396 }, { "epoch": 1.1579524107701942, "grad_norm": 0.8977347612380981, "learning_rate": 6.409137228260869e-05, "loss": 0.2394, "step": 7397 }, { "epoch": 1.1581089542892924, "grad_norm": 0.8508070707321167, "learning_rate": 6.407948369565217e-05, "loss": 0.2016, "step": 7398 }, { "epoch": 1.1582654978083906, "grad_norm": 0.7870635390281677, "learning_rate": 6.406759510869565e-05, "loss": 0.2074, "step": 7399 }, { "epoch": 1.158422041327489, "grad_norm": 1.1743836402893066, "learning_rate": 6.405570652173913e-05, "loss": 0.3797, "step": 7400 }, { "epoch": 1.1585785848465873, "grad_norm": 0.986369788646698, "learning_rate": 6.40438179347826e-05, "loss": 0.3642, "step": 7401 }, { "epoch": 1.1587351283656857, "grad_norm": 0.8799954056739807, "learning_rate": 6.403192934782608e-05, "loss": 0.2148, "step": 7402 }, { "epoch": 1.158891671884784, "grad_norm": 1.1631687879562378, "learning_rate": 6.402004076086956e-05, "loss": 0.2737, "step": 7403 }, { "epoch": 1.1590482154038824, "grad_norm": 1.6418851613998413, "learning_rate": 6.400815217391304e-05, "loss": 0.3587, "step": 7404 }, { "epoch": 1.1592047589229806, "grad_norm": 1.3733184337615967, "learning_rate": 6.399626358695652e-05, "loss": 0.3702, "step": 7405 }, { "epoch": 1.159361302442079, "grad_norm": 0.6831279993057251, "learning_rate": 6.3984375e-05, "loss": 0.2472, "step": 7406 }, { "epoch": 1.1595178459611772, "grad_norm": 0.6355229616165161, "learning_rate": 6.397248641304347e-05, "loss": 0.2275, "step": 7407 }, { "epoch": 1.1596743894802755, "grad_norm": 2.116387128829956, "learning_rate": 6.396059782608695e-05, "loss": 0.3369, "step": 7408 }, { "epoch": 1.159830932999374, "grad_norm": 3.237978219985962, "learning_rate": 6.394870923913043e-05, "loss": 0.9576, "step": 7409 }, { "epoch": 1.1599874765184721, "grad_norm": 1.276313304901123, "learning_rate": 6.393682065217391e-05, "loss": 0.366, "step": 7410 }, { "epoch": 1.1601440200375706, "grad_norm": 3.1755707263946533, "learning_rate": 6.392493206521739e-05, "loss": 0.4726, "step": 7411 }, { "epoch": 1.1603005635566688, "grad_norm": 1.887509822845459, "learning_rate": 6.391304347826086e-05, "loss": 0.3613, "step": 7412 }, { "epoch": 1.160457107075767, "grad_norm": 1.7947118282318115, "learning_rate": 6.390115489130434e-05, "loss": 0.7497, "step": 7413 }, { "epoch": 1.1606136505948654, "grad_norm": 1.9044517278671265, "learning_rate": 6.388926630434782e-05, "loss": 0.5797, "step": 7414 }, { "epoch": 1.1607701941139636, "grad_norm": 1.7837930917739868, "learning_rate": 6.38773777173913e-05, "loss": 0.777, "step": 7415 }, { "epoch": 1.160926737633062, "grad_norm": 1.8809837102890015, "learning_rate": 6.386548913043478e-05, "loss": 0.588, "step": 7416 }, { "epoch": 1.1610832811521603, "grad_norm": 1.6607818603515625, "learning_rate": 6.385360054347825e-05, "loss": 0.2872, "step": 7417 }, { "epoch": 1.1612398246712585, "grad_norm": 2.0249505043029785, "learning_rate": 6.384171195652173e-05, "loss": 0.5639, "step": 7418 }, { "epoch": 1.161396368190357, "grad_norm": 3.6796414852142334, "learning_rate": 6.382982336956521e-05, "loss": 0.6461, "step": 7419 }, { "epoch": 1.1615529117094552, "grad_norm": 1.219697117805481, "learning_rate": 6.381793478260869e-05, "loss": 0.3306, "step": 7420 }, { "epoch": 1.1617094552285536, "grad_norm": 3.963686227798462, "learning_rate": 6.380604619565217e-05, "loss": 0.5341, "step": 7421 }, { "epoch": 1.1618659987476518, "grad_norm": 2.1467342376708984, "learning_rate": 6.379415760869565e-05, "loss": 0.6367, "step": 7422 }, { "epoch": 1.16202254226675, "grad_norm": 2.1273319721221924, "learning_rate": 6.378226902173912e-05, "loss": 0.8509, "step": 7423 }, { "epoch": 1.1621790857858485, "grad_norm": 2.474515438079834, "learning_rate": 6.37703804347826e-05, "loss": 0.8547, "step": 7424 }, { "epoch": 1.1623356293049467, "grad_norm": 2.890794515609741, "learning_rate": 6.375849184782608e-05, "loss": 1.1732, "step": 7425 }, { "epoch": 1.1624921728240452, "grad_norm": 2.708794355392456, "learning_rate": 6.374660326086956e-05, "loss": 1.02, "step": 7426 }, { "epoch": 1.1626487163431434, "grad_norm": 2.20438551902771, "learning_rate": 6.373471467391304e-05, "loss": 0.9792, "step": 7427 }, { "epoch": 1.1628052598622416, "grad_norm": 4.415675163269043, "learning_rate": 6.372282608695651e-05, "loss": 1.2282, "step": 7428 }, { "epoch": 1.16296180338134, "grad_norm": 1.6326543092727661, "learning_rate": 6.371093749999999e-05, "loss": 0.9442, "step": 7429 }, { "epoch": 1.1631183469004382, "grad_norm": 2.485006093978882, "learning_rate": 6.369904891304347e-05, "loss": 1.5425, "step": 7430 }, { "epoch": 1.1632748904195367, "grad_norm": 4.162367820739746, "learning_rate": 6.368716032608695e-05, "loss": 1.6049, "step": 7431 }, { "epoch": 1.163431433938635, "grad_norm": 2.7808446884155273, "learning_rate": 6.367527173913043e-05, "loss": 0.9969, "step": 7432 }, { "epoch": 1.1635879774577333, "grad_norm": 3.5891880989074707, "learning_rate": 6.36633831521739e-05, "loss": 0.9232, "step": 7433 }, { "epoch": 1.1637445209768316, "grad_norm": 1.2087769508361816, "learning_rate": 6.365149456521738e-05, "loss": 0.2983, "step": 7434 }, { "epoch": 1.1639010644959298, "grad_norm": 3.7369461059570312, "learning_rate": 6.363960597826086e-05, "loss": 0.862, "step": 7435 }, { "epoch": 1.1640576080150282, "grad_norm": 4.594311714172363, "learning_rate": 6.362771739130434e-05, "loss": 0.7897, "step": 7436 }, { "epoch": 1.1642141515341264, "grad_norm": 1.995735764503479, "learning_rate": 6.361582880434782e-05, "loss": 0.4316, "step": 7437 }, { "epoch": 1.1643706950532249, "grad_norm": 2.462280035018921, "learning_rate": 6.36039402173913e-05, "loss": 0.7039, "step": 7438 }, { "epoch": 1.164527238572323, "grad_norm": 1.0852800607681274, "learning_rate": 6.359205163043477e-05, "loss": 0.1863, "step": 7439 }, { "epoch": 1.1646837820914215, "grad_norm": 0.47832369804382324, "learning_rate": 6.358016304347825e-05, "loss": 0.2165, "step": 7440 }, { "epoch": 1.1648403256105198, "grad_norm": 0.5319947600364685, "learning_rate": 6.356827445652173e-05, "loss": 0.1954, "step": 7441 }, { "epoch": 1.164996869129618, "grad_norm": 0.670690655708313, "learning_rate": 6.355638586956521e-05, "loss": 0.1247, "step": 7442 }, { "epoch": 1.1651534126487164, "grad_norm": 0.6765363812446594, "learning_rate": 6.354449728260869e-05, "loss": 0.2245, "step": 7443 }, { "epoch": 1.1653099561678146, "grad_norm": 0.9108954668045044, "learning_rate": 6.353260869565216e-05, "loss": 0.2079, "step": 7444 }, { "epoch": 1.165466499686913, "grad_norm": 1.0929282903671265, "learning_rate": 6.352072010869564e-05, "loss": 0.2882, "step": 7445 }, { "epoch": 1.1656230432060113, "grad_norm": 0.7020341157913208, "learning_rate": 6.350883152173913e-05, "loss": 0.2357, "step": 7446 }, { "epoch": 1.1657795867251095, "grad_norm": 0.663882851600647, "learning_rate": 6.34969429347826e-05, "loss": 0.2331, "step": 7447 }, { "epoch": 1.165936130244208, "grad_norm": 1.4077228307724, "learning_rate": 6.348505434782608e-05, "loss": 0.3254, "step": 7448 }, { "epoch": 1.1660926737633062, "grad_norm": 1.5742331743240356, "learning_rate": 6.347316576086956e-05, "loss": 0.2855, "step": 7449 }, { "epoch": 1.1662492172824046, "grad_norm": 1.1905310153961182, "learning_rate": 6.346127717391303e-05, "loss": 0.3665, "step": 7450 }, { "epoch": 1.1664057608015028, "grad_norm": 2.3971455097198486, "learning_rate": 6.344938858695651e-05, "loss": 0.2387, "step": 7451 }, { "epoch": 1.166562304320601, "grad_norm": 0.48486506938934326, "learning_rate": 6.343749999999999e-05, "loss": 0.2523, "step": 7452 }, { "epoch": 1.1667188478396995, "grad_norm": 1.2731887102127075, "learning_rate": 6.342561141304347e-05, "loss": 0.3758, "step": 7453 }, { "epoch": 1.1668753913587977, "grad_norm": 2.1250791549682617, "learning_rate": 6.341372282608695e-05, "loss": 0.7094, "step": 7454 }, { "epoch": 1.1670319348778961, "grad_norm": 0.7437793016433716, "learning_rate": 6.340183423913042e-05, "loss": 0.1675, "step": 7455 }, { "epoch": 1.1671884783969944, "grad_norm": 0.9740059971809387, "learning_rate": 6.338994565217392e-05, "loss": 0.3133, "step": 7456 }, { "epoch": 1.1673450219160926, "grad_norm": 0.8284987807273865, "learning_rate": 6.33780570652174e-05, "loss": 0.2485, "step": 7457 }, { "epoch": 1.167501565435191, "grad_norm": 2.365765333175659, "learning_rate": 6.336616847826086e-05, "loss": 0.4849, "step": 7458 }, { "epoch": 1.1676581089542892, "grad_norm": 2.2415690422058105, "learning_rate": 6.335427989130434e-05, "loss": 0.4944, "step": 7459 }, { "epoch": 1.1678146524733877, "grad_norm": 1.9508651494979858, "learning_rate": 6.334239130434782e-05, "loss": 0.8484, "step": 7460 }, { "epoch": 1.1679711959924859, "grad_norm": 1.0493019819259644, "learning_rate": 6.33305027173913e-05, "loss": 0.221, "step": 7461 }, { "epoch": 1.168127739511584, "grad_norm": 2.1637213230133057, "learning_rate": 6.331861413043477e-05, "loss": 0.8971, "step": 7462 }, { "epoch": 1.1682842830306825, "grad_norm": 1.3398998975753784, "learning_rate": 6.330672554347825e-05, "loss": 0.4008, "step": 7463 }, { "epoch": 1.1684408265497808, "grad_norm": 4.191359519958496, "learning_rate": 6.329483695652173e-05, "loss": 0.5967, "step": 7464 }, { "epoch": 1.1685973700688792, "grad_norm": 3.7551000118255615, "learning_rate": 6.32829483695652e-05, "loss": 0.7012, "step": 7465 }, { "epoch": 1.1687539135879774, "grad_norm": 1.9982166290283203, "learning_rate": 6.32710597826087e-05, "loss": 0.5394, "step": 7466 }, { "epoch": 1.1689104571070759, "grad_norm": 3.539524793624878, "learning_rate": 6.325917119565218e-05, "loss": 0.7698, "step": 7467 }, { "epoch": 1.169067000626174, "grad_norm": 4.362522125244141, "learning_rate": 6.324728260869565e-05, "loss": 0.5175, "step": 7468 }, { "epoch": 1.1692235441452723, "grad_norm": 2.1100292205810547, "learning_rate": 6.323539402173913e-05, "loss": 1.1277, "step": 7469 }, { "epoch": 1.1693800876643707, "grad_norm": 6.449792861938477, "learning_rate": 6.32235054347826e-05, "loss": 0.4025, "step": 7470 }, { "epoch": 1.169536631183469, "grad_norm": 1.8572189807891846, "learning_rate": 6.321161684782608e-05, "loss": 0.5692, "step": 7471 }, { "epoch": 1.1696931747025674, "grad_norm": 2.1819794178009033, "learning_rate": 6.319972826086955e-05, "loss": 0.639, "step": 7472 }, { "epoch": 1.1698497182216656, "grad_norm": 2.573091506958008, "learning_rate": 6.318783967391303e-05, "loss": 0.4569, "step": 7473 }, { "epoch": 1.170006261740764, "grad_norm": 3.98651123046875, "learning_rate": 6.317595108695651e-05, "loss": 0.8259, "step": 7474 }, { "epoch": 1.1701628052598623, "grad_norm": 2.132571220397949, "learning_rate": 6.316406249999999e-05, "loss": 0.5772, "step": 7475 }, { "epoch": 1.1703193487789605, "grad_norm": 2.9597268104553223, "learning_rate": 6.315217391304348e-05, "loss": 0.729, "step": 7476 }, { "epoch": 1.170475892298059, "grad_norm": 2.1593737602233887, "learning_rate": 6.314028532608696e-05, "loss": 0.671, "step": 7477 }, { "epoch": 1.1706324358171571, "grad_norm": 3.9993855953216553, "learning_rate": 6.312839673913044e-05, "loss": 1.2807, "step": 7478 }, { "epoch": 1.1707889793362556, "grad_norm": 2.4214632511138916, "learning_rate": 6.311650815217391e-05, "loss": 0.8767, "step": 7479 }, { "epoch": 1.1709455228553538, "grad_norm": 4.824526309967041, "learning_rate": 6.310461956521739e-05, "loss": 0.9206, "step": 7480 }, { "epoch": 1.171102066374452, "grad_norm": 2.418861150741577, "learning_rate": 6.309273097826086e-05, "loss": 1.1497, "step": 7481 }, { "epoch": 1.1712586098935505, "grad_norm": 2.167546033859253, "learning_rate": 6.308084239130433e-05, "loss": 1.1727, "step": 7482 }, { "epoch": 1.1714151534126487, "grad_norm": 7.26110315322876, "learning_rate": 6.306895380434781e-05, "loss": 1.8, "step": 7483 }, { "epoch": 1.1715716969317471, "grad_norm": 2.3303141593933105, "learning_rate": 6.305706521739129e-05, "loss": 0.9107, "step": 7484 }, { "epoch": 1.1717282404508453, "grad_norm": 3.025266647338867, "learning_rate": 6.304517663043477e-05, "loss": 0.9668, "step": 7485 }, { "epoch": 1.1718847839699436, "grad_norm": 3.0980987548828125, "learning_rate": 6.303328804347826e-05, "loss": 1.1538, "step": 7486 }, { "epoch": 1.172041327489042, "grad_norm": 2.015627145767212, "learning_rate": 6.302139945652174e-05, "loss": 0.5435, "step": 7487 }, { "epoch": 1.1721978710081402, "grad_norm": 1.9969065189361572, "learning_rate": 6.300951086956522e-05, "loss": 0.4817, "step": 7488 }, { "epoch": 1.1723544145272387, "grad_norm": 0.5814707279205322, "learning_rate": 6.29976222826087e-05, "loss": 0.272, "step": 7489 }, { "epoch": 1.1725109580463369, "grad_norm": 0.4682469666004181, "learning_rate": 6.298573369565217e-05, "loss": 0.1947, "step": 7490 }, { "epoch": 1.172667501565435, "grad_norm": 1.1792140007019043, "learning_rate": 6.297384510869565e-05, "loss": 0.6378, "step": 7491 }, { "epoch": 1.1728240450845335, "grad_norm": 0.688774585723877, "learning_rate": 6.296195652173913e-05, "loss": 0.2711, "step": 7492 }, { "epoch": 1.1729805886036317, "grad_norm": 0.485519677400589, "learning_rate": 6.29500679347826e-05, "loss": 0.2116, "step": 7493 }, { "epoch": 1.1731371321227302, "grad_norm": 1.1582224369049072, "learning_rate": 6.293817934782607e-05, "loss": 0.2588, "step": 7494 }, { "epoch": 1.1732936756418284, "grad_norm": 0.6169425845146179, "learning_rate": 6.292629076086955e-05, "loss": 0.1886, "step": 7495 }, { "epoch": 1.1734502191609266, "grad_norm": 0.6004372239112854, "learning_rate": 6.291440217391304e-05, "loss": 0.3341, "step": 7496 }, { "epoch": 1.173606762680025, "grad_norm": 0.781673014163971, "learning_rate": 6.290251358695652e-05, "loss": 0.5096, "step": 7497 }, { "epoch": 1.1737633061991233, "grad_norm": 1.778995394706726, "learning_rate": 6.2890625e-05, "loss": 0.2775, "step": 7498 }, { "epoch": 1.1739198497182217, "grad_norm": 0.7870672345161438, "learning_rate": 6.287873641304348e-05, "loss": 0.2859, "step": 7499 }, { "epoch": 1.17407639323732, "grad_norm": 0.7637919187545776, "learning_rate": 6.286684782608696e-05, "loss": 0.3777, "step": 7500 }, { "epoch": 1.1742329367564184, "grad_norm": 0.8241618871688843, "learning_rate": 6.285495923913043e-05, "loss": 0.2698, "step": 7501 }, { "epoch": 1.1743894802755166, "grad_norm": 1.5872952938079834, "learning_rate": 6.284307065217391e-05, "loss": 0.2464, "step": 7502 }, { "epoch": 1.1745460237946148, "grad_norm": 0.9228699207305908, "learning_rate": 6.283118206521739e-05, "loss": 0.3486, "step": 7503 }, { "epoch": 1.1747025673137133, "grad_norm": 1.3748432397842407, "learning_rate": 6.281929347826085e-05, "loss": 0.2414, "step": 7504 }, { "epoch": 1.1748591108328115, "grad_norm": 0.743278443813324, "learning_rate": 6.280740489130435e-05, "loss": 0.221, "step": 7505 }, { "epoch": 1.17501565435191, "grad_norm": 2.440538167953491, "learning_rate": 6.279551630434782e-05, "loss": 0.3466, "step": 7506 }, { "epoch": 1.1751721978710081, "grad_norm": 1.873499870300293, "learning_rate": 6.27836277173913e-05, "loss": 0.5665, "step": 7507 }, { "epoch": 1.1753287413901066, "grad_norm": 1.886534333229065, "learning_rate": 6.277173913043478e-05, "loss": 0.5874, "step": 7508 }, { "epoch": 1.1754852849092048, "grad_norm": 2.860410213470459, "learning_rate": 6.275985054347826e-05, "loss": 0.5645, "step": 7509 }, { "epoch": 1.175641828428303, "grad_norm": 2.450225830078125, "learning_rate": 6.274796195652174e-05, "loss": 0.5304, "step": 7510 }, { "epoch": 1.1757983719474014, "grad_norm": 0.8417583107948303, "learning_rate": 6.273607336956521e-05, "loss": 0.3547, "step": 7511 }, { "epoch": 1.1759549154664997, "grad_norm": 1.0077292919158936, "learning_rate": 6.272418478260869e-05, "loss": 0.3355, "step": 7512 }, { "epoch": 1.176111458985598, "grad_norm": 1.1695177555084229, "learning_rate": 6.271229619565217e-05, "loss": 0.3169, "step": 7513 }, { "epoch": 1.1762680025046963, "grad_norm": 2.6701595783233643, "learning_rate": 6.270040760869565e-05, "loss": 0.7611, "step": 7514 }, { "epoch": 1.1764245460237945, "grad_norm": 1.3142681121826172, "learning_rate": 6.268851902173913e-05, "loss": 0.4559, "step": 7515 }, { "epoch": 1.176581089542893, "grad_norm": 2.8741769790649414, "learning_rate": 6.26766304347826e-05, "loss": 0.5167, "step": 7516 }, { "epoch": 1.1767376330619912, "grad_norm": 2.1146955490112305, "learning_rate": 6.266474184782608e-05, "loss": 0.7182, "step": 7517 }, { "epoch": 1.1768941765810896, "grad_norm": 2.098400115966797, "learning_rate": 6.265285326086956e-05, "loss": 0.6411, "step": 7518 }, { "epoch": 1.1770507201001879, "grad_norm": 4.092765808105469, "learning_rate": 6.264096467391304e-05, "loss": 0.7222, "step": 7519 }, { "epoch": 1.177207263619286, "grad_norm": 1.9952706098556519, "learning_rate": 6.262907608695652e-05, "loss": 0.3242, "step": 7520 }, { "epoch": 1.1773638071383845, "grad_norm": 1.246323823928833, "learning_rate": 6.26171875e-05, "loss": 0.3778, "step": 7521 }, { "epoch": 1.1775203506574827, "grad_norm": 1.8903555870056152, "learning_rate": 6.260529891304347e-05, "loss": 0.6831, "step": 7522 }, { "epoch": 1.1776768941765812, "grad_norm": 3.3818888664245605, "learning_rate": 6.259341032608695e-05, "loss": 0.7952, "step": 7523 }, { "epoch": 1.1778334376956794, "grad_norm": 5.739587783813477, "learning_rate": 6.258152173913043e-05, "loss": 0.788, "step": 7524 }, { "epoch": 1.1779899812147776, "grad_norm": 2.786895751953125, "learning_rate": 6.256963315217391e-05, "loss": 0.9844, "step": 7525 }, { "epoch": 1.178146524733876, "grad_norm": 3.477977991104126, "learning_rate": 6.255774456521739e-05, "loss": 0.7374, "step": 7526 }, { "epoch": 1.1783030682529743, "grad_norm": 3.0741751194000244, "learning_rate": 6.254585597826087e-05, "loss": 1.0446, "step": 7527 }, { "epoch": 1.1784596117720727, "grad_norm": 4.994105339050293, "learning_rate": 6.253396739130434e-05, "loss": 1.0774, "step": 7528 }, { "epoch": 1.178616155291171, "grad_norm": 1.9073766469955444, "learning_rate": 6.252207880434782e-05, "loss": 0.8548, "step": 7529 }, { "epoch": 1.1787726988102691, "grad_norm": 7.657297134399414, "learning_rate": 6.25101902173913e-05, "loss": 1.1864, "step": 7530 }, { "epoch": 1.1789292423293676, "grad_norm": 3.1733858585357666, "learning_rate": 6.249830163043478e-05, "loss": 1.3629, "step": 7531 }, { "epoch": 1.1790857858484658, "grad_norm": 2.775118589401245, "learning_rate": 6.248641304347826e-05, "loss": 1.3927, "step": 7532 }, { "epoch": 1.1792423293675642, "grad_norm": 4.833832263946533, "learning_rate": 6.247452445652173e-05, "loss": 1.4116, "step": 7533 }, { "epoch": 1.1793988728866625, "grad_norm": 2.9268038272857666, "learning_rate": 6.246263586956521e-05, "loss": 1.0746, "step": 7534 }, { "epoch": 1.179555416405761, "grad_norm": 1.7705191373825073, "learning_rate": 6.245074728260869e-05, "loss": 0.2649, "step": 7535 }, { "epoch": 1.179711959924859, "grad_norm": 4.11981725692749, "learning_rate": 6.243885869565217e-05, "loss": 0.5959, "step": 7536 }, { "epoch": 1.1798685034439573, "grad_norm": 5.220945835113525, "learning_rate": 6.242697010869565e-05, "loss": 0.8954, "step": 7537 }, { "epoch": 1.1800250469630558, "grad_norm": 1.4919085502624512, "learning_rate": 6.241508152173913e-05, "loss": 0.672, "step": 7538 }, { "epoch": 1.180181590482154, "grad_norm": 0.47644251585006714, "learning_rate": 6.24031929347826e-05, "loss": 0.2023, "step": 7539 }, { "epoch": 1.1803381340012524, "grad_norm": 0.5068759918212891, "learning_rate": 6.239130434782608e-05, "loss": 0.1589, "step": 7540 }, { "epoch": 1.1804946775203506, "grad_norm": 0.3527630567550659, "learning_rate": 6.237941576086956e-05, "loss": 0.2007, "step": 7541 }, { "epoch": 1.180651221039449, "grad_norm": 2.203486442565918, "learning_rate": 6.236752717391304e-05, "loss": 0.3517, "step": 7542 }, { "epoch": 1.1808077645585473, "grad_norm": 0.493836373090744, "learning_rate": 6.235563858695652e-05, "loss": 0.1706, "step": 7543 }, { "epoch": 1.1809643080776455, "grad_norm": 1.2223751544952393, "learning_rate": 6.234375e-05, "loss": 0.2325, "step": 7544 }, { "epoch": 1.181120851596744, "grad_norm": 0.763396680355072, "learning_rate": 6.233186141304347e-05, "loss": 0.2635, "step": 7545 }, { "epoch": 1.1812773951158422, "grad_norm": 0.8322598338127136, "learning_rate": 6.231997282608695e-05, "loss": 0.2538, "step": 7546 }, { "epoch": 1.1814339386349406, "grad_norm": 0.9594230651855469, "learning_rate": 6.230808423913043e-05, "loss": 0.2277, "step": 7547 }, { "epoch": 1.1815904821540388, "grad_norm": 0.8378079533576965, "learning_rate": 6.22961956521739e-05, "loss": 0.258, "step": 7548 }, { "epoch": 1.181747025673137, "grad_norm": 0.633420467376709, "learning_rate": 6.228430706521738e-05, "loss": 0.2468, "step": 7549 }, { "epoch": 1.1819035691922355, "grad_norm": 0.7347333431243896, "learning_rate": 6.227241847826086e-05, "loss": 0.2555, "step": 7550 }, { "epoch": 1.1820601127113337, "grad_norm": 1.3448619842529297, "learning_rate": 6.226052989130434e-05, "loss": 0.5017, "step": 7551 }, { "epoch": 1.1822166562304322, "grad_norm": 1.646381139755249, "learning_rate": 6.224864130434782e-05, "loss": 0.5084, "step": 7552 }, { "epoch": 1.1823731997495304, "grad_norm": 1.9713810682296753, "learning_rate": 6.22367527173913e-05, "loss": 0.3606, "step": 7553 }, { "epoch": 1.1825297432686286, "grad_norm": 0.7265000939369202, "learning_rate": 6.222486413043478e-05, "loss": 0.2839, "step": 7554 }, { "epoch": 1.182686286787727, "grad_norm": 2.022268056869507, "learning_rate": 6.221297554347825e-05, "loss": 0.3599, "step": 7555 }, { "epoch": 1.1828428303068252, "grad_norm": 2.420374631881714, "learning_rate": 6.220108695652173e-05, "loss": 0.6521, "step": 7556 }, { "epoch": 1.1829993738259237, "grad_norm": 1.4028124809265137, "learning_rate": 6.218919836956521e-05, "loss": 0.3538, "step": 7557 }, { "epoch": 1.183155917345022, "grad_norm": 2.072009563446045, "learning_rate": 6.217730978260869e-05, "loss": 0.5215, "step": 7558 }, { "epoch": 1.1833124608641201, "grad_norm": 1.5961508750915527, "learning_rate": 6.216542119565217e-05, "loss": 0.3622, "step": 7559 }, { "epoch": 1.1834690043832186, "grad_norm": 0.9592030644416809, "learning_rate": 6.215353260869564e-05, "loss": 0.341, "step": 7560 }, { "epoch": 1.1836255479023168, "grad_norm": 1.1676949262619019, "learning_rate": 6.214164402173912e-05, "loss": 0.425, "step": 7561 }, { "epoch": 1.1837820914214152, "grad_norm": 2.4771101474761963, "learning_rate": 6.21297554347826e-05, "loss": 0.4242, "step": 7562 }, { "epoch": 1.1839386349405134, "grad_norm": 1.6763110160827637, "learning_rate": 6.211786684782608e-05, "loss": 0.4108, "step": 7563 }, { "epoch": 1.1840951784596117, "grad_norm": 4.426321029663086, "learning_rate": 6.210597826086956e-05, "loss": 0.8254, "step": 7564 }, { "epoch": 1.18425172197871, "grad_norm": 3.4210009574890137, "learning_rate": 6.209408967391304e-05, "loss": 0.5853, "step": 7565 }, { "epoch": 1.1844082654978083, "grad_norm": 1.1335431337356567, "learning_rate": 6.208220108695651e-05, "loss": 0.4138, "step": 7566 }, { "epoch": 1.1845648090169068, "grad_norm": 1.8663029670715332, "learning_rate": 6.207031249999999e-05, "loss": 0.6285, "step": 7567 }, { "epoch": 1.184721352536005, "grad_norm": 2.2447612285614014, "learning_rate": 6.205842391304347e-05, "loss": 0.3645, "step": 7568 }, { "epoch": 1.1848778960551034, "grad_norm": 4.403597831726074, "learning_rate": 6.204653532608695e-05, "loss": 0.7362, "step": 7569 }, { "epoch": 1.1850344395742016, "grad_norm": 3.8521971702575684, "learning_rate": 6.203464673913043e-05, "loss": 0.8717, "step": 7570 }, { "epoch": 1.1851909830932998, "grad_norm": 2.200761318206787, "learning_rate": 6.20227581521739e-05, "loss": 0.6496, "step": 7571 }, { "epoch": 1.1853475266123983, "grad_norm": 2.8920490741729736, "learning_rate": 6.201086956521738e-05, "loss": 0.9264, "step": 7572 }, { "epoch": 1.1855040701314965, "grad_norm": 3.0893867015838623, "learning_rate": 6.199898097826086e-05, "loss": 0.7061, "step": 7573 }, { "epoch": 1.185660613650595, "grad_norm": 3.041358709335327, "learning_rate": 6.198709239130434e-05, "loss": 0.6583, "step": 7574 }, { "epoch": 1.1858171571696932, "grad_norm": 3.6952803134918213, "learning_rate": 6.197520380434782e-05, "loss": 0.8147, "step": 7575 }, { "epoch": 1.1859737006887916, "grad_norm": 3.052802801132202, "learning_rate": 6.19633152173913e-05, "loss": 0.7293, "step": 7576 }, { "epoch": 1.1861302442078898, "grad_norm": 2.7489960193634033, "learning_rate": 6.195142663043477e-05, "loss": 1.0375, "step": 7577 }, { "epoch": 1.186286787726988, "grad_norm": 2.6536059379577637, "learning_rate": 6.193953804347825e-05, "loss": 0.6784, "step": 7578 }, { "epoch": 1.1864433312460865, "grad_norm": 3.8463430404663086, "learning_rate": 6.192764945652173e-05, "loss": 0.7498, "step": 7579 }, { "epoch": 1.1865998747651847, "grad_norm": 2.239819288253784, "learning_rate": 6.191576086956521e-05, "loss": 0.9055, "step": 7580 }, { "epoch": 1.1867564182842831, "grad_norm": 3.392461061477661, "learning_rate": 6.190387228260869e-05, "loss": 0.9295, "step": 7581 }, { "epoch": 1.1869129618033814, "grad_norm": 4.237736701965332, "learning_rate": 6.189198369565218e-05, "loss": 1.0415, "step": 7582 }, { "epoch": 1.1870695053224796, "grad_norm": 3.427994728088379, "learning_rate": 6.188009510869566e-05, "loss": 1.2054, "step": 7583 }, { "epoch": 1.187226048841578, "grad_norm": 3.0408518314361572, "learning_rate": 6.186820652173913e-05, "loss": 0.9538, "step": 7584 }, { "epoch": 1.1873825923606762, "grad_norm": 3.775588274002075, "learning_rate": 6.18563179347826e-05, "loss": 0.9651, "step": 7585 }, { "epoch": 1.1875391358797747, "grad_norm": 1.3485240936279297, "learning_rate": 6.184442934782608e-05, "loss": 0.4383, "step": 7586 }, { "epoch": 1.1876956793988729, "grad_norm": 2.971301794052124, "learning_rate": 6.183254076086955e-05, "loss": 0.6211, "step": 7587 }, { "epoch": 1.187852222917971, "grad_norm": 3.4538991451263428, "learning_rate": 6.182065217391303e-05, "loss": 0.6474, "step": 7588 }, { "epoch": 1.1880087664370695, "grad_norm": 0.4463996887207031, "learning_rate": 6.180876358695651e-05, "loss": 0.2373, "step": 7589 }, { "epoch": 1.1881653099561678, "grad_norm": 0.5145469307899475, "learning_rate": 6.179687499999999e-05, "loss": 0.1834, "step": 7590 }, { "epoch": 1.1883218534752662, "grad_norm": 0.35160133242607117, "learning_rate": 6.178498641304347e-05, "loss": 0.1397, "step": 7591 }, { "epoch": 1.1884783969943644, "grad_norm": 0.5377123951911926, "learning_rate": 6.177309782608696e-05, "loss": 0.1721, "step": 7592 }, { "epoch": 1.1886349405134626, "grad_norm": 0.49151578545570374, "learning_rate": 6.176120923913044e-05, "loss": 0.1751, "step": 7593 }, { "epoch": 1.188791484032561, "grad_norm": 0.4734833836555481, "learning_rate": 6.174932065217392e-05, "loss": 0.2116, "step": 7594 }, { "epoch": 1.1889480275516593, "grad_norm": 0.9341704249382019, "learning_rate": 6.17374320652174e-05, "loss": 0.3427, "step": 7595 }, { "epoch": 1.1891045710707577, "grad_norm": 0.6779454350471497, "learning_rate": 6.172554347826086e-05, "loss": 0.2056, "step": 7596 }, { "epoch": 1.189261114589856, "grad_norm": 1.0069419145584106, "learning_rate": 6.171365489130434e-05, "loss": 0.1998, "step": 7597 }, { "epoch": 1.1894176581089542, "grad_norm": 0.940328061580658, "learning_rate": 6.170176630434781e-05, "loss": 0.3249, "step": 7598 }, { "epoch": 1.1895742016280526, "grad_norm": 1.0799555778503418, "learning_rate": 6.168987771739129e-05, "loss": 0.3578, "step": 7599 }, { "epoch": 1.1897307451471508, "grad_norm": 1.108219861984253, "learning_rate": 6.167798913043477e-05, "loss": 0.2321, "step": 7600 }, { "epoch": 1.1898872886662493, "grad_norm": 0.8791635632514954, "learning_rate": 6.166610054347825e-05, "loss": 0.2397, "step": 7601 }, { "epoch": 1.1900438321853475, "grad_norm": 0.8257560133934021, "learning_rate": 6.165421195652174e-05, "loss": 0.3088, "step": 7602 }, { "epoch": 1.190200375704446, "grad_norm": 0.8180612325668335, "learning_rate": 6.164232336956522e-05, "loss": 0.187, "step": 7603 }, { "epoch": 1.1903569192235441, "grad_norm": 0.979602575302124, "learning_rate": 6.16304347826087e-05, "loss": 0.2842, "step": 7604 }, { "epoch": 1.1905134627426424, "grad_norm": 3.676037073135376, "learning_rate": 6.161854619565218e-05, "loss": 0.9774, "step": 7605 }, { "epoch": 1.1906700062617408, "grad_norm": 1.5089037418365479, "learning_rate": 6.160665760869565e-05, "loss": 0.4634, "step": 7606 }, { "epoch": 1.190826549780839, "grad_norm": 1.5573688745498657, "learning_rate": 6.159476902173913e-05, "loss": 0.2906, "step": 7607 }, { "epoch": 1.1909830932999375, "grad_norm": 1.4436564445495605, "learning_rate": 6.15828804347826e-05, "loss": 0.3099, "step": 7608 }, { "epoch": 1.1911396368190357, "grad_norm": 1.1259815692901611, "learning_rate": 6.157099184782607e-05, "loss": 0.325, "step": 7609 }, { "epoch": 1.1912961803381341, "grad_norm": 2.2596137523651123, "learning_rate": 6.155910326086955e-05, "loss": 0.3798, "step": 7610 }, { "epoch": 1.1914527238572323, "grad_norm": 1.3787095546722412, "learning_rate": 6.154721467391303e-05, "loss": 0.7399, "step": 7611 }, { "epoch": 1.1916092673763305, "grad_norm": 2.124176502227783, "learning_rate": 6.153532608695652e-05, "loss": 0.4292, "step": 7612 }, { "epoch": 1.191765810895429, "grad_norm": 1.9678934812545776, "learning_rate": 6.15234375e-05, "loss": 0.8659, "step": 7613 }, { "epoch": 1.1919223544145272, "grad_norm": 1.7352195978164673, "learning_rate": 6.151154891304348e-05, "loss": 0.3954, "step": 7614 }, { "epoch": 1.1920788979336256, "grad_norm": 2.786628246307373, "learning_rate": 6.149966032608696e-05, "loss": 0.9251, "step": 7615 }, { "epoch": 1.1922354414527239, "grad_norm": 3.765507221221924, "learning_rate": 6.148777173913043e-05, "loss": 0.871, "step": 7616 }, { "epoch": 1.192391984971822, "grad_norm": 5.071339130401611, "learning_rate": 6.147588315217391e-05, "loss": 0.535, "step": 7617 }, { "epoch": 1.1925485284909205, "grad_norm": 3.75254487991333, "learning_rate": 6.146399456521739e-05, "loss": 0.7451, "step": 7618 }, { "epoch": 1.1927050720100187, "grad_norm": 4.018612384796143, "learning_rate": 6.145210597826086e-05, "loss": 0.595, "step": 7619 }, { "epoch": 1.1928616155291172, "grad_norm": 1.251801609992981, "learning_rate": 6.144021739130433e-05, "loss": 0.4106, "step": 7620 }, { "epoch": 1.1930181590482154, "grad_norm": 5.6656060218811035, "learning_rate": 6.142832880434781e-05, "loss": 0.7663, "step": 7621 }, { "epoch": 1.1931747025673136, "grad_norm": 1.7824429273605347, "learning_rate": 6.14164402173913e-05, "loss": 0.7785, "step": 7622 }, { "epoch": 1.193331246086412, "grad_norm": 1.9864685535430908, "learning_rate": 6.140455163043478e-05, "loss": 0.5391, "step": 7623 }, { "epoch": 1.1934877896055103, "grad_norm": 3.96679425239563, "learning_rate": 6.139266304347826e-05, "loss": 1.1995, "step": 7624 }, { "epoch": 1.1936443331246087, "grad_norm": 3.3757591247558594, "learning_rate": 6.138077445652174e-05, "loss": 0.5857, "step": 7625 }, { "epoch": 1.193800876643707, "grad_norm": 2.7364139556884766, "learning_rate": 6.136888586956522e-05, "loss": 1.46, "step": 7626 }, { "epoch": 1.1939574201628051, "grad_norm": 3.501331329345703, "learning_rate": 6.13569972826087e-05, "loss": 1.2463, "step": 7627 }, { "epoch": 1.1941139636819036, "grad_norm": 2.643327474594116, "learning_rate": 6.134510869565217e-05, "loss": 1.6504, "step": 7628 }, { "epoch": 1.1942705072010018, "grad_norm": 4.2857232093811035, "learning_rate": 6.133322010869565e-05, "loss": 1.0049, "step": 7629 }, { "epoch": 1.1944270507201002, "grad_norm": 2.2139172554016113, "learning_rate": 6.132133152173913e-05, "loss": 1.0571, "step": 7630 }, { "epoch": 1.1945835942391985, "grad_norm": 2.3542017936706543, "learning_rate": 6.13094429347826e-05, "loss": 0.9088, "step": 7631 }, { "epoch": 1.1947401377582967, "grad_norm": 2.247584104537964, "learning_rate": 6.129755434782609e-05, "loss": 0.8271, "step": 7632 }, { "epoch": 1.1948966812773951, "grad_norm": 2.9968366622924805, "learning_rate": 6.128566576086956e-05, "loss": 1.0734, "step": 7633 }, { "epoch": 1.1950532247964933, "grad_norm": 3.636659860610962, "learning_rate": 6.127377717391304e-05, "loss": 0.9144, "step": 7634 }, { "epoch": 1.1952097683155918, "grad_norm": 3.823564052581787, "learning_rate": 6.126188858695652e-05, "loss": 0.2957, "step": 7635 }, { "epoch": 1.19536631183469, "grad_norm": 5.897661209106445, "learning_rate": 6.125e-05, "loss": 1.0691, "step": 7636 }, { "epoch": 1.1955228553537884, "grad_norm": 2.4356448650360107, "learning_rate": 6.123811141304348e-05, "loss": 0.698, "step": 7637 }, { "epoch": 1.1956793988728867, "grad_norm": 2.2981321811676025, "learning_rate": 6.122622282608695e-05, "loss": 0.6237, "step": 7638 }, { "epoch": 1.195835942391985, "grad_norm": 0.6660681366920471, "learning_rate": 6.121433423913043e-05, "loss": 0.2469, "step": 7639 }, { "epoch": 1.1959924859110833, "grad_norm": 0.9615753293037415, "learning_rate": 6.120244565217391e-05, "loss": 0.2923, "step": 7640 }, { "epoch": 1.1961490294301815, "grad_norm": 3.2661068439483643, "learning_rate": 6.119055706521739e-05, "loss": 0.2314, "step": 7641 }, { "epoch": 1.19630557294928, "grad_norm": 0.5453205108642578, "learning_rate": 6.117866847826087e-05, "loss": 0.2014, "step": 7642 }, { "epoch": 1.1964621164683782, "grad_norm": 0.5954243540763855, "learning_rate": 6.116677989130435e-05, "loss": 0.1691, "step": 7643 }, { "epoch": 1.1966186599874766, "grad_norm": 1.2150520086288452, "learning_rate": 6.115489130434782e-05, "loss": 0.3478, "step": 7644 }, { "epoch": 1.1967752035065748, "grad_norm": 0.6582737565040588, "learning_rate": 6.11430027173913e-05, "loss": 0.2789, "step": 7645 }, { "epoch": 1.196931747025673, "grad_norm": 0.7186632752418518, "learning_rate": 6.113111413043478e-05, "loss": 0.2392, "step": 7646 }, { "epoch": 1.1970882905447715, "grad_norm": 0.7572813034057617, "learning_rate": 6.111922554347826e-05, "loss": 0.3215, "step": 7647 }, { "epoch": 1.1972448340638697, "grad_norm": 0.6482993364334106, "learning_rate": 6.110733695652174e-05, "loss": 0.1687, "step": 7648 }, { "epoch": 1.1974013775829682, "grad_norm": 1.3564372062683105, "learning_rate": 6.109544836956521e-05, "loss": 0.2511, "step": 7649 }, { "epoch": 1.1975579211020664, "grad_norm": 1.667864441871643, "learning_rate": 6.108355978260869e-05, "loss": 0.5524, "step": 7650 }, { "epoch": 1.1977144646211646, "grad_norm": 1.0896278619766235, "learning_rate": 6.107167119565217e-05, "loss": 0.2789, "step": 7651 }, { "epoch": 1.197871008140263, "grad_norm": 1.5141583681106567, "learning_rate": 6.105978260869565e-05, "loss": 0.3976, "step": 7652 }, { "epoch": 1.1980275516593613, "grad_norm": 1.441391110420227, "learning_rate": 6.104789402173913e-05, "loss": 0.3327, "step": 7653 }, { "epoch": 1.1981840951784597, "grad_norm": 1.1699985265731812, "learning_rate": 6.10360054347826e-05, "loss": 0.3632, "step": 7654 }, { "epoch": 1.198340638697558, "grad_norm": 1.323407769203186, "learning_rate": 6.1024116847826076e-05, "loss": 0.4269, "step": 7655 }, { "epoch": 1.1984971822166561, "grad_norm": 1.3018369674682617, "learning_rate": 6.101222826086956e-05, "loss": 0.4321, "step": 7656 }, { "epoch": 1.1986537257357546, "grad_norm": 2.5522592067718506, "learning_rate": 6.100033967391304e-05, "loss": 0.6265, "step": 7657 }, { "epoch": 1.1988102692548528, "grad_norm": 2.372467041015625, "learning_rate": 6.098845108695652e-05, "loss": 0.5506, "step": 7658 }, { "epoch": 1.1989668127739512, "grad_norm": 2.055413246154785, "learning_rate": 6.0976562499999996e-05, "loss": 0.5261, "step": 7659 }, { "epoch": 1.1991233562930494, "grad_norm": 1.3447391986846924, "learning_rate": 6.0964673913043474e-05, "loss": 0.3798, "step": 7660 }, { "epoch": 1.1992798998121477, "grad_norm": 2.1812744140625, "learning_rate": 6.095278532608695e-05, "loss": 0.2135, "step": 7661 }, { "epoch": 1.199436443331246, "grad_norm": 1.2696913480758667, "learning_rate": 6.094089673913043e-05, "loss": 0.4542, "step": 7662 }, { "epoch": 1.1995929868503443, "grad_norm": 3.2480366230010986, "learning_rate": 6.092900815217391e-05, "loss": 1.005, "step": 7663 }, { "epoch": 1.1997495303694428, "grad_norm": 2.278611660003662, "learning_rate": 6.0917119565217386e-05, "loss": 0.4737, "step": 7664 }, { "epoch": 1.199906073888541, "grad_norm": 1.4691022634506226, "learning_rate": 6.090523097826086e-05, "loss": 0.468, "step": 7665 }, { "epoch": 1.2000626174076394, "grad_norm": 2.303895950317383, "learning_rate": 6.089334239130434e-05, "loss": 0.8956, "step": 7666 }, { "epoch": 1.2002191609267376, "grad_norm": 1.599799394607544, "learning_rate": 6.088145380434782e-05, "loss": 0.4515, "step": 7667 }, { "epoch": 1.2003757044458359, "grad_norm": 3.780550241470337, "learning_rate": 6.08695652173913e-05, "loss": 0.8438, "step": 7668 }, { "epoch": 1.2005322479649343, "grad_norm": 1.8404790163040161, "learning_rate": 6.085767663043478e-05, "loss": 0.5856, "step": 7669 }, { "epoch": 1.2006887914840325, "grad_norm": 2.064284563064575, "learning_rate": 6.0845788043478255e-05, "loss": 0.7235, "step": 7670 }, { "epoch": 1.200845335003131, "grad_norm": 3.0421502590179443, "learning_rate": 6.0833899456521733e-05, "loss": 0.78, "step": 7671 }, { "epoch": 1.2010018785222292, "grad_norm": 2.050741672515869, "learning_rate": 6.082201086956521e-05, "loss": 1.0262, "step": 7672 }, { "epoch": 1.2011584220413276, "grad_norm": 1.9760346412658691, "learning_rate": 6.081012228260869e-05, "loss": 0.9045, "step": 7673 }, { "epoch": 1.2013149655604258, "grad_norm": 2.8690881729125977, "learning_rate": 6.079823369565217e-05, "loss": 0.754, "step": 7674 }, { "epoch": 1.201471509079524, "grad_norm": 3.1121840476989746, "learning_rate": 6.078634510869565e-05, "loss": 0.9022, "step": 7675 }, { "epoch": 1.2016280525986225, "grad_norm": 2.8963470458984375, "learning_rate": 6.077445652173913e-05, "loss": 1.3634, "step": 7676 }, { "epoch": 1.2017845961177207, "grad_norm": 1.7100067138671875, "learning_rate": 6.07625679347826e-05, "loss": 0.9021, "step": 7677 }, { "epoch": 1.2019411396368191, "grad_norm": 4.330780506134033, "learning_rate": 6.075067934782608e-05, "loss": 0.9949, "step": 7678 }, { "epoch": 1.2020976831559174, "grad_norm": 1.3116711378097534, "learning_rate": 6.073879076086956e-05, "loss": 0.716, "step": 7679 }, { "epoch": 1.2022542266750156, "grad_norm": 5.517224311828613, "learning_rate": 6.072690217391304e-05, "loss": 1.2292, "step": 7680 }, { "epoch": 1.202410770194114, "grad_norm": 3.0169897079467773, "learning_rate": 6.0715013586956515e-05, "loss": 1.6786, "step": 7681 }, { "epoch": 1.2025673137132122, "grad_norm": 2.946890354156494, "learning_rate": 6.070312499999999e-05, "loss": 1.1465, "step": 7682 }, { "epoch": 1.2027238572323107, "grad_norm": 4.64113187789917, "learning_rate": 6.069123641304347e-05, "loss": 0.5718, "step": 7683 }, { "epoch": 1.202880400751409, "grad_norm": 4.029336452484131, "learning_rate": 6.067934782608695e-05, "loss": 0.9319, "step": 7684 }, { "epoch": 1.2030369442705071, "grad_norm": 2.3594441413879395, "learning_rate": 6.0667459239130434e-05, "loss": 0.6398, "step": 7685 }, { "epoch": 1.2031934877896056, "grad_norm": 4.359577655792236, "learning_rate": 6.065557065217391e-05, "loss": 1.2085, "step": 7686 }, { "epoch": 1.2033500313087038, "grad_norm": 2.303274154663086, "learning_rate": 6.064368206521739e-05, "loss": 1.2446, "step": 7687 }, { "epoch": 1.2035065748278022, "grad_norm": 2.3410794734954834, "learning_rate": 6.063179347826086e-05, "loss": 0.7216, "step": 7688 }, { "epoch": 1.2036631183469004, "grad_norm": 0.41671010851860046, "learning_rate": 6.061990489130434e-05, "loss": 0.233, "step": 7689 }, { "epoch": 1.2038196618659986, "grad_norm": 0.4557720124721527, "learning_rate": 6.060801630434782e-05, "loss": 0.2181, "step": 7690 }, { "epoch": 1.203976205385097, "grad_norm": 0.6381228566169739, "learning_rate": 6.0596127717391297e-05, "loss": 0.2053, "step": 7691 }, { "epoch": 1.2041327489041953, "grad_norm": 0.5672786831855774, "learning_rate": 6.0584239130434775e-05, "loss": 0.1755, "step": 7692 }, { "epoch": 1.2042892924232937, "grad_norm": 0.48637664318084717, "learning_rate": 6.057235054347825e-05, "loss": 0.1443, "step": 7693 }, { "epoch": 1.204445835942392, "grad_norm": 0.9691879749298096, "learning_rate": 6.056046195652173e-05, "loss": 0.2944, "step": 7694 }, { "epoch": 1.2046023794614902, "grad_norm": 0.7859202027320862, "learning_rate": 6.0548573369565216e-05, "loss": 0.2845, "step": 7695 }, { "epoch": 1.2047589229805886, "grad_norm": 0.7127594947814941, "learning_rate": 6.0536684782608694e-05, "loss": 0.2209, "step": 7696 }, { "epoch": 1.2049154664996868, "grad_norm": 0.9675967693328857, "learning_rate": 6.052479619565217e-05, "loss": 0.2577, "step": 7697 }, { "epoch": 1.2050720100187853, "grad_norm": 0.9987935423851013, "learning_rate": 6.051290760869565e-05, "loss": 0.2829, "step": 7698 }, { "epoch": 1.2052285535378835, "grad_norm": 2.225867748260498, "learning_rate": 6.050101902173913e-05, "loss": 0.3264, "step": 7699 }, { "epoch": 1.205385097056982, "grad_norm": 0.9164025187492371, "learning_rate": 6.04891304347826e-05, "loss": 0.3267, "step": 7700 }, { "epoch": 1.2055416405760802, "grad_norm": 1.0118316411972046, "learning_rate": 6.047724184782608e-05, "loss": 0.2042, "step": 7701 }, { "epoch": 1.2056981840951784, "grad_norm": 2.16774582862854, "learning_rate": 6.0465353260869556e-05, "loss": 0.4158, "step": 7702 }, { "epoch": 1.2058547276142768, "grad_norm": 2.1927356719970703, "learning_rate": 6.0453464673913034e-05, "loss": 0.4448, "step": 7703 }, { "epoch": 1.206011271133375, "grad_norm": 1.8053853511810303, "learning_rate": 6.044157608695651e-05, "loss": 0.3976, "step": 7704 }, { "epoch": 1.2061678146524735, "grad_norm": 1.698429822921753, "learning_rate": 6.04296875e-05, "loss": 0.6303, "step": 7705 }, { "epoch": 1.2063243581715717, "grad_norm": 0.9755812883377075, "learning_rate": 6.0417798913043476e-05, "loss": 0.2656, "step": 7706 }, { "epoch": 1.2064809016906701, "grad_norm": 1.6731294393539429, "learning_rate": 6.0405910326086954e-05, "loss": 0.4104, "step": 7707 }, { "epoch": 1.2066374452097683, "grad_norm": 0.8750206828117371, "learning_rate": 6.039402173913043e-05, "loss": 0.2325, "step": 7708 }, { "epoch": 1.2067939887288666, "grad_norm": 1.6510015726089478, "learning_rate": 6.038213315217391e-05, "loss": 0.5925, "step": 7709 }, { "epoch": 1.206950532247965, "grad_norm": 1.7347586154937744, "learning_rate": 6.037024456521739e-05, "loss": 0.3735, "step": 7710 }, { "epoch": 1.2071070757670632, "grad_norm": 1.7352992296218872, "learning_rate": 6.035835597826086e-05, "loss": 0.4665, "step": 7711 }, { "epoch": 1.2072636192861617, "grad_norm": 1.4119412899017334, "learning_rate": 6.034646739130434e-05, "loss": 0.3907, "step": 7712 }, { "epoch": 1.2074201628052599, "grad_norm": 1.5005511045455933, "learning_rate": 6.0334578804347816e-05, "loss": 0.4559, "step": 7713 }, { "epoch": 1.207576706324358, "grad_norm": 1.9284946918487549, "learning_rate": 6.0322690217391294e-05, "loss": 0.4069, "step": 7714 }, { "epoch": 1.2077332498434565, "grad_norm": 3.08929705619812, "learning_rate": 6.031080163043478e-05, "loss": 0.6341, "step": 7715 }, { "epoch": 1.2078897933625548, "grad_norm": 2.6028597354888916, "learning_rate": 6.029891304347826e-05, "loss": 0.5544, "step": 7716 }, { "epoch": 1.2080463368816532, "grad_norm": 5.33465051651001, "learning_rate": 6.0287024456521735e-05, "loss": 0.5379, "step": 7717 }, { "epoch": 1.2082028804007514, "grad_norm": 1.2788032293319702, "learning_rate": 6.0275135869565214e-05, "loss": 0.5163, "step": 7718 }, { "epoch": 1.2083594239198496, "grad_norm": 2.685776472091675, "learning_rate": 6.026324728260869e-05, "loss": 0.6534, "step": 7719 }, { "epoch": 1.208515967438948, "grad_norm": 1.8465579748153687, "learning_rate": 6.025135869565217e-05, "loss": 0.7777, "step": 7720 }, { "epoch": 1.2086725109580463, "grad_norm": 2.655111312866211, "learning_rate": 6.023947010869565e-05, "loss": 0.7991, "step": 7721 }, { "epoch": 1.2088290544771447, "grad_norm": 2.781317949295044, "learning_rate": 6.0227581521739126e-05, "loss": 0.6934, "step": 7722 }, { "epoch": 1.208985597996243, "grad_norm": 3.939401149749756, "learning_rate": 6.02156929347826e-05, "loss": 0.7164, "step": 7723 }, { "epoch": 1.2091421415153412, "grad_norm": 4.686135768890381, "learning_rate": 6.0203804347826076e-05, "loss": 1.324, "step": 7724 }, { "epoch": 1.2092986850344396, "grad_norm": 2.8432350158691406, "learning_rate": 6.019191576086956e-05, "loss": 0.956, "step": 7725 }, { "epoch": 1.2094552285535378, "grad_norm": 4.141757011413574, "learning_rate": 6.018002717391304e-05, "loss": 0.9663, "step": 7726 }, { "epoch": 1.2096117720726363, "grad_norm": 2.00771427154541, "learning_rate": 6.016813858695652e-05, "loss": 0.8716, "step": 7727 }, { "epoch": 1.2097683155917345, "grad_norm": 1.8016048669815063, "learning_rate": 6.0156249999999995e-05, "loss": 0.9412, "step": 7728 }, { "epoch": 1.2099248591108327, "grad_norm": 3.0464346408843994, "learning_rate": 6.014436141304347e-05, "loss": 0.7624, "step": 7729 }, { "epoch": 1.2100814026299311, "grad_norm": 3.3473849296569824, "learning_rate": 6.013247282608695e-05, "loss": 0.9056, "step": 7730 }, { "epoch": 1.2102379461490294, "grad_norm": 2.744861602783203, "learning_rate": 6.012058423913043e-05, "loss": 1.0515, "step": 7731 }, { "epoch": 1.2103944896681278, "grad_norm": 2.8447232246398926, "learning_rate": 6.010869565217391e-05, "loss": 1.273, "step": 7732 }, { "epoch": 1.210551033187226, "grad_norm": 1.8671693801879883, "learning_rate": 6.009680706521739e-05, "loss": 1.2663, "step": 7733 }, { "epoch": 1.2107075767063245, "grad_norm": 2.1247904300689697, "learning_rate": 6.008491847826086e-05, "loss": 0.4192, "step": 7734 }, { "epoch": 1.2108641202254227, "grad_norm": 1.305168628692627, "learning_rate": 6.007302989130434e-05, "loss": 0.5231, "step": 7735 }, { "epoch": 1.2110206637445209, "grad_norm": 0.46988534927368164, "learning_rate": 6.006114130434782e-05, "loss": 0.123, "step": 7736 }, { "epoch": 1.2111772072636193, "grad_norm": 3.1105117797851562, "learning_rate": 6.00492527173913e-05, "loss": 0.5432, "step": 7737 }, { "epoch": 1.2113337507827175, "grad_norm": 2.935148000717163, "learning_rate": 6.003736413043478e-05, "loss": 0.9827, "step": 7738 }, { "epoch": 1.211490294301816, "grad_norm": 0.40671178698539734, "learning_rate": 6.0025475543478255e-05, "loss": 0.2098, "step": 7739 }, { "epoch": 1.2116468378209142, "grad_norm": 0.5210038423538208, "learning_rate": 6.001358695652173e-05, "loss": 0.1993, "step": 7740 }, { "epoch": 1.2118033813400126, "grad_norm": 0.7216587066650391, "learning_rate": 6.000169836956521e-05, "loss": 0.2275, "step": 7741 }, { "epoch": 1.2119599248591109, "grad_norm": 0.5302210450172424, "learning_rate": 5.998980978260869e-05, "loss": 0.2435, "step": 7742 }, { "epoch": 1.212116468378209, "grad_norm": 0.6804484724998474, "learning_rate": 5.9977921195652174e-05, "loss": 0.2767, "step": 7743 }, { "epoch": 1.2122730118973075, "grad_norm": 0.5723395943641663, "learning_rate": 5.996603260869565e-05, "loss": 0.2794, "step": 7744 }, { "epoch": 1.2124295554164057, "grad_norm": 0.6949394345283508, "learning_rate": 5.995414402173913e-05, "loss": 0.2431, "step": 7745 }, { "epoch": 1.2125860989355042, "grad_norm": 2.455723762512207, "learning_rate": 5.99422554347826e-05, "loss": 0.3609, "step": 7746 }, { "epoch": 1.2127426424546024, "grad_norm": 2.0301101207733154, "learning_rate": 5.993036684782608e-05, "loss": 0.3574, "step": 7747 }, { "epoch": 1.2128991859737006, "grad_norm": 1.1794615983963013, "learning_rate": 5.991847826086956e-05, "loss": 0.2841, "step": 7748 }, { "epoch": 1.213055729492799, "grad_norm": 1.3608602285385132, "learning_rate": 5.9906589673913036e-05, "loss": 0.3798, "step": 7749 }, { "epoch": 1.2132122730118973, "grad_norm": 0.8137030601501465, "learning_rate": 5.9894701086956515e-05, "loss": 0.296, "step": 7750 }, { "epoch": 1.2133688165309957, "grad_norm": 0.5889323353767395, "learning_rate": 5.988281249999999e-05, "loss": 0.2526, "step": 7751 }, { "epoch": 1.213525360050094, "grad_norm": 0.9666175842285156, "learning_rate": 5.987092391304347e-05, "loss": 0.2806, "step": 7752 }, { "epoch": 1.2136819035691921, "grad_norm": 0.8830265402793884, "learning_rate": 5.9859035326086956e-05, "loss": 0.3124, "step": 7753 }, { "epoch": 1.2138384470882906, "grad_norm": 0.9336267113685608, "learning_rate": 5.9847146739130434e-05, "loss": 0.3491, "step": 7754 }, { "epoch": 1.2139949906073888, "grad_norm": 2.755751848220825, "learning_rate": 5.983525815217391e-05, "loss": 0.5395, "step": 7755 }, { "epoch": 1.2141515341264872, "grad_norm": 7.241015434265137, "learning_rate": 5.982336956521739e-05, "loss": 0.7238, "step": 7756 }, { "epoch": 1.2143080776455855, "grad_norm": 1.282633662223816, "learning_rate": 5.981148097826086e-05, "loss": 0.3062, "step": 7757 }, { "epoch": 1.2144646211646837, "grad_norm": 1.0595684051513672, "learning_rate": 5.979959239130434e-05, "loss": 0.3265, "step": 7758 }, { "epoch": 1.2146211646837821, "grad_norm": 1.4399842023849487, "learning_rate": 5.978770380434782e-05, "loss": 0.3336, "step": 7759 }, { "epoch": 1.2147777082028803, "grad_norm": 2.175100326538086, "learning_rate": 5.9775815217391296e-05, "loss": 0.6623, "step": 7760 }, { "epoch": 1.2149342517219788, "grad_norm": 1.0394331216812134, "learning_rate": 5.9763926630434774e-05, "loss": 0.5621, "step": 7761 }, { "epoch": 1.215090795241077, "grad_norm": 2.156938314437866, "learning_rate": 5.975203804347825e-05, "loss": 0.5456, "step": 7762 }, { "epoch": 1.2152473387601752, "grad_norm": 2.22601318359375, "learning_rate": 5.974014945652174e-05, "loss": 0.8063, "step": 7763 }, { "epoch": 1.2154038822792737, "grad_norm": 1.6422626972198486, "learning_rate": 5.9728260869565216e-05, "loss": 0.3792, "step": 7764 }, { "epoch": 1.2155604257983719, "grad_norm": 2.4641897678375244, "learning_rate": 5.9716372282608694e-05, "loss": 0.731, "step": 7765 }, { "epoch": 1.2157169693174703, "grad_norm": 1.8069514036178589, "learning_rate": 5.970448369565217e-05, "loss": 0.8144, "step": 7766 }, { "epoch": 1.2158735128365685, "grad_norm": 5.061131477355957, "learning_rate": 5.969259510869565e-05, "loss": 1.1734, "step": 7767 }, { "epoch": 1.216030056355667, "grad_norm": 2.247253179550171, "learning_rate": 5.968070652173913e-05, "loss": 0.9063, "step": 7768 }, { "epoch": 1.2161865998747652, "grad_norm": 2.0417866706848145, "learning_rate": 5.96688179347826e-05, "loss": 0.7755, "step": 7769 }, { "epoch": 1.2163431433938634, "grad_norm": 7.1647138595581055, "learning_rate": 5.965692934782608e-05, "loss": 0.8915, "step": 7770 }, { "epoch": 1.2164996869129618, "grad_norm": 4.215598106384277, "learning_rate": 5.9645040760869556e-05, "loss": 0.8595, "step": 7771 }, { "epoch": 1.21665623043206, "grad_norm": 3.1496801376342773, "learning_rate": 5.9633152173913034e-05, "loss": 0.9467, "step": 7772 }, { "epoch": 1.2168127739511585, "grad_norm": 2.3742589950561523, "learning_rate": 5.962126358695652e-05, "loss": 0.4965, "step": 7773 }, { "epoch": 1.2169693174702567, "grad_norm": 2.779402256011963, "learning_rate": 5.9609375e-05, "loss": 1.0686, "step": 7774 }, { "epoch": 1.2171258609893552, "grad_norm": 2.263599157333374, "learning_rate": 5.9597486413043475e-05, "loss": 0.7473, "step": 7775 }, { "epoch": 1.2172824045084534, "grad_norm": 3.8215415477752686, "learning_rate": 5.9585597826086953e-05, "loss": 0.6371, "step": 7776 }, { "epoch": 1.2174389480275516, "grad_norm": 2.112287998199463, "learning_rate": 5.957370923913043e-05, "loss": 0.7979, "step": 7777 }, { "epoch": 1.21759549154665, "grad_norm": 2.59885311126709, "learning_rate": 5.956182065217391e-05, "loss": 0.6772, "step": 7778 }, { "epoch": 1.2177520350657483, "grad_norm": 2.240819215774536, "learning_rate": 5.954993206521739e-05, "loss": 0.4326, "step": 7779 }, { "epoch": 1.2179085785848467, "grad_norm": 1.9155255556106567, "learning_rate": 5.953804347826086e-05, "loss": 0.5699, "step": 7780 }, { "epoch": 1.218065122103945, "grad_norm": 4.923479080200195, "learning_rate": 5.952615489130434e-05, "loss": 1.0437, "step": 7781 }, { "epoch": 1.2182216656230431, "grad_norm": 1.7933422327041626, "learning_rate": 5.9514266304347816e-05, "loss": 0.511, "step": 7782 }, { "epoch": 1.2183782091421416, "grad_norm": 2.8376598358154297, "learning_rate": 5.95023777173913e-05, "loss": 1.0142, "step": 7783 }, { "epoch": 1.2185347526612398, "grad_norm": 4.851485729217529, "learning_rate": 5.949048913043478e-05, "loss": 1.6595, "step": 7784 }, { "epoch": 1.2186912961803382, "grad_norm": 2.6279778480529785, "learning_rate": 5.947860054347826e-05, "loss": 0.9849, "step": 7785 }, { "epoch": 1.2188478396994364, "grad_norm": 1.6519584655761719, "learning_rate": 5.9466711956521735e-05, "loss": 0.4021, "step": 7786 }, { "epoch": 1.2190043832185347, "grad_norm": 3.103289842605591, "learning_rate": 5.945482336956521e-05, "loss": 0.2665, "step": 7787 }, { "epoch": 1.219160926737633, "grad_norm": 2.7348251342773438, "learning_rate": 5.944293478260869e-05, "loss": 0.9433, "step": 7788 }, { "epoch": 1.2193174702567313, "grad_norm": 0.47850945591926575, "learning_rate": 5.943104619565217e-05, "loss": 0.1749, "step": 7789 }, { "epoch": 1.2194740137758298, "grad_norm": 0.5052911043167114, "learning_rate": 5.941915760869565e-05, "loss": 0.1935, "step": 7790 }, { "epoch": 1.219630557294928, "grad_norm": 0.49246999621391296, "learning_rate": 5.9407269021739126e-05, "loss": 0.1774, "step": 7791 }, { "epoch": 1.2197871008140262, "grad_norm": 0.9868510365486145, "learning_rate": 5.93953804347826e-05, "loss": 0.2025, "step": 7792 }, { "epoch": 1.2199436443331246, "grad_norm": 0.7106982469558716, "learning_rate": 5.938349184782608e-05, "loss": 0.2441, "step": 7793 }, { "epoch": 1.2201001878522229, "grad_norm": 0.5360453128814697, "learning_rate": 5.937160326086956e-05, "loss": 0.2276, "step": 7794 }, { "epoch": 1.2202567313713213, "grad_norm": 0.6182153820991516, "learning_rate": 5.935971467391304e-05, "loss": 0.2463, "step": 7795 }, { "epoch": 1.2204132748904195, "grad_norm": 0.6664374470710754, "learning_rate": 5.9347826086956517e-05, "loss": 0.1776, "step": 7796 }, { "epoch": 1.2205698184095177, "grad_norm": 0.6143420934677124, "learning_rate": 5.9335937499999995e-05, "loss": 0.181, "step": 7797 }, { "epoch": 1.2207263619286162, "grad_norm": 0.5989176630973816, "learning_rate": 5.932404891304347e-05, "loss": 0.2257, "step": 7798 }, { "epoch": 1.2208829054477144, "grad_norm": 0.9217280745506287, "learning_rate": 5.931216032608695e-05, "loss": 0.302, "step": 7799 }, { "epoch": 1.2210394489668128, "grad_norm": 0.8718611001968384, "learning_rate": 5.930027173913043e-05, "loss": 0.3197, "step": 7800 }, { "epoch": 1.221195992485911, "grad_norm": 2.1094470024108887, "learning_rate": 5.928838315217391e-05, "loss": 0.2891, "step": 7801 }, { "epoch": 1.2213525360050095, "grad_norm": 3.0451457500457764, "learning_rate": 5.927649456521739e-05, "loss": 0.6462, "step": 7802 }, { "epoch": 1.2215090795241077, "grad_norm": 0.971757173538208, "learning_rate": 5.9264605978260864e-05, "loss": 0.2119, "step": 7803 }, { "epoch": 1.221665623043206, "grad_norm": 0.6371275186538696, "learning_rate": 5.925271739130434e-05, "loss": 0.2077, "step": 7804 }, { "epoch": 1.2218221665623044, "grad_norm": 1.1626635789871216, "learning_rate": 5.924082880434782e-05, "loss": 0.4846, "step": 7805 }, { "epoch": 1.2219787100814026, "grad_norm": 1.163492202758789, "learning_rate": 5.92289402173913e-05, "loss": 0.3853, "step": 7806 }, { "epoch": 1.222135253600501, "grad_norm": 2.6275532245635986, "learning_rate": 5.9217051630434776e-05, "loss": 0.8676, "step": 7807 }, { "epoch": 1.2222917971195992, "grad_norm": 1.1802529096603394, "learning_rate": 5.9205163043478254e-05, "loss": 0.4181, "step": 7808 }, { "epoch": 1.2224483406386977, "grad_norm": 1.7144075632095337, "learning_rate": 5.919327445652173e-05, "loss": 0.3579, "step": 7809 }, { "epoch": 1.222604884157796, "grad_norm": 1.7044432163238525, "learning_rate": 5.918138586956521e-05, "loss": 0.4769, "step": 7810 }, { "epoch": 1.222761427676894, "grad_norm": 2.233137607574463, "learning_rate": 5.916949728260869e-05, "loss": 0.6997, "step": 7811 }, { "epoch": 1.2229179711959925, "grad_norm": 2.2633724212646484, "learning_rate": 5.9157608695652174e-05, "loss": 0.4279, "step": 7812 }, { "epoch": 1.2230745147150908, "grad_norm": 2.214808940887451, "learning_rate": 5.914572010869565e-05, "loss": 0.63, "step": 7813 }, { "epoch": 1.2232310582341892, "grad_norm": 2.6818089485168457, "learning_rate": 5.913383152173913e-05, "loss": 0.8155, "step": 7814 }, { "epoch": 1.2233876017532874, "grad_norm": 1.8116323947906494, "learning_rate": 5.91219429347826e-05, "loss": 0.3944, "step": 7815 }, { "epoch": 1.2235441452723856, "grad_norm": 1.4948559999465942, "learning_rate": 5.911005434782608e-05, "loss": 0.3977, "step": 7816 }, { "epoch": 1.223700688791484, "grad_norm": 1.7598689794540405, "learning_rate": 5.909816576086956e-05, "loss": 0.438, "step": 7817 }, { "epoch": 1.2238572323105823, "grad_norm": 1.5461419820785522, "learning_rate": 5.9086277173913036e-05, "loss": 0.3589, "step": 7818 }, { "epoch": 1.2240137758296807, "grad_norm": 2.2334907054901123, "learning_rate": 5.9074388586956514e-05, "loss": 0.9149, "step": 7819 }, { "epoch": 1.224170319348779, "grad_norm": 3.6761059761047363, "learning_rate": 5.906249999999999e-05, "loss": 0.9011, "step": 7820 }, { "epoch": 1.2243268628678772, "grad_norm": 2.4983623027801514, "learning_rate": 5.905061141304347e-05, "loss": 1.0954, "step": 7821 }, { "epoch": 1.2244834063869756, "grad_norm": 4.180243968963623, "learning_rate": 5.9038722826086955e-05, "loss": 0.8088, "step": 7822 }, { "epoch": 1.2246399499060738, "grad_norm": 2.8462140560150146, "learning_rate": 5.9026834239130434e-05, "loss": 0.5004, "step": 7823 }, { "epoch": 1.2247964934251723, "grad_norm": 2.8187694549560547, "learning_rate": 5.901494565217391e-05, "loss": 1.4201, "step": 7824 }, { "epoch": 1.2249530369442705, "grad_norm": 4.355632305145264, "learning_rate": 5.900305706521739e-05, "loss": 0.7521, "step": 7825 }, { "epoch": 1.2251095804633687, "grad_norm": 9.604656219482422, "learning_rate": 5.899116847826086e-05, "loss": 0.8952, "step": 7826 }, { "epoch": 1.2252661239824671, "grad_norm": 3.543121337890625, "learning_rate": 5.897927989130434e-05, "loss": 1.278, "step": 7827 }, { "epoch": 1.2254226675015654, "grad_norm": 4.273598670959473, "learning_rate": 5.896739130434782e-05, "loss": 0.9353, "step": 7828 }, { "epoch": 1.2255792110206638, "grad_norm": 2.028184652328491, "learning_rate": 5.8955502717391296e-05, "loss": 0.7994, "step": 7829 }, { "epoch": 1.225735754539762, "grad_norm": 3.7391953468322754, "learning_rate": 5.8943614130434774e-05, "loss": 1.0267, "step": 7830 }, { "epoch": 1.2258922980588602, "grad_norm": 3.8038489818573, "learning_rate": 5.893172554347825e-05, "loss": 1.1935, "step": 7831 }, { "epoch": 1.2260488415779587, "grad_norm": 3.273042917251587, "learning_rate": 5.891983695652174e-05, "loss": 1.4034, "step": 7832 }, { "epoch": 1.226205385097057, "grad_norm": 3.0727360248565674, "learning_rate": 5.8907948369565215e-05, "loss": 1.1988, "step": 7833 }, { "epoch": 1.2263619286161553, "grad_norm": 6.06515645980835, "learning_rate": 5.889605978260869e-05, "loss": 0.422, "step": 7834 }, { "epoch": 1.2265184721352536, "grad_norm": 3.2068846225738525, "learning_rate": 5.888417119565217e-05, "loss": 0.5689, "step": 7835 }, { "epoch": 1.226675015654352, "grad_norm": 5.751680850982666, "learning_rate": 5.887228260869565e-05, "loss": 0.8392, "step": 7836 }, { "epoch": 1.2268315591734502, "grad_norm": 2.8414266109466553, "learning_rate": 5.886039402173913e-05, "loss": 0.8917, "step": 7837 }, { "epoch": 1.2269881026925484, "grad_norm": 3.562791347503662, "learning_rate": 5.88485054347826e-05, "loss": 0.8207, "step": 7838 }, { "epoch": 1.2271446462116469, "grad_norm": 0.6806472539901733, "learning_rate": 5.883661684782608e-05, "loss": 0.2314, "step": 7839 }, { "epoch": 1.227301189730745, "grad_norm": 0.5083348155021667, "learning_rate": 5.8824728260869555e-05, "loss": 0.2141, "step": 7840 }, { "epoch": 1.2274577332498435, "grad_norm": 0.7125424742698669, "learning_rate": 5.881283967391304e-05, "loss": 0.2215, "step": 7841 }, { "epoch": 1.2276142767689417, "grad_norm": 0.429788202047348, "learning_rate": 5.880095108695652e-05, "loss": 0.1652, "step": 7842 }, { "epoch": 1.2277708202880402, "grad_norm": 0.6066604256629944, "learning_rate": 5.87890625e-05, "loss": 0.2116, "step": 7843 }, { "epoch": 1.2279273638071384, "grad_norm": 0.5601625442504883, "learning_rate": 5.8777173913043475e-05, "loss": 0.2096, "step": 7844 }, { "epoch": 1.2280839073262366, "grad_norm": 0.550135612487793, "learning_rate": 5.876528532608695e-05, "loss": 0.2727, "step": 7845 }, { "epoch": 1.228240450845335, "grad_norm": 0.9998852610588074, "learning_rate": 5.875339673913043e-05, "loss": 0.2659, "step": 7846 }, { "epoch": 1.2283969943644333, "grad_norm": 0.7901473045349121, "learning_rate": 5.874150815217391e-05, "loss": 0.3217, "step": 7847 }, { "epoch": 1.2285535378835317, "grad_norm": 0.6937294006347656, "learning_rate": 5.872961956521739e-05, "loss": 0.2478, "step": 7848 }, { "epoch": 1.22871008140263, "grad_norm": 0.8781102299690247, "learning_rate": 5.871773097826086e-05, "loss": 0.3508, "step": 7849 }, { "epoch": 1.2288666249217282, "grad_norm": 2.927799940109253, "learning_rate": 5.870584239130434e-05, "loss": 0.469, "step": 7850 }, { "epoch": 1.2290231684408266, "grad_norm": 0.8039957284927368, "learning_rate": 5.869395380434782e-05, "loss": 0.3424, "step": 7851 }, { "epoch": 1.2291797119599248, "grad_norm": 0.8291757702827454, "learning_rate": 5.86820652173913e-05, "loss": 0.2105, "step": 7852 }, { "epoch": 1.2293362554790233, "grad_norm": 1.0826303958892822, "learning_rate": 5.867017663043478e-05, "loss": 0.3658, "step": 7853 }, { "epoch": 1.2294927989981215, "grad_norm": 0.9481671452522278, "learning_rate": 5.8658288043478256e-05, "loss": 0.3271, "step": 7854 }, { "epoch": 1.2296493425172197, "grad_norm": 4.375735282897949, "learning_rate": 5.8646399456521735e-05, "loss": 0.5031, "step": 7855 }, { "epoch": 1.2298058860363181, "grad_norm": 0.7546560764312744, "learning_rate": 5.863451086956521e-05, "loss": 0.2021, "step": 7856 }, { "epoch": 1.2299624295554163, "grad_norm": 2.2652249336242676, "learning_rate": 5.862262228260869e-05, "loss": 0.4267, "step": 7857 }, { "epoch": 1.2301189730745148, "grad_norm": 1.1003577709197998, "learning_rate": 5.861073369565217e-05, "loss": 0.2698, "step": 7858 }, { "epoch": 1.230275516593613, "grad_norm": 2.5551462173461914, "learning_rate": 5.859884510869565e-05, "loss": 0.545, "step": 7859 }, { "epoch": 1.2304320601127112, "grad_norm": 1.8956103324890137, "learning_rate": 5.858695652173913e-05, "loss": 0.3963, "step": 7860 }, { "epoch": 1.2305886036318097, "grad_norm": 1.65129554271698, "learning_rate": 5.8575067934782603e-05, "loss": 0.3473, "step": 7861 }, { "epoch": 1.2307451471509079, "grad_norm": 2.2586300373077393, "learning_rate": 5.856317934782608e-05, "loss": 0.6358, "step": 7862 }, { "epoch": 1.2309016906700063, "grad_norm": 3.601438522338867, "learning_rate": 5.855129076086956e-05, "loss": 0.5927, "step": 7863 }, { "epoch": 1.2310582341891045, "grad_norm": 2.0403006076812744, "learning_rate": 5.853940217391304e-05, "loss": 0.355, "step": 7864 }, { "epoch": 1.2312147777082028, "grad_norm": 5.055902004241943, "learning_rate": 5.8527513586956516e-05, "loss": 0.6141, "step": 7865 }, { "epoch": 1.2313713212273012, "grad_norm": 2.1693198680877686, "learning_rate": 5.8515624999999994e-05, "loss": 0.7963, "step": 7866 }, { "epoch": 1.2315278647463994, "grad_norm": 2.191439628601074, "learning_rate": 5.850373641304347e-05, "loss": 0.5808, "step": 7867 }, { "epoch": 1.2316844082654979, "grad_norm": 4.142387866973877, "learning_rate": 5.849184782608695e-05, "loss": 1.2588, "step": 7868 }, { "epoch": 1.231840951784596, "grad_norm": 2.074735641479492, "learning_rate": 5.847995923913043e-05, "loss": 0.8104, "step": 7869 }, { "epoch": 1.2319974953036945, "grad_norm": 2.648648977279663, "learning_rate": 5.8468070652173914e-05, "loss": 0.5946, "step": 7870 }, { "epoch": 1.2321540388227927, "grad_norm": 1.8333449363708496, "learning_rate": 5.845618206521739e-05, "loss": 0.7075, "step": 7871 }, { "epoch": 1.2323105823418912, "grad_norm": 3.3032217025756836, "learning_rate": 5.844429347826086e-05, "loss": 0.5949, "step": 7872 }, { "epoch": 1.2324671258609894, "grad_norm": 3.6214566230773926, "learning_rate": 5.843240489130434e-05, "loss": 0.8533, "step": 7873 }, { "epoch": 1.2326236693800876, "grad_norm": 3.6762609481811523, "learning_rate": 5.842051630434782e-05, "loss": 0.9787, "step": 7874 }, { "epoch": 1.232780212899186, "grad_norm": 1.785263180732727, "learning_rate": 5.84086277173913e-05, "loss": 0.586, "step": 7875 }, { "epoch": 1.2329367564182843, "grad_norm": 2.6912481784820557, "learning_rate": 5.8396739130434776e-05, "loss": 1.12, "step": 7876 }, { "epoch": 1.2330932999373827, "grad_norm": 4.730843544006348, "learning_rate": 5.8384850543478254e-05, "loss": 0.9618, "step": 7877 }, { "epoch": 1.233249843456481, "grad_norm": 4.919073581695557, "learning_rate": 5.837296195652173e-05, "loss": 1.0468, "step": 7878 }, { "epoch": 1.2334063869755791, "grad_norm": 2.0311806201934814, "learning_rate": 5.836107336956521e-05, "loss": 0.5237, "step": 7879 }, { "epoch": 1.2335629304946776, "grad_norm": 3.166008472442627, "learning_rate": 5.8349184782608695e-05, "loss": 1.1803, "step": 7880 }, { "epoch": 1.2337194740137758, "grad_norm": 5.4330339431762695, "learning_rate": 5.833729619565217e-05, "loss": 0.8748, "step": 7881 }, { "epoch": 1.2338760175328742, "grad_norm": 2.6970393657684326, "learning_rate": 5.832540760869565e-05, "loss": 1.3093, "step": 7882 }, { "epoch": 1.2340325610519725, "grad_norm": 3.1121034622192383, "learning_rate": 5.831351902173913e-05, "loss": 0.8629, "step": 7883 }, { "epoch": 1.2341891045710707, "grad_norm": 4.164750576019287, "learning_rate": 5.83016304347826e-05, "loss": 0.7183, "step": 7884 }, { "epoch": 1.2343456480901691, "grad_norm": 5.2210211753845215, "learning_rate": 5.828974184782608e-05, "loss": 0.6035, "step": 7885 }, { "epoch": 1.2345021916092673, "grad_norm": 3.2683427333831787, "learning_rate": 5.827785326086956e-05, "loss": 1.5026, "step": 7886 }, { "epoch": 1.2346587351283658, "grad_norm": 2.7378997802734375, "learning_rate": 5.8265964673913036e-05, "loss": 0.9695, "step": 7887 }, { "epoch": 1.234815278647464, "grad_norm": 3.3107104301452637, "learning_rate": 5.8254076086956514e-05, "loss": 1.4077, "step": 7888 }, { "epoch": 1.2349718221665622, "grad_norm": 0.7354045510292053, "learning_rate": 5.824218749999999e-05, "loss": 0.2877, "step": 7889 }, { "epoch": 1.2351283656856606, "grad_norm": 0.6021600365638733, "learning_rate": 5.823029891304348e-05, "loss": 0.2649, "step": 7890 }, { "epoch": 1.2352849092047589, "grad_norm": 0.3501337766647339, "learning_rate": 5.8218410326086955e-05, "loss": 0.1606, "step": 7891 }, { "epoch": 1.2354414527238573, "grad_norm": 0.5789162516593933, "learning_rate": 5.820652173913043e-05, "loss": 0.2293, "step": 7892 }, { "epoch": 1.2355979962429555, "grad_norm": 0.5309104919433594, "learning_rate": 5.819463315217391e-05, "loss": 0.1455, "step": 7893 }, { "epoch": 1.2357545397620537, "grad_norm": 3.111152410507202, "learning_rate": 5.818274456521739e-05, "loss": 0.1926, "step": 7894 }, { "epoch": 1.2359110832811522, "grad_norm": 0.5814661979675293, "learning_rate": 5.817085597826086e-05, "loss": 0.2165, "step": 7895 }, { "epoch": 1.2360676268002504, "grad_norm": 0.8206741213798523, "learning_rate": 5.815896739130434e-05, "loss": 0.2548, "step": 7896 }, { "epoch": 1.2362241703193488, "grad_norm": 0.5579687356948853, "learning_rate": 5.814707880434782e-05, "loss": 0.2417, "step": 7897 }, { "epoch": 1.236380713838447, "grad_norm": 0.45682039856910706, "learning_rate": 5.8135190217391295e-05, "loss": 0.1841, "step": 7898 }, { "epoch": 1.2365372573575455, "grad_norm": 0.6824737191200256, "learning_rate": 5.812330163043477e-05, "loss": 0.3126, "step": 7899 }, { "epoch": 1.2366938008766437, "grad_norm": 1.6137629747390747, "learning_rate": 5.811141304347826e-05, "loss": 0.6334, "step": 7900 }, { "epoch": 1.236850344395742, "grad_norm": 0.9492663741111755, "learning_rate": 5.8099524456521736e-05, "loss": 0.6099, "step": 7901 }, { "epoch": 1.2370068879148404, "grad_norm": 0.7965685725212097, "learning_rate": 5.8087635869565215e-05, "loss": 0.2591, "step": 7902 }, { "epoch": 1.2371634314339386, "grad_norm": 0.9296538233757019, "learning_rate": 5.807574728260869e-05, "loss": 0.2043, "step": 7903 }, { "epoch": 1.237319974953037, "grad_norm": 0.8475710153579712, "learning_rate": 5.806385869565217e-05, "loss": 0.3129, "step": 7904 }, { "epoch": 1.2374765184721352, "grad_norm": 2.979492425918579, "learning_rate": 5.805197010869565e-05, "loss": 0.8457, "step": 7905 }, { "epoch": 1.2376330619912337, "grad_norm": 1.5358309745788574, "learning_rate": 5.804008152173913e-05, "loss": 0.3808, "step": 7906 }, { "epoch": 1.237789605510332, "grad_norm": 1.4862210750579834, "learning_rate": 5.80281929347826e-05, "loss": 0.5244, "step": 7907 }, { "epoch": 1.2379461490294301, "grad_norm": 2.51641583442688, "learning_rate": 5.801630434782608e-05, "loss": 0.5319, "step": 7908 }, { "epoch": 1.2381026925485286, "grad_norm": 1.3329362869262695, "learning_rate": 5.8004415760869555e-05, "loss": 0.3196, "step": 7909 }, { "epoch": 1.2382592360676268, "grad_norm": 3.9056239128112793, "learning_rate": 5.799252717391304e-05, "loss": 0.6654, "step": 7910 }, { "epoch": 1.2384157795867252, "grad_norm": 2.1045005321502686, "learning_rate": 5.798063858695652e-05, "loss": 0.4068, "step": 7911 }, { "epoch": 1.2385723231058234, "grad_norm": 1.480521321296692, "learning_rate": 5.7968749999999996e-05, "loss": 0.7199, "step": 7912 }, { "epoch": 1.2387288666249217, "grad_norm": 0.9237334728240967, "learning_rate": 5.7956861413043474e-05, "loss": 0.3043, "step": 7913 }, { "epoch": 1.23888541014402, "grad_norm": 1.7164299488067627, "learning_rate": 5.794497282608695e-05, "loss": 0.3183, "step": 7914 }, { "epoch": 1.2390419536631183, "grad_norm": 1.3473618030548096, "learning_rate": 5.793308423913043e-05, "loss": 0.5873, "step": 7915 }, { "epoch": 1.2391984971822168, "grad_norm": 3.4182615280151367, "learning_rate": 5.792119565217391e-05, "loss": 0.7958, "step": 7916 }, { "epoch": 1.239355040701315, "grad_norm": 2.216196060180664, "learning_rate": 5.790930706521739e-05, "loss": 0.5478, "step": 7917 }, { "epoch": 1.2395115842204132, "grad_norm": 2.411402702331543, "learning_rate": 5.789741847826086e-05, "loss": 0.6235, "step": 7918 }, { "epoch": 1.2396681277395116, "grad_norm": 1.6773711442947388, "learning_rate": 5.7885529891304337e-05, "loss": 0.8154, "step": 7919 }, { "epoch": 1.2398246712586098, "grad_norm": 2.4096922874450684, "learning_rate": 5.787364130434782e-05, "loss": 1.2, "step": 7920 }, { "epoch": 1.2399812147777083, "grad_norm": 3.899930238723755, "learning_rate": 5.78617527173913e-05, "loss": 0.8102, "step": 7921 }, { "epoch": 1.2401377582968065, "grad_norm": 2.9732308387756348, "learning_rate": 5.784986413043478e-05, "loss": 0.9984, "step": 7922 }, { "epoch": 1.2402943018159047, "grad_norm": 2.628436803817749, "learning_rate": 5.7837975543478256e-05, "loss": 1.3342, "step": 7923 }, { "epoch": 1.2404508453350032, "grad_norm": 1.934407114982605, "learning_rate": 5.7826086956521734e-05, "loss": 0.4849, "step": 7924 }, { "epoch": 1.2406073888541014, "grad_norm": 1.8082512617111206, "learning_rate": 5.781419836956521e-05, "loss": 0.7219, "step": 7925 }, { "epoch": 1.2407639323731998, "grad_norm": 4.146202087402344, "learning_rate": 5.780230978260869e-05, "loss": 0.6659, "step": 7926 }, { "epoch": 1.240920475892298, "grad_norm": 2.5587244033813477, "learning_rate": 5.779042119565217e-05, "loss": 0.9614, "step": 7927 }, { "epoch": 1.2410770194113963, "grad_norm": 13.5656156539917, "learning_rate": 5.7778532608695653e-05, "loss": 0.7823, "step": 7928 }, { "epoch": 1.2412335629304947, "grad_norm": 7.228659629821777, "learning_rate": 5.776664402173913e-05, "loss": 1.867, "step": 7929 }, { "epoch": 1.241390106449593, "grad_norm": 3.025315761566162, "learning_rate": 5.77547554347826e-05, "loss": 1.2616, "step": 7930 }, { "epoch": 1.2415466499686914, "grad_norm": 2.560810089111328, "learning_rate": 5.774286684782608e-05, "loss": 0.8576, "step": 7931 }, { "epoch": 1.2417031934877896, "grad_norm": 1.610094428062439, "learning_rate": 5.773097826086956e-05, "loss": 0.7128, "step": 7932 }, { "epoch": 1.241859737006888, "grad_norm": 2.354391098022461, "learning_rate": 5.771908967391304e-05, "loss": 0.5326, "step": 7933 }, { "epoch": 1.2420162805259862, "grad_norm": 2.164623498916626, "learning_rate": 5.7707201086956516e-05, "loss": 0.4886, "step": 7934 }, { "epoch": 1.2421728240450844, "grad_norm": 1.9973162412643433, "learning_rate": 5.7695312499999994e-05, "loss": 0.7446, "step": 7935 }, { "epoch": 1.2423293675641829, "grad_norm": 4.897204399108887, "learning_rate": 5.768342391304347e-05, "loss": 0.7067, "step": 7936 }, { "epoch": 1.242485911083281, "grad_norm": 3.427286386489868, "learning_rate": 5.767153532608695e-05, "loss": 0.7015, "step": 7937 }, { "epoch": 1.2426424546023795, "grad_norm": 3.186401128768921, "learning_rate": 5.7659646739130435e-05, "loss": 1.301, "step": 7938 }, { "epoch": 1.2427989981214778, "grad_norm": 0.5296036601066589, "learning_rate": 5.764775815217391e-05, "loss": 0.2473, "step": 7939 }, { "epoch": 1.2429555416405762, "grad_norm": 0.6793874502182007, "learning_rate": 5.763586956521739e-05, "loss": 0.1788, "step": 7940 }, { "epoch": 1.2431120851596744, "grad_norm": 0.5921952128410339, "learning_rate": 5.762398097826086e-05, "loss": 0.1808, "step": 7941 }, { "epoch": 1.2432686286787726, "grad_norm": 0.44341060519218445, "learning_rate": 5.761209239130434e-05, "loss": 0.1801, "step": 7942 }, { "epoch": 1.243425172197871, "grad_norm": 0.915291428565979, "learning_rate": 5.760020380434782e-05, "loss": 0.2241, "step": 7943 }, { "epoch": 1.2435817157169693, "grad_norm": 0.5412155985832214, "learning_rate": 5.75883152173913e-05, "loss": 0.1564, "step": 7944 }, { "epoch": 1.2437382592360677, "grad_norm": 0.6679673790931702, "learning_rate": 5.7576426630434775e-05, "loss": 0.1906, "step": 7945 }, { "epoch": 1.243894802755166, "grad_norm": 0.6716683506965637, "learning_rate": 5.7564538043478253e-05, "loss": 0.2052, "step": 7946 }, { "epoch": 1.2440513462742642, "grad_norm": 1.0511788129806519, "learning_rate": 5.755264945652173e-05, "loss": 0.2835, "step": 7947 }, { "epoch": 1.2442078897933626, "grad_norm": 0.925911009311676, "learning_rate": 5.7540760869565217e-05, "loss": 0.3182, "step": 7948 }, { "epoch": 1.2443644333124608, "grad_norm": 1.0963209867477417, "learning_rate": 5.7528872282608695e-05, "loss": 0.1753, "step": 7949 }, { "epoch": 1.2445209768315593, "grad_norm": 1.1110478639602661, "learning_rate": 5.751698369565217e-05, "loss": 0.2466, "step": 7950 }, { "epoch": 1.2446775203506575, "grad_norm": 0.9388072490692139, "learning_rate": 5.750509510869565e-05, "loss": 0.2647, "step": 7951 }, { "epoch": 1.2448340638697557, "grad_norm": 1.5473642349243164, "learning_rate": 5.749320652173913e-05, "loss": 0.2995, "step": 7952 }, { "epoch": 1.2449906073888541, "grad_norm": 1.448696494102478, "learning_rate": 5.74813179347826e-05, "loss": 0.3696, "step": 7953 }, { "epoch": 1.2451471509079524, "grad_norm": 1.2183622121810913, "learning_rate": 5.746942934782608e-05, "loss": 0.4784, "step": 7954 }, { "epoch": 1.2453036944270508, "grad_norm": 0.6133463382720947, "learning_rate": 5.745754076086956e-05, "loss": 0.14, "step": 7955 }, { "epoch": 1.245460237946149, "grad_norm": 1.0647932291030884, "learning_rate": 5.7445652173913035e-05, "loss": 0.3814, "step": 7956 }, { "epoch": 1.2456167814652472, "grad_norm": 1.9954781532287598, "learning_rate": 5.743376358695651e-05, "loss": 0.4475, "step": 7957 }, { "epoch": 1.2457733249843457, "grad_norm": 1.7333239316940308, "learning_rate": 5.7421875e-05, "loss": 0.4609, "step": 7958 }, { "epoch": 1.245929868503444, "grad_norm": 1.7419235706329346, "learning_rate": 5.7409986413043476e-05, "loss": 0.6365, "step": 7959 }, { "epoch": 1.2460864120225423, "grad_norm": 2.342350721359253, "learning_rate": 5.7398097826086954e-05, "loss": 0.302, "step": 7960 }, { "epoch": 1.2462429555416406, "grad_norm": 1.763672947883606, "learning_rate": 5.738620923913043e-05, "loss": 0.5257, "step": 7961 }, { "epoch": 1.2463994990607388, "grad_norm": 4.051609992980957, "learning_rate": 5.737432065217391e-05, "loss": 0.3217, "step": 7962 }, { "epoch": 1.2465560425798372, "grad_norm": 1.912589192390442, "learning_rate": 5.736243206521739e-05, "loss": 0.4525, "step": 7963 }, { "epoch": 1.2467125860989354, "grad_norm": 1.1552369594573975, "learning_rate": 5.735054347826086e-05, "loss": 0.3795, "step": 7964 }, { "epoch": 1.2468691296180339, "grad_norm": 2.1518023014068604, "learning_rate": 5.733865489130434e-05, "loss": 0.5156, "step": 7965 }, { "epoch": 1.247025673137132, "grad_norm": 1.543982982635498, "learning_rate": 5.732676630434782e-05, "loss": 0.5764, "step": 7966 }, { "epoch": 1.2471822166562305, "grad_norm": 3.049227476119995, "learning_rate": 5.7314877717391295e-05, "loss": 0.7957, "step": 7967 }, { "epoch": 1.2473387601753287, "grad_norm": 4.750926494598389, "learning_rate": 5.730298913043478e-05, "loss": 0.9335, "step": 7968 }, { "epoch": 1.247495303694427, "grad_norm": 2.3792619705200195, "learning_rate": 5.729110054347826e-05, "loss": 0.7636, "step": 7969 }, { "epoch": 1.2476518472135254, "grad_norm": 2.649951457977295, "learning_rate": 5.7279211956521736e-05, "loss": 0.6308, "step": 7970 }, { "epoch": 1.2478083907326236, "grad_norm": 3.355361223220825, "learning_rate": 5.7267323369565214e-05, "loss": 0.6463, "step": 7971 }, { "epoch": 1.247964934251722, "grad_norm": 1.7960528135299683, "learning_rate": 5.725543478260869e-05, "loss": 0.5225, "step": 7972 }, { "epoch": 1.2481214777708203, "grad_norm": 1.3458425998687744, "learning_rate": 5.724354619565217e-05, "loss": 0.5572, "step": 7973 }, { "epoch": 1.2482780212899187, "grad_norm": 5.9569902420043945, "learning_rate": 5.723165760869565e-05, "loss": 0.7223, "step": 7974 }, { "epoch": 1.248434564809017, "grad_norm": 2.642184019088745, "learning_rate": 5.721976902173913e-05, "loss": 0.7721, "step": 7975 }, { "epoch": 1.2485911083281152, "grad_norm": 2.836259365081787, "learning_rate": 5.72078804347826e-05, "loss": 1.0677, "step": 7976 }, { "epoch": 1.2487476518472136, "grad_norm": 2.4698894023895264, "learning_rate": 5.7195991847826076e-05, "loss": 0.9373, "step": 7977 }, { "epoch": 1.2489041953663118, "grad_norm": 3.0983033180236816, "learning_rate": 5.718410326086956e-05, "loss": 0.7967, "step": 7978 }, { "epoch": 1.2490607388854102, "grad_norm": 3.374767780303955, "learning_rate": 5.717221467391304e-05, "loss": 0.7014, "step": 7979 }, { "epoch": 1.2492172824045085, "grad_norm": 3.3914952278137207, "learning_rate": 5.716032608695652e-05, "loss": 1.0361, "step": 7980 }, { "epoch": 1.2493738259236067, "grad_norm": 4.400058269500732, "learning_rate": 5.7148437499999996e-05, "loss": 1.037, "step": 7981 }, { "epoch": 1.2495303694427051, "grad_norm": 4.183074474334717, "learning_rate": 5.7136548913043474e-05, "loss": 0.6893, "step": 7982 }, { "epoch": 1.2496869129618033, "grad_norm": 4.246591091156006, "learning_rate": 5.712466032608695e-05, "loss": 1.1712, "step": 7983 }, { "epoch": 1.2498434564809018, "grad_norm": 2.5350170135498047, "learning_rate": 5.711277173913043e-05, "loss": 0.9585, "step": 7984 }, { "epoch": 1.25, "grad_norm": 1.7970802783966064, "learning_rate": 5.710088315217391e-05, "loss": 0.346, "step": 7985 }, { "epoch": 1.2501565435190982, "grad_norm": 1.4378117322921753, "learning_rate": 5.7088994565217387e-05, "loss": 0.7796, "step": 7986 }, { "epoch": 1.2503130870381967, "grad_norm": 2.1162073612213135, "learning_rate": 5.707710597826086e-05, "loss": 0.4585, "step": 7987 }, { "epoch": 1.2504696305572949, "grad_norm": 1.3396320343017578, "learning_rate": 5.706521739130434e-05, "loss": 0.5391, "step": 7988 }, { "epoch": 1.2506261740763933, "grad_norm": 0.34837478399276733, "learning_rate": 5.705332880434782e-05, "loss": 0.1592, "step": 7989 }, { "epoch": 1.2507827175954915, "grad_norm": 0.46198606491088867, "learning_rate": 5.70414402173913e-05, "loss": 0.164, "step": 7990 }, { "epoch": 1.2509392611145898, "grad_norm": 0.8811986446380615, "learning_rate": 5.702955163043478e-05, "loss": 0.2323, "step": 7991 }, { "epoch": 1.2510958046336882, "grad_norm": 0.4969804883003235, "learning_rate": 5.7017663043478255e-05, "loss": 0.3219, "step": 7992 }, { "epoch": 1.2512523481527864, "grad_norm": 1.1468558311462402, "learning_rate": 5.7005774456521734e-05, "loss": 0.21, "step": 7993 }, { "epoch": 1.2514088916718848, "grad_norm": 0.4624009430408478, "learning_rate": 5.699388586956521e-05, "loss": 0.1892, "step": 7994 }, { "epoch": 1.251565435190983, "grad_norm": 0.8623465895652771, "learning_rate": 5.698199728260869e-05, "loss": 0.2158, "step": 7995 }, { "epoch": 1.2517219787100813, "grad_norm": 0.7959850430488586, "learning_rate": 5.697010869565217e-05, "loss": 0.3348, "step": 7996 }, { "epoch": 1.2518785222291797, "grad_norm": 1.0844982862472534, "learning_rate": 5.695822010869565e-05, "loss": 0.5035, "step": 7997 }, { "epoch": 1.252035065748278, "grad_norm": 1.7019600868225098, "learning_rate": 5.694633152173913e-05, "loss": 0.3155, "step": 7998 }, { "epoch": 1.2521916092673764, "grad_norm": 1.1086505651474, "learning_rate": 5.69344429347826e-05, "loss": 0.2267, "step": 7999 }, { "epoch": 1.2523481527864746, "grad_norm": 0.6572399735450745, "learning_rate": 5.692255434782608e-05, "loss": 0.1923, "step": 8000 }, { "epoch": 1.2523481527864746, "eval_loss": 0.49750587344169617, "eval_runtime": 205.6909, "eval_samples_per_second": 60.202, "eval_steps_per_second": 3.763, "eval_wer": 0.31591474169624073, "step": 8000 }, { "epoch": 1.2525046963055728, "grad_norm": 1.2003049850463867, "learning_rate": 5.691066576086956e-05, "loss": 0.3348, "step": 8001 }, { "epoch": 1.2526612398246713, "grad_norm": 1.9540462493896484, "learning_rate": 5.689877717391304e-05, "loss": 0.479, "step": 8002 }, { "epoch": 1.2528177833437697, "grad_norm": 0.7687429189682007, "learning_rate": 5.6886888586956515e-05, "loss": 0.2401, "step": 8003 }, { "epoch": 1.252974326862868, "grad_norm": 3.680274724960327, "learning_rate": 5.687499999999999e-05, "loss": 0.4937, "step": 8004 }, { "epoch": 1.2531308703819661, "grad_norm": 1.7989509105682373, "learning_rate": 5.686311141304347e-05, "loss": 0.4169, "step": 8005 }, { "epoch": 1.2532874139010646, "grad_norm": 1.8748031854629517, "learning_rate": 5.685122282608695e-05, "loss": 0.4413, "step": 8006 }, { "epoch": 1.2534439574201628, "grad_norm": 1.5181061029434204, "learning_rate": 5.6839334239130435e-05, "loss": 0.2803, "step": 8007 }, { "epoch": 1.2536005009392612, "grad_norm": 1.5817533731460571, "learning_rate": 5.682744565217391e-05, "loss": 0.344, "step": 8008 }, { "epoch": 1.2537570444583594, "grad_norm": 1.962801218032837, "learning_rate": 5.681555706521739e-05, "loss": 0.4535, "step": 8009 }, { "epoch": 1.2539135879774577, "grad_norm": 2.1454687118530273, "learning_rate": 5.680366847826086e-05, "loss": 0.4115, "step": 8010 }, { "epoch": 1.254070131496556, "grad_norm": 4.083019256591797, "learning_rate": 5.679177989130434e-05, "loss": 0.3916, "step": 8011 }, { "epoch": 1.2542266750156543, "grad_norm": 0.9498183131217957, "learning_rate": 5.677989130434782e-05, "loss": 0.5949, "step": 8012 }, { "epoch": 1.2543832185347528, "grad_norm": 1.482208251953125, "learning_rate": 5.67680027173913e-05, "loss": 0.44, "step": 8013 }, { "epoch": 1.254539762053851, "grad_norm": 1.4096070528030396, "learning_rate": 5.6756114130434775e-05, "loss": 0.4556, "step": 8014 }, { "epoch": 1.2546963055729492, "grad_norm": 2.011352300643921, "learning_rate": 5.674422554347825e-05, "loss": 0.4686, "step": 8015 }, { "epoch": 1.2548528490920476, "grad_norm": 2.019896984100342, "learning_rate": 5.673233695652173e-05, "loss": 0.3085, "step": 8016 }, { "epoch": 1.2550093926111459, "grad_norm": 1.7909919023513794, "learning_rate": 5.6720448369565216e-05, "loss": 0.506, "step": 8017 }, { "epoch": 1.2551659361302443, "grad_norm": 2.111804962158203, "learning_rate": 5.6708559782608694e-05, "loss": 0.4761, "step": 8018 }, { "epoch": 1.2553224796493425, "grad_norm": 2.5735363960266113, "learning_rate": 5.669667119565217e-05, "loss": 0.3237, "step": 8019 }, { "epoch": 1.2554790231684407, "grad_norm": 1.8891366720199585, "learning_rate": 5.668478260869565e-05, "loss": 0.5932, "step": 8020 }, { "epoch": 1.2556355666875392, "grad_norm": 2.3593084812164307, "learning_rate": 5.667289402173913e-05, "loss": 0.5077, "step": 8021 }, { "epoch": 1.2557921102066374, "grad_norm": 2.2396955490112305, "learning_rate": 5.66610054347826e-05, "loss": 0.623, "step": 8022 }, { "epoch": 1.2559486537257358, "grad_norm": 2.7450859546661377, "learning_rate": 5.664911684782608e-05, "loss": 0.6173, "step": 8023 }, { "epoch": 1.256105197244834, "grad_norm": 1.493815302848816, "learning_rate": 5.6637228260869556e-05, "loss": 0.5988, "step": 8024 }, { "epoch": 1.2562617407639323, "grad_norm": 2.222855567932129, "learning_rate": 5.6625339673913035e-05, "loss": 0.6323, "step": 8025 }, { "epoch": 1.2564182842830307, "grad_norm": 3.506321430206299, "learning_rate": 5.661345108695651e-05, "loss": 1.197, "step": 8026 }, { "epoch": 1.256574827802129, "grad_norm": 3.8622844219207764, "learning_rate": 5.66015625e-05, "loss": 1.1018, "step": 8027 }, { "epoch": 1.2567313713212274, "grad_norm": 8.330638885498047, "learning_rate": 5.6589673913043476e-05, "loss": 1.1042, "step": 8028 }, { "epoch": 1.2568879148403256, "grad_norm": 4.926156520843506, "learning_rate": 5.6577785326086954e-05, "loss": 0.957, "step": 8029 }, { "epoch": 1.2570444583594238, "grad_norm": 4.419129848480225, "learning_rate": 5.656589673913043e-05, "loss": 1.0034, "step": 8030 }, { "epoch": 1.2572010018785222, "grad_norm": 2.455068349838257, "learning_rate": 5.655400815217391e-05, "loss": 0.9704, "step": 8031 }, { "epoch": 1.2573575453976205, "grad_norm": 3.294809341430664, "learning_rate": 5.654211956521739e-05, "loss": 0.7559, "step": 8032 }, { "epoch": 1.257514088916719, "grad_norm": 2.6215109825134277, "learning_rate": 5.653023097826086e-05, "loss": 0.6412, "step": 8033 }, { "epoch": 1.2576706324358171, "grad_norm": 3.295437812805176, "learning_rate": 5.651834239130434e-05, "loss": 0.5531, "step": 8034 }, { "epoch": 1.2578271759549153, "grad_norm": 1.9576337337493896, "learning_rate": 5.6506453804347816e-05, "loss": 0.3882, "step": 8035 }, { "epoch": 1.2579837194740138, "grad_norm": 2.57425856590271, "learning_rate": 5.64945652173913e-05, "loss": 0.5487, "step": 8036 }, { "epoch": 1.2581402629931122, "grad_norm": 4.025811672210693, "learning_rate": 5.648267663043478e-05, "loss": 0.7788, "step": 8037 }, { "epoch": 1.2582968065122104, "grad_norm": 2.0518925189971924, "learning_rate": 5.647078804347826e-05, "loss": 0.667, "step": 8038 }, { "epoch": 1.2584533500313086, "grad_norm": 0.6660261750221252, "learning_rate": 5.6458899456521736e-05, "loss": 0.2571, "step": 8039 }, { "epoch": 1.258609893550407, "grad_norm": 0.8382539749145508, "learning_rate": 5.6447010869565214e-05, "loss": 0.3256, "step": 8040 }, { "epoch": 1.2587664370695053, "grad_norm": 0.688955545425415, "learning_rate": 5.643512228260869e-05, "loss": 0.2424, "step": 8041 }, { "epoch": 1.2589229805886037, "grad_norm": 2.2213921546936035, "learning_rate": 5.642323369565217e-05, "loss": 0.2172, "step": 8042 }, { "epoch": 1.259079524107702, "grad_norm": 0.4342550039291382, "learning_rate": 5.641134510869565e-05, "loss": 0.2228, "step": 8043 }, { "epoch": 1.2592360676268002, "grad_norm": 0.6277884840965271, "learning_rate": 5.6399456521739126e-05, "loss": 0.2735, "step": 8044 }, { "epoch": 1.2593926111458986, "grad_norm": 0.8146473169326782, "learning_rate": 5.63875679347826e-05, "loss": 0.2117, "step": 8045 }, { "epoch": 1.2595491546649968, "grad_norm": 0.5079545378684998, "learning_rate": 5.637567934782608e-05, "loss": 0.1985, "step": 8046 }, { "epoch": 1.2597056981840953, "grad_norm": 0.8453031778335571, "learning_rate": 5.636379076086956e-05, "loss": 0.2007, "step": 8047 }, { "epoch": 1.2598622417031935, "grad_norm": 0.6580536961555481, "learning_rate": 5.635190217391304e-05, "loss": 0.2009, "step": 8048 }, { "epoch": 1.2600187852222917, "grad_norm": 1.486207365989685, "learning_rate": 5.634001358695652e-05, "loss": 0.3482, "step": 8049 }, { "epoch": 1.2601753287413902, "grad_norm": 0.8187905550003052, "learning_rate": 5.6328124999999995e-05, "loss": 0.2763, "step": 8050 }, { "epoch": 1.2603318722604884, "grad_norm": 0.8308860063552856, "learning_rate": 5.6316236413043473e-05, "loss": 0.2425, "step": 8051 }, { "epoch": 1.2604884157795868, "grad_norm": 1.046517014503479, "learning_rate": 5.630434782608695e-05, "loss": 0.2603, "step": 8052 }, { "epoch": 1.260644959298685, "grad_norm": 0.7669467926025391, "learning_rate": 5.629245923913043e-05, "loss": 0.1514, "step": 8053 }, { "epoch": 1.2608015028177832, "grad_norm": 2.0902762413024902, "learning_rate": 5.628057065217391e-05, "loss": 0.7234, "step": 8054 }, { "epoch": 1.2609580463368817, "grad_norm": 0.9163323044776917, "learning_rate": 5.626868206521739e-05, "loss": 0.2508, "step": 8055 }, { "epoch": 1.26111458985598, "grad_norm": 0.8164222240447998, "learning_rate": 5.6256793478260864e-05, "loss": 0.2342, "step": 8056 }, { "epoch": 1.2612711333750783, "grad_norm": 1.3847688436508179, "learning_rate": 5.624490489130434e-05, "loss": 0.3339, "step": 8057 }, { "epoch": 1.2614276768941766, "grad_norm": 2.5277111530303955, "learning_rate": 5.623301630434782e-05, "loss": 0.5789, "step": 8058 }, { "epoch": 1.2615842204132748, "grad_norm": 4.1441802978515625, "learning_rate": 5.62211277173913e-05, "loss": 0.913, "step": 8059 }, { "epoch": 1.2617407639323732, "grad_norm": 1.4462882280349731, "learning_rate": 5.620923913043478e-05, "loss": 0.6444, "step": 8060 }, { "epoch": 1.2618973074514714, "grad_norm": 1.8323180675506592, "learning_rate": 5.6197350543478255e-05, "loss": 0.2897, "step": 8061 }, { "epoch": 1.2620538509705699, "grad_norm": 6.966251850128174, "learning_rate": 5.618546195652173e-05, "loss": 0.7086, "step": 8062 }, { "epoch": 1.262210394489668, "grad_norm": 2.0271003246307373, "learning_rate": 5.617357336956521e-05, "loss": 0.3063, "step": 8063 }, { "epoch": 1.2623669380087663, "grad_norm": 3.0159406661987305, "learning_rate": 5.616168478260869e-05, "loss": 0.7093, "step": 8064 }, { "epoch": 1.2625234815278648, "grad_norm": 1.4930551052093506, "learning_rate": 5.6149796195652174e-05, "loss": 0.5928, "step": 8065 }, { "epoch": 1.262680025046963, "grad_norm": 1.4623749256134033, "learning_rate": 5.613790760869565e-05, "loss": 0.7631, "step": 8066 }, { "epoch": 1.2628365685660614, "grad_norm": 4.570174217224121, "learning_rate": 5.612601902173913e-05, "loss": 1.2357, "step": 8067 }, { "epoch": 1.2629931120851596, "grad_norm": 2.9872069358825684, "learning_rate": 5.61141304347826e-05, "loss": 0.9184, "step": 8068 }, { "epoch": 1.2631496556042578, "grad_norm": 3.522033452987671, "learning_rate": 5.610224184782608e-05, "loss": 0.7762, "step": 8069 }, { "epoch": 1.2633061991233563, "grad_norm": 1.881877064704895, "learning_rate": 5.609035326086956e-05, "loss": 0.4817, "step": 8070 }, { "epoch": 1.2634627426424547, "grad_norm": 1.412722110748291, "learning_rate": 5.6078464673913037e-05, "loss": 0.3871, "step": 8071 }, { "epoch": 1.263619286161553, "grad_norm": 3.657071590423584, "learning_rate": 5.6066576086956515e-05, "loss": 1.1131, "step": 8072 }, { "epoch": 1.2637758296806512, "grad_norm": 2.1905548572540283, "learning_rate": 5.605468749999999e-05, "loss": 0.6472, "step": 8073 }, { "epoch": 1.2639323731997496, "grad_norm": 4.653868675231934, "learning_rate": 5.604279891304347e-05, "loss": 0.7403, "step": 8074 }, { "epoch": 1.2640889167188478, "grad_norm": 2.7384443283081055, "learning_rate": 5.6030910326086956e-05, "loss": 0.5913, "step": 8075 }, { "epoch": 1.2642454602379463, "grad_norm": 5.68044900894165, "learning_rate": 5.6019021739130434e-05, "loss": 0.7193, "step": 8076 }, { "epoch": 1.2644020037570445, "grad_norm": 4.132944107055664, "learning_rate": 5.600713315217391e-05, "loss": 1.4519, "step": 8077 }, { "epoch": 1.2645585472761427, "grad_norm": 11.234874725341797, "learning_rate": 5.599524456521739e-05, "loss": 0.7596, "step": 8078 }, { "epoch": 1.2647150907952411, "grad_norm": 4.375277519226074, "learning_rate": 5.598335597826086e-05, "loss": 1.2572, "step": 8079 }, { "epoch": 1.2648716343143394, "grad_norm": 2.6167283058166504, "learning_rate": 5.597146739130434e-05, "loss": 0.9027, "step": 8080 }, { "epoch": 1.2650281778334378, "grad_norm": 3.812523126602173, "learning_rate": 5.595957880434782e-05, "loss": 1.9092, "step": 8081 }, { "epoch": 1.265184721352536, "grad_norm": 2.217710494995117, "learning_rate": 5.5947690217391296e-05, "loss": 0.8605, "step": 8082 }, { "epoch": 1.2653412648716342, "grad_norm": 2.762489080429077, "learning_rate": 5.5935801630434774e-05, "loss": 1.2236, "step": 8083 }, { "epoch": 1.2654978083907327, "grad_norm": 3.9406144618988037, "learning_rate": 5.592391304347825e-05, "loss": 0.8086, "step": 8084 }, { "epoch": 1.2656543519098309, "grad_norm": 10.895279884338379, "learning_rate": 5.591202445652174e-05, "loss": 0.4581, "step": 8085 }, { "epoch": 1.2658108954289293, "grad_norm": 3.6692144870758057, "learning_rate": 5.5900135869565216e-05, "loss": 0.253, "step": 8086 }, { "epoch": 1.2659674389480275, "grad_norm": 4.1693267822265625, "learning_rate": 5.5888247282608694e-05, "loss": 0.6688, "step": 8087 }, { "epoch": 1.2661239824671258, "grad_norm": 3.0538392066955566, "learning_rate": 5.587635869565217e-05, "loss": 0.6939, "step": 8088 }, { "epoch": 1.2662805259862242, "grad_norm": 0.5128295421600342, "learning_rate": 5.586447010869565e-05, "loss": 0.2651, "step": 8089 }, { "epoch": 1.2664370695053224, "grad_norm": 1.1779139041900635, "learning_rate": 5.585258152173913e-05, "loss": 0.2426, "step": 8090 }, { "epoch": 1.2665936130244209, "grad_norm": 0.6052125096321106, "learning_rate": 5.58406929347826e-05, "loss": 0.2295, "step": 8091 }, { "epoch": 1.266750156543519, "grad_norm": 0.720552384853363, "learning_rate": 5.582880434782608e-05, "loss": 0.2353, "step": 8092 }, { "epoch": 1.2669067000626173, "grad_norm": 0.5334534049034119, "learning_rate": 5.5816915760869556e-05, "loss": 0.23, "step": 8093 }, { "epoch": 1.2670632435817157, "grad_norm": 0.8905168175697327, "learning_rate": 5.5805027173913034e-05, "loss": 0.3203, "step": 8094 }, { "epoch": 1.267219787100814, "grad_norm": 0.6097309589385986, "learning_rate": 5.579313858695652e-05, "loss": 0.3406, "step": 8095 }, { "epoch": 1.2673763306199124, "grad_norm": 0.6208405494689941, "learning_rate": 5.578125e-05, "loss": 0.1758, "step": 8096 }, { "epoch": 1.2675328741390106, "grad_norm": 2.3095650672912598, "learning_rate": 5.5769361413043475e-05, "loss": 0.3588, "step": 8097 }, { "epoch": 1.2676894176581088, "grad_norm": 3.3769805431365967, "learning_rate": 5.5757472826086954e-05, "loss": 0.3357, "step": 8098 }, { "epoch": 1.2678459611772073, "grad_norm": 0.8973973989486694, "learning_rate": 5.574558423913043e-05, "loss": 0.3733, "step": 8099 }, { "epoch": 1.2680025046963057, "grad_norm": 0.5879065990447998, "learning_rate": 5.573369565217391e-05, "loss": 0.2052, "step": 8100 }, { "epoch": 1.268159048215404, "grad_norm": 2.5104291439056396, "learning_rate": 5.572180706521739e-05, "loss": 0.616, "step": 8101 }, { "epoch": 1.2683155917345021, "grad_norm": 0.7882383465766907, "learning_rate": 5.570991847826086e-05, "loss": 0.2249, "step": 8102 }, { "epoch": 1.2684721352536004, "grad_norm": 0.9797226190567017, "learning_rate": 5.569802989130434e-05, "loss": 0.2179, "step": 8103 }, { "epoch": 1.2686286787726988, "grad_norm": 1.4023452997207642, "learning_rate": 5.5686141304347816e-05, "loss": 0.437, "step": 8104 }, { "epoch": 1.2687852222917972, "grad_norm": 1.2051557302474976, "learning_rate": 5.56742527173913e-05, "loss": 0.5495, "step": 8105 }, { "epoch": 1.2689417658108955, "grad_norm": 1.4070618152618408, "learning_rate": 5.566236413043478e-05, "loss": 0.3169, "step": 8106 }, { "epoch": 1.2690983093299937, "grad_norm": 2.550915002822876, "learning_rate": 5.565047554347826e-05, "loss": 0.5254, "step": 8107 }, { "epoch": 1.2692548528490921, "grad_norm": 1.814480185508728, "learning_rate": 5.5638586956521735e-05, "loss": 0.4261, "step": 8108 }, { "epoch": 1.2694113963681903, "grad_norm": 1.6951565742492676, "learning_rate": 5.562669836956521e-05, "loss": 0.5444, "step": 8109 }, { "epoch": 1.2695679398872888, "grad_norm": 2.747338056564331, "learning_rate": 5.561480978260869e-05, "loss": 0.5356, "step": 8110 }, { "epoch": 1.269724483406387, "grad_norm": 1.069840669631958, "learning_rate": 5.560292119565217e-05, "loss": 0.1981, "step": 8111 }, { "epoch": 1.2698810269254852, "grad_norm": 1.5589491128921509, "learning_rate": 5.559103260869565e-05, "loss": 0.5909, "step": 8112 }, { "epoch": 1.2700375704445837, "grad_norm": 1.502621054649353, "learning_rate": 5.557914402173913e-05, "loss": 0.4523, "step": 8113 }, { "epoch": 1.2701941139636819, "grad_norm": 2.74658465385437, "learning_rate": 5.55672554347826e-05, "loss": 0.6286, "step": 8114 }, { "epoch": 1.2703506574827803, "grad_norm": 1.0030122995376587, "learning_rate": 5.555536684782608e-05, "loss": 0.3671, "step": 8115 }, { "epoch": 1.2705072010018785, "grad_norm": 1.6877386569976807, "learning_rate": 5.554347826086956e-05, "loss": 0.5745, "step": 8116 }, { "epoch": 1.2706637445209767, "grad_norm": 2.6509947776794434, "learning_rate": 5.553158967391304e-05, "loss": 0.5436, "step": 8117 }, { "epoch": 1.2708202880400752, "grad_norm": 2.5954997539520264, "learning_rate": 5.551970108695652e-05, "loss": 0.8556, "step": 8118 }, { "epoch": 1.2709768315591734, "grad_norm": 4.2577996253967285, "learning_rate": 5.5507812499999995e-05, "loss": 0.8694, "step": 8119 }, { "epoch": 1.2711333750782718, "grad_norm": 4.799262046813965, "learning_rate": 5.549592391304347e-05, "loss": 0.7045, "step": 8120 }, { "epoch": 1.27128991859737, "grad_norm": 5.713566780090332, "learning_rate": 5.548403532608695e-05, "loss": 0.6832, "step": 8121 }, { "epoch": 1.2714464621164683, "grad_norm": 2.2015442848205566, "learning_rate": 5.547214673913043e-05, "loss": 0.7661, "step": 8122 }, { "epoch": 1.2716030056355667, "grad_norm": 2.1784827709198, "learning_rate": 5.5460258152173914e-05, "loss": 0.9148, "step": 8123 }, { "epoch": 1.271759549154665, "grad_norm": 5.034367084503174, "learning_rate": 5.544836956521739e-05, "loss": 1.4098, "step": 8124 }, { "epoch": 1.2719160926737634, "grad_norm": 5.529078006744385, "learning_rate": 5.5436480978260864e-05, "loss": 1.0048, "step": 8125 }, { "epoch": 1.2720726361928616, "grad_norm": 4.677115440368652, "learning_rate": 5.542459239130434e-05, "loss": 1.0468, "step": 8126 }, { "epoch": 1.2722291797119598, "grad_norm": 2.7085840702056885, "learning_rate": 5.541270380434782e-05, "loss": 0.8424, "step": 8127 }, { "epoch": 1.2723857232310583, "grad_norm": 5.474047660827637, "learning_rate": 5.54008152173913e-05, "loss": 1.4326, "step": 8128 }, { "epoch": 1.2725422667501565, "grad_norm": 2.139925241470337, "learning_rate": 5.5388926630434776e-05, "loss": 0.8929, "step": 8129 }, { "epoch": 1.272698810269255, "grad_norm": 2.4235620498657227, "learning_rate": 5.5377038043478255e-05, "loss": 1.0869, "step": 8130 }, { "epoch": 1.2728553537883531, "grad_norm": 5.105234622955322, "learning_rate": 5.536514945652173e-05, "loss": 2.0595, "step": 8131 }, { "epoch": 1.2730118973074513, "grad_norm": 3.0401620864868164, "learning_rate": 5.535326086956521e-05, "loss": 0.776, "step": 8132 }, { "epoch": 1.2731684408265498, "grad_norm": 3.7158632278442383, "learning_rate": 5.5341372282608696e-05, "loss": 1.0514, "step": 8133 }, { "epoch": 1.2733249843456482, "grad_norm": 9.60766887664795, "learning_rate": 5.5329483695652174e-05, "loss": 0.6771, "step": 8134 }, { "epoch": 1.2734815278647464, "grad_norm": 5.29464864730835, "learning_rate": 5.531759510869565e-05, "loss": 0.974, "step": 8135 }, { "epoch": 1.2736380713838447, "grad_norm": 3.8807711601257324, "learning_rate": 5.530570652173913e-05, "loss": 0.6941, "step": 8136 }, { "epoch": 1.273794614902943, "grad_norm": 2.23105788230896, "learning_rate": 5.52938179347826e-05, "loss": 0.7446, "step": 8137 }, { "epoch": 1.2739511584220413, "grad_norm": 3.5545706748962402, "learning_rate": 5.528192934782608e-05, "loss": 0.6831, "step": 8138 }, { "epoch": 1.2741077019411398, "grad_norm": 0.46991273760795593, "learning_rate": 5.527004076086956e-05, "loss": 0.2218, "step": 8139 }, { "epoch": 1.274264245460238, "grad_norm": 0.6386700868606567, "learning_rate": 5.5258152173913036e-05, "loss": 0.3954, "step": 8140 }, { "epoch": 1.2744207889793362, "grad_norm": 0.46903276443481445, "learning_rate": 5.5246263586956514e-05, "loss": 0.2402, "step": 8141 }, { "epoch": 1.2745773324984346, "grad_norm": 0.7064833641052246, "learning_rate": 5.523437499999999e-05, "loss": 0.2398, "step": 8142 }, { "epoch": 1.2747338760175329, "grad_norm": 0.5251678824424744, "learning_rate": 5.522248641304348e-05, "loss": 0.2085, "step": 8143 }, { "epoch": 1.2748904195366313, "grad_norm": 0.6834549307823181, "learning_rate": 5.5210597826086955e-05, "loss": 0.2813, "step": 8144 }, { "epoch": 1.2750469630557295, "grad_norm": 0.6032211780548096, "learning_rate": 5.5198709239130434e-05, "loss": 0.191, "step": 8145 }, { "epoch": 1.2752035065748277, "grad_norm": 0.7665048837661743, "learning_rate": 5.518682065217391e-05, "loss": 0.2466, "step": 8146 }, { "epoch": 1.2753600500939262, "grad_norm": 0.9139910340309143, "learning_rate": 5.517493206521739e-05, "loss": 0.237, "step": 8147 }, { "epoch": 1.2755165936130244, "grad_norm": 0.9730105400085449, "learning_rate": 5.516304347826086e-05, "loss": 0.1928, "step": 8148 }, { "epoch": 1.2756731371321228, "grad_norm": 0.8018919229507446, "learning_rate": 5.515115489130434e-05, "loss": 0.2327, "step": 8149 }, { "epoch": 1.275829680651221, "grad_norm": 1.093112587928772, "learning_rate": 5.513926630434782e-05, "loss": 0.2831, "step": 8150 }, { "epoch": 1.2759862241703193, "grad_norm": 1.2235559225082397, "learning_rate": 5.5127377717391296e-05, "loss": 0.369, "step": 8151 }, { "epoch": 1.2761427676894177, "grad_norm": 0.7178487777709961, "learning_rate": 5.5115489130434774e-05, "loss": 0.2352, "step": 8152 }, { "epoch": 1.276299311208516, "grad_norm": 1.0280925035476685, "learning_rate": 5.510360054347826e-05, "loss": 0.2387, "step": 8153 }, { "epoch": 1.2764558547276144, "grad_norm": 1.4209643602371216, "learning_rate": 5.509171195652174e-05, "loss": 0.4846, "step": 8154 }, { "epoch": 1.2766123982467126, "grad_norm": 4.26008415222168, "learning_rate": 5.5079823369565215e-05, "loss": 0.841, "step": 8155 }, { "epoch": 1.2767689417658108, "grad_norm": 3.768817901611328, "learning_rate": 5.506793478260869e-05, "loss": 0.2828, "step": 8156 }, { "epoch": 1.2769254852849092, "grad_norm": 1.7353625297546387, "learning_rate": 5.505604619565217e-05, "loss": 0.4118, "step": 8157 }, { "epoch": 1.2770820288040075, "grad_norm": 1.7729783058166504, "learning_rate": 5.504415760869565e-05, "loss": 0.4069, "step": 8158 }, { "epoch": 1.277238572323106, "grad_norm": 2.555926561355591, "learning_rate": 5.503226902173913e-05, "loss": 0.5007, "step": 8159 }, { "epoch": 1.277395115842204, "grad_norm": 3.503239870071411, "learning_rate": 5.50203804347826e-05, "loss": 0.7405, "step": 8160 }, { "epoch": 1.2775516593613023, "grad_norm": 0.9163112044334412, "learning_rate": 5.500849184782608e-05, "loss": 0.2769, "step": 8161 }, { "epoch": 1.2777082028804008, "grad_norm": 1.9864250421524048, "learning_rate": 5.4996603260869556e-05, "loss": 0.5972, "step": 8162 }, { "epoch": 1.277864746399499, "grad_norm": 1.312592625617981, "learning_rate": 5.498471467391304e-05, "loss": 0.3606, "step": 8163 }, { "epoch": 1.2780212899185974, "grad_norm": 2.7779195308685303, "learning_rate": 5.497282608695652e-05, "loss": 0.7393, "step": 8164 }, { "epoch": 1.2781778334376956, "grad_norm": 2.644036293029785, "learning_rate": 5.49609375e-05, "loss": 0.748, "step": 8165 }, { "epoch": 1.2783343769567939, "grad_norm": 1.799517035484314, "learning_rate": 5.4949048913043475e-05, "loss": 0.3822, "step": 8166 }, { "epoch": 1.2784909204758923, "grad_norm": 2.8280069828033447, "learning_rate": 5.493716032608695e-05, "loss": 0.8061, "step": 8167 }, { "epoch": 1.2786474639949907, "grad_norm": 3.13149094581604, "learning_rate": 5.492527173913043e-05, "loss": 0.601, "step": 8168 }, { "epoch": 1.278804007514089, "grad_norm": 2.2092714309692383, "learning_rate": 5.491338315217391e-05, "loss": 1.1367, "step": 8169 }, { "epoch": 1.2789605510331872, "grad_norm": 3.545867919921875, "learning_rate": 5.490149456521739e-05, "loss": 0.8061, "step": 8170 }, { "epoch": 1.2791170945522856, "grad_norm": 3.0353901386260986, "learning_rate": 5.488960597826086e-05, "loss": 0.7415, "step": 8171 }, { "epoch": 1.2792736380713838, "grad_norm": 1.9405124187469482, "learning_rate": 5.487771739130434e-05, "loss": 0.5933, "step": 8172 }, { "epoch": 1.2794301815904823, "grad_norm": 3.4053356647491455, "learning_rate": 5.486582880434782e-05, "loss": 0.9704, "step": 8173 }, { "epoch": 1.2795867251095805, "grad_norm": 4.183650970458984, "learning_rate": 5.48539402173913e-05, "loss": 1.0219, "step": 8174 }, { "epoch": 1.2797432686286787, "grad_norm": 6.164062976837158, "learning_rate": 5.484205163043478e-05, "loss": 1.1965, "step": 8175 }, { "epoch": 1.2798998121477771, "grad_norm": 2.113354444503784, "learning_rate": 5.4830163043478256e-05, "loss": 0.9576, "step": 8176 }, { "epoch": 1.2800563556668754, "grad_norm": 2.326564073562622, "learning_rate": 5.4818274456521735e-05, "loss": 0.6464, "step": 8177 }, { "epoch": 1.2802128991859738, "grad_norm": 5.205068588256836, "learning_rate": 5.480638586956521e-05, "loss": 1.2585, "step": 8178 }, { "epoch": 1.280369442705072, "grad_norm": 4.38555383682251, "learning_rate": 5.479449728260869e-05, "loss": 1.2366, "step": 8179 }, { "epoch": 1.2805259862241702, "grad_norm": 3.336125373840332, "learning_rate": 5.478260869565217e-05, "loss": 0.9188, "step": 8180 }, { "epoch": 1.2806825297432687, "grad_norm": 3.8115556240081787, "learning_rate": 5.477072010869565e-05, "loss": 0.5865, "step": 8181 }, { "epoch": 1.280839073262367, "grad_norm": 5.55440092086792, "learning_rate": 5.475883152173913e-05, "loss": 0.8837, "step": 8182 }, { "epoch": 1.2809956167814653, "grad_norm": 2.579174041748047, "learning_rate": 5.4746942934782604e-05, "loss": 0.6776, "step": 8183 }, { "epoch": 1.2811521603005636, "grad_norm": 2.756596803665161, "learning_rate": 5.473505434782608e-05, "loss": 0.7156, "step": 8184 }, { "epoch": 1.2813087038196618, "grad_norm": 2.0457851886749268, "learning_rate": 5.472316576086956e-05, "loss": 0.4029, "step": 8185 }, { "epoch": 1.2814652473387602, "grad_norm": 1.7636560201644897, "learning_rate": 5.471127717391304e-05, "loss": 0.5654, "step": 8186 }, { "epoch": 1.2816217908578584, "grad_norm": 1.9618474245071411, "learning_rate": 5.4699388586956516e-05, "loss": 0.5669, "step": 8187 }, { "epoch": 1.2817783343769569, "grad_norm": 1.912635326385498, "learning_rate": 5.4687499999999994e-05, "loss": 0.6415, "step": 8188 }, { "epoch": 1.281934877896055, "grad_norm": 0.4248294234275818, "learning_rate": 5.467561141304347e-05, "loss": 0.2185, "step": 8189 }, { "epoch": 1.2820914214151533, "grad_norm": 2.075568914413452, "learning_rate": 5.466372282608695e-05, "loss": 0.9408, "step": 8190 }, { "epoch": 1.2822479649342517, "grad_norm": 0.417196124792099, "learning_rate": 5.465183423913043e-05, "loss": 0.1388, "step": 8191 }, { "epoch": 1.28240450845335, "grad_norm": 0.7526351809501648, "learning_rate": 5.4639945652173914e-05, "loss": 0.2036, "step": 8192 }, { "epoch": 1.2825610519724484, "grad_norm": 3.544516086578369, "learning_rate": 5.462805706521739e-05, "loss": 0.3754, "step": 8193 }, { "epoch": 1.2827175954915466, "grad_norm": 1.8559495210647583, "learning_rate": 5.461616847826086e-05, "loss": 0.3173, "step": 8194 }, { "epoch": 1.2828741390106448, "grad_norm": 9.305380821228027, "learning_rate": 5.460427989130434e-05, "loss": 0.3602, "step": 8195 }, { "epoch": 1.2830306825297433, "grad_norm": 0.6801361441612244, "learning_rate": 5.459239130434782e-05, "loss": 0.275, "step": 8196 }, { "epoch": 1.2831872260488415, "grad_norm": 0.9621412754058838, "learning_rate": 5.45805027173913e-05, "loss": 0.2086, "step": 8197 }, { "epoch": 1.28334376956794, "grad_norm": 0.7928347587585449, "learning_rate": 5.4568614130434776e-05, "loss": 0.2193, "step": 8198 }, { "epoch": 1.2835003130870382, "grad_norm": 1.159193515777588, "learning_rate": 5.4556725543478254e-05, "loss": 0.2221, "step": 8199 }, { "epoch": 1.2836568566061364, "grad_norm": 0.7826451063156128, "learning_rate": 5.454483695652173e-05, "loss": 0.2354, "step": 8200 }, { "epoch": 1.2838134001252348, "grad_norm": 1.5159252882003784, "learning_rate": 5.453294836956521e-05, "loss": 0.2575, "step": 8201 }, { "epoch": 1.2839699436443333, "grad_norm": 1.3621042966842651, "learning_rate": 5.4521059782608695e-05, "loss": 0.4066, "step": 8202 }, { "epoch": 1.2841264871634315, "grad_norm": 1.438460350036621, "learning_rate": 5.4509171195652173e-05, "loss": 0.1722, "step": 8203 }, { "epoch": 1.2842830306825297, "grad_norm": 2.3615024089813232, "learning_rate": 5.449728260869565e-05, "loss": 0.3745, "step": 8204 }, { "epoch": 1.2844395742016281, "grad_norm": 1.1614705324172974, "learning_rate": 5.448539402173913e-05, "loss": 0.4179, "step": 8205 }, { "epoch": 1.2845961177207263, "grad_norm": 0.9616918563842773, "learning_rate": 5.44735054347826e-05, "loss": 0.2141, "step": 8206 }, { "epoch": 1.2847526612398248, "grad_norm": 1.9357107877731323, "learning_rate": 5.446161684782608e-05, "loss": 0.4367, "step": 8207 }, { "epoch": 1.284909204758923, "grad_norm": 1.567920207977295, "learning_rate": 5.444972826086956e-05, "loss": 0.3699, "step": 8208 }, { "epoch": 1.2850657482780212, "grad_norm": 1.6296812295913696, "learning_rate": 5.4437839673913036e-05, "loss": 0.6055, "step": 8209 }, { "epoch": 1.2852222917971197, "grad_norm": 2.615333080291748, "learning_rate": 5.4425951086956514e-05, "loss": 0.7349, "step": 8210 }, { "epoch": 1.2853788353162179, "grad_norm": 1.6692149639129639, "learning_rate": 5.441406249999999e-05, "loss": 0.5341, "step": 8211 }, { "epoch": 1.2855353788353163, "grad_norm": 1.374345064163208, "learning_rate": 5.440217391304348e-05, "loss": 0.3062, "step": 8212 }, { "epoch": 1.2856919223544145, "grad_norm": 1.2004449367523193, "learning_rate": 5.4390285326086955e-05, "loss": 0.52, "step": 8213 }, { "epoch": 1.2858484658735128, "grad_norm": 1.1535160541534424, "learning_rate": 5.437839673913043e-05, "loss": 0.3331, "step": 8214 }, { "epoch": 1.2860050093926112, "grad_norm": 3.040759801864624, "learning_rate": 5.436650815217391e-05, "loss": 0.565, "step": 8215 }, { "epoch": 1.2861615529117094, "grad_norm": 1.7432810068130493, "learning_rate": 5.435461956521739e-05, "loss": 0.5769, "step": 8216 }, { "epoch": 1.2863180964308079, "grad_norm": 2.522956609725952, "learning_rate": 5.434273097826086e-05, "loss": 0.7052, "step": 8217 }, { "epoch": 1.286474639949906, "grad_norm": 1.9118211269378662, "learning_rate": 5.433084239130434e-05, "loss": 0.5503, "step": 8218 }, { "epoch": 1.2866311834690043, "grad_norm": 2.9273698329925537, "learning_rate": 5.431895380434782e-05, "loss": 0.6323, "step": 8219 }, { "epoch": 1.2867877269881027, "grad_norm": 2.5964467525482178, "learning_rate": 5.4307065217391295e-05, "loss": 0.8066, "step": 8220 }, { "epoch": 1.286944270507201, "grad_norm": 3.124629259109497, "learning_rate": 5.4295176630434774e-05, "loss": 0.912, "step": 8221 }, { "epoch": 1.2871008140262994, "grad_norm": 3.546435594558716, "learning_rate": 5.428328804347826e-05, "loss": 0.5597, "step": 8222 }, { "epoch": 1.2872573575453976, "grad_norm": 2.7057504653930664, "learning_rate": 5.4271399456521737e-05, "loss": 0.5756, "step": 8223 }, { "epoch": 1.2874139010644958, "grad_norm": 2.249875783920288, "learning_rate": 5.4259510869565215e-05, "loss": 0.9583, "step": 8224 }, { "epoch": 1.2875704445835943, "grad_norm": 4.11207914352417, "learning_rate": 5.424762228260869e-05, "loss": 1.1769, "step": 8225 }, { "epoch": 1.2877269881026925, "grad_norm": 5.718787670135498, "learning_rate": 5.423573369565217e-05, "loss": 1.0246, "step": 8226 }, { "epoch": 1.287883531621791, "grad_norm": 6.229344844818115, "learning_rate": 5.422384510869565e-05, "loss": 1.0654, "step": 8227 }, { "epoch": 1.2880400751408891, "grad_norm": 4.383051872253418, "learning_rate": 5.421195652173913e-05, "loss": 1.041, "step": 8228 }, { "epoch": 1.2881966186599874, "grad_norm": 1.9475451707839966, "learning_rate": 5.42000679347826e-05, "loss": 0.6286, "step": 8229 }, { "epoch": 1.2883531621790858, "grad_norm": 3.750868797302246, "learning_rate": 5.418817934782608e-05, "loss": 0.8218, "step": 8230 }, { "epoch": 1.288509705698184, "grad_norm": 3.0280845165252686, "learning_rate": 5.4176290760869555e-05, "loss": 1.6771, "step": 8231 }, { "epoch": 1.2886662492172825, "grad_norm": 2.5097997188568115, "learning_rate": 5.416440217391304e-05, "loss": 0.927, "step": 8232 }, { "epoch": 1.2888227927363807, "grad_norm": 2.741420269012451, "learning_rate": 5.415251358695652e-05, "loss": 0.6323, "step": 8233 }, { "epoch": 1.288979336255479, "grad_norm": 3.081015110015869, "learning_rate": 5.4140624999999996e-05, "loss": 0.546, "step": 8234 }, { "epoch": 1.2891358797745773, "grad_norm": 2.8458850383758545, "learning_rate": 5.4128736413043474e-05, "loss": 0.4341, "step": 8235 }, { "epoch": 1.2892924232936758, "grad_norm": 6.166201591491699, "learning_rate": 5.411684782608695e-05, "loss": 1.0102, "step": 8236 }, { "epoch": 1.289448966812774, "grad_norm": 7.071556091308594, "learning_rate": 5.410495923913043e-05, "loss": 0.2766, "step": 8237 }, { "epoch": 1.2896055103318722, "grad_norm": 2.6016147136688232, "learning_rate": 5.409307065217391e-05, "loss": 1.1457, "step": 8238 }, { "epoch": 1.2897620538509706, "grad_norm": 0.45543843507766724, "learning_rate": 5.408118206521739e-05, "loss": 0.2202, "step": 8239 }, { "epoch": 1.2899185973700689, "grad_norm": 0.5087339878082275, "learning_rate": 5.406929347826086e-05, "loss": 0.2115, "step": 8240 }, { "epoch": 1.2900751408891673, "grad_norm": 0.5311031341552734, "learning_rate": 5.4057404891304343e-05, "loss": 0.2288, "step": 8241 }, { "epoch": 1.2902316844082655, "grad_norm": 1.019264817237854, "learning_rate": 5.404551630434782e-05, "loss": 0.3473, "step": 8242 }, { "epoch": 1.2903882279273637, "grad_norm": 0.5450243949890137, "learning_rate": 5.40336277173913e-05, "loss": 0.2404, "step": 8243 }, { "epoch": 1.2905447714464622, "grad_norm": 0.9394849538803101, "learning_rate": 5.402173913043478e-05, "loss": 0.2967, "step": 8244 }, { "epoch": 1.2907013149655604, "grad_norm": 0.6212477087974548, "learning_rate": 5.4009850543478256e-05, "loss": 0.293, "step": 8245 }, { "epoch": 1.2908578584846588, "grad_norm": 1.0320639610290527, "learning_rate": 5.3997961956521734e-05, "loss": 0.2825, "step": 8246 }, { "epoch": 1.291014402003757, "grad_norm": 1.4785557985305786, "learning_rate": 5.398607336956521e-05, "loss": 0.3667, "step": 8247 }, { "epoch": 1.2911709455228553, "grad_norm": 0.7726352214813232, "learning_rate": 5.397418478260869e-05, "loss": 0.3052, "step": 8248 }, { "epoch": 1.2913274890419537, "grad_norm": 1.081728219985962, "learning_rate": 5.396229619565217e-05, "loss": 0.3495, "step": 8249 }, { "epoch": 1.291484032561052, "grad_norm": 1.021256685256958, "learning_rate": 5.3950407608695654e-05, "loss": 0.3461, "step": 8250 }, { "epoch": 1.2916405760801504, "grad_norm": 0.753734827041626, "learning_rate": 5.393851902173913e-05, "loss": 0.3235, "step": 8251 }, { "epoch": 1.2917971195992486, "grad_norm": 1.2961649894714355, "learning_rate": 5.39266304347826e-05, "loss": 0.4219, "step": 8252 }, { "epoch": 1.2919536631183468, "grad_norm": 0.9517664909362793, "learning_rate": 5.391474184782608e-05, "loss": 0.3249, "step": 8253 }, { "epoch": 1.2921102066374452, "grad_norm": 1.0360842943191528, "learning_rate": 5.390285326086956e-05, "loss": 0.4593, "step": 8254 }, { "epoch": 1.2922667501565435, "grad_norm": 1.5166468620300293, "learning_rate": 5.389096467391304e-05, "loss": 0.3536, "step": 8255 }, { "epoch": 1.292423293675642, "grad_norm": 0.5535582304000854, "learning_rate": 5.3879076086956516e-05, "loss": 0.2177, "step": 8256 }, { "epoch": 1.2925798371947401, "grad_norm": 1.3623216152191162, "learning_rate": 5.3867187499999994e-05, "loss": 0.483, "step": 8257 }, { "epoch": 1.2927363807138383, "grad_norm": 6.383028030395508, "learning_rate": 5.385529891304347e-05, "loss": 1.3163, "step": 8258 }, { "epoch": 1.2928929242329368, "grad_norm": 1.9316874742507935, "learning_rate": 5.384341032608695e-05, "loss": 0.5772, "step": 8259 }, { "epoch": 1.293049467752035, "grad_norm": 0.9779478311538696, "learning_rate": 5.3831521739130435e-05, "loss": 0.3294, "step": 8260 }, { "epoch": 1.2932060112711334, "grad_norm": 1.2817915678024292, "learning_rate": 5.381963315217391e-05, "loss": 0.5032, "step": 8261 }, { "epoch": 1.2933625547902317, "grad_norm": 1.5225074291229248, "learning_rate": 5.380774456521739e-05, "loss": 0.4336, "step": 8262 }, { "epoch": 1.2935190983093299, "grad_norm": 1.8338899612426758, "learning_rate": 5.379585597826086e-05, "loss": 0.5021, "step": 8263 }, { "epoch": 1.2936756418284283, "grad_norm": 1.2387012243270874, "learning_rate": 5.378396739130434e-05, "loss": 0.4121, "step": 8264 }, { "epoch": 1.2938321853475265, "grad_norm": 3.9344863891601562, "learning_rate": 5.377207880434782e-05, "loss": 0.7023, "step": 8265 }, { "epoch": 1.293988728866625, "grad_norm": 3.6297361850738525, "learning_rate": 5.37601902173913e-05, "loss": 0.6074, "step": 8266 }, { "epoch": 1.2941452723857232, "grad_norm": 3.302253484725952, "learning_rate": 5.3748301630434775e-05, "loss": 0.945, "step": 8267 }, { "epoch": 1.2943018159048214, "grad_norm": 2.084681987762451, "learning_rate": 5.3736413043478254e-05, "loss": 0.5816, "step": 8268 }, { "epoch": 1.2944583594239198, "grad_norm": 3.1655361652374268, "learning_rate": 5.372452445652173e-05, "loss": 0.8808, "step": 8269 }, { "epoch": 1.2946149029430183, "grad_norm": 2.820420503616333, "learning_rate": 5.371263586956522e-05, "loss": 0.5779, "step": 8270 }, { "epoch": 1.2947714464621165, "grad_norm": 1.7412488460540771, "learning_rate": 5.3700747282608695e-05, "loss": 0.7892, "step": 8271 }, { "epoch": 1.2949279899812147, "grad_norm": 6.835015773773193, "learning_rate": 5.368885869565217e-05, "loss": 1.2585, "step": 8272 }, { "epoch": 1.2950845335003132, "grad_norm": 1.3040486574172974, "learning_rate": 5.367697010869565e-05, "loss": 0.5137, "step": 8273 }, { "epoch": 1.2952410770194114, "grad_norm": 3.2197301387786865, "learning_rate": 5.366508152173913e-05, "loss": 0.8861, "step": 8274 }, { "epoch": 1.2953976205385098, "grad_norm": 6.442622661590576, "learning_rate": 5.36531929347826e-05, "loss": 0.7281, "step": 8275 }, { "epoch": 1.295554164057608, "grad_norm": 5.222671031951904, "learning_rate": 5.364130434782608e-05, "loss": 1.1756, "step": 8276 }, { "epoch": 1.2957107075767063, "grad_norm": 4.0839104652404785, "learning_rate": 5.362941576086956e-05, "loss": 0.9845, "step": 8277 }, { "epoch": 1.2958672510958047, "grad_norm": 2.4839982986450195, "learning_rate": 5.3617527173913035e-05, "loss": 0.9406, "step": 8278 }, { "epoch": 1.296023794614903, "grad_norm": 1.958970308303833, "learning_rate": 5.360563858695651e-05, "loss": 0.9155, "step": 8279 }, { "epoch": 1.2961803381340014, "grad_norm": 2.9617085456848145, "learning_rate": 5.359375e-05, "loss": 1.2803, "step": 8280 }, { "epoch": 1.2963368816530996, "grad_norm": 6.072402000427246, "learning_rate": 5.3581861413043476e-05, "loss": 1.4126, "step": 8281 }, { "epoch": 1.2964934251721978, "grad_norm": 3.583500623703003, "learning_rate": 5.3569972826086955e-05, "loss": 1.6176, "step": 8282 }, { "epoch": 1.2966499686912962, "grad_norm": 2.502046585083008, "learning_rate": 5.355808423913043e-05, "loss": 0.7034, "step": 8283 }, { "epoch": 1.2968065122103944, "grad_norm": 2.6359806060791016, "learning_rate": 5.354619565217391e-05, "loss": 0.677, "step": 8284 }, { "epoch": 1.2969630557294929, "grad_norm": 1.5124813318252563, "learning_rate": 5.353430706521739e-05, "loss": 0.6971, "step": 8285 }, { "epoch": 1.297119599248591, "grad_norm": 4.748797416687012, "learning_rate": 5.352241847826086e-05, "loss": 0.764, "step": 8286 }, { "epoch": 1.2972761427676893, "grad_norm": 2.2516956329345703, "learning_rate": 5.351052989130434e-05, "loss": 0.782, "step": 8287 }, { "epoch": 1.2974326862867878, "grad_norm": 2.3101794719696045, "learning_rate": 5.349864130434782e-05, "loss": 0.7099, "step": 8288 }, { "epoch": 1.297589229805886, "grad_norm": 0.7854422330856323, "learning_rate": 5.3486752717391295e-05, "loss": 0.2121, "step": 8289 }, { "epoch": 1.2977457733249844, "grad_norm": 0.9300307035446167, "learning_rate": 5.347486413043478e-05, "loss": 0.1679, "step": 8290 }, { "epoch": 1.2979023168440826, "grad_norm": 0.8324413895606995, "learning_rate": 5.346297554347826e-05, "loss": 0.2371, "step": 8291 }, { "epoch": 1.2980588603631809, "grad_norm": 0.774506688117981, "learning_rate": 5.3451086956521736e-05, "loss": 0.2762, "step": 8292 }, { "epoch": 1.2982154038822793, "grad_norm": 0.6566260457038879, "learning_rate": 5.3439198369565214e-05, "loss": 0.1865, "step": 8293 }, { "epoch": 1.2983719474013775, "grad_norm": 6.009603977203369, "learning_rate": 5.342730978260869e-05, "loss": 0.6651, "step": 8294 }, { "epoch": 1.298528490920476, "grad_norm": 1.1345053911209106, "learning_rate": 5.341542119565217e-05, "loss": 0.2586, "step": 8295 }, { "epoch": 1.2986850344395742, "grad_norm": 0.7079518437385559, "learning_rate": 5.340353260869565e-05, "loss": 0.3429, "step": 8296 }, { "epoch": 1.2988415779586724, "grad_norm": 0.8804667592048645, "learning_rate": 5.339164402173913e-05, "loss": 0.2851, "step": 8297 }, { "epoch": 1.2989981214777708, "grad_norm": 1.8310266733169556, "learning_rate": 5.33797554347826e-05, "loss": 0.356, "step": 8298 }, { "epoch": 1.2991546649968693, "grad_norm": 1.584134578704834, "learning_rate": 5.3367866847826076e-05, "loss": 0.3231, "step": 8299 }, { "epoch": 1.2993112085159675, "grad_norm": 0.9457914233207703, "learning_rate": 5.335597826086956e-05, "loss": 0.2022, "step": 8300 }, { "epoch": 1.2994677520350657, "grad_norm": 0.6110353469848633, "learning_rate": 5.334408967391304e-05, "loss": 0.2311, "step": 8301 }, { "epoch": 1.299624295554164, "grad_norm": 1.437056303024292, "learning_rate": 5.333220108695652e-05, "loss": 0.282, "step": 8302 }, { "epoch": 1.2997808390732624, "grad_norm": 1.199056625366211, "learning_rate": 5.3320312499999996e-05, "loss": 0.3396, "step": 8303 }, { "epoch": 1.2999373825923608, "grad_norm": 1.3382184505462646, "learning_rate": 5.3308423913043474e-05, "loss": 0.2987, "step": 8304 }, { "epoch": 1.300093926111459, "grad_norm": 1.195002555847168, "learning_rate": 5.329653532608695e-05, "loss": 0.1865, "step": 8305 }, { "epoch": 1.3002504696305572, "grad_norm": 1.5356537103652954, "learning_rate": 5.328464673913043e-05, "loss": 0.327, "step": 8306 }, { "epoch": 1.3004070131496557, "grad_norm": 1.4878263473510742, "learning_rate": 5.327275815217391e-05, "loss": 0.6394, "step": 8307 }, { "epoch": 1.300563556668754, "grad_norm": 2.3719630241394043, "learning_rate": 5.326086956521739e-05, "loss": 0.659, "step": 8308 }, { "epoch": 1.3007201001878523, "grad_norm": 1.4760074615478516, "learning_rate": 5.324898097826086e-05, "loss": 0.4266, "step": 8309 }, { "epoch": 1.3008766437069506, "grad_norm": 1.5796904563903809, "learning_rate": 5.323709239130434e-05, "loss": 0.553, "step": 8310 }, { "epoch": 1.3010331872260488, "grad_norm": 1.331342339515686, "learning_rate": 5.322520380434782e-05, "loss": 0.4393, "step": 8311 }, { "epoch": 1.3011897307451472, "grad_norm": 2.622260808944702, "learning_rate": 5.32133152173913e-05, "loss": 0.5917, "step": 8312 }, { "epoch": 1.3013462742642454, "grad_norm": 0.8667289018630981, "learning_rate": 5.320142663043478e-05, "loss": 0.3163, "step": 8313 }, { "epoch": 1.3015028177833439, "grad_norm": 3.0876519680023193, "learning_rate": 5.3189538043478256e-05, "loss": 0.6068, "step": 8314 }, { "epoch": 1.301659361302442, "grad_norm": 7.713058948516846, "learning_rate": 5.3177649456521734e-05, "loss": 1.0384, "step": 8315 }, { "epoch": 1.3018159048215403, "grad_norm": 1.3857024908065796, "learning_rate": 5.316576086956521e-05, "loss": 0.546, "step": 8316 }, { "epoch": 1.3019724483406387, "grad_norm": 2.166189193725586, "learning_rate": 5.315387228260869e-05, "loss": 0.3767, "step": 8317 }, { "epoch": 1.302128991859737, "grad_norm": 1.3877288103103638, "learning_rate": 5.3141983695652175e-05, "loss": 0.4878, "step": 8318 }, { "epoch": 1.3022855353788354, "grad_norm": 2.675654172897339, "learning_rate": 5.313009510869565e-05, "loss": 0.3287, "step": 8319 }, { "epoch": 1.3024420788979336, "grad_norm": 2.093019962310791, "learning_rate": 5.311820652173913e-05, "loss": 0.4273, "step": 8320 }, { "epoch": 1.3025986224170318, "grad_norm": 3.9073526859283447, "learning_rate": 5.31063179347826e-05, "loss": 0.5275, "step": 8321 }, { "epoch": 1.3027551659361303, "grad_norm": 4.168030738830566, "learning_rate": 5.309442934782608e-05, "loss": 0.9642, "step": 8322 }, { "epoch": 1.3029117094552285, "grad_norm": 2.766453504562378, "learning_rate": 5.308254076086956e-05, "loss": 0.298, "step": 8323 }, { "epoch": 1.303068252974327, "grad_norm": 6.780119895935059, "learning_rate": 5.307065217391304e-05, "loss": 0.7647, "step": 8324 }, { "epoch": 1.3032247964934252, "grad_norm": 2.0018763542175293, "learning_rate": 5.3058763586956515e-05, "loss": 0.5543, "step": 8325 }, { "epoch": 1.3033813400125234, "grad_norm": 3.545948028564453, "learning_rate": 5.3046874999999993e-05, "loss": 1.0935, "step": 8326 }, { "epoch": 1.3035378835316218, "grad_norm": 3.511335849761963, "learning_rate": 5.303498641304347e-05, "loss": 1.2635, "step": 8327 }, { "epoch": 1.30369442705072, "grad_norm": 2.8741257190704346, "learning_rate": 5.3023097826086957e-05, "loss": 1.0441, "step": 8328 }, { "epoch": 1.3038509705698185, "grad_norm": 3.451406717300415, "learning_rate": 5.3011209239130435e-05, "loss": 0.5851, "step": 8329 }, { "epoch": 1.3040075140889167, "grad_norm": 3.08248233795166, "learning_rate": 5.299932065217391e-05, "loss": 0.7738, "step": 8330 }, { "epoch": 1.304164057608015, "grad_norm": 9.233357429504395, "learning_rate": 5.298743206521739e-05, "loss": 1.8815, "step": 8331 }, { "epoch": 1.3043206011271133, "grad_norm": 4.399572372436523, "learning_rate": 5.297554347826086e-05, "loss": 1.6311, "step": 8332 }, { "epoch": 1.3044771446462118, "grad_norm": 3.7070884704589844, "learning_rate": 5.296365489130434e-05, "loss": 0.9639, "step": 8333 }, { "epoch": 1.30463368816531, "grad_norm": 1.399835228919983, "learning_rate": 5.295176630434782e-05, "loss": 0.4037, "step": 8334 }, { "epoch": 1.3047902316844082, "grad_norm": 4.744528293609619, "learning_rate": 5.29398777173913e-05, "loss": 1.0045, "step": 8335 }, { "epoch": 1.3049467752035064, "grad_norm": 8.910494804382324, "learning_rate": 5.2927989130434775e-05, "loss": 0.9518, "step": 8336 }, { "epoch": 1.3051033187226049, "grad_norm": 1.3109524250030518, "learning_rate": 5.291610054347825e-05, "loss": 0.4579, "step": 8337 }, { "epoch": 1.3052598622417033, "grad_norm": 5.081573486328125, "learning_rate": 5.290421195652174e-05, "loss": 1.317, "step": 8338 }, { "epoch": 1.3054164057608015, "grad_norm": 0.6908486485481262, "learning_rate": 5.2892323369565216e-05, "loss": 0.2589, "step": 8339 }, { "epoch": 1.3055729492798998, "grad_norm": 0.8090315461158752, "learning_rate": 5.2880434782608694e-05, "loss": 0.3086, "step": 8340 }, { "epoch": 1.3057294927989982, "grad_norm": 0.5444275140762329, "learning_rate": 5.286854619565217e-05, "loss": 0.1924, "step": 8341 }, { "epoch": 1.3058860363180964, "grad_norm": 0.7012562155723572, "learning_rate": 5.285665760869565e-05, "loss": 0.218, "step": 8342 }, { "epoch": 1.3060425798371949, "grad_norm": 0.9490010142326355, "learning_rate": 5.284476902173913e-05, "loss": 0.2527, "step": 8343 }, { "epoch": 1.306199123356293, "grad_norm": 0.5001701712608337, "learning_rate": 5.28328804347826e-05, "loss": 0.1714, "step": 8344 }, { "epoch": 1.3063556668753913, "grad_norm": 0.9424663186073303, "learning_rate": 5.282099184782608e-05, "loss": 0.1973, "step": 8345 }, { "epoch": 1.3065122103944897, "grad_norm": 1.0325279235839844, "learning_rate": 5.2809103260869557e-05, "loss": 0.2438, "step": 8346 }, { "epoch": 1.306668753913588, "grad_norm": 1.046747088432312, "learning_rate": 5.2797214673913035e-05, "loss": 0.2341, "step": 8347 }, { "epoch": 1.3068252974326864, "grad_norm": 1.0462477207183838, "learning_rate": 5.278532608695652e-05, "loss": 0.3035, "step": 8348 }, { "epoch": 1.3069818409517846, "grad_norm": 1.0023382902145386, "learning_rate": 5.27734375e-05, "loss": 0.3785, "step": 8349 }, { "epoch": 1.3071383844708828, "grad_norm": 1.049440622329712, "learning_rate": 5.2761548913043476e-05, "loss": 0.3671, "step": 8350 }, { "epoch": 1.3072949279899813, "grad_norm": 1.268355369567871, "learning_rate": 5.2749660326086954e-05, "loss": 0.2699, "step": 8351 }, { "epoch": 1.3074514715090795, "grad_norm": 1.5778613090515137, "learning_rate": 5.273777173913043e-05, "loss": 0.3251, "step": 8352 }, { "epoch": 1.307608015028178, "grad_norm": 0.7559840679168701, "learning_rate": 5.272588315217391e-05, "loss": 0.2585, "step": 8353 }, { "epoch": 1.3077645585472761, "grad_norm": 1.1084295511245728, "learning_rate": 5.271399456521739e-05, "loss": 0.3484, "step": 8354 }, { "epoch": 1.3079211020663744, "grad_norm": 1.2348517179489136, "learning_rate": 5.270210597826086e-05, "loss": 0.3413, "step": 8355 }, { "epoch": 1.3080776455854728, "grad_norm": 1.133396863937378, "learning_rate": 5.269021739130434e-05, "loss": 0.4142, "step": 8356 }, { "epoch": 1.308234189104571, "grad_norm": 0.65140700340271, "learning_rate": 5.2678328804347816e-05, "loss": 0.2706, "step": 8357 }, { "epoch": 1.3083907326236695, "grad_norm": 1.9472826719284058, "learning_rate": 5.26664402173913e-05, "loss": 0.8767, "step": 8358 }, { "epoch": 1.3085472761427677, "grad_norm": 1.179810881614685, "learning_rate": 5.265455163043478e-05, "loss": 0.2325, "step": 8359 }, { "epoch": 1.3087038196618659, "grad_norm": 2.0080502033233643, "learning_rate": 5.264266304347826e-05, "loss": 0.7609, "step": 8360 }, { "epoch": 1.3088603631809643, "grad_norm": 1.7594425678253174, "learning_rate": 5.2630774456521736e-05, "loss": 0.5489, "step": 8361 }, { "epoch": 1.3090169067000625, "grad_norm": 1.4796937704086304, "learning_rate": 5.2618885869565214e-05, "loss": 0.4565, "step": 8362 }, { "epoch": 1.309173450219161, "grad_norm": 0.9447329640388489, "learning_rate": 5.260699728260869e-05, "loss": 0.3779, "step": 8363 }, { "epoch": 1.3093299937382592, "grad_norm": 1.4929882287979126, "learning_rate": 5.259510869565217e-05, "loss": 0.5719, "step": 8364 }, { "epoch": 1.3094865372573574, "grad_norm": 3.0055899620056152, "learning_rate": 5.258322010869565e-05, "loss": 0.574, "step": 8365 }, { "epoch": 1.3096430807764559, "grad_norm": 3.6419625282287598, "learning_rate": 5.2571331521739126e-05, "loss": 0.9376, "step": 8366 }, { "epoch": 1.3097996242955543, "grad_norm": 2.760058879852295, "learning_rate": 5.25594429347826e-05, "loss": 0.6591, "step": 8367 }, { "epoch": 1.3099561678146525, "grad_norm": 2.614819288253784, "learning_rate": 5.254755434782608e-05, "loss": 0.682, "step": 8368 }, { "epoch": 1.3101127113337507, "grad_norm": 1.7676465511322021, "learning_rate": 5.253566576086956e-05, "loss": 0.3684, "step": 8369 }, { "epoch": 1.3102692548528492, "grad_norm": 2.2485854625701904, "learning_rate": 5.252377717391304e-05, "loss": 0.4481, "step": 8370 }, { "epoch": 1.3104257983719474, "grad_norm": 3.220795154571533, "learning_rate": 5.251188858695652e-05, "loss": 0.7105, "step": 8371 }, { "epoch": 1.3105823418910458, "grad_norm": 2.894798517227173, "learning_rate": 5.2499999999999995e-05, "loss": 0.6381, "step": 8372 }, { "epoch": 1.310738885410144, "grad_norm": 2.316070318222046, "learning_rate": 5.2488111413043474e-05, "loss": 0.7472, "step": 8373 }, { "epoch": 1.3108954289292423, "grad_norm": 3.7953567504882812, "learning_rate": 5.247622282608695e-05, "loss": 1.0642, "step": 8374 }, { "epoch": 1.3110519724483407, "grad_norm": 2.023735284805298, "learning_rate": 5.246433423913043e-05, "loss": 0.8179, "step": 8375 }, { "epoch": 1.311208515967439, "grad_norm": 2.587855339050293, "learning_rate": 5.245244565217391e-05, "loss": 0.544, "step": 8376 }, { "epoch": 1.3113650594865374, "grad_norm": 4.728296756744385, "learning_rate": 5.244055706521739e-05, "loss": 1.0383, "step": 8377 }, { "epoch": 1.3115216030056356, "grad_norm": 2.5495779514312744, "learning_rate": 5.2428668478260864e-05, "loss": 1.2258, "step": 8378 }, { "epoch": 1.3116781465247338, "grad_norm": 2.2396721839904785, "learning_rate": 5.241677989130434e-05, "loss": 0.8493, "step": 8379 }, { "epoch": 1.3118346900438322, "grad_norm": 2.4293551445007324, "learning_rate": 5.240489130434782e-05, "loss": 0.4083, "step": 8380 }, { "epoch": 1.3119912335629305, "grad_norm": 2.5144031047821045, "learning_rate": 5.23930027173913e-05, "loss": 0.5309, "step": 8381 }, { "epoch": 1.312147777082029, "grad_norm": 3.8382463455200195, "learning_rate": 5.238111413043478e-05, "loss": 1.8548, "step": 8382 }, { "epoch": 1.3123043206011271, "grad_norm": 3.3930563926696777, "learning_rate": 5.2369225543478255e-05, "loss": 1.1907, "step": 8383 }, { "epoch": 1.3124608641202253, "grad_norm": 7.073361396789551, "learning_rate": 5.235733695652173e-05, "loss": 0.4637, "step": 8384 }, { "epoch": 1.3126174076393238, "grad_norm": 1.8681260347366333, "learning_rate": 5.234544836956521e-05, "loss": 0.5119, "step": 8385 }, { "epoch": 1.312773951158422, "grad_norm": 2.723349094390869, "learning_rate": 5.233355978260869e-05, "loss": 0.6014, "step": 8386 }, { "epoch": 1.3129304946775204, "grad_norm": 4.6735076904296875, "learning_rate": 5.2321671195652175e-05, "loss": 0.9185, "step": 8387 }, { "epoch": 1.3130870381966186, "grad_norm": 4.305910587310791, "learning_rate": 5.230978260869565e-05, "loss": 0.8024, "step": 8388 }, { "epoch": 1.3132435817157169, "grad_norm": 0.37932804226875305, "learning_rate": 5.229789402173913e-05, "loss": 0.1568, "step": 8389 }, { "epoch": 1.3134001252348153, "grad_norm": 0.5027434229850769, "learning_rate": 5.22860054347826e-05, "loss": 0.1564, "step": 8390 }, { "epoch": 1.3135566687539135, "grad_norm": 0.9891074895858765, "learning_rate": 5.227411684782608e-05, "loss": 0.2136, "step": 8391 }, { "epoch": 1.313713212273012, "grad_norm": 0.47871559858322144, "learning_rate": 5.226222826086956e-05, "loss": 0.1603, "step": 8392 }, { "epoch": 1.3138697557921102, "grad_norm": 0.607693612575531, "learning_rate": 5.225033967391304e-05, "loss": 0.2776, "step": 8393 }, { "epoch": 1.3140262993112084, "grad_norm": 0.8391265869140625, "learning_rate": 5.2238451086956515e-05, "loss": 0.1866, "step": 8394 }, { "epoch": 1.3141828428303068, "grad_norm": 0.7756728529930115, "learning_rate": 5.222656249999999e-05, "loss": 0.2426, "step": 8395 }, { "epoch": 1.314339386349405, "grad_norm": 0.6292739510536194, "learning_rate": 5.221467391304347e-05, "loss": 0.2236, "step": 8396 }, { "epoch": 1.3144959298685035, "grad_norm": 0.6623067855834961, "learning_rate": 5.2202785326086956e-05, "loss": 0.2736, "step": 8397 }, { "epoch": 1.3146524733876017, "grad_norm": 0.5660138130187988, "learning_rate": 5.2190896739130434e-05, "loss": 0.2465, "step": 8398 }, { "epoch": 1.3148090169067, "grad_norm": 2.0631353855133057, "learning_rate": 5.217900815217391e-05, "loss": 0.3251, "step": 8399 }, { "epoch": 1.3149655604257984, "grad_norm": 2.143477439880371, "learning_rate": 5.216711956521739e-05, "loss": 0.1893, "step": 8400 }, { "epoch": 1.3151221039448968, "grad_norm": 0.8707379698753357, "learning_rate": 5.215523097826086e-05, "loss": 0.3576, "step": 8401 }, { "epoch": 1.315278647463995, "grad_norm": 1.5132368803024292, "learning_rate": 5.214334239130434e-05, "loss": 0.3936, "step": 8402 }, { "epoch": 1.3154351909830932, "grad_norm": 0.9492995738983154, "learning_rate": 5.213145380434782e-05, "loss": 0.338, "step": 8403 }, { "epoch": 1.3155917345021917, "grad_norm": 1.3466097116470337, "learning_rate": 5.2119565217391296e-05, "loss": 0.3386, "step": 8404 }, { "epoch": 1.31574827802129, "grad_norm": 1.1284641027450562, "learning_rate": 5.2107676630434775e-05, "loss": 0.3603, "step": 8405 }, { "epoch": 1.3159048215403883, "grad_norm": 1.2036994695663452, "learning_rate": 5.209578804347825e-05, "loss": 0.2449, "step": 8406 }, { "epoch": 1.3160613650594866, "grad_norm": 1.2153252363204956, "learning_rate": 5.208389945652174e-05, "loss": 0.3278, "step": 8407 }, { "epoch": 1.3162179085785848, "grad_norm": 1.1886534690856934, "learning_rate": 5.2072010869565216e-05, "loss": 0.2514, "step": 8408 }, { "epoch": 1.3163744520976832, "grad_norm": 0.756091296672821, "learning_rate": 5.2060122282608694e-05, "loss": 0.3039, "step": 8409 }, { "epoch": 1.3165309956167814, "grad_norm": 2.0329647064208984, "learning_rate": 5.204823369565217e-05, "loss": 0.4667, "step": 8410 }, { "epoch": 1.3166875391358799, "grad_norm": 0.9530768394470215, "learning_rate": 5.203634510869565e-05, "loss": 0.2992, "step": 8411 }, { "epoch": 1.316844082654978, "grad_norm": 2.512890100479126, "learning_rate": 5.202445652173913e-05, "loss": 0.5728, "step": 8412 }, { "epoch": 1.3170006261740763, "grad_norm": 2.0459511280059814, "learning_rate": 5.20125679347826e-05, "loss": 0.4525, "step": 8413 }, { "epoch": 1.3171571696931748, "grad_norm": 2.063495635986328, "learning_rate": 5.200067934782608e-05, "loss": 0.4745, "step": 8414 }, { "epoch": 1.317313713212273, "grad_norm": 2.244481325149536, "learning_rate": 5.1988790760869556e-05, "loss": 0.5223, "step": 8415 }, { "epoch": 1.3174702567313714, "grad_norm": 1.4799875020980835, "learning_rate": 5.1976902173913034e-05, "loss": 0.3487, "step": 8416 }, { "epoch": 1.3176268002504696, "grad_norm": 1.5630888938903809, "learning_rate": 5.196501358695652e-05, "loss": 0.3142, "step": 8417 }, { "epoch": 1.3177833437695678, "grad_norm": 2.8142499923706055, "learning_rate": 5.1953125e-05, "loss": 0.4667, "step": 8418 }, { "epoch": 1.3179398872886663, "grad_norm": 3.811680316925049, "learning_rate": 5.1941236413043476e-05, "loss": 0.6615, "step": 8419 }, { "epoch": 1.3180964308077645, "grad_norm": 5.87204122543335, "learning_rate": 5.1929347826086954e-05, "loss": 0.6469, "step": 8420 }, { "epoch": 1.318252974326863, "grad_norm": 3.4117133617401123, "learning_rate": 5.191745923913043e-05, "loss": 0.6955, "step": 8421 }, { "epoch": 1.3184095178459612, "grad_norm": 3.136354446411133, "learning_rate": 5.190557065217391e-05, "loss": 0.8903, "step": 8422 }, { "epoch": 1.3185660613650594, "grad_norm": 2.7469513416290283, "learning_rate": 5.189368206521739e-05, "loss": 1.1337, "step": 8423 }, { "epoch": 1.3187226048841578, "grad_norm": 4.178616523742676, "learning_rate": 5.188179347826086e-05, "loss": 1.0707, "step": 8424 }, { "epoch": 1.318879148403256, "grad_norm": 2.445476531982422, "learning_rate": 5.186990489130434e-05, "loss": 1.2255, "step": 8425 }, { "epoch": 1.3190356919223545, "grad_norm": 2.81412935256958, "learning_rate": 5.1858016304347816e-05, "loss": 0.9257, "step": 8426 }, { "epoch": 1.3191922354414527, "grad_norm": 2.7403290271759033, "learning_rate": 5.18461277173913e-05, "loss": 0.3242, "step": 8427 }, { "epoch": 1.319348778960551, "grad_norm": 3.9082162380218506, "learning_rate": 5.183423913043478e-05, "loss": 1.1407, "step": 8428 }, { "epoch": 1.3195053224796494, "grad_norm": 2.688504934310913, "learning_rate": 5.182235054347826e-05, "loss": 0.9015, "step": 8429 }, { "epoch": 1.3196618659987476, "grad_norm": 5.097596168518066, "learning_rate": 5.1810461956521735e-05, "loss": 0.9699, "step": 8430 }, { "epoch": 1.319818409517846, "grad_norm": 4.0816731452941895, "learning_rate": 5.179857336956521e-05, "loss": 1.5146, "step": 8431 }, { "epoch": 1.3199749530369442, "grad_norm": 2.2203550338745117, "learning_rate": 5.178668478260869e-05, "loss": 0.7342, "step": 8432 }, { "epoch": 1.3201314965560424, "grad_norm": 2.4319615364074707, "learning_rate": 5.177479619565217e-05, "loss": 1.1301, "step": 8433 }, { "epoch": 1.320288040075141, "grad_norm": 3.3345730304718018, "learning_rate": 5.176290760869565e-05, "loss": 0.4617, "step": 8434 }, { "epoch": 1.3204445835942393, "grad_norm": 1.4923886060714722, "learning_rate": 5.175101902173913e-05, "loss": 0.5335, "step": 8435 }, { "epoch": 1.3206011271133375, "grad_norm": 2.495443344116211, "learning_rate": 5.1739130434782604e-05, "loss": 0.9222, "step": 8436 }, { "epoch": 1.3207576706324358, "grad_norm": 4.786012172698975, "learning_rate": 5.172724184782608e-05, "loss": 1.5748, "step": 8437 }, { "epoch": 1.3209142141515342, "grad_norm": 1.2211668491363525, "learning_rate": 5.171535326086956e-05, "loss": 0.8169, "step": 8438 }, { "epoch": 1.3210707576706324, "grad_norm": 0.7880717515945435, "learning_rate": 5.170346467391304e-05, "loss": 0.2245, "step": 8439 }, { "epoch": 1.3212273011897309, "grad_norm": 0.6066724061965942, "learning_rate": 5.169157608695652e-05, "loss": 0.214, "step": 8440 }, { "epoch": 1.321383844708829, "grad_norm": 1.2073801755905151, "learning_rate": 5.1679687499999995e-05, "loss": 0.3075, "step": 8441 }, { "epoch": 1.3215403882279273, "grad_norm": 0.7894335389137268, "learning_rate": 5.166779891304347e-05, "loss": 0.2284, "step": 8442 }, { "epoch": 1.3216969317470257, "grad_norm": 0.4874792695045471, "learning_rate": 5.165591032608695e-05, "loss": 0.1934, "step": 8443 }, { "epoch": 1.321853475266124, "grad_norm": 0.6368094682693481, "learning_rate": 5.164402173913043e-05, "loss": 0.19, "step": 8444 }, { "epoch": 1.3220100187852224, "grad_norm": 0.9097103476524353, "learning_rate": 5.1632133152173914e-05, "loss": 0.2329, "step": 8445 }, { "epoch": 1.3221665623043206, "grad_norm": 0.5808327794075012, "learning_rate": 5.162024456521739e-05, "loss": 0.2252, "step": 8446 }, { "epoch": 1.3223231058234188, "grad_norm": 0.72557133436203, "learning_rate": 5.1608355978260864e-05, "loss": 0.2811, "step": 8447 }, { "epoch": 1.3224796493425173, "grad_norm": 0.8239424824714661, "learning_rate": 5.159646739130434e-05, "loss": 0.1691, "step": 8448 }, { "epoch": 1.3226361928616155, "grad_norm": 1.5094788074493408, "learning_rate": 5.158457880434782e-05, "loss": 0.2262, "step": 8449 }, { "epoch": 1.322792736380714, "grad_norm": 1.5701494216918945, "learning_rate": 5.15726902173913e-05, "loss": 0.4048, "step": 8450 }, { "epoch": 1.3229492798998121, "grad_norm": 3.634791612625122, "learning_rate": 5.1560801630434777e-05, "loss": 0.5028, "step": 8451 }, { "epoch": 1.3231058234189104, "grad_norm": 1.1830214262008667, "learning_rate": 5.1548913043478255e-05, "loss": 0.3206, "step": 8452 }, { "epoch": 1.3232623669380088, "grad_norm": 1.0275551080703735, "learning_rate": 5.153702445652173e-05, "loss": 0.2981, "step": 8453 }, { "epoch": 1.323418910457107, "grad_norm": 3.9983081817626953, "learning_rate": 5.152513586956521e-05, "loss": 0.3833, "step": 8454 }, { "epoch": 1.3235754539762055, "grad_norm": 1.5043946504592896, "learning_rate": 5.1513247282608696e-05, "loss": 0.4177, "step": 8455 }, { "epoch": 1.3237319974953037, "grad_norm": 2.8745806217193604, "learning_rate": 5.1501358695652174e-05, "loss": 0.436, "step": 8456 }, { "epoch": 1.323888541014402, "grad_norm": 1.1000033617019653, "learning_rate": 5.148947010869565e-05, "loss": 0.2503, "step": 8457 }, { "epoch": 1.3240450845335003, "grad_norm": 1.8040111064910889, "learning_rate": 5.147758152173913e-05, "loss": 0.4835, "step": 8458 }, { "epoch": 1.3242016280525986, "grad_norm": 1.5550901889801025, "learning_rate": 5.14656929347826e-05, "loss": 0.2705, "step": 8459 }, { "epoch": 1.324358171571697, "grad_norm": 1.8524531126022339, "learning_rate": 5.145380434782608e-05, "loss": 0.2704, "step": 8460 }, { "epoch": 1.3245147150907952, "grad_norm": 2.114435911178589, "learning_rate": 5.144191576086956e-05, "loss": 0.3039, "step": 8461 }, { "epoch": 1.3246712586098934, "grad_norm": 1.2381343841552734, "learning_rate": 5.1430027173913036e-05, "loss": 0.3195, "step": 8462 }, { "epoch": 1.3248278021289919, "grad_norm": 2.619919776916504, "learning_rate": 5.1418138586956514e-05, "loss": 0.3423, "step": 8463 }, { "epoch": 1.32498434564809, "grad_norm": 2.085926055908203, "learning_rate": 5.140624999999999e-05, "loss": 0.3506, "step": 8464 }, { "epoch": 1.3251408891671885, "grad_norm": 1.685734510421753, "learning_rate": 5.139436141304348e-05, "loss": 0.4044, "step": 8465 }, { "epoch": 1.3252974326862867, "grad_norm": 3.428079843521118, "learning_rate": 5.1382472826086956e-05, "loss": 0.648, "step": 8466 }, { "epoch": 1.325453976205385, "grad_norm": 3.694976329803467, "learning_rate": 5.1370584239130434e-05, "loss": 0.5856, "step": 8467 }, { "epoch": 1.3256105197244834, "grad_norm": 2.1173360347747803, "learning_rate": 5.135869565217391e-05, "loss": 0.803, "step": 8468 }, { "epoch": 1.3257670632435818, "grad_norm": 2.5285098552703857, "learning_rate": 5.134680706521739e-05, "loss": 0.4696, "step": 8469 }, { "epoch": 1.32592360676268, "grad_norm": 4.930835723876953, "learning_rate": 5.133491847826086e-05, "loss": 0.9231, "step": 8470 }, { "epoch": 1.3260801502817783, "grad_norm": 3.4827611446380615, "learning_rate": 5.132302989130434e-05, "loss": 0.7163, "step": 8471 }, { "epoch": 1.3262366938008767, "grad_norm": 2.855592727661133, "learning_rate": 5.131114130434782e-05, "loss": 1.0666, "step": 8472 }, { "epoch": 1.326393237319975, "grad_norm": 4.8024420738220215, "learning_rate": 5.1299252717391296e-05, "loss": 1.2342, "step": 8473 }, { "epoch": 1.3265497808390734, "grad_norm": 2.0883939266204834, "learning_rate": 5.1287364130434774e-05, "loss": 0.9604, "step": 8474 }, { "epoch": 1.3267063243581716, "grad_norm": 4.808028221130371, "learning_rate": 5.127547554347826e-05, "loss": 1.2388, "step": 8475 }, { "epoch": 1.3268628678772698, "grad_norm": 4.012434959411621, "learning_rate": 5.126358695652174e-05, "loss": 1.1837, "step": 8476 }, { "epoch": 1.3270194113963683, "grad_norm": 3.4379234313964844, "learning_rate": 5.1251698369565215e-05, "loss": 1.2336, "step": 8477 }, { "epoch": 1.3271759549154665, "grad_norm": 5.577293872833252, "learning_rate": 5.1239809782608693e-05, "loss": 1.028, "step": 8478 }, { "epoch": 1.327332498434565, "grad_norm": 4.008512020111084, "learning_rate": 5.122792119565217e-05, "loss": 1.3801, "step": 8479 }, { "epoch": 1.3274890419536631, "grad_norm": 2.895390033721924, "learning_rate": 5.121603260869565e-05, "loss": 1.2721, "step": 8480 }, { "epoch": 1.3276455854727613, "grad_norm": 5.866698741912842, "learning_rate": 5.120414402173913e-05, "loss": 1.004, "step": 8481 }, { "epoch": 1.3278021289918598, "grad_norm": 1.9465625286102295, "learning_rate": 5.11922554347826e-05, "loss": 1.0647, "step": 8482 }, { "epoch": 1.327958672510958, "grad_norm": 4.844081878662109, "learning_rate": 5.118036684782608e-05, "loss": 1.1531, "step": 8483 }, { "epoch": 1.3281152160300564, "grad_norm": 2.9256327152252197, "learning_rate": 5.1168478260869556e-05, "loss": 1.1049, "step": 8484 }, { "epoch": 1.3282717595491547, "grad_norm": 7.48610782623291, "learning_rate": 5.115658967391304e-05, "loss": 0.312, "step": 8485 }, { "epoch": 1.3284283030682529, "grad_norm": 4.590615749359131, "learning_rate": 5.114470108695652e-05, "loss": 1.0762, "step": 8486 }, { "epoch": 1.3285848465873513, "grad_norm": 4.2911763191223145, "learning_rate": 5.11328125e-05, "loss": 1.1128, "step": 8487 }, { "epoch": 1.3287413901064495, "grad_norm": 3.1543116569519043, "learning_rate": 5.1120923913043475e-05, "loss": 1.0389, "step": 8488 }, { "epoch": 1.328897933625548, "grad_norm": 1.4005086421966553, "learning_rate": 5.110903532608695e-05, "loss": 0.2126, "step": 8489 }, { "epoch": 1.3290544771446462, "grad_norm": 0.6979767084121704, "learning_rate": 5.109714673913043e-05, "loss": 0.3836, "step": 8490 }, { "epoch": 1.3292110206637444, "grad_norm": 0.7248562574386597, "learning_rate": 5.108525815217391e-05, "loss": 0.1975, "step": 8491 }, { "epoch": 1.3293675641828429, "grad_norm": 0.5030659437179565, "learning_rate": 5.107336956521739e-05, "loss": 0.2145, "step": 8492 }, { "epoch": 1.329524107701941, "grad_norm": 0.5081518888473511, "learning_rate": 5.106148097826086e-05, "loss": 0.1849, "step": 8493 }, { "epoch": 1.3296806512210395, "grad_norm": 0.8877454400062561, "learning_rate": 5.104959239130434e-05, "loss": 0.2173, "step": 8494 }, { "epoch": 1.3298371947401377, "grad_norm": 0.446064829826355, "learning_rate": 5.103770380434782e-05, "loss": 0.1321, "step": 8495 }, { "epoch": 1.329993738259236, "grad_norm": 0.6532548069953918, "learning_rate": 5.10258152173913e-05, "loss": 0.2777, "step": 8496 }, { "epoch": 1.3301502817783344, "grad_norm": 0.7085617780685425, "learning_rate": 5.101392663043478e-05, "loss": 0.2751, "step": 8497 }, { "epoch": 1.3303068252974326, "grad_norm": 0.8689234852790833, "learning_rate": 5.100203804347826e-05, "loss": 0.2586, "step": 8498 }, { "epoch": 1.330463368816531, "grad_norm": 0.5233132839202881, "learning_rate": 5.0990149456521735e-05, "loss": 0.1476, "step": 8499 }, { "epoch": 1.3306199123356293, "grad_norm": 0.803695023059845, "learning_rate": 5.097826086956521e-05, "loss": 0.3228, "step": 8500 }, { "epoch": 1.3307764558547275, "grad_norm": 0.4885929226875305, "learning_rate": 5.096637228260869e-05, "loss": 0.2262, "step": 8501 }, { "epoch": 1.330932999373826, "grad_norm": 2.275285005569458, "learning_rate": 5.095448369565217e-05, "loss": 0.5165, "step": 8502 }, { "epoch": 1.3310895428929244, "grad_norm": 1.201228141784668, "learning_rate": 5.094259510869565e-05, "loss": 0.4873, "step": 8503 }, { "epoch": 1.3312460864120226, "grad_norm": 0.713198721408844, "learning_rate": 5.093070652173913e-05, "loss": 0.2127, "step": 8504 }, { "epoch": 1.3314026299311208, "grad_norm": 0.9209476709365845, "learning_rate": 5.0918817934782604e-05, "loss": 0.3541, "step": 8505 }, { "epoch": 1.3315591734502192, "grad_norm": 2.495059013366699, "learning_rate": 5.090692934782608e-05, "loss": 0.4499, "step": 8506 }, { "epoch": 1.3317157169693175, "grad_norm": 0.892863929271698, "learning_rate": 5.089504076086956e-05, "loss": 0.2832, "step": 8507 }, { "epoch": 1.331872260488416, "grad_norm": 1.7761387825012207, "learning_rate": 5.088315217391304e-05, "loss": 0.271, "step": 8508 }, { "epoch": 1.3320288040075141, "grad_norm": 1.237683653831482, "learning_rate": 5.0871263586956516e-05, "loss": 0.4912, "step": 8509 }, { "epoch": 1.3321853475266123, "grad_norm": 1.1521103382110596, "learning_rate": 5.0859374999999994e-05, "loss": 0.3536, "step": 8510 }, { "epoch": 1.3323418910457108, "grad_norm": 2.0907390117645264, "learning_rate": 5.084748641304347e-05, "loss": 0.6738, "step": 8511 }, { "epoch": 1.332498434564809, "grad_norm": 2.881988048553467, "learning_rate": 5.083559782608695e-05, "loss": 0.6539, "step": 8512 }, { "epoch": 1.3326549780839074, "grad_norm": 2.889228582382202, "learning_rate": 5.0823709239130436e-05, "loss": 0.7212, "step": 8513 }, { "epoch": 1.3328115216030056, "grad_norm": 2.6500439643859863, "learning_rate": 5.0811820652173914e-05, "loss": 0.6417, "step": 8514 }, { "epoch": 1.3329680651221039, "grad_norm": 2.5009140968322754, "learning_rate": 5.079993206521739e-05, "loss": 0.4949, "step": 8515 }, { "epoch": 1.3331246086412023, "grad_norm": 2.1524317264556885, "learning_rate": 5.0788043478260863e-05, "loss": 0.8091, "step": 8516 }, { "epoch": 1.3332811521603005, "grad_norm": 2.0097813606262207, "learning_rate": 5.077615489130434e-05, "loss": 0.542, "step": 8517 }, { "epoch": 1.333437695679399, "grad_norm": 2.1988441944122314, "learning_rate": 5.076426630434782e-05, "loss": 0.9707, "step": 8518 }, { "epoch": 1.3335942391984972, "grad_norm": 1.5773497819900513, "learning_rate": 5.07523777173913e-05, "loss": 0.8367, "step": 8519 }, { "epoch": 1.3337507827175954, "grad_norm": 2.450021743774414, "learning_rate": 5.0740489130434776e-05, "loss": 0.46, "step": 8520 }, { "epoch": 1.3339073262366938, "grad_norm": 1.3922719955444336, "learning_rate": 5.0728600543478254e-05, "loss": 0.4351, "step": 8521 }, { "epoch": 1.334063869755792, "grad_norm": 2.8999083042144775, "learning_rate": 5.071671195652173e-05, "loss": 0.8135, "step": 8522 }, { "epoch": 1.3342204132748905, "grad_norm": 2.396137237548828, "learning_rate": 5.070482336956522e-05, "loss": 0.7647, "step": 8523 }, { "epoch": 1.3343769567939887, "grad_norm": 4.017902374267578, "learning_rate": 5.0692934782608695e-05, "loss": 0.703, "step": 8524 }, { "epoch": 1.334533500313087, "grad_norm": 3.743021011352539, "learning_rate": 5.0681046195652174e-05, "loss": 0.9051, "step": 8525 }, { "epoch": 1.3346900438321854, "grad_norm": 5.910476207733154, "learning_rate": 5.066915760869565e-05, "loss": 0.7742, "step": 8526 }, { "epoch": 1.3348465873512836, "grad_norm": 3.4154512882232666, "learning_rate": 5.065726902173913e-05, "loss": 1.0092, "step": 8527 }, { "epoch": 1.335003130870382, "grad_norm": 4.041189193725586, "learning_rate": 5.06453804347826e-05, "loss": 1.5568, "step": 8528 }, { "epoch": 1.3351596743894802, "grad_norm": 2.5793724060058594, "learning_rate": 5.063349184782608e-05, "loss": 1.1453, "step": 8529 }, { "epoch": 1.3353162179085785, "grad_norm": 2.305061101913452, "learning_rate": 5.062160326086956e-05, "loss": 1.1284, "step": 8530 }, { "epoch": 1.335472761427677, "grad_norm": 2.6737518310546875, "learning_rate": 5.0609714673913036e-05, "loss": 0.9718, "step": 8531 }, { "epoch": 1.3356293049467753, "grad_norm": 2.08156156539917, "learning_rate": 5.0597826086956514e-05, "loss": 0.6255, "step": 8532 }, { "epoch": 1.3357858484658736, "grad_norm": 4.5479631423950195, "learning_rate": 5.05859375e-05, "loss": 0.6504, "step": 8533 }, { "epoch": 1.3359423919849718, "grad_norm": 2.9158835411071777, "learning_rate": 5.057404891304348e-05, "loss": 0.6329, "step": 8534 }, { "epoch": 1.33609893550407, "grad_norm": 2.8755927085876465, "learning_rate": 5.0562160326086955e-05, "loss": 0.4281, "step": 8535 }, { "epoch": 1.3362554790231684, "grad_norm": 2.406050205230713, "learning_rate": 5.055027173913043e-05, "loss": 0.6149, "step": 8536 }, { "epoch": 1.3364120225422669, "grad_norm": 1.8460521697998047, "learning_rate": 5.053838315217391e-05, "loss": 0.5869, "step": 8537 }, { "epoch": 1.336568566061365, "grad_norm": 4.162355899810791, "learning_rate": 5.052649456521739e-05, "loss": 0.9407, "step": 8538 }, { "epoch": 1.3367251095804633, "grad_norm": 0.4457916021347046, "learning_rate": 5.051460597826086e-05, "loss": 0.2162, "step": 8539 }, { "epoch": 1.3368816530995618, "grad_norm": 1.044219732284546, "learning_rate": 5.050271739130434e-05, "loss": 0.2075, "step": 8540 }, { "epoch": 1.33703819661866, "grad_norm": 0.44392460584640503, "learning_rate": 5.049082880434782e-05, "loss": 0.2513, "step": 8541 }, { "epoch": 1.3371947401377584, "grad_norm": 0.6552778482437134, "learning_rate": 5.0478940217391295e-05, "loss": 0.2926, "step": 8542 }, { "epoch": 1.3373512836568566, "grad_norm": 0.8136093020439148, "learning_rate": 5.046705163043478e-05, "loss": 0.236, "step": 8543 }, { "epoch": 1.3375078271759548, "grad_norm": 1.5574944019317627, "learning_rate": 5.045516304347826e-05, "loss": 0.274, "step": 8544 }, { "epoch": 1.3376643706950533, "grad_norm": 0.5940535068511963, "learning_rate": 5.044327445652174e-05, "loss": 0.1645, "step": 8545 }, { "epoch": 1.3378209142141515, "grad_norm": 1.0001596212387085, "learning_rate": 5.0431385869565215e-05, "loss": 0.2963, "step": 8546 }, { "epoch": 1.33797745773325, "grad_norm": 0.973078191280365, "learning_rate": 5.041949728260869e-05, "loss": 0.272, "step": 8547 }, { "epoch": 1.3381340012523482, "grad_norm": 0.6372198462486267, "learning_rate": 5.040760869565217e-05, "loss": 0.2269, "step": 8548 }, { "epoch": 1.3382905447714464, "grad_norm": 1.3687288761138916, "learning_rate": 5.039572010869565e-05, "loss": 0.2508, "step": 8549 }, { "epoch": 1.3384470882905448, "grad_norm": 1.3535200357437134, "learning_rate": 5.038383152173913e-05, "loss": 0.3179, "step": 8550 }, { "epoch": 1.338603631809643, "grad_norm": 1.2610455751419067, "learning_rate": 5.03719429347826e-05, "loss": 0.2772, "step": 8551 }, { "epoch": 1.3387601753287415, "grad_norm": 3.0122909545898438, "learning_rate": 5.036005434782608e-05, "loss": 0.3854, "step": 8552 }, { "epoch": 1.3389167188478397, "grad_norm": 0.6163998246192932, "learning_rate": 5.034816576086956e-05, "loss": 0.2155, "step": 8553 }, { "epoch": 1.339073262366938, "grad_norm": 1.3928186893463135, "learning_rate": 5.033627717391304e-05, "loss": 0.424, "step": 8554 }, { "epoch": 1.3392298058860364, "grad_norm": 1.034520149230957, "learning_rate": 5.032438858695652e-05, "loss": 0.4514, "step": 8555 }, { "epoch": 1.3393863494051346, "grad_norm": 1.6322546005249023, "learning_rate": 5.0312499999999996e-05, "loss": 0.4103, "step": 8556 }, { "epoch": 1.339542892924233, "grad_norm": 2.652021646499634, "learning_rate": 5.0300611413043475e-05, "loss": 0.4299, "step": 8557 }, { "epoch": 1.3396994364433312, "grad_norm": 0.8599926829338074, "learning_rate": 5.028872282608695e-05, "loss": 0.2124, "step": 8558 }, { "epoch": 1.3398559799624294, "grad_norm": 1.2797852754592896, "learning_rate": 5.027683423913043e-05, "loss": 0.3076, "step": 8559 }, { "epoch": 1.3400125234815279, "grad_norm": 1.6205990314483643, "learning_rate": 5.026494565217391e-05, "loss": 0.1884, "step": 8560 }, { "epoch": 1.340169067000626, "grad_norm": 2.351538896560669, "learning_rate": 5.025305706521739e-05, "loss": 0.3411, "step": 8561 }, { "epoch": 1.3403256105197245, "grad_norm": 2.8403732776641846, "learning_rate": 5.024116847826086e-05, "loss": 0.7293, "step": 8562 }, { "epoch": 1.3404821540388228, "grad_norm": 11.527314186096191, "learning_rate": 5.0229279891304344e-05, "loss": 0.7823, "step": 8563 }, { "epoch": 1.340638697557921, "grad_norm": 1.7066071033477783, "learning_rate": 5.021739130434782e-05, "loss": 0.4597, "step": 8564 }, { "epoch": 1.3407952410770194, "grad_norm": 4.2130045890808105, "learning_rate": 5.02055027173913e-05, "loss": 0.3081, "step": 8565 }, { "epoch": 1.3409517845961179, "grad_norm": 2.5352771282196045, "learning_rate": 5.019361413043478e-05, "loss": 0.686, "step": 8566 }, { "epoch": 1.341108328115216, "grad_norm": 2.288006067276001, "learning_rate": 5.0181725543478256e-05, "loss": 0.7765, "step": 8567 }, { "epoch": 1.3412648716343143, "grad_norm": 1.2372286319732666, "learning_rate": 5.0169836956521734e-05, "loss": 0.4667, "step": 8568 }, { "epoch": 1.3414214151534125, "grad_norm": 2.1002681255340576, "learning_rate": 5.015794836956521e-05, "loss": 0.7908, "step": 8569 }, { "epoch": 1.341577958672511, "grad_norm": 13.491796493530273, "learning_rate": 5.014605978260869e-05, "loss": 0.7164, "step": 8570 }, { "epoch": 1.3417345021916094, "grad_norm": 2.1826512813568115, "learning_rate": 5.013417119565217e-05, "loss": 0.7749, "step": 8571 }, { "epoch": 1.3418910457107076, "grad_norm": 2.9365851879119873, "learning_rate": 5.0122282608695654e-05, "loss": 0.9479, "step": 8572 }, { "epoch": 1.3420475892298058, "grad_norm": 5.118083953857422, "learning_rate": 5.011039402173913e-05, "loss": 0.5557, "step": 8573 }, { "epoch": 1.3422041327489043, "grad_norm": 6.893304347991943, "learning_rate": 5.00985054347826e-05, "loss": 0.972, "step": 8574 }, { "epoch": 1.3423606762680025, "grad_norm": 2.878908157348633, "learning_rate": 5.008661684782608e-05, "loss": 0.7854, "step": 8575 }, { "epoch": 1.342517219787101, "grad_norm": 2.75372576713562, "learning_rate": 5.007472826086956e-05, "loss": 0.8393, "step": 8576 }, { "epoch": 1.3426737633061991, "grad_norm": 2.684856653213501, "learning_rate": 5.006283967391304e-05, "loss": 0.6799, "step": 8577 }, { "epoch": 1.3428303068252974, "grad_norm": 2.407285690307617, "learning_rate": 5.0050951086956516e-05, "loss": 1.0702, "step": 8578 }, { "epoch": 1.3429868503443958, "grad_norm": 4.282900333404541, "learning_rate": 5.0039062499999994e-05, "loss": 1.2721, "step": 8579 }, { "epoch": 1.343143393863494, "grad_norm": 2.4258921146392822, "learning_rate": 5.002717391304347e-05, "loss": 1.049, "step": 8580 }, { "epoch": 1.3432999373825925, "grad_norm": 1.4255551099777222, "learning_rate": 5.001528532608695e-05, "loss": 0.444, "step": 8581 }, { "epoch": 1.3434564809016907, "grad_norm": 3.460552453994751, "learning_rate": 5.0003396739130435e-05, "loss": 0.4856, "step": 8582 }, { "epoch": 1.343613024420789, "grad_norm": 3.121530532836914, "learning_rate": 4.9991508152173913e-05, "loss": 0.7048, "step": 8583 }, { "epoch": 1.3437695679398873, "grad_norm": 1.8434067964553833, "learning_rate": 4.997961956521739e-05, "loss": 0.7541, "step": 8584 }, { "epoch": 1.3439261114589856, "grad_norm": 2.728498697280884, "learning_rate": 4.996773097826086e-05, "loss": 0.7702, "step": 8585 }, { "epoch": 1.344082654978084, "grad_norm": 4.575986862182617, "learning_rate": 4.995584239130434e-05, "loss": 1.0579, "step": 8586 }, { "epoch": 1.3442391984971822, "grad_norm": 4.359337329864502, "learning_rate": 4.994395380434782e-05, "loss": 0.9569, "step": 8587 }, { "epoch": 1.3443957420162804, "grad_norm": 2.4282641410827637, "learning_rate": 4.99320652173913e-05, "loss": 0.6695, "step": 8588 }, { "epoch": 1.3445522855353789, "grad_norm": 0.6456189751625061, "learning_rate": 4.9920176630434776e-05, "loss": 0.1533, "step": 8589 }, { "epoch": 1.344708829054477, "grad_norm": 0.5246249437332153, "learning_rate": 4.9908288043478254e-05, "loss": 0.2019, "step": 8590 }, { "epoch": 1.3448653725735755, "grad_norm": 0.6297491192817688, "learning_rate": 4.989639945652173e-05, "loss": 0.3253, "step": 8591 }, { "epoch": 1.3450219160926737, "grad_norm": 0.626595139503479, "learning_rate": 4.988451086956522e-05, "loss": 0.1773, "step": 8592 }, { "epoch": 1.345178459611772, "grad_norm": 0.6090409755706787, "learning_rate": 4.9872622282608695e-05, "loss": 0.2833, "step": 8593 }, { "epoch": 1.3453350031308704, "grad_norm": 0.7241976857185364, "learning_rate": 4.986073369565217e-05, "loss": 0.3041, "step": 8594 }, { "epoch": 1.3454915466499686, "grad_norm": 0.6269464492797852, "learning_rate": 4.984884510869565e-05, "loss": 0.2344, "step": 8595 }, { "epoch": 1.345648090169067, "grad_norm": 0.7456495761871338, "learning_rate": 4.983695652173913e-05, "loss": 0.1457, "step": 8596 }, { "epoch": 1.3458046336881653, "grad_norm": 1.5554322004318237, "learning_rate": 4.98250679347826e-05, "loss": 0.2715, "step": 8597 }, { "epoch": 1.3459611772072635, "grad_norm": 0.9812779426574707, "learning_rate": 4.981317934782608e-05, "loss": 0.3717, "step": 8598 }, { "epoch": 1.346117720726362, "grad_norm": 1.1595038175582886, "learning_rate": 4.980129076086956e-05, "loss": 0.3335, "step": 8599 }, { "epoch": 1.3462742642454604, "grad_norm": 0.94212406873703, "learning_rate": 4.9789402173913035e-05, "loss": 0.4045, "step": 8600 }, { "epoch": 1.3464308077645586, "grad_norm": 2.0375545024871826, "learning_rate": 4.9777513586956513e-05, "loss": 0.4936, "step": 8601 }, { "epoch": 1.3465873512836568, "grad_norm": 1.4357929229736328, "learning_rate": 4.9765625e-05, "loss": 0.3496, "step": 8602 }, { "epoch": 1.3467438948027552, "grad_norm": 0.749093234539032, "learning_rate": 4.9753736413043477e-05, "loss": 0.2137, "step": 8603 }, { "epoch": 1.3469004383218535, "grad_norm": 0.8773446679115295, "learning_rate": 4.9741847826086955e-05, "loss": 0.231, "step": 8604 }, { "epoch": 1.347056981840952, "grad_norm": 1.283036231994629, "learning_rate": 4.972995923913043e-05, "loss": 0.4123, "step": 8605 }, { "epoch": 1.3472135253600501, "grad_norm": 2.184586763381958, "learning_rate": 4.971807065217391e-05, "loss": 0.4888, "step": 8606 }, { "epoch": 1.3473700688791483, "grad_norm": 2.141216516494751, "learning_rate": 4.970618206521739e-05, "loss": 0.5002, "step": 8607 }, { "epoch": 1.3475266123982468, "grad_norm": 2.1967077255249023, "learning_rate": 4.969429347826086e-05, "loss": 0.499, "step": 8608 }, { "epoch": 1.347683155917345, "grad_norm": 2.2561442852020264, "learning_rate": 4.968240489130434e-05, "loss": 0.4401, "step": 8609 }, { "epoch": 1.3478396994364434, "grad_norm": 1.4856367111206055, "learning_rate": 4.967051630434782e-05, "loss": 0.5224, "step": 8610 }, { "epoch": 1.3479962429555417, "grad_norm": 1.0798144340515137, "learning_rate": 4.9658627717391295e-05, "loss": 0.3196, "step": 8611 }, { "epoch": 1.3481527864746399, "grad_norm": 1.7785851955413818, "learning_rate": 4.964673913043478e-05, "loss": 0.5054, "step": 8612 }, { "epoch": 1.3483093299937383, "grad_norm": 1.7254629135131836, "learning_rate": 4.963485054347826e-05, "loss": 0.6675, "step": 8613 }, { "epoch": 1.3484658735128365, "grad_norm": 1.9740592241287231, "learning_rate": 4.9622961956521736e-05, "loss": 0.7882, "step": 8614 }, { "epoch": 1.348622417031935, "grad_norm": 2.3787004947662354, "learning_rate": 4.9611073369565214e-05, "loss": 0.4837, "step": 8615 }, { "epoch": 1.3487789605510332, "grad_norm": 1.9634076356887817, "learning_rate": 4.959918478260869e-05, "loss": 0.6278, "step": 8616 }, { "epoch": 1.3489355040701314, "grad_norm": 1.9210368394851685, "learning_rate": 4.958729619565217e-05, "loss": 0.7986, "step": 8617 }, { "epoch": 1.3490920475892298, "grad_norm": 2.035060167312622, "learning_rate": 4.957540760869565e-05, "loss": 0.7955, "step": 8618 }, { "epoch": 1.349248591108328, "grad_norm": 2.335350513458252, "learning_rate": 4.956351902173913e-05, "loss": 0.5041, "step": 8619 }, { "epoch": 1.3494051346274265, "grad_norm": 3.0201127529144287, "learning_rate": 4.95516304347826e-05, "loss": 0.7001, "step": 8620 }, { "epoch": 1.3495616781465247, "grad_norm": 4.052512168884277, "learning_rate": 4.9539741847826077e-05, "loss": 0.7998, "step": 8621 }, { "epoch": 1.349718221665623, "grad_norm": 5.09596061706543, "learning_rate": 4.952785326086956e-05, "loss": 0.6588, "step": 8622 }, { "epoch": 1.3498747651847214, "grad_norm": 4.3294901847839355, "learning_rate": 4.951596467391304e-05, "loss": 1.0207, "step": 8623 }, { "epoch": 1.3500313087038196, "grad_norm": 1.5455697774887085, "learning_rate": 4.950407608695652e-05, "loss": 0.3773, "step": 8624 }, { "epoch": 1.350187852222918, "grad_norm": 1.9684433937072754, "learning_rate": 4.9492187499999996e-05, "loss": 0.7671, "step": 8625 }, { "epoch": 1.3503443957420163, "grad_norm": 2.690061092376709, "learning_rate": 4.9480298913043474e-05, "loss": 1.1832, "step": 8626 }, { "epoch": 1.3505009392611145, "grad_norm": 2.381819725036621, "learning_rate": 4.946841032608695e-05, "loss": 0.8848, "step": 8627 }, { "epoch": 1.350657482780213, "grad_norm": 2.108623504638672, "learning_rate": 4.945652173913043e-05, "loss": 0.8848, "step": 8628 }, { "epoch": 1.3508140262993111, "grad_norm": 4.4010491371154785, "learning_rate": 4.944463315217391e-05, "loss": 1.016, "step": 8629 }, { "epoch": 1.3509705698184096, "grad_norm": 2.3561835289001465, "learning_rate": 4.9432744565217394e-05, "loss": 0.7051, "step": 8630 }, { "epoch": 1.3511271133375078, "grad_norm": 2.0222690105438232, "learning_rate": 4.942085597826086e-05, "loss": 1.0447, "step": 8631 }, { "epoch": 1.351283656856606, "grad_norm": 3.0459201335906982, "learning_rate": 4.940896739130434e-05, "loss": 0.3514, "step": 8632 }, { "epoch": 1.3514402003757044, "grad_norm": 4.495997428894043, "learning_rate": 4.939707880434782e-05, "loss": 0.9935, "step": 8633 }, { "epoch": 1.3515967438948029, "grad_norm": 3.1264560222625732, "learning_rate": 4.93851902173913e-05, "loss": 1.2197, "step": 8634 }, { "epoch": 1.351753287413901, "grad_norm": 1.8845082521438599, "learning_rate": 4.937330163043478e-05, "loss": 0.8607, "step": 8635 }, { "epoch": 1.3519098309329993, "grad_norm": 2.785404920578003, "learning_rate": 4.9361413043478256e-05, "loss": 1.1638, "step": 8636 }, { "epoch": 1.3520663744520978, "grad_norm": 2.698533058166504, "learning_rate": 4.9349524456521734e-05, "loss": 0.6935, "step": 8637 }, { "epoch": 1.352222917971196, "grad_norm": 4.161587238311768, "learning_rate": 4.933763586956521e-05, "loss": 1.1686, "step": 8638 }, { "epoch": 1.3523794614902944, "grad_norm": 0.5210708975791931, "learning_rate": 4.932574728260869e-05, "loss": 0.2137, "step": 8639 }, { "epoch": 1.3525360050093926, "grad_norm": 0.37963104248046875, "learning_rate": 4.9313858695652175e-05, "loss": 0.2092, "step": 8640 }, { "epoch": 1.3526925485284909, "grad_norm": 1.0546318292617798, "learning_rate": 4.930197010869565e-05, "loss": 0.2879, "step": 8641 }, { "epoch": 1.3528490920475893, "grad_norm": 0.5694555640220642, "learning_rate": 4.929008152173913e-05, "loss": 0.2743, "step": 8642 }, { "epoch": 1.3530056355666875, "grad_norm": 0.4267643392086029, "learning_rate": 4.92781929347826e-05, "loss": 0.1442, "step": 8643 }, { "epoch": 1.353162179085786, "grad_norm": 0.8502418994903564, "learning_rate": 4.926630434782608e-05, "loss": 0.3263, "step": 8644 }, { "epoch": 1.3533187226048842, "grad_norm": 1.0681360960006714, "learning_rate": 4.925441576086956e-05, "loss": 0.2348, "step": 8645 }, { "epoch": 1.3534752661239824, "grad_norm": 0.5974439382553101, "learning_rate": 4.924252717391304e-05, "loss": 0.1968, "step": 8646 }, { "epoch": 1.3536318096430808, "grad_norm": 0.7220355272293091, "learning_rate": 4.9230638586956515e-05, "loss": 0.3133, "step": 8647 }, { "epoch": 1.353788353162179, "grad_norm": 1.0907554626464844, "learning_rate": 4.9218749999999994e-05, "loss": 0.2953, "step": 8648 }, { "epoch": 1.3539448966812775, "grad_norm": 0.7381885051727295, "learning_rate": 4.920686141304347e-05, "loss": 0.2287, "step": 8649 }, { "epoch": 1.3541014402003757, "grad_norm": 1.1283825635910034, "learning_rate": 4.919497282608696e-05, "loss": 0.2835, "step": 8650 }, { "epoch": 1.354257983719474, "grad_norm": 0.6118396520614624, "learning_rate": 4.9183084239130435e-05, "loss": 0.1317, "step": 8651 }, { "epoch": 1.3544145272385724, "grad_norm": 1.12844717502594, "learning_rate": 4.917119565217391e-05, "loss": 0.3316, "step": 8652 }, { "epoch": 1.3545710707576706, "grad_norm": 1.149000883102417, "learning_rate": 4.915930706521739e-05, "loss": 0.2801, "step": 8653 }, { "epoch": 1.354727614276769, "grad_norm": 1.6531001329421997, "learning_rate": 4.914741847826086e-05, "loss": 0.2701, "step": 8654 }, { "epoch": 1.3548841577958672, "grad_norm": 2.4448797702789307, "learning_rate": 4.913552989130434e-05, "loss": 0.459, "step": 8655 }, { "epoch": 1.3550407013149655, "grad_norm": 0.9598848819732666, "learning_rate": 4.912364130434782e-05, "loss": 0.4595, "step": 8656 }, { "epoch": 1.355197244834064, "grad_norm": 1.4934172630310059, "learning_rate": 4.91117527173913e-05, "loss": 0.3371, "step": 8657 }, { "epoch": 1.3553537883531621, "grad_norm": 1.4942678213119507, "learning_rate": 4.9099864130434775e-05, "loss": 0.2646, "step": 8658 }, { "epoch": 1.3555103318722606, "grad_norm": 1.3748384714126587, "learning_rate": 4.908797554347825e-05, "loss": 0.4324, "step": 8659 }, { "epoch": 1.3556668753913588, "grad_norm": 2.7945456504821777, "learning_rate": 4.907608695652174e-05, "loss": 0.3209, "step": 8660 }, { "epoch": 1.355823418910457, "grad_norm": 1.6301156282424927, "learning_rate": 4.9064198369565216e-05, "loss": 0.5839, "step": 8661 }, { "epoch": 1.3559799624295554, "grad_norm": 3.2983603477478027, "learning_rate": 4.9052309782608695e-05, "loss": 0.8457, "step": 8662 }, { "epoch": 1.3561365059486536, "grad_norm": 2.498201370239258, "learning_rate": 4.904042119565217e-05, "loss": 0.3191, "step": 8663 }, { "epoch": 1.356293049467752, "grad_norm": 1.3859524726867676, "learning_rate": 4.902853260869565e-05, "loss": 0.4674, "step": 8664 }, { "epoch": 1.3564495929868503, "grad_norm": 2.400277853012085, "learning_rate": 4.901664402173913e-05, "loss": 0.4533, "step": 8665 }, { "epoch": 1.3566061365059485, "grad_norm": 1.8757410049438477, "learning_rate": 4.90047554347826e-05, "loss": 0.782, "step": 8666 }, { "epoch": 1.356762680025047, "grad_norm": 3.4600799083709717, "learning_rate": 4.899286684782608e-05, "loss": 0.3981, "step": 8667 }, { "epoch": 1.3569192235441454, "grad_norm": 2.6359457969665527, "learning_rate": 4.898097826086956e-05, "loss": 0.524, "step": 8668 }, { "epoch": 1.3570757670632436, "grad_norm": 2.744239091873169, "learning_rate": 4.8969089673913035e-05, "loss": 1.0342, "step": 8669 }, { "epoch": 1.3572323105823418, "grad_norm": 2.569261312484741, "learning_rate": 4.895720108695652e-05, "loss": 0.4895, "step": 8670 }, { "epoch": 1.3573888541014403, "grad_norm": 2.2081995010375977, "learning_rate": 4.89453125e-05, "loss": 0.4297, "step": 8671 }, { "epoch": 1.3575453976205385, "grad_norm": 2.5700435638427734, "learning_rate": 4.8933423913043476e-05, "loss": 0.9911, "step": 8672 }, { "epoch": 1.357701941139637, "grad_norm": 2.462186336517334, "learning_rate": 4.8921535326086954e-05, "loss": 0.9278, "step": 8673 }, { "epoch": 1.3578584846587352, "grad_norm": 3.0337209701538086, "learning_rate": 4.890964673913043e-05, "loss": 0.6607, "step": 8674 }, { "epoch": 1.3580150281778334, "grad_norm": 3.5184214115142822, "learning_rate": 4.889775815217391e-05, "loss": 0.6419, "step": 8675 }, { "epoch": 1.3581715716969318, "grad_norm": 3.734079599380493, "learning_rate": 4.888586956521739e-05, "loss": 1.8048, "step": 8676 }, { "epoch": 1.35832811521603, "grad_norm": 2.142179012298584, "learning_rate": 4.887398097826086e-05, "loss": 0.8972, "step": 8677 }, { "epoch": 1.3584846587351285, "grad_norm": 4.386713981628418, "learning_rate": 4.886209239130434e-05, "loss": 0.7525, "step": 8678 }, { "epoch": 1.3586412022542267, "grad_norm": 8.351215362548828, "learning_rate": 4.8850203804347816e-05, "loss": 1.4295, "step": 8679 }, { "epoch": 1.358797745773325, "grad_norm": 2.7755188941955566, "learning_rate": 4.88383152173913e-05, "loss": 0.7048, "step": 8680 }, { "epoch": 1.3589542892924233, "grad_norm": 1.8844479322433472, "learning_rate": 4.882642663043478e-05, "loss": 0.6207, "step": 8681 }, { "epoch": 1.3591108328115216, "grad_norm": 4.859302997589111, "learning_rate": 4.881453804347826e-05, "loss": 1.2541, "step": 8682 }, { "epoch": 1.35926737633062, "grad_norm": 1.9193274974822998, "learning_rate": 4.8802649456521736e-05, "loss": 0.6833, "step": 8683 }, { "epoch": 1.3594239198497182, "grad_norm": 2.7645938396453857, "learning_rate": 4.8790760869565214e-05, "loss": 0.6734, "step": 8684 }, { "epoch": 1.3595804633688164, "grad_norm": 1.7352386713027954, "learning_rate": 4.877887228260869e-05, "loss": 0.4098, "step": 8685 }, { "epoch": 1.3597370068879149, "grad_norm": 6.241868019104004, "learning_rate": 4.876698369565217e-05, "loss": 0.666, "step": 8686 }, { "epoch": 1.359893550407013, "grad_norm": 1.9919891357421875, "learning_rate": 4.875509510869565e-05, "loss": 0.5908, "step": 8687 }, { "epoch": 1.3600500939261115, "grad_norm": 6.00626802444458, "learning_rate": 4.8743206521739127e-05, "loss": 1.3677, "step": 8688 }, { "epoch": 1.3602066374452098, "grad_norm": 0.5872572660446167, "learning_rate": 4.87313179347826e-05, "loss": 0.2813, "step": 8689 }, { "epoch": 1.360363180964308, "grad_norm": 0.4304451644420624, "learning_rate": 4.871942934782608e-05, "loss": 0.1646, "step": 8690 }, { "epoch": 1.3605197244834064, "grad_norm": 0.5935004353523254, "learning_rate": 4.870754076086956e-05, "loss": 0.1977, "step": 8691 }, { "epoch": 1.3606762680025046, "grad_norm": 0.6166180968284607, "learning_rate": 4.869565217391304e-05, "loss": 0.2291, "step": 8692 }, { "epoch": 1.360832811521603, "grad_norm": 0.5003758668899536, "learning_rate": 4.868376358695652e-05, "loss": 0.2163, "step": 8693 }, { "epoch": 1.3609893550407013, "grad_norm": 0.5122238993644714, "learning_rate": 4.8671874999999996e-05, "loss": 0.2152, "step": 8694 }, { "epoch": 1.3611458985597995, "grad_norm": 1.142266869544983, "learning_rate": 4.8659986413043474e-05, "loss": 0.317, "step": 8695 }, { "epoch": 1.361302442078898, "grad_norm": 1.2222654819488525, "learning_rate": 4.864809782608695e-05, "loss": 0.2665, "step": 8696 }, { "epoch": 1.3614589855979962, "grad_norm": 1.2723073959350586, "learning_rate": 4.863620923913043e-05, "loss": 0.2151, "step": 8697 }, { "epoch": 1.3616155291170946, "grad_norm": 0.9143516421318054, "learning_rate": 4.862432065217391e-05, "loss": 0.2735, "step": 8698 }, { "epoch": 1.3617720726361928, "grad_norm": 0.9932056069374084, "learning_rate": 4.861243206521739e-05, "loss": 0.1986, "step": 8699 }, { "epoch": 1.361928616155291, "grad_norm": 0.8369459509849548, "learning_rate": 4.8600543478260864e-05, "loss": 0.3589, "step": 8700 }, { "epoch": 1.3620851596743895, "grad_norm": 0.903293251991272, "learning_rate": 4.858865489130434e-05, "loss": 0.2879, "step": 8701 }, { "epoch": 1.362241703193488, "grad_norm": 1.3786612749099731, "learning_rate": 4.857676630434782e-05, "loss": 0.4878, "step": 8702 }, { "epoch": 1.3623982467125861, "grad_norm": 1.3944262266159058, "learning_rate": 4.85648777173913e-05, "loss": 0.2984, "step": 8703 }, { "epoch": 1.3625547902316844, "grad_norm": 0.8816447257995605, "learning_rate": 4.855298913043478e-05, "loss": 0.274, "step": 8704 }, { "epoch": 1.3627113337507828, "grad_norm": 1.0275369882583618, "learning_rate": 4.8541100543478255e-05, "loss": 0.5238, "step": 8705 }, { "epoch": 1.362867877269881, "grad_norm": 1.545226812362671, "learning_rate": 4.8529211956521733e-05, "loss": 0.4178, "step": 8706 }, { "epoch": 1.3630244207889795, "grad_norm": 1.16740083694458, "learning_rate": 4.851732336956521e-05, "loss": 0.3406, "step": 8707 }, { "epoch": 1.3631809643080777, "grad_norm": 2.4351212978363037, "learning_rate": 4.850543478260869e-05, "loss": 0.4338, "step": 8708 }, { "epoch": 1.3633375078271759, "grad_norm": 1.9687472581863403, "learning_rate": 4.8493546195652175e-05, "loss": 0.5161, "step": 8709 }, { "epoch": 1.3634940513462743, "grad_norm": 2.517753839492798, "learning_rate": 4.848165760869565e-05, "loss": 0.7938, "step": 8710 }, { "epoch": 1.3636505948653725, "grad_norm": 1.6738455295562744, "learning_rate": 4.846976902173913e-05, "loss": 0.4026, "step": 8711 }, { "epoch": 1.363807138384471, "grad_norm": 2.2848284244537354, "learning_rate": 4.84578804347826e-05, "loss": 0.4245, "step": 8712 }, { "epoch": 1.3639636819035692, "grad_norm": 1.0400307178497314, "learning_rate": 4.844599184782608e-05, "loss": 0.2444, "step": 8713 }, { "epoch": 1.3641202254226674, "grad_norm": 1.2128838300704956, "learning_rate": 4.843410326086956e-05, "loss": 0.2879, "step": 8714 }, { "epoch": 1.3642767689417659, "grad_norm": 8.327973365783691, "learning_rate": 4.842221467391304e-05, "loss": 0.7146, "step": 8715 }, { "epoch": 1.364433312460864, "grad_norm": 3.5344290733337402, "learning_rate": 4.8410326086956515e-05, "loss": 0.8583, "step": 8716 }, { "epoch": 1.3645898559799625, "grad_norm": 3.2082934379577637, "learning_rate": 4.839843749999999e-05, "loss": 0.9918, "step": 8717 }, { "epoch": 1.3647463994990607, "grad_norm": 3.3535854816436768, "learning_rate": 4.838654891304348e-05, "loss": 0.6923, "step": 8718 }, { "epoch": 1.364902943018159, "grad_norm": 2.2578372955322266, "learning_rate": 4.8374660326086956e-05, "loss": 0.8323, "step": 8719 }, { "epoch": 1.3650594865372574, "grad_norm": 2.294664144515991, "learning_rate": 4.8362771739130434e-05, "loss": 0.7592, "step": 8720 }, { "epoch": 1.3652160300563556, "grad_norm": 3.369382381439209, "learning_rate": 4.835088315217391e-05, "loss": 0.837, "step": 8721 }, { "epoch": 1.365372573575454, "grad_norm": 2.0825917720794678, "learning_rate": 4.833899456521739e-05, "loss": 0.5553, "step": 8722 }, { "epoch": 1.3655291170945523, "grad_norm": 2.8305745124816895, "learning_rate": 4.832710597826086e-05, "loss": 0.6206, "step": 8723 }, { "epoch": 1.3656856606136505, "grad_norm": 2.9153544902801514, "learning_rate": 4.831521739130434e-05, "loss": 0.64, "step": 8724 }, { "epoch": 1.365842204132749, "grad_norm": 1.8254534006118774, "learning_rate": 4.830332880434782e-05, "loss": 0.9314, "step": 8725 }, { "epoch": 1.3659987476518471, "grad_norm": 1.8859654664993286, "learning_rate": 4.8291440217391297e-05, "loss": 0.7267, "step": 8726 }, { "epoch": 1.3661552911709456, "grad_norm": 3.3739144802093506, "learning_rate": 4.8279551630434775e-05, "loss": 0.8598, "step": 8727 }, { "epoch": 1.3663118346900438, "grad_norm": 3.309739589691162, "learning_rate": 4.826766304347826e-05, "loss": 0.6414, "step": 8728 }, { "epoch": 1.366468378209142, "grad_norm": 1.6394017934799194, "learning_rate": 4.825577445652174e-05, "loss": 0.4465, "step": 8729 }, { "epoch": 1.3666249217282405, "grad_norm": 4.029904365539551, "learning_rate": 4.8243885869565216e-05, "loss": 0.7959, "step": 8730 }, { "epoch": 1.3667814652473387, "grad_norm": 2.6488397121429443, "learning_rate": 4.8231997282608694e-05, "loss": 0.998, "step": 8731 }, { "epoch": 1.3669380087664371, "grad_norm": 1.7307370901107788, "learning_rate": 4.822010869565217e-05, "loss": 0.9784, "step": 8732 }, { "epoch": 1.3670945522855353, "grad_norm": 2.6860408782958984, "learning_rate": 4.820822010869565e-05, "loss": 0.5497, "step": 8733 }, { "epoch": 1.3672510958046336, "grad_norm": 3.810856580734253, "learning_rate": 4.819633152173913e-05, "loss": 1.1279, "step": 8734 }, { "epoch": 1.367407639323732, "grad_norm": 2.242473602294922, "learning_rate": 4.81844429347826e-05, "loss": 0.527, "step": 8735 }, { "epoch": 1.3675641828428304, "grad_norm": 1.6219332218170166, "learning_rate": 4.817255434782608e-05, "loss": 0.5787, "step": 8736 }, { "epoch": 1.3677207263619287, "grad_norm": 2.5285274982452393, "learning_rate": 4.8160665760869556e-05, "loss": 0.6722, "step": 8737 }, { "epoch": 1.3678772698810269, "grad_norm": 1.7520694732666016, "learning_rate": 4.814877717391304e-05, "loss": 0.5506, "step": 8738 }, { "epoch": 1.3680338134001253, "grad_norm": 0.6055160760879517, "learning_rate": 4.813688858695652e-05, "loss": 0.2448, "step": 8739 }, { "epoch": 1.3681903569192235, "grad_norm": 0.9453651905059814, "learning_rate": 4.8125e-05, "loss": 0.1889, "step": 8740 }, { "epoch": 1.368346900438322, "grad_norm": 0.6761276721954346, "learning_rate": 4.8113111413043476e-05, "loss": 0.1825, "step": 8741 }, { "epoch": 1.3685034439574202, "grad_norm": 0.38524511456489563, "learning_rate": 4.8101222826086954e-05, "loss": 0.1308, "step": 8742 }, { "epoch": 1.3686599874765184, "grad_norm": 0.8253977298736572, "learning_rate": 4.808933423913043e-05, "loss": 0.2153, "step": 8743 }, { "epoch": 1.3688165309956168, "grad_norm": 1.4076637029647827, "learning_rate": 4.807744565217391e-05, "loss": 0.4627, "step": 8744 }, { "epoch": 1.368973074514715, "grad_norm": 0.6993924379348755, "learning_rate": 4.806555706521739e-05, "loss": 0.2843, "step": 8745 }, { "epoch": 1.3691296180338135, "grad_norm": 0.8752833604812622, "learning_rate": 4.805366847826086e-05, "loss": 0.1997, "step": 8746 }, { "epoch": 1.3692861615529117, "grad_norm": 0.7385271191596985, "learning_rate": 4.804177989130434e-05, "loss": 0.2105, "step": 8747 }, { "epoch": 1.36944270507201, "grad_norm": 2.661916732788086, "learning_rate": 4.802989130434782e-05, "loss": 0.3312, "step": 8748 }, { "epoch": 1.3695992485911084, "grad_norm": 2.0270755290985107, "learning_rate": 4.80180027173913e-05, "loss": 0.2487, "step": 8749 }, { "epoch": 1.3697557921102066, "grad_norm": 0.8722821474075317, "learning_rate": 4.800611413043478e-05, "loss": 0.2966, "step": 8750 }, { "epoch": 1.369912335629305, "grad_norm": 4.276497840881348, "learning_rate": 4.799422554347826e-05, "loss": 0.7486, "step": 8751 }, { "epoch": 1.3700688791484033, "grad_norm": 2.0496444702148438, "learning_rate": 4.7982336956521735e-05, "loss": 0.3229, "step": 8752 }, { "epoch": 1.3702254226675015, "grad_norm": 1.157025933265686, "learning_rate": 4.7970448369565214e-05, "loss": 0.2856, "step": 8753 }, { "epoch": 1.3703819661866, "grad_norm": 0.6627770662307739, "learning_rate": 4.795855978260869e-05, "loss": 0.1153, "step": 8754 }, { "epoch": 1.3705385097056981, "grad_norm": 1.1030654907226562, "learning_rate": 4.794667119565217e-05, "loss": 0.3034, "step": 8755 }, { "epoch": 1.3706950532247966, "grad_norm": 1.213293194770813, "learning_rate": 4.793478260869565e-05, "loss": 0.2638, "step": 8756 }, { "epoch": 1.3708515967438948, "grad_norm": 1.4015768766403198, "learning_rate": 4.792289402173913e-05, "loss": 0.5609, "step": 8757 }, { "epoch": 1.371008140262993, "grad_norm": 1.3662710189819336, "learning_rate": 4.7911005434782604e-05, "loss": 0.3193, "step": 8758 }, { "epoch": 1.3711646837820914, "grad_norm": 1.7370175123214722, "learning_rate": 4.789911684782608e-05, "loss": 0.5036, "step": 8759 }, { "epoch": 1.3713212273011897, "grad_norm": 1.732642650604248, "learning_rate": 4.788722826086956e-05, "loss": 0.3508, "step": 8760 }, { "epoch": 1.371477770820288, "grad_norm": 1.4106186628341675, "learning_rate": 4.787533967391304e-05, "loss": 0.5246, "step": 8761 }, { "epoch": 1.3716343143393863, "grad_norm": 4.645236492156982, "learning_rate": 4.786345108695652e-05, "loss": 0.788, "step": 8762 }, { "epoch": 1.3717908578584845, "grad_norm": 2.6606578826904297, "learning_rate": 4.7851562499999995e-05, "loss": 0.3595, "step": 8763 }, { "epoch": 1.371947401377583, "grad_norm": 3.280275583267212, "learning_rate": 4.783967391304347e-05, "loss": 0.652, "step": 8764 }, { "epoch": 1.3721039448966814, "grad_norm": 1.8190118074417114, "learning_rate": 4.782778532608695e-05, "loss": 0.4737, "step": 8765 }, { "epoch": 1.3722604884157796, "grad_norm": 2.06160044670105, "learning_rate": 4.781589673913043e-05, "loss": 0.5167, "step": 8766 }, { "epoch": 1.3724170319348779, "grad_norm": 3.3053104877471924, "learning_rate": 4.7804008152173914e-05, "loss": 0.8407, "step": 8767 }, { "epoch": 1.372573575453976, "grad_norm": 1.2202759981155396, "learning_rate": 4.779211956521739e-05, "loss": 0.4258, "step": 8768 }, { "epoch": 1.3727301189730745, "grad_norm": 2.3172800540924072, "learning_rate": 4.7780230978260864e-05, "loss": 0.6653, "step": 8769 }, { "epoch": 1.372886662492173, "grad_norm": 4.468164443969727, "learning_rate": 4.776834239130434e-05, "loss": 0.9025, "step": 8770 }, { "epoch": 1.3730432060112712, "grad_norm": 1.9074190855026245, "learning_rate": 4.775645380434782e-05, "loss": 0.4159, "step": 8771 }, { "epoch": 1.3731997495303694, "grad_norm": 2.6679656505584717, "learning_rate": 4.77445652173913e-05, "loss": 0.7883, "step": 8772 }, { "epoch": 1.3733562930494678, "grad_norm": 4.146814346313477, "learning_rate": 4.773267663043478e-05, "loss": 0.7009, "step": 8773 }, { "epoch": 1.373512836568566, "grad_norm": 3.1173927783966064, "learning_rate": 4.7720788043478255e-05, "loss": 1.4635, "step": 8774 }, { "epoch": 1.3736693800876645, "grad_norm": 2.3988089561462402, "learning_rate": 4.770889945652173e-05, "loss": 1.0486, "step": 8775 }, { "epoch": 1.3738259236067627, "grad_norm": 5.287840843200684, "learning_rate": 4.769701086956521e-05, "loss": 1.3262, "step": 8776 }, { "epoch": 1.373982467125861, "grad_norm": 3.4796838760375977, "learning_rate": 4.7685122282608696e-05, "loss": 1.2762, "step": 8777 }, { "epoch": 1.3741390106449594, "grad_norm": 8.433894157409668, "learning_rate": 4.7673233695652174e-05, "loss": 0.7934, "step": 8778 }, { "epoch": 1.3742955541640576, "grad_norm": 2.9560091495513916, "learning_rate": 4.766134510869565e-05, "loss": 0.9523, "step": 8779 }, { "epoch": 1.374452097683156, "grad_norm": 3.2737491130828857, "learning_rate": 4.764945652173913e-05, "loss": 1.3024, "step": 8780 }, { "epoch": 1.3746086412022542, "grad_norm": 2.594682455062866, "learning_rate": 4.76375679347826e-05, "loss": 0.9975, "step": 8781 }, { "epoch": 1.3747651847213525, "grad_norm": 1.7206547260284424, "learning_rate": 4.762567934782608e-05, "loss": 0.7006, "step": 8782 }, { "epoch": 1.374921728240451, "grad_norm": 1.7586252689361572, "learning_rate": 4.761379076086956e-05, "loss": 0.6199, "step": 8783 }, { "epoch": 1.375078271759549, "grad_norm": 11.85706901550293, "learning_rate": 4.7601902173913036e-05, "loss": 0.8527, "step": 8784 }, { "epoch": 1.3752348152786475, "grad_norm": 1.4902894496917725, "learning_rate": 4.7590013586956514e-05, "loss": 0.3459, "step": 8785 }, { "epoch": 1.3753913587977458, "grad_norm": 2.7001841068267822, "learning_rate": 4.757812499999999e-05, "loss": 0.4534, "step": 8786 }, { "epoch": 1.375547902316844, "grad_norm": 3.8775205612182617, "learning_rate": 4.756623641304348e-05, "loss": 1.0295, "step": 8787 }, { "epoch": 1.3757044458359424, "grad_norm": 3.983433246612549, "learning_rate": 4.7554347826086956e-05, "loss": 0.9723, "step": 8788 }, { "epoch": 1.3758609893550406, "grad_norm": 0.6987612843513489, "learning_rate": 4.7542459239130434e-05, "loss": 0.2856, "step": 8789 }, { "epoch": 1.376017532874139, "grad_norm": 0.5122056007385254, "learning_rate": 4.753057065217391e-05, "loss": 0.2717, "step": 8790 }, { "epoch": 1.3761740763932373, "grad_norm": 0.9952983260154724, "learning_rate": 4.751868206521739e-05, "loss": 0.2602, "step": 8791 }, { "epoch": 1.3763306199123355, "grad_norm": 0.558459997177124, "learning_rate": 4.750679347826086e-05, "loss": 0.1748, "step": 8792 }, { "epoch": 1.376487163431434, "grad_norm": 0.5057030320167542, "learning_rate": 4.749490489130434e-05, "loss": 0.2667, "step": 8793 }, { "epoch": 1.3766437069505322, "grad_norm": 0.9130978584289551, "learning_rate": 4.748301630434782e-05, "loss": 0.1892, "step": 8794 }, { "epoch": 1.3768002504696306, "grad_norm": 0.7944409251213074, "learning_rate": 4.7471127717391296e-05, "loss": 0.2265, "step": 8795 }, { "epoch": 1.3769567939887288, "grad_norm": 0.90545254945755, "learning_rate": 4.7459239130434774e-05, "loss": 0.2723, "step": 8796 }, { "epoch": 1.377113337507827, "grad_norm": 4.420419692993164, "learning_rate": 4.744735054347826e-05, "loss": 0.8294, "step": 8797 }, { "epoch": 1.3772698810269255, "grad_norm": 0.923164427280426, "learning_rate": 4.743546195652174e-05, "loss": 0.3376, "step": 8798 }, { "epoch": 1.377426424546024, "grad_norm": 2.7344155311584473, "learning_rate": 4.7423573369565215e-05, "loss": 0.3336, "step": 8799 }, { "epoch": 1.3775829680651221, "grad_norm": 2.279280185699463, "learning_rate": 4.7411684782608694e-05, "loss": 0.3818, "step": 8800 }, { "epoch": 1.3777395115842204, "grad_norm": 0.7616026401519775, "learning_rate": 4.739979619565217e-05, "loss": 0.2432, "step": 8801 }, { "epoch": 1.3778960551033186, "grad_norm": 1.258264422416687, "learning_rate": 4.738790760869565e-05, "loss": 0.4352, "step": 8802 }, { "epoch": 1.378052598622417, "grad_norm": 1.5136786699295044, "learning_rate": 4.737601902173913e-05, "loss": 0.4512, "step": 8803 }, { "epoch": 1.3782091421415155, "grad_norm": 0.9353893399238586, "learning_rate": 4.73641304347826e-05, "loss": 0.2968, "step": 8804 }, { "epoch": 1.3783656856606137, "grad_norm": 2.388278007507324, "learning_rate": 4.735224184782608e-05, "loss": 0.3998, "step": 8805 }, { "epoch": 1.378522229179712, "grad_norm": 1.104280948638916, "learning_rate": 4.7340353260869556e-05, "loss": 0.2924, "step": 8806 }, { "epoch": 1.3786787726988103, "grad_norm": 2.2602720260620117, "learning_rate": 4.732846467391304e-05, "loss": 0.3239, "step": 8807 }, { "epoch": 1.3788353162179086, "grad_norm": 2.148451805114746, "learning_rate": 4.731657608695652e-05, "loss": 0.431, "step": 8808 }, { "epoch": 1.378991859737007, "grad_norm": 1.1847995519638062, "learning_rate": 4.73046875e-05, "loss": 0.2778, "step": 8809 }, { "epoch": 1.3791484032561052, "grad_norm": 2.5348432064056396, "learning_rate": 4.7292798913043475e-05, "loss": 0.6912, "step": 8810 }, { "epoch": 1.3793049467752034, "grad_norm": 0.9248403310775757, "learning_rate": 4.728091032608695e-05, "loss": 0.3706, "step": 8811 }, { "epoch": 1.3794614902943019, "grad_norm": 1.6044185161590576, "learning_rate": 4.726902173913043e-05, "loss": 0.2447, "step": 8812 }, { "epoch": 1.3796180338134, "grad_norm": 1.6274502277374268, "learning_rate": 4.725713315217391e-05, "loss": 0.4503, "step": 8813 }, { "epoch": 1.3797745773324985, "grad_norm": 1.5141072273254395, "learning_rate": 4.724524456521739e-05, "loss": 0.4545, "step": 8814 }, { "epoch": 1.3799311208515967, "grad_norm": 3.33455753326416, "learning_rate": 4.723335597826086e-05, "loss": 0.8671, "step": 8815 }, { "epoch": 1.380087664370695, "grad_norm": 2.2365808486938477, "learning_rate": 4.722146739130434e-05, "loss": 0.5974, "step": 8816 }, { "epoch": 1.3802442078897934, "grad_norm": 1.857866644859314, "learning_rate": 4.720957880434782e-05, "loss": 0.8861, "step": 8817 }, { "epoch": 1.3804007514088916, "grad_norm": 4.055851459503174, "learning_rate": 4.71976902173913e-05, "loss": 0.8266, "step": 8818 }, { "epoch": 1.38055729492799, "grad_norm": 3.1050639152526855, "learning_rate": 4.718580163043478e-05, "loss": 0.4674, "step": 8819 }, { "epoch": 1.3807138384470883, "grad_norm": 8.576010704040527, "learning_rate": 4.717391304347826e-05, "loss": 0.9482, "step": 8820 }, { "epoch": 1.3808703819661865, "grad_norm": 2.881955623626709, "learning_rate": 4.7162024456521735e-05, "loss": 0.7999, "step": 8821 }, { "epoch": 1.381026925485285, "grad_norm": 2.6963319778442383, "learning_rate": 4.715013586956521e-05, "loss": 0.5517, "step": 8822 }, { "epoch": 1.3811834690043832, "grad_norm": 2.8883543014526367, "learning_rate": 4.713824728260869e-05, "loss": 0.995, "step": 8823 }, { "epoch": 1.3813400125234816, "grad_norm": 2.7534289360046387, "learning_rate": 4.712635869565217e-05, "loss": 0.5308, "step": 8824 }, { "epoch": 1.3814965560425798, "grad_norm": 2.3978726863861084, "learning_rate": 4.7114470108695654e-05, "loss": 0.707, "step": 8825 }, { "epoch": 1.381653099561678, "grad_norm": 3.1557815074920654, "learning_rate": 4.710258152173913e-05, "loss": 1.3346, "step": 8826 }, { "epoch": 1.3818096430807765, "grad_norm": 2.500563621520996, "learning_rate": 4.7090692934782604e-05, "loss": 1.1651, "step": 8827 }, { "epoch": 1.3819661865998747, "grad_norm": 3.047166585922241, "learning_rate": 4.707880434782608e-05, "loss": 1.1589, "step": 8828 }, { "epoch": 1.3821227301189731, "grad_norm": 2.411794424057007, "learning_rate": 4.706691576086956e-05, "loss": 0.9432, "step": 8829 }, { "epoch": 1.3822792736380713, "grad_norm": 3.3806235790252686, "learning_rate": 4.705502717391304e-05, "loss": 1.2077, "step": 8830 }, { "epoch": 1.3824358171571696, "grad_norm": 1.6027857065200806, "learning_rate": 4.7043138586956516e-05, "loss": 0.5383, "step": 8831 }, { "epoch": 1.382592360676268, "grad_norm": 1.919935941696167, "learning_rate": 4.7031249999999995e-05, "loss": 1.1423, "step": 8832 }, { "epoch": 1.3827489041953664, "grad_norm": 1.3852185010910034, "learning_rate": 4.701936141304347e-05, "loss": 0.9553, "step": 8833 }, { "epoch": 1.3829054477144647, "grad_norm": 2.6347076892852783, "learning_rate": 4.700747282608695e-05, "loss": 0.3568, "step": 8834 }, { "epoch": 1.3830619912335629, "grad_norm": 3.5322773456573486, "learning_rate": 4.6995584239130436e-05, "loss": 0.9662, "step": 8835 }, { "epoch": 1.3832185347526613, "grad_norm": 3.8267812728881836, "learning_rate": 4.6983695652173914e-05, "loss": 1.3118, "step": 8836 }, { "epoch": 1.3833750782717595, "grad_norm": 12.715620994567871, "learning_rate": 4.697180706521739e-05, "loss": 0.9786, "step": 8837 }, { "epoch": 1.383531621790858, "grad_norm": 2.353909730911255, "learning_rate": 4.6959918478260864e-05, "loss": 0.8678, "step": 8838 }, { "epoch": 1.3836881653099562, "grad_norm": 0.44444507360458374, "learning_rate": 4.694802989130434e-05, "loss": 0.1747, "step": 8839 }, { "epoch": 1.3838447088290544, "grad_norm": 0.49438491463661194, "learning_rate": 4.693614130434782e-05, "loss": 0.2771, "step": 8840 }, { "epoch": 1.3840012523481529, "grad_norm": 0.5984125733375549, "learning_rate": 4.69242527173913e-05, "loss": 0.1712, "step": 8841 }, { "epoch": 1.384157795867251, "grad_norm": 0.5289652943611145, "learning_rate": 4.6912364130434776e-05, "loss": 0.2079, "step": 8842 }, { "epoch": 1.3843143393863495, "grad_norm": 0.7883012294769287, "learning_rate": 4.6900475543478254e-05, "loss": 0.2432, "step": 8843 }, { "epoch": 1.3844708829054477, "grad_norm": 0.6299050450325012, "learning_rate": 4.688858695652173e-05, "loss": 0.2357, "step": 8844 }, { "epoch": 1.384627426424546, "grad_norm": 0.7287395000457764, "learning_rate": 4.687669836956522e-05, "loss": 0.2368, "step": 8845 }, { "epoch": 1.3847839699436444, "grad_norm": 0.6482690572738647, "learning_rate": 4.6864809782608696e-05, "loss": 0.2701, "step": 8846 }, { "epoch": 1.3849405134627426, "grad_norm": 0.5398342609405518, "learning_rate": 4.6852921195652174e-05, "loss": 0.2414, "step": 8847 }, { "epoch": 1.385097056981841, "grad_norm": 0.4028090834617615, "learning_rate": 4.684103260869565e-05, "loss": 0.1759, "step": 8848 }, { "epoch": 1.3852536005009393, "grad_norm": 0.9023312926292419, "learning_rate": 4.682914402173913e-05, "loss": 0.2031, "step": 8849 }, { "epoch": 1.3854101440200375, "grad_norm": 1.191253662109375, "learning_rate": 4.68172554347826e-05, "loss": 0.2488, "step": 8850 }, { "epoch": 1.385566687539136, "grad_norm": 1.5138559341430664, "learning_rate": 4.680536684782608e-05, "loss": 0.5808, "step": 8851 }, { "epoch": 1.3857232310582341, "grad_norm": 1.785079002380371, "learning_rate": 4.679347826086956e-05, "loss": 0.2036, "step": 8852 }, { "epoch": 1.3858797745773326, "grad_norm": 0.7895663976669312, "learning_rate": 4.6781589673913036e-05, "loss": 0.3291, "step": 8853 }, { "epoch": 1.3860363180964308, "grad_norm": 0.6987854838371277, "learning_rate": 4.6769701086956514e-05, "loss": 0.3544, "step": 8854 }, { "epoch": 1.386192861615529, "grad_norm": 0.9825424551963806, "learning_rate": 4.67578125e-05, "loss": 0.4213, "step": 8855 }, { "epoch": 1.3863494051346275, "grad_norm": 1.5898878574371338, "learning_rate": 4.674592391304348e-05, "loss": 0.4714, "step": 8856 }, { "epoch": 1.3865059486537257, "grad_norm": 1.8925260305404663, "learning_rate": 4.6734035326086955e-05, "loss": 0.4418, "step": 8857 }, { "epoch": 1.3866624921728241, "grad_norm": 1.3576107025146484, "learning_rate": 4.6722146739130433e-05, "loss": 0.3613, "step": 8858 }, { "epoch": 1.3868190356919223, "grad_norm": 3.3030574321746826, "learning_rate": 4.671025815217391e-05, "loss": 0.8392, "step": 8859 }, { "epoch": 1.3869755792110205, "grad_norm": 1.8192387819290161, "learning_rate": 4.669836956521739e-05, "loss": 0.5973, "step": 8860 }, { "epoch": 1.387132122730119, "grad_norm": 2.1421103477478027, "learning_rate": 4.668648097826086e-05, "loss": 0.9376, "step": 8861 }, { "epoch": 1.3872886662492172, "grad_norm": 1.8127254247665405, "learning_rate": 4.667459239130434e-05, "loss": 0.4253, "step": 8862 }, { "epoch": 1.3874452097683156, "grad_norm": 1.8144464492797852, "learning_rate": 4.666270380434782e-05, "loss": 0.4247, "step": 8863 }, { "epoch": 1.3876017532874139, "grad_norm": 1.556758999824524, "learning_rate": 4.6650815217391296e-05, "loss": 0.5461, "step": 8864 }, { "epoch": 1.387758296806512, "grad_norm": 2.1873250007629395, "learning_rate": 4.663892663043478e-05, "loss": 0.5662, "step": 8865 }, { "epoch": 1.3879148403256105, "grad_norm": 1.6827532052993774, "learning_rate": 4.662703804347826e-05, "loss": 0.5921, "step": 8866 }, { "epoch": 1.388071383844709, "grad_norm": 3.975372552871704, "learning_rate": 4.661514945652174e-05, "loss": 0.6511, "step": 8867 }, { "epoch": 1.3882279273638072, "grad_norm": 2.0397019386291504, "learning_rate": 4.6603260869565215e-05, "loss": 0.5548, "step": 8868 }, { "epoch": 1.3883844708829054, "grad_norm": 2.436241626739502, "learning_rate": 4.659137228260869e-05, "loss": 0.8876, "step": 8869 }, { "epoch": 1.3885410144020038, "grad_norm": 2.3034746646881104, "learning_rate": 4.657948369565217e-05, "loss": 0.8656, "step": 8870 }, { "epoch": 1.388697557921102, "grad_norm": 2.465315341949463, "learning_rate": 4.656759510869565e-05, "loss": 0.8614, "step": 8871 }, { "epoch": 1.3888541014402005, "grad_norm": 1.495560884475708, "learning_rate": 4.655570652173913e-05, "loss": 0.4279, "step": 8872 }, { "epoch": 1.3890106449592987, "grad_norm": 2.498847484588623, "learning_rate": 4.65438179347826e-05, "loss": 0.8538, "step": 8873 }, { "epoch": 1.389167188478397, "grad_norm": 4.364850044250488, "learning_rate": 4.653192934782608e-05, "loss": 1.0748, "step": 8874 }, { "epoch": 1.3893237319974954, "grad_norm": 2.3984851837158203, "learning_rate": 4.652004076086956e-05, "loss": 0.8152, "step": 8875 }, { "epoch": 1.3894802755165936, "grad_norm": 2.379234552383423, "learning_rate": 4.650815217391304e-05, "loss": 0.9118, "step": 8876 }, { "epoch": 1.389636819035692, "grad_norm": 1.5549315214157104, "learning_rate": 4.649626358695652e-05, "loss": 0.8252, "step": 8877 }, { "epoch": 1.3897933625547902, "grad_norm": 3.8955750465393066, "learning_rate": 4.6484374999999997e-05, "loss": 0.759, "step": 8878 }, { "epoch": 1.3899499060738885, "grad_norm": 2.4461114406585693, "learning_rate": 4.6472486413043475e-05, "loss": 0.6371, "step": 8879 }, { "epoch": 1.390106449592987, "grad_norm": 2.503406286239624, "learning_rate": 4.646059782608695e-05, "loss": 1.1029, "step": 8880 }, { "epoch": 1.3902629931120851, "grad_norm": 2.0052709579467773, "learning_rate": 4.644870923913043e-05, "loss": 0.8532, "step": 8881 }, { "epoch": 1.3904195366311836, "grad_norm": 6.20941686630249, "learning_rate": 4.643682065217391e-05, "loss": 0.8906, "step": 8882 }, { "epoch": 1.3905760801502818, "grad_norm": 6.2393646240234375, "learning_rate": 4.642493206521739e-05, "loss": 1.5376, "step": 8883 }, { "epoch": 1.39073262366938, "grad_norm": 1.8350799083709717, "learning_rate": 4.641304347826086e-05, "loss": 0.545, "step": 8884 }, { "epoch": 1.3908891671884784, "grad_norm": 1.340080976486206, "learning_rate": 4.6401154891304344e-05, "loss": 0.6851, "step": 8885 }, { "epoch": 1.3910457107075767, "grad_norm": 2.7567245960235596, "learning_rate": 4.638926630434782e-05, "loss": 0.755, "step": 8886 }, { "epoch": 1.391202254226675, "grad_norm": 4.829906940460205, "learning_rate": 4.63773777173913e-05, "loss": 1.2938, "step": 8887 }, { "epoch": 1.3913587977457733, "grad_norm": 4.204695224761963, "learning_rate": 4.636548913043478e-05, "loss": 1.5418, "step": 8888 }, { "epoch": 1.3915153412648715, "grad_norm": 0.4432068169116974, "learning_rate": 4.6353600543478256e-05, "loss": 0.1917, "step": 8889 }, { "epoch": 1.39167188478397, "grad_norm": 1.0203258991241455, "learning_rate": 4.6341711956521734e-05, "loss": 0.2562, "step": 8890 }, { "epoch": 1.3918284283030682, "grad_norm": 0.6243074536323547, "learning_rate": 4.632982336956521e-05, "loss": 0.1821, "step": 8891 }, { "epoch": 1.3919849718221666, "grad_norm": 0.4368225336074829, "learning_rate": 4.631793478260869e-05, "loss": 0.2021, "step": 8892 }, { "epoch": 1.3921415153412648, "grad_norm": 0.7680044770240784, "learning_rate": 4.630604619565217e-05, "loss": 0.187, "step": 8893 }, { "epoch": 1.392298058860363, "grad_norm": 0.9097186326980591, "learning_rate": 4.6294157608695654e-05, "loss": 0.583, "step": 8894 }, { "epoch": 1.3924546023794615, "grad_norm": 1.0232489109039307, "learning_rate": 4.628226902173913e-05, "loss": 0.3351, "step": 8895 }, { "epoch": 1.3926111458985597, "grad_norm": 0.7052410244941711, "learning_rate": 4.62703804347826e-05, "loss": 0.1726, "step": 8896 }, { "epoch": 1.3927676894176582, "grad_norm": 0.5566078424453735, "learning_rate": 4.625849184782608e-05, "loss": 0.182, "step": 8897 }, { "epoch": 1.3929242329367564, "grad_norm": 1.8835728168487549, "learning_rate": 4.624660326086956e-05, "loss": 0.3151, "step": 8898 }, { "epoch": 1.3930807764558546, "grad_norm": 0.9906134009361267, "learning_rate": 4.623471467391304e-05, "loss": 0.4038, "step": 8899 }, { "epoch": 1.393237319974953, "grad_norm": 1.0502111911773682, "learning_rate": 4.6222826086956516e-05, "loss": 0.3436, "step": 8900 }, { "epoch": 1.3933938634940515, "grad_norm": 1.2029130458831787, "learning_rate": 4.6210937499999994e-05, "loss": 0.5402, "step": 8901 }, { "epoch": 1.3935504070131497, "grad_norm": 0.9456429481506348, "learning_rate": 4.619904891304347e-05, "loss": 0.303, "step": 8902 }, { "epoch": 1.393706950532248, "grad_norm": 1.805944800376892, "learning_rate": 4.618716032608695e-05, "loss": 0.4825, "step": 8903 }, { "epoch": 1.3938634940513464, "grad_norm": 1.2222557067871094, "learning_rate": 4.6175271739130435e-05, "loss": 0.3435, "step": 8904 }, { "epoch": 1.3940200375704446, "grad_norm": 2.2029547691345215, "learning_rate": 4.6163383152173914e-05, "loss": 0.4329, "step": 8905 }, { "epoch": 1.394176581089543, "grad_norm": 1.4165655374526978, "learning_rate": 4.615149456521739e-05, "loss": 0.4631, "step": 8906 }, { "epoch": 1.3943331246086412, "grad_norm": 2.5969724655151367, "learning_rate": 4.613960597826086e-05, "loss": 0.4267, "step": 8907 }, { "epoch": 1.3944896681277394, "grad_norm": 1.1110504865646362, "learning_rate": 4.612771739130434e-05, "loss": 0.3085, "step": 8908 }, { "epoch": 1.3946462116468379, "grad_norm": 2.212676525115967, "learning_rate": 4.611582880434782e-05, "loss": 0.345, "step": 8909 }, { "epoch": 1.394802755165936, "grad_norm": 1.5128042697906494, "learning_rate": 4.61039402173913e-05, "loss": 0.4756, "step": 8910 }, { "epoch": 1.3949592986850345, "grad_norm": 2.5801851749420166, "learning_rate": 4.6092051630434776e-05, "loss": 0.429, "step": 8911 }, { "epoch": 1.3951158422041328, "grad_norm": 1.0990955829620361, "learning_rate": 4.6080163043478254e-05, "loss": 0.4929, "step": 8912 }, { "epoch": 1.395272385723231, "grad_norm": 1.8999942541122437, "learning_rate": 4.606827445652174e-05, "loss": 0.4845, "step": 8913 }, { "epoch": 1.3954289292423294, "grad_norm": 3.402658462524414, "learning_rate": 4.605638586956522e-05, "loss": 0.4677, "step": 8914 }, { "epoch": 1.3955854727614276, "grad_norm": 2.0569088459014893, "learning_rate": 4.6044497282608695e-05, "loss": 0.5725, "step": 8915 }, { "epoch": 1.395742016280526, "grad_norm": 4.452723503112793, "learning_rate": 4.603260869565217e-05, "loss": 0.5843, "step": 8916 }, { "epoch": 1.3958985597996243, "grad_norm": 2.5079400539398193, "learning_rate": 4.602072010869565e-05, "loss": 0.6174, "step": 8917 }, { "epoch": 1.3960551033187225, "grad_norm": 1.174926996231079, "learning_rate": 4.600883152173913e-05, "loss": 0.4189, "step": 8918 }, { "epoch": 1.396211646837821, "grad_norm": 2.453042984008789, "learning_rate": 4.59969429347826e-05, "loss": 0.5755, "step": 8919 }, { "epoch": 1.3963681903569192, "grad_norm": 3.9377169609069824, "learning_rate": 4.598505434782608e-05, "loss": 0.8397, "step": 8920 }, { "epoch": 1.3965247338760176, "grad_norm": 1.7691380977630615, "learning_rate": 4.597316576086956e-05, "loss": 0.6046, "step": 8921 }, { "epoch": 1.3966812773951158, "grad_norm": 2.6266229152679443, "learning_rate": 4.5961277173913035e-05, "loss": 0.8935, "step": 8922 }, { "epoch": 1.396837820914214, "grad_norm": 2.6443023681640625, "learning_rate": 4.594938858695652e-05, "loss": 0.6981, "step": 8923 }, { "epoch": 1.3969943644333125, "grad_norm": 2.678394079208374, "learning_rate": 4.59375e-05, "loss": 0.7971, "step": 8924 }, { "epoch": 1.3971509079524107, "grad_norm": 3.6977293491363525, "learning_rate": 4.592561141304348e-05, "loss": 0.4965, "step": 8925 }, { "epoch": 1.3973074514715091, "grad_norm": 3.797037363052368, "learning_rate": 4.5913722826086955e-05, "loss": 0.8348, "step": 8926 }, { "epoch": 1.3974639949906074, "grad_norm": 2.771094799041748, "learning_rate": 4.590183423913043e-05, "loss": 1.1392, "step": 8927 }, { "epoch": 1.3976205385097056, "grad_norm": 4.010396957397461, "learning_rate": 4.588994565217391e-05, "loss": 1.5901, "step": 8928 }, { "epoch": 1.397777082028804, "grad_norm": 2.5656087398529053, "learning_rate": 4.587805706521739e-05, "loss": 0.9226, "step": 8929 }, { "epoch": 1.3979336255479022, "grad_norm": 2.37449049949646, "learning_rate": 4.586616847826086e-05, "loss": 0.6301, "step": 8930 }, { "epoch": 1.3980901690670007, "grad_norm": 11.355772018432617, "learning_rate": 4.585427989130434e-05, "loss": 1.5599, "step": 8931 }, { "epoch": 1.398246712586099, "grad_norm": 5.20739221572876, "learning_rate": 4.584239130434782e-05, "loss": 0.677, "step": 8932 }, { "epoch": 1.3984032561051971, "grad_norm": 2.958529472351074, "learning_rate": 4.58305027173913e-05, "loss": 1.2729, "step": 8933 }, { "epoch": 1.3985597996242956, "grad_norm": 1.126070261001587, "learning_rate": 4.581861413043478e-05, "loss": 0.4155, "step": 8934 }, { "epoch": 1.398716343143394, "grad_norm": 4.64093542098999, "learning_rate": 4.580672554347826e-05, "loss": 0.4622, "step": 8935 }, { "epoch": 1.3988728866624922, "grad_norm": 3.71183443069458, "learning_rate": 4.5794836956521736e-05, "loss": 0.7033, "step": 8936 }, { "epoch": 1.3990294301815904, "grad_norm": 2.356381893157959, "learning_rate": 4.5782948369565215e-05, "loss": 0.8392, "step": 8937 }, { "epoch": 1.3991859737006889, "grad_norm": 3.9193906784057617, "learning_rate": 4.577105978260869e-05, "loss": 1.5372, "step": 8938 }, { "epoch": 1.399342517219787, "grad_norm": 0.7254648208618164, "learning_rate": 4.575917119565217e-05, "loss": 0.2923, "step": 8939 }, { "epoch": 1.3994990607388855, "grad_norm": 2.2369303703308105, "learning_rate": 4.574728260869565e-05, "loss": 0.1656, "step": 8940 }, { "epoch": 1.3996556042579837, "grad_norm": 0.5574643015861511, "learning_rate": 4.573539402173913e-05, "loss": 0.2042, "step": 8941 }, { "epoch": 1.399812147777082, "grad_norm": 1.7477610111236572, "learning_rate": 4.57235054347826e-05, "loss": 0.318, "step": 8942 }, { "epoch": 1.3999686912961804, "grad_norm": 0.8166016936302185, "learning_rate": 4.5711616847826083e-05, "loss": 0.218, "step": 8943 }, { "epoch": 1.4001252348152786, "grad_norm": 1.287175178527832, "learning_rate": 4.569972826086956e-05, "loss": 0.3486, "step": 8944 }, { "epoch": 1.400281778334377, "grad_norm": 0.6728498339653015, "learning_rate": 4.568783967391304e-05, "loss": 0.2262, "step": 8945 }, { "epoch": 1.4004383218534753, "grad_norm": 0.98634272813797, "learning_rate": 4.567595108695652e-05, "loss": 0.2613, "step": 8946 }, { "epoch": 1.4005948653725735, "grad_norm": 1.2819567918777466, "learning_rate": 4.5664062499999996e-05, "loss": 0.2864, "step": 8947 }, { "epoch": 1.400751408891672, "grad_norm": 0.6795997619628906, "learning_rate": 4.5652173913043474e-05, "loss": 0.2242, "step": 8948 }, { "epoch": 1.4009079524107702, "grad_norm": 1.0056177377700806, "learning_rate": 4.564028532608695e-05, "loss": 0.2748, "step": 8949 }, { "epoch": 1.4010644959298686, "grad_norm": 1.2610639333724976, "learning_rate": 4.562839673913043e-05, "loss": 0.3604, "step": 8950 }, { "epoch": 1.4012210394489668, "grad_norm": 0.8680161237716675, "learning_rate": 4.561650815217391e-05, "loss": 0.3552, "step": 8951 }, { "epoch": 1.401377582968065, "grad_norm": 1.5010323524475098, "learning_rate": 4.5604619565217394e-05, "loss": 0.2495, "step": 8952 }, { "epoch": 1.4015341264871635, "grad_norm": 0.9949285984039307, "learning_rate": 4.5592730978260865e-05, "loss": 0.2266, "step": 8953 }, { "epoch": 1.4016906700062617, "grad_norm": 1.2843704223632812, "learning_rate": 4.558084239130434e-05, "loss": 0.4024, "step": 8954 }, { "epoch": 1.4018472135253601, "grad_norm": 2.4320249557495117, "learning_rate": 4.556895380434782e-05, "loss": 1.0214, "step": 8955 }, { "epoch": 1.4020037570444583, "grad_norm": 2.9043047428131104, "learning_rate": 4.55570652173913e-05, "loss": 0.5511, "step": 8956 }, { "epoch": 1.4021603005635566, "grad_norm": 2.05167293548584, "learning_rate": 4.554517663043478e-05, "loss": 0.5556, "step": 8957 }, { "epoch": 1.402316844082655, "grad_norm": 2.33481502532959, "learning_rate": 4.5533288043478256e-05, "loss": 0.4197, "step": 8958 }, { "epoch": 1.4024733876017532, "grad_norm": 1.5464767217636108, "learning_rate": 4.5521399456521734e-05, "loss": 0.6407, "step": 8959 }, { "epoch": 1.4026299311208517, "grad_norm": 2.419109582901001, "learning_rate": 4.550951086956521e-05, "loss": 0.4118, "step": 8960 }, { "epoch": 1.4027864746399499, "grad_norm": 3.8100662231445312, "learning_rate": 4.549762228260869e-05, "loss": 0.9788, "step": 8961 }, { "epoch": 1.402943018159048, "grad_norm": 1.9491182565689087, "learning_rate": 4.5485733695652175e-05, "loss": 0.5483, "step": 8962 }, { "epoch": 1.4030995616781465, "grad_norm": 6.341459274291992, "learning_rate": 4.547384510869565e-05, "loss": 0.8854, "step": 8963 }, { "epoch": 1.4032561051972448, "grad_norm": 4.134757995605469, "learning_rate": 4.546195652173913e-05, "loss": 0.7459, "step": 8964 }, { "epoch": 1.4034126487163432, "grad_norm": 1.320175051689148, "learning_rate": 4.54500679347826e-05, "loss": 0.2088, "step": 8965 }, { "epoch": 1.4035691922354414, "grad_norm": 1.5179933309555054, "learning_rate": 4.543817934782608e-05, "loss": 0.7841, "step": 8966 }, { "epoch": 1.4037257357545396, "grad_norm": 2.485819101333618, "learning_rate": 4.542629076086956e-05, "loss": 0.6086, "step": 8967 }, { "epoch": 1.403882279273638, "grad_norm": 1.6496407985687256, "learning_rate": 4.541440217391304e-05, "loss": 0.4428, "step": 8968 }, { "epoch": 1.4040388227927365, "grad_norm": 1.8806239366531372, "learning_rate": 4.5402513586956516e-05, "loss": 0.4071, "step": 8969 }, { "epoch": 1.4041953663118347, "grad_norm": 1.836503267288208, "learning_rate": 4.5390624999999994e-05, "loss": 0.3651, "step": 8970 }, { "epoch": 1.404351909830933, "grad_norm": 1.6506457328796387, "learning_rate": 4.537873641304347e-05, "loss": 0.7041, "step": 8971 }, { "epoch": 1.4045084533500314, "grad_norm": 1.7378846406936646, "learning_rate": 4.536684782608696e-05, "loss": 0.3426, "step": 8972 }, { "epoch": 1.4046649968691296, "grad_norm": 1.8746554851531982, "learning_rate": 4.5354959239130435e-05, "loss": 0.6663, "step": 8973 }, { "epoch": 1.404821540388228, "grad_norm": 3.09273624420166, "learning_rate": 4.534307065217391e-05, "loss": 0.7376, "step": 8974 }, { "epoch": 1.4049780839073263, "grad_norm": 2.82524037361145, "learning_rate": 4.533118206521739e-05, "loss": 0.7559, "step": 8975 }, { "epoch": 1.4051346274264245, "grad_norm": 3.069613456726074, "learning_rate": 4.531929347826086e-05, "loss": 1.0525, "step": 8976 }, { "epoch": 1.405291170945523, "grad_norm": 3.0058462619781494, "learning_rate": 4.530740489130434e-05, "loss": 1.2561, "step": 8977 }, { "epoch": 1.4054477144646211, "grad_norm": 2.838533639907837, "learning_rate": 4.529551630434782e-05, "loss": 0.8955, "step": 8978 }, { "epoch": 1.4056042579837196, "grad_norm": 7.278471946716309, "learning_rate": 4.52836277173913e-05, "loss": 1.3262, "step": 8979 }, { "epoch": 1.4057608015028178, "grad_norm": 3.124790668487549, "learning_rate": 4.5271739130434775e-05, "loss": 0.6907, "step": 8980 }, { "epoch": 1.405917345021916, "grad_norm": 2.6099531650543213, "learning_rate": 4.5259850543478253e-05, "loss": 1.4565, "step": 8981 }, { "epoch": 1.4060738885410144, "grad_norm": 2.591843605041504, "learning_rate": 4.524796195652174e-05, "loss": 1.1962, "step": 8982 }, { "epoch": 1.4062304320601127, "grad_norm": 2.451850414276123, "learning_rate": 4.5236073369565216e-05, "loss": 1.0287, "step": 8983 }, { "epoch": 1.406386975579211, "grad_norm": 3.5857510566711426, "learning_rate": 4.5224184782608695e-05, "loss": 0.576, "step": 8984 }, { "epoch": 1.4065435190983093, "grad_norm": 1.3999063968658447, "learning_rate": 4.521229619565217e-05, "loss": 0.1694, "step": 8985 }, { "epoch": 1.4067000626174075, "grad_norm": 1.2397531270980835, "learning_rate": 4.520040760869565e-05, "loss": 0.25, "step": 8986 }, { "epoch": 1.406856606136506, "grad_norm": 3.0373127460479736, "learning_rate": 4.518851902173913e-05, "loss": 0.6845, "step": 8987 }, { "epoch": 1.4070131496556042, "grad_norm": 1.6840862035751343, "learning_rate": 4.51766304347826e-05, "loss": 0.5419, "step": 8988 }, { "epoch": 1.4071696931747026, "grad_norm": 0.43111494183540344, "learning_rate": 4.516474184782608e-05, "loss": 0.1908, "step": 8989 }, { "epoch": 1.4073262366938009, "grad_norm": 0.6340141296386719, "learning_rate": 4.515285326086956e-05, "loss": 0.2172, "step": 8990 }, { "epoch": 1.407482780212899, "grad_norm": 0.7759098410606384, "learning_rate": 4.5140964673913035e-05, "loss": 0.2066, "step": 8991 }, { "epoch": 1.4076393237319975, "grad_norm": 1.1922410726547241, "learning_rate": 4.512907608695652e-05, "loss": 0.4065, "step": 8992 }, { "epoch": 1.4077958672510957, "grad_norm": 0.5144864916801453, "learning_rate": 4.51171875e-05, "loss": 0.2463, "step": 8993 }, { "epoch": 1.4079524107701942, "grad_norm": 1.557058334350586, "learning_rate": 4.5105298913043476e-05, "loss": 0.3528, "step": 8994 }, { "epoch": 1.4081089542892924, "grad_norm": 1.3769549131393433, "learning_rate": 4.5093410326086954e-05, "loss": 0.2657, "step": 8995 }, { "epoch": 1.4082654978083906, "grad_norm": 0.9886278510093689, "learning_rate": 4.508152173913043e-05, "loss": 0.3394, "step": 8996 }, { "epoch": 1.408422041327489, "grad_norm": 1.4882901906967163, "learning_rate": 4.506963315217391e-05, "loss": 0.2687, "step": 8997 }, { "epoch": 1.4085785848465875, "grad_norm": 0.5634118914604187, "learning_rate": 4.505774456521739e-05, "loss": 0.2475, "step": 8998 }, { "epoch": 1.4087351283656857, "grad_norm": 0.9857126474380493, "learning_rate": 4.504585597826086e-05, "loss": 0.2034, "step": 8999 }, { "epoch": 1.408891671884784, "grad_norm": 1.0982046127319336, "learning_rate": 4.503396739130434e-05, "loss": 0.2897, "step": 9000 }, { "epoch": 1.408891671884784, "eval_loss": 0.4757098853588104, "eval_runtime": 205.2292, "eval_samples_per_second": 60.337, "eval_steps_per_second": 3.771, "eval_wer": 0.30655375395796586, "step": 9000 }, { "epoch": 1.4090482154038821, "grad_norm": 2.350506544113159, "learning_rate": 4.5022078804347817e-05, "loss": 0.3855, "step": 9001 }, { "epoch": 1.4092047589229806, "grad_norm": 2.0763683319091797, "learning_rate": 4.50101902173913e-05, "loss": 0.5174, "step": 9002 }, { "epoch": 1.409361302442079, "grad_norm": 2.4176177978515625, "learning_rate": 4.499830163043478e-05, "loss": 0.2425, "step": 9003 }, { "epoch": 1.4095178459611772, "grad_norm": 1.0219806432724, "learning_rate": 4.498641304347826e-05, "loss": 0.4146, "step": 9004 }, { "epoch": 1.4096743894802755, "grad_norm": 1.1742850542068481, "learning_rate": 4.4974524456521736e-05, "loss": 0.3432, "step": 9005 }, { "epoch": 1.409830932999374, "grad_norm": 1.878766655921936, "learning_rate": 4.4962635869565214e-05, "loss": 0.5066, "step": 9006 }, { "epoch": 1.4099874765184721, "grad_norm": 1.213653326034546, "learning_rate": 4.495074728260869e-05, "loss": 0.4726, "step": 9007 }, { "epoch": 1.4101440200375706, "grad_norm": 1.1360259056091309, "learning_rate": 4.493885869565217e-05, "loss": 0.3299, "step": 9008 }, { "epoch": 1.4103005635566688, "grad_norm": 4.132699012756348, "learning_rate": 4.492697010869565e-05, "loss": 0.498, "step": 9009 }, { "epoch": 1.410457107075767, "grad_norm": 1.8099472522735596, "learning_rate": 4.4915081521739133e-05, "loss": 0.4297, "step": 9010 }, { "epoch": 1.4106136505948654, "grad_norm": 3.257399082183838, "learning_rate": 4.49031929347826e-05, "loss": 0.5371, "step": 9011 }, { "epoch": 1.4107701941139636, "grad_norm": 1.7584121227264404, "learning_rate": 4.489130434782608e-05, "loss": 0.4589, "step": 9012 }, { "epoch": 1.410926737633062, "grad_norm": 1.7920756340026855, "learning_rate": 4.487941576086956e-05, "loss": 0.2044, "step": 9013 }, { "epoch": 1.4110832811521603, "grad_norm": 2.7545862197875977, "learning_rate": 4.486752717391304e-05, "loss": 0.7867, "step": 9014 }, { "epoch": 1.4112398246712585, "grad_norm": 3.7947065830230713, "learning_rate": 4.485563858695652e-05, "loss": 0.685, "step": 9015 }, { "epoch": 1.411396368190357, "grad_norm": 1.9897149801254272, "learning_rate": 4.4843749999999996e-05, "loss": 0.6337, "step": 9016 }, { "epoch": 1.4115529117094552, "grad_norm": 1.500117540359497, "learning_rate": 4.4831861413043474e-05, "loss": 0.4987, "step": 9017 }, { "epoch": 1.4117094552285536, "grad_norm": 4.249162673950195, "learning_rate": 4.481997282608695e-05, "loss": 1.1461, "step": 9018 }, { "epoch": 1.4118659987476518, "grad_norm": 4.195754528045654, "learning_rate": 4.480808423913043e-05, "loss": 0.5714, "step": 9019 }, { "epoch": 1.41202254226675, "grad_norm": 2.1112451553344727, "learning_rate": 4.4796195652173915e-05, "loss": 0.432, "step": 9020 }, { "epoch": 1.4121790857858485, "grad_norm": 2.0213401317596436, "learning_rate": 4.478430706521739e-05, "loss": 0.7244, "step": 9021 }, { "epoch": 1.4123356293049467, "grad_norm": 3.8596465587615967, "learning_rate": 4.4772418478260865e-05, "loss": 1.2575, "step": 9022 }, { "epoch": 1.4124921728240452, "grad_norm": 4.772214412689209, "learning_rate": 4.476052989130434e-05, "loss": 0.7098, "step": 9023 }, { "epoch": 1.4126487163431434, "grad_norm": 6.186572074890137, "learning_rate": 4.474864130434782e-05, "loss": 1.296, "step": 9024 }, { "epoch": 1.4128052598622416, "grad_norm": 3.6901633739471436, "learning_rate": 4.47367527173913e-05, "loss": 0.9281, "step": 9025 }, { "epoch": 1.41296180338134, "grad_norm": 2.7720518112182617, "learning_rate": 4.472486413043478e-05, "loss": 0.8882, "step": 9026 }, { "epoch": 1.4131183469004382, "grad_norm": 2.292515516281128, "learning_rate": 4.4712975543478255e-05, "loss": 1.1277, "step": 9027 }, { "epoch": 1.4132748904195367, "grad_norm": 3.0858397483825684, "learning_rate": 4.4701086956521734e-05, "loss": 0.947, "step": 9028 }, { "epoch": 1.413431433938635, "grad_norm": 1.9034117460250854, "learning_rate": 4.468919836956521e-05, "loss": 0.8914, "step": 9029 }, { "epoch": 1.4135879774577331, "grad_norm": 3.055338144302368, "learning_rate": 4.4677309782608697e-05, "loss": 0.9325, "step": 9030 }, { "epoch": 1.4137445209768316, "grad_norm": 4.803001403808594, "learning_rate": 4.4665421195652175e-05, "loss": 1.2333, "step": 9031 }, { "epoch": 1.41390106449593, "grad_norm": 3.1299333572387695, "learning_rate": 4.465353260869565e-05, "loss": 0.858, "step": 9032 }, { "epoch": 1.4140576080150282, "grad_norm": 2.572051763534546, "learning_rate": 4.464164402173913e-05, "loss": 0.6284, "step": 9033 }, { "epoch": 1.4142141515341264, "grad_norm": 1.3071141242980957, "learning_rate": 4.46297554347826e-05, "loss": 0.5269, "step": 9034 }, { "epoch": 1.4143706950532247, "grad_norm": 5.3289971351623535, "learning_rate": 4.461786684782608e-05, "loss": 1.2785, "step": 9035 }, { "epoch": 1.414527238572323, "grad_norm": 4.221307277679443, "learning_rate": 4.460597826086956e-05, "loss": 0.8374, "step": 9036 }, { "epoch": 1.4146837820914215, "grad_norm": 6.712945461273193, "learning_rate": 4.459408967391304e-05, "loss": 1.4614, "step": 9037 }, { "epoch": 1.4148403256105198, "grad_norm": 3.465268135070801, "learning_rate": 4.4582201086956515e-05, "loss": 1.2234, "step": 9038 }, { "epoch": 1.414996869129618, "grad_norm": 0.8516533374786377, "learning_rate": 4.457031249999999e-05, "loss": 0.1936, "step": 9039 }, { "epoch": 1.4151534126487164, "grad_norm": 0.5125131011009216, "learning_rate": 4.455842391304348e-05, "loss": 0.1967, "step": 9040 }, { "epoch": 1.4153099561678146, "grad_norm": 0.8803597092628479, "learning_rate": 4.4546535326086956e-05, "loss": 0.2255, "step": 9041 }, { "epoch": 1.415466499686913, "grad_norm": 2.9904327392578125, "learning_rate": 4.4534646739130434e-05, "loss": 0.202, "step": 9042 }, { "epoch": 1.4156230432060113, "grad_norm": 0.7508803606033325, "learning_rate": 4.452275815217391e-05, "loss": 0.2118, "step": 9043 }, { "epoch": 1.4157795867251095, "grad_norm": 0.6819354891777039, "learning_rate": 4.451086956521739e-05, "loss": 0.2154, "step": 9044 }, { "epoch": 1.415936130244208, "grad_norm": 0.6951857805252075, "learning_rate": 4.449898097826086e-05, "loss": 0.2787, "step": 9045 }, { "epoch": 1.4160926737633062, "grad_norm": 0.8404704332351685, "learning_rate": 4.448709239130434e-05, "loss": 0.2038, "step": 9046 }, { "epoch": 1.4162492172824046, "grad_norm": 1.0664139986038208, "learning_rate": 4.447520380434782e-05, "loss": 0.2159, "step": 9047 }, { "epoch": 1.4164057608015028, "grad_norm": 0.8422969579696655, "learning_rate": 4.44633152173913e-05, "loss": 0.2608, "step": 9048 }, { "epoch": 1.416562304320601, "grad_norm": 0.9245281219482422, "learning_rate": 4.4451426630434775e-05, "loss": 0.1397, "step": 9049 }, { "epoch": 1.4167188478396995, "grad_norm": 0.931506872177124, "learning_rate": 4.443953804347826e-05, "loss": 0.1612, "step": 9050 }, { "epoch": 1.4168753913587977, "grad_norm": 0.6655548214912415, "learning_rate": 4.442764945652174e-05, "loss": 0.2493, "step": 9051 }, { "epoch": 1.4170319348778961, "grad_norm": 2.3252317905426025, "learning_rate": 4.4415760869565216e-05, "loss": 0.3978, "step": 9052 }, { "epoch": 1.4171884783969944, "grad_norm": 2.3119356632232666, "learning_rate": 4.4403872282608694e-05, "loss": 0.5241, "step": 9053 }, { "epoch": 1.4173450219160926, "grad_norm": 1.341863751411438, "learning_rate": 4.439198369565217e-05, "loss": 0.4364, "step": 9054 }, { "epoch": 1.417501565435191, "grad_norm": 1.5018471479415894, "learning_rate": 4.438009510869565e-05, "loss": 0.595, "step": 9055 }, { "epoch": 1.4176581089542892, "grad_norm": 1.2583988904953003, "learning_rate": 4.436820652173913e-05, "loss": 0.2569, "step": 9056 }, { "epoch": 1.4178146524733877, "grad_norm": 1.5302389860153198, "learning_rate": 4.43563179347826e-05, "loss": 0.287, "step": 9057 }, { "epoch": 1.4179711959924859, "grad_norm": 2.744860887527466, "learning_rate": 4.434442934782608e-05, "loss": 0.4205, "step": 9058 }, { "epoch": 1.418127739511584, "grad_norm": 2.2202348709106445, "learning_rate": 4.4332540760869556e-05, "loss": 0.3563, "step": 9059 }, { "epoch": 1.4182842830306825, "grad_norm": 1.8654605150222778, "learning_rate": 4.432065217391304e-05, "loss": 0.3391, "step": 9060 }, { "epoch": 1.4184408265497808, "grad_norm": 1.9119874238967896, "learning_rate": 4.430876358695652e-05, "loss": 0.3617, "step": 9061 }, { "epoch": 1.4185973700688792, "grad_norm": 1.7815064191818237, "learning_rate": 4.4296875e-05, "loss": 0.4352, "step": 9062 }, { "epoch": 1.4187539135879774, "grad_norm": 2.928542137145996, "learning_rate": 4.4284986413043476e-05, "loss": 0.4398, "step": 9063 }, { "epoch": 1.4189104571070756, "grad_norm": 1.3899388313293457, "learning_rate": 4.4273097826086954e-05, "loss": 0.4341, "step": 9064 }, { "epoch": 1.419067000626174, "grad_norm": 2.042663335800171, "learning_rate": 4.426120923913043e-05, "loss": 0.6917, "step": 9065 }, { "epoch": 1.4192235441452725, "grad_norm": 1.2015230655670166, "learning_rate": 4.424932065217391e-05, "loss": 0.4508, "step": 9066 }, { "epoch": 1.4193800876643707, "grad_norm": 3.8125834465026855, "learning_rate": 4.423743206521739e-05, "loss": 0.6379, "step": 9067 }, { "epoch": 1.419536631183469, "grad_norm": 2.322672128677368, "learning_rate": 4.422554347826086e-05, "loss": 0.5525, "step": 9068 }, { "epoch": 1.4196931747025674, "grad_norm": 2.3892862796783447, "learning_rate": 4.421365489130434e-05, "loss": 0.8055, "step": 9069 }, { "epoch": 1.4198497182216656, "grad_norm": 3.7148284912109375, "learning_rate": 4.420176630434782e-05, "loss": 0.8829, "step": 9070 }, { "epoch": 1.420006261740764, "grad_norm": 3.3419950008392334, "learning_rate": 4.41898777173913e-05, "loss": 0.6916, "step": 9071 }, { "epoch": 1.4201628052598623, "grad_norm": 1.6147576570510864, "learning_rate": 4.417798913043478e-05, "loss": 0.5568, "step": 9072 }, { "epoch": 1.4203193487789605, "grad_norm": 2.3820178508758545, "learning_rate": 4.416610054347826e-05, "loss": 0.5422, "step": 9073 }, { "epoch": 1.420475892298059, "grad_norm": 5.17689323425293, "learning_rate": 4.4154211956521735e-05, "loss": 0.5747, "step": 9074 }, { "epoch": 1.4206324358171571, "grad_norm": 2.0195281505584717, "learning_rate": 4.4142323369565214e-05, "loss": 0.6336, "step": 9075 }, { "epoch": 1.4207889793362556, "grad_norm": 1.71804678440094, "learning_rate": 4.413043478260869e-05, "loss": 0.8651, "step": 9076 }, { "epoch": 1.4209455228553538, "grad_norm": 1.772873878479004, "learning_rate": 4.411854619565217e-05, "loss": 0.4116, "step": 9077 }, { "epoch": 1.421102066374452, "grad_norm": 4.913057804107666, "learning_rate": 4.410665760869565e-05, "loss": 0.7657, "step": 9078 }, { "epoch": 1.4212586098935505, "grad_norm": 4.89540958404541, "learning_rate": 4.409476902173913e-05, "loss": 1.1551, "step": 9079 }, { "epoch": 1.4214151534126487, "grad_norm": 3.463578462600708, "learning_rate": 4.4082880434782604e-05, "loss": 1.0362, "step": 9080 }, { "epoch": 1.4215716969317471, "grad_norm": 3.244584798812866, "learning_rate": 4.407099184782608e-05, "loss": 0.7267, "step": 9081 }, { "epoch": 1.4217282404508453, "grad_norm": 3.0325889587402344, "learning_rate": 4.405910326086956e-05, "loss": 1.033, "step": 9082 }, { "epoch": 1.4218847839699436, "grad_norm": 4.0745849609375, "learning_rate": 4.404721467391304e-05, "loss": 1.4463, "step": 9083 }, { "epoch": 1.422041327489042, "grad_norm": 1.1500208377838135, "learning_rate": 4.403532608695652e-05, "loss": 0.3248, "step": 9084 }, { "epoch": 1.4221978710081402, "grad_norm": 1.8585773706436157, "learning_rate": 4.4023437499999995e-05, "loss": 0.6495, "step": 9085 }, { "epoch": 1.4223544145272387, "grad_norm": 3.0134520530700684, "learning_rate": 4.401154891304347e-05, "loss": 0.5317, "step": 9086 }, { "epoch": 1.4225109580463369, "grad_norm": 4.577974319458008, "learning_rate": 4.399966032608695e-05, "loss": 0.46, "step": 9087 }, { "epoch": 1.422667501565435, "grad_norm": 1.7149702310562134, "learning_rate": 4.398777173913043e-05, "loss": 0.69, "step": 9088 }, { "epoch": 1.4228240450845335, "grad_norm": 0.43732577562332153, "learning_rate": 4.3975883152173915e-05, "loss": 0.2037, "step": 9089 }, { "epoch": 1.4229805886036317, "grad_norm": 0.3855223059654236, "learning_rate": 4.396399456521739e-05, "loss": 0.2049, "step": 9090 }, { "epoch": 1.4231371321227302, "grad_norm": 0.8807380795478821, "learning_rate": 4.3952105978260864e-05, "loss": 0.3103, "step": 9091 }, { "epoch": 1.4232936756418284, "grad_norm": 0.5151590704917908, "learning_rate": 4.394021739130434e-05, "loss": 0.2617, "step": 9092 }, { "epoch": 1.4234502191609266, "grad_norm": 1.24167799949646, "learning_rate": 4.392832880434782e-05, "loss": 0.3152, "step": 9093 }, { "epoch": 1.423606762680025, "grad_norm": 1.174886703491211, "learning_rate": 4.39164402173913e-05, "loss": 0.2681, "step": 9094 }, { "epoch": 1.4237633061991233, "grad_norm": 0.7693584561347961, "learning_rate": 4.390455163043478e-05, "loss": 0.2047, "step": 9095 }, { "epoch": 1.4239198497182217, "grad_norm": 0.7070851922035217, "learning_rate": 4.3892663043478255e-05, "loss": 0.2334, "step": 9096 }, { "epoch": 1.42407639323732, "grad_norm": 0.8772193193435669, "learning_rate": 4.388077445652173e-05, "loss": 0.2558, "step": 9097 }, { "epoch": 1.4242329367564182, "grad_norm": 0.8261443376541138, "learning_rate": 4.386888586956521e-05, "loss": 0.3219, "step": 9098 }, { "epoch": 1.4243894802755166, "grad_norm": 0.9736018776893616, "learning_rate": 4.3856997282608696e-05, "loss": 0.174, "step": 9099 }, { "epoch": 1.424546023794615, "grad_norm": 0.6930388808250427, "learning_rate": 4.3845108695652174e-05, "loss": 0.1964, "step": 9100 }, { "epoch": 1.4247025673137133, "grad_norm": 1.9541418552398682, "learning_rate": 4.383322010869565e-05, "loss": 0.4646, "step": 9101 }, { "epoch": 1.4248591108328115, "grad_norm": 3.162625551223755, "learning_rate": 4.382133152173913e-05, "loss": 0.2935, "step": 9102 }, { "epoch": 1.42501565435191, "grad_norm": 1.0745099782943726, "learning_rate": 4.38094429347826e-05, "loss": 0.2795, "step": 9103 }, { "epoch": 1.4251721978710081, "grad_norm": 1.6744211912155151, "learning_rate": 4.379755434782608e-05, "loss": 0.3707, "step": 9104 }, { "epoch": 1.4253287413901066, "grad_norm": 1.0875681638717651, "learning_rate": 4.378566576086956e-05, "loss": 0.3047, "step": 9105 }, { "epoch": 1.4254852849092048, "grad_norm": 1.3138841390609741, "learning_rate": 4.3773777173913036e-05, "loss": 0.4642, "step": 9106 }, { "epoch": 1.425641828428303, "grad_norm": 1.2389613389968872, "learning_rate": 4.3761888586956515e-05, "loss": 0.3468, "step": 9107 }, { "epoch": 1.4257983719474014, "grad_norm": 3.3274738788604736, "learning_rate": 4.374999999999999e-05, "loss": 0.5937, "step": 9108 }, { "epoch": 1.4259549154664997, "grad_norm": 1.6438499689102173, "learning_rate": 4.373811141304348e-05, "loss": 0.5673, "step": 9109 }, { "epoch": 1.426111458985598, "grad_norm": 0.8999751806259155, "learning_rate": 4.3726222826086956e-05, "loss": 0.3095, "step": 9110 }, { "epoch": 1.4262680025046963, "grad_norm": 3.2359845638275146, "learning_rate": 4.3714334239130434e-05, "loss": 0.7627, "step": 9111 }, { "epoch": 1.4264245460237945, "grad_norm": 2.3228495121002197, "learning_rate": 4.370244565217391e-05, "loss": 0.7939, "step": 9112 }, { "epoch": 1.426581089542893, "grad_norm": 1.9258344173431396, "learning_rate": 4.369055706521739e-05, "loss": 0.588, "step": 9113 }, { "epoch": 1.4267376330619912, "grad_norm": 2.742858409881592, "learning_rate": 4.367866847826086e-05, "loss": 0.5122, "step": 9114 }, { "epoch": 1.4268941765810896, "grad_norm": 3.7729899883270264, "learning_rate": 4.366677989130434e-05, "loss": 0.4242, "step": 9115 }, { "epoch": 1.4270507201001879, "grad_norm": 1.5122369527816772, "learning_rate": 4.365489130434782e-05, "loss": 0.2329, "step": 9116 }, { "epoch": 1.427207263619286, "grad_norm": 3.6418752670288086, "learning_rate": 4.3643002717391296e-05, "loss": 0.5662, "step": 9117 }, { "epoch": 1.4273638071383845, "grad_norm": 2.7131831645965576, "learning_rate": 4.363111413043478e-05, "loss": 0.4144, "step": 9118 }, { "epoch": 1.4275203506574827, "grad_norm": 3.648449659347534, "learning_rate": 4.361922554347826e-05, "loss": 1.0136, "step": 9119 }, { "epoch": 1.4276768941765812, "grad_norm": 1.794143557548523, "learning_rate": 4.360733695652174e-05, "loss": 0.426, "step": 9120 }, { "epoch": 1.4278334376956794, "grad_norm": 3.6195809841156006, "learning_rate": 4.3595448369565216e-05, "loss": 0.81, "step": 9121 }, { "epoch": 1.4279899812147776, "grad_norm": 5.649221420288086, "learning_rate": 4.3583559782608694e-05, "loss": 0.8341, "step": 9122 }, { "epoch": 1.428146524733876, "grad_norm": 7.399574279785156, "learning_rate": 4.357167119565217e-05, "loss": 0.8282, "step": 9123 }, { "epoch": 1.4283030682529743, "grad_norm": 3.1968448162078857, "learning_rate": 4.355978260869565e-05, "loss": 0.7241, "step": 9124 }, { "epoch": 1.4284596117720727, "grad_norm": 3.0211713314056396, "learning_rate": 4.354789402173913e-05, "loss": 0.8757, "step": 9125 }, { "epoch": 1.428616155291171, "grad_norm": 3.3855860233306885, "learning_rate": 4.35360054347826e-05, "loss": 1.0933, "step": 9126 }, { "epoch": 1.4287726988102691, "grad_norm": 4.300130844116211, "learning_rate": 4.352411684782608e-05, "loss": 1.0827, "step": 9127 }, { "epoch": 1.4289292423293676, "grad_norm": 2.7729949951171875, "learning_rate": 4.351222826086956e-05, "loss": 0.9512, "step": 9128 }, { "epoch": 1.4290857858484658, "grad_norm": 5.561803817749023, "learning_rate": 4.350033967391304e-05, "loss": 0.6959, "step": 9129 }, { "epoch": 1.4292423293675642, "grad_norm": 5.878911972045898, "learning_rate": 4.348845108695652e-05, "loss": 1.2851, "step": 9130 }, { "epoch": 1.4293988728866625, "grad_norm": 1.693114161491394, "learning_rate": 4.34765625e-05, "loss": 0.5917, "step": 9131 }, { "epoch": 1.4295554164057607, "grad_norm": 5.124202251434326, "learning_rate": 4.3464673913043475e-05, "loss": 1.4384, "step": 9132 }, { "epoch": 1.429711959924859, "grad_norm": 2.772063970565796, "learning_rate": 4.3452785326086953e-05, "loss": 0.938, "step": 9133 }, { "epoch": 1.4298685034439576, "grad_norm": 1.8839541673660278, "learning_rate": 4.344089673913043e-05, "loss": 0.3482, "step": 9134 }, { "epoch": 1.4300250469630558, "grad_norm": 5.550987720489502, "learning_rate": 4.342900815217391e-05, "loss": 0.9496, "step": 9135 }, { "epoch": 1.430181590482154, "grad_norm": 2.8807268142700195, "learning_rate": 4.341711956521739e-05, "loss": 0.3262, "step": 9136 }, { "epoch": 1.4303381340012524, "grad_norm": 3.8548381328582764, "learning_rate": 4.340523097826086e-05, "loss": 1.0135, "step": 9137 }, { "epoch": 1.4304946775203506, "grad_norm": 3.213679313659668, "learning_rate": 4.3393342391304344e-05, "loss": 1.0242, "step": 9138 }, { "epoch": 1.430651221039449, "grad_norm": 0.583296537399292, "learning_rate": 4.338145380434782e-05, "loss": 0.1678, "step": 9139 }, { "epoch": 1.4308077645585473, "grad_norm": 0.9803653359413147, "learning_rate": 4.33695652173913e-05, "loss": 0.3071, "step": 9140 }, { "epoch": 1.4309643080776455, "grad_norm": 0.6428372859954834, "learning_rate": 4.335767663043478e-05, "loss": 0.239, "step": 9141 }, { "epoch": 1.431120851596744, "grad_norm": 0.5709884166717529, "learning_rate": 4.334578804347826e-05, "loss": 0.2331, "step": 9142 }, { "epoch": 1.4312773951158422, "grad_norm": 0.4677014648914337, "learning_rate": 4.3333899456521735e-05, "loss": 0.1944, "step": 9143 }, { "epoch": 1.4314339386349406, "grad_norm": 1.5723886489868164, "learning_rate": 4.332201086956521e-05, "loss": 0.3545, "step": 9144 }, { "epoch": 1.4315904821540388, "grad_norm": 2.9181129932403564, "learning_rate": 4.331012228260869e-05, "loss": 0.2786, "step": 9145 }, { "epoch": 1.431747025673137, "grad_norm": 0.9223023653030396, "learning_rate": 4.329823369565217e-05, "loss": 0.2085, "step": 9146 }, { "epoch": 1.4319035691922355, "grad_norm": 0.7209746837615967, "learning_rate": 4.3286345108695654e-05, "loss": 0.2352, "step": 9147 }, { "epoch": 1.4320601127113337, "grad_norm": 2.155860424041748, "learning_rate": 4.327445652173913e-05, "loss": 0.5776, "step": 9148 }, { "epoch": 1.4322166562304322, "grad_norm": 1.940746545791626, "learning_rate": 4.3262567934782604e-05, "loss": 0.1807, "step": 9149 }, { "epoch": 1.4323731997495304, "grad_norm": 0.9924167990684509, "learning_rate": 4.325067934782608e-05, "loss": 0.301, "step": 9150 }, { "epoch": 1.4325297432686286, "grad_norm": 1.1259034872055054, "learning_rate": 4.323879076086956e-05, "loss": 0.454, "step": 9151 }, { "epoch": 1.432686286787727, "grad_norm": 0.8402918577194214, "learning_rate": 4.322690217391304e-05, "loss": 0.4024, "step": 9152 }, { "epoch": 1.4328428303068252, "grad_norm": 2.0007805824279785, "learning_rate": 4.3215013586956517e-05, "loss": 0.4355, "step": 9153 }, { "epoch": 1.4329993738259237, "grad_norm": 0.8632403612136841, "learning_rate": 4.3203124999999995e-05, "loss": 0.2396, "step": 9154 }, { "epoch": 1.433155917345022, "grad_norm": 0.5924530625343323, "learning_rate": 4.319123641304347e-05, "loss": 0.1832, "step": 9155 }, { "epoch": 1.4333124608641201, "grad_norm": 2.6965577602386475, "learning_rate": 4.317934782608695e-05, "loss": 0.3011, "step": 9156 }, { "epoch": 1.4334690043832186, "grad_norm": 1.4779486656188965, "learning_rate": 4.3167459239130436e-05, "loss": 0.2919, "step": 9157 }, { "epoch": 1.4336255479023168, "grad_norm": 1.8008702993392944, "learning_rate": 4.3155570652173914e-05, "loss": 0.4035, "step": 9158 }, { "epoch": 1.4337820914214152, "grad_norm": 1.9443156719207764, "learning_rate": 4.314368206521739e-05, "loss": 0.3343, "step": 9159 }, { "epoch": 1.4339386349405134, "grad_norm": 3.4607620239257812, "learning_rate": 4.3131793478260864e-05, "loss": 0.5122, "step": 9160 }, { "epoch": 1.4340951784596117, "grad_norm": 1.0334382057189941, "learning_rate": 4.311990489130434e-05, "loss": 0.2339, "step": 9161 }, { "epoch": 1.43425172197871, "grad_norm": 2.2013890743255615, "learning_rate": 4.310801630434782e-05, "loss": 0.7586, "step": 9162 }, { "epoch": 1.4344082654978083, "grad_norm": 1.16911780834198, "learning_rate": 4.30961277173913e-05, "loss": 0.2977, "step": 9163 }, { "epoch": 1.4345648090169068, "grad_norm": 1.3931989669799805, "learning_rate": 4.3084239130434776e-05, "loss": 0.3663, "step": 9164 }, { "epoch": 1.434721352536005, "grad_norm": 2.964674234390259, "learning_rate": 4.3072350543478254e-05, "loss": 0.4107, "step": 9165 }, { "epoch": 1.4348778960551032, "grad_norm": 2.668790102005005, "learning_rate": 4.306046195652173e-05, "loss": 0.6207, "step": 9166 }, { "epoch": 1.4350344395742016, "grad_norm": 1.851648211479187, "learning_rate": 4.304857336956522e-05, "loss": 0.6756, "step": 9167 }, { "epoch": 1.4351909830933, "grad_norm": 2.3424503803253174, "learning_rate": 4.3036684782608696e-05, "loss": 0.9555, "step": 9168 }, { "epoch": 1.4353475266123983, "grad_norm": 2.27223801612854, "learning_rate": 4.3024796195652174e-05, "loss": 0.4193, "step": 9169 }, { "epoch": 1.4355040701314965, "grad_norm": 2.604599952697754, "learning_rate": 4.301290760869565e-05, "loss": 0.4487, "step": 9170 }, { "epoch": 1.435660613650595, "grad_norm": 4.9117326736450195, "learning_rate": 4.300101902173913e-05, "loss": 0.7895, "step": 9171 }, { "epoch": 1.4358171571696932, "grad_norm": 1.466533899307251, "learning_rate": 4.29891304347826e-05, "loss": 0.5634, "step": 9172 }, { "epoch": 1.4359737006887916, "grad_norm": 6.017882347106934, "learning_rate": 4.297724184782608e-05, "loss": 1.3647, "step": 9173 }, { "epoch": 1.4361302442078898, "grad_norm": 2.225731372833252, "learning_rate": 4.296535326086956e-05, "loss": 0.6176, "step": 9174 }, { "epoch": 1.436286787726988, "grad_norm": 3.3428916931152344, "learning_rate": 4.2953464673913036e-05, "loss": 0.8317, "step": 9175 }, { "epoch": 1.4364433312460865, "grad_norm": 1.998910665512085, "learning_rate": 4.2941576086956514e-05, "loss": 0.5225, "step": 9176 }, { "epoch": 1.4365998747651847, "grad_norm": 2.6772687435150146, "learning_rate": 4.29296875e-05, "loss": 0.5647, "step": 9177 }, { "epoch": 1.4367564182842831, "grad_norm": 2.753298759460449, "learning_rate": 4.291779891304348e-05, "loss": 1.2035, "step": 9178 }, { "epoch": 1.4369129618033814, "grad_norm": 6.8978142738342285, "learning_rate": 4.2905910326086955e-05, "loss": 1.2775, "step": 9179 }, { "epoch": 1.4370695053224796, "grad_norm": 4.308775424957275, "learning_rate": 4.2894021739130434e-05, "loss": 0.9045, "step": 9180 }, { "epoch": 1.437226048841578, "grad_norm": 9.88953685760498, "learning_rate": 4.288213315217391e-05, "loss": 1.1149, "step": 9181 }, { "epoch": 1.4373825923606762, "grad_norm": 3.9147331714630127, "learning_rate": 4.287024456521739e-05, "loss": 1.0394, "step": 9182 }, { "epoch": 1.4375391358797747, "grad_norm": 5.019151210784912, "learning_rate": 4.285835597826086e-05, "loss": 0.9821, "step": 9183 }, { "epoch": 1.4376956793988729, "grad_norm": 3.5256636142730713, "learning_rate": 4.284646739130434e-05, "loss": 0.601, "step": 9184 }, { "epoch": 1.437852222917971, "grad_norm": 1.6303881406784058, "learning_rate": 4.283457880434782e-05, "loss": 0.5245, "step": 9185 }, { "epoch": 1.4380087664370695, "grad_norm": 2.920095682144165, "learning_rate": 4.2822690217391296e-05, "loss": 0.4574, "step": 9186 }, { "epoch": 1.4381653099561678, "grad_norm": 1.4310495853424072, "learning_rate": 4.281080163043478e-05, "loss": 0.3432, "step": 9187 }, { "epoch": 1.4383218534752662, "grad_norm": 4.058804988861084, "learning_rate": 4.279891304347826e-05, "loss": 0.8077, "step": 9188 }, { "epoch": 1.4384783969943644, "grad_norm": 0.5276725888252258, "learning_rate": 4.278702445652174e-05, "loss": 0.1896, "step": 9189 }, { "epoch": 1.4386349405134626, "grad_norm": 0.6342072486877441, "learning_rate": 4.2775135869565215e-05, "loss": 0.2818, "step": 9190 }, { "epoch": 1.438791484032561, "grad_norm": 1.028830647468567, "learning_rate": 4.276324728260869e-05, "loss": 0.4029, "step": 9191 }, { "epoch": 1.4389480275516593, "grad_norm": 0.7760360836982727, "learning_rate": 4.275135869565217e-05, "loss": 0.2851, "step": 9192 }, { "epoch": 1.4391045710707577, "grad_norm": 0.7799695134162903, "learning_rate": 4.273947010869565e-05, "loss": 0.3221, "step": 9193 }, { "epoch": 1.439261114589856, "grad_norm": 0.43932998180389404, "learning_rate": 4.272758152173913e-05, "loss": 0.1797, "step": 9194 }, { "epoch": 1.4394176581089542, "grad_norm": 1.7283228635787964, "learning_rate": 4.27156929347826e-05, "loss": 0.2447, "step": 9195 }, { "epoch": 1.4395742016280526, "grad_norm": 1.0957245826721191, "learning_rate": 4.270380434782608e-05, "loss": 0.3371, "step": 9196 }, { "epoch": 1.4397307451471508, "grad_norm": 0.6429232358932495, "learning_rate": 4.269191576086956e-05, "loss": 0.1995, "step": 9197 }, { "epoch": 1.4398872886662493, "grad_norm": 0.9569399356842041, "learning_rate": 4.268002717391304e-05, "loss": 0.2154, "step": 9198 }, { "epoch": 1.4400438321853475, "grad_norm": 1.4870209693908691, "learning_rate": 4.266813858695652e-05, "loss": 0.3078, "step": 9199 }, { "epoch": 1.4402003757044457, "grad_norm": 0.8143390417098999, "learning_rate": 4.265625e-05, "loss": 0.2643, "step": 9200 }, { "epoch": 1.4403569192235441, "grad_norm": 1.0112450122833252, "learning_rate": 4.2644361413043475e-05, "loss": 0.3535, "step": 9201 }, { "epoch": 1.4405134627426426, "grad_norm": 1.1448432207107544, "learning_rate": 4.263247282608695e-05, "loss": 0.3349, "step": 9202 }, { "epoch": 1.4406700062617408, "grad_norm": 1.1125699281692505, "learning_rate": 4.262058423913043e-05, "loss": 0.2712, "step": 9203 }, { "epoch": 1.440826549780839, "grad_norm": 0.9311196804046631, "learning_rate": 4.260869565217391e-05, "loss": 0.3574, "step": 9204 }, { "epoch": 1.4409830932999375, "grad_norm": 3.0100176334381104, "learning_rate": 4.2596807065217394e-05, "loss": 0.4734, "step": 9205 }, { "epoch": 1.4411396368190357, "grad_norm": 1.214535117149353, "learning_rate": 4.258491847826086e-05, "loss": 0.4566, "step": 9206 }, { "epoch": 1.4412961803381341, "grad_norm": 1.2461925745010376, "learning_rate": 4.2573029891304344e-05, "loss": 0.3444, "step": 9207 }, { "epoch": 1.4414527238572323, "grad_norm": 0.9573638439178467, "learning_rate": 4.256114130434782e-05, "loss": 0.3032, "step": 9208 }, { "epoch": 1.4416092673763305, "grad_norm": 3.728708505630493, "learning_rate": 4.25492527173913e-05, "loss": 0.6047, "step": 9209 }, { "epoch": 1.441765810895429, "grad_norm": 3.0461413860321045, "learning_rate": 4.253736413043478e-05, "loss": 0.6837, "step": 9210 }, { "epoch": 1.4419223544145272, "grad_norm": 3.3078835010528564, "learning_rate": 4.2525475543478256e-05, "loss": 0.4227, "step": 9211 }, { "epoch": 1.4420788979336256, "grad_norm": 2.8178367614746094, "learning_rate": 4.2513586956521735e-05, "loss": 0.5742, "step": 9212 }, { "epoch": 1.4422354414527239, "grad_norm": 1.677110195159912, "learning_rate": 4.250169836956521e-05, "loss": 0.3569, "step": 9213 }, { "epoch": 1.442391984971822, "grad_norm": 2.931191921234131, "learning_rate": 4.248980978260869e-05, "loss": 1.0209, "step": 9214 }, { "epoch": 1.4425485284909205, "grad_norm": 3.638118028640747, "learning_rate": 4.2477921195652176e-05, "loss": 0.6172, "step": 9215 }, { "epoch": 1.4427050720100187, "grad_norm": 7.390195369720459, "learning_rate": 4.2466032608695654e-05, "loss": 0.6335, "step": 9216 }, { "epoch": 1.4428616155291172, "grad_norm": 3.0423927307128906, "learning_rate": 4.245414402173913e-05, "loss": 0.7574, "step": 9217 }, { "epoch": 1.4430181590482154, "grad_norm": 3.2088634967803955, "learning_rate": 4.2442255434782603e-05, "loss": 0.7106, "step": 9218 }, { "epoch": 1.4431747025673136, "grad_norm": 2.337602138519287, "learning_rate": 4.243036684782608e-05, "loss": 1.0731, "step": 9219 }, { "epoch": 1.443331246086412, "grad_norm": 1.5149165391921997, "learning_rate": 4.241847826086956e-05, "loss": 0.4227, "step": 9220 }, { "epoch": 1.4434877896055103, "grad_norm": 3.03322696685791, "learning_rate": 4.240658967391304e-05, "loss": 0.7081, "step": 9221 }, { "epoch": 1.4436443331246087, "grad_norm": 4.672436237335205, "learning_rate": 4.2394701086956516e-05, "loss": 0.9558, "step": 9222 }, { "epoch": 1.443800876643707, "grad_norm": 5.0354814529418945, "learning_rate": 4.2382812499999994e-05, "loss": 1.1181, "step": 9223 }, { "epoch": 1.4439574201628051, "grad_norm": 2.018890380859375, "learning_rate": 4.237092391304347e-05, "loss": 0.394, "step": 9224 }, { "epoch": 1.4441139636819036, "grad_norm": 2.3699162006378174, "learning_rate": 4.235903532608696e-05, "loss": 0.6803, "step": 9225 }, { "epoch": 1.4442705072010018, "grad_norm": 3.3726589679718018, "learning_rate": 4.2347146739130436e-05, "loss": 1.13, "step": 9226 }, { "epoch": 1.4444270507201002, "grad_norm": 1.8528469800949097, "learning_rate": 4.2335258152173914e-05, "loss": 0.7838, "step": 9227 }, { "epoch": 1.4445835942391985, "grad_norm": 3.0620346069335938, "learning_rate": 4.232336956521739e-05, "loss": 0.8592, "step": 9228 }, { "epoch": 1.4447401377582967, "grad_norm": 4.309928894042969, "learning_rate": 4.231148097826086e-05, "loss": 1.0652, "step": 9229 }, { "epoch": 1.4448966812773951, "grad_norm": 3.7714016437530518, "learning_rate": 4.229959239130434e-05, "loss": 1.8107, "step": 9230 }, { "epoch": 1.4450532247964936, "grad_norm": 2.4996418952941895, "learning_rate": 4.228770380434782e-05, "loss": 0.8447, "step": 9231 }, { "epoch": 1.4452097683155918, "grad_norm": 2.054933786392212, "learning_rate": 4.22758152173913e-05, "loss": 1.1339, "step": 9232 }, { "epoch": 1.44536631183469, "grad_norm": 2.344653844833374, "learning_rate": 4.2263926630434776e-05, "loss": 1.6074, "step": 9233 }, { "epoch": 1.4455228553537882, "grad_norm": 1.8327327966690063, "learning_rate": 4.2252038043478254e-05, "loss": 0.321, "step": 9234 }, { "epoch": 1.4456793988728867, "grad_norm": 2.3320887088775635, "learning_rate": 4.224014945652174e-05, "loss": 1.0759, "step": 9235 }, { "epoch": 1.445835942391985, "grad_norm": 3.815873622894287, "learning_rate": 4.222826086956522e-05, "loss": 0.5208, "step": 9236 }, { "epoch": 1.4459924859110833, "grad_norm": 3.284459352493286, "learning_rate": 4.2216372282608695e-05, "loss": 1.2254, "step": 9237 }, { "epoch": 1.4461490294301815, "grad_norm": 2.9030935764312744, "learning_rate": 4.2204483695652173e-05, "loss": 1.0332, "step": 9238 }, { "epoch": 1.44630557294928, "grad_norm": 0.6077360510826111, "learning_rate": 4.219259510869565e-05, "loss": 0.2402, "step": 9239 }, { "epoch": 1.4464621164683782, "grad_norm": 0.6503486037254333, "learning_rate": 4.218070652173913e-05, "loss": 0.1805, "step": 9240 }, { "epoch": 1.4466186599874766, "grad_norm": 0.977691650390625, "learning_rate": 4.21688179347826e-05, "loss": 0.3438, "step": 9241 }, { "epoch": 1.4467752035065748, "grad_norm": 0.4803237020969391, "learning_rate": 4.215692934782608e-05, "loss": 0.1703, "step": 9242 }, { "epoch": 1.446931747025673, "grad_norm": 0.8092516660690308, "learning_rate": 4.214504076086956e-05, "loss": 0.2498, "step": 9243 }, { "epoch": 1.4470882905447715, "grad_norm": 0.8318967819213867, "learning_rate": 4.2133152173913036e-05, "loss": 0.1999, "step": 9244 }, { "epoch": 1.4472448340638697, "grad_norm": 0.5364032983779907, "learning_rate": 4.212126358695652e-05, "loss": 0.1384, "step": 9245 }, { "epoch": 1.4474013775829682, "grad_norm": 0.7455703020095825, "learning_rate": 4.2109375e-05, "loss": 0.1791, "step": 9246 }, { "epoch": 1.4475579211020664, "grad_norm": 0.48112785816192627, "learning_rate": 4.209748641304348e-05, "loss": 0.2144, "step": 9247 }, { "epoch": 1.4477144646211646, "grad_norm": 0.9706409573554993, "learning_rate": 4.2085597826086955e-05, "loss": 0.3441, "step": 9248 }, { "epoch": 1.447871008140263, "grad_norm": 1.5317976474761963, "learning_rate": 4.207370923913043e-05, "loss": 0.2753, "step": 9249 }, { "epoch": 1.4480275516593613, "grad_norm": 0.8570466637611389, "learning_rate": 4.206182065217391e-05, "loss": 0.2007, "step": 9250 }, { "epoch": 1.4481840951784597, "grad_norm": 1.2393656969070435, "learning_rate": 4.204993206521739e-05, "loss": 0.3102, "step": 9251 }, { "epoch": 1.448340638697558, "grad_norm": 1.415632963180542, "learning_rate": 4.203804347826086e-05, "loss": 0.4836, "step": 9252 }, { "epoch": 1.4484971822166561, "grad_norm": 1.137580156326294, "learning_rate": 4.202615489130434e-05, "loss": 0.4061, "step": 9253 }, { "epoch": 1.4486537257357546, "grad_norm": 1.1664133071899414, "learning_rate": 4.201426630434782e-05, "loss": 0.3097, "step": 9254 }, { "epoch": 1.4488102692548528, "grad_norm": 0.8917604684829712, "learning_rate": 4.20023777173913e-05, "loss": 0.3041, "step": 9255 }, { "epoch": 1.4489668127739512, "grad_norm": 4.039106845855713, "learning_rate": 4.199048913043478e-05, "loss": 0.2538, "step": 9256 }, { "epoch": 1.4491233562930494, "grad_norm": 1.0779602527618408, "learning_rate": 4.197860054347826e-05, "loss": 0.3589, "step": 9257 }, { "epoch": 1.4492798998121477, "grad_norm": 1.8538914918899536, "learning_rate": 4.1966711956521737e-05, "loss": 0.5242, "step": 9258 }, { "epoch": 1.449436443331246, "grad_norm": 2.5399858951568604, "learning_rate": 4.1954823369565215e-05, "loss": 0.3637, "step": 9259 }, { "epoch": 1.4495929868503443, "grad_norm": 3.2761006355285645, "learning_rate": 4.194293478260869e-05, "loss": 0.4754, "step": 9260 }, { "epoch": 1.4497495303694428, "grad_norm": 3.527557849884033, "learning_rate": 4.193104619565217e-05, "loss": 0.498, "step": 9261 }, { "epoch": 1.449906073888541, "grad_norm": 1.8606067895889282, "learning_rate": 4.191915760869565e-05, "loss": 0.3821, "step": 9262 }, { "epoch": 1.4500626174076392, "grad_norm": 1.2119731903076172, "learning_rate": 4.190726902173913e-05, "loss": 0.3464, "step": 9263 }, { "epoch": 1.4502191609267376, "grad_norm": 2.641918897628784, "learning_rate": 4.18953804347826e-05, "loss": 0.6369, "step": 9264 }, { "epoch": 1.450375704445836, "grad_norm": 1.8989008665084839, "learning_rate": 4.1883491847826084e-05, "loss": 0.5973, "step": 9265 }, { "epoch": 1.4505322479649343, "grad_norm": 2.536806583404541, "learning_rate": 4.187160326086956e-05, "loss": 0.6178, "step": 9266 }, { "epoch": 1.4506887914840325, "grad_norm": 2.37931752204895, "learning_rate": 4.185971467391304e-05, "loss": 0.5162, "step": 9267 }, { "epoch": 1.4508453350031307, "grad_norm": 2.1299636363983154, "learning_rate": 4.184782608695652e-05, "loss": 0.6128, "step": 9268 }, { "epoch": 1.4510018785222292, "grad_norm": 2.6895463466644287, "learning_rate": 4.1835937499999996e-05, "loss": 0.5998, "step": 9269 }, { "epoch": 1.4511584220413276, "grad_norm": 2.8067667484283447, "learning_rate": 4.1824048913043474e-05, "loss": 0.6198, "step": 9270 }, { "epoch": 1.4513149655604258, "grad_norm": 2.739535093307495, "learning_rate": 4.181216032608695e-05, "loss": 0.8357, "step": 9271 }, { "epoch": 1.451471509079524, "grad_norm": 3.802837371826172, "learning_rate": 4.180027173913043e-05, "loss": 0.689, "step": 9272 }, { "epoch": 1.4516280525986225, "grad_norm": 8.556916236877441, "learning_rate": 4.178838315217391e-05, "loss": 0.8396, "step": 9273 }, { "epoch": 1.4517845961177207, "grad_norm": 2.3586015701293945, "learning_rate": 4.1776494565217394e-05, "loss": 0.6336, "step": 9274 }, { "epoch": 1.4519411396368191, "grad_norm": 6.363882541656494, "learning_rate": 4.1764605978260865e-05, "loss": 1.1137, "step": 9275 }, { "epoch": 1.4520976831559174, "grad_norm": 2.3410794734954834, "learning_rate": 4.175271739130434e-05, "loss": 1.2086, "step": 9276 }, { "epoch": 1.4522542266750156, "grad_norm": 1.6105388402938843, "learning_rate": 4.174082880434782e-05, "loss": 0.656, "step": 9277 }, { "epoch": 1.452410770194114, "grad_norm": 4.033339023590088, "learning_rate": 4.17289402173913e-05, "loss": 0.81, "step": 9278 }, { "epoch": 1.4525673137132122, "grad_norm": 2.9741451740264893, "learning_rate": 4.171705163043478e-05, "loss": 1.3133, "step": 9279 }, { "epoch": 1.4527238572323107, "grad_norm": 3.01710844039917, "learning_rate": 4.1705163043478256e-05, "loss": 0.7292, "step": 9280 }, { "epoch": 1.452880400751409, "grad_norm": 1.9682068824768066, "learning_rate": 4.1693274456521734e-05, "loss": 0.9193, "step": 9281 }, { "epoch": 1.4530369442705071, "grad_norm": 2.559218168258667, "learning_rate": 4.168138586956521e-05, "loss": 1.1918, "step": 9282 }, { "epoch": 1.4531934877896056, "grad_norm": 3.2282938957214355, "learning_rate": 4.166949728260869e-05, "loss": 1.2176, "step": 9283 }, { "epoch": 1.4533500313087038, "grad_norm": 2.223820209503174, "learning_rate": 4.1657608695652175e-05, "loss": 0.4478, "step": 9284 }, { "epoch": 1.4535065748278022, "grad_norm": 3.522545337677002, "learning_rate": 4.1645720108695654e-05, "loss": 0.7336, "step": 9285 }, { "epoch": 1.4536631183469004, "grad_norm": 1.8239401578903198, "learning_rate": 4.163383152173913e-05, "loss": 0.5427, "step": 9286 }, { "epoch": 1.4538196618659986, "grad_norm": 2.2960574626922607, "learning_rate": 4.16219429347826e-05, "loss": 0.6123, "step": 9287 }, { "epoch": 1.453976205385097, "grad_norm": 3.452686309814453, "learning_rate": 4.161005434782608e-05, "loss": 1.2685, "step": 9288 }, { "epoch": 1.4541327489041953, "grad_norm": 0.618181586265564, "learning_rate": 4.159816576086956e-05, "loss": 0.1672, "step": 9289 }, { "epoch": 1.4542892924232937, "grad_norm": 0.40843042731285095, "learning_rate": 4.158627717391304e-05, "loss": 0.1842, "step": 9290 }, { "epoch": 1.454445835942392, "grad_norm": 0.6443662643432617, "learning_rate": 4.1574388586956516e-05, "loss": 0.2539, "step": 9291 }, { "epoch": 1.4546023794614902, "grad_norm": 1.2634527683258057, "learning_rate": 4.1562499999999994e-05, "loss": 0.3054, "step": 9292 }, { "epoch": 1.4547589229805886, "grad_norm": 0.5770145058631897, "learning_rate": 4.155061141304347e-05, "loss": 0.2089, "step": 9293 }, { "epoch": 1.4549154664996868, "grad_norm": 0.6120436787605286, "learning_rate": 4.153872282608696e-05, "loss": 0.229, "step": 9294 }, { "epoch": 1.4550720100187853, "grad_norm": 0.6481825113296509, "learning_rate": 4.1526834239130435e-05, "loss": 0.1225, "step": 9295 }, { "epoch": 1.4552285535378835, "grad_norm": 0.5140082836151123, "learning_rate": 4.151494565217391e-05, "loss": 0.2061, "step": 9296 }, { "epoch": 1.4553850970569817, "grad_norm": 0.9384800791740417, "learning_rate": 4.150305706521739e-05, "loss": 0.2559, "step": 9297 }, { "epoch": 1.4555416405760802, "grad_norm": 0.8294116854667664, "learning_rate": 4.149116847826086e-05, "loss": 0.1984, "step": 9298 }, { "epoch": 1.4556981840951786, "grad_norm": 0.5285692811012268, "learning_rate": 4.147927989130434e-05, "loss": 0.1809, "step": 9299 }, { "epoch": 1.4558547276142768, "grad_norm": 0.9651150703430176, "learning_rate": 4.146739130434782e-05, "loss": 0.2241, "step": 9300 }, { "epoch": 1.456011271133375, "grad_norm": 1.2486416101455688, "learning_rate": 4.14555027173913e-05, "loss": 0.4165, "step": 9301 }, { "epoch": 1.4561678146524735, "grad_norm": 1.132035493850708, "learning_rate": 4.1443614130434775e-05, "loss": 0.2671, "step": 9302 }, { "epoch": 1.4563243581715717, "grad_norm": 0.8505483269691467, "learning_rate": 4.1431725543478254e-05, "loss": 0.315, "step": 9303 }, { "epoch": 1.4564809016906701, "grad_norm": 1.1295266151428223, "learning_rate": 4.141983695652174e-05, "loss": 0.2847, "step": 9304 }, { "epoch": 1.4566374452097683, "grad_norm": 1.236594796180725, "learning_rate": 4.140794836956522e-05, "loss": 0.5735, "step": 9305 }, { "epoch": 1.4567939887288666, "grad_norm": 1.1938930749893188, "learning_rate": 4.1396059782608695e-05, "loss": 0.1805, "step": 9306 }, { "epoch": 1.456950532247965, "grad_norm": 1.490020513534546, "learning_rate": 4.138417119565217e-05, "loss": 0.5445, "step": 9307 }, { "epoch": 1.4571070757670632, "grad_norm": 2.8513307571411133, "learning_rate": 4.137228260869565e-05, "loss": 0.5543, "step": 9308 }, { "epoch": 1.4572636192861617, "grad_norm": 2.229954957962036, "learning_rate": 4.136039402173913e-05, "loss": 0.4465, "step": 9309 }, { "epoch": 1.4574201628052599, "grad_norm": 1.6528555154800415, "learning_rate": 4.13485054347826e-05, "loss": 0.6798, "step": 9310 }, { "epoch": 1.457576706324358, "grad_norm": 1.5599907636642456, "learning_rate": 4.133661684782608e-05, "loss": 0.4016, "step": 9311 }, { "epoch": 1.4577332498434565, "grad_norm": 1.696831226348877, "learning_rate": 4.132472826086956e-05, "loss": 0.6274, "step": 9312 }, { "epoch": 1.4578897933625548, "grad_norm": 2.6584725379943848, "learning_rate": 4.131283967391304e-05, "loss": 0.8115, "step": 9313 }, { "epoch": 1.4580463368816532, "grad_norm": 2.3619449138641357, "learning_rate": 4.130095108695652e-05, "loss": 0.537, "step": 9314 }, { "epoch": 1.4582028804007514, "grad_norm": 1.786773443222046, "learning_rate": 4.12890625e-05, "loss": 0.3521, "step": 9315 }, { "epoch": 1.4583594239198496, "grad_norm": 2.7383222579956055, "learning_rate": 4.1277173913043476e-05, "loss": 0.785, "step": 9316 }, { "epoch": 1.458515967438948, "grad_norm": 2.101667642593384, "learning_rate": 4.1265285326086954e-05, "loss": 0.8425, "step": 9317 }, { "epoch": 1.4586725109580463, "grad_norm": 3.3512697219848633, "learning_rate": 4.125339673913043e-05, "loss": 0.8625, "step": 9318 }, { "epoch": 1.4588290544771447, "grad_norm": 9.053346633911133, "learning_rate": 4.124150815217391e-05, "loss": 0.7364, "step": 9319 }, { "epoch": 1.458985597996243, "grad_norm": 2.2966580390930176, "learning_rate": 4.122961956521739e-05, "loss": 0.8431, "step": 9320 }, { "epoch": 1.4591421415153412, "grad_norm": 3.870370626449585, "learning_rate": 4.121773097826086e-05, "loss": 1.1005, "step": 9321 }, { "epoch": 1.4592986850344396, "grad_norm": 5.569665431976318, "learning_rate": 4.120584239130434e-05, "loss": 1.1258, "step": 9322 }, { "epoch": 1.4594552285535378, "grad_norm": 2.6250834465026855, "learning_rate": 4.1193953804347823e-05, "loss": 0.8886, "step": 9323 }, { "epoch": 1.4596117720726363, "grad_norm": 3.503044366836548, "learning_rate": 4.11820652173913e-05, "loss": 0.761, "step": 9324 }, { "epoch": 1.4597683155917345, "grad_norm": 3.162754774093628, "learning_rate": 4.117017663043478e-05, "loss": 0.4857, "step": 9325 }, { "epoch": 1.4599248591108327, "grad_norm": 3.612436294555664, "learning_rate": 4.115828804347826e-05, "loss": 1.6981, "step": 9326 }, { "epoch": 1.4600814026299311, "grad_norm": 3.6811065673828125, "learning_rate": 4.1146399456521736e-05, "loss": 1.1021, "step": 9327 }, { "epoch": 1.4602379461490294, "grad_norm": 5.622432708740234, "learning_rate": 4.1134510869565214e-05, "loss": 0.9423, "step": 9328 }, { "epoch": 1.4603944896681278, "grad_norm": 2.2317519187927246, "learning_rate": 4.112262228260869e-05, "loss": 0.7962, "step": 9329 }, { "epoch": 1.460551033187226, "grad_norm": 4.5817060470581055, "learning_rate": 4.111073369565217e-05, "loss": 0.9588, "step": 9330 }, { "epoch": 1.4607075767063242, "grad_norm": 3.675532579421997, "learning_rate": 4.109884510869565e-05, "loss": 1.0655, "step": 9331 }, { "epoch": 1.4608641202254227, "grad_norm": 4.740599155426025, "learning_rate": 4.1086956521739134e-05, "loss": 1.1613, "step": 9332 }, { "epoch": 1.461020663744521, "grad_norm": 3.818514823913574, "learning_rate": 4.1075067934782605e-05, "loss": 1.1513, "step": 9333 }, { "epoch": 1.4611772072636193, "grad_norm": 1.4133578538894653, "learning_rate": 4.106317934782608e-05, "loss": 0.2477, "step": 9334 }, { "epoch": 1.4613337507827175, "grad_norm": 4.418375015258789, "learning_rate": 4.105129076086956e-05, "loss": 0.9319, "step": 9335 }, { "epoch": 1.461490294301816, "grad_norm": 1.8793584108352661, "learning_rate": 4.103940217391304e-05, "loss": 0.6682, "step": 9336 }, { "epoch": 1.4616468378209142, "grad_norm": 1.6928939819335938, "learning_rate": 4.102751358695652e-05, "loss": 0.3225, "step": 9337 }, { "epoch": 1.4618033813400126, "grad_norm": 2.1861538887023926, "learning_rate": 4.1015624999999996e-05, "loss": 0.5667, "step": 9338 }, { "epoch": 1.4619599248591109, "grad_norm": 0.7148823738098145, "learning_rate": 4.1003736413043474e-05, "loss": 0.2686, "step": 9339 }, { "epoch": 1.462116468378209, "grad_norm": 0.7055800557136536, "learning_rate": 4.099184782608695e-05, "loss": 0.2407, "step": 9340 }, { "epoch": 1.4622730118973075, "grad_norm": 0.5067596435546875, "learning_rate": 4.097995923913043e-05, "loss": 0.1272, "step": 9341 }, { "epoch": 1.4624295554164057, "grad_norm": 0.6217328310012817, "learning_rate": 4.0968070652173915e-05, "loss": 0.2988, "step": 9342 }, { "epoch": 1.4625860989355042, "grad_norm": 0.9216888546943665, "learning_rate": 4.095618206521739e-05, "loss": 0.2471, "step": 9343 }, { "epoch": 1.4627426424546024, "grad_norm": 0.5592172741889954, "learning_rate": 4.0944293478260865e-05, "loss": 0.1511, "step": 9344 }, { "epoch": 1.4628991859737006, "grad_norm": 0.5715106129646301, "learning_rate": 4.093240489130434e-05, "loss": 0.2592, "step": 9345 }, { "epoch": 1.463055729492799, "grad_norm": 0.6069285273551941, "learning_rate": 4.092051630434782e-05, "loss": 0.2273, "step": 9346 }, { "epoch": 1.4632122730118973, "grad_norm": 1.0924493074417114, "learning_rate": 4.09086277173913e-05, "loss": 0.3228, "step": 9347 }, { "epoch": 1.4633688165309957, "grad_norm": 0.6951536536216736, "learning_rate": 4.089673913043478e-05, "loss": 0.2413, "step": 9348 }, { "epoch": 1.463525360050094, "grad_norm": 1.677202582359314, "learning_rate": 4.0884850543478255e-05, "loss": 0.3272, "step": 9349 }, { "epoch": 1.4636819035691921, "grad_norm": 2.394939661026001, "learning_rate": 4.0872961956521734e-05, "loss": 0.6397, "step": 9350 }, { "epoch": 1.4638384470882906, "grad_norm": 0.6866820454597473, "learning_rate": 4.086107336956521e-05, "loss": 0.178, "step": 9351 }, { "epoch": 1.4639949906073888, "grad_norm": 3.6389448642730713, "learning_rate": 4.08491847826087e-05, "loss": 0.2241, "step": 9352 }, { "epoch": 1.4641515341264872, "grad_norm": 1.7455090284347534, "learning_rate": 4.0837296195652175e-05, "loss": 0.2682, "step": 9353 }, { "epoch": 1.4643080776455855, "grad_norm": 2.760821580886841, "learning_rate": 4.082540760869565e-05, "loss": 0.4093, "step": 9354 }, { "epoch": 1.4644646211646837, "grad_norm": 1.5279215574264526, "learning_rate": 4.081351902173913e-05, "loss": 0.4677, "step": 9355 }, { "epoch": 1.4646211646837821, "grad_norm": 2.1355438232421875, "learning_rate": 4.08016304347826e-05, "loss": 0.406, "step": 9356 }, { "epoch": 1.4647777082028803, "grad_norm": 2.6984994411468506, "learning_rate": 4.078974184782608e-05, "loss": 0.4989, "step": 9357 }, { "epoch": 1.4649342517219788, "grad_norm": 1.1451126337051392, "learning_rate": 4.077785326086956e-05, "loss": 0.3938, "step": 9358 }, { "epoch": 1.465090795241077, "grad_norm": 2.0320394039154053, "learning_rate": 4.076596467391304e-05, "loss": 0.4199, "step": 9359 }, { "epoch": 1.4652473387601752, "grad_norm": 1.1019046306610107, "learning_rate": 4.0754076086956515e-05, "loss": 0.4027, "step": 9360 }, { "epoch": 1.4654038822792737, "grad_norm": 3.471359968185425, "learning_rate": 4.074218749999999e-05, "loss": 1.0002, "step": 9361 }, { "epoch": 1.4655604257983719, "grad_norm": 1.7441716194152832, "learning_rate": 4.073029891304348e-05, "loss": 0.146, "step": 9362 }, { "epoch": 1.4657169693174703, "grad_norm": 2.5213985443115234, "learning_rate": 4.0718410326086956e-05, "loss": 0.5179, "step": 9363 }, { "epoch": 1.4658735128365685, "grad_norm": 1.490350365638733, "learning_rate": 4.0706521739130435e-05, "loss": 0.3126, "step": 9364 }, { "epoch": 1.4660300563556667, "grad_norm": 2.2905631065368652, "learning_rate": 4.069463315217391e-05, "loss": 0.6469, "step": 9365 }, { "epoch": 1.4661865998747652, "grad_norm": 2.0713374614715576, "learning_rate": 4.068274456521739e-05, "loss": 0.6304, "step": 9366 }, { "epoch": 1.4663431433938636, "grad_norm": 3.7919654846191406, "learning_rate": 4.067085597826086e-05, "loss": 0.4692, "step": 9367 }, { "epoch": 1.4664996869129618, "grad_norm": 1.638966679573059, "learning_rate": 4.065896739130434e-05, "loss": 0.3005, "step": 9368 }, { "epoch": 1.46665623043206, "grad_norm": 1.4711403846740723, "learning_rate": 4.064707880434782e-05, "loss": 0.4943, "step": 9369 }, { "epoch": 1.4668127739511585, "grad_norm": 4.947766304016113, "learning_rate": 4.06351902173913e-05, "loss": 0.7023, "step": 9370 }, { "epoch": 1.4669693174702567, "grad_norm": 3.28539776802063, "learning_rate": 4.0623301630434775e-05, "loss": 0.9344, "step": 9371 }, { "epoch": 1.4671258609893552, "grad_norm": 2.8735368251800537, "learning_rate": 4.061141304347826e-05, "loss": 0.445, "step": 9372 }, { "epoch": 1.4672824045084534, "grad_norm": 4.057353973388672, "learning_rate": 4.059952445652174e-05, "loss": 0.6397, "step": 9373 }, { "epoch": 1.4674389480275516, "grad_norm": 4.593616008758545, "learning_rate": 4.0587635869565216e-05, "loss": 0.8497, "step": 9374 }, { "epoch": 1.46759549154665, "grad_norm": 5.4910078048706055, "learning_rate": 4.0575747282608694e-05, "loss": 0.6703, "step": 9375 }, { "epoch": 1.4677520350657483, "grad_norm": 34.594276428222656, "learning_rate": 4.056385869565217e-05, "loss": 0.7382, "step": 9376 }, { "epoch": 1.4679085785848467, "grad_norm": 4.031030654907227, "learning_rate": 4.055197010869565e-05, "loss": 1.1941, "step": 9377 }, { "epoch": 1.468065122103945, "grad_norm": 2.0943875312805176, "learning_rate": 4.054008152173913e-05, "loss": 0.8037, "step": 9378 }, { "epoch": 1.4682216656230431, "grad_norm": 2.4169909954071045, "learning_rate": 4.05281929347826e-05, "loss": 0.9728, "step": 9379 }, { "epoch": 1.4683782091421416, "grad_norm": 2.4582362174987793, "learning_rate": 4.051630434782608e-05, "loss": 1.4343, "step": 9380 }, { "epoch": 1.4685347526612398, "grad_norm": 3.602802038192749, "learning_rate": 4.0504415760869556e-05, "loss": 1.0631, "step": 9381 }, { "epoch": 1.4686912961803382, "grad_norm": 2.989946126937866, "learning_rate": 4.049252717391304e-05, "loss": 1.0501, "step": 9382 }, { "epoch": 1.4688478396994364, "grad_norm": 2.9404006004333496, "learning_rate": 4.048063858695652e-05, "loss": 1.3516, "step": 9383 }, { "epoch": 1.4690043832185347, "grad_norm": 2.718142032623291, "learning_rate": 4.046875e-05, "loss": 1.2199, "step": 9384 }, { "epoch": 1.469160926737633, "grad_norm": 5.932255268096924, "learning_rate": 4.0456861413043476e-05, "loss": 1.3792, "step": 9385 }, { "epoch": 1.4693174702567313, "grad_norm": 1.060914397239685, "learning_rate": 4.0444972826086954e-05, "loss": 0.6098, "step": 9386 }, { "epoch": 1.4694740137758298, "grad_norm": 4.7429633140563965, "learning_rate": 4.043308423913043e-05, "loss": 1.1245, "step": 9387 }, { "epoch": 1.469630557294928, "grad_norm": 1.8404436111450195, "learning_rate": 4.042119565217391e-05, "loss": 0.6173, "step": 9388 }, { "epoch": 1.4697871008140262, "grad_norm": 0.7180817723274231, "learning_rate": 4.040930706521739e-05, "loss": 0.2141, "step": 9389 }, { "epoch": 1.4699436443331246, "grad_norm": 0.40575212240219116, "learning_rate": 4.039741847826086e-05, "loss": 0.1536, "step": 9390 }, { "epoch": 1.4701001878522229, "grad_norm": 1.6212108135223389, "learning_rate": 4.038552989130434e-05, "loss": 0.2285, "step": 9391 }, { "epoch": 1.4702567313713213, "grad_norm": 0.5989220142364502, "learning_rate": 4.037364130434782e-05, "loss": 0.2385, "step": 9392 }, { "epoch": 1.4704132748904195, "grad_norm": 0.6037298440933228, "learning_rate": 4.03617527173913e-05, "loss": 0.194, "step": 9393 }, { "epoch": 1.4705698184095177, "grad_norm": 2.1920006275177, "learning_rate": 4.034986413043478e-05, "loss": 0.2074, "step": 9394 }, { "epoch": 1.4707263619286162, "grad_norm": 0.8084788918495178, "learning_rate": 4.033797554347826e-05, "loss": 0.257, "step": 9395 }, { "epoch": 1.4708829054477144, "grad_norm": 1.1382107734680176, "learning_rate": 4.0326086956521736e-05, "loss": 0.2682, "step": 9396 }, { "epoch": 1.4710394489668128, "grad_norm": 0.9565906524658203, "learning_rate": 4.0314198369565214e-05, "loss": 0.2182, "step": 9397 }, { "epoch": 1.471195992485911, "grad_norm": 0.6544360518455505, "learning_rate": 4.030230978260869e-05, "loss": 0.1526, "step": 9398 }, { "epoch": 1.4713525360050093, "grad_norm": 0.8631836771965027, "learning_rate": 4.029042119565217e-05, "loss": 0.3657, "step": 9399 }, { "epoch": 1.4715090795241077, "grad_norm": 1.726982831954956, "learning_rate": 4.0278532608695655e-05, "loss": 0.2251, "step": 9400 }, { "epoch": 1.4716656230432061, "grad_norm": 1.15215003490448, "learning_rate": 4.026664402173913e-05, "loss": 0.2836, "step": 9401 }, { "epoch": 1.4718221665623044, "grad_norm": 1.3201156854629517, "learning_rate": 4.0254755434782605e-05, "loss": 0.201, "step": 9402 }, { "epoch": 1.4719787100814026, "grad_norm": 0.7439080476760864, "learning_rate": 4.024286684782608e-05, "loss": 0.1971, "step": 9403 }, { "epoch": 1.472135253600501, "grad_norm": 1.3644425868988037, "learning_rate": 4.023097826086956e-05, "loss": 0.3559, "step": 9404 }, { "epoch": 1.4722917971195992, "grad_norm": 0.9179037809371948, "learning_rate": 4.021908967391304e-05, "loss": 0.3344, "step": 9405 }, { "epoch": 1.4724483406386977, "grad_norm": 1.9183275699615479, "learning_rate": 4.020720108695652e-05, "loss": 0.4171, "step": 9406 }, { "epoch": 1.472604884157796, "grad_norm": 2.077299118041992, "learning_rate": 4.0195312499999995e-05, "loss": 0.4436, "step": 9407 }, { "epoch": 1.472761427676894, "grad_norm": 1.1448837518692017, "learning_rate": 4.0183423913043473e-05, "loss": 0.4185, "step": 9408 }, { "epoch": 1.4729179711959925, "grad_norm": 1.401747465133667, "learning_rate": 4.017153532608695e-05, "loss": 0.3503, "step": 9409 }, { "epoch": 1.4730745147150908, "grad_norm": 2.4912946224212646, "learning_rate": 4.0159646739130437e-05, "loss": 0.4459, "step": 9410 }, { "epoch": 1.4732310582341892, "grad_norm": 1.609484076499939, "learning_rate": 4.0147758152173915e-05, "loss": 0.509, "step": 9411 }, { "epoch": 1.4733876017532874, "grad_norm": 1.9134795665740967, "learning_rate": 4.013586956521739e-05, "loss": 0.4679, "step": 9412 }, { "epoch": 1.4735441452723856, "grad_norm": 0.9143208265304565, "learning_rate": 4.0123980978260864e-05, "loss": 0.2503, "step": 9413 }, { "epoch": 1.473700688791484, "grad_norm": 2.0061326026916504, "learning_rate": 4.011209239130434e-05, "loss": 0.7248, "step": 9414 }, { "epoch": 1.4738572323105823, "grad_norm": 0.7063813805580139, "learning_rate": 4.010020380434782e-05, "loss": 0.2534, "step": 9415 }, { "epoch": 1.4740137758296807, "grad_norm": 3.0217642784118652, "learning_rate": 4.00883152173913e-05, "loss": 0.7656, "step": 9416 }, { "epoch": 1.474170319348779, "grad_norm": 1.827446699142456, "learning_rate": 4.007642663043478e-05, "loss": 0.544, "step": 9417 }, { "epoch": 1.4743268628678772, "grad_norm": 1.99081552028656, "learning_rate": 4.0064538043478255e-05, "loss": 0.5879, "step": 9418 }, { "epoch": 1.4744834063869756, "grad_norm": 2.1253626346588135, "learning_rate": 4.005264945652173e-05, "loss": 0.3772, "step": 9419 }, { "epoch": 1.4746399499060738, "grad_norm": 2.4993536472320557, "learning_rate": 4.004076086956522e-05, "loss": 0.4462, "step": 9420 }, { "epoch": 1.4747964934251723, "grad_norm": 2.2094485759735107, "learning_rate": 4.0028872282608696e-05, "loss": 0.7092, "step": 9421 }, { "epoch": 1.4749530369442705, "grad_norm": 3.395089626312256, "learning_rate": 4.0016983695652174e-05, "loss": 0.8416, "step": 9422 }, { "epoch": 1.4751095804633687, "grad_norm": 3.315373659133911, "learning_rate": 4.000509510869565e-05, "loss": 1.204, "step": 9423 }, { "epoch": 1.4752661239824671, "grad_norm": 2.0389113426208496, "learning_rate": 3.999320652173913e-05, "loss": 0.7041, "step": 9424 }, { "epoch": 1.4754226675015654, "grad_norm": 3.9114458560943604, "learning_rate": 3.99813179347826e-05, "loss": 0.8191, "step": 9425 }, { "epoch": 1.4755792110206638, "grad_norm": 3.2570273876190186, "learning_rate": 3.996942934782608e-05, "loss": 0.9994, "step": 9426 }, { "epoch": 1.475735754539762, "grad_norm": 5.690246105194092, "learning_rate": 3.995754076086956e-05, "loss": 0.8478, "step": 9427 }, { "epoch": 1.4758922980588602, "grad_norm": 3.9936397075653076, "learning_rate": 3.9945652173913037e-05, "loss": 0.9616, "step": 9428 }, { "epoch": 1.4760488415779587, "grad_norm": 1.9564018249511719, "learning_rate": 3.9933763586956515e-05, "loss": 0.9712, "step": 9429 }, { "epoch": 1.476205385097057, "grad_norm": 2.702277421951294, "learning_rate": 3.9921875e-05, "loss": 1.601, "step": 9430 }, { "epoch": 1.4763619286161553, "grad_norm": 1.8299083709716797, "learning_rate": 3.990998641304348e-05, "loss": 1.212, "step": 9431 }, { "epoch": 1.4765184721352536, "grad_norm": 2.7829902172088623, "learning_rate": 3.9898097826086956e-05, "loss": 1.1946, "step": 9432 }, { "epoch": 1.4766750156543518, "grad_norm": 1.6911497116088867, "learning_rate": 3.9886209239130434e-05, "loss": 0.8398, "step": 9433 }, { "epoch": 1.4768315591734502, "grad_norm": 2.550330400466919, "learning_rate": 3.987432065217391e-05, "loss": 0.5809, "step": 9434 }, { "epoch": 1.4769881026925487, "grad_norm": 2.150177001953125, "learning_rate": 3.986243206521739e-05, "loss": 0.4914, "step": 9435 }, { "epoch": 1.4771446462116469, "grad_norm": 4.958688259124756, "learning_rate": 3.985054347826086e-05, "loss": 0.8943, "step": 9436 }, { "epoch": 1.477301189730745, "grad_norm": 4.1965508460998535, "learning_rate": 3.983865489130434e-05, "loss": 1.3866, "step": 9437 }, { "epoch": 1.4774577332498435, "grad_norm": 2.3914635181427, "learning_rate": 3.982676630434782e-05, "loss": 0.946, "step": 9438 }, { "epoch": 1.4776142767689417, "grad_norm": 0.5662755966186523, "learning_rate": 3.9814877717391296e-05, "loss": 0.2353, "step": 9439 }, { "epoch": 1.4777708202880402, "grad_norm": 4.68065071105957, "learning_rate": 3.980298913043478e-05, "loss": 0.308, "step": 9440 }, { "epoch": 1.4779273638071384, "grad_norm": 0.5492413640022278, "learning_rate": 3.979110054347826e-05, "loss": 0.1524, "step": 9441 }, { "epoch": 1.4780839073262366, "grad_norm": 0.6284293532371521, "learning_rate": 3.977921195652174e-05, "loss": 0.3089, "step": 9442 }, { "epoch": 1.478240450845335, "grad_norm": 0.5615115761756897, "learning_rate": 3.9767323369565216e-05, "loss": 0.1344, "step": 9443 }, { "epoch": 1.4783969943644333, "grad_norm": 0.6574634909629822, "learning_rate": 3.9755434782608694e-05, "loss": 0.2552, "step": 9444 }, { "epoch": 1.4785535378835317, "grad_norm": 0.8343060612678528, "learning_rate": 3.974354619565217e-05, "loss": 0.3001, "step": 9445 }, { "epoch": 1.47871008140263, "grad_norm": 3.2556328773498535, "learning_rate": 3.973165760869565e-05, "loss": 0.2801, "step": 9446 }, { "epoch": 1.4788666249217282, "grad_norm": 1.0068053007125854, "learning_rate": 3.971976902173913e-05, "loss": 0.4163, "step": 9447 }, { "epoch": 1.4790231684408266, "grad_norm": 0.7881961464881897, "learning_rate": 3.97078804347826e-05, "loss": 0.2465, "step": 9448 }, { "epoch": 1.4791797119599248, "grad_norm": 0.6545819044113159, "learning_rate": 3.969599184782608e-05, "loss": 0.1727, "step": 9449 }, { "epoch": 1.4793362554790233, "grad_norm": 0.933887779712677, "learning_rate": 3.968410326086956e-05, "loss": 0.345, "step": 9450 }, { "epoch": 1.4794927989981215, "grad_norm": 1.3929316997528076, "learning_rate": 3.967221467391304e-05, "loss": 0.3881, "step": 9451 }, { "epoch": 1.4796493425172197, "grad_norm": 0.973512589931488, "learning_rate": 3.966032608695652e-05, "loss": 0.2329, "step": 9452 }, { "epoch": 1.4798058860363181, "grad_norm": 1.2121917009353638, "learning_rate": 3.96484375e-05, "loss": 0.3309, "step": 9453 }, { "epoch": 1.4799624295554163, "grad_norm": 1.1499638557434082, "learning_rate": 3.9636548913043475e-05, "loss": 0.5216, "step": 9454 }, { "epoch": 1.4801189730745148, "grad_norm": 0.8778674006462097, "learning_rate": 3.9624660326086954e-05, "loss": 0.3882, "step": 9455 }, { "epoch": 1.480275516593613, "grad_norm": 1.9793564081192017, "learning_rate": 3.961277173913043e-05, "loss": 0.4474, "step": 9456 }, { "epoch": 1.4804320601127112, "grad_norm": 2.2396252155303955, "learning_rate": 3.960088315217391e-05, "loss": 0.3982, "step": 9457 }, { "epoch": 1.4805886036318097, "grad_norm": 1.3401989936828613, "learning_rate": 3.958899456521739e-05, "loss": 0.4313, "step": 9458 }, { "epoch": 1.4807451471509079, "grad_norm": 0.7481887936592102, "learning_rate": 3.957710597826086e-05, "loss": 0.237, "step": 9459 }, { "epoch": 1.4809016906700063, "grad_norm": 2.32619309425354, "learning_rate": 3.9565217391304344e-05, "loss": 0.562, "step": 9460 }, { "epoch": 1.4810582341891045, "grad_norm": 1.5739473104476929, "learning_rate": 3.955332880434782e-05, "loss": 0.4584, "step": 9461 }, { "epoch": 1.4812147777082028, "grad_norm": 3.010822057723999, "learning_rate": 3.95414402173913e-05, "loss": 0.5414, "step": 9462 }, { "epoch": 1.4813713212273012, "grad_norm": 1.1214537620544434, "learning_rate": 3.952955163043478e-05, "loss": 0.3232, "step": 9463 }, { "epoch": 1.4815278647463996, "grad_norm": 2.5215132236480713, "learning_rate": 3.951766304347826e-05, "loss": 0.4061, "step": 9464 }, { "epoch": 1.4816844082654979, "grad_norm": 2.4723825454711914, "learning_rate": 3.9505774456521735e-05, "loss": 0.3092, "step": 9465 }, { "epoch": 1.481840951784596, "grad_norm": 2.1081998348236084, "learning_rate": 3.949388586956521e-05, "loss": 0.6226, "step": 9466 }, { "epoch": 1.4819974953036943, "grad_norm": 2.054635524749756, "learning_rate": 3.948199728260869e-05, "loss": 0.6765, "step": 9467 }, { "epoch": 1.4821540388227927, "grad_norm": 1.8563541173934937, "learning_rate": 3.947010869565217e-05, "loss": 0.5824, "step": 9468 }, { "epoch": 1.4823105823418912, "grad_norm": 1.676539659500122, "learning_rate": 3.9458220108695655e-05, "loss": 0.4279, "step": 9469 }, { "epoch": 1.4824671258609894, "grad_norm": 1.672682285308838, "learning_rate": 3.944633152173913e-05, "loss": 0.5303, "step": 9470 }, { "epoch": 1.4826236693800876, "grad_norm": 6.7541093826293945, "learning_rate": 3.9434442934782604e-05, "loss": 1.2653, "step": 9471 }, { "epoch": 1.482780212899186, "grad_norm": 2.0551276206970215, "learning_rate": 3.942255434782608e-05, "loss": 0.9289, "step": 9472 }, { "epoch": 1.4829367564182843, "grad_norm": 2.6676619052886963, "learning_rate": 3.941066576086956e-05, "loss": 0.7336, "step": 9473 }, { "epoch": 1.4830932999373827, "grad_norm": 2.4071037769317627, "learning_rate": 3.939877717391304e-05, "loss": 0.5444, "step": 9474 }, { "epoch": 1.483249843456481, "grad_norm": 3.406665325164795, "learning_rate": 3.938688858695652e-05, "loss": 1.0701, "step": 9475 }, { "epoch": 1.4834063869755791, "grad_norm": 3.359962224960327, "learning_rate": 3.9374999999999995e-05, "loss": 0.9748, "step": 9476 }, { "epoch": 1.4835629304946776, "grad_norm": 2.9134137630462646, "learning_rate": 3.936311141304347e-05, "loss": 0.847, "step": 9477 }, { "epoch": 1.4837194740137758, "grad_norm": 1.783878207206726, "learning_rate": 3.935122282608695e-05, "loss": 0.6114, "step": 9478 }, { "epoch": 1.4838760175328742, "grad_norm": 2.9464223384857178, "learning_rate": 3.9339334239130436e-05, "loss": 0.9835, "step": 9479 }, { "epoch": 1.4840325610519725, "grad_norm": 2.2161056995391846, "learning_rate": 3.9327445652173914e-05, "loss": 1.131, "step": 9480 }, { "epoch": 1.4841891045710707, "grad_norm": 2.599893569946289, "learning_rate": 3.931555706521739e-05, "loss": 1.0773, "step": 9481 }, { "epoch": 1.4843456480901691, "grad_norm": 1.8975387811660767, "learning_rate": 3.9303668478260864e-05, "loss": 0.7381, "step": 9482 }, { "epoch": 1.4845021916092673, "grad_norm": 6.841968059539795, "learning_rate": 3.929177989130434e-05, "loss": 1.1832, "step": 9483 }, { "epoch": 1.4846587351283658, "grad_norm": 3.032521963119507, "learning_rate": 3.927989130434782e-05, "loss": 1.1721, "step": 9484 }, { "epoch": 1.484815278647464, "grad_norm": 2.151503324508667, "learning_rate": 3.92680027173913e-05, "loss": 0.3237, "step": 9485 }, { "epoch": 1.4849718221665622, "grad_norm": 2.6575815677642822, "learning_rate": 3.9256114130434776e-05, "loss": 0.9869, "step": 9486 }, { "epoch": 1.4851283656856606, "grad_norm": 3.9065473079681396, "learning_rate": 3.9244225543478255e-05, "loss": 1.3048, "step": 9487 }, { "epoch": 1.4852849092047589, "grad_norm": 3.682189702987671, "learning_rate": 3.923233695652173e-05, "loss": 0.8804, "step": 9488 }, { "epoch": 1.4854414527238573, "grad_norm": 0.516037106513977, "learning_rate": 3.922044836956522e-05, "loss": 0.1853, "step": 9489 }, { "epoch": 1.4855979962429555, "grad_norm": 0.3704691529273987, "learning_rate": 3.9208559782608696e-05, "loss": 0.1867, "step": 9490 }, { "epoch": 1.4857545397620537, "grad_norm": 0.486287921667099, "learning_rate": 3.9196671195652174e-05, "loss": 0.172, "step": 9491 }, { "epoch": 1.4859110832811522, "grad_norm": 0.9240738749504089, "learning_rate": 3.918478260869565e-05, "loss": 0.2413, "step": 9492 }, { "epoch": 1.4860676268002504, "grad_norm": 1.1277118921279907, "learning_rate": 3.917289402173913e-05, "loss": 0.3273, "step": 9493 }, { "epoch": 1.4862241703193488, "grad_norm": 0.5349644422531128, "learning_rate": 3.91610054347826e-05, "loss": 0.1839, "step": 9494 }, { "epoch": 1.486380713838447, "grad_norm": 1.6806374788284302, "learning_rate": 3.914911684782608e-05, "loss": 0.3119, "step": 9495 }, { "epoch": 1.4865372573575453, "grad_norm": 4.338770866394043, "learning_rate": 3.913722826086956e-05, "loss": 0.7814, "step": 9496 }, { "epoch": 1.4866938008766437, "grad_norm": 0.8507847189903259, "learning_rate": 3.9125339673913036e-05, "loss": 0.2135, "step": 9497 }, { "epoch": 1.4868503443957422, "grad_norm": 2.003952741622925, "learning_rate": 3.9113451086956514e-05, "loss": 0.3623, "step": 9498 }, { "epoch": 1.4870068879148404, "grad_norm": 0.9231337904930115, "learning_rate": 3.91015625e-05, "loss": 0.2323, "step": 9499 }, { "epoch": 1.4871634314339386, "grad_norm": 0.7572725415229797, "learning_rate": 3.908967391304348e-05, "loss": 0.1469, "step": 9500 }, { "epoch": 1.487319974953037, "grad_norm": 1.0854719877243042, "learning_rate": 3.9077785326086956e-05, "loss": 0.3559, "step": 9501 }, { "epoch": 1.4874765184721352, "grad_norm": 0.7236939668655396, "learning_rate": 3.9065896739130434e-05, "loss": 0.2562, "step": 9502 }, { "epoch": 1.4876330619912337, "grad_norm": NaN, "learning_rate": 3.9065896739130434e-05, "loss": 0.0, "step": 9503 }, { "epoch": 1.487789605510332, "grad_norm": 0.7916626334190369, "learning_rate": 3.905400815217391e-05, "loss": 0.3921, "step": 9504 }, { "epoch": 1.4879461490294301, "grad_norm": 1.7659244537353516, "learning_rate": 3.904211956521739e-05, "loss": 0.3589, "step": 9505 }, { "epoch": 1.4881026925485286, "grad_norm": 1.1041806936264038, "learning_rate": 3.903023097826086e-05, "loss": 0.3748, "step": 9506 }, { "epoch": 1.4882592360676268, "grad_norm": 1.2529082298278809, "learning_rate": 3.901834239130434e-05, "loss": 0.3721, "step": 9507 }, { "epoch": 1.4884157795867252, "grad_norm": 2.2092816829681396, "learning_rate": 3.900645380434782e-05, "loss": 0.8355, "step": 9508 }, { "epoch": 1.4885723231058234, "grad_norm": 1.3431447744369507, "learning_rate": 3.89945652173913e-05, "loss": 0.4333, "step": 9509 }, { "epoch": 1.4887288666249217, "grad_norm": 1.1871147155761719, "learning_rate": 3.898267663043478e-05, "loss": 0.3661, "step": 9510 }, { "epoch": 1.48888541014402, "grad_norm": 2.297050952911377, "learning_rate": 3.897078804347826e-05, "loss": 0.428, "step": 9511 }, { "epoch": 1.4890419536631183, "grad_norm": 1.9566843509674072, "learning_rate": 3.895889945652174e-05, "loss": 0.4259, "step": 9512 }, { "epoch": 1.4891984971822168, "grad_norm": 2.781750440597534, "learning_rate": 3.8947010869565215e-05, "loss": 0.8125, "step": 9513 }, { "epoch": 1.489355040701315, "grad_norm": 2.1481595039367676, "learning_rate": 3.8935122282608693e-05, "loss": 0.6376, "step": 9514 }, { "epoch": 1.4895115842204132, "grad_norm": 3.1732354164123535, "learning_rate": 3.892323369565217e-05, "loss": 0.6714, "step": 9515 }, { "epoch": 1.4896681277395116, "grad_norm": 2.1760833263397217, "learning_rate": 3.891134510869565e-05, "loss": 0.6725, "step": 9516 }, { "epoch": 1.4898246712586098, "grad_norm": 1.254492163658142, "learning_rate": 3.889945652173913e-05, "loss": 0.425, "step": 9517 }, { "epoch": 1.4899812147777083, "grad_norm": 2.0868799686431885, "learning_rate": 3.88875679347826e-05, "loss": 0.5412, "step": 9518 }, { "epoch": 1.4901377582968065, "grad_norm": 1.9811334609985352, "learning_rate": 3.8875679347826084e-05, "loss": 0.7319, "step": 9519 }, { "epoch": 1.4902943018159047, "grad_norm": 2.5351500511169434, "learning_rate": 3.886379076086956e-05, "loss": 1.1141, "step": 9520 }, { "epoch": 1.4904508453350032, "grad_norm": 2.600659132003784, "learning_rate": 3.885190217391304e-05, "loss": 0.9749, "step": 9521 }, { "epoch": 1.4906073888541014, "grad_norm": 3.001845121383667, "learning_rate": 3.884001358695652e-05, "loss": 1.0273, "step": 9522 }, { "epoch": 1.4907639323731998, "grad_norm": 1.9187536239624023, "learning_rate": 3.8828125e-05, "loss": 0.6172, "step": 9523 }, { "epoch": 1.490920475892298, "grad_norm": 1.5613077878952026, "learning_rate": 3.8816236413043475e-05, "loss": 0.5434, "step": 9524 }, { "epoch": 1.4910770194113963, "grad_norm": 3.013624429702759, "learning_rate": 3.880434782608695e-05, "loss": 1.2798, "step": 9525 }, { "epoch": 1.4912335629304947, "grad_norm": 2.3315305709838867, "learning_rate": 3.879245923913043e-05, "loss": 0.3813, "step": 9526 }, { "epoch": 1.491390106449593, "grad_norm": 2.187302350997925, "learning_rate": 3.878057065217391e-05, "loss": 0.8959, "step": 9527 }, { "epoch": 1.4915466499686914, "grad_norm": 2.2327959537506104, "learning_rate": 3.8768682065217394e-05, "loss": 0.8484, "step": 9528 }, { "epoch": 1.4917031934877896, "grad_norm": 3.025864839553833, "learning_rate": 3.8756793478260866e-05, "loss": 0.8613, "step": 9529 }, { "epoch": 1.4918597370068878, "grad_norm": 3.019212484359741, "learning_rate": 3.8744904891304344e-05, "loss": 0.7101, "step": 9530 }, { "epoch": 1.4920162805259862, "grad_norm": 3.021777629852295, "learning_rate": 3.873301630434782e-05, "loss": 1.2302, "step": 9531 }, { "epoch": 1.4921728240450847, "grad_norm": 4.007408142089844, "learning_rate": 3.87211277173913e-05, "loss": 1.5111, "step": 9532 }, { "epoch": 1.4923293675641829, "grad_norm": 2.937150239944458, "learning_rate": 3.870923913043478e-05, "loss": 0.947, "step": 9533 }, { "epoch": 1.492485911083281, "grad_norm": 2.662252187728882, "learning_rate": 3.8697350543478257e-05, "loss": 0.9493, "step": 9534 }, { "epoch": 1.4926424546023795, "grad_norm": 1.5096527338027954, "learning_rate": 3.8685461956521735e-05, "loss": 0.1697, "step": 9535 }, { "epoch": 1.4927989981214778, "grad_norm": 2.1372568607330322, "learning_rate": 3.867357336956521e-05, "loss": 0.6742, "step": 9536 }, { "epoch": 1.4929555416405762, "grad_norm": 1.2701501846313477, "learning_rate": 3.866168478260869e-05, "loss": 0.5536, "step": 9537 }, { "epoch": 1.4931120851596744, "grad_norm": 1.3305944204330444, "learning_rate": 3.8649796195652176e-05, "loss": 0.2906, "step": 9538 }, { "epoch": 1.4932686286787726, "grad_norm": 0.6295035481452942, "learning_rate": 3.8637907608695654e-05, "loss": 0.2124, "step": 9539 }, { "epoch": 1.493425172197871, "grad_norm": 0.4126646816730499, "learning_rate": 3.862601902173913e-05, "loss": 0.1981, "step": 9540 }, { "epoch": 1.4935817157169693, "grad_norm": 0.7015737891197205, "learning_rate": 3.8614130434782604e-05, "loss": 0.1994, "step": 9541 }, { "epoch": 1.4937382592360677, "grad_norm": 0.7511310577392578, "learning_rate": 3.860224184782608e-05, "loss": 0.3062, "step": 9542 }, { "epoch": 1.493894802755166, "grad_norm": 0.9559282660484314, "learning_rate": 3.859035326086956e-05, "loss": 0.2285, "step": 9543 }, { "epoch": 1.4940513462742642, "grad_norm": 0.6519678235054016, "learning_rate": 3.857846467391304e-05, "loss": 0.1984, "step": 9544 }, { "epoch": 1.4942078897933626, "grad_norm": 0.8470036387443542, "learning_rate": 3.8566576086956516e-05, "loss": 0.22, "step": 9545 }, { "epoch": 1.4943644333124608, "grad_norm": 0.7212802171707153, "learning_rate": 3.8554687499999994e-05, "loss": 0.213, "step": 9546 }, { "epoch": 1.4945209768315593, "grad_norm": 1.0826085805892944, "learning_rate": 3.854279891304347e-05, "loss": 0.2481, "step": 9547 }, { "epoch": 1.4946775203506575, "grad_norm": 0.8791028261184692, "learning_rate": 3.853091032608696e-05, "loss": 0.4654, "step": 9548 }, { "epoch": 1.4948340638697557, "grad_norm": 0.9437376260757446, "learning_rate": 3.8519021739130436e-05, "loss": 0.2495, "step": 9549 }, { "epoch": 1.4949906073888541, "grad_norm": 0.7300267219543457, "learning_rate": 3.8507133152173914e-05, "loss": 0.4172, "step": 9550 }, { "epoch": 1.4951471509079524, "grad_norm": 0.9978669285774231, "learning_rate": 3.849524456521739e-05, "loss": 0.3658, "step": 9551 }, { "epoch": 1.4953036944270508, "grad_norm": 2.069585084915161, "learning_rate": 3.848335597826086e-05, "loss": 0.5207, "step": 9552 }, { "epoch": 1.495460237946149, "grad_norm": 0.9403184652328491, "learning_rate": 3.847146739130434e-05, "loss": 0.289, "step": 9553 }, { "epoch": 1.4956167814652472, "grad_norm": 1.1953210830688477, "learning_rate": 3.845957880434782e-05, "loss": 0.3281, "step": 9554 }, { "epoch": 1.4957733249843457, "grad_norm": 1.3541394472122192, "learning_rate": 3.84476902173913e-05, "loss": 0.3841, "step": 9555 }, { "epoch": 1.495929868503444, "grad_norm": 0.9867404699325562, "learning_rate": 3.8435801630434776e-05, "loss": 0.3078, "step": 9556 }, { "epoch": 1.4960864120225423, "grad_norm": 1.3466813564300537, "learning_rate": 3.8423913043478254e-05, "loss": 0.3453, "step": 9557 }, { "epoch": 1.4962429555416406, "grad_norm": 1.0061264038085938, "learning_rate": 3.841202445652174e-05, "loss": 0.2551, "step": 9558 }, { "epoch": 1.4963994990607388, "grad_norm": 1.3425935506820679, "learning_rate": 3.840013586956522e-05, "loss": 0.3632, "step": 9559 }, { "epoch": 1.4965560425798372, "grad_norm": 2.158154010772705, "learning_rate": 3.8388247282608695e-05, "loss": 0.4381, "step": 9560 }, { "epoch": 1.4967125860989354, "grad_norm": 2.30450177192688, "learning_rate": 3.8376358695652174e-05, "loss": 0.4676, "step": 9561 }, { "epoch": 1.4968691296180339, "grad_norm": 1.5872068405151367, "learning_rate": 3.836447010869565e-05, "loss": 0.257, "step": 9562 }, { "epoch": 1.497025673137132, "grad_norm": 1.743543267250061, "learning_rate": 3.835258152173913e-05, "loss": 0.4024, "step": 9563 }, { "epoch": 1.4971822166562303, "grad_norm": 2.2705559730529785, "learning_rate": 3.83406929347826e-05, "loss": 0.5895, "step": 9564 }, { "epoch": 1.4973387601753287, "grad_norm": 2.55861234664917, "learning_rate": 3.832880434782608e-05, "loss": 0.8436, "step": 9565 }, { "epoch": 1.4974953036944272, "grad_norm": 2.1535937786102295, "learning_rate": 3.831691576086956e-05, "loss": 0.8502, "step": 9566 }, { "epoch": 1.4976518472135254, "grad_norm": 1.691846489906311, "learning_rate": 3.8305027173913036e-05, "loss": 0.623, "step": 9567 }, { "epoch": 1.4978083907326236, "grad_norm": 2.2849106788635254, "learning_rate": 3.829313858695652e-05, "loss": 0.5574, "step": 9568 }, { "epoch": 1.497964934251722, "grad_norm": 2.2120254039764404, "learning_rate": 3.828125e-05, "loss": 0.552, "step": 9569 }, { "epoch": 1.4981214777708203, "grad_norm": 3.585608959197998, "learning_rate": 3.826936141304348e-05, "loss": 0.8696, "step": 9570 }, { "epoch": 1.4982780212899187, "grad_norm": 2.362807035446167, "learning_rate": 3.8257472826086955e-05, "loss": 0.7015, "step": 9571 }, { "epoch": 1.498434564809017, "grad_norm": 4.069850921630859, "learning_rate": 3.824558423913043e-05, "loss": 0.8872, "step": 9572 }, { "epoch": 1.4985911083281152, "grad_norm": 3.4386847019195557, "learning_rate": 3.823369565217391e-05, "loss": 0.8235, "step": 9573 }, { "epoch": 1.4987476518472136, "grad_norm": 1.9840214252471924, "learning_rate": 3.822180706521739e-05, "loss": 0.1986, "step": 9574 }, { "epoch": 1.4989041953663118, "grad_norm": 2.3982648849487305, "learning_rate": 3.820991847826086e-05, "loss": 0.8246, "step": 9575 }, { "epoch": 1.4990607388854102, "grad_norm": 2.696007013320923, "learning_rate": 3.819802989130434e-05, "loss": 0.7912, "step": 9576 }, { "epoch": 1.4992172824045085, "grad_norm": 4.017723083496094, "learning_rate": 3.818614130434782e-05, "loss": 1.3757, "step": 9577 }, { "epoch": 1.4993738259236067, "grad_norm": 4.888078212738037, "learning_rate": 3.81742527173913e-05, "loss": 1.297, "step": 9578 }, { "epoch": 1.4995303694427051, "grad_norm": 2.4601662158966064, "learning_rate": 3.816236413043478e-05, "loss": 0.8596, "step": 9579 }, { "epoch": 1.4996869129618033, "grad_norm": 1.5211392641067505, "learning_rate": 3.815047554347826e-05, "loss": 0.4249, "step": 9580 }, { "epoch": 1.4998434564809018, "grad_norm": 8.595690727233887, "learning_rate": 3.813858695652174e-05, "loss": 1.5849, "step": 9581 }, { "epoch": 1.5, "grad_norm": 5.171334266662598, "learning_rate": 3.8126698369565215e-05, "loss": 1.3117, "step": 9582 }, { "epoch": 1.5001565435190982, "grad_norm": 3.7930870056152344, "learning_rate": 3.811480978260869e-05, "loss": 0.6805, "step": 9583 }, { "epoch": 1.5003130870381967, "grad_norm": 2.569251298904419, "learning_rate": 3.810292119565217e-05, "loss": 0.5577, "step": 9584 }, { "epoch": 1.5004696305572949, "grad_norm": 3.604339838027954, "learning_rate": 3.809103260869565e-05, "loss": 0.4682, "step": 9585 }, { "epoch": 1.5006261740763933, "grad_norm": 7.586126327514648, "learning_rate": 3.8079144021739134e-05, "loss": 1.3513, "step": 9586 }, { "epoch": 1.5007827175954915, "grad_norm": 2.4136621952056885, "learning_rate": 3.80672554347826e-05, "loss": 0.7662, "step": 9587 }, { "epoch": 1.5009392611145898, "grad_norm": 3.839714765548706, "learning_rate": 3.8055366847826084e-05, "loss": 1.7728, "step": 9588 }, { "epoch": 1.5010958046336882, "grad_norm": 1.0800292491912842, "learning_rate": 3.804347826086956e-05, "loss": 0.2299, "step": 9589 }, { "epoch": 1.5012523481527866, "grad_norm": 0.8015256524085999, "learning_rate": 3.803158967391304e-05, "loss": 0.2565, "step": 9590 }, { "epoch": 1.5014088916718848, "grad_norm": 0.8056163191795349, "learning_rate": 3.801970108695652e-05, "loss": 0.2453, "step": 9591 }, { "epoch": 1.501565435190983, "grad_norm": 0.7886549234390259, "learning_rate": 3.8007812499999996e-05, "loss": 0.1723, "step": 9592 }, { "epoch": 1.5017219787100813, "grad_norm": 0.6433013677597046, "learning_rate": 3.7995923913043475e-05, "loss": 0.2101, "step": 9593 }, { "epoch": 1.5018785222291797, "grad_norm": 0.9446423649787903, "learning_rate": 3.798403532608695e-05, "loss": 0.2265, "step": 9594 }, { "epoch": 1.5020350657482782, "grad_norm": 0.7251456379890442, "learning_rate": 3.797214673913043e-05, "loss": 0.2426, "step": 9595 }, { "epoch": 1.5021916092673764, "grad_norm": 0.5351996421813965, "learning_rate": 3.7960258152173916e-05, "loss": 0.1836, "step": 9596 }, { "epoch": 1.5023481527864746, "grad_norm": 3.7213001251220703, "learning_rate": 3.7948369565217394e-05, "loss": 0.2002, "step": 9597 }, { "epoch": 1.5025046963055728, "grad_norm": 0.66976398229599, "learning_rate": 3.7936480978260865e-05, "loss": 0.2866, "step": 9598 }, { "epoch": 1.5026612398246713, "grad_norm": 0.7666813135147095, "learning_rate": 3.7924592391304343e-05, "loss": 0.2751, "step": 9599 }, { "epoch": 1.5028177833437697, "grad_norm": 0.8009845018386841, "learning_rate": 3.791270380434782e-05, "loss": 0.2532, "step": 9600 }, { "epoch": 1.502974326862868, "grad_norm": 1.0763399600982666, "learning_rate": 3.79008152173913e-05, "loss": 0.319, "step": 9601 }, { "epoch": 1.5031308703819661, "grad_norm": 7.471010684967041, "learning_rate": 3.788892663043478e-05, "loss": 0.3952, "step": 9602 }, { "epoch": 1.5032874139010644, "grad_norm": 1.3436068296432495, "learning_rate": 3.7877038043478256e-05, "loss": 0.4966, "step": 9603 }, { "epoch": 1.5034439574201628, "grad_norm": 1.4469660520553589, "learning_rate": 3.7865149456521734e-05, "loss": 0.3749, "step": 9604 }, { "epoch": 1.5036005009392612, "grad_norm": 0.8449103832244873, "learning_rate": 3.785326086956521e-05, "loss": 0.2655, "step": 9605 }, { "epoch": 1.5037570444583594, "grad_norm": 2.612706422805786, "learning_rate": 3.78413722826087e-05, "loss": 0.2877, "step": 9606 }, { "epoch": 1.5039135879774577, "grad_norm": 1.5374656915664673, "learning_rate": 3.7829483695652175e-05, "loss": 0.7018, "step": 9607 }, { "epoch": 1.5040701314965559, "grad_norm": 1.6321414709091187, "learning_rate": 3.7817595108695654e-05, "loss": 0.4861, "step": 9608 }, { "epoch": 1.5042266750156543, "grad_norm": 1.3744252920150757, "learning_rate": 3.780570652173913e-05, "loss": 0.3859, "step": 9609 }, { "epoch": 1.5043832185347528, "grad_norm": 1.2709693908691406, "learning_rate": 3.77938179347826e-05, "loss": 0.364, "step": 9610 }, { "epoch": 1.504539762053851, "grad_norm": 1.74161696434021, "learning_rate": 3.778192934782608e-05, "loss": 0.4943, "step": 9611 }, { "epoch": 1.5046963055729492, "grad_norm": 1.7833856344223022, "learning_rate": 3.777004076086956e-05, "loss": 0.4172, "step": 9612 }, { "epoch": 1.5048528490920476, "grad_norm": 2.1464765071868896, "learning_rate": 3.775815217391304e-05, "loss": 0.4105, "step": 9613 }, { "epoch": 1.5050093926111459, "grad_norm": 1.676975965499878, "learning_rate": 3.7746263586956516e-05, "loss": 0.5272, "step": 9614 }, { "epoch": 1.5051659361302443, "grad_norm": 2.018251657485962, "learning_rate": 3.7734374999999994e-05, "loss": 0.5139, "step": 9615 }, { "epoch": 1.5053224796493425, "grad_norm": 1.959094524383545, "learning_rate": 3.772248641304348e-05, "loss": 0.4384, "step": 9616 }, { "epoch": 1.5054790231684407, "grad_norm": 2.9238028526306152, "learning_rate": 3.771059782608696e-05, "loss": 0.4241, "step": 9617 }, { "epoch": 1.5056355666875392, "grad_norm": 4.107649803161621, "learning_rate": 3.7698709239130435e-05, "loss": 0.6055, "step": 9618 }, { "epoch": 1.5057921102066374, "grad_norm": 1.246055006980896, "learning_rate": 3.768682065217391e-05, "loss": 0.2904, "step": 9619 }, { "epoch": 1.5059486537257358, "grad_norm": 2.7649385929107666, "learning_rate": 3.767493206521739e-05, "loss": 1.1382, "step": 9620 }, { "epoch": 1.506105197244834, "grad_norm": 5.341802597045898, "learning_rate": 3.766304347826086e-05, "loss": 0.7648, "step": 9621 }, { "epoch": 1.5062617407639323, "grad_norm": 5.480435848236084, "learning_rate": 3.765115489130434e-05, "loss": 0.7777, "step": 9622 }, { "epoch": 1.5064182842830307, "grad_norm": 2.1096911430358887, "learning_rate": 3.763926630434782e-05, "loss": 0.9864, "step": 9623 }, { "epoch": 1.5065748278021291, "grad_norm": 5.296219348907471, "learning_rate": 3.76273777173913e-05, "loss": 1.0695, "step": 9624 }, { "epoch": 1.5067313713212274, "grad_norm": 4.835213661193848, "learning_rate": 3.7615489130434776e-05, "loss": 1.0852, "step": 9625 }, { "epoch": 1.5068879148403256, "grad_norm": 3.047670364379883, "learning_rate": 3.760360054347826e-05, "loss": 1.5891, "step": 9626 }, { "epoch": 1.5070444583594238, "grad_norm": 1.9582029581069946, "learning_rate": 3.759171195652174e-05, "loss": 0.4885, "step": 9627 }, { "epoch": 1.5072010018785222, "grad_norm": 3.402165412902832, "learning_rate": 3.757982336956522e-05, "loss": 1.0974, "step": 9628 }, { "epoch": 1.5073575453976207, "grad_norm": 5.078648567199707, "learning_rate": 3.7567934782608695e-05, "loss": 1.2954, "step": 9629 }, { "epoch": 1.507514088916719, "grad_norm": 2.5373382568359375, "learning_rate": 3.755604619565217e-05, "loss": 0.5207, "step": 9630 }, { "epoch": 1.5076706324358171, "grad_norm": 6.036319255828857, "learning_rate": 3.754415760869565e-05, "loss": 1.2924, "step": 9631 }, { "epoch": 1.5078271759549153, "grad_norm": 3.8971383571624756, "learning_rate": 3.753226902173913e-05, "loss": 0.8365, "step": 9632 }, { "epoch": 1.5079837194740138, "grad_norm": 2.017336368560791, "learning_rate": 3.75203804347826e-05, "loss": 0.4642, "step": 9633 }, { "epoch": 1.5081402629931122, "grad_norm": 2.184661626815796, "learning_rate": 3.750849184782608e-05, "loss": 0.5645, "step": 9634 }, { "epoch": 1.5082968065122104, "grad_norm": 3.4672956466674805, "learning_rate": 3.749660326086956e-05, "loss": 0.8118, "step": 9635 }, { "epoch": 1.5084533500313086, "grad_norm": 1.8242701292037964, "learning_rate": 3.748471467391304e-05, "loss": 0.6234, "step": 9636 }, { "epoch": 1.5086098935504069, "grad_norm": 2.281806230545044, "learning_rate": 3.747282608695652e-05, "loss": 0.626, "step": 9637 }, { "epoch": 1.5087664370695053, "grad_norm": 4.145202159881592, "learning_rate": 3.74609375e-05, "loss": 0.6926, "step": 9638 }, { "epoch": 1.5089229805886037, "grad_norm": 0.5851184725761414, "learning_rate": 3.7449048913043476e-05, "loss": 0.2238, "step": 9639 }, { "epoch": 1.509079524107702, "grad_norm": 0.5682710409164429, "learning_rate": 3.7437160326086955e-05, "loss": 0.2228, "step": 9640 }, { "epoch": 1.5092360676268002, "grad_norm": 0.6199625730514526, "learning_rate": 3.742527173913043e-05, "loss": 0.2486, "step": 9641 }, { "epoch": 1.5093926111458984, "grad_norm": 0.4057449996471405, "learning_rate": 3.741338315217391e-05, "loss": 0.1363, "step": 9642 }, { "epoch": 1.5095491546649968, "grad_norm": 0.5047759413719177, "learning_rate": 3.740149456521739e-05, "loss": 0.1462, "step": 9643 }, { "epoch": 1.5097056981840953, "grad_norm": 0.6574609279632568, "learning_rate": 3.738960597826086e-05, "loss": 0.1791, "step": 9644 }, { "epoch": 1.5098622417031935, "grad_norm": 0.6024161577224731, "learning_rate": 3.737771739130434e-05, "loss": 0.263, "step": 9645 }, { "epoch": 1.5100187852222917, "grad_norm": 0.7362421751022339, "learning_rate": 3.7365828804347824e-05, "loss": 0.2995, "step": 9646 }, { "epoch": 1.5101753287413902, "grad_norm": 0.77032470703125, "learning_rate": 3.73539402173913e-05, "loss": 0.3525, "step": 9647 }, { "epoch": 1.5103318722604884, "grad_norm": 1.26923406124115, "learning_rate": 3.734205163043478e-05, "loss": 0.2914, "step": 9648 }, { "epoch": 1.5104884157795868, "grad_norm": 0.8408750891685486, "learning_rate": 3.733016304347826e-05, "loss": 0.2856, "step": 9649 }, { "epoch": 1.510644959298685, "grad_norm": 0.5923863053321838, "learning_rate": 3.7318274456521736e-05, "loss": 0.19, "step": 9650 }, { "epoch": 1.5108015028177832, "grad_norm": 0.8828781843185425, "learning_rate": 3.7306385869565214e-05, "loss": 0.356, "step": 9651 }, { "epoch": 1.5109580463368817, "grad_norm": 1.6552892923355103, "learning_rate": 3.729449728260869e-05, "loss": 0.3873, "step": 9652 }, { "epoch": 1.5111145898559801, "grad_norm": 0.8879346251487732, "learning_rate": 3.728260869565217e-05, "loss": 0.478, "step": 9653 }, { "epoch": 1.5112711333750783, "grad_norm": 1.7517913579940796, "learning_rate": 3.727072010869565e-05, "loss": 0.3685, "step": 9654 }, { "epoch": 1.5114276768941766, "grad_norm": 1.7395554780960083, "learning_rate": 3.7258831521739134e-05, "loss": 0.4697, "step": 9655 }, { "epoch": 1.5115842204132748, "grad_norm": 0.990230917930603, "learning_rate": 3.7246942934782605e-05, "loss": 0.4503, "step": 9656 }, { "epoch": 1.5117407639323732, "grad_norm": 1.1103808879852295, "learning_rate": 3.723505434782608e-05, "loss": 0.2688, "step": 9657 }, { "epoch": 1.5118973074514717, "grad_norm": 0.7122592329978943, "learning_rate": 3.722316576086956e-05, "loss": 0.3382, "step": 9658 }, { "epoch": 1.5120538509705699, "grad_norm": 1.2376749515533447, "learning_rate": 3.721127717391304e-05, "loss": 0.4996, "step": 9659 }, { "epoch": 1.512210394489668, "grad_norm": 2.3318021297454834, "learning_rate": 3.719938858695652e-05, "loss": 0.5523, "step": 9660 }, { "epoch": 1.5123669380087663, "grad_norm": 1.4719082117080688, "learning_rate": 3.7187499999999996e-05, "loss": 0.4028, "step": 9661 }, { "epoch": 1.5125234815278648, "grad_norm": 3.07954478263855, "learning_rate": 3.7175611413043474e-05, "loss": 0.5273, "step": 9662 }, { "epoch": 1.5126800250469632, "grad_norm": 2.213068723678589, "learning_rate": 3.716372282608695e-05, "loss": 0.7627, "step": 9663 }, { "epoch": 1.5128365685660614, "grad_norm": 4.385886192321777, "learning_rate": 3.715183423913043e-05, "loss": 0.6876, "step": 9664 }, { "epoch": 1.5129931120851596, "grad_norm": 3.1189446449279785, "learning_rate": 3.7139945652173915e-05, "loss": 0.6221, "step": 9665 }, { "epoch": 1.5131496556042578, "grad_norm": 1.627333164215088, "learning_rate": 3.7128057065217393e-05, "loss": 0.467, "step": 9666 }, { "epoch": 1.5133061991233563, "grad_norm": 1.8578894138336182, "learning_rate": 3.7116168478260865e-05, "loss": 0.7119, "step": 9667 }, { "epoch": 1.5134627426424547, "grad_norm": 2.536036968231201, "learning_rate": 3.710427989130434e-05, "loss": 0.7192, "step": 9668 }, { "epoch": 1.513619286161553, "grad_norm": 2.6007771492004395, "learning_rate": 3.709239130434782e-05, "loss": 0.7014, "step": 9669 }, { "epoch": 1.5137758296806512, "grad_norm": 1.5747687816619873, "learning_rate": 3.70805027173913e-05, "loss": 0.3636, "step": 9670 }, { "epoch": 1.5139323731997494, "grad_norm": 1.512957215309143, "learning_rate": 3.706861413043478e-05, "loss": 0.6124, "step": 9671 }, { "epoch": 1.5140889167188478, "grad_norm": 3.3310675621032715, "learning_rate": 3.7056725543478256e-05, "loss": 0.8086, "step": 9672 }, { "epoch": 1.5142454602379463, "grad_norm": 2.8908345699310303, "learning_rate": 3.7044836956521734e-05, "loss": 0.47, "step": 9673 }, { "epoch": 1.5144020037570445, "grad_norm": 3.311842679977417, "learning_rate": 3.703294836956521e-05, "loss": 1.0736, "step": 9674 }, { "epoch": 1.5145585472761427, "grad_norm": 2.54732346534729, "learning_rate": 3.70210597826087e-05, "loss": 1.3149, "step": 9675 }, { "epoch": 1.514715090795241, "grad_norm": 5.970694065093994, "learning_rate": 3.7009171195652175e-05, "loss": 0.5448, "step": 9676 }, { "epoch": 1.5148716343143394, "grad_norm": 3.771117687225342, "learning_rate": 3.699728260869565e-05, "loss": 0.7932, "step": 9677 }, { "epoch": 1.5150281778334378, "grad_norm": 3.101468563079834, "learning_rate": 3.698539402173913e-05, "loss": 1.0055, "step": 9678 }, { "epoch": 1.515184721352536, "grad_norm": 4.264756679534912, "learning_rate": 3.69735054347826e-05, "loss": 1.0606, "step": 9679 }, { "epoch": 1.5153412648716342, "grad_norm": 2.0974700450897217, "learning_rate": 3.696161684782608e-05, "loss": 0.7117, "step": 9680 }, { "epoch": 1.5154978083907327, "grad_norm": 3.9731154441833496, "learning_rate": 3.694972826086956e-05, "loss": 1.3125, "step": 9681 }, { "epoch": 1.5156543519098309, "grad_norm": 8.304401397705078, "learning_rate": 3.693783967391304e-05, "loss": 0.8895, "step": 9682 }, { "epoch": 1.5158108954289293, "grad_norm": 3.4360756874084473, "learning_rate": 3.6925951086956515e-05, "loss": 1.7139, "step": 9683 }, { "epoch": 1.5159674389480275, "grad_norm": 2.3006060123443604, "learning_rate": 3.6914062499999993e-05, "loss": 0.9193, "step": 9684 }, { "epoch": 1.5161239824671258, "grad_norm": 2.3497166633605957, "learning_rate": 3.690217391304348e-05, "loss": 0.365, "step": 9685 }, { "epoch": 1.5162805259862242, "grad_norm": 3.6702041625976562, "learning_rate": 3.6890285326086957e-05, "loss": 0.6467, "step": 9686 }, { "epoch": 1.5164370695053226, "grad_norm": 3.2887656688690186, "learning_rate": 3.6878396739130435e-05, "loss": 0.6472, "step": 9687 }, { "epoch": 1.5165936130244209, "grad_norm": 2.960428476333618, "learning_rate": 3.686650815217391e-05, "loss": 0.8216, "step": 9688 }, { "epoch": 1.516750156543519, "grad_norm": 0.5125085711479187, "learning_rate": 3.685461956521739e-05, "loss": 0.2514, "step": 9689 }, { "epoch": 1.5169067000626173, "grad_norm": 0.516815721988678, "learning_rate": 3.684273097826086e-05, "loss": 0.1812, "step": 9690 }, { "epoch": 1.5170632435817157, "grad_norm": 0.5136469006538391, "learning_rate": 3.683084239130434e-05, "loss": 0.1885, "step": 9691 }, { "epoch": 1.5172197871008142, "grad_norm": 1.2480697631835938, "learning_rate": 3.681895380434782e-05, "loss": 0.34, "step": 9692 }, { "epoch": 1.5173763306199124, "grad_norm": 0.3598582148551941, "learning_rate": 3.68070652173913e-05, "loss": 0.2146, "step": 9693 }, { "epoch": 1.5175328741390106, "grad_norm": 0.5915454030036926, "learning_rate": 3.6795176630434775e-05, "loss": 0.1755, "step": 9694 }, { "epoch": 1.5176894176581088, "grad_norm": 0.684097945690155, "learning_rate": 3.678328804347826e-05, "loss": 0.2639, "step": 9695 }, { "epoch": 1.5178459611772073, "grad_norm": 0.6495858430862427, "learning_rate": 3.677139945652174e-05, "loss": 0.2802, "step": 9696 }, { "epoch": 1.5180025046963057, "grad_norm": 2.0476410388946533, "learning_rate": 3.6759510869565216e-05, "loss": 0.2914, "step": 9697 }, { "epoch": 1.518159048215404, "grad_norm": 1.7881464958190918, "learning_rate": 3.6747622282608694e-05, "loss": 0.3655, "step": 9698 }, { "epoch": 1.5183155917345021, "grad_norm": 0.9067604541778564, "learning_rate": 3.673573369565217e-05, "loss": 0.2085, "step": 9699 }, { "epoch": 1.5184721352536004, "grad_norm": 0.7482017278671265, "learning_rate": 3.672384510869565e-05, "loss": 0.2507, "step": 9700 }, { "epoch": 1.5186286787726988, "grad_norm": 1.2923895120620728, "learning_rate": 3.671195652173913e-05, "loss": 0.3729, "step": 9701 }, { "epoch": 1.5187852222917972, "grad_norm": 0.5492497086524963, "learning_rate": 3.67000679347826e-05, "loss": 0.1912, "step": 9702 }, { "epoch": 1.5189417658108955, "grad_norm": 1.084281325340271, "learning_rate": 3.668817934782608e-05, "loss": 0.2722, "step": 9703 }, { "epoch": 1.5190983093299937, "grad_norm": 1.0239067077636719, "learning_rate": 3.6676290760869557e-05, "loss": 0.2575, "step": 9704 }, { "epoch": 1.519254852849092, "grad_norm": 0.8405023813247681, "learning_rate": 3.666440217391304e-05, "loss": 0.238, "step": 9705 }, { "epoch": 1.5194113963681903, "grad_norm": 1.4576950073242188, "learning_rate": 3.665251358695652e-05, "loss": 0.2499, "step": 9706 }, { "epoch": 1.5195679398872888, "grad_norm": 1.4685540199279785, "learning_rate": 3.6640625e-05, "loss": 0.4703, "step": 9707 }, { "epoch": 1.519724483406387, "grad_norm": 3.0841054916381836, "learning_rate": 3.6628736413043476e-05, "loss": 0.6409, "step": 9708 }, { "epoch": 1.5198810269254852, "grad_norm": 1.756545901298523, "learning_rate": 3.6616847826086954e-05, "loss": 0.4202, "step": 9709 }, { "epoch": 1.5200375704445834, "grad_norm": 1.5638608932495117, "learning_rate": 3.660495923913043e-05, "loss": 0.451, "step": 9710 }, { "epoch": 1.5201941139636819, "grad_norm": 2.371488571166992, "learning_rate": 3.659307065217391e-05, "loss": 0.6759, "step": 9711 }, { "epoch": 1.5203506574827803, "grad_norm": 1.9212990999221802, "learning_rate": 3.658118206521739e-05, "loss": 0.4466, "step": 9712 }, { "epoch": 1.5205072010018785, "grad_norm": 2.4628732204437256, "learning_rate": 3.656929347826086e-05, "loss": 0.5756, "step": 9713 }, { "epoch": 1.5206637445209767, "grad_norm": 7.966614246368408, "learning_rate": 3.6557404891304345e-05, "loss": 0.6273, "step": 9714 }, { "epoch": 1.5208202880400752, "grad_norm": 2.1205310821533203, "learning_rate": 3.654551630434782e-05, "loss": 0.6979, "step": 9715 }, { "epoch": 1.5209768315591734, "grad_norm": 2.4457716941833496, "learning_rate": 3.65336277173913e-05, "loss": 0.5042, "step": 9716 }, { "epoch": 1.5211333750782718, "grad_norm": 1.785168170928955, "learning_rate": 3.652173913043478e-05, "loss": 0.4184, "step": 9717 }, { "epoch": 1.52128991859737, "grad_norm": 2.2963221073150635, "learning_rate": 3.650985054347826e-05, "loss": 0.8453, "step": 9718 }, { "epoch": 1.5214464621164683, "grad_norm": 2.698050022125244, "learning_rate": 3.6497961956521736e-05, "loss": 0.7722, "step": 9719 }, { "epoch": 1.5216030056355667, "grad_norm": 7.9776458740234375, "learning_rate": 3.6486073369565214e-05, "loss": 1.0734, "step": 9720 }, { "epoch": 1.5217595491546652, "grad_norm": 31.1489200592041, "learning_rate": 3.647418478260869e-05, "loss": 1.1078, "step": 9721 }, { "epoch": 1.5219160926737634, "grad_norm": 2.535017728805542, "learning_rate": 3.646229619565217e-05, "loss": 0.6808, "step": 9722 }, { "epoch": 1.5220726361928616, "grad_norm": 4.746556282043457, "learning_rate": 3.6450407608695655e-05, "loss": 0.6797, "step": 9723 }, { "epoch": 1.5222291797119598, "grad_norm": 2.270190954208374, "learning_rate": 3.643851902173913e-05, "loss": 0.8935, "step": 9724 }, { "epoch": 1.5223857232310583, "grad_norm": 1.8375684022903442, "learning_rate": 3.6426630434782605e-05, "loss": 0.8133, "step": 9725 }, { "epoch": 1.5225422667501567, "grad_norm": 3.669950485229492, "learning_rate": 3.641474184782608e-05, "loss": 0.803, "step": 9726 }, { "epoch": 1.522698810269255, "grad_norm": 2.39097261428833, "learning_rate": 3.640285326086956e-05, "loss": 0.8857, "step": 9727 }, { "epoch": 1.5228553537883531, "grad_norm": 3.3866050243377686, "learning_rate": 3.639096467391304e-05, "loss": 1.4113, "step": 9728 }, { "epoch": 1.5230118973074513, "grad_norm": 2.279557943344116, "learning_rate": 3.637907608695652e-05, "loss": 0.9209, "step": 9729 }, { "epoch": 1.5231684408265498, "grad_norm": 5.289220333099365, "learning_rate": 3.6367187499999995e-05, "loss": 1.5362, "step": 9730 }, { "epoch": 1.5233249843456482, "grad_norm": 2.573629140853882, "learning_rate": 3.6355298913043474e-05, "loss": 0.9438, "step": 9731 }, { "epoch": 1.5234815278647464, "grad_norm": 1.899846076965332, "learning_rate": 3.634341032608695e-05, "loss": 0.9086, "step": 9732 }, { "epoch": 1.5236380713838447, "grad_norm": 2.3027873039245605, "learning_rate": 3.633152173913044e-05, "loss": 0.6289, "step": 9733 }, { "epoch": 1.5237946149029429, "grad_norm": 3.221082925796509, "learning_rate": 3.6319633152173915e-05, "loss": 0.7776, "step": 9734 }, { "epoch": 1.5239511584220413, "grad_norm": 8.75937557220459, "learning_rate": 3.630774456521739e-05, "loss": 0.6571, "step": 9735 }, { "epoch": 1.5241077019411398, "grad_norm": 3.8455164432525635, "learning_rate": 3.6295855978260864e-05, "loss": 0.6878, "step": 9736 }, { "epoch": 1.524264245460238, "grad_norm": 3.7846970558166504, "learning_rate": 3.628396739130434e-05, "loss": 0.3923, "step": 9737 }, { "epoch": 1.5244207889793362, "grad_norm": 4.241483688354492, "learning_rate": 3.627207880434782e-05, "loss": 0.8359, "step": 9738 }, { "epoch": 1.5245773324984344, "grad_norm": 0.36151307821273804, "learning_rate": 3.62601902173913e-05, "loss": 0.1809, "step": 9739 }, { "epoch": 1.5247338760175329, "grad_norm": 0.8069365620613098, "learning_rate": 3.624830163043478e-05, "loss": 0.2083, "step": 9740 }, { "epoch": 1.5248904195366313, "grad_norm": 0.6129469871520996, "learning_rate": 3.6236413043478255e-05, "loss": 0.1965, "step": 9741 }, { "epoch": 1.5250469630557295, "grad_norm": 0.6179580688476562, "learning_rate": 3.622452445652173e-05, "loss": 0.2097, "step": 9742 }, { "epoch": 1.5252035065748277, "grad_norm": 0.7149573564529419, "learning_rate": 3.621263586956522e-05, "loss": 0.238, "step": 9743 }, { "epoch": 1.525360050093926, "grad_norm": 0.9431974291801453, "learning_rate": 3.6200747282608696e-05, "loss": 0.2911, "step": 9744 }, { "epoch": 1.5255165936130244, "grad_norm": 0.7462194561958313, "learning_rate": 3.6188858695652175e-05, "loss": 0.2759, "step": 9745 }, { "epoch": 1.5256731371321228, "grad_norm": 3.14021372795105, "learning_rate": 3.617697010869565e-05, "loss": 0.8118, "step": 9746 }, { "epoch": 1.525829680651221, "grad_norm": 0.9804496169090271, "learning_rate": 3.616508152173913e-05, "loss": 0.2599, "step": 9747 }, { "epoch": 1.5259862241703193, "grad_norm": 0.46280422806739807, "learning_rate": 3.61531929347826e-05, "loss": 0.0988, "step": 9748 }, { "epoch": 1.5261427676894177, "grad_norm": 0.8051014542579651, "learning_rate": 3.614130434782608e-05, "loss": 0.2804, "step": 9749 }, { "epoch": 1.526299311208516, "grad_norm": 1.6539332866668701, "learning_rate": 3.612941576086956e-05, "loss": 0.4378, "step": 9750 }, { "epoch": 1.5264558547276144, "grad_norm": 1.9196847677230835, "learning_rate": 3.611752717391304e-05, "loss": 0.5059, "step": 9751 }, { "epoch": 1.5266123982467126, "grad_norm": 1.2617123126983643, "learning_rate": 3.6105638586956515e-05, "loss": 0.3029, "step": 9752 }, { "epoch": 1.5267689417658108, "grad_norm": 1.3812172412872314, "learning_rate": 3.609375e-05, "loss": 0.397, "step": 9753 }, { "epoch": 1.5269254852849092, "grad_norm": 1.3728859424591064, "learning_rate": 3.608186141304348e-05, "loss": 0.5803, "step": 9754 }, { "epoch": 1.5270820288040077, "grad_norm": 2.846985340118408, "learning_rate": 3.6069972826086956e-05, "loss": 0.4277, "step": 9755 }, { "epoch": 1.527238572323106, "grad_norm": 0.8530025482177734, "learning_rate": 3.6058084239130434e-05, "loss": 0.4232, "step": 9756 }, { "epoch": 1.527395115842204, "grad_norm": 1.7794175148010254, "learning_rate": 3.604619565217391e-05, "loss": 0.5403, "step": 9757 }, { "epoch": 1.5275516593613023, "grad_norm": 1.4248203039169312, "learning_rate": 3.603430706521739e-05, "loss": 0.3346, "step": 9758 }, { "epoch": 1.5277082028804008, "grad_norm": 1.5314399003982544, "learning_rate": 3.602241847826086e-05, "loss": 0.5145, "step": 9759 }, { "epoch": 1.5278647463994992, "grad_norm": 2.8155620098114014, "learning_rate": 3.601052989130434e-05, "loss": 0.6872, "step": 9760 }, { "epoch": 1.5280212899185974, "grad_norm": 1.01211678981781, "learning_rate": 3.599864130434782e-05, "loss": 0.2778, "step": 9761 }, { "epoch": 1.5281778334376956, "grad_norm": 2.078742027282715, "learning_rate": 3.5986752717391296e-05, "loss": 0.886, "step": 9762 }, { "epoch": 1.5283343769567939, "grad_norm": 2.729083776473999, "learning_rate": 3.597486413043478e-05, "loss": 0.7813, "step": 9763 }, { "epoch": 1.5284909204758923, "grad_norm": 1.865966558456421, "learning_rate": 3.596297554347826e-05, "loss": 0.548, "step": 9764 }, { "epoch": 1.5286474639949907, "grad_norm": 1.7854983806610107, "learning_rate": 3.595108695652174e-05, "loss": 0.5608, "step": 9765 }, { "epoch": 1.528804007514089, "grad_norm": 3.240400552749634, "learning_rate": 3.5939198369565216e-05, "loss": 0.9109, "step": 9766 }, { "epoch": 1.5289605510331872, "grad_norm": 4.7093329429626465, "learning_rate": 3.5927309782608694e-05, "loss": 0.998, "step": 9767 }, { "epoch": 1.5291170945522854, "grad_norm": 1.7607187032699585, "learning_rate": 3.591542119565217e-05, "loss": 0.3385, "step": 9768 }, { "epoch": 1.5292736380713838, "grad_norm": 2.517700672149658, "learning_rate": 3.590353260869565e-05, "loss": 0.5355, "step": 9769 }, { "epoch": 1.5294301815904823, "grad_norm": 2.1344497203826904, "learning_rate": 3.589164402173913e-05, "loss": 0.7069, "step": 9770 }, { "epoch": 1.5295867251095805, "grad_norm": 1.7459958791732788, "learning_rate": 3.58797554347826e-05, "loss": 0.5552, "step": 9771 }, { "epoch": 1.5297432686286787, "grad_norm": 2.2956607341766357, "learning_rate": 3.586786684782608e-05, "loss": 0.6727, "step": 9772 }, { "epoch": 1.529899812147777, "grad_norm": 2.3729469776153564, "learning_rate": 3.585597826086956e-05, "loss": 0.8102, "step": 9773 }, { "epoch": 1.5300563556668754, "grad_norm": 2.094662666320801, "learning_rate": 3.584408967391304e-05, "loss": 0.432, "step": 9774 }, { "epoch": 1.5302128991859738, "grad_norm": 3.8491249084472656, "learning_rate": 3.583220108695652e-05, "loss": 1.0726, "step": 9775 }, { "epoch": 1.530369442705072, "grad_norm": 2.9706568717956543, "learning_rate": 3.58203125e-05, "loss": 0.8306, "step": 9776 }, { "epoch": 1.5305259862241702, "grad_norm": 1.6858385801315308, "learning_rate": 3.5808423913043476e-05, "loss": 0.8121, "step": 9777 }, { "epoch": 1.5306825297432687, "grad_norm": 3.8205182552337646, "learning_rate": 3.5796535326086954e-05, "loss": 1.3632, "step": 9778 }, { "epoch": 1.530839073262367, "grad_norm": 3.191636085510254, "learning_rate": 3.578464673913043e-05, "loss": 0.7565, "step": 9779 }, { "epoch": 1.5309956167814653, "grad_norm": 4.3562164306640625, "learning_rate": 3.577275815217391e-05, "loss": 1.1258, "step": 9780 }, { "epoch": 1.5311521603005636, "grad_norm": 6.006584167480469, "learning_rate": 3.576086956521739e-05, "loss": 1.0692, "step": 9781 }, { "epoch": 1.5313087038196618, "grad_norm": 3.755859375, "learning_rate": 3.574898097826086e-05, "loss": 1.6727, "step": 9782 }, { "epoch": 1.5314652473387602, "grad_norm": 2.1683104038238525, "learning_rate": 3.5737092391304344e-05, "loss": 0.5855, "step": 9783 }, { "epoch": 1.5316217908578584, "grad_norm": 2.317070245742798, "learning_rate": 3.572520380434782e-05, "loss": 0.3211, "step": 9784 }, { "epoch": 1.5317783343769569, "grad_norm": 1.7937824726104736, "learning_rate": 3.57133152173913e-05, "loss": 0.8581, "step": 9785 }, { "epoch": 1.531934877896055, "grad_norm": 2.6610658168792725, "learning_rate": 3.570142663043478e-05, "loss": 0.5606, "step": 9786 }, { "epoch": 1.5320914214151533, "grad_norm": 3.3975601196289062, "learning_rate": 3.568953804347826e-05, "loss": 0.6741, "step": 9787 }, { "epoch": 1.5322479649342517, "grad_norm": 3.000880002975464, "learning_rate": 3.5677649456521735e-05, "loss": 0.4203, "step": 9788 }, { "epoch": 1.5324045084533502, "grad_norm": 0.4552304148674011, "learning_rate": 3.5665760869565213e-05, "loss": 0.2453, "step": 9789 }, { "epoch": 1.5325610519724484, "grad_norm": 0.6576285362243652, "learning_rate": 3.565387228260869e-05, "loss": 0.1701, "step": 9790 }, { "epoch": 1.5327175954915466, "grad_norm": 0.9888237714767456, "learning_rate": 3.5641983695652177e-05, "loss": 0.2869, "step": 9791 }, { "epoch": 1.5328741390106448, "grad_norm": 0.7979891300201416, "learning_rate": 3.5630095108695655e-05, "loss": 0.2385, "step": 9792 }, { "epoch": 1.5330306825297433, "grad_norm": 0.674461305141449, "learning_rate": 3.561820652173913e-05, "loss": 0.1709, "step": 9793 }, { "epoch": 1.5331872260488417, "grad_norm": 0.8065366744995117, "learning_rate": 3.5606317934782604e-05, "loss": 0.2898, "step": 9794 }, { "epoch": 1.53334376956794, "grad_norm": 0.6483885049819946, "learning_rate": 3.559442934782608e-05, "loss": 0.1837, "step": 9795 }, { "epoch": 1.5335003130870382, "grad_norm": 1.1818794012069702, "learning_rate": 3.558254076086956e-05, "loss": 0.2203, "step": 9796 }, { "epoch": 1.5336568566061364, "grad_norm": 0.7622712850570679, "learning_rate": 3.557065217391304e-05, "loss": 0.2293, "step": 9797 }, { "epoch": 1.5338134001252348, "grad_norm": 0.8274227380752563, "learning_rate": 3.555876358695652e-05, "loss": 0.2649, "step": 9798 }, { "epoch": 1.5339699436443333, "grad_norm": 1.2274298667907715, "learning_rate": 3.5546874999999995e-05, "loss": 0.4068, "step": 9799 }, { "epoch": 1.5341264871634315, "grad_norm": 4.1536712646484375, "learning_rate": 3.553498641304347e-05, "loss": 0.2221, "step": 9800 }, { "epoch": 1.5342830306825297, "grad_norm": 0.8985209465026855, "learning_rate": 3.552309782608696e-05, "loss": 0.52, "step": 9801 }, { "epoch": 1.534439574201628, "grad_norm": 1.4208931922912598, "learning_rate": 3.5511209239130436e-05, "loss": 0.2446, "step": 9802 }, { "epoch": 1.5345961177207263, "grad_norm": 0.8107170462608337, "learning_rate": 3.5499320652173914e-05, "loss": 0.1426, "step": 9803 }, { "epoch": 1.5347526612398248, "grad_norm": 1.3151836395263672, "learning_rate": 3.548743206521739e-05, "loss": 0.2127, "step": 9804 }, { "epoch": 1.534909204758923, "grad_norm": 1.493598461151123, "learning_rate": 3.5475543478260864e-05, "loss": 0.4296, "step": 9805 }, { "epoch": 1.5350657482780212, "grad_norm": 1.1925967931747437, "learning_rate": 3.546365489130434e-05, "loss": 0.1632, "step": 9806 }, { "epoch": 1.5352222917971194, "grad_norm": 1.6230759620666504, "learning_rate": 3.545176630434782e-05, "loss": 0.3962, "step": 9807 }, { "epoch": 1.5353788353162179, "grad_norm": 1.1954597234725952, "learning_rate": 3.54398777173913e-05, "loss": 0.5192, "step": 9808 }, { "epoch": 1.5355353788353163, "grad_norm": 2.0931947231292725, "learning_rate": 3.5427989130434777e-05, "loss": 0.4433, "step": 9809 }, { "epoch": 1.5356919223544145, "grad_norm": 1.5651090145111084, "learning_rate": 3.5416100543478255e-05, "loss": 0.396, "step": 9810 }, { "epoch": 1.5358484658735128, "grad_norm": 3.2600390911102295, "learning_rate": 3.540421195652174e-05, "loss": 0.6973, "step": 9811 }, { "epoch": 1.5360050093926112, "grad_norm": 2.9735090732574463, "learning_rate": 3.539232336956522e-05, "loss": 0.5012, "step": 9812 }, { "epoch": 1.5361615529117094, "grad_norm": 1.8330577611923218, "learning_rate": 3.5380434782608696e-05, "loss": 0.4984, "step": 9813 }, { "epoch": 1.5363180964308079, "grad_norm": 5.310966491699219, "learning_rate": 3.5368546195652174e-05, "loss": 0.6877, "step": 9814 }, { "epoch": 1.536474639949906, "grad_norm": 9.760058403015137, "learning_rate": 3.535665760869565e-05, "loss": 0.6927, "step": 9815 }, { "epoch": 1.5366311834690043, "grad_norm": 2.085836887359619, "learning_rate": 3.534476902173913e-05, "loss": 0.5437, "step": 9816 }, { "epoch": 1.5367877269881027, "grad_norm": 2.329134464263916, "learning_rate": 3.53328804347826e-05, "loss": 0.448, "step": 9817 }, { "epoch": 1.536944270507201, "grad_norm": 1.2729038000106812, "learning_rate": 3.532099184782608e-05, "loss": 0.3698, "step": 9818 }, { "epoch": 1.5371008140262994, "grad_norm": 2.1646103858947754, "learning_rate": 3.530910326086956e-05, "loss": 0.8662, "step": 9819 }, { "epoch": 1.5372573575453976, "grad_norm": 4.652977466583252, "learning_rate": 3.5297214673913036e-05, "loss": 0.5666, "step": 9820 }, { "epoch": 1.5374139010644958, "grad_norm": 4.731037139892578, "learning_rate": 3.528532608695652e-05, "loss": 1.1456, "step": 9821 }, { "epoch": 1.5375704445835943, "grad_norm": 2.510826587677002, "learning_rate": 3.52734375e-05, "loss": 1.0624, "step": 9822 }, { "epoch": 1.5377269881026927, "grad_norm": 3.537822961807251, "learning_rate": 3.526154891304348e-05, "loss": 0.7052, "step": 9823 }, { "epoch": 1.537883531621791, "grad_norm": 3.3467464447021484, "learning_rate": 3.5249660326086956e-05, "loss": 0.4999, "step": 9824 }, { "epoch": 1.5380400751408891, "grad_norm": 3.021695137023926, "learning_rate": 3.5237771739130434e-05, "loss": 1.0257, "step": 9825 }, { "epoch": 1.5381966186599874, "grad_norm": 2.443289041519165, "learning_rate": 3.522588315217391e-05, "loss": 1.0619, "step": 9826 }, { "epoch": 1.5383531621790858, "grad_norm": 2.7938647270202637, "learning_rate": 3.521399456521739e-05, "loss": 1.5459, "step": 9827 }, { "epoch": 1.5385097056981842, "grad_norm": 2.6568589210510254, "learning_rate": 3.520210597826086e-05, "loss": 0.4913, "step": 9828 }, { "epoch": 1.5386662492172825, "grad_norm": 2.8180794715881348, "learning_rate": 3.519021739130434e-05, "loss": 0.6871, "step": 9829 }, { "epoch": 1.5388227927363807, "grad_norm": 5.822264194488525, "learning_rate": 3.517832880434782e-05, "loss": 1.1113, "step": 9830 }, { "epoch": 1.538979336255479, "grad_norm": 4.711894512176514, "learning_rate": 3.51664402173913e-05, "loss": 1.1694, "step": 9831 }, { "epoch": 1.5391358797745773, "grad_norm": 4.809213161468506, "learning_rate": 3.515455163043478e-05, "loss": 0.7725, "step": 9832 }, { "epoch": 1.5392924232936758, "grad_norm": 3.179445743560791, "learning_rate": 3.514266304347826e-05, "loss": 1.0952, "step": 9833 }, { "epoch": 1.539448966812774, "grad_norm": 1.6010109186172485, "learning_rate": 3.513077445652174e-05, "loss": 0.1666, "step": 9834 }, { "epoch": 1.5396055103318722, "grad_norm": 2.798609733581543, "learning_rate": 3.5118885869565215e-05, "loss": 0.9547, "step": 9835 }, { "epoch": 1.5397620538509704, "grad_norm": 3.5157408714294434, "learning_rate": 3.5106997282608694e-05, "loss": 1.1322, "step": 9836 }, { "epoch": 1.5399185973700689, "grad_norm": 3.693622589111328, "learning_rate": 3.509510869565217e-05, "loss": 1.2289, "step": 9837 }, { "epoch": 1.5400751408891673, "grad_norm": 2.9294140338897705, "learning_rate": 3.508322010869565e-05, "loss": 0.5943, "step": 9838 }, { "epoch": 1.5402316844082655, "grad_norm": 0.5069560408592224, "learning_rate": 3.507133152173913e-05, "loss": 0.1776, "step": 9839 }, { "epoch": 1.5403882279273637, "grad_norm": 0.5028475522994995, "learning_rate": 3.50594429347826e-05, "loss": 0.1591, "step": 9840 }, { "epoch": 1.540544771446462, "grad_norm": 0.5707376599311829, "learning_rate": 3.5047554347826084e-05, "loss": 0.1254, "step": 9841 }, { "epoch": 1.5407013149655604, "grad_norm": 0.9953250885009766, "learning_rate": 3.503566576086956e-05, "loss": 0.2192, "step": 9842 }, { "epoch": 1.5408578584846588, "grad_norm": 1.012017011642456, "learning_rate": 3.502377717391304e-05, "loss": 0.2066, "step": 9843 }, { "epoch": 1.541014402003757, "grad_norm": 1.46383798122406, "learning_rate": 3.501188858695652e-05, "loss": 0.2524, "step": 9844 }, { "epoch": 1.5411709455228553, "grad_norm": 1.0094008445739746, "learning_rate": 3.5e-05, "loss": 0.2879, "step": 9845 }, { "epoch": 1.5413274890419537, "grad_norm": 1.2457127571105957, "learning_rate": 3.4988111413043475e-05, "loss": 0.2775, "step": 9846 }, { "epoch": 1.541484032561052, "grad_norm": 3.395383358001709, "learning_rate": 3.497622282608695e-05, "loss": 0.3003, "step": 9847 }, { "epoch": 1.5416405760801504, "grad_norm": 0.42943891882896423, "learning_rate": 3.496433423913043e-05, "loss": 0.1426, "step": 9848 }, { "epoch": 1.5417971195992486, "grad_norm": 0.8504369854927063, "learning_rate": 3.495244565217391e-05, "loss": 0.2202, "step": 9849 }, { "epoch": 1.5419536631183468, "grad_norm": 0.8031790852546692, "learning_rate": 3.494055706521739e-05, "loss": 0.3131, "step": 9850 }, { "epoch": 1.5421102066374452, "grad_norm": 0.9611242413520813, "learning_rate": 3.4928668478260866e-05, "loss": 0.3391, "step": 9851 }, { "epoch": 1.5422667501565435, "grad_norm": 4.143782615661621, "learning_rate": 3.4916779891304344e-05, "loss": 0.3131, "step": 9852 }, { "epoch": 1.542423293675642, "grad_norm": 1.492146611213684, "learning_rate": 3.490489130434782e-05, "loss": 0.4508, "step": 9853 }, { "epoch": 1.5425798371947401, "grad_norm": 1.203545093536377, "learning_rate": 3.48930027173913e-05, "loss": 0.2076, "step": 9854 }, { "epoch": 1.5427363807138383, "grad_norm": 1.1700773239135742, "learning_rate": 3.488111413043478e-05, "loss": 0.3299, "step": 9855 }, { "epoch": 1.5428929242329368, "grad_norm": 8.790118217468262, "learning_rate": 3.486922554347826e-05, "loss": 0.7653, "step": 9856 }, { "epoch": 1.5430494677520352, "grad_norm": 1.131675124168396, "learning_rate": 3.4857336956521735e-05, "loss": 0.4812, "step": 9857 }, { "epoch": 1.5432060112711334, "grad_norm": 1.3232656717300415, "learning_rate": 3.484544836956521e-05, "loss": 0.4142, "step": 9858 }, { "epoch": 1.5433625547902317, "grad_norm": 2.20096492767334, "learning_rate": 3.483355978260869e-05, "loss": 0.468, "step": 9859 }, { "epoch": 1.5435190983093299, "grad_norm": 3.7436187267303467, "learning_rate": 3.482167119565217e-05, "loss": 0.4616, "step": 9860 }, { "epoch": 1.5436756418284283, "grad_norm": 1.0341593027114868, "learning_rate": 3.480978260869565e-05, "loss": 0.5058, "step": 9861 }, { "epoch": 1.5438321853475268, "grad_norm": 1.2792843580245972, "learning_rate": 3.4797894021739126e-05, "loss": 0.3288, "step": 9862 }, { "epoch": 1.543988728866625, "grad_norm": 1.7394466400146484, "learning_rate": 3.4786005434782604e-05, "loss": 0.6055, "step": 9863 }, { "epoch": 1.5441452723857232, "grad_norm": 1.2310612201690674, "learning_rate": 3.477411684782608e-05, "loss": 0.5236, "step": 9864 }, { "epoch": 1.5443018159048214, "grad_norm": 6.030813694000244, "learning_rate": 3.476222826086957e-05, "loss": 0.7003, "step": 9865 }, { "epoch": 1.5444583594239198, "grad_norm": 2.040247917175293, "learning_rate": 3.475033967391304e-05, "loss": 0.464, "step": 9866 }, { "epoch": 1.5446149029430183, "grad_norm": 2.0109846591949463, "learning_rate": 3.4738451086956516e-05, "loss": 0.2979, "step": 9867 }, { "epoch": 1.5447714464621165, "grad_norm": 2.395266056060791, "learning_rate": 3.4726562499999995e-05, "loss": 0.5858, "step": 9868 }, { "epoch": 1.5449279899812147, "grad_norm": 2.28810453414917, "learning_rate": 3.471467391304347e-05, "loss": 0.7035, "step": 9869 }, { "epoch": 1.545084533500313, "grad_norm": 2.2035818099975586, "learning_rate": 3.470278532608696e-05, "loss": 0.652, "step": 9870 }, { "epoch": 1.5452410770194114, "grad_norm": 2.352590560913086, "learning_rate": 3.469089673913043e-05, "loss": 0.5355, "step": 9871 }, { "epoch": 1.5453976205385098, "grad_norm": 3.603425979614258, "learning_rate": 3.467900815217391e-05, "loss": 0.5639, "step": 9872 }, { "epoch": 1.545554164057608, "grad_norm": 3.028432846069336, "learning_rate": 3.4667119565217385e-05, "loss": 0.3829, "step": 9873 }, { "epoch": 1.5457107075767063, "grad_norm": 3.7414801120758057, "learning_rate": 3.4655230978260863e-05, "loss": 0.5876, "step": 9874 }, { "epoch": 1.5458672510958045, "grad_norm": 2.232328414916992, "learning_rate": 3.464334239130435e-05, "loss": 0.9676, "step": 9875 }, { "epoch": 1.546023794614903, "grad_norm": 2.793696403503418, "learning_rate": 3.4631453804347827e-05, "loss": 1.0672, "step": 9876 }, { "epoch": 1.5461803381340014, "grad_norm": 3.011934757232666, "learning_rate": 3.46195652173913e-05, "loss": 0.9692, "step": 9877 }, { "epoch": 1.5463368816530996, "grad_norm": 3.0453221797943115, "learning_rate": 3.4607676630434776e-05, "loss": 0.8062, "step": 9878 }, { "epoch": 1.5464934251721978, "grad_norm": 4.840117931365967, "learning_rate": 3.4595788043478254e-05, "loss": 0.6902, "step": 9879 }, { "epoch": 1.5466499686912962, "grad_norm": 3.909337043762207, "learning_rate": 3.458389945652174e-05, "loss": 1.1017, "step": 9880 }, { "epoch": 1.5468065122103944, "grad_norm": 3.6161868572235107, "learning_rate": 3.457201086956522e-05, "loss": 0.6883, "step": 9881 }, { "epoch": 1.5469630557294929, "grad_norm": 5.475499153137207, "learning_rate": 3.4560122282608695e-05, "loss": 1.1946, "step": 9882 }, { "epoch": 1.547119599248591, "grad_norm": 3.0572597980499268, "learning_rate": 3.454823369565217e-05, "loss": 1.2876, "step": 9883 }, { "epoch": 1.5472761427676893, "grad_norm": 2.535299777984619, "learning_rate": 3.4536345108695645e-05, "loss": 0.4621, "step": 9884 }, { "epoch": 1.5474326862867878, "grad_norm": 5.926052093505859, "learning_rate": 3.452445652173913e-05, "loss": 0.6144, "step": 9885 }, { "epoch": 1.5475892298058862, "grad_norm": 1.0712608098983765, "learning_rate": 3.451256793478261e-05, "loss": 0.3292, "step": 9886 }, { "epoch": 1.5477457733249844, "grad_norm": 1.9989995956420898, "learning_rate": 3.4500679347826086e-05, "loss": 0.8517, "step": 9887 }, { "epoch": 1.5479023168440826, "grad_norm": 2.8369462490081787, "learning_rate": 3.4488790760869564e-05, "loss": 0.7879, "step": 9888 }, { "epoch": 1.5480588603631809, "grad_norm": 0.7114554643630981, "learning_rate": 3.4476902173913036e-05, "loss": 0.2317, "step": 9889 }, { "epoch": 1.5482154038822793, "grad_norm": 1.5249648094177246, "learning_rate": 3.446501358695652e-05, "loss": 0.2332, "step": 9890 }, { "epoch": 1.5483719474013777, "grad_norm": 0.6559094786643982, "learning_rate": 3.4453125e-05, "loss": 0.1947, "step": 9891 }, { "epoch": 1.548528490920476, "grad_norm": 1.9698351621627808, "learning_rate": 3.444123641304348e-05, "loss": 0.2286, "step": 9892 }, { "epoch": 1.5486850344395742, "grad_norm": 0.9222644567489624, "learning_rate": 3.4429347826086955e-05, "loss": 0.2377, "step": 9893 }, { "epoch": 1.5488415779586724, "grad_norm": 6.10420036315918, "learning_rate": 3.4417459239130427e-05, "loss": 0.2634, "step": 9894 }, { "epoch": 1.5489981214777708, "grad_norm": 0.44023653864860535, "learning_rate": 3.440557065217391e-05, "loss": 0.1404, "step": 9895 }, { "epoch": 1.5491546649968693, "grad_norm": 1.3548821210861206, "learning_rate": 3.439368206521739e-05, "loss": 0.4374, "step": 9896 }, { "epoch": 1.5493112085159675, "grad_norm": 0.8498101234436035, "learning_rate": 3.438179347826087e-05, "loss": 0.2586, "step": 9897 }, { "epoch": 1.5494677520350657, "grad_norm": 0.7334262132644653, "learning_rate": 3.4369904891304346e-05, "loss": 0.3199, "step": 9898 }, { "epoch": 1.549624295554164, "grad_norm": 0.9061806201934814, "learning_rate": 3.4358016304347824e-05, "loss": 0.1377, "step": 9899 }, { "epoch": 1.5497808390732624, "grad_norm": 0.87895268201828, "learning_rate": 3.43461277173913e-05, "loss": 0.3024, "step": 9900 }, { "epoch": 1.5499373825923608, "grad_norm": 1.0152597427368164, "learning_rate": 3.433423913043478e-05, "loss": 0.3999, "step": 9901 }, { "epoch": 1.550093926111459, "grad_norm": 1.2754113674163818, "learning_rate": 3.432235054347826e-05, "loss": 0.5012, "step": 9902 }, { "epoch": 1.5502504696305572, "grad_norm": 1.415547251701355, "learning_rate": 3.431046195652174e-05, "loss": 0.4332, "step": 9903 }, { "epoch": 1.5504070131496555, "grad_norm": 1.840811848640442, "learning_rate": 3.4298573369565215e-05, "loss": 0.3458, "step": 9904 }, { "epoch": 1.550563556668754, "grad_norm": 1.3106211423873901, "learning_rate": 3.428668478260869e-05, "loss": 0.335, "step": 9905 }, { "epoch": 1.5507201001878523, "grad_norm": 2.055691719055176, "learning_rate": 3.427479619565217e-05, "loss": 0.2944, "step": 9906 }, { "epoch": 1.5508766437069506, "grad_norm": 2.217951774597168, "learning_rate": 3.426290760869565e-05, "loss": 0.5491, "step": 9907 }, { "epoch": 1.5510331872260488, "grad_norm": 2.7528653144836426, "learning_rate": 3.425101902173913e-05, "loss": 0.4734, "step": 9908 }, { "epoch": 1.551189730745147, "grad_norm": 1.4471588134765625, "learning_rate": 3.4239130434782606e-05, "loss": 0.4013, "step": 9909 }, { "epoch": 1.5513462742642454, "grad_norm": 1.5813329219818115, "learning_rate": 3.4227241847826084e-05, "loss": 0.4764, "step": 9910 }, { "epoch": 1.5515028177833439, "grad_norm": 5.461438179016113, "learning_rate": 3.421535326086956e-05, "loss": 0.544, "step": 9911 }, { "epoch": 1.551659361302442, "grad_norm": 2.145094156265259, "learning_rate": 3.420346467391304e-05, "loss": 0.8614, "step": 9912 }, { "epoch": 1.5518159048215403, "grad_norm": 1.9332960844039917, "learning_rate": 3.419157608695652e-05, "loss": 0.6232, "step": 9913 }, { "epoch": 1.5519724483406387, "grad_norm": 2.771103620529175, "learning_rate": 3.4179687499999996e-05, "loss": 0.7841, "step": 9914 }, { "epoch": 1.552128991859737, "grad_norm": 1.4706820249557495, "learning_rate": 3.4167798913043475e-05, "loss": 0.2952, "step": 9915 }, { "epoch": 1.5522855353788354, "grad_norm": 2.9112188816070557, "learning_rate": 3.415591032608695e-05, "loss": 0.4658, "step": 9916 }, { "epoch": 1.5524420788979336, "grad_norm": 1.0996911525726318, "learning_rate": 3.414402173913043e-05, "loss": 0.4934, "step": 9917 }, { "epoch": 1.5525986224170318, "grad_norm": 2.6090710163116455, "learning_rate": 3.413213315217391e-05, "loss": 0.3491, "step": 9918 }, { "epoch": 1.5527551659361303, "grad_norm": 2.424104928970337, "learning_rate": 3.412024456521739e-05, "loss": 0.7203, "step": 9919 }, { "epoch": 1.5529117094552287, "grad_norm": 2.6726503372192383, "learning_rate": 3.4108355978260865e-05, "loss": 0.4466, "step": 9920 }, { "epoch": 1.553068252974327, "grad_norm": 5.130659580230713, "learning_rate": 3.4096467391304344e-05, "loss": 1.0742, "step": 9921 }, { "epoch": 1.5532247964934252, "grad_norm": 2.146470546722412, "learning_rate": 3.408457880434782e-05, "loss": 0.5434, "step": 9922 }, { "epoch": 1.5533813400125234, "grad_norm": 2.65730357170105, "learning_rate": 3.40726902173913e-05, "loss": 0.9173, "step": 9923 }, { "epoch": 1.5535378835316218, "grad_norm": 2.2022931575775146, "learning_rate": 3.406080163043478e-05, "loss": 0.7002, "step": 9924 }, { "epoch": 1.5536944270507203, "grad_norm": 3.2549283504486084, "learning_rate": 3.4048913043478256e-05, "loss": 0.8937, "step": 9925 }, { "epoch": 1.5538509705698185, "grad_norm": 3.0747780799865723, "learning_rate": 3.4037024456521734e-05, "loss": 0.5534, "step": 9926 }, { "epoch": 1.5540075140889167, "grad_norm": 3.3332486152648926, "learning_rate": 3.402513586956521e-05, "loss": 0.7009, "step": 9927 }, { "epoch": 1.554164057608015, "grad_norm": 5.860404014587402, "learning_rate": 3.40132472826087e-05, "loss": 0.8935, "step": 9928 }, { "epoch": 1.5543206011271133, "grad_norm": 2.8831799030303955, "learning_rate": 3.400135869565217e-05, "loss": 0.947, "step": 9929 }, { "epoch": 1.5544771446462118, "grad_norm": 3.01436710357666, "learning_rate": 3.398947010869565e-05, "loss": 0.7629, "step": 9930 }, { "epoch": 1.55463368816531, "grad_norm": 3.547330141067505, "learning_rate": 3.3977581521739125e-05, "loss": 1.0224, "step": 9931 }, { "epoch": 1.5547902316844082, "grad_norm": 5.267106533050537, "learning_rate": 3.39656929347826e-05, "loss": 1.6597, "step": 9932 }, { "epoch": 1.5549467752035064, "grad_norm": 1.9838265180587769, "learning_rate": 3.395380434782609e-05, "loss": 0.758, "step": 9933 }, { "epoch": 1.5551033187226049, "grad_norm": 2.7799999713897705, "learning_rate": 3.3941915760869566e-05, "loss": 0.8879, "step": 9934 }, { "epoch": 1.5552598622417033, "grad_norm": 3.02760910987854, "learning_rate": 3.393002717391304e-05, "loss": 0.7191, "step": 9935 }, { "epoch": 1.5554164057608015, "grad_norm": 3.057662010192871, "learning_rate": 3.3918138586956516e-05, "loss": 1.4681, "step": 9936 }, { "epoch": 1.5555729492798998, "grad_norm": 1.7930982112884521, "learning_rate": 3.3906249999999994e-05, "loss": 0.6866, "step": 9937 }, { "epoch": 1.555729492798998, "grad_norm": 2.181812047958374, "learning_rate": 3.389436141304348e-05, "loss": 0.8777, "step": 9938 }, { "epoch": 1.5558860363180964, "grad_norm": 0.5295925140380859, "learning_rate": 3.388247282608696e-05, "loss": 0.1448, "step": 9939 }, { "epoch": 1.5560425798371949, "grad_norm": 0.49820011854171753, "learning_rate": 3.387058423913043e-05, "loss": 0.1266, "step": 9940 }, { "epoch": 1.556199123356293, "grad_norm": 0.838528573513031, "learning_rate": 3.385869565217391e-05, "loss": 0.1799, "step": 9941 }, { "epoch": 1.5563556668753913, "grad_norm": 0.5594557523727417, "learning_rate": 3.3846807065217385e-05, "loss": 0.2132, "step": 9942 }, { "epoch": 1.5565122103944895, "grad_norm": 0.3953682482242584, "learning_rate": 3.383491847826087e-05, "loss": 0.2026, "step": 9943 }, { "epoch": 1.556668753913588, "grad_norm": 0.4497411847114563, "learning_rate": 3.382302989130435e-05, "loss": 0.1369, "step": 9944 }, { "epoch": 1.5568252974326864, "grad_norm": 0.8755006194114685, "learning_rate": 3.3811141304347826e-05, "loss": 0.2434, "step": 9945 }, { "epoch": 1.5569818409517846, "grad_norm": 0.947909951210022, "learning_rate": 3.37992527173913e-05, "loss": 0.1952, "step": 9946 }, { "epoch": 1.5571383844708828, "grad_norm": 1.2304229736328125, "learning_rate": 3.3787364130434776e-05, "loss": 0.2772, "step": 9947 }, { "epoch": 1.5572949279899813, "grad_norm": 0.9237186908721924, "learning_rate": 3.377547554347826e-05, "loss": 0.3614, "step": 9948 }, { "epoch": 1.5574514715090795, "grad_norm": 1.1335630416870117, "learning_rate": 3.376358695652174e-05, "loss": 0.3576, "step": 9949 }, { "epoch": 1.557608015028178, "grad_norm": 1.2573052644729614, "learning_rate": 3.375169836956522e-05, "loss": 0.1928, "step": 9950 }, { "epoch": 1.5577645585472761, "grad_norm": 0.9555529356002808, "learning_rate": 3.3739809782608695e-05, "loss": 0.2686, "step": 9951 }, { "epoch": 1.5579211020663744, "grad_norm": 1.1866750717163086, "learning_rate": 3.3727921195652166e-05, "loss": 0.3976, "step": 9952 }, { "epoch": 1.5580776455854728, "grad_norm": 0.7947357892990112, "learning_rate": 3.371603260869565e-05, "loss": 0.2169, "step": 9953 }, { "epoch": 1.5582341891045712, "grad_norm": 1.263967752456665, "learning_rate": 3.370414402173913e-05, "loss": 0.3475, "step": 9954 }, { "epoch": 1.5583907326236695, "grad_norm": 1.2545052766799927, "learning_rate": 3.369225543478261e-05, "loss": 0.3779, "step": 9955 }, { "epoch": 1.5585472761427677, "grad_norm": 3.0372350215911865, "learning_rate": 3.3680366847826086e-05, "loss": 0.7564, "step": 9956 }, { "epoch": 1.5587038196618659, "grad_norm": 2.4949429035186768, "learning_rate": 3.3668478260869564e-05, "loss": 0.3251, "step": 9957 }, { "epoch": 1.5588603631809643, "grad_norm": 1.254440188407898, "learning_rate": 3.365658967391304e-05, "loss": 0.2559, "step": 9958 }, { "epoch": 1.5590169067000628, "grad_norm": 1.5905170440673828, "learning_rate": 3.364470108695652e-05, "loss": 0.5256, "step": 9959 }, { "epoch": 1.559173450219161, "grad_norm": 1.60410737991333, "learning_rate": 3.36328125e-05, "loss": 0.4676, "step": 9960 }, { "epoch": 1.5593299937382592, "grad_norm": 4.248783111572266, "learning_rate": 3.3620923913043477e-05, "loss": 0.69, "step": 9961 }, { "epoch": 1.5594865372573574, "grad_norm": 1.4306432008743286, "learning_rate": 3.3609035326086955e-05, "loss": 0.3649, "step": 9962 }, { "epoch": 1.5596430807764559, "grad_norm": 1.9882729053497314, "learning_rate": 3.359714673913043e-05, "loss": 0.6415, "step": 9963 }, { "epoch": 1.5597996242955543, "grad_norm": 8.603158950805664, "learning_rate": 3.358525815217391e-05, "loss": 0.8173, "step": 9964 }, { "epoch": 1.5599561678146525, "grad_norm": 2.154953956604004, "learning_rate": 3.357336956521739e-05, "loss": 0.6758, "step": 9965 }, { "epoch": 1.5601127113337507, "grad_norm": 1.8238340616226196, "learning_rate": 3.356148097826087e-05, "loss": 0.396, "step": 9966 }, { "epoch": 1.560269254852849, "grad_norm": 1.75259530544281, "learning_rate": 3.3549592391304346e-05, "loss": 0.6842, "step": 9967 }, { "epoch": 1.5604257983719474, "grad_norm": 2.3687760829925537, "learning_rate": 3.3537703804347824e-05, "loss": 0.585, "step": 9968 }, { "epoch": 1.5605823418910458, "grad_norm": 1.7490761280059814, "learning_rate": 3.35258152173913e-05, "loss": 0.4348, "step": 9969 }, { "epoch": 1.560738885410144, "grad_norm": 1.5283021926879883, "learning_rate": 3.351392663043478e-05, "loss": 0.6256, "step": 9970 }, { "epoch": 1.5608954289292423, "grad_norm": 1.7685884237289429, "learning_rate": 3.350203804347826e-05, "loss": 0.5911, "step": 9971 }, { "epoch": 1.5610519724483405, "grad_norm": 1.748946189880371, "learning_rate": 3.3490149456521736e-05, "loss": 0.5695, "step": 9972 }, { "epoch": 1.561208515967439, "grad_norm": 4.499454021453857, "learning_rate": 3.3478260869565214e-05, "loss": 1.0758, "step": 9973 }, { "epoch": 1.5613650594865374, "grad_norm": 4.797616958618164, "learning_rate": 3.346637228260869e-05, "loss": 1.3134, "step": 9974 }, { "epoch": 1.5615216030056356, "grad_norm": 3.0602262020111084, "learning_rate": 3.345448369565217e-05, "loss": 1.1788, "step": 9975 }, { "epoch": 1.5616781465247338, "grad_norm": 3.7913107872009277, "learning_rate": 3.344259510869565e-05, "loss": 0.6325, "step": 9976 }, { "epoch": 1.561834690043832, "grad_norm": 3.080826759338379, "learning_rate": 3.343070652173913e-05, "loss": 0.9457, "step": 9977 }, { "epoch": 1.5619912335629305, "grad_norm": 3.242022752761841, "learning_rate": 3.3418817934782605e-05, "loss": 0.7995, "step": 9978 }, { "epoch": 1.562147777082029, "grad_norm": 5.509000301361084, "learning_rate": 3.3406929347826083e-05, "loss": 1.3462, "step": 9979 }, { "epoch": 1.5623043206011271, "grad_norm": 2.333416700363159, "learning_rate": 3.339504076086956e-05, "loss": 0.9995, "step": 9980 }, { "epoch": 1.5624608641202253, "grad_norm": 4.777994632720947, "learning_rate": 3.338315217391304e-05, "loss": 0.7148, "step": 9981 }, { "epoch": 1.5626174076393238, "grad_norm": 3.361694097518921, "learning_rate": 3.337126358695652e-05, "loss": 0.6635, "step": 9982 }, { "epoch": 1.562773951158422, "grad_norm": 2.5201728343963623, "learning_rate": 3.3359374999999996e-05, "loss": 1.1929, "step": 9983 }, { "epoch": 1.5629304946775204, "grad_norm": 3.2841475009918213, "learning_rate": 3.3347486413043474e-05, "loss": 1.0911, "step": 9984 }, { "epoch": 1.5630870381966186, "grad_norm": 1.1818060874938965, "learning_rate": 3.333559782608695e-05, "loss": 0.4302, "step": 9985 }, { "epoch": 1.5632435817157169, "grad_norm": 1.2086371183395386, "learning_rate": 3.332370923913043e-05, "loss": 0.5754, "step": 9986 }, { "epoch": 1.5634001252348153, "grad_norm": 1.6073236465454102, "learning_rate": 3.331182065217391e-05, "loss": 0.3394, "step": 9987 }, { "epoch": 1.5635566687539137, "grad_norm": 2.4132962226867676, "learning_rate": 3.329993206521739e-05, "loss": 0.8231, "step": 9988 }, { "epoch": 1.563713212273012, "grad_norm": 0.6444993615150452, "learning_rate": 3.3288043478260865e-05, "loss": 0.1901, "step": 9989 }, { "epoch": 1.5638697557921102, "grad_norm": 0.3397302031517029, "learning_rate": 3.327615489130434e-05, "loss": 0.1173, "step": 9990 }, { "epoch": 1.5640262993112084, "grad_norm": 0.7278451323509216, "learning_rate": 3.326426630434783e-05, "loss": 0.2369, "step": 9991 }, { "epoch": 1.5641828428303068, "grad_norm": 0.829058051109314, "learning_rate": 3.32523777173913e-05, "loss": 0.2169, "step": 9992 }, { "epoch": 1.5643393863494053, "grad_norm": 1.3171637058258057, "learning_rate": 3.324048913043478e-05, "loss": 0.3339, "step": 9993 }, { "epoch": 1.5644959298685035, "grad_norm": 0.47179558873176575, "learning_rate": 3.3228600543478256e-05, "loss": 0.2889, "step": 9994 }, { "epoch": 1.5646524733876017, "grad_norm": 1.1824686527252197, "learning_rate": 3.3216711956521734e-05, "loss": 0.3579, "step": 9995 }, { "epoch": 1.5648090169067, "grad_norm": 0.7399032115936279, "learning_rate": 3.320482336956522e-05, "loss": 0.2184, "step": 9996 }, { "epoch": 1.5649655604257984, "grad_norm": 0.7790495753288269, "learning_rate": 3.31929347826087e-05, "loss": 0.1941, "step": 9997 }, { "epoch": 1.5651221039448968, "grad_norm": 0.9702885150909424, "learning_rate": 3.318104619565217e-05, "loss": 0.3421, "step": 9998 }, { "epoch": 1.565278647463995, "grad_norm": 0.648149311542511, "learning_rate": 3.3169157608695647e-05, "loss": 0.1607, "step": 9999 }, { "epoch": 1.5654351909830932, "grad_norm": 0.816554069519043, "learning_rate": 3.3157269021739125e-05, "loss": 0.1536, "step": 10000 }, { "epoch": 1.5654351909830932, "eval_loss": 0.47840648889541626, "eval_runtime": 203.5946, "eval_samples_per_second": 60.822, "eval_steps_per_second": 3.802, "eval_wer": 0.3065643793697006, "step": 10000 }, { "epoch": 1.5655917345021915, "grad_norm": 1.1876505613327026, "learning_rate": 3.314538043478261e-05, "loss": 0.3895, "step": 10001 }, { "epoch": 1.56574827802129, "grad_norm": 1.929848074913025, "learning_rate": 3.313349184782609e-05, "loss": 0.2892, "step": 10002 }, { "epoch": 1.5659048215403883, "grad_norm": 0.9021331667900085, "learning_rate": 3.3121603260869566e-05, "loss": 0.2343, "step": 10003 }, { "epoch": 1.5660613650594866, "grad_norm": 1.2929447889328003, "learning_rate": 3.310971467391304e-05, "loss": 0.3, "step": 10004 }, { "epoch": 1.5662179085785848, "grad_norm": 3.9611763954162598, "learning_rate": 3.3097826086956515e-05, "loss": 0.2691, "step": 10005 }, { "epoch": 1.566374452097683, "grad_norm": 1.2823082208633423, "learning_rate": 3.30859375e-05, "loss": 0.4923, "step": 10006 }, { "epoch": 1.5665309956167814, "grad_norm": 2.7527194023132324, "learning_rate": 3.307404891304348e-05, "loss": 0.3452, "step": 10007 }, { "epoch": 1.5666875391358799, "grad_norm": 1.754594326019287, "learning_rate": 3.306216032608696e-05, "loss": 0.4294, "step": 10008 }, { "epoch": 1.566844082654978, "grad_norm": 1.8212445974349976, "learning_rate": 3.305027173913043e-05, "loss": 0.4195, "step": 10009 }, { "epoch": 1.5670006261740763, "grad_norm": 2.6413702964782715, "learning_rate": 3.3038383152173906e-05, "loss": 0.4982, "step": 10010 }, { "epoch": 1.5671571696931748, "grad_norm": 1.6063095331192017, "learning_rate": 3.302649456521739e-05, "loss": 0.3245, "step": 10011 }, { "epoch": 1.567313713212273, "grad_norm": 0.8769960999488831, "learning_rate": 3.301460597826087e-05, "loss": 0.2838, "step": 10012 }, { "epoch": 1.5674702567313714, "grad_norm": 3.8902928829193115, "learning_rate": 3.300271739130435e-05, "loss": 0.729, "step": 10013 }, { "epoch": 1.5676268002504696, "grad_norm": 3.1373910903930664, "learning_rate": 3.2990828804347826e-05, "loss": 0.528, "step": 10014 }, { "epoch": 1.5677833437695678, "grad_norm": 2.6620826721191406, "learning_rate": 3.29789402173913e-05, "loss": 0.5121, "step": 10015 }, { "epoch": 1.5679398872886663, "grad_norm": 1.959355354309082, "learning_rate": 3.296705163043478e-05, "loss": 0.5528, "step": 10016 }, { "epoch": 1.5680964308077645, "grad_norm": 2.0414528846740723, "learning_rate": 3.295516304347826e-05, "loss": 0.7324, "step": 10017 }, { "epoch": 1.568252974326863, "grad_norm": 2.5357861518859863, "learning_rate": 3.294327445652174e-05, "loss": 0.7374, "step": 10018 }, { "epoch": 1.5684095178459612, "grad_norm": 3.407268762588501, "learning_rate": 3.2931385869565216e-05, "loss": 0.8259, "step": 10019 }, { "epoch": 1.5685660613650594, "grad_norm": 1.3756870031356812, "learning_rate": 3.2919497282608695e-05, "loss": 0.5009, "step": 10020 }, { "epoch": 1.5687226048841578, "grad_norm": 3.3000845909118652, "learning_rate": 3.290760869565217e-05, "loss": 1.1425, "step": 10021 }, { "epoch": 1.5688791484032563, "grad_norm": 1.8563802242279053, "learning_rate": 3.289572010869565e-05, "loss": 0.6081, "step": 10022 }, { "epoch": 1.5690356919223545, "grad_norm": 1.9495269060134888, "learning_rate": 3.288383152173913e-05, "loss": 0.7484, "step": 10023 }, { "epoch": 1.5691922354414527, "grad_norm": 1.2622429132461548, "learning_rate": 3.287194293478261e-05, "loss": 0.5048, "step": 10024 }, { "epoch": 1.569348778960551, "grad_norm": 3.6872949600219727, "learning_rate": 3.2860054347826085e-05, "loss": 0.9678, "step": 10025 }, { "epoch": 1.5695053224796494, "grad_norm": 2.460826873779297, "learning_rate": 3.2848165760869564e-05, "loss": 0.8032, "step": 10026 }, { "epoch": 1.5696618659987478, "grad_norm": 2.3013107776641846, "learning_rate": 3.283627717391304e-05, "loss": 0.5674, "step": 10027 }, { "epoch": 1.569818409517846, "grad_norm": 2.7555274963378906, "learning_rate": 3.282438858695652e-05, "loss": 0.8803, "step": 10028 }, { "epoch": 1.5699749530369442, "grad_norm": 6.2030720710754395, "learning_rate": 3.28125e-05, "loss": 1.1828, "step": 10029 }, { "epoch": 1.5701314965560424, "grad_norm": 2.658017635345459, "learning_rate": 3.2800611413043476e-05, "loss": 0.7513, "step": 10030 }, { "epoch": 1.570288040075141, "grad_norm": 4.58135986328125, "learning_rate": 3.2788722826086954e-05, "loss": 1.2877, "step": 10031 }, { "epoch": 1.5704445835942393, "grad_norm": 7.356186389923096, "learning_rate": 3.277683423913043e-05, "loss": 0.7276, "step": 10032 }, { "epoch": 1.5706011271133375, "grad_norm": 4.319295406341553, "learning_rate": 3.276494565217391e-05, "loss": 1.3934, "step": 10033 }, { "epoch": 1.5707576706324358, "grad_norm": 2.3864634037017822, "learning_rate": 3.275305706521739e-05, "loss": 1.0639, "step": 10034 }, { "epoch": 1.570914214151534, "grad_norm": 2.1003081798553467, "learning_rate": 3.274116847826087e-05, "loss": 1.2381, "step": 10035 }, { "epoch": 1.5710707576706324, "grad_norm": 2.9257254600524902, "learning_rate": 3.2729279891304345e-05, "loss": 0.7299, "step": 10036 }, { "epoch": 1.5712273011897309, "grad_norm": 2.994534969329834, "learning_rate": 3.271739130434782e-05, "loss": 0.7361, "step": 10037 }, { "epoch": 1.571383844708829, "grad_norm": 7.022100925445557, "learning_rate": 3.27055027173913e-05, "loss": 1.3107, "step": 10038 }, { "epoch": 1.5715403882279273, "grad_norm": 0.43398040533065796, "learning_rate": 3.269361413043478e-05, "loss": 0.204, "step": 10039 }, { "epoch": 1.5716969317470255, "grad_norm": 0.5054906606674194, "learning_rate": 3.268172554347826e-05, "loss": 0.1608, "step": 10040 }, { "epoch": 1.571853475266124, "grad_norm": 0.49651673436164856, "learning_rate": 3.2669836956521736e-05, "loss": 0.1496, "step": 10041 }, { "epoch": 1.5720100187852224, "grad_norm": 0.591971218585968, "learning_rate": 3.2657948369565214e-05, "loss": 0.1308, "step": 10042 }, { "epoch": 1.5721665623043206, "grad_norm": 0.5332335829734802, "learning_rate": 3.264605978260869e-05, "loss": 0.1725, "step": 10043 }, { "epoch": 1.5723231058234188, "grad_norm": 0.9095372557640076, "learning_rate": 3.263417119565217e-05, "loss": 0.261, "step": 10044 }, { "epoch": 1.5724796493425173, "grad_norm": 0.5609951615333557, "learning_rate": 3.262228260869565e-05, "loss": 0.1536, "step": 10045 }, { "epoch": 1.5726361928616155, "grad_norm": 0.5564268827438354, "learning_rate": 3.261039402173913e-05, "loss": 0.2077, "step": 10046 }, { "epoch": 1.572792736380714, "grad_norm": 0.6826467514038086, "learning_rate": 3.2598505434782605e-05, "loss": 0.2202, "step": 10047 }, { "epoch": 1.5729492798998121, "grad_norm": 1.325971007347107, "learning_rate": 3.258661684782608e-05, "loss": 0.1623, "step": 10048 }, { "epoch": 1.5731058234189104, "grad_norm": 0.7880258560180664, "learning_rate": 3.257472826086956e-05, "loss": 0.242, "step": 10049 }, { "epoch": 1.5732623669380088, "grad_norm": 0.8278499245643616, "learning_rate": 3.256283967391304e-05, "loss": 0.3115, "step": 10050 }, { "epoch": 1.573418910457107, "grad_norm": 0.8254081010818481, "learning_rate": 3.255095108695652e-05, "loss": 0.2033, "step": 10051 }, { "epoch": 1.5735754539762055, "grad_norm": 1.0673773288726807, "learning_rate": 3.2539062499999996e-05, "loss": 0.2602, "step": 10052 }, { "epoch": 1.5737319974953037, "grad_norm": 1.9798575639724731, "learning_rate": 3.2527173913043474e-05, "loss": 0.4611, "step": 10053 }, { "epoch": 1.573888541014402, "grad_norm": 1.526545524597168, "learning_rate": 3.251528532608695e-05, "loss": 0.2595, "step": 10054 }, { "epoch": 1.5740450845335003, "grad_norm": 5.066948413848877, "learning_rate": 3.250339673913043e-05, "loss": 0.3488, "step": 10055 }, { "epoch": 1.5742016280525988, "grad_norm": 3.922994375228882, "learning_rate": 3.249150815217391e-05, "loss": 0.5024, "step": 10056 }, { "epoch": 1.574358171571697, "grad_norm": 2.0900282859802246, "learning_rate": 3.2479619565217386e-05, "loss": 0.4176, "step": 10057 }, { "epoch": 1.5745147150907952, "grad_norm": 1.6446585655212402, "learning_rate": 3.2467730978260864e-05, "loss": 0.4702, "step": 10058 }, { "epoch": 1.5746712586098934, "grad_norm": 3.6721813678741455, "learning_rate": 3.245584239130434e-05, "loss": 0.6326, "step": 10059 }, { "epoch": 1.5748278021289919, "grad_norm": 1.4987750053405762, "learning_rate": 3.244395380434783e-05, "loss": 0.3327, "step": 10060 }, { "epoch": 1.5749843456480903, "grad_norm": 2.1085026264190674, "learning_rate": 3.24320652173913e-05, "loss": 0.3445, "step": 10061 }, { "epoch": 1.5751408891671885, "grad_norm": 1.3398019075393677, "learning_rate": 3.242017663043478e-05, "loss": 0.5393, "step": 10062 }, { "epoch": 1.5752974326862867, "grad_norm": 1.8874784708023071, "learning_rate": 3.2408288043478255e-05, "loss": 0.4474, "step": 10063 }, { "epoch": 1.575453976205385, "grad_norm": 2.246999740600586, "learning_rate": 3.2396399456521733e-05, "loss": 0.6682, "step": 10064 }, { "epoch": 1.5756105197244834, "grad_norm": 3.947788715362549, "learning_rate": 3.238451086956522e-05, "loss": 0.6109, "step": 10065 }, { "epoch": 1.5757670632435818, "grad_norm": 1.240382432937622, "learning_rate": 3.2372622282608697e-05, "loss": 0.2697, "step": 10066 }, { "epoch": 1.57592360676268, "grad_norm": 4.211026191711426, "learning_rate": 3.236073369565217e-05, "loss": 0.8726, "step": 10067 }, { "epoch": 1.5760801502817783, "grad_norm": 2.189406633377075, "learning_rate": 3.2348845108695646e-05, "loss": 0.7671, "step": 10068 }, { "epoch": 1.5762366938008765, "grad_norm": 2.055722951889038, "learning_rate": 3.2336956521739124e-05, "loss": 0.4575, "step": 10069 }, { "epoch": 1.576393237319975, "grad_norm": 4.797212600708008, "learning_rate": 3.232506793478261e-05, "loss": 0.6103, "step": 10070 }, { "epoch": 1.5765497808390734, "grad_norm": 3.0193288326263428, "learning_rate": 3.231317934782609e-05, "loss": 0.5386, "step": 10071 }, { "epoch": 1.5767063243581716, "grad_norm": 1.4885249137878418, "learning_rate": 3.2301290760869565e-05, "loss": 0.5779, "step": 10072 }, { "epoch": 1.5768628678772698, "grad_norm": 2.985391855239868, "learning_rate": 3.228940217391304e-05, "loss": 0.5398, "step": 10073 }, { "epoch": 1.577019411396368, "grad_norm": 3.092790365219116, "learning_rate": 3.2277513586956515e-05, "loss": 1.0174, "step": 10074 }, { "epoch": 1.5771759549154665, "grad_norm": 6.6943230628967285, "learning_rate": 3.2265625e-05, "loss": 0.8228, "step": 10075 }, { "epoch": 1.577332498434565, "grad_norm": 6.443971157073975, "learning_rate": 3.225373641304348e-05, "loss": 1.2933, "step": 10076 }, { "epoch": 1.5774890419536631, "grad_norm": 3.661564588546753, "learning_rate": 3.2241847826086956e-05, "loss": 1.2159, "step": 10077 }, { "epoch": 1.5776455854727613, "grad_norm": 4.259986400604248, "learning_rate": 3.222995923913043e-05, "loss": 1.4063, "step": 10078 }, { "epoch": 1.5778021289918598, "grad_norm": 5.711047172546387, "learning_rate": 3.2218070652173906e-05, "loss": 0.864, "step": 10079 }, { "epoch": 1.577958672510958, "grad_norm": 5.301638603210449, "learning_rate": 3.220618206521739e-05, "loss": 1.003, "step": 10080 }, { "epoch": 1.5781152160300564, "grad_norm": 3.540651559829712, "learning_rate": 3.219429347826087e-05, "loss": 1.2744, "step": 10081 }, { "epoch": 1.5782717595491547, "grad_norm": 4.282635688781738, "learning_rate": 3.218240489130435e-05, "loss": 1.3455, "step": 10082 }, { "epoch": 1.5784283030682529, "grad_norm": 3.0004637241363525, "learning_rate": 3.2170516304347825e-05, "loss": 0.4898, "step": 10083 }, { "epoch": 1.5785848465873513, "grad_norm": 0.9685983657836914, "learning_rate": 3.2158627717391297e-05, "loss": 0.1505, "step": 10084 }, { "epoch": 1.5787413901064495, "grad_norm": 1.937862515449524, "learning_rate": 3.214673913043478e-05, "loss": 0.5217, "step": 10085 }, { "epoch": 1.578897933625548, "grad_norm": 3.639050006866455, "learning_rate": 3.213485054347826e-05, "loss": 0.5828, "step": 10086 }, { "epoch": 1.5790544771446462, "grad_norm": 4.018068313598633, "learning_rate": 3.212296195652174e-05, "loss": 1.4976, "step": 10087 }, { "epoch": 1.5792110206637444, "grad_norm": 4.258273124694824, "learning_rate": 3.2111073369565216e-05, "loss": 1.5887, "step": 10088 }, { "epoch": 1.5793675641828429, "grad_norm": 0.7053163647651672, "learning_rate": 3.2099184782608694e-05, "loss": 0.2143, "step": 10089 }, { "epoch": 1.5795241077019413, "grad_norm": 0.6031246185302734, "learning_rate": 3.208729619565217e-05, "loss": 0.3225, "step": 10090 }, { "epoch": 1.5796806512210395, "grad_norm": 1.2712950706481934, "learning_rate": 3.207540760869565e-05, "loss": 0.3518, "step": 10091 }, { "epoch": 1.5798371947401377, "grad_norm": 1.3443080186843872, "learning_rate": 3.206351902173913e-05, "loss": 0.1555, "step": 10092 }, { "epoch": 1.579993738259236, "grad_norm": 0.892091691493988, "learning_rate": 3.205163043478261e-05, "loss": 0.2298, "step": 10093 }, { "epoch": 1.5801502817783344, "grad_norm": 1.7012250423431396, "learning_rate": 3.2039741847826085e-05, "loss": 0.247, "step": 10094 }, { "epoch": 1.5803068252974328, "grad_norm": 1.6336909532546997, "learning_rate": 3.202785326086956e-05, "loss": 0.2997, "step": 10095 }, { "epoch": 1.580463368816531, "grad_norm": 1.106988787651062, "learning_rate": 3.201596467391304e-05, "loss": 0.188, "step": 10096 }, { "epoch": 1.5806199123356293, "grad_norm": 0.5083320140838623, "learning_rate": 3.200407608695652e-05, "loss": 0.1613, "step": 10097 }, { "epoch": 1.5807764558547275, "grad_norm": 0.8176162838935852, "learning_rate": 3.19921875e-05, "loss": 0.2978, "step": 10098 }, { "epoch": 1.580932999373826, "grad_norm": 1.3497486114501953, "learning_rate": 3.1980298913043476e-05, "loss": 0.2649, "step": 10099 }, { "epoch": 1.5810895428929244, "grad_norm": 3.2387630939483643, "learning_rate": 3.1968410326086954e-05, "loss": 0.257, "step": 10100 }, { "epoch": 1.5812460864120226, "grad_norm": 0.9557090997695923, "learning_rate": 3.195652173913043e-05, "loss": 0.217, "step": 10101 }, { "epoch": 1.5814026299311208, "grad_norm": 1.8548837900161743, "learning_rate": 3.194463315217391e-05, "loss": 0.3671, "step": 10102 }, { "epoch": 1.581559173450219, "grad_norm": 1.2229750156402588, "learning_rate": 3.193274456521739e-05, "loss": 0.3601, "step": 10103 }, { "epoch": 1.5817157169693175, "grad_norm": 1.4529528617858887, "learning_rate": 3.1920855978260866e-05, "loss": 0.3789, "step": 10104 }, { "epoch": 1.581872260488416, "grad_norm": 3.1878461837768555, "learning_rate": 3.1908967391304345e-05, "loss": 0.3977, "step": 10105 }, { "epoch": 1.5820288040075141, "grad_norm": 2.0986719131469727, "learning_rate": 3.189707880434782e-05, "loss": 0.4961, "step": 10106 }, { "epoch": 1.5821853475266123, "grad_norm": 3.102585554122925, "learning_rate": 3.18851902173913e-05, "loss": 0.359, "step": 10107 }, { "epoch": 1.5823418910457105, "grad_norm": 4.867345333099365, "learning_rate": 3.187330163043478e-05, "loss": 0.4531, "step": 10108 }, { "epoch": 1.582498434564809, "grad_norm": 1.7194571495056152, "learning_rate": 3.186141304347826e-05, "loss": 0.2807, "step": 10109 }, { "epoch": 1.5826549780839074, "grad_norm": 8.440205574035645, "learning_rate": 3.1849524456521735e-05, "loss": 0.7281, "step": 10110 }, { "epoch": 1.5828115216030056, "grad_norm": 2.8930020332336426, "learning_rate": 3.1837635869565214e-05, "loss": 0.541, "step": 10111 }, { "epoch": 1.5829680651221039, "grad_norm": 6.041659832000732, "learning_rate": 3.182574728260869e-05, "loss": 0.5159, "step": 10112 }, { "epoch": 1.5831246086412023, "grad_norm": 2.884404182434082, "learning_rate": 3.181385869565217e-05, "loss": 0.3813, "step": 10113 }, { "epoch": 1.5832811521603005, "grad_norm": 1.6402472257614136, "learning_rate": 3.180197010869565e-05, "loss": 0.4462, "step": 10114 }, { "epoch": 1.583437695679399, "grad_norm": 22.938182830810547, "learning_rate": 3.1790081521739126e-05, "loss": 0.2206, "step": 10115 }, { "epoch": 1.5835942391984972, "grad_norm": 1.6546770334243774, "learning_rate": 3.1778192934782604e-05, "loss": 0.6065, "step": 10116 }, { "epoch": 1.5837507827175954, "grad_norm": 1.5633736848831177, "learning_rate": 3.176630434782608e-05, "loss": 0.4053, "step": 10117 }, { "epoch": 1.5839073262366938, "grad_norm": 3.3867077827453613, "learning_rate": 3.175441576086957e-05, "loss": 0.7632, "step": 10118 }, { "epoch": 1.5840638697557923, "grad_norm": 2.464543342590332, "learning_rate": 3.174252717391304e-05, "loss": 0.5437, "step": 10119 }, { "epoch": 1.5842204132748905, "grad_norm": 2.1203386783599854, "learning_rate": 3.173063858695652e-05, "loss": 0.8277, "step": 10120 }, { "epoch": 1.5843769567939887, "grad_norm": 4.227522373199463, "learning_rate": 3.1718749999999995e-05, "loss": 1.2371, "step": 10121 }, { "epoch": 1.584533500313087, "grad_norm": 2.5655643939971924, "learning_rate": 3.170686141304347e-05, "loss": 0.4522, "step": 10122 }, { "epoch": 1.5846900438321854, "grad_norm": 5.227837562561035, "learning_rate": 3.169497282608696e-05, "loss": 0.5841, "step": 10123 }, { "epoch": 1.5848465873512838, "grad_norm": 2.4780960083007812, "learning_rate": 3.168308423913043e-05, "loss": 0.6617, "step": 10124 }, { "epoch": 1.585003130870382, "grad_norm": 1.8330973386764526, "learning_rate": 3.167119565217391e-05, "loss": 0.2496, "step": 10125 }, { "epoch": 1.5851596743894802, "grad_norm": 3.765514373779297, "learning_rate": 3.1659307065217386e-05, "loss": 0.8498, "step": 10126 }, { "epoch": 1.5853162179085785, "grad_norm": 4.961223125457764, "learning_rate": 3.1647418478260864e-05, "loss": 0.7348, "step": 10127 }, { "epoch": 1.585472761427677, "grad_norm": 3.682126760482788, "learning_rate": 3.163552989130435e-05, "loss": 1.0331, "step": 10128 }, { "epoch": 1.5856293049467753, "grad_norm": 2.0437381267547607, "learning_rate": 3.162364130434783e-05, "loss": 0.9237, "step": 10129 }, { "epoch": 1.5857858484658736, "grad_norm": 3.4378457069396973, "learning_rate": 3.16117527173913e-05, "loss": 1.5697, "step": 10130 }, { "epoch": 1.5859423919849718, "grad_norm": 2.0327553749084473, "learning_rate": 3.159986413043478e-05, "loss": 0.8567, "step": 10131 }, { "epoch": 1.58609893550407, "grad_norm": 2.484196901321411, "learning_rate": 3.1587975543478255e-05, "loss": 1.0279, "step": 10132 }, { "epoch": 1.5862554790231684, "grad_norm": 2.760066270828247, "learning_rate": 3.157608695652174e-05, "loss": 1.1805, "step": 10133 }, { "epoch": 1.5864120225422669, "grad_norm": 3.4179399013519287, "learning_rate": 3.156419836956522e-05, "loss": 0.8023, "step": 10134 }, { "epoch": 1.586568566061365, "grad_norm": 1.3661054372787476, "learning_rate": 3.1552309782608696e-05, "loss": 0.5304, "step": 10135 }, { "epoch": 1.5867251095804633, "grad_norm": 0.5881316661834717, "learning_rate": 3.154042119565217e-05, "loss": 0.1197, "step": 10136 }, { "epoch": 1.5868816530995615, "grad_norm": 2.7798805236816406, "learning_rate": 3.1528532608695646e-05, "loss": 0.6619, "step": 10137 }, { "epoch": 1.58703819661866, "grad_norm": 3.1561098098754883, "learning_rate": 3.151664402173913e-05, "loss": 0.6942, "step": 10138 }, { "epoch": 1.5871947401377584, "grad_norm": 0.5989023447036743, "learning_rate": 3.150475543478261e-05, "loss": 0.2505, "step": 10139 }, { "epoch": 1.5873512836568566, "grad_norm": 0.5883434414863586, "learning_rate": 3.149286684782609e-05, "loss": 0.2203, "step": 10140 }, { "epoch": 1.5875078271759548, "grad_norm": 2.7145180702209473, "learning_rate": 3.1480978260869565e-05, "loss": 0.2375, "step": 10141 }, { "epoch": 1.587664370695053, "grad_norm": 0.6921699047088623, "learning_rate": 3.1469089673913036e-05, "loss": 0.2688, "step": 10142 }, { "epoch": 1.5878209142141515, "grad_norm": 1.2788141965866089, "learning_rate": 3.145720108695652e-05, "loss": 0.2628, "step": 10143 }, { "epoch": 1.58797745773325, "grad_norm": 1.3061723709106445, "learning_rate": 3.14453125e-05, "loss": 0.1691, "step": 10144 }, { "epoch": 1.5881340012523482, "grad_norm": 0.6534034609794617, "learning_rate": 3.143342391304348e-05, "loss": 0.2951, "step": 10145 }, { "epoch": 1.5882905447714464, "grad_norm": 1.2728806734085083, "learning_rate": 3.1421535326086956e-05, "loss": 0.3874, "step": 10146 }, { "epoch": 1.5884470882905448, "grad_norm": 0.841110348701477, "learning_rate": 3.140964673913043e-05, "loss": 0.3542, "step": 10147 }, { "epoch": 1.588603631809643, "grad_norm": 0.8517207503318787, "learning_rate": 3.139775815217391e-05, "loss": 0.4641, "step": 10148 }, { "epoch": 1.5887601753287415, "grad_norm": 0.8553672432899475, "learning_rate": 3.138586956521739e-05, "loss": 0.1774, "step": 10149 }, { "epoch": 1.5889167188478397, "grad_norm": 0.8101016879081726, "learning_rate": 3.137398097826087e-05, "loss": 0.3079, "step": 10150 }, { "epoch": 1.589073262366938, "grad_norm": 1.3758803606033325, "learning_rate": 3.1362092391304347e-05, "loss": 0.2344, "step": 10151 }, { "epoch": 1.5892298058860364, "grad_norm": 2.0989391803741455, "learning_rate": 3.1350203804347825e-05, "loss": 0.2577, "step": 10152 }, { "epoch": 1.5893863494051348, "grad_norm": 1.1007925271987915, "learning_rate": 3.13383152173913e-05, "loss": 0.3068, "step": 10153 }, { "epoch": 1.589542892924233, "grad_norm": 2.4761130809783936, "learning_rate": 3.132642663043478e-05, "loss": 0.5823, "step": 10154 }, { "epoch": 1.5896994364433312, "grad_norm": 2.1532680988311768, "learning_rate": 3.131453804347826e-05, "loss": 0.5186, "step": 10155 }, { "epoch": 1.5898559799624294, "grad_norm": 1.453597903251648, "learning_rate": 3.130264945652174e-05, "loss": 0.3529, "step": 10156 }, { "epoch": 1.5900125234815279, "grad_norm": 0.8974078297615051, "learning_rate": 3.1290760869565215e-05, "loss": 0.3372, "step": 10157 }, { "epoch": 1.5901690670006263, "grad_norm": 1.3507921695709229, "learning_rate": 3.1278872282608694e-05, "loss": 0.3584, "step": 10158 }, { "epoch": 1.5903256105197245, "grad_norm": 1.8638805150985718, "learning_rate": 3.126698369565217e-05, "loss": 0.4468, "step": 10159 }, { "epoch": 1.5904821540388228, "grad_norm": 1.734885573387146, "learning_rate": 3.125509510869565e-05, "loss": 0.4614, "step": 10160 }, { "epoch": 1.590638697557921, "grad_norm": 2.6774990558624268, "learning_rate": 3.124320652173913e-05, "loss": 0.2858, "step": 10161 }, { "epoch": 1.5907952410770194, "grad_norm": 1.5462384223937988, "learning_rate": 3.1231317934782606e-05, "loss": 0.3417, "step": 10162 }, { "epoch": 1.5909517845961179, "grad_norm": 2.2554268836975098, "learning_rate": 3.1219429347826084e-05, "loss": 0.68, "step": 10163 }, { "epoch": 1.591108328115216, "grad_norm": 1.249536156654358, "learning_rate": 3.120754076086956e-05, "loss": 0.322, "step": 10164 }, { "epoch": 1.5912648716343143, "grad_norm": 1.8992308378219604, "learning_rate": 3.119565217391304e-05, "loss": 0.5141, "step": 10165 }, { "epoch": 1.5914214151534125, "grad_norm": 1.905631184577942, "learning_rate": 3.118376358695652e-05, "loss": 0.7464, "step": 10166 }, { "epoch": 1.591577958672511, "grad_norm": 2.6717185974121094, "learning_rate": 3.1171875e-05, "loss": 0.7894, "step": 10167 }, { "epoch": 1.5917345021916094, "grad_norm": 1.7212543487548828, "learning_rate": 3.1159986413043475e-05, "loss": 0.6179, "step": 10168 }, { "epoch": 1.5918910457107076, "grad_norm": 3.5596466064453125, "learning_rate": 3.114809782608695e-05, "loss": 0.571, "step": 10169 }, { "epoch": 1.5920475892298058, "grad_norm": 2.4733290672302246, "learning_rate": 3.113620923913043e-05, "loss": 0.7045, "step": 10170 }, { "epoch": 1.592204132748904, "grad_norm": 5.876322269439697, "learning_rate": 3.112432065217391e-05, "loss": 0.9951, "step": 10171 }, { "epoch": 1.5923606762680025, "grad_norm": 3.089984178543091, "learning_rate": 3.111243206521739e-05, "loss": 0.7956, "step": 10172 }, { "epoch": 1.592517219787101, "grad_norm": 3.1375677585601807, "learning_rate": 3.1100543478260866e-05, "loss": 0.6422, "step": 10173 }, { "epoch": 1.5926737633061991, "grad_norm": 3.602233648300171, "learning_rate": 3.1088654891304344e-05, "loss": 0.8385, "step": 10174 }, { "epoch": 1.5928303068252974, "grad_norm": 2.510434865951538, "learning_rate": 3.107676630434782e-05, "loss": 0.7875, "step": 10175 }, { "epoch": 1.5929868503443956, "grad_norm": 3.1532068252563477, "learning_rate": 3.10648777173913e-05, "loss": 0.6044, "step": 10176 }, { "epoch": 1.593143393863494, "grad_norm": 4.08696174621582, "learning_rate": 3.105298913043478e-05, "loss": 0.5349, "step": 10177 }, { "epoch": 1.5932999373825925, "grad_norm": 3.008937120437622, "learning_rate": 3.104110054347826e-05, "loss": 0.8903, "step": 10178 }, { "epoch": 1.5934564809016907, "grad_norm": 4.681085586547852, "learning_rate": 3.1029211956521735e-05, "loss": 0.9669, "step": 10179 }, { "epoch": 1.593613024420789, "grad_norm": 3.36797833442688, "learning_rate": 3.101732336956521e-05, "loss": 1.0881, "step": 10180 }, { "epoch": 1.5937695679398873, "grad_norm": 2.6295688152313232, "learning_rate": 3.100543478260869e-05, "loss": 0.8003, "step": 10181 }, { "epoch": 1.5939261114589856, "grad_norm": 3.540518045425415, "learning_rate": 3.099354619565217e-05, "loss": 1.2299, "step": 10182 }, { "epoch": 1.594082654978084, "grad_norm": 3.6198132038116455, "learning_rate": 3.098165760869565e-05, "loss": 1.0237, "step": 10183 }, { "epoch": 1.5942391984971822, "grad_norm": 2.266040802001953, "learning_rate": 3.0969769021739126e-05, "loss": 0.3521, "step": 10184 }, { "epoch": 1.5943957420162804, "grad_norm": 4.163845062255859, "learning_rate": 3.0957880434782604e-05, "loss": 0.4997, "step": 10185 }, { "epoch": 1.5945522855353789, "grad_norm": 2.4242148399353027, "learning_rate": 3.094599184782609e-05, "loss": 0.5816, "step": 10186 }, { "epoch": 1.5947088290544773, "grad_norm": 2.4104950428009033, "learning_rate": 3.093410326086957e-05, "loss": 0.6568, "step": 10187 }, { "epoch": 1.5948653725735755, "grad_norm": 4.158421516418457, "learning_rate": 3.092221467391304e-05, "loss": 0.9809, "step": 10188 }, { "epoch": 1.5950219160926737, "grad_norm": 0.3869769871234894, "learning_rate": 3.0910326086956516e-05, "loss": 0.2046, "step": 10189 }, { "epoch": 1.595178459611772, "grad_norm": 0.6527525782585144, "learning_rate": 3.0898437499999995e-05, "loss": 0.1539, "step": 10190 }, { "epoch": 1.5953350031308704, "grad_norm": 0.6351661682128906, "learning_rate": 3.088654891304348e-05, "loss": 0.2482, "step": 10191 }, { "epoch": 1.5954915466499688, "grad_norm": 1.0275886058807373, "learning_rate": 3.087466032608696e-05, "loss": 0.2713, "step": 10192 }, { "epoch": 1.595648090169067, "grad_norm": 0.9771100878715515, "learning_rate": 3.086277173913043e-05, "loss": 0.2762, "step": 10193 }, { "epoch": 1.5958046336881653, "grad_norm": 0.688248872756958, "learning_rate": 3.085088315217391e-05, "loss": 0.2418, "step": 10194 }, { "epoch": 1.5959611772072635, "grad_norm": 0.8283999562263489, "learning_rate": 3.0838994565217385e-05, "loss": 0.2707, "step": 10195 }, { "epoch": 1.596117720726362, "grad_norm": 0.6695224046707153, "learning_rate": 3.082710597826087e-05, "loss": 0.2361, "step": 10196 }, { "epoch": 1.5962742642454604, "grad_norm": 1.1160733699798584, "learning_rate": 3.081521739130435e-05, "loss": 0.4393, "step": 10197 }, { "epoch": 1.5964308077645586, "grad_norm": 0.9756506085395813, "learning_rate": 3.080332880434783e-05, "loss": 0.2326, "step": 10198 }, { "epoch": 1.5965873512836568, "grad_norm": 2.026515483856201, "learning_rate": 3.07914402173913e-05, "loss": 0.3735, "step": 10199 }, { "epoch": 1.596743894802755, "grad_norm": 1.4397461414337158, "learning_rate": 3.0779551630434776e-05, "loss": 0.3962, "step": 10200 }, { "epoch": 1.5969004383218535, "grad_norm": 1.3804813623428345, "learning_rate": 3.076766304347826e-05, "loss": 0.2374, "step": 10201 }, { "epoch": 1.597056981840952, "grad_norm": 0.9697756171226501, "learning_rate": 3.075577445652174e-05, "loss": 0.395, "step": 10202 }, { "epoch": 1.5972135253600501, "grad_norm": 0.6578664183616638, "learning_rate": 3.074388586956522e-05, "loss": 0.3321, "step": 10203 }, { "epoch": 1.5973700688791483, "grad_norm": 0.9863660335540771, "learning_rate": 3.0731997282608696e-05, "loss": 0.2899, "step": 10204 }, { "epoch": 1.5975266123982466, "grad_norm": 2.549471616744995, "learning_rate": 3.072010869565217e-05, "loss": 0.2962, "step": 10205 }, { "epoch": 1.597683155917345, "grad_norm": 1.2478861808776855, "learning_rate": 3.070822010869565e-05, "loss": 0.4824, "step": 10206 }, { "epoch": 1.5978396994364434, "grad_norm": 2.3322713375091553, "learning_rate": 3.069633152173913e-05, "loss": 0.1934, "step": 10207 }, { "epoch": 1.5979962429555417, "grad_norm": 1.4034241437911987, "learning_rate": 3.068444293478261e-05, "loss": 0.5167, "step": 10208 }, { "epoch": 1.5981527864746399, "grad_norm": 4.26704740524292, "learning_rate": 3.0672554347826086e-05, "loss": 0.4765, "step": 10209 }, { "epoch": 1.5983093299937383, "grad_norm": 2.8064260482788086, "learning_rate": 3.0660665760869565e-05, "loss": 0.4081, "step": 10210 }, { "epoch": 1.5984658735128365, "grad_norm": 5.72035026550293, "learning_rate": 3.064877717391304e-05, "loss": 0.4761, "step": 10211 }, { "epoch": 1.598622417031935, "grad_norm": 1.7741906642913818, "learning_rate": 3.063688858695652e-05, "loss": 0.5175, "step": 10212 }, { "epoch": 1.5987789605510332, "grad_norm": 2.7313344478607178, "learning_rate": 3.0625e-05, "loss": 0.3897, "step": 10213 }, { "epoch": 1.5989355040701314, "grad_norm": 4.005735397338867, "learning_rate": 3.061311141304348e-05, "loss": 0.5757, "step": 10214 }, { "epoch": 1.5990920475892298, "grad_norm": 2.2238457202911377, "learning_rate": 3.0601222826086955e-05, "loss": 0.4836, "step": 10215 }, { "epoch": 1.599248591108328, "grad_norm": 3.7250475883483887, "learning_rate": 3.0589334239130433e-05, "loss": 0.4323, "step": 10216 }, { "epoch": 1.5994051346274265, "grad_norm": 2.2984890937805176, "learning_rate": 3.057744565217391e-05, "loss": 0.6367, "step": 10217 }, { "epoch": 1.5995616781465247, "grad_norm": 2.5470023155212402, "learning_rate": 3.056555706521739e-05, "loss": 0.5698, "step": 10218 }, { "epoch": 1.599718221665623, "grad_norm": 3.017198085784912, "learning_rate": 3.055366847826087e-05, "loss": 1.028, "step": 10219 }, { "epoch": 1.5998747651847214, "grad_norm": 3.0307071208953857, "learning_rate": 3.0541779891304346e-05, "loss": 0.8379, "step": 10220 }, { "epoch": 1.6000313087038198, "grad_norm": 2.985626697540283, "learning_rate": 3.0529891304347824e-05, "loss": 0.8301, "step": 10221 }, { "epoch": 1.600187852222918, "grad_norm": 1.7766942977905273, "learning_rate": 3.05180027173913e-05, "loss": 0.594, "step": 10222 }, { "epoch": 1.6003443957420163, "grad_norm": 4.469400405883789, "learning_rate": 3.050611413043478e-05, "loss": 1.0512, "step": 10223 }, { "epoch": 1.6005009392611145, "grad_norm": 3.4699413776397705, "learning_rate": 3.049422554347826e-05, "loss": 1.0723, "step": 10224 }, { "epoch": 1.600657482780213, "grad_norm": 3.144070863723755, "learning_rate": 3.0482336956521737e-05, "loss": 0.8699, "step": 10225 }, { "epoch": 1.6008140262993114, "grad_norm": 5.805459499359131, "learning_rate": 3.0470448369565215e-05, "loss": 0.9035, "step": 10226 }, { "epoch": 1.6009705698184096, "grad_norm": 4.251742362976074, "learning_rate": 3.0458559782608693e-05, "loss": 1.1752, "step": 10227 }, { "epoch": 1.6011271133375078, "grad_norm": 2.507666826248169, "learning_rate": 3.044667119565217e-05, "loss": 0.9777, "step": 10228 }, { "epoch": 1.601283656856606, "grad_norm": 2.4764163494110107, "learning_rate": 3.043478260869565e-05, "loss": 0.6821, "step": 10229 }, { "epoch": 1.6014402003757044, "grad_norm": 3.258732557296753, "learning_rate": 3.0422894021739128e-05, "loss": 1.4114, "step": 10230 }, { "epoch": 1.6015967438948029, "grad_norm": 5.182737350463867, "learning_rate": 3.0411005434782606e-05, "loss": 0.8, "step": 10231 }, { "epoch": 1.601753287413901, "grad_norm": 15.411431312561035, "learning_rate": 3.0399116847826084e-05, "loss": 0.4138, "step": 10232 }, { "epoch": 1.6019098309329993, "grad_norm": 2.8629026412963867, "learning_rate": 3.0387228260869566e-05, "loss": 0.7346, "step": 10233 }, { "epoch": 1.6020663744520975, "grad_norm": 2.995253562927246, "learning_rate": 3.037533967391304e-05, "loss": 0.7103, "step": 10234 }, { "epoch": 1.602222917971196, "grad_norm": 1.2879527807235718, "learning_rate": 3.036345108695652e-05, "loss": 0.2212, "step": 10235 }, { "epoch": 1.6023794614902944, "grad_norm": 3.077224016189575, "learning_rate": 3.0351562499999997e-05, "loss": 0.5605, "step": 10236 }, { "epoch": 1.6025360050093926, "grad_norm": 1.3214764595031738, "learning_rate": 3.0339673913043475e-05, "loss": 0.6492, "step": 10237 }, { "epoch": 1.6026925485284909, "grad_norm": 1.9727133512496948, "learning_rate": 3.0327785326086956e-05, "loss": 1.0271, "step": 10238 }, { "epoch": 1.602849092047589, "grad_norm": 0.7318537831306458, "learning_rate": 3.031589673913043e-05, "loss": 0.2202, "step": 10239 }, { "epoch": 1.6030056355666875, "grad_norm": 0.5969008207321167, "learning_rate": 3.030400815217391e-05, "loss": 0.2283, "step": 10240 }, { "epoch": 1.603162179085786, "grad_norm": 0.6052429676055908, "learning_rate": 3.0292119565217387e-05, "loss": 0.1646, "step": 10241 }, { "epoch": 1.6033187226048842, "grad_norm": 0.7161831259727478, "learning_rate": 3.0280230978260866e-05, "loss": 0.2946, "step": 10242 }, { "epoch": 1.6034752661239824, "grad_norm": 3.389965057373047, "learning_rate": 3.0268342391304347e-05, "loss": 0.2344, "step": 10243 }, { "epoch": 1.6036318096430808, "grad_norm": 0.9649909734725952, "learning_rate": 3.0256453804347825e-05, "loss": 0.4085, "step": 10244 }, { "epoch": 1.603788353162179, "grad_norm": 0.9569277763366699, "learning_rate": 3.02445652173913e-05, "loss": 0.3125, "step": 10245 }, { "epoch": 1.6039448966812775, "grad_norm": 1.6353092193603516, "learning_rate": 3.0232676630434778e-05, "loss": 0.1717, "step": 10246 }, { "epoch": 1.6041014402003757, "grad_norm": 0.9941762685775757, "learning_rate": 3.0220788043478256e-05, "loss": 0.2547, "step": 10247 }, { "epoch": 1.604257983719474, "grad_norm": 2.214343547821045, "learning_rate": 3.0208899456521738e-05, "loss": 0.3378, "step": 10248 }, { "epoch": 1.6044145272385724, "grad_norm": 0.7824607491493225, "learning_rate": 3.0197010869565216e-05, "loss": 0.2179, "step": 10249 }, { "epoch": 1.6045710707576706, "grad_norm": 0.945197582244873, "learning_rate": 3.0185122282608694e-05, "loss": 0.3319, "step": 10250 }, { "epoch": 1.604727614276769, "grad_norm": 2.882577419281006, "learning_rate": 3.017323369565217e-05, "loss": 0.1494, "step": 10251 }, { "epoch": 1.6048841577958672, "grad_norm": 1.231092095375061, "learning_rate": 3.0161345108695647e-05, "loss": 0.365, "step": 10252 }, { "epoch": 1.6050407013149655, "grad_norm": 1.093896746635437, "learning_rate": 3.014945652173913e-05, "loss": 0.344, "step": 10253 }, { "epoch": 1.605197244834064, "grad_norm": 1.1580668687820435, "learning_rate": 3.0137567934782607e-05, "loss": 0.2584, "step": 10254 }, { "epoch": 1.6053537883531623, "grad_norm": 1.7249300479888916, "learning_rate": 3.0125679347826085e-05, "loss": 0.4752, "step": 10255 }, { "epoch": 1.6055103318722606, "grad_norm": 1.9701441526412964, "learning_rate": 3.0113790760869563e-05, "loss": 0.4297, "step": 10256 }, { "epoch": 1.6056668753913588, "grad_norm": 1.4606975317001343, "learning_rate": 3.0101902173913038e-05, "loss": 0.2383, "step": 10257 }, { "epoch": 1.605823418910457, "grad_norm": 1.7235151529312134, "learning_rate": 3.009001358695652e-05, "loss": 0.4298, "step": 10258 }, { "epoch": 1.6059799624295554, "grad_norm": 1.6146445274353027, "learning_rate": 3.0078124999999998e-05, "loss": 0.56, "step": 10259 }, { "epoch": 1.6061365059486539, "grad_norm": 2.065859079360962, "learning_rate": 3.0066236413043476e-05, "loss": 0.3592, "step": 10260 }, { "epoch": 1.606293049467752, "grad_norm": 1.168227195739746, "learning_rate": 3.0054347826086954e-05, "loss": 0.223, "step": 10261 }, { "epoch": 1.6064495929868503, "grad_norm": 1.7729275226593018, "learning_rate": 3.004245923913043e-05, "loss": 0.5772, "step": 10262 }, { "epoch": 1.6066061365059485, "grad_norm": 2.5569205284118652, "learning_rate": 3.003057065217391e-05, "loss": 0.4777, "step": 10263 }, { "epoch": 1.606762680025047, "grad_norm": 1.4024940729141235, "learning_rate": 3.001868206521739e-05, "loss": 0.7603, "step": 10264 }, { "epoch": 1.6069192235441454, "grad_norm": 7.2135772705078125, "learning_rate": 3.0006793478260867e-05, "loss": 0.3626, "step": 10265 }, { "epoch": 1.6070757670632436, "grad_norm": 3.0856900215148926, "learning_rate": 2.9994904891304345e-05, "loss": 0.4442, "step": 10266 }, { "epoch": 1.6072323105823418, "grad_norm": 2.049835681915283, "learning_rate": 2.9983016304347826e-05, "loss": 0.59, "step": 10267 }, { "epoch": 1.60738885410144, "grad_norm": 2.19222354888916, "learning_rate": 2.99711277173913e-05, "loss": 0.5079, "step": 10268 }, { "epoch": 1.6075453976205385, "grad_norm": 1.8559759855270386, "learning_rate": 2.995923913043478e-05, "loss": 0.8347, "step": 10269 }, { "epoch": 1.607701941139637, "grad_norm": 2.9951608180999756, "learning_rate": 2.9947350543478257e-05, "loss": 0.4152, "step": 10270 }, { "epoch": 1.6078584846587352, "grad_norm": 2.2438693046569824, "learning_rate": 2.9935461956521735e-05, "loss": 0.7341, "step": 10271 }, { "epoch": 1.6080150281778334, "grad_norm": 2.2838025093078613, "learning_rate": 2.9923573369565217e-05, "loss": 0.8078, "step": 10272 }, { "epoch": 1.6081715716969316, "grad_norm": 4.406555652618408, "learning_rate": 2.9911684782608695e-05, "loss": 0.7872, "step": 10273 }, { "epoch": 1.60832811521603, "grad_norm": 3.1306111812591553, "learning_rate": 2.989979619565217e-05, "loss": 1.1647, "step": 10274 }, { "epoch": 1.6084846587351285, "grad_norm": 4.09343957901001, "learning_rate": 2.9887907608695648e-05, "loss": 1.2114, "step": 10275 }, { "epoch": 1.6086412022542267, "grad_norm": 3.282919406890869, "learning_rate": 2.9876019021739126e-05, "loss": 1.0429, "step": 10276 }, { "epoch": 1.608797745773325, "grad_norm": NaN, "learning_rate": 2.9876019021739126e-05, "loss": 0.0, "step": 10277 }, { "epoch": 1.6089542892924233, "grad_norm": 3.6016249656677246, "learning_rate": 2.9864130434782608e-05, "loss": 0.9644, "step": 10278 }, { "epoch": 1.6091108328115216, "grad_norm": 3.139558792114258, "learning_rate": 2.9852241847826086e-05, "loss": 0.6676, "step": 10279 }, { "epoch": 1.60926737633062, "grad_norm": 3.7344987392425537, "learning_rate": 2.9840353260869564e-05, "loss": 1.1214, "step": 10280 }, { "epoch": 1.6094239198497182, "grad_norm": 1.7893955707550049, "learning_rate": 2.982846467391304e-05, "loss": 0.8696, "step": 10281 }, { "epoch": 1.6095804633688164, "grad_norm": 3.9697279930114746, "learning_rate": 2.9816576086956517e-05, "loss": 0.9658, "step": 10282 }, { "epoch": 1.6097370068879149, "grad_norm": 5.395816802978516, "learning_rate": 2.98046875e-05, "loss": 1.2057, "step": 10283 }, { "epoch": 1.609893550407013, "grad_norm": 3.60418701171875, "learning_rate": 2.9792798913043477e-05, "loss": 0.654, "step": 10284 }, { "epoch": 1.6100500939261115, "grad_norm": 2.160118341445923, "learning_rate": 2.9780910326086955e-05, "loss": 0.6566, "step": 10285 }, { "epoch": 1.6102066374452098, "grad_norm": 4.8225603103637695, "learning_rate": 2.976902173913043e-05, "loss": 0.8327, "step": 10286 }, { "epoch": 1.610363180964308, "grad_norm": 4.724800109863281, "learning_rate": 2.9757133152173908e-05, "loss": 0.9598, "step": 10287 }, { "epoch": 1.6105197244834064, "grad_norm": 3.2878258228302, "learning_rate": 2.974524456521739e-05, "loss": 1.2229, "step": 10288 }, { "epoch": 1.6106762680025049, "grad_norm": 0.4695488512516022, "learning_rate": 2.9733355978260867e-05, "loss": 0.1776, "step": 10289 }, { "epoch": 1.610832811521603, "grad_norm": 0.9242264628410339, "learning_rate": 2.9721467391304346e-05, "loss": 0.3137, "step": 10290 }, { "epoch": 1.6109893550407013, "grad_norm": 0.8924686908721924, "learning_rate": 2.9709578804347824e-05, "loss": 0.2039, "step": 10291 }, { "epoch": 1.6111458985597995, "grad_norm": 1.8504528999328613, "learning_rate": 2.96976902173913e-05, "loss": 0.1375, "step": 10292 }, { "epoch": 1.611302442078898, "grad_norm": 5.341706275939941, "learning_rate": 2.968580163043478e-05, "loss": 0.2621, "step": 10293 }, { "epoch": 1.6114589855979964, "grad_norm": 1.4071956872940063, "learning_rate": 2.9673913043478258e-05, "loss": 0.2148, "step": 10294 }, { "epoch": 1.6116155291170946, "grad_norm": 0.9284337162971497, "learning_rate": 2.9662024456521736e-05, "loss": 0.2751, "step": 10295 }, { "epoch": 1.6117720726361928, "grad_norm": 2.520979166030884, "learning_rate": 2.9650135869565215e-05, "loss": 0.4292, "step": 10296 }, { "epoch": 1.611928616155291, "grad_norm": 0.7606669664382935, "learning_rate": 2.9638247282608696e-05, "loss": 0.1786, "step": 10297 }, { "epoch": 1.6120851596743895, "grad_norm": 0.7574651837348938, "learning_rate": 2.962635869565217e-05, "loss": 0.3151, "step": 10298 }, { "epoch": 1.612241703193488, "grad_norm": 0.9667022228240967, "learning_rate": 2.961447010869565e-05, "loss": 0.2076, "step": 10299 }, { "epoch": 1.6123982467125861, "grad_norm": 0.9361425042152405, "learning_rate": 2.9602581521739127e-05, "loss": 0.2242, "step": 10300 }, { "epoch": 1.6125547902316844, "grad_norm": 1.0137947797775269, "learning_rate": 2.9590692934782605e-05, "loss": 0.2723, "step": 10301 }, { "epoch": 1.6127113337507826, "grad_norm": 1.086254358291626, "learning_rate": 2.9578804347826087e-05, "loss": 0.1916, "step": 10302 }, { "epoch": 1.612867877269881, "grad_norm": 1.3206911087036133, "learning_rate": 2.9566915760869565e-05, "loss": 0.3225, "step": 10303 }, { "epoch": 1.6130244207889795, "grad_norm": 0.8663286566734314, "learning_rate": 2.955502717391304e-05, "loss": 0.3812, "step": 10304 }, { "epoch": 1.6131809643080777, "grad_norm": 1.7558369636535645, "learning_rate": 2.9543138586956518e-05, "loss": 0.4673, "step": 10305 }, { "epoch": 1.6133375078271759, "grad_norm": 1.3267284631729126, "learning_rate": 2.9531249999999996e-05, "loss": 0.4418, "step": 10306 }, { "epoch": 1.613494051346274, "grad_norm": 2.517211675643921, "learning_rate": 2.9519361413043478e-05, "loss": 0.6059, "step": 10307 }, { "epoch": 1.6136505948653725, "grad_norm": 1.9619404077529907, "learning_rate": 2.9507472826086956e-05, "loss": 0.6903, "step": 10308 }, { "epoch": 1.613807138384471, "grad_norm": 1.6154149770736694, "learning_rate": 2.949558423913043e-05, "loss": 0.2968, "step": 10309 }, { "epoch": 1.6139636819035692, "grad_norm": 2.3082962036132812, "learning_rate": 2.948369565217391e-05, "loss": 0.4188, "step": 10310 }, { "epoch": 1.6141202254226674, "grad_norm": 1.1944999694824219, "learning_rate": 2.9471807065217387e-05, "loss": 0.3328, "step": 10311 }, { "epoch": 1.6142767689417659, "grad_norm": 3.0266480445861816, "learning_rate": 2.945991847826087e-05, "loss": 0.4444, "step": 10312 }, { "epoch": 1.614433312460864, "grad_norm": 3.9204442501068115, "learning_rate": 2.9448029891304347e-05, "loss": 0.7577, "step": 10313 }, { "epoch": 1.6145898559799625, "grad_norm": 2.9758007526397705, "learning_rate": 2.9436141304347825e-05, "loss": 0.4653, "step": 10314 }, { "epoch": 1.6147463994990607, "grad_norm": 2.489227056503296, "learning_rate": 2.94242527173913e-05, "loss": 0.6699, "step": 10315 }, { "epoch": 1.614902943018159, "grad_norm": 2.6989924907684326, "learning_rate": 2.9412364130434778e-05, "loss": 0.7796, "step": 10316 }, { "epoch": 1.6150594865372574, "grad_norm": 2.919267416000366, "learning_rate": 2.940047554347826e-05, "loss": 0.6812, "step": 10317 }, { "epoch": 1.6152160300563556, "grad_norm": 2.654237985610962, "learning_rate": 2.9388586956521737e-05, "loss": 0.6345, "step": 10318 }, { "epoch": 1.615372573575454, "grad_norm": 2.852341413497925, "learning_rate": 2.9376698369565216e-05, "loss": 0.5976, "step": 10319 }, { "epoch": 1.6155291170945523, "grad_norm": 2.610119104385376, "learning_rate": 2.9364809782608694e-05, "loss": 0.4921, "step": 10320 }, { "epoch": 1.6156856606136505, "grad_norm": 2.0286285877227783, "learning_rate": 2.935292119565217e-05, "loss": 1.0549, "step": 10321 }, { "epoch": 1.615842204132749, "grad_norm": 2.353771448135376, "learning_rate": 2.934103260869565e-05, "loss": 0.9686, "step": 10322 }, { "epoch": 1.6159987476518474, "grad_norm": 4.995689868927002, "learning_rate": 2.9329144021739128e-05, "loss": 0.725, "step": 10323 }, { "epoch": 1.6161552911709456, "grad_norm": 1.4738601446151733, "learning_rate": 2.9317255434782606e-05, "loss": 0.5272, "step": 10324 }, { "epoch": 1.6163118346900438, "grad_norm": 3.6173043251037598, "learning_rate": 2.9305366847826084e-05, "loss": 0.6395, "step": 10325 }, { "epoch": 1.616468378209142, "grad_norm": 2.476443290710449, "learning_rate": 2.9293478260869566e-05, "loss": 0.6994, "step": 10326 }, { "epoch": 1.6166249217282405, "grad_norm": 2.493025064468384, "learning_rate": 2.928158967391304e-05, "loss": 0.8719, "step": 10327 }, { "epoch": 1.616781465247339, "grad_norm": 5.28809118270874, "learning_rate": 2.926970108695652e-05, "loss": 0.701, "step": 10328 }, { "epoch": 1.6169380087664371, "grad_norm": 2.9136276245117188, "learning_rate": 2.9257812499999997e-05, "loss": 1.1566, "step": 10329 }, { "epoch": 1.6170945522855353, "grad_norm": 2.1390366554260254, "learning_rate": 2.9245923913043475e-05, "loss": 0.5483, "step": 10330 }, { "epoch": 1.6172510958046336, "grad_norm": 2.763434648513794, "learning_rate": 2.9234035326086957e-05, "loss": 1.2784, "step": 10331 }, { "epoch": 1.617407639323732, "grad_norm": 3.0632948875427246, "learning_rate": 2.922214673913043e-05, "loss": 0.7615, "step": 10332 }, { "epoch": 1.6175641828428304, "grad_norm": 4.4093852043151855, "learning_rate": 2.921025815217391e-05, "loss": 1.344, "step": 10333 }, { "epoch": 1.6177207263619287, "grad_norm": 2.7890725135803223, "learning_rate": 2.9198369565217388e-05, "loss": 1.2964, "step": 10334 }, { "epoch": 1.6178772698810269, "grad_norm": 2.9742047786712646, "learning_rate": 2.9186480978260866e-05, "loss": 0.6982, "step": 10335 }, { "epoch": 1.618033813400125, "grad_norm": 1.7409400939941406, "learning_rate": 2.9174592391304348e-05, "loss": 0.5901, "step": 10336 }, { "epoch": 1.6181903569192235, "grad_norm": 2.3219149112701416, "learning_rate": 2.9162703804347826e-05, "loss": 0.8469, "step": 10337 }, { "epoch": 1.618346900438322, "grad_norm": 2.0555777549743652, "learning_rate": 2.91508152173913e-05, "loss": 0.7618, "step": 10338 }, { "epoch": 1.6185034439574202, "grad_norm": 1.0614368915557861, "learning_rate": 2.913892663043478e-05, "loss": 0.2164, "step": 10339 }, { "epoch": 1.6186599874765184, "grad_norm": 0.5865556597709656, "learning_rate": 2.9127038043478257e-05, "loss": 0.1787, "step": 10340 }, { "epoch": 1.6188165309956166, "grad_norm": 0.577605664730072, "learning_rate": 2.911514945652174e-05, "loss": 0.2364, "step": 10341 }, { "epoch": 1.618973074514715, "grad_norm": 0.43389904499053955, "learning_rate": 2.9103260869565217e-05, "loss": 0.125, "step": 10342 }, { "epoch": 1.6191296180338135, "grad_norm": 4.719044208526611, "learning_rate": 2.9091372282608695e-05, "loss": 0.2328, "step": 10343 }, { "epoch": 1.6192861615529117, "grad_norm": 1.7882741689682007, "learning_rate": 2.907948369565217e-05, "loss": 0.2278, "step": 10344 }, { "epoch": 1.61944270507201, "grad_norm": 0.682331383228302, "learning_rate": 2.9067595108695648e-05, "loss": 0.1616, "step": 10345 }, { "epoch": 1.6195992485911084, "grad_norm": 1.742249846458435, "learning_rate": 2.905570652173913e-05, "loss": 0.2345, "step": 10346 }, { "epoch": 1.6197557921102066, "grad_norm": 1.2892118692398071, "learning_rate": 2.9043817934782607e-05, "loss": 0.1445, "step": 10347 }, { "epoch": 1.619912335629305, "grad_norm": 1.358806848526001, "learning_rate": 2.9031929347826085e-05, "loss": 0.4013, "step": 10348 }, { "epoch": 1.6200688791484033, "grad_norm": 1.8218224048614502, "learning_rate": 2.9020040760869564e-05, "loss": 0.5114, "step": 10349 }, { "epoch": 1.6202254226675015, "grad_norm": 0.8478822708129883, "learning_rate": 2.900815217391304e-05, "loss": 0.2211, "step": 10350 }, { "epoch": 1.6203819661866, "grad_norm": 1.1199865341186523, "learning_rate": 2.899626358695652e-05, "loss": 0.3577, "step": 10351 }, { "epoch": 1.6205385097056983, "grad_norm": 1.4021568298339844, "learning_rate": 2.8984374999999998e-05, "loss": 0.1952, "step": 10352 }, { "epoch": 1.6206950532247966, "grad_norm": 1.4578477144241333, "learning_rate": 2.8972486413043476e-05, "loss": 0.2794, "step": 10353 }, { "epoch": 1.6208515967438948, "grad_norm": 1.694918155670166, "learning_rate": 2.8960597826086954e-05, "loss": 0.3375, "step": 10354 }, { "epoch": 1.621008140262993, "grad_norm": 1.7194323539733887, "learning_rate": 2.894870923913043e-05, "loss": 0.6596, "step": 10355 }, { "epoch": 1.6211646837820914, "grad_norm": 2.162846803665161, "learning_rate": 2.893682065217391e-05, "loss": 0.4367, "step": 10356 }, { "epoch": 1.6213212273011899, "grad_norm": 4.0198564529418945, "learning_rate": 2.892493206521739e-05, "loss": 0.55, "step": 10357 }, { "epoch": 1.621477770820288, "grad_norm": 2.0567286014556885, "learning_rate": 2.8913043478260867e-05, "loss": 0.364, "step": 10358 }, { "epoch": 1.6216343143393863, "grad_norm": 1.6938735246658325, "learning_rate": 2.8901154891304345e-05, "loss": 0.4875, "step": 10359 }, { "epoch": 1.6217908578584845, "grad_norm": 2.537714719772339, "learning_rate": 2.8889266304347827e-05, "loss": 0.5302, "step": 10360 }, { "epoch": 1.621947401377583, "grad_norm": 1.8570868968963623, "learning_rate": 2.88773777173913e-05, "loss": 0.4474, "step": 10361 }, { "epoch": 1.6221039448966814, "grad_norm": 4.421291351318359, "learning_rate": 2.886548913043478e-05, "loss": 0.7233, "step": 10362 }, { "epoch": 1.6222604884157796, "grad_norm": 2.2233548164367676, "learning_rate": 2.8853600543478258e-05, "loss": 0.6132, "step": 10363 }, { "epoch": 1.6224170319348779, "grad_norm": 2.1295554637908936, "learning_rate": 2.8841711956521736e-05, "loss": 0.5798, "step": 10364 }, { "epoch": 1.622573575453976, "grad_norm": 5.122897148132324, "learning_rate": 2.8829823369565218e-05, "loss": 0.6565, "step": 10365 }, { "epoch": 1.6227301189730745, "grad_norm": 23.106176376342773, "learning_rate": 2.8817934782608696e-05, "loss": 0.8056, "step": 10366 }, { "epoch": 1.622886662492173, "grad_norm": 3.5941684246063232, "learning_rate": 2.880604619565217e-05, "loss": 0.6131, "step": 10367 }, { "epoch": 1.6230432060112712, "grad_norm": 5.339930534362793, "learning_rate": 2.879415760869565e-05, "loss": 0.6873, "step": 10368 }, { "epoch": 1.6231997495303694, "grad_norm": 4.509616851806641, "learning_rate": 2.8782269021739127e-05, "loss": 0.752, "step": 10369 }, { "epoch": 1.6233562930494676, "grad_norm": 1.5968221426010132, "learning_rate": 2.8770380434782608e-05, "loss": 0.3632, "step": 10370 }, { "epoch": 1.623512836568566, "grad_norm": 1.7771170139312744, "learning_rate": 2.8758491847826086e-05, "loss": 0.6243, "step": 10371 }, { "epoch": 1.6236693800876645, "grad_norm": 2.3597044944763184, "learning_rate": 2.8746603260869565e-05, "loss": 0.5419, "step": 10372 }, { "epoch": 1.6238259236067627, "grad_norm": 2.0870397090911865, "learning_rate": 2.873471467391304e-05, "loss": 0.6736, "step": 10373 }, { "epoch": 1.623982467125861, "grad_norm": 1.525732159614563, "learning_rate": 2.8722826086956518e-05, "loss": 0.7283, "step": 10374 }, { "epoch": 1.6241390106449591, "grad_norm": 1.239062786102295, "learning_rate": 2.87109375e-05, "loss": 0.349, "step": 10375 }, { "epoch": 1.6242955541640576, "grad_norm": 3.4177193641662598, "learning_rate": 2.8699048913043477e-05, "loss": 0.8754, "step": 10376 }, { "epoch": 1.624452097683156, "grad_norm": 4.291350841522217, "learning_rate": 2.8687160326086955e-05, "loss": 1.1789, "step": 10377 }, { "epoch": 1.6246086412022542, "grad_norm": 6.326263427734375, "learning_rate": 2.867527173913043e-05, "loss": 0.762, "step": 10378 }, { "epoch": 1.6247651847213525, "grad_norm": 2.982909679412842, "learning_rate": 2.866338315217391e-05, "loss": 1.1473, "step": 10379 }, { "epoch": 1.624921728240451, "grad_norm": 3.5818231105804443, "learning_rate": 2.865149456521739e-05, "loss": 1.3731, "step": 10380 }, { "epoch": 1.625078271759549, "grad_norm": 1.5522193908691406, "learning_rate": 2.8639605978260868e-05, "loss": 0.5121, "step": 10381 }, { "epoch": 1.6252348152786475, "grad_norm": 6.704400062561035, "learning_rate": 2.8627717391304346e-05, "loss": 0.9165, "step": 10382 }, { "epoch": 1.6253913587977458, "grad_norm": 5.410210609436035, "learning_rate": 2.8615828804347824e-05, "loss": 1.4369, "step": 10383 }, { "epoch": 1.625547902316844, "grad_norm": 8.970086097717285, "learning_rate": 2.86039402173913e-05, "loss": 0.2808, "step": 10384 }, { "epoch": 1.6257044458359424, "grad_norm": 1.449784755706787, "learning_rate": 2.859205163043478e-05, "loss": 0.6225, "step": 10385 }, { "epoch": 1.6258609893550409, "grad_norm": 1.6559580564498901, "learning_rate": 2.858016304347826e-05, "loss": 0.6363, "step": 10386 }, { "epoch": 1.626017532874139, "grad_norm": 1.6959755420684814, "learning_rate": 2.8568274456521737e-05, "loss": 0.437, "step": 10387 }, { "epoch": 1.6261740763932373, "grad_norm": 1.6321216821670532, "learning_rate": 2.8556385869565215e-05, "loss": 0.5049, "step": 10388 }, { "epoch": 1.6263306199123355, "grad_norm": 0.6314437389373779, "learning_rate": 2.8544497282608693e-05, "loss": 0.1964, "step": 10389 }, { "epoch": 1.626487163431434, "grad_norm": 0.3609730303287506, "learning_rate": 2.853260869565217e-05, "loss": 0.1516, "step": 10390 }, { "epoch": 1.6266437069505324, "grad_norm": 0.7578545212745667, "learning_rate": 2.852072010869565e-05, "loss": 0.2027, "step": 10391 }, { "epoch": 1.6268002504696306, "grad_norm": 0.780797004699707, "learning_rate": 2.8508831521739128e-05, "loss": 0.1563, "step": 10392 }, { "epoch": 1.6269567939887288, "grad_norm": 0.9627163410186768, "learning_rate": 2.8496942934782606e-05, "loss": 0.2775, "step": 10393 }, { "epoch": 1.627113337507827, "grad_norm": 0.5588061213493347, "learning_rate": 2.8485054347826084e-05, "loss": 0.1888, "step": 10394 }, { "epoch": 1.6272698810269255, "grad_norm": 0.5558486580848694, "learning_rate": 2.8473165760869566e-05, "loss": 0.1868, "step": 10395 }, { "epoch": 1.627426424546024, "grad_norm": 4.394417762756348, "learning_rate": 2.846127717391304e-05, "loss": 0.2225, "step": 10396 }, { "epoch": 1.6275829680651221, "grad_norm": 0.8351404070854187, "learning_rate": 2.844938858695652e-05, "loss": 0.1413, "step": 10397 }, { "epoch": 1.6277395115842204, "grad_norm": 1.4360054731369019, "learning_rate": 2.8437499999999997e-05, "loss": 0.3091, "step": 10398 }, { "epoch": 1.6278960551033186, "grad_norm": 1.1164181232452393, "learning_rate": 2.8425611413043475e-05, "loss": 0.2348, "step": 10399 }, { "epoch": 1.628052598622417, "grad_norm": 1.9913181066513062, "learning_rate": 2.8413722826086956e-05, "loss": 0.2064, "step": 10400 }, { "epoch": 1.6282091421415155, "grad_norm": 0.9258219003677368, "learning_rate": 2.840183423913043e-05, "loss": 0.2809, "step": 10401 }, { "epoch": 1.6283656856606137, "grad_norm": 1.556120753288269, "learning_rate": 2.838994565217391e-05, "loss": 0.4552, "step": 10402 }, { "epoch": 1.628522229179712, "grad_norm": 1.665364384651184, "learning_rate": 2.8378057065217387e-05, "loss": 0.328, "step": 10403 }, { "epoch": 1.6286787726988101, "grad_norm": 1.3285293579101562, "learning_rate": 2.8366168478260866e-05, "loss": 0.1569, "step": 10404 }, { "epoch": 1.6288353162179086, "grad_norm": 1.5031616687774658, "learning_rate": 2.8354279891304347e-05, "loss": 0.3078, "step": 10405 }, { "epoch": 1.628991859737007, "grad_norm": 1.2590333223342896, "learning_rate": 2.8342391304347825e-05, "loss": 0.3311, "step": 10406 }, { "epoch": 1.6291484032561052, "grad_norm": 0.9490890502929688, "learning_rate": 2.83305027173913e-05, "loss": 0.2482, "step": 10407 }, { "epoch": 1.6293049467752034, "grad_norm": 1.3699865341186523, "learning_rate": 2.8318614130434778e-05, "loss": 0.3396, "step": 10408 }, { "epoch": 1.6294614902943017, "grad_norm": 7.584133148193359, "learning_rate": 2.8306725543478256e-05, "loss": 0.532, "step": 10409 }, { "epoch": 1.6296180338134, "grad_norm": 1.1398359537124634, "learning_rate": 2.8294836956521738e-05, "loss": 0.2706, "step": 10410 }, { "epoch": 1.6297745773324985, "grad_norm": 1.0430185794830322, "learning_rate": 2.8282948369565216e-05, "loss": 0.3301, "step": 10411 }, { "epoch": 1.6299311208515967, "grad_norm": 5.148941993713379, "learning_rate": 2.8271059782608694e-05, "loss": 0.7805, "step": 10412 }, { "epoch": 1.630087664370695, "grad_norm": 1.0899221897125244, "learning_rate": 2.825917119565217e-05, "loss": 0.2204, "step": 10413 }, { "epoch": 1.6302442078897934, "grad_norm": 2.2531697750091553, "learning_rate": 2.824728260869565e-05, "loss": 0.4891, "step": 10414 }, { "epoch": 1.6304007514088916, "grad_norm": 4.254968643188477, "learning_rate": 2.823539402173913e-05, "loss": 0.752, "step": 10415 }, { "epoch": 1.63055729492799, "grad_norm": 1.9729714393615723, "learning_rate": 2.8223505434782607e-05, "loss": 0.4246, "step": 10416 }, { "epoch": 1.6307138384470883, "grad_norm": 3.0543391704559326, "learning_rate": 2.8211616847826085e-05, "loss": 0.6074, "step": 10417 }, { "epoch": 1.6308703819661865, "grad_norm": 2.3258683681488037, "learning_rate": 2.8199728260869563e-05, "loss": 0.5883, "step": 10418 }, { "epoch": 1.631026925485285, "grad_norm": 1.9583250284194946, "learning_rate": 2.818783967391304e-05, "loss": 0.5512, "step": 10419 }, { "epoch": 1.6311834690043834, "grad_norm": 2.082174062728882, "learning_rate": 2.817595108695652e-05, "loss": 0.8208, "step": 10420 }, { "epoch": 1.6313400125234816, "grad_norm": 5.424646377563477, "learning_rate": 2.8164062499999998e-05, "loss": 0.8286, "step": 10421 }, { "epoch": 1.6314965560425798, "grad_norm": 4.2019782066345215, "learning_rate": 2.8152173913043476e-05, "loss": 0.8821, "step": 10422 }, { "epoch": 1.631653099561678, "grad_norm": 2.2681100368499756, "learning_rate": 2.8140285326086954e-05, "loss": 0.6227, "step": 10423 }, { "epoch": 1.6318096430807765, "grad_norm": 2.087195873260498, "learning_rate": 2.8128396739130432e-05, "loss": 0.6629, "step": 10424 }, { "epoch": 1.631966186599875, "grad_norm": 4.500279903411865, "learning_rate": 2.811650815217391e-05, "loss": 0.8073, "step": 10425 }, { "epoch": 1.6321227301189731, "grad_norm": 2.6949143409729004, "learning_rate": 2.810461956521739e-05, "loss": 0.7247, "step": 10426 }, { "epoch": 1.6322792736380713, "grad_norm": 3.027569532394409, "learning_rate": 2.8092730978260867e-05, "loss": 0.8378, "step": 10427 }, { "epoch": 1.6324358171571696, "grad_norm": 4.554258346557617, "learning_rate": 2.8080842391304345e-05, "loss": 1.2026, "step": 10428 }, { "epoch": 1.632592360676268, "grad_norm": 3.159970760345459, "learning_rate": 2.8068953804347826e-05, "loss": 0.8232, "step": 10429 }, { "epoch": 1.6327489041953664, "grad_norm": 6.600109577178955, "learning_rate": 2.80570652173913e-05, "loss": 1.1235, "step": 10430 }, { "epoch": 1.6329054477144647, "grad_norm": 4.580519676208496, "learning_rate": 2.804517663043478e-05, "loss": 0.8563, "step": 10431 }, { "epoch": 1.6330619912335629, "grad_norm": 5.463250160217285, "learning_rate": 2.8033288043478257e-05, "loss": 1.385, "step": 10432 }, { "epoch": 1.633218534752661, "grad_norm": 2.3521640300750732, "learning_rate": 2.8021399456521736e-05, "loss": 1.0426, "step": 10433 }, { "epoch": 1.6333750782717595, "grad_norm": 2.306034564971924, "learning_rate": 2.8009510869565217e-05, "loss": 0.559, "step": 10434 }, { "epoch": 1.633531621790858, "grad_norm": 2.35274600982666, "learning_rate": 2.7997622282608695e-05, "loss": 0.2769, "step": 10435 }, { "epoch": 1.6336881653099562, "grad_norm": 4.557651519775391, "learning_rate": 2.798573369565217e-05, "loss": 0.7747, "step": 10436 }, { "epoch": 1.6338447088290544, "grad_norm": 2.517944097518921, "learning_rate": 2.7973845108695648e-05, "loss": 1.303, "step": 10437 }, { "epoch": 1.6340012523481526, "grad_norm": 4.057187557220459, "learning_rate": 2.7961956521739126e-05, "loss": 1.1192, "step": 10438 }, { "epoch": 1.634157795867251, "grad_norm": 0.41990235447883606, "learning_rate": 2.7950067934782608e-05, "loss": 0.1881, "step": 10439 }, { "epoch": 1.6343143393863495, "grad_norm": 0.4345623254776001, "learning_rate": 2.7938179347826086e-05, "loss": 0.1894, "step": 10440 }, { "epoch": 1.6344708829054477, "grad_norm": 0.5339062213897705, "learning_rate": 2.7926290760869564e-05, "loss": 0.2182, "step": 10441 }, { "epoch": 1.634627426424546, "grad_norm": 0.7262791395187378, "learning_rate": 2.791440217391304e-05, "loss": 0.2268, "step": 10442 }, { "epoch": 1.6347839699436444, "grad_norm": 0.44576361775398254, "learning_rate": 2.7902513586956517e-05, "loss": 0.196, "step": 10443 }, { "epoch": 1.6349405134627426, "grad_norm": 0.422979474067688, "learning_rate": 2.7890625e-05, "loss": 0.1603, "step": 10444 }, { "epoch": 1.635097056981841, "grad_norm": 1.1748936176300049, "learning_rate": 2.7878736413043477e-05, "loss": 0.1872, "step": 10445 }, { "epoch": 1.6352536005009393, "grad_norm": 0.5560595989227295, "learning_rate": 2.7866847826086955e-05, "loss": 0.187, "step": 10446 }, { "epoch": 1.6354101440200375, "grad_norm": 1.1714569330215454, "learning_rate": 2.785495923913043e-05, "loss": 0.3226, "step": 10447 }, { "epoch": 1.635566687539136, "grad_norm": 1.0816452503204346, "learning_rate": 2.7843070652173908e-05, "loss": 0.2533, "step": 10448 }, { "epoch": 1.6357232310582341, "grad_norm": 1.1058282852172852, "learning_rate": 2.783118206521739e-05, "loss": 0.2552, "step": 10449 }, { "epoch": 1.6358797745773326, "grad_norm": 1.2919721603393555, "learning_rate": 2.7819293478260868e-05, "loss": 0.3125, "step": 10450 }, { "epoch": 1.6360363180964308, "grad_norm": 1.4999724626541138, "learning_rate": 2.7807404891304346e-05, "loss": 0.4962, "step": 10451 }, { "epoch": 1.636192861615529, "grad_norm": 1.5981014966964722, "learning_rate": 2.7795516304347824e-05, "loss": 0.2766, "step": 10452 }, { "epoch": 1.6363494051346275, "grad_norm": 1.7297931909561157, "learning_rate": 2.77836277173913e-05, "loss": 0.5759, "step": 10453 }, { "epoch": 1.636505948653726, "grad_norm": 2.2990875244140625, "learning_rate": 2.777173913043478e-05, "loss": 0.338, "step": 10454 }, { "epoch": 1.6366624921728241, "grad_norm": 0.9149208068847656, "learning_rate": 2.775985054347826e-05, "loss": 0.3913, "step": 10455 }, { "epoch": 1.6368190356919223, "grad_norm": 0.9060066342353821, "learning_rate": 2.7747961956521736e-05, "loss": 0.2739, "step": 10456 }, { "epoch": 1.6369755792110205, "grad_norm": 1.3574737310409546, "learning_rate": 2.7736073369565215e-05, "loss": 0.1868, "step": 10457 }, { "epoch": 1.637132122730119, "grad_norm": 3.597076892852783, "learning_rate": 2.7724184782608696e-05, "loss": 0.3833, "step": 10458 }, { "epoch": 1.6372886662492174, "grad_norm": 4.564977169036865, "learning_rate": 2.771229619565217e-05, "loss": 0.6033, "step": 10459 }, { "epoch": 1.6374452097683156, "grad_norm": 2.5846376419067383, "learning_rate": 2.770040760869565e-05, "loss": 0.3847, "step": 10460 }, { "epoch": 1.6376017532874139, "grad_norm": 1.7592990398406982, "learning_rate": 2.7688519021739127e-05, "loss": 0.3318, "step": 10461 }, { "epoch": 1.637758296806512, "grad_norm": 3.467470169067383, "learning_rate": 2.7676630434782605e-05, "loss": 0.478, "step": 10462 }, { "epoch": 1.6379148403256105, "grad_norm": 1.5177438259124756, "learning_rate": 2.7664741847826087e-05, "loss": 0.3756, "step": 10463 }, { "epoch": 1.638071383844709, "grad_norm": 2.152186632156372, "learning_rate": 2.7652853260869565e-05, "loss": 0.5007, "step": 10464 }, { "epoch": 1.6382279273638072, "grad_norm": 2.468698263168335, "learning_rate": 2.764096467391304e-05, "loss": 0.5267, "step": 10465 }, { "epoch": 1.6383844708829054, "grad_norm": 2.4606146812438965, "learning_rate": 2.7629076086956518e-05, "loss": 0.7025, "step": 10466 }, { "epoch": 1.6385410144020036, "grad_norm": 3.110504388809204, "learning_rate": 2.7617187499999996e-05, "loss": 0.5967, "step": 10467 }, { "epoch": 1.638697557921102, "grad_norm": 1.5573440790176392, "learning_rate": 2.7605298913043478e-05, "loss": 0.3701, "step": 10468 }, { "epoch": 1.6388541014402005, "grad_norm": 3.4309098720550537, "learning_rate": 2.7593410326086956e-05, "loss": 0.8791, "step": 10469 }, { "epoch": 1.6390106449592987, "grad_norm": 1.6484349966049194, "learning_rate": 2.758152173913043e-05, "loss": 0.5614, "step": 10470 }, { "epoch": 1.639167188478397, "grad_norm": 4.918293476104736, "learning_rate": 2.756963315217391e-05, "loss": 0.8486, "step": 10471 }, { "epoch": 1.6393237319974951, "grad_norm": 3.353924036026001, "learning_rate": 2.7557744565217387e-05, "loss": 0.8643, "step": 10472 }, { "epoch": 1.6394802755165936, "grad_norm": 2.9689180850982666, "learning_rate": 2.754585597826087e-05, "loss": 0.9826, "step": 10473 }, { "epoch": 1.639636819035692, "grad_norm": 2.889544725418091, "learning_rate": 2.7533967391304347e-05, "loss": 0.8585, "step": 10474 }, { "epoch": 1.6397933625547902, "grad_norm": 3.541414737701416, "learning_rate": 2.7522078804347825e-05, "loss": 0.9467, "step": 10475 }, { "epoch": 1.6399499060738885, "grad_norm": 2.0724456310272217, "learning_rate": 2.75101902173913e-05, "loss": 0.5491, "step": 10476 }, { "epoch": 1.640106449592987, "grad_norm": 1.8786523342132568, "learning_rate": 2.7498301630434778e-05, "loss": 0.7452, "step": 10477 }, { "epoch": 1.6402629931120851, "grad_norm": 2.922927141189575, "learning_rate": 2.748641304347826e-05, "loss": 1.2425, "step": 10478 }, { "epoch": 1.6404195366311836, "grad_norm": 5.077960968017578, "learning_rate": 2.7474524456521737e-05, "loss": 1.4014, "step": 10479 }, { "epoch": 1.6405760801502818, "grad_norm": 4.78774356842041, "learning_rate": 2.7462635869565216e-05, "loss": 1.195, "step": 10480 }, { "epoch": 1.64073262366938, "grad_norm": 2.4856929779052734, "learning_rate": 2.7450747282608694e-05, "loss": 0.8198, "step": 10481 }, { "epoch": 1.6408891671884784, "grad_norm": 2.5878684520721436, "learning_rate": 2.743885869565217e-05, "loss": 1.2837, "step": 10482 }, { "epoch": 1.6410457107075767, "grad_norm": 3.352283000946045, "learning_rate": 2.742697010869565e-05, "loss": 1.1489, "step": 10483 }, { "epoch": 1.641202254226675, "grad_norm": 4.340141773223877, "learning_rate": 2.7415081521739128e-05, "loss": 0.9721, "step": 10484 }, { "epoch": 1.6413587977457733, "grad_norm": 3.393965005874634, "learning_rate": 2.7403192934782606e-05, "loss": 0.4986, "step": 10485 }, { "epoch": 1.6415153412648715, "grad_norm": 3.412628650665283, "learning_rate": 2.7391304347826085e-05, "loss": 0.3053, "step": 10486 }, { "epoch": 1.64167188478397, "grad_norm": 4.740467548370361, "learning_rate": 2.7379415760869566e-05, "loss": 1.1947, "step": 10487 }, { "epoch": 1.6418284283030684, "grad_norm": 2.815479040145874, "learning_rate": 2.736752717391304e-05, "loss": 0.9223, "step": 10488 }, { "epoch": 1.6419849718221666, "grad_norm": 0.4443182945251465, "learning_rate": 2.735563858695652e-05, "loss": 0.1396, "step": 10489 }, { "epoch": 1.6421415153412648, "grad_norm": 1.117040991783142, "learning_rate": 2.7343749999999997e-05, "loss": 0.2483, "step": 10490 }, { "epoch": 1.642298058860363, "grad_norm": 0.6746817231178284, "learning_rate": 2.7331861413043475e-05, "loss": 0.3279, "step": 10491 }, { "epoch": 1.6424546023794615, "grad_norm": 1.0678001642227173, "learning_rate": 2.7319972826086957e-05, "loss": 0.2806, "step": 10492 }, { "epoch": 1.64261114589856, "grad_norm": 0.6154839396476746, "learning_rate": 2.730808423913043e-05, "loss": 0.194, "step": 10493 }, { "epoch": 1.6427676894176582, "grad_norm": 0.6484103798866272, "learning_rate": 2.729619565217391e-05, "loss": 0.1681, "step": 10494 }, { "epoch": 1.6429242329367564, "grad_norm": 0.41152942180633545, "learning_rate": 2.7284307065217388e-05, "loss": 0.1327, "step": 10495 }, { "epoch": 1.6430807764558546, "grad_norm": 0.48847734928131104, "learning_rate": 2.7272418478260866e-05, "loss": 0.1884, "step": 10496 }, { "epoch": 1.643237319974953, "grad_norm": 1.1195650100708008, "learning_rate": 2.7260529891304348e-05, "loss": 0.3786, "step": 10497 }, { "epoch": 1.6433938634940515, "grad_norm": 0.9228609800338745, "learning_rate": 2.7248641304347826e-05, "loss": 0.2701, "step": 10498 }, { "epoch": 1.6435504070131497, "grad_norm": 0.8404269814491272, "learning_rate": 2.72367527173913e-05, "loss": 0.2377, "step": 10499 }, { "epoch": 1.643706950532248, "grad_norm": 1.2289317846298218, "learning_rate": 2.722486413043478e-05, "loss": 0.4371, "step": 10500 }, { "epoch": 1.6438634940513461, "grad_norm": 0.7209617495536804, "learning_rate": 2.7212975543478257e-05, "loss": 0.2404, "step": 10501 }, { "epoch": 1.6440200375704446, "grad_norm": 1.189362645149231, "learning_rate": 2.720108695652174e-05, "loss": 0.3082, "step": 10502 }, { "epoch": 1.644176581089543, "grad_norm": 2.097329616546631, "learning_rate": 2.7189198369565217e-05, "loss": 0.5813, "step": 10503 }, { "epoch": 1.6443331246086412, "grad_norm": 2.2942392826080322, "learning_rate": 2.7177309782608695e-05, "loss": 0.383, "step": 10504 }, { "epoch": 1.6444896681277394, "grad_norm": 1.5905420780181885, "learning_rate": 2.716542119565217e-05, "loss": 0.445, "step": 10505 }, { "epoch": 1.6446462116468377, "grad_norm": 3.612082004547119, "learning_rate": 2.7153532608695648e-05, "loss": 0.5313, "step": 10506 }, { "epoch": 1.644802755165936, "grad_norm": 2.1428427696228027, "learning_rate": 2.714164402173913e-05, "loss": 0.4556, "step": 10507 }, { "epoch": 1.6449592986850345, "grad_norm": 2.100085973739624, "learning_rate": 2.7129755434782607e-05, "loss": 0.2718, "step": 10508 }, { "epoch": 1.6451158422041328, "grad_norm": 3.182785749435425, "learning_rate": 2.7117866847826086e-05, "loss": 0.4594, "step": 10509 }, { "epoch": 1.645272385723231, "grad_norm": 2.626007080078125, "learning_rate": 2.7105978260869564e-05, "loss": 0.7085, "step": 10510 }, { "epoch": 1.6454289292423294, "grad_norm": 1.2909075021743774, "learning_rate": 2.709408967391304e-05, "loss": 0.6505, "step": 10511 }, { "epoch": 1.6455854727614276, "grad_norm": 2.457493305206299, "learning_rate": 2.708220108695652e-05, "loss": 0.4731, "step": 10512 }, { "epoch": 1.645742016280526, "grad_norm": 1.7316431999206543, "learning_rate": 2.7070312499999998e-05, "loss": 0.3642, "step": 10513 }, { "epoch": 1.6458985597996243, "grad_norm": 2.475344181060791, "learning_rate": 2.7058423913043476e-05, "loss": 0.6407, "step": 10514 }, { "epoch": 1.6460551033187225, "grad_norm": 2.908580780029297, "learning_rate": 2.7046535326086954e-05, "loss": 0.5084, "step": 10515 }, { "epoch": 1.646211646837821, "grad_norm": 2.356127977371216, "learning_rate": 2.703464673913043e-05, "loss": 0.6697, "step": 10516 }, { "epoch": 1.6463681903569192, "grad_norm": 1.276840090751648, "learning_rate": 2.702275815217391e-05, "loss": 0.3275, "step": 10517 }, { "epoch": 1.6465247338760176, "grad_norm": 1.542188286781311, "learning_rate": 2.701086956521739e-05, "loss": 0.2551, "step": 10518 }, { "epoch": 1.6466812773951158, "grad_norm": 1.8119176626205444, "learning_rate": 2.6998980978260867e-05, "loss": 0.494, "step": 10519 }, { "epoch": 1.646837820914214, "grad_norm": 2.535614013671875, "learning_rate": 2.6987092391304345e-05, "loss": 0.5336, "step": 10520 }, { "epoch": 1.6469943644333125, "grad_norm": 1.7003751993179321, "learning_rate": 2.6975203804347827e-05, "loss": 0.425, "step": 10521 }, { "epoch": 1.647150907952411, "grad_norm": 2.7873663902282715, "learning_rate": 2.69633152173913e-05, "loss": 0.5538, "step": 10522 }, { "epoch": 1.6473074514715091, "grad_norm": 2.420388698577881, "learning_rate": 2.695142663043478e-05, "loss": 0.8439, "step": 10523 }, { "epoch": 1.6474639949906074, "grad_norm": 4.006801128387451, "learning_rate": 2.6939538043478258e-05, "loss": 1.0611, "step": 10524 }, { "epoch": 1.6476205385097056, "grad_norm": 1.5988335609436035, "learning_rate": 2.6927649456521736e-05, "loss": 0.5925, "step": 10525 }, { "epoch": 1.647777082028804, "grad_norm": 3.2127201557159424, "learning_rate": 2.6915760869565218e-05, "loss": 0.5015, "step": 10526 }, { "epoch": 1.6479336255479025, "grad_norm": 2.49357271194458, "learning_rate": 2.6903872282608696e-05, "loss": 1.0582, "step": 10527 }, { "epoch": 1.6480901690670007, "grad_norm": 4.398533821105957, "learning_rate": 2.689198369565217e-05, "loss": 1.0959, "step": 10528 }, { "epoch": 1.648246712586099, "grad_norm": 8.697532653808594, "learning_rate": 2.688009510869565e-05, "loss": 0.4699, "step": 10529 }, { "epoch": 1.6484032561051971, "grad_norm": 9.720989227294922, "learning_rate": 2.6868206521739127e-05, "loss": 1.3692, "step": 10530 }, { "epoch": 1.6485597996242956, "grad_norm": 4.682546138763428, "learning_rate": 2.685631793478261e-05, "loss": 1.7899, "step": 10531 }, { "epoch": 1.648716343143394, "grad_norm": 2.944974184036255, "learning_rate": 2.6844429347826087e-05, "loss": 1.1632, "step": 10532 }, { "epoch": 1.6488728866624922, "grad_norm": 2.6808536052703857, "learning_rate": 2.6832540760869565e-05, "loss": 1.341, "step": 10533 }, { "epoch": 1.6490294301815904, "grad_norm": 2.5387823581695557, "learning_rate": 2.682065217391304e-05, "loss": 0.8921, "step": 10534 }, { "epoch": 1.6491859737006886, "grad_norm": 15.558074951171875, "learning_rate": 2.6808763586956518e-05, "loss": 0.5326, "step": 10535 }, { "epoch": 1.649342517219787, "grad_norm": 5.539371490478516, "learning_rate": 2.6796875e-05, "loss": 0.7517, "step": 10536 }, { "epoch": 1.6494990607388855, "grad_norm": 3.091179132461548, "learning_rate": 2.6784986413043477e-05, "loss": 0.6964, "step": 10537 }, { "epoch": 1.6496556042579837, "grad_norm": 2.3371200561523438, "learning_rate": 2.6773097826086955e-05, "loss": 0.823, "step": 10538 }, { "epoch": 1.649812147777082, "grad_norm": 0.5764394402503967, "learning_rate": 2.676120923913043e-05, "loss": 0.2069, "step": 10539 }, { "epoch": 1.6499686912961802, "grad_norm": 0.5808513760566711, "learning_rate": 2.674932065217391e-05, "loss": 0.2041, "step": 10540 }, { "epoch": 1.6501252348152786, "grad_norm": 1.0977435111999512, "learning_rate": 2.673743206521739e-05, "loss": 0.1483, "step": 10541 }, { "epoch": 1.650281778334377, "grad_norm": 0.6314653754234314, "learning_rate": 2.6725543478260868e-05, "loss": 0.1816, "step": 10542 }, { "epoch": 1.6504383218534753, "grad_norm": 2.2388358116149902, "learning_rate": 2.6713654891304346e-05, "loss": 0.3015, "step": 10543 }, { "epoch": 1.6505948653725735, "grad_norm": 0.9688872694969177, "learning_rate": 2.6701766304347824e-05, "loss": 0.2459, "step": 10544 }, { "epoch": 1.650751408891672, "grad_norm": 0.7484098076820374, "learning_rate": 2.66898777173913e-05, "loss": 0.2585, "step": 10545 }, { "epoch": 1.6509079524107702, "grad_norm": 0.7973011136054993, "learning_rate": 2.667798913043478e-05, "loss": 0.28, "step": 10546 }, { "epoch": 1.6510644959298686, "grad_norm": 1.15187406539917, "learning_rate": 2.666610054347826e-05, "loss": 0.2077, "step": 10547 }, { "epoch": 1.6512210394489668, "grad_norm": 0.9143211245536804, "learning_rate": 2.6654211956521737e-05, "loss": 0.3131, "step": 10548 }, { "epoch": 1.651377582968065, "grad_norm": 0.8381046056747437, "learning_rate": 2.6642323369565215e-05, "loss": 0.2779, "step": 10549 }, { "epoch": 1.6515341264871635, "grad_norm": 1.4978036880493164, "learning_rate": 2.6630434782608693e-05, "loss": 0.3145, "step": 10550 }, { "epoch": 1.6516906700062617, "grad_norm": 1.6027312278747559, "learning_rate": 2.661854619565217e-05, "loss": 0.3926, "step": 10551 }, { "epoch": 1.6518472135253601, "grad_norm": 0.7835795283317566, "learning_rate": 2.660665760869565e-05, "loss": 0.2667, "step": 10552 }, { "epoch": 1.6520037570444583, "grad_norm": 0.8763027191162109, "learning_rate": 2.6594769021739128e-05, "loss": 0.3255, "step": 10553 }, { "epoch": 1.6521603005635566, "grad_norm": 1.0255693197250366, "learning_rate": 2.6582880434782606e-05, "loss": 0.2594, "step": 10554 }, { "epoch": 1.652316844082655, "grad_norm": 1.3212610483169556, "learning_rate": 2.6570991847826087e-05, "loss": 0.4131, "step": 10555 }, { "epoch": 1.6524733876017534, "grad_norm": 4.750890731811523, "learning_rate": 2.6559103260869566e-05, "loss": 0.3835, "step": 10556 }, { "epoch": 1.6526299311208517, "grad_norm": 1.3662575483322144, "learning_rate": 2.654721467391304e-05, "loss": 0.3707, "step": 10557 }, { "epoch": 1.6527864746399499, "grad_norm": 1.1991314888000488, "learning_rate": 2.653532608695652e-05, "loss": 0.3445, "step": 10558 }, { "epoch": 1.652943018159048, "grad_norm": 1.032923698425293, "learning_rate": 2.6523437499999997e-05, "loss": 0.4349, "step": 10559 }, { "epoch": 1.6530995616781465, "grad_norm": 1.452370285987854, "learning_rate": 2.6511548913043478e-05, "loss": 0.2735, "step": 10560 }, { "epoch": 1.653256105197245, "grad_norm": 1.7496209144592285, "learning_rate": 2.6499660326086956e-05, "loss": 0.4179, "step": 10561 }, { "epoch": 1.6534126487163432, "grad_norm": 1.2228484153747559, "learning_rate": 2.648777173913043e-05, "loss": 0.3751, "step": 10562 }, { "epoch": 1.6535691922354414, "grad_norm": 1.225647211074829, "learning_rate": 2.647588315217391e-05, "loss": 0.3127, "step": 10563 }, { "epoch": 1.6537257357545396, "grad_norm": 1.4085296392440796, "learning_rate": 2.6463994565217388e-05, "loss": 0.5092, "step": 10564 }, { "epoch": 1.653882279273638, "grad_norm": 1.9288595914840698, "learning_rate": 2.645210597826087e-05, "loss": 0.7825, "step": 10565 }, { "epoch": 1.6540388227927365, "grad_norm": 6.261617660522461, "learning_rate": 2.6440217391304347e-05, "loss": 0.502, "step": 10566 }, { "epoch": 1.6541953663118347, "grad_norm": 1.2058967351913452, "learning_rate": 2.6428328804347825e-05, "loss": 0.4047, "step": 10567 }, { "epoch": 1.654351909830933, "grad_norm": 2.1454272270202637, "learning_rate": 2.64164402173913e-05, "loss": 0.4896, "step": 10568 }, { "epoch": 1.6545084533500312, "grad_norm": 2.323345899581909, "learning_rate": 2.6404551630434778e-05, "loss": 0.7693, "step": 10569 }, { "epoch": 1.6546649968691296, "grad_norm": 1.9238473176956177, "learning_rate": 2.639266304347826e-05, "loss": 0.4346, "step": 10570 }, { "epoch": 1.654821540388228, "grad_norm": 3.2860591411590576, "learning_rate": 2.6380774456521738e-05, "loss": 0.9345, "step": 10571 }, { "epoch": 1.6549780839073263, "grad_norm": 4.312137603759766, "learning_rate": 2.6368885869565216e-05, "loss": 1.1909, "step": 10572 }, { "epoch": 1.6551346274264245, "grad_norm": 1.8529261350631714, "learning_rate": 2.6356997282608694e-05, "loss": 0.4128, "step": 10573 }, { "epoch": 1.6552911709455227, "grad_norm": 3.293816089630127, "learning_rate": 2.634510869565217e-05, "loss": 1.0663, "step": 10574 }, { "epoch": 1.6554477144646211, "grad_norm": 3.0328075885772705, "learning_rate": 2.633322010869565e-05, "loss": 0.9058, "step": 10575 }, { "epoch": 1.6556042579837196, "grad_norm": 2.7342820167541504, "learning_rate": 2.632133152173913e-05, "loss": 0.6899, "step": 10576 }, { "epoch": 1.6557608015028178, "grad_norm": 12.032238006591797, "learning_rate": 2.6309442934782607e-05, "loss": 1.054, "step": 10577 }, { "epoch": 1.655917345021916, "grad_norm": 6.1105828285217285, "learning_rate": 2.6297554347826085e-05, "loss": 0.9473, "step": 10578 }, { "epoch": 1.6560738885410144, "grad_norm": 6.783019065856934, "learning_rate": 2.6285665760869563e-05, "loss": 1.3771, "step": 10579 }, { "epoch": 1.6562304320601127, "grad_norm": 5.334681987762451, "learning_rate": 2.627377717391304e-05, "loss": 1.5483, "step": 10580 }, { "epoch": 1.656386975579211, "grad_norm": 3.6244170665740967, "learning_rate": 2.626188858695652e-05, "loss": 1.6473, "step": 10581 }, { "epoch": 1.6565435190983093, "grad_norm": 2.3104631900787354, "learning_rate": 2.6249999999999998e-05, "loss": 0.7128, "step": 10582 }, { "epoch": 1.6567000626174075, "grad_norm": 4.258233547210693, "learning_rate": 2.6238111413043476e-05, "loss": 0.8682, "step": 10583 }, { "epoch": 1.656856606136506, "grad_norm": NaN, "learning_rate": 2.6238111413043476e-05, "loss": 0.0, "step": 10584 }, { "epoch": 1.6570131496556044, "grad_norm": 4.629364967346191, "learning_rate": 2.6226222826086954e-05, "loss": 0.5511, "step": 10585 }, { "epoch": 1.6571696931747026, "grad_norm": 2.193629741668701, "learning_rate": 2.6214334239130432e-05, "loss": 0.6182, "step": 10586 }, { "epoch": 1.6573262366938009, "grad_norm": 3.2242395877838135, "learning_rate": 2.620244565217391e-05, "loss": 1.0866, "step": 10587 }, { "epoch": 1.657482780212899, "grad_norm": 3.8982062339782715, "learning_rate": 2.619055706521739e-05, "loss": 1.0854, "step": 10588 }, { "epoch": 1.6576393237319975, "grad_norm": 0.8086597919464111, "learning_rate": 2.6178668478260867e-05, "loss": 0.2424, "step": 10589 }, { "epoch": 1.657795867251096, "grad_norm": 0.659435510635376, "learning_rate": 2.6166779891304345e-05, "loss": 0.2161, "step": 10590 }, { "epoch": 1.6579524107701942, "grad_norm": 2.5754611492156982, "learning_rate": 2.6154891304347826e-05, "loss": 0.194, "step": 10591 }, { "epoch": 1.6581089542892924, "grad_norm": 0.8108079433441162, "learning_rate": 2.61430027173913e-05, "loss": 0.2004, "step": 10592 }, { "epoch": 1.6582654978083906, "grad_norm": 0.6342044472694397, "learning_rate": 2.613111413043478e-05, "loss": 0.1996, "step": 10593 }, { "epoch": 1.658422041327489, "grad_norm": 0.9975916147232056, "learning_rate": 2.6119225543478257e-05, "loss": 0.2536, "step": 10594 }, { "epoch": 1.6585785848465875, "grad_norm": 0.7103311419487, "learning_rate": 2.6107336956521736e-05, "loss": 0.31, "step": 10595 }, { "epoch": 1.6587351283656857, "grad_norm": 1.2085292339324951, "learning_rate": 2.6095448369565217e-05, "loss": 0.3428, "step": 10596 }, { "epoch": 1.658891671884784, "grad_norm": 1.5847784280776978, "learning_rate": 2.6083559782608695e-05, "loss": 0.1601, "step": 10597 }, { "epoch": 1.6590482154038821, "grad_norm": 1.3612124919891357, "learning_rate": 2.607167119565217e-05, "loss": 0.3277, "step": 10598 }, { "epoch": 1.6592047589229806, "grad_norm": 1.2000564336776733, "learning_rate": 2.6059782608695648e-05, "loss": 0.4335, "step": 10599 }, { "epoch": 1.659361302442079, "grad_norm": 1.325197458267212, "learning_rate": 2.6047894021739126e-05, "loss": 0.1612, "step": 10600 }, { "epoch": 1.6595178459611772, "grad_norm": 1.1848549842834473, "learning_rate": 2.6036005434782608e-05, "loss": 0.2916, "step": 10601 }, { "epoch": 1.6596743894802755, "grad_norm": 0.9092533588409424, "learning_rate": 2.6024116847826086e-05, "loss": 0.2955, "step": 10602 }, { "epoch": 1.6598309329993737, "grad_norm": 1.108530879020691, "learning_rate": 2.6012228260869564e-05, "loss": 0.2687, "step": 10603 }, { "epoch": 1.6599874765184721, "grad_norm": 1.8683451414108276, "learning_rate": 2.600033967391304e-05, "loss": 0.4156, "step": 10604 }, { "epoch": 1.6601440200375706, "grad_norm": 1.5472538471221924, "learning_rate": 2.5988451086956517e-05, "loss": 0.5081, "step": 10605 }, { "epoch": 1.6603005635566688, "grad_norm": 1.2705541849136353, "learning_rate": 2.59765625e-05, "loss": 0.2601, "step": 10606 }, { "epoch": 1.660457107075767, "grad_norm": 1.212092638015747, "learning_rate": 2.5964673913043477e-05, "loss": 0.1912, "step": 10607 }, { "epoch": 1.6606136505948652, "grad_norm": 1.221488356590271, "learning_rate": 2.5952785326086955e-05, "loss": 0.2446, "step": 10608 }, { "epoch": 1.6607701941139636, "grad_norm": 0.9251350164413452, "learning_rate": 2.594089673913043e-05, "loss": 0.2296, "step": 10609 }, { "epoch": 1.660926737633062, "grad_norm": 2.2132792472839355, "learning_rate": 2.5929008152173908e-05, "loss": 0.3724, "step": 10610 }, { "epoch": 1.6610832811521603, "grad_norm": 3.110887050628662, "learning_rate": 2.591711956521739e-05, "loss": 0.4372, "step": 10611 }, { "epoch": 1.6612398246712585, "grad_norm": 1.9094136953353882, "learning_rate": 2.5905230978260868e-05, "loss": 0.5436, "step": 10612 }, { "epoch": 1.661396368190357, "grad_norm": 3.2388670444488525, "learning_rate": 2.5893342391304346e-05, "loss": 0.4324, "step": 10613 }, { "epoch": 1.6615529117094552, "grad_norm": 1.7248731851577759, "learning_rate": 2.5881453804347824e-05, "loss": 0.4214, "step": 10614 }, { "epoch": 1.6617094552285536, "grad_norm": 1.7111068964004517, "learning_rate": 2.5869565217391302e-05, "loss": 0.5755, "step": 10615 }, { "epoch": 1.6618659987476518, "grad_norm": 11.820337295532227, "learning_rate": 2.585767663043478e-05, "loss": 0.7403, "step": 10616 }, { "epoch": 1.66202254226675, "grad_norm": 2.096576690673828, "learning_rate": 2.584578804347826e-05, "loss": 0.5899, "step": 10617 }, { "epoch": 1.6621790857858485, "grad_norm": 1.748471736907959, "learning_rate": 2.5833899456521737e-05, "loss": 0.6663, "step": 10618 }, { "epoch": 1.662335629304947, "grad_norm": 3.9472241401672363, "learning_rate": 2.5822010869565215e-05, "loss": 0.8864, "step": 10619 }, { "epoch": 1.6624921728240452, "grad_norm": 2.244412899017334, "learning_rate": 2.5810122282608696e-05, "loss": 0.9988, "step": 10620 }, { "epoch": 1.6626487163431434, "grad_norm": 1.904506802558899, "learning_rate": 2.579823369565217e-05, "loss": 0.3799, "step": 10621 }, { "epoch": 1.6628052598622416, "grad_norm": 2.2049386501312256, "learning_rate": 2.578634510869565e-05, "loss": 0.5956, "step": 10622 }, { "epoch": 1.66296180338134, "grad_norm": 3.705679178237915, "learning_rate": 2.5774456521739127e-05, "loss": 0.7892, "step": 10623 }, { "epoch": 1.6631183469004385, "grad_norm": 3.68975830078125, "learning_rate": 2.5762567934782605e-05, "loss": 0.8771, "step": 10624 }, { "epoch": 1.6632748904195367, "grad_norm": 2.817445993423462, "learning_rate": 2.5750679347826087e-05, "loss": 0.9852, "step": 10625 }, { "epoch": 1.663431433938635, "grad_norm": 2.856666326522827, "learning_rate": 2.5738790760869565e-05, "loss": 0.5056, "step": 10626 }, { "epoch": 1.6635879774577331, "grad_norm": 2.5012221336364746, "learning_rate": 2.572690217391304e-05, "loss": 0.8358, "step": 10627 }, { "epoch": 1.6637445209768316, "grad_norm": 5.745575904846191, "learning_rate": 2.5715013586956518e-05, "loss": 0.9756, "step": 10628 }, { "epoch": 1.66390106449593, "grad_norm": 2.5506885051727295, "learning_rate": 2.5703124999999996e-05, "loss": 0.9878, "step": 10629 }, { "epoch": 1.6640576080150282, "grad_norm": 3.7202234268188477, "learning_rate": 2.5691236413043478e-05, "loss": 1.1645, "step": 10630 }, { "epoch": 1.6642141515341264, "grad_norm": 3.9010982513427734, "learning_rate": 2.5679347826086956e-05, "loss": 1.7516, "step": 10631 }, { "epoch": 1.6643706950532247, "grad_norm": 3.7006075382232666, "learning_rate": 2.566745923913043e-05, "loss": 1.1169, "step": 10632 }, { "epoch": 1.664527238572323, "grad_norm": 1.0442136526107788, "learning_rate": 2.565557065217391e-05, "loss": 0.8701, "step": 10633 }, { "epoch": 1.6646837820914215, "grad_norm": 3.7403178215026855, "learning_rate": 2.5643682065217387e-05, "loss": 1.1012, "step": 10634 }, { "epoch": 1.6648403256105198, "grad_norm": 3.83508563041687, "learning_rate": 2.563179347826087e-05, "loss": 0.7593, "step": 10635 }, { "epoch": 1.664996869129618, "grad_norm": 3.6616249084472656, "learning_rate": 2.5619904891304347e-05, "loss": 0.8555, "step": 10636 }, { "epoch": 1.6651534126487162, "grad_norm": 2.6965181827545166, "learning_rate": 2.5608016304347825e-05, "loss": 0.9149, "step": 10637 }, { "epoch": 1.6653099561678146, "grad_norm": 4.038667678833008, "learning_rate": 2.55961277173913e-05, "loss": 1.1411, "step": 10638 }, { "epoch": 1.665466499686913, "grad_norm": 1.5572214126586914, "learning_rate": 2.5584239130434778e-05, "loss": 0.6189, "step": 10639 }, { "epoch": 1.6656230432060113, "grad_norm": 7.843084812164307, "learning_rate": 2.557235054347826e-05, "loss": 0.2061, "step": 10640 }, { "epoch": 1.6657795867251095, "grad_norm": 0.6900447010993958, "learning_rate": 2.5560461956521738e-05, "loss": 0.2895, "step": 10641 }, { "epoch": 1.6659361302442077, "grad_norm": 0.9190594553947449, "learning_rate": 2.5548573369565216e-05, "loss": 0.2718, "step": 10642 }, { "epoch": 1.6660926737633062, "grad_norm": 1.4758033752441406, "learning_rate": 2.5536684782608694e-05, "loss": 0.3343, "step": 10643 }, { "epoch": 1.6662492172824046, "grad_norm": 1.369628667831421, "learning_rate": 2.552479619565217e-05, "loss": 0.3276, "step": 10644 }, { "epoch": 1.6664057608015028, "grad_norm": 0.8480128645896912, "learning_rate": 2.551290760869565e-05, "loss": 0.1735, "step": 10645 }, { "epoch": 1.666562304320601, "grad_norm": 1.023101806640625, "learning_rate": 2.550101902173913e-05, "loss": 0.2829, "step": 10646 }, { "epoch": 1.6667188478396995, "grad_norm": 0.7629166841506958, "learning_rate": 2.5489130434782606e-05, "loss": 0.2504, "step": 10647 }, { "epoch": 1.6668753913587977, "grad_norm": 2.240100860595703, "learning_rate": 2.5477241847826085e-05, "loss": 0.3577, "step": 10648 }, { "epoch": 1.6670319348778961, "grad_norm": 1.448101282119751, "learning_rate": 2.5465353260869566e-05, "loss": 0.3296, "step": 10649 }, { "epoch": 1.6671884783969944, "grad_norm": 1.6884651184082031, "learning_rate": 2.545346467391304e-05, "loss": 0.3834, "step": 10650 }, { "epoch": 1.6673450219160926, "grad_norm": 2.5480291843414307, "learning_rate": 2.544157608695652e-05, "loss": 0.2794, "step": 10651 }, { "epoch": 1.667501565435191, "grad_norm": 0.9674702882766724, "learning_rate": 2.5429687499999997e-05, "loss": 0.3845, "step": 10652 }, { "epoch": 1.6676581089542895, "grad_norm": 1.6979548931121826, "learning_rate": 2.5417798913043475e-05, "loss": 0.5469, "step": 10653 }, { "epoch": 1.6678146524733877, "grad_norm": 1.6252363920211792, "learning_rate": 2.5405910326086957e-05, "loss": 0.381, "step": 10654 }, { "epoch": 1.6679711959924859, "grad_norm": 1.2029447555541992, "learning_rate": 2.5394021739130432e-05, "loss": 0.3985, "step": 10655 }, { "epoch": 1.668127739511584, "grad_norm": 2.3818511962890625, "learning_rate": 2.538213315217391e-05, "loss": 0.4081, "step": 10656 }, { "epoch": 1.6682842830306825, "grad_norm": 1.084537148475647, "learning_rate": 2.5370244565217388e-05, "loss": 0.323, "step": 10657 }, { "epoch": 1.668440826549781, "grad_norm": 1.896524429321289, "learning_rate": 2.5358355978260866e-05, "loss": 0.6483, "step": 10658 }, { "epoch": 1.6685973700688792, "grad_norm": 9.950224876403809, "learning_rate": 2.5346467391304348e-05, "loss": 1.3579, "step": 10659 }, { "epoch": 1.6687539135879774, "grad_norm": 2.0742077827453613, "learning_rate": 2.5334578804347826e-05, "loss": 0.6281, "step": 10660 }, { "epoch": 1.6689104571070756, "grad_norm": 1.7064248323440552, "learning_rate": 2.53226902173913e-05, "loss": 0.4295, "step": 10661 }, { "epoch": 1.669067000626174, "grad_norm": 1.993630290031433, "learning_rate": 2.531080163043478e-05, "loss": 0.3934, "step": 10662 }, { "epoch": 1.6692235441452725, "grad_norm": 2.219187021255493, "learning_rate": 2.5298913043478257e-05, "loss": 0.3374, "step": 10663 }, { "epoch": 1.6693800876643707, "grad_norm": 3.9741082191467285, "learning_rate": 2.528702445652174e-05, "loss": 1.118, "step": 10664 }, { "epoch": 1.669536631183469, "grad_norm": 3.648329973220825, "learning_rate": 2.5275135869565217e-05, "loss": 0.6385, "step": 10665 }, { "epoch": 1.6696931747025672, "grad_norm": 1.8728373050689697, "learning_rate": 2.5263247282608695e-05, "loss": 0.4588, "step": 10666 }, { "epoch": 1.6698497182216656, "grad_norm": 1.9624192714691162, "learning_rate": 2.525135869565217e-05, "loss": 0.3008, "step": 10667 }, { "epoch": 1.670006261740764, "grad_norm": 3.029412031173706, "learning_rate": 2.5239470108695648e-05, "loss": 0.4481, "step": 10668 }, { "epoch": 1.6701628052598623, "grad_norm": 1.4095947742462158, "learning_rate": 2.522758152173913e-05, "loss": 0.3561, "step": 10669 }, { "epoch": 1.6703193487789605, "grad_norm": 3.1851418018341064, "learning_rate": 2.5215692934782607e-05, "loss": 0.791, "step": 10670 }, { "epoch": 1.6704758922980587, "grad_norm": 2.6509413719177246, "learning_rate": 2.5203804347826086e-05, "loss": 0.5045, "step": 10671 }, { "epoch": 1.6706324358171571, "grad_norm": 3.452700138092041, "learning_rate": 2.5191915760869564e-05, "loss": 0.5438, "step": 10672 }, { "epoch": 1.6707889793362556, "grad_norm": 4.178023815155029, "learning_rate": 2.518002717391304e-05, "loss": 0.7574, "step": 10673 }, { "epoch": 1.6709455228553538, "grad_norm": 2.8459458351135254, "learning_rate": 2.516813858695652e-05, "loss": 0.9843, "step": 10674 }, { "epoch": 1.671102066374452, "grad_norm": 2.5416831970214844, "learning_rate": 2.5156249999999998e-05, "loss": 0.4601, "step": 10675 }, { "epoch": 1.6712586098935505, "grad_norm": 3.5111000537872314, "learning_rate": 2.5144361413043476e-05, "loss": 0.7372, "step": 10676 }, { "epoch": 1.6714151534126487, "grad_norm": 4.0461015701293945, "learning_rate": 2.5132472826086955e-05, "loss": 0.6769, "step": 10677 }, { "epoch": 1.6715716969317471, "grad_norm": 2.90340518951416, "learning_rate": 2.512058423913043e-05, "loss": 1.3494, "step": 10678 }, { "epoch": 1.6717282404508453, "grad_norm": 2.1869072914123535, "learning_rate": 2.510869565217391e-05, "loss": 0.6334, "step": 10679 }, { "epoch": 1.6718847839699436, "grad_norm": 3.040008068084717, "learning_rate": 2.509680706521739e-05, "loss": 0.9531, "step": 10680 }, { "epoch": 1.672041327489042, "grad_norm": 2.804685115814209, "learning_rate": 2.5084918478260867e-05, "loss": 0.5777, "step": 10681 }, { "epoch": 1.6721978710081402, "grad_norm": 2.5585744380950928, "learning_rate": 2.5073029891304345e-05, "loss": 0.9473, "step": 10682 }, { "epoch": 1.6723544145272387, "grad_norm": 3.646481513977051, "learning_rate": 2.5061141304347827e-05, "loss": 1.2854, "step": 10683 }, { "epoch": 1.6725109580463369, "grad_norm": 3.151638984680176, "learning_rate": 2.50492527173913e-05, "loss": 0.7049, "step": 10684 }, { "epoch": 1.672667501565435, "grad_norm": 4.119843006134033, "learning_rate": 2.503736413043478e-05, "loss": 0.9096, "step": 10685 }, { "epoch": 1.6728240450845335, "grad_norm": 2.663921594619751, "learning_rate": 2.5025475543478258e-05, "loss": 0.282, "step": 10686 }, { "epoch": 1.672980588603632, "grad_norm": 2.6105241775512695, "learning_rate": 2.5013586956521736e-05, "loss": 0.6618, "step": 10687 }, { "epoch": 1.6731371321227302, "grad_norm": 2.2810683250427246, "learning_rate": 2.5001698369565218e-05, "loss": 0.8866, "step": 10688 }, { "epoch": 1.6732936756418284, "grad_norm": 0.46508610248565674, "learning_rate": 2.4989809782608696e-05, "loss": 0.195, "step": 10689 }, { "epoch": 1.6734502191609266, "grad_norm": 0.7426146864891052, "learning_rate": 2.497792119565217e-05, "loss": 0.1611, "step": 10690 }, { "epoch": 1.673606762680025, "grad_norm": 0.9316383004188538, "learning_rate": 2.496603260869565e-05, "loss": 0.242, "step": 10691 }, { "epoch": 1.6737633061991235, "grad_norm": 0.6193244457244873, "learning_rate": 2.4954144021739127e-05, "loss": 0.2019, "step": 10692 }, { "epoch": 1.6739198497182217, "grad_norm": 0.7786237597465515, "learning_rate": 2.494225543478261e-05, "loss": 0.1922, "step": 10693 }, { "epoch": 1.67407639323732, "grad_norm": 0.9873241186141968, "learning_rate": 2.4930366847826087e-05, "loss": 0.2082, "step": 10694 }, { "epoch": 1.6742329367564182, "grad_norm": 0.9605923295021057, "learning_rate": 2.4918478260869565e-05, "loss": 0.2977, "step": 10695 }, { "epoch": 1.6743894802755166, "grad_norm": 0.6898202300071716, "learning_rate": 2.490658967391304e-05, "loss": 0.2745, "step": 10696 }, { "epoch": 1.674546023794615, "grad_norm": 2.076580286026001, "learning_rate": 2.4894701086956518e-05, "loss": 0.4138, "step": 10697 }, { "epoch": 1.6747025673137133, "grad_norm": 1.1588647365570068, "learning_rate": 2.48828125e-05, "loss": 0.3573, "step": 10698 }, { "epoch": 1.6748591108328115, "grad_norm": 1.1615736484527588, "learning_rate": 2.4870923913043477e-05, "loss": 0.2287, "step": 10699 }, { "epoch": 1.6750156543519097, "grad_norm": 1.5446629524230957, "learning_rate": 2.4859035326086956e-05, "loss": 0.3598, "step": 10700 }, { "epoch": 1.6751721978710081, "grad_norm": 2.1849608421325684, "learning_rate": 2.484714673913043e-05, "loss": 0.3585, "step": 10701 }, { "epoch": 1.6753287413901066, "grad_norm": 0.6762170195579529, "learning_rate": 2.483525815217391e-05, "loss": 0.2273, "step": 10702 }, { "epoch": 1.6754852849092048, "grad_norm": 1.165895700454712, "learning_rate": 2.482336956521739e-05, "loss": 0.5389, "step": 10703 }, { "epoch": 1.675641828428303, "grad_norm": 1.2346985340118408, "learning_rate": 2.4811480978260868e-05, "loss": 0.3526, "step": 10704 }, { "epoch": 1.6757983719474012, "grad_norm": 4.08147668838501, "learning_rate": 2.4799592391304346e-05, "loss": 0.3803, "step": 10705 }, { "epoch": 1.6759549154664997, "grad_norm": 9.625836372375488, "learning_rate": 2.4787703804347824e-05, "loss": 0.4391, "step": 10706 }, { "epoch": 1.676111458985598, "grad_norm": 0.9085838794708252, "learning_rate": 2.47758152173913e-05, "loss": 0.2196, "step": 10707 }, { "epoch": 1.6762680025046963, "grad_norm": 1.6417495012283325, "learning_rate": 2.476392663043478e-05, "loss": 0.379, "step": 10708 }, { "epoch": 1.6764245460237945, "grad_norm": 1.3206959962844849, "learning_rate": 2.475203804347826e-05, "loss": 0.1769, "step": 10709 }, { "epoch": 1.676581089542893, "grad_norm": 2.472099542617798, "learning_rate": 2.4740149456521737e-05, "loss": 0.3635, "step": 10710 }, { "epoch": 1.6767376330619912, "grad_norm": 1.3867287635803223, "learning_rate": 2.4728260869565215e-05, "loss": 0.3013, "step": 10711 }, { "epoch": 1.6768941765810896, "grad_norm": 2.42145037651062, "learning_rate": 2.4716372282608697e-05, "loss": 0.3393, "step": 10712 }, { "epoch": 1.6770507201001879, "grad_norm": 3.9459080696105957, "learning_rate": 2.470448369565217e-05, "loss": 0.7543, "step": 10713 }, { "epoch": 1.677207263619286, "grad_norm": 1.774423599243164, "learning_rate": 2.469259510869565e-05, "loss": 0.3821, "step": 10714 }, { "epoch": 1.6773638071383845, "grad_norm": 1.778343677520752, "learning_rate": 2.4680706521739128e-05, "loss": 0.8475, "step": 10715 }, { "epoch": 1.6775203506574827, "grad_norm": 3.8205137252807617, "learning_rate": 2.4668817934782606e-05, "loss": 0.5962, "step": 10716 }, { "epoch": 1.6776768941765812, "grad_norm": 3.0486762523651123, "learning_rate": 2.4656929347826088e-05, "loss": 0.5371, "step": 10717 }, { "epoch": 1.6778334376956794, "grad_norm": 4.22676420211792, "learning_rate": 2.4645040760869566e-05, "loss": 0.6746, "step": 10718 }, { "epoch": 1.6779899812147776, "grad_norm": 2.385401964187622, "learning_rate": 2.463315217391304e-05, "loss": 0.6982, "step": 10719 }, { "epoch": 1.678146524733876, "grad_norm": 2.0121428966522217, "learning_rate": 2.462126358695652e-05, "loss": 0.6769, "step": 10720 }, { "epoch": 1.6783030682529745, "grad_norm": 5.593378067016602, "learning_rate": 2.4609374999999997e-05, "loss": 1.0315, "step": 10721 }, { "epoch": 1.6784596117720727, "grad_norm": 6.943414688110352, "learning_rate": 2.459748641304348e-05, "loss": 0.5675, "step": 10722 }, { "epoch": 1.678616155291171, "grad_norm": 4.537980556488037, "learning_rate": 2.4585597826086956e-05, "loss": 0.5832, "step": 10723 }, { "epoch": 1.6787726988102691, "grad_norm": 5.894924163818359, "learning_rate": 2.457370923913043e-05, "loss": 0.8348, "step": 10724 }, { "epoch": 1.6789292423293676, "grad_norm": 2.666269302368164, "learning_rate": 2.456182065217391e-05, "loss": 1.231, "step": 10725 }, { "epoch": 1.679085785848466, "grad_norm": 5.458833694458008, "learning_rate": 2.4549932065217388e-05, "loss": 1.2503, "step": 10726 }, { "epoch": 1.6792423293675642, "grad_norm": 2.87387752532959, "learning_rate": 2.453804347826087e-05, "loss": 0.5831, "step": 10727 }, { "epoch": 1.6793988728866625, "grad_norm": 2.495779514312744, "learning_rate": 2.4526154891304347e-05, "loss": 0.6605, "step": 10728 }, { "epoch": 1.6795554164057607, "grad_norm": 4.150745391845703, "learning_rate": 2.4514266304347825e-05, "loss": 0.6683, "step": 10729 }, { "epoch": 1.679711959924859, "grad_norm": 2.5458133220672607, "learning_rate": 2.45023777173913e-05, "loss": 0.7146, "step": 10730 }, { "epoch": 1.6798685034439576, "grad_norm": 5.24901819229126, "learning_rate": 2.449048913043478e-05, "loss": 1.5867, "step": 10731 }, { "epoch": 1.6800250469630558, "grad_norm": 3.0826878547668457, "learning_rate": 2.447860054347826e-05, "loss": 0.444, "step": 10732 }, { "epoch": 1.680181590482154, "grad_norm": 3.1218619346618652, "learning_rate": 2.4466711956521738e-05, "loss": 1.2717, "step": 10733 }, { "epoch": 1.6803381340012522, "grad_norm": 7.702990531921387, "learning_rate": 2.4454823369565216e-05, "loss": 0.8183, "step": 10734 }, { "epoch": 1.6804946775203506, "grad_norm": 2.4709761142730713, "learning_rate": 2.4442934782608694e-05, "loss": 0.3452, "step": 10735 }, { "epoch": 1.680651221039449, "grad_norm": 3.1449382305145264, "learning_rate": 2.443104619565217e-05, "loss": 0.9981, "step": 10736 }, { "epoch": 1.6808077645585473, "grad_norm": 2.2492716312408447, "learning_rate": 2.441915760869565e-05, "loss": 0.5922, "step": 10737 }, { "epoch": 1.6809643080776455, "grad_norm": 2.7484242916107178, "learning_rate": 2.440726902173913e-05, "loss": 0.775, "step": 10738 }, { "epoch": 1.6811208515967437, "grad_norm": 1.4362150430679321, "learning_rate": 2.4395380434782607e-05, "loss": 0.2535, "step": 10739 }, { "epoch": 1.6812773951158422, "grad_norm": 0.5896499156951904, "learning_rate": 2.4383491847826085e-05, "loss": 0.2143, "step": 10740 }, { "epoch": 1.6814339386349406, "grad_norm": 0.6959473490715027, "learning_rate": 2.4371603260869563e-05, "loss": 0.1867, "step": 10741 }, { "epoch": 1.6815904821540388, "grad_norm": 0.9570159912109375, "learning_rate": 2.435971467391304e-05, "loss": 0.2157, "step": 10742 }, { "epoch": 1.681747025673137, "grad_norm": 1.478625774383545, "learning_rate": 2.434782608695652e-05, "loss": 0.3055, "step": 10743 }, { "epoch": 1.6819035691922355, "grad_norm": 0.7018715143203735, "learning_rate": 2.4335937499999998e-05, "loss": 0.2132, "step": 10744 }, { "epoch": 1.6820601127113337, "grad_norm": 1.0878819227218628, "learning_rate": 2.4324048913043476e-05, "loss": 0.3381, "step": 10745 }, { "epoch": 1.6822166562304322, "grad_norm": 0.9246894121170044, "learning_rate": 2.4312160326086954e-05, "loss": 0.3216, "step": 10746 }, { "epoch": 1.6823731997495304, "grad_norm": 1.4946147203445435, "learning_rate": 2.4300271739130432e-05, "loss": 0.3478, "step": 10747 }, { "epoch": 1.6825297432686286, "grad_norm": 1.1981205940246582, "learning_rate": 2.428838315217391e-05, "loss": 0.2248, "step": 10748 }, { "epoch": 1.682686286787727, "grad_norm": 0.8280710577964783, "learning_rate": 2.427649456521739e-05, "loss": 0.1914, "step": 10749 }, { "epoch": 1.6828428303068252, "grad_norm": 1.0530157089233398, "learning_rate": 2.4264605978260867e-05, "loss": 0.2738, "step": 10750 }, { "epoch": 1.6829993738259237, "grad_norm": 1.1087888479232788, "learning_rate": 2.4252717391304345e-05, "loss": 0.3199, "step": 10751 }, { "epoch": 1.683155917345022, "grad_norm": 2.348078727722168, "learning_rate": 2.4240828804347826e-05, "loss": 0.4189, "step": 10752 }, { "epoch": 1.6833124608641201, "grad_norm": 1.3935095071792603, "learning_rate": 2.42289402173913e-05, "loss": 0.3132, "step": 10753 }, { "epoch": 1.6834690043832186, "grad_norm": 1.260053277015686, "learning_rate": 2.421705163043478e-05, "loss": 0.4455, "step": 10754 }, { "epoch": 1.683625547902317, "grad_norm": 2.062098503112793, "learning_rate": 2.4205163043478257e-05, "loss": 0.5286, "step": 10755 }, { "epoch": 1.6837820914214152, "grad_norm": 1.2811813354492188, "learning_rate": 2.419327445652174e-05, "loss": 0.3007, "step": 10756 }, { "epoch": 1.6839386349405134, "grad_norm": 2.674208164215088, "learning_rate": 2.4181385869565217e-05, "loss": 0.3559, "step": 10757 }, { "epoch": 1.6840951784596117, "grad_norm": 1.780931830406189, "learning_rate": 2.4169497282608695e-05, "loss": 0.4493, "step": 10758 }, { "epoch": 1.68425172197871, "grad_norm": 2.2463183403015137, "learning_rate": 2.415760869565217e-05, "loss": 0.766, "step": 10759 }, { "epoch": 1.6844082654978085, "grad_norm": 1.8443766832351685, "learning_rate": 2.4145720108695648e-05, "loss": 0.4562, "step": 10760 }, { "epoch": 1.6845648090169068, "grad_norm": 1.6424840688705444, "learning_rate": 2.413383152173913e-05, "loss": 0.4203, "step": 10761 }, { "epoch": 1.684721352536005, "grad_norm": 2.2627930641174316, "learning_rate": 2.4121942934782608e-05, "loss": 0.4471, "step": 10762 }, { "epoch": 1.6848778960551032, "grad_norm": 1.6345069408416748, "learning_rate": 2.4110054347826086e-05, "loss": 0.3264, "step": 10763 }, { "epoch": 1.6850344395742016, "grad_norm": 2.573805570602417, "learning_rate": 2.4098165760869564e-05, "loss": 0.7635, "step": 10764 }, { "epoch": 1.6851909830933, "grad_norm": 3.1504669189453125, "learning_rate": 2.408627717391304e-05, "loss": 0.5641, "step": 10765 }, { "epoch": 1.6853475266123983, "grad_norm": 3.163479804992676, "learning_rate": 2.407438858695652e-05, "loss": 0.5596, "step": 10766 }, { "epoch": 1.6855040701314965, "grad_norm": 2.370175838470459, "learning_rate": 2.40625e-05, "loss": 0.7092, "step": 10767 }, { "epoch": 1.6856606136505947, "grad_norm": 3.278265953063965, "learning_rate": 2.4050611413043477e-05, "loss": 0.7874, "step": 10768 }, { "epoch": 1.6858171571696932, "grad_norm": 2.9037094116210938, "learning_rate": 2.4038722826086955e-05, "loss": 0.31, "step": 10769 }, { "epoch": 1.6859737006887916, "grad_norm": 2.2983288764953613, "learning_rate": 2.402683423913043e-05, "loss": 0.6386, "step": 10770 }, { "epoch": 1.6861302442078898, "grad_norm": 7.492010593414307, "learning_rate": 2.401494565217391e-05, "loss": 0.9814, "step": 10771 }, { "epoch": 1.686286787726988, "grad_norm": 4.512316703796387, "learning_rate": 2.400305706521739e-05, "loss": 1.0199, "step": 10772 }, { "epoch": 1.6864433312460863, "grad_norm": 3.621886730194092, "learning_rate": 2.3991168478260868e-05, "loss": 1.0497, "step": 10773 }, { "epoch": 1.6865998747651847, "grad_norm": 3.2825610637664795, "learning_rate": 2.3979279891304346e-05, "loss": 0.7169, "step": 10774 }, { "epoch": 1.6867564182842831, "grad_norm": 2.574028491973877, "learning_rate": 2.3967391304347824e-05, "loss": 0.9707, "step": 10775 }, { "epoch": 1.6869129618033814, "grad_norm": 3.308969259262085, "learning_rate": 2.3955502717391302e-05, "loss": 1.2453, "step": 10776 }, { "epoch": 1.6870695053224796, "grad_norm": 4.3224687576293945, "learning_rate": 2.394361413043478e-05, "loss": 1.072, "step": 10777 }, { "epoch": 1.687226048841578, "grad_norm": 2.125545024871826, "learning_rate": 2.393172554347826e-05, "loss": 0.9693, "step": 10778 }, { "epoch": 1.6873825923606762, "grad_norm": 4.255035877227783, "learning_rate": 2.3919836956521737e-05, "loss": 0.8587, "step": 10779 }, { "epoch": 1.6875391358797747, "grad_norm": 8.170124053955078, "learning_rate": 2.3907948369565215e-05, "loss": 0.8991, "step": 10780 }, { "epoch": 1.6876956793988729, "grad_norm": 2.965085744857788, "learning_rate": 2.3896059782608696e-05, "loss": 0.6758, "step": 10781 }, { "epoch": 1.687852222917971, "grad_norm": 8.440535545349121, "learning_rate": 2.388417119565217e-05, "loss": 1.1914, "step": 10782 }, { "epoch": 1.6880087664370695, "grad_norm": 2.7989468574523926, "learning_rate": 2.387228260869565e-05, "loss": 0.9149, "step": 10783 }, { "epoch": 1.688165309956168, "grad_norm": 1.1804900169372559, "learning_rate": 2.3860394021739127e-05, "loss": 0.4554, "step": 10784 }, { "epoch": 1.6883218534752662, "grad_norm": 3.115361213684082, "learning_rate": 2.3848505434782606e-05, "loss": 0.8704, "step": 10785 }, { "epoch": 1.6884783969943644, "grad_norm": 1.715522289276123, "learning_rate": 2.3836616847826087e-05, "loss": 0.5497, "step": 10786 }, { "epoch": 1.6886349405134626, "grad_norm": 2.3457939624786377, "learning_rate": 2.3824728260869565e-05, "loss": 0.5444, "step": 10787 }, { "epoch": 1.688791484032561, "grad_norm": 3.591726303100586, "learning_rate": 2.381283967391304e-05, "loss": 1.3374, "step": 10788 }, { "epoch": 1.6889480275516595, "grad_norm": 1.2686960697174072, "learning_rate": 2.3800951086956518e-05, "loss": 0.264, "step": 10789 }, { "epoch": 1.6891045710707577, "grad_norm": 0.6093018054962158, "learning_rate": 2.3789062499999996e-05, "loss": 0.2633, "step": 10790 }, { "epoch": 1.689261114589856, "grad_norm": 0.5650773644447327, "learning_rate": 2.3777173913043478e-05, "loss": 0.1785, "step": 10791 }, { "epoch": 1.6894176581089542, "grad_norm": 1.0226354598999023, "learning_rate": 2.3765285326086956e-05, "loss": 0.1849, "step": 10792 }, { "epoch": 1.6895742016280526, "grad_norm": 0.7639710307121277, "learning_rate": 2.375339673913043e-05, "loss": 0.2537, "step": 10793 }, { "epoch": 1.689730745147151, "grad_norm": 1.5107357501983643, "learning_rate": 2.374150815217391e-05, "loss": 0.3487, "step": 10794 }, { "epoch": 1.6898872886662493, "grad_norm": 1.2483105659484863, "learning_rate": 2.3729619565217387e-05, "loss": 0.2973, "step": 10795 }, { "epoch": 1.6900438321853475, "grad_norm": 1.1993328332901, "learning_rate": 2.371773097826087e-05, "loss": 0.359, "step": 10796 }, { "epoch": 1.6902003757044457, "grad_norm": 1.2846691608428955, "learning_rate": 2.3705842391304347e-05, "loss": 0.2602, "step": 10797 }, { "epoch": 1.6903569192235441, "grad_norm": 0.5089133381843567, "learning_rate": 2.3693953804347825e-05, "loss": 0.3062, "step": 10798 }, { "epoch": 1.6905134627426426, "grad_norm": 1.9532452821731567, "learning_rate": 2.36820652173913e-05, "loss": 0.3188, "step": 10799 }, { "epoch": 1.6906700062617408, "grad_norm": 2.273158073425293, "learning_rate": 2.3670176630434778e-05, "loss": 0.2704, "step": 10800 }, { "epoch": 1.690826549780839, "grad_norm": 1.1985983848571777, "learning_rate": 2.365828804347826e-05, "loss": 0.4176, "step": 10801 }, { "epoch": 1.6909830932999372, "grad_norm": 2.1570427417755127, "learning_rate": 2.3646399456521738e-05, "loss": 0.2963, "step": 10802 }, { "epoch": 1.6911396368190357, "grad_norm": 1.287315011024475, "learning_rate": 2.3634510869565216e-05, "loss": 0.3337, "step": 10803 }, { "epoch": 1.6912961803381341, "grad_norm": 1.0306700468063354, "learning_rate": 2.3622622282608694e-05, "loss": 0.3328, "step": 10804 }, { "epoch": 1.6914527238572323, "grad_norm": 2.111396074295044, "learning_rate": 2.361073369565217e-05, "loss": 0.3983, "step": 10805 }, { "epoch": 1.6916092673763305, "grad_norm": 2.261899471282959, "learning_rate": 2.359884510869565e-05, "loss": 0.3102, "step": 10806 }, { "epoch": 1.6917658108954288, "grad_norm": 1.8814866542816162, "learning_rate": 2.358695652173913e-05, "loss": 0.3028, "step": 10807 }, { "epoch": 1.6919223544145272, "grad_norm": 3.601728916168213, "learning_rate": 2.3575067934782607e-05, "loss": 0.6307, "step": 10808 }, { "epoch": 1.6920788979336256, "grad_norm": 1.77829909324646, "learning_rate": 2.3563179347826085e-05, "loss": 0.3923, "step": 10809 }, { "epoch": 1.6922354414527239, "grad_norm": 6.893371105194092, "learning_rate": 2.3551290760869566e-05, "loss": 0.6999, "step": 10810 }, { "epoch": 1.692391984971822, "grad_norm": 4.3846516609191895, "learning_rate": 2.353940217391304e-05, "loss": 0.8029, "step": 10811 }, { "epoch": 1.6925485284909205, "grad_norm": 2.585505962371826, "learning_rate": 2.352751358695652e-05, "loss": 0.7309, "step": 10812 }, { "epoch": 1.6927050720100187, "grad_norm": 2.968914031982422, "learning_rate": 2.3515624999999997e-05, "loss": 0.6189, "step": 10813 }, { "epoch": 1.6928616155291172, "grad_norm": 2.726740598678589, "learning_rate": 2.3503736413043475e-05, "loss": 0.6321, "step": 10814 }, { "epoch": 1.6930181590482154, "grad_norm": 2.451282024383545, "learning_rate": 2.3491847826086957e-05, "loss": 0.5794, "step": 10815 }, { "epoch": 1.6931747025673136, "grad_norm": 2.210481882095337, "learning_rate": 2.3479959239130432e-05, "loss": 0.7766, "step": 10816 }, { "epoch": 1.693331246086412, "grad_norm": 3.049813985824585, "learning_rate": 2.346807065217391e-05, "loss": 0.5986, "step": 10817 }, { "epoch": 1.6934877896055105, "grad_norm": 0.9628913402557373, "learning_rate": 2.3456182065217388e-05, "loss": 0.1516, "step": 10818 }, { "epoch": 1.6936443331246087, "grad_norm": 3.573599338531494, "learning_rate": 2.3444293478260866e-05, "loss": 0.8356, "step": 10819 }, { "epoch": 1.693800876643707, "grad_norm": 8.818928718566895, "learning_rate": 2.3432404891304348e-05, "loss": 0.7328, "step": 10820 }, { "epoch": 1.6939574201628051, "grad_norm": 2.473158836364746, "learning_rate": 2.3420516304347826e-05, "loss": 0.7498, "step": 10821 }, { "epoch": 1.6941139636819036, "grad_norm": 3.8829739093780518, "learning_rate": 2.34086277173913e-05, "loss": 0.4566, "step": 10822 }, { "epoch": 1.694270507201002, "grad_norm": 2.1909923553466797, "learning_rate": 2.339673913043478e-05, "loss": 0.5833, "step": 10823 }, { "epoch": 1.6944270507201002, "grad_norm": 3.492260217666626, "learning_rate": 2.3384850543478257e-05, "loss": 0.6553, "step": 10824 }, { "epoch": 1.6945835942391985, "grad_norm": 3.3822567462921143, "learning_rate": 2.337296195652174e-05, "loss": 1.4855, "step": 10825 }, { "epoch": 1.6947401377582967, "grad_norm": 2.7644288539886475, "learning_rate": 2.3361073369565217e-05, "loss": 1.0357, "step": 10826 }, { "epoch": 1.6948966812773951, "grad_norm": 4.917076587677002, "learning_rate": 2.3349184782608695e-05, "loss": 1.0476, "step": 10827 }, { "epoch": 1.6950532247964936, "grad_norm": 4.415504455566406, "learning_rate": 2.333729619565217e-05, "loss": 1.0347, "step": 10828 }, { "epoch": 1.6952097683155918, "grad_norm": 4.729331970214844, "learning_rate": 2.3325407608695648e-05, "loss": 0.6898, "step": 10829 }, { "epoch": 1.69536631183469, "grad_norm": 2.224867105484009, "learning_rate": 2.331351902173913e-05, "loss": 0.5502, "step": 10830 }, { "epoch": 1.6955228553537882, "grad_norm": 4.24811315536499, "learning_rate": 2.3301630434782608e-05, "loss": 1.0422, "step": 10831 }, { "epoch": 1.6956793988728867, "grad_norm": 2.545135736465454, "learning_rate": 2.3289741847826086e-05, "loss": 1.1123, "step": 10832 }, { "epoch": 1.695835942391985, "grad_norm": 3.0824027061462402, "learning_rate": 2.3277853260869564e-05, "loss": 1.1057, "step": 10833 }, { "epoch": 1.6959924859110833, "grad_norm": 3.8684308528900146, "learning_rate": 2.326596467391304e-05, "loss": 1.1266, "step": 10834 }, { "epoch": 1.6961490294301815, "grad_norm": 2.2309792041778564, "learning_rate": 2.325407608695652e-05, "loss": 0.729, "step": 10835 }, { "epoch": 1.6963055729492797, "grad_norm": 2.868328809738159, "learning_rate": 2.3242187499999998e-05, "loss": 0.5072, "step": 10836 }, { "epoch": 1.6964621164683782, "grad_norm": 2.09637188911438, "learning_rate": 2.3230298913043476e-05, "loss": 0.6531, "step": 10837 }, { "epoch": 1.6966186599874766, "grad_norm": 2.105530023574829, "learning_rate": 2.3218410326086955e-05, "loss": 0.7242, "step": 10838 }, { "epoch": 1.6967752035065748, "grad_norm": 0.7734873294830322, "learning_rate": 2.320652173913043e-05, "loss": 0.2205, "step": 10839 }, { "epoch": 1.696931747025673, "grad_norm": 0.5399865508079529, "learning_rate": 2.319463315217391e-05, "loss": 0.2422, "step": 10840 }, { "epoch": 1.6970882905447713, "grad_norm": 1.8416098356246948, "learning_rate": 2.318274456521739e-05, "loss": 0.2674, "step": 10841 }, { "epoch": 1.6972448340638697, "grad_norm": 0.8559640645980835, "learning_rate": 2.3170855978260867e-05, "loss": 0.2369, "step": 10842 }, { "epoch": 1.6974013775829682, "grad_norm": 0.6832001209259033, "learning_rate": 2.3158967391304345e-05, "loss": 0.1827, "step": 10843 }, { "epoch": 1.6975579211020664, "grad_norm": 1.0042650699615479, "learning_rate": 2.3147078804347827e-05, "loss": 0.231, "step": 10844 }, { "epoch": 1.6977144646211646, "grad_norm": 0.6778267025947571, "learning_rate": 2.31351902173913e-05, "loss": 0.2094, "step": 10845 }, { "epoch": 1.697871008140263, "grad_norm": 2.452240467071533, "learning_rate": 2.312330163043478e-05, "loss": 0.2114, "step": 10846 }, { "epoch": 1.6980275516593613, "grad_norm": 1.4504586458206177, "learning_rate": 2.3111413043478258e-05, "loss": 0.2893, "step": 10847 }, { "epoch": 1.6981840951784597, "grad_norm": 1.3921871185302734, "learning_rate": 2.3099524456521736e-05, "loss": 0.3724, "step": 10848 }, { "epoch": 1.698340638697558, "grad_norm": 0.901400625705719, "learning_rate": 2.3087635869565218e-05, "loss": 0.1313, "step": 10849 }, { "epoch": 1.6984971822166561, "grad_norm": 1.3710230588912964, "learning_rate": 2.3075747282608696e-05, "loss": 0.2168, "step": 10850 }, { "epoch": 1.6986537257357546, "grad_norm": 2.030353546142578, "learning_rate": 2.306385869565217e-05, "loss": 0.3765, "step": 10851 }, { "epoch": 1.698810269254853, "grad_norm": 4.237790107727051, "learning_rate": 2.305197010869565e-05, "loss": 0.4833, "step": 10852 }, { "epoch": 1.6989668127739512, "grad_norm": 1.2575849294662476, "learning_rate": 2.3040081521739127e-05, "loss": 0.3958, "step": 10853 }, { "epoch": 1.6991233562930494, "grad_norm": 2.1291310787200928, "learning_rate": 2.302819293478261e-05, "loss": 0.2605, "step": 10854 }, { "epoch": 1.6992798998121477, "grad_norm": 1.1534584760665894, "learning_rate": 2.3016304347826087e-05, "loss": 0.3751, "step": 10855 }, { "epoch": 1.699436443331246, "grad_norm": 4.103093147277832, "learning_rate": 2.3004415760869565e-05, "loss": 0.6018, "step": 10856 }, { "epoch": 1.6995929868503445, "grad_norm": 2.32681941986084, "learning_rate": 2.299252717391304e-05, "loss": 0.3487, "step": 10857 }, { "epoch": 1.6997495303694428, "grad_norm": 1.29725182056427, "learning_rate": 2.2980638586956518e-05, "loss": 0.3266, "step": 10858 }, { "epoch": 1.699906073888541, "grad_norm": 1.2457268238067627, "learning_rate": 2.296875e-05, "loss": 0.3156, "step": 10859 }, { "epoch": 1.7000626174076392, "grad_norm": 1.89055597782135, "learning_rate": 2.2956861413043477e-05, "loss": 0.3763, "step": 10860 }, { "epoch": 1.7002191609267376, "grad_norm": 1.6400425434112549, "learning_rate": 2.2944972826086956e-05, "loss": 0.3863, "step": 10861 }, { "epoch": 1.700375704445836, "grad_norm": 20.13998794555664, "learning_rate": 2.293308423913043e-05, "loss": 0.4198, "step": 10862 }, { "epoch": 1.7005322479649343, "grad_norm": 3.9182729721069336, "learning_rate": 2.292119565217391e-05, "loss": 0.7749, "step": 10863 }, { "epoch": 1.7006887914840325, "grad_norm": 2.363445520401001, "learning_rate": 2.290930706521739e-05, "loss": 0.5247, "step": 10864 }, { "epoch": 1.7008453350031307, "grad_norm": 1.3386287689208984, "learning_rate": 2.2897418478260868e-05, "loss": 0.2924, "step": 10865 }, { "epoch": 1.7010018785222292, "grad_norm": 3.197828769683838, "learning_rate": 2.2885529891304346e-05, "loss": 0.5443, "step": 10866 }, { "epoch": 1.7011584220413276, "grad_norm": 1.5751068592071533, "learning_rate": 2.2873641304347825e-05, "loss": 0.3914, "step": 10867 }, { "epoch": 1.7013149655604258, "grad_norm": 2.824615001678467, "learning_rate": 2.28617527173913e-05, "loss": 0.7144, "step": 10868 }, { "epoch": 1.701471509079524, "grad_norm": 2.7708849906921387, "learning_rate": 2.284986413043478e-05, "loss": 0.7476, "step": 10869 }, { "epoch": 1.7016280525986223, "grad_norm": 10.164655685424805, "learning_rate": 2.283797554347826e-05, "loss": 0.6474, "step": 10870 }, { "epoch": 1.7017845961177207, "grad_norm": 5.105922698974609, "learning_rate": 2.2826086956521737e-05, "loss": 0.6903, "step": 10871 }, { "epoch": 1.7019411396368191, "grad_norm": 3.600689649581909, "learning_rate": 2.2814198369565215e-05, "loss": 0.3688, "step": 10872 }, { "epoch": 1.7020976831559174, "grad_norm": 2.777726411819458, "learning_rate": 2.2802309782608697e-05, "loss": 0.5923, "step": 10873 }, { "epoch": 1.7022542266750156, "grad_norm": 2.3126089572906494, "learning_rate": 2.279042119565217e-05, "loss": 1.2279, "step": 10874 }, { "epoch": 1.7024107701941138, "grad_norm": 1.8390530347824097, "learning_rate": 2.277853260869565e-05, "loss": 0.7158, "step": 10875 }, { "epoch": 1.7025673137132122, "grad_norm": 4.306124210357666, "learning_rate": 2.2766644021739128e-05, "loss": 1.2416, "step": 10876 }, { "epoch": 1.7027238572323107, "grad_norm": 3.6386687755584717, "learning_rate": 2.2754755434782606e-05, "loss": 0.8912, "step": 10877 }, { "epoch": 1.702880400751409, "grad_norm": 2.9149842262268066, "learning_rate": 2.2742866847826088e-05, "loss": 0.7166, "step": 10878 }, { "epoch": 1.7030369442705071, "grad_norm": 3.5276412963867188, "learning_rate": 2.2730978260869566e-05, "loss": 1.1525, "step": 10879 }, { "epoch": 1.7031934877896056, "grad_norm": 3.162618398666382, "learning_rate": 2.271908967391304e-05, "loss": 0.7752, "step": 10880 }, { "epoch": 1.7033500313087038, "grad_norm": 2.7941863536834717, "learning_rate": 2.270720108695652e-05, "loss": 0.8308, "step": 10881 }, { "epoch": 1.7035065748278022, "grad_norm": 6.1531147956848145, "learning_rate": 2.2695312499999997e-05, "loss": 1.4293, "step": 10882 }, { "epoch": 1.7036631183469004, "grad_norm": 4.611825466156006, "learning_rate": 2.268342391304348e-05, "loss": 0.9405, "step": 10883 }, { "epoch": 1.7038196618659986, "grad_norm": 3.6336472034454346, "learning_rate": 2.2671535326086957e-05, "loss": 1.063, "step": 10884 }, { "epoch": 1.703976205385097, "grad_norm": 4.221210479736328, "learning_rate": 2.265964673913043e-05, "loss": 0.8, "step": 10885 }, { "epoch": 1.7041327489041955, "grad_norm": 2.114530324935913, "learning_rate": 2.264775815217391e-05, "loss": 0.9146, "step": 10886 }, { "epoch": 1.7042892924232937, "grad_norm": 4.908699035644531, "learning_rate": 2.2635869565217388e-05, "loss": 1.3599, "step": 10887 }, { "epoch": 1.704445835942392, "grad_norm": 1.8204169273376465, "learning_rate": 2.262398097826087e-05, "loss": 0.6168, "step": 10888 }, { "epoch": 1.7046023794614902, "grad_norm": 1.110382318496704, "learning_rate": 2.2612092391304347e-05, "loss": 0.3376, "step": 10889 }, { "epoch": 1.7047589229805886, "grad_norm": 0.8984686732292175, "learning_rate": 2.2600203804347825e-05, "loss": 0.2366, "step": 10890 }, { "epoch": 1.704915466499687, "grad_norm": 1.2503798007965088, "learning_rate": 2.25883152173913e-05, "loss": 0.2618, "step": 10891 }, { "epoch": 1.7050720100187853, "grad_norm": 1.0248969793319702, "learning_rate": 2.257642663043478e-05, "loss": 0.3119, "step": 10892 }, { "epoch": 1.7052285535378835, "grad_norm": 1.7354310750961304, "learning_rate": 2.256453804347826e-05, "loss": 0.2307, "step": 10893 }, { "epoch": 1.7053850970569817, "grad_norm": 1.0854952335357666, "learning_rate": 2.2552649456521738e-05, "loss": 0.2369, "step": 10894 }, { "epoch": 1.7055416405760802, "grad_norm": 1.5646404027938843, "learning_rate": 2.2540760869565216e-05, "loss": 0.2467, "step": 10895 }, { "epoch": 1.7056981840951786, "grad_norm": 1.3851131200790405, "learning_rate": 2.2528872282608694e-05, "loss": 0.2511, "step": 10896 }, { "epoch": 1.7058547276142768, "grad_norm": 5.395506858825684, "learning_rate": 2.251698369565217e-05, "loss": 0.2779, "step": 10897 }, { "epoch": 1.706011271133375, "grad_norm": 0.6132009625434875, "learning_rate": 2.250509510869565e-05, "loss": 0.2331, "step": 10898 }, { "epoch": 1.7061678146524732, "grad_norm": 1.0484957695007324, "learning_rate": 2.249320652173913e-05, "loss": 0.2277, "step": 10899 }, { "epoch": 1.7063243581715717, "grad_norm": 0.9627106785774231, "learning_rate": 2.2481317934782607e-05, "loss": 0.2178, "step": 10900 }, { "epoch": 1.7064809016906701, "grad_norm": 0.9697428941726685, "learning_rate": 2.2469429347826085e-05, "loss": 0.3389, "step": 10901 }, { "epoch": 1.7066374452097683, "grad_norm": 2.225573778152466, "learning_rate": 2.2457540760869567e-05, "loss": 0.6454, "step": 10902 }, { "epoch": 1.7067939887288666, "grad_norm": 1.0080353021621704, "learning_rate": 2.244565217391304e-05, "loss": 0.4061, "step": 10903 }, { "epoch": 1.7069505322479648, "grad_norm": 2.070333242416382, "learning_rate": 2.243376358695652e-05, "loss": 0.5954, "step": 10904 }, { "epoch": 1.7071070757670632, "grad_norm": 0.8788936138153076, "learning_rate": 2.2421874999999998e-05, "loss": 0.2436, "step": 10905 }, { "epoch": 1.7072636192861617, "grad_norm": 2.3308560848236084, "learning_rate": 2.2409986413043476e-05, "loss": 0.3762, "step": 10906 }, { "epoch": 1.7074201628052599, "grad_norm": 1.565144419670105, "learning_rate": 2.2398097826086958e-05, "loss": 0.4269, "step": 10907 }, { "epoch": 1.707576706324358, "grad_norm": 1.5374361276626587, "learning_rate": 2.2386209239130432e-05, "loss": 0.3243, "step": 10908 }, { "epoch": 1.7077332498434565, "grad_norm": 2.7596075534820557, "learning_rate": 2.237432065217391e-05, "loss": 0.4637, "step": 10909 }, { "epoch": 1.7078897933625548, "grad_norm": 1.0490702390670776, "learning_rate": 2.236243206521739e-05, "loss": 0.4906, "step": 10910 }, { "epoch": 1.7080463368816532, "grad_norm": 0.662324070930481, "learning_rate": 2.2350543478260867e-05, "loss": 0.2636, "step": 10911 }, { "epoch": 1.7082028804007514, "grad_norm": 1.4810850620269775, "learning_rate": 2.2338654891304348e-05, "loss": 0.3845, "step": 10912 }, { "epoch": 1.7083594239198496, "grad_norm": 4.559952735900879, "learning_rate": 2.2326766304347826e-05, "loss": 0.6395, "step": 10913 }, { "epoch": 1.708515967438948, "grad_norm": 2.3411929607391357, "learning_rate": 2.23148777173913e-05, "loss": 0.4692, "step": 10914 }, { "epoch": 1.7086725109580463, "grad_norm": 3.9152982234954834, "learning_rate": 2.230298913043478e-05, "loss": 0.4632, "step": 10915 }, { "epoch": 1.7088290544771447, "grad_norm": 6.087344646453857, "learning_rate": 2.2291100543478258e-05, "loss": 0.7052, "step": 10916 }, { "epoch": 1.708985597996243, "grad_norm": 2.742807626724243, "learning_rate": 2.227921195652174e-05, "loss": 1.2197, "step": 10917 }, { "epoch": 1.7091421415153412, "grad_norm": 3.3528554439544678, "learning_rate": 2.2267323369565217e-05, "loss": 0.6542, "step": 10918 }, { "epoch": 1.7092986850344396, "grad_norm": 2.0085678100585938, "learning_rate": 2.2255434782608695e-05, "loss": 0.3959, "step": 10919 }, { "epoch": 1.709455228553538, "grad_norm": 4.539134979248047, "learning_rate": 2.224354619565217e-05, "loss": 0.545, "step": 10920 }, { "epoch": 1.7096117720726363, "grad_norm": 3.751234292984009, "learning_rate": 2.223165760869565e-05, "loss": 0.2797, "step": 10921 }, { "epoch": 1.7097683155917345, "grad_norm": 2.396747589111328, "learning_rate": 2.221976902173913e-05, "loss": 0.8854, "step": 10922 }, { "epoch": 1.7099248591108327, "grad_norm": 4.701972961425781, "learning_rate": 2.2207880434782608e-05, "loss": 0.8415, "step": 10923 }, { "epoch": 1.7100814026299311, "grad_norm": 3.6583893299102783, "learning_rate": 2.2195991847826086e-05, "loss": 0.6876, "step": 10924 }, { "epoch": 1.7102379461490296, "grad_norm": 5.119818210601807, "learning_rate": 2.2184103260869564e-05, "loss": 0.9538, "step": 10925 }, { "epoch": 1.7103944896681278, "grad_norm": 2.7652084827423096, "learning_rate": 2.217221467391304e-05, "loss": 0.7833, "step": 10926 }, { "epoch": 1.710551033187226, "grad_norm": 5.224816799163818, "learning_rate": 2.216032608695652e-05, "loss": 0.832, "step": 10927 }, { "epoch": 1.7107075767063242, "grad_norm": 2.90120530128479, "learning_rate": 2.21484375e-05, "loss": 1.0625, "step": 10928 }, { "epoch": 1.7108641202254227, "grad_norm": 4.205680847167969, "learning_rate": 2.2136548913043477e-05, "loss": 1.3404, "step": 10929 }, { "epoch": 1.711020663744521, "grad_norm": 2.2649853229522705, "learning_rate": 2.2124660326086955e-05, "loss": 0.8807, "step": 10930 }, { "epoch": 1.7111772072636193, "grad_norm": 3.5063376426696777, "learning_rate": 2.211277173913043e-05, "loss": 0.7132, "step": 10931 }, { "epoch": 1.7113337507827175, "grad_norm": 2.648847818374634, "learning_rate": 2.210088315217391e-05, "loss": 1.6041, "step": 10932 }, { "epoch": 1.7114902943018158, "grad_norm": 3.2677602767944336, "learning_rate": 2.208899456521739e-05, "loss": 1.118, "step": 10933 }, { "epoch": 1.7116468378209142, "grad_norm": 2.5513927936553955, "learning_rate": 2.2077105978260868e-05, "loss": 0.9811, "step": 10934 }, { "epoch": 1.7118033813400126, "grad_norm": 2.274359941482544, "learning_rate": 2.2065217391304346e-05, "loss": 0.2739, "step": 10935 }, { "epoch": 1.7119599248591109, "grad_norm": 6.645880699157715, "learning_rate": 2.2053328804347824e-05, "loss": 0.9355, "step": 10936 }, { "epoch": 1.712116468378209, "grad_norm": 2.6372716426849365, "learning_rate": 2.2041440217391302e-05, "loss": 0.495, "step": 10937 }, { "epoch": 1.7122730118973073, "grad_norm": 2.8489198684692383, "learning_rate": 2.202955163043478e-05, "loss": 1.2084, "step": 10938 }, { "epoch": 1.7124295554164057, "grad_norm": 0.9592059254646301, "learning_rate": 2.201766304347826e-05, "loss": 0.2596, "step": 10939 }, { "epoch": 1.7125860989355042, "grad_norm": 0.7270509600639343, "learning_rate": 2.2005774456521737e-05, "loss": 0.2867, "step": 10940 }, { "epoch": 1.7127426424546024, "grad_norm": 0.7715315222740173, "learning_rate": 2.1993885869565215e-05, "loss": 0.2315, "step": 10941 }, { "epoch": 1.7128991859737006, "grad_norm": 0.728795051574707, "learning_rate": 2.1981997282608696e-05, "loss": 0.2202, "step": 10942 }, { "epoch": 1.713055729492799, "grad_norm": 0.859630286693573, "learning_rate": 2.197010869565217e-05, "loss": 0.265, "step": 10943 }, { "epoch": 1.7132122730118973, "grad_norm": 0.64517742395401, "learning_rate": 2.195822010869565e-05, "loss": 0.1781, "step": 10944 }, { "epoch": 1.7133688165309957, "grad_norm": 0.5996840596199036, "learning_rate": 2.1946331521739127e-05, "loss": 0.3153, "step": 10945 }, { "epoch": 1.713525360050094, "grad_norm": 0.724141001701355, "learning_rate": 2.1934442934782606e-05, "loss": 0.2012, "step": 10946 }, { "epoch": 1.7136819035691921, "grad_norm": 0.5102146863937378, "learning_rate": 2.1922554347826087e-05, "loss": 0.1839, "step": 10947 }, { "epoch": 1.7138384470882906, "grad_norm": 1.4386687278747559, "learning_rate": 2.1910665760869565e-05, "loss": 0.2893, "step": 10948 }, { "epoch": 1.7139949906073888, "grad_norm": 1.165918231010437, "learning_rate": 2.189877717391304e-05, "loss": 0.3364, "step": 10949 }, { "epoch": 1.7141515341264872, "grad_norm": 0.9257692098617554, "learning_rate": 2.1886888586956518e-05, "loss": 0.3104, "step": 10950 }, { "epoch": 1.7143080776455855, "grad_norm": 0.9946314096450806, "learning_rate": 2.1874999999999996e-05, "loss": 0.2456, "step": 10951 }, { "epoch": 1.7144646211646837, "grad_norm": 1.953068494796753, "learning_rate": 2.1863111413043478e-05, "loss": 0.4087, "step": 10952 }, { "epoch": 1.7146211646837821, "grad_norm": 2.4097824096679688, "learning_rate": 2.1851222826086956e-05, "loss": 0.3685, "step": 10953 }, { "epoch": 1.7147777082028806, "grad_norm": 2.3209309577941895, "learning_rate": 2.183933423913043e-05, "loss": 0.5094, "step": 10954 }, { "epoch": 1.7149342517219788, "grad_norm": 1.15758216381073, "learning_rate": 2.182744565217391e-05, "loss": 0.2887, "step": 10955 }, { "epoch": 1.715090795241077, "grad_norm": 3.2630226612091064, "learning_rate": 2.181555706521739e-05, "loss": 0.5175, "step": 10956 }, { "epoch": 1.7152473387601752, "grad_norm": 4.93676233291626, "learning_rate": 2.180366847826087e-05, "loss": 0.5954, "step": 10957 }, { "epoch": 1.7154038822792737, "grad_norm": 1.6543793678283691, "learning_rate": 2.1791779891304347e-05, "loss": 0.7335, "step": 10958 }, { "epoch": 1.715560425798372, "grad_norm": 3.067082643508911, "learning_rate": 2.1779891304347825e-05, "loss": 0.4923, "step": 10959 }, { "epoch": 1.7157169693174703, "grad_norm": 1.9572697877883911, "learning_rate": 2.17680027173913e-05, "loss": 0.3653, "step": 10960 }, { "epoch": 1.7158735128365685, "grad_norm": 3.5805585384368896, "learning_rate": 2.175611413043478e-05, "loss": 0.585, "step": 10961 }, { "epoch": 1.7160300563556667, "grad_norm": 2.295182943344116, "learning_rate": 2.174422554347826e-05, "loss": 0.3427, "step": 10962 }, { "epoch": 1.7161865998747652, "grad_norm": 2.513282299041748, "learning_rate": 2.1732336956521738e-05, "loss": 0.5166, "step": 10963 }, { "epoch": 1.7163431433938636, "grad_norm": 4.653680801391602, "learning_rate": 2.1720448369565216e-05, "loss": 0.6706, "step": 10964 }, { "epoch": 1.7164996869129618, "grad_norm": 1.4642263650894165, "learning_rate": 2.1708559782608694e-05, "loss": 0.4174, "step": 10965 }, { "epoch": 1.71665623043206, "grad_norm": 8.227136611938477, "learning_rate": 2.1696671195652172e-05, "loss": 0.7605, "step": 10966 }, { "epoch": 1.7168127739511583, "grad_norm": 2.3521711826324463, "learning_rate": 2.168478260869565e-05, "loss": 0.7125, "step": 10967 }, { "epoch": 1.7169693174702567, "grad_norm": 2.667524576187134, "learning_rate": 2.167289402173913e-05, "loss": 0.596, "step": 10968 }, { "epoch": 1.7171258609893552, "grad_norm": 6.55093240737915, "learning_rate": 2.1661005434782607e-05, "loss": 0.6396, "step": 10969 }, { "epoch": 1.7172824045084534, "grad_norm": 4.544257164001465, "learning_rate": 2.1649116847826085e-05, "loss": 0.8492, "step": 10970 }, { "epoch": 1.7174389480275516, "grad_norm": 4.418801784515381, "learning_rate": 2.1637228260869566e-05, "loss": 0.935, "step": 10971 }, { "epoch": 1.7175954915466498, "grad_norm": 4.239954948425293, "learning_rate": 2.162533967391304e-05, "loss": 0.8751, "step": 10972 }, { "epoch": 1.7177520350657483, "grad_norm": 2.7504470348358154, "learning_rate": 2.161345108695652e-05, "loss": 0.9767, "step": 10973 }, { "epoch": 1.7179085785848467, "grad_norm": 4.41430139541626, "learning_rate": 2.1601562499999997e-05, "loss": 1.0861, "step": 10974 }, { "epoch": 1.718065122103945, "grad_norm": 2.613518476486206, "learning_rate": 2.1589673913043476e-05, "loss": 0.5785, "step": 10975 }, { "epoch": 1.7182216656230431, "grad_norm": 2.0825998783111572, "learning_rate": 2.1577785326086957e-05, "loss": 0.6974, "step": 10976 }, { "epoch": 1.7183782091421416, "grad_norm": 4.375422477722168, "learning_rate": 2.1565896739130432e-05, "loss": 0.9092, "step": 10977 }, { "epoch": 1.7185347526612398, "grad_norm": 5.09165620803833, "learning_rate": 2.155400815217391e-05, "loss": 1.1083, "step": 10978 }, { "epoch": 1.7186912961803382, "grad_norm": 5.254854202270508, "learning_rate": 2.1542119565217388e-05, "loss": 0.7724, "step": 10979 }, { "epoch": 1.7188478396994364, "grad_norm": 8.067619323730469, "learning_rate": 2.1530230978260866e-05, "loss": 0.9104, "step": 10980 }, { "epoch": 1.7190043832185347, "grad_norm": 3.772038221359253, "learning_rate": 2.1518342391304348e-05, "loss": 1.1242, "step": 10981 }, { "epoch": 1.719160926737633, "grad_norm": 3.0793497562408447, "learning_rate": 2.1506453804347826e-05, "loss": 0.573, "step": 10982 }, { "epoch": 1.7193174702567313, "grad_norm": 2.837602138519287, "learning_rate": 2.14945652173913e-05, "loss": 1.27, "step": 10983 }, { "epoch": 1.7194740137758298, "grad_norm": 4.801596641540527, "learning_rate": 2.148267663043478e-05, "loss": 1.1318, "step": 10984 }, { "epoch": 1.719630557294928, "grad_norm": 3.221708059310913, "learning_rate": 2.1470788043478257e-05, "loss": 0.5384, "step": 10985 }, { "epoch": 1.7197871008140262, "grad_norm": 3.687497854232788, "learning_rate": 2.145889945652174e-05, "loss": 0.6947, "step": 10986 }, { "epoch": 1.7199436443331246, "grad_norm": 3.874307155609131, "learning_rate": 2.1447010869565217e-05, "loss": 0.6225, "step": 10987 }, { "epoch": 1.720100187852223, "grad_norm": 3.9001705646514893, "learning_rate": 2.1435122282608695e-05, "loss": 0.8153, "step": 10988 }, { "epoch": 1.7202567313713213, "grad_norm": 1.40394926071167, "learning_rate": 2.142323369565217e-05, "loss": 0.4318, "step": 10989 }, { "epoch": 1.7204132748904195, "grad_norm": 0.4645773768424988, "learning_rate": 2.1411345108695648e-05, "loss": 0.1803, "step": 10990 }, { "epoch": 1.7205698184095177, "grad_norm": 0.5649290084838867, "learning_rate": 2.139945652173913e-05, "loss": 0.1941, "step": 10991 }, { "epoch": 1.7207263619286162, "grad_norm": 0.9830725789070129, "learning_rate": 2.1387567934782608e-05, "loss": 0.2064, "step": 10992 }, { "epoch": 1.7208829054477146, "grad_norm": 1.2728556394577026, "learning_rate": 2.1375679347826086e-05, "loss": 0.3837, "step": 10993 }, { "epoch": 1.7210394489668128, "grad_norm": 1.3930323123931885, "learning_rate": 2.1363790760869564e-05, "loss": 0.2229, "step": 10994 }, { "epoch": 1.721195992485911, "grad_norm": 1.6747329235076904, "learning_rate": 2.135190217391304e-05, "loss": 0.3352, "step": 10995 }, { "epoch": 1.7213525360050093, "grad_norm": 3.4114766120910645, "learning_rate": 2.134001358695652e-05, "loss": 0.2858, "step": 10996 }, { "epoch": 1.7215090795241077, "grad_norm": 9.469321250915527, "learning_rate": 2.1328125e-05, "loss": 0.2387, "step": 10997 }, { "epoch": 1.7216656230432061, "grad_norm": 0.6778154373168945, "learning_rate": 2.1316236413043476e-05, "loss": 0.2742, "step": 10998 }, { "epoch": 1.7218221665623044, "grad_norm": 1.5328624248504639, "learning_rate": 2.1304347826086955e-05, "loss": 0.3087, "step": 10999 }, { "epoch": 1.7219787100814026, "grad_norm": 1.3693451881408691, "learning_rate": 2.129245923913043e-05, "loss": 0.3964, "step": 11000 }, { "epoch": 1.7219787100814026, "eval_loss": 0.48989829421043396, "eval_runtime": 204.0195, "eval_samples_per_second": 60.695, "eval_steps_per_second": 3.794, "eval_wer": 0.30968825041970377, "step": 11000 }, { "epoch": 1.7221352536005008, "grad_norm": 0.9444740414619446, "learning_rate": 2.128057065217391e-05, "loss": 0.3567, "step": 11001 }, { "epoch": 1.7222917971195992, "grad_norm": 1.478234887123108, "learning_rate": 2.126868206521739e-05, "loss": 0.3438, "step": 11002 }, { "epoch": 1.7224483406386977, "grad_norm": 1.8189070224761963, "learning_rate": 2.1256793478260867e-05, "loss": 0.4707, "step": 11003 }, { "epoch": 1.722604884157796, "grad_norm": 2.1475369930267334, "learning_rate": 2.1244904891304345e-05, "loss": 0.4428, "step": 11004 }, { "epoch": 1.722761427676894, "grad_norm": 1.4131685495376587, "learning_rate": 2.1233016304347827e-05, "loss": 0.4771, "step": 11005 }, { "epoch": 1.7229179711959923, "grad_norm": 1.2214477062225342, "learning_rate": 2.1221127717391302e-05, "loss": 0.3935, "step": 11006 }, { "epoch": 1.7230745147150908, "grad_norm": 1.2421962022781372, "learning_rate": 2.120923913043478e-05, "loss": 0.3163, "step": 11007 }, { "epoch": 1.7232310582341892, "grad_norm": 1.135184407234192, "learning_rate": 2.1197350543478258e-05, "loss": 0.3464, "step": 11008 }, { "epoch": 1.7233876017532874, "grad_norm": 2.3962278366088867, "learning_rate": 2.1185461956521736e-05, "loss": 0.6015, "step": 11009 }, { "epoch": 1.7235441452723856, "grad_norm": 2.609158992767334, "learning_rate": 2.1173573369565218e-05, "loss": 0.6392, "step": 11010 }, { "epoch": 1.723700688791484, "grad_norm": 2.5496327877044678, "learning_rate": 2.1161684782608696e-05, "loss": 0.472, "step": 11011 }, { "epoch": 1.7238572323105823, "grad_norm": 2.2880780696868896, "learning_rate": 2.114979619565217e-05, "loss": 0.3225, "step": 11012 }, { "epoch": 1.7240137758296807, "grad_norm": 2.035449981689453, "learning_rate": 2.113790760869565e-05, "loss": 0.6603, "step": 11013 }, { "epoch": 1.724170319348779, "grad_norm": 25.227153778076172, "learning_rate": 2.1126019021739127e-05, "loss": 0.7972, "step": 11014 }, { "epoch": 1.7243268628678772, "grad_norm": 2.267313003540039, "learning_rate": 2.111413043478261e-05, "loss": 0.7352, "step": 11015 }, { "epoch": 1.7244834063869756, "grad_norm": 1.6655514240264893, "learning_rate": 2.1102241847826087e-05, "loss": 0.4232, "step": 11016 }, { "epoch": 1.724639949906074, "grad_norm": 1.9986313581466675, "learning_rate": 2.1090353260869565e-05, "loss": 0.4355, "step": 11017 }, { "epoch": 1.7247964934251723, "grad_norm": 5.854803562164307, "learning_rate": 2.107846467391304e-05, "loss": 0.4965, "step": 11018 }, { "epoch": 1.7249530369442705, "grad_norm": 3.4049911499023438, "learning_rate": 2.1066576086956518e-05, "loss": 0.7988, "step": 11019 }, { "epoch": 1.7251095804633687, "grad_norm": 4.721397399902344, "learning_rate": 2.10546875e-05, "loss": 0.5771, "step": 11020 }, { "epoch": 1.7252661239824671, "grad_norm": 1.5940344333648682, "learning_rate": 2.1042798913043477e-05, "loss": 0.4846, "step": 11021 }, { "epoch": 1.7254226675015656, "grad_norm": 2.1519081592559814, "learning_rate": 2.1030910326086956e-05, "loss": 0.5834, "step": 11022 }, { "epoch": 1.7255792110206638, "grad_norm": 3.149533271789551, "learning_rate": 2.101902173913043e-05, "loss": 1.1316, "step": 11023 }, { "epoch": 1.725735754539762, "grad_norm": 6.354373455047607, "learning_rate": 2.100713315217391e-05, "loss": 1.3422, "step": 11024 }, { "epoch": 1.7258922980588602, "grad_norm": 5.3175578117370605, "learning_rate": 2.099524456521739e-05, "loss": 1.0261, "step": 11025 }, { "epoch": 1.7260488415779587, "grad_norm": 3.467806100845337, "learning_rate": 2.0983355978260868e-05, "loss": 0.8299, "step": 11026 }, { "epoch": 1.7262053850970571, "grad_norm": 2.592381715774536, "learning_rate": 2.0971467391304346e-05, "loss": 0.5844, "step": 11027 }, { "epoch": 1.7263619286161553, "grad_norm": 3.0649237632751465, "learning_rate": 2.0959578804347825e-05, "loss": 1.209, "step": 11028 }, { "epoch": 1.7265184721352536, "grad_norm": 3.7811641693115234, "learning_rate": 2.09476902173913e-05, "loss": 1.5154, "step": 11029 }, { "epoch": 1.7266750156543518, "grad_norm": 3.0590884685516357, "learning_rate": 2.093580163043478e-05, "loss": 0.8428, "step": 11030 }, { "epoch": 1.7268315591734502, "grad_norm": 4.482877254486084, "learning_rate": 2.092391304347826e-05, "loss": 1.301, "step": 11031 }, { "epoch": 1.7269881026925487, "grad_norm": 2.431084156036377, "learning_rate": 2.0912024456521737e-05, "loss": 0.8302, "step": 11032 }, { "epoch": 1.7271446462116469, "grad_norm": 14.884596824645996, "learning_rate": 2.0900135869565215e-05, "loss": 1.1214, "step": 11033 }, { "epoch": 1.727301189730745, "grad_norm": 2.163391351699829, "learning_rate": 2.0888247282608697e-05, "loss": 0.5142, "step": 11034 }, { "epoch": 1.7274577332498433, "grad_norm": 2.590573787689209, "learning_rate": 2.087635869565217e-05, "loss": 0.7059, "step": 11035 }, { "epoch": 1.7276142767689417, "grad_norm": 3.191420078277588, "learning_rate": 2.086447010869565e-05, "loss": 0.8408, "step": 11036 }, { "epoch": 1.7277708202880402, "grad_norm": 2.7372212409973145, "learning_rate": 2.0852581521739128e-05, "loss": 0.9596, "step": 11037 }, { "epoch": 1.7279273638071384, "grad_norm": 4.082023620605469, "learning_rate": 2.0840692934782606e-05, "loss": 1.0236, "step": 11038 }, { "epoch": 1.7280839073262366, "grad_norm": 0.7017141580581665, "learning_rate": 2.0828804347826088e-05, "loss": 0.2162, "step": 11039 }, { "epoch": 1.7282404508453348, "grad_norm": 0.5479418039321899, "learning_rate": 2.0816915760869566e-05, "loss": 0.2297, "step": 11040 }, { "epoch": 1.7283969943644333, "grad_norm": 0.9714682698249817, "learning_rate": 2.080502717391304e-05, "loss": 0.3078, "step": 11041 }, { "epoch": 1.7285535378835317, "grad_norm": 0.642406165599823, "learning_rate": 2.079313858695652e-05, "loss": 0.1894, "step": 11042 }, { "epoch": 1.72871008140263, "grad_norm": 1.3286471366882324, "learning_rate": 2.0781249999999997e-05, "loss": 0.2546, "step": 11043 }, { "epoch": 1.7288666249217282, "grad_norm": 1.731328010559082, "learning_rate": 2.076936141304348e-05, "loss": 0.4396, "step": 11044 }, { "epoch": 1.7290231684408266, "grad_norm": 0.7832604050636292, "learning_rate": 2.0757472826086957e-05, "loss": 0.2171, "step": 11045 }, { "epoch": 1.7291797119599248, "grad_norm": 1.0080935955047607, "learning_rate": 2.074558423913043e-05, "loss": 0.2157, "step": 11046 }, { "epoch": 1.7293362554790233, "grad_norm": 1.5293692350387573, "learning_rate": 2.073369565217391e-05, "loss": 0.2978, "step": 11047 }, { "epoch": 1.7294927989981215, "grad_norm": 1.711125135421753, "learning_rate": 2.0721807065217388e-05, "loss": 0.3409, "step": 11048 }, { "epoch": 1.7296493425172197, "grad_norm": 1.249939203262329, "learning_rate": 2.070991847826087e-05, "loss": 0.2412, "step": 11049 }, { "epoch": 1.7298058860363181, "grad_norm": 2.1627278327941895, "learning_rate": 2.0698029891304347e-05, "loss": 0.312, "step": 11050 }, { "epoch": 1.7299624295554166, "grad_norm": 1.6448572874069214, "learning_rate": 2.0686141304347826e-05, "loss": 0.3382, "step": 11051 }, { "epoch": 1.7301189730745148, "grad_norm": 4.3190083503723145, "learning_rate": 2.06742527173913e-05, "loss": 0.478, "step": 11052 }, { "epoch": 1.730275516593613, "grad_norm": 2.592197895050049, "learning_rate": 2.066236413043478e-05, "loss": 0.3582, "step": 11053 }, { "epoch": 1.7304320601127112, "grad_norm": 1.8064007759094238, "learning_rate": 2.065047554347826e-05, "loss": 0.6749, "step": 11054 }, { "epoch": 1.7305886036318097, "grad_norm": 2.523695945739746, "learning_rate": 2.0638586956521738e-05, "loss": 0.5736, "step": 11055 }, { "epoch": 1.730745147150908, "grad_norm": 1.0386204719543457, "learning_rate": 2.0626698369565216e-05, "loss": 0.2846, "step": 11056 }, { "epoch": 1.7309016906700063, "grad_norm": 0.7362631559371948, "learning_rate": 2.0614809782608694e-05, "loss": 0.2564, "step": 11057 }, { "epoch": 1.7310582341891045, "grad_norm": 2.1041476726531982, "learning_rate": 2.060292119565217e-05, "loss": 0.4221, "step": 11058 }, { "epoch": 1.7312147777082028, "grad_norm": 2.6771750450134277, "learning_rate": 2.059103260869565e-05, "loss": 0.5488, "step": 11059 }, { "epoch": 1.7313713212273012, "grad_norm": 1.1895883083343506, "learning_rate": 2.057914402173913e-05, "loss": 0.3129, "step": 11060 }, { "epoch": 1.7315278647463996, "grad_norm": 2.709317445755005, "learning_rate": 2.0567255434782607e-05, "loss": 0.3996, "step": 11061 }, { "epoch": 1.7316844082654979, "grad_norm": 2.024113416671753, "learning_rate": 2.0555366847826085e-05, "loss": 0.7242, "step": 11062 }, { "epoch": 1.731840951784596, "grad_norm": 2.0749809741973877, "learning_rate": 2.0543478260869567e-05, "loss": 0.5117, "step": 11063 }, { "epoch": 1.7319974953036943, "grad_norm": 4.4358344078063965, "learning_rate": 2.053158967391304e-05, "loss": 0.9314, "step": 11064 }, { "epoch": 1.7321540388227927, "grad_norm": 25.940671920776367, "learning_rate": 2.051970108695652e-05, "loss": 0.6326, "step": 11065 }, { "epoch": 1.7323105823418912, "grad_norm": 1.5892549753189087, "learning_rate": 2.0507812499999998e-05, "loss": 0.4824, "step": 11066 }, { "epoch": 1.7324671258609894, "grad_norm": 1.7679214477539062, "learning_rate": 2.0495923913043476e-05, "loss": 0.455, "step": 11067 }, { "epoch": 1.7326236693800876, "grad_norm": 2.6718456745147705, "learning_rate": 2.0484035326086958e-05, "loss": 0.7037, "step": 11068 }, { "epoch": 1.7327802128991858, "grad_norm": 3.452190637588501, "learning_rate": 2.0472146739130432e-05, "loss": 0.8668, "step": 11069 }, { "epoch": 1.7329367564182843, "grad_norm": 2.1627256870269775, "learning_rate": 2.046025815217391e-05, "loss": 0.3944, "step": 11070 }, { "epoch": 1.7330932999373827, "grad_norm": 1.7307653427124023, "learning_rate": 2.044836956521739e-05, "loss": 0.3485, "step": 11071 }, { "epoch": 1.733249843456481, "grad_norm": 3.611262321472168, "learning_rate": 2.0436480978260867e-05, "loss": 0.8355, "step": 11072 }, { "epoch": 1.7334063869755791, "grad_norm": 7.040108680725098, "learning_rate": 2.042459239130435e-05, "loss": 0.9261, "step": 11073 }, { "epoch": 1.7335629304946774, "grad_norm": 2.83695125579834, "learning_rate": 2.0412703804347827e-05, "loss": 0.5903, "step": 11074 }, { "epoch": 1.7337194740137758, "grad_norm": 11.528519630432129, "learning_rate": 2.04008152173913e-05, "loss": 1.2562, "step": 11075 }, { "epoch": 1.7338760175328742, "grad_norm": 5.299741268157959, "learning_rate": 2.038892663043478e-05, "loss": 1.1738, "step": 11076 }, { "epoch": 1.7340325610519725, "grad_norm": 3.693483829498291, "learning_rate": 2.0377038043478258e-05, "loss": 0.765, "step": 11077 }, { "epoch": 1.7341891045710707, "grad_norm": 2.492950677871704, "learning_rate": 2.036514945652174e-05, "loss": 0.8123, "step": 11078 }, { "epoch": 1.7343456480901691, "grad_norm": 3.6850435733795166, "learning_rate": 2.0353260869565217e-05, "loss": 1.1956, "step": 11079 }, { "epoch": 1.7345021916092673, "grad_norm": 4.956048965454102, "learning_rate": 2.0341372282608695e-05, "loss": 1.5858, "step": 11080 }, { "epoch": 1.7346587351283658, "grad_norm": 2.174010753631592, "learning_rate": 2.032948369565217e-05, "loss": 0.8737, "step": 11081 }, { "epoch": 1.734815278647464, "grad_norm": 5.31271505355835, "learning_rate": 2.031759510869565e-05, "loss": 1.6148, "step": 11082 }, { "epoch": 1.7349718221665622, "grad_norm": 1.9179514646530151, "learning_rate": 2.030570652173913e-05, "loss": 0.9277, "step": 11083 }, { "epoch": 1.7351283656856606, "grad_norm": 2.4655404090881348, "learning_rate": 2.0293817934782608e-05, "loss": 0.6486, "step": 11084 }, { "epoch": 1.735284909204759, "grad_norm": 3.1135311126708984, "learning_rate": 2.0281929347826086e-05, "loss": 0.5787, "step": 11085 }, { "epoch": 1.7354414527238573, "grad_norm": 4.076301097869873, "learning_rate": 2.0270040760869564e-05, "loss": 0.5981, "step": 11086 }, { "epoch": 1.7355979962429555, "grad_norm": 3.117687225341797, "learning_rate": 2.025815217391304e-05, "loss": 0.595, "step": 11087 }, { "epoch": 1.7357545397620537, "grad_norm": 2.321453094482422, "learning_rate": 2.024626358695652e-05, "loss": 1.1979, "step": 11088 }, { "epoch": 1.7359110832811522, "grad_norm": 0.7705173492431641, "learning_rate": 2.0234375e-05, "loss": 0.2564, "step": 11089 }, { "epoch": 1.7360676268002506, "grad_norm": 0.5692620873451233, "learning_rate": 2.0222486413043477e-05, "loss": 0.2542, "step": 11090 }, { "epoch": 1.7362241703193488, "grad_norm": 1.6118474006652832, "learning_rate": 2.0210597826086955e-05, "loss": 0.2051, "step": 11091 }, { "epoch": 1.736380713838447, "grad_norm": 0.7703117728233337, "learning_rate": 2.019870923913043e-05, "loss": 0.2007, "step": 11092 }, { "epoch": 1.7365372573575453, "grad_norm": 0.9465517997741699, "learning_rate": 2.018682065217391e-05, "loss": 0.3528, "step": 11093 }, { "epoch": 1.7366938008766437, "grad_norm": 0.8271805644035339, "learning_rate": 2.017493206521739e-05, "loss": 0.2049, "step": 11094 }, { "epoch": 1.7368503443957422, "grad_norm": 0.8511632084846497, "learning_rate": 2.0163043478260868e-05, "loss": 0.3166, "step": 11095 }, { "epoch": 1.7370068879148404, "grad_norm": 0.5247347354888916, "learning_rate": 2.0151154891304346e-05, "loss": 0.2299, "step": 11096 }, { "epoch": 1.7371634314339386, "grad_norm": 0.9269756078720093, "learning_rate": 2.0139266304347827e-05, "loss": 0.3765, "step": 11097 }, { "epoch": 1.7373199749530368, "grad_norm": 1.145110011100769, "learning_rate": 2.0127377717391302e-05, "loss": 0.4228, "step": 11098 }, { "epoch": 1.7374765184721352, "grad_norm": 12.360448837280273, "learning_rate": 2.011548913043478e-05, "loss": 0.4428, "step": 11099 }, { "epoch": 1.7376330619912337, "grad_norm": 1.0768539905548096, "learning_rate": 2.010360054347826e-05, "loss": 0.3685, "step": 11100 }, { "epoch": 1.737789605510332, "grad_norm": 1.6442958116531372, "learning_rate": 2.0091711956521737e-05, "loss": 0.2154, "step": 11101 }, { "epoch": 1.7379461490294301, "grad_norm": 2.341592311859131, "learning_rate": 2.0079823369565218e-05, "loss": 0.28, "step": 11102 }, { "epoch": 1.7381026925485283, "grad_norm": 1.2948733568191528, "learning_rate": 2.0067934782608696e-05, "loss": 0.3513, "step": 11103 }, { "epoch": 1.7382592360676268, "grad_norm": 4.846363067626953, "learning_rate": 2.005604619565217e-05, "loss": 0.4178, "step": 11104 }, { "epoch": 1.7384157795867252, "grad_norm": 1.6065536737442017, "learning_rate": 2.004415760869565e-05, "loss": 0.3139, "step": 11105 }, { "epoch": 1.7385723231058234, "grad_norm": 2.5199005603790283, "learning_rate": 2.0032269021739128e-05, "loss": 0.5004, "step": 11106 }, { "epoch": 1.7387288666249217, "grad_norm": 1.7702378034591675, "learning_rate": 2.002038043478261e-05, "loss": 0.3213, "step": 11107 }, { "epoch": 1.7388854101440199, "grad_norm": 1.9458616971969604, "learning_rate": 2.0008491847826087e-05, "loss": 0.3268, "step": 11108 }, { "epoch": 1.7390419536631183, "grad_norm": 2.1471216678619385, "learning_rate": 1.9996603260869565e-05, "loss": 0.6269, "step": 11109 }, { "epoch": 1.7391984971822168, "grad_norm": 5.706935882568359, "learning_rate": 1.998471467391304e-05, "loss": 0.5311, "step": 11110 }, { "epoch": 1.739355040701315, "grad_norm": 2.024519681930542, "learning_rate": 1.9972826086956518e-05, "loss": 0.5426, "step": 11111 }, { "epoch": 1.7395115842204132, "grad_norm": 2.7581324577331543, "learning_rate": 1.99609375e-05, "loss": 0.6302, "step": 11112 }, { "epoch": 1.7396681277395116, "grad_norm": 1.6789865493774414, "learning_rate": 1.9949048913043478e-05, "loss": 0.3331, "step": 11113 }, { "epoch": 1.7398246712586098, "grad_norm": 1.8090062141418457, "learning_rate": 1.9937160326086956e-05, "loss": 0.268, "step": 11114 }, { "epoch": 1.7399812147777083, "grad_norm": 3.4062681198120117, "learning_rate": 1.992527173913043e-05, "loss": 0.6893, "step": 11115 }, { "epoch": 1.7401377582968065, "grad_norm": 3.2536332607269287, "learning_rate": 1.991338315217391e-05, "loss": 0.6302, "step": 11116 }, { "epoch": 1.7402943018159047, "grad_norm": 2.956906318664551, "learning_rate": 1.990149456521739e-05, "loss": 0.6372, "step": 11117 }, { "epoch": 1.7404508453350032, "grad_norm": 1.6968371868133545, "learning_rate": 1.988960597826087e-05, "loss": 0.5632, "step": 11118 }, { "epoch": 1.7406073888541016, "grad_norm": 1.432328701019287, "learning_rate": 1.9877717391304347e-05, "loss": 0.4355, "step": 11119 }, { "epoch": 1.7407639323731998, "grad_norm": 4.371121406555176, "learning_rate": 1.9865828804347825e-05, "loss": 0.4896, "step": 11120 }, { "epoch": 1.740920475892298, "grad_norm": 7.325766563415527, "learning_rate": 1.98539402173913e-05, "loss": 0.7154, "step": 11121 }, { "epoch": 1.7410770194113963, "grad_norm": 4.069987773895264, "learning_rate": 1.984205163043478e-05, "loss": 0.726, "step": 11122 }, { "epoch": 1.7412335629304947, "grad_norm": 11.902520179748535, "learning_rate": 1.983016304347826e-05, "loss": 0.337, "step": 11123 }, { "epoch": 1.7413901064495931, "grad_norm": 3.514312267303467, "learning_rate": 1.9818274456521738e-05, "loss": 0.8056, "step": 11124 }, { "epoch": 1.7415466499686914, "grad_norm": 10.438179016113281, "learning_rate": 1.9806385869565216e-05, "loss": 0.6993, "step": 11125 }, { "epoch": 1.7417031934877896, "grad_norm": 3.455845594406128, "learning_rate": 1.9794497282608694e-05, "loss": 0.7317, "step": 11126 }, { "epoch": 1.7418597370068878, "grad_norm": 5.638486385345459, "learning_rate": 1.9782608695652172e-05, "loss": 0.8658, "step": 11127 }, { "epoch": 1.7420162805259862, "grad_norm": 5.535280704498291, "learning_rate": 1.977072010869565e-05, "loss": 1.2442, "step": 11128 }, { "epoch": 1.7421728240450847, "grad_norm": 3.5843911170959473, "learning_rate": 1.975883152173913e-05, "loss": 1.3294, "step": 11129 }, { "epoch": 1.7423293675641829, "grad_norm": 3.2030458450317383, "learning_rate": 1.9746942934782607e-05, "loss": 1.1828, "step": 11130 }, { "epoch": 1.742485911083281, "grad_norm": 3.168473958969116, "learning_rate": 1.9735054347826085e-05, "loss": 0.9033, "step": 11131 }, { "epoch": 1.7426424546023793, "grad_norm": 1.671071171760559, "learning_rate": 1.9723165760869566e-05, "loss": 0.7213, "step": 11132 }, { "epoch": 1.7427989981214778, "grad_norm": 7.6236958503723145, "learning_rate": 1.971127717391304e-05, "loss": 1.5168, "step": 11133 }, { "epoch": 1.7429555416405762, "grad_norm": 2.32814359664917, "learning_rate": 1.969938858695652e-05, "loss": 0.4802, "step": 11134 }, { "epoch": 1.7431120851596744, "grad_norm": NaN, "learning_rate": 1.969938858695652e-05, "loss": 0.0, "step": 11135 }, { "epoch": 1.7432686286787726, "grad_norm": 4.118825912475586, "learning_rate": 1.9687499999999997e-05, "loss": 0.412, "step": 11136 }, { "epoch": 1.7434251721978709, "grad_norm": 1.6117165088653564, "learning_rate": 1.9675611413043476e-05, "loss": 0.6846, "step": 11137 }, { "epoch": 1.7435817157169693, "grad_norm": 2.4359383583068848, "learning_rate": 1.9663722826086957e-05, "loss": 1.2096, "step": 11138 }, { "epoch": 1.7437382592360677, "grad_norm": 0.9280950427055359, "learning_rate": 1.9651834239130432e-05, "loss": 0.2409, "step": 11139 }, { "epoch": 1.743894802755166, "grad_norm": 0.8989452123641968, "learning_rate": 1.963994565217391e-05, "loss": 0.3687, "step": 11140 }, { "epoch": 1.7440513462742642, "grad_norm": 1.8401728868484497, "learning_rate": 1.9628057065217388e-05, "loss": 0.2369, "step": 11141 }, { "epoch": 1.7442078897933626, "grad_norm": 0.8467321395874023, "learning_rate": 1.9616168478260866e-05, "loss": 0.2167, "step": 11142 }, { "epoch": 1.7443644333124608, "grad_norm": 1.5909407138824463, "learning_rate": 1.9604279891304348e-05, "loss": 0.2395, "step": 11143 }, { "epoch": 1.7445209768315593, "grad_norm": 1.0054457187652588, "learning_rate": 1.9592391304347826e-05, "loss": 0.2413, "step": 11144 }, { "epoch": 1.7446775203506575, "grad_norm": 2.1488070487976074, "learning_rate": 1.95805027173913e-05, "loss": 0.26, "step": 11145 }, { "epoch": 1.7448340638697557, "grad_norm": 1.4430122375488281, "learning_rate": 1.956861413043478e-05, "loss": 0.2727, "step": 11146 }, { "epoch": 1.7449906073888541, "grad_norm": 0.9280151128768921, "learning_rate": 1.9556725543478257e-05, "loss": 0.2214, "step": 11147 }, { "epoch": 1.7451471509079524, "grad_norm": 1.5630557537078857, "learning_rate": 1.954483695652174e-05, "loss": 0.3322, "step": 11148 }, { "epoch": 1.7453036944270508, "grad_norm": 1.0685065984725952, "learning_rate": 1.9532948369565217e-05, "loss": 0.2831, "step": 11149 }, { "epoch": 1.745460237946149, "grad_norm": 2.274332284927368, "learning_rate": 1.9521059782608695e-05, "loss": 0.2051, "step": 11150 }, { "epoch": 1.7456167814652472, "grad_norm": 1.5054755210876465, "learning_rate": 1.950917119565217e-05, "loss": 0.2033, "step": 11151 }, { "epoch": 1.7457733249843457, "grad_norm": 1.6202837228775024, "learning_rate": 1.949728260869565e-05, "loss": 0.3683, "step": 11152 }, { "epoch": 1.7459298685034441, "grad_norm": 1.6025335788726807, "learning_rate": 1.948539402173913e-05, "loss": 0.4456, "step": 11153 }, { "epoch": 1.7460864120225423, "grad_norm": 1.3742411136627197, "learning_rate": 1.9473505434782608e-05, "loss": 0.3095, "step": 11154 }, { "epoch": 1.7462429555416406, "grad_norm": 1.4322175979614258, "learning_rate": 1.9461616847826086e-05, "loss": 0.4119, "step": 11155 }, { "epoch": 1.7463994990607388, "grad_norm": 2.246206760406494, "learning_rate": 1.9449728260869564e-05, "loss": 0.5678, "step": 11156 }, { "epoch": 1.7465560425798372, "grad_norm": 2.883898973464966, "learning_rate": 1.9437839673913042e-05, "loss": 0.7824, "step": 11157 }, { "epoch": 1.7467125860989356, "grad_norm": 3.6438074111938477, "learning_rate": 1.942595108695652e-05, "loss": 0.6123, "step": 11158 }, { "epoch": 1.7468691296180339, "grad_norm": 2.1416208744049072, "learning_rate": 1.94140625e-05, "loss": 0.6209, "step": 11159 }, { "epoch": 1.747025673137132, "grad_norm": 3.1453261375427246, "learning_rate": 1.9402173913043477e-05, "loss": 0.5737, "step": 11160 }, { "epoch": 1.7471822166562303, "grad_norm": 3.0439889430999756, "learning_rate": 1.9390285326086955e-05, "loss": 0.3772, "step": 11161 }, { "epoch": 1.7473387601753287, "grad_norm": 3.6582865715026855, "learning_rate": 1.9378396739130433e-05, "loss": 0.5549, "step": 11162 }, { "epoch": 1.7474953036944272, "grad_norm": 7.128550052642822, "learning_rate": 1.936650815217391e-05, "loss": 0.6758, "step": 11163 }, { "epoch": 1.7476518472135254, "grad_norm": 3.4382741451263428, "learning_rate": 1.935461956521739e-05, "loss": 0.4275, "step": 11164 }, { "epoch": 1.7478083907326236, "grad_norm": 3.022498607635498, "learning_rate": 1.9342730978260867e-05, "loss": 0.6998, "step": 11165 }, { "epoch": 1.7479649342517218, "grad_norm": 2.681574821472168, "learning_rate": 1.9330842391304345e-05, "loss": 0.8368, "step": 11166 }, { "epoch": 1.7481214777708203, "grad_norm": 3.4111015796661377, "learning_rate": 1.9318953804347827e-05, "loss": 0.4321, "step": 11167 }, { "epoch": 1.7482780212899187, "grad_norm": 5.155689716339111, "learning_rate": 1.9307065217391302e-05, "loss": 0.9593, "step": 11168 }, { "epoch": 1.748434564809017, "grad_norm": 3.662680149078369, "learning_rate": 1.929517663043478e-05, "loss": 0.647, "step": 11169 }, { "epoch": 1.7485911083281152, "grad_norm": 3.315366506576538, "learning_rate": 1.9283288043478258e-05, "loss": 0.6281, "step": 11170 }, { "epoch": 1.7487476518472134, "grad_norm": 3.8381402492523193, "learning_rate": 1.9271399456521736e-05, "loss": 0.8088, "step": 11171 }, { "epoch": 1.7489041953663118, "grad_norm": 2.9327826499938965, "learning_rate": 1.9259510869565218e-05, "loss": 0.8582, "step": 11172 }, { "epoch": 1.7490607388854102, "grad_norm": 3.348848819732666, "learning_rate": 1.9247622282608696e-05, "loss": 0.556, "step": 11173 }, { "epoch": 1.7492172824045085, "grad_norm": 3.701103925704956, "learning_rate": 1.923573369565217e-05, "loss": 0.7816, "step": 11174 }, { "epoch": 1.7493738259236067, "grad_norm": 3.538564443588257, "learning_rate": 1.922384510869565e-05, "loss": 0.7055, "step": 11175 }, { "epoch": 1.7495303694427051, "grad_norm": 6.685811519622803, "learning_rate": 1.9211956521739127e-05, "loss": 1.1809, "step": 11176 }, { "epoch": 1.7496869129618033, "grad_norm": 7.601191520690918, "learning_rate": 1.920006793478261e-05, "loss": 1.2462, "step": 11177 }, { "epoch": 1.7498434564809018, "grad_norm": 2.345186710357666, "learning_rate": 1.9188179347826087e-05, "loss": 0.938, "step": 11178 }, { "epoch": 1.75, "grad_norm": 3.1795156002044678, "learning_rate": 1.9176290760869565e-05, "loss": 0.4636, "step": 11179 }, { "epoch": 1.7501565435190982, "grad_norm": 3.932297468185425, "learning_rate": 1.916440217391304e-05, "loss": 0.7067, "step": 11180 }, { "epoch": 1.7503130870381967, "grad_norm": 3.493405342102051, "learning_rate": 1.9152513586956518e-05, "loss": 0.6251, "step": 11181 }, { "epoch": 1.7504696305572949, "grad_norm": 2.661776304244995, "learning_rate": 1.9140625e-05, "loss": 1.1345, "step": 11182 }, { "epoch": 1.7506261740763933, "grad_norm": 2.241960287094116, "learning_rate": 1.9128736413043478e-05, "loss": 1.0745, "step": 11183 }, { "epoch": 1.7507827175954915, "grad_norm": 1.8533296585083008, "learning_rate": 1.9116847826086956e-05, "loss": 0.5318, "step": 11184 }, { "epoch": 1.7509392611145898, "grad_norm": 3.137636184692383, "learning_rate": 1.910495923913043e-05, "loss": 0.8941, "step": 11185 }, { "epoch": 1.7510958046336882, "grad_norm": 2.9002151489257812, "learning_rate": 1.909307065217391e-05, "loss": 0.8583, "step": 11186 }, { "epoch": 1.7512523481527866, "grad_norm": 2.170685291290283, "learning_rate": 1.908118206521739e-05, "loss": 0.5629, "step": 11187 }, { "epoch": 1.7514088916718848, "grad_norm": 2.661564350128174, "learning_rate": 1.906929347826087e-05, "loss": 0.8811, "step": 11188 }, { "epoch": 1.751565435190983, "grad_norm": 0.6320489048957825, "learning_rate": 1.9057404891304346e-05, "loss": 0.2385, "step": 11189 }, { "epoch": 1.7517219787100813, "grad_norm": 3.3406424522399902, "learning_rate": 1.9045516304347825e-05, "loss": 0.3143, "step": 11190 }, { "epoch": 1.7518785222291797, "grad_norm": 1.0771323442459106, "learning_rate": 1.90336277173913e-05, "loss": 0.3596, "step": 11191 }, { "epoch": 1.7520350657482782, "grad_norm": 0.7562718391418457, "learning_rate": 1.902173913043478e-05, "loss": 0.2747, "step": 11192 }, { "epoch": 1.7521916092673764, "grad_norm": 0.7402568459510803, "learning_rate": 1.900985054347826e-05, "loss": 0.2965, "step": 11193 }, { "epoch": 1.7523481527864746, "grad_norm": 1.6420753002166748, "learning_rate": 1.8997961956521737e-05, "loss": 0.3405, "step": 11194 }, { "epoch": 1.7525046963055728, "grad_norm": 1.703488826751709, "learning_rate": 1.8986073369565215e-05, "loss": 0.2807, "step": 11195 }, { "epoch": 1.7526612398246713, "grad_norm": 0.8439343571662903, "learning_rate": 1.8974184782608697e-05, "loss": 0.3412, "step": 11196 }, { "epoch": 1.7528177833437697, "grad_norm": 2.6513524055480957, "learning_rate": 1.8962296195652172e-05, "loss": 0.5514, "step": 11197 }, { "epoch": 1.752974326862868, "grad_norm": 1.7805927991867065, "learning_rate": 1.895040760869565e-05, "loss": 0.3099, "step": 11198 }, { "epoch": 1.7531308703819661, "grad_norm": 3.041438341140747, "learning_rate": 1.8938519021739128e-05, "loss": 0.3989, "step": 11199 }, { "epoch": 1.7532874139010644, "grad_norm": 0.9599918127059937, "learning_rate": 1.8926630434782606e-05, "loss": 0.3067, "step": 11200 }, { "epoch": 1.7534439574201628, "grad_norm": 2.1358275413513184, "learning_rate": 1.8914741847826088e-05, "loss": 0.4209, "step": 11201 }, { "epoch": 1.7536005009392612, "grad_norm": 2.2146475315093994, "learning_rate": 1.8902853260869566e-05, "loss": 0.6447, "step": 11202 }, { "epoch": 1.7537570444583594, "grad_norm": 1.1681498289108276, "learning_rate": 1.889096467391304e-05, "loss": 0.3013, "step": 11203 }, { "epoch": 1.7539135879774577, "grad_norm": 2.2345364093780518, "learning_rate": 1.887907608695652e-05, "loss": 0.5223, "step": 11204 }, { "epoch": 1.7540701314965559, "grad_norm": 2.8455159664154053, "learning_rate": 1.8867187499999997e-05, "loss": 0.3544, "step": 11205 }, { "epoch": 1.7542266750156543, "grad_norm": 0.8011422157287598, "learning_rate": 1.885529891304348e-05, "loss": 0.2922, "step": 11206 }, { "epoch": 1.7543832185347528, "grad_norm": 0.712848961353302, "learning_rate": 1.8843410326086957e-05, "loss": 0.2877, "step": 11207 }, { "epoch": 1.754539762053851, "grad_norm": 3.8128013610839844, "learning_rate": 1.883152173913043e-05, "loss": 0.4826, "step": 11208 }, { "epoch": 1.7546963055729492, "grad_norm": 2.003769636154175, "learning_rate": 1.881963315217391e-05, "loss": 0.5584, "step": 11209 }, { "epoch": 1.7548528490920476, "grad_norm": 1.2419204711914062, "learning_rate": 1.8807744565217388e-05, "loss": 0.4279, "step": 11210 }, { "epoch": 1.7550093926111459, "grad_norm": 7.065700531005859, "learning_rate": 1.879585597826087e-05, "loss": 0.4255, "step": 11211 }, { "epoch": 1.7551659361302443, "grad_norm": 1.741346001625061, "learning_rate": 1.8783967391304347e-05, "loss": 0.5743, "step": 11212 }, { "epoch": 1.7553224796493425, "grad_norm": 2.3320093154907227, "learning_rate": 1.8772078804347826e-05, "loss": 0.6069, "step": 11213 }, { "epoch": 1.7554790231684407, "grad_norm": 1.9959325790405273, "learning_rate": 1.87601902173913e-05, "loss": 0.4979, "step": 11214 }, { "epoch": 1.7556355666875392, "grad_norm": 1.917410135269165, "learning_rate": 1.874830163043478e-05, "loss": 0.2701, "step": 11215 }, { "epoch": 1.7557921102066374, "grad_norm": 2.7342307567596436, "learning_rate": 1.873641304347826e-05, "loss": 0.3785, "step": 11216 }, { "epoch": 1.7559486537257358, "grad_norm": 8.04577350616455, "learning_rate": 1.8724524456521738e-05, "loss": 0.8256, "step": 11217 }, { "epoch": 1.756105197244834, "grad_norm": 1.7675105333328247, "learning_rate": 1.8712635869565216e-05, "loss": 0.3894, "step": 11218 }, { "epoch": 1.7562617407639323, "grad_norm": 5.6073174476623535, "learning_rate": 1.8700747282608695e-05, "loss": 0.6763, "step": 11219 }, { "epoch": 1.7564182842830307, "grad_norm": 1.7493900060653687, "learning_rate": 1.868885869565217e-05, "loss": 0.6098, "step": 11220 }, { "epoch": 1.7565748278021291, "grad_norm": 3.1888558864593506, "learning_rate": 1.867697010869565e-05, "loss": 0.7333, "step": 11221 }, { "epoch": 1.7567313713212274, "grad_norm": 1.9699641466140747, "learning_rate": 1.866508152173913e-05, "loss": 0.7286, "step": 11222 }, { "epoch": 1.7568879148403256, "grad_norm": 3.9530234336853027, "learning_rate": 1.8653192934782607e-05, "loss": 1.0222, "step": 11223 }, { "epoch": 1.7570444583594238, "grad_norm": 3.864988327026367, "learning_rate": 1.8641304347826085e-05, "loss": 0.6593, "step": 11224 }, { "epoch": 1.7572010018785222, "grad_norm": 4.030975341796875, "learning_rate": 1.8629415760869567e-05, "loss": 0.613, "step": 11225 }, { "epoch": 1.7573575453976207, "grad_norm": 2.2711424827575684, "learning_rate": 1.861752717391304e-05, "loss": 0.3379, "step": 11226 }, { "epoch": 1.757514088916719, "grad_norm": 2.848745822906494, "learning_rate": 1.860563858695652e-05, "loss": 0.9365, "step": 11227 }, { "epoch": 1.7576706324358171, "grad_norm": 3.2838056087493896, "learning_rate": 1.8593749999999998e-05, "loss": 0.9721, "step": 11228 }, { "epoch": 1.7578271759549153, "grad_norm": 10.494851112365723, "learning_rate": 1.8581861413043476e-05, "loss": 1.5103, "step": 11229 }, { "epoch": 1.7579837194740138, "grad_norm": 3.444397449493408, "learning_rate": 1.8569972826086958e-05, "loss": 1.4009, "step": 11230 }, { "epoch": 1.7581402629931122, "grad_norm": 3.5486323833465576, "learning_rate": 1.8558084239130432e-05, "loss": 1.2307, "step": 11231 }, { "epoch": 1.7582968065122104, "grad_norm": 2.9229483604431152, "learning_rate": 1.854619565217391e-05, "loss": 0.6332, "step": 11232 }, { "epoch": 1.7584533500313086, "grad_norm": 5.921905040740967, "learning_rate": 1.853430706521739e-05, "loss": 0.7968, "step": 11233 }, { "epoch": 1.7586098935504069, "grad_norm": 1.954360842704773, "learning_rate": 1.8522418478260867e-05, "loss": 0.3039, "step": 11234 }, { "epoch": 1.7587664370695053, "grad_norm": 3.823145627975464, "learning_rate": 1.851052989130435e-05, "loss": 0.6258, "step": 11235 }, { "epoch": 1.7589229805886037, "grad_norm": 2.4499051570892334, "learning_rate": 1.8498641304347827e-05, "loss": 0.8284, "step": 11236 }, { "epoch": 1.759079524107702, "grad_norm": 4.8667073249816895, "learning_rate": 1.84867527173913e-05, "loss": 1.1313, "step": 11237 }, { "epoch": 1.7592360676268002, "grad_norm": 4.8766703605651855, "learning_rate": 1.847486413043478e-05, "loss": 1.6607, "step": 11238 }, { "epoch": 1.7593926111458984, "grad_norm": 0.6056260466575623, "learning_rate": 1.8462975543478258e-05, "loss": 0.2356, "step": 11239 }, { "epoch": 1.7595491546649968, "grad_norm": 0.8070452213287354, "learning_rate": 1.845108695652174e-05, "loss": 0.2361, "step": 11240 }, { "epoch": 1.7597056981840953, "grad_norm": 0.6590806245803833, "learning_rate": 1.8439198369565217e-05, "loss": 0.2238, "step": 11241 }, { "epoch": 1.7598622417031935, "grad_norm": 4.109917163848877, "learning_rate": 1.8427309782608696e-05, "loss": 0.5865, "step": 11242 }, { "epoch": 1.7600187852222917, "grad_norm": 1.2736618518829346, "learning_rate": 1.841542119565217e-05, "loss": 0.3227, "step": 11243 }, { "epoch": 1.7601753287413902, "grad_norm": 0.8944978713989258, "learning_rate": 1.840353260869565e-05, "loss": 0.3384, "step": 11244 }, { "epoch": 1.7603318722604884, "grad_norm": 0.6786588430404663, "learning_rate": 1.839164402173913e-05, "loss": 0.2848, "step": 11245 }, { "epoch": 1.7604884157795868, "grad_norm": 0.994778573513031, "learning_rate": 1.8379755434782608e-05, "loss": 0.2957, "step": 11246 }, { "epoch": 1.760644959298685, "grad_norm": 1.7182221412658691, "learning_rate": 1.8367866847826086e-05, "loss": 0.2632, "step": 11247 }, { "epoch": 1.7608015028177832, "grad_norm": 1.034631371498108, "learning_rate": 1.8355978260869564e-05, "loss": 0.2644, "step": 11248 }, { "epoch": 1.7609580463368817, "grad_norm": 0.7800166606903076, "learning_rate": 1.834408967391304e-05, "loss": 0.2911, "step": 11249 }, { "epoch": 1.7611145898559801, "grad_norm": 3.5346617698669434, "learning_rate": 1.833220108695652e-05, "loss": 0.4009, "step": 11250 }, { "epoch": 1.7612711333750783, "grad_norm": 1.8153001070022583, "learning_rate": 1.83203125e-05, "loss": 0.2564, "step": 11251 }, { "epoch": 1.7614276768941766, "grad_norm": 1.1200278997421265, "learning_rate": 1.8308423913043477e-05, "loss": 0.38, "step": 11252 }, { "epoch": 1.7615842204132748, "grad_norm": 1.2438985109329224, "learning_rate": 1.8296535326086955e-05, "loss": 0.4221, "step": 11253 }, { "epoch": 1.7617407639323732, "grad_norm": 1.4060332775115967, "learning_rate": 1.828464673913043e-05, "loss": 0.2981, "step": 11254 }, { "epoch": 1.7618973074514717, "grad_norm": 1.243806004524231, "learning_rate": 1.827275815217391e-05, "loss": 0.4316, "step": 11255 }, { "epoch": 1.7620538509705699, "grad_norm": 7.1107282638549805, "learning_rate": 1.826086956521739e-05, "loss": 1.8341, "step": 11256 }, { "epoch": 1.762210394489668, "grad_norm": 1.2352017164230347, "learning_rate": 1.8248980978260868e-05, "loss": 0.4702, "step": 11257 }, { "epoch": 1.7623669380087663, "grad_norm": 1.596166729927063, "learning_rate": 1.8237092391304346e-05, "loss": 0.4282, "step": 11258 }, { "epoch": 1.7625234815278648, "grad_norm": 1.287471055984497, "learning_rate": 1.8225203804347828e-05, "loss": 0.4125, "step": 11259 }, { "epoch": 1.7626800250469632, "grad_norm": 1.528381109237671, "learning_rate": 1.8213315217391302e-05, "loss": 0.3968, "step": 11260 }, { "epoch": 1.7628365685660614, "grad_norm": 6.7351202964782715, "learning_rate": 1.820142663043478e-05, "loss": 0.5423, "step": 11261 }, { "epoch": 1.7629931120851596, "grad_norm": 1.7201005220413208, "learning_rate": 1.818953804347826e-05, "loss": 0.4111, "step": 11262 }, { "epoch": 1.7631496556042578, "grad_norm": 3.043557643890381, "learning_rate": 1.8177649456521737e-05, "loss": 0.7791, "step": 11263 }, { "epoch": 1.7633061991233563, "grad_norm": 2.1645119190216064, "learning_rate": 1.816576086956522e-05, "loss": 0.6218, "step": 11264 }, { "epoch": 1.7634627426424547, "grad_norm": 2.2830095291137695, "learning_rate": 1.8153872282608696e-05, "loss": 0.4415, "step": 11265 }, { "epoch": 1.763619286161553, "grad_norm": 2.634500741958618, "learning_rate": 1.814198369565217e-05, "loss": 0.459, "step": 11266 }, { "epoch": 1.7637758296806512, "grad_norm": 1.8987860679626465, "learning_rate": 1.813009510869565e-05, "loss": 0.5967, "step": 11267 }, { "epoch": 1.7639323731997494, "grad_norm": 1.6392006874084473, "learning_rate": 1.8118206521739128e-05, "loss": 0.4801, "step": 11268 }, { "epoch": 1.7640889167188478, "grad_norm": 2.498929500579834, "learning_rate": 1.810631793478261e-05, "loss": 0.7486, "step": 11269 }, { "epoch": 1.7642454602379463, "grad_norm": 3.95797061920166, "learning_rate": 1.8094429347826087e-05, "loss": 0.5866, "step": 11270 }, { "epoch": 1.7644020037570445, "grad_norm": 3.6871840953826904, "learning_rate": 1.8082540760869565e-05, "loss": 0.7484, "step": 11271 }, { "epoch": 1.7645585472761427, "grad_norm": 4.990233421325684, "learning_rate": 1.807065217391304e-05, "loss": 0.9487, "step": 11272 }, { "epoch": 1.764715090795241, "grad_norm": 2.2868871688842773, "learning_rate": 1.805876358695652e-05, "loss": 0.5511, "step": 11273 }, { "epoch": 1.7648716343143394, "grad_norm": 15.600385665893555, "learning_rate": 1.8046875e-05, "loss": 1.0955, "step": 11274 }, { "epoch": 1.7650281778334378, "grad_norm": 4.629461288452148, "learning_rate": 1.8034986413043478e-05, "loss": 0.8655, "step": 11275 }, { "epoch": 1.765184721352536, "grad_norm": 5.301481246948242, "learning_rate": 1.8023097826086956e-05, "loss": 1.0201, "step": 11276 }, { "epoch": 1.7653412648716342, "grad_norm": 2.623561143875122, "learning_rate": 1.801120923913043e-05, "loss": 0.7139, "step": 11277 }, { "epoch": 1.7654978083907327, "grad_norm": 3.0665316581726074, "learning_rate": 1.799932065217391e-05, "loss": 1.0579, "step": 11278 }, { "epoch": 1.7656543519098309, "grad_norm": 3.630793571472168, "learning_rate": 1.798743206521739e-05, "loss": 0.937, "step": 11279 }, { "epoch": 1.7658108954289293, "grad_norm": 3.0544815063476562, "learning_rate": 1.797554347826087e-05, "loss": 1.1832, "step": 11280 }, { "epoch": 1.7659674389480275, "grad_norm": 3.066910743713379, "learning_rate": 1.7963654891304347e-05, "loss": 0.9357, "step": 11281 }, { "epoch": 1.7661239824671258, "grad_norm": 4.3148112297058105, "learning_rate": 1.7951766304347825e-05, "loss": 1.2952, "step": 11282 }, { "epoch": 1.7662805259862242, "grad_norm": 2.7109322547912598, "learning_rate": 1.79398777173913e-05, "loss": 1.0719, "step": 11283 }, { "epoch": 1.7664370695053226, "grad_norm": 6.245396137237549, "learning_rate": 1.792798913043478e-05, "loss": 1.2701, "step": 11284 }, { "epoch": 1.7665936130244209, "grad_norm": 2.285378932952881, "learning_rate": 1.791610054347826e-05, "loss": 0.7081, "step": 11285 }, { "epoch": 1.766750156543519, "grad_norm": 1.78818678855896, "learning_rate": 1.7904211956521738e-05, "loss": 0.5093, "step": 11286 }, { "epoch": 1.7669067000626173, "grad_norm": 2.516057014465332, "learning_rate": 1.7892323369565216e-05, "loss": 0.7614, "step": 11287 }, { "epoch": 1.7670632435817157, "grad_norm": 3.569835662841797, "learning_rate": 1.7880434782608694e-05, "loss": 0.7821, "step": 11288 }, { "epoch": 1.7672197871008142, "grad_norm": 0.896522581577301, "learning_rate": 1.7868546195652172e-05, "loss": 0.3083, "step": 11289 }, { "epoch": 1.7673763306199124, "grad_norm": 3.2932307720184326, "learning_rate": 1.785665760869565e-05, "loss": 0.2721, "step": 11290 }, { "epoch": 1.7675328741390106, "grad_norm": 0.8874508738517761, "learning_rate": 1.784476902173913e-05, "loss": 0.2564, "step": 11291 }, { "epoch": 1.7676894176581088, "grad_norm": 0.6055817008018494, "learning_rate": 1.7832880434782607e-05, "loss": 0.2402, "step": 11292 }, { "epoch": 1.7678459611772073, "grad_norm": 0.7097737789154053, "learning_rate": 1.7820991847826088e-05, "loss": 0.3721, "step": 11293 }, { "epoch": 1.7680025046963057, "grad_norm": 0.6264057755470276, "learning_rate": 1.7809103260869566e-05, "loss": 0.2537, "step": 11294 }, { "epoch": 1.768159048215404, "grad_norm": 1.1319694519042969, "learning_rate": 1.779721467391304e-05, "loss": 0.3442, "step": 11295 }, { "epoch": 1.7683155917345021, "grad_norm": 1.2115683555603027, "learning_rate": 1.778532608695652e-05, "loss": 0.4576, "step": 11296 }, { "epoch": 1.7684721352536004, "grad_norm": 1.323129415512085, "learning_rate": 1.7773437499999997e-05, "loss": 0.3437, "step": 11297 }, { "epoch": 1.7686286787726988, "grad_norm": 1.095009684562683, "learning_rate": 1.776154891304348e-05, "loss": 0.3785, "step": 11298 }, { "epoch": 1.7687852222917972, "grad_norm": 1.2689013481140137, "learning_rate": 1.7749660326086957e-05, "loss": 0.3278, "step": 11299 }, { "epoch": 1.7689417658108955, "grad_norm": 2.1096243858337402, "learning_rate": 1.7737771739130432e-05, "loss": 0.3109, "step": 11300 }, { "epoch": 1.7690983093299937, "grad_norm": 3.5724730491638184, "learning_rate": 1.772588315217391e-05, "loss": 0.5879, "step": 11301 }, { "epoch": 1.769254852849092, "grad_norm": 1.2708995342254639, "learning_rate": 1.7713994565217388e-05, "loss": 0.3887, "step": 11302 }, { "epoch": 1.7694113963681903, "grad_norm": 1.8379684686660767, "learning_rate": 1.770210597826087e-05, "loss": 0.3846, "step": 11303 }, { "epoch": 1.7695679398872888, "grad_norm": 1.3416743278503418, "learning_rate": 1.7690217391304348e-05, "loss": 0.3259, "step": 11304 }, { "epoch": 1.769724483406387, "grad_norm": 2.7161707878112793, "learning_rate": 1.7678328804347826e-05, "loss": 0.3774, "step": 11305 }, { "epoch": 1.7698810269254852, "grad_norm": 1.6148937940597534, "learning_rate": 1.76664402173913e-05, "loss": 0.8264, "step": 11306 }, { "epoch": 1.7700375704445834, "grad_norm": 1.8048583269119263, "learning_rate": 1.765455163043478e-05, "loss": 0.5246, "step": 11307 }, { "epoch": 1.7701941139636819, "grad_norm": 5.6233229637146, "learning_rate": 1.764266304347826e-05, "loss": 0.6791, "step": 11308 }, { "epoch": 1.7703506574827803, "grad_norm": 1.6927956342697144, "learning_rate": 1.763077445652174e-05, "loss": 0.4469, "step": 11309 }, { "epoch": 1.7705072010018785, "grad_norm": 2.3963711261749268, "learning_rate": 1.7618885869565217e-05, "loss": 0.592, "step": 11310 }, { "epoch": 1.7706637445209767, "grad_norm": 1.9818974733352661, "learning_rate": 1.7606997282608695e-05, "loss": 0.344, "step": 11311 }, { "epoch": 1.7708202880400752, "grad_norm": 2.5028069019317627, "learning_rate": 1.759510869565217e-05, "loss": 0.5251, "step": 11312 }, { "epoch": 1.7709768315591734, "grad_norm": 1.9453977346420288, "learning_rate": 1.758322010869565e-05, "loss": 0.3891, "step": 11313 }, { "epoch": 1.7711333750782718, "grad_norm": 2.4652061462402344, "learning_rate": 1.757133152173913e-05, "loss": 0.6354, "step": 11314 }, { "epoch": 1.77128991859737, "grad_norm": 5.3793253898620605, "learning_rate": 1.7559442934782608e-05, "loss": 0.5013, "step": 11315 }, { "epoch": 1.7714464621164683, "grad_norm": 3.4663681983947754, "learning_rate": 1.7547554347826086e-05, "loss": 0.6329, "step": 11316 }, { "epoch": 1.7716030056355667, "grad_norm": 2.143653154373169, "learning_rate": 1.7535665760869564e-05, "loss": 0.5624, "step": 11317 }, { "epoch": 1.7717595491546652, "grad_norm": 1.9073190689086914, "learning_rate": 1.7523777173913042e-05, "loss": 0.5824, "step": 11318 }, { "epoch": 1.7719160926737634, "grad_norm": 2.3787996768951416, "learning_rate": 1.751188858695652e-05, "loss": 0.5227, "step": 11319 }, { "epoch": 1.7720726361928616, "grad_norm": 4.829401016235352, "learning_rate": 1.75e-05, "loss": 0.5014, "step": 11320 }, { "epoch": 1.7722291797119598, "grad_norm": 1.970898151397705, "learning_rate": 1.7488111413043477e-05, "loss": 0.5884, "step": 11321 }, { "epoch": 1.7723857232310583, "grad_norm": 1.9393190145492554, "learning_rate": 1.7476222826086955e-05, "loss": 0.5536, "step": 11322 }, { "epoch": 1.7725422667501567, "grad_norm": 2.9396798610687256, "learning_rate": 1.7464334239130433e-05, "loss": 0.8501, "step": 11323 }, { "epoch": 1.772698810269255, "grad_norm": 12.32392692565918, "learning_rate": 1.745244565217391e-05, "loss": 1.0359, "step": 11324 }, { "epoch": 1.7728553537883531, "grad_norm": 5.769763946533203, "learning_rate": 1.744055706521739e-05, "loss": 0.7098, "step": 11325 }, { "epoch": 1.7730118973074513, "grad_norm": 6.4470391273498535, "learning_rate": 1.7428668478260867e-05, "loss": 0.8119, "step": 11326 }, { "epoch": 1.7731684408265498, "grad_norm": 2.9760279655456543, "learning_rate": 1.7416779891304346e-05, "loss": 0.6549, "step": 11327 }, { "epoch": 1.7733249843456482, "grad_norm": 4.919402599334717, "learning_rate": 1.7404891304347824e-05, "loss": 0.4948, "step": 11328 }, { "epoch": 1.7734815278647464, "grad_norm": 4.23431396484375, "learning_rate": 1.7393002717391302e-05, "loss": 0.9521, "step": 11329 }, { "epoch": 1.7736380713838447, "grad_norm": 2.6489834785461426, "learning_rate": 1.7381114130434783e-05, "loss": 0.408, "step": 11330 }, { "epoch": 1.7737946149029429, "grad_norm": 2.4848203659057617, "learning_rate": 1.7369225543478258e-05, "loss": 0.9634, "step": 11331 }, { "epoch": 1.7739511584220413, "grad_norm": 3.1066365242004395, "learning_rate": 1.7357336956521736e-05, "loss": 1.0333, "step": 11332 }, { "epoch": 1.7741077019411398, "grad_norm": 2.6274096965789795, "learning_rate": 1.7345448369565214e-05, "loss": 1.4014, "step": 11333 }, { "epoch": 1.774264245460238, "grad_norm": 4.432891368865967, "learning_rate": 1.7333559782608693e-05, "loss": 0.7818, "step": 11334 }, { "epoch": 1.7744207889793362, "grad_norm": 2.37404465675354, "learning_rate": 1.7321671195652174e-05, "loss": 0.4335, "step": 11335 }, { "epoch": 1.7745773324984344, "grad_norm": 3.0328588485717773, "learning_rate": 1.730978260869565e-05, "loss": 0.6621, "step": 11336 }, { "epoch": 1.7747338760175329, "grad_norm": 5.087409019470215, "learning_rate": 1.7297894021739127e-05, "loss": 1.4173, "step": 11337 }, { "epoch": 1.7748904195366313, "grad_norm": 1.5147455930709839, "learning_rate": 1.728600543478261e-05, "loss": 0.8799, "step": 11338 }, { "epoch": 1.7750469630557295, "grad_norm": 0.5886321663856506, "learning_rate": 1.7274116847826083e-05, "loss": 0.3513, "step": 11339 }, { "epoch": 1.7752035065748277, "grad_norm": 1.8824142217636108, "learning_rate": 1.7262228260869565e-05, "loss": 0.2785, "step": 11340 }, { "epoch": 1.775360050093926, "grad_norm": 0.6672768592834473, "learning_rate": 1.7250339673913043e-05, "loss": 0.2958, "step": 11341 }, { "epoch": 1.7755165936130244, "grad_norm": 1.2923107147216797, "learning_rate": 1.7238451086956518e-05, "loss": 0.2564, "step": 11342 }, { "epoch": 1.7756731371321228, "grad_norm": 1.3532124757766724, "learning_rate": 1.72265625e-05, "loss": 0.4532, "step": 11343 }, { "epoch": 1.775829680651221, "grad_norm": 1.2327845096588135, "learning_rate": 1.7214673913043478e-05, "loss": 0.4608, "step": 11344 }, { "epoch": 1.7759862241703193, "grad_norm": 1.2150251865386963, "learning_rate": 1.7202785326086956e-05, "loss": 0.3902, "step": 11345 }, { "epoch": 1.7761427676894177, "grad_norm": 0.8737384676933289, "learning_rate": 1.7190896739130434e-05, "loss": 0.3566, "step": 11346 }, { "epoch": 1.776299311208516, "grad_norm": 0.8099036812782288, "learning_rate": 1.7179008152173912e-05, "loss": 0.3205, "step": 11347 }, { "epoch": 1.7764558547276144, "grad_norm": 1.4028183221817017, "learning_rate": 1.716711956521739e-05, "loss": 0.32, "step": 11348 }, { "epoch": 1.7766123982467126, "grad_norm": 1.2058393955230713, "learning_rate": 1.715523097826087e-05, "loss": 0.4439, "step": 11349 }, { "epoch": 1.7767689417658108, "grad_norm": 2.4249942302703857, "learning_rate": 1.7143342391304347e-05, "loss": 0.4391, "step": 11350 }, { "epoch": 1.7769254852849092, "grad_norm": 1.426642894744873, "learning_rate": 1.7131453804347825e-05, "loss": 0.3362, "step": 11351 }, { "epoch": 1.7770820288040077, "grad_norm": 1.5221081972122192, "learning_rate": 1.7119565217391303e-05, "loss": 0.398, "step": 11352 }, { "epoch": 1.777238572323106, "grad_norm": 1.674475908279419, "learning_rate": 1.710767663043478e-05, "loss": 0.292, "step": 11353 }, { "epoch": 1.777395115842204, "grad_norm": 1.9652501344680786, "learning_rate": 1.709578804347826e-05, "loss": 0.4421, "step": 11354 }, { "epoch": 1.7775516593613023, "grad_norm": 6.1566643714904785, "learning_rate": 1.7083899456521737e-05, "loss": 0.8237, "step": 11355 }, { "epoch": 1.7777082028804008, "grad_norm": 1.369353175163269, "learning_rate": 1.7072010869565215e-05, "loss": 0.5698, "step": 11356 }, { "epoch": 1.7778647463994992, "grad_norm": 1.4722843170166016, "learning_rate": 1.7060122282608694e-05, "loss": 0.5135, "step": 11357 }, { "epoch": 1.7780212899185974, "grad_norm": 2.6609232425689697, "learning_rate": 1.7048233695652172e-05, "loss": 0.4038, "step": 11358 }, { "epoch": 1.7781778334376956, "grad_norm": 3.1817972660064697, "learning_rate": 1.703634510869565e-05, "loss": 0.488, "step": 11359 }, { "epoch": 1.7783343769567939, "grad_norm": 3.2609636783599854, "learning_rate": 1.7024456521739128e-05, "loss": 0.6124, "step": 11360 }, { "epoch": 1.7784909204758923, "grad_norm": 3.509288787841797, "learning_rate": 1.7012567934782606e-05, "loss": 0.644, "step": 11361 }, { "epoch": 1.7786474639949907, "grad_norm": 7.794014930725098, "learning_rate": 1.7000679347826084e-05, "loss": 0.61, "step": 11362 }, { "epoch": 1.778804007514089, "grad_norm": 0.9967325925827026, "learning_rate": 1.6988790760869563e-05, "loss": 0.2736, "step": 11363 }, { "epoch": 1.7789605510331872, "grad_norm": 3.597637891769409, "learning_rate": 1.6976902173913044e-05, "loss": 0.6792, "step": 11364 }, { "epoch": 1.7791170945522854, "grad_norm": 2.970341205596924, "learning_rate": 1.696501358695652e-05, "loss": 0.6543, "step": 11365 }, { "epoch": 1.7792736380713838, "grad_norm": 2.1036324501037598, "learning_rate": 1.6953124999999997e-05, "loss": 0.6522, "step": 11366 }, { "epoch": 1.7794301815904823, "grad_norm": 2.700850248336792, "learning_rate": 1.694123641304348e-05, "loss": 0.4807, "step": 11367 }, { "epoch": 1.7795867251095805, "grad_norm": 2.570119619369507, "learning_rate": 1.6929347826086953e-05, "loss": 0.3957, "step": 11368 }, { "epoch": 1.7797432686286787, "grad_norm": 4.361047267913818, "learning_rate": 1.6917459239130435e-05, "loss": 0.4654, "step": 11369 }, { "epoch": 1.779899812147777, "grad_norm": 3.0775258541107178, "learning_rate": 1.6905570652173913e-05, "loss": 0.5222, "step": 11370 }, { "epoch": 1.7800563556668754, "grad_norm": 2.8095953464508057, "learning_rate": 1.6893682065217388e-05, "loss": 0.8551, "step": 11371 }, { "epoch": 1.7802128991859738, "grad_norm": 3.321164846420288, "learning_rate": 1.688179347826087e-05, "loss": 0.5157, "step": 11372 }, { "epoch": 1.780369442705072, "grad_norm": 4.6615142822265625, "learning_rate": 1.6869904891304348e-05, "loss": 0.9162, "step": 11373 }, { "epoch": 1.7805259862241702, "grad_norm": 8.96009635925293, "learning_rate": 1.6858016304347826e-05, "loss": 0.85, "step": 11374 }, { "epoch": 1.7806825297432687, "grad_norm": 4.555318832397461, "learning_rate": 1.6846127717391304e-05, "loss": 1.3187, "step": 11375 }, { "epoch": 1.780839073262367, "grad_norm": 3.572406530380249, "learning_rate": 1.6834239130434782e-05, "loss": 0.8794, "step": 11376 }, { "epoch": 1.7809956167814653, "grad_norm": 3.4434661865234375, "learning_rate": 1.682235054347826e-05, "loss": 1.0321, "step": 11377 }, { "epoch": 1.7811521603005636, "grad_norm": 6.441258907318115, "learning_rate": 1.6810461956521738e-05, "loss": 1.2833, "step": 11378 }, { "epoch": 1.7813087038196618, "grad_norm": 12.40341567993164, "learning_rate": 1.6798573369565216e-05, "loss": 0.8726, "step": 11379 }, { "epoch": 1.7814652473387602, "grad_norm": 3.633530378341675, "learning_rate": 1.6786684782608695e-05, "loss": 1.2184, "step": 11380 }, { "epoch": 1.7816217908578584, "grad_norm": 3.5694260597229004, "learning_rate": 1.6774796195652173e-05, "loss": 1.3503, "step": 11381 }, { "epoch": 1.7817783343769569, "grad_norm": 3.0449893474578857, "learning_rate": 1.676290760869565e-05, "loss": 1.238, "step": 11382 }, { "epoch": 1.781934877896055, "grad_norm": 4.25550651550293, "learning_rate": 1.675101902173913e-05, "loss": 0.8688, "step": 11383 }, { "epoch": 1.7820914214151533, "grad_norm": 3.607276678085327, "learning_rate": 1.6739130434782607e-05, "loss": 0.2537, "step": 11384 }, { "epoch": 1.7822479649342517, "grad_norm": 3.1047072410583496, "learning_rate": 1.6727241847826085e-05, "loss": 0.4519, "step": 11385 }, { "epoch": 1.7824045084533502, "grad_norm": 4.58246374130249, "learning_rate": 1.6715353260869564e-05, "loss": 0.784, "step": 11386 }, { "epoch": 1.7825610519724484, "grad_norm": 2.5786168575286865, "learning_rate": 1.6703464673913042e-05, "loss": 0.673, "step": 11387 }, { "epoch": 1.7827175954915466, "grad_norm": 2.4314985275268555, "learning_rate": 1.669157608695652e-05, "loss": 0.7516, "step": 11388 }, { "epoch": 1.7828741390106448, "grad_norm": 0.6186507344245911, "learning_rate": 1.6679687499999998e-05, "loss": 0.3032, "step": 11389 }, { "epoch": 1.7830306825297433, "grad_norm": 0.5592915415763855, "learning_rate": 1.6667798913043476e-05, "loss": 0.3081, "step": 11390 }, { "epoch": 1.7831872260488417, "grad_norm": 0.5584611892700195, "learning_rate": 1.6655910326086954e-05, "loss": 0.3281, "step": 11391 }, { "epoch": 1.78334376956794, "grad_norm": 0.708814263343811, "learning_rate": 1.6644021739130432e-05, "loss": 0.2863, "step": 11392 }, { "epoch": 1.7835003130870382, "grad_norm": 1.714224934577942, "learning_rate": 1.6632133152173914e-05, "loss": 0.3517, "step": 11393 }, { "epoch": 1.7836568566061364, "grad_norm": 0.6835424304008484, "learning_rate": 1.662024456521739e-05, "loss": 0.3275, "step": 11394 }, { "epoch": 1.7838134001252348, "grad_norm": 1.6334304809570312, "learning_rate": 1.6608355978260867e-05, "loss": 0.3662, "step": 11395 }, { "epoch": 1.7839699436443333, "grad_norm": 0.7419809699058533, "learning_rate": 1.659646739130435e-05, "loss": 0.3645, "step": 11396 }, { "epoch": 1.7841264871634315, "grad_norm": 1.3952815532684326, "learning_rate": 1.6584578804347823e-05, "loss": 0.3105, "step": 11397 }, { "epoch": 1.7842830306825297, "grad_norm": 1.9412468671798706, "learning_rate": 1.6572690217391305e-05, "loss": 0.4557, "step": 11398 }, { "epoch": 1.784439574201628, "grad_norm": 1.0394041538238525, "learning_rate": 1.6560801630434783e-05, "loss": 0.3808, "step": 11399 }, { "epoch": 1.7845961177207263, "grad_norm": 6.29727840423584, "learning_rate": 1.6548913043478258e-05, "loss": 0.4526, "step": 11400 }, { "epoch": 1.7847526612398248, "grad_norm": 2.1701836585998535, "learning_rate": 1.653702445652174e-05, "loss": 0.3563, "step": 11401 }, { "epoch": 1.784909204758923, "grad_norm": 1.8913041353225708, "learning_rate": 1.6525135869565214e-05, "loss": 0.4322, "step": 11402 }, { "epoch": 1.7850657482780212, "grad_norm": 4.3739728927612305, "learning_rate": 1.6513247282608696e-05, "loss": 0.4363, "step": 11403 }, { "epoch": 1.7852222917971194, "grad_norm": 1.4734845161437988, "learning_rate": 1.6501358695652174e-05, "loss": 0.3773, "step": 11404 }, { "epoch": 1.7853788353162179, "grad_norm": 5.079473972320557, "learning_rate": 1.648947010869565e-05, "loss": 0.3874, "step": 11405 }, { "epoch": 1.7855353788353163, "grad_norm": 3.5311474800109863, "learning_rate": 1.647758152173913e-05, "loss": 0.4747, "step": 11406 }, { "epoch": 1.7856919223544145, "grad_norm": 2.1908745765686035, "learning_rate": 1.6465692934782608e-05, "loss": 0.8274, "step": 11407 }, { "epoch": 1.7858484658735128, "grad_norm": 3.0407676696777344, "learning_rate": 1.6453804347826086e-05, "loss": 0.3156, "step": 11408 }, { "epoch": 1.7860050093926112, "grad_norm": 6.485593795776367, "learning_rate": 1.6441915760869565e-05, "loss": 0.414, "step": 11409 }, { "epoch": 1.7861615529117094, "grad_norm": 2.363544225692749, "learning_rate": 1.6430027173913043e-05, "loss": 0.4307, "step": 11410 }, { "epoch": 1.7863180964308079, "grad_norm": 3.1136202812194824, "learning_rate": 1.641813858695652e-05, "loss": 0.5134, "step": 11411 }, { "epoch": 1.786474639949906, "grad_norm": 2.4016175270080566, "learning_rate": 1.640625e-05, "loss": 0.7409, "step": 11412 }, { "epoch": 1.7866311834690043, "grad_norm": 2.011056423187256, "learning_rate": 1.6394361413043477e-05, "loss": 0.4716, "step": 11413 }, { "epoch": 1.7867877269881027, "grad_norm": 1.6304811239242554, "learning_rate": 1.6382472826086955e-05, "loss": 0.4118, "step": 11414 }, { "epoch": 1.786944270507201, "grad_norm": 3.4507863521575928, "learning_rate": 1.6370584239130433e-05, "loss": 0.8347, "step": 11415 }, { "epoch": 1.7871008140262994, "grad_norm": 3.81784987449646, "learning_rate": 1.635869565217391e-05, "loss": 0.7486, "step": 11416 }, { "epoch": 1.7872573575453976, "grad_norm": 7.344252109527588, "learning_rate": 1.634680706521739e-05, "loss": 0.7322, "step": 11417 }, { "epoch": 1.7874139010644958, "grad_norm": 4.008902549743652, "learning_rate": 1.6334918478260868e-05, "loss": 0.637, "step": 11418 }, { "epoch": 1.7875704445835943, "grad_norm": 2.1047000885009766, "learning_rate": 1.6323029891304346e-05, "loss": 0.3511, "step": 11419 }, { "epoch": 1.7877269881026927, "grad_norm": 9.726273536682129, "learning_rate": 1.6311141304347824e-05, "loss": 0.5703, "step": 11420 }, { "epoch": 1.787883531621791, "grad_norm": 2.548091411590576, "learning_rate": 1.6299252717391302e-05, "loss": 0.8707, "step": 11421 }, { "epoch": 1.7880400751408891, "grad_norm": 2.388883113861084, "learning_rate": 1.628736413043478e-05, "loss": 0.69, "step": 11422 }, { "epoch": 1.7881966186599874, "grad_norm": 3.313633918762207, "learning_rate": 1.627547554347826e-05, "loss": 0.6767, "step": 11423 }, { "epoch": 1.7883531621790858, "grad_norm": 4.446158409118652, "learning_rate": 1.6263586956521737e-05, "loss": 0.7723, "step": 11424 }, { "epoch": 1.7885097056981842, "grad_norm": 3.0315089225769043, "learning_rate": 1.6251698369565215e-05, "loss": 0.6703, "step": 11425 }, { "epoch": 1.7886662492172825, "grad_norm": 2.648203134536743, "learning_rate": 1.6239809782608693e-05, "loss": 0.7481, "step": 11426 }, { "epoch": 1.7888227927363807, "grad_norm": 3.9562647342681885, "learning_rate": 1.622792119565217e-05, "loss": 0.9134, "step": 11427 }, { "epoch": 1.788979336255479, "grad_norm": 3.4605703353881836, "learning_rate": 1.621603260869565e-05, "loss": 1.1013, "step": 11428 }, { "epoch": 1.7891358797745773, "grad_norm": 4.597591400146484, "learning_rate": 1.6204144021739128e-05, "loss": 1.3304, "step": 11429 }, { "epoch": 1.7892924232936758, "grad_norm": 2.80845308303833, "learning_rate": 1.619225543478261e-05, "loss": 1.0751, "step": 11430 }, { "epoch": 1.789448966812774, "grad_norm": 1.9397579431533813, "learning_rate": 1.6180366847826084e-05, "loss": 0.7545, "step": 11431 }, { "epoch": 1.7896055103318722, "grad_norm": 2.6087818145751953, "learning_rate": 1.6168478260869562e-05, "loss": 0.6756, "step": 11432 }, { "epoch": 1.7897620538509704, "grad_norm": 3.605186939239502, "learning_rate": 1.6156589673913044e-05, "loss": 0.6727, "step": 11433 }, { "epoch": 1.7899185973700689, "grad_norm": 2.501699209213257, "learning_rate": 1.614470108695652e-05, "loss": 0.2004, "step": 11434 }, { "epoch": 1.7900751408891673, "grad_norm": 13.99677562713623, "learning_rate": 1.61328125e-05, "loss": 0.7552, "step": 11435 }, { "epoch": 1.7902316844082655, "grad_norm": 4.727803707122803, "learning_rate": 1.6120923913043478e-05, "loss": 0.7211, "step": 11436 }, { "epoch": 1.7903882279273637, "grad_norm": 4.8168864250183105, "learning_rate": 1.6109035326086953e-05, "loss": 1.06, "step": 11437 }, { "epoch": 1.790544771446462, "grad_norm": 1.9007141590118408, "learning_rate": 1.6097146739130434e-05, "loss": 0.6528, "step": 11438 }, { "epoch": 1.7907013149655604, "grad_norm": 1.0465569496154785, "learning_rate": 1.6085258152173913e-05, "loss": 0.4641, "step": 11439 }, { "epoch": 1.7908578584846588, "grad_norm": 0.8564383387565613, "learning_rate": 1.607336956521739e-05, "loss": 0.3007, "step": 11440 }, { "epoch": 1.791014402003757, "grad_norm": 0.6923457980155945, "learning_rate": 1.606148097826087e-05, "loss": 0.3, "step": 11441 }, { "epoch": 1.7911709455228553, "grad_norm": 0.6333643198013306, "learning_rate": 1.6049592391304347e-05, "loss": 0.2953, "step": 11442 }, { "epoch": 1.7913274890419537, "grad_norm": 0.5421554446220398, "learning_rate": 1.6037703804347825e-05, "loss": 0.2506, "step": 11443 }, { "epoch": 1.791484032561052, "grad_norm": 1.8279391527175903, "learning_rate": 1.6025815217391303e-05, "loss": 0.3805, "step": 11444 }, { "epoch": 1.7916405760801504, "grad_norm": 1.6147462129592896, "learning_rate": 1.601392663043478e-05, "loss": 0.3831, "step": 11445 }, { "epoch": 1.7917971195992486, "grad_norm": 1.2002158164978027, "learning_rate": 1.600203804347826e-05, "loss": 0.4312, "step": 11446 }, { "epoch": 1.7919536631183468, "grad_norm": 1.179093599319458, "learning_rate": 1.5990149456521738e-05, "loss": 0.4268, "step": 11447 }, { "epoch": 1.7921102066374452, "grad_norm": 1.9295125007629395, "learning_rate": 1.5978260869565216e-05, "loss": 0.295, "step": 11448 }, { "epoch": 1.7922667501565435, "grad_norm": 0.9027491211891174, "learning_rate": 1.5966372282608694e-05, "loss": 0.3871, "step": 11449 }, { "epoch": 1.792423293675642, "grad_norm": 1.8549429178237915, "learning_rate": 1.5954483695652172e-05, "loss": 0.3191, "step": 11450 }, { "epoch": 1.7925798371947401, "grad_norm": 1.1193856000900269, "learning_rate": 1.594259510869565e-05, "loss": 0.3954, "step": 11451 }, { "epoch": 1.7927363807138383, "grad_norm": 0.9572113752365112, "learning_rate": 1.593070652173913e-05, "loss": 0.4326, "step": 11452 }, { "epoch": 1.7928929242329368, "grad_norm": 3.7945642471313477, "learning_rate": 1.5918817934782607e-05, "loss": 0.516, "step": 11453 }, { "epoch": 1.7930494677520352, "grad_norm": 2.795273780822754, "learning_rate": 1.5906929347826085e-05, "loss": 0.5867, "step": 11454 }, { "epoch": 1.7932060112711334, "grad_norm": 3.9846508502960205, "learning_rate": 1.5895040760869563e-05, "loss": 0.4319, "step": 11455 }, { "epoch": 1.7933625547902317, "grad_norm": 1.2778617143630981, "learning_rate": 1.588315217391304e-05, "loss": 0.3689, "step": 11456 }, { "epoch": 1.7935190983093299, "grad_norm": 1.4320851564407349, "learning_rate": 1.587126358695652e-05, "loss": 0.5012, "step": 11457 }, { "epoch": 1.7936756418284283, "grad_norm": 3.2244153022766113, "learning_rate": 1.5859374999999998e-05, "loss": 0.8284, "step": 11458 }, { "epoch": 1.7938321853475268, "grad_norm": 2.0569026470184326, "learning_rate": 1.584748641304348e-05, "loss": 0.5474, "step": 11459 }, { "epoch": 1.793988728866625, "grad_norm": 1.155800461769104, "learning_rate": 1.5835597826086954e-05, "loss": 0.4451, "step": 11460 }, { "epoch": 1.7941452723857232, "grad_norm": 3.5658118724823, "learning_rate": 1.5823709239130432e-05, "loss": 0.4601, "step": 11461 }, { "epoch": 1.7943018159048214, "grad_norm": 2.890630006790161, "learning_rate": 1.5811820652173914e-05, "loss": 0.5815, "step": 11462 }, { "epoch": 1.7944583594239198, "grad_norm": 2.3349108695983887, "learning_rate": 1.579993206521739e-05, "loss": 0.616, "step": 11463 }, { "epoch": 1.7946149029430183, "grad_norm": 2.4452271461486816, "learning_rate": 1.578804347826087e-05, "loss": 0.5705, "step": 11464 }, { "epoch": 1.7947714464621165, "grad_norm": 6.22318172454834, "learning_rate": 1.5776154891304348e-05, "loss": 0.7121, "step": 11465 }, { "epoch": 1.7949279899812147, "grad_norm": 3.337472677230835, "learning_rate": 1.5764266304347823e-05, "loss": 0.5403, "step": 11466 }, { "epoch": 1.795084533500313, "grad_norm": 2.679748296737671, "learning_rate": 1.5752377717391304e-05, "loss": 0.9027, "step": 11467 }, { "epoch": 1.7952410770194114, "grad_norm": 2.503232002258301, "learning_rate": 1.5740489130434783e-05, "loss": 0.5061, "step": 11468 }, { "epoch": 1.7953976205385098, "grad_norm": 2.412288188934326, "learning_rate": 1.572860054347826e-05, "loss": 0.54, "step": 11469 }, { "epoch": 1.795554164057608, "grad_norm": 3.936673402786255, "learning_rate": 1.571671195652174e-05, "loss": 0.5971, "step": 11470 }, { "epoch": 1.7957107075767063, "grad_norm": 2.0491783618927, "learning_rate": 1.5704823369565214e-05, "loss": 0.8939, "step": 11471 }, { "epoch": 1.7958672510958045, "grad_norm": 2.24269962310791, "learning_rate": 1.5692934782608695e-05, "loss": 0.4066, "step": 11472 }, { "epoch": 1.796023794614903, "grad_norm": 2.9907517433166504, "learning_rate": 1.5681046195652173e-05, "loss": 0.4909, "step": 11473 }, { "epoch": 1.7961803381340014, "grad_norm": 3.773254871368408, "learning_rate": 1.566915760869565e-05, "loss": 1.0159, "step": 11474 }, { "epoch": 1.7963368816530996, "grad_norm": 2.6915409564971924, "learning_rate": 1.565726902173913e-05, "loss": 0.6951, "step": 11475 }, { "epoch": 1.7964934251721978, "grad_norm": 2.9089393615722656, "learning_rate": 1.5645380434782608e-05, "loss": 0.7806, "step": 11476 }, { "epoch": 1.7966499686912962, "grad_norm": 4.154906749725342, "learning_rate": 1.5633491847826086e-05, "loss": 1.0795, "step": 11477 }, { "epoch": 1.7968065122103944, "grad_norm": 2.033968687057495, "learning_rate": 1.5621603260869564e-05, "loss": 0.6918, "step": 11478 }, { "epoch": 1.7969630557294929, "grad_norm": 4.213752746582031, "learning_rate": 1.5609714673913042e-05, "loss": 1.4185, "step": 11479 }, { "epoch": 1.797119599248591, "grad_norm": 6.594788074493408, "learning_rate": 1.559782608695652e-05, "loss": 1.2918, "step": 11480 }, { "epoch": 1.7972761427676893, "grad_norm": 2.439744472503662, "learning_rate": 1.55859375e-05, "loss": 0.8923, "step": 11481 }, { "epoch": 1.7974326862867878, "grad_norm": 3.434197425842285, "learning_rate": 1.5574048913043477e-05, "loss": 1.4821, "step": 11482 }, { "epoch": 1.7975892298058862, "grad_norm": 4.931283473968506, "learning_rate": 1.5562160326086955e-05, "loss": 1.301, "step": 11483 }, { "epoch": 1.7977457733249844, "grad_norm": 2.4745163917541504, "learning_rate": 1.5550271739130433e-05, "loss": 0.655, "step": 11484 }, { "epoch": 1.7979023168440826, "grad_norm": 1.669519305229187, "learning_rate": 1.553838315217391e-05, "loss": 0.3963, "step": 11485 }, { "epoch": 1.7980588603631809, "grad_norm": 1.4170118570327759, "learning_rate": 1.552649456521739e-05, "loss": 0.2721, "step": 11486 }, { "epoch": 1.7982154038822793, "grad_norm": 1.9375007152557373, "learning_rate": 1.5514605978260867e-05, "loss": 0.447, "step": 11487 }, { "epoch": 1.7983719474013777, "grad_norm": 5.847013473510742, "learning_rate": 1.5502717391304346e-05, "loss": 0.8589, "step": 11488 }, { "epoch": 1.798528490920476, "grad_norm": 3.8861327171325684, "learning_rate": 1.5490828804347824e-05, "loss": 0.3426, "step": 11489 }, { "epoch": 1.7986850344395742, "grad_norm": 0.8129429221153259, "learning_rate": 1.5478940217391302e-05, "loss": 0.4923, "step": 11490 }, { "epoch": 1.7988415779586724, "grad_norm": 1.0791816711425781, "learning_rate": 1.5467051630434783e-05, "loss": 0.3791, "step": 11491 }, { "epoch": 1.7989981214777708, "grad_norm": 0.9479283690452576, "learning_rate": 1.5455163043478258e-05, "loss": 0.3637, "step": 11492 }, { "epoch": 1.7991546649968693, "grad_norm": 1.1023740768432617, "learning_rate": 1.544327445652174e-05, "loss": 0.2967, "step": 11493 }, { "epoch": 1.7993112085159675, "grad_norm": 0.6908473968505859, "learning_rate": 1.5431385869565215e-05, "loss": 0.3023, "step": 11494 }, { "epoch": 1.7994677520350657, "grad_norm": 0.8928840756416321, "learning_rate": 1.5419497282608693e-05, "loss": 0.3492, "step": 11495 }, { "epoch": 1.799624295554164, "grad_norm": 2.070225715637207, "learning_rate": 1.5407608695652174e-05, "loss": 0.3816, "step": 11496 }, { "epoch": 1.7997808390732624, "grad_norm": 3.886101484298706, "learning_rate": 1.539572010869565e-05, "loss": 0.5066, "step": 11497 }, { "epoch": 1.7999373825923608, "grad_norm": 2.5631563663482666, "learning_rate": 1.538383152173913e-05, "loss": 0.508, "step": 11498 }, { "epoch": 1.800093926111459, "grad_norm": 1.2707771062850952, "learning_rate": 1.537194293478261e-05, "loss": 0.5244, "step": 11499 }, { "epoch": 1.8002504696305572, "grad_norm": 1.6968297958374023, "learning_rate": 1.5360054347826083e-05, "loss": 0.6214, "step": 11500 }, { "epoch": 1.8004070131496555, "grad_norm": 3.229480266571045, "learning_rate": 1.5348165760869565e-05, "loss": 0.3571, "step": 11501 }, { "epoch": 1.800563556668754, "grad_norm": 3.290130615234375, "learning_rate": 1.5336277173913043e-05, "loss": 0.3574, "step": 11502 }, { "epoch": 1.8007201001878523, "grad_norm": 3.3783583641052246, "learning_rate": 1.532438858695652e-05, "loss": 0.5088, "step": 11503 }, { "epoch": 1.8008766437069506, "grad_norm": 5.000029563903809, "learning_rate": 1.53125e-05, "loss": 0.3854, "step": 11504 }, { "epoch": 1.8010331872260488, "grad_norm": 3.1020236015319824, "learning_rate": 1.5300611413043478e-05, "loss": 0.5063, "step": 11505 }, { "epoch": 1.801189730745147, "grad_norm": 4.50528621673584, "learning_rate": 1.5288722826086956e-05, "loss": 0.2883, "step": 11506 }, { "epoch": 1.8013462742642454, "grad_norm": 1.551353096961975, "learning_rate": 1.5276834239130434e-05, "loss": 0.4292, "step": 11507 }, { "epoch": 1.8015028177833439, "grad_norm": 1.2391095161437988, "learning_rate": 1.5264945652173912e-05, "loss": 0.5275, "step": 11508 }, { "epoch": 1.801659361302442, "grad_norm": 2.1287803649902344, "learning_rate": 1.525305706521739e-05, "loss": 0.4666, "step": 11509 }, { "epoch": 1.8018159048215403, "grad_norm": 4.427123069763184, "learning_rate": 1.5241168478260868e-05, "loss": 0.4815, "step": 11510 }, { "epoch": 1.8019724483406387, "grad_norm": 3.259688138961792, "learning_rate": 1.5229279891304347e-05, "loss": 0.4906, "step": 11511 }, { "epoch": 1.802128991859737, "grad_norm": 2.492633819580078, "learning_rate": 1.5217391304347825e-05, "loss": 0.6489, "step": 11512 }, { "epoch": 1.8022855353788354, "grad_norm": 1.9147733449935913, "learning_rate": 1.5205502717391303e-05, "loss": 0.7584, "step": 11513 }, { "epoch": 1.8024420788979336, "grad_norm": 2.326695442199707, "learning_rate": 1.5193614130434783e-05, "loss": 0.6135, "step": 11514 }, { "epoch": 1.8025986224170318, "grad_norm": 2.351560115814209, "learning_rate": 1.518172554347826e-05, "loss": 0.5592, "step": 11515 }, { "epoch": 1.8027551659361303, "grad_norm": 5.739931106567383, "learning_rate": 1.5169836956521737e-05, "loss": 0.7496, "step": 11516 }, { "epoch": 1.8029117094552287, "grad_norm": 5.481928825378418, "learning_rate": 1.5157948369565216e-05, "loss": 0.5407, "step": 11517 }, { "epoch": 1.803068252974327, "grad_norm": 4.925919055938721, "learning_rate": 1.5146059782608694e-05, "loss": 0.5466, "step": 11518 }, { "epoch": 1.8032247964934252, "grad_norm": 2.3125240802764893, "learning_rate": 1.5134171195652174e-05, "loss": 0.7008, "step": 11519 }, { "epoch": 1.8033813400125234, "grad_norm": 3.075080633163452, "learning_rate": 1.512228260869565e-05, "loss": 1.0122, "step": 11520 }, { "epoch": 1.8035378835316218, "grad_norm": 3.3945508003234863, "learning_rate": 1.5110394021739128e-05, "loss": 0.8571, "step": 11521 }, { "epoch": 1.8036944270507203, "grad_norm": NaN, "learning_rate": 1.5110394021739128e-05, "loss": 0.0, "step": 11522 }, { "epoch": 1.8038509705698185, "grad_norm": 5.880252361297607, "learning_rate": 1.5098505434782608e-05, "loss": 0.9378, "step": 11523 }, { "epoch": 1.8040075140889167, "grad_norm": 4.630260944366455, "learning_rate": 1.5086616847826084e-05, "loss": 0.769, "step": 11524 }, { "epoch": 1.804164057608015, "grad_norm": 4.804925441741943, "learning_rate": 1.5074728260869564e-05, "loss": 0.8429, "step": 11525 }, { "epoch": 1.8043206011271133, "grad_norm": 3.2456092834472656, "learning_rate": 1.5062839673913042e-05, "loss": 1.0759, "step": 11526 }, { "epoch": 1.8044771446462118, "grad_norm": 5.225546360015869, "learning_rate": 1.5050951086956519e-05, "loss": 1.291, "step": 11527 }, { "epoch": 1.80463368816531, "grad_norm": 2.365091562271118, "learning_rate": 1.5039062499999999e-05, "loss": 0.759, "step": 11528 }, { "epoch": 1.8047902316844082, "grad_norm": 5.166929244995117, "learning_rate": 1.5027173913043477e-05, "loss": 1.1934, "step": 11529 }, { "epoch": 1.8049467752035064, "grad_norm": 3.2344019412994385, "learning_rate": 1.5015285326086955e-05, "loss": 1.1233, "step": 11530 }, { "epoch": 1.8051033187226049, "grad_norm": 6.958945274353027, "learning_rate": 1.5003396739130433e-05, "loss": 1.1321, "step": 11531 }, { "epoch": 1.8052598622417033, "grad_norm": 2.154186248779297, "learning_rate": 1.4991508152173913e-05, "loss": 1.0291, "step": 11532 }, { "epoch": 1.8054164057608015, "grad_norm": 3.533376455307007, "learning_rate": 1.497961956521739e-05, "loss": 0.7436, "step": 11533 }, { "epoch": 1.8055729492798998, "grad_norm": 1.992466926574707, "learning_rate": 1.4967730978260868e-05, "loss": 0.6784, "step": 11534 }, { "epoch": 1.805729492798998, "grad_norm": 4.023629665374756, "learning_rate": 1.4955842391304348e-05, "loss": 0.5576, "step": 11535 }, { "epoch": 1.8058860363180964, "grad_norm": 2.1713736057281494, "learning_rate": 1.4943953804347824e-05, "loss": 0.5278, "step": 11536 }, { "epoch": 1.8060425798371949, "grad_norm": 2.4952237606048584, "learning_rate": 1.4932065217391304e-05, "loss": 0.5617, "step": 11537 }, { "epoch": 1.806199123356293, "grad_norm": 1.40253484249115, "learning_rate": 1.4920176630434782e-05, "loss": 0.8052, "step": 11538 }, { "epoch": 1.8063556668753913, "grad_norm": 1.2611039876937866, "learning_rate": 1.4908288043478259e-05, "loss": 0.4662, "step": 11539 }, { "epoch": 1.8065122103944895, "grad_norm": 0.8030280470848083, "learning_rate": 1.4896399456521738e-05, "loss": 0.417, "step": 11540 }, { "epoch": 1.806668753913588, "grad_norm": 1.0435187816619873, "learning_rate": 1.4884510869565215e-05, "loss": 0.4158, "step": 11541 }, { "epoch": 1.8068252974326864, "grad_norm": 1.038429617881775, "learning_rate": 1.4872622282608695e-05, "loss": 0.461, "step": 11542 }, { "epoch": 1.8069818409517846, "grad_norm": 2.4780592918395996, "learning_rate": 1.4860733695652173e-05, "loss": 0.4675, "step": 11543 }, { "epoch": 1.8071383844708828, "grad_norm": 1.2772752046585083, "learning_rate": 1.484884510869565e-05, "loss": 0.5174, "step": 11544 }, { "epoch": 1.8072949279899813, "grad_norm": 0.7213178277015686, "learning_rate": 1.4836956521739129e-05, "loss": 0.3825, "step": 11545 }, { "epoch": 1.8074514715090795, "grad_norm": 1.3024121522903442, "learning_rate": 1.4825067934782607e-05, "loss": 0.4152, "step": 11546 }, { "epoch": 1.807608015028178, "grad_norm": 1.4352396726608276, "learning_rate": 1.4813179347826085e-05, "loss": 0.4588, "step": 11547 }, { "epoch": 1.8077645585472761, "grad_norm": 1.1270636320114136, "learning_rate": 1.4801290760869564e-05, "loss": 0.4176, "step": 11548 }, { "epoch": 1.8079211020663744, "grad_norm": 0.8555067777633667, "learning_rate": 1.4789402173913043e-05, "loss": 0.4106, "step": 11549 }, { "epoch": 1.8080776455854728, "grad_norm": 1.4288898706436157, "learning_rate": 1.477751358695652e-05, "loss": 0.5248, "step": 11550 }, { "epoch": 1.8082341891045712, "grad_norm": 1.7652862071990967, "learning_rate": 1.4765624999999998e-05, "loss": 0.5205, "step": 11551 }, { "epoch": 1.8083907326236695, "grad_norm": 2.042452573776245, "learning_rate": 1.4753736413043478e-05, "loss": 0.5681, "step": 11552 }, { "epoch": 1.8085472761427677, "grad_norm": 1.9807960987091064, "learning_rate": 1.4741847826086954e-05, "loss": 0.5268, "step": 11553 }, { "epoch": 1.8087038196618659, "grad_norm": 3.0353925228118896, "learning_rate": 1.4729959239130434e-05, "loss": 0.556, "step": 11554 }, { "epoch": 1.8088603631809643, "grad_norm": 1.516912817955017, "learning_rate": 1.4718070652173912e-05, "loss": 0.5386, "step": 11555 }, { "epoch": 1.8090169067000628, "grad_norm": 3.9036502838134766, "learning_rate": 1.4706182065217389e-05, "loss": 0.4675, "step": 11556 }, { "epoch": 1.809173450219161, "grad_norm": 4.810522556304932, "learning_rate": 1.4694293478260869e-05, "loss": 0.5408, "step": 11557 }, { "epoch": 1.8093299937382592, "grad_norm": 1.9841090440750122, "learning_rate": 1.4682404891304347e-05, "loss": 0.6276, "step": 11558 }, { "epoch": 1.8094865372573574, "grad_norm": 2.0636041164398193, "learning_rate": 1.4670516304347825e-05, "loss": 0.6685, "step": 11559 }, { "epoch": 1.8096430807764559, "grad_norm": 1.3882534503936768, "learning_rate": 1.4658627717391303e-05, "loss": 0.4928, "step": 11560 }, { "epoch": 1.8097996242955543, "grad_norm": 3.8464691638946533, "learning_rate": 1.4646739130434783e-05, "loss": 0.4421, "step": 11561 }, { "epoch": 1.8099561678146525, "grad_norm": 1.862911581993103, "learning_rate": 1.463485054347826e-05, "loss": 0.5767, "step": 11562 }, { "epoch": 1.8101127113337507, "grad_norm": 2.069758653640747, "learning_rate": 1.4622961956521738e-05, "loss": 0.4718, "step": 11563 }, { "epoch": 1.810269254852849, "grad_norm": 2.835127830505371, "learning_rate": 1.4611073369565216e-05, "loss": 0.691, "step": 11564 }, { "epoch": 1.8104257983719474, "grad_norm": 6.876976490020752, "learning_rate": 1.4599184782608694e-05, "loss": 0.4442, "step": 11565 }, { "epoch": 1.8105823418910458, "grad_norm": 3.9653759002685547, "learning_rate": 1.4587296195652174e-05, "loss": 0.627, "step": 11566 }, { "epoch": 1.810738885410144, "grad_norm": 3.6680614948272705, "learning_rate": 1.457540760869565e-05, "loss": 0.883, "step": 11567 }, { "epoch": 1.8108954289292423, "grad_norm": 4.507615089416504, "learning_rate": 1.4563519021739128e-05, "loss": 1.1512, "step": 11568 }, { "epoch": 1.8110519724483405, "grad_norm": 2.248587131500244, "learning_rate": 1.4551630434782608e-05, "loss": 0.5192, "step": 11569 }, { "epoch": 1.811208515967439, "grad_norm": 3.2906525135040283, "learning_rate": 1.4539741847826085e-05, "loss": 0.6252, "step": 11570 }, { "epoch": 1.8113650594865374, "grad_norm": 6.735452651977539, "learning_rate": 1.4527853260869565e-05, "loss": 0.7121, "step": 11571 }, { "epoch": 1.8115216030056356, "grad_norm": 6.464320182800293, "learning_rate": 1.4515964673913043e-05, "loss": 0.582, "step": 11572 }, { "epoch": 1.8116781465247338, "grad_norm": 18.49674415588379, "learning_rate": 1.450407608695652e-05, "loss": 0.9791, "step": 11573 }, { "epoch": 1.811834690043832, "grad_norm": 3.563123941421509, "learning_rate": 1.4492187499999999e-05, "loss": 1.2927, "step": 11574 }, { "epoch": 1.8119912335629305, "grad_norm": 9.776533126831055, "learning_rate": 1.4480298913043477e-05, "loss": 0.7699, "step": 11575 }, { "epoch": 1.812147777082029, "grad_norm": 3.9624645709991455, "learning_rate": 1.4468410326086955e-05, "loss": 1.0684, "step": 11576 }, { "epoch": 1.8123043206011271, "grad_norm": 2.0484859943389893, "learning_rate": 1.4456521739130434e-05, "loss": 0.2773, "step": 11577 }, { "epoch": 1.8124608641202253, "grad_norm": 4.831039905548096, "learning_rate": 1.4444633152173913e-05, "loss": 0.7603, "step": 11578 }, { "epoch": 1.8126174076393238, "grad_norm": 6.792863368988037, "learning_rate": 1.443274456521739e-05, "loss": 1.107, "step": 11579 }, { "epoch": 1.812773951158422, "grad_norm": 7.6694440841674805, "learning_rate": 1.4420855978260868e-05, "loss": 0.9403, "step": 11580 }, { "epoch": 1.8129304946775204, "grad_norm": 2.7363121509552, "learning_rate": 1.4408967391304348e-05, "loss": 0.7395, "step": 11581 }, { "epoch": 1.8130870381966186, "grad_norm": 4.994723796844482, "learning_rate": 1.4397078804347824e-05, "loss": 1.6145, "step": 11582 }, { "epoch": 1.8132435817157169, "grad_norm": 4.57357931137085, "learning_rate": 1.4385190217391304e-05, "loss": 0.9747, "step": 11583 }, { "epoch": 1.8134001252348153, "grad_norm": 3.1007907390594482, "learning_rate": 1.4373301630434782e-05, "loss": 1.2984, "step": 11584 }, { "epoch": 1.8135566687539137, "grad_norm": 1.2339773178100586, "learning_rate": 1.4361413043478259e-05, "loss": 0.2013, "step": 11585 }, { "epoch": 1.813713212273012, "grad_norm": 3.640876531600952, "learning_rate": 1.4349524456521739e-05, "loss": 1.0015, "step": 11586 }, { "epoch": 1.8138697557921102, "grad_norm": 3.9746103286743164, "learning_rate": 1.4337635869565215e-05, "loss": 0.5352, "step": 11587 }, { "epoch": 1.8140262993112084, "grad_norm": 3.135241985321045, "learning_rate": 1.4325747282608695e-05, "loss": 0.6785, "step": 11588 }, { "epoch": 1.8141828428303068, "grad_norm": 0.9610992074012756, "learning_rate": 1.4313858695652173e-05, "loss": 0.5578, "step": 11589 }, { "epoch": 1.8143393863494053, "grad_norm": 1.05708909034729, "learning_rate": 1.430197010869565e-05, "loss": 0.5568, "step": 11590 }, { "epoch": 1.8144959298685035, "grad_norm": 0.7050967812538147, "learning_rate": 1.429008152173913e-05, "loss": 0.4622, "step": 11591 }, { "epoch": 1.8146524733876017, "grad_norm": 0.8031126260757446, "learning_rate": 1.4278192934782608e-05, "loss": 0.446, "step": 11592 }, { "epoch": 1.8148090169067, "grad_norm": 0.7805694937705994, "learning_rate": 1.4266304347826086e-05, "loss": 0.4635, "step": 11593 }, { "epoch": 1.8149655604257984, "grad_norm": 1.0972981452941895, "learning_rate": 1.4254415760869564e-05, "loss": 0.5459, "step": 11594 }, { "epoch": 1.8151221039448968, "grad_norm": 1.316089153289795, "learning_rate": 1.4242527173913042e-05, "loss": 0.5394, "step": 11595 }, { "epoch": 1.815278647463995, "grad_norm": 1.1250461339950562, "learning_rate": 1.423063858695652e-05, "loss": 0.6002, "step": 11596 }, { "epoch": 1.8154351909830932, "grad_norm": 0.9612434506416321, "learning_rate": 1.4218749999999998e-05, "loss": 0.5521, "step": 11597 }, { "epoch": 1.8155917345021915, "grad_norm": 1.118714690208435, "learning_rate": 1.4206861413043478e-05, "loss": 0.4698, "step": 11598 }, { "epoch": 1.81574827802129, "grad_norm": 1.0751357078552246, "learning_rate": 1.4194972826086955e-05, "loss": 0.508, "step": 11599 }, { "epoch": 1.8159048215403883, "grad_norm": 1.2514901161193848, "learning_rate": 1.4183084239130433e-05, "loss": 0.5356, "step": 11600 }, { "epoch": 1.8160613650594866, "grad_norm": 5.002877712249756, "learning_rate": 1.4171195652173913e-05, "loss": 0.5007, "step": 11601 }, { "epoch": 1.8162179085785848, "grad_norm": 1.5809880495071411, "learning_rate": 1.4159307065217389e-05, "loss": 0.6951, "step": 11602 }, { "epoch": 1.816374452097683, "grad_norm": 1.1356817483901978, "learning_rate": 1.4147418478260869e-05, "loss": 0.5256, "step": 11603 }, { "epoch": 1.8165309956167814, "grad_norm": 1.7047269344329834, "learning_rate": 1.4135529891304347e-05, "loss": 0.6538, "step": 11604 }, { "epoch": 1.8166875391358799, "grad_norm": 3.118725061416626, "learning_rate": 1.4123641304347825e-05, "loss": 0.6072, "step": 11605 }, { "epoch": 1.816844082654978, "grad_norm": 1.5941252708435059, "learning_rate": 1.4111752717391303e-05, "loss": 0.6441, "step": 11606 }, { "epoch": 1.8170006261740763, "grad_norm": 2.8391330242156982, "learning_rate": 1.4099864130434782e-05, "loss": 0.6657, "step": 11607 }, { "epoch": 1.8171571696931748, "grad_norm": 2.137920379638672, "learning_rate": 1.408797554347826e-05, "loss": 0.6374, "step": 11608 }, { "epoch": 1.817313713212273, "grad_norm": 2.145012378692627, "learning_rate": 1.4076086956521738e-05, "loss": 0.6429, "step": 11609 }, { "epoch": 1.8174702567313714, "grad_norm": 3.135537624359131, "learning_rate": 1.4064198369565216e-05, "loss": 0.545, "step": 11610 }, { "epoch": 1.8176268002504696, "grad_norm": 2.763195037841797, "learning_rate": 1.4052309782608694e-05, "loss": 0.5824, "step": 11611 }, { "epoch": 1.8177833437695678, "grad_norm": 2.606846570968628, "learning_rate": 1.4040421195652172e-05, "loss": 0.5543, "step": 11612 }, { "epoch": 1.8179398872886663, "grad_norm": 2.5494372844696045, "learning_rate": 1.402853260869565e-05, "loss": 0.6699, "step": 11613 }, { "epoch": 1.8180964308077645, "grad_norm": 4.792669773101807, "learning_rate": 1.4016644021739129e-05, "loss": 0.5199, "step": 11614 }, { "epoch": 1.818252974326863, "grad_norm": 8.276554107666016, "learning_rate": 1.4004755434782609e-05, "loss": 1.381, "step": 11615 }, { "epoch": 1.8184095178459612, "grad_norm": 4.396731853485107, "learning_rate": 1.3992866847826085e-05, "loss": 0.6266, "step": 11616 }, { "epoch": 1.8185660613650594, "grad_norm": 2.415806293487549, "learning_rate": 1.3980978260869563e-05, "loss": 0.6174, "step": 11617 }, { "epoch": 1.8187226048841578, "grad_norm": 7.710570812225342, "learning_rate": 1.3969089673913043e-05, "loss": 0.7871, "step": 11618 }, { "epoch": 1.8188791484032563, "grad_norm": 5.0588884353637695, "learning_rate": 1.395720108695652e-05, "loss": 0.8678, "step": 11619 }, { "epoch": 1.8190356919223545, "grad_norm": 2.4781341552734375, "learning_rate": 1.39453125e-05, "loss": 0.693, "step": 11620 }, { "epoch": 1.8191922354414527, "grad_norm": 2.951472520828247, "learning_rate": 1.3933423913043477e-05, "loss": 0.7177, "step": 11621 }, { "epoch": 1.819348778960551, "grad_norm": 3.727708339691162, "learning_rate": 1.3921535326086954e-05, "loss": 0.8204, "step": 11622 }, { "epoch": 1.8195053224796494, "grad_norm": 3.8138136863708496, "learning_rate": 1.3909646739130434e-05, "loss": 0.792, "step": 11623 }, { "epoch": 1.8196618659987478, "grad_norm": 4.240707874298096, "learning_rate": 1.3897758152173912e-05, "loss": 0.6871, "step": 11624 }, { "epoch": 1.819818409517846, "grad_norm": 3.610234498977661, "learning_rate": 1.388586956521739e-05, "loss": 0.4998, "step": 11625 }, { "epoch": 1.8199749530369442, "grad_norm": 4.674693584442139, "learning_rate": 1.3873980978260868e-05, "loss": 0.5319, "step": 11626 }, { "epoch": 1.8201314965560424, "grad_norm": 3.7276675701141357, "learning_rate": 1.3862092391304348e-05, "loss": 0.9228, "step": 11627 }, { "epoch": 1.820288040075141, "grad_norm": 4.947493076324463, "learning_rate": 1.3850203804347825e-05, "loss": 1.2638, "step": 11628 }, { "epoch": 1.8204445835942393, "grad_norm": 4.886566638946533, "learning_rate": 1.3838315217391303e-05, "loss": 1.1393, "step": 11629 }, { "epoch": 1.8206011271133375, "grad_norm": 3.0675010681152344, "learning_rate": 1.3826426630434783e-05, "loss": 0.7947, "step": 11630 }, { "epoch": 1.8207576706324358, "grad_norm": 2.8147575855255127, "learning_rate": 1.3814538043478259e-05, "loss": 1.0163, "step": 11631 }, { "epoch": 1.820914214151534, "grad_norm": 2.8309247493743896, "learning_rate": 1.3802649456521739e-05, "loss": 1.0444, "step": 11632 }, { "epoch": 1.8210707576706324, "grad_norm": 4.128608226776123, "learning_rate": 1.3790760869565215e-05, "loss": 0.8624, "step": 11633 }, { "epoch": 1.8212273011897309, "grad_norm": 2.764066696166992, "learning_rate": 1.3778872282608693e-05, "loss": 1.108, "step": 11634 }, { "epoch": 1.821383844708829, "grad_norm": 3.551905632019043, "learning_rate": 1.3766983695652173e-05, "loss": 0.7779, "step": 11635 }, { "epoch": 1.8215403882279273, "grad_norm": 2.033346176147461, "learning_rate": 1.375509510869565e-05, "loss": 0.7562, "step": 11636 }, { "epoch": 1.8216969317470255, "grad_norm": 3.0101144313812256, "learning_rate": 1.374320652173913e-05, "loss": 0.7934, "step": 11637 }, { "epoch": 1.821853475266124, "grad_norm": 4.030338764190674, "learning_rate": 1.3731317934782608e-05, "loss": 0.98, "step": 11638 }, { "epoch": 1.8220100187852224, "grad_norm": 0.880194902420044, "learning_rate": 1.3719429347826084e-05, "loss": 0.595, "step": 11639 }, { "epoch": 1.8221665623043206, "grad_norm": 0.793268620967865, "learning_rate": 1.3707540760869564e-05, "loss": 0.5527, "step": 11640 }, { "epoch": 1.8223231058234188, "grad_norm": 0.9702295064926147, "learning_rate": 1.3695652173913042e-05, "loss": 0.5816, "step": 11641 }, { "epoch": 1.8224796493425173, "grad_norm": 0.9994122982025146, "learning_rate": 1.368376358695652e-05, "loss": 0.6128, "step": 11642 }, { "epoch": 1.8226361928616155, "grad_norm": 1.0329086780548096, "learning_rate": 1.3671874999999999e-05, "loss": 0.524, "step": 11643 }, { "epoch": 1.822792736380714, "grad_norm": 1.5872763395309448, "learning_rate": 1.3659986413043478e-05, "loss": 0.5937, "step": 11644 }, { "epoch": 1.8229492798998121, "grad_norm": 1.8490418195724487, "learning_rate": 1.3648097826086955e-05, "loss": 0.6478, "step": 11645 }, { "epoch": 1.8231058234189104, "grad_norm": 0.9265679717063904, "learning_rate": 1.3636209239130433e-05, "loss": 0.5924, "step": 11646 }, { "epoch": 1.8232623669380088, "grad_norm": 1.2240145206451416, "learning_rate": 1.3624320652173913e-05, "loss": 0.6748, "step": 11647 }, { "epoch": 1.823418910457107, "grad_norm": 1.7670528888702393, "learning_rate": 1.361243206521739e-05, "loss": 0.6605, "step": 11648 }, { "epoch": 1.8235754539762055, "grad_norm": 1.693172812461853, "learning_rate": 1.360054347826087e-05, "loss": 0.7425, "step": 11649 }, { "epoch": 1.8237319974953037, "grad_norm": 1.1795793771743774, "learning_rate": 1.3588654891304347e-05, "loss": 0.751, "step": 11650 }, { "epoch": 1.823888541014402, "grad_norm": 1.8805882930755615, "learning_rate": 1.3576766304347824e-05, "loss": 0.9385, "step": 11651 }, { "epoch": 1.8240450845335003, "grad_norm": 2.2541515827178955, "learning_rate": 1.3564877717391304e-05, "loss": 0.7651, "step": 11652 }, { "epoch": 1.8242016280525988, "grad_norm": 1.2213319540023804, "learning_rate": 1.3552989130434782e-05, "loss": 0.7045, "step": 11653 }, { "epoch": 1.824358171571697, "grad_norm": 1.6274306774139404, "learning_rate": 1.354110054347826e-05, "loss": 0.9195, "step": 11654 }, { "epoch": 1.8245147150907952, "grad_norm": 0.9693163633346558, "learning_rate": 1.3529211956521738e-05, "loss": 0.6095, "step": 11655 }, { "epoch": 1.8246712586098934, "grad_norm": 1.8522982597351074, "learning_rate": 1.3517323369565215e-05, "loss": 0.7743, "step": 11656 }, { "epoch": 1.8248278021289919, "grad_norm": 2.297701597213745, "learning_rate": 1.3505434782608694e-05, "loss": 0.6972, "step": 11657 }, { "epoch": 1.8249843456480903, "grad_norm": 1.965940237045288, "learning_rate": 1.3493546195652173e-05, "loss": 0.7934, "step": 11658 }, { "epoch": 1.8251408891671885, "grad_norm": 1.625719666481018, "learning_rate": 1.348165760869565e-05, "loss": 0.596, "step": 11659 }, { "epoch": 1.8252974326862867, "grad_norm": 2.3277909755706787, "learning_rate": 1.3469769021739129e-05, "loss": 0.7958, "step": 11660 }, { "epoch": 1.825453976205385, "grad_norm": 14.112348556518555, "learning_rate": 1.3457880434782609e-05, "loss": 0.8537, "step": 11661 }, { "epoch": 1.8256105197244834, "grad_norm": 1.983139991760254, "learning_rate": 1.3445991847826085e-05, "loss": 0.5384, "step": 11662 }, { "epoch": 1.8257670632435818, "grad_norm": 1.7219626903533936, "learning_rate": 1.3434103260869563e-05, "loss": 0.5163, "step": 11663 }, { "epoch": 1.82592360676268, "grad_norm": 3.6896088123321533, "learning_rate": 1.3422214673913043e-05, "loss": 0.6594, "step": 11664 }, { "epoch": 1.8260801502817783, "grad_norm": 2.6569015979766846, "learning_rate": 1.341032608695652e-05, "loss": 0.6912, "step": 11665 }, { "epoch": 1.8262366938008765, "grad_norm": 1.8287177085876465, "learning_rate": 1.33984375e-05, "loss": 0.4242, "step": 11666 }, { "epoch": 1.826393237319975, "grad_norm": 4.774355888366699, "learning_rate": 1.3386548913043478e-05, "loss": 0.6958, "step": 11667 }, { "epoch": 1.8265497808390734, "grad_norm": 3.057051181793213, "learning_rate": 1.3374660326086954e-05, "loss": 0.5465, "step": 11668 }, { "epoch": 1.8267063243581716, "grad_norm": 2.17354679107666, "learning_rate": 1.3362771739130434e-05, "loss": 0.4537, "step": 11669 }, { "epoch": 1.8268628678772698, "grad_norm": 5.182984828948975, "learning_rate": 1.3350883152173912e-05, "loss": 0.8575, "step": 11670 }, { "epoch": 1.827019411396368, "grad_norm": 2.3805363178253174, "learning_rate": 1.333899456521739e-05, "loss": 0.4428, "step": 11671 }, { "epoch": 1.8271759549154665, "grad_norm": 4.740058422088623, "learning_rate": 1.3327105978260869e-05, "loss": 1.0719, "step": 11672 }, { "epoch": 1.827332498434565, "grad_norm": 6.088620662689209, "learning_rate": 1.3315217391304347e-05, "loss": 1.021, "step": 11673 }, { "epoch": 1.8274890419536631, "grad_norm": 6.261062145233154, "learning_rate": 1.3303328804347825e-05, "loss": 1.4234, "step": 11674 }, { "epoch": 1.8276455854727613, "grad_norm": 2.878777503967285, "learning_rate": 1.3291440217391303e-05, "loss": 0.8813, "step": 11675 }, { "epoch": 1.8278021289918598, "grad_norm": 4.0654401779174805, "learning_rate": 1.3279551630434783e-05, "loss": 0.9406, "step": 11676 }, { "epoch": 1.827958672510958, "grad_norm": 4.206693172454834, "learning_rate": 1.326766304347826e-05, "loss": 1.0439, "step": 11677 }, { "epoch": 1.8281152160300564, "grad_norm": 2.9214978218078613, "learning_rate": 1.3255774456521739e-05, "loss": 0.9108, "step": 11678 }, { "epoch": 1.8282717595491547, "grad_norm": 5.085268020629883, "learning_rate": 1.3243885869565216e-05, "loss": 1.3112, "step": 11679 }, { "epoch": 1.8284283030682529, "grad_norm": 8.333154678344727, "learning_rate": 1.3231997282608694e-05, "loss": 1.3386, "step": 11680 }, { "epoch": 1.8285848465873513, "grad_norm": 4.247594833374023, "learning_rate": 1.3220108695652174e-05, "loss": 0.9932, "step": 11681 }, { "epoch": 1.8287413901064495, "grad_norm": 2.916025400161743, "learning_rate": 1.320822010869565e-05, "loss": 0.8609, "step": 11682 }, { "epoch": 1.828897933625548, "grad_norm": 4.466762542724609, "learning_rate": 1.319633152173913e-05, "loss": 1.431, "step": 11683 }, { "epoch": 1.8290544771446462, "grad_norm": 4.595661163330078, "learning_rate": 1.3184442934782608e-05, "loss": 0.778, "step": 11684 }, { "epoch": 1.8292110206637444, "grad_norm": 2.811819553375244, "learning_rate": 1.3172554347826085e-05, "loss": 0.3253, "step": 11685 }, { "epoch": 1.8293675641828429, "grad_norm": 2.969716787338257, "learning_rate": 1.3160665760869564e-05, "loss": 0.477, "step": 11686 }, { "epoch": 1.8295241077019413, "grad_norm": 2.8698160648345947, "learning_rate": 1.3148777173913043e-05, "loss": 0.612, "step": 11687 }, { "epoch": 1.8296806512210395, "grad_norm": 4.939245223999023, "learning_rate": 1.313688858695652e-05, "loss": 0.9608, "step": 11688 }, { "epoch": 1.8298371947401377, "grad_norm": 0.7700476050376892, "learning_rate": 1.3124999999999999e-05, "loss": 0.5271, "step": 11689 }, { "epoch": 1.829993738259236, "grad_norm": 0.8607412576675415, "learning_rate": 1.3113111413043477e-05, "loss": 0.5242, "step": 11690 }, { "epoch": 1.8301502817783344, "grad_norm": 0.7949037551879883, "learning_rate": 1.3101222826086955e-05, "loss": 0.4852, "step": 11691 }, { "epoch": 1.8303068252974328, "grad_norm": 0.816030740737915, "learning_rate": 1.3089334239130433e-05, "loss": 0.4961, "step": 11692 }, { "epoch": 1.830463368816531, "grad_norm": 1.047945261001587, "learning_rate": 1.3077445652173913e-05, "loss": 0.5724, "step": 11693 }, { "epoch": 1.8306199123356293, "grad_norm": 5.075475215911865, "learning_rate": 1.306555706521739e-05, "loss": 0.5344, "step": 11694 }, { "epoch": 1.8307764558547275, "grad_norm": 1.2386202812194824, "learning_rate": 1.3053668478260868e-05, "loss": 0.621, "step": 11695 }, { "epoch": 1.830932999373826, "grad_norm": 0.8917097449302673, "learning_rate": 1.3041779891304348e-05, "loss": 0.4885, "step": 11696 }, { "epoch": 1.8310895428929244, "grad_norm": 2.5835564136505127, "learning_rate": 1.3029891304347824e-05, "loss": 0.6359, "step": 11697 }, { "epoch": 1.8312460864120226, "grad_norm": 1.4467593431472778, "learning_rate": 1.3018002717391304e-05, "loss": 0.6254, "step": 11698 }, { "epoch": 1.8314026299311208, "grad_norm": 1.0968528985977173, "learning_rate": 1.3006114130434782e-05, "loss": 0.5611, "step": 11699 }, { "epoch": 1.831559173450219, "grad_norm": 2.891395330429077, "learning_rate": 1.2994225543478259e-05, "loss": 0.6099, "step": 11700 }, { "epoch": 1.8317157169693175, "grad_norm": 6.130589962005615, "learning_rate": 1.2982336956521738e-05, "loss": 0.5133, "step": 11701 }, { "epoch": 1.831872260488416, "grad_norm": 1.5518481731414795, "learning_rate": 1.2970448369565215e-05, "loss": 0.6483, "step": 11702 }, { "epoch": 1.8320288040075141, "grad_norm": 1.8250809907913208, "learning_rate": 1.2958559782608695e-05, "loss": 0.5847, "step": 11703 }, { "epoch": 1.8321853475266123, "grad_norm": 1.7835981845855713, "learning_rate": 1.2946671195652173e-05, "loss": 0.5164, "step": 11704 }, { "epoch": 1.8323418910457105, "grad_norm": 1.8640447854995728, "learning_rate": 1.2934782608695651e-05, "loss": 0.7345, "step": 11705 }, { "epoch": 1.832498434564809, "grad_norm": 1.158438801765442, "learning_rate": 1.292289402173913e-05, "loss": 0.5647, "step": 11706 }, { "epoch": 1.8326549780839074, "grad_norm": 2.537041425704956, "learning_rate": 1.2911005434782607e-05, "loss": 0.6454, "step": 11707 }, { "epoch": 1.8328115216030056, "grad_norm": 2.234606981277466, "learning_rate": 1.2899116847826086e-05, "loss": 0.722, "step": 11708 }, { "epoch": 1.8329680651221039, "grad_norm": 3.1831979751586914, "learning_rate": 1.2887228260869564e-05, "loss": 0.8056, "step": 11709 }, { "epoch": 1.8331246086412023, "grad_norm": 5.272292613983154, "learning_rate": 1.2875339673913044e-05, "loss": 0.7093, "step": 11710 }, { "epoch": 1.8332811521603005, "grad_norm": 4.629453182220459, "learning_rate": 1.286345108695652e-05, "loss": 0.8735, "step": 11711 }, { "epoch": 1.833437695679399, "grad_norm": 1.3826992511749268, "learning_rate": 1.2851562499999998e-05, "loss": 0.6152, "step": 11712 }, { "epoch": 1.8335942391984972, "grad_norm": 1.9304910898208618, "learning_rate": 1.2839673913043478e-05, "loss": 0.7545, "step": 11713 }, { "epoch": 1.8337507827175954, "grad_norm": 1.6856399774551392, "learning_rate": 1.2827785326086954e-05, "loss": 0.5417, "step": 11714 }, { "epoch": 1.8339073262366938, "grad_norm": 8.593427658081055, "learning_rate": 1.2815896739130434e-05, "loss": 0.6872, "step": 11715 }, { "epoch": 1.8340638697557923, "grad_norm": 2.444819450378418, "learning_rate": 1.2804008152173912e-05, "loss": 0.7887, "step": 11716 }, { "epoch": 1.8342204132748905, "grad_norm": 1.715498685836792, "learning_rate": 1.2792119565217389e-05, "loss": 0.8404, "step": 11717 }, { "epoch": 1.8343769567939887, "grad_norm": 8.979711532592773, "learning_rate": 1.2780230978260869e-05, "loss": 0.852, "step": 11718 }, { "epoch": 1.834533500313087, "grad_norm": 6.699624061584473, "learning_rate": 1.2768342391304347e-05, "loss": 0.9027, "step": 11719 }, { "epoch": 1.8346900438321854, "grad_norm": 14.520591735839844, "learning_rate": 1.2756453804347825e-05, "loss": 1.0326, "step": 11720 }, { "epoch": 1.8348465873512838, "grad_norm": 6.942943096160889, "learning_rate": 1.2744565217391303e-05, "loss": 0.8178, "step": 11721 }, { "epoch": 1.835003130870382, "grad_norm": 3.229459524154663, "learning_rate": 1.2732676630434783e-05, "loss": 1.0483, "step": 11722 }, { "epoch": 1.8351596743894802, "grad_norm": 2.907601833343506, "learning_rate": 1.272078804347826e-05, "loss": 0.6556, "step": 11723 }, { "epoch": 1.8353162179085785, "grad_norm": 3.0552103519439697, "learning_rate": 1.2708899456521738e-05, "loss": 0.7053, "step": 11724 }, { "epoch": 1.835472761427677, "grad_norm": 3.0855460166931152, "learning_rate": 1.2697010869565216e-05, "loss": 0.7464, "step": 11725 }, { "epoch": 1.8356293049467753, "grad_norm": 3.2043333053588867, "learning_rate": 1.2685122282608694e-05, "loss": 0.619, "step": 11726 }, { "epoch": 1.8357858484658736, "grad_norm": 4.724063396453857, "learning_rate": 1.2673233695652174e-05, "loss": 0.8817, "step": 11727 }, { "epoch": 1.8359423919849718, "grad_norm": 3.62302827835083, "learning_rate": 1.266134510869565e-05, "loss": 0.8394, "step": 11728 }, { "epoch": 1.83609893550407, "grad_norm": 3.203402519226074, "learning_rate": 1.2649456521739128e-05, "loss": 0.9194, "step": 11729 }, { "epoch": 1.8362554790231684, "grad_norm": 3.5860595703125, "learning_rate": 1.2637567934782608e-05, "loss": 1.2299, "step": 11730 }, { "epoch": 1.8364120225422669, "grad_norm": 5.883243083953857, "learning_rate": 1.2625679347826085e-05, "loss": 1.2568, "step": 11731 }, { "epoch": 1.836568566061365, "grad_norm": 6.811415195465088, "learning_rate": 1.2613790760869565e-05, "loss": 1.1156, "step": 11732 }, { "epoch": 1.8367251095804633, "grad_norm": 1.5976145267486572, "learning_rate": 1.2601902173913043e-05, "loss": 0.7841, "step": 11733 }, { "epoch": 1.8368816530995615, "grad_norm": 2.018569231033325, "learning_rate": 1.259001358695652e-05, "loss": 0.3608, "step": 11734 }, { "epoch": 1.83703819661866, "grad_norm": 1.6943624019622803, "learning_rate": 1.2578124999999999e-05, "loss": 0.5104, "step": 11735 }, { "epoch": 1.8371947401377584, "grad_norm": 1.6080540418624878, "learning_rate": 1.2566236413043477e-05, "loss": 0.4171, "step": 11736 }, { "epoch": 1.8373512836568566, "grad_norm": 3.6105098724365234, "learning_rate": 1.2554347826086955e-05, "loss": 0.7479, "step": 11737 }, { "epoch": 1.8375078271759548, "grad_norm": 1.4505722522735596, "learning_rate": 1.2542459239130434e-05, "loss": 0.644, "step": 11738 }, { "epoch": 1.837664370695053, "grad_norm": 0.8134494423866272, "learning_rate": 1.2530570652173913e-05, "loss": 0.6929, "step": 11739 }, { "epoch": 1.8378209142141515, "grad_norm": 0.9468308687210083, "learning_rate": 1.251868206521739e-05, "loss": 0.6869, "step": 11740 }, { "epoch": 1.83797745773325, "grad_norm": 0.8600996732711792, "learning_rate": 1.2506793478260868e-05, "loss": 0.7279, "step": 11741 }, { "epoch": 1.8381340012523482, "grad_norm": 1.0099339485168457, "learning_rate": 1.2494904891304348e-05, "loss": 0.6277, "step": 11742 }, { "epoch": 1.8382905447714464, "grad_norm": 1.0652287006378174, "learning_rate": 1.2483016304347824e-05, "loss": 0.6706, "step": 11743 }, { "epoch": 1.8384470882905448, "grad_norm": 2.240809917449951, "learning_rate": 1.2471127717391304e-05, "loss": 0.8038, "step": 11744 }, { "epoch": 1.838603631809643, "grad_norm": 1.1476993560791016, "learning_rate": 1.2459239130434782e-05, "loss": 0.7419, "step": 11745 }, { "epoch": 1.8387601753287415, "grad_norm": 1.101151466369629, "learning_rate": 1.2447350543478259e-05, "loss": 0.7232, "step": 11746 }, { "epoch": 1.8389167188478397, "grad_norm": 1.520784616470337, "learning_rate": 1.2435461956521739e-05, "loss": 0.6715, "step": 11747 }, { "epoch": 1.839073262366938, "grad_norm": 1.1553703546524048, "learning_rate": 1.2423573369565215e-05, "loss": 0.6698, "step": 11748 }, { "epoch": 1.8392298058860364, "grad_norm": 7.069875240325928, "learning_rate": 1.2411684782608695e-05, "loss": 1.2644, "step": 11749 }, { "epoch": 1.8393863494051348, "grad_norm": 1.891882300376892, "learning_rate": 1.2399796195652173e-05, "loss": 0.7435, "step": 11750 }, { "epoch": 1.839542892924233, "grad_norm": 1.0902793407440186, "learning_rate": 1.238790760869565e-05, "loss": 0.737, "step": 11751 }, { "epoch": 1.8396994364433312, "grad_norm": 1.5118807554244995, "learning_rate": 1.237601902173913e-05, "loss": 0.7823, "step": 11752 }, { "epoch": 1.8398559799624294, "grad_norm": 3.3943846225738525, "learning_rate": 1.2364130434782608e-05, "loss": 0.6746, "step": 11753 }, { "epoch": 1.8400125234815279, "grad_norm": 2.0187296867370605, "learning_rate": 1.2352241847826086e-05, "loss": 0.7254, "step": 11754 }, { "epoch": 1.8401690670006263, "grad_norm": 1.8669233322143555, "learning_rate": 1.2340353260869564e-05, "loss": 0.949, "step": 11755 }, { "epoch": 1.8403256105197245, "grad_norm": 1.9410851001739502, "learning_rate": 1.2328464673913044e-05, "loss": 0.6997, "step": 11756 }, { "epoch": 1.8404821540388228, "grad_norm": 3.073396682739258, "learning_rate": 1.231657608695652e-05, "loss": 0.743, "step": 11757 }, { "epoch": 1.840638697557921, "grad_norm": 2.140336036682129, "learning_rate": 1.2304687499999998e-05, "loss": 0.7587, "step": 11758 }, { "epoch": 1.8407952410770194, "grad_norm": 2.2942328453063965, "learning_rate": 1.2292798913043478e-05, "loss": 0.83, "step": 11759 }, { "epoch": 1.8409517845961179, "grad_norm": 3.433701515197754, "learning_rate": 1.2280910326086955e-05, "loss": 0.8319, "step": 11760 }, { "epoch": 1.841108328115216, "grad_norm": 2.4815239906311035, "learning_rate": 1.2269021739130435e-05, "loss": 0.6975, "step": 11761 }, { "epoch": 1.8412648716343143, "grad_norm": 2.026676893234253, "learning_rate": 1.2257133152173913e-05, "loss": 0.7499, "step": 11762 }, { "epoch": 1.8414214151534125, "grad_norm": 7.752603054046631, "learning_rate": 1.224524456521739e-05, "loss": 0.6976, "step": 11763 }, { "epoch": 1.841577958672511, "grad_norm": 1.9761539697647095, "learning_rate": 1.2233355978260869e-05, "loss": 0.9518, "step": 11764 }, { "epoch": 1.8417345021916094, "grad_norm": 1.6831833124160767, "learning_rate": 1.2221467391304347e-05, "loss": 0.6863, "step": 11765 }, { "epoch": 1.8418910457107076, "grad_norm": 3.3128743171691895, "learning_rate": 1.2209578804347825e-05, "loss": 0.6208, "step": 11766 }, { "epoch": 1.8420475892298058, "grad_norm": 1.9860811233520508, "learning_rate": 1.2197690217391303e-05, "loss": 0.7448, "step": 11767 }, { "epoch": 1.842204132748904, "grad_norm": 2.087134838104248, "learning_rate": 1.2185801630434782e-05, "loss": 0.5859, "step": 11768 }, { "epoch": 1.8423606762680025, "grad_norm": 3.0973825454711914, "learning_rate": 1.217391304347826e-05, "loss": 0.8387, "step": 11769 }, { "epoch": 1.842517219787101, "grad_norm": 7.136032581329346, "learning_rate": 1.2162024456521738e-05, "loss": 0.9445, "step": 11770 }, { "epoch": 1.8426737633061991, "grad_norm": 4.930957794189453, "learning_rate": 1.2150135869565216e-05, "loss": 0.391, "step": 11771 }, { "epoch": 1.8428303068252974, "grad_norm": 9.643431663513184, "learning_rate": 1.2138247282608694e-05, "loss": 1.6517, "step": 11772 }, { "epoch": 1.8429868503443956, "grad_norm": 2.1591765880584717, "learning_rate": 1.2126358695652172e-05, "loss": 0.845, "step": 11773 }, { "epoch": 1.843143393863494, "grad_norm": 7.132566928863525, "learning_rate": 1.211447010869565e-05, "loss": 1.0953, "step": 11774 }, { "epoch": 1.8432999373825925, "grad_norm": 6.506346225738525, "learning_rate": 1.2102581521739129e-05, "loss": 0.886, "step": 11775 }, { "epoch": 1.8434564809016907, "grad_norm": 5.46403694152832, "learning_rate": 1.2090692934782609e-05, "loss": 0.9443, "step": 11776 }, { "epoch": 1.843613024420789, "grad_norm": 2.778341054916382, "learning_rate": 1.2078804347826085e-05, "loss": 0.6863, "step": 11777 }, { "epoch": 1.8437695679398873, "grad_norm": 3.1285228729248047, "learning_rate": 1.2066915760869565e-05, "loss": 0.6529, "step": 11778 }, { "epoch": 1.8439261114589856, "grad_norm": 2.1411795616149902, "learning_rate": 1.2055027173913043e-05, "loss": 0.8884, "step": 11779 }, { "epoch": 1.844082654978084, "grad_norm": 1.9566324949264526, "learning_rate": 1.204313858695652e-05, "loss": 0.9504, "step": 11780 }, { "epoch": 1.8442391984971822, "grad_norm": 3.501619577407837, "learning_rate": 1.203125e-05, "loss": 1.4719, "step": 11781 }, { "epoch": 1.8443957420162804, "grad_norm": 2.816768169403076, "learning_rate": 1.2019361413043478e-05, "loss": 0.5205, "step": 11782 }, { "epoch": 1.8445522855353789, "grad_norm": 2.136829376220703, "learning_rate": 1.2007472826086956e-05, "loss": 0.9463, "step": 11783 }, { "epoch": 1.8447088290544773, "grad_norm": 1.33564031124115, "learning_rate": 1.1995584239130434e-05, "loss": 0.6578, "step": 11784 }, { "epoch": 1.8448653725735755, "grad_norm": 3.5143399238586426, "learning_rate": 1.1983695652173912e-05, "loss": 0.9467, "step": 11785 }, { "epoch": 1.8450219160926737, "grad_norm": 8.900053977966309, "learning_rate": 1.197180706521739e-05, "loss": 1.148, "step": 11786 }, { "epoch": 1.845178459611772, "grad_norm": 5.358577251434326, "learning_rate": 1.1959918478260868e-05, "loss": 0.4232, "step": 11787 }, { "epoch": 1.8453350031308704, "grad_norm": 3.57761287689209, "learning_rate": 1.1948029891304348e-05, "loss": 1.0219, "step": 11788 }, { "epoch": 1.8454915466499688, "grad_norm": 2.7704148292541504, "learning_rate": 1.1936141304347825e-05, "loss": 1.0849, "step": 11789 }, { "epoch": 1.845648090169067, "grad_norm": 1.1187899112701416, "learning_rate": 1.1924252717391303e-05, "loss": 0.6795, "step": 11790 }, { "epoch": 1.8458046336881653, "grad_norm": 1.1053264141082764, "learning_rate": 1.1912364130434783e-05, "loss": 0.6128, "step": 11791 }, { "epoch": 1.8459611772072635, "grad_norm": 0.9697030186653137, "learning_rate": 1.1900475543478259e-05, "loss": 0.7062, "step": 11792 }, { "epoch": 1.846117720726362, "grad_norm": 1.4453489780426025, "learning_rate": 1.1888586956521739e-05, "loss": 0.6072, "step": 11793 }, { "epoch": 1.8462742642454604, "grad_norm": 2.5152268409729004, "learning_rate": 1.1876698369565215e-05, "loss": 1.0442, "step": 11794 }, { "epoch": 1.8464308077645586, "grad_norm": 1.886556625366211, "learning_rate": 1.1864809782608694e-05, "loss": 0.8443, "step": 11795 }, { "epoch": 1.8465873512836568, "grad_norm": 1.0402157306671143, "learning_rate": 1.1852921195652173e-05, "loss": 0.6054, "step": 11796 }, { "epoch": 1.846743894802755, "grad_norm": 1.4284700155258179, "learning_rate": 1.184103260869565e-05, "loss": 0.6998, "step": 11797 }, { "epoch": 1.8469004383218535, "grad_norm": 1.194242000579834, "learning_rate": 1.182914402173913e-05, "loss": 0.7099, "step": 11798 }, { "epoch": 1.847056981840952, "grad_norm": 1.7261736392974854, "learning_rate": 1.1817255434782608e-05, "loss": 0.6981, "step": 11799 }, { "epoch": 1.8472135253600501, "grad_norm": 1.0620402097702026, "learning_rate": 1.1805366847826084e-05, "loss": 0.6847, "step": 11800 }, { "epoch": 1.8473700688791483, "grad_norm": 1.7173264026641846, "learning_rate": 1.1793478260869564e-05, "loss": 0.8662, "step": 11801 }, { "epoch": 1.8475266123982466, "grad_norm": 1.5705076456069946, "learning_rate": 1.1781589673913042e-05, "loss": 0.8291, "step": 11802 }, { "epoch": 1.847683155917345, "grad_norm": 2.5021419525146484, "learning_rate": 1.176970108695652e-05, "loss": 0.9846, "step": 11803 }, { "epoch": 1.8478396994364434, "grad_norm": 1.9561996459960938, "learning_rate": 1.1757812499999999e-05, "loss": 0.8388, "step": 11804 }, { "epoch": 1.8479962429555417, "grad_norm": 2.2462656497955322, "learning_rate": 1.1745923913043479e-05, "loss": 0.8849, "step": 11805 }, { "epoch": 1.8481527864746399, "grad_norm": 2.975236177444458, "learning_rate": 1.1734035326086955e-05, "loss": 1.1538, "step": 11806 }, { "epoch": 1.8483093299937383, "grad_norm": 6.649317741394043, "learning_rate": 1.1722146739130433e-05, "loss": 0.9738, "step": 11807 }, { "epoch": 1.8484658735128365, "grad_norm": 1.867984414100647, "learning_rate": 1.1710258152173913e-05, "loss": 0.8793, "step": 11808 }, { "epoch": 1.848622417031935, "grad_norm": 2.087751626968384, "learning_rate": 1.169836956521739e-05, "loss": 0.961, "step": 11809 }, { "epoch": 1.8487789605510332, "grad_norm": 4.906894207000732, "learning_rate": 1.168648097826087e-05, "loss": 0.8362, "step": 11810 }, { "epoch": 1.8489355040701314, "grad_norm": 5.577618598937988, "learning_rate": 1.1674592391304347e-05, "loss": 1.0333, "step": 11811 }, { "epoch": 1.8490920475892298, "grad_norm": 4.851244926452637, "learning_rate": 1.1662703804347824e-05, "loss": 0.8753, "step": 11812 }, { "epoch": 1.849248591108328, "grad_norm": 2.5218446254730225, "learning_rate": 1.1650815217391304e-05, "loss": 0.8131, "step": 11813 }, { "epoch": 1.8494051346274265, "grad_norm": 5.630248069763184, "learning_rate": 1.1638926630434782e-05, "loss": 0.8208, "step": 11814 }, { "epoch": 1.8495616781465247, "grad_norm": 0.9262992739677429, "learning_rate": 1.162703804347826e-05, "loss": 0.5261, "step": 11815 }, { "epoch": 1.849718221665623, "grad_norm": 4.158056735992432, "learning_rate": 1.1615149456521738e-05, "loss": 0.6342, "step": 11816 }, { "epoch": 1.8498747651847214, "grad_norm": 2.376826047897339, "learning_rate": 1.1603260869565215e-05, "loss": 0.5514, "step": 11817 }, { "epoch": 1.8500313087038198, "grad_norm": 3.1266512870788574, "learning_rate": 1.1591372282608695e-05, "loss": 0.9756, "step": 11818 }, { "epoch": 1.850187852222918, "grad_norm": 4.6730265617370605, "learning_rate": 1.1579483695652173e-05, "loss": 0.9251, "step": 11819 }, { "epoch": 1.8503443957420163, "grad_norm": 3.07981538772583, "learning_rate": 1.156759510869565e-05, "loss": 0.8654, "step": 11820 }, { "epoch": 1.8505009392611145, "grad_norm": 3.0340676307678223, "learning_rate": 1.1555706521739129e-05, "loss": 0.8569, "step": 11821 }, { "epoch": 1.850657482780213, "grad_norm": 3.8411240577697754, "learning_rate": 1.1543817934782609e-05, "loss": 0.7894, "step": 11822 }, { "epoch": 1.8508140262993114, "grad_norm": 8.57752799987793, "learning_rate": 1.1531929347826085e-05, "loss": 0.9592, "step": 11823 }, { "epoch": 1.8509705698184096, "grad_norm": 5.382934093475342, "learning_rate": 1.1520040760869563e-05, "loss": 0.8251, "step": 11824 }, { "epoch": 1.8511271133375078, "grad_norm": 1.7327300310134888, "learning_rate": 1.1508152173913043e-05, "loss": 0.725, "step": 11825 }, { "epoch": 1.851283656856606, "grad_norm": 4.7735276222229, "learning_rate": 1.149626358695652e-05, "loss": 0.9354, "step": 11826 }, { "epoch": 1.8514402003757044, "grad_norm": 2.427400827407837, "learning_rate": 1.1484375e-05, "loss": 0.8012, "step": 11827 }, { "epoch": 1.8515967438948029, "grad_norm": 4.492406845092773, "learning_rate": 1.1472486413043478e-05, "loss": 1.3253, "step": 11828 }, { "epoch": 1.851753287413901, "grad_norm": 9.785686492919922, "learning_rate": 1.1460597826086954e-05, "loss": 0.9531, "step": 11829 }, { "epoch": 1.8519098309329993, "grad_norm": 3.153486728668213, "learning_rate": 1.1448709239130434e-05, "loss": 1.3971, "step": 11830 }, { "epoch": 1.8520663744520975, "grad_norm": 2.628453493118286, "learning_rate": 1.1436820652173912e-05, "loss": 0.7638, "step": 11831 }, { "epoch": 1.852222917971196, "grad_norm": 1.8677197694778442, "learning_rate": 1.142493206521739e-05, "loss": 0.7832, "step": 11832 }, { "epoch": 1.8523794614902944, "grad_norm": 7.43255615234375, "learning_rate": 1.1413043478260869e-05, "loss": 1.9918, "step": 11833 }, { "epoch": 1.8525360050093926, "grad_norm": 2.3038229942321777, "learning_rate": 1.1401154891304348e-05, "loss": 0.4871, "step": 11834 }, { "epoch": 1.8526925485284909, "grad_norm": 4.228329181671143, "learning_rate": 1.1389266304347825e-05, "loss": 0.6499, "step": 11835 }, { "epoch": 1.852849092047589, "grad_norm": 3.30991530418396, "learning_rate": 1.1377377717391303e-05, "loss": 1.2404, "step": 11836 }, { "epoch": 1.8530056355666875, "grad_norm": 2.619241952896118, "learning_rate": 1.1365489130434783e-05, "loss": 0.6728, "step": 11837 }, { "epoch": 1.853162179085786, "grad_norm": 2.2633118629455566, "learning_rate": 1.135360054347826e-05, "loss": 0.5602, "step": 11838 }, { "epoch": 1.8533187226048842, "grad_norm": 2.147017478942871, "learning_rate": 1.134171195652174e-05, "loss": 0.7061, "step": 11839 }, { "epoch": 1.8534752661239824, "grad_norm": 0.856713593006134, "learning_rate": 1.1329823369565216e-05, "loss": 0.6471, "step": 11840 }, { "epoch": 1.8536318096430808, "grad_norm": 1.1503146886825562, "learning_rate": 1.1317934782608694e-05, "loss": 0.6816, "step": 11841 }, { "epoch": 1.853788353162179, "grad_norm": 1.6018335819244385, "learning_rate": 1.1306046195652174e-05, "loss": 0.6697, "step": 11842 }, { "epoch": 1.8539448966812775, "grad_norm": 0.7746901512145996, "learning_rate": 1.129415760869565e-05, "loss": 0.6114, "step": 11843 }, { "epoch": 1.8541014402003757, "grad_norm": 1.0171442031860352, "learning_rate": 1.128226902173913e-05, "loss": 0.6337, "step": 11844 }, { "epoch": 1.854257983719474, "grad_norm": 1.484558343887329, "learning_rate": 1.1270380434782608e-05, "loss": 0.6719, "step": 11845 }, { "epoch": 1.8544145272385724, "grad_norm": 0.9235568046569824, "learning_rate": 1.1258491847826085e-05, "loss": 0.6179, "step": 11846 }, { "epoch": 1.8545710707576706, "grad_norm": 2.2658557891845703, "learning_rate": 1.1246603260869564e-05, "loss": 0.714, "step": 11847 }, { "epoch": 1.854727614276769, "grad_norm": 14.559488296508789, "learning_rate": 1.1234714673913043e-05, "loss": 1.401, "step": 11848 }, { "epoch": 1.8548841577958672, "grad_norm": 1.5318835973739624, "learning_rate": 1.122282608695652e-05, "loss": 0.7299, "step": 11849 }, { "epoch": 1.8550407013149655, "grad_norm": 4.301366329193115, "learning_rate": 1.1210937499999999e-05, "loss": 0.8046, "step": 11850 }, { "epoch": 1.855197244834064, "grad_norm": 2.4853672981262207, "learning_rate": 1.1199048913043479e-05, "loss": 0.7577, "step": 11851 }, { "epoch": 1.8553537883531623, "grad_norm": 4.447765350341797, "learning_rate": 1.1187160326086955e-05, "loss": 0.5683, "step": 11852 }, { "epoch": 1.8555103318722606, "grad_norm": 1.3220564126968384, "learning_rate": 1.1175271739130433e-05, "loss": 0.5804, "step": 11853 }, { "epoch": 1.8556668753913588, "grad_norm": 4.037609100341797, "learning_rate": 1.1163383152173913e-05, "loss": 0.8844, "step": 11854 }, { "epoch": 1.855823418910457, "grad_norm": 1.9008504152297974, "learning_rate": 1.115149456521739e-05, "loss": 0.7689, "step": 11855 }, { "epoch": 1.8559799624295554, "grad_norm": 2.397475481033325, "learning_rate": 1.113960597826087e-05, "loss": 0.6301, "step": 11856 }, { "epoch": 1.8561365059486539, "grad_norm": 1.657560110092163, "learning_rate": 1.1127717391304348e-05, "loss": 0.5805, "step": 11857 }, { "epoch": 1.856293049467752, "grad_norm": 1.2775987386703491, "learning_rate": 1.1115828804347824e-05, "loss": 0.5745, "step": 11858 }, { "epoch": 1.8564495929868503, "grad_norm": 1.6651252508163452, "learning_rate": 1.1103940217391304e-05, "loss": 0.4843, "step": 11859 }, { "epoch": 1.8566061365059485, "grad_norm": 2.3518178462982178, "learning_rate": 1.1092051630434782e-05, "loss": 0.6576, "step": 11860 }, { "epoch": 1.856762680025047, "grad_norm": 4.213306903839111, "learning_rate": 1.108016304347826e-05, "loss": 1.3154, "step": 11861 }, { "epoch": 1.8569192235441454, "grad_norm": 5.321047306060791, "learning_rate": 1.1068274456521738e-05, "loss": 0.5051, "step": 11862 }, { "epoch": 1.8570757670632436, "grad_norm": 1.5453641414642334, "learning_rate": 1.1056385869565215e-05, "loss": 0.511, "step": 11863 }, { "epoch": 1.8572323105823418, "grad_norm": 3.453407049179077, "learning_rate": 1.1044497282608695e-05, "loss": 0.7242, "step": 11864 }, { "epoch": 1.85738885410144, "grad_norm": 3.3536922931671143, "learning_rate": 1.1032608695652173e-05, "loss": 0.8011, "step": 11865 }, { "epoch": 1.8575453976205385, "grad_norm": 2.663158893585205, "learning_rate": 1.1020720108695651e-05, "loss": 0.7828, "step": 11866 }, { "epoch": 1.857701941139637, "grad_norm": 7.48060941696167, "learning_rate": 1.100883152173913e-05, "loss": 0.9639, "step": 11867 }, { "epoch": 1.8578584846587352, "grad_norm": 5.491586208343506, "learning_rate": 1.0996942934782607e-05, "loss": 0.8464, "step": 11868 }, { "epoch": 1.8580150281778334, "grad_norm": 2.4743382930755615, "learning_rate": 1.0985054347826086e-05, "loss": 0.4781, "step": 11869 }, { "epoch": 1.8581715716969316, "grad_norm": 3.9874465465545654, "learning_rate": 1.0973165760869564e-05, "loss": 1.0145, "step": 11870 }, { "epoch": 1.85832811521603, "grad_norm": 4.092168807983398, "learning_rate": 1.0961277173913044e-05, "loss": 1.1511, "step": 11871 }, { "epoch": 1.8584846587351285, "grad_norm": 6.746730804443359, "learning_rate": 1.094938858695652e-05, "loss": 1.2254, "step": 11872 }, { "epoch": 1.8586412022542267, "grad_norm": 6.299065589904785, "learning_rate": 1.0937499999999998e-05, "loss": 1.5206, "step": 11873 }, { "epoch": 1.858797745773325, "grad_norm": 1.860451579093933, "learning_rate": 1.0925611413043478e-05, "loss": 0.6032, "step": 11874 }, { "epoch": 1.8589542892924233, "grad_norm": 3.6348369121551514, "learning_rate": 1.0913722826086955e-05, "loss": 0.5323, "step": 11875 }, { "epoch": 1.8591108328115216, "grad_norm": 4.228086948394775, "learning_rate": 1.0901834239130434e-05, "loss": 1.0733, "step": 11876 }, { "epoch": 1.85926737633062, "grad_norm": 3.5700435638427734, "learning_rate": 1.0889945652173913e-05, "loss": 0.8599, "step": 11877 }, { "epoch": 1.8594239198497182, "grad_norm": 2.7225265502929688, "learning_rate": 1.087805706521739e-05, "loss": 0.7252, "step": 11878 }, { "epoch": 1.8595804633688164, "grad_norm": 2.7961368560791016, "learning_rate": 1.0866168478260869e-05, "loss": 0.7513, "step": 11879 }, { "epoch": 1.8597370068879149, "grad_norm": 2.1146304607391357, "learning_rate": 1.0854279891304347e-05, "loss": 0.7146, "step": 11880 }, { "epoch": 1.859893550407013, "grad_norm": 5.292323589324951, "learning_rate": 1.0842391304347825e-05, "loss": 1.3248, "step": 11881 }, { "epoch": 1.8600500939261115, "grad_norm": 4.349895477294922, "learning_rate": 1.0830502717391303e-05, "loss": 0.7108, "step": 11882 }, { "epoch": 1.8602066374452098, "grad_norm": 2.801745653152466, "learning_rate": 1.0818614130434783e-05, "loss": 0.7213, "step": 11883 }, { "epoch": 1.860363180964308, "grad_norm": 3.0257046222686768, "learning_rate": 1.080672554347826e-05, "loss": 0.6562, "step": 11884 }, { "epoch": 1.8605197244834064, "grad_norm": 0.6330824494361877, "learning_rate": 1.0794836956521738e-05, "loss": 0.1814, "step": 11885 }, { "epoch": 1.8606762680025049, "grad_norm": 6.967499732971191, "learning_rate": 1.0782948369565216e-05, "loss": 0.3441, "step": 11886 }, { "epoch": 1.860832811521603, "grad_norm": 6.383340358734131, "learning_rate": 1.0771059782608694e-05, "loss": 0.9241, "step": 11887 }, { "epoch": 1.8609893550407013, "grad_norm": 3.8608367443084717, "learning_rate": 1.0759171195652174e-05, "loss": 0.8894, "step": 11888 }, { "epoch": 1.8611458985597995, "grad_norm": 0.9434061646461487, "learning_rate": 1.074728260869565e-05, "loss": 0.8168, "step": 11889 }, { "epoch": 1.861302442078898, "grad_norm": 0.812154233455658, "learning_rate": 1.0735394021739129e-05, "loss": 0.7225, "step": 11890 }, { "epoch": 1.8614589855979964, "grad_norm": 1.2146934270858765, "learning_rate": 1.0723505434782608e-05, "loss": 0.775, "step": 11891 }, { "epoch": 1.8616155291170946, "grad_norm": 1.2662532329559326, "learning_rate": 1.0711616847826085e-05, "loss": 0.7524, "step": 11892 }, { "epoch": 1.8617720726361928, "grad_norm": 1.3590151071548462, "learning_rate": 1.0699728260869565e-05, "loss": 0.7892, "step": 11893 }, { "epoch": 1.861928616155291, "grad_norm": 0.9828929305076599, "learning_rate": 1.0687839673913043e-05, "loss": 0.7626, "step": 11894 }, { "epoch": 1.8620851596743895, "grad_norm": 1.1349295377731323, "learning_rate": 1.067595108695652e-05, "loss": 0.7368, "step": 11895 }, { "epoch": 1.862241703193488, "grad_norm": 1.2340049743652344, "learning_rate": 1.06640625e-05, "loss": 0.7593, "step": 11896 }, { "epoch": 1.8623982467125861, "grad_norm": 1.1785348653793335, "learning_rate": 1.0652173913043477e-05, "loss": 0.8427, "step": 11897 }, { "epoch": 1.8625547902316844, "grad_norm": 1.728737711906433, "learning_rate": 1.0640285326086955e-05, "loss": 0.7917, "step": 11898 }, { "epoch": 1.8627113337507826, "grad_norm": 2.432568311691284, "learning_rate": 1.0628396739130434e-05, "loss": 0.7746, "step": 11899 }, { "epoch": 1.862867877269881, "grad_norm": 1.8805451393127441, "learning_rate": 1.0616508152173913e-05, "loss": 0.7565, "step": 11900 }, { "epoch": 1.8630244207889795, "grad_norm": 1.4910606145858765, "learning_rate": 1.060461956521739e-05, "loss": 0.9022, "step": 11901 }, { "epoch": 1.8631809643080777, "grad_norm": 1.5177167654037476, "learning_rate": 1.0592730978260868e-05, "loss": 0.8335, "step": 11902 }, { "epoch": 1.8633375078271759, "grad_norm": 2.6180384159088135, "learning_rate": 1.0580842391304348e-05, "loss": 0.9432, "step": 11903 }, { "epoch": 1.863494051346274, "grad_norm": 1.2594823837280273, "learning_rate": 1.0568953804347824e-05, "loss": 0.758, "step": 11904 }, { "epoch": 1.8636505948653725, "grad_norm": 1.4484513998031616, "learning_rate": 1.0557065217391304e-05, "loss": 0.8206, "step": 11905 }, { "epoch": 1.863807138384471, "grad_norm": 1.315460205078125, "learning_rate": 1.0545176630434782e-05, "loss": 0.6966, "step": 11906 }, { "epoch": 1.8639636819035692, "grad_norm": 2.1102590560913086, "learning_rate": 1.0533288043478259e-05, "loss": 0.8322, "step": 11907 }, { "epoch": 1.8641202254226674, "grad_norm": 2.98730731010437, "learning_rate": 1.0521399456521739e-05, "loss": 0.6796, "step": 11908 }, { "epoch": 1.8642767689417659, "grad_norm": 2.4673233032226562, "learning_rate": 1.0509510869565215e-05, "loss": 0.8445, "step": 11909 }, { "epoch": 1.864433312460864, "grad_norm": 3.265072822570801, "learning_rate": 1.0497622282608695e-05, "loss": 1.121, "step": 11910 }, { "epoch": 1.8645898559799625, "grad_norm": 1.8012303113937378, "learning_rate": 1.0485733695652173e-05, "loss": 0.8369, "step": 11911 }, { "epoch": 1.8647463994990607, "grad_norm": 1.6519615650177002, "learning_rate": 1.047384510869565e-05, "loss": 0.8714, "step": 11912 }, { "epoch": 1.864902943018159, "grad_norm": 1.887900710105896, "learning_rate": 1.046195652173913e-05, "loss": 0.8428, "step": 11913 }, { "epoch": 1.8650594865372574, "grad_norm": 1.3423926830291748, "learning_rate": 1.0450067934782608e-05, "loss": 0.7123, "step": 11914 }, { "epoch": 1.8652160300563556, "grad_norm": 14.387187957763672, "learning_rate": 1.0438179347826086e-05, "loss": 0.914, "step": 11915 }, { "epoch": 1.865372573575454, "grad_norm": 2.1901135444641113, "learning_rate": 1.0426290760869564e-05, "loss": 0.867, "step": 11916 }, { "epoch": 1.8655291170945523, "grad_norm": 2.0548038482666016, "learning_rate": 1.0414402173913044e-05, "loss": 0.7589, "step": 11917 }, { "epoch": 1.8656856606136505, "grad_norm": 6.974366664886475, "learning_rate": 1.040251358695652e-05, "loss": 1.1515, "step": 11918 }, { "epoch": 1.865842204132749, "grad_norm": 2.2383601665496826, "learning_rate": 1.0390624999999998e-05, "loss": 0.5884, "step": 11919 }, { "epoch": 1.8659987476518474, "grad_norm": 1.8079347610473633, "learning_rate": 1.0378736413043478e-05, "loss": 0.5602, "step": 11920 }, { "epoch": 1.8661552911709456, "grad_norm": 2.1477365493774414, "learning_rate": 1.0366847826086955e-05, "loss": 0.567, "step": 11921 }, { "epoch": 1.8663118346900438, "grad_norm": 2.8556110858917236, "learning_rate": 1.0354959239130435e-05, "loss": 0.661, "step": 11922 }, { "epoch": 1.866468378209142, "grad_norm": 4.9577178955078125, "learning_rate": 1.0343070652173913e-05, "loss": 1.1681, "step": 11923 }, { "epoch": 1.8666249217282405, "grad_norm": 3.6556363105773926, "learning_rate": 1.033118206521739e-05, "loss": 0.4904, "step": 11924 }, { "epoch": 1.866781465247339, "grad_norm": 8.237056732177734, "learning_rate": 1.0319293478260869e-05, "loss": 0.7197, "step": 11925 }, { "epoch": 1.8669380087664371, "grad_norm": 5.764786720275879, "learning_rate": 1.0307404891304347e-05, "loss": 1.085, "step": 11926 }, { "epoch": 1.8670945522855353, "grad_norm": 5.235495090484619, "learning_rate": 1.0295516304347825e-05, "loss": 1.2987, "step": 11927 }, { "epoch": 1.8672510958046336, "grad_norm": 2.755941390991211, "learning_rate": 1.0283627717391304e-05, "loss": 0.4503, "step": 11928 }, { "epoch": 1.867407639323732, "grad_norm": 4.2379865646362305, "learning_rate": 1.0271739130434783e-05, "loss": 1.3017, "step": 11929 }, { "epoch": 1.8675641828428304, "grad_norm": 4.497812747955322, "learning_rate": 1.025985054347826e-05, "loss": 1.1004, "step": 11930 }, { "epoch": 1.8677207263619287, "grad_norm": 3.5777664184570312, "learning_rate": 1.0247961956521738e-05, "loss": 2.1309, "step": 11931 }, { "epoch": 1.8678772698810269, "grad_norm": 3.291609525680542, "learning_rate": 1.0236073369565216e-05, "loss": 1.1223, "step": 11932 }, { "epoch": 1.868033813400125, "grad_norm": NaN, "learning_rate": 1.0236073369565216e-05, "loss": 0.0, "step": 11933 }, { "epoch": 1.8681903569192235, "grad_norm": 3.2986299991607666, "learning_rate": 1.0224184782608694e-05, "loss": 0.9851, "step": 11934 }, { "epoch": 1.868346900438322, "grad_norm": 2.6432363986968994, "learning_rate": 1.0212296195652174e-05, "loss": 1.0348, "step": 11935 }, { "epoch": 1.8685034439574202, "grad_norm": 1.9279741048812866, "learning_rate": 1.020040760869565e-05, "loss": 0.5342, "step": 11936 }, { "epoch": 1.8686599874765184, "grad_norm": 4.121459007263184, "learning_rate": 1.0188519021739129e-05, "loss": 0.7302, "step": 11937 }, { "epoch": 1.8688165309956166, "grad_norm": 3.199766159057617, "learning_rate": 1.0176630434782609e-05, "loss": 0.9119, "step": 11938 }, { "epoch": 1.868973074514715, "grad_norm": 1.0377861261367798, "learning_rate": 1.0164741847826085e-05, "loss": 0.6966, "step": 11939 }, { "epoch": 1.8691296180338135, "grad_norm": 0.7815666198730469, "learning_rate": 1.0152853260869565e-05, "loss": 0.7152, "step": 11940 }, { "epoch": 1.8692861615529117, "grad_norm": 1.8625136613845825, "learning_rate": 1.0140964673913043e-05, "loss": 0.7701, "step": 11941 }, { "epoch": 1.86944270507201, "grad_norm": 0.700502872467041, "learning_rate": 1.012907608695652e-05, "loss": 0.6709, "step": 11942 }, { "epoch": 1.8695992485911084, "grad_norm": 1.1567810773849487, "learning_rate": 1.01171875e-05, "loss": 0.909, "step": 11943 }, { "epoch": 1.8697557921102066, "grad_norm": 1.3283026218414307, "learning_rate": 1.0105298913043478e-05, "loss": 0.7153, "step": 11944 }, { "epoch": 1.869912335629305, "grad_norm": 0.9665043354034424, "learning_rate": 1.0093410326086956e-05, "loss": 0.6663, "step": 11945 }, { "epoch": 1.8700688791484033, "grad_norm": 1.098042607307434, "learning_rate": 1.0081521739130434e-05, "loss": 0.7506, "step": 11946 }, { "epoch": 1.8702254226675015, "grad_norm": 0.9527786374092102, "learning_rate": 1.0069633152173914e-05, "loss": 0.7091, "step": 11947 }, { "epoch": 1.8703819661866, "grad_norm": 1.1646610498428345, "learning_rate": 1.005774456521739e-05, "loss": 0.7205, "step": 11948 }, { "epoch": 1.8705385097056983, "grad_norm": 1.8845417499542236, "learning_rate": 1.0045855978260868e-05, "loss": 0.7782, "step": 11949 }, { "epoch": 1.8706950532247966, "grad_norm": 5.306276321411133, "learning_rate": 1.0033967391304348e-05, "loss": 0.8817, "step": 11950 }, { "epoch": 1.8708515967438948, "grad_norm": 1.7659265995025635, "learning_rate": 1.0022078804347825e-05, "loss": 0.8362, "step": 11951 }, { "epoch": 1.871008140262993, "grad_norm": 4.730284690856934, "learning_rate": 1.0010190217391305e-05, "loss": 0.8169, "step": 11952 }, { "epoch": 1.8711646837820914, "grad_norm": 1.9359347820281982, "learning_rate": 9.998301630434783e-06, "loss": 0.8305, "step": 11953 }, { "epoch": 1.8713212273011899, "grad_norm": 1.9898897409439087, "learning_rate": 9.986413043478259e-06, "loss": 0.757, "step": 11954 }, { "epoch": 1.871477770820288, "grad_norm": 1.5060153007507324, "learning_rate": 9.974524456521739e-06, "loss": 0.8326, "step": 11955 }, { "epoch": 1.8716343143393863, "grad_norm": 1.703222632408142, "learning_rate": 9.962635869565215e-06, "loss": 0.7882, "step": 11956 }, { "epoch": 1.8717908578584845, "grad_norm": 2.945387363433838, "learning_rate": 9.950747282608695e-06, "loss": 0.7685, "step": 11957 }, { "epoch": 1.871947401377583, "grad_norm": 1.7491660118103027, "learning_rate": 9.938858695652173e-06, "loss": 0.849, "step": 11958 }, { "epoch": 1.8721039448966814, "grad_norm": 6.389097213745117, "learning_rate": 9.92697010869565e-06, "loss": 0.7464, "step": 11959 }, { "epoch": 1.8722604884157796, "grad_norm": 2.254256248474121, "learning_rate": 9.91508152173913e-06, "loss": 0.9293, "step": 11960 }, { "epoch": 1.8724170319348779, "grad_norm": 2.9383091926574707, "learning_rate": 9.903192934782608e-06, "loss": 1.0144, "step": 11961 }, { "epoch": 1.872573575453976, "grad_norm": 2.861985921859741, "learning_rate": 9.891304347826086e-06, "loss": 0.6701, "step": 11962 }, { "epoch": 1.8727301189730745, "grad_norm": 8.57581615447998, "learning_rate": 9.879415760869564e-06, "loss": 0.9934, "step": 11963 }, { "epoch": 1.872886662492173, "grad_norm": 2.173186779022217, "learning_rate": 9.867527173913042e-06, "loss": 0.8271, "step": 11964 }, { "epoch": 1.8730432060112712, "grad_norm": 2.958416700363159, "learning_rate": 9.85563858695652e-06, "loss": 0.7408, "step": 11965 }, { "epoch": 1.8731997495303694, "grad_norm": 20.0357723236084, "learning_rate": 9.843749999999999e-06, "loss": 1.0013, "step": 11966 }, { "epoch": 1.8733562930494676, "grad_norm": 3.289712905883789, "learning_rate": 9.831861413043479e-06, "loss": 0.9129, "step": 11967 }, { "epoch": 1.873512836568566, "grad_norm": 1.532753348350525, "learning_rate": 9.819972826086955e-06, "loss": 0.6746, "step": 11968 }, { "epoch": 1.8736693800876645, "grad_norm": 2.281484603881836, "learning_rate": 9.808084239130433e-06, "loss": 0.7771, "step": 11969 }, { "epoch": 1.8738259236067627, "grad_norm": 3.7315499782562256, "learning_rate": 9.796195652173913e-06, "loss": 0.8759, "step": 11970 }, { "epoch": 1.873982467125861, "grad_norm": 3.2112386226654053, "learning_rate": 9.78430706521739e-06, "loss": 0.8365, "step": 11971 }, { "epoch": 1.8741390106449591, "grad_norm": 3.5195846557617188, "learning_rate": 9.77241847826087e-06, "loss": 0.7335, "step": 11972 }, { "epoch": 1.8742955541640576, "grad_norm": 3.78289794921875, "learning_rate": 9.760529891304348e-06, "loss": 0.7547, "step": 11973 }, { "epoch": 1.874452097683156, "grad_norm": 2.1299936771392822, "learning_rate": 9.748641304347826e-06, "loss": 0.7682, "step": 11974 }, { "epoch": 1.8746086412022542, "grad_norm": 11.356939315795898, "learning_rate": 9.736752717391304e-06, "loss": 1.0297, "step": 11975 }, { "epoch": 1.8747651847213525, "grad_norm": 5.4350996017456055, "learning_rate": 9.724864130434782e-06, "loss": 1.1829, "step": 11976 }, { "epoch": 1.874921728240451, "grad_norm": 5.5607500076293945, "learning_rate": 9.71297554347826e-06, "loss": 0.8593, "step": 11977 }, { "epoch": 1.875078271759549, "grad_norm": 6.624189376831055, "learning_rate": 9.701086956521738e-06, "loss": 1.0061, "step": 11978 }, { "epoch": 1.8752348152786475, "grad_norm": 3.2392899990081787, "learning_rate": 9.689198369565216e-06, "loss": 1.2416, "step": 11979 }, { "epoch": 1.8753913587977458, "grad_norm": 5.704195022583008, "learning_rate": 9.677309782608695e-06, "loss": 1.0451, "step": 11980 }, { "epoch": 1.875547902316844, "grad_norm": 5.473053932189941, "learning_rate": 9.665421195652173e-06, "loss": 1.247, "step": 11981 }, { "epoch": 1.8757044458359424, "grad_norm": 1.622187614440918, "learning_rate": 9.653532608695651e-06, "loss": 0.6662, "step": 11982 }, { "epoch": 1.8758609893550409, "grad_norm": 1.9350017309188843, "learning_rate": 9.641644021739129e-06, "loss": 0.4019, "step": 11983 }, { "epoch": 1.876017532874139, "grad_norm": 1.1540865898132324, "learning_rate": 9.629755434782609e-06, "loss": 0.396, "step": 11984 }, { "epoch": 1.8761740763932373, "grad_norm": 2.3918070793151855, "learning_rate": 9.617866847826085e-06, "loss": 0.5356, "step": 11985 }, { "epoch": 1.8763306199123355, "grad_norm": 3.3857901096343994, "learning_rate": 9.605978260869564e-06, "loss": 0.886, "step": 11986 }, { "epoch": 1.876487163431434, "grad_norm": 3.6559536457061768, "learning_rate": 9.594089673913043e-06, "loss": 1.0312, "step": 11987 }, { "epoch": 1.8766437069505324, "grad_norm": 2.2746641635894775, "learning_rate": 9.58220108695652e-06, "loss": 1.1745, "step": 11988 }, { "epoch": 1.8768002504696306, "grad_norm": 0.9185448288917542, "learning_rate": 9.5703125e-06, "loss": 1.3733, "step": 11989 }, { "epoch": 1.8769567939887288, "grad_norm": 1.042295217514038, "learning_rate": 9.558423913043478e-06, "loss": 1.3174, "step": 11990 }, { "epoch": 1.877113337507827, "grad_norm": 1.1402212381362915, "learning_rate": 9.546535326086954e-06, "loss": 1.4177, "step": 11991 }, { "epoch": 1.8772698810269255, "grad_norm": 1.0364875793457031, "learning_rate": 9.534646739130434e-06, "loss": 1.2681, "step": 11992 }, { "epoch": 1.877426424546024, "grad_norm": 1.9072438478469849, "learning_rate": 9.522758152173912e-06, "loss": 1.3398, "step": 11993 }, { "epoch": 1.8775829680651221, "grad_norm": 2.8735013008117676, "learning_rate": 9.51086956521739e-06, "loss": 1.347, "step": 11994 }, { "epoch": 1.8777395115842204, "grad_norm": 1.5179246664047241, "learning_rate": 9.498980978260869e-06, "loss": 1.3606, "step": 11995 }, { "epoch": 1.8778960551033186, "grad_norm": 1.1752792596817017, "learning_rate": 9.487092391304348e-06, "loss": 1.2199, "step": 11996 }, { "epoch": 1.878052598622417, "grad_norm": 1.1794319152832031, "learning_rate": 9.475203804347825e-06, "loss": 1.2506, "step": 11997 }, { "epoch": 1.8782091421415155, "grad_norm": 1.2492963075637817, "learning_rate": 9.463315217391303e-06, "loss": 1.158, "step": 11998 }, { "epoch": 1.8783656856606137, "grad_norm": 1.800179123878479, "learning_rate": 9.451426630434783e-06, "loss": 1.1817, "step": 11999 }, { "epoch": 1.878522229179712, "grad_norm": 1.3371505737304688, "learning_rate": 9.43953804347826e-06, "loss": 1.1026, "step": 12000 }, { "epoch": 1.878522229179712, "eval_loss": 0.9829882979393005, "eval_runtime": 203.441, "eval_samples_per_second": 60.868, "eval_steps_per_second": 3.805, "eval_wer": 0.8711243810697664, "step": 12000 }, { "epoch": 1.8786787726988101, "grad_norm": 5.182831287384033, "learning_rate": 9.42764945652174e-06, "loss": 1.5862, "step": 12001 }, { "epoch": 1.8788353162179086, "grad_norm": 1.9229707717895508, "learning_rate": 9.415760869565216e-06, "loss": 1.2504, "step": 12002 }, { "epoch": 1.878991859737007, "grad_norm": 1.703960657119751, "learning_rate": 9.403872282608694e-06, "loss": 1.0436, "step": 12003 }, { "epoch": 1.8791484032561052, "grad_norm": 1.8541909456253052, "learning_rate": 9.391983695652174e-06, "loss": 1.0518, "step": 12004 }, { "epoch": 1.8793049467752034, "grad_norm": 3.630406618118286, "learning_rate": 9.38009510869565e-06, "loss": 1.2205, "step": 12005 }, { "epoch": 1.8794614902943017, "grad_norm": 3.123537302017212, "learning_rate": 9.36820652173913e-06, "loss": 1.0678, "step": 12006 }, { "epoch": 1.8796180338134, "grad_norm": 2.5861618518829346, "learning_rate": 9.356317934782608e-06, "loss": 1.0339, "step": 12007 }, { "epoch": 1.8797745773324985, "grad_norm": 1.4743202924728394, "learning_rate": 9.344429347826085e-06, "loss": 1.0361, "step": 12008 }, { "epoch": 1.8799311208515967, "grad_norm": 3.4973602294921875, "learning_rate": 9.332540760869565e-06, "loss": 1.0504, "step": 12009 }, { "epoch": 1.880087664370695, "grad_norm": 4.103195667266846, "learning_rate": 9.320652173913043e-06, "loss": 1.1312, "step": 12010 }, { "epoch": 1.8802442078897934, "grad_norm": 1.7003071308135986, "learning_rate": 9.30876358695652e-06, "loss": 0.8502, "step": 12011 }, { "epoch": 1.8804007514088916, "grad_norm": 5.290245056152344, "learning_rate": 9.296874999999999e-06, "loss": 0.9629, "step": 12012 }, { "epoch": 1.88055729492799, "grad_norm": 3.454191207885742, "learning_rate": 9.284986413043479e-06, "loss": 1.0472, "step": 12013 }, { "epoch": 1.8807138384470883, "grad_norm": 2.111452341079712, "learning_rate": 9.273097826086955e-06, "loss": 0.9322, "step": 12014 }, { "epoch": 1.8808703819661865, "grad_norm": 3.9610612392425537, "learning_rate": 9.261209239130433e-06, "loss": 0.7861, "step": 12015 }, { "epoch": 1.881026925485285, "grad_norm": 1.8798471689224243, "learning_rate": 9.249320652173913e-06, "loss": 0.7124, "step": 12016 }, { "epoch": 1.8811834690043834, "grad_norm": 8.46898365020752, "learning_rate": 9.23743206521739e-06, "loss": 0.9694, "step": 12017 }, { "epoch": 1.8813400125234816, "grad_norm": 2.6647167205810547, "learning_rate": 9.22554347826087e-06, "loss": 0.8032, "step": 12018 }, { "epoch": 1.8814965560425798, "grad_norm": 5.257624626159668, "learning_rate": 9.213654891304348e-06, "loss": 1.1153, "step": 12019 }, { "epoch": 1.881653099561678, "grad_norm": 3.5775091648101807, "learning_rate": 9.201766304347824e-06, "loss": 0.9562, "step": 12020 }, { "epoch": 1.8818096430807765, "grad_norm": 5.636914253234863, "learning_rate": 9.189877717391304e-06, "loss": 1.1491, "step": 12021 }, { "epoch": 1.881966186599875, "grad_norm": 3.4716198444366455, "learning_rate": 9.177989130434782e-06, "loss": 1.1312, "step": 12022 }, { "epoch": 1.8821227301189731, "grad_norm": 6.623715877532959, "learning_rate": 9.16610054347826e-06, "loss": 1.0641, "step": 12023 }, { "epoch": 1.8822792736380713, "grad_norm": 6.110248565673828, "learning_rate": 9.154211956521739e-06, "loss": 1.0388, "step": 12024 }, { "epoch": 1.8824358171571696, "grad_norm": 4.3244805335998535, "learning_rate": 9.142323369565215e-06, "loss": 1.0035, "step": 12025 }, { "epoch": 1.882592360676268, "grad_norm": 5.861422538757324, "learning_rate": 9.130434782608695e-06, "loss": 1.0547, "step": 12026 }, { "epoch": 1.8827489041953664, "grad_norm": 8.071517944335938, "learning_rate": 9.118546195652173e-06, "loss": 1.1296, "step": 12027 }, { "epoch": 1.8829054477144647, "grad_norm": 4.611490726470947, "learning_rate": 9.106657608695651e-06, "loss": 0.9612, "step": 12028 }, { "epoch": 1.8830619912335629, "grad_norm": 10.456551551818848, "learning_rate": 9.09476902173913e-06, "loss": 1.0246, "step": 12029 }, { "epoch": 1.883218534752661, "grad_norm": 3.217658519744873, "learning_rate": 9.08288043478261e-06, "loss": 1.1191, "step": 12030 }, { "epoch": 1.8833750782717595, "grad_norm": 5.065220355987549, "learning_rate": 9.070991847826086e-06, "loss": 1.4525, "step": 12031 }, { "epoch": 1.883531621790858, "grad_norm": 3.779832124710083, "learning_rate": 9.059103260869564e-06, "loss": 1.1933, "step": 12032 }, { "epoch": 1.8836881653099562, "grad_norm": 4.145232200622559, "learning_rate": 9.047214673913044e-06, "loss": 0.6398, "step": 12033 }, { "epoch": 1.8838447088290544, "grad_norm": 2.2179505825042725, "learning_rate": 9.03532608695652e-06, "loss": 0.452, "step": 12034 }, { "epoch": 1.8840012523481526, "grad_norm": 4.44736909866333, "learning_rate": 9.0234375e-06, "loss": 0.5107, "step": 12035 }, { "epoch": 1.884157795867251, "grad_norm": 3.2010610103607178, "learning_rate": 9.011548913043478e-06, "loss": 0.5792, "step": 12036 }, { "epoch": 1.8843143393863495, "grad_norm": 4.158010482788086, "learning_rate": 8.999660326086955e-06, "loss": 0.9871, "step": 12037 }, { "epoch": 1.8844708829054477, "grad_norm": 1.9274159669876099, "learning_rate": 8.987771739130434e-06, "loss": 0.8672, "step": 12038 }, { "epoch": 1.884627426424546, "grad_norm": 0.9023780226707458, "learning_rate": 8.975883152173913e-06, "loss": 1.4126, "step": 12039 }, { "epoch": 1.8847839699436444, "grad_norm": 1.2042438983917236, "learning_rate": 8.96399456521739e-06, "loss": 1.3048, "step": 12040 }, { "epoch": 1.8849405134627426, "grad_norm": 1.058908224105835, "learning_rate": 8.952105978260869e-06, "loss": 1.3418, "step": 12041 }, { "epoch": 1.885097056981841, "grad_norm": 2.098128080368042, "learning_rate": 8.940217391304347e-06, "loss": 1.3375, "step": 12042 }, { "epoch": 1.8852536005009393, "grad_norm": 1.547957181930542, "learning_rate": 8.928328804347825e-06, "loss": 1.3077, "step": 12043 }, { "epoch": 1.8854101440200375, "grad_norm": 1.0458471775054932, "learning_rate": 8.916440217391303e-06, "loss": 1.3395, "step": 12044 }, { "epoch": 1.885566687539136, "grad_norm": 1.4712506532669067, "learning_rate": 8.904551630434783e-06, "loss": 1.4697, "step": 12045 }, { "epoch": 1.8857232310582341, "grad_norm": 1.6869171857833862, "learning_rate": 8.89266304347826e-06, "loss": 1.3732, "step": 12046 }, { "epoch": 1.8858797745773326, "grad_norm": 1.2191897630691528, "learning_rate": 8.88077445652174e-06, "loss": 1.3312, "step": 12047 }, { "epoch": 1.8860363180964308, "grad_norm": 1.3537700176239014, "learning_rate": 8.868885869565216e-06, "loss": 1.355, "step": 12048 }, { "epoch": 1.886192861615529, "grad_norm": 1.387945294380188, "learning_rate": 8.856997282608694e-06, "loss": 1.3786, "step": 12049 }, { "epoch": 1.8863494051346275, "grad_norm": 2.3830554485321045, "learning_rate": 8.845108695652174e-06, "loss": 1.3842, "step": 12050 }, { "epoch": 1.886505948653726, "grad_norm": 1.4700684547424316, "learning_rate": 8.83322010869565e-06, "loss": 1.3892, "step": 12051 }, { "epoch": 1.8866624921728241, "grad_norm": 1.3052235841751099, "learning_rate": 8.82133152173913e-06, "loss": 1.6021, "step": 12052 }, { "epoch": 1.8868190356919223, "grad_norm": 1.3360000848770142, "learning_rate": 8.809442934782608e-06, "loss": 1.2581, "step": 12053 }, { "epoch": 1.8869755792110205, "grad_norm": 1.6020877361297607, "learning_rate": 8.797554347826085e-06, "loss": 1.3843, "step": 12054 }, { "epoch": 1.887132122730119, "grad_norm": 3.7696011066436768, "learning_rate": 8.785665760869565e-06, "loss": 1.6178, "step": 12055 }, { "epoch": 1.8872886662492174, "grad_norm": 2.4075112342834473, "learning_rate": 8.773777173913043e-06, "loss": 1.7702, "step": 12056 }, { "epoch": 1.8874452097683156, "grad_norm": 4.039999961853027, "learning_rate": 8.761888586956521e-06, "loss": 1.4214, "step": 12057 }, { "epoch": 1.8876017532874139, "grad_norm": 1.1225229501724243, "learning_rate": 8.75e-06, "loss": 1.2267, "step": 12058 }, { "epoch": 1.887758296806512, "grad_norm": 3.2809154987335205, "learning_rate": 8.738111413043477e-06, "loss": 1.6508, "step": 12059 }, { "epoch": 1.8879148403256105, "grad_norm": 3.4546375274658203, "learning_rate": 8.726222826086956e-06, "loss": 1.4301, "step": 12060 }, { "epoch": 1.888071383844709, "grad_norm": 2.9126522541046143, "learning_rate": 8.714334239130434e-06, "loss": 1.2874, "step": 12061 }, { "epoch": 1.8882279273638072, "grad_norm": 2.721062660217285, "learning_rate": 8.702445652173912e-06, "loss": 1.2024, "step": 12062 }, { "epoch": 1.8883844708829054, "grad_norm": 2.6860079765319824, "learning_rate": 8.690557065217392e-06, "loss": 1.1961, "step": 12063 }, { "epoch": 1.8885410144020036, "grad_norm": 2.2013702392578125, "learning_rate": 8.678668478260868e-06, "loss": 1.3263, "step": 12064 }, { "epoch": 1.888697557921102, "grad_norm": 2.529597043991089, "learning_rate": 8.666779891304346e-06, "loss": 1.2285, "step": 12065 }, { "epoch": 1.8888541014402005, "grad_norm": 2.2427406311035156, "learning_rate": 8.654891304347824e-06, "loss": 0.9401, "step": 12066 }, { "epoch": 1.8890106449592987, "grad_norm": 3.6187562942504883, "learning_rate": 8.643002717391304e-06, "loss": 1.1474, "step": 12067 }, { "epoch": 1.889167188478397, "grad_norm": 2.306546211242676, "learning_rate": 8.631114130434782e-06, "loss": 1.2569, "step": 12068 }, { "epoch": 1.8893237319974951, "grad_norm": 17.232196807861328, "learning_rate": 8.619225543478259e-06, "loss": 1.3307, "step": 12069 }, { "epoch": 1.8894802755165936, "grad_norm": 4.481505870819092, "learning_rate": 8.607336956521739e-06, "loss": 1.2792, "step": 12070 }, { "epoch": 1.889636819035692, "grad_norm": 6.089162826538086, "learning_rate": 8.595448369565217e-06, "loss": 1.2065, "step": 12071 }, { "epoch": 1.8897933625547902, "grad_norm": 3.7049992084503174, "learning_rate": 8.583559782608695e-06, "loss": 1.3675, "step": 12072 }, { "epoch": 1.8899499060738885, "grad_norm": 6.9069695472717285, "learning_rate": 8.571671195652173e-06, "loss": 1.0363, "step": 12073 }, { "epoch": 1.890106449592987, "grad_norm": 3.0067079067230225, "learning_rate": 8.559782608695651e-06, "loss": 1.0475, "step": 12074 }, { "epoch": 1.8902629931120851, "grad_norm": 12.983119010925293, "learning_rate": 8.54789402173913e-06, "loss": 1.5614, "step": 12075 }, { "epoch": 1.8904195366311836, "grad_norm": 4.616128921508789, "learning_rate": 8.536005434782608e-06, "loss": 1.2943, "step": 12076 }, { "epoch": 1.8905760801502818, "grad_norm": 3.1125991344451904, "learning_rate": 8.524116847826086e-06, "loss": 1.2166, "step": 12077 }, { "epoch": 1.89073262366938, "grad_norm": 9.953347206115723, "learning_rate": 8.512228260869564e-06, "loss": 1.1751, "step": 12078 }, { "epoch": 1.8908891671884784, "grad_norm": 3.2846717834472656, "learning_rate": 8.500339673913042e-06, "loss": 1.0908, "step": 12079 }, { "epoch": 1.8910457107075767, "grad_norm": 8.09660530090332, "learning_rate": 8.488451086956522e-06, "loss": 1.2101, "step": 12080 }, { "epoch": 1.891202254226675, "grad_norm": 4.353957176208496, "learning_rate": 8.476562499999999e-06, "loss": 1.3817, "step": 12081 }, { "epoch": 1.8913587977457733, "grad_norm": 3.101376533508301, "learning_rate": 8.464673913043477e-06, "loss": 1.3211, "step": 12082 }, { "epoch": 1.8915153412648715, "grad_norm": 2.771514654159546, "learning_rate": 8.452785326086957e-06, "loss": 0.7803, "step": 12083 }, { "epoch": 1.89167188478397, "grad_norm": 5.20182991027832, "learning_rate": 8.440896739130435e-06, "loss": 0.8106, "step": 12084 }, { "epoch": 1.8918284283030684, "grad_norm": 2.454831838607788, "learning_rate": 8.429008152173913e-06, "loss": 0.6204, "step": 12085 }, { "epoch": 1.8919849718221666, "grad_norm": 5.164551734924316, "learning_rate": 8.417119565217391e-06, "loss": 0.6502, "step": 12086 }, { "epoch": 1.8921415153412648, "grad_norm": 4.9991455078125, "learning_rate": 8.405230978260869e-06, "loss": 1.57, "step": 12087 }, { "epoch": 1.892298058860363, "grad_norm": 4.776933670043945, "learning_rate": 8.393342391304347e-06, "loss": 1.1701, "step": 12088 }, { "epoch": 1.8924546023794615, "grad_norm": 1.3836685419082642, "learning_rate": 8.381453804347825e-06, "loss": 1.4908, "step": 12089 }, { "epoch": 1.89261114589856, "grad_norm": 1.4602994918823242, "learning_rate": 8.369565217391304e-06, "loss": 1.4024, "step": 12090 }, { "epoch": 1.8927676894176582, "grad_norm": 0.9581389427185059, "learning_rate": 8.357676630434782e-06, "loss": 1.3685, "step": 12091 }, { "epoch": 1.8929242329367564, "grad_norm": 0.9459466338157654, "learning_rate": 8.34578804347826e-06, "loss": 1.4031, "step": 12092 }, { "epoch": 1.8930807764558546, "grad_norm": 0.9777057766914368, "learning_rate": 8.333899456521738e-06, "loss": 1.3985, "step": 12093 }, { "epoch": 1.893237319974953, "grad_norm": 0.8998606204986572, "learning_rate": 8.322010869565216e-06, "loss": 1.38, "step": 12094 }, { "epoch": 1.8933938634940515, "grad_norm": 2.548150062561035, "learning_rate": 8.310122282608694e-06, "loss": 1.4113, "step": 12095 }, { "epoch": 1.8935504070131497, "grad_norm": 1.3859814405441284, "learning_rate": 8.298233695652174e-06, "loss": 1.4437, "step": 12096 }, { "epoch": 1.893706950532248, "grad_norm": 1.1842947006225586, "learning_rate": 8.286345108695652e-06, "loss": 1.5314, "step": 12097 }, { "epoch": 1.8938634940513461, "grad_norm": 1.323096752166748, "learning_rate": 8.274456521739129e-06, "loss": 1.4527, "step": 12098 }, { "epoch": 1.8940200375704446, "grad_norm": 2.0267531871795654, "learning_rate": 8.262567934782607e-06, "loss": 1.3465, "step": 12099 }, { "epoch": 1.894176581089543, "grad_norm": 3.0552027225494385, "learning_rate": 8.250679347826087e-06, "loss": 1.4621, "step": 12100 }, { "epoch": 1.8943331246086412, "grad_norm": 1.6302975416183472, "learning_rate": 8.238790760869565e-06, "loss": 1.4616, "step": 12101 }, { "epoch": 1.8944896681277394, "grad_norm": 2.4284679889678955, "learning_rate": 8.226902173913043e-06, "loss": 1.5669, "step": 12102 }, { "epoch": 1.8946462116468377, "grad_norm": 1.7902894020080566, "learning_rate": 8.215013586956521e-06, "loss": 1.5983, "step": 12103 }, { "epoch": 1.894802755165936, "grad_norm": 3.459280014038086, "learning_rate": 8.203125e-06, "loss": 1.6201, "step": 12104 }, { "epoch": 1.8949592986850345, "grad_norm": 2.106574058532715, "learning_rate": 8.191236413043478e-06, "loss": 1.4236, "step": 12105 }, { "epoch": 1.8951158422041328, "grad_norm": 5.207345485687256, "learning_rate": 8.179347826086956e-06, "loss": 1.59, "step": 12106 }, { "epoch": 1.895272385723231, "grad_norm": 1.9989246129989624, "learning_rate": 8.167459239130434e-06, "loss": 1.4841, "step": 12107 }, { "epoch": 1.8954289292423294, "grad_norm": 1.7357362508773804, "learning_rate": 8.155570652173912e-06, "loss": 1.5716, "step": 12108 }, { "epoch": 1.8955854727614276, "grad_norm": 1.7323013544082642, "learning_rate": 8.14368206521739e-06, "loss": 1.6681, "step": 12109 }, { "epoch": 1.895742016280526, "grad_norm": 1.8410289287567139, "learning_rate": 8.131793478260868e-06, "loss": 1.5594, "step": 12110 }, { "epoch": 1.8958985597996243, "grad_norm": 2.5588555335998535, "learning_rate": 8.119904891304347e-06, "loss": 1.6043, "step": 12111 }, { "epoch": 1.8960551033187225, "grad_norm": 3.2712881565093994, "learning_rate": 8.108016304347825e-06, "loss": 1.5536, "step": 12112 }, { "epoch": 1.896211646837821, "grad_norm": 2.43414568901062, "learning_rate": 8.096127717391305e-06, "loss": 1.4612, "step": 12113 }, { "epoch": 1.8963681903569192, "grad_norm": 6.389035701751709, "learning_rate": 8.084239130434781e-06, "loss": 1.6381, "step": 12114 }, { "epoch": 1.8965247338760176, "grad_norm": 1.6718326807022095, "learning_rate": 8.07235054347826e-06, "loss": 1.233, "step": 12115 }, { "epoch": 1.8966812773951158, "grad_norm": 2.8170976638793945, "learning_rate": 8.060461956521739e-06, "loss": 1.5445, "step": 12116 }, { "epoch": 1.896837820914214, "grad_norm": 8.611741065979004, "learning_rate": 8.048573369565217e-06, "loss": 1.592, "step": 12117 }, { "epoch": 1.8969943644333125, "grad_norm": 1.9037225246429443, "learning_rate": 8.036684782608695e-06, "loss": 1.2415, "step": 12118 }, { "epoch": 1.897150907952411, "grad_norm": 4.240595817565918, "learning_rate": 8.024796195652174e-06, "loss": 1.3987, "step": 12119 }, { "epoch": 1.8973074514715091, "grad_norm": 4.979689598083496, "learning_rate": 8.012907608695652e-06, "loss": 1.2794, "step": 12120 }, { "epoch": 1.8974639949906074, "grad_norm": 3.253326177597046, "learning_rate": 8.00101902173913e-06, "loss": 1.0924, "step": 12121 }, { "epoch": 1.8976205385097056, "grad_norm": 3.201193332672119, "learning_rate": 7.989130434782608e-06, "loss": 1.0645, "step": 12122 }, { "epoch": 1.897777082028804, "grad_norm": 1.9880077838897705, "learning_rate": 7.977241847826086e-06, "loss": 1.1098, "step": 12123 }, { "epoch": 1.8979336255479025, "grad_norm": 2.412951946258545, "learning_rate": 7.965353260869564e-06, "loss": 1.0516, "step": 12124 }, { "epoch": 1.8980901690670007, "grad_norm": 4.633194923400879, "learning_rate": 7.953464673913042e-06, "loss": 1.4498, "step": 12125 }, { "epoch": 1.898246712586099, "grad_norm": 8.784133911132812, "learning_rate": 7.94157608695652e-06, "loss": 1.0794, "step": 12126 }, { "epoch": 1.8984032561051971, "grad_norm": 4.3559112548828125, "learning_rate": 7.929687499999999e-06, "loss": 1.0433, "step": 12127 }, { "epoch": 1.8985597996242956, "grad_norm": 5.4930033683776855, "learning_rate": 7.917798913043477e-06, "loss": 1.0553, "step": 12128 }, { "epoch": 1.898716343143394, "grad_norm": 3.861570119857788, "learning_rate": 7.905910326086957e-06, "loss": 0.8358, "step": 12129 }, { "epoch": 1.8988728866624922, "grad_norm": 5.126827239990234, "learning_rate": 7.894021739130435e-06, "loss": 1.5065, "step": 12130 }, { "epoch": 1.8990294301815904, "grad_norm": 4.6554975509643555, "learning_rate": 7.882133152173911e-06, "loss": 1.2077, "step": 12131 }, { "epoch": 1.8991859737006886, "grad_norm": 2.212658405303955, "learning_rate": 7.870244565217391e-06, "loss": 0.9871, "step": 12132 }, { "epoch": 1.899342517219787, "grad_norm": 12.761900901794434, "learning_rate": 7.85835597826087e-06, "loss": 1.2654, "step": 12133 }, { "epoch": 1.8994990607388855, "grad_norm": 2.2142648696899414, "learning_rate": 7.846467391304348e-06, "loss": 0.5523, "step": 12134 }, { "epoch": 1.8996556042579837, "grad_norm": 2.1725971698760986, "learning_rate": 7.834578804347826e-06, "loss": 0.4092, "step": 12135 }, { "epoch": 1.899812147777082, "grad_norm": 2.5697529315948486, "learning_rate": 7.822690217391304e-06, "loss": 0.5833, "step": 12136 }, { "epoch": 1.8999686912961802, "grad_norm": 5.738124847412109, "learning_rate": 7.810801630434782e-06, "loss": 1.3979, "step": 12137 }, { "epoch": 1.9001252348152786, "grad_norm": 2.959599018096924, "learning_rate": 7.79891304347826e-06, "loss": 0.6991, "step": 12138 }, { "epoch": 1.900281778334377, "grad_norm": 1.184004545211792, "learning_rate": 7.787024456521738e-06, "loss": 1.8201, "step": 12139 }, { "epoch": 1.9004383218534753, "grad_norm": 1.2237941026687622, "learning_rate": 7.775135869565216e-06, "loss": 1.6556, "step": 12140 }, { "epoch": 1.9005948653725735, "grad_norm": 1.3715628385543823, "learning_rate": 7.763247282608695e-06, "loss": 1.6394, "step": 12141 }, { "epoch": 1.900751408891672, "grad_norm": 1.085321307182312, "learning_rate": 7.751358695652173e-06, "loss": 1.7045, "step": 12142 }, { "epoch": 1.9009079524107702, "grad_norm": 1.206402063369751, "learning_rate": 7.739470108695651e-06, "loss": 1.5404, "step": 12143 }, { "epoch": 1.9010644959298686, "grad_norm": 1.5719717741012573, "learning_rate": 7.727581521739129e-06, "loss": 1.5999, "step": 12144 }, { "epoch": 1.9012210394489668, "grad_norm": 1.0713475942611694, "learning_rate": 7.715692934782607e-06, "loss": 1.5786, "step": 12145 }, { "epoch": 1.901377582968065, "grad_norm": 1.4197187423706055, "learning_rate": 7.703804347826087e-06, "loss": 1.6603, "step": 12146 }, { "epoch": 1.9015341264871635, "grad_norm": 1.3313614130020142, "learning_rate": 7.691915760869565e-06, "loss": 1.4919, "step": 12147 }, { "epoch": 1.9016906700062617, "grad_norm": 1.393036961555481, "learning_rate": 7.680027173913042e-06, "loss": 1.5931, "step": 12148 }, { "epoch": 1.9018472135253601, "grad_norm": 1.5646010637283325, "learning_rate": 7.668138586956522e-06, "loss": 1.4612, "step": 12149 }, { "epoch": 1.9020037570444583, "grad_norm": 1.3110668659210205, "learning_rate": 7.65625e-06, "loss": 1.4685, "step": 12150 }, { "epoch": 1.9021603005635566, "grad_norm": 1.8773938417434692, "learning_rate": 7.644361413043478e-06, "loss": 1.5599, "step": 12151 }, { "epoch": 1.902316844082655, "grad_norm": 1.2994425296783447, "learning_rate": 7.632472826086956e-06, "loss": 1.3498, "step": 12152 }, { "epoch": 1.9024733876017534, "grad_norm": 1.5827687978744507, "learning_rate": 7.620584239130434e-06, "loss": 1.4784, "step": 12153 }, { "epoch": 1.9026299311208517, "grad_norm": 8.549077987670898, "learning_rate": 7.608695652173912e-06, "loss": 1.8154, "step": 12154 }, { "epoch": 1.9027864746399499, "grad_norm": 2.637472629547119, "learning_rate": 7.596807065217391e-06, "loss": 1.2566, "step": 12155 }, { "epoch": 1.902943018159048, "grad_norm": 3.6172590255737305, "learning_rate": 7.584918478260869e-06, "loss": 1.4257, "step": 12156 }, { "epoch": 1.9030995616781465, "grad_norm": 1.685028314590454, "learning_rate": 7.573029891304347e-06, "loss": 1.4664, "step": 12157 }, { "epoch": 1.903256105197245, "grad_norm": 1.8909518718719482, "learning_rate": 7.561141304347825e-06, "loss": 1.5541, "step": 12158 }, { "epoch": 1.9034126487163432, "grad_norm": 3.684206247329712, "learning_rate": 7.549252717391304e-06, "loss": 1.519, "step": 12159 }, { "epoch": 1.9035691922354414, "grad_norm": 2.628264904022217, "learning_rate": 7.537364130434782e-06, "loss": 1.6628, "step": 12160 }, { "epoch": 1.9037257357545396, "grad_norm": 2.0247416496276855, "learning_rate": 7.5254755434782595e-06, "loss": 1.3409, "step": 12161 }, { "epoch": 1.903882279273638, "grad_norm": 1.917043924331665, "learning_rate": 7.5135869565217385e-06, "loss": 1.4125, "step": 12162 }, { "epoch": 1.9040388227927365, "grad_norm": 2.69948673248291, "learning_rate": 7.501698369565217e-06, "loss": 1.4876, "step": 12163 }, { "epoch": 1.9041953663118347, "grad_norm": 3.4638454914093018, "learning_rate": 7.489809782608695e-06, "loss": 1.523, "step": 12164 }, { "epoch": 1.904351909830933, "grad_norm": 8.092854499816895, "learning_rate": 7.477921195652174e-06, "loss": 1.4877, "step": 12165 }, { "epoch": 1.9045084533500312, "grad_norm": 1.808316946029663, "learning_rate": 7.466032608695652e-06, "loss": 1.2656, "step": 12166 }, { "epoch": 1.9046649968691296, "grad_norm": 3.6249818801879883, "learning_rate": 7.454144021739129e-06, "loss": 1.4334, "step": 12167 }, { "epoch": 1.904821540388228, "grad_norm": 4.060814380645752, "learning_rate": 7.442255434782607e-06, "loss": 1.5012, "step": 12168 }, { "epoch": 1.9049780839073263, "grad_norm": 3.9391379356384277, "learning_rate": 7.430366847826086e-06, "loss": 1.3154, "step": 12169 }, { "epoch": 1.9051346274264245, "grad_norm": 10.226801872253418, "learning_rate": 7.4184782608695646e-06, "loss": 1.2949, "step": 12170 }, { "epoch": 1.9052911709455227, "grad_norm": 6.296871185302734, "learning_rate": 7.406589673913043e-06, "loss": 0.9698, "step": 12171 }, { "epoch": 1.9054477144646211, "grad_norm": 2.691234588623047, "learning_rate": 7.394701086956522e-06, "loss": 0.9446, "step": 12172 }, { "epoch": 1.9056042579837196, "grad_norm": 3.575914144515991, "learning_rate": 7.382812499999999e-06, "loss": 1.1816, "step": 12173 }, { "epoch": 1.9057608015028178, "grad_norm": 8.019679069519043, "learning_rate": 7.370923913043477e-06, "loss": 1.3632, "step": 12174 }, { "epoch": 1.905917345021916, "grad_norm": 2.9085819721221924, "learning_rate": 7.359035326086956e-06, "loss": 1.0312, "step": 12175 }, { "epoch": 1.9060738885410144, "grad_norm": 4.2262115478515625, "learning_rate": 7.347146739130434e-06, "loss": 1.0929, "step": 12176 }, { "epoch": 1.9062304320601127, "grad_norm": 4.829705715179443, "learning_rate": 7.3352581521739125e-06, "loss": 0.9001, "step": 12177 }, { "epoch": 1.906386975579211, "grad_norm": 15.931147575378418, "learning_rate": 7.3233695652173915e-06, "loss": 0.9818, "step": 12178 }, { "epoch": 1.9065435190983093, "grad_norm": 9.175643920898438, "learning_rate": 7.311480978260869e-06, "loss": 0.9203, "step": 12179 }, { "epoch": 1.9067000626174075, "grad_norm": 12.908992767333984, "learning_rate": 7.299592391304347e-06, "loss": 1.1341, "step": 12180 }, { "epoch": 1.906856606136506, "grad_norm": 2.3976798057556152, "learning_rate": 7.287703804347825e-06, "loss": 0.7828, "step": 12181 }, { "epoch": 1.9070131496556044, "grad_norm": 4.804961204528809, "learning_rate": 7.275815217391304e-06, "loss": 0.7281, "step": 12182 }, { "epoch": 1.9071696931747026, "grad_norm": 1.3030637502670288, "learning_rate": 7.263926630434782e-06, "loss": 0.4715, "step": 12183 }, { "epoch": 1.9073262366938009, "grad_norm": 2.698925733566284, "learning_rate": 7.25203804347826e-06, "loss": 1.148, "step": 12184 }, { "epoch": 1.907482780212899, "grad_norm": 1.5554184913635254, "learning_rate": 7.240149456521739e-06, "loss": 0.5496, "step": 12185 }, { "epoch": 1.9076393237319975, "grad_norm": 1.4283099174499512, "learning_rate": 7.228260869565217e-06, "loss": 0.3182, "step": 12186 }, { "epoch": 1.907795867251096, "grad_norm": 2.2281494140625, "learning_rate": 7.216372282608695e-06, "loss": 0.5344, "step": 12187 }, { "epoch": 1.9079524107701942, "grad_norm": 5.111281871795654, "learning_rate": 7.204483695652174e-06, "loss": 1.5576, "step": 12188 }, { "epoch": 1.9081089542892924, "grad_norm": 1.1142865419387817, "learning_rate": 7.192595108695652e-06, "loss": 1.9629, "step": 12189 }, { "epoch": 1.9082654978083906, "grad_norm": 1.2958009243011475, "learning_rate": 7.180706521739129e-06, "loss": 2.0522, "step": 12190 }, { "epoch": 1.908422041327489, "grad_norm": 1.2441725730895996, "learning_rate": 7.1688179347826075e-06, "loss": 1.9105, "step": 12191 }, { "epoch": 1.9085785848465875, "grad_norm": 1.7363780736923218, "learning_rate": 7.1569293478260865e-06, "loss": 1.8997, "step": 12192 }, { "epoch": 1.9087351283656857, "grad_norm": 1.4750323295593262, "learning_rate": 7.145040760869565e-06, "loss": 2.0643, "step": 12193 }, { "epoch": 1.908891671884784, "grad_norm": 1.5276517868041992, "learning_rate": 7.133152173913043e-06, "loss": 2.0296, "step": 12194 }, { "epoch": 1.9090482154038821, "grad_norm": 1.6160048246383667, "learning_rate": 7.121263586956521e-06, "loss": 1.9061, "step": 12195 }, { "epoch": 1.9092047589229806, "grad_norm": 2.2891082763671875, "learning_rate": 7.109374999999999e-06, "loss": 1.8434, "step": 12196 }, { "epoch": 1.909361302442079, "grad_norm": 1.2995740175247192, "learning_rate": 7.097486413043477e-06, "loss": 1.9639, "step": 12197 }, { "epoch": 1.9095178459611772, "grad_norm": 2.75858473777771, "learning_rate": 7.085597826086956e-06, "loss": 1.8775, "step": 12198 }, { "epoch": 1.9096743894802755, "grad_norm": 1.4493021965026855, "learning_rate": 7.0737092391304345e-06, "loss": 1.9862, "step": 12199 }, { "epoch": 1.9098309329993737, "grad_norm": 1.8264774084091187, "learning_rate": 7.061820652173913e-06, "loss": 2.0962, "step": 12200 }, { "epoch": 1.9099874765184721, "grad_norm": 1.832051396369934, "learning_rate": 7.049932065217391e-06, "loss": 1.8653, "step": 12201 }, { "epoch": 1.9101440200375706, "grad_norm": 4.764091491699219, "learning_rate": 7.038043478260869e-06, "loss": 2.0645, "step": 12202 }, { "epoch": 1.9103005635566688, "grad_norm": 3.600449323654175, "learning_rate": 7.026154891304347e-06, "loss": 2.0106, "step": 12203 }, { "epoch": 1.910457107075767, "grad_norm": 1.5762195587158203, "learning_rate": 7.014266304347825e-06, "loss": 1.6768, "step": 12204 }, { "epoch": 1.9106136505948652, "grad_norm": 1.767044186592102, "learning_rate": 7.002377717391304e-06, "loss": 2.1418, "step": 12205 }, { "epoch": 1.9107701941139636, "grad_norm": 1.8072794675827026, "learning_rate": 6.9904891304347816e-06, "loss": 2.1719, "step": 12206 }, { "epoch": 1.910926737633062, "grad_norm": 2.0602800846099854, "learning_rate": 6.97860054347826e-06, "loss": 2.0311, "step": 12207 }, { "epoch": 1.9110832811521603, "grad_norm": 1.930262565612793, "learning_rate": 6.966711956521739e-06, "loss": 2.1272, "step": 12208 }, { "epoch": 1.9112398246712585, "grad_norm": 2.1120412349700928, "learning_rate": 6.954823369565217e-06, "loss": 1.8811, "step": 12209 }, { "epoch": 1.911396368190357, "grad_norm": 2.8410096168518066, "learning_rate": 6.942934782608695e-06, "loss": 1.7922, "step": 12210 }, { "epoch": 1.9115529117094552, "grad_norm": 2.017179250717163, "learning_rate": 6.931046195652174e-06, "loss": 1.9551, "step": 12211 }, { "epoch": 1.9117094552285536, "grad_norm": 5.58518648147583, "learning_rate": 6.919157608695651e-06, "loss": 1.9546, "step": 12212 }, { "epoch": 1.9118659987476518, "grad_norm": 4.583561420440674, "learning_rate": 6.9072690217391295e-06, "loss": 1.9692, "step": 12213 }, { "epoch": 1.91202254226675, "grad_norm": 2.953556776046753, "learning_rate": 6.895380434782608e-06, "loss": 1.8738, "step": 12214 }, { "epoch": 1.9121790857858485, "grad_norm": 2.387061834335327, "learning_rate": 6.883491847826087e-06, "loss": 1.8052, "step": 12215 }, { "epoch": 1.912335629304947, "grad_norm": 2.0455925464630127, "learning_rate": 6.871603260869565e-06, "loss": 1.5295, "step": 12216 }, { "epoch": 1.9124921728240452, "grad_norm": 3.9239466190338135, "learning_rate": 6.859714673913042e-06, "loss": 1.9881, "step": 12217 }, { "epoch": 1.9126487163431434, "grad_norm": 2.8050377368927, "learning_rate": 6.847826086956521e-06, "loss": 2.0703, "step": 12218 }, { "epoch": 1.9128052598622416, "grad_norm": 4.846762180328369, "learning_rate": 6.835937499999999e-06, "loss": 1.3685, "step": 12219 }, { "epoch": 1.91296180338134, "grad_norm": 3.1980719566345215, "learning_rate": 6.8240489130434775e-06, "loss": 1.6473, "step": 12220 }, { "epoch": 1.9131183469004385, "grad_norm": 6.816134452819824, "learning_rate": 6.8121603260869565e-06, "loss": 1.3493, "step": 12221 }, { "epoch": 1.9132748904195367, "grad_norm": 3.0004615783691406, "learning_rate": 6.800271739130435e-06, "loss": 1.3422, "step": 12222 }, { "epoch": 1.913431433938635, "grad_norm": 11.231470108032227, "learning_rate": 6.788383152173912e-06, "loss": 1.3851, "step": 12223 }, { "epoch": 1.9135879774577331, "grad_norm": 3.021394729614258, "learning_rate": 6.776494565217391e-06, "loss": 1.1963, "step": 12224 }, { "epoch": 1.9137445209768316, "grad_norm": 3.9974091053009033, "learning_rate": 6.764605978260869e-06, "loss": 0.9768, "step": 12225 }, { "epoch": 1.91390106449593, "grad_norm": 13.338269233703613, "learning_rate": 6.752717391304347e-06, "loss": 1.2967, "step": 12226 }, { "epoch": 1.9140576080150282, "grad_norm": 3.481294870376587, "learning_rate": 6.740828804347825e-06, "loss": 1.6747, "step": 12227 }, { "epoch": 1.9142141515341264, "grad_norm": 4.505405902862549, "learning_rate": 6.728940217391304e-06, "loss": 1.2727, "step": 12228 }, { "epoch": 1.9143706950532247, "grad_norm": 9.933453559875488, "learning_rate": 6.717051630434782e-06, "loss": 1.6203, "step": 12229 }, { "epoch": 1.914527238572323, "grad_norm": 5.748189449310303, "learning_rate": 6.70516304347826e-06, "loss": 1.294, "step": 12230 }, { "epoch": 1.9146837820914215, "grad_norm": 3.7171359062194824, "learning_rate": 6.693274456521739e-06, "loss": 0.751, "step": 12231 }, { "epoch": 1.9148403256105198, "grad_norm": 6.425541400909424, "learning_rate": 6.681385869565217e-06, "loss": 1.113, "step": 12232 }, { "epoch": 1.914996869129618, "grad_norm": 4.607865333557129, "learning_rate": 6.669497282608695e-06, "loss": 0.9885, "step": 12233 }, { "epoch": 1.9151534126487162, "grad_norm": 3.9687905311584473, "learning_rate": 6.657608695652173e-06, "loss": 0.3232, "step": 12234 }, { "epoch": 1.9153099561678146, "grad_norm": 2.3024561405181885, "learning_rate": 6.6457201086956515e-06, "loss": 0.5532, "step": 12235 }, { "epoch": 1.915466499686913, "grad_norm": 1.5880708694458008, "learning_rate": 6.63383152173913e-06, "loss": 0.405, "step": 12236 }, { "epoch": 1.9156230432060113, "grad_norm": 4.589909553527832, "learning_rate": 6.621942934782608e-06, "loss": 1.0445, "step": 12237 }, { "epoch": 1.9157795867251095, "grad_norm": 5.7333855628967285, "learning_rate": 6.610054347826087e-06, "loss": 1.0476, "step": 12238 }, { "epoch": 1.9159361302442077, "grad_norm": 1.4561870098114014, "learning_rate": 6.598165760869565e-06, "loss": 2.4521, "step": 12239 }, { "epoch": 1.9160926737633062, "grad_norm": 1.531969428062439, "learning_rate": 6.586277173913042e-06, "loss": 2.4725, "step": 12240 }, { "epoch": 1.9162492172824046, "grad_norm": 1.7158375978469849, "learning_rate": 6.574388586956521e-06, "loss": 2.4139, "step": 12241 }, { "epoch": 1.9164057608015028, "grad_norm": 1.3728220462799072, "learning_rate": 6.5624999999999994e-06, "loss": 2.3588, "step": 12242 }, { "epoch": 1.916562304320601, "grad_norm": 1.8352272510528564, "learning_rate": 6.550611413043478e-06, "loss": 2.3586, "step": 12243 }, { "epoch": 1.9167188478396995, "grad_norm": 1.7884385585784912, "learning_rate": 6.538722826086957e-06, "loss": 2.4644, "step": 12244 }, { "epoch": 1.9168753913587977, "grad_norm": 1.7442266941070557, "learning_rate": 6.526834239130434e-06, "loss": 2.2601, "step": 12245 }, { "epoch": 1.9170319348778961, "grad_norm": 1.4835973978042603, "learning_rate": 6.514945652173912e-06, "loss": 2.3666, "step": 12246 }, { "epoch": 1.9171884783969944, "grad_norm": 1.5185145139694214, "learning_rate": 6.503057065217391e-06, "loss": 2.1783, "step": 12247 }, { "epoch": 1.9173450219160926, "grad_norm": 1.6305327415466309, "learning_rate": 6.491168478260869e-06, "loss": 2.193, "step": 12248 }, { "epoch": 1.917501565435191, "grad_norm": 1.9223718643188477, "learning_rate": 6.479279891304347e-06, "loss": 2.1281, "step": 12249 }, { "epoch": 1.9176581089542895, "grad_norm": 1.571926474571228, "learning_rate": 6.4673913043478255e-06, "loss": 2.2227, "step": 12250 }, { "epoch": 1.9178146524733877, "grad_norm": 1.7400630712509155, "learning_rate": 6.455502717391304e-06, "loss": 2.0286, "step": 12251 }, { "epoch": 1.9179711959924859, "grad_norm": 2.0638084411621094, "learning_rate": 6.443614130434782e-06, "loss": 2.0634, "step": 12252 }, { "epoch": 1.918127739511584, "grad_norm": 2.463390350341797, "learning_rate": 6.43172554347826e-06, "loss": 2.0505, "step": 12253 }, { "epoch": 1.9182842830306825, "grad_norm": 2.3745245933532715, "learning_rate": 6.419836956521739e-06, "loss": 1.9492, "step": 12254 }, { "epoch": 1.918440826549781, "grad_norm": 1.8992558717727661, "learning_rate": 6.407948369565217e-06, "loss": 1.8919, "step": 12255 }, { "epoch": 1.9185973700688792, "grad_norm": 1.6064308881759644, "learning_rate": 6.3960597826086945e-06, "loss": 1.8383, "step": 12256 }, { "epoch": 1.9187539135879774, "grad_norm": 3.223004102706909, "learning_rate": 6.3841711956521735e-06, "loss": 1.6631, "step": 12257 }, { "epoch": 1.9189104571070756, "grad_norm": 1.5276122093200684, "learning_rate": 6.372282608695652e-06, "loss": 1.8007, "step": 12258 }, { "epoch": 1.919067000626174, "grad_norm": 2.1381149291992188, "learning_rate": 6.36039402173913e-06, "loss": 1.7404, "step": 12259 }, { "epoch": 1.9192235441452725, "grad_norm": 2.0583279132843018, "learning_rate": 6.348505434782608e-06, "loss": 1.6697, "step": 12260 }, { "epoch": 1.9193800876643707, "grad_norm": 2.1569056510925293, "learning_rate": 6.336616847826087e-06, "loss": 1.7404, "step": 12261 }, { "epoch": 1.919536631183469, "grad_norm": 11.211871147155762, "learning_rate": 6.324728260869564e-06, "loss": 2.5128, "step": 12262 }, { "epoch": 1.9196931747025672, "grad_norm": 2.457289457321167, "learning_rate": 6.312839673913042e-06, "loss": 1.5101, "step": 12263 }, { "epoch": 1.9198497182216656, "grad_norm": 2.8831570148468018, "learning_rate": 6.300951086956521e-06, "loss": 1.51, "step": 12264 }, { "epoch": 1.920006261740764, "grad_norm": 2.2516047954559326, "learning_rate": 6.2890624999999996e-06, "loss": 1.126, "step": 12265 }, { "epoch": 1.9201628052598623, "grad_norm": 3.058497428894043, "learning_rate": 6.277173913043478e-06, "loss": 1.5728, "step": 12266 }, { "epoch": 1.9203193487789605, "grad_norm": 2.6846938133239746, "learning_rate": 6.265285326086957e-06, "loss": 1.4643, "step": 12267 }, { "epoch": 1.9204758922980587, "grad_norm": 6.273610591888428, "learning_rate": 6.253396739130434e-06, "loss": 1.3974, "step": 12268 }, { "epoch": 1.9206324358171571, "grad_norm": 5.621363639831543, "learning_rate": 6.241508152173912e-06, "loss": 1.3512, "step": 12269 }, { "epoch": 1.9207889793362556, "grad_norm": 4.470064640045166, "learning_rate": 6.229619565217391e-06, "loss": 1.7671, "step": 12270 }, { "epoch": 1.9209455228553538, "grad_norm": 3.925358533859253, "learning_rate": 6.217730978260869e-06, "loss": 1.3068, "step": 12271 }, { "epoch": 1.921102066374452, "grad_norm": 9.679473876953125, "learning_rate": 6.2058423913043475e-06, "loss": 1.1192, "step": 12272 }, { "epoch": 1.9212586098935505, "grad_norm": 3.8482863903045654, "learning_rate": 6.193953804347825e-06, "loss": 1.1948, "step": 12273 }, { "epoch": 1.9214151534126487, "grad_norm": 7.325191020965576, "learning_rate": 6.182065217391304e-06, "loss": 1.0242, "step": 12274 }, { "epoch": 1.9215716969317471, "grad_norm": 4.985886573791504, "learning_rate": 6.170176630434782e-06, "loss": 1.6941, "step": 12275 }, { "epoch": 1.9217282404508453, "grad_norm": 6.45041036605835, "learning_rate": 6.15828804347826e-06, "loss": 1.5721, "step": 12276 }, { "epoch": 1.9218847839699436, "grad_norm": 2.5480029582977295, "learning_rate": 6.146399456521739e-06, "loss": 1.0821, "step": 12277 }, { "epoch": 1.922041327489042, "grad_norm": 10.45530891418457, "learning_rate": 6.134510869565217e-06, "loss": 1.4276, "step": 12278 }, { "epoch": 1.9221978710081402, "grad_norm": 5.244997024536133, "learning_rate": 6.122622282608695e-06, "loss": 1.4449, "step": 12279 }, { "epoch": 1.9223544145272387, "grad_norm": 17.423776626586914, "learning_rate": 6.110733695652174e-06, "loss": 1.7577, "step": 12280 }, { "epoch": 1.9225109580463369, "grad_norm": 6.36430549621582, "learning_rate": 6.098845108695652e-06, "loss": 1.3087, "step": 12281 }, { "epoch": 1.922667501565435, "grad_norm": 6.842656135559082, "learning_rate": 6.08695652173913e-06, "loss": 1.0402, "step": 12282 }, { "epoch": 1.9228240450845335, "grad_norm": 2.5672664642333984, "learning_rate": 6.075067934782608e-06, "loss": 0.9173, "step": 12283 }, { "epoch": 1.922980588603632, "grad_norm": 1.7665144205093384, "learning_rate": 6.063179347826086e-06, "loss": 0.5333, "step": 12284 }, { "epoch": 1.9231371321227302, "grad_norm": 3.1151652336120605, "learning_rate": 6.051290760869564e-06, "loss": 0.5859, "step": 12285 }, { "epoch": 1.9232936756418284, "grad_norm": 1.6582437753677368, "learning_rate": 6.0394021739130425e-06, "loss": 0.4135, "step": 12286 }, { "epoch": 1.9234502191609266, "grad_norm": 1.9730229377746582, "learning_rate": 6.0275135869565215e-06, "loss": 0.4226, "step": 12287 }, { "epoch": 1.923606762680025, "grad_norm": 3.6888885498046875, "learning_rate": 6.015625e-06, "loss": 0.5781, "step": 12288 }, { "epoch": 1.9237633061991235, "grad_norm": 2.56406307220459, "learning_rate": 6.003736413043478e-06, "loss": 2.3486, "step": 12289 }, { "epoch": 1.9239198497182217, "grad_norm": 1.526360034942627, "learning_rate": 5.991847826086956e-06, "loss": 2.4377, "step": 12290 }, { "epoch": 1.92407639323732, "grad_norm": 1.565175175666809, "learning_rate": 5.979959239130434e-06, "loss": 2.4437, "step": 12291 }, { "epoch": 1.9242329367564182, "grad_norm": 1.5605591535568237, "learning_rate": 5.968070652173912e-06, "loss": 2.4167, "step": 12292 }, { "epoch": 1.9243894802755166, "grad_norm": 1.7146700620651245, "learning_rate": 5.956182065217391e-06, "loss": 2.4604, "step": 12293 }, { "epoch": 1.924546023794615, "grad_norm": 1.737808346748352, "learning_rate": 5.9442934782608695e-06, "loss": 2.5084, "step": 12294 }, { "epoch": 1.9247025673137133, "grad_norm": 1.5015642642974854, "learning_rate": 5.932404891304347e-06, "loss": 2.3111, "step": 12295 }, { "epoch": 1.9248591108328115, "grad_norm": 1.3156194686889648, "learning_rate": 5.920516304347825e-06, "loss": 2.2122, "step": 12296 }, { "epoch": 1.9250156543519097, "grad_norm": 1.581198811531067, "learning_rate": 5.908627717391304e-06, "loss": 2.3939, "step": 12297 }, { "epoch": 1.9251721978710081, "grad_norm": 1.5095152854919434, "learning_rate": 5.896739130434782e-06, "loss": 2.2152, "step": 12298 }, { "epoch": 1.9253287413901066, "grad_norm": 1.5747214555740356, "learning_rate": 5.88485054347826e-06, "loss": 2.236, "step": 12299 }, { "epoch": 1.9254852849092048, "grad_norm": 3.500892400741577, "learning_rate": 5.872961956521739e-06, "loss": 2.564, "step": 12300 }, { "epoch": 1.925641828428303, "grad_norm": 2.4266979694366455, "learning_rate": 5.8610733695652166e-06, "loss": 2.2495, "step": 12301 }, { "epoch": 1.9257983719474012, "grad_norm": 1.8683334589004517, "learning_rate": 5.849184782608695e-06, "loss": 2.2537, "step": 12302 }, { "epoch": 1.9259549154664997, "grad_norm": 1.564625859260559, "learning_rate": 5.837296195652174e-06, "loss": 2.1602, "step": 12303 }, { "epoch": 1.926111458985598, "grad_norm": 2.486330032348633, "learning_rate": 5.825407608695652e-06, "loss": 2.2637, "step": 12304 }, { "epoch": 1.9262680025046963, "grad_norm": 2.215700149536133, "learning_rate": 5.81351902173913e-06, "loss": 2.0907, "step": 12305 }, { "epoch": 1.9264245460237945, "grad_norm": 2.592102527618408, "learning_rate": 5.801630434782607e-06, "loss": 2.0098, "step": 12306 }, { "epoch": 1.926581089542893, "grad_norm": 3.026496410369873, "learning_rate": 5.789741847826086e-06, "loss": 2.2483, "step": 12307 }, { "epoch": 1.9267376330619912, "grad_norm": 2.526388645172119, "learning_rate": 5.7778532608695645e-06, "loss": 2.0829, "step": 12308 }, { "epoch": 1.9268941765810896, "grad_norm": 4.068664073944092, "learning_rate": 5.765964673913043e-06, "loss": 1.91, "step": 12309 }, { "epoch": 1.9270507201001879, "grad_norm": 2.7168750762939453, "learning_rate": 5.754076086956522e-06, "loss": 1.9052, "step": 12310 }, { "epoch": 1.927207263619286, "grad_norm": 2.8469324111938477, "learning_rate": 5.7421875e-06, "loss": 1.968, "step": 12311 }, { "epoch": 1.9273638071383845, "grad_norm": 6.543916702270508, "learning_rate": 5.730298913043477e-06, "loss": 1.9744, "step": 12312 }, { "epoch": 1.9275203506574827, "grad_norm": 2.5627217292785645, "learning_rate": 5.718410326086956e-06, "loss": 1.5817, "step": 12313 }, { "epoch": 1.9276768941765812, "grad_norm": 2.7339072227478027, "learning_rate": 5.706521739130434e-06, "loss": 2.0097, "step": 12314 }, { "epoch": 1.9278334376956794, "grad_norm": 3.197436809539795, "learning_rate": 5.6946331521739124e-06, "loss": 2.0305, "step": 12315 }, { "epoch": 1.9279899812147776, "grad_norm": 3.2693614959716797, "learning_rate": 5.6827445652173914e-06, "loss": 1.7463, "step": 12316 }, { "epoch": 1.928146524733876, "grad_norm": 4.1048688888549805, "learning_rate": 5.67085597826087e-06, "loss": 1.6916, "step": 12317 }, { "epoch": 1.9283030682529745, "grad_norm": 2.592939615249634, "learning_rate": 5.658967391304347e-06, "loss": 1.3139, "step": 12318 }, { "epoch": 1.9284596117720727, "grad_norm": 6.5928192138671875, "learning_rate": 5.647078804347825e-06, "loss": 1.3491, "step": 12319 }, { "epoch": 1.928616155291171, "grad_norm": 3.9032633304595947, "learning_rate": 5.635190217391304e-06, "loss": 1.6425, "step": 12320 }, { "epoch": 1.9287726988102691, "grad_norm": 5.684948444366455, "learning_rate": 5.623301630434782e-06, "loss": 1.5142, "step": 12321 }, { "epoch": 1.9289292423293676, "grad_norm": 22.180265426635742, "learning_rate": 5.61141304347826e-06, "loss": 1.3238, "step": 12322 }, { "epoch": 1.929085785848466, "grad_norm": 5.6028289794921875, "learning_rate": 5.599524456521739e-06, "loss": 1.6784, "step": 12323 }, { "epoch": 1.9292423293675642, "grad_norm": 3.805162191390991, "learning_rate": 5.587635869565217e-06, "loss": 1.6315, "step": 12324 }, { "epoch": 1.9293988728866625, "grad_norm": 3.011730194091797, "learning_rate": 5.575747282608695e-06, "loss": 1.6055, "step": 12325 }, { "epoch": 1.9295554164057607, "grad_norm": 5.0912675857543945, "learning_rate": 5.563858695652174e-06, "loss": 1.516, "step": 12326 }, { "epoch": 1.929711959924859, "grad_norm": 3.2260942459106445, "learning_rate": 5.551970108695652e-06, "loss": 1.2888, "step": 12327 }, { "epoch": 1.9298685034439576, "grad_norm": 4.288518905639648, "learning_rate": 5.54008152173913e-06, "loss": 1.6404, "step": 12328 }, { "epoch": 1.9300250469630558, "grad_norm": 4.512974739074707, "learning_rate": 5.5281929347826075e-06, "loss": 0.8906, "step": 12329 }, { "epoch": 1.930181590482154, "grad_norm": 4.978224277496338, "learning_rate": 5.5163043478260865e-06, "loss": 1.3315, "step": 12330 }, { "epoch": 1.9303381340012522, "grad_norm": 2.0012240409851074, "learning_rate": 5.504415760869565e-06, "loss": 0.609, "step": 12331 }, { "epoch": 1.9304946775203506, "grad_norm": 4.962413787841797, "learning_rate": 5.492527173913043e-06, "loss": 1.297, "step": 12332 }, { "epoch": 1.930651221039449, "grad_norm": 4.97286319732666, "learning_rate": 5.480638586956522e-06, "loss": 1.6308, "step": 12333 }, { "epoch": 1.9308077645585473, "grad_norm": 3.0609452724456787, "learning_rate": 5.468749999999999e-06, "loss": 0.7743, "step": 12334 }, { "epoch": 1.9309643080776455, "grad_norm": 1.812564492225647, "learning_rate": 5.456861413043477e-06, "loss": 0.954, "step": 12335 }, { "epoch": 1.9311208515967437, "grad_norm": 1.797513484954834, "learning_rate": 5.444972826086956e-06, "loss": 0.7786, "step": 12336 }, { "epoch": 1.9312773951158422, "grad_norm": 3.5056521892547607, "learning_rate": 5.433084239130434e-06, "loss": 0.6142, "step": 12337 }, { "epoch": 1.9314339386349406, "grad_norm": 2.938033103942871, "learning_rate": 5.4211956521739126e-06, "loss": 1.1889, "step": 12338 }, { "epoch": 1.9315904821540388, "grad_norm": 4.712599754333496, "learning_rate": 5.4093070652173916e-06, "loss": 2.8541, "step": 12339 }, { "epoch": 1.931747025673137, "grad_norm": 1.3960394859313965, "learning_rate": 5.397418478260869e-06, "loss": 2.5503, "step": 12340 }, { "epoch": 1.9319035691922355, "grad_norm": 1.5266926288604736, "learning_rate": 5.385529891304347e-06, "loss": 2.6479, "step": 12341 }, { "epoch": 1.9320601127113337, "grad_norm": 1.3088434934616089, "learning_rate": 5.373641304347825e-06, "loss": 2.5129, "step": 12342 }, { "epoch": 1.9322166562304322, "grad_norm": 2.471798896789551, "learning_rate": 5.361752717391304e-06, "loss": 2.4529, "step": 12343 }, { "epoch": 1.9323731997495304, "grad_norm": 1.3019988536834717, "learning_rate": 5.349864130434782e-06, "loss": 2.3822, "step": 12344 }, { "epoch": 1.9325297432686286, "grad_norm": 1.5172632932662964, "learning_rate": 5.33797554347826e-06, "loss": 2.5029, "step": 12345 }, { "epoch": 1.932686286787727, "grad_norm": 1.4683221578598022, "learning_rate": 5.326086956521739e-06, "loss": 2.3825, "step": 12346 }, { "epoch": 1.9328428303068252, "grad_norm": 1.4488341808319092, "learning_rate": 5.314198369565217e-06, "loss": 2.4549, "step": 12347 }, { "epoch": 1.9329993738259237, "grad_norm": 1.688308835029602, "learning_rate": 5.302309782608695e-06, "loss": 2.4537, "step": 12348 }, { "epoch": 1.933155917345022, "grad_norm": 1.9888386726379395, "learning_rate": 5.290421195652174e-06, "loss": 2.2408, "step": 12349 }, { "epoch": 1.9333124608641201, "grad_norm": 2.3324835300445557, "learning_rate": 5.278532608695652e-06, "loss": 2.4971, "step": 12350 }, { "epoch": 1.9334690043832186, "grad_norm": 1.6445688009262085, "learning_rate": 5.2666440217391294e-06, "loss": 2.3044, "step": 12351 }, { "epoch": 1.933625547902317, "grad_norm": 7.81101131439209, "learning_rate": 5.254755434782608e-06, "loss": 2.5671, "step": 12352 }, { "epoch": 1.9337820914214152, "grad_norm": 2.0722508430480957, "learning_rate": 5.242866847826087e-06, "loss": 2.3613, "step": 12353 }, { "epoch": 1.9339386349405134, "grad_norm": 3.292931318283081, "learning_rate": 5.230978260869565e-06, "loss": 2.2068, "step": 12354 }, { "epoch": 1.9340951784596117, "grad_norm": 3.567390203475952, "learning_rate": 5.219089673913043e-06, "loss": 2.3304, "step": 12355 }, { "epoch": 1.93425172197871, "grad_norm": 1.768709659576416, "learning_rate": 5.207201086956522e-06, "loss": 2.195, "step": 12356 }, { "epoch": 1.9344082654978085, "grad_norm": 3.126570224761963, "learning_rate": 5.195312499999999e-06, "loss": 2.1346, "step": 12357 }, { "epoch": 1.9345648090169068, "grad_norm": 1.7957587242126465, "learning_rate": 5.183423913043477e-06, "loss": 2.2154, "step": 12358 }, { "epoch": 1.934721352536005, "grad_norm": 1.4241621494293213, "learning_rate": 5.171535326086956e-06, "loss": 2.0535, "step": 12359 }, { "epoch": 1.9348778960551032, "grad_norm": 2.418287515640259, "learning_rate": 5.1596467391304345e-06, "loss": 2.2088, "step": 12360 }, { "epoch": 1.9350344395742016, "grad_norm": 3.2094061374664307, "learning_rate": 5.147758152173913e-06, "loss": 2.3428, "step": 12361 }, { "epoch": 1.9351909830933, "grad_norm": 1.6744602918624878, "learning_rate": 5.135869565217392e-06, "loss": 2.1703, "step": 12362 }, { "epoch": 1.9353475266123983, "grad_norm": 2.290200710296631, "learning_rate": 5.123980978260869e-06, "loss": 1.6555, "step": 12363 }, { "epoch": 1.9355040701314965, "grad_norm": 2.1350643634796143, "learning_rate": 5.112092391304347e-06, "loss": 2.0782, "step": 12364 }, { "epoch": 1.9356606136505947, "grad_norm": 4.533969879150391, "learning_rate": 5.100203804347825e-06, "loss": 1.9981, "step": 12365 }, { "epoch": 1.9358171571696932, "grad_norm": 5.207503795623779, "learning_rate": 5.088315217391304e-06, "loss": 2.3599, "step": 12366 }, { "epoch": 1.9359737006887916, "grad_norm": 1.60601007938385, "learning_rate": 5.0764266304347825e-06, "loss": 1.8609, "step": 12367 }, { "epoch": 1.9361302442078898, "grad_norm": 4.013352394104004, "learning_rate": 5.06453804347826e-06, "loss": 1.6509, "step": 12368 }, { "epoch": 1.936286787726988, "grad_norm": 2.7062578201293945, "learning_rate": 5.052649456521739e-06, "loss": 1.698, "step": 12369 }, { "epoch": 1.9364433312460863, "grad_norm": 7.853254795074463, "learning_rate": 5.040760869565217e-06, "loss": 1.8685, "step": 12370 }, { "epoch": 1.9365998747651847, "grad_norm": 2.6288321018218994, "learning_rate": 5.028872282608695e-06, "loss": 1.4133, "step": 12371 }, { "epoch": 1.9367564182842831, "grad_norm": 5.754298686981201, "learning_rate": 5.016983695652174e-06, "loss": 1.2637, "step": 12372 }, { "epoch": 1.9369129618033814, "grad_norm": 7.580206871032715, "learning_rate": 5.005095108695652e-06, "loss": 1.5466, "step": 12373 }, { "epoch": 1.9370695053224796, "grad_norm": 5.496615409851074, "learning_rate": 4.9932065217391296e-06, "loss": 1.7559, "step": 12374 }, { "epoch": 1.937226048841578, "grad_norm": 5.002652168273926, "learning_rate": 4.981317934782608e-06, "loss": 1.4934, "step": 12375 }, { "epoch": 1.9373825923606762, "grad_norm": 9.90695858001709, "learning_rate": 4.969429347826087e-06, "loss": 1.6505, "step": 12376 }, { "epoch": 1.9375391358797747, "grad_norm": 5.895081996917725, "learning_rate": 4.957540760869565e-06, "loss": 1.2809, "step": 12377 }, { "epoch": 1.9376956793988729, "grad_norm": 3.7646005153656006, "learning_rate": 4.945652173913043e-06, "loss": 1.2877, "step": 12378 }, { "epoch": 1.937852222917971, "grad_norm": 3.6830103397369385, "learning_rate": 4.933763586956521e-06, "loss": 1.2802, "step": 12379 }, { "epoch": 1.9380087664370695, "grad_norm": 4.8639817237854, "learning_rate": 4.921874999999999e-06, "loss": 0.951, "step": 12380 }, { "epoch": 1.938165309956168, "grad_norm": 6.020194053649902, "learning_rate": 4.9099864130434775e-06, "loss": 0.8985, "step": 12381 }, { "epoch": 1.9383218534752662, "grad_norm": 2.353076934814453, "learning_rate": 4.8980978260869565e-06, "loss": 1.1693, "step": 12382 }, { "epoch": 1.9384783969943644, "grad_norm": 3.1395928859710693, "learning_rate": 4.886209239130435e-06, "loss": 1.266, "step": 12383 }, { "epoch": 1.9386349405134626, "grad_norm": 4.625245094299316, "learning_rate": 4.874320652173913e-06, "loss": 1.0958, "step": 12384 }, { "epoch": 1.938791484032561, "grad_norm": 2.4252564907073975, "learning_rate": 4.862432065217391e-06, "loss": 0.8905, "step": 12385 }, { "epoch": 1.9389480275516595, "grad_norm": 3.750396251678467, "learning_rate": 4.850543478260869e-06, "loss": 0.7741, "step": 12386 }, { "epoch": 1.9391045710707577, "grad_norm": 2.8295822143554688, "learning_rate": 4.838654891304347e-06, "loss": 0.4392, "step": 12387 }, { "epoch": 1.939261114589856, "grad_norm": 2.692901849746704, "learning_rate": 4.8267663043478255e-06, "loss": 1.2113, "step": 12388 }, { "epoch": 1.9394176581089542, "grad_norm": 1.8979294300079346, "learning_rate": 4.8148777173913045e-06, "loss": 2.8098, "step": 12389 }, { "epoch": 1.9395742016280526, "grad_norm": 1.8860416412353516, "learning_rate": 4.802989130434782e-06, "loss": 2.9271, "step": 12390 }, { "epoch": 1.939730745147151, "grad_norm": 1.8483679294586182, "learning_rate": 4.79110054347826e-06, "loss": 2.7849, "step": 12391 }, { "epoch": 1.9398872886662493, "grad_norm": 1.7686866521835327, "learning_rate": 4.779211956521739e-06, "loss": 2.8646, "step": 12392 }, { "epoch": 1.9400438321853475, "grad_norm": 1.8225914239883423, "learning_rate": 4.767323369565217e-06, "loss": 2.7714, "step": 12393 }, { "epoch": 1.9402003757044457, "grad_norm": 1.8646214008331299, "learning_rate": 4.755434782608695e-06, "loss": 2.7952, "step": 12394 }, { "epoch": 1.9403569192235441, "grad_norm": 1.984923005104065, "learning_rate": 4.743546195652174e-06, "loss": 2.8671, "step": 12395 }, { "epoch": 1.9405134627426426, "grad_norm": 1.9221899509429932, "learning_rate": 4.7316576086956515e-06, "loss": 2.8426, "step": 12396 }, { "epoch": 1.9406700062617408, "grad_norm": 2.8842101097106934, "learning_rate": 4.71976902173913e-06, "loss": 2.8538, "step": 12397 }, { "epoch": 1.940826549780839, "grad_norm": 1.9751620292663574, "learning_rate": 4.707880434782608e-06, "loss": 2.8075, "step": 12398 }, { "epoch": 1.9409830932999372, "grad_norm": 1.9051334857940674, "learning_rate": 4.695991847826087e-06, "loss": 2.7145, "step": 12399 }, { "epoch": 1.9411396368190357, "grad_norm": 2.5307767391204834, "learning_rate": 4.684103260869565e-06, "loss": 2.7203, "step": 12400 }, { "epoch": 1.9412961803381341, "grad_norm": 1.9505846500396729, "learning_rate": 4.672214673913042e-06, "loss": 2.7193, "step": 12401 }, { "epoch": 1.9414527238572323, "grad_norm": 1.9905164241790771, "learning_rate": 4.660326086956521e-06, "loss": 2.7066, "step": 12402 }, { "epoch": 1.9416092673763305, "grad_norm": 2.1677892208099365, "learning_rate": 4.6484374999999995e-06, "loss": 2.6154, "step": 12403 }, { "epoch": 1.9417658108954288, "grad_norm": 1.947159767150879, "learning_rate": 4.636548913043478e-06, "loss": 2.5567, "step": 12404 }, { "epoch": 1.9419223544145272, "grad_norm": 1.6114706993103027, "learning_rate": 4.624660326086957e-06, "loss": 2.4335, "step": 12405 }, { "epoch": 1.9420788979336256, "grad_norm": 2.237694501876831, "learning_rate": 4.612771739130435e-06, "loss": 2.5405, "step": 12406 }, { "epoch": 1.9422354414527239, "grad_norm": 2.5481152534484863, "learning_rate": 4.600883152173912e-06, "loss": 2.2862, "step": 12407 }, { "epoch": 1.942391984971822, "grad_norm": 3.030834674835205, "learning_rate": 4.588994565217391e-06, "loss": 2.0471, "step": 12408 }, { "epoch": 1.9425485284909205, "grad_norm": 2.476665735244751, "learning_rate": 4.577105978260869e-06, "loss": 2.2878, "step": 12409 }, { "epoch": 1.9427050720100187, "grad_norm": 2.044889450073242, "learning_rate": 4.5652173913043474e-06, "loss": 2.3835, "step": 12410 }, { "epoch": 1.9428616155291172, "grad_norm": 2.145465135574341, "learning_rate": 4.553328804347826e-06, "loss": 2.3227, "step": 12411 }, { "epoch": 1.9430181590482154, "grad_norm": 2.6731112003326416, "learning_rate": 4.541440217391305e-06, "loss": 2.3893, "step": 12412 }, { "epoch": 1.9431747025673136, "grad_norm": 4.429732322692871, "learning_rate": 4.529551630434782e-06, "loss": 2.2044, "step": 12413 }, { "epoch": 1.943331246086412, "grad_norm": 3.777782678604126, "learning_rate": 4.51766304347826e-06, "loss": 1.9723, "step": 12414 }, { "epoch": 1.9434877896055105, "grad_norm": 3.1434690952301025, "learning_rate": 4.505774456521739e-06, "loss": 2.1492, "step": 12415 }, { "epoch": 1.9436443331246087, "grad_norm": 2.0443484783172607, "learning_rate": 4.493885869565217e-06, "loss": 1.7714, "step": 12416 }, { "epoch": 1.943800876643707, "grad_norm": 16.892101287841797, "learning_rate": 4.481997282608695e-06, "loss": 2.0376, "step": 12417 }, { "epoch": 1.9439574201628051, "grad_norm": 7.687930107116699, "learning_rate": 4.4701086956521735e-06, "loss": 1.4856, "step": 12418 }, { "epoch": 1.9441139636819036, "grad_norm": 2.0008230209350586, "learning_rate": 4.458220108695652e-06, "loss": 1.5662, "step": 12419 }, { "epoch": 1.944270507201002, "grad_norm": 4.207871913909912, "learning_rate": 4.44633152173913e-06, "loss": 1.701, "step": 12420 }, { "epoch": 1.9444270507201002, "grad_norm": 7.485208511352539, "learning_rate": 4.434442934782608e-06, "loss": 1.4407, "step": 12421 }, { "epoch": 1.9445835942391985, "grad_norm": 3.129462242126465, "learning_rate": 4.422554347826087e-06, "loss": 1.3926, "step": 12422 }, { "epoch": 1.9447401377582967, "grad_norm": 3.452791452407837, "learning_rate": 4.410665760869565e-06, "loss": 1.3933, "step": 12423 }, { "epoch": 1.9448966812773951, "grad_norm": 4.929559707641602, "learning_rate": 4.3987771739130425e-06, "loss": 1.2219, "step": 12424 }, { "epoch": 1.9450532247964936, "grad_norm": 5.192774772644043, "learning_rate": 4.3868885869565215e-06, "loss": 1.7379, "step": 12425 }, { "epoch": 1.9452097683155918, "grad_norm": 3.882925510406494, "learning_rate": 4.375e-06, "loss": 1.7853, "step": 12426 }, { "epoch": 1.94536631183469, "grad_norm": 5.765262126922607, "learning_rate": 4.363111413043478e-06, "loss": 1.2253, "step": 12427 }, { "epoch": 1.9455228553537882, "grad_norm": 7.608963966369629, "learning_rate": 4.351222826086956e-06, "loss": 1.4012, "step": 12428 }, { "epoch": 1.9456793988728867, "grad_norm": 5.276591777801514, "learning_rate": 4.339334239130434e-06, "loss": 1.6758, "step": 12429 }, { "epoch": 1.945835942391985, "grad_norm": 2.5360803604125977, "learning_rate": 4.327445652173912e-06, "loss": 1.0818, "step": 12430 }, { "epoch": 1.9459924859110833, "grad_norm": 3.5321333408355713, "learning_rate": 4.315557065217391e-06, "loss": 1.5546, "step": 12431 }, { "epoch": 1.9461490294301815, "grad_norm": 4.748870372772217, "learning_rate": 4.303668478260869e-06, "loss": 0.9575, "step": 12432 }, { "epoch": 1.9463055729492797, "grad_norm": 5.916277885437012, "learning_rate": 4.2917798913043476e-06, "loss": 1.0332, "step": 12433 }, { "epoch": 1.9464621164683782, "grad_norm": 3.6434125900268555, "learning_rate": 4.279891304347826e-06, "loss": 0.985, "step": 12434 }, { "epoch": 1.9466186599874766, "grad_norm": 2.884605884552002, "learning_rate": 4.268002717391304e-06, "loss": 0.825, "step": 12435 }, { "epoch": 1.9467752035065748, "grad_norm": 4.030575275421143, "learning_rate": 4.256114130434782e-06, "loss": 0.9684, "step": 12436 }, { "epoch": 1.946931747025673, "grad_norm": 6.394804954528809, "learning_rate": 4.244225543478261e-06, "loss": 0.9487, "step": 12437 }, { "epoch": 1.9470882905447713, "grad_norm": 3.8814713954925537, "learning_rate": 4.232336956521738e-06, "loss": 0.9712, "step": 12438 }, { "epoch": 1.9472448340638697, "grad_norm": 1.8790318965911865, "learning_rate": 4.220448369565217e-06, "loss": 2.8691, "step": 12439 }, { "epoch": 1.9474013775829682, "grad_norm": 1.920149326324463, "learning_rate": 4.2085597826086955e-06, "loss": 2.8091, "step": 12440 }, { "epoch": 1.9475579211020664, "grad_norm": 1.6957205533981323, "learning_rate": 4.196671195652174e-06, "loss": 2.767, "step": 12441 }, { "epoch": 1.9477144646211646, "grad_norm": 1.6975927352905273, "learning_rate": 4.184782608695652e-06, "loss": 2.7267, "step": 12442 }, { "epoch": 1.947871008140263, "grad_norm": 2.0913684368133545, "learning_rate": 4.17289402173913e-06, "loss": 2.9986, "step": 12443 }, { "epoch": 1.9480275516593613, "grad_norm": 2.0443544387817383, "learning_rate": 4.161005434782608e-06, "loss": 2.8989, "step": 12444 }, { "epoch": 1.9481840951784597, "grad_norm": 1.8067764043807983, "learning_rate": 4.149116847826087e-06, "loss": 2.8346, "step": 12445 }, { "epoch": 1.948340638697558, "grad_norm": 1.9730005264282227, "learning_rate": 4.1372282608695644e-06, "loss": 2.8153, "step": 12446 }, { "epoch": 1.9484971822166561, "grad_norm": 1.904576063156128, "learning_rate": 4.1253396739130434e-06, "loss": 2.8218, "step": 12447 }, { "epoch": 1.9486537257357546, "grad_norm": 1.75669264793396, "learning_rate": 4.113451086956522e-06, "loss": 2.6778, "step": 12448 }, { "epoch": 1.948810269254853, "grad_norm": 1.8685898780822754, "learning_rate": 4.1015625e-06, "loss": 2.7076, "step": 12449 }, { "epoch": 1.9489668127739512, "grad_norm": 1.6911829710006714, "learning_rate": 4.089673913043478e-06, "loss": 2.4599, "step": 12450 }, { "epoch": 1.9491233562930494, "grad_norm": 1.463472843170166, "learning_rate": 4.077785326086956e-06, "loss": 2.5366, "step": 12451 }, { "epoch": 1.9492798998121477, "grad_norm": 2.7629687786102295, "learning_rate": 4.065896739130434e-06, "loss": 2.6595, "step": 12452 }, { "epoch": 1.949436443331246, "grad_norm": 2.088066816329956, "learning_rate": 4.054008152173912e-06, "loss": 2.7528, "step": 12453 }, { "epoch": 1.9495929868503445, "grad_norm": 1.545872449874878, "learning_rate": 4.0421195652173905e-06, "loss": 2.4584, "step": 12454 }, { "epoch": 1.9497495303694428, "grad_norm": 2.3832781314849854, "learning_rate": 4.0302309782608695e-06, "loss": 2.5954, "step": 12455 }, { "epoch": 1.949906073888541, "grad_norm": 2.32985782623291, "learning_rate": 4.018342391304348e-06, "loss": 2.3313, "step": 12456 }, { "epoch": 1.9500626174076392, "grad_norm": 3.226716995239258, "learning_rate": 4.006453804347826e-06, "loss": 2.4336, "step": 12457 }, { "epoch": 1.9502191609267376, "grad_norm": 1.9975454807281494, "learning_rate": 3.994565217391304e-06, "loss": 2.4587, "step": 12458 }, { "epoch": 1.950375704445836, "grad_norm": 1.955398678779602, "learning_rate": 3.982676630434782e-06, "loss": 2.2821, "step": 12459 }, { "epoch": 1.9505322479649343, "grad_norm": 3.2975945472717285, "learning_rate": 3.97078804347826e-06, "loss": 2.1856, "step": 12460 }, { "epoch": 1.9506887914840325, "grad_norm": 4.294447898864746, "learning_rate": 3.9588994565217385e-06, "loss": 2.2226, "step": 12461 }, { "epoch": 1.9508453350031307, "grad_norm": 7.3046650886535645, "learning_rate": 3.9470108695652175e-06, "loss": 2.2093, "step": 12462 }, { "epoch": 1.9510018785222292, "grad_norm": 2.883687973022461, "learning_rate": 3.935122282608696e-06, "loss": 2.112, "step": 12463 }, { "epoch": 1.9511584220413276, "grad_norm": 3.2260773181915283, "learning_rate": 3.923233695652174e-06, "loss": 1.8717, "step": 12464 }, { "epoch": 1.9513149655604258, "grad_norm": 4.389449119567871, "learning_rate": 3.911345108695652e-06, "loss": 1.796, "step": 12465 }, { "epoch": 1.951471509079524, "grad_norm": 2.0444657802581787, "learning_rate": 3.89945652173913e-06, "loss": 1.9771, "step": 12466 }, { "epoch": 1.9516280525986223, "grad_norm": 2.365347385406494, "learning_rate": 3.887567934782608e-06, "loss": 1.3974, "step": 12467 }, { "epoch": 1.9517845961177207, "grad_norm": 3.2177369594573975, "learning_rate": 3.875679347826086e-06, "loss": 1.9022, "step": 12468 }, { "epoch": 1.9519411396368191, "grad_norm": 6.028573989868164, "learning_rate": 3.8637907608695646e-06, "loss": 1.8808, "step": 12469 }, { "epoch": 1.9520976831559174, "grad_norm": 3.925595760345459, "learning_rate": 3.8519021739130436e-06, "loss": 1.7456, "step": 12470 }, { "epoch": 1.9522542266750156, "grad_norm": 3.29742169380188, "learning_rate": 3.840013586956521e-06, "loss": 1.709, "step": 12471 }, { "epoch": 1.9524107701941138, "grad_norm": 2.72920298576355, "learning_rate": 3.828125e-06, "loss": 1.1373, "step": 12472 }, { "epoch": 1.9525673137132122, "grad_norm": 4.040767192840576, "learning_rate": 3.816236413043478e-06, "loss": 1.7889, "step": 12473 }, { "epoch": 1.9527238572323107, "grad_norm": 5.549520492553711, "learning_rate": 3.804347826086956e-06, "loss": 1.555, "step": 12474 }, { "epoch": 1.952880400751409, "grad_norm": 7.571475505828857, "learning_rate": 3.7924592391304343e-06, "loss": 2.0267, "step": 12475 }, { "epoch": 1.9530369442705071, "grad_norm": 4.214211940765381, "learning_rate": 3.7805706521739125e-06, "loss": 1.0803, "step": 12476 }, { "epoch": 1.9531934877896056, "grad_norm": 5.813425064086914, "learning_rate": 3.768682065217391e-06, "loss": 1.3813, "step": 12477 }, { "epoch": 1.9533500313087038, "grad_norm": 9.466153144836426, "learning_rate": 3.7567934782608692e-06, "loss": 1.0217, "step": 12478 }, { "epoch": 1.9535065748278022, "grad_norm": 2.5356595516204834, "learning_rate": 3.7449048913043474e-06, "loss": 1.14, "step": 12479 }, { "epoch": 1.9536631183469004, "grad_norm": 2.369577169418335, "learning_rate": 3.733016304347826e-06, "loss": 0.6451, "step": 12480 }, { "epoch": 1.9538196618659986, "grad_norm": 4.847519874572754, "learning_rate": 3.7211277173913037e-06, "loss": 1.0832, "step": 12481 }, { "epoch": 1.953976205385097, "grad_norm": 1.9471659660339355, "learning_rate": 3.7092391304347823e-06, "loss": 0.9567, "step": 12482 }, { "epoch": 1.9541327489041955, "grad_norm": 5.0003132820129395, "learning_rate": 3.697350543478261e-06, "loss": 1.4662, "step": 12483 }, { "epoch": 1.9542892924232937, "grad_norm": 2.1182074546813965, "learning_rate": 3.6854619565217386e-06, "loss": 1.0078, "step": 12484 }, { "epoch": 1.954445835942392, "grad_norm": 5.9696197509765625, "learning_rate": 3.673573369565217e-06, "loss": 1.2733, "step": 12485 }, { "epoch": 1.9546023794614902, "grad_norm": 3.5079524517059326, "learning_rate": 3.6616847826086958e-06, "loss": 0.6099, "step": 12486 }, { "epoch": 1.9547589229805886, "grad_norm": 3.484922409057617, "learning_rate": 3.6497961956521735e-06, "loss": 1.0887, "step": 12487 }, { "epoch": 1.954915466499687, "grad_norm": 2.0859992504119873, "learning_rate": 3.637907608695652e-06, "loss": 0.4023, "step": 12488 }, { "epoch": 1.9550720100187853, "grad_norm": 1.6195721626281738, "learning_rate": 3.62601902173913e-06, "loss": 2.8524, "step": 12489 }, { "epoch": 1.9552285535378835, "grad_norm": 1.6305210590362549, "learning_rate": 3.6141304347826084e-06, "loss": 2.8024, "step": 12490 }, { "epoch": 1.9553850970569817, "grad_norm": 2.186833620071411, "learning_rate": 3.602241847826087e-06, "loss": 2.6019, "step": 12491 }, { "epoch": 1.9555416405760802, "grad_norm": 1.7845596075057983, "learning_rate": 3.5903532608695647e-06, "loss": 2.8658, "step": 12492 }, { "epoch": 1.9556981840951786, "grad_norm": 1.6202934980392456, "learning_rate": 3.5784646739130433e-06, "loss": 2.7818, "step": 12493 }, { "epoch": 1.9558547276142768, "grad_norm": 1.819537878036499, "learning_rate": 3.5665760869565214e-06, "loss": 2.8513, "step": 12494 }, { "epoch": 1.956011271133375, "grad_norm": 1.797844648361206, "learning_rate": 3.5546874999999996e-06, "loss": 2.7691, "step": 12495 }, { "epoch": 1.9561678146524732, "grad_norm": 1.865004539489746, "learning_rate": 3.542798913043478e-06, "loss": 2.787, "step": 12496 }, { "epoch": 1.9563243581715717, "grad_norm": 2.0269100666046143, "learning_rate": 3.5309103260869563e-06, "loss": 2.9069, "step": 12497 }, { "epoch": 1.9564809016906701, "grad_norm": 1.7016339302062988, "learning_rate": 3.5190217391304345e-06, "loss": 2.701, "step": 12498 }, { "epoch": 1.9566374452097683, "grad_norm": 1.845093846321106, "learning_rate": 3.5071331521739126e-06, "loss": 2.7202, "step": 12499 }, { "epoch": 1.9567939887288666, "grad_norm": 1.753563642501831, "learning_rate": 3.4952445652173908e-06, "loss": 2.6582, "step": 12500 }, { "epoch": 1.9569505322479648, "grad_norm": 2.710792303085327, "learning_rate": 3.4833559782608694e-06, "loss": 2.6777, "step": 12501 }, { "epoch": 1.9571070757670632, "grad_norm": 1.3205358982086182, "learning_rate": 3.4714673913043475e-06, "loss": 2.3466, "step": 12502 }, { "epoch": 1.9572636192861617, "grad_norm": 2.837921619415283, "learning_rate": 3.4595788043478257e-06, "loss": 2.5372, "step": 12503 }, { "epoch": 1.9574201628052599, "grad_norm": 2.6978132724761963, "learning_rate": 3.447690217391304e-06, "loss": 2.4738, "step": 12504 }, { "epoch": 1.957576706324358, "grad_norm": 1.4499021768569946, "learning_rate": 3.4358016304347824e-06, "loss": 2.3675, "step": 12505 }, { "epoch": 1.9577332498434565, "grad_norm": 2.3532581329345703, "learning_rate": 3.4239130434782606e-06, "loss": 2.6966, "step": 12506 }, { "epoch": 1.9578897933625548, "grad_norm": 2.144831418991089, "learning_rate": 3.4120244565217387e-06, "loss": 2.351, "step": 12507 }, { "epoch": 1.9580463368816532, "grad_norm": 2.2382521629333496, "learning_rate": 3.4001358695652173e-06, "loss": 2.3903, "step": 12508 }, { "epoch": 1.9582028804007514, "grad_norm": 6.0221848487854, "learning_rate": 3.3882472826086955e-06, "loss": 2.181, "step": 12509 }, { "epoch": 1.9583594239198496, "grad_norm": 4.554486274719238, "learning_rate": 3.3763586956521736e-06, "loss": 2.2384, "step": 12510 }, { "epoch": 1.958515967438948, "grad_norm": 2.2452359199523926, "learning_rate": 3.364470108695652e-06, "loss": 2.2839, "step": 12511 }, { "epoch": 1.9586725109580463, "grad_norm": 6.284882068634033, "learning_rate": 3.35258152173913e-06, "loss": 2.555, "step": 12512 }, { "epoch": 1.9588290544771447, "grad_norm": 6.036424160003662, "learning_rate": 3.3406929347826085e-06, "loss": 2.2642, "step": 12513 }, { "epoch": 1.958985597996243, "grad_norm": 2.562342405319214, "learning_rate": 3.3288043478260867e-06, "loss": 1.9869, "step": 12514 }, { "epoch": 1.9591421415153412, "grad_norm": 3.0527312755584717, "learning_rate": 3.316915760869565e-06, "loss": 2.1529, "step": 12515 }, { "epoch": 1.9592986850344396, "grad_norm": 4.1386518478393555, "learning_rate": 3.3050271739130434e-06, "loss": 1.9517, "step": 12516 }, { "epoch": 1.959455228553538, "grad_norm": 2.1945528984069824, "learning_rate": 3.293138586956521e-06, "loss": 1.8863, "step": 12517 }, { "epoch": 1.9596117720726363, "grad_norm": 3.9312920570373535, "learning_rate": 3.2812499999999997e-06, "loss": 1.2921, "step": 12518 }, { "epoch": 1.9597683155917345, "grad_norm": 4.009547710418701, "learning_rate": 3.2693614130434783e-06, "loss": 2.0769, "step": 12519 }, { "epoch": 1.9599248591108327, "grad_norm": 2.622843027114868, "learning_rate": 3.257472826086956e-06, "loss": 1.8681, "step": 12520 }, { "epoch": 1.9600814026299311, "grad_norm": 3.9732813835144043, "learning_rate": 3.2455842391304346e-06, "loss": 1.4681, "step": 12521 }, { "epoch": 1.9602379461490296, "grad_norm": 6.117884635925293, "learning_rate": 3.2336956521739128e-06, "loss": 1.7161, "step": 12522 }, { "epoch": 1.9603944896681278, "grad_norm": 2.977898597717285, "learning_rate": 3.221807065217391e-06, "loss": 1.3862, "step": 12523 }, { "epoch": 1.960551033187226, "grad_norm": 3.037281036376953, "learning_rate": 3.2099184782608695e-06, "loss": 1.575, "step": 12524 }, { "epoch": 1.9607075767063242, "grad_norm": 10.151965141296387, "learning_rate": 3.1980298913043472e-06, "loss": 1.4734, "step": 12525 }, { "epoch": 1.9608641202254227, "grad_norm": 6.2074785232543945, "learning_rate": 3.186141304347826e-06, "loss": 1.5085, "step": 12526 }, { "epoch": 1.961020663744521, "grad_norm": 3.8054940700531006, "learning_rate": 3.174252717391304e-06, "loss": 1.131, "step": 12527 }, { "epoch": 1.9611772072636193, "grad_norm": 3.748661756515503, "learning_rate": 3.162364130434782e-06, "loss": 1.6626, "step": 12528 }, { "epoch": 1.9613337507827175, "grad_norm": 2.7386834621429443, "learning_rate": 3.1504755434782607e-06, "loss": 1.3107, "step": 12529 }, { "epoch": 1.9614902943018158, "grad_norm": 3.704934597015381, "learning_rate": 3.138586956521739e-06, "loss": 1.0096, "step": 12530 }, { "epoch": 1.9616468378209142, "grad_norm": 6.827402114868164, "learning_rate": 3.126698369565217e-06, "loss": 0.898, "step": 12531 }, { "epoch": 1.9618033813400126, "grad_norm": 7.628554821014404, "learning_rate": 3.1148097826086956e-06, "loss": 1.4949, "step": 12532 }, { "epoch": 1.9619599248591109, "grad_norm": 2.761687755584717, "learning_rate": 3.1029211956521737e-06, "loss": 0.9489, "step": 12533 }, { "epoch": 1.962116468378209, "grad_norm": 3.5451877117156982, "learning_rate": 3.091032608695652e-06, "loss": 1.3161, "step": 12534 }, { "epoch": 1.9622730118973073, "grad_norm": 5.524079322814941, "learning_rate": 3.07914402173913e-06, "loss": 0.7463, "step": 12535 }, { "epoch": 1.9624295554164057, "grad_norm": 3.306706190109253, "learning_rate": 3.0672554347826086e-06, "loss": 0.8058, "step": 12536 }, { "epoch": 1.9625860989355042, "grad_norm": 3.1859827041625977, "learning_rate": 3.055366847826087e-06, "loss": 0.8385, "step": 12537 }, { "epoch": 1.9627426424546024, "grad_norm": 4.237364292144775, "learning_rate": 3.043478260869565e-06, "loss": 0.7363, "step": 12538 }, { "epoch": 1.9628991859737006, "grad_norm": 1.9224369525909424, "learning_rate": 3.031589673913043e-06, "loss": 2.9373, "step": 12539 }, { "epoch": 1.963055729492799, "grad_norm": 1.8885226249694824, "learning_rate": 3.0197010869565213e-06, "loss": 2.9539, "step": 12540 }, { "epoch": 1.9632122730118973, "grad_norm": 1.647106647491455, "learning_rate": 3.0078125e-06, "loss": 2.8527, "step": 12541 }, { "epoch": 1.9633688165309957, "grad_norm": 1.675878882408142, "learning_rate": 2.995923913043478e-06, "loss": 2.8027, "step": 12542 }, { "epoch": 1.963525360050094, "grad_norm": 1.9008088111877441, "learning_rate": 2.984035326086956e-06, "loss": 2.9663, "step": 12543 }, { "epoch": 1.9636819035691921, "grad_norm": 2.2622673511505127, "learning_rate": 2.9721467391304347e-06, "loss": 2.8424, "step": 12544 }, { "epoch": 1.9638384470882906, "grad_norm": 1.869354248046875, "learning_rate": 2.9602581521739125e-06, "loss": 2.9213, "step": 12545 }, { "epoch": 1.9639949906073888, "grad_norm": 1.7473738193511963, "learning_rate": 2.948369565217391e-06, "loss": 2.7848, "step": 12546 }, { "epoch": 1.9641515341264872, "grad_norm": 1.7721922397613525, "learning_rate": 2.9364809782608696e-06, "loss": 2.7837, "step": 12547 }, { "epoch": 1.9643080776455855, "grad_norm": 1.9002379179000854, "learning_rate": 2.9245923913043474e-06, "loss": 2.9164, "step": 12548 }, { "epoch": 1.9644646211646837, "grad_norm": 2.0860838890075684, "learning_rate": 2.912703804347826e-06, "loss": 2.9268, "step": 12549 }, { "epoch": 1.9646211646837821, "grad_norm": 1.6751253604888916, "learning_rate": 2.9008152173913037e-06, "loss": 2.6252, "step": 12550 }, { "epoch": 1.9647777082028806, "grad_norm": 1.7208343744277954, "learning_rate": 2.8889266304347822e-06, "loss": 2.7125, "step": 12551 }, { "epoch": 1.9649342517219788, "grad_norm": 2.0167455673217773, "learning_rate": 2.877038043478261e-06, "loss": 2.7023, "step": 12552 }, { "epoch": 1.965090795241077, "grad_norm": 2.573782444000244, "learning_rate": 2.8651494565217386e-06, "loss": 2.8856, "step": 12553 }, { "epoch": 1.9652473387601752, "grad_norm": 1.7597516775131226, "learning_rate": 2.853260869565217e-06, "loss": 2.5324, "step": 12554 }, { "epoch": 1.9654038822792737, "grad_norm": 1.5842186212539673, "learning_rate": 2.8413722826086957e-06, "loss": 2.5022, "step": 12555 }, { "epoch": 1.965560425798372, "grad_norm": 1.8162561655044556, "learning_rate": 2.8294836956521735e-06, "loss": 2.6755, "step": 12556 }, { "epoch": 1.9657169693174703, "grad_norm": 2.165081024169922, "learning_rate": 2.817595108695652e-06, "loss": 2.6416, "step": 12557 }, { "epoch": 1.9658735128365685, "grad_norm": 1.9893112182617188, "learning_rate": 2.80570652173913e-06, "loss": 2.4898, "step": 12558 }, { "epoch": 1.9660300563556667, "grad_norm": 3.9044454097747803, "learning_rate": 2.7938179347826083e-06, "loss": 2.5939, "step": 12559 }, { "epoch": 1.9661865998747652, "grad_norm": 2.1523690223693848, "learning_rate": 2.781929347826087e-06, "loss": 2.4638, "step": 12560 }, { "epoch": 1.9663431433938636, "grad_norm": 2.106741428375244, "learning_rate": 2.770040760869565e-06, "loss": 2.163, "step": 12561 }, { "epoch": 1.9664996869129618, "grad_norm": 2.121701240539551, "learning_rate": 2.7581521739130432e-06, "loss": 2.1962, "step": 12562 }, { "epoch": 1.96665623043206, "grad_norm": 5.600260257720947, "learning_rate": 2.7462635869565214e-06, "loss": 2.249, "step": 12563 }, { "epoch": 1.9668127739511583, "grad_norm": 3.9027082920074463, "learning_rate": 2.7343749999999995e-06, "loss": 2.5391, "step": 12564 }, { "epoch": 1.9669693174702567, "grad_norm": 3.6926584243774414, "learning_rate": 2.722486413043478e-06, "loss": 2.117, "step": 12565 }, { "epoch": 1.9671258609893552, "grad_norm": 5.341497421264648, "learning_rate": 2.7105978260869563e-06, "loss": 1.905, "step": 12566 }, { "epoch": 1.9672824045084534, "grad_norm": 3.4666829109191895, "learning_rate": 2.6987092391304344e-06, "loss": 1.7158, "step": 12567 }, { "epoch": 1.9674389480275516, "grad_norm": 2.707318067550659, "learning_rate": 2.6868206521739126e-06, "loss": 1.965, "step": 12568 }, { "epoch": 1.9675954915466498, "grad_norm": 3.5201432704925537, "learning_rate": 2.674932065217391e-06, "loss": 1.8078, "step": 12569 }, { "epoch": 1.9677520350657483, "grad_norm": 2.740238666534424, "learning_rate": 2.6630434782608693e-06, "loss": 1.3845, "step": 12570 }, { "epoch": 1.9679085785848467, "grad_norm": 3.2163589000701904, "learning_rate": 2.6511548913043475e-06, "loss": 2.1915, "step": 12571 }, { "epoch": 1.968065122103945, "grad_norm": 3.3131916522979736, "learning_rate": 2.639266304347826e-06, "loss": 1.5953, "step": 12572 }, { "epoch": 1.9682216656230431, "grad_norm": 3.1740312576293945, "learning_rate": 2.627377717391304e-06, "loss": 1.4148, "step": 12573 }, { "epoch": 1.9683782091421416, "grad_norm": 8.18712043762207, "learning_rate": 2.6154891304347824e-06, "loss": 1.6743, "step": 12574 }, { "epoch": 1.9685347526612398, "grad_norm": 7.017939567565918, "learning_rate": 2.603600543478261e-06, "loss": 1.7036, "step": 12575 }, { "epoch": 1.9686912961803382, "grad_norm": 2.38163161277771, "learning_rate": 2.5917119565217387e-06, "loss": 1.0293, "step": 12576 }, { "epoch": 1.9688478396994364, "grad_norm": 5.189415454864502, "learning_rate": 2.5798233695652173e-06, "loss": 1.3968, "step": 12577 }, { "epoch": 1.9690043832185347, "grad_norm": 7.644651889801025, "learning_rate": 2.567934782608696e-06, "loss": 1.3081, "step": 12578 }, { "epoch": 1.969160926737633, "grad_norm": 3.597391366958618, "learning_rate": 2.5560461956521736e-06, "loss": 1.1556, "step": 12579 }, { "epoch": 1.9693174702567313, "grad_norm": 6.175591468811035, "learning_rate": 2.544157608695652e-06, "loss": 0.9449, "step": 12580 }, { "epoch": 1.9694740137758298, "grad_norm": 6.516781330108643, "learning_rate": 2.53226902173913e-06, "loss": 1.065, "step": 12581 }, { "epoch": 1.969630557294928, "grad_norm": 4.915063858032227, "learning_rate": 2.5203804347826085e-06, "loss": 1.9415, "step": 12582 }, { "epoch": 1.9697871008140262, "grad_norm": 2.983365297317505, "learning_rate": 2.508491847826087e-06, "loss": 1.581, "step": 12583 }, { "epoch": 1.9699436443331246, "grad_norm": 1.6841644048690796, "learning_rate": 2.4966032608695648e-06, "loss": 0.5288, "step": 12584 }, { "epoch": 1.970100187852223, "grad_norm": 4.890353202819824, "learning_rate": 2.4847146739130434e-06, "loss": 0.4826, "step": 12585 }, { "epoch": 1.9702567313713213, "grad_norm": 3.981797456741333, "learning_rate": 2.4728260869565215e-06, "loss": 0.6786, "step": 12586 }, { "epoch": 1.9704132748904195, "grad_norm": 2.512864351272583, "learning_rate": 2.4609374999999997e-06, "loss": 0.8367, "step": 12587 }, { "epoch": 1.9705698184095177, "grad_norm": 2.428521156311035, "learning_rate": 2.4490489130434783e-06, "loss": 0.8695, "step": 12588 }, { "epoch": 1.9707263619286162, "grad_norm": 2.0677542686462402, "learning_rate": 2.4371603260869564e-06, "loss": 2.9238, "step": 12589 }, { "epoch": 1.9708829054477146, "grad_norm": 1.9499543905258179, "learning_rate": 2.4252717391304346e-06, "loss": 2.9893, "step": 12590 }, { "epoch": 1.9710394489668128, "grad_norm": 1.9393608570098877, "learning_rate": 2.4133831521739127e-06, "loss": 2.953, "step": 12591 }, { "epoch": 1.971195992485911, "grad_norm": 1.8744410276412964, "learning_rate": 2.401494565217391e-06, "loss": 2.9504, "step": 12592 }, { "epoch": 1.9713525360050093, "grad_norm": 1.923394799232483, "learning_rate": 2.3896059782608695e-06, "loss": 2.9617, "step": 12593 }, { "epoch": 1.9715090795241077, "grad_norm": 1.7987457513809204, "learning_rate": 2.3777173913043476e-06, "loss": 2.8346, "step": 12594 }, { "epoch": 1.9716656230432061, "grad_norm": 1.794922947883606, "learning_rate": 2.3658288043478258e-06, "loss": 2.8855, "step": 12595 }, { "epoch": 1.9718221665623044, "grad_norm": 1.8506746292114258, "learning_rate": 2.353940217391304e-06, "loss": 2.8324, "step": 12596 }, { "epoch": 1.9719787100814026, "grad_norm": 1.7109731435775757, "learning_rate": 2.3420516304347825e-06, "loss": 2.7943, "step": 12597 }, { "epoch": 1.9721352536005008, "grad_norm": 1.7426645755767822, "learning_rate": 2.3301630434782607e-06, "loss": 2.7674, "step": 12598 }, { "epoch": 1.9722917971195992, "grad_norm": 1.9973618984222412, "learning_rate": 2.318274456521739e-06, "loss": 2.864, "step": 12599 }, { "epoch": 1.9724483406386977, "grad_norm": 1.8339837789535522, "learning_rate": 2.3063858695652174e-06, "loss": 2.846, "step": 12600 }, { "epoch": 1.972604884157796, "grad_norm": 2.1272807121276855, "learning_rate": 2.2944972826086956e-06, "loss": 2.3486, "step": 12601 }, { "epoch": 1.972761427676894, "grad_norm": 1.7438037395477295, "learning_rate": 2.2826086956521737e-06, "loss": 2.7634, "step": 12602 }, { "epoch": 1.9729179711959923, "grad_norm": 2.1287527084350586, "learning_rate": 2.2707201086956523e-06, "loss": 2.9095, "step": 12603 }, { "epoch": 1.9730745147150908, "grad_norm": 5.5958991050720215, "learning_rate": 2.25883152173913e-06, "loss": 2.7054, "step": 12604 }, { "epoch": 1.9732310582341892, "grad_norm": 2.266753911972046, "learning_rate": 2.2469429347826086e-06, "loss": 2.6056, "step": 12605 }, { "epoch": 1.9733876017532874, "grad_norm": 2.019437551498413, "learning_rate": 2.2350543478260868e-06, "loss": 2.4107, "step": 12606 }, { "epoch": 1.9735441452723856, "grad_norm": 6.256537437438965, "learning_rate": 2.223165760869565e-06, "loss": 2.5405, "step": 12607 }, { "epoch": 1.973700688791484, "grad_norm": 2.934673547744751, "learning_rate": 2.2112771739130435e-06, "loss": 2.5735, "step": 12608 }, { "epoch": 1.9738572323105823, "grad_norm": 5.3308868408203125, "learning_rate": 2.1993885869565212e-06, "loss": 2.6387, "step": 12609 }, { "epoch": 1.9740137758296807, "grad_norm": 2.655078411102295, "learning_rate": 2.1875e-06, "loss": 2.1825, "step": 12610 }, { "epoch": 1.974170319348779, "grad_norm": 3.3241119384765625, "learning_rate": 2.175611413043478e-06, "loss": 1.8332, "step": 12611 }, { "epoch": 1.9743268628678772, "grad_norm": 2.3137104511260986, "learning_rate": 2.163722826086956e-06, "loss": 2.0611, "step": 12612 }, { "epoch": 1.9744834063869756, "grad_norm": 2.0335922241210938, "learning_rate": 2.1518342391304347e-06, "loss": 1.8548, "step": 12613 }, { "epoch": 1.974639949906074, "grad_norm": 4.50117301940918, "learning_rate": 2.139945652173913e-06, "loss": 1.7926, "step": 12614 }, { "epoch": 1.9747964934251723, "grad_norm": 4.447171688079834, "learning_rate": 2.128057065217391e-06, "loss": 1.8589, "step": 12615 }, { "epoch": 1.9749530369442705, "grad_norm": 3.5517117977142334, "learning_rate": 2.116168478260869e-06, "loss": 1.9137, "step": 12616 }, { "epoch": 1.9751095804633687, "grad_norm": 5.070085048675537, "learning_rate": 2.1042798913043477e-06, "loss": 2.1235, "step": 12617 }, { "epoch": 1.9752661239824671, "grad_norm": 5.218674182891846, "learning_rate": 2.092391304347826e-06, "loss": 1.8035, "step": 12618 }, { "epoch": 1.9754226675015656, "grad_norm": 12.603699684143066, "learning_rate": 2.080502717391304e-06, "loss": 1.4979, "step": 12619 }, { "epoch": 1.9755792110206638, "grad_norm": 4.6436767578125, "learning_rate": 2.0686141304347822e-06, "loss": 1.5784, "step": 12620 }, { "epoch": 1.975735754539762, "grad_norm": 2.947314500808716, "learning_rate": 2.056725543478261e-06, "loss": 1.3299, "step": 12621 }, { "epoch": 1.9758922980588602, "grad_norm": 5.476008415222168, "learning_rate": 2.044836956521739e-06, "loss": 1.2229, "step": 12622 }, { "epoch": 1.9760488415779587, "grad_norm": 18.55363655090332, "learning_rate": 2.032948369565217e-06, "loss": 1.3305, "step": 12623 }, { "epoch": 1.9762053850970571, "grad_norm": 7.535390853881836, "learning_rate": 2.0210597826086953e-06, "loss": 1.5836, "step": 12624 }, { "epoch": 1.9763619286161553, "grad_norm": 14.191845893859863, "learning_rate": 2.009171195652174e-06, "loss": 2.1434, "step": 12625 }, { "epoch": 1.9765184721352536, "grad_norm": 5.055546760559082, "learning_rate": 1.997282608695652e-06, "loss": 1.3991, "step": 12626 }, { "epoch": 1.9766750156543518, "grad_norm": 5.185608863830566, "learning_rate": 1.98539402173913e-06, "loss": 1.114, "step": 12627 }, { "epoch": 1.9768315591734502, "grad_norm": 5.329787731170654, "learning_rate": 1.9735054347826087e-06, "loss": 1.2084, "step": 12628 }, { "epoch": 1.9769881026925487, "grad_norm": 4.282310962677002, "learning_rate": 1.961616847826087e-06, "loss": 1.6564, "step": 12629 }, { "epoch": 1.9771446462116469, "grad_norm": 4.375661849975586, "learning_rate": 1.949728260869565e-06, "loss": 1.3938, "step": 12630 }, { "epoch": 1.977301189730745, "grad_norm": 2.3116767406463623, "learning_rate": 1.937839673913043e-06, "loss": 0.8843, "step": 12631 }, { "epoch": 1.9774577332498433, "grad_norm": 2.709021806716919, "learning_rate": 1.9259510869565218e-06, "loss": 1.2683, "step": 12632 }, { "epoch": 1.9776142767689417, "grad_norm": 5.057296276092529, "learning_rate": 1.9140625e-06, "loss": 1.0681, "step": 12633 }, { "epoch": 1.9777708202880402, "grad_norm": 3.3424010276794434, "learning_rate": 1.902173913043478e-06, "loss": 0.5287, "step": 12634 }, { "epoch": 1.9779273638071384, "grad_norm": 3.986919641494751, "learning_rate": 1.8902853260869563e-06, "loss": 0.9049, "step": 12635 }, { "epoch": 1.9780839073262366, "grad_norm": 1.4179490804672241, "learning_rate": 1.8783967391304346e-06, "loss": 0.3816, "step": 12636 }, { "epoch": 1.9782404508453348, "grad_norm": 2.000173568725586, "learning_rate": 1.866508152173913e-06, "loss": 0.6127, "step": 12637 }, { "epoch": 1.9783969943644333, "grad_norm": 3.031588554382324, "learning_rate": 1.8546195652173911e-06, "loss": 0.9132, "step": 12638 }, { "epoch": 1.9785535378835317, "grad_norm": 1.8740705251693726, "learning_rate": 1.8427309782608693e-06, "loss": 2.9644, "step": 12639 }, { "epoch": 1.97871008140263, "grad_norm": 2.0627734661102295, "learning_rate": 1.8308423913043479e-06, "loss": 2.7472, "step": 12640 }, { "epoch": 1.9788666249217282, "grad_norm": 1.8062782287597656, "learning_rate": 1.818953804347826e-06, "loss": 2.8882, "step": 12641 }, { "epoch": 1.9790231684408266, "grad_norm": 1.7078273296356201, "learning_rate": 1.8070652173913042e-06, "loss": 2.9085, "step": 12642 }, { "epoch": 1.9791797119599248, "grad_norm": 1.9415303468704224, "learning_rate": 1.7951766304347823e-06, "loss": 2.9508, "step": 12643 }, { "epoch": 1.9793362554790233, "grad_norm": 2.1048622131347656, "learning_rate": 1.7832880434782607e-06, "loss": 2.9751, "step": 12644 }, { "epoch": 1.9794927989981215, "grad_norm": 1.8078382015228271, "learning_rate": 1.771399456521739e-06, "loss": 2.8587, "step": 12645 }, { "epoch": 1.9796493425172197, "grad_norm": 1.8724548816680908, "learning_rate": 1.7595108695652172e-06, "loss": 2.9505, "step": 12646 }, { "epoch": 1.9798058860363181, "grad_norm": 1.653760552406311, "learning_rate": 1.7476222826086954e-06, "loss": 2.7293, "step": 12647 }, { "epoch": 1.9799624295554166, "grad_norm": 1.7813156843185425, "learning_rate": 1.7357336956521738e-06, "loss": 2.8391, "step": 12648 }, { "epoch": 1.9801189730745148, "grad_norm": 1.5969213247299194, "learning_rate": 1.723845108695652e-06, "loss": 2.6101, "step": 12649 }, { "epoch": 1.980275516593613, "grad_norm": 1.782819151878357, "learning_rate": 1.7119565217391303e-06, "loss": 2.7501, "step": 12650 }, { "epoch": 1.9804320601127112, "grad_norm": 2.9006540775299072, "learning_rate": 1.7000679347826087e-06, "loss": 2.61, "step": 12651 }, { "epoch": 1.9805886036318097, "grad_norm": 1.870467185974121, "learning_rate": 1.6881793478260868e-06, "loss": 2.8035, "step": 12652 }, { "epoch": 1.980745147150908, "grad_norm": 1.9332584142684937, "learning_rate": 1.676290760869565e-06, "loss": 2.8258, "step": 12653 }, { "epoch": 1.9809016906700063, "grad_norm": 1.8363772630691528, "learning_rate": 1.6644021739130433e-06, "loss": 2.6109, "step": 12654 }, { "epoch": 1.9810582341891045, "grad_norm": 1.523802399635315, "learning_rate": 1.6525135869565217e-06, "loss": 2.347, "step": 12655 }, { "epoch": 1.9812147777082028, "grad_norm": 2.6608991622924805, "learning_rate": 1.6406249999999999e-06, "loss": 2.3349, "step": 12656 }, { "epoch": 1.9813713212273012, "grad_norm": 1.898357629776001, "learning_rate": 1.628736413043478e-06, "loss": 2.6599, "step": 12657 }, { "epoch": 1.9815278647463996, "grad_norm": 4.004353046417236, "learning_rate": 1.6168478260869564e-06, "loss": 2.5054, "step": 12658 }, { "epoch": 1.9816844082654979, "grad_norm": 1.7210047245025635, "learning_rate": 1.6049592391304347e-06, "loss": 2.2397, "step": 12659 }, { "epoch": 1.981840951784596, "grad_norm": 1.9547840356826782, "learning_rate": 1.593070652173913e-06, "loss": 2.2959, "step": 12660 }, { "epoch": 1.9819974953036943, "grad_norm": 4.719292163848877, "learning_rate": 1.581182065217391e-06, "loss": 1.9402, "step": 12661 }, { "epoch": 1.9821540388227927, "grad_norm": 3.2298662662506104, "learning_rate": 1.5692934782608694e-06, "loss": 2.1615, "step": 12662 }, { "epoch": 1.9823105823418912, "grad_norm": 4.730401515960693, "learning_rate": 1.5574048913043478e-06, "loss": 2.2884, "step": 12663 }, { "epoch": 1.9824671258609894, "grad_norm": 8.258781433105469, "learning_rate": 1.545516304347826e-06, "loss": 2.4707, "step": 12664 }, { "epoch": 1.9826236693800876, "grad_norm": 3.2877509593963623, "learning_rate": 1.5336277173913043e-06, "loss": 2.3689, "step": 12665 }, { "epoch": 1.9827802128991858, "grad_norm": 3.8593738079071045, "learning_rate": 1.5217391304347825e-06, "loss": 2.1377, "step": 12666 }, { "epoch": 1.9829367564182843, "grad_norm": 2.1214382648468018, "learning_rate": 1.5098505434782606e-06, "loss": 1.9757, "step": 12667 }, { "epoch": 1.9830932999373827, "grad_norm": 5.00585412979126, "learning_rate": 1.497961956521739e-06, "loss": 1.8729, "step": 12668 }, { "epoch": 1.983249843456481, "grad_norm": 1.4192140102386475, "learning_rate": 1.4860733695652174e-06, "loss": 1.7005, "step": 12669 }, { "epoch": 1.9834063869755791, "grad_norm": 6.438273906707764, "learning_rate": 1.4741847826086955e-06, "loss": 1.6056, "step": 12670 }, { "epoch": 1.9835629304946774, "grad_norm": 1.9602165222167969, "learning_rate": 1.4622961956521737e-06, "loss": 1.2994, "step": 12671 }, { "epoch": 1.9837194740137758, "grad_norm": 9.215200424194336, "learning_rate": 1.4504076086956518e-06, "loss": 1.812, "step": 12672 }, { "epoch": 1.9838760175328742, "grad_norm": 3.5848751068115234, "learning_rate": 1.4385190217391304e-06, "loss": 1.0818, "step": 12673 }, { "epoch": 1.9840325610519725, "grad_norm": 3.668435573577881, "learning_rate": 1.4266304347826086e-06, "loss": 1.5642, "step": 12674 }, { "epoch": 1.9841891045710707, "grad_norm": 3.286691427230835, "learning_rate": 1.4147418478260867e-06, "loss": 1.6558, "step": 12675 }, { "epoch": 1.9843456480901691, "grad_norm": 6.311059474945068, "learning_rate": 1.402853260869565e-06, "loss": 1.4693, "step": 12676 }, { "epoch": 1.9845021916092673, "grad_norm": 6.454926013946533, "learning_rate": 1.3909646739130435e-06, "loss": 1.2832, "step": 12677 }, { "epoch": 1.9846587351283658, "grad_norm": 4.690300464630127, "learning_rate": 1.3790760869565216e-06, "loss": 1.1196, "step": 12678 }, { "epoch": 1.984815278647464, "grad_norm": 7.509739398956299, "learning_rate": 1.3671874999999998e-06, "loss": 1.1682, "step": 12679 }, { "epoch": 1.9849718221665622, "grad_norm": 9.190909385681152, "learning_rate": 1.3552989130434781e-06, "loss": 0.8935, "step": 12680 }, { "epoch": 1.9851283656856606, "grad_norm": 3.547703981399536, "learning_rate": 1.3434103260869563e-06, "loss": 1.3581, "step": 12681 }, { "epoch": 1.985284909204759, "grad_norm": 4.287250518798828, "learning_rate": 1.3315217391304347e-06, "loss": 0.9805, "step": 12682 }, { "epoch": 1.9854414527238573, "grad_norm": 3.558534622192383, "learning_rate": 1.319633152173913e-06, "loss": 0.8826, "step": 12683 }, { "epoch": 1.9855979962429555, "grad_norm": 6.892899036407471, "learning_rate": 1.3077445652173912e-06, "loss": 0.5837, "step": 12684 }, { "epoch": 1.9857545397620537, "grad_norm": 1.67763352394104, "learning_rate": 1.2958559782608693e-06, "loss": 0.7445, "step": 12685 }, { "epoch": 1.9859110832811522, "grad_norm": 2.8098526000976562, "learning_rate": 1.283967391304348e-06, "loss": 0.922, "step": 12686 }, { "epoch": 1.9860676268002506, "grad_norm": 1.7574679851531982, "learning_rate": 1.272078804347826e-06, "loss": 0.5418, "step": 12687 }, { "epoch": 1.9862241703193488, "grad_norm": 1.9214035272598267, "learning_rate": 1.2601902173913042e-06, "loss": 0.7568, "step": 12688 }, { "epoch": 1.986380713838447, "grad_norm": 1.9888882637023926, "learning_rate": 1.2483016304347824e-06, "loss": 2.856, "step": 12689 }, { "epoch": 1.9865372573575453, "grad_norm": 1.7812798023223877, "learning_rate": 1.2364130434782608e-06, "loss": 2.9388, "step": 12690 }, { "epoch": 1.9866938008766437, "grad_norm": 1.7115925550460815, "learning_rate": 1.2245244565217391e-06, "loss": 2.9186, "step": 12691 }, { "epoch": 1.9868503443957422, "grad_norm": 1.9333367347717285, "learning_rate": 1.2126358695652173e-06, "loss": 2.9341, "step": 12692 }, { "epoch": 1.9870068879148404, "grad_norm": 1.8189491033554077, "learning_rate": 1.2007472826086954e-06, "loss": 2.9228, "step": 12693 }, { "epoch": 1.9871634314339386, "grad_norm": 2.0311336517333984, "learning_rate": 1.1888586956521738e-06, "loss": 2.7668, "step": 12694 }, { "epoch": 1.9873199749530368, "grad_norm": 4.429170608520508, "learning_rate": 1.176970108695652e-06, "loss": 2.8553, "step": 12695 }, { "epoch": 1.9874765184721352, "grad_norm": 1.9434064626693726, "learning_rate": 1.1650815217391303e-06, "loss": 2.8235, "step": 12696 }, { "epoch": 1.9876330619912337, "grad_norm": 1.5084233283996582, "learning_rate": 1.1531929347826087e-06, "loss": 2.6301, "step": 12697 }, { "epoch": 1.987789605510332, "grad_norm": 1.4962050914764404, "learning_rate": 1.1413043478260869e-06, "loss": 2.6707, "step": 12698 }, { "epoch": 1.9879461490294301, "grad_norm": 1.6622228622436523, "learning_rate": 1.129415760869565e-06, "loss": 2.6751, "step": 12699 }, { "epoch": 1.9881026925485283, "grad_norm": 1.57244074344635, "learning_rate": 1.1175271739130434e-06, "loss": 2.6027, "step": 12700 }, { "epoch": 1.9882592360676268, "grad_norm": 1.9821975231170654, "learning_rate": 1.1056385869565217e-06, "loss": 2.5564, "step": 12701 }, { "epoch": 1.9884157795867252, "grad_norm": 1.600675344467163, "learning_rate": 1.09375e-06, "loss": 2.5909, "step": 12702 }, { "epoch": 1.9885723231058234, "grad_norm": 1.9700291156768799, "learning_rate": 1.081861413043478e-06, "loss": 2.6146, "step": 12703 }, { "epoch": 1.9887288666249217, "grad_norm": 1.8288556337356567, "learning_rate": 1.0699728260869564e-06, "loss": 2.153, "step": 12704 }, { "epoch": 1.9888854101440199, "grad_norm": 1.6595802307128906, "learning_rate": 1.0580842391304346e-06, "loss": 2.5695, "step": 12705 }, { "epoch": 1.9890419536631183, "grad_norm": 1.905251145362854, "learning_rate": 1.046195652173913e-06, "loss": 2.1581, "step": 12706 }, { "epoch": 1.9891984971822168, "grad_norm": 7.70994234085083, "learning_rate": 1.0343070652173911e-06, "loss": 2.3857, "step": 12707 }, { "epoch": 1.989355040701315, "grad_norm": 2.2474119663238525, "learning_rate": 1.0224184782608695e-06, "loss": 2.1741, "step": 12708 }, { "epoch": 1.9895115842204132, "grad_norm": 3.1898422241210938, "learning_rate": 1.0105298913043476e-06, "loss": 2.5811, "step": 12709 }, { "epoch": 1.9896681277395116, "grad_norm": 12.819948196411133, "learning_rate": 9.98641304347826e-07, "loss": 2.2146, "step": 12710 }, { "epoch": 1.9898246712586098, "grad_norm": 4.550641059875488, "learning_rate": 9.867527173913044e-07, "loss": 1.9701, "step": 12711 }, { "epoch": 1.9899812147777083, "grad_norm": 2.790452241897583, "learning_rate": 9.748641304347825e-07, "loss": 2.2031, "step": 12712 }, { "epoch": 1.9901377582968065, "grad_norm": 1.7086151838302612, "learning_rate": 9.629755434782609e-07, "loss": 2.0329, "step": 12713 }, { "epoch": 1.9902943018159047, "grad_norm": 3.3103647232055664, "learning_rate": 9.51086956521739e-07, "loss": 1.9044, "step": 12714 }, { "epoch": 1.9904508453350032, "grad_norm": 2.4661638736724854, "learning_rate": 9.391983695652173e-07, "loss": 1.7576, "step": 12715 }, { "epoch": 1.9906073888541016, "grad_norm": 2.7166128158569336, "learning_rate": 9.273097826086956e-07, "loss": 1.6735, "step": 12716 }, { "epoch": 1.9907639323731998, "grad_norm": 27.716978073120117, "learning_rate": 9.154211956521739e-07, "loss": 1.761, "step": 12717 }, { "epoch": 1.990920475892298, "grad_norm": 4.854394435882568, "learning_rate": 9.035326086956521e-07, "loss": 1.8885, "step": 12718 }, { "epoch": 1.9910770194113963, "grad_norm": 3.518700361251831, "learning_rate": 8.916440217391304e-07, "loss": 1.482, "step": 12719 }, { "epoch": 1.9912335629304947, "grad_norm": 2.0685298442840576, "learning_rate": 8.797554347826086e-07, "loss": 1.3468, "step": 12720 }, { "epoch": 1.9913901064495931, "grad_norm": 4.621035099029541, "learning_rate": 8.678668478260869e-07, "loss": 1.1385, "step": 12721 }, { "epoch": 1.9915466499686914, "grad_norm": 3.4484710693359375, "learning_rate": 8.559782608695651e-07, "loss": 1.2318, "step": 12722 }, { "epoch": 1.9917031934877896, "grad_norm": 5.243100166320801, "learning_rate": 8.440896739130434e-07, "loss": 1.5499, "step": 12723 }, { "epoch": 1.9918597370068878, "grad_norm": 5.906971454620361, "learning_rate": 8.322010869565217e-07, "loss": 1.2049, "step": 12724 }, { "epoch": 1.9920162805259862, "grad_norm": 5.3394856452941895, "learning_rate": 8.203124999999999e-07, "loss": 1.1445, "step": 12725 }, { "epoch": 1.9921728240450847, "grad_norm": 2.8514204025268555, "learning_rate": 8.084239130434782e-07, "loss": 1.1529, "step": 12726 }, { "epoch": 1.9923293675641829, "grad_norm": 5.576069355010986, "learning_rate": 7.965353260869565e-07, "loss": 1.2134, "step": 12727 }, { "epoch": 1.992485911083281, "grad_norm": 3.3972792625427246, "learning_rate": 7.846467391304347e-07, "loss": 1.1258, "step": 12728 }, { "epoch": 1.9926424546023793, "grad_norm": 6.000216960906982, "learning_rate": 7.72758152173913e-07, "loss": 1.8632, "step": 12729 }, { "epoch": 1.9927989981214778, "grad_norm": 3.4334568977355957, "learning_rate": 7.608695652173912e-07, "loss": 1.1877, "step": 12730 }, { "epoch": 1.9929555416405762, "grad_norm": 4.444576263427734, "learning_rate": 7.489809782608695e-07, "loss": 1.2108, "step": 12731 }, { "epoch": 1.9931120851596744, "grad_norm": 3.6119322776794434, "learning_rate": 7.370923913043478e-07, "loss": 0.7452, "step": 12732 }, { "epoch": 1.9932686286787726, "grad_norm": NaN, "learning_rate": 7.370923913043478e-07, "loss": 0.0, "step": 12733 }, { "epoch": 1.9934251721978709, "grad_norm": 3.437225341796875, "learning_rate": 7.252038043478259e-07, "loss": 0.812, "step": 12734 }, { "epoch": 1.9935817157169693, "grad_norm": 8.215194702148438, "learning_rate": 7.133152173913043e-07, "loss": 1.0311, "step": 12735 }, { "epoch": 1.9937382592360677, "grad_norm": 3.3470985889434814, "learning_rate": 7.014266304347825e-07, "loss": 0.6741, "step": 12736 }, { "epoch": 1.993894802755166, "grad_norm": 3.7932448387145996, "learning_rate": 6.895380434782608e-07, "loss": 1.0251, "step": 12737 }, { "epoch": 1.9940513462742642, "grad_norm": 6.221490859985352, "learning_rate": 6.776494565217391e-07, "loss": 1.1745, "step": 12738 }, { "epoch": 1.9942078897933626, "grad_norm": 1.842808485031128, "learning_rate": 6.657608695652173e-07, "loss": 2.8182, "step": 12739 }, { "epoch": 1.9943644333124608, "grad_norm": 2.083857297897339, "learning_rate": 6.538722826086956e-07, "loss": 2.8712, "step": 12740 }, { "epoch": 1.9945209768315593, "grad_norm": 1.6432135105133057, "learning_rate": 6.41983695652174e-07, "loss": 2.8298, "step": 12741 }, { "epoch": 1.9946775203506575, "grad_norm": 1.7040554285049438, "learning_rate": 6.300951086956521e-07, "loss": 2.841, "step": 12742 }, { "epoch": 1.9948340638697557, "grad_norm": 1.7833001613616943, "learning_rate": 6.182065217391304e-07, "loss": 2.7148, "step": 12743 }, { "epoch": 1.9949906073888541, "grad_norm": 1.720930576324463, "learning_rate": 6.063179347826086e-07, "loss": 2.8135, "step": 12744 }, { "epoch": 1.9951471509079524, "grad_norm": 1.4388052225112915, "learning_rate": 5.944293478260869e-07, "loss": 2.7699, "step": 12745 }, { "epoch": 1.9953036944270508, "grad_norm": 1.8502174615859985, "learning_rate": 5.825407608695652e-07, "loss": 2.8312, "step": 12746 }, { "epoch": 1.995460237946149, "grad_norm": 1.5378456115722656, "learning_rate": 5.706521739130434e-07, "loss": 2.5635, "step": 12747 }, { "epoch": 1.9956167814652472, "grad_norm": 1.4994385242462158, "learning_rate": 5.587635869565217e-07, "loss": 2.6516, "step": 12748 }, { "epoch": 1.9957733249843457, "grad_norm": 1.7972216606140137, "learning_rate": 5.46875e-07, "loss": 2.7451, "step": 12749 }, { "epoch": 1.9959298685034441, "grad_norm": 1.7638109922409058, "learning_rate": 5.349864130434782e-07, "loss": 2.56, "step": 12750 }, { "epoch": 1.9960864120225423, "grad_norm": 2.7034292221069336, "learning_rate": 5.230978260869565e-07, "loss": 2.5539, "step": 12751 }, { "epoch": 1.9962429555416406, "grad_norm": 1.366592526435852, "learning_rate": 5.112092391304347e-07, "loss": 2.4434, "step": 12752 }, { "epoch": 1.9963994990607388, "grad_norm": 1.6307352781295776, "learning_rate": 4.99320652173913e-07, "loss": 2.5648, "step": 12753 }, { "epoch": 1.9965560425798372, "grad_norm": 1.8082646131515503, "learning_rate": 4.874320652173913e-07, "loss": 2.4413, "step": 12754 }, { "epoch": 1.9967125860989356, "grad_norm": 1.7128336429595947, "learning_rate": 4.755434782608695e-07, "loss": 2.4607, "step": 12755 }, { "epoch": 1.9968691296180339, "grad_norm": 2.8533339500427246, "learning_rate": 4.636548913043478e-07, "loss": 2.3888, "step": 12756 }, { "epoch": 1.997025673137132, "grad_norm": 2.9971630573272705, "learning_rate": 4.5176630434782605e-07, "loss": 1.9606, "step": 12757 }, { "epoch": 1.9971822166562303, "grad_norm": 1.4824519157409668, "learning_rate": 4.398777173913043e-07, "loss": 1.9636, "step": 12758 }, { "epoch": 1.9973387601753287, "grad_norm": 2.7305357456207275, "learning_rate": 4.2798913043478257e-07, "loss": 1.8046, "step": 12759 }, { "epoch": 1.9974953036944272, "grad_norm": 2.7387592792510986, "learning_rate": 4.1610054347826083e-07, "loss": 1.6898, "step": 12760 }, { "epoch": 1.9976518472135254, "grad_norm": 3.7550573348999023, "learning_rate": 4.042119565217391e-07, "loss": 1.7429, "step": 12761 }, { "epoch": 1.9978083907326236, "grad_norm": 4.4314866065979, "learning_rate": 3.9232336956521736e-07, "loss": 2.0755, "step": 12762 }, { "epoch": 1.9979649342517218, "grad_norm": 2.387753486633301, "learning_rate": 3.804347826086956e-07, "loss": 1.6166, "step": 12763 }, { "epoch": 1.9981214777708203, "grad_norm": 5.417107582092285, "learning_rate": 3.685461956521739e-07, "loss": 1.6565, "step": 12764 }, { "epoch": 1.9982780212899187, "grad_norm": 3.8149495124816895, "learning_rate": 3.5665760869565214e-07, "loss": 1.8832, "step": 12765 }, { "epoch": 1.998434564809017, "grad_norm": 6.623764991760254, "learning_rate": 3.447690217391304e-07, "loss": 1.5304, "step": 12766 }, { "epoch": 1.9985911083281152, "grad_norm": 3.2151501178741455, "learning_rate": 3.3288043478260867e-07, "loss": 1.0327, "step": 12767 }, { "epoch": 1.9987476518472134, "grad_norm": 6.950605869293213, "learning_rate": 3.20991847826087e-07, "loss": 1.764, "step": 12768 }, { "epoch": 1.9989041953663118, "grad_norm": 38.88340759277344, "learning_rate": 3.091032608695652e-07, "loss": 1.4558, "step": 12769 }, { "epoch": 1.9990607388854102, "grad_norm": 5.565809726715088, "learning_rate": 2.9721467391304345e-07, "loss": 1.7097, "step": 12770 }, { "epoch": 1.9992172824045085, "grad_norm": 7.170962810516357, "learning_rate": 2.853260869565217e-07, "loss": 1.2168, "step": 12771 }, { "epoch": 1.9993738259236067, "grad_norm": 2.1780800819396973, "learning_rate": 2.734375e-07, "loss": 0.9111, "step": 12772 }, { "epoch": 1.9995303694427051, "grad_norm": 4.814699172973633, "learning_rate": 2.6154891304347824e-07, "loss": 0.6491, "step": 12773 }, { "epoch": 1.9996869129618033, "grad_norm": 1.922163724899292, "learning_rate": 2.496603260869565e-07, "loss": 0.4335, "step": 12774 }, { "epoch": 1.9998434564809018, "grad_norm": 3.4018924236297607, "learning_rate": 2.3777173913043476e-07, "loss": 1.4645, "step": 12775 }, { "epoch": 2.0, "grad_norm": 5.539596080780029, "learning_rate": 2.2588315217391302e-07, "loss": 1.2158, "step": 12776 }, { "epoch": 2.0, "step": 12776, "total_flos": 1.799905283771071e+19, "train_loss": 1.0417570046014482, "train_runtime": 7667.7466, "train_samples_per_second": 26.657, "train_steps_per_second": 1.666 } ], "logging_steps": 1.0, "max_steps": 12776, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.799905283771071e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }