diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,88390 +2,76274 @@ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, - "eval_steps": 350000, - "global_step": 1472295, + "eval_steps": 300000, + "global_step": 1088730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "learning_rate": 1.1294172439890552e-07, - "loss": 9.4022, + "epoch": 0.0013777520597393293, + "grad_norm": 11.65067195892334, + "learning_rate": 4.849375459221161e-07, + "loss": 1.4102, "step": 100 }, { - "epoch": 0.0, - "learning_rate": 2.2821214414624206e-07, - "loss": 9.2918, + "epoch": 0.0027555041194786585, + "grad_norm": 13.264008522033691, + "learning_rate": 9.900808229243204e-07, + "loss": 1.4591, "step": 200 }, { - "epoch": 0.0, - "learning_rate": 3.4464691156779416e-07, - "loss": 9.011, + "epoch": 0.004133256179217988, + "grad_norm": 8.67760181427002, + "learning_rate": 1.4952240999265246e-06, + "loss": 1.2864, "step": 300 }, { - "epoch": 0.0, - "learning_rate": 4.6108167898934623e-07, - "loss": 8.5504, + "epoch": 0.005511008238957317, + "grad_norm": 16.742992401123047, + "learning_rate": 1.9953159441587073e-06, + "loss": 1.3362, "step": 400 }, { - "epoch": 0.01, - "learning_rate": 5.763520987366827e-07, - "loss": 8.043, + "epoch": 0.006888760298696646, + "grad_norm": 24.729005813598633, + "learning_rate": 2.500459221160911e-06, + "loss": 1.2823, "step": 500 }, { - "epoch": 0.01, - "learning_rate": 6.927868661582348e-07, - "loss": 7.7445, + "epoch": 0.008266512358435976, + "grad_norm": 9.72551155090332, + "learning_rate": 3.0056024981631153e-06, + "loss": 1.2514, "step": 600 }, { - "epoch": 0.01, - "learning_rate": 8.092216335797869e-07, - "loss": 7.5088, + "epoch": 0.009644264418175305, + "grad_norm": 5.669283866882324, + "learning_rate": 3.5107457751653195e-06, + "loss": 1.2883, "step": 700 }, { - "epoch": 0.01, - "learning_rate": 9.25656401001339e-07, - "loss": 7.3762, + "epoch": 0.011022016477914634, + "grad_norm": 34.766544342041016, + "learning_rate": 4.015889052167524e-06, + "loss": 1.1993, "step": 800 }, { - "epoch": 0.01, - "learning_rate": 1.0420911684228912e-06, - "loss": 7.2198, + "epoch": 0.012399768537653963, + "grad_norm": 7.9280829429626465, + "learning_rate": 4.521032329169728e-06, + "loss": 1.0601, "step": 900 }, { - "epoch": 0.01, - "learning_rate": 1.158525935844443e-06, - "loss": 7.1566, + "epoch": 0.013777520597393293, + "grad_norm": 5.170280456542969, + "learning_rate": 5.0261756061719325e-06, + "loss": 1.1682, "step": 1000 }, { - "epoch": 0.01, - "learning_rate": 1.2749607032659952e-06, - "loss": 7.0508, + "epoch": 0.015155272657132622, + "grad_norm": 12.251971244812012, + "learning_rate": 5.531318883174137e-06, + "loss": 1.0818, "step": 1100 }, { - "epoch": 0.01, - "learning_rate": 1.3913954706875475e-06, - "loss": 6.9891, + "epoch": 0.01653302471687195, + "grad_norm": 6.913139820098877, + "learning_rate": 6.036462160176341e-06, + "loss": 1.1873, "step": 1200 }, { - "epoch": 0.01, - "learning_rate": 1.5078302381090993e-06, - "loss": 6.8779, + "epoch": 0.017910776776611282, + "grad_norm": 4.224829196929932, + "learning_rate": 6.5416054371785455e-06, + "loss": 1.1132, "step": 1300 }, { - "epoch": 0.01, - "learning_rate": 1.6242650055306516e-06, - "loss": 6.8091, + "epoch": 0.01928852883635061, + "grad_norm": 3.3719537258148193, + "learning_rate": 7.046748714180749e-06, + "loss": 1.0681, "step": 1400 }, { - "epoch": 0.02, - "learning_rate": 1.7406997729522035e-06, - "loss": 6.7784, + "epoch": 0.02066628089608994, + "grad_norm": 9.445250511169434, + "learning_rate": 7.551891991182953e-06, + "loss": 1.0711, "step": 1500 }, { - "epoch": 0.02, - "learning_rate": 1.8571345403737557e-06, - "loss": 6.6996, + "epoch": 0.022044032955829268, + "grad_norm": 21.53278160095215, + "learning_rate": 8.057035268185158e-06, + "loss": 1.1757, "step": 1600 }, { - "epoch": 0.02, - "learning_rate": 1.973569307795308e-06, - "loss": 6.6599, + "epoch": 0.0234217850155686, + "grad_norm": 19.54088592529297, + "learning_rate": 8.562178545187362e-06, + "loss": 1.0576, "step": 1700 }, { - "epoch": 0.02, - "learning_rate": 2.09000407521686e-06, - "loss": 6.6455, + "epoch": 0.024799537075307927, + "grad_norm": 7.3780927658081055, + "learning_rate": 9.067321822189567e-06, + "loss": 1.036, "step": 1800 }, { - "epoch": 0.02, - "learning_rate": 2.206438842638412e-06, - "loss": 6.5771, + "epoch": 0.026177289135047258, + "grad_norm": 9.797306060791016, + "learning_rate": 9.57246509919177e-06, + "loss": 1.1004, "step": 1900 }, { - "epoch": 0.02, - "learning_rate": 2.322873610059964e-06, - "loss": 6.5437, + "epoch": 0.027555041194786585, + "grad_norm": 7.63425350189209, + "learning_rate": 1.0077608376193976e-05, + "loss": 1.0462, "step": 2000 }, { - "epoch": 0.02, - "learning_rate": 2.4393083774815157e-06, - "loss": 6.4812, + "epoch": 0.028932793254525916, + "grad_norm": 6.704458236694336, + "learning_rate": 1.0582751653196178e-05, + "loss": 1.037, "step": 2100 }, { - "epoch": 0.02, - "learning_rate": 2.555743144903068e-06, - "loss": 6.3998, + "epoch": 0.030310545314265244, + "grad_norm": 6.635415077209473, + "learning_rate": 1.1087894930198384e-05, + "loss": 0.9806, "step": 2200 }, { - "epoch": 0.02, - "learning_rate": 2.6721779123246202e-06, - "loss": 6.4008, + "epoch": 0.031688297374004575, + "grad_norm": 4.560552597045898, + "learning_rate": 1.1593038207200588e-05, + "loss": 0.8893, "step": 2300 }, { - "epoch": 0.02, - "learning_rate": 2.7886126797461723e-06, - "loss": 6.3409, + "epoch": 0.0330660494337439, + "grad_norm": 5.239875316619873, + "learning_rate": 1.2098181484202793e-05, + "loss": 1.0697, "step": 2400 }, { - "epoch": 0.03, - "learning_rate": 2.9050474471677244e-06, - "loss": 6.3212, + "epoch": 0.03444380149348323, + "grad_norm": 4.281900882720947, + "learning_rate": 1.2603324761204997e-05, + "loss": 0.9615, "step": 2500 }, { - "epoch": 0.03, - "learning_rate": 3.0214822145892764e-06, - "loss": 6.3122, + "epoch": 0.035821553553222564, + "grad_norm": 11.624236106872559, + "learning_rate": 1.31084680382072e-05, + "loss": 1.0059, "step": 2600 }, { - "epoch": 0.03, - "learning_rate": 3.1379169820108285e-06, - "loss": 6.267, + "epoch": 0.03719930561296189, + "grad_norm": 28.93486785888672, + "learning_rate": 1.3608559882439383e-05, + "loss": 0.8404, "step": 2700 }, { - "epoch": 0.03, - "learning_rate": 3.2543517494323806e-06, - "loss": 6.2042, + "epoch": 0.03857705767270122, + "grad_norm": 11.4780912399292, + "learning_rate": 1.4113703159441589e-05, + "loss": 0.9094, "step": 2800 }, { - "epoch": 0.03, - "learning_rate": 3.3707865168539327e-06, - "loss": 6.2018, + "epoch": 0.03995480973244055, + "grad_norm": 9.215027809143066, + "learning_rate": 1.4618846436443793e-05, + "loss": 0.8866, "step": 2900 }, { - "epoch": 0.03, - "learning_rate": 3.4872212842754847e-06, - "loss": 6.1906, + "epoch": 0.04133256179217988, + "grad_norm": 29.185121536254883, + "learning_rate": 1.5123989713445997e-05, + "loss": 0.9744, "step": 3000 }, { - "epoch": 0.03, - "learning_rate": 3.603656051697037e-06, - "loss": 6.0879, + "epoch": 0.04271031385191921, + "grad_norm": 5.132340908050537, + "learning_rate": 1.56291329904482e-05, + "loss": 1.0274, "step": 3100 }, { - "epoch": 0.03, - "learning_rate": 3.720090819118589e-06, - "loss": 6.062, + "epoch": 0.044088065911658536, + "grad_norm": 7.962031364440918, + "learning_rate": 1.6134276267450406e-05, + "loss": 0.8822, "step": 3200 }, { - "epoch": 0.03, - "learning_rate": 3.836525586540141e-06, - "loss": 6.0293, + "epoch": 0.04546581797139787, + "grad_norm": 6.370624542236328, + "learning_rate": 1.663941954445261e-05, + "loss": 1.0308, "step": 3300 }, { - "epoch": 0.03, - "learning_rate": 3.952960353961693e-06, - "loss": 6.0056, + "epoch": 0.0468435700311372, + "grad_norm": 4.936957836151123, + "learning_rate": 1.7144562821454813e-05, + "loss": 0.927, "step": 3400 }, { - "epoch": 0.04, - "learning_rate": 4.069395121383245e-06, - "loss": 6.0142, + "epoch": 0.048221322090876526, + "grad_norm": 24.894546508789062, + "learning_rate": 1.7649706098457017e-05, + "loss": 0.9564, "step": 3500 }, { - "epoch": 0.04, - "learning_rate": 4.185829888804797e-06, - "loss": 5.98, + "epoch": 0.04959907415061585, + "grad_norm": 9.514641761779785, + "learning_rate": 1.815484937545922e-05, + "loss": 0.9474, "step": 3600 }, { - "epoch": 0.04, - "learning_rate": 4.302264656226349e-06, - "loss": 5.9428, + "epoch": 0.05097682621035519, + "grad_norm": 5.476971626281738, + "learning_rate": 1.8659992652461428e-05, + "loss": 0.9496, "step": 3700 }, { - "epoch": 0.04, - "learning_rate": 4.418699423647901e-06, - "loss": 5.8527, + "epoch": 0.052354578270094515, + "grad_norm": 15.0453462600708, + "learning_rate": 1.9165135929463632e-05, + "loss": 0.9339, "step": 3800 }, { - "epoch": 0.04, - "learning_rate": 4.535134191069453e-06, - "loss": 5.8186, + "epoch": 0.05373233032983384, + "grad_norm": 14.468217849731445, + "learning_rate": 1.9670279206465836e-05, + "loss": 0.9348, "step": 3900 }, { - "epoch": 0.04, - "learning_rate": 4.6515689584910054e-06, - "loss": 5.7424, + "epoch": 0.05511008238957317, + "grad_norm": 7.205018520355225, + "learning_rate": 2.0175422483468036e-05, + "loss": 0.912, "step": 4000 }, { - "epoch": 0.04, - "learning_rate": 4.7680037259125575e-06, - "loss": 5.8399, + "epoch": 0.056487834449312505, + "grad_norm": 4.877631187438965, + "learning_rate": 2.0680565760470243e-05, + "loss": 0.9864, "step": 4100 }, { - "epoch": 0.04, - "learning_rate": 4.8844384933341096e-06, - "loss": 5.6997, + "epoch": 0.05786558650905183, + "grad_norm": 8.84345531463623, + "learning_rate": 2.1185709037472447e-05, + "loss": 1.0205, "step": 4200 }, { - "epoch": 0.04, - "learning_rate": 5.000873260755662e-06, - "loss": 5.6402, + "epoch": 0.05924333856879116, + "grad_norm": 23.226093292236328, + "learning_rate": 2.169085231447465e-05, + "loss": 0.937, "step": 4300 }, { - "epoch": 0.04, - "learning_rate": 5.117308028177214e-06, - "loss": 5.6395, + "epoch": 0.06062109062853049, + "grad_norm": 12.72083568572998, + "learning_rate": 2.2195995591476855e-05, + "loss": 0.9369, "step": 4400 }, { - "epoch": 0.05, - "learning_rate": 5.233742795598766e-06, - "loss": 5.5789, + "epoch": 0.06199884268826982, + "grad_norm": 17.244569778442383, + "learning_rate": 2.270113886847906e-05, + "loss": 0.952, "step": 4500 }, { - "epoch": 0.05, - "learning_rate": 5.350177563020319e-06, - "loss": 5.5653, + "epoch": 0.06337659474800915, + "grad_norm": 6.532393932342529, + "learning_rate": 2.3206282145481265e-05, + "loss": 0.9171, "step": 4600 }, { - "epoch": 0.05, - "learning_rate": 5.46661233044187e-06, - "loss": 5.5319, + "epoch": 0.06475434680774848, + "grad_norm": 10.168941497802734, + "learning_rate": 2.371142542248347e-05, + "loss": 1.0035, "step": 4700 }, { - "epoch": 0.05, - "learning_rate": 5.583047097863422e-06, - "loss": 5.4884, + "epoch": 0.0661320988674878, + "grad_norm": 11.163745880126953, + "learning_rate": 2.4211517266715652e-05, + "loss": 0.8717, "step": 4800 }, { - "epoch": 0.05, - "learning_rate": 5.699481865284974e-06, - "loss": 5.3925, + "epoch": 0.06750985092722714, + "grad_norm": 8.413740158081055, + "learning_rate": 2.4716660543717856e-05, + "loss": 1.0384, "step": 4900 }, { - "epoch": 0.05, - "learning_rate": 5.815916632706526e-06, - "loss": 5.4045, + "epoch": 0.06888760298696646, + "grad_norm": 45.454017639160156, + "learning_rate": 2.521675238795004e-05, + "loss": 0.9217, "step": 5000 }, { - "epoch": 0.05, - "learning_rate": 5.932351400128079e-06, - "loss": 5.4015, + "epoch": 0.0702653550467058, + "grad_norm": 11.021474838256836, + "learning_rate": 2.572189566495224e-05, + "loss": 0.9638, "step": 5100 }, { - "epoch": 0.05, - "learning_rate": 6.04878616754963e-06, - "loss": 5.2608, + "epoch": 0.07164310710644513, + "grad_norm": 4.591658592224121, + "learning_rate": 2.6227038941954446e-05, + "loss": 1.068, "step": 5200 }, { - "epoch": 0.05, - "learning_rate": 6.165220934971182e-06, - "loss": 5.1825, + "epoch": 0.07302085916618445, + "grad_norm": 6.153427600860596, + "learning_rate": 2.673218221895665e-05, + "loss": 0.9792, "step": 5300 }, { - "epoch": 0.06, - "learning_rate": 6.280491354718519e-06, - "loss": 5.057, + "epoch": 0.07439861122592378, + "grad_norm": 5.186223030090332, + "learning_rate": 2.7237325495958854e-05, + "loss": 0.9601, "step": 5400 }, { - "epoch": 0.06, - "learning_rate": 6.3969261221400715e-06, - "loss": 5.0588, + "epoch": 0.07577636328566312, + "grad_norm": 11.95683765411377, + "learning_rate": 2.7742468772961058e-05, + "loss": 0.9659, "step": 5500 }, { - "epoch": 0.06, - "learning_rate": 6.513360889561624e-06, - "loss": 5.0782, + "epoch": 0.07715411534540244, + "grad_norm": 45.106727600097656, + "learning_rate": 2.824761204996326e-05, + "loss": 0.9401, "step": 5600 }, { - "epoch": 0.06, - "learning_rate": 6.629795656983176e-06, - "loss": 5.0982, + "epoch": 0.07853186740514177, + "grad_norm": 42.417320251464844, + "learning_rate": 2.8752755326965465e-05, + "loss": 1.1097, "step": 5700 }, { - "epoch": 0.06, - "learning_rate": 6.746230424404728e-06, - "loss": 4.8921, + "epoch": 0.0799096194648811, + "grad_norm": 9.149765014648438, + "learning_rate": 2.9257898603967676e-05, + "loss": 1.0048, "step": 5800 }, { - "epoch": 0.06, - "learning_rate": 6.862665191826279e-06, - "loss": 4.8616, + "epoch": 0.08128737152462043, + "grad_norm": 10.777480125427246, + "learning_rate": 2.976304188096988e-05, + "loss": 0.9749, "step": 5900 }, { - "epoch": 0.06, - "learning_rate": 6.979099959247832e-06, - "loss": 4.8014, + "epoch": 0.08266512358435976, + "grad_norm": 60.3282470703125, + "learning_rate": 3.0268185157972083e-05, + "loss": 0.9522, "step": 6000 }, { - "epoch": 0.06, - "learning_rate": 7.095534726669384e-06, - "loss": 4.7205, + "epoch": 0.08404287564409908, + "grad_norm": 6.608532428741455, + "learning_rate": 3.0773328434974284e-05, + "loss": 1.0985, "step": 6100 }, { - "epoch": 0.06, - "learning_rate": 7.21080514641672e-06, - "loss": 4.9263, + "epoch": 0.08542062770383842, + "grad_norm": 11.488170623779297, + "learning_rate": 3.127847171197649e-05, + "loss": 1.1379, "step": 6200 }, { - "epoch": 0.06, - "learning_rate": 7.327239913838272e-06, - "loss": 4.7103, + "epoch": 0.08679837976357775, + "grad_norm": 12.194034576416016, + "learning_rate": 3.178361498897869e-05, + "loss": 0.9478, "step": 6300 }, { - "epoch": 0.07, - "learning_rate": 7.4413459859113935e-06, - "loss": 4.6061, + "epoch": 0.08817613182331707, + "grad_norm": 19.647018432617188, + "learning_rate": 3.2288758265980895e-05, + "loss": 1.0554, "step": 6400 }, { - "epoch": 0.07, - "learning_rate": 7.557780753332946e-06, - "loss": 4.5566, + "epoch": 0.0895538838830564, + "grad_norm": 8.758164405822754, + "learning_rate": 3.278885011021308e-05, + "loss": 1.0042, "step": 6500 }, { - "epoch": 0.07, - "learning_rate": 7.674215520754497e-06, - "loss": 4.6434, + "epoch": 0.09093163594279574, + "grad_norm": 5.215201377868652, + "learning_rate": 3.329399338721529e-05, + "loss": 0.9614, "step": 6600 }, { - "epoch": 0.07, - "learning_rate": 7.790650288176048e-06, - "loss": 4.4895, + "epoch": 0.09230938800253506, + "grad_norm": 6.266079902648926, + "learning_rate": 3.379913666421749e-05, + "loss": 1.0105, "step": 6700 }, { - "epoch": 0.07, - "learning_rate": 7.907085055597603e-06, - "loss": 4.3704, + "epoch": 0.0936871400622744, + "grad_norm": 16.860300064086914, + "learning_rate": 3.4304279941219696e-05, + "loss": 1.0777, "step": 6800 }, { - "epoch": 0.07, - "learning_rate": 8.023519823019154e-06, - "loss": 4.322, + "epoch": 0.09506489212201372, + "grad_norm": 19.434036254882812, + "learning_rate": 3.48094232182219e-05, + "loss": 1.0434, "step": 6900 }, { - "epoch": 0.07, - "learning_rate": 8.139954590440705e-06, - "loss": 4.3678, + "epoch": 0.09644264418175305, + "grad_norm": 4.839750289916992, + "learning_rate": 3.5314566495224104e-05, + "loss": 0.9474, "step": 7000 }, { - "epoch": 0.07, - "learning_rate": 8.256389357862258e-06, - "loss": 4.2371, + "epoch": 0.09782039624149239, + "grad_norm": 10.522396087646484, + "learning_rate": 3.581970977222631e-05, + "loss": 1.1231, "step": 7100 }, { - "epoch": 0.07, - "learning_rate": 8.37282412528381e-06, - "loss": 4.197, + "epoch": 0.0991981483012317, + "grad_norm": 160.55545043945312, + "learning_rate": 3.6324853049228504e-05, + "loss": 1.0285, "step": 7200 }, { - "epoch": 0.07, - "learning_rate": 8.489258892705362e-06, - "loss": 4.1948, + "epoch": 0.10057590036097104, + "grad_norm": 9.294129371643066, + "learning_rate": 3.682999632623071e-05, + "loss": 1.005, "step": 7300 }, { - "epoch": 0.08, - "learning_rate": 8.605693660126915e-06, - "loss": 4.0807, + "epoch": 0.10195365242071038, + "grad_norm": 20.62299156188965, + "learning_rate": 3.733513960323292e-05, + "loss": 0.9804, "step": 7400 }, { - "epoch": 0.08, - "learning_rate": 8.722128427548466e-06, - "loss": 4.1448, + "epoch": 0.1033314044804497, + "grad_norm": 13.265617370605469, + "learning_rate": 3.784028288023512e-05, + "loss": 1.1047, "step": 7500 }, { - "epoch": 0.08, - "learning_rate": 8.838563194970018e-06, - "loss": 3.9823, + "epoch": 0.10470915654018903, + "grad_norm": 22.576231002807617, + "learning_rate": 3.8345426157237326e-05, + "loss": 0.9803, "step": 7600 }, { - "epoch": 0.08, - "learning_rate": 8.954997962391569e-06, - "loss": 4.0164, + "epoch": 0.10608690859992835, + "grad_norm": 27.096935272216797, + "learning_rate": 3.885056943423953e-05, + "loss": 0.9725, "step": 7700 }, { - "epoch": 0.08, - "learning_rate": 9.071432729813123e-06, - "loss": 4.0373, + "epoch": 0.10746466065966769, + "grad_norm": 41.65309143066406, + "learning_rate": 3.9355712711241734e-05, + "loss": 0.8945, "step": 7800 }, { - "epoch": 0.08, - "learning_rate": 9.187867497234675e-06, - "loss": 3.8453, + "epoch": 0.10884241271940702, + "grad_norm": 3.681422472000122, + "learning_rate": 3.986085598824394e-05, + "loss": 0.9033, "step": 7900 }, { - "epoch": 0.08, - "learning_rate": 9.304302264656226e-06, - "loss": 3.9272, + "epoch": 0.11022016477914634, + "grad_norm": 4.928839683532715, + "learning_rate": 4.036599926524614e-05, + "loss": 1.0033, "step": 8000 }, { - "epoch": 0.08, - "learning_rate": 9.420737032077779e-06, - "loss": 3.7603, + "epoch": 0.11159791683888567, + "grad_norm": 11.218937873840332, + "learning_rate": 4.0871142542248345e-05, + "loss": 1.039, "step": 8100 }, { - "epoch": 0.08, - "learning_rate": 9.53717179949933e-06, - "loss": 3.8101, + "epoch": 0.11297566889862501, + "grad_norm": 8.559927940368652, + "learning_rate": 4.1376285819250556e-05, + "loss": 1.1383, "step": 8200 }, { - "epoch": 0.08, - "learning_rate": 9.653606566920883e-06, - "loss": 3.7365, + "epoch": 0.11435342095836433, + "grad_norm": 7.633176326751709, + "learning_rate": 4.188142909625276e-05, + "loss": 1.1427, "step": 8300 }, { - "epoch": 0.09, - "learning_rate": 9.770041334342436e-06, - "loss": 3.6733, + "epoch": 0.11573117301810366, + "grad_norm": 8.33764934539795, + "learning_rate": 4.238657237325496e-05, + "loss": 1.0857, "step": 8400 }, { - "epoch": 0.09, - "learning_rate": 9.886476101763987e-06, - "loss": 3.594, + "epoch": 0.11710892507784298, + "grad_norm": 10.000191688537598, + "learning_rate": 4.289171565025717e-05, + "loss": 1.084, "step": 8500 }, { - "epoch": 0.09, - "learning_rate": 1.0002910869185538e-05, - "loss": 3.656, + "epoch": 0.11848667713758232, + "grad_norm": 18.05430030822754, + "learning_rate": 4.339180749448935e-05, + "loss": 1.1712, "step": 8600 }, { - "epoch": 0.09, - "learning_rate": 1.0119345636607091e-05, - "loss": 3.5974, + "epoch": 0.11986442919732165, + "grad_norm": 12.45681381225586, + "learning_rate": 4.3896950771491554e-05, + "loss": 1.0295, "step": 8700 }, { - "epoch": 0.09, - "learning_rate": 1.0235780404028644e-05, - "loss": 3.5924, + "epoch": 0.12124218125706097, + "grad_norm": 9.36514663696289, + "learning_rate": 4.440209404849376e-05, + "loss": 1.2121, "step": 8800 }, { - "epoch": 0.09, - "learning_rate": 1.0352215171450195e-05, - "loss": 3.6154, + "epoch": 0.12261993331680031, + "grad_norm": 9.039582252502441, + "learning_rate": 4.4907237325495955e-05, + "loss": 1.1665, "step": 8900 }, { - "epoch": 0.09, - "learning_rate": 1.0468649938871747e-05, - "loss": 3.583, + "epoch": 0.12399768537653964, + "grad_norm": 16.782058715820312, + "learning_rate": 4.5412380602498165e-05, + "loss": 1.1426, "step": 9000 }, { - "epoch": 0.09, - "learning_rate": 1.05850847062933e-05, - "loss": 3.4384, + "epoch": 0.12537543743627896, + "grad_norm": 17.21622085571289, + "learning_rate": 4.591752387950037e-05, + "loss": 1.1173, "step": 9100 }, { - "epoch": 0.09, - "learning_rate": 1.070151947371485e-05, - "loss": 3.352, + "epoch": 0.1267531894960183, + "grad_norm": 6.7519307136535645, + "learning_rate": 4.642266715650257e-05, + "loss": 1.1164, "step": 9200 }, { - "epoch": 0.09, - "learning_rate": 1.0817954241136404e-05, - "loss": 3.3393, + "epoch": 0.12813094155575763, + "grad_norm": 7.507974624633789, + "learning_rate": 4.6927810433504776e-05, + "loss": 1.1192, "step": 9300 }, { - "epoch": 0.1, - "learning_rate": 1.0934389008557957e-05, - "loss": 3.3016, + "epoch": 0.12950869361549697, + "grad_norm": 4.70835018157959, + "learning_rate": 4.743295371050698e-05, + "loss": 1.0741, "step": 9400 }, { - "epoch": 0.1, - "learning_rate": 1.1050823775979508e-05, - "loss": 3.271, + "epoch": 0.13088644567523627, + "grad_norm": 8.935495376586914, + "learning_rate": 4.7938096987509184e-05, + "loss": 1.069, "step": 9500 }, { - "epoch": 0.1, - "learning_rate": 1.1167258543401059e-05, - "loss": 3.1705, + "epoch": 0.1322641977349756, + "grad_norm": 7.942813873291016, + "learning_rate": 4.844324026451139e-05, + "loss": 1.1075, "step": 9600 }, { - "epoch": 0.1, - "learning_rate": 1.1283693310822612e-05, - "loss": 3.1311, + "epoch": 0.13364194979471494, + "grad_norm": 8.648787498474121, + "learning_rate": 4.894838354151359e-05, + "loss": 1.0331, "step": 9700 }, { - "epoch": 0.1, - "learning_rate": 1.1400128078244165e-05, - "loss": 3.0696, + "epoch": 0.13501970185445428, + "grad_norm": 37.376991271972656, + "learning_rate": 4.94535268185158e-05, + "loss": 1.1188, "step": 9800 }, { - "epoch": 0.1, - "learning_rate": 1.1516562845665716e-05, - "loss": 3.046, + "epoch": 0.1363974539141936, + "grad_norm": 8.2828369140625, + "learning_rate": 4.9958670095518006e-05, + "loss": 1.1222, "step": 9900 }, { - "epoch": 0.1, - "learning_rate": 1.1632997613087269e-05, - "loss": 3.0403, + "epoch": 0.13777520597393292, + "grad_norm": 7.8098225593566895, + "learning_rate": 5.046381337252021e-05, + "loss": 1.0751, "step": 10000 }, { - "epoch": 0.1, - "learning_rate": 1.174943238050882e-05, - "loss": 3.1051, + "epoch": 0.13915295803367225, + "grad_norm": 174.5991668701172, + "learning_rate": 5.0968956649522413e-05, + "loss": 1.2117, "step": 10100 }, { - "epoch": 0.1, - "learning_rate": 1.1865867147930371e-05, - "loss": 3.1563, + "epoch": 0.1405307100934116, + "grad_norm": 9.941773414611816, + "learning_rate": 5.147409992652462e-05, + "loss": 1.1293, "step": 10200 }, { - "epoch": 0.1, - "learning_rate": 1.1982301915351924e-05, - "loss": 3.176, + "epoch": 0.14190846215315092, + "grad_norm": 53.449195861816406, + "learning_rate": 5.197924320352682e-05, + "loss": 1.1948, "step": 10300 }, { - "epoch": 0.11, - "learning_rate": 1.2098736682773477e-05, - "loss": 3.0937, + "epoch": 0.14328621421289026, + "grad_norm": 9.17073917388916, + "learning_rate": 5.2484386480529025e-05, + "loss": 1.105, "step": 10400 }, { - "epoch": 0.11, - "learning_rate": 1.2215171450195028e-05, - "loss": 2.9902, + "epoch": 0.1446639662726296, + "grad_norm": 13.831076622009277, + "learning_rate": 5.298952975753123e-05, + "loss": 1.2271, "step": 10500 }, { - "epoch": 0.11, - "learning_rate": 1.233160621761658e-05, - "loss": 3.0412, + "epoch": 0.1460417183323689, + "grad_norm": 10.086932182312012, + "learning_rate": 5.349467303453344e-05, + "loss": 1.1864, "step": 10600 }, { - "epoch": 0.11, - "learning_rate": 1.2448040985038133e-05, - "loss": 2.9516, + "epoch": 0.14741947039210823, + "grad_norm": 4.469985008239746, + "learning_rate": 5.399981631153564e-05, + "loss": 1.2172, "step": 10700 }, { - "epoch": 0.11, - "learning_rate": 1.2564475752459685e-05, - "loss": 2.8995, + "epoch": 0.14879722245184757, + "grad_norm": 9.413055419921875, + "learning_rate": 5.450495958853784e-05, + "loss": 1.1877, "step": 10800 }, { - "epoch": 0.11, - "learning_rate": 1.2680910519881237e-05, - "loss": 2.943, + "epoch": 0.1501749745115869, + "grad_norm": 26.23496437072754, + "learning_rate": 5.499999999988319e-05, + "loss": 1.2923, "step": 10900 }, { - "epoch": 0.11, - "learning_rate": 1.279734528730279e-05, - "loss": 2.7188, + "epoch": 0.15155272657132624, + "grad_norm": 7.072531223297119, + "learning_rate": 5.499999880838769e-05, + "loss": 1.1533, "step": 11000 }, { - "epoch": 0.11, - "learning_rate": 1.2912615707050124e-05, - "loss": 2.8802, + "epoch": 0.15293047863106554, + "grad_norm": 26.590896606445312, + "learning_rate": 5.499999528062659e-05, + "loss": 1.1943, "step": 11100 }, { - "epoch": 0.11, - "learning_rate": 1.3029050474471679e-05, - "loss": 2.8186, + "epoch": 0.15430823069080488, + "grad_norm": 15.353813171386719, + "learning_rate": 5.499998941660022e-05, + "loss": 1.1975, "step": 11200 }, { - "epoch": 0.12, - "learning_rate": 1.314548524189323e-05, - "loss": 2.8054, + "epoch": 0.1556859827505442, + "grad_norm": 10.522014617919922, + "learning_rate": 5.499998121630905e-05, + "loss": 1.2506, "step": 11300 }, { - "epoch": 0.12, - "learning_rate": 1.3261920009314781e-05, - "loss": 2.725, + "epoch": 0.15706373481028355, + "grad_norm": 127.85610961914062, + "learning_rate": 5.499997067975379e-05, + "loss": 1.2472, "step": 11400 }, { - "epoch": 0.12, - "learning_rate": 1.3378354776736334e-05, - "loss": 2.7341, + "epoch": 0.15844148687002288, + "grad_norm": 16.62019157409668, + "learning_rate": 5.4999957806935333e-05, + "loss": 1.2199, "step": 11500 }, { - "epoch": 0.12, - "learning_rate": 1.3494789544157885e-05, - "loss": 2.8901, + "epoch": 0.1598192389297622, + "grad_norm": 30.395166397094727, + "learning_rate": 5.499994259785477e-05, + "loss": 1.3408, "step": 11600 }, { - "epoch": 0.12, - "learning_rate": 1.3611224311579438e-05, - "loss": 2.7045, + "epoch": 0.16119699098950152, + "grad_norm": 9.531847953796387, + "learning_rate": 5.49999250525134e-05, + "loss": 1.2979, "step": 11700 }, { - "epoch": 0.12, - "learning_rate": 1.372765907900099e-05, - "loss": 2.7249, + "epoch": 0.16257474304924086, + "grad_norm": 13.844141960144043, + "learning_rate": 5.49999051709127e-05, + "loss": 1.1795, "step": 11800 }, { - "epoch": 0.12, - "learning_rate": 1.3844093846422542e-05, - "loss": 2.5729, + "epoch": 0.1639524951089802, + "grad_norm": 10.223821640014648, + "learning_rate": 5.4999882953054366e-05, + "loss": 1.2488, "step": 11900 }, { - "epoch": 0.12, - "learning_rate": 1.3960528613844094e-05, - "loss": 2.588, + "epoch": 0.16533024716871952, + "grad_norm": 4.547140121459961, + "learning_rate": 5.4999858398940294e-05, + "loss": 1.2205, "step": 12000 }, { - "epoch": 0.12, - "learning_rate": 1.4076963381265645e-05, - "loss": 2.6701, + "epoch": 0.16670799922845886, + "grad_norm": 21.750076293945312, + "learning_rate": 5.4999831508572554e-05, + "loss": 1.1902, "step": 12100 }, { - "epoch": 0.12, - "learning_rate": 1.41933981486872e-05, - "loss": 2.5204, + "epoch": 0.16808575128819817, + "grad_norm": 12.58484935760498, + "learning_rate": 5.499980228195345e-05, + "loss": 1.192, "step": 12200 }, { - "epoch": 0.13, - "learning_rate": 1.430983291610875e-05, - "loss": 2.7362, + "epoch": 0.1694635033479375, + "grad_norm": 19.202035903930664, + "learning_rate": 5.499977071908545e-05, + "loss": 1.1948, "step": 12300 }, { - "epoch": 0.13, - "learning_rate": 1.4426267683530302e-05, - "loss": 2.5727, + "epoch": 0.17084125540767683, + "grad_norm": 24.988405227661133, + "learning_rate": 5.4999736819971234e-05, + "loss": 1.2013, "step": 12400 }, { - "epoch": 0.13, - "learning_rate": 1.4542702450951855e-05, - "loss": 2.4632, + "epoch": 0.17221900746741617, + "grad_norm": 12.41451358795166, + "learning_rate": 5.499970058461369e-05, + "loss": 1.2893, "step": 12500 }, { - "epoch": 0.13, - "learning_rate": 1.4659137218373406e-05, - "loss": 2.4762, + "epoch": 0.1735967595271555, + "grad_norm": 10.051517486572266, + "learning_rate": 5.49996620130159e-05, + "loss": 1.2852, "step": 12600 }, { - "epoch": 0.13, - "learning_rate": 1.4775571985794959e-05, - "loss": 2.4738, + "epoch": 0.1749745115868948, + "grad_norm": 7.056366443634033, + "learning_rate": 5.499962110518112e-05, + "loss": 1.2215, "step": 12700 }, { - "epoch": 0.13, - "learning_rate": 1.4892006753216512e-05, - "loss": 2.5803, + "epoch": 0.17635226364663414, + "grad_norm": 15.381092071533203, + "learning_rate": 5.4999577861112856e-05, + "loss": 1.3229, "step": 12800 }, { - "epoch": 0.13, - "learning_rate": 1.5008441520638063e-05, - "loss": 2.5389, + "epoch": 0.17773001570637348, + "grad_norm": 9.423325538635254, + "learning_rate": 5.4999532280814754e-05, + "loss": 1.1513, "step": 12900 }, { - "epoch": 0.13, - "learning_rate": 1.5124876288059614e-05, - "loss": 2.4748, + "epoch": 0.1791077677661128, + "grad_norm": 9.337259292602539, + "learning_rate": 5.4999484364290714e-05, + "loss": 1.1022, "step": 13000 }, { - "epoch": 0.13, - "learning_rate": 1.5241311055481166e-05, - "loss": 2.5047, + "epoch": 0.18048551982585215, + "grad_norm": 8.747264862060547, + "learning_rate": 5.499943411154478e-05, + "loss": 1.2086, "step": 13100 }, { - "epoch": 0.13, - "learning_rate": 1.5357745822902717e-05, - "loss": 2.3426, + "epoch": 0.18186327188559148, + "grad_norm": 74.09689331054688, + "learning_rate": 5.4999381522581234e-05, + "loss": 1.1169, "step": 13200 }, { - "epoch": 0.14, - "learning_rate": 1.547418059032427e-05, - "loss": 2.3343, + "epoch": 0.1832410239453308, + "grad_norm": 14.921662330627441, + "learning_rate": 5.4999326597404536e-05, + "loss": 1.1987, "step": 13300 }, { - "epoch": 0.14, - "learning_rate": 1.5590615357745826e-05, - "loss": 2.4218, + "epoch": 0.18461877600507012, + "grad_norm": 6.766313076019287, + "learning_rate": 5.499926933601937e-05, + "loss": 1.2041, "step": 13400 }, { - "epoch": 0.14, - "learning_rate": 1.5707050125167377e-05, - "loss": 2.435, + "epoch": 0.18599652806480946, + "grad_norm": 15.904729843139648, + "learning_rate": 5.4999209738430575e-05, + "loss": 1.1398, "step": 13500 }, { - "epoch": 0.14, - "learning_rate": 1.582348489258893e-05, - "loss": 2.3447, + "epoch": 0.1873742801245488, + "grad_norm": 4.456528663635254, + "learning_rate": 5.499914780464323e-05, + "loss": 1.2177, "step": 13600 }, { - "epoch": 0.14, - "learning_rate": 1.593991966001048e-05, - "loss": 2.247, + "epoch": 0.18875203218428813, + "grad_norm": 137.29202270507812, + "learning_rate": 5.49990835346626e-05, + "loss": 1.1807, "step": 13700 }, { - "epoch": 0.14, - "learning_rate": 1.605635442743203e-05, - "loss": 2.3916, + "epoch": 0.19012978424402743, + "grad_norm": 4.208902359008789, + "learning_rate": 5.499901692849414e-05, + "loss": 1.2225, "step": 13800 }, { - "epoch": 0.14, - "learning_rate": 1.6172789194853582e-05, - "loss": 2.2815, + "epoch": 0.19150753630376677, + "grad_norm": 28.881311416625977, + "learning_rate": 5.49989479861435e-05, + "loss": 1.2106, "step": 13900 }, { - "epoch": 0.14, - "learning_rate": 1.6289223962275137e-05, - "loss": 2.2443, + "epoch": 0.1928852883635061, + "grad_norm": 26.017873764038086, + "learning_rate": 5.4998876707616556e-05, + "loss": 1.2853, "step": 14000 }, { - "epoch": 0.14, - "learning_rate": 1.6405658729696688e-05, - "loss": 2.3048, + "epoch": 0.19426304042324544, + "grad_norm": 40.50442886352539, + "learning_rate": 5.4998803092919346e-05, + "loss": 1.2832, "step": 14100 }, { - "epoch": 0.14, - "learning_rate": 1.652209349711824e-05, - "loss": 2.0418, + "epoch": 0.19564079248298477, + "grad_norm": 23.924936294555664, + "learning_rate": 5.499872714205813e-05, + "loss": 1.1663, "step": 14200 }, { - "epoch": 0.15, - "learning_rate": 1.663852826453979e-05, - "loss": 2.2515, + "epoch": 0.19701854454272408, + "grad_norm": 15.067869186401367, + "learning_rate": 5.499864964947351e-05, + "loss": 1.4564, "step": 14300 }, { - "epoch": 0.15, - "learning_rate": 1.6753798684287127e-05, - "loss": 2.2287, + "epoch": 0.1983962966024634, + "grad_norm": 17.385555267333984, + "learning_rate": 5.499856904966531e-05, + "loss": 1.3213, "step": 14400 }, { - "epoch": 0.15, - "learning_rate": 1.687023345170868e-05, - "loss": 2.2754, + "epoch": 0.19977404866220275, + "grad_norm": 7.320497035980225, + "learning_rate": 5.499848611371299e-05, + "loss": 1.2966, "step": 14500 }, { - "epoch": 0.15, - "learning_rate": 1.6986668219130233e-05, - "loss": 2.2174, + "epoch": 0.20115180072194208, + "grad_norm": 52.53131866455078, + "learning_rate": 5.49984008416236e-05, + "loss": 1.2964, "step": 14600 }, { - "epoch": 0.15, - "learning_rate": 1.7103102986551784e-05, - "loss": 2.1102, + "epoch": 0.20252955278168142, + "grad_norm": 16.048524856567383, + "learning_rate": 5.499831323340437e-05, + "loss": 1.3015, "step": 14700 }, { - "epoch": 0.15, - "learning_rate": 1.721953775397334e-05, - "loss": 2.2028, + "epoch": 0.20390730484142075, + "grad_norm": 16.255613327026367, + "learning_rate": 5.4998223289062754e-05, + "loss": 1.3644, "step": 14800 }, { - "epoch": 0.15, - "learning_rate": 1.733597252139489e-05, - "loss": 2.0597, + "epoch": 0.20528505690116006, + "grad_norm": 8.616049766540527, + "learning_rate": 5.4998131008606386e-05, + "loss": 1.327, "step": 14900 }, { - "epoch": 0.15, - "learning_rate": 1.745240728881644e-05, - "loss": 2.0455, + "epoch": 0.2066628089608994, + "grad_norm": 51.175228118896484, + "learning_rate": 5.499803639204311e-05, + "loss": 1.2566, "step": 15000 }, { - "epoch": 0.15, - "learning_rate": 1.7568842056237992e-05, - "loss": 2.0336, + "epoch": 0.20804056102063873, + "grad_norm": 7.27580451965332, + "learning_rate": 5.499793943938097e-05, + "loss": 1.3868, "step": 15100 }, { - "epoch": 0.15, - "learning_rate": 1.7685276823659547e-05, - "loss": 2.1017, + "epoch": 0.20941831308037806, + "grad_norm": 7.9837517738342285, + "learning_rate": 5.4997840150628194e-05, + "loss": 1.2163, "step": 15200 }, { - "epoch": 0.16, - "learning_rate": 1.7801711591081098e-05, - "loss": 2.153, + "epoch": 0.2107960651401174, + "grad_norm": 13.115901947021484, + "learning_rate": 5.4997738525793216e-05, + "loss": 1.1793, "step": 15300 }, { - "epoch": 0.16, - "learning_rate": 1.791814635850265e-05, - "loss": 2.0786, + "epoch": 0.2121738171998567, + "grad_norm": 11.005683898925781, + "learning_rate": 5.4997634564884676e-05, + "loss": 1.2811, "step": 15400 }, { - "epoch": 0.16, - "learning_rate": 1.80345811259242e-05, - "loss": 2.0992, + "epoch": 0.21355156925959604, + "grad_norm": 12.336491584777832, + "learning_rate": 5.4997528267911405e-05, + "loss": 1.2793, "step": 15500 }, { - "epoch": 0.16, - "learning_rate": 1.815101589334575e-05, - "loss": 2.1288, + "epoch": 0.21492932131933537, + "grad_norm": 9.77892780303955, + "learning_rate": 5.499741963488242e-05, + "loss": 1.3607, "step": 15600 }, { - "epoch": 0.16, - "learning_rate": 1.8267450660767303e-05, - "loss": 2.16, + "epoch": 0.2163070733790747, + "grad_norm": 9.016510963439941, + "learning_rate": 5.499730866580697e-05, + "loss": 1.2084, "step": 15700 }, { - "epoch": 0.16, - "learning_rate": 1.838388542818886e-05, - "loss": 2.1414, + "epoch": 0.21768482543881404, + "grad_norm": 19.17759895324707, + "learning_rate": 5.4997195360694475e-05, + "loss": 1.2585, "step": 15800 }, { - "epoch": 0.16, - "learning_rate": 1.8500320195610412e-05, - "loss": 2.1669, + "epoch": 0.21906257749855337, + "grad_norm": 12.264784812927246, + "learning_rate": 5.499707971955455e-05, + "loss": 1.2381, "step": 15900 }, { - "epoch": 0.16, - "learning_rate": 1.8616754963031963e-05, - "loss": 2.1529, + "epoch": 0.22044032955829268, + "grad_norm": 14.009553909301758, + "learning_rate": 5.499696174239703e-05, + "loss": 1.2221, "step": 16000 }, { - "epoch": 0.16, - "learning_rate": 1.8733189730453514e-05, - "loss": 2.1362, + "epoch": 0.22181808161803201, + "grad_norm": 5.409763813018799, + "learning_rate": 5.499684142923195e-05, + "loss": 1.2755, "step": 16100 }, { - "epoch": 0.17, - "learning_rate": 1.8849624497875066e-05, - "loss": 2.1828, + "epoch": 0.22319583367777135, + "grad_norm": 14.818997383117676, + "learning_rate": 5.499671878006951e-05, + "loss": 1.3131, "step": 16200 }, { - "epoch": 0.17, - "learning_rate": 1.8966059265296617e-05, - "loss": 2.0154, + "epoch": 0.22457358573751068, + "grad_norm": 18.775480270385742, + "learning_rate": 5.4996593794920135e-05, + "loss": 1.111, "step": 16300 }, { - "epoch": 0.17, - "learning_rate": 1.9082494032718168e-05, - "loss": 2.2411, + "epoch": 0.22595133779725002, + "grad_norm": 2.511378526687622, + "learning_rate": 5.499646647379445e-05, + "loss": 1.1362, "step": 16400 }, { - "epoch": 0.17, - "learning_rate": 1.9198928800139723e-05, - "loss": 2.0872, + "epoch": 0.22732908985698932, + "grad_norm": 8.04574203491211, + "learning_rate": 5.4996336816703265e-05, + "loss": 1.1849, "step": 16500 }, { - "epoch": 0.17, - "learning_rate": 1.9315363567561274e-05, - "loss": 1.8982, + "epoch": 0.22870684191672866, + "grad_norm": 6.202666282653809, + "learning_rate": 5.4996204823657594e-05, + "loss": 1.2502, "step": 16600 }, { - "epoch": 0.17, - "learning_rate": 1.9431798334982825e-05, - "loss": 1.9847, + "epoch": 0.230084593976468, + "grad_norm": 20.663745880126953, + "learning_rate": 5.4996070494668656e-05, + "loss": 1.1894, "step": 16700 }, { - "epoch": 0.17, - "learning_rate": 1.954823310240438e-05, - "loss": 1.9877, + "epoch": 0.23146234603620733, + "grad_norm": 10.979053497314453, + "learning_rate": 5.499593382974786e-05, + "loss": 1.1144, "step": 16800 }, { - "epoch": 0.17, - "learning_rate": 1.966466786982593e-05, - "loss": 2.2184, + "epoch": 0.23284009809594666, + "grad_norm": 28.4576358795166, + "learning_rate": 5.499579482890682e-05, + "loss": 1.1706, "step": 16900 }, { - "epoch": 0.17, - "learning_rate": 1.9781102637247482e-05, - "loss": 2.0553, + "epoch": 0.23421785015568597, + "grad_norm": 24.407394409179688, + "learning_rate": 5.499565349215733e-05, + "loss": 1.3014, "step": 17000 }, { - "epoch": 0.17, - "learning_rate": 1.9897537404669037e-05, - "loss": 2.0305, + "epoch": 0.2355956022154253, + "grad_norm": 6.59324836730957, + "learning_rate": 5.499550981951142e-05, + "loss": 1.2647, "step": 17100 }, { - "epoch": 0.18, - "learning_rate": 2.0013972172090588e-05, - "loss": 1.9608, + "epoch": 0.23697335427516464, + "grad_norm": 19.240928649902344, + "learning_rate": 5.4995363810981284e-05, + "loss": 1.0861, "step": 17200 }, { - "epoch": 0.18, - "learning_rate": 2.013040693951214e-05, - "loss": 1.8904, + "epoch": 0.23835110633490397, + "grad_norm": 19.197452545166016, + "learning_rate": 5.499521546657932e-05, + "loss": 1.1999, "step": 17300 }, { - "epoch": 0.18, - "learning_rate": 2.024684170693369e-05, - "loss": 1.9832, + "epoch": 0.2397288583946433, + "grad_norm": 58.96784210205078, + "learning_rate": 5.4995064786318154e-05, + "loss": 1.2032, "step": 17400 }, { - "epoch": 0.18, - "learning_rate": 2.0363276474355242e-05, - "loss": 1.9605, + "epoch": 0.24110661045438264, + "grad_norm": 9.024454116821289, + "learning_rate": 5.4994911770210557e-05, + "loss": 1.3062, "step": 17500 }, { - "epoch": 0.18, - "learning_rate": 2.0479711241776793e-05, - "loss": 1.9082, + "epoch": 0.24248436251412195, + "grad_norm": 19.052547454833984, + "learning_rate": 5.4994757983351285e-05, + "loss": 1.2405, "step": 17600 }, { - "epoch": 0.18, - "learning_rate": 2.0596146009198344e-05, - "loss": 1.8935, + "epoch": 0.24386211457386128, + "grad_norm": 13.851653099060059, + "learning_rate": 5.4994600318948186e-05, + "loss": 1.3275, "step": 17700 }, { - "epoch": 0.18, - "learning_rate": 2.07125807766199e-05, - "loss": 2.0428, + "epoch": 0.24523986663360062, + "grad_norm": 27.99448013305664, + "learning_rate": 5.499444031873814e-05, + "loss": 1.1994, "step": 17800 }, { - "epoch": 0.18, - "learning_rate": 2.0829015544041453e-05, - "loss": 1.9269, + "epoch": 0.24661761869333995, + "grad_norm": 12.64948844909668, + "learning_rate": 5.4994277982734713e-05, + "loss": 1.1713, "step": 17900 }, { - "epoch": 0.18, - "learning_rate": 2.0945450311463005e-05, - "loss": 1.9049, + "epoch": 0.2479953707530793, + "grad_norm": 8.357621192932129, + "learning_rate": 5.499411331095172e-05, + "loss": 1.2516, "step": 18000 }, { - "epoch": 0.18, - "learning_rate": 2.1061885078884556e-05, - "loss": 1.8559, + "epoch": 0.2493731228128186, + "grad_norm": 14.239299774169922, + "learning_rate": 5.499394630340314e-05, + "loss": 1.2017, "step": 18100 }, { - "epoch": 0.19, - "learning_rate": 2.1178319846306107e-05, - "loss": 1.8794, + "epoch": 0.2507508748725579, + "grad_norm": 11.225935935974121, + "learning_rate": 5.499377696010317e-05, + "loss": 1.1938, "step": 18200 }, { - "epoch": 0.19, - "learning_rate": 2.1294754613727658e-05, - "loss": 1.9616, + "epoch": 0.25212862693229726, + "grad_norm": 17.634973526000977, + "learning_rate": 5.499360528106618e-05, + "loss": 1.254, "step": 18300 }, { - "epoch": 0.19, - "learning_rate": 2.1410025033474998e-05, - "loss": 1.9527, + "epoch": 0.2535063789920366, + "grad_norm": 10.021209716796875, + "learning_rate": 5.499343126630677e-05, + "loss": 1.1603, "step": 18400 }, { - "epoch": 0.19, - "learning_rate": 2.152645980089655e-05, - "loss": 1.8142, + "epoch": 0.25488413105177593, + "grad_norm": 26.679080963134766, + "learning_rate": 5.499325491583972e-05, + "loss": 1.1749, "step": 18500 }, { - "epoch": 0.19, - "learning_rate": 2.16428945683181e-05, - "loss": 1.8426, + "epoch": 0.25626188311151527, + "grad_norm": 14.252337455749512, + "learning_rate": 5.4993076229680005e-05, + "loss": 1.1853, "step": 18600 }, { - "epoch": 0.19, - "learning_rate": 2.175932933573965e-05, - "loss": 1.8306, + "epoch": 0.2576396351712546, + "grad_norm": 14.54755973815918, + "learning_rate": 5.499289520784281e-05, + "loss": 1.2063, "step": 18700 }, { - "epoch": 0.19, - "learning_rate": 2.1875764103161203e-05, - "loss": 1.7197, + "epoch": 0.25901738723099393, + "grad_norm": 10.576868057250977, + "learning_rate": 5.499271185034352e-05, + "loss": 1.197, "step": 18800 }, { - "epoch": 0.19, - "learning_rate": 2.1992198870582757e-05, - "loss": 1.9279, + "epoch": 0.2603951392907332, + "grad_norm": 8.283714294433594, + "learning_rate": 5.499252615719771e-05, + "loss": 1.2528, "step": 18900 }, { - "epoch": 0.19, - "learning_rate": 2.2107469290330094e-05, - "loss": 1.7236, + "epoch": 0.26177289135047255, + "grad_norm": 9.045394897460938, + "learning_rate": 5.499233812842115e-05, + "loss": 1.1031, "step": 19000 }, { - "epoch": 0.19, - "learning_rate": 2.2223904057751645e-05, - "loss": 1.9686, + "epoch": 0.2631506434102119, + "grad_norm": 9.431145668029785, + "learning_rate": 5.4992147764029804e-05, + "loss": 1.113, "step": 19100 }, { - "epoch": 0.2, - "learning_rate": 2.2340338825173196e-05, - "loss": 1.8834, + "epoch": 0.2645283954699512, + "grad_norm": 12.061200141906738, + "learning_rate": 5.499195506403987e-05, + "loss": 1.2396, "step": 19200 }, { - "epoch": 0.2, - "learning_rate": 2.2456773592594747e-05, - "loss": 1.8359, + "epoch": 0.26590614752969055, + "grad_norm": 11.876815795898438, + "learning_rate": 5.49917600284677e-05, + "loss": 1.2091, "step": 19300 }, { - "epoch": 0.2, - "learning_rate": 2.2573208360016302e-05, - "loss": 1.9224, + "epoch": 0.2672838995894299, + "grad_norm": 15.724605560302734, + "learning_rate": 5.499156464260224e-05, + "loss": 1.2188, "step": 19400 }, { - "epoch": 0.2, - "learning_rate": 2.2689643127437853e-05, - "loss": 1.9865, + "epoch": 0.2686616516491692, + "grad_norm": 25.08930015563965, + "learning_rate": 5.499136495927092e-05, + "loss": 1.1141, "step": 19500 }, { - "epoch": 0.2, - "learning_rate": 2.2806077894859408e-05, - "loss": 1.7864, + "epoch": 0.27003940370890855, + "grad_norm": 8.645094871520996, + "learning_rate": 5.499116294040751e-05, + "loss": 1.146, "step": 19600 }, { - "epoch": 0.2, - "learning_rate": 2.292251266228096e-05, - "loss": 1.869, + "epoch": 0.2714171557686479, + "grad_norm": 32.26475143432617, + "learning_rate": 5.499095858602915e-05, + "loss": 1.2498, "step": 19700 }, { - "epoch": 0.2, - "learning_rate": 2.303894742970251e-05, - "loss": 1.7395, + "epoch": 0.2727949078283872, + "grad_norm": 7.541633129119873, + "learning_rate": 5.499075189615322e-05, + "loss": 1.1223, "step": 19800 }, { - "epoch": 0.2, - "learning_rate": 2.315538219712406e-05, - "loss": 1.827, + "epoch": 0.27417265988812656, + "grad_norm": 12.944062232971191, + "learning_rate": 5.4990542870797286e-05, + "loss": 1.1919, "step": 19900 }, { - "epoch": 0.2, - "learning_rate": 2.3271816964545613e-05, - "loss": 1.8106, + "epoch": 0.27555041194786584, + "grad_norm": 18.79230499267578, + "learning_rate": 5.499033150997908e-05, + "loss": 1.2171, "step": 20000 }, { - "epoch": 0.2, - "learning_rate": 2.3388251731967167e-05, - "loss": 1.8351, + "epoch": 0.27692816400760517, + "grad_norm": 5.1296772956848145, + "learning_rate": 5.499011781371659e-05, + "loss": 1.2126, "step": 20100 }, { - "epoch": 0.21, - "learning_rate": 2.350468649938872e-05, - "loss": 2.009, + "epoch": 0.2783059160673445, + "grad_norm": 13.036163330078125, + "learning_rate": 5.4989901782027935e-05, + "loss": 1.2454, "step": 20200 }, { - "epoch": 0.21, - "learning_rate": 2.362112126681027e-05, - "loss": 1.8558, + "epoch": 0.27968366812708384, + "grad_norm": 14.533465385437012, + "learning_rate": 5.498968341493149e-05, + "loss": 1.2029, "step": 20300 }, { - "epoch": 0.21, - "learning_rate": 2.373755603423182e-05, - "loss": 1.9493, + "epoch": 0.2810614201868232, + "grad_norm": 11.073671340942383, + "learning_rate": 5.4989462712445804e-05, + "loss": 1.1959, "step": 20400 }, { - "epoch": 0.21, - "learning_rate": 2.3853990801653372e-05, - "loss": 1.7777, + "epoch": 0.2824391722465625, + "grad_norm": 11.641937255859375, + "learning_rate": 5.4989239674589635e-05, + "loss": 1.2327, "step": 20500 }, { - "epoch": 0.21, - "learning_rate": 2.3970425569074927e-05, - "loss": 1.8565, + "epoch": 0.28381692430630184, + "grad_norm": 9.438309669494629, + "learning_rate": 5.4989014301381915e-05, + "loss": 1.3567, "step": 20600 }, { - "epoch": 0.21, - "learning_rate": 2.408686033649648e-05, - "loss": 1.7888, + "epoch": 0.2851946763660412, + "grad_norm": 14.73698902130127, + "learning_rate": 5.4988786592841795e-05, + "loss": 1.1945, "step": 20700 }, { - "epoch": 0.21, - "learning_rate": 2.4203295103918033e-05, - "loss": 1.6928, + "epoch": 0.2865724284257805, + "grad_norm": 27.582157135009766, + "learning_rate": 5.498855654898862e-05, + "loss": 1.3459, "step": 20800 }, { - "epoch": 0.21, - "learning_rate": 2.4319729871339584e-05, - "loss": 1.8924, + "epoch": 0.28795018048551985, + "grad_norm": 12.265806198120117, + "learning_rate": 5.498832416984193e-05, + "loss": 1.2672, "step": 20900 }, { - "epoch": 0.21, - "learning_rate": 2.4436164638761135e-05, - "loss": 1.9816, + "epoch": 0.2893279325452592, + "grad_norm": 14.940210342407227, + "learning_rate": 5.498808945542149e-05, + "loss": 1.2263, "step": 21000 }, { - "epoch": 0.21, - "learning_rate": 2.4552599406182686e-05, - "loss": 1.9393, + "epoch": 0.29070568460499846, + "grad_norm": 8.485940933227539, + "learning_rate": 5.49878547878034e-05, + "loss": 1.2284, "step": 21100 }, { - "epoch": 0.22, - "learning_rate": 2.4669034173604238e-05, - "loss": 1.8131, + "epoch": 0.2920834366647378, + "grad_norm": 48.134517669677734, + "learning_rate": 5.498761542624767e-05, + "loss": 1.1747, "step": 21200 }, { - "epoch": 0.22, - "learning_rate": 2.478546894102579e-05, - "loss": 1.7091, + "epoch": 0.29346118872447713, + "grad_norm": 10.468015670776367, + "learning_rate": 5.498737372947838e-05, + "loss": 1.2562, "step": 21300 }, { - "epoch": 0.22, - "learning_rate": 2.4901903708447343e-05, - "loss": 1.6654, + "epoch": 0.29483894078421646, + "grad_norm": 6.899281978607178, + "learning_rate": 5.4987129697516074e-05, + "loss": 1.2914, "step": 21400 }, { - "epoch": 0.22, - "learning_rate": 2.5018338475868895e-05, - "loss": 1.7938, + "epoch": 0.2962166928439558, + "grad_norm": 28.62447166442871, + "learning_rate": 5.498688333038148e-05, + "loss": 1.1887, "step": 21500 }, { - "epoch": 0.22, - "learning_rate": 2.513477324329045e-05, - "loss": 1.8864, + "epoch": 0.29759444490369513, + "grad_norm": 15.736666679382324, + "learning_rate": 5.4986634628095516e-05, + "loss": 1.2271, "step": 21600 }, { - "epoch": 0.22, - "learning_rate": 2.5251208010712e-05, - "loss": 1.7026, + "epoch": 0.29897219696343447, + "grad_norm": 15.980098724365234, + "learning_rate": 5.498638359067933e-05, + "loss": 1.2464, "step": 21700 }, { - "epoch": 0.22, - "learning_rate": 2.536764277813355e-05, - "loss": 1.7584, + "epoch": 0.3003499490231738, + "grad_norm": 20.236129760742188, + "learning_rate": 5.498613021815423e-05, + "loss": 1.2531, "step": 21800 }, { - "epoch": 0.22, - "learning_rate": 2.5484077545555103e-05, - "loss": 1.7256, + "epoch": 0.30172770108291314, + "grad_norm": 16.427574157714844, + "learning_rate": 5.498587451054176e-05, + "loss": 1.1831, "step": 21900 }, { - "epoch": 0.22, - "learning_rate": 2.5599347965302443e-05, - "loss": 1.8007, + "epoch": 0.30310545314265247, + "grad_norm": 10.426117897033691, + "learning_rate": 5.4985616467863624e-05, + "loss": 1.1913, "step": 22000 }, { - "epoch": 0.23, - "learning_rate": 2.5715782732723994e-05, - "loss": 1.8303, + "epoch": 0.3044832052023918, + "grad_norm": 11.521761894226074, + "learning_rate": 5.498535609014175e-05, + "loss": 1.0736, "step": 22100 }, { - "epoch": 0.23, - "learning_rate": 2.5832217500145545e-05, - "loss": 1.7349, + "epoch": 0.3058609572621311, + "grad_norm": 6.655721664428711, + "learning_rate": 5.498509337739827e-05, + "loss": 1.2219, "step": 22200 }, { - "epoch": 0.23, - "learning_rate": 2.5948652267567096e-05, - "loss": 1.7948, + "epoch": 0.3072387093218704, + "grad_norm": 19.005769729614258, + "learning_rate": 5.49848283296555e-05, + "loss": 1.2403, "step": 22300 }, { - "epoch": 0.23, - "learning_rate": 2.6065087034988647e-05, - "loss": 1.8688, + "epoch": 0.30861646138160975, + "grad_norm": 10.534347534179688, + "learning_rate": 5.4984560946935936e-05, + "loss": 1.2538, "step": 22400 }, { - "epoch": 0.23, - "learning_rate": 2.61815218024102e-05, - "loss": 1.694, + "epoch": 0.3099942134413491, + "grad_norm": 20.09412956237793, + "learning_rate": 5.498429122926232e-05, + "loss": 1.2081, "step": 22500 }, { - "epoch": 0.23, - "learning_rate": 2.6297956569831753e-05, - "loss": 1.9865, + "epoch": 0.3113719655010884, + "grad_norm": 16.096044540405273, + "learning_rate": 5.498401917665756e-05, + "loss": 1.1684, "step": 22600 }, { - "epoch": 0.23, - "learning_rate": 2.6414391337253304e-05, - "loss": 1.6756, + "epoch": 0.31274971756082776, + "grad_norm": 20.909530639648438, + "learning_rate": 5.498374478914475e-05, + "loss": 1.1313, "step": 22700 }, { - "epoch": 0.23, - "learning_rate": 2.6530826104674856e-05, - "loss": 1.8416, + "epoch": 0.3141274696205671, + "grad_norm": 6.890002250671387, + "learning_rate": 5.4983468066747225e-05, + "loss": 1.0835, "step": 22800 }, { - "epoch": 0.23, - "learning_rate": 2.6647260872096407e-05, - "loss": 1.7703, + "epoch": 0.3155052216803064, + "grad_norm": 16.735523223876953, + "learning_rate": 5.498318900948848e-05, + "loss": 1.2713, "step": 22900 }, { - "epoch": 0.23, - "learning_rate": 2.676369563951796e-05, - "loss": 1.6477, + "epoch": 0.31688297374004576, + "grad_norm": 16.891698837280273, + "learning_rate": 5.498290761739222e-05, + "loss": 1.1909, "step": 23000 }, { - "epoch": 0.24, - "learning_rate": 2.6880130406939513e-05, - "loss": 1.6786, + "epoch": 0.3182607257997851, + "grad_norm": 13.88641357421875, + "learning_rate": 5.498262389048237e-05, + "loss": 1.2079, "step": 23100 }, { - "epoch": 0.24, - "learning_rate": 2.6996565174361067e-05, - "loss": 1.7777, + "epoch": 0.3196384778595244, + "grad_norm": 7.386116981506348, + "learning_rate": 5.498233782878301e-05, + "loss": 1.2487, "step": 23200 }, { - "epoch": 0.24, - "learning_rate": 2.711299994178262e-05, - "loss": 1.714, + "epoch": 0.3210162299192637, + "grad_norm": 13.648170471191406, + "learning_rate": 5.498204943231846e-05, + "loss": 1.1771, "step": 23300 }, { - "epoch": 0.24, - "learning_rate": 2.722943470920417e-05, - "loss": 1.6982, + "epoch": 0.32239398197900304, + "grad_norm": 24.235118865966797, + "learning_rate": 5.49817587011132e-05, + "loss": 1.0437, "step": 23400 }, { - "epoch": 0.24, - "learning_rate": 2.734586947662572e-05, - "loss": 1.9178, + "epoch": 0.3237717340387424, + "grad_norm": 16.021926879882812, + "learning_rate": 5.498146563519196e-05, + "loss": 1.2237, "step": 23500 }, { - "epoch": 0.24, - "learning_rate": 2.7462304244047272e-05, - "loss": 1.8461, + "epoch": 0.3251494860984817, + "grad_norm": 28.705358505249023, + "learning_rate": 5.498117023457961e-05, + "loss": 1.272, "step": 23600 }, { - "epoch": 0.24, - "learning_rate": 2.7578739011468823e-05, - "loss": 1.7734, + "epoch": 0.32652723815822104, + "grad_norm": 12.784554481506348, + "learning_rate": 5.4980872499301254e-05, + "loss": 1.0994, "step": 23700 }, { - "epoch": 0.24, - "learning_rate": 2.7695173778890375e-05, - "loss": 1.7396, + "epoch": 0.3279049902179604, + "grad_norm": 9.825348854064941, + "learning_rate": 5.4980572429382194e-05, + "loss": 1.255, "step": 23800 }, { - "epoch": 0.24, - "learning_rate": 2.781160854631193e-05, - "loss": 1.7885, + "epoch": 0.3292827422776997, + "grad_norm": 12.11705493927002, + "learning_rate": 5.498027002484791e-05, + "loss": 1.2695, "step": 23900 }, { - "epoch": 0.24, - "learning_rate": 2.7928043313733484e-05, - "loss": 1.7568, + "epoch": 0.33066049433743905, + "grad_norm": 8.544857025146484, + "learning_rate": 5.4979965285724105e-05, + "loss": 1.1801, "step": 24000 }, { - "epoch": 0.25, - "learning_rate": 2.8044478081155035e-05, - "loss": 1.7252, + "epoch": 0.3320382463971784, + "grad_norm": 6.105419158935547, + "learning_rate": 5.4979658212036656e-05, + "loss": 1.1384, "step": 24100 }, { - "epoch": 0.25, - "learning_rate": 2.8160912848576586e-05, - "loss": 1.6315, + "epoch": 0.3334159984569177, + "grad_norm": 7.332189083099365, + "learning_rate": 5.497934880381166e-05, + "loss": 1.2623, "step": 24200 }, { - "epoch": 0.25, - "learning_rate": 2.8277347615998138e-05, - "loss": 1.6681, + "epoch": 0.334793750516657, + "grad_norm": 7.7589545249938965, + "learning_rate": 5.497903706107541e-05, + "loss": 1.1905, "step": 24300 }, { - "epoch": 0.25, - "learning_rate": 2.839378238341969e-05, - "loss": 1.7478, + "epoch": 0.33617150257639633, + "grad_norm": 12.13595962524414, + "learning_rate": 5.4978722983854365e-05, + "loss": 1.104, "step": 24400 }, { - "epoch": 0.25, - "learning_rate": 2.8510217150841243e-05, - "loss": 1.7316, + "epoch": 0.33754925463613566, + "grad_norm": 67.7665786743164, + "learning_rate": 5.4978406572175227e-05, + "loss": 1.0483, "step": 24500 }, { - "epoch": 0.25, - "learning_rate": 2.8626651918262795e-05, - "loss": 1.6041, + "epoch": 0.338927006695875, + "grad_norm": 15.1935396194458, + "learning_rate": 5.4978087826064876e-05, + "loss": 1.2435, "step": 24600 }, { - "epoch": 0.25, - "learning_rate": 2.8743086685684346e-05, - "loss": 1.6318, + "epoch": 0.34030475875561433, + "grad_norm": 9.363604545593262, + "learning_rate": 5.497776674555038e-05, + "loss": 1.1054, "step": 24700 }, { - "epoch": 0.25, - "learning_rate": 2.8859521453105897e-05, - "loss": 1.7134, + "epoch": 0.34168251081535367, + "grad_norm": 9.143170356750488, + "learning_rate": 5.497744333065903e-05, + "loss": 1.2164, "step": 24800 }, { - "epoch": 0.25, - "learning_rate": 2.897595622052745e-05, - "loss": 1.5604, + "epoch": 0.343060262875093, + "grad_norm": 9.108330726623535, + "learning_rate": 5.4977117581418295e-05, + "loss": 1.1459, "step": 24900 }, { - "epoch": 0.25, - "learning_rate": 2.9092390987949003e-05, - "loss": 1.6424, + "epoch": 0.34443801493483234, + "grad_norm": 6.112168788909912, + "learning_rate": 5.497678949785585e-05, + "loss": 1.124, "step": 25000 }, { - "epoch": 0.26, - "learning_rate": 2.9208825755370554e-05, - "loss": 1.6891, + "epoch": 0.34581576699457167, + "grad_norm": 12.69692325592041, + "learning_rate": 5.497645907999956e-05, + "loss": 1.1198, "step": 25100 }, { - "epoch": 0.26, - "learning_rate": 2.932526052279211e-05, - "loss": 1.7311, + "epoch": 0.347193519054311, + "grad_norm": 10.93918514251709, + "learning_rate": 5.4976126327877504e-05, + "loss": 1.1381, "step": 25200 }, { - "epoch": 0.26, - "learning_rate": 2.944169529021366e-05, - "loss": 1.5749, + "epoch": 0.34857127111405034, + "grad_norm": 54.03297805786133, + "learning_rate": 5.497579124151796e-05, + "loss": 1.15, "step": 25300 }, { - "epoch": 0.26, - "learning_rate": 2.955813005763521e-05, - "loss": 1.8692, + "epoch": 0.3499490231737896, + "grad_norm": 7.5128631591796875, + "learning_rate": 5.4975453820949375e-05, + "loss": 1.1485, "step": 25400 }, { - "epoch": 0.26, - "learning_rate": 2.9674564825056762e-05, - "loss": 1.8747, + "epoch": 0.35132677523352895, + "grad_norm": 11.75341510772705, + "learning_rate": 5.497511406620042e-05, + "loss": 1.1393, "step": 25500 }, { - "epoch": 0.26, - "learning_rate": 2.9790999592478314e-05, - "loss": 1.7075, + "epoch": 0.3527045272932683, + "grad_norm": 6.413788318634033, + "learning_rate": 5.4974771977299975e-05, + "loss": 1.0141, "step": 25600 }, { - "epoch": 0.26, - "learning_rate": 2.9907434359899865e-05, - "loss": 1.5054, + "epoch": 0.3540822793530076, + "grad_norm": 7.513718605041504, + "learning_rate": 5.497442755427709e-05, + "loss": 1.1473, "step": 25700 }, { - "epoch": 0.26, - "learning_rate": 3.0023869127321416e-05, - "loss": 1.8078, + "epoch": 0.35546003141274696, + "grad_norm": 4.155311584472656, + "learning_rate": 5.497408079716102e-05, + "loss": 1.0039, "step": 25800 }, { - "epoch": 0.26, - "learning_rate": 3.0140303894742967e-05, - "loss": 1.7337, + "epoch": 0.3568377834724863, + "grad_norm": 10.284714698791504, + "learning_rate": 5.4973731705981236e-05, + "loss": 1.1851, "step": 25900 }, { - "epoch": 0.26, - "learning_rate": 3.0256738662164522e-05, - "loss": 1.6756, + "epoch": 0.3582155355322256, + "grad_norm": 32.3874626159668, + "learning_rate": 5.497338028076738e-05, + "loss": 1.0983, "step": 26000 }, { - "epoch": 0.27, - "learning_rate": 3.0373173429586073e-05, - "loss": 1.6192, + "epoch": 0.35959328759196496, + "grad_norm": 13.360755920410156, + "learning_rate": 5.4973026521549324e-05, + "loss": 1.1441, "step": 26100 }, { - "epoch": 0.27, - "learning_rate": 3.048960819700763e-05, - "loss": 1.6467, + "epoch": 0.3609710396517043, + "grad_norm": 26.66537094116211, + "learning_rate": 5.497267042835711e-05, + "loss": 1.1707, "step": 26200 }, { - "epoch": 0.27, - "learning_rate": 3.060604296442918e-05, - "loss": 1.5818, + "epoch": 0.36234879171144363, + "grad_norm": 14.240584373474121, + "learning_rate": 5.497231200122099e-05, + "loss": 1.0371, "step": 26300 }, { - "epoch": 0.27, - "learning_rate": 3.072131338417652e-05, - "loss": 1.6936, + "epoch": 0.36372654377118296, + "grad_norm": 28.889888763427734, + "learning_rate": 5.497195124017142e-05, + "loss": 1.1521, "step": 26400 }, { - "epoch": 0.27, - "learning_rate": 3.083774815159807e-05, - "loss": 1.6342, + "epoch": 0.36510429583092224, + "grad_norm": 10.489887237548828, + "learning_rate": 5.497158814523906e-05, + "loss": 1.1477, "step": 26500 }, { - "epoch": 0.27, - "learning_rate": 3.095418291901962e-05, - "loss": 1.5545, + "epoch": 0.3664820478906616, + "grad_norm": 31.80312156677246, + "learning_rate": 5.497122271645473e-05, + "loss": 1.2026, "step": 26600 }, { - "epoch": 0.27, - "learning_rate": 3.107061768644117e-05, - "loss": 1.7028, + "epoch": 0.3678597999504009, + "grad_norm": 11.199711799621582, + "learning_rate": 5.497085495384949e-05, + "loss": 1.1356, "step": 26700 }, { - "epoch": 0.27, - "learning_rate": 3.1187052453862724e-05, - "loss": 1.6518, + "epoch": 0.36923755201014025, + "grad_norm": 8.920809745788574, + "learning_rate": 5.4970484857454584e-05, + "loss": 1.0267, "step": 26800 }, { - "epoch": 0.27, - "learning_rate": 3.1303487221284275e-05, - "loss": 1.6923, + "epoch": 0.3706153040698796, + "grad_norm": 28.67486572265625, + "learning_rate": 5.4970112427301454e-05, + "loss": 1.1357, "step": 26900 }, { - "epoch": 0.28, - "learning_rate": 3.1419921988705826e-05, - "loss": 1.6728, + "epoch": 0.3719930561296189, + "grad_norm": 50.55158233642578, + "learning_rate": 5.496973766342173e-05, + "loss": 1.1476, "step": 27000 }, { - "epoch": 0.28, - "learning_rate": 3.153635675612738e-05, - "loss": 1.5505, + "epoch": 0.37337080818935825, + "grad_norm": 11.69437026977539, + "learning_rate": 5.496936056584726e-05, + "loss": 1.185, "step": 27100 }, { - "epoch": 0.28, - "learning_rate": 3.165279152354893e-05, - "loss": 1.6107, + "epoch": 0.3747485602490976, + "grad_norm": 20.794321060180664, + "learning_rate": 5.496898113461007e-05, + "loss": 1.1766, "step": 27200 }, { - "epoch": 0.28, - "learning_rate": 3.1769226290970486e-05, - "loss": 1.7175, + "epoch": 0.3761263123088369, + "grad_norm": 5.544865131378174, + "learning_rate": 5.496859936974242e-05, + "loss": 1.0123, "step": 27300 }, { - "epoch": 0.28, - "learning_rate": 3.188566105839204e-05, - "loss": 1.5619, + "epoch": 0.37750406436857625, + "grad_norm": 12.653202056884766, + "learning_rate": 5.4968215271276716e-05, + "loss": 1.1932, "step": 27400 }, { - "epoch": 0.28, - "learning_rate": 3.200209582581359e-05, - "loss": 1.779, + "epoch": 0.3788818164283156, + "grad_norm": 3.6544315814971924, + "learning_rate": 5.49678288392456e-05, + "loss": 1.213, "step": 27500 }, { - "epoch": 0.28, - "learning_rate": 3.211853059323514e-05, - "loss": 1.6193, + "epoch": 0.38025956848805487, + "grad_norm": 36.82583236694336, + "learning_rate": 5.496744007368189e-05, + "loss": 1.1204, "step": 27600 }, { - "epoch": 0.28, - "learning_rate": 3.223496536065669e-05, - "loss": 1.5945, + "epoch": 0.3816373205477942, + "grad_norm": 9.208050727844238, + "learning_rate": 5.4967052897159984e-05, + "loss": 1.156, "step": 27700 }, { - "epoch": 0.28, - "learning_rate": 3.235140012807824e-05, - "loss": 1.6029, + "epoch": 0.38301507260753354, + "grad_norm": 4.28313684463501, + "learning_rate": 5.496665948796489e-05, + "loss": 1.1157, "step": 27800 }, { - "epoch": 0.28, - "learning_rate": 3.24678348954998e-05, - "loss": 1.5127, + "epoch": 0.38439282466727287, + "grad_norm": 17.668710708618164, + "learning_rate": 5.4966263745336553e-05, + "loss": 1.1063, "step": 27900 }, { - "epoch": 0.29, - "learning_rate": 3.258426966292135e-05, - "loss": 1.7253, + "epoch": 0.3857705767270122, + "grad_norm": 7.139005661010742, + "learning_rate": 5.49658656693086e-05, + "loss": 1.1658, "step": 28000 }, { - "epoch": 0.29, - "learning_rate": 3.27007044303429e-05, - "loss": 1.6813, + "epoch": 0.38714832878675154, + "grad_norm": 5.367567539215088, + "learning_rate": 5.496546525991484e-05, + "loss": 1.2344, "step": 28100 }, { - "epoch": 0.29, - "learning_rate": 3.2817139197764454e-05, - "loss": 1.5601, + "epoch": 0.3885260808464909, + "grad_norm": 10.696671485900879, + "learning_rate": 5.49650625171893e-05, + "loss": 1.2177, "step": 28200 }, { - "epoch": 0.29, - "learning_rate": 3.2933573965186005e-05, - "loss": 1.6222, + "epoch": 0.3899038329062302, + "grad_norm": 12.0829439163208, + "learning_rate": 5.496466150347612e-05, + "loss": 1.2213, "step": 28300 }, { - "epoch": 0.29, - "learning_rate": 3.305000873260756e-05, - "loss": 1.6366, + "epoch": 0.39128158496596954, + "grad_norm": 3.8467326164245605, + "learning_rate": 5.496425411752232e-05, + "loss": 1.137, "step": 28400 }, { - "epoch": 0.29, - "learning_rate": 3.316644350002911e-05, - "loss": 1.6782, + "epoch": 0.3926593370257089, + "grad_norm": 18.072893142700195, + "learning_rate": 5.49638443983396e-05, + "loss": 1.256, "step": 28500 }, { - "epoch": 0.29, - "learning_rate": 3.3282878267450666e-05, - "loss": 1.7085, + "epoch": 0.39403708908544816, + "grad_norm": 26.420886993408203, + "learning_rate": 5.4963432345962805e-05, + "loss": 1.2116, "step": 28600 }, { - "epoch": 0.29, - "learning_rate": 3.339931303487222e-05, - "loss": 1.6443, + "epoch": 0.3954148411451875, + "grad_norm": 7.811822891235352, + "learning_rate": 5.496301796042694e-05, + "loss": 1.1855, "step": 28700 }, { - "epoch": 0.29, - "learning_rate": 3.351574780229377e-05, - "loss": 1.7136, + "epoch": 0.3967925932049268, + "grad_norm": 25.796764373779297, + "learning_rate": 5.4962601241767195e-05, + "loss": 1.2127, "step": 28800 }, { - "epoch": 0.29, - "learning_rate": 3.363218256971532e-05, - "loss": 1.743, + "epoch": 0.39817034526466616, + "grad_norm": 24.476287841796875, + "learning_rate": 5.496218219001897e-05, + "loss": 1.1299, "step": 28900 }, { - "epoch": 0.3, - "learning_rate": 3.374861733713687e-05, - "loss": 1.6346, + "epoch": 0.3995480973244055, + "grad_norm": 10.935700416564941, + "learning_rate": 5.4961760805217875e-05, + "loss": 1.2926, "step": 29000 }, { - "epoch": 0.3, - "learning_rate": 3.386505210455842e-05, - "loss": 1.5877, + "epoch": 0.40092584938414483, + "grad_norm": 12.129424095153809, + "learning_rate": 5.496133708739971e-05, + "loss": 1.1798, "step": 29100 }, { - "epoch": 0.3, - "learning_rate": 3.398148687197997e-05, - "loss": 1.7239, + "epoch": 0.40230360144388416, + "grad_norm": 5.727356433868408, + "learning_rate": 5.496091103660047e-05, + "loss": 1.2193, "step": 29200 }, { - "epoch": 0.3, - "learning_rate": 3.4097921639401524e-05, - "loss": 1.4876, + "epoch": 0.4036813535036235, + "grad_norm": 4.554442405700684, + "learning_rate": 5.496048265285634e-05, + "loss": 1.1673, "step": 29300 }, { - "epoch": 0.3, - "learning_rate": 3.4214356406823076e-05, - "loss": 1.6542, + "epoch": 0.40505910556336283, + "grad_norm": 10.333436965942383, + "learning_rate": 5.4960051936203726e-05, + "loss": 1.117, "step": 29400 }, { - "epoch": 0.3, - "learning_rate": 3.433079117424463e-05, - "loss": 1.7042, + "epoch": 0.40643685762310217, + "grad_norm": 14.174511909484863, + "learning_rate": 5.495961888667921e-05, + "loss": 1.2083, "step": 29500 }, { - "epoch": 0.3, - "learning_rate": 3.444722594166618e-05, - "loss": 1.5168, + "epoch": 0.4078146096828415, + "grad_norm": 11.463114738464355, + "learning_rate": 5.4959183504319596e-05, + "loss": 1.1945, "step": 29600 }, { - "epoch": 0.3, - "learning_rate": 3.456366070908773e-05, - "loss": 1.6418, + "epoch": 0.4091923617425808, + "grad_norm": 3.3590152263641357, + "learning_rate": 5.495874578916187e-05, + "loss": 1.247, "step": 29700 }, { - "epoch": 0.3, - "learning_rate": 3.468009547650928e-05, - "loss": 1.6087, + "epoch": 0.4105701138023201, + "grad_norm": 39.54381561279297, + "learning_rate": 5.495830574124319e-05, + "loss": 1.2505, "step": 29800 }, { - "epoch": 0.3, - "learning_rate": 3.479653024393084e-05, - "loss": 1.6075, + "epoch": 0.41194786586205945, + "grad_norm": 6.801312446594238, + "learning_rate": 5.495786336060098e-05, + "loss": 1.1564, "step": 29900 }, { - "epoch": 0.31, - "learning_rate": 3.4912965011352397e-05, - "loss": 1.6207, + "epoch": 0.4133256179217988, + "grad_norm": 3.9892210960388184, + "learning_rate": 5.495741864727279e-05, + "loss": 1.0421, "step": 30000 }, { - "epoch": 0.31, - "learning_rate": 3.502939977877395e-05, - "loss": 1.478, + "epoch": 0.4147033699815381, + "grad_norm": 20.140783309936523, + "learning_rate": 5.495697160129642e-05, + "loss": 1.1145, "step": 30100 }, { - "epoch": 0.31, - "learning_rate": 3.51458345461955e-05, - "loss": 1.7332, + "epoch": 0.41608112204127745, + "grad_norm": 24.042156219482422, + "learning_rate": 5.4956522222709846e-05, + "loss": 1.2111, "step": 30200 }, { - "epoch": 0.31, - "learning_rate": 3.526226931361705e-05, - "loss": 1.6318, + "epoch": 0.4174588741010168, + "grad_norm": 7.5537004470825195, + "learning_rate": 5.495607051155124e-05, + "loss": 1.1163, "step": 30300 }, { - "epoch": 0.31, - "learning_rate": 3.53787040810386e-05, - "loss": 1.6422, + "epoch": 0.4188366261607561, + "grad_norm": 8.79468059539795, + "learning_rate": 5.4955616467858984e-05, + "loss": 1.1754, "step": 30400 }, { - "epoch": 0.31, - "learning_rate": 3.549513884846015e-05, - "loss": 1.6565, + "epoch": 0.42021437822049545, + "grad_norm": 21.197681427001953, + "learning_rate": 5.495516009167164e-05, + "loss": 1.0062, "step": 30500 }, { - "epoch": 0.31, - "learning_rate": 3.5611573615881704e-05, - "loss": 1.5044, + "epoch": 0.4215921302802348, + "grad_norm": 12.178730010986328, + "learning_rate": 5.495470138302799e-05, + "loss": 1.1227, "step": 30600 }, { - "epoch": 0.31, - "learning_rate": 3.5728008383303255e-05, - "loss": 1.5851, + "epoch": 0.4229698823399741, + "grad_norm": 15.641883850097656, + "learning_rate": 5.4954240341967e-05, + "loss": 1.1938, "step": 30700 }, { - "epoch": 0.31, - "learning_rate": 3.5844443150724806e-05, - "loss": 1.7198, + "epoch": 0.4243476343997134, + "grad_norm": 24.71154022216797, + "learning_rate": 5.4953776968527846e-05, + "loss": 1.0528, "step": 30800 }, { - "epoch": 0.31, - "learning_rate": 3.596087791814636e-05, - "loss": 1.6801, + "epoch": 0.42572538645945274, + "grad_norm": 11.14073371887207, + "learning_rate": 5.495331126274987e-05, + "loss": 1.1865, "step": 30900 }, { - "epoch": 0.32, - "learning_rate": 3.607731268556791e-05, - "loss": 1.7126, + "epoch": 0.42710313851919207, + "grad_norm": 7.5139994621276855, + "learning_rate": 5.495284322467267e-05, + "loss": 1.0983, "step": 31000 }, { - "epoch": 0.32, - "learning_rate": 3.619374745298946e-05, - "loss": 1.4165, + "epoch": 0.4284808905789314, + "grad_norm": 13.63111400604248, + "learning_rate": 5.495237285433598e-05, + "loss": 1.2065, "step": 31100 }, { - "epoch": 0.32, - "learning_rate": 3.63090178727368e-05, - "loss": 1.627, + "epoch": 0.42985864263867074, + "grad_norm": 31.978715896606445, + "learning_rate": 5.495190015177977e-05, + "loss": 1.1782, "step": 31200 }, { - "epoch": 0.32, - "learning_rate": 3.642545264015836e-05, - "loss": 1.6585, + "epoch": 0.4312363946984101, + "grad_norm": 9.212420463562012, + "learning_rate": 5.4951425117044204e-05, + "loss": 1.1952, "step": 31300 }, { - "epoch": 0.32, - "learning_rate": 3.654188740757991e-05, - "loss": 1.461, + "epoch": 0.4326141467581494, + "grad_norm": 5.0600690841674805, + "learning_rate": 5.4950947750169625e-05, + "loss": 1.1572, "step": 31400 }, { - "epoch": 0.32, - "learning_rate": 3.665832217500146e-05, - "loss": 1.593, + "epoch": 0.43399189881788874, + "grad_norm": 38.48628616333008, + "learning_rate": 5.49504680511966e-05, + "loss": 1.1557, "step": 31500 }, { - "epoch": 0.32, - "learning_rate": 3.677475694242301e-05, - "loss": 1.5944, + "epoch": 0.4353696508776281, + "grad_norm": 25.28558349609375, + "learning_rate": 5.494999085201974e-05, + "loss": 1.1951, "step": 31600 }, { - "epoch": 0.32, - "learning_rate": 3.689119170984456e-05, - "loss": 1.6941, + "epoch": 0.4367474029373674, + "grad_norm": 9.937192916870117, + "learning_rate": 5.4949506512292245e-05, + "loss": 1.1394, "step": 31700 }, { - "epoch": 0.32, - "learning_rate": 3.7007626477266114e-05, - "loss": 1.4476, + "epoch": 0.43812515499710675, + "grad_norm": 17.897592544555664, + "learning_rate": 5.494901984058873e-05, + "loss": 1.1675, "step": 31800 }, { - "epoch": 0.33, - "learning_rate": 3.7124061244687665e-05, - "loss": 1.6172, + "epoch": 0.439502907056846, + "grad_norm": 28.568443298339844, + "learning_rate": 5.494853083695056e-05, + "loss": 1.2684, "step": 31900 }, { - "epoch": 0.33, - "learning_rate": 3.7239331664435e-05, - "loss": 1.6509, + "epoch": 0.44088065911658536, + "grad_norm": 40.10667419433594, + "learning_rate": 5.494803950141926e-05, + "loss": 1.1271, "step": 32000 }, { - "epoch": 0.33, - "learning_rate": 3.735576643185655e-05, - "loss": 1.7404, + "epoch": 0.4422584111763247, + "grad_norm": 16.480844497680664, + "learning_rate": 5.4947545834036594e-05, + "loss": 1.1347, "step": 32100 }, { - "epoch": 0.33, - "learning_rate": 3.74722011992781e-05, - "loss": 1.5284, + "epoch": 0.44363616323606403, + "grad_norm": 8.691411018371582, + "learning_rate": 5.494704983484448e-05, + "loss": 1.1556, "step": 32200 }, { - "epoch": 0.33, - "learning_rate": 3.758863596669966e-05, - "loss": 1.6119, + "epoch": 0.44501391529580336, + "grad_norm": 9.516257286071777, + "learning_rate": 5.494655150388506e-05, + "loss": 1.2484, "step": 32300 }, { - "epoch": 0.33, - "learning_rate": 3.770507073412121e-05, - "loss": 1.4731, + "epoch": 0.4463916673555427, + "grad_norm": 21.820451736450195, + "learning_rate": 5.494605084120069e-05, + "loss": 1.1766, "step": 32400 }, { - "epoch": 0.33, - "learning_rate": 3.782150550154276e-05, - "loss": 1.7473, + "epoch": 0.44776941941528203, + "grad_norm": 53.97123718261719, + "learning_rate": 5.494555288831923e-05, + "loss": 1.2531, "step": 32500 }, { - "epoch": 0.33, - "learning_rate": 3.793794026896431e-05, - "loss": 1.663, + "epoch": 0.44914717147502137, + "grad_norm": 10.5194673538208, + "learning_rate": 5.49450475856289e-05, + "loss": 1.2607, "step": 32600 }, { - "epoch": 0.33, - "learning_rate": 3.805437503638587e-05, - "loss": 1.5969, + "epoch": 0.4505249235347607, + "grad_norm": 28.1883602142334, + "learning_rate": 5.494453995134138e-05, + "loss": 1.1606, "step": 32700 }, { - "epoch": 0.33, - "learning_rate": 3.817080980380742e-05, - "loss": 1.5234, + "epoch": 0.45190267559450004, + "grad_norm": 16.315658569335938, + "learning_rate": 5.494402998549977e-05, + "loss": 1.1847, "step": 32800 }, { - "epoch": 0.34, - "learning_rate": 3.828724457122897e-05, - "loss": 1.4602, + "epoch": 0.45328042765423937, + "grad_norm": 10.581954002380371, + "learning_rate": 5.494351768814742e-05, + "loss": 1.1807, "step": 32900 }, { - "epoch": 0.34, - "learning_rate": 3.8403679338650524e-05, - "loss": 1.5917, + "epoch": 0.45465817971397865, + "grad_norm": 10.171825408935547, + "learning_rate": 5.494300305932784e-05, + "loss": 1.1268, "step": 33000 }, { - "epoch": 0.34, - "learning_rate": 3.8518949758397857e-05, - "loss": 1.6486, + "epoch": 0.456035931773718, + "grad_norm": 25.388198852539062, + "learning_rate": 5.4942486099084755e-05, + "loss": 1.1395, "step": 33100 }, { - "epoch": 0.34, - "learning_rate": 3.863538452581941e-05, - "loss": 1.6396, + "epoch": 0.4574136838334573, + "grad_norm": 6.69132661819458, + "learning_rate": 5.4941966807462086e-05, + "loss": 1.1545, "step": 33200 }, { - "epoch": 0.34, - "learning_rate": 3.875181929324096e-05, - "loss": 1.4991, + "epoch": 0.45879143589319665, + "grad_norm": 10.449405670166016, + "learning_rate": 5.4941445184503934e-05, + "loss": 1.2649, "step": 33300 }, { - "epoch": 0.34, - "learning_rate": 3.886825406066252e-05, - "loss": 1.4741, + "epoch": 0.460169187952936, + "grad_norm": 9.717305183410645, + "learning_rate": 5.494092123025462e-05, + "loss": 1.1029, "step": 33400 }, { - "epoch": 0.34, - "learning_rate": 3.898468882808407e-05, - "loss": 1.6008, + "epoch": 0.4615469400126753, + "grad_norm": 9.180707931518555, + "learning_rate": 5.494039494475868e-05, + "loss": 1.2296, "step": 33500 }, { - "epoch": 0.34, - "learning_rate": 3.910112359550562e-05, - "loss": 1.6586, + "epoch": 0.46292469207241466, + "grad_norm": 9.62405014038086, + "learning_rate": 5.493987162576708e-05, + "loss": 1.1546, "step": 33600 }, { - "epoch": 0.34, - "learning_rate": 3.921755836292717e-05, - "loss": 1.5204, + "epoch": 0.464302444132154, + "grad_norm": 17.93250846862793, + "learning_rate": 5.4939340701223503e-05, + "loss": 1.3056, "step": 33700 }, { - "epoch": 0.34, - "learning_rate": 3.933399313034872e-05, - "loss": 1.604, + "epoch": 0.4656801961918933, + "grad_norm": 8.866469383239746, + "learning_rate": 5.493880744556757e-05, + "loss": 1.2404, "step": 33800 }, { - "epoch": 0.35, - "learning_rate": 3.945042789777027e-05, - "loss": 1.6618, + "epoch": 0.46705794825163266, + "grad_norm": 10.755928039550781, + "learning_rate": 5.493827185884457e-05, + "loss": 1.1766, "step": 33900 }, { - "epoch": 0.35, - "learning_rate": 3.9566862665191824e-05, - "loss": 1.5671, + "epoch": 0.46843570031137194, + "grad_norm": 48.711204528808594, + "learning_rate": 5.49377339411e-05, + "loss": 1.0883, "step": 34000 }, { - "epoch": 0.35, - "learning_rate": 3.968329743261338e-05, - "loss": 1.6671, + "epoch": 0.4698134523711113, + "grad_norm": 10.83607292175293, + "learning_rate": 5.493719369237956e-05, + "loss": 1.1657, "step": 34100 }, { - "epoch": 0.35, - "learning_rate": 3.9799732200034934e-05, - "loss": 1.6382, + "epoch": 0.4711912044308506, + "grad_norm": 202.4723358154297, + "learning_rate": 5.4936651112729146e-05, + "loss": 1.1673, "step": 34200 }, { - "epoch": 0.35, - "learning_rate": 3.9916166967456485e-05, - "loss": 1.7198, + "epoch": 0.47256895649058994, + "grad_norm": 23.345539093017578, + "learning_rate": 5.493610620219485e-05, + "loss": 1.2544, "step": 34300 }, { - "epoch": 0.35, - "learning_rate": 4.0032601734878036e-05, - "loss": 1.5506, + "epoch": 0.4739467085503293, + "grad_norm": 17.14194107055664, + "learning_rate": 5.4935558960822975e-05, + "loss": 1.2183, "step": 34400 }, { - "epoch": 0.35, - "learning_rate": 4.014903650229959e-05, - "loss": 1.4979, + "epoch": 0.4753244606100686, + "grad_norm": 154.0811004638672, + "learning_rate": 5.493500938866002e-05, + "loss": 1.1882, "step": 34500 }, { - "epoch": 0.35, - "learning_rate": 4.026547126972114e-05, - "loss": 1.5727, + "epoch": 0.47670221266980795, + "grad_norm": 29.115861892700195, + "learning_rate": 5.493445748575265e-05, + "loss": 1.1198, "step": 34600 }, { - "epoch": 0.35, - "learning_rate": 4.0381906037142696e-05, - "loss": 1.5792, + "epoch": 0.4780799647295473, + "grad_norm": 20.269804000854492, + "learning_rate": 5.493390325214776e-05, + "loss": 1.2218, "step": 34700 }, { - "epoch": 0.35, - "learning_rate": 4.049834080456425e-05, - "loss": 1.6914, + "epoch": 0.4794577167892866, + "grad_norm": 16.553653717041016, + "learning_rate": 5.4933346687892435e-05, + "loss": 1.1487, "step": 34800 }, { - "epoch": 0.36, - "learning_rate": 4.06147755719858e-05, - "loss": 1.6395, + "epoch": 0.48083546884902595, + "grad_norm": 25.207561492919922, + "learning_rate": 5.493278779303397e-05, + "loss": 1.1777, "step": 34900 }, { - "epoch": 0.36, - "learning_rate": 4.073121033940735e-05, - "loss": 1.3941, + "epoch": 0.4822132209087653, + "grad_norm": 38.11124038696289, + "learning_rate": 5.493222656761983e-05, + "loss": 1.1786, "step": 35000 }, { - "epoch": 0.36, - "learning_rate": 4.08476451068289e-05, - "loss": 1.5986, + "epoch": 0.48359097296850456, + "grad_norm": 146.48446655273438, + "learning_rate": 5.49316630116977e-05, + "loss": 1.2362, "step": 35100 }, { - "epoch": 0.36, - "learning_rate": 4.096407987425045e-05, - "loss": 1.6196, + "epoch": 0.4849687250282439, + "grad_norm": 21.451509475708008, + "learning_rate": 5.493109712531546e-05, + "loss": 1.1021, "step": 35200 }, { - "epoch": 0.36, - "learning_rate": 4.1080514641672004e-05, - "loss": 1.6662, + "epoch": 0.48634647708798323, + "grad_norm": 181.49642944335938, + "learning_rate": 5.4930528908521185e-05, + "loss": 1.2539, "step": 35300 }, { - "epoch": 0.36, - "learning_rate": 4.1196949409093555e-05, - "loss": 1.6225, + "epoch": 0.48772422914772257, + "grad_norm": 26.78451156616211, + "learning_rate": 5.4929958361363143e-05, + "loss": 1.3365, "step": 35400 }, { - "epoch": 0.36, - "learning_rate": 4.1313384176515106e-05, - "loss": 1.5535, + "epoch": 0.4891019812074619, + "grad_norm": 11.870569229125977, + "learning_rate": 5.492938548388981e-05, + "loss": 1.3135, "step": 35500 }, { - "epoch": 0.36, - "learning_rate": 4.142981894393666e-05, - "loss": 1.5587, + "epoch": 0.49047973326720123, + "grad_norm": 11.67177963256836, + "learning_rate": 5.492881027614985e-05, + "loss": 1.1676, "step": 35600 }, { - "epoch": 0.36, - "learning_rate": 4.154625371135821e-05, - "loss": 1.5462, + "epoch": 0.49185748532694057, + "grad_norm": 43.119224548339844, + "learning_rate": 5.4928232738192135e-05, + "loss": 1.3202, "step": 35700 }, { - "epoch": 0.36, - "learning_rate": 4.166268847877976e-05, - "loss": 1.6372, + "epoch": 0.4932352373866799, + "grad_norm": 26.77092170715332, + "learning_rate": 5.4927652870065726e-05, + "loss": 1.3075, "step": 35800 }, { - "epoch": 0.37, - "learning_rate": 4.177912324620131e-05, - "loss": 1.6065, + "epoch": 0.49461298944641924, + "grad_norm": 12.316910743713379, + "learning_rate": 5.492707067181988e-05, + "loss": 1.352, "step": 35900 }, { - "epoch": 0.37, - "learning_rate": 4.189555801362286e-05, - "loss": 1.6709, + "epoch": 0.4959907415061586, + "grad_norm": 61.07495880126953, + "learning_rate": 5.492648614350407e-05, + "loss": 1.1952, "step": 36000 }, { - "epoch": 0.37, - "learning_rate": 4.201199278104443e-05, - "loss": 1.6939, + "epoch": 0.4973684935658979, + "grad_norm": 49.89175033569336, + "learning_rate": 5.492589928516795e-05, + "loss": 1.1014, "step": 36100 }, { - "epoch": 0.37, - "learning_rate": 4.212842754846598e-05, - "loss": 1.5545, + "epoch": 0.4987462456256372, + "grad_norm": 8.198811531066895, + "learning_rate": 5.492531009686138e-05, + "loss": 1.349, "step": 36200 }, { - "epoch": 0.37, - "learning_rate": 4.224486231588753e-05, - "loss": 1.4577, + "epoch": 0.5001239976853765, + "grad_norm": 24.358966827392578, + "learning_rate": 5.4924718578634405e-05, + "loss": 1.2439, "step": 36300 }, { - "epoch": 0.37, - "learning_rate": 4.236129708330908e-05, - "loss": 1.5861, + "epoch": 0.5015017497451159, + "grad_norm": 15.124456405639648, + "learning_rate": 5.4924124730537284e-05, + "loss": 1.1741, "step": 36400 }, { - "epoch": 0.37, - "learning_rate": 4.247773185073063e-05, - "loss": 1.6461, + "epoch": 0.5028795018048552, + "grad_norm": 40.977848052978516, + "learning_rate": 5.492352855262046e-05, + "loss": 1.2399, "step": 36500 }, { - "epoch": 0.37, - "learning_rate": 4.259416661815218e-05, - "loss": 1.6597, + "epoch": 0.5042572538645945, + "grad_norm": 16.233501434326172, + "learning_rate": 5.49229300449346e-05, + "loss": 1.1527, "step": 36600 }, { - "epoch": 0.37, - "learning_rate": 4.2710601385573734e-05, - "loss": 1.5604, + "epoch": 0.5056350059243339, + "grad_norm": 24.221960067749023, + "learning_rate": 5.4922329207530526e-05, + "loss": 1.311, "step": 36700 }, { - "epoch": 0.37, - "learning_rate": 4.2827036152995286e-05, - "loss": 1.6345, + "epoch": 0.5070127579840732, + "grad_norm": 10.371733665466309, + "learning_rate": 5.4921726040459305e-05, + "loss": 1.2262, "step": 36800 }, { - "epoch": 0.38, - "learning_rate": 4.294347092041684e-05, - "loss": 1.6635, + "epoch": 0.5083905100438125, + "grad_norm": 13.55928897857666, + "learning_rate": 5.492112054377216e-05, + "loss": 1.2579, "step": 36900 }, { - "epoch": 0.38, - "learning_rate": 4.305990568783839e-05, - "loss": 1.4263, + "epoch": 0.5097682621035519, + "grad_norm": 19.904464721679688, + "learning_rate": 5.492051271752054e-05, + "loss": 1.1302, "step": 37000 }, { - "epoch": 0.38, - "learning_rate": 4.317634045525994e-05, - "loss": 1.6068, + "epoch": 0.5111460141632912, + "grad_norm": 8.938076972961426, + "learning_rate": 5.491990256175609e-05, + "loss": 1.2907, "step": 37100 }, { - "epoch": 0.38, - "learning_rate": 4.329277522268149e-05, - "loss": 1.4196, + "epoch": 0.5125237662230305, + "grad_norm": 7.375090599060059, + "learning_rate": 5.491929007653063e-05, + "loss": 1.2576, "step": 37200 }, { - "epoch": 0.38, - "learning_rate": 4.340920999010304e-05, - "loss": 1.6036, + "epoch": 0.5139015182827699, + "grad_norm": 19.95588493347168, + "learning_rate": 5.491868142157295e-05, + "loss": 1.2289, "step": 37300 }, { - "epoch": 0.38, - "learning_rate": 4.35256447575246e-05, - "loss": 1.4094, + "epoch": 0.5152792703425092, + "grad_norm": 38.733131408691406, + "learning_rate": 5.4918064300875095e-05, + "loss": 1.2287, "step": 37400 }, { - "epoch": 0.38, - "learning_rate": 4.364207952494615e-05, - "loss": 1.6328, + "epoch": 0.5166570224022485, + "grad_norm": 23.4934024810791, + "learning_rate": 5.491744485087241e-05, + "loss": 1.1797, "step": 37500 }, { - "epoch": 0.38, - "learning_rate": 4.37585142923677e-05, - "loss": 1.6935, + "epoch": 0.5180347744619879, + "grad_norm": 15.991085052490234, + "learning_rate": 5.4916823071617515e-05, + "loss": 1.2026, "step": 37600 }, { - "epoch": 0.38, - "learning_rate": 4.3874949059789253e-05, - "loss": 1.7379, + "epoch": 0.5194125265217272, + "grad_norm": 6.564786434173584, + "learning_rate": 5.491619896316324e-05, + "loss": 1.2287, "step": 37700 }, { - "epoch": 0.39, - "learning_rate": 4.3991383827210805e-05, - "loss": 1.7062, + "epoch": 0.5207902785814664, + "grad_norm": 21.138957977294922, + "learning_rate": 5.49155725255626e-05, + "loss": 1.2666, "step": 37800 }, { - "epoch": 0.39, - "learning_rate": 4.4107818594632356e-05, - "loss": 1.586, + "epoch": 0.5221680306412058, + "grad_norm": 5.338771820068359, + "learning_rate": 5.491494375886882e-05, + "loss": 1.0798, "step": 37900 }, { - "epoch": 0.39, - "learning_rate": 4.422425336205391e-05, - "loss": 1.7326, + "epoch": 0.5235457827009451, + "grad_norm": 17.468963623046875, + "learning_rate": 5.4914312663135315e-05, + "loss": 1.1097, "step": 38000 }, { - "epoch": 0.39, - "learning_rate": 4.4340688129475465e-05, - "loss": 1.5298, + "epoch": 0.5249235347606844, + "grad_norm": 5.89479398727417, + "learning_rate": 5.491367923841569e-05, + "loss": 1.0782, "step": 38100 }, { - "epoch": 0.39, - "learning_rate": 4.4457122896897016e-05, - "loss": 1.6318, + "epoch": 0.5263012868204238, + "grad_norm": 8.392021179199219, + "learning_rate": 5.491304348476377e-05, + "loss": 1.1417, "step": 38200 }, { - "epoch": 0.39, - "learning_rate": 4.457355766431857e-05, - "loss": 1.5471, + "epoch": 0.5276790388801631, + "grad_norm": 9.740779876708984, + "learning_rate": 5.491240540223357e-05, + "loss": 1.2198, "step": 38300 }, { - "epoch": 0.39, - "learning_rate": 4.468999243174012e-05, - "loss": 1.5413, + "epoch": 0.5290567909399024, + "grad_norm": 8.02989387512207, + "learning_rate": 5.491176499087928e-05, + "loss": 1.2369, "step": 38400 }, { - "epoch": 0.39, - "learning_rate": 4.480642719916167e-05, - "loss": 1.6205, + "epoch": 0.5304345429996418, + "grad_norm": 14.354741096496582, + "learning_rate": 5.491112225075532e-05, + "loss": 1.1563, "step": 38500 }, { - "epoch": 0.39, - "learning_rate": 4.492286196658322e-05, - "loss": 1.7189, + "epoch": 0.5318122950593811, + "grad_norm": 13.512933731079102, + "learning_rate": 5.491047718191629e-05, + "loss": 1.0641, "step": 38600 }, { - "epoch": 0.39, - "learning_rate": 4.503929673400478e-05, - "loss": 1.7002, + "epoch": 0.5331900471191204, + "grad_norm": 8.201644897460938, + "learning_rate": 5.4909829784417e-05, + "loss": 1.1771, "step": 38700 }, { - "epoch": 0.4, - "learning_rate": 4.515573150142633e-05, - "loss": 1.6631, + "epoch": 0.5345677991788598, + "grad_norm": 12.34885025024414, + "learning_rate": 5.490918005831243e-05, + "loss": 1.2186, "step": 38800 }, { - "epoch": 0.4, - "learning_rate": 4.527216626884788e-05, - "loss": 1.5372, + "epoch": 0.5359455512385991, + "grad_norm": 12.697105407714844, + "learning_rate": 5.4908528003657794e-05, + "loss": 1.0416, "step": 38900 }, { - "epoch": 0.4, - "learning_rate": 4.538860103626943e-05, - "loss": 1.5069, + "epoch": 0.5373233032983384, + "grad_norm": 20.74173355102539, + "learning_rate": 5.490787362050848e-05, + "loss": 1.2125, "step": 39000 }, { - "epoch": 0.4, - "learning_rate": 4.5505035803690984e-05, - "loss": 1.6565, + "epoch": 0.5387010553580778, + "grad_norm": 5.475405216217041, + "learning_rate": 5.490721690892009e-05, + "loss": 1.1598, "step": 39100 }, { - "epoch": 0.4, - "learning_rate": 4.5621470571112535e-05, - "loss": 1.6127, + "epoch": 0.5400788074178171, + "grad_norm": 15.600400924682617, + "learning_rate": 5.490655786894841e-05, + "loss": 1.1072, "step": 39200 }, { - "epoch": 0.4, - "learning_rate": 4.573674099085987e-05, - "loss": 1.7147, + "epoch": 0.5414565594775564, + "grad_norm": 13.139054298400879, + "learning_rate": 5.4905896500649416e-05, + "loss": 1.1552, "step": 39300 }, { - "epoch": 0.4, - "learning_rate": 4.585201141060721e-05, - "loss": 1.7017, + "epoch": 0.5428343115372958, + "grad_norm": 62.545928955078125, + "learning_rate": 5.490523280407932e-05, + "loss": 1.0561, "step": 39400 }, { - "epoch": 0.4, - "learning_rate": 4.596844617802876e-05, - "loss": 1.4555, + "epoch": 0.5442120635970351, + "grad_norm": 6.346467971801758, + "learning_rate": 5.4904566779294486e-05, + "loss": 1.1642, "step": 39500 }, { - "epoch": 0.4, - "learning_rate": 4.608488094545031e-05, - "loss": 1.5304, + "epoch": 0.5455898156567744, + "grad_norm": 15.561434745788574, + "learning_rate": 5.490389842635151e-05, + "loss": 1.106, "step": 39600 }, { - "epoch": 0.4, - "learning_rate": 4.620131571287187e-05, - "loss": 1.6392, + "epoch": 0.5469675677165138, + "grad_norm": 12.556934356689453, + "learning_rate": 5.490322774530716e-05, + "loss": 1.0369, "step": 39700 }, { - "epoch": 0.41, - "learning_rate": 4.631775048029342e-05, - "loss": 1.6347, + "epoch": 0.5483453197762531, + "grad_norm": 13.121975898742676, + "learning_rate": 5.490255473621842e-05, + "loss": 1.1628, "step": 39800 }, { - "epoch": 0.41, - "learning_rate": 4.643418524771497e-05, - "loss": 1.5631, + "epoch": 0.5497230718359924, + "grad_norm": 7.027225017547607, + "learning_rate": 5.490187939914246e-05, + "loss": 1.1277, "step": 39900 }, { - "epoch": 0.41, - "learning_rate": 4.655062001513652e-05, - "loss": 1.5137, + "epoch": 0.5511008238957317, + "grad_norm": 12.51138973236084, + "learning_rate": 5.490120173413667e-05, + "loss": 1.1055, "step": 40000 }, { - "epoch": 0.41, - "learning_rate": 4.666705478255807e-05, - "loss": 1.5546, + "epoch": 0.552478575955471, + "grad_norm": 15.838208198547363, + "learning_rate": 5.4900521741258595e-05, + "loss": 1.1405, "step": 40100 }, { - "epoch": 0.41, - "learning_rate": 4.6783489549979624e-05, - "loss": 1.6372, + "epoch": 0.5538563280152103, + "grad_norm": 63.764068603515625, + "learning_rate": 5.4899839420566027e-05, + "loss": 1.2225, "step": 40200 }, { - "epoch": 0.41, - "learning_rate": 4.6899924317401176e-05, - "loss": 1.6029, + "epoch": 0.5552340800749497, + "grad_norm": 24.84697151184082, + "learning_rate": 5.489915477211693e-05, + "loss": 1.0902, "step": 40300 }, { - "epoch": 0.41, - "learning_rate": 4.701635908482273e-05, - "loss": 1.7338, + "epoch": 0.556611832134689, + "grad_norm": 18.65909194946289, + "learning_rate": 5.489846779596945e-05, + "loss": 1.2615, "step": 40400 }, { - "epoch": 0.41, - "learning_rate": 4.713279385224428e-05, - "loss": 1.5, + "epoch": 0.5579895841944283, + "grad_norm": 13.023543357849121, + "learning_rate": 5.489777849218196e-05, + "loss": 1.2782, "step": 40500 }, { - "epoch": 0.41, - "learning_rate": 4.724922861966583e-05, - "loss": 1.4543, + "epoch": 0.5593673362541677, + "grad_norm": 10.606678009033203, + "learning_rate": 5.489708686081303e-05, + "loss": 1.174, "step": 40600 }, { - "epoch": 0.41, - "learning_rate": 4.736566338708738e-05, - "loss": 1.5589, + "epoch": 0.560745088313907, + "grad_norm": 21.74370574951172, + "learning_rate": 5.4896392901921404e-05, + "loss": 1.1712, "step": 40700 }, { - "epoch": 0.42, - "learning_rate": 4.748209815450893e-05, - "loss": 1.5708, + "epoch": 0.5621228403736463, + "grad_norm": 36.1141242980957, + "learning_rate": 5.489569661556604e-05, + "loss": 1.1973, "step": 40800 }, { - "epoch": 0.42, - "learning_rate": 4.759853292193049e-05, - "loss": 1.6656, + "epoch": 0.5635005924333857, + "grad_norm": 20.26376724243164, + "learning_rate": 5.4894998001806094e-05, + "loss": 1.4648, "step": 40900 }, { - "epoch": 0.42, - "learning_rate": 4.771496768935205e-05, - "loss": 1.4874, + "epoch": 0.564878344493125, + "grad_norm": 15.052135467529297, + "learning_rate": 5.4894297060700914e-05, + "loss": 1.2466, "step": 41000 }, { - "epoch": 0.42, - "learning_rate": 4.78314024567736e-05, - "loss": 1.6359, + "epoch": 0.5662560965528644, + "grad_norm": 104.23808288574219, + "learning_rate": 5.489359379231006e-05, + "loss": 1.1266, "step": 41100 }, { - "epoch": 0.42, - "learning_rate": 4.794783722419515e-05, - "loss": 1.5632, + "epoch": 0.5676338486126037, + "grad_norm": 25.820280075073242, + "learning_rate": 5.489288819669326e-05, + "loss": 1.2133, "step": 41200 }, { - "epoch": 0.42, - "learning_rate": 4.80642719916167e-05, - "loss": 1.567, + "epoch": 0.569011600672343, + "grad_norm": 12.302903175354004, + "learning_rate": 5.4892180273910467e-05, + "loss": 1.2712, "step": 41300 }, { - "epoch": 0.42, - "learning_rate": 4.818070675903825e-05, - "loss": 1.6288, + "epoch": 0.5703893527320824, + "grad_norm": 11.3978910446167, + "learning_rate": 5.489147002402182e-05, + "loss": 1.3285, "step": 41400 }, { - "epoch": 0.42, - "learning_rate": 4.8297141526459804e-05, - "loss": 1.6696, + "epoch": 0.5717671047918217, + "grad_norm": 20.72574234008789, + "learning_rate": 5.489075744708767e-05, + "loss": 1.1829, "step": 41500 }, { - "epoch": 0.42, - "learning_rate": 4.8413576293881355e-05, - "loss": 1.5842, + "epoch": 0.573144856851561, + "grad_norm": 8.017884254455566, + "learning_rate": 5.489004254316854e-05, + "loss": 1.2739, "step": 41600 }, { - "epoch": 0.42, - "learning_rate": 4.8530011061302906e-05, - "loss": 1.7026, + "epoch": 0.5745226089113004, + "grad_norm": 11.902440071105957, + "learning_rate": 5.488932531232517e-05, + "loss": 1.3857, "step": 41700 }, { - "epoch": 0.43, - "learning_rate": 4.864644582872446e-05, - "loss": 1.6694, + "epoch": 0.5759003609710397, + "grad_norm": 10.239564895629883, + "learning_rate": 5.488860575461849e-05, + "loss": 1.1585, "step": 41800 }, { - "epoch": 0.43, - "learning_rate": 4.876288059614601e-05, - "loss": 1.5539, + "epoch": 0.577278113030779, + "grad_norm": 20.104263305664062, + "learning_rate": 5.488788387010963e-05, + "loss": 1.2889, "step": 41900 }, { - "epoch": 0.43, - "learning_rate": 4.887931536356756e-05, - "loss": 1.616, + "epoch": 0.5786558650905184, + "grad_norm": 24.809194564819336, + "learning_rate": 5.4887166912489586e-05, + "loss": 1.2151, "step": 42000 }, { - "epoch": 0.43, - "learning_rate": 4.899575013098911e-05, - "loss": 1.7302, + "epoch": 0.5800336171502576, + "grad_norm": 10.520858764648438, + "learning_rate": 5.488644039782705e-05, + "loss": 1.2757, "step": 42100 }, { - "epoch": 0.43, - "learning_rate": 4.911218489841066e-05, - "loss": 1.6773, + "epoch": 0.5814113692099969, + "grad_norm": 10.631114959716797, + "learning_rate": 5.488571155654628e-05, + "loss": 1.1857, "step": 42200 }, { - "epoch": 0.43, - "learning_rate": 4.922861966583222e-05, - "loss": 1.5233, + "epoch": 0.5827891212697363, + "grad_norm": 18.827821731567383, + "learning_rate": 5.488498038870921e-05, + "loss": 1.2018, "step": 42300 }, { - "epoch": 0.43, - "learning_rate": 4.934505443325377e-05, - "loss": 1.6265, + "epoch": 0.5841668733294756, + "grad_norm": 5.8350019454956055, + "learning_rate": 5.488424689437796e-05, + "loss": 1.1799, "step": 42400 }, { - "epoch": 0.43, - "learning_rate": 4.946148920067532e-05, - "loss": 1.7762, + "epoch": 0.5855446253892149, + "grad_norm": 23.99330711364746, + "learning_rate": 5.488351107361484e-05, + "loss": 1.2866, "step": 42500 }, { - "epoch": 0.43, - "learning_rate": 4.9577923968096874e-05, - "loss": 1.477, + "epoch": 0.5869223774489543, + "grad_norm": 122.87985229492188, + "learning_rate": 5.488277292648236e-05, + "loss": 1.1655, "step": 42600 }, { - "epoch": 0.44, - "learning_rate": 4.9694358735518425e-05, - "loss": 1.5589, + "epoch": 0.5883001295086936, + "grad_norm": 24.69963264465332, + "learning_rate": 5.488203245304323e-05, + "loss": 1.2314, "step": 42700 }, { - "epoch": 0.44, - "learning_rate": 4.9810793502939977e-05, - "loss": 1.6193, + "epoch": 0.5896778815684329, + "grad_norm": 103.8561782836914, + "learning_rate": 5.4881289653360364e-05, + "loss": 1.244, "step": 42800 }, { - "epoch": 0.44, - "learning_rate": 4.9927228270361535e-05, - "loss": 1.5742, + "epoch": 0.5910556336281723, + "grad_norm": 9.050652503967285, + "learning_rate": 5.4880544527496854e-05, + "loss": 1.3281, "step": 42900 }, { - "epoch": 0.44, - "learning_rate": 5.0043663037783086e-05, - "loss": 1.7094, + "epoch": 0.5924333856879116, + "grad_norm": 28.411632537841797, + "learning_rate": 5.487979707551601e-05, + "loss": 1.238, "step": 43000 }, { - "epoch": 0.44, - "learning_rate": 5.016009780520464e-05, - "loss": 1.7161, + "epoch": 0.5938111377476509, + "grad_norm": 90.37954711914062, + "learning_rate": 5.487904729748133e-05, + "loss": 1.2811, "step": 43100 }, { - "epoch": 0.44, - "learning_rate": 5.027653257262619e-05, - "loss": 1.5581, + "epoch": 0.5951888898073903, + "grad_norm": 6.304271221160889, + "learning_rate": 5.487829519345651e-05, + "loss": 1.2682, "step": 43200 }, { - "epoch": 0.44, - "learning_rate": 5.039296734004774e-05, - "loss": 1.7793, + "epoch": 0.5965666418671296, + "grad_norm": 7.4491143226623535, + "learning_rate": 5.487754076350545e-05, + "loss": 1.1171, "step": 43300 }, { - "epoch": 0.44, - "learning_rate": 5.050940210746929e-05, - "loss": 1.6299, + "epoch": 0.5979443939268689, + "grad_norm": 11.623374938964844, + "learning_rate": 5.487678400769224e-05, + "loss": 1.323, "step": 43400 }, { - "epoch": 0.44, - "learning_rate": 5.062583687489084e-05, - "loss": 1.6768, + "epoch": 0.5993221459866083, + "grad_norm": 17.040042877197266, + "learning_rate": 5.4876024926081166e-05, + "loss": 1.2878, "step": 43500 }, { - "epoch": 0.44, - "learning_rate": 5.07422716423124e-05, - "loss": 1.4999, + "epoch": 0.6006998980463476, + "grad_norm": 37.76469421386719, + "learning_rate": 5.4875263518736724e-05, + "loss": 1.2728, "step": 43600 }, { - "epoch": 0.45, - "learning_rate": 5.085870640973395e-05, - "loss": 1.6757, + "epoch": 0.6020776501060869, + "grad_norm": 5.725869655609131, + "learning_rate": 5.4874499785723586e-05, + "loss": 1.3025, "step": 43700 }, { - "epoch": 0.45, - "learning_rate": 5.09751411771555e-05, - "loss": 1.6047, + "epoch": 0.6034554021658263, + "grad_norm": 17.790145874023438, + "learning_rate": 5.4873733727106655e-05, + "loss": 1.2354, "step": 43800 }, { - "epoch": 0.45, - "learning_rate": 5.1091575944577054e-05, - "loss": 1.7353, + "epoch": 0.6048331542255656, + "grad_norm": 37.19287872314453, + "learning_rate": 5.4872965342950995e-05, + "loss": 1.1685, "step": 43900 }, { - "epoch": 0.45, - "learning_rate": 5.1206846364324386e-05, - "loss": 1.5894, + "epoch": 0.6062109062853049, + "grad_norm": 19.467134475708008, + "learning_rate": 5.4872194633321896e-05, + "loss": 1.2465, "step": 44000 }, { - "epoch": 0.45, - "learning_rate": 5.1322116784071726e-05, - "loss": 1.6438, + "epoch": 0.6075886583450443, + "grad_norm": 26.84002113342285, + "learning_rate": 5.4871421598284824e-05, + "loss": 1.2126, "step": 44100 }, { - "epoch": 0.45, - "learning_rate": 5.143855155149328e-05, - "loss": 1.685, + "epoch": 0.6089664104047836, + "grad_norm": 40.50870895385742, + "learning_rate": 5.4870646237905455e-05, + "loss": 1.3323, "step": 44200 }, { - "epoch": 0.45, - "learning_rate": 5.155498631891483e-05, - "loss": 1.7013, + "epoch": 0.6103441624645228, + "grad_norm": 16.28245735168457, + "learning_rate": 5.4869868552249666e-05, + "loss": 1.2629, "step": 44300 }, { - "epoch": 0.45, - "learning_rate": 5.167142108633638e-05, - "loss": 1.7027, + "epoch": 0.6117219145242622, + "grad_norm": 14.023327827453613, + "learning_rate": 5.4869088541383514e-05, + "loss": 1.2761, "step": 44400 }, { - "epoch": 0.45, - "learning_rate": 5.178785585375793e-05, - "loss": 1.7189, + "epoch": 0.6130996665840015, + "grad_norm": 7.924612998962402, + "learning_rate": 5.486830620537327e-05, + "loss": 1.189, "step": 44500 }, { - "epoch": 0.45, - "learning_rate": 5.190429062117949e-05, - "loss": 1.5007, + "epoch": 0.6144774186437408, + "grad_norm": 29.270492553710938, + "learning_rate": 5.48675215442854e-05, + "loss": 1.1727, "step": 44600 }, { - "epoch": 0.46, - "learning_rate": 5.202072538860104e-05, - "loss": 1.6081, + "epoch": 0.6158551707034802, + "grad_norm": 12.770021438598633, + "learning_rate": 5.486673455818657e-05, + "loss": 1.2345, "step": 44700 }, { - "epoch": 0.46, - "learning_rate": 5.213716015602259e-05, - "loss": 1.6024, + "epoch": 0.6172329227632195, + "grad_norm": 18.894485473632812, + "learning_rate": 5.486594524714362e-05, + "loss": 1.0552, "step": 44800 }, { - "epoch": 0.46, - "learning_rate": 5.225359492344414e-05, - "loss": 1.4827, + "epoch": 0.6186106748229588, + "grad_norm": 34.18074035644531, + "learning_rate": 5.4865153611223626e-05, + "loss": 1.2627, "step": 44900 }, { - "epoch": 0.46, - "learning_rate": 5.2368865343191476e-05, - "loss": 1.6316, + "epoch": 0.6199884268826982, + "grad_norm": 11.412156105041504, + "learning_rate": 5.4864359650493836e-05, + "loss": 1.2822, "step": 45000 }, { - "epoch": 0.46, - "learning_rate": 5.2485300110613034e-05, - "loss": 1.4994, + "epoch": 0.6213661789424375, + "grad_norm": 25.027217864990234, + "learning_rate": 5.4863563365021686e-05, + "loss": 1.1529, "step": 45100 }, { - "epoch": 0.46, - "learning_rate": 5.2601734878034585e-05, - "loss": 1.6473, + "epoch": 0.6227439310021768, + "grad_norm": 9.224932670593262, + "learning_rate": 5.4862764754874855e-05, + "loss": 1.1406, "step": 45200 }, { - "epoch": 0.46, - "learning_rate": 5.2718169645456136e-05, - "loss": 1.6055, + "epoch": 0.6241216830619162, + "grad_norm": 2.8139286041259766, + "learning_rate": 5.486196382012116e-05, + "loss": 1.3154, "step": 45300 }, { - "epoch": 0.46, - "learning_rate": 5.283460441287769e-05, - "loss": 1.6548, + "epoch": 0.6254994351216555, + "grad_norm": 6.805177211761475, + "learning_rate": 5.486116056082866e-05, + "loss": 1.0983, "step": 45400 }, { - "epoch": 0.46, - "learning_rate": 5.295103918029924e-05, - "loss": 1.6434, + "epoch": 0.6268771871813948, + "grad_norm": 8.907033920288086, + "learning_rate": 5.4860354977065584e-05, + "loss": 1.1609, "step": 45500 }, { - "epoch": 0.46, - "learning_rate": 5.306630960004657e-05, - "loss": 1.8213, + "epoch": 0.6282549392411342, + "grad_norm": 19.110136032104492, + "learning_rate": 5.4859547068900385e-05, + "loss": 1.2155, "step": 45600 }, { - "epoch": 0.47, - "learning_rate": 5.318274436746813e-05, - "loss": 1.7567, + "epoch": 0.6296326913008735, + "grad_norm": 5.0881524085998535, + "learning_rate": 5.485873683640169e-05, + "loss": 1.2043, "step": 45700 }, { - "epoch": 0.47, - "learning_rate": 5.329917913488968e-05, - "loss": 1.6743, + "epoch": 0.6310104433606128, + "grad_norm": 44.61912155151367, + "learning_rate": 5.4857924279638333e-05, + "loss": 1.1461, "step": 45800 }, { - "epoch": 0.47, - "learning_rate": 5.341561390231123e-05, - "loss": 1.8337, + "epoch": 0.6323881954203522, + "grad_norm": 12.612244606018066, + "learning_rate": 5.485710939867935e-05, + "loss": 1.1641, "step": 45900 }, { - "epoch": 0.47, - "learning_rate": 5.353204866973278e-05, - "loss": 1.7429, + "epoch": 0.6337659474800915, + "grad_norm": 40.20648193359375, + "learning_rate": 5.485629219359396e-05, + "loss": 1.131, "step": 46000 }, { - "epoch": 0.47, - "learning_rate": 5.3648483437154334e-05, - "loss": 1.6504, + "epoch": 0.6351436995398309, + "grad_norm": 8.210180282592773, + "learning_rate": 5.48554726644516e-05, + "loss": 1.1889, "step": 46100 }, { - "epoch": 0.47, - "learning_rate": 5.3764918204575885e-05, - "loss": 1.67, + "epoch": 0.6365214515995702, + "grad_norm": 5.954362392425537, + "learning_rate": 5.485465081132189e-05, + "loss": 1.1033, "step": 46200 }, { - "epoch": 0.47, - "learning_rate": 5.388135297199744e-05, - "loss": 1.8022, + "epoch": 0.6378992036593095, + "grad_norm": 19.641450881958008, + "learning_rate": 5.485382663427464e-05, + "loss": 1.0969, "step": 46300 }, { - "epoch": 0.47, - "learning_rate": 5.399778773941899e-05, - "loss": 1.7542, + "epoch": 0.6392769557190487, + "grad_norm": 26.514488220214844, + "learning_rate": 5.485300013337988e-05, + "loss": 1.1515, "step": 46400 }, { - "epoch": 0.47, - "learning_rate": 5.4114222506840546e-05, - "loss": 1.5413, + "epoch": 0.6406547077787881, + "grad_norm": 11.927842140197754, + "learning_rate": 5.485217130870782e-05, + "loss": 1.1535, "step": 46500 }, { - "epoch": 0.47, - "learning_rate": 5.42306572742621e-05, - "loss": 1.6942, + "epoch": 0.6420324598385274, + "grad_norm": 19.73988151550293, + "learning_rate": 5.485134016032888e-05, + "loss": 1.1087, "step": 46600 }, { - "epoch": 0.48, - "learning_rate": 5.434709204168365e-05, - "loss": 1.6963, + "epoch": 0.6434102118982667, + "grad_norm": 5.729079246520996, + "learning_rate": 5.485050668831366e-05, + "loss": 1.0969, "step": 46700 }, { - "epoch": 0.48, - "learning_rate": 5.44635268091052e-05, - "loss": 1.7163, + "epoch": 0.6447879639580061, + "grad_norm": 27.262556076049805, + "learning_rate": 5.484967089273297e-05, + "loss": 1.189, "step": 46800 }, { - "epoch": 0.48, - "learning_rate": 5.457996157652675e-05, - "loss": 1.8332, + "epoch": 0.6461657160177454, + "grad_norm": 9.671031951904297, + "learning_rate": 5.484883277365783e-05, + "loss": 1.2077, "step": 46900 }, { - "epoch": 0.48, - "learning_rate": 5.469639634394831e-05, - "loss": 1.7749, + "epoch": 0.6475434680774848, + "grad_norm": 11.681415557861328, + "learning_rate": 5.4847992331159415e-05, + "loss": 1.1066, "step": 47000 }, { - "epoch": 0.48, - "learning_rate": 5.481283111136986e-05, - "loss": 1.5602, + "epoch": 0.6489212201372241, + "grad_norm": 11.195589065551758, + "learning_rate": 5.4847149565309145e-05, + "loss": 1.116, "step": 47100 }, { - "epoch": 0.48, - "learning_rate": 5.492926587879141e-05, - "loss": 1.6308, + "epoch": 0.6502989721969634, + "grad_norm": 24.623706817626953, + "learning_rate": 5.484630447617862e-05, + "loss": 1.0836, "step": 47200 }, { - "epoch": 0.48, - "learning_rate": 5.504570064621296e-05, - "loss": 1.8576, + "epoch": 0.6516767242567028, + "grad_norm": 5.540821075439453, + "learning_rate": 5.484545706383961e-05, + "loss": 1.1583, "step": 47300 }, { - "epoch": 0.48, - "learning_rate": 5.5162135413634514e-05, - "loss": 1.6232, + "epoch": 0.6530544763164421, + "grad_norm": 11.08587646484375, + "learning_rate": 5.484460732836414e-05, + "loss": 1.176, "step": 47400 }, { - "epoch": 0.48, - "learning_rate": 5.5278570181056065e-05, - "loss": 1.7661, + "epoch": 0.6544322283761814, + "grad_norm": 18.468021392822266, + "learning_rate": 5.484375526982438e-05, + "loss": 1.1749, "step": 47500 }, { - "epoch": 0.48, - "learning_rate": 5.5395004948477616e-05, - "loss": 1.6967, + "epoch": 0.6558099804359208, + "grad_norm": 8.209395408630371, + "learning_rate": 5.484290088829272e-05, + "loss": 1.2228, "step": 47600 }, { - "epoch": 0.49, - "learning_rate": 5.551143971589917e-05, - "loss": 1.6686, + "epoch": 0.6571877324956601, + "grad_norm": 28.284866333007812, + "learning_rate": 5.484204418384174e-05, + "loss": 1.1728, "step": 47700 }, { - "epoch": 0.49, - "learning_rate": 5.562787448332072e-05, - "loss": 1.745, + "epoch": 0.6585654845553994, + "grad_norm": 15.217370986938477, + "learning_rate": 5.484118515654422e-05, + "loss": 1.2026, "step": 47800 }, { - "epoch": 0.49, - "learning_rate": 5.574430925074227e-05, - "loss": 1.6954, + "epoch": 0.6599432366151388, + "grad_norm": 28.61541748046875, + "learning_rate": 5.484032380647316e-05, + "loss": 1.2036, "step": 47900 }, { - "epoch": 0.49, - "learning_rate": 5.586074401816382e-05, - "loss": 1.7496, + "epoch": 0.6613209886748781, + "grad_norm": 17.485252380371094, + "learning_rate": 5.483946013370172e-05, + "loss": 1.2027, "step": 48000 }, { - "epoch": 0.49, - "learning_rate": 5.597717878558537e-05, - "loss": 1.7957, + "epoch": 0.6626987407346174, + "grad_norm": 67.5372543334961, + "learning_rate": 5.483859413830326e-05, + "loss": 1.2129, "step": 48100 }, { - "epoch": 0.49, - "learning_rate": 5.6093613553006923e-05, - "loss": 1.6327, + "epoch": 0.6640764927943568, + "grad_norm": 8.771037101745605, + "learning_rate": 5.483772582035137e-05, + "loss": 1.2718, "step": 48200 }, { - "epoch": 0.49, - "learning_rate": 5.621004832042848e-05, - "loss": 1.6333, + "epoch": 0.6654542448540961, + "grad_norm": 9.3125638961792, + "learning_rate": 5.483685517991982e-05, + "loss": 1.1783, "step": 48300 }, { - "epoch": 0.49, - "learning_rate": 5.632648308785003e-05, - "loss": 1.7373, + "epoch": 0.6668319969138354, + "grad_norm": 13.458366394042969, + "learning_rate": 5.48359909582066e-05, + "loss": 1.2295, "step": 48400 }, { - "epoch": 0.49, - "learning_rate": 5.644291785527159e-05, - "loss": 1.8736, + "epoch": 0.6682097489735748, + "grad_norm": 59.08677291870117, + "learning_rate": 5.4835115696260744e-05, + "loss": 1.2055, "step": 48500 }, { - "epoch": 0.5, - "learning_rate": 5.655935262269314e-05, - "loss": 1.9779, + "epoch": 0.669587501033314, + "grad_norm": 24.833065032958984, + "learning_rate": 5.483423811205697e-05, + "loss": 1.116, "step": 48600 }, { - "epoch": 0.5, - "learning_rate": 5.667578739011469e-05, - "loss": 1.6684, + "epoch": 0.6709652530930533, + "grad_norm": 22.202251434326172, + "learning_rate": 5.4833358205669826e-05, + "loss": 1.2302, "step": 48700 }, { - "epoch": 0.5, - "learning_rate": 5.6792222157536244e-05, - "loss": 1.7345, + "epoch": 0.6723430051527927, + "grad_norm": 15.729240417480469, + "learning_rate": 5.483247597717407e-05, + "loss": 1.0977, "step": 48800 }, { - "epoch": 0.5, - "learning_rate": 5.6908656924957796e-05, - "loss": 1.9195, + "epoch": 0.673720757212532, + "grad_norm": 142.8865966796875, + "learning_rate": 5.483159142664464e-05, + "loss": 1.2123, "step": 48900 }, { - "epoch": 0.5, - "learning_rate": 5.702509169237935e-05, - "loss": 1.7144, + "epoch": 0.6750985092722713, + "grad_norm": 6.221757411956787, + "learning_rate": 5.4830704554156704e-05, + "loss": 1.1948, "step": 49000 }, { - "epoch": 0.5, - "learning_rate": 5.71415264598009e-05, - "loss": 1.6925, + "epoch": 0.6764762613320107, + "grad_norm": 87.30204010009766, + "learning_rate": 5.482981535978559e-05, + "loss": 1.3066, "step": 49100 }, { - "epoch": 0.5, - "learning_rate": 5.725796122722245e-05, - "loss": 1.7665, + "epoch": 0.67785401339175, + "grad_norm": 19.206348419189453, + "learning_rate": 5.4828932770261324e-05, + "loss": 1.2929, "step": 49200 }, { - "epoch": 0.5, - "learning_rate": 5.7374395994644e-05, - "loss": 1.7518, + "epoch": 0.6792317654514893, + "grad_norm": 14.013370513916016, + "learning_rate": 5.4828038955567624e-05, + "loss": 1.2378, "step": 49300 }, { - "epoch": 0.5, - "learning_rate": 5.749083076206555e-05, - "loss": 1.6738, + "epoch": 0.6806095175112287, + "grad_norm": 22.980525970458984, + "learning_rate": 5.4827142819217196e-05, + "loss": 1.1694, "step": 49400 }, { - "epoch": 0.5, - "learning_rate": 5.76072655294871e-05, - "loss": 1.6673, + "epoch": 0.681987269570968, + "grad_norm": 10.68824577331543, + "learning_rate": 5.482624436128619e-05, + "loss": 1.1398, "step": 49500 }, { - "epoch": 0.51, - "learning_rate": 5.772370029690866e-05, - "loss": 1.5949, + "epoch": 0.6833650216307073, + "grad_norm": 20.506732940673828, + "learning_rate": 5.482534358185092e-05, + "loss": 1.2095, "step": 49600 }, { - "epoch": 0.51, - "learning_rate": 5.784013506433021e-05, - "loss": 1.7586, + "epoch": 0.6847427736904467, + "grad_norm": 11.502998352050781, + "learning_rate": 5.482444048098792e-05, + "loss": 1.0841, "step": 49700 }, { - "epoch": 0.51, - "learning_rate": 5.795656983175176e-05, - "loss": 1.7049, + "epoch": 0.686120525750186, + "grad_norm": 10.389013290405273, + "learning_rate": 5.482353505877391e-05, + "loss": 1.1964, "step": 49800 }, { - "epoch": 0.51, - "learning_rate": 5.8073004599173315e-05, - "loss": 1.7876, + "epoch": 0.6874982778099253, + "grad_norm": 8.436223030090332, + "learning_rate": 5.4822627315285815e-05, + "loss": 1.1851, "step": 49900 }, { - "epoch": 0.51, - "learning_rate": 5.8189439366594866e-05, - "loss": 1.7694, + "epoch": 0.6888760298696647, + "grad_norm": 9.904741287231445, + "learning_rate": 5.4821717250600746e-05, + "loss": 1.1134, "step": 50000 }, { - "epoch": 0.51, - "learning_rate": 5.830587413401642e-05, - "loss": 1.8653, + "epoch": 0.690253781929404, + "grad_norm": 45.23259353637695, + "learning_rate": 5.482080486479602e-05, + "loss": 1.1171, "step": 50100 }, { - "epoch": 0.51, - "learning_rate": 5.842230890143797e-05, - "loss": 1.7531, + "epoch": 0.6916315339891433, + "grad_norm": 22.682371139526367, + "learning_rate": 5.481989015794914e-05, + "loss": 1.1765, "step": 50200 }, { - "epoch": 0.51, - "learning_rate": 5.853874366885952e-05, - "loss": 1.7341, + "epoch": 0.6930092860488827, + "grad_norm": 9.918152809143066, + "learning_rate": 5.481897313013783e-05, + "loss": 1.1686, "step": 50300 }, { - "epoch": 0.51, - "learning_rate": 5.865517843628107e-05, - "loss": 1.8113, + "epoch": 0.694387038108622, + "grad_norm": 7.5822553634643555, + "learning_rate": 5.481805378143999e-05, + "loss": 1.1215, "step": 50400 }, { - "epoch": 0.51, - "learning_rate": 5.877161320370263e-05, - "loss": 1.859, + "epoch": 0.6957647901683613, + "grad_norm": 52.38286209106445, + "learning_rate": 5.4817132111933725e-05, + "loss": 1.1647, "step": 50500 }, { - "epoch": 0.52, - "learning_rate": 5.888804797112418e-05, - "loss": 1.8349, + "epoch": 0.6971425422281007, + "grad_norm": 11.827544212341309, + "learning_rate": 5.4816208121697324e-05, + "loss": 1.1653, "step": 50600 }, { - "epoch": 0.52, - "learning_rate": 5.900448273854573e-05, - "loss": 1.823, + "epoch": 0.69852029428784, + "grad_norm": 15.004902839660645, + "learning_rate": 5.4815281810809304e-05, + "loss": 1.2503, "step": 50700 }, { - "epoch": 0.52, - "learning_rate": 5.912091750596728e-05, - "loss": 1.8625, + "epoch": 0.6998980463475792, + "grad_norm": 41.13508605957031, + "learning_rate": 5.4814353179348344e-05, + "loss": 1.2125, "step": 50800 }, { - "epoch": 0.52, - "learning_rate": 5.923618792571462e-05, - "loss": 1.7629, + "epoch": 0.7012757984073186, + "grad_norm": 105.07270812988281, + "learning_rate": 5.481342222739335e-05, + "loss": 1.4533, "step": 50900 }, { - "epoch": 0.52, - "learning_rate": 5.935262269313617e-05, - "loss": 1.8262, + "epoch": 0.7026535504670579, + "grad_norm": 44.4683723449707, + "learning_rate": 5.481249829923289e-05, + "loss": 1.4762, "step": 51000 }, { - "epoch": 0.52, - "learning_rate": 5.9469057460557724e-05, - "loss": 1.7556, + "epoch": 0.7040313025267972, + "grad_norm": 247.8203125, + "learning_rate": 5.481156272973023e-05, + "loss": 1.2595, "step": 51100 }, { - "epoch": 0.52, - "learning_rate": 5.9585492227979276e-05, - "loss": 1.9064, + "epoch": 0.7054090545865366, + "grad_norm": 21.079259872436523, + "learning_rate": 5.48106248399706e-05, + "loss": 1.4128, "step": 51200 }, { - "epoch": 0.52, - "learning_rate": 5.970192699540083e-05, - "loss": 1.8993, + "epoch": 0.7067868066462759, + "grad_norm": 8.392303466796875, + "learning_rate": 5.4809684630033665e-05, + "loss": 1.2437, "step": 51300 }, { - "epoch": 0.52, - "learning_rate": 5.981836176282238e-05, - "loss": 1.8139, + "epoch": 0.7081645587060152, + "grad_norm": 7.86804723739624, + "learning_rate": 5.480874209999932e-05, + "loss": 1.3367, "step": 51400 }, { - "epoch": 0.52, - "learning_rate": 5.993479653024393e-05, - "loss": 1.7898, + "epoch": 0.7095423107657546, + "grad_norm": 13.094411849975586, + "learning_rate": 5.4807797249947604e-05, + "loss": 1.218, "step": 51500 }, { - "epoch": 0.53, - "learning_rate": 5.9999999858011726e-05, - "loss": 1.8265, + "epoch": 0.7109200628254939, + "grad_norm": 8.20193099975586, + "learning_rate": 5.480685007995881e-05, + "loss": 1.3033, "step": 51600 }, { - "epoch": 0.53, - "learning_rate": 5.999999847920002e-05, - "loss": 1.914, + "epoch": 0.7122978148852332, + "grad_norm": 46.50385665893555, + "learning_rate": 5.4805900590113404e-05, + "loss": 1.3322, "step": 51700 }, { - "epoch": 0.53, - "learning_rate": 5.9999995633567425e-05, - "loss": 1.7052, + "epoch": 0.7136755669449726, + "grad_norm": 126.6577377319336, + "learning_rate": 5.4804948780492044e-05, + "loss": 1.3084, "step": 51800 }, { - "epoch": 0.53, - "learning_rate": 5.999999132111407e-05, - "loss": 1.8215, + "epoch": 0.7150533190047119, + "grad_norm": 25.30513572692871, + "learning_rate": 5.4803994651175595e-05, + "loss": 1.3466, "step": 51900 }, { - "epoch": 0.53, - "learning_rate": 5.9999985541840173e-05, - "loss": 1.8507, + "epoch": 0.7164310710644513, + "grad_norm": 11.659921646118164, + "learning_rate": 5.4803038202245116e-05, + "loss": 1.1219, "step": 52000 }, { - "epoch": 0.53, - "learning_rate": 5.999997829574601e-05, - "loss": 1.6986, + "epoch": 0.7178088231241906, + "grad_norm": 16.904481887817383, + "learning_rate": 5.480207943378186e-05, + "loss": 1.0886, "step": 52100 }, { - "epoch": 0.53, - "learning_rate": 5.9999969582831936e-05, - "loss": 1.7754, + "epoch": 0.7191865751839299, + "grad_norm": 8.303357124328613, + "learning_rate": 5.480111834586728e-05, + "loss": 1.2044, "step": 52200 }, { - "epoch": 0.53, - "learning_rate": 5.999995940309839e-05, - "loss": 1.8446, + "epoch": 0.7205643272436693, + "grad_norm": 26.6663875579834, + "learning_rate": 5.480015493858302e-05, + "loss": 1.1989, "step": 52300 }, { - "epoch": 0.53, - "learning_rate": 5.999994775654585e-05, - "loss": 1.8798, + "epoch": 0.7219420793034086, + "grad_norm": 16.30890464782715, + "learning_rate": 5.479918921201093e-05, + "loss": 1.1712, "step": 52400 }, { - "epoch": 0.53, - "learning_rate": 5.99999346431749e-05, - "loss": 1.7133, + "epoch": 0.7233198313631479, + "grad_norm": 6.086356163024902, + "learning_rate": 5.479822116623306e-05, + "loss": 1.1054, "step": 52500 }, { - "epoch": 0.54, - "learning_rate": 5.999992006298617e-05, - "loss": 1.8276, + "epoch": 0.7246975834228873, + "grad_norm": 8.788448333740234, + "learning_rate": 5.4797250801331645e-05, + "loss": 1.1336, "step": 52600 }, { - "epoch": 0.54, - "learning_rate": 5.999990401598039e-05, - "loss": 1.8011, + "epoch": 0.7260753354826266, + "grad_norm": 16.002470016479492, + "learning_rate": 5.4796278117389126e-05, + "loss": 1.2275, "step": 52700 }, { - "epoch": 0.54, - "learning_rate": 5.999988650215832e-05, - "loss": 1.8984, + "epoch": 0.7274530875423659, + "grad_norm": 6.798962116241455, + "learning_rate": 5.4795303114488126e-05, + "loss": 1.2138, "step": 52800 }, { - "epoch": 0.54, - "learning_rate": 5.999986752152084e-05, - "loss": 1.7291, + "epoch": 0.7288308396021052, + "grad_norm": 11.686549186706543, + "learning_rate": 5.479432579271149e-05, + "loss": 1.2924, "step": 52900 }, { - "epoch": 0.54, - "learning_rate": 5.999984707406886e-05, - "loss": 1.6652, + "epoch": 0.7302085916618445, + "grad_norm": 86.74701690673828, + "learning_rate": 5.479334615214224e-05, + "loss": 1.2223, "step": 53000 }, { - "epoch": 0.54, - "learning_rate": 5.9999825386206776e-05, - "loss": 1.7401, + "epoch": 0.7315863437215838, + "grad_norm": 19.557451248168945, + "learning_rate": 5.4792364192863604e-05, + "loss": 1.292, "step": 53100 }, { - "epoch": 0.54, - "learning_rate": 5.999980201979701e-05, - "loss": 1.7484, + "epoch": 0.7329640957813232, + "grad_norm": 14.568532943725586, + "learning_rate": 5.4791379914959e-05, + "loss": 1.0606, "step": 53200 }, { - "epoch": 0.54, - "learning_rate": 5.9999777186575945e-05, - "loss": 1.816, + "epoch": 0.7343418478410625, + "grad_norm": 477.8893737792969, + "learning_rate": 5.479039331851205e-05, + "loss": 1.2518, "step": 53300 }, { - "epoch": 0.54, - "learning_rate": 5.9999750886544816e-05, - "loss": 1.6185, + "epoch": 0.7357195999008018, + "grad_norm": 22.306926727294922, + "learning_rate": 5.4789404403606576e-05, + "loss": 1.2761, "step": 53400 }, { - "epoch": 0.55, - "learning_rate": 5.999972311970489e-05, - "loss": 1.8986, + "epoch": 0.7370973519605412, + "grad_norm": 24.619558334350586, + "learning_rate": 5.4788413170326583e-05, + "loss": 1.1627, "step": 53500 }, { - "epoch": 0.55, - "learning_rate": 5.999969388605754e-05, - "loss": 1.8713, + "epoch": 0.7384751040202805, + "grad_norm": 25.820980072021484, + "learning_rate": 5.478741961875628e-05, + "loss": 1.2325, "step": 53600 }, { - "epoch": 0.55, - "learning_rate": 5.9999663185604177e-05, - "loss": 1.7371, + "epoch": 0.7398528560800198, + "grad_norm": 6.452317237854004, + "learning_rate": 5.4786423748980085e-05, + "loss": 1.2066, "step": 53700 }, { - "epoch": 0.55, - "learning_rate": 5.999963101834632e-05, - "loss": 1.7292, + "epoch": 0.7412306081397592, + "grad_norm": 9.134746551513672, + "learning_rate": 5.4785425561082594e-05, + "loss": 1.2001, "step": 53800 }, { - "epoch": 0.55, - "learning_rate": 5.999959738428553e-05, - "loss": 1.8653, + "epoch": 0.7426083601994985, + "grad_norm": 15.809443473815918, + "learning_rate": 5.478442505514861e-05, + "loss": 1.2269, "step": 53900 }, { - "epoch": 0.55, - "learning_rate": 5.999956228342346e-05, - "loss": 1.8762, + "epoch": 0.7439861122592378, + "grad_norm": 7.069437503814697, + "learning_rate": 5.478342223126313e-05, + "loss": 1.1525, "step": 54000 }, { - "epoch": 0.55, - "learning_rate": 5.999952608869909e-05, - "loss": 1.6714, + "epoch": 0.7453638643189772, + "grad_norm": 9.375837326049805, + "learning_rate": 5.478241708951136e-05, + "loss": 1.279, "step": 54100 }, { - "epoch": 0.55, - "learning_rate": 5.999948806890764e-05, - "loss": 1.8645, + "epoch": 0.7467416163787165, + "grad_norm": 7.450246334075928, + "learning_rate": 5.478140962997867e-05, + "loss": 1.0976, "step": 54200 }, { - "epoch": 0.55, - "learning_rate": 5.9999448582320255e-05, - "loss": 1.7443, + "epoch": 0.7481193684384558, + "grad_norm": 6.898708820343018, + "learning_rate": 5.478039985275067e-05, + "loss": 1.1062, "step": 54300 }, { - "epoch": 0.55, - "learning_rate": 5.999940762893886e-05, - "loss": 1.7701, + "epoch": 0.7494971204981952, + "grad_norm": 40.28762435913086, + "learning_rate": 5.477938775791313e-05, + "loss": 1.1483, "step": 54400 }, { - "epoch": 0.56, - "learning_rate": 5.9999365208765464e-05, - "loss": 1.8186, + "epoch": 0.7508748725579345, + "grad_norm": 14.639007568359375, + "learning_rate": 5.477837334555205e-05, + "loss": 1.2125, "step": 54500 }, { - "epoch": 0.56, - "learning_rate": 5.9999321321802126e-05, - "loss": 1.8848, + "epoch": 0.7522526246176738, + "grad_norm": 39.45486068725586, + "learning_rate": 5.47773667945226e-05, + "loss": 1.1797, "step": 54600 }, { - "epoch": 0.56, - "learning_rate": 5.999927596805101e-05, - "loss": 1.8028, + "epoch": 0.7536303766774132, + "grad_norm": 14.289118766784668, + "learning_rate": 5.477634777054623e-05, + "loss": 1.2033, "step": 54700 }, { - "epoch": 0.56, - "learning_rate": 5.999922914751432e-05, - "loss": 1.6073, + "epoch": 0.7550081287371525, + "grad_norm": 20.10711669921875, + "learning_rate": 5.477532642930456e-05, + "loss": 1.1648, "step": 54800 }, { - "epoch": 0.56, - "learning_rate": 5.999918086019435e-05, - "loss": 1.7682, + "epoch": 0.7563858807968918, + "grad_norm": 18.584131240844727, + "learning_rate": 5.477430277088439e-05, + "loss": 1.1935, "step": 54900 }, { - "epoch": 0.56, - "learning_rate": 5.999913110609347e-05, - "loss": 1.7641, + "epoch": 0.7577636328566312, + "grad_norm": 16.353309631347656, + "learning_rate": 5.477327679537266e-05, + "loss": 1.1627, "step": 55000 }, { - "epoch": 0.56, - "learning_rate": 5.999907988521409e-05, - "loss": 1.6054, + "epoch": 0.7591413849163704, + "grad_norm": 5.6780524253845215, + "learning_rate": 5.477224850285653e-05, + "loss": 1.1773, "step": 55100 }, { - "epoch": 0.56, - "learning_rate": 5.999902719755874e-05, - "loss": 1.5821, + "epoch": 0.7605191369761097, + "grad_norm": 29.311391830444336, + "learning_rate": 5.477121789342337e-05, + "loss": 1.1408, "step": 55200 }, { - "epoch": 0.56, - "learning_rate": 5.999897304312998e-05, - "loss": 1.723, + "epoch": 0.7618968890358491, + "grad_norm": 23.528329849243164, + "learning_rate": 5.477018496716073e-05, + "loss": 1.2482, "step": 55300 }, { - "epoch": 0.56, - "learning_rate": 5.999891742193047e-05, - "loss": 1.8605, + "epoch": 0.7632746410955884, + "grad_norm": 37.14783477783203, + "learning_rate": 5.476914972415636e-05, + "loss": 1.1922, "step": 55400 }, { - "epoch": 0.57, - "learning_rate": 5.9998860333962915e-05, - "loss": 1.7869, + "epoch": 0.7646523931553277, + "grad_norm": 21.471261978149414, + "learning_rate": 5.476811216449822e-05, + "loss": 1.1817, "step": 55500 }, { - "epoch": 0.57, - "learning_rate": 5.9998801779230116e-05, - "loss": 1.7645, + "epoch": 0.7660301452150671, + "grad_norm": 8.953851699829102, + "learning_rate": 5.4767072288274446e-05, + "loss": 1.2108, "step": 55600 }, { - "epoch": 0.57, - "learning_rate": 5.999874175773493e-05, - "loss": 2.0339, + "epoch": 0.7674078972748064, + "grad_norm": 2.856712579727173, + "learning_rate": 5.476603009557338e-05, + "loss": 1.1368, "step": 55700 }, { - "epoch": 0.57, - "learning_rate": 5.9998680269480304e-05, - "loss": 2.0343, + "epoch": 0.7687856493345457, + "grad_norm": 7.848891735076904, + "learning_rate": 5.4764985586483574e-05, + "loss": 1.2068, "step": 55800 }, { - "epoch": 0.57, - "learning_rate": 5.999861731446923e-05, - "loss": 1.7218, + "epoch": 0.7701634013942851, + "grad_norm": 10.449261665344238, + "learning_rate": 5.476393876109375e-05, + "loss": 1.1533, "step": 55900 }, { - "epoch": 0.57, - "learning_rate": 5.999855289270479e-05, - "loss": 1.7438, + "epoch": 0.7715411534540244, + "grad_norm": 15.309361457824707, + "learning_rate": 5.476288961949285e-05, + "loss": 1.1055, "step": 56000 }, { - "epoch": 0.57, - "learning_rate": 5.999848700419014e-05, - "loss": 1.7914, + "epoch": 0.7729189055137637, + "grad_norm": 18.263992309570312, + "learning_rate": 5.4761838161769997e-05, + "loss": 1.2973, "step": 56100 }, { - "epoch": 0.57, - "learning_rate": 5.999841964892849e-05, - "loss": 1.9114, + "epoch": 0.7742966575735031, + "grad_norm": 8.282423973083496, + "learning_rate": 5.4760784388014526e-05, + "loss": 1.0724, "step": 56200 }, { - "epoch": 0.57, - "learning_rate": 5.999835082692314e-05, - "loss": 1.814, + "epoch": 0.7756744096332424, + "grad_norm": 7.8377366065979, + "learning_rate": 5.475972829831595e-05, + "loss": 1.0374, "step": 56300 }, { - "epoch": 0.57, - "learning_rate": 5.9998280538177463e-05, - "loss": 1.8186, + "epoch": 0.7770521616929817, + "grad_norm": 40.84781265258789, + "learning_rate": 5.4758669892764e-05, + "loss": 1.1089, "step": 56400 }, { - "epoch": 0.58, - "learning_rate": 5.999820878269488e-05, - "loss": 1.6235, + "epoch": 0.7784299137527211, + "grad_norm": 6.531658172607422, + "learning_rate": 5.4757609171448586e-05, + "loss": 1.0716, "step": 56500 }, { - "epoch": 0.58, - "learning_rate": 5.9998135560478916e-05, - "loss": 1.7426, + "epoch": 0.7798076658124604, + "grad_norm": 12.462308883666992, + "learning_rate": 5.475654613445982e-05, + "loss": 1.054, "step": 56600 }, { - "epoch": 0.58, - "learning_rate": 5.99980616256829e-05, - "loss": 1.7726, + "epoch": 0.7811854178721997, + "grad_norm": 8.333971977233887, + "learning_rate": 5.475548078188803e-05, + "loss": 1.0387, "step": 56700 }, { - "epoch": 0.58, - "learning_rate": 5.9997985484678205e-05, - "loss": 1.8324, + "epoch": 0.7825631699319391, + "grad_norm": 10.69863510131836, + "learning_rate": 5.47544131138237e-05, + "loss": 1.0798, "step": 56800 }, { - "epoch": 0.58, - "learning_rate": 5.999790787695105e-05, - "loss": 1.7814, + "epoch": 0.7839409219916784, + "grad_norm": 10.501168251037598, + "learning_rate": 5.475334313035754e-05, + "loss": 1.2019, "step": 56900 }, { - "epoch": 0.58, - "learning_rate": 5.999782880250521e-05, - "loss": 1.7815, + "epoch": 0.7853186740514178, + "grad_norm": 7.4806318283081055, + "learning_rate": 5.475227083158045e-05, + "loss": 1.1566, "step": 57000 }, { - "epoch": 0.58, - "learning_rate": 5.9997748261344556e-05, - "loss": 1.9284, + "epoch": 0.7866964261111571, + "grad_norm": 9.43379020690918, + "learning_rate": 5.475119621758355e-05, + "loss": 1.0975, "step": 57100 }, { - "epoch": 0.58, - "learning_rate": 5.999766625347303e-05, - "loss": 1.8563, + "epoch": 0.7880741781708963, + "grad_norm": 22.25967025756836, + "learning_rate": 5.4750119288458095e-05, + "loss": 1.0693, "step": 57200 }, { - "epoch": 0.58, - "learning_rate": 5.999758277889464e-05, - "loss": 1.7365, + "epoch": 0.7894519302306356, + "grad_norm": 45.955928802490234, + "learning_rate": 5.4749040044295605e-05, + "loss": 1.054, "step": 57300 }, { - "epoch": 0.58, - "learning_rate": 5.9997497837613475e-05, - "loss": 1.6019, + "epoch": 0.790829682290375, + "grad_norm": 16.152667999267578, + "learning_rate": 5.474795848518776e-05, + "loss": 1.1842, "step": 57400 }, { - "epoch": 0.59, - "learning_rate": 5.999741142963368e-05, - "loss": 1.7575, + "epoch": 0.7922074343501143, + "grad_norm": 4.7594075202941895, + "learning_rate": 5.474687461122644e-05, + "loss": 1.1633, "step": 57500 }, { - "epoch": 0.59, - "learning_rate": 5.999732355495948e-05, - "loss": 1.8243, + "epoch": 0.7935851864098536, + "grad_norm": 8.323111534118652, + "learning_rate": 5.474578842250373e-05, + "loss": 1.108, "step": 57600 }, { - "epoch": 0.59, - "learning_rate": 5.999723421359517e-05, - "loss": 1.9446, + "epoch": 0.794962938469593, + "grad_norm": 30.226247787475586, + "learning_rate": 5.4744699919111895e-05, + "loss": 1.1067, "step": 57700 }, { - "epoch": 0.59, - "learning_rate": 5.9997143405545124e-05, - "loss": 1.8453, + "epoch": 0.7963406905293323, + "grad_norm": 10.015289306640625, + "learning_rate": 5.474360910114343e-05, + "loss": 1.161, "step": 57800 }, { - "epoch": 0.59, - "learning_rate": 5.999705113081378e-05, - "loss": 1.7528, + "epoch": 0.7977184425890717, + "grad_norm": 12.430333137512207, + "learning_rate": 5.474251596869099e-05, + "loss": 1.0728, "step": 57900 }, { - "epoch": 0.59, - "learning_rate": 5.999695738940565e-05, - "loss": 1.7813, + "epoch": 0.799096194648811, + "grad_norm": 16.917774200439453, + "learning_rate": 5.4741420521847457e-05, + "loss": 1.3061, "step": 58000 }, { - "epoch": 0.59, - "learning_rate": 5.999686218132531e-05, - "loss": 1.756, + "epoch": 0.8004739467085503, + "grad_norm": 23.206104278564453, + "learning_rate": 5.474032276070587e-05, + "loss": 1.1408, "step": 58100 }, { - "epoch": 0.59, - "learning_rate": 5.9996765506577425e-05, - "loss": 1.8373, + "epoch": 0.8018516987682897, + "grad_norm": 2.4593045711517334, + "learning_rate": 5.473922268535951e-05, + "loss": 1.0693, "step": 58200 }, { - "epoch": 0.59, - "learning_rate": 5.9996667365166716e-05, - "loss": 1.9234, + "epoch": 0.803229450828029, + "grad_norm": 9.281790733337402, + "learning_rate": 5.4738120295901824e-05, + "loss": 1.0999, "step": 58300 }, { - "epoch": 0.59, - "learning_rate": 5.9996568760438614e-05, - "loss": 1.7684, + "epoch": 0.8046072028877683, + "grad_norm": 94.02510833740234, + "learning_rate": 5.4737015592426474e-05, + "loss": 1.1631, "step": 58400 }, { - "epoch": 0.6, - "learning_rate": 5.9996467700383237e-05, - "loss": 1.7996, + "epoch": 0.8059849549475077, + "grad_norm": 11.41101360321045, + "learning_rate": 5.4735908575027304e-05, + "loss": 1.1112, "step": 58500 }, { - "epoch": 0.6, - "learning_rate": 5.9996365173679604e-05, - "loss": 1.7491, + "epoch": 0.807362707007247, + "grad_norm": 26.403072357177734, + "learning_rate": 5.4734799243798356e-05, + "loss": 1.079, "step": 58600 }, { - "epoch": 0.6, - "learning_rate": 5.999626118033272e-05, - "loss": 1.681, + "epoch": 0.8087404590669863, + "grad_norm": 10.233325958251953, + "learning_rate": 5.4733687598833895e-05, + "loss": 1.0384, "step": 58700 }, { - "epoch": 0.6, - "learning_rate": 5.999615572034767e-05, - "loss": 1.8637, + "epoch": 0.8101182111267257, + "grad_norm": 1124.02490234375, + "learning_rate": 5.473257364022833e-05, + "loss": 1.0471, "step": 58800 }, { - "epoch": 0.6, - "learning_rate": 5.999604879372961e-05, - "loss": 1.5885, + "epoch": 0.811495963186465, + "grad_norm": 90.05939483642578, + "learning_rate": 5.473145736807632e-05, + "loss": 1.1022, "step": 58900 }, { - "epoch": 0.6, - "learning_rate": 5.999594040048377e-05, - "loss": 1.7716, + "epoch": 0.8128737152462043, + "grad_norm": 11.457798957824707, + "learning_rate": 5.4730338782472696e-05, + "loss": 1.0113, "step": 59000 }, { - "epoch": 0.6, - "learning_rate": 5.9995830540615466e-05, - "loss": 1.7986, + "epoch": 0.8142514673059437, + "grad_norm": 7.373476505279541, + "learning_rate": 5.472921788351248e-05, + "loss": 1.1842, "step": 59100 }, { - "epoch": 0.6, - "learning_rate": 5.9995719214130035e-05, - "loss": 1.8433, + "epoch": 0.815629219365683, + "grad_norm": 42.67975997924805, + "learning_rate": 5.47280946712909e-05, + "loss": 1.0965, "step": 59200 }, { - "epoch": 0.6, - "learning_rate": 5.9995606421032954e-05, - "loss": 1.7293, + "epoch": 0.8170069714254223, + "grad_norm": 8.14940357208252, + "learning_rate": 5.472696914590338e-05, + "loss": 1.1904, "step": 59300 }, { - "epoch": 0.61, - "learning_rate": 5.9995492161329713e-05, - "loss": 1.9343, + "epoch": 0.8183847234851616, + "grad_norm": 15.231611251831055, + "learning_rate": 5.472584130744554e-05, + "loss": 1.083, "step": 59400 }, { - "epoch": 0.61, - "learning_rate": 5.999537643502592e-05, - "loss": 1.6948, + "epoch": 0.8197624755449009, + "grad_norm": 16.02197265625, + "learning_rate": 5.4724711156013194e-05, + "loss": 1.2139, "step": 59500 }, { - "epoch": 0.61, - "learning_rate": 5.999525924212721e-05, - "loss": 1.7785, + "epoch": 0.8211402276046402, + "grad_norm": 14.748459815979004, + "learning_rate": 5.472357869170236e-05, + "loss": 1.051, "step": 59600 }, { - "epoch": 0.61, - "learning_rate": 5.999514058263934e-05, - "loss": 1.8778, + "epoch": 0.8225179796643796, + "grad_norm": 4.968407154083252, + "learning_rate": 5.472244391460923e-05, + "loss": 1.1052, "step": 59700 }, { - "epoch": 0.61, - "learning_rate": 5.999502045656809e-05, - "loss": 1.8336, + "epoch": 0.8238957317241189, + "grad_norm": 15.957460403442383, + "learning_rate": 5.4721306824830225e-05, + "loss": 1.1636, "step": 59800 }, { - "epoch": 0.61, - "learning_rate": 5.999489886391934e-05, - "loss": 1.743, + "epoch": 0.8252734837838582, + "grad_norm": 16.082277297973633, + "learning_rate": 5.472016742246194e-05, + "loss": 1.0048, "step": 59900 }, { - "epoch": 0.61, - "learning_rate": 5.999477580469903e-05, - "loss": 1.7351, + "epoch": 0.8266512358435976, + "grad_norm": 16.96127700805664, + "learning_rate": 5.471902570760118e-05, + "loss": 1.1646, "step": 60000 }, { - "epoch": 0.61, - "learning_rate": 5.999465127891319e-05, - "loss": 1.9071, + "epoch": 0.8280289879033369, + "grad_norm": 10.596650123596191, + "learning_rate": 5.4717881680344935e-05, + "loss": 1.2358, "step": 60100 }, { - "epoch": 0.61, - "learning_rate": 5.99945252865679e-05, - "loss": 1.7476, + "epoch": 0.8294067399630762, + "grad_norm": 19.039791107177734, + "learning_rate": 5.471674681563149e-05, + "loss": 1.2627, "step": 60200 }, { - "epoch": 0.61, - "learning_rate": 5.9994397827669315e-05, - "loss": 1.7562, + "epoch": 0.8307844920228156, + "grad_norm": 16.778432846069336, + "learning_rate": 5.471559818699756e-05, + "loss": 1.1974, "step": 60300 }, { - "epoch": 0.62, - "learning_rate": 5.999426890222368e-05, - "loss": 1.8382, + "epoch": 0.8321622440825549, + "grad_norm": 495.1744689941406, + "learning_rate": 5.471444724625934e-05, + "loss": 1.1441, "step": 60400 }, { - "epoch": 0.62, - "learning_rate": 5.999413982141651e-05, - "loss": 1.7408, + "epoch": 0.8335399961422942, + "grad_norm": 18.594533920288086, + "learning_rate": 5.47132939935146e-05, + "loss": 1.2501, "step": 60500 }, { - "epoch": 0.62, - "learning_rate": 5.999400797756105e-05, - "loss": 1.8513, + "epoch": 0.8349177482020336, + "grad_norm": 15.665253639221191, + "learning_rate": 5.471213842886131e-05, + "loss": 1.1846, "step": 60600 }, { - "epoch": 0.62, - "learning_rate": 5.999387466717761e-05, - "loss": 1.9462, + "epoch": 0.8362955002617729, + "grad_norm": 8.79216480255127, + "learning_rate": 5.4710980552397636e-05, + "loss": 1.1777, "step": 60700 }, { - "epoch": 0.62, - "learning_rate": 5.9993739890272686e-05, - "loss": 1.9378, + "epoch": 0.8376732523215122, + "grad_norm": 80.83506774902344, + "learning_rate": 5.470982036422196e-05, + "loss": 1.0193, "step": 60800 }, { - "epoch": 0.62, - "learning_rate": 5.999360364685287e-05, - "loss": 1.7939, + "epoch": 0.8390510043812516, + "grad_norm": 69.26873016357422, + "learning_rate": 5.4708657864432836e-05, + "loss": 1.2296, "step": 60900 }, { - "epoch": 0.62, - "learning_rate": 5.9993465936924825e-05, - "loss": 1.6532, + "epoch": 0.8404287564409909, + "grad_norm": 23.37373924255371, + "learning_rate": 5.4707493053129036e-05, + "loss": 1.0475, "step": 61000 }, { - "epoch": 0.62, - "learning_rate": 5.9993326760495296e-05, - "loss": 1.6977, + "epoch": 0.8418065085007302, + "grad_norm": 43.963623046875, + "learning_rate": 5.470632593040951e-05, + "loss": 1.1772, "step": 61100 }, { - "epoch": 0.62, - "learning_rate": 5.999318611757107e-05, - "loss": 1.8962, + "epoch": 0.8431842605604696, + "grad_norm": 17.73714828491211, + "learning_rate": 5.4705156496373395e-05, + "loss": 1.1627, "step": 61200 }, { - "epoch": 0.62, - "learning_rate": 5.999304400815905e-05, - "loss": 1.754, + "epoch": 0.8445620126202089, + "grad_norm": 27.438568115234375, + "learning_rate": 5.470398475112007e-05, + "loss": 1.2035, "step": 61300 }, { - "epoch": 0.63, - "learning_rate": 5.9992900432266154e-05, - "loss": 1.8434, + "epoch": 0.8459397646799482, + "grad_norm": 100.108154296875, + "learning_rate": 5.470281069474906e-05, + "loss": 1.0764, "step": 61400 }, { - "epoch": 0.63, - "learning_rate": 5.9992755389899406e-05, - "loss": 1.7021, + "epoch": 0.8473175167396875, + "grad_norm": 27.32681655883789, + "learning_rate": 5.4701634327360105e-05, + "loss": 1.1767, "step": 61500 }, { - "epoch": 0.63, - "learning_rate": 5.9992608881065925e-05, - "loss": 1.9396, + "epoch": 0.8486952687994268, + "grad_norm": 18.406723022460938, + "learning_rate": 5.4700455649053164e-05, + "loss": 1.0829, "step": 61600 }, { - "epoch": 0.63, - "learning_rate": 5.999246090577284e-05, - "loss": 1.757, + "epoch": 0.8500730208591661, + "grad_norm": 14.237214088439941, + "learning_rate": 5.469927465992835e-05, + "loss": 1.0697, "step": 61700 }, { - "epoch": 0.63, - "learning_rate": 5.999231146402742e-05, - "loss": 1.9366, + "epoch": 0.8514507729189055, + "grad_norm": 17.61935043334961, + "learning_rate": 5.4698091360086004e-05, + "loss": 1.1711, "step": 61800 }, { - "epoch": 0.63, - "learning_rate": 5.999216055583694e-05, - "loss": 1.7092, + "epoch": 0.8528285249786448, + "grad_norm": 11.328566551208496, + "learning_rate": 5.469690574962666e-05, + "loss": 1.1212, "step": 61900 }, { - "epoch": 0.63, - "learning_rate": 5.99920081812088e-05, - "loss": 1.8041, + "epoch": 0.8542062770383841, + "grad_norm": 21.75775718688965, + "learning_rate": 5.469571782865103e-05, + "loss": 1.1068, "step": 62000 }, { - "epoch": 0.63, - "learning_rate": 5.999185434015044e-05, - "loss": 1.8746, + "epoch": 0.8555840290981235, + "grad_norm": 8.026041984558105, + "learning_rate": 5.469452759726004e-05, + "loss": 1.1298, "step": 62100 }, { - "epoch": 0.63, - "learning_rate": 5.9991699032669384e-05, - "loss": 1.8011, + "epoch": 0.8569617811578628, + "grad_norm": 48.023170471191406, + "learning_rate": 5.469333505555479e-05, + "loss": 1.1877, "step": 62200 }, { - "epoch": 0.63, - "learning_rate": 5.9991542258773225e-05, - "loss": 1.7387, + "epoch": 0.8583395332176021, + "grad_norm": 25.07050895690918, + "learning_rate": 5.469214020363662e-05, + "loss": 1.1046, "step": 62300 }, { - "epoch": 0.64, - "learning_rate": 5.999138401846964e-05, - "loss": 1.9458, + "epoch": 0.8597172852773415, + "grad_norm": 6.722031116485596, + "learning_rate": 5.4690943041607023e-05, + "loss": 1.0693, "step": 62400 }, { - "epoch": 0.64, - "learning_rate": 5.999122431176634e-05, - "loss": 1.9475, + "epoch": 0.8610950373370808, + "grad_norm": 15.410558700561523, + "learning_rate": 5.468974356956771e-05, + "loss": 1.2216, "step": 62500 }, { - "epoch": 0.64, - "learning_rate": 5.9991063138671167e-05, - "loss": 1.7204, + "epoch": 0.8624727893968201, + "grad_norm": 19.42689323425293, + "learning_rate": 5.468854178762058e-05, + "loss": 1.1332, "step": 62600 }, { - "epoch": 0.64, - "learning_rate": 5.999090049919198e-05, - "loss": 1.7923, + "epoch": 0.8638505414565595, + "grad_norm": 24.845836639404297, + "learning_rate": 5.468733769586772e-05, + "loss": 1.2004, "step": 62700 }, { - "epoch": 0.64, - "learning_rate": 5.9990736393336724e-05, - "loss": 1.7572, + "epoch": 0.8652282935162988, + "grad_norm": 17.53343963623047, + "learning_rate": 5.4686131294411444e-05, + "loss": 1.22, "step": 62800 }, { - "epoch": 0.64, - "learning_rate": 5.999057082111344e-05, - "loss": 1.8297, + "epoch": 0.8666060455760382, + "grad_norm": 6.367466449737549, + "learning_rate": 5.468492258335422e-05, + "loss": 1.0237, "step": 62900 }, { - "epoch": 0.64, - "learning_rate": 5.999040378253021e-05, - "loss": 1.9533, + "epoch": 0.8679837976357775, + "grad_norm": 11.674467086791992, + "learning_rate": 5.468371156279876e-05, + "loss": 1.1647, "step": 63000 }, { - "epoch": 0.64, - "learning_rate": 5.9990235277595224e-05, - "loss": 1.8337, + "epoch": 0.8693615496955168, + "grad_norm": 8.33002758026123, + "learning_rate": 5.468249823284792e-05, + "loss": 1.1642, "step": 63100 }, { - "epoch": 0.64, - "learning_rate": 5.999006530631669e-05, - "loss": 1.6718, + "epoch": 0.8707393017552562, + "grad_norm": 11.837431907653809, + "learning_rate": 5.4681282593604794e-05, + "loss": 1.0929, "step": 63200 }, { - "epoch": 0.64, - "learning_rate": 5.9989893868702944e-05, - "loss": 1.8575, + "epoch": 0.8721170538149955, + "grad_norm": 15.618926048278809, + "learning_rate": 5.4680064645172656e-05, + "loss": 1.1287, "step": 63300 }, { - "epoch": 0.65, - "learning_rate": 5.9989720964762355e-05, - "loss": 1.7373, + "epoch": 0.8734948058747348, + "grad_norm": 11.067187309265137, + "learning_rate": 5.467884438765497e-05, + "loss": 1.3, "step": 63400 }, { - "epoch": 0.65, - "learning_rate": 5.9989546594503374e-05, - "loss": 1.7246, + "epoch": 0.8748725579344742, + "grad_norm": 7.3953537940979, + "learning_rate": 5.467762182115541e-05, + "loss": 1.1744, "step": 63500 }, { - "epoch": 0.65, - "learning_rate": 5.9989370757934544e-05, - "loss": 1.7, + "epoch": 0.8762503099942135, + "grad_norm": 14.800235748291016, + "learning_rate": 5.467639694577783e-05, + "loss": 1.176, "step": 63600 }, { - "epoch": 0.65, - "learning_rate": 5.9989193455064445e-05, - "loss": 1.7548, + "epoch": 0.8776280620539527, + "grad_norm": 24.887876510620117, + "learning_rate": 5.46751697616263e-05, + "loss": 1.127, "step": 63700 }, { - "epoch": 0.65, - "learning_rate": 5.998901468590175e-05, - "loss": 1.684, + "epoch": 0.879005814113692, + "grad_norm": 67.39167785644531, + "learning_rate": 5.4673940268805074e-05, + "loss": 1.1131, "step": 63800 }, { - "epoch": 0.65, - "learning_rate": 5.99888344504552e-05, - "loss": 1.7344, + "epoch": 0.8803835661734314, + "grad_norm": 12.841975212097168, + "learning_rate": 5.46727084674186e-05, + "loss": 1.1908, "step": 63900 }, { - "epoch": 0.65, - "learning_rate": 5.9988652748733614e-05, - "loss": 1.623, + "epoch": 0.8817613182331707, + "grad_norm": 8.85670280456543, + "learning_rate": 5.4671474357571525e-05, + "loss": 1.0991, "step": 64000 }, { - "epoch": 0.65, - "learning_rate": 5.998846958074588e-05, - "loss": 1.7377, + "epoch": 0.8831390702929101, + "grad_norm": 6.850118160247803, + "learning_rate": 5.46702379393687e-05, + "loss": 1.0663, "step": 64100 }, { - "epoch": 0.65, - "learning_rate": 5.9988284946500933e-05, - "loss": 1.8485, + "epoch": 0.8845168223526494, + "grad_norm": 22.98515510559082, + "learning_rate": 5.466899921291516e-05, + "loss": 1.059, "step": 64200 }, { - "epoch": 0.66, - "learning_rate": 5.9988098846007824e-05, - "loss": 1.8087, + "epoch": 0.8858945744123887, + "grad_norm": 11.451286315917969, + "learning_rate": 5.466775817831614e-05, + "loss": 1.1818, "step": 64300 }, { - "epoch": 0.66, - "learning_rate": 5.9987911279275625e-05, - "loss": 1.7151, + "epoch": 0.8872723264721281, + "grad_norm": 7.088813781738281, + "learning_rate": 5.4666514835677075e-05, + "loss": 1.1725, "step": 64400 }, { - "epoch": 0.66, - "learning_rate": 5.998772224631353e-05, - "loss": 1.6868, + "epoch": 0.8886500785318674, + "grad_norm": 11.199012756347656, + "learning_rate": 5.466526918510358e-05, + "loss": 1.1438, "step": 64500 }, { - "epoch": 0.66, - "learning_rate": 5.998753174713078e-05, - "loss": 1.7295, + "epoch": 0.8900278305916067, + "grad_norm": 9.510293960571289, + "learning_rate": 5.4664033717708927e-05, + "loss": 1.1066, "step": 64600 }, { - "epoch": 0.66, - "learning_rate": 5.9987339781736676e-05, - "loss": 1.6257, + "epoch": 0.8914055826513461, + "grad_norm": 10.846967697143555, + "learning_rate": 5.4662783474660975e-05, + "loss": 1.2442, "step": 64700 }, { - "epoch": 0.66, - "learning_rate": 5.998714635014062e-05, - "loss": 1.6688, + "epoch": 0.8927833347110854, + "grad_norm": 12.702670097351074, + "learning_rate": 5.4661530923995595e-05, + "loss": 1.023, "step": 64800 }, { - "epoch": 0.66, - "learning_rate": 5.9986951452352056e-05, - "loss": 1.8821, + "epoch": 0.8941610867708247, + "grad_norm": 5.255120754241943, + "learning_rate": 5.4660276065819214e-05, + "loss": 1.1045, "step": 64900 }, { - "epoch": 0.66, - "learning_rate": 5.998675508838052e-05, - "loss": 1.8325, + "epoch": 0.8955388388305641, + "grad_norm": 27.506940841674805, + "learning_rate": 5.465901890023843e-05, + "loss": 1.0864, "step": 65000 }, { - "epoch": 0.66, - "learning_rate": 5.9986557258235614e-05, - "loss": 1.695, + "epoch": 0.8969165908903034, + "grad_norm": 7.346675395965576, + "learning_rate": 5.465775942736004e-05, + "loss": 1.0771, "step": 65100 }, { - "epoch": 0.66, - "learning_rate": 5.9986357961927005e-05, - "loss": 1.6766, + "epoch": 0.8982943429500427, + "grad_norm": 12.305924415588379, + "learning_rate": 5.465649764729106e-05, + "loss": 1.0462, "step": 65200 }, { - "epoch": 0.67, - "learning_rate": 5.998615719946444e-05, - "loss": 1.7103, + "epoch": 0.8996720950097821, + "grad_norm": 16.626296997070312, + "learning_rate": 5.4655233560138655e-05, + "loss": 1.164, "step": 65300 }, { - "epoch": 0.67, - "learning_rate": 5.998595497085774e-05, - "loss": 1.7861, + "epoch": 0.9010498470695214, + "grad_norm": 11.740323066711426, + "learning_rate": 5.465396716601025e-05, + "loss": 1.1718, "step": 65400 }, { - "epoch": 0.67, - "learning_rate": 5.998575127611678e-05, - "loss": 1.7178, + "epoch": 0.9024275991292607, + "grad_norm": 13.916251182556152, + "learning_rate": 5.465269846501341e-05, + "loss": 1.141, "step": 65500 }, { - "epoch": 0.67, - "learning_rate": 5.9985546115251534e-05, - "loss": 1.771, + "epoch": 0.9038053511890001, + "grad_norm": 8.259647369384766, + "learning_rate": 5.465142745725592e-05, + "loss": 1.1684, "step": 65600 }, { - "epoch": 0.67, - "learning_rate": 5.998534156179906e-05, - "loss": 1.6728, + "epoch": 0.9051831032487394, + "grad_norm": 5.774639129638672, + "learning_rate": 5.465015414284577e-05, + "loss": 1.1246, "step": 65700 }, { - "epoch": 0.67, - "learning_rate": 5.9985133483376386e-05, - "loss": 1.6888, + "epoch": 0.9065608553084787, + "grad_norm": 5.18399715423584, + "learning_rate": 5.464887852189112e-05, + "loss": 1.2111, "step": 65800 }, { - "epoch": 0.67, - "learning_rate": 5.998492393885964e-05, - "loss": 1.7414, + "epoch": 0.907938607368218, + "grad_norm": 14.119433403015137, + "learning_rate": 5.464760059450035e-05, + "loss": 1.0581, "step": 65900 }, { - "epoch": 0.67, - "learning_rate": 5.998471292825903e-05, - "loss": 1.711, + "epoch": 0.9093163594279573, + "grad_norm": 9.2739896774292, + "learning_rate": 5.4646320360782015e-05, + "loss": 1.0555, "step": 66000 }, { - "epoch": 0.67, - "learning_rate": 5.998450045158491e-05, - "loss": 1.7552, + "epoch": 0.9106941114876966, + "grad_norm": 68.081787109375, + "learning_rate": 5.4645037820844895e-05, + "loss": 1.073, "step": 66100 }, { - "epoch": 0.67, - "learning_rate": 5.9984286508847666e-05, - "loss": 1.7269, + "epoch": 0.912071863547436, + "grad_norm": 155.14369201660156, + "learning_rate": 5.464375297479794e-05, + "loss": 1.0979, "step": 66200 }, { - "epoch": 0.68, - "learning_rate": 5.998407110005773e-05, - "loss": 1.6826, + "epoch": 0.9134496156071753, + "grad_norm": 12.8380708694458, + "learning_rate": 5.4642465822750296e-05, + "loss": 1.1742, "step": 66300 }, { - "epoch": 0.68, - "learning_rate": 5.9983854225225655e-05, - "loss": 1.8838, + "epoch": 0.9148273676669146, + "grad_norm": 29.779661178588867, + "learning_rate": 5.464117636481132e-05, + "loss": 1.1311, "step": 66400 }, { - "epoch": 0.68, - "learning_rate": 5.998363588436206e-05, - "loss": 1.7023, + "epoch": 0.916205119726654, + "grad_norm": 10.672758102416992, + "learning_rate": 5.463988460109056e-05, + "loss": 1.0626, "step": 66500 }, { - "epoch": 0.68, - "learning_rate": 5.998341607747759e-05, - "loss": 1.6921, + "epoch": 0.9175828717863933, + "grad_norm": 23.137615203857422, + "learning_rate": 5.463859053169776e-05, + "loss": 1.162, "step": 66600 }, { - "epoch": 0.68, - "learning_rate": 5.9983194804583006e-05, - "loss": 1.7901, + "epoch": 0.9189606238461326, + "grad_norm": 21.966482162475586, + "learning_rate": 5.4637294156742854e-05, + "loss": 1.213, "step": 66700 }, { - "epoch": 0.68, - "learning_rate": 5.9982972065689135e-05, - "loss": 1.6776, + "epoch": 0.920338375905872, + "grad_norm": 23.99468231201172, + "learning_rate": 5.463600847455167e-05, + "loss": 1.262, "step": 66800 }, { - "epoch": 0.68, - "learning_rate": 5.998274786080685e-05, - "loss": 1.7197, + "epoch": 0.9217161279656113, + "grad_norm": 6.36064338684082, + "learning_rate": 5.4634707511856026e-05, + "loss": 1.2435, "step": 66900 }, { - "epoch": 0.68, - "learning_rate": 5.998252218994712e-05, - "loss": 1.7005, + "epoch": 0.9230938800253506, + "grad_norm": 103.19817352294922, + "learning_rate": 5.4633404243928144e-05, + "loss": 1.1248, "step": 67000 }, { - "epoch": 0.68, - "learning_rate": 5.998229505312098e-05, - "loss": 1.5663, + "epoch": 0.92447163208509, + "grad_norm": 97.3226089477539, + "learning_rate": 5.463209867087877e-05, + "loss": 1.3189, "step": 67100 }, { - "epoch": 0.68, - "learning_rate": 5.998206645033955e-05, - "loss": 1.6845, + "epoch": 0.9258493841448293, + "grad_norm": 13.455120086669922, + "learning_rate": 5.463079079281879e-05, + "loss": 1.1353, "step": 67200 }, { - "epoch": 0.69, - "learning_rate": 5.9981836381613985e-05, - "loss": 1.7288, + "epoch": 0.9272271362045686, + "grad_norm": 9.061980247497559, + "learning_rate": 5.462948060985935e-05, + "loss": 1.2046, "step": 67300 }, { - "epoch": 0.69, - "learning_rate": 5.998160484695554e-05, - "loss": 1.8569, + "epoch": 0.928604888264308, + "grad_norm": 25.758026123046875, + "learning_rate": 5.462816812211173e-05, + "loss": 1.1738, "step": 67400 }, { - "epoch": 0.69, - "learning_rate": 5.998137184637555e-05, - "loss": 1.8241, + "epoch": 0.9299826403240473, + "grad_norm": 5.642591953277588, + "learning_rate": 5.4626853329687444e-05, + "loss": 1.1939, "step": 67500 }, { - "epoch": 0.69, - "learning_rate": 5.998113737988538e-05, - "loss": 1.5569, + "epoch": 0.9313603923837867, + "grad_norm": 10.177616119384766, + "learning_rate": 5.4625536232698185e-05, + "loss": 1.1388, "step": 67600 }, { - "epoch": 0.69, - "learning_rate": 5.998090144749653e-05, - "loss": 1.6191, + "epoch": 0.932738144443526, + "grad_norm": 30.894012451171875, + "learning_rate": 5.4624216831255856e-05, + "loss": 1.0834, "step": 67700 }, { - "epoch": 0.69, - "learning_rate": 5.99806640492205e-05, - "loss": 1.8349, + "epoch": 0.9341158965032653, + "grad_norm": 7.21911096572876, + "learning_rate": 5.462289512547254e-05, + "loss": 1.1439, "step": 67800 }, { - "epoch": 0.69, - "learning_rate": 5.9980425185068914e-05, - "loss": 1.7102, + "epoch": 0.9354936485630047, + "grad_norm": 15.170910835266113, + "learning_rate": 5.462157111546052e-05, + "loss": 1.1791, "step": 67900 }, { - "epoch": 0.69, - "learning_rate": 5.998018485505346e-05, - "loss": 1.708, + "epoch": 0.9368714006227439, + "grad_norm": 19.506067276000977, + "learning_rate": 5.462024480133228e-05, + "loss": 1.263, "step": 68000 }, { - "epoch": 0.69, - "learning_rate": 5.9979943059185874e-05, - "loss": 1.8039, + "epoch": 0.9382491526824832, + "grad_norm": 48.98276901245117, + "learning_rate": 5.461891618320049e-05, + "loss": 1.1762, "step": 68100 }, { - "epoch": 0.69, - "learning_rate": 5.997969979747799e-05, - "loss": 1.7602, + "epoch": 0.9396269047422225, + "grad_norm": 6.124650478363037, + "learning_rate": 5.4617585261178045e-05, + "loss": 1.0375, "step": 68200 }, { - "epoch": 0.7, - "learning_rate": 5.9979455069941686e-05, - "loss": 1.8215, + "epoch": 0.9410046568019619, + "grad_norm": 11.622027397155762, + "learning_rate": 5.461625203537799e-05, + "loss": 1.1505, "step": 68300 }, { - "epoch": 0.7, - "learning_rate": 5.997920887658894e-05, - "loss": 1.7234, + "epoch": 0.9423824088617012, + "grad_norm": 9.864316940307617, + "learning_rate": 5.4614916505913604e-05, + "loss": 1.0624, "step": 68400 }, { - "epoch": 0.7, - "learning_rate": 5.997896121743179e-05, - "loss": 1.6635, + "epoch": 0.9437601609214405, + "grad_norm": 28.526643753051758, + "learning_rate": 5.4613578672898343e-05, + "loss": 1.1675, "step": 68500 }, { - "epoch": 0.7, - "learning_rate": 5.997871209248233e-05, - "loss": 1.722, + "epoch": 0.9451379129811799, + "grad_norm": 9.211615562438965, + "learning_rate": 5.461223853644585e-05, + "loss": 1.1768, "step": 68600 }, { - "epoch": 0.7, - "learning_rate": 5.997846150175276e-05, - "loss": 1.6843, + "epoch": 0.9465156650409192, + "grad_norm": 17.06532096862793, + "learning_rate": 5.461089609667e-05, + "loss": 1.0767, "step": 68700 }, { - "epoch": 0.7, - "learning_rate": 5.997820944525533e-05, - "loss": 1.6984, + "epoch": 0.9478934171006586, + "grad_norm": 5.959560871124268, + "learning_rate": 5.460955135368483e-05, + "loss": 1.1172, "step": 68800 }, { - "epoch": 0.7, - "learning_rate": 5.9977955923002354e-05, - "loss": 1.855, + "epoch": 0.9492711691603979, + "grad_norm": 12.894081115722656, + "learning_rate": 5.460820430760457e-05, + "loss": 1.1053, "step": 68900 }, { - "epoch": 0.7, - "learning_rate": 5.9977700935006236e-05, - "loss": 1.6806, + "epoch": 0.9506489212201372, + "grad_norm": 61.03858184814453, + "learning_rate": 5.460685495854367e-05, + "loss": 1.1451, "step": 69000 }, { - "epoch": 0.7, - "learning_rate": 5.997744448127943e-05, - "loss": 1.733, + "epoch": 0.9520266732798766, + "grad_norm": 55.478179931640625, + "learning_rate": 5.460550330661677e-05, + "loss": 1.1774, "step": 69100 }, { - "epoch": 0.71, - "learning_rate": 5.9977189148284206e-05, - "loss": 1.7506, + "epoch": 0.9534044253396159, + "grad_norm": 120.8309555053711, + "learning_rate": 5.46041493519387e-05, + "loss": 1.0696, "step": 69200 }, { - "epoch": 0.71, - "learning_rate": 5.997692977779074e-05, - "loss": 1.7328, + "epoch": 0.9547821773993552, + "grad_norm": 15.431912422180176, + "learning_rate": 5.460279309462447e-05, + "loss": 1.1694, "step": 69300 }, { - "epoch": 0.71, - "learning_rate": 5.997666894160429e-05, - "loss": 1.7325, + "epoch": 0.9561599294590946, + "grad_norm": 13.145162582397461, + "learning_rate": 5.460143453478931e-05, + "loss": 1.1647, "step": 69400 }, { - "epoch": 0.71, - "learning_rate": 5.997640663973762e-05, - "loss": 1.695, + "epoch": 0.9575376815188339, + "grad_norm": 9.228443145751953, + "learning_rate": 5.460007367254863e-05, + "loss": 1.1053, "step": 69500 }, { - "epoch": 0.71, - "learning_rate": 5.997614287220355e-05, - "loss": 1.5971, + "epoch": 0.9589154335785732, + "grad_norm": 15.133450508117676, + "learning_rate": 5.459871050801806e-05, + "loss": 1.0844, "step": 69600 }, { - "epoch": 0.71, - "learning_rate": 5.997587763901499e-05, - "loss": 1.6594, + "epoch": 0.9602931856383126, + "grad_norm": 39.60796356201172, + "learning_rate": 5.459734504131339e-05, + "loss": 1.1495, "step": 69700 }, { - "epoch": 0.71, - "learning_rate": 5.9975610940184904e-05, - "loss": 1.836, + "epoch": 0.9616709376980519, + "grad_norm": 195.38800048828125, + "learning_rate": 5.4595977272550626e-05, + "loss": 1.0921, "step": 69800 }, { - "epoch": 0.71, - "learning_rate": 5.997534277572632e-05, - "loss": 1.5829, + "epoch": 0.9630486897577912, + "grad_norm": 10.106812477111816, + "learning_rate": 5.459460720184598e-05, + "loss": 1.1482, "step": 69900 }, { - "epoch": 0.71, - "learning_rate": 5.997507314565236e-05, - "loss": 1.7333, + "epoch": 0.9644264418175306, + "grad_norm": 19.906232833862305, + "learning_rate": 5.4593234829315834e-05, + "loss": 1.1633, "step": 70000 }, { - "epoch": 0.71, - "learning_rate": 5.997480204997621e-05, - "loss": 1.7039, + "epoch": 0.9658041938772699, + "grad_norm": 14.408281326293945, + "learning_rate": 5.459186015507678e-05, + "loss": 1.249, "step": 70100 }, { - "epoch": 0.72, - "learning_rate": 5.997452948871112e-05, - "loss": 1.6867, + "epoch": 0.9671819459370091, + "grad_norm": 6.585309982299805, + "learning_rate": 5.459048317924561e-05, + "loss": 1.1562, "step": 70200 }, { - "epoch": 0.72, - "learning_rate": 5.9974255461870406e-05, - "loss": 1.5479, + "epoch": 0.9685596979967485, + "grad_norm": 16.187843322753906, + "learning_rate": 5.458910390193929e-05, + "loss": 1.0716, "step": 70300 }, { - "epoch": 0.72, - "learning_rate": 5.997397996946749e-05, - "loss": 1.6165, + "epoch": 0.9699374500564878, + "grad_norm": 25.06880760192871, + "learning_rate": 5.458772232327501e-05, + "loss": 1.1231, "step": 70400 }, { - "epoch": 0.72, - "learning_rate": 5.997370301151582e-05, - "loss": 1.6625, + "epoch": 0.9713152021162271, + "grad_norm": 9.305314064025879, + "learning_rate": 5.458633844337015e-05, + "loss": 1.1219, "step": 70500 }, { - "epoch": 0.72, - "learning_rate": 5.9973424588028944e-05, - "loss": 1.7098, + "epoch": 0.9726929541759665, + "grad_norm": 49.83774948120117, + "learning_rate": 5.458495226234225e-05, + "loss": 1.044, "step": 70600 }, { - "epoch": 0.72, - "learning_rate": 5.997314469902048e-05, - "loss": 1.5972, + "epoch": 0.9740707062357058, + "grad_norm": 10.125755310058594, + "learning_rate": 5.458357767651902e-05, + "loss": 1.0559, "step": 70700 }, { - "epoch": 0.72, - "learning_rate": 5.997286334450412e-05, - "loss": 1.5813, + "epoch": 0.9754484582954451, + "grad_norm": 30.343183517456055, + "learning_rate": 5.458218691660685e-05, + "loss": 1.1131, "step": 70800 }, { - "epoch": 0.72, - "learning_rate": 5.997258052449361e-05, - "loss": 1.683, + "epoch": 0.9768262103551845, + "grad_norm": 9.028037071228027, + "learning_rate": 5.4580793855924345e-05, + "loss": 1.0412, "step": 70900 }, { - "epoch": 0.72, - "learning_rate": 5.997229623900277e-05, - "loss": 1.7293, + "epoch": 0.9782039624149238, + "grad_norm": 3.4295129776000977, + "learning_rate": 5.457939849458987e-05, + "loss": 0.9659, "step": 71000 }, { - "epoch": 0.72, - "learning_rate": 5.997201048804552e-05, - "loss": 1.6516, + "epoch": 0.9795817144746631, + "grad_norm": 9.905821800231934, + "learning_rate": 5.457800083272196e-05, + "loss": 1.0396, "step": 71100 }, { - "epoch": 0.73, - "learning_rate": 5.9971723271635806e-05, - "loss": 1.6689, + "epoch": 0.9809594665344025, + "grad_norm": 161.8009033203125, + "learning_rate": 5.4576600870439336e-05, + "loss": 1.0404, "step": 71200 }, { - "epoch": 0.73, - "learning_rate": 5.997143458978769e-05, - "loss": 1.5891, + "epoch": 0.9823372185941418, + "grad_norm": 8.669951438903809, + "learning_rate": 5.4575212641872804e-05, + "loss": 1.1563, "step": 71300 }, { - "epoch": 0.73, - "learning_rate": 5.9971144442515293e-05, - "loss": 1.7378, + "epoch": 0.9837149706538811, + "grad_norm": 37.04841613769531, + "learning_rate": 5.4573808102118954e-05, + "loss": 1.0112, "step": 71400 }, { - "epoch": 0.73, - "learning_rate": 5.9970852829832785e-05, - "loss": 1.7722, + "epoch": 0.9850927227136205, + "grad_norm": 16.401836395263672, + "learning_rate": 5.45724012623066e-05, + "loss": 1.1272, "step": 71500 }, { - "epoch": 0.73, - "learning_rate": 5.997056268978888e-05, - "loss": 1.6847, + "epoch": 0.9864704747733598, + "grad_norm": 12.509251594543457, + "learning_rate": 5.4570992122555254e-05, + "loss": 1.1502, "step": 71600 }, { - "epoch": 0.73, - "learning_rate": 5.997026816098274e-05, - "loss": 1.6061, + "epoch": 0.9878482268330991, + "grad_norm": 6.93540096282959, + "learning_rate": 5.456958068298463e-05, + "loss": 1.0207, "step": 71700 }, { - "epoch": 0.73, - "learning_rate": 5.9969972166809356e-05, - "loss": 1.7402, + "epoch": 0.9892259788928385, + "grad_norm": 19.594242095947266, + "learning_rate": 5.4568166943714654e-05, + "loss": 1.0321, "step": 71800 }, { - "epoch": 0.73, - "learning_rate": 5.996967470728319e-05, - "loss": 1.6667, + "epoch": 0.9906037309525778, + "grad_norm": 18.886825561523438, + "learning_rate": 5.45667509048654e-05, + "loss": 1.0534, "step": 71900 }, { - "epoch": 0.73, - "learning_rate": 5.996937578241877e-05, - "loss": 1.8258, + "epoch": 0.9919814830123171, + "grad_norm": 8.338887214660645, + "learning_rate": 5.45653325665572e-05, + "loss": 1.092, "step": 72000 }, { - "epoch": 0.73, - "learning_rate": 5.996907539223072e-05, - "loss": 1.664, + "epoch": 0.9933592350720565, + "grad_norm": 8.154275894165039, + "learning_rate": 5.4563911928910524e-05, + "loss": 1.0882, "step": 72100 }, { - "epoch": 0.74, - "learning_rate": 5.996877353673373e-05, - "loss": 1.767, + "epoch": 0.9947369871317958, + "grad_norm": 13.929986000061035, + "learning_rate": 5.456248899204607e-05, + "loss": 0.9251, "step": 72200 }, { - "epoch": 0.74, - "learning_rate": 5.9968470215942564e-05, - "loss": 1.7394, + "epoch": 0.996114739191535, + "grad_norm": 46.77289581298828, + "learning_rate": 5.456106375608472e-05, + "loss": 1.0503, "step": 72300 }, { - "epoch": 0.74, - "learning_rate": 5.9968165429872044e-05, - "loss": 1.7196, + "epoch": 0.9974924912512744, + "grad_norm": 19.588340759277344, + "learning_rate": 5.455963622114758e-05, + "loss": 1.1006, "step": 72400 }, { - "epoch": 0.74, - "learning_rate": 5.9967859178537076e-05, - "loss": 1.7572, + "epoch": 0.9988702433110137, + "grad_norm": 12.388969421386719, + "learning_rate": 5.455820638735589e-05, + "loss": 1.0414, "step": 72500 }, { - "epoch": 0.74, - "learning_rate": 5.996755146195264e-05, - "loss": 1.731, + "epoch": 1.000247995370753, + "grad_norm": 15.652881622314453, + "learning_rate": 5.4556774254831144e-05, + "loss": 1.0677, "step": 72600 }, { - "epoch": 0.74, - "learning_rate": 5.996724228013375e-05, - "loss": 1.7933, + "epoch": 1.0016257474304924, + "grad_norm": 32.35802459716797, + "learning_rate": 5.4555339823695003e-05, + "loss": 1.0572, "step": 72700 }, { - "epoch": 0.74, - "learning_rate": 5.996693163309557e-05, - "loss": 1.7275, + "epoch": 1.0030034994902317, + "grad_norm": 7.339658260345459, + "learning_rate": 5.4553903094069325e-05, + "loss": 1.0814, "step": 72800 }, { - "epoch": 0.74, - "learning_rate": 5.9966619520853266e-05, - "loss": 1.7643, + "epoch": 1.004381251549971, + "grad_norm": 16.973472595214844, + "learning_rate": 5.455246406607618e-05, + "loss": 1.1038, "step": 72900 }, { - "epoch": 0.74, - "learning_rate": 5.9966305943422094e-05, - "loss": 1.7486, + "epoch": 1.0057590036097104, + "grad_norm": 20.264204025268555, + "learning_rate": 5.455102273983781e-05, + "loss": 1.0272, "step": 73000 }, { - "epoch": 0.74, - "learning_rate": 5.9965990900817385e-05, - "loss": 1.8221, + "epoch": 1.0071367556694497, + "grad_norm": 7.178906440734863, + "learning_rate": 5.4549579115476665e-05, + "loss": 1.0812, "step": 73100 }, { - "epoch": 0.75, - "learning_rate": 5.9965674393054556e-05, - "loss": 1.7577, + "epoch": 1.008514507729189, + "grad_norm": 13.878907203674316, + "learning_rate": 5.45481331931154e-05, + "loss": 1.1016, "step": 73200 }, { - "epoch": 0.75, - "learning_rate": 5.996535642014907e-05, - "loss": 1.768, + "epoch": 1.0098922597889284, + "grad_norm": 5.6693925857543945, + "learning_rate": 5.454668497287682e-05, + "loss": 1.0893, "step": 73300 }, { - "epoch": 0.75, - "learning_rate": 5.9965036982116486e-05, - "loss": 1.8148, + "epoch": 1.0112700118486677, + "grad_norm": 9.715073585510254, + "learning_rate": 5.4545234454884e-05, + "loss": 1.0805, "step": 73400 }, { - "epoch": 0.75, - "learning_rate": 5.9964716078972415e-05, - "loss": 1.8267, + "epoch": 1.012647763908407, + "grad_norm": 13.02507209777832, + "learning_rate": 5.454378163926014e-05, + "loss": 1.0722, "step": 73500 }, { - "epoch": 0.75, - "learning_rate": 5.9964393710732545e-05, - "loss": 1.8537, + "epoch": 1.0140255159681464, + "grad_norm": 9.488529205322266, + "learning_rate": 5.4542326526128687e-05, + "loss": 1.1949, "step": 73600 }, { - "epoch": 0.75, - "learning_rate": 5.996406987741264e-05, - "loss": 1.7622, + "epoch": 1.0154032680278857, + "grad_norm": 31.917760848999023, + "learning_rate": 5.454086911561324e-05, + "loss": 1.0524, "step": 73700 }, { - "epoch": 0.75, - "learning_rate": 5.9963744579028535e-05, - "loss": 1.7858, + "epoch": 1.016781020087625, + "grad_norm": 16.788251876831055, + "learning_rate": 5.453942401628641e-05, + "loss": 1.1226, "step": 73800 }, { - "epoch": 0.75, - "learning_rate": 5.9963417815596135e-05, - "loss": 1.735, + "epoch": 1.0181587721473644, + "grad_norm": 15.653600692749023, + "learning_rate": 5.4537962034345385e-05, + "loss": 1.0383, "step": 73900 }, { - "epoch": 0.75, - "learning_rate": 5.996308958713141e-05, - "loss": 1.7192, + "epoch": 1.0195365242071037, + "grad_norm": 14.568146705627441, + "learning_rate": 5.4536497755391145e-05, + "loss": 1.1341, "step": 74000 }, { - "epoch": 0.75, - "learning_rate": 5.9962759893650424e-05, - "loss": 1.7746, + "epoch": 1.020914276266843, + "grad_norm": 22.769643783569336, + "learning_rate": 5.453503117954811e-05, + "loss": 1.0483, "step": 74100 }, { - "epoch": 0.76, - "learning_rate": 5.996242873516928e-05, - "loss": 1.8431, + "epoch": 1.0222920283265824, + "grad_norm": 16.251747131347656, + "learning_rate": 5.4533562306940866e-05, + "loss": 1.0303, "step": 74200 }, { - "epoch": 0.76, - "learning_rate": 5.996209611170418e-05, - "loss": 1.775, + "epoch": 1.0236697803863217, + "grad_norm": 7.284687042236328, + "learning_rate": 5.4532091137694206e-05, + "loss": 0.9862, "step": 74300 }, { - "epoch": 0.76, - "learning_rate": 5.996176202327139e-05, - "loss": 1.5642, + "epoch": 1.025047532446061, + "grad_norm": 13.01443099975586, + "learning_rate": 5.4530617671933106e-05, + "loss": 1.1108, "step": 74400 }, { - "epoch": 0.76, - "learning_rate": 5.996142646988723e-05, - "loss": 1.5735, + "epoch": 1.0264252845058004, + "grad_norm": 20.628028869628906, + "learning_rate": 5.4529141909782745e-05, + "loss": 1.098, "step": 74500 }, { - "epoch": 0.76, - "learning_rate": 5.9961089451568125e-05, - "loss": 1.9178, + "epoch": 1.0278030365655397, + "grad_norm": 104.16590118408203, + "learning_rate": 5.4527663851368504e-05, + "loss": 1.1839, "step": 74600 }, { - "epoch": 0.76, - "learning_rate": 5.996075096833054e-05, - "loss": 1.7321, + "epoch": 1.029180788625279, + "grad_norm": 28.211145401000977, + "learning_rate": 5.452618349681594e-05, + "loss": 1.0293, "step": 74700 }, { - "epoch": 0.76, - "learning_rate": 5.996041102019102e-05, - "loss": 1.8698, + "epoch": 1.0305585406850184, + "grad_norm": 11.115974426269531, + "learning_rate": 5.452470084625083e-05, + "loss": 1.1017, "step": 74800 }, { - "epoch": 0.76, - "learning_rate": 5.996007302854759e-05, - "loss": 1.8204, + "epoch": 1.0319362927447577, + "grad_norm": 16.250064849853516, + "learning_rate": 5.452321589979911e-05, + "loss": 1.1867, "step": 74900 }, { - "epoch": 0.76, - "learning_rate": 5.995973016530276e-05, - "loss": 1.7028, + "epoch": 1.033314044804497, + "grad_norm": 6.32594633102417, + "learning_rate": 5.452172865758696e-05, + "loss": 1.0984, "step": 75000 }, { - "epoch": 0.77, - "learning_rate": 5.9959385837205914e-05, - "loss": 1.7415, + "epoch": 1.0346917968642364, + "grad_norm": 9.784749984741211, + "learning_rate": 5.4520239119740725e-05, + "loss": 1.1673, "step": 75100 }, { - "epoch": 0.77, - "learning_rate": 5.99590400442739e-05, - "loss": 1.7637, + "epoch": 1.0360695489239757, + "grad_norm": 7.5926408767700195, + "learning_rate": 5.4518747286386934e-05, + "loss": 1.0425, "step": 75200 }, { - "epoch": 0.77, - "learning_rate": 5.9958692786523614e-05, - "loss": 1.5989, + "epoch": 1.037447300983715, + "grad_norm": 30.351991653442383, + "learning_rate": 5.451725315765233e-05, + "loss": 1.1489, "step": 75300 }, { - "epoch": 0.77, - "learning_rate": 5.995834406397203e-05, - "loss": 1.7268, + "epoch": 1.0388250530434544, + "grad_norm": 32.387943267822266, + "learning_rate": 5.4515756733663866e-05, + "loss": 1.2019, "step": 75400 }, { - "epoch": 0.77, - "learning_rate": 5.9957993876636205e-05, - "loss": 1.8434, + "epoch": 1.0402028051031937, + "grad_norm": 7.632412433624268, + "learning_rate": 5.4514258014548644e-05, + "loss": 1.1611, "step": 75500 }, { - "epoch": 0.77, - "learning_rate": 5.995764222453326e-05, - "loss": 1.6682, + "epoch": 1.0415805571629329, + "grad_norm": 12.371882438659668, + "learning_rate": 5.4512757000434e-05, + "loss": 1.1791, "step": 75600 }, { - "epoch": 0.77, - "learning_rate": 5.995728910768039e-05, - "loss": 1.8422, + "epoch": 1.0429583092226722, + "grad_norm": 59.52131271362305, + "learning_rate": 5.4511253691447454e-05, + "loss": 1.0491, "step": 75700 }, { - "epoch": 0.77, - "learning_rate": 5.995693452609485e-05, - "loss": 1.7228, + "epoch": 1.0443360612824115, + "grad_norm": 13.746917724609375, + "learning_rate": 5.4509748087716715e-05, + "loss": 1.1139, "step": 75800 }, { - "epoch": 0.77, - "learning_rate": 5.9956578479793994e-05, - "loss": 1.6471, + "epoch": 1.0457138133421509, + "grad_norm": 4.954247951507568, + "learning_rate": 5.45082401893697e-05, + "loss": 1.1633, "step": 75900 }, { - "epoch": 0.77, - "learning_rate": 5.9956220968795216e-05, - "loss": 1.7214, + "epoch": 1.0470915654018902, + "grad_norm": 65.24946594238281, + "learning_rate": 5.450672999653451e-05, + "loss": 1.1437, "step": 76000 }, { - "epoch": 0.78, - "learning_rate": 5.9955861993116e-05, - "loss": 1.8173, + "epoch": 1.0484693174616295, + "grad_norm": 29.954261779785156, + "learning_rate": 5.450521750933944e-05, + "loss": 1.118, "step": 76100 }, { - "epoch": 0.78, - "learning_rate": 5.995550155277391e-05, - "loss": 1.6598, + "epoch": 1.0498470695213689, + "grad_norm": 23.084720611572266, + "learning_rate": 5.450370272791298e-05, + "loss": 1.1746, "step": 76200 }, { - "epoch": 0.78, - "learning_rate": 5.995513964778656e-05, - "loss": 1.7961, + "epoch": 1.0512248215811082, + "grad_norm": 14.409061431884766, + "learning_rate": 5.45022008344945e-05, + "loss": 1.0738, "step": 76300 }, { - "epoch": 0.78, - "learning_rate": 5.995477627817163e-05, - "loss": 1.6887, + "epoch": 1.0526025736408475, + "grad_norm": 20.48155975341797, + "learning_rate": 5.450068148793063e-05, + "loss": 1.1904, "step": 76400 }, { - "epoch": 0.78, - "learning_rate": 5.995441144394691e-05, - "loss": 1.688, + "epoch": 1.0539803257005869, + "grad_norm": 3.173302173614502, + "learning_rate": 5.449917507527895e-05, + "loss": 1.1493, "step": 76500 }, { - "epoch": 0.78, - "learning_rate": 5.9954045145130224e-05, - "loss": 1.5794, + "epoch": 1.0553580777603262, + "grad_norm": 10.985121726989746, + "learning_rate": 5.4497651164088826e-05, + "loss": 1.0501, "step": 76600 }, { - "epoch": 0.78, - "learning_rate": 5.995367738173949e-05, - "loss": 1.5646, + "epoch": 1.0567358298200655, + "grad_norm": 14.65538215637207, + "learning_rate": 5.449612495931011e-05, + "loss": 1.133, "step": 76700 }, { - "epoch": 0.78, - "learning_rate": 5.995330815379269e-05, - "loss": 1.749, + "epoch": 1.0581135818798049, + "grad_norm": 32.66143798828125, + "learning_rate": 5.449459646107248e-05, + "loss": 1.1614, "step": 76800 }, { - "epoch": 0.78, - "learning_rate": 5.9952937461307866e-05, - "loss": 1.6658, + "epoch": 1.0594913339395442, + "grad_norm": 11.89787483215332, + "learning_rate": 5.449306566950577e-05, + "loss": 1.0989, "step": 76900 }, { - "epoch": 0.78, - "learning_rate": 5.995256530430316e-05, - "loss": 1.7042, + "epoch": 1.0608690859992835, + "grad_norm": 5.535636901855469, + "learning_rate": 5.449153258474003e-05, + "loss": 1.0709, "step": 77000 }, { - "epoch": 0.79, - "learning_rate": 5.995219168279675e-05, - "loss": 1.7833, + "epoch": 1.0622468380590229, + "grad_norm": 3.966078758239746, + "learning_rate": 5.4489997206905524e-05, + "loss": 1.088, "step": 77100 }, { - "epoch": 0.79, - "learning_rate": 5.9951816596806914e-05, - "loss": 1.6982, + "epoch": 1.0636245901187622, + "grad_norm": 33.74238204956055, + "learning_rate": 5.448845953613267e-05, + "loss": 1.1627, "step": 77200 }, { - "epoch": 0.79, - "learning_rate": 5.9951440046351994e-05, - "loss": 1.8167, + "epoch": 1.0650023421785015, + "grad_norm": 23.126930236816406, + "learning_rate": 5.4486919572552104e-05, + "loss": 1.124, "step": 77300 }, { - "epoch": 0.79, - "learning_rate": 5.995106203145039e-05, - "loss": 1.4935, + "epoch": 1.0663800942382409, + "grad_norm": 9.946000099182129, + "learning_rate": 5.448537731629465e-05, + "loss": 1.0372, "step": 77400 }, { - "epoch": 0.79, - "learning_rate": 5.995068255212059e-05, - "loss": 1.6009, + "epoch": 1.0677578462979802, + "grad_norm": 43.816619873046875, + "learning_rate": 5.4483832767491345e-05, + "loss": 1.0756, "step": 77500 }, { - "epoch": 0.79, - "learning_rate": 5.995030160838116e-05, - "loss": 1.6238, + "epoch": 1.0691355983577195, + "grad_norm": 11.190947532653809, + "learning_rate": 5.44822859262734e-05, + "loss": 1.1891, "step": 77600 }, { - "epoch": 0.79, - "learning_rate": 5.9949919200250703e-05, - "loss": 1.7839, + "epoch": 1.0705133504174589, + "grad_norm": 15.756780624389648, + "learning_rate": 5.448073679277221e-05, + "loss": 1.1597, "step": 77700 }, { - "epoch": 0.79, - "learning_rate": 5.994953532774793e-05, - "loss": 1.7463, + "epoch": 1.0718911024771982, + "grad_norm": 11.828129768371582, + "learning_rate": 5.447918536711941e-05, + "loss": 1.0724, "step": 77800 }, { - "epoch": 0.79, - "learning_rate": 5.994914999089161e-05, - "loss": 1.5378, + "epoch": 1.0732688545369375, + "grad_norm": 6.720120429992676, + "learning_rate": 5.44776316494468e-05, + "loss": 1.0642, "step": 77900 }, { - "epoch": 0.79, - "learning_rate": 5.994876318970059e-05, - "loss": 1.6648, + "epoch": 1.0746466065966769, + "grad_norm": 50.856388092041016, + "learning_rate": 5.447607563988635e-05, + "loss": 1.1431, "step": 78000 }, { - "epoch": 0.8, - "learning_rate": 5.994837492419377e-05, - "loss": 1.7013, + "epoch": 1.0760243586564162, + "grad_norm": 13.411588668823242, + "learning_rate": 5.447451733857026e-05, + "loss": 1.1438, "step": 78100 }, { - "epoch": 0.8, - "learning_rate": 5.994798519439013e-05, - "loss": 1.5961, + "epoch": 1.0774021107161555, + "grad_norm": 12.107073783874512, + "learning_rate": 5.447295674563093e-05, + "loss": 1.1408, "step": 78200 }, { - "epoch": 0.8, - "learning_rate": 5.994759400030875e-05, - "loss": 1.7069, + "epoch": 1.0787798627758949, + "grad_norm": 22.65484619140625, + "learning_rate": 5.4471393861200924e-05, + "loss": 1.0825, "step": 78300 }, { - "epoch": 0.8, - "learning_rate": 5.994720134196874e-05, - "loss": 1.6798, + "epoch": 1.0801576148356342, + "grad_norm": 9.944424629211426, + "learning_rate": 5.446982868541304e-05, + "loss": 1.1244, "step": 78400 }, { - "epoch": 0.8, - "learning_rate": 5.994680721938929e-05, - "loss": 1.5955, + "epoch": 1.0815353668953736, + "grad_norm": 11.047139167785645, + "learning_rate": 5.446826121840022e-05, + "loss": 0.9826, "step": 78500 }, { - "epoch": 0.8, - "learning_rate": 5.9946411632589686e-05, - "loss": 1.5852, + "epoch": 1.0829131189551129, + "grad_norm": 11.193907737731934, + "learning_rate": 5.446669146029564e-05, + "loss": 1.0336, "step": 78600 }, { - "epoch": 0.8, - "learning_rate": 5.9946014581589264e-05, - "loss": 1.5861, + "epoch": 1.0842908710148522, + "grad_norm": 5.81455135345459, + "learning_rate": 5.4465119411232666e-05, + "loss": 1.1661, "step": 78700 }, { - "epoch": 0.8, - "learning_rate": 5.994561606640744e-05, - "loss": 1.6381, + "epoch": 1.0856686230745916, + "grad_norm": 11.2352294921875, + "learning_rate": 5.446354507134484e-05, + "loss": 1.0469, "step": 78800 }, { - "epoch": 0.8, - "learning_rate": 5.994521608706369e-05, - "loss": 1.4923, + "epoch": 1.0870463751343309, + "grad_norm": 12.239124298095703, + "learning_rate": 5.446196844076592e-05, + "loss": 1.0839, "step": 78900 }, { - "epoch": 0.8, - "learning_rate": 5.994481464357758e-05, - "loss": 1.6126, + "epoch": 1.0884241271940702, + "grad_norm": 13.329319953918457, + "learning_rate": 5.4460389519629845e-05, + "loss": 1.2385, "step": 79000 }, { - "epoch": 0.81, - "learning_rate": 5.994441173596873e-05, - "loss": 1.5464, + "epoch": 1.0898018792538096, + "grad_norm": 11.425338745117188, + "learning_rate": 5.445880830807075e-05, + "loss": 1.1366, "step": 79100 }, { - "epoch": 0.81, - "learning_rate": 5.994400736425685e-05, - "loss": 1.5168, + "epoch": 1.091179631313549, + "grad_norm": 45.51327896118164, + "learning_rate": 5.445722480622297e-05, + "loss": 1.1483, "step": 79200 }, { - "epoch": 0.81, - "learning_rate": 5.9943605594066815e-05, - "loss": 1.5812, + "epoch": 1.0925573833732882, + "grad_norm": 9.737133979797363, + "learning_rate": 5.445563901422103e-05, + "loss": 1.1978, "step": 79300 }, { - "epoch": 0.81, - "learning_rate": 5.9943198308848785e-05, - "loss": 1.5695, + "epoch": 1.0939351354330276, + "grad_norm": 10.909536361694336, + "learning_rate": 5.4454050932199644e-05, + "loss": 1.183, "step": 79400 }, { - "epoch": 0.81, - "learning_rate": 5.994278955958705e-05, - "loss": 1.5285, + "epoch": 1.095312887492767, + "grad_norm": 9.269868850708008, + "learning_rate": 5.445246056029374e-05, + "loss": 1.1329, "step": 79500 }, { - "epoch": 0.81, - "learning_rate": 5.9942379346301594e-05, - "loss": 1.6406, + "epoch": 1.0966906395525062, + "grad_norm": 11.121707916259766, + "learning_rate": 5.445086789863844e-05, + "loss": 1.0166, "step": 79600 }, { - "epoch": 0.81, - "learning_rate": 5.994196766901248e-05, - "loss": 1.6521, + "epoch": 1.0980683916122453, + "grad_norm": 21.100162506103516, + "learning_rate": 5.444928890821485e-05, + "loss": 1.0112, "step": 79700 }, { - "epoch": 0.81, - "learning_rate": 5.994155452773982e-05, - "loss": 1.5834, + "epoch": 1.099446143671985, + "grad_norm": 38.795997619628906, + "learning_rate": 5.4447691690360935e-05, + "loss": 1.0999, "step": 79800 }, { - "epoch": 0.81, - "learning_rate": 5.994113992250384e-05, - "loss": 1.596, + "epoch": 1.100823895731724, + "grad_norm": 38.402748107910156, + "learning_rate": 5.444609218316274e-05, + "loss": 1.2368, "step": 79900 }, { - "epoch": 0.82, - "learning_rate": 5.9940723853324786e-05, - "loss": 1.5746, + "epoch": 1.1022016477914633, + "grad_norm": 8.940460205078125, + "learning_rate": 5.444449038675617e-05, + "loss": 1.0731, "step": 80000 }, { - "epoch": 0.82, - "learning_rate": 5.9940306320223024e-05, - "loss": 1.6426, + "epoch": 1.1035793998512027, + "grad_norm": 13.04713249206543, + "learning_rate": 5.444288630127729e-05, + "loss": 1.1178, "step": 80100 }, { - "epoch": 0.82, - "learning_rate": 5.993988732321894e-05, - "loss": 1.4767, + "epoch": 1.104957151910942, + "grad_norm": 75.44645690917969, + "learning_rate": 5.444127992686238e-05, + "loss": 1.135, "step": 80200 }, { - "epoch": 0.82, - "learning_rate": 5.993946686233305e-05, - "loss": 1.6856, + "epoch": 1.1063349039706813, + "grad_norm": 43.578285217285156, + "learning_rate": 5.4439671263647916e-05, + "loss": 1.0547, "step": 80300 }, { - "epoch": 0.82, - "learning_rate": 5.99390449375859e-05, - "loss": 1.543, + "epoch": 1.1077126560304207, + "grad_norm": 9.73839282989502, + "learning_rate": 5.443806031177055e-05, + "loss": 1.074, "step": 80400 }, { - "epoch": 0.82, - "learning_rate": 5.9938625790129935e-05, - "loss": 1.6273, + "epoch": 1.10909040809016, + "grad_norm": 30.30518913269043, + "learning_rate": 5.443644707136714e-05, + "loss": 1.0595, "step": 80500 }, { - "epoch": 0.82, - "learning_rate": 5.9938200952360323e-05, - "loss": 1.5779, + "epoch": 1.1104681601498994, + "grad_norm": 11.806612968444824, + "learning_rate": 5.443483154257475e-05, + "loss": 1.0632, "step": 80600 }, { - "epoch": 0.82, - "learning_rate": 5.993777465079134e-05, - "loss": 1.6695, + "epoch": 1.1118459122096387, + "grad_norm": 12.915070533752441, + "learning_rate": 5.4433213725530626e-05, + "loss": 1.1425, "step": 80700 }, { - "epoch": 0.82, - "learning_rate": 5.993734688544384e-05, - "loss": 1.6803, + "epoch": 1.113223664269378, + "grad_norm": 1.4595685005187988, + "learning_rate": 5.4431593620372206e-05, + "loss": 1.0396, "step": 80800 }, { - "epoch": 0.82, - "learning_rate": 5.993692195587532e-05, - "loss": 1.5935, + "epoch": 1.1146014163291174, + "grad_norm": 33.75121307373047, + "learning_rate": 5.442997122723712e-05, + "loss": 1.3027, "step": 80900 }, { - "epoch": 0.83, - "learning_rate": 5.9936491277670844e-05, - "loss": 1.571, + "epoch": 1.1159791683888567, + "grad_norm": 46.50124740600586, + "learning_rate": 5.442834654626321e-05, + "loss": 1.109, "step": 81000 }, { - "epoch": 0.83, - "learning_rate": 5.9936059135750614e-05, - "loss": 1.6264, + "epoch": 1.117356920448596, + "grad_norm": 25.134326934814453, + "learning_rate": 5.4426719577588504e-05, + "loss": 1.0968, "step": 81100 }, { - "epoch": 0.83, - "learning_rate": 5.9935625530135734e-05, - "loss": 1.7186, + "epoch": 1.1187346725083354, + "grad_norm": 20.71164321899414, + "learning_rate": 5.44250903213512e-05, + "loss": 1.0733, "step": 81200 }, { - "epoch": 0.83, - "learning_rate": 5.993519046084741e-05, - "loss": 1.5536, + "epoch": 1.1201124245680747, + "grad_norm": 17.346027374267578, + "learning_rate": 5.442347510444865e-05, + "loss": 1.2123, "step": 81300 }, { - "epoch": 0.83, - "learning_rate": 5.993475392790693e-05, - "loss": 1.498, + "epoch": 1.121490176627814, + "grad_norm": 12.854451179504395, + "learning_rate": 5.4421841296373786e-05, + "loss": 1.0808, "step": 81400 }, { - "epoch": 0.83, - "learning_rate": 5.993431593133561e-05, - "loss": 1.7581, + "epoch": 1.1228679286875534, + "grad_norm": 33.05289840698242, + "learning_rate": 5.4420205201150774e-05, + "loss": 1.1255, "step": 81500 }, { - "epoch": 0.83, - "learning_rate": 5.993387647115488e-05, - "loss": 1.6968, + "epoch": 1.1242456807472927, + "grad_norm": 29.11379051208496, + "learning_rate": 5.4418566818918604e-05, + "loss": 1.129, "step": 81600 }, { - "epoch": 0.83, - "learning_rate": 5.9933435547386226e-05, - "loss": 1.5503, + "epoch": 1.125623432807032, + "grad_norm": 8.390328407287598, + "learning_rate": 5.441692614981648e-05, + "loss": 1.131, "step": 81700 }, { - "epoch": 0.83, - "learning_rate": 5.993299316005122e-05, - "loss": 1.7896, + "epoch": 1.1270011848667714, + "grad_norm": 6.617385387420654, + "learning_rate": 5.4415283193983766e-05, + "loss": 1.0623, "step": 81800 }, { - "epoch": 0.83, - "learning_rate": 5.993254930917147e-05, - "loss": 1.6738, + "epoch": 1.1283789369265107, + "grad_norm": 27.417757034301758, + "learning_rate": 5.441363795156004e-05, + "loss": 1.0916, "step": 81900 }, { - "epoch": 0.84, - "learning_rate": 5.993210399476868e-05, - "loss": 1.6336, + "epoch": 1.12975668898625, + "grad_norm": 57.721649169921875, + "learning_rate": 5.441199042268509e-05, + "loss": 1.0992, "step": 82000 }, { - "epoch": 0.84, - "learning_rate": 5.993165721686464e-05, - "loss": 1.6025, + "epoch": 1.1311344410459894, + "grad_norm": 14.383347511291504, + "learning_rate": 5.441034060749888e-05, + "loss": 1.1283, "step": 82100 }, { - "epoch": 0.84, - "learning_rate": 5.993120897548118e-05, - "loss": 1.4988, + "epoch": 1.1325121931057287, + "grad_norm": 11.432974815368652, + "learning_rate": 5.440868850614155e-05, + "loss": 1.0373, "step": 82200 }, { - "epoch": 0.84, - "learning_rate": 5.993075927064022e-05, - "loss": 1.6275, + "epoch": 1.133889945165468, + "grad_norm": 33.512874603271484, + "learning_rate": 5.4407034118753466e-05, + "loss": 1.0787, "step": 82300 }, { - "epoch": 0.84, - "learning_rate": 5.9930308102363755e-05, - "loss": 1.6154, + "epoch": 1.1352676972252074, + "grad_norm": 8.812397003173828, + "learning_rate": 5.440537744547519e-05, + "loss": 1.2184, "step": 82400 }, { - "epoch": 0.84, - "learning_rate": 5.9929855470673834e-05, - "loss": 1.6202, + "epoch": 1.1366454492849467, + "grad_norm": 4.684133052825928, + "learning_rate": 5.440371848644745e-05, + "loss": 0.9759, "step": 82500 }, { - "epoch": 0.84, - "learning_rate": 5.99294013755926e-05, - "loss": 1.4874, + "epoch": 1.138023201344686, + "grad_norm": 8.109193801879883, + "learning_rate": 5.440205724181118e-05, + "loss": 1.0988, "step": 82600 }, { - "epoch": 0.84, - "learning_rate": 5.992894581714224e-05, - "loss": 1.4211, + "epoch": 1.1394009534044254, + "grad_norm": 53.559410095214844, + "learning_rate": 5.440039371170752e-05, + "loss": 1.1205, "step": 82700 }, { - "epoch": 0.84, - "learning_rate": 5.992848879534503e-05, - "loss": 1.5337, + "epoch": 1.1407787054641647, + "grad_norm": 10.38835620880127, + "learning_rate": 5.439872789627779e-05, + "loss": 1.1365, "step": 82800 }, { - "epoch": 0.84, - "learning_rate": 5.992803031022334e-05, - "loss": 1.4893, + "epoch": 1.142156457523904, + "grad_norm": 18.772029876708984, + "learning_rate": 5.439705979566352e-05, + "loss": 1.0443, "step": 82900 }, { - "epoch": 0.85, - "learning_rate": 5.992757496852707e-05, - "loss": 1.6062, + "epoch": 1.1435342095836434, + "grad_norm": 13.300806045532227, + "learning_rate": 5.439538941000641e-05, + "loss": 1.1483, "step": 83000 }, { - "epoch": 0.85, - "learning_rate": 5.9927113571456375e-05, - "loss": 1.6367, + "epoch": 1.1449119616433827, + "grad_norm": 10.32981014251709, + "learning_rate": 5.439371673944837e-05, + "loss": 1.2567, "step": 83100 }, { - "epoch": 0.85, - "learning_rate": 5.992665071112843e-05, - "loss": 1.5922, + "epoch": 1.146289713703122, + "grad_norm": 22.647167205810547, + "learning_rate": 5.439204178413151e-05, + "loss": 1.1153, "step": 83200 }, { - "epoch": 0.85, - "learning_rate": 5.992618638756586e-05, - "loss": 1.5424, + "epoch": 1.1476674657628614, + "grad_norm": 14.755345344543457, + "learning_rate": 5.439036454419812e-05, + "loss": 1.126, "step": 83300 }, { - "epoch": 0.85, - "learning_rate": 5.992572060079136e-05, - "loss": 1.6272, + "epoch": 1.1490452178226007, + "grad_norm": 38.85015106201172, + "learning_rate": 5.43886850197907e-05, + "loss": 1.164, "step": 83400 }, { - "epoch": 0.85, - "learning_rate": 5.9925253350827716e-05, - "loss": 1.5665, + "epoch": 1.15042296988234, + "grad_norm": 6.0228376388549805, + "learning_rate": 5.4387003211051914e-05, + "loss": 1.1508, "step": 83500 }, { - "epoch": 0.85, - "learning_rate": 5.9924784637697755e-05, - "loss": 1.5553, + "epoch": 1.1518007219420794, + "grad_norm": 6.189916610717773, + "learning_rate": 5.4385319118124655e-05, + "loss": 1.114, "step": 83600 }, { - "epoch": 0.85, - "learning_rate": 5.992431446142442e-05, - "loss": 1.494, + "epoch": 1.1531784740018187, + "grad_norm": 12.52712345123291, + "learning_rate": 5.4383632741152e-05, + "loss": 1.0507, "step": 83700 }, { - "epoch": 0.85, - "learning_rate": 5.992384282203069e-05, - "loss": 1.6219, + "epoch": 1.154556226061558, + "grad_norm": 34.678489685058594, + "learning_rate": 5.4381944080277204e-05, + "loss": 1.1446, "step": 83800 }, { - "epoch": 0.85, - "learning_rate": 5.992336971953961e-05, - "loss": 1.6507, + "epoch": 1.1559339781212974, + "grad_norm": 19.96841049194336, + "learning_rate": 5.438025313564373e-05, + "loss": 1.0987, "step": 83900 }, { - "epoch": 0.86, - "learning_rate": 5.992289515397433e-05, - "loss": 1.4922, + "epoch": 1.1573117301810365, + "grad_norm": 11.498269081115723, + "learning_rate": 5.4378559907395235e-05, + "loss": 1.0812, "step": 84000 }, { - "epoch": 0.86, - "learning_rate": 5.992241912535804e-05, - "loss": 1.5944, + "epoch": 1.158689482240776, + "grad_norm": 17.331729888916016, + "learning_rate": 5.4376864395675564e-05, + "loss": 1.124, "step": 84100 }, { - "epoch": 0.86, - "learning_rate": 5.9921941633714034e-05, - "loss": 1.48, + "epoch": 1.1600672343005152, + "grad_norm": 12.576549530029297, + "learning_rate": 5.437516660062876e-05, + "loss": 1.032, "step": 84200 }, { - "epoch": 0.86, - "learning_rate": 5.9921462679065645e-05, - "loss": 1.5063, + "epoch": 1.1614449863602547, + "grad_norm": 32.467193603515625, + "learning_rate": 5.437346652239906e-05, + "loss": 1.1427, "step": 84300 }, { - "epoch": 0.86, - "learning_rate": 5.9920982261436294e-05, - "loss": 1.5678, + "epoch": 1.1628227384199938, + "grad_norm": 8.65752124786377, + "learning_rate": 5.43717641611309e-05, + "loss": 1.0757, "step": 84400 }, { - "epoch": 0.86, - "learning_rate": 5.9920500380849456e-05, - "loss": 1.5608, + "epoch": 1.1642004904797332, + "grad_norm": 27.366947174072266, + "learning_rate": 5.43700595169689e-05, + "loss": 1.2517, "step": 84500 }, { - "epoch": 0.86, - "learning_rate": 5.992001703732872e-05, - "loss": 1.4435, + "epoch": 1.1655782425394725, + "grad_norm": 15.15655517578125, + "learning_rate": 5.436835259005788e-05, + "loss": 1.1006, "step": 84600 }, { - "epoch": 0.86, - "learning_rate": 5.99195322308977e-05, - "loss": 1.469, + "epoch": 1.1669559945992118, + "grad_norm": 9.536809921264648, + "learning_rate": 5.4366643380542846e-05, + "loss": 1.095, "step": 84700 }, { - "epoch": 0.86, - "learning_rate": 5.99190459615801e-05, - "loss": 1.5086, + "epoch": 1.1683337466589512, + "grad_norm": 9.374640464782715, + "learning_rate": 5.436493188856901e-05, + "loss": 1.0874, "step": 84800 }, { - "epoch": 0.86, - "learning_rate": 5.99185582293997e-05, - "loss": 1.4966, + "epoch": 1.1697114987186905, + "grad_norm": 4.268041133880615, + "learning_rate": 5.4363218114281764e-05, + "loss": 1.0426, "step": 84900 }, { - "epoch": 0.87, - "learning_rate": 5.9918069034380356e-05, - "loss": 1.5488, + "epoch": 1.1710892507784298, + "grad_norm": 11.414450645446777, + "learning_rate": 5.436150205782671e-05, + "loss": 1.0633, "step": 85000 }, { - "epoch": 0.87, - "learning_rate": 5.991757837654597e-05, - "loss": 1.5319, + "epoch": 1.1724670028381692, + "grad_norm": 18.067049026489258, + "learning_rate": 5.4359783719349637e-05, + "loss": 1.0085, "step": 85100 }, { - "epoch": 0.87, - "learning_rate": 5.991708625592054e-05, - "loss": 1.5415, + "epoch": 1.1738447548979085, + "grad_norm": 8.926839828491211, + "learning_rate": 5.4358063098996524e-05, + "loss": 1.1346, "step": 85200 }, { - "epoch": 0.87, - "learning_rate": 5.9916592672528134e-05, - "loss": 1.7018, + "epoch": 1.1752225069576479, + "grad_norm": 8.551411628723145, + "learning_rate": 5.435634019691354e-05, + "loss": 1.1359, "step": 85300 }, { - "epoch": 0.87, - "learning_rate": 5.991609762639287e-05, - "loss": 1.5908, + "epoch": 1.1766002590173872, + "grad_norm": 10.83679485321045, + "learning_rate": 5.4354615013247064e-05, + "loss": 1.1378, "step": 85400 }, { - "epoch": 0.87, - "learning_rate": 5.9915601117538976e-05, - "loss": 1.5505, + "epoch": 1.1779780110771265, + "grad_norm": 8.291827201843262, + "learning_rate": 5.435288754814366e-05, + "loss": 1.0827, "step": 85500 }, { - "epoch": 0.87, - "learning_rate": 5.991510314599071e-05, - "loss": 1.4897, + "epoch": 1.1793557631368659, + "grad_norm": 5.762343883514404, + "learning_rate": 5.4351157801750076e-05, + "loss": 1.118, "step": 85600 }, { - "epoch": 0.87, - "learning_rate": 5.9914603711772416e-05, - "loss": 1.5582, + "epoch": 1.1807335151966052, + "grad_norm": 15.36035442352295, + "learning_rate": 5.434942577421326e-05, + "loss": 1.1872, "step": 85700 }, { - "epoch": 0.87, - "learning_rate": 5.991410281490853e-05, - "loss": 1.5034, + "epoch": 1.1821112672563445, + "grad_norm": 189.766845703125, + "learning_rate": 5.434770882005615e-05, + "loss": 1.1369, "step": 85800 }, { - "epoch": 0.88, - "learning_rate": 5.991360045542353e-05, - "loss": 1.7018, + "epoch": 1.1834890193160839, + "grad_norm": 19.740503311157227, + "learning_rate": 5.4345972253482275e-05, + "loss": 1.0792, "step": 85900 }, { - "epoch": 0.88, - "learning_rate": 5.991309663334198e-05, - "loss": 1.5036, + "epoch": 1.1848667713758232, + "grad_norm": 31.87593650817871, + "learning_rate": 5.434423340620572e-05, + "loss": 1.1086, "step": 86000 }, { - "epoch": 0.88, - "learning_rate": 5.991259134868852e-05, - "loss": 1.8881, + "epoch": 1.1862445234355625, + "grad_norm": 31.415952682495117, + "learning_rate": 5.4342492278374195e-05, + "loss": 1.1566, "step": 86100 }, { - "epoch": 0.88, - "learning_rate": 5.9912084601487855e-05, - "loss": 2.5213, + "epoch": 1.1876222754953019, + "grad_norm": 20.956693649291992, + "learning_rate": 5.4340748870135626e-05, + "loss": 1.1708, "step": 86200 }, { - "epoch": 0.88, - "learning_rate": 5.991157639176475e-05, - "loss": 2.3425, + "epoch": 1.1890000275550412, + "grad_norm": 16.08473777770996, + "learning_rate": 5.433900318163812e-05, + "loss": 1.1599, "step": 86300 }, { - "epoch": 0.88, - "learning_rate": 5.991107182350556e-05, - "loss": 1.6305, + "epoch": 1.1903777796147805, + "grad_norm": 7.2160139083862305, + "learning_rate": 5.433725521303e-05, + "loss": 1.1283, "step": 86400 }, { - "epoch": 0.88, - "learning_rate": 5.991056070343682e-05, - "loss": 1.7046, + "epoch": 1.1917555316745199, + "grad_norm": 7.467589855194092, + "learning_rate": 5.433550496445974e-05, + "loss": 1.1068, "step": 86500 }, { - "epoch": 0.88, - "learning_rate": 5.991004812092015e-05, - "loss": 1.7513, + "epoch": 1.1931332837342592, + "grad_norm": 16.656461715698242, + "learning_rate": 5.4333752436076055e-05, + "loss": 1.1579, "step": 86600 }, { - "epoch": 0.88, - "learning_rate": 5.990953407598063e-05, - "loss": 1.7299, + "epoch": 1.1945110357939985, + "grad_norm": 4.586458206176758, + "learning_rate": 5.4331997628027814e-05, + "loss": 1.1274, "step": 86700 }, { - "epoch": 0.88, - "learning_rate": 5.9909018568643387e-05, - "loss": 1.6553, + "epoch": 1.1958887878537379, + "grad_norm": 21.88697052001953, + "learning_rate": 5.43302405404641e-05, + "loss": 1.055, "step": 86800 }, { - "epoch": 0.89, - "learning_rate": 5.9908501598933616e-05, - "loss": 1.7274, + "epoch": 1.1972665399134772, + "grad_norm": 44.077423095703125, + "learning_rate": 5.4328481173534196e-05, + "loss": 1.1449, "step": 86900 }, { - "epoch": 0.89, - "learning_rate": 5.99079831668766e-05, - "loss": 1.6031, + "epoch": 1.1986442919732165, + "grad_norm": 13.202244758605957, + "learning_rate": 5.432671952738755e-05, + "loss": 1.2522, "step": 87000 }, { - "epoch": 0.89, - "learning_rate": 5.99074632724977e-05, - "loss": 1.5621, + "epoch": 1.2000220440329559, + "grad_norm": 12.632771492004395, + "learning_rate": 5.4324955602173845e-05, + "loss": 1.0926, "step": 87100 }, { - "epoch": 0.89, - "learning_rate": 5.990694191582231e-05, - "loss": 1.6214, + "epoch": 1.2013997960926952, + "grad_norm": 97.10979461669922, + "learning_rate": 5.4323189398042925e-05, + "loss": 1.2086, "step": 87200 }, { - "epoch": 0.89, - "learning_rate": 5.990641909687595e-05, - "loss": 1.6506, + "epoch": 1.2027775481524345, + "grad_norm": 1011.980224609375, + "learning_rate": 5.4321420915144845e-05, + "loss": 1.2412, "step": 87300 }, { - "epoch": 0.89, - "learning_rate": 5.990589481568415e-05, - "loss": 1.6717, + "epoch": 1.2041553002121739, + "grad_norm": 169.3117218017578, + "learning_rate": 5.431965015362983e-05, + "loss": 1.2365, "step": 87400 }, { - "epoch": 0.89, - "learning_rate": 5.9905369072272584e-05, - "loss": 1.6402, + "epoch": 1.2055330522719132, + "grad_norm": 9.306952476501465, + "learning_rate": 5.431787711364833e-05, + "loss": 1.1619, "step": 87500 }, { - "epoch": 0.89, - "learning_rate": 5.990484186666692e-05, - "loss": 1.7092, + "epoch": 1.2069108043316525, + "grad_norm": 11.595381736755371, + "learning_rate": 5.431610179535097e-05, + "loss": 1.2372, "step": 87600 }, { - "epoch": 0.89, - "learning_rate": 5.990431319889297e-05, - "loss": 1.6545, + "epoch": 1.2082885563913919, + "grad_norm": 9.741960525512695, + "learning_rate": 5.4314324198888576e-05, + "loss": 1.1783, "step": 87700 }, { - "epoch": 0.89, - "learning_rate": 5.990378306897655e-05, - "loss": 1.6781, + "epoch": 1.2096663084511312, + "grad_norm": 38.73411178588867, + "learning_rate": 5.4312544324412154e-05, + "loss": 1.1611, "step": 87800 }, { - "epoch": 0.9, - "learning_rate": 5.990325680010131e-05, - "loss": 1.7792, + "epoch": 1.2110440605108705, + "grad_norm": 76.61911010742188, + "learning_rate": 5.431076217207292e-05, + "loss": 1.1557, "step": 87900 }, { - "epoch": 0.9, - "learning_rate": 5.990272376059858e-05, - "loss": 1.6132, + "epoch": 1.2124218125706099, + "grad_norm": 106.42626190185547, + "learning_rate": 5.430897774202228e-05, + "loss": 1.1955, "step": 88000 }, { - "epoch": 0.9, - "learning_rate": 5.9902189259031126e-05, - "loss": 1.4612, + "epoch": 1.2137995646303492, + "grad_norm": 41.87074279785156, + "learning_rate": 5.430719103441183e-05, + "loss": 1.152, "step": 88100 }, { - "epoch": 0.9, - "learning_rate": 5.9901653295425055e-05, - "loss": 1.5725, + "epoch": 1.2151773166900885, + "grad_norm": 79.47062683105469, + "learning_rate": 5.430540204939335e-05, + "loss": 1.1882, "step": 88200 }, { - "epoch": 0.9, - "learning_rate": 5.990111586980658e-05, - "loss": 1.5206, + "epoch": 1.2165550687498277, + "grad_norm": 14.044783592224121, + "learning_rate": 5.430361078711884e-05, + "loss": 1.2085, "step": 88300 }, { - "epoch": 0.9, - "learning_rate": 5.990057698220199e-05, - "loss": 1.6188, + "epoch": 1.2179328208095672, + "grad_norm": 101.65431213378906, + "learning_rate": 5.4301817247740456e-05, + "loss": 1.0778, "step": 88400 }, { - "epoch": 0.9, - "learning_rate": 5.990003663263761e-05, - "loss": 1.3889, + "epoch": 1.2193105728693063, + "grad_norm": 20.04885482788086, + "learning_rate": 5.430002143141059e-05, + "loss": 1.0642, "step": 88500 }, { - "epoch": 0.9, - "learning_rate": 5.989949482113988e-05, - "loss": 1.8094, + "epoch": 1.2206883249290459, + "grad_norm": 18.162139892578125, + "learning_rate": 5.4298241330482725e-05, + "loss": 1.1004, "step": 88600 }, { - "epoch": 0.9, - "learning_rate": 5.9898951547735275e-05, - "loss": 1.6204, + "epoch": 1.222066076988785, + "grad_norm": 90.41080474853516, + "learning_rate": 5.429644098347346e-05, + "loss": 1.1673, "step": 88700 }, { - "epoch": 0.9, - "learning_rate": 5.989840681245038e-05, - "loss": 1.7374, + "epoch": 1.2234438290485243, + "grad_norm": 23.455533981323242, + "learning_rate": 5.429463835996944e-05, + "loss": 1.1221, "step": 88800 }, { - "epoch": 0.91, - "learning_rate": 5.989786061531181e-05, - "loss": 1.6129, + "epoch": 1.2248215811082637, + "grad_norm": 12.024797439575195, + "learning_rate": 5.4292833460123815e-05, + "loss": 1.0866, "step": 88900 }, { - "epoch": 0.91, - "learning_rate": 5.989731295634627e-05, - "loss": 1.4584, + "epoch": 1.226199333168003, + "grad_norm": 118.55679321289062, + "learning_rate": 5.429102628408991e-05, + "loss": 1.225, "step": 89000 }, { - "epoch": 0.91, - "learning_rate": 5.9896763835580544e-05, - "loss": 1.5022, + "epoch": 1.2275770852277423, + "grad_norm": 32.787933349609375, + "learning_rate": 5.4289216832021266e-05, + "loss": 1.3286, "step": 89100 }, { - "epoch": 0.91, - "learning_rate": 5.989621325304149e-05, - "loss": 1.5479, + "epoch": 1.2289548372874817, + "grad_norm": 20.325054168701172, + "learning_rate": 5.4287405104071596e-05, + "loss": 1.261, "step": 89200 }, { - "epoch": 0.91, - "learning_rate": 5.989566120875602e-05, - "loss": 1.5201, + "epoch": 1.230332589347221, + "grad_norm": 12.644694328308105, + "learning_rate": 5.428559110039483e-05, + "loss": 1.0944, "step": 89300 }, { - "epoch": 0.91, - "learning_rate": 5.9895107702751125e-05, - "loss": 1.5713, + "epoch": 1.2317103414069603, + "grad_norm": 19.038776397705078, + "learning_rate": 5.428377482114505e-05, + "loss": 1.0673, "step": 89400 }, { - "epoch": 0.91, - "learning_rate": 5.989455273505386e-05, - "loss": 1.6368, + "epoch": 1.2330880934666997, + "grad_norm": 28.572973251342773, + "learning_rate": 5.4281956266476585e-05, + "loss": 1.1157, "step": 89500 }, { - "epoch": 0.91, - "learning_rate": 5.9893996305691374e-05, - "loss": 1.4649, + "epoch": 1.234465845526439, + "grad_norm": 205.89178466796875, + "learning_rate": 5.428013543654391e-05, + "loss": 1.1564, "step": 89600 }, { - "epoch": 0.91, - "learning_rate": 5.989343841469087e-05, - "loss": 1.4484, + "epoch": 1.2358435975861783, + "grad_norm": 5.926337242126465, + "learning_rate": 5.427831233150173e-05, + "loss": 1.0738, "step": 89700 }, { - "epoch": 0.91, - "learning_rate": 5.989287906207961e-05, - "loss": 1.51, + "epoch": 1.2372213496459177, + "grad_norm": 11.620429039001465, + "learning_rate": 5.427648695150492e-05, + "loss": 1.1264, "step": 89800 }, { - "epoch": 0.92, - "learning_rate": 5.989231824788496e-05, - "loss": 1.519, + "epoch": 1.238599101705657, + "grad_norm": 14.232254028320312, + "learning_rate": 5.427465929670856e-05, + "loss": 1.2858, "step": 89900 }, { - "epoch": 0.92, - "learning_rate": 5.989175597213434e-05, - "loss": 1.4558, + "epoch": 1.2399768537653963, + "grad_norm": 20.881052017211914, + "learning_rate": 5.427282936726791e-05, + "loss": 1.2236, "step": 90000 }, { - "epoch": 0.92, - "learning_rate": 5.989119223485524e-05, - "loss": 1.3998, + "epoch": 1.2413546058251357, + "grad_norm": 6.499723434448242, + "learning_rate": 5.427099716333844e-05, + "loss": 1.1673, "step": 90100 }, { - "epoch": 0.92, - "learning_rate": 5.989062703607521e-05, - "loss": 1.4141, + "epoch": 1.242732357884875, + "grad_norm": 88.66902923583984, + "learning_rate": 5.426916268507579e-05, + "loss": 1.367, "step": 90200 }, { - "epoch": 0.92, - "learning_rate": 5.98900603758219e-05, - "loss": 1.6032, + "epoch": 1.2441101099446144, + "grad_norm": 46.79194259643555, + "learning_rate": 5.426732593263583e-05, + "loss": 1.212, "step": 90300 }, { - "epoch": 0.92, - "learning_rate": 5.988949225412302e-05, - "loss": 1.433, + "epoch": 1.2454878620043537, + "grad_norm": 11.282959938049316, + "learning_rate": 5.426548690617459e-05, + "loss": 1.1415, "step": 90400 }, { - "epoch": 0.92, - "learning_rate": 5.988892267100633e-05, - "loss": 1.5292, + "epoch": 1.246865614064093, + "grad_norm": 46.6762809753418, + "learning_rate": 5.426364560584831e-05, + "loss": 1.1784, "step": 90500 }, { - "epoch": 0.92, - "learning_rate": 5.988835162649971e-05, - "loss": 1.4914, + "epoch": 1.2482433661238324, + "grad_norm": 78.02066802978516, + "learning_rate": 5.42618020318134e-05, + "loss": 1.1381, "step": 90600 }, { - "epoch": 0.92, - "learning_rate": 5.988777912063103e-05, - "loss": 1.5033, + "epoch": 1.2496211181835717, + "grad_norm": 6.296343803405762, + "learning_rate": 5.425995618422651e-05, + "loss": 1.1683, "step": 90700 }, { - "epoch": 0.93, - "learning_rate": 5.9887205153428335e-05, - "loss": 1.4194, + "epoch": 1.250998870243311, + "grad_norm": 7.056164741516113, + "learning_rate": 5.425810806324442e-05, + "loss": 1.1351, "step": 90800 }, { - "epoch": 0.93, - "learning_rate": 5.988662972491965e-05, - "loss": 1.516, + "epoch": 1.2523766223030504, + "grad_norm": 24.29969596862793, + "learning_rate": 5.4256257669024176e-05, + "loss": 1.3496, "step": 90900 }, { - "epoch": 0.93, - "learning_rate": 5.9886052835133135e-05, - "loss": 1.5305, + "epoch": 1.2537543743627897, + "grad_norm": 224.34046936035156, + "learning_rate": 5.4254405001722954e-05, + "loss": 1.1915, "step": 91000 }, { - "epoch": 0.93, - "learning_rate": 5.988547448409698e-05, - "loss": 1.5844, + "epoch": 1.255132126422529, + "grad_norm": 18.948619842529297, + "learning_rate": 5.425255006149815e-05, + "loss": 1.115, "step": 91100 }, { - "epoch": 0.93, - "learning_rate": 5.988489467183947e-05, - "loss": 1.4488, + "epoch": 1.2565098784822684, + "grad_norm": 15.805986404418945, + "learning_rate": 5.425069284850735e-05, + "loss": 1.2344, "step": 91200 }, { - "epoch": 0.93, - "learning_rate": 5.9884313398388964e-05, - "loss": 1.5022, + "epoch": 1.2578876305420077, + "grad_norm": 6.287398338317871, + "learning_rate": 5.4248833362908344e-05, + "loss": 1.1521, "step": 91300 }, { - "epoch": 0.93, - "learning_rate": 5.9883730663773865e-05, - "loss": 1.4467, + "epoch": 1.259265382601747, + "grad_norm": 10.883261680603027, + "learning_rate": 5.424697160485909e-05, + "loss": 1.0913, "step": 91400 }, { - "epoch": 0.93, - "learning_rate": 5.9883146468022676e-05, - "loss": 1.4996, + "epoch": 1.2606431346614864, + "grad_norm": 310.649658203125, + "learning_rate": 5.424512622606851e-05, + "loss": 1.1438, "step": 91500 }, { - "epoch": 0.93, - "learning_rate": 5.9882560811163954e-05, - "loss": 1.5119, + "epoch": 1.2620208867212257, + "grad_norm": 12.606843948364258, + "learning_rate": 5.424325994631402e-05, + "loss": 1.1751, "step": 91600 }, { - "epoch": 0.93, - "learning_rate": 5.9881973693226346e-05, - "loss": 1.5147, + "epoch": 1.263398638780965, + "grad_norm": 18.939218521118164, + "learning_rate": 5.424139139458279e-05, + "loss": 1.2275, "step": 91700 }, { - "epoch": 0.94, - "learning_rate": 5.9881385114238545e-05, - "loss": 1.4885, + "epoch": 1.2647763908407044, + "grad_norm": 199.7487030029297, + "learning_rate": 5.423952057103354e-05, + "loss": 1.237, "step": 91800 }, { - "epoch": 0.94, - "learning_rate": 5.9880795074229337e-05, - "loss": 1.4255, + "epoch": 1.2661541429004437, + "grad_norm": 16.321306228637695, + "learning_rate": 5.423764747582522e-05, + "loss": 1.2226, "step": 91900 }, { - "epoch": 0.94, - "learning_rate": 5.988020357322757e-05, - "loss": 1.6254, + "epoch": 1.267531894960183, + "grad_norm": 88.80003356933594, + "learning_rate": 5.4235772109116976e-05, + "loss": 1.2745, "step": 92000 }, { - "epoch": 0.94, - "learning_rate": 5.987961061126216e-05, - "loss": 1.4327, + "epoch": 1.2689096470199224, + "grad_norm": 171.31634521484375, + "learning_rate": 5.4233894471068096e-05, + "loss": 1.2717, "step": 92100 }, { - "epoch": 0.94, - "learning_rate": 5.98790161883621e-05, - "loss": 1.4429, + "epoch": 1.2702873990796617, + "grad_norm": 40.57966613769531, + "learning_rate": 5.423201456183811e-05, + "loss": 1.2254, "step": 92200 }, { - "epoch": 0.94, - "learning_rate": 5.987842030455647e-05, - "loss": 1.5518, + "epoch": 1.271665151139401, + "grad_norm": 35.70359420776367, + "learning_rate": 5.423013238158673e-05, + "loss": 1.3632, "step": 92300 }, { - "epoch": 0.94, - "learning_rate": 5.9877822959874376e-05, - "loss": 1.4686, + "epoch": 1.2730429031991402, + "grad_norm": 89.7231674194336, + "learning_rate": 5.422824793047386e-05, + "loss": 1.8249, "step": 92400 }, { - "epoch": 0.94, - "learning_rate": 5.987722415434505e-05, - "loss": 1.4723, + "epoch": 1.2744206552588797, + "grad_norm": 204.28256225585938, + "learning_rate": 5.422636120865958e-05, + "loss": 1.2999, "step": 92500 }, { - "epoch": 0.94, - "learning_rate": 5.9876623887997756e-05, - "loss": 1.5262, + "epoch": 1.2757984073186188, + "grad_norm": 51.34372329711914, + "learning_rate": 5.422447221630418e-05, + "loss": 1.4106, "step": 92600 }, { - "epoch": 0.94, - "learning_rate": 5.987602216086186e-05, - "loss": 1.4851, + "epoch": 1.2771761593783584, + "grad_norm": 15.5244140625, + "learning_rate": 5.4222580953568154e-05, + "loss": 1.4799, "step": 92700 }, { - "epoch": 0.95, - "learning_rate": 5.9875418972966765e-05, - "loss": 1.5974, + "epoch": 1.2785539114380975, + "grad_norm": 94.4416275024414, + "learning_rate": 5.422068742061216e-05, + "loss": 1.2477, "step": 92800 }, { - "epoch": 0.95, - "learning_rate": 5.987481432434196e-05, - "loss": 1.5091, + "epoch": 1.279931663497837, + "grad_norm": 47.11005401611328, + "learning_rate": 5.421879161759708e-05, + "loss": 1.3798, "step": 92900 }, { - "epoch": 0.95, - "learning_rate": 5.987420821501703e-05, - "loss": 1.596, + "epoch": 1.2813094155575762, + "grad_norm": 95.08572387695312, + "learning_rate": 5.421689354468394e-05, + "loss": 1.2238, "step": 93000 }, { - "epoch": 0.95, - "learning_rate": 5.9873600645021595e-05, - "loss": 1.4018, + "epoch": 1.2826871676173157, + "grad_norm": 36.19763946533203, + "learning_rate": 5.421499320203402e-05, + "loss": 1.341, "step": 93100 }, { - "epoch": 0.95, - "learning_rate": 5.987299161438536e-05, - "loss": 1.4484, + "epoch": 1.2840649196770548, + "grad_norm": 31.36441993713379, + "learning_rate": 5.421309058980876e-05, + "loss": 1.3659, "step": 93200 }, { - "epoch": 0.95, - "learning_rate": 5.987238112313812e-05, - "loss": 1.4128, + "epoch": 1.2854426717367944, + "grad_norm": 28.856643676757812, + "learning_rate": 5.4211185708169776e-05, + "loss": 1.4084, "step": 93300 }, { - "epoch": 0.95, - "learning_rate": 5.98717691713097e-05, - "loss": 1.443, + "epoch": 1.2868204237965335, + "grad_norm": 12.829009056091309, + "learning_rate": 5.4209278557278916e-05, + "loss": 1.3829, "step": 93400 }, { - "epoch": 0.95, - "learning_rate": 5.9871155758930036e-05, - "loss": 1.6698, + "epoch": 1.2881981758562728, + "grad_norm": 41.47861099243164, + "learning_rate": 5.420736913729821e-05, + "loss": 1.2971, "step": 93500 }, { - "epoch": 0.95, - "learning_rate": 5.9870540886029116e-05, - "loss": 1.4533, + "epoch": 1.2895759279160122, + "grad_norm": 20.384492874145508, + "learning_rate": 5.420545744838985e-05, + "loss": 1.295, "step": 93600 }, { - "epoch": 0.95, - "learning_rate": 5.9869924552637004e-05, - "loss": 1.4722, + "epoch": 1.2909536799757515, + "grad_norm": 41.33100128173828, + "learning_rate": 5.420354349071626e-05, + "loss": 1.2833, "step": 93700 }, { - "epoch": 0.96, - "learning_rate": 5.9869306758783847e-05, - "loss": 1.4219, + "epoch": 1.2923314320354908, + "grad_norm": 23.125564575195312, + "learning_rate": 5.420164643793185e-05, + "loss": 1.3758, "step": 93800 }, { - "epoch": 0.96, - "learning_rate": 5.9868687504499826e-05, - "loss": 1.5066, + "epoch": 1.2937091840952302, + "grad_norm": 21.431570053100586, + "learning_rate": 5.419972796589938e-05, + "loss": 1.2996, "step": 93900 }, { - "epoch": 0.96, - "learning_rate": 5.986806678981524e-05, - "loss": 1.4336, + "epoch": 1.2950869361549695, + "grad_norm": 11.668604850769043, + "learning_rate": 5.419780722558842e-05, + "loss": 1.2024, "step": 94000 }, { - "epoch": 0.96, - "learning_rate": 5.986744461476043e-05, - "loss": 1.4057, + "epoch": 1.2964646882147088, + "grad_norm": 39.09562683105469, + "learning_rate": 5.4195884217162155e-05, + "loss": 1.2559, "step": 94100 }, { - "epoch": 0.96, - "learning_rate": 5.9866820979365815e-05, - "loss": 1.5177, + "epoch": 1.2978424402744482, + "grad_norm": 19.265756607055664, + "learning_rate": 5.4193958940783945e-05, + "loss": 1.3205, "step": 94200 }, { - "epoch": 0.96, - "learning_rate": 5.98661958836619e-05, - "loss": 1.3542, + "epoch": 1.2992201923341875, + "grad_norm": 16.87660026550293, + "learning_rate": 5.419203139661737e-05, + "loss": 1.1221, "step": 94300 }, { - "epoch": 0.96, - "learning_rate": 5.986556932767922e-05, - "loss": 1.5767, + "epoch": 1.3005979443939268, + "grad_norm": 12.028328895568848, + "learning_rate": 5.4190101584826164e-05, + "loss": 1.0838, "step": 94400 }, { - "epoch": 0.96, - "learning_rate": 5.986494759883888e-05, - "loss": 1.4592, + "epoch": 1.3019756964536662, + "grad_norm": 47.374629974365234, + "learning_rate": 5.418816950557428e-05, + "loss": 1.1656, "step": 94500 }, { - "epoch": 0.96, - "learning_rate": 5.986431813699271e-05, - "loss": 1.3926, + "epoch": 1.3033534485134055, + "grad_norm": 17.502593994140625, + "learning_rate": 5.4186235159025864e-05, + "loss": 1.2574, "step": 94600 }, { - "epoch": 0.96, - "learning_rate": 5.986368721495961e-05, - "loss": 1.4459, + "epoch": 1.3047312005731448, + "grad_norm": 116.42330169677734, + "learning_rate": 5.418429854534524e-05, + "loss": 1.1281, "step": 94700 }, { - "epoch": 0.97, - "learning_rate": 5.986305483277043e-05, - "loss": 1.3428, + "epoch": 1.3061089526328842, + "grad_norm": 11.536840438842773, + "learning_rate": 5.418235966469695e-05, + "loss": 1.0518, "step": 94800 }, { - "epoch": 0.97, - "learning_rate": 5.9862420990456075e-05, - "loss": 1.4095, + "epoch": 1.3074867046926235, + "grad_norm": 22.08839225769043, + "learning_rate": 5.4180418517245694e-05, + "loss": 1.1579, "step": 94900 }, { - "epoch": 0.97, - "learning_rate": 5.986178568804754e-05, - "loss": 1.4202, + "epoch": 1.3088644567523628, + "grad_norm": 18.14686393737793, + "learning_rate": 5.417847510315639e-05, + "loss": 1.212, "step": 95000 }, { - "epoch": 0.97, - "learning_rate": 5.986114892557589e-05, - "loss": 1.4598, + "epoch": 1.3102422088121022, + "grad_norm": 21.127187728881836, + "learning_rate": 5.4176529422594134e-05, + "loss": 1.172, "step": 95100 }, { - "epoch": 0.97, - "learning_rate": 5.986051070307226e-05, - "loss": 1.5845, + "epoch": 1.3116199608718415, + "grad_norm": 10.276876449584961, + "learning_rate": 5.417458147572423e-05, + "loss": 1.1477, "step": 95200 }, { - "epoch": 0.97, - "learning_rate": 5.9859871020567864e-05, - "loss": 1.4093, + "epoch": 1.3129977129315809, + "grad_norm": 40.1536979675293, + "learning_rate": 5.417263126271217e-05, + "loss": 1.1976, "step": 95300 }, { - "epoch": 0.97, - "learning_rate": 5.985922987809396e-05, - "loss": 1.4548, + "epoch": 1.3143754649913202, + "grad_norm": 16.13709259033203, + "learning_rate": 5.417067878372362e-05, + "loss": 1.2112, "step": 95400 }, { - "epoch": 0.97, - "learning_rate": 5.985858727568192e-05, - "loss": 1.354, + "epoch": 1.3157532170510595, + "grad_norm": 31.721723556518555, + "learning_rate": 5.4168724038924465e-05, + "loss": 1.1679, "step": 95500 }, { - "epoch": 0.97, - "learning_rate": 5.985794321336313e-05, - "loss": 1.4892, + "epoch": 1.3171309691107989, + "grad_norm": 11.788978576660156, + "learning_rate": 5.416676702848076e-05, + "loss": 1.2025, "step": 95600 }, { - "epoch": 0.98, - "learning_rate": 5.9857297691169106e-05, - "loss": 1.5283, + "epoch": 1.3185087211705382, + "grad_norm": 16.537700653076172, + "learning_rate": 5.416480775255878e-05, + "loss": 1.2318, "step": 95700 }, { - "epoch": 0.98, - "learning_rate": 5.985665070913141e-05, - "loss": 1.3423, + "epoch": 1.3198864732302775, + "grad_norm": 20.435949325561523, + "learning_rate": 5.4162846211324964e-05, + "loss": 1.1599, "step": 95800 }, { - "epoch": 0.98, - "learning_rate": 5.985600226728166e-05, - "loss": 1.4428, + "epoch": 1.3212642252900169, + "grad_norm": 14.62575912475586, + "learning_rate": 5.416088240494595e-05, + "loss": 1.1141, "step": 95900 }, { - "epoch": 0.98, - "learning_rate": 5.985535236565157e-05, - "loss": 1.3588, + "epoch": 1.3226419773497562, + "grad_norm": 15.326416015625, + "learning_rate": 5.415891633358858e-05, + "loss": 1.1135, "step": 96000 }, { - "epoch": 0.98, - "learning_rate": 5.9854701004272926e-05, - "loss": 1.4912, + "epoch": 1.3240197294094955, + "grad_norm": 5.411820888519287, + "learning_rate": 5.415694799741989e-05, + "loss": 1.0651, "step": 96100 }, { - "epoch": 0.98, - "learning_rate": 5.985404818317756e-05, - "loss": 1.5576, + "epoch": 1.3253974814692349, + "grad_norm": 11.630475997924805, + "learning_rate": 5.415497739660708e-05, + "loss": 1.0951, "step": 96200 }, { - "epoch": 0.98, - "learning_rate": 5.985340045243054e-05, - "loss": 1.5684, + "epoch": 1.3267752335289742, + "grad_norm": 22.017620086669922, + "learning_rate": 5.415300453131758e-05, + "loss": 1.1332, "step": 96300 }, { - "epoch": 0.98, - "learning_rate": 5.9852744726593947e-05, - "loss": 1.5131, + "epoch": 1.3281529855887135, + "grad_norm": 15.014047622680664, + "learning_rate": 5.4151029401718985e-05, + "loss": 1.2671, "step": 96400 }, { - "epoch": 0.98, - "learning_rate": 5.9852087541136275e-05, - "loss": 1.5162, + "epoch": 1.3295307376484529, + "grad_norm": 9.514591217041016, + "learning_rate": 5.4149052007979106e-05, + "loss": 1.023, "step": 96500 }, { - "epoch": 0.98, - "learning_rate": 5.9851428896089685e-05, - "loss": 1.4749, + "epoch": 1.3309084897081922, + "grad_norm": 9.969331741333008, + "learning_rate": 5.414707235026592e-05, + "loss": 1.0165, "step": 96600 }, { - "epoch": 0.99, - "learning_rate": 5.9850768791486355e-05, - "loss": 1.4942, + "epoch": 1.3322862417679313, + "grad_norm": 6.215322017669678, + "learning_rate": 5.414509042874761e-05, + "loss": 1.0, "step": 96700 }, { - "epoch": 0.99, - "learning_rate": 5.985010722735858e-05, - "loss": 1.4914, + "epoch": 1.3336639938276709, + "grad_norm": 21.648183822631836, + "learning_rate": 5.414310624359255e-05, + "loss": 1.1317, "step": 96800 }, { - "epoch": 0.99, - "learning_rate": 5.9849444203738695e-05, - "loss": 1.4131, + "epoch": 1.33504174588741, + "grad_norm": 21.194229125976562, + "learning_rate": 5.4141119794969316e-05, + "loss": 1.1609, "step": 96900 }, { - "epoch": 0.99, - "learning_rate": 5.9848779720659126e-05, - "loss": 1.3661, + "epoch": 1.3364194979471495, + "grad_norm": 10.29611587524414, + "learning_rate": 5.413913108304666e-05, + "loss": 1.1667, "step": 97000 }, { - "epoch": 0.99, - "learning_rate": 5.9848113778152346e-05, - "loss": 1.4944, + "epoch": 1.3377972500068886, + "grad_norm": 24.378509521484375, + "learning_rate": 5.4137140107993526e-05, + "loss": 1.0474, "step": 97100 }, { - "epoch": 0.99, - "learning_rate": 5.9847446376250936e-05, - "loss": 1.4675, + "epoch": 1.3391750020666282, + "grad_norm": 12.182476997375488, + "learning_rate": 5.413514686997907e-05, + "loss": 1.0979, "step": 97200 }, { - "epoch": 0.99, - "learning_rate": 5.9846777514987516e-05, - "loss": 1.3728, + "epoch": 1.3405527541263673, + "grad_norm": 17.44414710998535, + "learning_rate": 5.413315136917262e-05, + "loss": 1.1522, "step": 97300 }, { - "epoch": 0.99, - "learning_rate": 5.984611390482428e-05, - "loss": 1.5257, + "epoch": 1.3419305061861069, + "grad_norm": 14.256574630737305, + "learning_rate": 5.413115360574371e-05, + "loss": 1.0445, "step": 97400 }, { - "epoch": 0.99, - "learning_rate": 5.984544213952784e-05, - "loss": 1.4841, + "epoch": 1.343308258245846, + "grad_norm": 26.928070068359375, + "learning_rate": 5.412915357986206e-05, + "loss": 1.143, "step": 97500 }, { - "epoch": 0.99, - "learning_rate": 5.984476891496737e-05, - "loss": 1.5172, + "epoch": 1.3446860103055855, + "grad_norm": 7.874545574188232, + "learning_rate": 5.412715129169757e-05, + "loss": 1.1181, "step": 97600 }, { - "epoch": 1.0, - "learning_rate": 5.9844094231175805e-05, - "loss": 1.4262, + "epoch": 1.3460637623653247, + "grad_norm": 6.059437274932861, + "learning_rate": 5.412514674142036e-05, + "loss": 1.1641, "step": 97700 }, { - "epoch": 1.0, - "learning_rate": 5.984341808818614e-05, - "loss": 1.4293, + "epoch": 1.347441514425064, + "grad_norm": 24.248310089111328, + "learning_rate": 5.4123139929200724e-05, + "loss": 1.1107, "step": 97800 }, { - "epoch": 1.0, - "learning_rate": 5.984274048603143e-05, - "loss": 1.6798, + "epoch": 1.3488192664848033, + "grad_norm": 9.78466510772705, + "learning_rate": 5.412113085520915e-05, + "loss": 1.0107, "step": 97900 }, { - "epoch": 1.0, - "learning_rate": 5.984206142474479e-05, - "loss": 1.5527, + "epoch": 1.3501970185445427, + "grad_norm": 38.441551208496094, + "learning_rate": 5.4119119519616306e-05, + "loss": 1.0825, "step": 98000 }, { - "epoch": 1.0, - "learning_rate": 5.984138090435944e-05, - "loss": 1.4024, + "epoch": 1.351574770604282, + "grad_norm": 32.95125961303711, + "learning_rate": 5.411710592259308e-05, + "loss": 1.2035, "step": 98100 }, { - "epoch": 1.0, - "learning_rate": 5.984070575192542e-05, - "loss": 1.5522, + "epoch": 1.3529525226640213, + "grad_norm": 129.1112823486328, + "learning_rate": 5.411509006431054e-05, + "loss": 1.076, "step": 98200 }, { - "epoch": 1.0, - "learning_rate": 5.984002232803269e-05, - "loss": 1.6232, + "epoch": 1.3543302747237607, + "grad_norm": 14.370128631591797, + "learning_rate": 5.411307194493993e-05, + "loss": 1.1042, "step": 98300 }, { - "epoch": 1.0, - "learning_rate": 5.9839337445140936e-05, - "loss": 1.5484, + "epoch": 1.3557080267835, + "grad_norm": 9.850196838378906, + "learning_rate": 5.411105156465271e-05, + "loss": 1.0179, "step": 98400 }, { - "epoch": 1.0, - "learning_rate": 5.983865110328365e-05, - "loss": 1.4182, + "epoch": 1.3570857788432393, + "grad_norm": 3.594674825668335, + "learning_rate": 5.4109028923620524e-05, + "loss": 1.0863, "step": 98500 }, { - "epoch": 1.0, - "learning_rate": 5.983796330249441e-05, - "loss": 1.4759, + "epoch": 1.3584635309029787, + "grad_norm": 9.291257858276367, + "learning_rate": 5.410700402201519e-05, + "loss": 1.0532, "step": 98600 }, { - "epoch": 1.01, - "learning_rate": 5.983727404280682e-05, - "loss": 1.4507, + "epoch": 1.359841282962718, + "grad_norm": 10.405354499816895, + "learning_rate": 5.4104976860008745e-05, + "loss": 1.108, "step": 98700 }, { - "epoch": 1.01, - "learning_rate": 5.983658332425458e-05, - "loss": 1.3354, + "epoch": 1.3612190350224573, + "grad_norm": 7.508618354797363, + "learning_rate": 5.410294743777341e-05, + "loss": 1.0318, "step": 98800 }, { - "epoch": 1.01, - "learning_rate": 5.9835891146871484e-05, - "loss": 1.3849, + "epoch": 1.3625967870821967, + "grad_norm": 9.89979362487793, + "learning_rate": 5.410091575548159e-05, + "loss": 1.0753, "step": 98900 }, { - "epoch": 1.01, - "learning_rate": 5.9835197510691354e-05, - "loss": 1.5347, + "epoch": 1.363974539141936, + "grad_norm": 17.785009384155273, + "learning_rate": 5.4098881813305884e-05, + "loss": 1.2069, "step": 99000 }, { - "epoch": 1.01, - "learning_rate": 5.983450241574812e-05, - "loss": 1.3758, + "epoch": 1.3653522912016753, + "grad_norm": 19.928552627563477, + "learning_rate": 5.40968456114191e-05, + "loss": 1.1077, "step": 99100 }, { - "epoch": 1.01, - "learning_rate": 5.983380586207575e-05, - "loss": 1.3151, + "epoch": 1.3667300432614147, + "grad_norm": 32.78633499145508, + "learning_rate": 5.40948071499942e-05, + "loss": 1.0913, "step": 99200 }, { - "epoch": 1.01, - "learning_rate": 5.9833107849708325e-05, - "loss": 1.4597, + "epoch": 1.368107795321154, + "grad_norm": 36.34523010253906, + "learning_rate": 5.409276642920438e-05, + "loss": 1.1546, "step": 99300 }, { - "epoch": 1.01, - "learning_rate": 5.9832408378679945e-05, - "loss": 1.3496, + "epoch": 1.3694855473808933, + "grad_norm": 12.677918434143066, + "learning_rate": 5.4090723449223e-05, + "loss": 1.1183, "step": 99400 }, { - "epoch": 1.01, - "learning_rate": 5.983170744902484e-05, - "loss": 1.5162, + "epoch": 1.3708632994406327, + "grad_norm": 10.551412582397461, + "learning_rate": 5.408867821022363e-05, + "loss": 1.0943, "step": 99500 }, { - "epoch": 1.01, - "learning_rate": 5.983100506077726e-05, - "loss": 1.2813, + "epoch": 1.372241051500372, + "grad_norm": 15.108549118041992, + "learning_rate": 5.408663071238001e-05, + "loss": 1.0646, "step": 99600 }, { - "epoch": 1.02, - "learning_rate": 5.983030121397156e-05, - "loss": 1.483, + "epoch": 1.3736188035601113, + "grad_norm": 125.39273834228516, + "learning_rate": 5.408458095586611e-05, + "loss": 1.0462, "step": 99700 }, { - "epoch": 1.02, - "learning_rate": 5.982959590864214e-05, - "loss": 1.5009, + "epoch": 1.3749965556198507, + "grad_norm": 15.085184097290039, + "learning_rate": 5.408252894085605e-05, + "loss": 0.9886, "step": 99800 }, { - "epoch": 1.02, - "learning_rate": 5.982888914482349e-05, - "loss": 1.3123, + "epoch": 1.37637430767959, + "grad_norm": 5.552258014678955, + "learning_rate": 5.408047466752415e-05, + "loss": 1.1143, "step": 99900 }, { - "epoch": 1.02, - "learning_rate": 5.9828180922550186e-05, - "loss": 1.4835, + "epoch": 1.3777520597393293, + "grad_norm": 12.217641830444336, + "learning_rate": 5.407841813604495e-05, + "loss": 0.9727, "step": 100000 }, { - "epoch": 1.02, - "learning_rate": 5.982747124185684e-05, - "loss": 1.349, + "epoch": 1.3791298117990687, + "grad_norm": 6.6453423500061035, + "learning_rate": 5.4076359346593153e-05, + "loss": 0.974, "step": 100100 }, { - "epoch": 1.02, - "learning_rate": 5.982676010277814e-05, - "loss": 1.3898, + "epoch": 1.380507563858808, + "grad_norm": 8.828533172607422, + "learning_rate": 5.407431892099168e-05, + "loss": 0.9244, "step": 100200 }, { - "epoch": 1.02, - "learning_rate": 5.982604750534888e-05, - "loss": 1.3675, + "epoch": 1.3818853159185474, + "grad_norm": 4.5843329429626465, + "learning_rate": 5.407225563869496e-05, + "loss": 0.9709, "step": 100300 }, { - "epoch": 1.02, - "learning_rate": 5.982533344960388e-05, - "loss": 1.3503, + "epoch": 1.3832630679782867, + "grad_norm": 8.428098678588867, + "learning_rate": 5.407019009894918e-05, + "loss": 1.1502, "step": 100400 }, { - "epoch": 1.02, - "learning_rate": 5.982461793557806e-05, - "loss": 1.2747, + "epoch": 1.384640820038026, + "grad_norm": 6.951627254486084, + "learning_rate": 5.406812230192981e-05, + "loss": 1.0878, "step": 100500 }, { - "epoch": 1.02, - "learning_rate": 5.9823900963306415e-05, - "loss": 1.3983, + "epoch": 1.3860185720977654, + "grad_norm": 14.449235916137695, + "learning_rate": 5.406605224781254e-05, + "loss": 1.0733, "step": 100600 }, { - "epoch": 1.03, - "learning_rate": 5.9823182532823986e-05, - "loss": 1.3127, + "epoch": 1.3873963241575047, + "grad_norm": 34.672786712646484, + "learning_rate": 5.406397993677322e-05, + "loss": 0.9911, "step": 100700 }, { - "epoch": 1.03, - "learning_rate": 5.982246264416591e-05, - "loss": 1.4465, + "epoch": 1.388774076217244, + "grad_norm": 7.907532215118408, + "learning_rate": 5.406190536898789e-05, + "loss": 0.977, "step": 100800 }, { - "epoch": 1.03, - "learning_rate": 5.982174129736738e-05, - "loss": 1.425, + "epoch": 1.3901518282769834, + "grad_norm": 12.782571792602539, + "learning_rate": 5.4059828544632824e-05, + "loss": 1.2255, "step": 100900 }, { - "epoch": 1.03, - "learning_rate": 5.9821018492463666e-05, - "loss": 1.4288, + "epoch": 1.3915295803367225, + "grad_norm": 18.650449752807617, + "learning_rate": 5.405774946388445e-05, + "loss": 1.0604, "step": 101000 }, { - "epoch": 1.03, - "learning_rate": 5.982029422949012e-05, - "loss": 1.3546, + "epoch": 1.392907332396462, + "grad_norm": 27.865568161010742, + "learning_rate": 5.4055668126919373e-05, + "loss": 1.0506, "step": 101100 }, { - "epoch": 1.03, - "learning_rate": 5.981956850848213e-05, - "loss": 1.3968, + "epoch": 1.3942850844562011, + "grad_norm": 7.016862869262695, + "learning_rate": 5.405358453391444e-05, + "loss": 1.0837, "step": 101200 }, { - "epoch": 1.03, - "learning_rate": 5.98188413294752e-05, - "loss": 1.3547, + "epoch": 1.3956628365159407, + "grad_norm": 47.0599250793457, + "learning_rate": 5.4051498685046655e-05, + "loss": 1.0757, "step": 101300 }, { - "epoch": 1.03, - "learning_rate": 5.981811269250488e-05, - "loss": 1.4353, + "epoch": 1.3970405885756798, + "grad_norm": 47.66703414916992, + "learning_rate": 5.404941058049321e-05, + "loss": 1.1705, "step": 101400 }, { - "epoch": 1.03, - "learning_rate": 5.98173825976068e-05, - "loss": 1.4136, + "epoch": 1.3984183406354194, + "grad_norm": 28.557613372802734, + "learning_rate": 5.4047320220431524e-05, + "loss": 1.1288, "step": 101500 }, { - "epoch": 1.04, - "learning_rate": 5.981665104481665e-05, - "loss": 1.4651, + "epoch": 1.3997960926951585, + "grad_norm": 60.851383209228516, + "learning_rate": 5.404524854235639e-05, + "loss": 1.1702, "step": 101600 }, { - "epoch": 1.04, - "learning_rate": 5.981591803417019e-05, - "loss": 1.4144, + "epoch": 1.401173844754898, + "grad_norm": 87.85972595214844, + "learning_rate": 5.404315369436178e-05, + "loss": 1.1407, "step": 101700 }, { - "epoch": 1.04, - "learning_rate": 5.981518356570328e-05, - "loss": 1.2181, + "epoch": 1.4025515968146371, + "grad_norm": 12.193360328674316, + "learning_rate": 5.404105659139048e-05, + "loss": 1.1541, "step": 101800 }, { - "epoch": 1.04, - "learning_rate": 5.981444763945181e-05, - "loss": 1.3956, + "epoch": 1.4039293488743767, + "grad_norm": 58.03166961669922, + "learning_rate": 5.4038957233620636e-05, + "loss": 1.2748, "step": 101900 }, { - "epoch": 1.04, - "learning_rate": 5.981371025545179e-05, - "loss": 1.3597, + "epoch": 1.4053071009341158, + "grad_norm": 9.301190376281738, + "learning_rate": 5.403685562123061e-05, + "loss": 1.1759, "step": 102000 }, { - "epoch": 1.04, - "learning_rate": 5.9812971413739246e-05, - "loss": 1.3117, + "epoch": 1.4066848529938552, + "grad_norm": 21.133102416992188, + "learning_rate": 5.403475175439893e-05, + "loss": 1.0426, "step": 102100 }, { - "epoch": 1.04, - "learning_rate": 5.9812231114350316e-05, - "loss": 1.3608, + "epoch": 1.4080626050535945, + "grad_norm": 20.334646224975586, + "learning_rate": 5.403264563330434e-05, + "loss": 1.1278, "step": 102200 }, { - "epoch": 1.04, - "learning_rate": 5.98114893573212e-05, - "loss": 1.2655, + "epoch": 1.4094403571133338, + "grad_norm": 12.950143814086914, + "learning_rate": 5.403053725812576e-05, + "loss": 1.1, "step": 102300 }, { - "epoch": 1.04, - "learning_rate": 5.9810746142688154e-05, - "loss": 1.2893, + "epoch": 1.4108181091730732, + "grad_norm": 49.374385833740234, + "learning_rate": 5.402842662904231e-05, + "loss": 0.9982, "step": 102400 }, { - "epoch": 1.04, - "learning_rate": 5.981000147048753e-05, - "loss": 1.3463, + "epoch": 1.4121958612328125, + "grad_norm": 12.126702308654785, + "learning_rate": 5.40263137462333e-05, + "loss": 1.1325, "step": 102500 }, { - "epoch": 1.05, - "learning_rate": 5.9809255340755724e-05, - "loss": 1.5518, + "epoch": 1.4135736132925518, + "grad_norm": 30.65694236755371, + "learning_rate": 5.4024198609878235e-05, + "loss": 1.1715, "step": 102600 }, { - "epoch": 1.05, - "learning_rate": 5.9808507753529225e-05, - "loss": 1.3162, + "epoch": 1.4149513653522912, + "grad_norm": 39.14908218383789, + "learning_rate": 5.40220812201568e-05, + "loss": 1.1905, "step": 102700 }, { - "epoch": 1.05, - "learning_rate": 5.980775870884459e-05, - "loss": 1.2651, + "epoch": 1.4163291174120305, + "grad_norm": 6.459589958190918, + "learning_rate": 5.4019961577248875e-05, + "loss": 1.0743, "step": 102800 }, { - "epoch": 1.05, - "learning_rate": 5.980700820673843e-05, - "loss": 1.4226, + "epoch": 1.4177068694717698, + "grad_norm": 7.655505657196045, + "learning_rate": 5.401783968133454e-05, + "loss": 1.1819, "step": 102900 }, { - "epoch": 1.05, - "learning_rate": 5.980625624724745e-05, - "loss": 1.3878, + "epoch": 1.4190846215315092, + "grad_norm": 36.74433898925781, + "learning_rate": 5.4015715532594056e-05, + "loss": 1.0713, "step": 103000 }, { - "epoch": 1.05, - "learning_rate": 5.980550283040842e-05, - "loss": 1.3676, + "epoch": 1.4204623735912485, + "grad_norm": 8.84647274017334, + "learning_rate": 5.4013589131207894e-05, + "loss": 1.1303, "step": 103100 }, { - "epoch": 1.05, - "learning_rate": 5.9804747956258166e-05, - "loss": 1.3179, + "epoch": 1.4218401256509878, + "grad_norm": 6.3132853507995605, + "learning_rate": 5.401146047735668e-05, + "loss": 0.9917, "step": 103200 }, { - "epoch": 1.05, - "learning_rate": 5.9803991624833594e-05, - "loss": 1.5319, + "epoch": 1.4232178777107272, + "grad_norm": 27.791242599487305, + "learning_rate": 5.4009329571221284e-05, + "loss": 1.1319, "step": 103300 }, { - "epoch": 1.05, - "learning_rate": 5.980323383617171e-05, - "loss": 1.3493, + "epoch": 1.4245956297704665, + "grad_norm": 28.155214309692383, + "learning_rate": 5.400719641298271e-05, + "loss": 1.0323, "step": 103400 }, { - "epoch": 1.05, - "learning_rate": 5.9802474590309544e-05, - "loss": 1.3645, + "epoch": 1.4259733818302058, + "grad_norm": 24.31360626220703, + "learning_rate": 5.400506100282219e-05, + "loss": 1.034, "step": 103500 }, { - "epoch": 1.06, - "learning_rate": 5.980171388728421e-05, - "loss": 1.4624, + "epoch": 1.4273511338899452, + "grad_norm": 8.539220809936523, + "learning_rate": 5.400292334092114e-05, + "loss": 1.0076, "step": 103600 }, { - "epoch": 1.06, - "learning_rate": 5.9800951727132924e-05, - "loss": 1.4435, + "epoch": 1.4287288859496845, + "grad_norm": 8.166019439697266, + "learning_rate": 5.400078342746116e-05, + "loss": 1.0668, "step": 103700 }, { - "epoch": 1.06, - "learning_rate": 5.980018810989294e-05, - "loss": 1.426, + "epoch": 1.4301066380094238, + "grad_norm": 12.023422241210938, + "learning_rate": 5.3998641262624057e-05, + "loss": 1.0229, "step": 103800 }, { - "epoch": 1.06, - "learning_rate": 5.979942303560159e-05, - "loss": 1.3458, + "epoch": 1.4314843900691632, + "grad_norm": 7.248012065887451, + "learning_rate": 5.3996496846591805e-05, + "loss": 1.0255, "step": 103900 }, { - "epoch": 1.06, - "learning_rate": 5.97986565042963e-05, - "loss": 1.4218, + "epoch": 1.4328621421289025, + "grad_norm": 135.61102294921875, + "learning_rate": 5.399435017954659e-05, + "loss": 1.0787, "step": 104000 }, { - "epoch": 1.06, - "learning_rate": 5.9797888516014524e-05, - "loss": 1.2861, + "epoch": 1.4342398941886418, + "grad_norm": 15.248514175415039, + "learning_rate": 5.3992201261670796e-05, + "loss": 1.067, "step": 104100 }, { - "epoch": 1.06, - "learning_rate": 5.979711907079383e-05, - "loss": 1.2324, + "epoch": 1.4356176462483812, + "grad_norm": 7.450089931488037, + "learning_rate": 5.3990050093146966e-05, + "loss": 1.0073, "step": 104200 }, { - "epoch": 1.06, - "learning_rate": 5.979634816867183e-05, - "loss": 1.3904, + "epoch": 1.4369953983081205, + "grad_norm": 17.849102020263672, + "learning_rate": 5.398789667415786e-05, + "loss": 1.0851, "step": 104300 }, { - "epoch": 1.06, - "learning_rate": 5.979557580968621e-05, - "loss": 1.3849, + "epoch": 1.4383731503678598, + "grad_norm": 25.645145416259766, + "learning_rate": 5.398574100488642e-05, + "loss": 1.0042, "step": 104400 }, { - "epoch": 1.06, - "learning_rate": 5.9794801993874764e-05, - "loss": 1.3526, + "epoch": 1.4397509024275992, + "grad_norm": 16.04149055480957, + "learning_rate": 5.398358308551577e-05, + "loss": 1.0395, "step": 104500 }, { - "epoch": 1.07, - "learning_rate": 5.9794026721275295e-05, - "loss": 1.2336, + "epoch": 1.4411286544873385, + "grad_norm": 48.494625091552734, + "learning_rate": 5.398142291622926e-05, + "loss": 1.1061, "step": 104600 }, { - "epoch": 1.07, - "learning_rate": 5.979324999192572e-05, - "loss": 1.3354, + "epoch": 1.4425064065470778, + "grad_norm": 8.755069732666016, + "learning_rate": 5.39792604972104e-05, + "loss": 1.1055, "step": 104700 }, { - "epoch": 1.07, - "learning_rate": 5.979247180586403e-05, - "loss": 1.2959, + "epoch": 1.4438841586068172, + "grad_norm": 12.382411003112793, + "learning_rate": 5.397709582864288e-05, + "loss": 1.0642, "step": 104800 }, { - "epoch": 1.07, - "learning_rate": 5.979169216312825e-05, - "loss": 1.1845, + "epoch": 1.4452619106665565, + "grad_norm": 23.622568130493164, + "learning_rate": 5.3974928910710615e-05, + "loss": 1.0832, "step": 104900 }, { - "epoch": 1.07, - "learning_rate": 5.9790911063756516e-05, - "loss": 1.3007, + "epoch": 1.4466396627262958, + "grad_norm": 5.427385330200195, + "learning_rate": 5.3972759743597696e-05, + "loss": 1.0343, "step": 105000 }, { - "epoch": 1.07, - "learning_rate": 5.979012850778701e-05, - "loss": 1.2666, + "epoch": 1.4480174147860352, + "grad_norm": 10.036494255065918, + "learning_rate": 5.397058832748841e-05, + "loss": 1.049, "step": 105100 }, { - "epoch": 1.07, - "learning_rate": 5.978934449525799e-05, - "loss": 1.3871, + "epoch": 1.4493951668457745, + "grad_norm": 37.23356628417969, + "learning_rate": 5.396841466256722e-05, + "loss": 1.0519, "step": 105200 }, { - "epoch": 1.07, - "learning_rate": 5.978855902620781e-05, - "loss": 1.4556, + "epoch": 1.4507729189055136, + "grad_norm": 10.193114280700684, + "learning_rate": 5.396626051928437e-05, + "loss": 1.1681, "step": 105300 }, { - "epoch": 1.07, - "learning_rate": 5.978777210067486e-05, - "loss": 1.2591, + "epoch": 1.4521506709652532, + "grad_norm": 12.274484634399414, + "learning_rate": 5.3964082379777066e-05, + "loss": 1.0629, "step": 105400 }, { - "epoch": 1.07, - "learning_rate": 5.978698371869762e-05, - "loss": 1.39, + "epoch": 1.4535284230249923, + "grad_norm": 29.097864151000977, + "learning_rate": 5.396190199201058e-05, + "loss": 0.9886, "step": 105500 }, { - "epoch": 1.08, - "learning_rate": 5.978619388031463e-05, - "loss": 1.4079, + "epoch": 1.4549061750847319, + "grad_norm": 35.82841110229492, + "learning_rate": 5.3959719356170134e-05, + "loss": 1.0094, "step": 105600 }, { - "epoch": 1.08, - "learning_rate": 5.978540258556452e-05, - "loss": 1.3003, + "epoch": 1.456283927144471, + "grad_norm": 18.119590759277344, + "learning_rate": 5.395753447244117e-05, + "loss": 1.1827, "step": 105700 }, { - "epoch": 1.08, - "learning_rate": 5.9784609834485965e-05, - "loss": 1.3884, + "epoch": 1.4576616792042105, + "grad_norm": 32.812923431396484, + "learning_rate": 5.395534734100929e-05, + "loss": 1.0668, "step": 105800 }, { - "epoch": 1.08, - "learning_rate": 5.9783815627117745e-05, - "loss": 1.408, + "epoch": 1.4590394312639496, + "grad_norm": 8.903825759887695, + "learning_rate": 5.395315796206031e-05, + "loss": 1.0225, "step": 105900 }, { - "epoch": 1.08, - "learning_rate": 5.9783019963498675e-05, - "loss": 1.3368, + "epoch": 1.4604171833236892, + "grad_norm": 28.307601928710938, + "learning_rate": 5.3950988263166706e-05, + "loss": 0.9488, "step": 106000 }, { - "epoch": 1.08, - "learning_rate": 5.9782222843667663e-05, - "loss": 1.4822, + "epoch": 1.4617949353834283, + "grad_norm": 5.0161542892456055, + "learning_rate": 5.3948794412212234e-05, + "loss": 0.9748, "step": 106100 }, { - "epoch": 1.08, - "learning_rate": 5.978142426766369e-05, - "loss": 1.2155, + "epoch": 1.4631726874431679, + "grad_norm": 32.3056755065918, + "learning_rate": 5.394659831429737e-05, + "loss": 1.0922, "step": 106200 }, { - "epoch": 1.08, - "learning_rate": 5.97806322430549e-05, - "loss": 1.2892, + "epoch": 1.464550439502907, + "grad_norm": 20.302715301513672, + "learning_rate": 5.3944399969608686e-05, + "loss": 1.1104, "step": 106300 }, { - "epoch": 1.08, - "learning_rate": 5.977983076938296e-05, - "loss": 1.3215, + "epoch": 1.4659281915626463, + "grad_norm": 15.857366561889648, + "learning_rate": 5.3942199378332935e-05, + "loss": 1.0989, "step": 106400 }, { - "epoch": 1.09, - "learning_rate": 5.9779027839655e-05, - "loss": 1.3413, + "epoch": 1.4673059436223856, + "grad_norm": 22.21146583557129, + "learning_rate": 5.3939996540657055e-05, + "loss": 1.1167, "step": 106500 }, { - "epoch": 1.09, - "learning_rate": 5.977822345391029e-05, - "loss": 1.3134, + "epoch": 1.468683695682125, + "grad_norm": 8.890013694763184, + "learning_rate": 5.393779145676821e-05, + "loss": 1.1702, "step": 106600 }, { - "epoch": 1.09, - "learning_rate": 5.977741761218818e-05, - "loss": 1.3632, + "epoch": 1.4700614477418643, + "grad_norm": 32.30009841918945, + "learning_rate": 5.393558412685373e-05, + "loss": 1.0053, "step": 106700 }, { - "epoch": 1.09, - "learning_rate": 5.977661031452803e-05, - "loss": 1.2829, + "epoch": 1.4714391998016036, + "grad_norm": 40.002044677734375, + "learning_rate": 5.393337455110113e-05, + "loss": 1.0999, "step": 106800 }, { - "epoch": 1.09, - "learning_rate": 5.977580156096934e-05, - "loss": 1.3444, + "epoch": 1.472816951861343, + "grad_norm": 13.027634620666504, + "learning_rate": 5.393116272969814e-05, + "loss": 1.1368, "step": 106900 }, { - "epoch": 1.09, - "learning_rate": 5.977499135155165e-05, - "loss": 1.3269, + "epoch": 1.4741947039210823, + "grad_norm": 10.368160247802734, + "learning_rate": 5.3928948662832645e-05, + "loss": 1.0435, "step": 107000 }, { - "epoch": 1.09, - "learning_rate": 5.9774179686314575e-05, - "loss": 1.363, + "epoch": 1.4755724559808217, + "grad_norm": 3.22397518157959, + "learning_rate": 5.3926732350692756e-05, + "loss": 1.0691, "step": 107100 }, { - "epoch": 1.09, - "learning_rate": 5.977336656529779e-05, - "loss": 1.3312, + "epoch": 1.476950208040561, + "grad_norm": 4.171754837036133, + "learning_rate": 5.392451379346676e-05, + "loss": 1.0992, "step": 107200 }, { - "epoch": 1.09, - "learning_rate": 5.977255198854105e-05, - "loss": 1.4233, + "epoch": 1.4783279601003003, + "grad_norm": 7.594644069671631, + "learning_rate": 5.392229299134312e-05, + "loss": 1.0613, "step": 107300 }, { - "epoch": 1.09, - "learning_rate": 5.97717359560842e-05, - "loss": 1.2688, + "epoch": 1.4797057121600397, + "grad_norm": 69.45877838134766, + "learning_rate": 5.392006994451051e-05, + "loss": 0.9404, "step": 107400 }, { - "epoch": 1.1, - "learning_rate": 5.977092665005369e-05, - "loss": 1.3501, + "epoch": 1.481083464219779, + "grad_norm": 10.095816612243652, + "learning_rate": 5.3917844653157806e-05, + "loss": 1.0178, "step": 107500 }, { - "epoch": 1.1, - "learning_rate": 5.9770107720872375e-05, - "loss": 1.2075, + "epoch": 1.4824612162795183, + "grad_norm": 14.364834785461426, + "learning_rate": 5.391561711747404e-05, + "loss": 1.0506, "step": 107600 }, { - "epoch": 1.1, - "learning_rate": 5.976928733611045e-05, - "loss": 1.3243, + "epoch": 1.4838389683392577, + "grad_norm": 9.645843505859375, + "learning_rate": 5.3913387337648464e-05, + "loss": 0.9973, "step": 107700 }, { - "epoch": 1.1, - "learning_rate": 5.976846549580803e-05, - "loss": 1.3873, + "epoch": 1.485216720398997, + "grad_norm": 10.550060272216797, + "learning_rate": 5.39111553138705e-05, + "loss": 1.0046, "step": 107800 }, { - "epoch": 1.1, - "learning_rate": 5.97676422000053e-05, - "loss": 1.3019, + "epoch": 1.4865944724587363, + "grad_norm": 29.86020851135254, + "learning_rate": 5.3908921046329774e-05, + "loss": 1.0615, "step": 107900 }, { - "epoch": 1.1, - "learning_rate": 5.9766817448742495e-05, - "loss": 1.2509, + "epoch": 1.4879722245184757, + "grad_norm": 5.713249206542969, + "learning_rate": 5.39066845352161e-05, + "loss": 1.0428, "step": 108000 }, { - "epoch": 1.1, - "learning_rate": 5.9765991242059965e-05, - "loss": 1.3011, + "epoch": 1.489349976578215, + "grad_norm": 15.704887390136719, + "learning_rate": 5.390444578071948e-05, + "loss": 1.1417, "step": 108100 }, { - "epoch": 1.1, - "learning_rate": 5.97651635799981e-05, - "loss": 1.3448, + "epoch": 1.4907277286379543, + "grad_norm": 59.428550720214844, + "learning_rate": 5.3902204783030106e-05, + "loss": 1.002, "step": 108200 }, { - "epoch": 1.1, - "learning_rate": 5.976433446259737e-05, - "loss": 1.2108, + "epoch": 1.4921054806976937, + "grad_norm": 49.97745895385742, + "learning_rate": 5.389996154233835e-05, + "loss": 0.9987, "step": 108300 }, { - "epoch": 1.1, - "learning_rate": 5.9763503889898296e-05, - "loss": 1.2111, + "epoch": 1.493483232757433, + "grad_norm": 5.066510200500488, + "learning_rate": 5.3897716058834815e-05, + "loss": 1.059, "step": 108400 }, { - "epoch": 1.11, - "learning_rate": 5.976267186194151e-05, - "loss": 1.1812, + "epoch": 1.4948609848171723, + "grad_norm": 9.55388355255127, + "learning_rate": 5.3895468332710244e-05, + "loss": 1.0086, "step": 108500 }, { - "epoch": 1.11, - "learning_rate": 5.976183837876768e-05, - "loss": 1.2569, + "epoch": 1.4962387368769117, + "grad_norm": 12.000092506408691, + "learning_rate": 5.38932183641556e-05, + "loss": 1.0447, "step": 108600 }, { - "epoch": 1.11, - "learning_rate": 5.976100344041757e-05, - "loss": 1.3887, + "epoch": 1.497616488936651, + "grad_norm": 19.03859519958496, + "learning_rate": 5.3890966153362034e-05, + "loss": 1.028, "step": 108700 }, { - "epoch": 1.11, - "learning_rate": 5.976016704693198e-05, - "loss": 1.254, + "epoch": 1.4989942409963903, + "grad_norm": 22.942298889160156, + "learning_rate": 5.388871170052088e-05, + "loss": 1.0599, "step": 108800 }, { - "epoch": 1.11, - "learning_rate": 5.975932919835184e-05, - "loss": 1.3807, + "epoch": 1.5003719930561297, + "grad_norm": 8.335474014282227, + "learning_rate": 5.388647758386719e-05, + "loss": 1.0734, "step": 108900 }, { - "epoch": 1.11, - "learning_rate": 5.975848989471809e-05, - "loss": 1.2827, + "epoch": 1.5017497451158688, + "grad_norm": 6.595418453216553, + "learning_rate": 5.388421866992131e-05, + "loss": 1.0177, "step": 109000 }, { - "epoch": 1.11, - "learning_rate": 5.975764913607177e-05, - "loss": 1.3466, + "epoch": 1.5031274971756083, + "grad_norm": 20.5299129486084, + "learning_rate": 5.3881957514501086e-05, + "loss": 0.971, "step": 109100 }, { - "epoch": 1.11, - "learning_rate": 5.975680692245399e-05, - "loss": 1.2963, + "epoch": 1.5045052492353475, + "grad_norm": 25.569503784179688, + "learning_rate": 5.3879694117798595e-05, + "loss": 1.1343, "step": 109200 }, { - "epoch": 1.11, - "learning_rate": 5.975596325390593e-05, - "loss": 1.2993, + "epoch": 1.505883001295087, + "grad_norm": 12.386091232299805, + "learning_rate": 5.387742848000614e-05, + "loss": 0.9886, "step": 109300 }, { - "epoch": 1.11, - "learning_rate": 5.9755118130468846e-05, - "loss": 1.3292, + "epoch": 1.5072607533548261, + "grad_norm": 17.859819412231445, + "learning_rate": 5.3875160601316176e-05, + "loss": 0.9962, "step": 109400 }, { - "epoch": 1.12, - "learning_rate": 5.9754271552184064e-05, - "loss": 1.2301, + "epoch": 1.5086385054145657, + "grad_norm": 12.250004768371582, + "learning_rate": 5.387289048192139e-05, + "loss": 1.1542, "step": 109500 }, { - "epoch": 1.12, - "learning_rate": 5.975342351909295e-05, - "loss": 1.3166, + "epoch": 1.5100162574743048, + "grad_norm": 13.733154296875, + "learning_rate": 5.387061812201464e-05, + "loss": 0.9375, "step": 109600 }, { - "epoch": 1.12, - "learning_rate": 5.975257403123699e-05, - "loss": 1.376, + "epoch": 1.5113940095340443, + "grad_norm": 57.36831283569336, + "learning_rate": 5.386834352178896e-05, + "loss": 1.0113, "step": 109700 }, { - "epoch": 1.12, - "learning_rate": 5.975172308865772e-05, - "loss": 1.3041, + "epoch": 1.5127717615937835, + "grad_norm": 17.72985076904297, + "learning_rate": 5.386606668143761e-05, + "loss": 1.0345, "step": 109800 }, { - "epoch": 1.12, - "learning_rate": 5.975087069139673e-05, - "loss": 1.2162, + "epoch": 1.514149513653523, + "grad_norm": 34.852169036865234, + "learning_rate": 5.3863787601153996e-05, + "loss": 1.0197, "step": 109900 }, { - "epoch": 1.12, - "learning_rate": 5.975001683949572e-05, - "loss": 1.3114, + "epoch": 1.5155272657132621, + "grad_norm": 45.96980667114258, + "learning_rate": 5.386150628113176e-05, + "loss": 1.0378, "step": 110000 }, { - "epoch": 1.12, - "learning_rate": 5.974916153299642e-05, - "loss": 1.3083, + "epoch": 1.5169050177730017, + "grad_norm": 15.316779136657715, + "learning_rate": 5.3859222721564696e-05, + "loss": 1.0835, "step": 110100 }, { - "epoch": 1.12, - "learning_rate": 5.974830477194065e-05, - "loss": 1.3603, + "epoch": 1.5182827698327408, + "grad_norm": 14.019207000732422, + "learning_rate": 5.38569369226468e-05, + "loss": 1.0603, "step": 110200 }, { - "epoch": 1.12, - "learning_rate": 5.974744655637031e-05, - "loss": 1.2566, + "epoch": 1.5196605218924804, + "grad_norm": 4.631565570831299, + "learning_rate": 5.3854648884572284e-05, + "loss": 1.0236, "step": 110300 }, { - "epoch": 1.12, - "learning_rate": 5.9746595490227285e-05, - "loss": 1.3417, + "epoch": 1.5210382739522195, + "grad_norm": 4.182553768157959, + "learning_rate": 5.385235860753551e-05, + "loss": 0.9696, "step": 110400 }, { - "epoch": 1.13, - "learning_rate": 5.974573438029785e-05, - "loss": 1.3569, + "epoch": 1.522416026011959, + "grad_norm": 14.243215560913086, + "learning_rate": 5.385006609173106e-05, + "loss": 1.0056, "step": 110500 }, { - "epoch": 1.13, - "learning_rate": 5.974487181597951e-05, - "loss": 1.3611, + "epoch": 1.5237937780716981, + "grad_norm": 8.640573501586914, + "learning_rate": 5.384777133735368e-05, + "loss": 1.0065, "step": 110600 }, { - "epoch": 1.13, - "learning_rate": 5.974400779731445e-05, - "loss": 1.4259, + "epoch": 1.5251715301314377, + "grad_norm": 13.582250595092773, + "learning_rate": 5.384547434459834e-05, + "loss": 1.0522, "step": 110700 }, { - "epoch": 1.13, - "learning_rate": 5.9743142324344916e-05, - "loss": 1.3045, + "epoch": 1.5265492821911768, + "grad_norm": 17.925390243530273, + "learning_rate": 5.3843175113660164e-05, + "loss": 0.9387, "step": 110800 }, { - "epoch": 1.13, - "learning_rate": 5.974227539711322e-05, - "loss": 1.4275, + "epoch": 1.5279270342509164, + "grad_norm": 42.459449768066406, + "learning_rate": 5.38408736447345e-05, + "loss": 0.9725, "step": 110900 }, { - "epoch": 1.13, - "learning_rate": 5.9741407015661744e-05, - "loss": 1.4117, + "epoch": 1.5293047863106555, + "grad_norm": 6.976260662078857, + "learning_rate": 5.3838569938016854e-05, + "loss": 1.063, "step": 111000 }, { - "epoch": 1.13, - "learning_rate": 5.974053718003295e-05, - "loss": 1.1097, + "epoch": 1.530682538370395, + "grad_norm": 59.334529876708984, + "learning_rate": 5.383626399370295e-05, + "loss": 1.0706, "step": 111100 }, { - "epoch": 1.13, - "learning_rate": 5.9739665890269376e-05, - "loss": 1.2404, + "epoch": 1.5320602904301341, + "grad_norm": 7.620677471160889, + "learning_rate": 5.383395581198867e-05, + "loss": 1.109, "step": 111200 }, { - "epoch": 1.13, - "learning_rate": 5.9738793146413615e-05, - "loss": 1.4136, + "epoch": 1.5334380424898735, + "grad_norm": 37.53221893310547, + "learning_rate": 5.3831645393070136e-05, + "loss": 1.0702, "step": 111300 }, { - "epoch": 1.13, - "learning_rate": 5.97379276976848e-05, - "loss": 1.3324, + "epoch": 1.5348157945496128, + "grad_norm": 35.887611389160156, + "learning_rate": 5.3829332737143606e-05, + "loss": 1.0681, "step": 111400 }, { - "epoch": 1.14, - "learning_rate": 5.973705206031262e-05, - "loss": 1.2511, + "epoch": 1.5361935466093521, + "grad_norm": 16.638322830200195, + "learning_rate": 5.3827017844405564e-05, + "loss": 1.1362, "step": 111500 }, { - "epoch": 1.14, - "learning_rate": 5.973617496897605e-05, - "loss": 1.4575, + "epoch": 1.5375712986690915, + "grad_norm": 20.41179656982422, + "learning_rate": 5.3824700715052664e-05, + "loss": 0.9979, "step": 111600 }, { - "epoch": 1.14, - "learning_rate": 5.973529642371797e-05, - "loss": 1.2321, + "epoch": 1.5389490507288308, + "grad_norm": 17.379352569580078, + "learning_rate": 5.382238134928177e-05, + "loss": 1.0898, "step": 111700 }, { - "epoch": 1.14, - "learning_rate": 5.9734416424581356e-05, - "loss": 1.2767, + "epoch": 1.5403268027885701, + "grad_norm": 33.538482666015625, + "learning_rate": 5.38200597472899e-05, + "loss": 1.1033, "step": 111800 }, { - "epoch": 1.14, - "learning_rate": 5.973353497160922e-05, - "loss": 1.3828, + "epoch": 1.5417045548483095, + "grad_norm": 13.266926765441895, + "learning_rate": 5.381773590927432e-05, + "loss": 0.9791, "step": 111900 }, { - "epoch": 1.14, - "learning_rate": 5.973265206484467e-05, - "loss": 1.314, + "epoch": 1.5430823069080488, + "grad_norm": 12.860810279846191, + "learning_rate": 5.381540983543243e-05, + "loss": 1.088, "step": 112000 }, { - "epoch": 1.14, - "learning_rate": 5.973176770433087e-05, - "loss": 1.4695, + "epoch": 1.5444600589677882, + "grad_norm": 5.510438919067383, + "learning_rate": 5.381308152596184e-05, + "loss": 1.0915, "step": 112100 }, { - "epoch": 1.14, - "learning_rate": 5.973088189011105e-05, - "loss": 1.4346, + "epoch": 1.5458378110275275, + "grad_norm": 17.27724838256836, + "learning_rate": 5.381075098106036e-05, + "loss": 1.0701, "step": 112200 }, { - "epoch": 1.14, - "learning_rate": 5.972999462222854e-05, - "loss": 1.4046, + "epoch": 1.5472155630872668, + "grad_norm": 7.033637046813965, + "learning_rate": 5.3808418200925974e-05, + "loss": 1.0984, "step": 112300 }, { - "epoch": 1.15, - "learning_rate": 5.972910590072671e-05, - "loss": 1.3947, + "epoch": 1.5485933151470062, + "grad_norm": 10.704571723937988, + "learning_rate": 5.380608318575688e-05, + "loss": 0.9909, "step": 112400 }, { - "epoch": 1.15, - "learning_rate": 5.9728215725649016e-05, - "loss": 1.1863, + "epoch": 1.5499710672067455, + "grad_norm": 9.773233413696289, + "learning_rate": 5.3803745935751426e-05, + "loss": 0.9809, "step": 112500 }, { - "epoch": 1.15, - "learning_rate": 5.972732409703898e-05, - "loss": 1.1956, + "epoch": 1.5513488192664848, + "grad_norm": 16.5738468170166, + "learning_rate": 5.38014064511082e-05, + "loss": 1.1881, "step": 112600 }, { - "epoch": 1.15, - "learning_rate": 5.97264310149402e-05, - "loss": 1.3824, + "epoch": 1.5527265713262242, + "grad_norm": 8.641104698181152, + "learning_rate": 5.379906473202593e-05, + "loss": 1.1011, "step": 112700 }, { - "epoch": 1.15, - "learning_rate": 5.972553647939633e-05, - "loss": 1.48, + "epoch": 1.5541043233859635, + "grad_norm": 7.702586650848389, + "learning_rate": 5.379674422929562e-05, + "loss": 0.8744, "step": 112800 }, { - "epoch": 1.15, - "learning_rate": 5.9724640490451135e-05, - "loss": 1.4075, + "epoch": 1.5554820754457028, + "grad_norm": 44.07160568237305, + "learning_rate": 5.379439806427172e-05, + "loss": 1.03, "step": 112900 }, { - "epoch": 1.15, - "learning_rate": 5.9723743048148405e-05, - "loss": 1.3613, + "epoch": 1.5568598275054422, + "grad_norm": 34.84183883666992, + "learning_rate": 5.379204966540418e-05, + "loss": 0.9633, "step": 113000 }, { - "epoch": 1.15, - "learning_rate": 5.9722844152532017e-05, - "loss": 1.3371, + "epoch": 1.5582375795651815, + "grad_norm": 84.15872955322266, + "learning_rate": 5.3789699032892514e-05, + "loss": 1.0958, "step": 113100 }, { - "epoch": 1.15, - "learning_rate": 5.972194380364593e-05, - "loss": 1.3093, + "epoch": 1.5596153316249208, + "grad_norm": 6.767889022827148, + "learning_rate": 5.378734616693641e-05, + "loss": 1.0156, "step": 113200 }, { - "epoch": 1.15, - "learning_rate": 5.9721042001534154e-05, - "loss": 1.3449, + "epoch": 1.56099308368466, + "grad_norm": 12.367714881896973, + "learning_rate": 5.378499106773578e-05, + "loss": 1.0273, "step": 113300 }, { - "epoch": 1.16, - "learning_rate": 5.97201387462408e-05, - "loss": 1.5715, + "epoch": 1.5623708357443995, + "grad_norm": 12.43026351928711, + "learning_rate": 5.378263373549067e-05, + "loss": 1.0319, "step": 113400 }, { - "epoch": 1.16, - "learning_rate": 5.9719234037810015e-05, - "loss": 1.4382, + "epoch": 1.5637485878041386, + "grad_norm": 19.776586532592773, + "learning_rate": 5.3780274170401365e-05, + "loss": 1.0497, "step": 113500 }, { - "epoch": 1.16, - "learning_rate": 5.9718327876286044e-05, - "loss": 1.4426, + "epoch": 1.5651263398638782, + "grad_norm": 30.242477416992188, + "learning_rate": 5.377791237266833e-05, + "loss": 1.0893, "step": 113600 }, { - "epoch": 1.16, - "learning_rate": 5.971742026171319e-05, - "loss": 1.3952, + "epoch": 1.5665040919236173, + "grad_norm": 13.9710693359375, + "learning_rate": 5.3775548342492194e-05, + "loss": 1.0857, "step": 113700 }, { - "epoch": 1.16, - "learning_rate": 5.9716511194135825e-05, - "loss": 1.5162, + "epoch": 1.5678818439833568, + "grad_norm": 28.976524353027344, + "learning_rate": 5.37731820800738e-05, + "loss": 1.0386, "step": 113800 }, { - "epoch": 1.16, - "learning_rate": 5.9715600673598406e-05, - "loss": 1.6213, + "epoch": 1.569259596043096, + "grad_norm": 5.827281475067139, + "learning_rate": 5.377081358561418e-05, + "loss": 1.1341, "step": 113900 }, { - "epoch": 1.16, - "learning_rate": 5.9714688700145454e-05, - "loss": 1.2823, + "epoch": 1.5706373481028355, + "grad_norm": 4.12724494934082, + "learning_rate": 5.3768442859314545e-05, + "loss": 1.0815, "step": 114000 }, { - "epoch": 1.16, - "learning_rate": 5.971377527382154e-05, - "loss": 1.3866, + "epoch": 1.5720151001625746, + "grad_norm": 24.242382049560547, + "learning_rate": 5.37660699013763e-05, + "loss": 1.1044, "step": 114100 }, { - "epoch": 1.16, - "learning_rate": 5.971286039467134e-05, - "loss": 1.4322, + "epoch": 1.5733928522223142, + "grad_norm": 73.92936706542969, + "learning_rate": 5.376369471200104e-05, + "loss": 1.0414, "step": 114200 }, { - "epoch": 1.16, - "learning_rate": 5.9711944062739595e-05, - "loss": 1.3167, + "epoch": 1.5747706042820533, + "grad_norm": 13.798851013183594, + "learning_rate": 5.3761317291390545e-05, + "loss": 1.2288, "step": 114300 }, { - "epoch": 1.17, - "learning_rate": 5.971102627807109e-05, - "loss": 1.4742, + "epoch": 1.5761483563417928, + "grad_norm": 23.716012954711914, + "learning_rate": 5.3758937639746806e-05, + "loss": 1.119, "step": 114400 }, { - "epoch": 1.17, - "learning_rate": 5.97101070407107e-05, - "loss": 1.4664, + "epoch": 1.577526108401532, + "grad_norm": 12.037857055664062, + "learning_rate": 5.375655575727197e-05, + "loss": 0.998, "step": 114500 }, { - "epoch": 1.17, - "learning_rate": 5.970918635070338e-05, - "loss": 1.5229, + "epoch": 1.5789038604612715, + "grad_norm": 8.871610641479492, + "learning_rate": 5.375417164416839e-05, + "loss": 1.0477, "step": 114600 }, { - "epoch": 1.17, - "learning_rate": 5.970826420809414e-05, - "loss": 1.4537, + "epoch": 1.5802816125210106, + "grad_norm": 18.3157958984375, + "learning_rate": 5.375178530063862e-05, + "loss": 1.0899, "step": 114700 }, { - "epoch": 1.17, - "learning_rate": 5.970734061292808e-05, - "loss": 1.526, + "epoch": 1.5816593645807502, + "grad_norm": 24.479957580566406, + "learning_rate": 5.374939672688538e-05, + "loss": 1.0893, "step": 114800 }, { - "epoch": 1.17, - "learning_rate": 5.9706415565250337e-05, - "loss": 1.4143, + "epoch": 1.5830371166404893, + "grad_norm": 15.456332206726074, + "learning_rate": 5.3747005923111596e-05, + "loss": 1.1074, "step": 114900 }, { - "epoch": 1.17, - "learning_rate": 5.970548906510616e-05, - "loss": 1.4729, + "epoch": 1.5844148687002289, + "grad_norm": 43.567195892333984, + "learning_rate": 5.3744612889520384e-05, + "loss": 1.1602, "step": 115000 }, { - "epoch": 1.17, - "learning_rate": 5.970456111254084e-05, - "loss": 1.4249, + "epoch": 1.585792620759968, + "grad_norm": 2.886906623840332, + "learning_rate": 5.374221762631504e-05, + "loss": 1.1854, "step": 115100 }, { - "epoch": 1.17, - "learning_rate": 5.9703631707599744e-05, - "loss": 1.4654, + "epoch": 1.5871703728197075, + "grad_norm": 8.213479995727539, + "learning_rate": 5.373982013369905e-05, + "loss": 1.1184, "step": 115200 }, { - "epoch": 1.17, - "learning_rate": 5.9702700850328326e-05, - "loss": 1.3872, + "epoch": 1.5885481248794466, + "grad_norm": 26.33953094482422, + "learning_rate": 5.3737420411876106e-05, + "loss": 1.0642, "step": 115300 }, { - "epoch": 1.18, - "learning_rate": 5.970176854077208e-05, - "loss": 1.4678, + "epoch": 1.5899258769391862, + "grad_norm": 11.88366985321045, + "learning_rate": 5.373501846105007e-05, + "loss": 1.0355, "step": 115400 }, { - "epoch": 1.18, - "learning_rate": 5.970083477897662e-05, - "loss": 1.3417, + "epoch": 1.5913036289989253, + "grad_norm": 10.156526565551758, + "learning_rate": 5.373261428142499e-05, + "loss": 1.0554, "step": 115500 }, { - "epoch": 1.18, - "learning_rate": 5.9699899564987576e-05, - "loss": 1.6554, + "epoch": 1.5926813810586646, + "grad_norm": 12.289494514465332, + "learning_rate": 5.373020787320512e-05, + "loss": 1.1031, "step": 115600 }, { - "epoch": 1.18, - "learning_rate": 5.9698962898850684e-05, - "loss": 1.4561, + "epoch": 1.594059133118404, + "grad_norm": 3.1095759868621826, + "learning_rate": 5.3727799236594904e-05, + "loss": 1.0181, "step": 115700 }, { - "epoch": 1.18, - "learning_rate": 5.9698024780611736e-05, - "loss": 1.5633, + "epoch": 1.5954368851781433, + "grad_norm": 21.690570831298828, + "learning_rate": 5.372538837179896e-05, + "loss": 1.0868, "step": 115800 }, { - "epoch": 1.18, - "learning_rate": 5.9697085210316614e-05, - "loss": 1.5998, + "epoch": 1.5968146372378826, + "grad_norm": 8.749502182006836, + "learning_rate": 5.372297527902211e-05, + "loss": 1.1044, "step": 115900 }, { - "epoch": 1.18, - "learning_rate": 5.969614418801124e-05, - "loss": 1.3655, + "epoch": 1.598192389297622, + "grad_norm": 9.71464729309082, + "learning_rate": 5.3720559958469344e-05, + "loss": 0.9406, "step": 116000 }, { - "epoch": 1.18, - "learning_rate": 5.9695201713741634e-05, - "loss": 1.446, + "epoch": 1.5995701413573613, + "grad_norm": 58.29253005981445, + "learning_rate": 5.3718142410345875e-05, + "loss": 1.0254, "step": 116100 }, { - "epoch": 1.18, - "learning_rate": 5.969425778755388e-05, - "loss": 1.4751, + "epoch": 1.6009478934171006, + "grad_norm": 14.844298362731934, + "learning_rate": 5.3715722634857074e-05, + "loss": 1.0255, "step": 116200 }, { - "epoch": 1.18, - "learning_rate": 5.969331240949412e-05, - "loss": 1.2872, + "epoch": 1.60232564547684, + "grad_norm": 22.95594024658203, + "learning_rate": 5.371330063220852e-05, + "loss": 1.096, "step": 116300 }, { - "epoch": 1.19, - "learning_rate": 5.969236557960859e-05, - "loss": 1.3157, + "epoch": 1.6037033975365793, + "grad_norm": 25.867507934570312, + "learning_rate": 5.371087640260597e-05, + "loss": 1.0485, "step": 116400 }, { - "epoch": 1.19, - "learning_rate": 5.969141729794358e-05, - "loss": 1.3384, + "epoch": 1.6050811495963186, + "grad_norm": 11.754130363464355, + "learning_rate": 5.370844994625537e-05, + "loss": 1.1064, "step": 116500 }, { - "epoch": 1.19, - "learning_rate": 5.9690467564545444e-05, - "loss": 1.504, + "epoch": 1.606458901656058, + "grad_norm": 11.510786056518555, + "learning_rate": 5.3706021263362867e-05, + "loss": 1.0876, "step": 116600 }, { - "epoch": 1.19, - "learning_rate": 5.968952589849718e-05, - "loss": 1.4771, + "epoch": 1.6078366537157973, + "grad_norm": 16.26561737060547, + "learning_rate": 5.370359035413479e-05, + "loss": 1.1881, "step": 116700 }, { - "epoch": 1.19, - "learning_rate": 5.9688573276288354e-05, - "loss": 1.3557, + "epoch": 1.6092144057755366, + "grad_norm": 36.43238067626953, + "learning_rate": 5.3701157218777656e-05, + "loss": 1.0849, "step": 116800 }, { - "epoch": 1.19, - "learning_rate": 5.968761920248549e-05, - "loss": 1.2829, + "epoch": 1.610592157835276, + "grad_norm": 10.017860412597656, + "learning_rate": 5.3698721857498174e-05, + "loss": 0.9248, "step": 116900 }, { - "epoch": 1.19, - "learning_rate": 5.96866636771352e-05, - "loss": 1.3307, + "epoch": 1.6119699098950153, + "grad_norm": 10.625978469848633, + "learning_rate": 5.369628427050324e-05, + "loss": 0.9446, "step": 117000 }, { - "epoch": 1.19, - "learning_rate": 5.968570670028422e-05, - "loss": 1.395, + "epoch": 1.6133476619547547, + "grad_norm": 38.3299560546875, + "learning_rate": 5.369384445799993e-05, + "loss": 1.1157, "step": 117100 }, { - "epoch": 1.19, - "learning_rate": 5.968474827197934e-05, - "loss": 1.3591, + "epoch": 1.614725414014494, + "grad_norm": 75.82661437988281, + "learning_rate": 5.369142685158813e-05, + "loss": 1.1497, "step": 117200 }, { - "epoch": 1.2, - "learning_rate": 5.968378839226741e-05, - "loss": 1.3328, + "epoch": 1.6161031660742333, + "grad_norm": 25.32996368408203, + "learning_rate": 5.3688982610940014e-05, + "loss": 1.0587, "step": 117300 }, { - "epoch": 1.2, - "learning_rate": 5.968282706119538e-05, - "loss": 1.3879, + "epoch": 1.6174809181339727, + "grad_norm": 36.150203704833984, + "learning_rate": 5.3686536145403844e-05, + "loss": 1.1681, "step": 117400 }, { - "epoch": 1.2, - "learning_rate": 5.9681864278810236e-05, - "loss": 1.339, + "epoch": 1.618858670193712, + "grad_norm": 26.849124908447266, + "learning_rate": 5.368408745518745e-05, + "loss": 1.1352, "step": 117500 }, { - "epoch": 1.2, - "learning_rate": 5.968090004515907e-05, - "loss": 1.2838, + "epoch": 1.620236422253451, + "grad_norm": 23.102895736694336, + "learning_rate": 5.3681636540498876e-05, + "loss": 1.0638, "step": 117600 }, { - "epoch": 1.2, - "learning_rate": 5.967993436028901e-05, - "loss": 1.452, + "epoch": 1.6216141743131907, + "grad_norm": 12.821172714233398, + "learning_rate": 5.367918340154633e-05, + "loss": 1.0358, "step": 117700 }, { - "epoch": 1.2, - "learning_rate": 5.967896722424728e-05, - "loss": 1.3095, + "epoch": 1.6229919263729298, + "grad_norm": 26.357751846313477, + "learning_rate": 5.367672803853823e-05, + "loss": 1.0402, "step": 117800 }, { - "epoch": 1.2, - "learning_rate": 5.967799863708118e-05, - "loss": 1.3719, + "epoch": 1.6243696784326693, + "grad_norm": 52.07672119140625, + "learning_rate": 5.367427045168315e-05, + "loss": 0.9626, "step": 117900 }, { - "epoch": 1.2, - "learning_rate": 5.967702859883805e-05, - "loss": 1.3329, + "epoch": 1.6257474304924084, + "grad_norm": 5.345720291137695, + "learning_rate": 5.36718106411899e-05, + "loss": 0.9762, "step": 118000 }, { - "epoch": 1.2, - "learning_rate": 5.967605710956532e-05, - "loss": 1.5108, + "epoch": 1.627125182552148, + "grad_norm": 18.012269973754883, + "learning_rate": 5.366934860726744e-05, + "loss": 0.9929, "step": 118100 }, { - "epoch": 1.2, - "learning_rate": 5.96750841693105e-05, - "loss": 1.4127, + "epoch": 1.628502934611887, + "grad_norm": 61.259674072265625, + "learning_rate": 5.366688435012493e-05, + "loss": 0.9906, "step": 118200 }, { - "epoch": 1.21, - "learning_rate": 5.967410977812115e-05, - "loss": 1.4059, + "epoch": 1.6298806866716267, + "grad_norm": 4.80804967880249, + "learning_rate": 5.3664417869971725e-05, + "loss": 1.0891, "step": 118300 }, { - "epoch": 1.21, - "learning_rate": 5.967313393604493e-05, - "loss": 1.3228, + "epoch": 1.6312584387313658, + "grad_norm": 80.92395782470703, + "learning_rate": 5.366194916701737e-05, + "loss": 1.0689, "step": 118400 }, { - "epoch": 1.21, - "learning_rate": 5.967215664312952e-05, - "loss": 1.3755, + "epoch": 1.6326361907911053, + "grad_norm": 11.650620460510254, + "learning_rate": 5.3659478241471594e-05, + "loss": 1.1024, "step": 118500 }, { - "epoch": 1.21, - "learning_rate": 5.967117789942274e-05, - "loss": 1.2995, + "epoch": 1.6340139428508444, + "grad_norm": 22.641496658325195, + "learning_rate": 5.36570050935443e-05, + "loss": 0.9952, "step": 118600 }, { - "epoch": 1.21, - "learning_rate": 5.967019770497242e-05, - "loss": 1.2373, + "epoch": 1.635391694910584, + "grad_norm": 6.600434303283691, + "learning_rate": 5.365452972344561e-05, + "loss": 0.9908, "step": 118700 }, { - "epoch": 1.21, - "learning_rate": 5.9669216059826505e-05, - "loss": 1.456, + "epoch": 1.6367694469703231, + "grad_norm": 20.471500396728516, + "learning_rate": 5.365205213138582e-05, + "loss": 0.9861, "step": 118800 }, { - "epoch": 1.21, - "learning_rate": 5.966824280217146e-05, - "loss": 1.3081, + "epoch": 1.6381471990300627, + "grad_norm": 2.438000440597534, + "learning_rate": 5.3649572317575405e-05, + "loss": 0.9061, "step": 118900 }, { - "epoch": 1.21, - "learning_rate": 5.9667268122783324e-05, - "loss": 1.3161, + "epoch": 1.6395249510898018, + "grad_norm": 13.06838321685791, + "learning_rate": 5.364709028222504e-05, + "loss": 0.9864, "step": 119000 }, { - "epoch": 1.21, - "learning_rate": 5.966628215484941e-05, - "loss": 1.2177, + "epoch": 1.6409027031495413, + "grad_norm": 24.539138793945312, + "learning_rate": 5.364460602554557e-05, + "loss": 1.1174, "step": 119100 }, { - "epoch": 1.21, - "learning_rate": 5.9665294736411335e-05, - "loss": 1.269, + "epoch": 1.6422804552092805, + "grad_norm": 114.63029479980469, + "learning_rate": 5.364211954774809e-05, + "loss": 1.1937, "step": 119200 }, { - "epoch": 1.22, - "learning_rate": 5.966430586751738e-05, - "loss": 1.2286, + "epoch": 1.64365820726902, + "grad_norm": 13.547866821289062, + "learning_rate": 5.36396308490438e-05, + "loss": 1.0725, "step": 119300 }, { - "epoch": 1.22, - "learning_rate": 5.9663315548215895e-05, - "loss": 1.2849, + "epoch": 1.6450359593287591, + "grad_norm": 24.412593841552734, + "learning_rate": 5.363713992964415e-05, + "loss": 1.0889, "step": 119400 }, { - "epoch": 1.22, - "learning_rate": 5.9662323778555296e-05, - "loss": 1.2344, + "epoch": 1.6464137113884987, + "grad_norm": 31.08279800415039, + "learning_rate": 5.3634646789760736e-05, + "loss": 1.1982, "step": 119500 }, { - "epoch": 1.22, - "learning_rate": 5.966133055858409e-05, - "loss": 1.4448, + "epoch": 1.6477914634482378, + "grad_norm": 27.614606857299805, + "learning_rate": 5.363217639419657e-05, + "loss": 1.1285, "step": 119600 }, { - "epoch": 1.22, - "learning_rate": 5.9660335888350826e-05, - "loss": 1.5244, + "epoch": 1.6491692155079773, + "grad_norm": 27.000633239746094, + "learning_rate": 5.3629678836180814e-05, + "loss": 1.0992, "step": 119700 }, { - "epoch": 1.22, - "learning_rate": 5.965933976790414e-05, - "loss": 1.4088, + "epoch": 1.6505469675677165, + "grad_norm": 25.138166427612305, + "learning_rate": 5.362717905831515e-05, + "loss": 1.2039, "step": 119800 }, { - "epoch": 1.22, - "learning_rate": 5.9658342197292736e-05, - "loss": 1.4552, + "epoch": 1.6519247196274558, + "grad_norm": 12.283367156982422, + "learning_rate": 5.362467706081196e-05, + "loss": 1.1113, "step": 119900 }, { - "epoch": 1.22, - "learning_rate": 5.965734317656539e-05, - "loss": 1.5194, + "epoch": 1.6533024716871951, + "grad_norm": 51.160743713378906, + "learning_rate": 5.362217284388381e-05, + "loss": 1.105, "step": 120000 }, { - "epoch": 1.22, - "learning_rate": 5.965634270577095e-05, - "loss": 1.3912, + "epoch": 1.6546802237469345, + "grad_norm": 13.717453956604004, + "learning_rate": 5.3619666407743424e-05, + "loss": 1.0964, "step": 120100 }, { - "epoch": 1.22, - "learning_rate": 5.965534078495833e-05, - "loss": 1.4178, + "epoch": 1.6560579758066738, + "grad_norm": 141.6107940673828, + "learning_rate": 5.361715775260376e-05, + "loss": 0.9988, "step": 120200 }, { - "epoch": 1.23, - "learning_rate": 5.9654337414176516e-05, - "loss": 1.4027, + "epoch": 1.6574357278664131, + "grad_norm": 11.551924705505371, + "learning_rate": 5.361464687867792e-05, + "loss": 1.1018, "step": 120300 }, { - "epoch": 1.23, - "learning_rate": 5.9653332593474584e-05, - "loss": 1.3197, + "epoch": 1.6588134799261525, + "grad_norm": 21.031797409057617, + "learning_rate": 5.3612133786179225e-05, + "loss": 1.0463, "step": 120400 }, { - "epoch": 1.23, - "learning_rate": 5.9652326322901634e-05, - "loss": 1.2208, + "epoch": 1.6601912319858918, + "grad_norm": 8.770426750183105, + "learning_rate": 5.360961847532118e-05, + "loss": 1.0918, "step": 120500 }, { - "epoch": 1.23, - "learning_rate": 5.965131860250689e-05, - "loss": 1.399, + "epoch": 1.6615689840456311, + "grad_norm": 22.356077194213867, + "learning_rate": 5.360710094631748e-05, + "loss": 1.0765, "step": 120600 }, { - "epoch": 1.23, - "learning_rate": 5.965030943233962e-05, - "loss": 1.3694, + "epoch": 1.6629467361053705, + "grad_norm": 18.905723571777344, + "learning_rate": 5.360458119938198e-05, + "loss": 1.1508, "step": 120700 }, { - "epoch": 1.23, - "learning_rate": 5.9649298812449154e-05, - "loss": 1.3825, + "epoch": 1.6643244881651098, + "grad_norm": 361.44158935546875, + "learning_rate": 5.360205923472876e-05, + "loss": 1.005, "step": 120800 }, { - "epoch": 1.23, - "learning_rate": 5.964828674288492e-05, - "loss": 1.4419, + "epoch": 1.6657022402248491, + "grad_norm": 5.470854759216309, + "learning_rate": 5.359953505257207e-05, + "loss": 1.0896, "step": 120900 }, { - "epoch": 1.23, - "learning_rate": 5.964727322369639e-05, - "loss": 1.4288, + "epoch": 1.6670799922845885, + "grad_norm": 11.167045593261719, + "learning_rate": 5.3597008653126354e-05, + "loss": 0.9672, "step": 121000 }, { - "epoch": 1.23, - "learning_rate": 5.964625825493312e-05, - "loss": 1.4255, + "epoch": 1.6684577443443278, + "grad_norm": 4.910240650177002, + "learning_rate": 5.3594480036606245e-05, + "loss": 1.0398, "step": 121100 }, { - "epoch": 1.23, - "learning_rate": 5.9645241836644754e-05, - "loss": 1.3312, + "epoch": 1.6698354964040671, + "grad_norm": 6.964974403381348, + "learning_rate": 5.359194920322655e-05, + "loss": 1.0586, "step": 121200 }, { - "epoch": 1.24, - "learning_rate": 5.964422396888098e-05, - "loss": 1.3803, + "epoch": 1.6712132484638065, + "grad_norm": 10.561227798461914, + "learning_rate": 5.358941615320229e-05, + "loss": 1.0649, "step": 121300 }, { - "epoch": 1.24, - "learning_rate": 5.964320465169155e-05, - "loss": 1.5161, + "epoch": 1.6725910005235458, + "grad_norm": 15.112284660339355, + "learning_rate": 5.358688088674866e-05, + "loss": 0.9604, "step": 121400 }, { - "epoch": 1.24, - "learning_rate": 5.9642194099966205e-05, - "loss": 1.3961, + "epoch": 1.6739687525832851, + "grad_norm": 11.338651657104492, + "learning_rate": 5.358434340408103e-05, + "loss": 1.0518, "step": 121500 }, { - "epoch": 1.24, - "learning_rate": 5.9641171898568093e-05, - "loss": 1.3588, + "epoch": 1.6753465046430245, + "grad_norm": 18.34324073791504, + "learning_rate": 5.3581803705414985e-05, + "loss": 0.9173, "step": 121600 }, { - "epoch": 1.24, - "learning_rate": 5.964014824789356e-05, - "loss": 1.203, + "epoch": 1.6767242567027638, + "grad_norm": 18.65089988708496, + "learning_rate": 5.357926179096629e-05, + "loss": 1.0138, "step": 121700 }, { - "epoch": 1.24, - "learning_rate": 5.9639123147992654e-05, - "loss": 1.3503, + "epoch": 1.6781020087625032, + "grad_norm": 10.005807876586914, + "learning_rate": 5.357671766095088e-05, + "loss": 1.0448, "step": 121800 }, { - "epoch": 1.24, - "learning_rate": 5.9638096598915506e-05, - "loss": 1.411, + "epoch": 1.6794797608222425, + "grad_norm": 17.847454071044922, + "learning_rate": 5.3574171315584886e-05, + "loss": 1.1733, "step": 121900 }, { - "epoch": 1.24, - "learning_rate": 5.9637068600712315e-05, - "loss": 1.3845, + "epoch": 1.6808575128819818, + "grad_norm": 11.443880081176758, + "learning_rate": 5.357164825165385e-05, + "loss": 1.0959, "step": 122000 }, { - "epoch": 1.24, - "learning_rate": 5.9636039153433326e-05, - "loss": 1.3192, + "epoch": 1.682235264941721, + "grad_norm": 6.307702541351318, + "learning_rate": 5.3569097498383995e-05, + "loss": 1.219, "step": 122100 }, { - "epoch": 1.24, - "learning_rate": 5.963500825712888e-05, - "loss": 1.2467, + "epoch": 1.6836130170014605, + "grad_norm": 27.30145835876465, + "learning_rate": 5.356654453041093e-05, + "loss": 1.1133, "step": 122200 }, { - "epoch": 1.25, - "learning_rate": 5.963397591184938e-05, - "loss": 1.3739, + "epoch": 1.6849907690611996, + "grad_norm": 69.27091979980469, + "learning_rate": 5.3564014910737146e-05, + "loss": 1.1829, "step": 122300 }, { - "epoch": 1.25, - "learning_rate": 5.963294211764531e-05, - "loss": 1.3546, + "epoch": 1.6863685211209392, + "grad_norm": 87.97235870361328, + "learning_rate": 5.3561457536150146e-05, + "loss": 1.2058, "step": 122400 }, { - "epoch": 1.25, - "learning_rate": 5.963190687456721e-05, - "loss": 1.3216, + "epoch": 1.6877462731806783, + "grad_norm": 11.573745727539062, + "learning_rate": 5.3558897947508997e-05, + "loss": 1.0475, "step": 122500 }, { - "epoch": 1.25, - "learning_rate": 5.96308701826657e-05, - "loss": 1.3243, + "epoch": 1.6891240252404178, + "grad_norm": 137.28189086914062, + "learning_rate": 5.3556336145031156e-05, + "loss": 1.1262, "step": 122600 }, { - "epoch": 1.25, - "learning_rate": 5.962983204199146e-05, - "loss": 1.2892, + "epoch": 1.690501777300157, + "grad_norm": 11.57826042175293, + "learning_rate": 5.3553772128934256e-05, + "loss": 1.2299, "step": 122700 }, { - "epoch": 1.25, - "learning_rate": 5.9628792452595256e-05, - "loss": 1.4663, + "epoch": 1.6918795293598965, + "grad_norm": 18.035369873046875, + "learning_rate": 5.355120589943612e-05, + "loss": 1.1575, "step": 122800 }, { - "epoch": 1.25, - "learning_rate": 5.9627751414527914e-05, - "loss": 1.3289, + "epoch": 1.6932572814196356, + "grad_norm": 27.564599990844727, + "learning_rate": 5.354863745675477e-05, + "loss": 1.1184, "step": 122900 }, { - "epoch": 1.25, - "learning_rate": 5.962670892784034e-05, - "loss": 1.4329, + "epoch": 1.6946350334793752, + "grad_norm": 62.29301452636719, + "learning_rate": 5.35460668011084e-05, + "loss": 0.9375, "step": 123000 }, { - "epoch": 1.25, - "learning_rate": 5.96256649925835e-05, - "loss": 1.4038, + "epoch": 1.6960127855391143, + "grad_norm": 14.21219253540039, + "learning_rate": 5.3543493932715406e-05, + "loss": 1.122, "step": 123100 }, { - "epoch": 1.26, - "learning_rate": 5.962461960880845e-05, - "loss": 1.3818, + "epoch": 1.6973905375988538, + "grad_norm": 31.510623931884766, + "learning_rate": 5.354091885179437e-05, + "loss": 1.1473, "step": 123200 }, { - "epoch": 1.26, - "learning_rate": 5.962357277656628e-05, - "loss": 1.3551, + "epoch": 1.698768289658593, + "grad_norm": 22.348297119140625, + "learning_rate": 5.3538341558564047e-05, + "loss": 1.0537, "step": 123300 }, { - "epoch": 1.26, - "learning_rate": 5.962252449590819e-05, - "loss": 1.3107, + "epoch": 1.7001460417183325, + "grad_norm": 41.6342658996582, + "learning_rate": 5.35357620532434e-05, + "loss": 1.1125, "step": 123400 }, { - "epoch": 1.26, - "learning_rate": 5.962147476688543e-05, - "loss": 1.2925, + "epoch": 1.7015237937780716, + "grad_norm": 89.36843872070312, + "learning_rate": 5.353318033605157e-05, + "loss": 1.2083, "step": 123500 }, { - "epoch": 1.26, - "learning_rate": 5.962042358954932e-05, - "loss": 1.3535, + "epoch": 1.7029015458378112, + "grad_norm": 17.638071060180664, + "learning_rate": 5.3530596407207885e-05, + "loss": 1.0446, "step": 123600 }, { - "epoch": 1.26, - "learning_rate": 5.9619370963951275e-05, - "loss": 1.3602, + "epoch": 1.7042792978975503, + "grad_norm": 15.610889434814453, + "learning_rate": 5.3528010266931856e-05, + "loss": 1.1434, "step": 123700 }, { - "epoch": 1.26, - "learning_rate": 5.961831689014274e-05, - "loss": 1.3391, + "epoch": 1.7056570499572898, + "grad_norm": 7.199135780334473, + "learning_rate": 5.35254219154432e-05, + "loss": 1.0682, "step": 123800 }, { - "epoch": 1.26, - "learning_rate": 5.961726136817526e-05, - "loss": 1.2356, + "epoch": 1.707034802017029, + "grad_norm": 5.713176727294922, + "learning_rate": 5.3522831352961814e-05, + "loss": 1.0126, "step": 123900 }, { - "epoch": 1.26, - "learning_rate": 5.961620439810045e-05, - "loss": 1.4079, + "epoch": 1.7084125540767685, + "grad_norm": 17.724994659423828, + "learning_rate": 5.3520238579707764e-05, + "loss": 1.0586, "step": 124000 }, { - "epoch": 1.26, - "learning_rate": 5.961514597996998e-05, - "loss": 1.3692, + "epoch": 1.7097903061365076, + "grad_norm": 28.14649772644043, + "learning_rate": 5.3517643595901334e-05, + "loss": 0.9933, "step": 124100 }, { - "epoch": 1.27, - "learning_rate": 5.961408611383561e-05, - "loss": 1.4242, + "epoch": 1.711168058196247, + "grad_norm": 6.085136413574219, + "learning_rate": 5.351504640176297e-05, + "loss": 1.0703, "step": 124200 }, { - "epoch": 1.27, - "learning_rate": 5.9613024799749155e-05, - "loss": 1.2418, + "epoch": 1.7125458102559863, + "grad_norm": 26.125911712646484, + "learning_rate": 5.351244699751333e-05, + "loss": 1.1096, "step": 124300 }, { - "epoch": 1.27, - "learning_rate": 5.9611962037762505e-05, - "loss": 1.2937, + "epoch": 1.7139235623157256, + "grad_norm": 16.116865158081055, + "learning_rate": 5.350984538337323e-05, + "loss": 1.1115, "step": 124400 }, { - "epoch": 1.27, - "learning_rate": 5.961089782792763e-05, - "loss": 1.4035, + "epoch": 1.715301314375465, + "grad_norm": 6.875175952911377, + "learning_rate": 5.350724155956371e-05, + "loss": 1.0288, "step": 124500 }, { - "epoch": 1.27, - "learning_rate": 5.9609832170296564e-05, - "loss": 1.458, + "epoch": 1.7166790664352043, + "grad_norm": 16.344148635864258, + "learning_rate": 5.350463552630595e-05, + "loss": 1.0952, "step": 124600 }, { - "epoch": 1.27, - "learning_rate": 5.9608765064921405e-05, - "loss": 1.2646, + "epoch": 1.7180568184949436, + "grad_norm": 33.94709014892578, + "learning_rate": 5.350202728382138e-05, + "loss": 1.1, "step": 124700 }, { - "epoch": 1.27, - "learning_rate": 5.960769651185432e-05, - "loss": 1.3861, + "epoch": 1.719434570554683, + "grad_norm": 9.75704288482666, + "learning_rate": 5.349941683233156e-05, + "loss": 1.0786, "step": 124800 }, { - "epoch": 1.27, - "learning_rate": 5.960662651114758e-05, - "loss": 1.3276, + "epoch": 1.7208123226144223, + "grad_norm": 2.9066145420074463, + "learning_rate": 5.349680417205827e-05, + "loss": 1.0042, "step": 124900 }, { - "epoch": 1.27, - "learning_rate": 5.960555506285348e-05, - "loss": 1.2731, + "epoch": 1.7221900746741616, + "grad_norm": 5.697661399841309, + "learning_rate": 5.349418930322347e-05, + "loss": 1.0052, "step": 125000 }, { - "epoch": 1.27, - "learning_rate": 5.960448216702441e-05, - "loss": 1.2559, + "epoch": 1.723567826733901, + "grad_norm": 4.006572723388672, + "learning_rate": 5.3491572226049287e-05, + "loss": 0.9651, "step": 125100 }, { - "epoch": 1.28, - "learning_rate": 5.960340782371284e-05, - "loss": 1.2045, + "epoch": 1.7249455787936403, + "grad_norm": 16.873310089111328, + "learning_rate": 5.348895294075809e-05, + "loss": 0.9622, "step": 125200 }, { - "epoch": 1.28, - "learning_rate": 5.960233203297129e-05, - "loss": 1.3988, + "epoch": 1.7263233308533796, + "grad_norm": 16.663330078125, + "learning_rate": 5.348633144757237e-05, + "loss": 0.9845, "step": 125300 }, { - "epoch": 1.28, - "learning_rate": 5.960125479485236e-05, - "loss": 1.2827, + "epoch": 1.727701082913119, + "grad_norm": 13.301389694213867, + "learning_rate": 5.3483707746714854e-05, + "loss": 1.0242, "step": 125400 }, { - "epoch": 1.28, - "learning_rate": 5.960017610940872e-05, - "loss": 1.379, + "epoch": 1.7290788349728583, + "grad_norm": 7.422813892364502, + "learning_rate": 5.348108183840844e-05, + "loss": 0.9887, "step": 125500 }, { - "epoch": 1.28, - "learning_rate": 5.9599095976693115e-05, - "loss": 1.3538, + "epoch": 1.7304565870325976, + "grad_norm": 10.25568962097168, + "learning_rate": 5.347845372287619e-05, + "loss": 1.1022, "step": 125600 }, { - "epoch": 1.28, - "learning_rate": 5.959801439675835e-05, - "loss": 1.2616, + "epoch": 1.731834339092337, + "grad_norm": 15.564820289611816, + "learning_rate": 5.3475823400341405e-05, + "loss": 1.0533, "step": 125700 }, { - "epoch": 1.28, - "learning_rate": 5.959693136965732e-05, - "loss": 1.2887, + "epoch": 1.7332120911520763, + "grad_norm": 14.092329025268555, + "learning_rate": 5.347319087102752e-05, + "loss": 1.1312, "step": 125800 }, { - "epoch": 1.28, - "learning_rate": 5.959584689544296e-05, - "loss": 1.3602, + "epoch": 1.7345898432118156, + "grad_norm": 38.11343002319336, + "learning_rate": 5.3470556135158204e-05, + "loss": 1.0647, "step": 125900 }, { - "epoch": 1.28, - "learning_rate": 5.959476097416832e-05, - "loss": 1.3714, + "epoch": 1.735967595271555, + "grad_norm": 45.88645553588867, + "learning_rate": 5.3467919192957265e-05, + "loss": 1.0641, "step": 126000 }, { - "epoch": 1.28, - "learning_rate": 5.9593673605886466e-05, - "loss": 1.39, + "epoch": 1.7373453473312943, + "grad_norm": 7.421977996826172, + "learning_rate": 5.3465280044648756e-05, + "loss": 1.0337, "step": 126100 }, { - "epoch": 1.29, - "learning_rate": 5.959258479065059e-05, - "loss": 1.2831, + "epoch": 1.7387230993910336, + "grad_norm": 6.982689380645752, + "learning_rate": 5.3462638690456856e-05, + "loss": 1.0006, "step": 126200 }, { - "epoch": 1.29, - "learning_rate": 5.959149452851391e-05, - "loss": 1.2548, + "epoch": 1.740100851450773, + "grad_norm": 20.65532875061035, + "learning_rate": 5.345999513060598e-05, + "loss": 1.1318, "step": 126300 }, { - "epoch": 1.29, - "learning_rate": 5.959040281952975e-05, - "loss": 1.3099, + "epoch": 1.741478603510512, + "grad_norm": 19.071537017822266, + "learning_rate": 5.345734936532071e-05, + "loss": 1.1514, "step": 126400 }, { - "epoch": 1.29, - "learning_rate": 5.958930966375147e-05, - "loss": 1.2853, + "epoch": 1.7428563555702516, + "grad_norm": 23.1497802734375, + "learning_rate": 5.345470139482581e-05, + "loss": 1.0037, "step": 126500 }, { - "epoch": 1.29, - "learning_rate": 5.958821506123253e-05, - "loss": 1.1851, + "epoch": 1.7442341076299908, + "grad_norm": 13.743980407714844, + "learning_rate": 5.345205121934625e-05, + "loss": 0.9893, "step": 126600 }, { - "epoch": 1.29, - "learning_rate": 5.958711901202645e-05, - "loss": 1.215, + "epoch": 1.7456118596897303, + "grad_norm": 10.10053825378418, + "learning_rate": 5.344939883910716e-05, + "loss": 0.9411, "step": 126700 }, { - "epoch": 1.29, - "learning_rate": 5.958602151618681e-05, - "loss": 1.2589, + "epoch": 1.7469896117494694, + "grad_norm": 27.273420333862305, + "learning_rate": 5.344674425433389e-05, + "loss": 1.0426, "step": 126800 }, { - "epoch": 1.29, - "learning_rate": 5.9584922573767284e-05, - "loss": 1.1761, + "epoch": 1.748367363809209, + "grad_norm": 8.651666641235352, + "learning_rate": 5.344411404405336e-05, + "loss": 1.0865, "step": 126900 }, { - "epoch": 1.29, - "learning_rate": 5.958382218482158e-05, - "loss": 1.2956, + "epoch": 1.749745115868948, + "grad_norm": 12.816594123840332, + "learning_rate": 5.344145507292818e-05, + "loss": 1.0971, "step": 127000 }, { - "epoch": 1.29, - "learning_rate": 5.958272034940354e-05, - "loss": 1.2954, + "epoch": 1.7511228679286877, + "grad_norm": 7.724693298339844, + "learning_rate": 5.343879389794367e-05, + "loss": 0.9634, "step": 127100 }, { - "epoch": 1.3, - "learning_rate": 5.958161706756701e-05, - "loss": 1.1245, + "epoch": 1.7525006199884268, + "grad_norm": 16.434913635253906, + "learning_rate": 5.3436130519325905e-05, + "loss": 1.1176, "step": 127200 }, { - "epoch": 1.3, - "learning_rate": 5.958051233936594e-05, - "loss": 1.1946, + "epoch": 1.7538783720481663, + "grad_norm": 92.48381042480469, + "learning_rate": 5.343349160402755e-05, + "loss": 1.1346, "step": 127300 }, { - "epoch": 1.3, - "learning_rate": 5.957940616485434e-05, - "loss": 1.2536, + "epoch": 1.7552561241079054, + "grad_norm": 37.74311065673828, + "learning_rate": 5.343082384085298e-05, + "loss": 1.0324, "step": 127400 }, { - "epoch": 1.3, - "learning_rate": 5.95782985440863e-05, - "loss": 1.2432, + "epoch": 1.756633876167645, + "grad_norm": 20.710325241088867, + "learning_rate": 5.342815387472226e-05, + "loss": 1.0489, "step": 127500 }, { - "epoch": 1.3, - "learning_rate": 5.957718947711597e-05, - "loss": 1.3207, + "epoch": 1.758011628227384, + "grad_norm": 101.49832916259766, + "learning_rate": 5.342548170586223e-05, + "loss": 1.157, "step": 127600 }, { - "epoch": 1.3, - "learning_rate": 5.957607896399759e-05, - "loss": 1.352, + "epoch": 1.7593893802871237, + "grad_norm": 13.25818920135498, + "learning_rate": 5.342280733449989e-05, + "loss": 1.2041, "step": 127700 }, { - "epoch": 1.3, - "learning_rate": 5.957496700478545e-05, - "loss": 1.1382, + "epoch": 1.7607671323468628, + "grad_norm": 9.916483879089355, + "learning_rate": 5.3420130760862445e-05, + "loss": 1.0011, "step": 127800 }, { - "epoch": 1.3, - "learning_rate": 5.957385359953391e-05, - "loss": 1.2078, + "epoch": 1.7621448844066023, + "grad_norm": 31.11695098876953, + "learning_rate": 5.341745198517729e-05, + "loss": 0.9658, "step": 127900 }, { - "epoch": 1.3, - "learning_rate": 5.9572749903967236e-05, - "loss": 1.359, + "epoch": 1.7635226364663414, + "grad_norm": 67.74144744873047, + "learning_rate": 5.3414771007671994e-05, + "loss": 0.9779, "step": 128000 }, { - "epoch": 1.31, - "learning_rate": 5.9571633621259345e-05, - "loss": 1.4138, + "epoch": 1.764900388526081, + "grad_norm": 11.926155090332031, + "learning_rate": 5.341208782857433e-05, + "loss": 1.1963, "step": 128100 }, { - "epoch": 1.31, - "learning_rate": 5.9570527077117785e-05, - "loss": 1.3653, + "epoch": 1.7662781405858201, + "grad_norm": 54.3149528503418, + "learning_rate": 5.3409402448112226e-05, + "loss": 1.1097, "step": 128200 }, { - "epoch": 1.31, - "learning_rate": 5.9569407917169666e-05, - "loss": 1.4049, + "epoch": 1.7676558926455597, + "grad_norm": 35.44221878051758, + "learning_rate": 5.340671486651384e-05, + "loss": 1.1201, "step": 128300 }, { - "epoch": 1.31, - "learning_rate": 5.9568287311453955e-05, - "loss": 1.3456, + "epoch": 1.7690336447052988, + "grad_norm": 16.38726234436035, + "learning_rate": 5.340402508400749e-05, + "loss": 1.0558, "step": 128400 }, { - "epoch": 1.31, - "learning_rate": 5.956716526002545e-05, - "loss": 1.194, + "epoch": 1.7704113967650381, + "grad_norm": 12.363265037536621, + "learning_rate": 5.340133310082168e-05, + "loss": 1.2323, "step": 128500 }, { - "epoch": 1.31, - "learning_rate": 5.9566041762939026e-05, - "loss": 1.2401, + "epoch": 1.7717891488247774, + "grad_norm": 15.031146049499512, + "learning_rate": 5.339863891718511e-05, + "loss": 1.1629, "step": 128600 }, { - "epoch": 1.31, - "learning_rate": 5.956491682024958e-05, - "loss": 1.1624, + "epoch": 1.7731669008845168, + "grad_norm": 24.993499755859375, + "learning_rate": 5.3395942533326675e-05, + "loss": 1.1863, "step": 128700 }, { - "epoch": 1.31, - "learning_rate": 5.956379043201215e-05, - "loss": 1.2408, + "epoch": 1.7745446529442561, + "grad_norm": 12.42155647277832, + "learning_rate": 5.3393243949475436e-05, + "loss": 1.0377, "step": 128800 }, { - "epoch": 1.31, - "learning_rate": 5.956266259828179e-05, - "loss": 1.2268, + "epoch": 1.7759224050039955, + "grad_norm": 28.53933334350586, + "learning_rate": 5.339054316586065e-05, + "loss": 1.0024, "step": 128900 }, { - "epoch": 1.31, - "learning_rate": 5.9561533319113645e-05, - "loss": 1.2228, + "epoch": 1.7773001570637348, + "grad_norm": 21.87401008605957, + "learning_rate": 5.338784018271177e-05, + "loss": 1.0078, "step": 129000 }, { - "epoch": 1.32, - "learning_rate": 5.9560402594562945e-05, - "loss": 1.1779, + "epoch": 1.7786779091234741, + "grad_norm": 10.402198791503906, + "learning_rate": 5.338513500025843e-05, + "loss": 1.0056, "step": 129100 }, { - "epoch": 1.32, - "learning_rate": 5.9559270424684954e-05, - "loss": 1.1835, + "epoch": 1.7800556611832135, + "grad_norm": 10.986882209777832, + "learning_rate": 5.338242761873044e-05, + "loss": 1.1049, "step": 129200 }, { - "epoch": 1.32, - "learning_rate": 5.955813680953505e-05, - "loss": 1.2337, + "epoch": 1.7814334132429528, + "grad_norm": 29.090335845947266, + "learning_rate": 5.3379718038357815e-05, + "loss": 1.0349, "step": 129300 }, { - "epoch": 1.32, - "learning_rate": 5.9557001749168646e-05, - "loss": 1.2018, + "epoch": 1.7828111653026921, + "grad_norm": 11.125770568847656, + "learning_rate": 5.337700625937074e-05, + "loss": 0.9962, "step": 129400 }, { - "epoch": 1.32, - "learning_rate": 5.955586524364124e-05, - "loss": 1.2186, + "epoch": 1.7841889173624315, + "grad_norm": 8.75442886352539, + "learning_rate": 5.3374292281999596e-05, + "loss": 1.1801, "step": 129500 }, { - "epoch": 1.32, - "learning_rate": 5.955472729300841e-05, - "loss": 1.098, + "epoch": 1.7855666694221708, + "grad_norm": 51.815059661865234, + "learning_rate": 5.337157610647495e-05, + "loss": 1.1249, "step": 129600 }, { - "epoch": 1.32, - "learning_rate": 5.955358789732578e-05, - "loss": 1.1163, + "epoch": 1.7869444214819101, + "grad_norm": 41.39480972290039, + "learning_rate": 5.3368857733027556e-05, + "loss": 1.0997, "step": 129700 }, { - "epoch": 1.32, - "learning_rate": 5.955244705664908e-05, - "loss": 1.2921, + "epoch": 1.7883221735416495, + "grad_norm": 14.78404426574707, + "learning_rate": 5.336613716188836e-05, + "loss": 1.0338, "step": 129800 }, { - "epoch": 1.32, - "learning_rate": 5.9551304771034066e-05, - "loss": 1.2077, + "epoch": 1.7896999256013888, + "grad_norm": 15.128174781799316, + "learning_rate": 5.3363414393288466e-05, + "loss": 1.0623, "step": 129900 }, { - "epoch": 1.32, - "learning_rate": 5.955016104053661e-05, - "loss": 1.328, + "epoch": 1.7910776776611281, + "grad_norm": 16.106595993041992, + "learning_rate": 5.336068942745922e-05, + "loss": 1.087, "step": 130000 }, { - "epoch": 1.33, - "learning_rate": 5.954901586521263e-05, - "loss": 1.2289, + "epoch": 1.7924554297208675, + "grad_norm": 22.07122039794922, + "learning_rate": 5.3357962264632096e-05, + "loss": 1.1794, "step": 130100 }, { - "epoch": 1.33, - "learning_rate": 5.95478692451181e-05, - "loss": 1.1185, + "epoch": 1.7938331817806068, + "grad_norm": 15.329657554626465, + "learning_rate": 5.335523290503879e-05, + "loss": 1.039, "step": 130200 }, { - "epoch": 1.33, - "learning_rate": 5.954672118030911e-05, - "loss": 1.1346, + "epoch": 1.7952109338403461, + "grad_norm": 12.507319450378418, + "learning_rate": 5.335250134891117e-05, + "loss": 1.0841, "step": 130300 }, { - "epoch": 1.33, - "learning_rate": 5.954557167084177e-05, - "loss": 1.2112, + "epoch": 1.7965886859000855, + "grad_norm": 15.831151962280273, + "learning_rate": 5.334976759648131e-05, + "loss": 1.1097, "step": 130400 }, { - "epoch": 1.33, - "learning_rate": 5.95444207167723e-05, - "loss": 1.1045, + "epoch": 1.7979664379598248, + "grad_norm": 10.346972465515137, + "learning_rate": 5.3347031647981446e-05, + "loss": 1.0386, "step": 130500 }, { - "epoch": 1.33, - "learning_rate": 5.954326831815697e-05, - "loss": 1.2686, + "epoch": 1.7993441900195641, + "grad_norm": 30.961881637573242, + "learning_rate": 5.3344293503644e-05, + "loss": 1.1335, "step": 130600 }, { - "epoch": 1.33, - "learning_rate": 5.954211447505212e-05, - "loss": 1.1579, + "epoch": 1.8007219420793033, + "grad_norm": 19.25282096862793, + "learning_rate": 5.33415531637016e-05, + "loss": 1.191, "step": 130700 }, { - "epoch": 1.33, - "learning_rate": 5.954095918751416e-05, - "loss": 1.2822, + "epoch": 1.8020996941390428, + "grad_norm": 15.14389705657959, + "learning_rate": 5.333881062838707e-05, + "loss": 1.1822, "step": 130800 }, { - "epoch": 1.33, - "learning_rate": 5.953981403006823e-05, - "loss": 1.1962, + "epoch": 1.803477446198782, + "grad_norm": 68.57266235351562, + "learning_rate": 5.333606589793338e-05, + "loss": 1.0386, "step": 130900 }, { - "epoch": 1.33, - "learning_rate": 5.953865586827653e-05, - "loss": 1.3005, + "epoch": 1.8048551982585215, + "grad_norm": 9.26943302154541, + "learning_rate": 5.3333318972573715e-05, + "loss": 1.0345, "step": 131000 }, { - "epoch": 1.34, - "learning_rate": 5.9537496262220834e-05, - "loss": 1.3678, + "epoch": 1.8062329503182606, + "grad_norm": 20.253746032714844, + "learning_rate": 5.3330569852541435e-05, + "loss": 1.0864, "step": 131100 }, { - "epoch": 1.34, - "learning_rate": 5.953633521195784e-05, - "loss": 1.2709, + "epoch": 1.8076107023780001, + "grad_norm": 8.966611862182617, + "learning_rate": 5.332781853807011e-05, + "loss": 1.0157, "step": 131200 }, { - "epoch": 1.34, - "learning_rate": 5.953517271754433e-05, - "loss": 1.1558, + "epoch": 1.8089884544377393, + "grad_norm": 8.893041610717773, + "learning_rate": 5.332506502939346e-05, + "loss": 0.9388, "step": 131300 }, { - "epoch": 1.34, - "learning_rate": 5.9534008779037125e-05, - "loss": 1.2944, + "epoch": 1.8103662064974788, + "grad_norm": 7.839788913726807, + "learning_rate": 5.332230932674542e-05, + "loss": 1.0934, "step": 131400 }, { - "epoch": 1.34, - "learning_rate": 5.953284339649314e-05, - "loss": 1.4518, + "epoch": 1.811743958557218, + "grad_norm": 13.025264739990234, + "learning_rate": 5.331955143036009e-05, + "loss": 1.0612, "step": 131500 }, { - "epoch": 1.34, - "learning_rate": 5.953167656996936e-05, - "loss": 1.2493, + "epoch": 1.8131217106169575, + "grad_norm": 15.265260696411133, + "learning_rate": 5.331679134047177e-05, + "loss": 1.0583, "step": 131600 }, { - "epoch": 1.34, - "learning_rate": 5.953050829952282e-05, - "loss": 1.206, + "epoch": 1.8144994626766966, + "grad_norm": 8.136308670043945, + "learning_rate": 5.3314029057314956e-05, + "loss": 1.0168, "step": 131700 }, { - "epoch": 1.34, - "learning_rate": 5.952933858521066e-05, - "loss": 1.2432, + "epoch": 1.8158772147364362, + "grad_norm": 21.833972930908203, + "learning_rate": 5.33112645811243e-05, + "loss": 1.0654, "step": 131800 }, { - "epoch": 1.34, - "learning_rate": 5.9528167427090065e-05, - "loss": 1.1193, + "epoch": 1.8172549667961753, + "grad_norm": 28.135250091552734, + "learning_rate": 5.330849791213468e-05, + "loss": 1.0604, "step": 131900 }, { - "epoch": 1.34, - "learning_rate": 5.9526994825218296e-05, - "loss": 1.2435, + "epoch": 1.8186327188559148, + "grad_norm": 29.88810157775879, + "learning_rate": 5.330572905058112e-05, + "loss": 0.9664, "step": 132000 }, { - "epoch": 1.35, - "learning_rate": 5.952582077965269e-05, - "loss": 1.3112, + "epoch": 1.820010470915654, + "grad_norm": 21.69635581970215, + "learning_rate": 5.330295799669886e-05, + "loss": 1.0365, "step": 132100 }, { - "epoch": 1.35, - "learning_rate": 5.9524645290450646e-05, - "loss": 1.3123, + "epoch": 1.8213882229753935, + "grad_norm": 9.101287841796875, + "learning_rate": 5.3300184750723314e-05, + "loss": 0.8906, "step": 132200 }, { - "epoch": 1.35, - "learning_rate": 5.952346835766964e-05, - "loss": 1.2641, + "epoch": 1.8227659750351326, + "grad_norm": 60.037200927734375, + "learning_rate": 5.3297409312890075e-05, + "loss": 1.102, "step": 132300 }, { - "epoch": 1.35, - "learning_rate": 5.9522289981367224e-05, - "loss": 1.3837, + "epoch": 1.8241437270948722, + "grad_norm": 5.7496442794799805, + "learning_rate": 5.329463168343494e-05, + "loss": 0.9268, "step": 132400 }, { - "epoch": 1.35, - "learning_rate": 5.952111016160101e-05, - "loss": 1.3454, + "epoch": 1.8255214791546113, + "grad_norm": 18.774789810180664, + "learning_rate": 5.3291851862593874e-05, + "loss": 1.0282, "step": 132500 }, { - "epoch": 1.35, - "learning_rate": 5.951992889842868e-05, - "loss": 1.1997, + "epoch": 1.8268992312143508, + "grad_norm": 2.4276559352874756, + "learning_rate": 5.328906985060305e-05, + "loss": 0.941, "step": 132600 }, { - "epoch": 1.35, - "learning_rate": 5.951875802611757e-05, - "loss": 1.2276, + "epoch": 1.82827698327409, + "grad_norm": 11.795206069946289, + "learning_rate": 5.32862856476988e-05, + "loss": 1.0247, "step": 132700 }, { - "epoch": 1.35, - "learning_rate": 5.9517573890738974e-05, - "loss": 1.2187, + "epoch": 1.8296547353338293, + "grad_norm": 12.539592742919922, + "learning_rate": 5.328349925411766e-05, + "loss": 0.9362, "step": 132800 }, { - "epoch": 1.35, - "learning_rate": 5.951638831212716e-05, - "loss": 1.2339, + "epoch": 1.8310324873935686, + "grad_norm": 11.351883888244629, + "learning_rate": 5.328071067009636e-05, + "loss": 1.0445, "step": 132900 }, { - "epoch": 1.36, - "learning_rate": 5.95152012903401e-05, - "loss": 1.3291, + "epoch": 1.832410239453308, + "grad_norm": 6.156078338623047, + "learning_rate": 5.327791989587179e-05, + "loss": 0.9723, "step": 133000 }, { - "epoch": 1.36, - "learning_rate": 5.9514012825435836e-05, - "loss": 1.2949, + "epoch": 1.8337879915130473, + "grad_norm": 7.734241008758545, + "learning_rate": 5.327512693168104e-05, + "loss": 0.9273, "step": 133100 }, { - "epoch": 1.36, - "learning_rate": 5.951282291747248e-05, - "loss": 1.4713, + "epoch": 1.8351657435727866, + "grad_norm": 12.243995666503906, + "learning_rate": 5.32723317777614e-05, + "loss": 1.0539, "step": 133200 }, { - "epoch": 1.36, - "learning_rate": 5.9511631566508204e-05, - "loss": 1.3082, + "epoch": 1.836543495632526, + "grad_norm": 12.704156875610352, + "learning_rate": 5.326953443435032e-05, + "loss": 0.9992, "step": 133300 }, { - "epoch": 1.36, - "learning_rate": 5.951043877260126e-05, - "loss": 1.351, + "epoch": 1.8379212476922653, + "grad_norm": 10.600850105285645, + "learning_rate": 5.3266734901685454e-05, + "loss": 1.1213, "step": 133400 }, { - "epoch": 1.36, - "learning_rate": 5.950924453580997e-05, - "loss": 1.4038, + "epoch": 1.8392989997520046, + "grad_norm": 19.322465896606445, + "learning_rate": 5.326393318000464e-05, + "loss": 1.1197, "step": 133500 }, { - "epoch": 1.36, - "learning_rate": 5.950804885619272e-05, - "loss": 1.2508, + "epoch": 1.840676751811744, + "grad_norm": 16.50214195251465, + "learning_rate": 5.32611292695459e-05, + "loss": 0.8972, "step": 133600 }, { - "epoch": 1.36, - "learning_rate": 5.950685173380798e-05, - "loss": 1.209, + "epoch": 1.8420545038714833, + "grad_norm": 10.501808166503906, + "learning_rate": 5.325832317054742e-05, + "loss": 1.0425, "step": 133700 }, { - "epoch": 1.36, - "learning_rate": 5.950565316871427e-05, - "loss": 1.1596, + "epoch": 1.8434322559312226, + "grad_norm": 29.4556941986084, + "learning_rate": 5.325551488324762e-05, + "loss": 0.9163, "step": 133800 }, { - "epoch": 1.36, - "learning_rate": 5.950445316097021e-05, - "loss": 1.2576, + "epoch": 1.844810007990962, + "grad_norm": 4.9216485023498535, + "learning_rate": 5.3252704407885074e-05, + "loss": 0.9956, "step": 133900 }, { - "epoch": 1.37, - "learning_rate": 5.950325171063447e-05, - "loss": 1.4181, + "epoch": 1.8461877600507013, + "grad_norm": 12.077922821044922, + "learning_rate": 5.324989174469852e-05, + "loss": 0.9936, "step": 134000 }, { - "epoch": 1.37, - "learning_rate": 5.9502048817765775e-05, - "loss": 1.2478, + "epoch": 1.8475655121104406, + "grad_norm": 27.104740142822266, + "learning_rate": 5.3247105053262415e-05, + "loss": 1.1411, "step": 134100 }, { - "epoch": 1.37, - "learning_rate": 5.9500856532916435e-05, - "loss": 1.2614, + "epoch": 1.84894326417018, + "grad_norm": 9.051424026489258, + "learning_rate": 5.32442880370172e-05, + "loss": 0.9995, "step": 134200 }, { - "epoch": 1.37, - "learning_rate": 5.949965076958223e-05, - "loss": 1.3422, + "epoch": 1.8503210162299193, + "grad_norm": 7.275717735290527, + "learning_rate": 5.324146883366301e-05, + "loss": 1.1227, "step": 134300 }, { - "epoch": 1.37, - "learning_rate": 5.9498443563891155e-05, - "loss": 1.3472, + "epoch": 1.8516987682896586, + "grad_norm": 10.74730396270752, + "learning_rate": 5.323864744343936e-05, + "loss": 0.9496, "step": 134400 }, { - "epoch": 1.37, - "learning_rate": 5.949723491590222e-05, - "loss": 1.3978, + "epoch": 1.853076520349398, + "grad_norm": 32.46671676635742, + "learning_rate": 5.323582386658592e-05, + "loss": 0.9779, "step": 134500 }, { - "epoch": 1.37, - "learning_rate": 5.9496024825674523e-05, - "loss": 1.4959, + "epoch": 1.8544542724091373, + "grad_norm": 6.33997917175293, + "learning_rate": 5.323299810334257e-05, + "loss": 0.948, "step": 134600 }, { - "epoch": 1.37, - "learning_rate": 5.949481329326724e-05, - "loss": 1.3894, + "epoch": 1.8558320244688766, + "grad_norm": 7.527612209320068, + "learning_rate": 5.32301701539494e-05, + "loss": 0.9229, "step": 134700 }, { - "epoch": 1.37, - "learning_rate": 5.949360031873961e-05, - "loss": 1.2948, + "epoch": 1.857209776528616, + "grad_norm": 13.82846450805664, + "learning_rate": 5.322734001864664e-05, + "loss": 0.9606, "step": 134800 }, { - "epoch": 1.37, - "learning_rate": 5.949238590215093e-05, - "loss": 1.2448, + "epoch": 1.8585875285883553, + "grad_norm": 28.552188873291016, + "learning_rate": 5.322450769767472e-05, + "loss": 1.0237, "step": 134900 }, { - "epoch": 1.38, - "learning_rate": 5.9491170043560575e-05, - "loss": 1.4274, + "epoch": 1.8599652806480944, + "grad_norm": 6.158682346343994, + "learning_rate": 5.3221673191274255e-05, + "loss": 1.0853, "step": 135000 }, { - "epoch": 1.38, - "learning_rate": 5.9489952743028004e-05, - "loss": 1.3352, + "epoch": 1.861343032707834, + "grad_norm": 28.212495803833008, + "learning_rate": 5.321883649968608e-05, + "loss": 0.9924, "step": 135100 }, { - "epoch": 1.38, - "learning_rate": 5.9488734000612735e-05, - "loss": 1.3923, + "epoch": 1.862720784767573, + "grad_norm": 9.977272987365723, + "learning_rate": 5.321599762315116e-05, + "loss": 0.9859, "step": 135200 }, { - "epoch": 1.38, - "learning_rate": 5.948751381637434e-05, - "loss": 1.4538, + "epoch": 1.8640985368273126, + "grad_norm": 8.814349174499512, + "learning_rate": 5.321315656191067e-05, + "loss": 0.876, "step": 135300 }, { - "epoch": 1.38, - "learning_rate": 5.94862921903725e-05, - "loss": 1.2411, + "epoch": 1.8654762888870517, + "grad_norm": 11.563972473144531, + "learning_rate": 5.3210313316205995e-05, + "loss": 0.9478, "step": 135400 }, { - "epoch": 1.38, - "learning_rate": 5.948508136048023e-05, - "loss": 1.3221, + "epoch": 1.8668540409467913, + "grad_norm": 6.570127964019775, + "learning_rate": 5.320746788627867e-05, + "loss": 0.9925, "step": 135500 }, { - "epoch": 1.38, - "learning_rate": 5.9483856865546886e-05, - "loss": 1.3336, + "epoch": 1.8682317930065304, + "grad_norm": 14.85019302368164, + "learning_rate": 5.320462027237043e-05, + "loss": 0.9663, "step": 135600 }, { - "epoch": 1.38, - "learning_rate": 5.9482630929028895e-05, - "loss": 1.3068, + "epoch": 1.86960954506627, + "grad_norm": 10.365165710449219, + "learning_rate": 5.320177047472319e-05, + "loss": 1.0398, "step": 135700 }, { - "epoch": 1.38, - "learning_rate": 5.948140355098619e-05, - "loss": 1.2702, + "epoch": 1.870987297126009, + "grad_norm": 17.870952606201172, + "learning_rate": 5.3198918493579054e-05, + "loss": 1.001, "step": 135800 }, { - "epoch": 1.38, - "learning_rate": 5.9480174731478784e-05, - "loss": 1.2973, + "epoch": 1.8723650491857486, + "grad_norm": 22.607162475585938, + "learning_rate": 5.3196064329180326e-05, + "loss": 1.0497, "step": 135900 }, { - "epoch": 1.39, - "learning_rate": 5.947894447056677e-05, - "loss": 1.4826, + "epoch": 1.8737428012454878, + "grad_norm": 4.769906044006348, + "learning_rate": 5.3193207981769465e-05, + "loss": 1.0029, "step": 136000 }, { - "epoch": 1.39, - "learning_rate": 5.9477712768310296e-05, - "loss": 1.279, + "epoch": 1.8751205533052273, + "grad_norm": 22.761455535888672, + "learning_rate": 5.319034945158914e-05, + "loss": 0.9172, "step": 136100 }, { - "epoch": 1.39, - "learning_rate": 5.947647962476957e-05, - "loss": 1.3991, + "epoch": 1.8764983053649664, + "grad_norm": 20.56629180908203, + "learning_rate": 5.318748873888221e-05, + "loss": 0.9593, "step": 136200 }, { - "epoch": 1.39, - "learning_rate": 5.94752450400049e-05, - "loss": 1.3375, + "epoch": 1.877876057424706, + "grad_norm": 14.103052139282227, + "learning_rate": 5.3184654483643094e-05, + "loss": 1.12, "step": 136300 }, { - "epoch": 1.39, - "learning_rate": 5.947400901407665e-05, - "loss": 1.4318, + "epoch": 1.879253809484445, + "grad_norm": 5.662879467010498, + "learning_rate": 5.318178942843141e-05, + "loss": 1.0458, "step": 136400 }, { - "epoch": 1.39, - "learning_rate": 5.9472783928848826e-05, - "loss": 1.2377, + "epoch": 1.8806315615441846, + "grad_norm": 28.704423904418945, + "learning_rate": 5.3178922191420325e-05, + "loss": 1.041, "step": 136500 }, { - "epoch": 1.39, - "learning_rate": 5.9471545035184906e-05, - "loss": 1.2923, + "epoch": 1.8820093136039238, + "grad_norm": 12.396733283996582, + "learning_rate": 5.3176052772853416e-05, + "loss": 0.9739, "step": 136600 }, { - "epoch": 1.39, - "learning_rate": 5.947030470053831e-05, - "loss": 1.4292, + "epoch": 1.8833870656636633, + "grad_norm": 7.1563615798950195, + "learning_rate": 5.317318117297447e-05, + "loss": 1.0313, "step": 136700 }, { - "epoch": 1.39, - "learning_rate": 5.946906292496969e-05, - "loss": 1.4379, + "epoch": 1.8847648177234024, + "grad_norm": 16.226806640625, + "learning_rate": 5.317030739202745e-05, + "loss": 1.0649, "step": 136800 }, { - "epoch": 1.39, - "learning_rate": 5.946781970853975e-05, - "loss": 1.3305, + "epoch": 1.886142569783142, + "grad_norm": 44.95390319824219, + "learning_rate": 5.3167431430256484e-05, + "loss": 1.0083, "step": 136900 }, { - "epoch": 1.4, - "learning_rate": 5.946657505130929e-05, - "loss": 1.3043, + "epoch": 1.887520321842881, + "grad_norm": 9.811665534973145, + "learning_rate": 5.31645532879059e-05, + "loss": 0.9397, "step": 137000 }, { - "epoch": 1.4, - "learning_rate": 5.9465328953339144e-05, - "loss": 1.5034, + "epoch": 1.8888980739026204, + "grad_norm": 8.302355766296387, + "learning_rate": 5.3161672965220227e-05, + "loss": 1.0824, "step": 137100 }, { - "epoch": 1.4, - "learning_rate": 5.9464081414690255e-05, - "loss": 1.4252, + "epoch": 1.8902758259623598, + "grad_norm": 61.96098327636719, + "learning_rate": 5.315879046244414e-05, + "loss": 0.9977, "step": 137200 }, { - "epoch": 1.4, - "learning_rate": 5.946283243542362e-05, - "loss": 1.3432, + "epoch": 1.891653578022099, + "grad_norm": 82.41322326660156, + "learning_rate": 5.3155905779822534e-05, + "loss": 0.9977, "step": 137300 }, { - "epoch": 1.4, - "learning_rate": 5.9461582015600305e-05, - "loss": 1.2562, + "epoch": 1.8930313300818384, + "grad_norm": 16.206649780273438, + "learning_rate": 5.3153018917600475e-05, + "loss": 1.0369, "step": 137400 }, { - "epoch": 1.4, - "learning_rate": 5.946033015528144e-05, - "loss": 1.3676, + "epoch": 1.8944090821415778, + "grad_norm": 12.784920692443848, + "learning_rate": 5.315012987602322e-05, + "loss": 0.9264, "step": 137500 }, { - "epoch": 1.4, - "learning_rate": 5.945907685452825e-05, - "loss": 1.3401, + "epoch": 1.895786834201317, + "grad_norm": 41.18064880371094, + "learning_rate": 5.3147238655336214e-05, + "loss": 1.0255, "step": 137600 }, { - "epoch": 1.4, - "learning_rate": 5.9457822113402e-05, - "loss": 1.4083, + "epoch": 1.8971645862610564, + "grad_norm": 6.60025691986084, + "learning_rate": 5.314434525578507e-05, + "loss": 1.0017, "step": 137700 }, { - "epoch": 1.4, - "learning_rate": 5.9456565931964047e-05, - "loss": 1.3111, + "epoch": 1.8985423383207958, + "grad_norm": 25.215187072753906, + "learning_rate": 5.3141449677615604e-05, + "loss": 1.0696, "step": 137800 }, { - "epoch": 1.4, - "learning_rate": 5.945530831027581e-05, - "loss": 1.3548, + "epoch": 1.899920090380535, + "grad_norm": 7.089141368865967, + "learning_rate": 5.3138551921073804e-05, + "loss": 0.9741, "step": 137900 }, { - "epoch": 1.41, - "learning_rate": 5.9454049248398766e-05, - "loss": 1.3126, + "epoch": 1.9012978424402744, + "grad_norm": 12.382874488830566, + "learning_rate": 5.313565198640585e-05, + "loss": 0.9946, "step": 138000 }, { - "epoch": 1.41, - "learning_rate": 5.945278874639449e-05, - "loss": 1.3558, + "epoch": 1.9026755945000138, + "grad_norm": 8.173346519470215, + "learning_rate": 5.3132749873858116e-05, + "loss": 0.9643, "step": 138100 }, { - "epoch": 1.41, - "learning_rate": 5.9451526804324614e-05, - "loss": 1.1784, + "epoch": 1.9040533465597531, + "grad_norm": 8.598787307739258, + "learning_rate": 5.312984558367713e-05, + "loss": 0.9642, "step": 138200 }, { - "epoch": 1.41, - "learning_rate": 5.945026342225083e-05, - "loss": 1.2846, + "epoch": 1.9054310986194924, + "grad_norm": 13.2457914352417, + "learning_rate": 5.312693911610966e-05, + "loss": 1.0556, "step": 138300 }, { - "epoch": 1.41, - "learning_rate": 5.9448998600234916e-05, - "loss": 1.222, + "epoch": 1.9068088506792318, + "grad_norm": 9.43970775604248, + "learning_rate": 5.312403047140259e-05, + "loss": 0.9242, "step": 138400 }, { - "epoch": 1.41, - "learning_rate": 5.944773233833872e-05, - "loss": 1.2498, + "epoch": 1.9081866027389711, + "grad_norm": 3.202592134475708, + "learning_rate": 5.312111964980305e-05, + "loss": 1.0047, "step": 138500 }, { - "epoch": 1.41, - "learning_rate": 5.944646463662414e-05, - "loss": 1.2831, + "epoch": 1.9095643547987105, + "grad_norm": 8.235738754272461, + "learning_rate": 5.3118206651558315e-05, + "loss": 0.9901, "step": 138600 }, { - "epoch": 1.41, - "learning_rate": 5.9445195495153165e-05, - "loss": 1.3749, + "epoch": 1.9109421068584498, + "grad_norm": 6.716213703155518, + "learning_rate": 5.3115291476915855e-05, + "loss": 0.943, "step": 138700 }, { - "epoch": 1.41, - "learning_rate": 5.944392491398785e-05, - "loss": 1.272, + "epoch": 1.9123198589181891, + "grad_norm": 11.197039604187012, + "learning_rate": 5.311237412612335e-05, + "loss": 1.0723, "step": 138800 }, { - "epoch": 1.42, - "learning_rate": 5.944265289319032e-05, - "loss": 1.1718, + "epoch": 1.9136976109779285, + "grad_norm": 22.365943908691406, + "learning_rate": 5.3109454599428626e-05, + "loss": 1.1182, "step": 138900 }, { - "epoch": 1.42, - "learning_rate": 5.9441379432822776e-05, - "loss": 1.3356, + "epoch": 1.9150753630376678, + "grad_norm": 8.819984436035156, + "learning_rate": 5.310653289707971e-05, + "loss": 1.058, "step": 139000 }, { - "epoch": 1.42, - "learning_rate": 5.944010453294747e-05, - "loss": 1.2766, + "epoch": 1.9164531150974071, + "grad_norm": 8.569857597351074, + "learning_rate": 5.310360901932483e-05, + "loss": 1.074, "step": 139100 }, { - "epoch": 1.42, - "learning_rate": 5.943882819362674e-05, - "loss": 1.3597, + "epoch": 1.9178308671571465, + "grad_norm": 14.33634090423584, + "learning_rate": 5.310068296641237e-05, + "loss": 1.1774, "step": 139200 }, { - "epoch": 1.42, - "learning_rate": 5.943755041492299e-05, - "loss": 1.4762, + "epoch": 1.9192086192168856, + "grad_norm": 4.235088348388672, + "learning_rate": 5.309775473859092e-05, + "loss": 0.9613, "step": 139300 }, { - "epoch": 1.42, - "learning_rate": 5.9436271196898703e-05, - "loss": 1.6504, + "epoch": 1.9205863712766251, + "grad_norm": 22.20789337158203, + "learning_rate": 5.309482433610926e-05, + "loss": 0.9929, "step": 139400 }, { - "epoch": 1.42, - "learning_rate": 5.943499053961642e-05, - "loss": 1.3129, + "epoch": 1.9219641233363642, + "grad_norm": 4.4684343338012695, + "learning_rate": 5.3091891759216326e-05, + "loss": 0.9585, "step": 139500 }, { - "epoch": 1.42, - "learning_rate": 5.943370844313876e-05, - "loss": 1.4621, + "epoch": 1.9233418753961038, + "grad_norm": 16.434553146362305, + "learning_rate": 5.308895700816125e-05, + "loss": 0.9839, "step": 139600 }, { - "epoch": 1.42, - "learning_rate": 5.94324249075284e-05, - "loss": 1.6409, + "epoch": 1.924719627455843, + "grad_norm": 35.85634994506836, + "learning_rate": 5.30860494632031e-05, + "loss": 1.0199, "step": 139700 }, { - "epoch": 1.42, - "learning_rate": 5.943113993284811e-05, - "loss": 1.6821, + "epoch": 1.9260973795155825, + "grad_norm": 18.60834312438965, + "learning_rate": 5.3083139787835095e-05, + "loss": 0.9354, "step": 139800 }, { - "epoch": 1.43, - "learning_rate": 5.942985351916072e-05, - "loss": 1.4077, + "epoch": 1.9274751315753216, + "grad_norm": 9.364298820495605, + "learning_rate": 5.308019855925613e-05, + "loss": 1.017, "step": 139900 }, { - "epoch": 1.43, - "learning_rate": 5.942856566652912e-05, - "loss": 1.4963, + "epoch": 1.9288528836350611, + "grad_norm": 38.099571228027344, + "learning_rate": 5.3077255157508424e-05, + "loss": 0.9935, "step": 140000 }, { - "epoch": 1.43, - "learning_rate": 5.9427276375016275e-05, - "loss": 1.8543, + "epoch": 1.9302306356948002, + "grad_norm": 39.307682037353516, + "learning_rate": 5.3074309582842035e-05, + "loss": 1.0742, "step": 140100 }, { - "epoch": 1.43, - "learning_rate": 5.942598564468523e-05, - "loss": 1.8604, + "epoch": 1.9316083877545398, + "grad_norm": 70.52771759033203, + "learning_rate": 5.3071361835507216e-05, + "loss": 0.9509, "step": 140200 }, { - "epoch": 1.43, - "learning_rate": 5.942469347559909e-05, - "loss": 1.7474, + "epoch": 1.932986139814279, + "grad_norm": 120.77656555175781, + "learning_rate": 5.306841191575439e-05, + "loss": 1.0197, "step": 140300 }, { - "epoch": 1.43, - "learning_rate": 5.9423399867821035e-05, - "loss": 1.7125, + "epoch": 1.9343638918740185, + "grad_norm": 51.007225036621094, + "learning_rate": 5.3065459823834155e-05, + "loss": 1.0303, "step": 140400 }, { - "epoch": 1.43, - "learning_rate": 5.942210482141432e-05, - "loss": 1.691, + "epoch": 1.9357416439337576, + "grad_norm": 16.969768524169922, + "learning_rate": 5.306250555999732e-05, + "loss": 0.952, "step": 140500 }, { - "epoch": 1.43, - "learning_rate": 5.942080833644226e-05, - "loss": 1.3812, + "epoch": 1.9371193959934971, + "grad_norm": 11.94832706451416, + "learning_rate": 5.3059549124494866e-05, + "loss": 1.0485, "step": 140600 }, { - "epoch": 1.43, - "learning_rate": 5.941951041296824e-05, - "loss": 1.4928, + "epoch": 1.9384971480532363, + "grad_norm": 20.453916549682617, + "learning_rate": 5.3056590517577946e-05, + "loss": 1.0441, "step": 140700 }, { - "epoch": 1.43, - "learning_rate": 5.941822405179493e-05, - "loss": 1.4654, + "epoch": 1.9398749001129758, + "grad_norm": 24.510923385620117, + "learning_rate": 5.3053629739497916e-05, + "loss": 0.955, "step": 140800 }, { - "epoch": 1.44, - "learning_rate": 5.941692326589089e-05, - "loss": 1.5307, + "epoch": 1.941252652172715, + "grad_norm": 21.415592193603516, + "learning_rate": 5.3050666790506316e-05, + "loss": 1.0676, "step": 140900 }, { - "epoch": 1.44, - "learning_rate": 5.9415621041674866e-05, - "loss": 1.4875, + "epoch": 1.9426304042324545, + "grad_norm": 10.470038414001465, + "learning_rate": 5.3047701670854844e-05, + "loss": 1.0797, "step": 141000 }, { - "epoch": 1.44, - "learning_rate": 5.941433042295427e-05, - "loss": 1.4396, + "epoch": 1.9440081562921936, + "grad_norm": 5.278820037841797, + "learning_rate": 5.304473438079542e-05, + "loss": 1.0034, "step": 141100 }, { - "epoch": 1.44, - "learning_rate": 5.941302533668686e-05, - "loss": 1.482, + "epoch": 1.9453859083519331, + "grad_norm": 9.555508613586426, + "learning_rate": 5.304176492058012e-05, + "loss": 1.0018, "step": 141200 }, { - "epoch": 1.44, - "learning_rate": 5.941171881229804e-05, - "loss": 1.3334, + "epoch": 1.9467636604116723, + "grad_norm": 5.737916469573975, + "learning_rate": 5.303879329046122e-05, + "loss": 1.0342, "step": 141300 }, { - "epoch": 1.44, - "learning_rate": 5.941041084985169e-05, - "loss": 1.3403, + "epoch": 1.9481414124714116, + "grad_norm": 56.66264724731445, + "learning_rate": 5.303581949069118e-05, + "loss": 1.1203, "step": 141400 }, { - "epoch": 1.44, - "learning_rate": 5.940910144941176e-05, - "loss": 1.4023, + "epoch": 1.949519164531151, + "grad_norm": 11.701165199279785, + "learning_rate": 5.303284352152263e-05, + "loss": 1.1388, "step": 141500 }, { - "epoch": 1.44, - "learning_rate": 5.9407790611042273e-05, - "loss": 1.2677, + "epoch": 1.9508969165908903, + "grad_norm": 16.850122451782227, + "learning_rate": 5.30298653832084e-05, + "loss": 1.1057, "step": 141600 }, { - "epoch": 1.44, - "learning_rate": 5.9406478334807324e-05, - "loss": 1.3845, + "epoch": 1.9522746686506296, + "grad_norm": 5.651546001434326, + "learning_rate": 5.302688507600149e-05, + "loss": 1.0818, "step": 141700 }, { - "epoch": 1.44, - "learning_rate": 5.9405164620771074e-05, - "loss": 1.3756, + "epoch": 1.953652420710369, + "grad_norm": 84.90093994140625, + "learning_rate": 5.302390260015511e-05, + "loss": 1.0933, "step": 141800 }, { - "epoch": 1.45, - "learning_rate": 5.940384946899775e-05, - "loss": 1.3117, + "epoch": 1.9550301727701083, + "grad_norm": 55.46168518066406, + "learning_rate": 5.3020917955922616e-05, + "loss": 1.1006, "step": 141900 }, { - "epoch": 1.45, - "learning_rate": 5.940253287955166e-05, - "loss": 1.3507, + "epoch": 1.9564079248298476, + "grad_norm": 56.97056579589844, + "learning_rate": 5.301793114355758e-05, + "loss": 1.2585, "step": 142000 }, { - "epoch": 1.45, - "learning_rate": 5.9401214852497185e-05, - "loss": 1.3176, + "epoch": 1.957785676889587, + "grad_norm": 45.9921760559082, + "learning_rate": 5.301494216331374e-05, + "loss": 1.4076, "step": 142100 }, { - "epoch": 1.45, - "learning_rate": 5.939989538789875e-05, - "loss": 1.4573, + "epoch": 1.9591634289493263, + "grad_norm": 47.68534851074219, + "learning_rate": 5.301198093765263e-05, + "loss": 1.289, "step": 142200 }, { - "epoch": 1.45, - "learning_rate": 5.939857448582089e-05, - "loss": 1.3671, + "epoch": 1.9605411810090656, + "grad_norm": 14.917179107666016, + "learning_rate": 5.300898764408561e-05, + "loss": 1.1821, "step": 142300 }, { - "epoch": 1.45, - "learning_rate": 5.939725214632817e-05, - "loss": 1.3991, + "epoch": 1.961918933068805, + "grad_norm": 24.069034576416016, + "learning_rate": 5.300602214873286e-05, + "loss": 1.2142, "step": 142400 }, { - "epoch": 1.45, - "learning_rate": 5.939592836948527e-05, - "loss": 1.4889, + "epoch": 1.9632966851285443, + "grad_norm": 19.97752571105957, + "learning_rate": 5.300302454284971e-05, + "loss": 1.2426, "step": 142500 }, { - "epoch": 1.45, - "learning_rate": 5.939460315535688e-05, - "loss": 1.3768, + "epoch": 1.9646744371882836, + "grad_norm": 19.85276985168457, + "learning_rate": 5.300002477035414e-05, + "loss": 1.1953, "step": 142600 }, { - "epoch": 1.45, - "learning_rate": 5.939327650400782e-05, - "loss": 1.4841, + "epoch": 1.966052189248023, + "grad_norm": 16.321653366088867, + "learning_rate": 5.2997022831501014e-05, + "loss": 1.0981, "step": 142700 }, { - "epoch": 1.45, - "learning_rate": 5.9391948415502953e-05, - "loss": 1.4147, + "epoch": 1.9674299413077623, + "grad_norm": 80.24627685546875, + "learning_rate": 5.2994018726545354e-05, + "loss": 1.161, "step": 142800 }, { - "epoch": 1.46, - "learning_rate": 5.93906188899072e-05, - "loss": 1.4139, + "epoch": 1.9688076933675016, + "grad_norm": 6.758548259735107, + "learning_rate": 5.299101245574238e-05, + "loss": 1.1315, "step": 142900 }, { - "epoch": 1.46, - "learning_rate": 5.938928792728559e-05, - "loss": 1.5077, + "epoch": 1.970185445427241, + "grad_norm": 53.158634185791016, + "learning_rate": 5.2988004019347485e-05, + "loss": 1.2088, "step": 143000 }, { - "epoch": 1.46, - "learning_rate": 5.9387955527703184e-05, - "loss": 1.4399, + "epoch": 1.9715631974869803, + "grad_norm": 107.23058319091797, + "learning_rate": 5.2984993417616255e-05, + "loss": 1.2485, "step": 143100 }, { - "epoch": 1.46, - "learning_rate": 5.938662169122513e-05, - "loss": 1.4146, + "epoch": 1.9729409495467196, + "grad_norm": 9.508332252502441, + "learning_rate": 5.298198065080446e-05, + "loss": 0.9912, "step": 143200 }, { - "epoch": 1.46, - "learning_rate": 5.9385286417916634e-05, - "loss": 1.3995, + "epoch": 1.974318701606459, + "grad_norm": 51.10231018066406, + "learning_rate": 5.2978965719168025e-05, + "loss": 1.0465, "step": 143300 }, { - "epoch": 1.46, - "learning_rate": 5.9383949707843004e-05, - "loss": 1.2944, + "epoch": 1.9756964536661983, + "grad_norm": 4.199136257171631, + "learning_rate": 5.2975948622963114e-05, + "loss": 1.0531, "step": 143400 }, { - "epoch": 1.46, - "learning_rate": 5.938261156106959e-05, - "loss": 1.3506, + "epoch": 1.9770742057259376, + "grad_norm": 31.836280822753906, + "learning_rate": 5.297292936244604e-05, + "loss": 1.159, "step": 143500 }, { - "epoch": 1.46, - "learning_rate": 5.9381271977661814e-05, - "loss": 1.188, + "epoch": 1.9784519577856767, + "grad_norm": 37.550933837890625, + "learning_rate": 5.296990793787328e-05, + "loss": 1.1227, "step": 143600 }, { - "epoch": 1.46, - "learning_rate": 5.937993095768518e-05, - "loss": 1.2377, + "epoch": 1.9798297098454163, + "grad_norm": 2.2217354774475098, + "learning_rate": 5.2966884349501555e-05, + "loss": 1.1316, "step": 143700 }, { - "epoch": 1.47, - "learning_rate": 5.937858850120525e-05, - "loss": 1.4885, + "epoch": 1.9812074619051554, + "grad_norm": 26.14838218688965, + "learning_rate": 5.296385859758771e-05, + "loss": 0.9354, "step": 143800 }, { - "epoch": 1.47, - "learning_rate": 5.937724460828766e-05, - "loss": 1.1877, + "epoch": 1.982585213964895, + "grad_norm": 16.10267448425293, + "learning_rate": 5.29608306823888e-05, + "loss": 1.1742, "step": 143900 }, { - "epoch": 1.47, - "learning_rate": 5.9375899278998124e-05, - "loss": 1.2533, + "epoch": 1.983962966024634, + "grad_norm": 27.14734649658203, + "learning_rate": 5.2957800604162075e-05, + "loss": 0.9897, "step": 144000 }, { - "epoch": 1.47, - "learning_rate": 5.937455251340242e-05, - "loss": 1.3347, + "epoch": 1.9853407180843736, + "grad_norm": 9.895814895629883, + "learning_rate": 5.2954768363164945e-05, + "loss": 1.0951, "step": 144100 }, { - "epoch": 1.47, - "learning_rate": 5.93732043115664e-05, - "loss": 1.2398, + "epoch": 1.9867184701441127, + "grad_norm": 12.091023445129395, + "learning_rate": 5.295173395965502e-05, + "loss": 1.0686, "step": 144200 }, { - "epoch": 1.47, - "learning_rate": 5.937185467355598e-05, - "loss": 1.368, + "epoch": 1.9880962222038523, + "grad_norm": 19.649566650390625, + "learning_rate": 5.2948697393890075e-05, + "loss": 1.0196, "step": 144300 }, { - "epoch": 1.47, - "learning_rate": 5.9370503599437145e-05, - "loss": 1.3697, + "epoch": 1.9894739742635914, + "grad_norm": 8.195260047912598, + "learning_rate": 5.29456586661281e-05, + "loss": 1.155, "step": 144400 }, { - "epoch": 1.47, - "learning_rate": 5.936915108927596e-05, - "loss": 1.3518, + "epoch": 1.990851726323331, + "grad_norm": 167.79872131347656, + "learning_rate": 5.294261777662724e-05, + "loss": 1.1521, "step": 144500 }, { - "epoch": 1.47, - "learning_rate": 5.936779714313855e-05, - "loss": 1.3357, + "epoch": 1.99222947838307, + "grad_norm": 9.540665626525879, + "learning_rate": 5.293957472564584e-05, + "loss": 1.1728, "step": 144600 }, { - "epoch": 1.47, - "learning_rate": 5.936644176109112e-05, - "loss": 1.2978, + "epoch": 1.9936072304428096, + "grad_norm": 13.336929321289062, + "learning_rate": 5.2936529513442414e-05, + "loss": 0.9594, "step": 144700 }, { - "epoch": 1.48, - "learning_rate": 5.936508494319994e-05, - "loss": 1.3547, + "epoch": 1.9949849825025487, + "grad_norm": 5.253993511199951, + "learning_rate": 5.293348214027568e-05, + "loss": 0.9967, "step": 144800 }, { - "epoch": 1.48, - "learning_rate": 5.936372668953134e-05, - "loss": 1.2259, + "epoch": 1.9963627345622883, + "grad_norm": 244.40768432617188, + "learning_rate": 5.293043260640451e-05, + "loss": 1.0451, "step": 144900 }, { - "epoch": 1.48, - "learning_rate": 5.936236700015174e-05, - "loss": 1.4325, + "epoch": 1.9977404866220274, + "grad_norm": 9.982230186462402, + "learning_rate": 5.2927380912088e-05, + "loss": 1.0934, "step": 145000 }, { - "epoch": 1.48, - "learning_rate": 5.9361005875127624e-05, - "loss": 1.4178, + "epoch": 1.999118238681767, + "grad_norm": 47.84312057495117, + "learning_rate": 5.292432705758539e-05, + "loss": 1.19, "step": 145100 }, { - "epoch": 1.48, - "learning_rate": 5.9359643314525543e-05, - "loss": 1.2432, + "epoch": 2.000495990741506, + "grad_norm": 8.944329261779785, + "learning_rate": 5.292127104315613e-05, + "loss": 1.0664, "step": 145200 }, { - "epoch": 1.48, - "learning_rate": 5.93582793184121e-05, - "loss": 1.3914, + "epoch": 2.0018737428012456, + "grad_norm": 25.012645721435547, + "learning_rate": 5.291821286905984e-05, + "loss": 1.089, "step": 145300 }, { - "epoch": 1.48, - "learning_rate": 5.935691388685401e-05, - "loss": 1.3872, + "epoch": 2.0032514948609847, + "grad_norm": 7.168227195739746, + "learning_rate": 5.291515253555632e-05, + "loss": 1.0854, "step": 145400 }, { - "epoch": 1.48, - "learning_rate": 5.935554701991801e-05, - "loss": 1.2988, + "epoch": 2.0046292469207243, + "grad_norm": 25.42429542541504, + "learning_rate": 5.291209004290557e-05, + "loss": 1.071, "step": 145500 }, { - "epoch": 1.48, - "learning_rate": 5.935417871767096e-05, - "loss": 1.2208, + "epoch": 2.0060069989804634, + "grad_norm": 16.521404266357422, + "learning_rate": 5.290902539136777e-05, + "loss": 1.0406, "step": 145600 }, { - "epoch": 1.48, - "learning_rate": 5.9352808980179744e-05, - "loss": 1.2235, + "epoch": 2.007384751040203, + "grad_norm": 16.918079376220703, + "learning_rate": 5.290595858120328e-05, + "loss": 1.0292, "step": 145700 }, { - "epoch": 1.49, - "learning_rate": 5.9351437807511335e-05, - "loss": 1.295, + "epoch": 2.008762503099942, + "grad_norm": 5.9267144203186035, + "learning_rate": 5.290288961267261e-05, + "loss": 0.9708, "step": 145800 }, { - "epoch": 1.49, - "learning_rate": 5.935006519973278e-05, - "loss": 1.1823, + "epoch": 2.0101402551596816, + "grad_norm": 16.487794876098633, + "learning_rate": 5.2899818486036525e-05, + "loss": 1.0435, "step": 145900 }, { - "epoch": 1.49, - "learning_rate": 5.934869115691119e-05, - "loss": 1.1615, + "epoch": 2.0115180072194208, + "grad_norm": 11.98873233795166, + "learning_rate": 5.289674520155591e-05, + "loss": 0.9971, "step": 146000 }, { - "epoch": 1.49, - "learning_rate": 5.934731567911374e-05, - "loss": 1.1835, + "epoch": 2.0128957592791603, + "grad_norm": 14.290670394897461, + "learning_rate": 5.289366975949187e-05, + "loss": 1.1232, "step": 146100 }, { - "epoch": 1.49, - "learning_rate": 5.934593876640769e-05, - "loss": 1.2593, + "epoch": 2.0142735113388994, + "grad_norm": 17.280029296875, + "learning_rate": 5.2890592160105656e-05, + "loss": 0.9318, "step": 146200 }, { - "epoch": 1.49, - "learning_rate": 5.9344560418860366e-05, - "loss": 1.2741, + "epoch": 2.015651263398639, + "grad_norm": 9.275042533874512, + "learning_rate": 5.2887512403658746e-05, + "loss": 1.021, "step": 146300 }, { - "epoch": 1.49, - "learning_rate": 5.934318063653915e-05, - "loss": 1.3302, + "epoch": 2.017029015458378, + "grad_norm": 35.25020980834961, + "learning_rate": 5.288443049041277e-05, + "loss": 1.0714, "step": 146400 }, { - "epoch": 1.49, - "learning_rate": 5.9341799419511525e-05, - "loss": 1.2166, + "epoch": 2.0184067675181177, + "grad_norm": 8.693073272705078, + "learning_rate": 5.2881346420629566e-05, + "loss": 1.101, "step": 146500 }, { - "epoch": 1.49, - "learning_rate": 5.9340416767845005e-05, - "loss": 1.2079, + "epoch": 2.0197845195778568, + "grad_norm": 32.61711120605469, + "learning_rate": 5.2878260194571134e-05, + "loss": 1.0787, "step": 146600 }, { - "epoch": 1.49, - "learning_rate": 5.933903268160719e-05, - "loss": 1.2697, + "epoch": 2.0211622716375963, + "grad_norm": 15.949615478515625, + "learning_rate": 5.2875171812499664e-05, + "loss": 1.0578, "step": 146700 }, { - "epoch": 1.5, - "learning_rate": 5.9337647160865774e-05, - "loss": 1.2259, + "epoch": 2.0225400236973354, + "grad_norm": 11.64084243774414, + "learning_rate": 5.2872081274677524e-05, + "loss": 1.0368, "step": 146800 }, { - "epoch": 1.5, - "learning_rate": 5.933626020568849e-05, - "loss": 1.2263, + "epoch": 2.0239177757570745, + "grad_norm": 29.57013511657715, + "learning_rate": 5.286898858136728e-05, + "loss": 1.0844, "step": 146900 }, { - "epoch": 1.5, - "learning_rate": 5.933487181614314e-05, - "loss": 1.234, + "epoch": 2.025295527816814, + "grad_norm": 17.013378143310547, + "learning_rate": 5.2865893732831675e-05, + "loss": 0.997, "step": 147000 }, { - "epoch": 1.5, - "learning_rate": 5.933348199229763e-05, - "loss": 1.2311, + "epoch": 2.026673279876553, + "grad_norm": 3.7932121753692627, + "learning_rate": 5.286279672933362e-05, + "loss": 1.1071, "step": 147100 }, { - "epoch": 1.5, - "learning_rate": 5.93320907342199e-05, - "loss": 1.1367, + "epoch": 2.0280510319362928, + "grad_norm": 3.597313165664673, + "learning_rate": 5.285969757113623e-05, + "loss": 0.9796, "step": 147200 }, { - "epoch": 1.5, - "learning_rate": 5.9330698041977976e-05, - "loss": 1.2094, + "epoch": 2.029428783996032, + "grad_norm": 3.0874714851379395, + "learning_rate": 5.285659625850279e-05, + "loss": 1.0383, "step": 147300 }, { - "epoch": 1.5, - "learning_rate": 5.9329303915639956e-05, - "loss": 1.0596, + "epoch": 2.0308065360557714, + "grad_norm": 110.7660140991211, + "learning_rate": 5.285349279169677e-05, + "loss": 1.0886, "step": 147400 }, { - "epoch": 1.5, - "learning_rate": 5.9327922317975875e-05, - "loss": 1.2459, + "epoch": 2.0321842881155106, + "grad_norm": 34.73955154418945, + "learning_rate": 5.2850387170981836e-05, + "loss": 0.9866, "step": 147500 }, { - "epoch": 1.5, - "learning_rate": 5.932652533798949e-05, - "loss": 1.4044, + "epoch": 2.03356204017525, + "grad_norm": 11.254148483276367, + "learning_rate": 5.284727939662182e-05, + "loss": 1.0124, "step": 147600 }, { - "epoch": 1.5, - "learning_rate": 5.932512692411102e-05, - "loss": 1.1682, + "epoch": 2.034939792234989, + "grad_norm": 40.052974700927734, + "learning_rate": 5.2844169468880746e-05, + "loss": 1.0453, "step": 147700 }, { - "epoch": 1.51, - "learning_rate": 5.932372707640884e-05, - "loss": 1.098, + "epoch": 2.0363175442947288, + "grad_norm": 13.228699684143066, + "learning_rate": 5.2841057388022805e-05, + "loss": 1.0064, "step": 147800 }, { - "epoch": 1.51, - "learning_rate": 5.932232579495141e-05, - "loss": 1.3269, + "epoch": 2.037695296354468, + "grad_norm": 7.61269474029541, + "learning_rate": 5.283794315431241e-05, + "loss": 1.0142, "step": 147900 }, { - "epoch": 1.51, - "learning_rate": 5.932092307980723e-05, - "loss": 1.3592, + "epoch": 2.0390730484142074, + "grad_norm": 22.158586502075195, + "learning_rate": 5.2834826768014106e-05, + "loss": 0.9403, "step": 148000 }, { - "epoch": 1.51, - "learning_rate": 5.9319518931044896e-05, - "loss": 1.179, + "epoch": 2.0404508004739466, + "grad_norm": 65.23225402832031, + "learning_rate": 5.2831708229392656e-05, + "loss": 1.0094, "step": 148100 }, { - "epoch": 1.51, - "learning_rate": 5.931811334873305e-05, - "loss": 1.2472, + "epoch": 2.041828552533686, + "grad_norm": 33.195255279541016, + "learning_rate": 5.2828587538713004e-05, + "loss": 0.9455, "step": 148200 }, { - "epoch": 1.51, - "learning_rate": 5.9316706332940426e-05, - "loss": 1.2411, + "epoch": 2.0432063045934252, + "grad_norm": 27.402624130249023, + "learning_rate": 5.282546469624025e-05, + "loss": 0.9618, "step": 148300 }, { - "epoch": 1.51, - "learning_rate": 5.9315297883735816e-05, - "loss": 1.1603, + "epoch": 2.044584056653165, + "grad_norm": 11.631880760192871, + "learning_rate": 5.282233970223971e-05, + "loss": 1.0161, "step": 148400 }, { - "epoch": 1.51, - "learning_rate": 5.931388800118808e-05, - "loss": 1.2482, + "epoch": 2.045961808712904, + "grad_norm": 9.41140079498291, + "learning_rate": 5.2819212556976865e-05, + "loss": 1.1504, "step": 148500 }, { - "epoch": 1.51, - "learning_rate": 5.931247668536616e-05, - "loss": 1.288, + "epoch": 2.0473395607726435, + "grad_norm": 22.59799575805664, + "learning_rate": 5.2816083260717376e-05, + "loss": 0.8834, "step": 148600 }, { - "epoch": 1.51, - "learning_rate": 5.9311063936339054e-05, - "loss": 1.3329, + "epoch": 2.0487173128323826, + "grad_norm": 212.4950408935547, + "learning_rate": 5.281298313884226e-05, + "loss": 1.1245, "step": 148700 }, { - "epoch": 1.52, - "learning_rate": 5.930964975417584e-05, - "loss": 1.4245, + "epoch": 2.050095064892122, + "grad_norm": 13.342792510986328, + "learning_rate": 5.280984956289055e-05, + "loss": 1.0093, "step": 148800 }, { - "epoch": 1.52, - "learning_rate": 5.930823413894567e-05, - "loss": 1.1962, + "epoch": 2.0514728169518612, + "grad_norm": 23.59154510498047, + "learning_rate": 5.280671383673765e-05, + "loss": 0.9521, "step": 148900 }, { - "epoch": 1.52, - "learning_rate": 5.930681709071775e-05, - "loss": 1.1768, + "epoch": 2.052850569011601, + "grad_norm": 12.94513988494873, + "learning_rate": 5.280357596064993e-05, + "loss": 1.1052, "step": 149000 }, { - "epoch": 1.52, - "learning_rate": 5.930539860956137e-05, - "loss": 1.2369, + "epoch": 2.05422832107134, + "grad_norm": 15.56832218170166, + "learning_rate": 5.280043593489399e-05, + "loss": 1.0303, "step": 149100 }, { - "epoch": 1.52, - "learning_rate": 5.930397869554588e-05, - "loss": 1.2428, + "epoch": 2.0556060731310795, + "grad_norm": 12.582783699035645, + "learning_rate": 5.279729375973658e-05, + "loss": 0.8779, "step": 149200 }, { - "epoch": 1.52, - "learning_rate": 5.9302557348740716e-05, - "loss": 1.2765, + "epoch": 2.0569838251908186, + "grad_norm": 10.843000411987305, + "learning_rate": 5.279414943544464e-05, + "loss": 0.9863, "step": 149300 }, { - "epoch": 1.52, - "learning_rate": 5.930113456921536e-05, - "loss": 1.0246, + "epoch": 2.058361577250558, + "grad_norm": 13.15982437133789, + "learning_rate": 5.27910029622853e-05, + "loss": 1.023, "step": 149400 }, { - "epoch": 1.52, - "learning_rate": 5.929971035703938e-05, - "loss": 1.2969, + "epoch": 2.0597393293102972, + "grad_norm": 10.263442039489746, + "learning_rate": 5.278785434052588e-05, + "loss": 0.978, "step": 149500 }, { - "epoch": 1.52, - "learning_rate": 5.9298284712282424e-05, - "loss": 1.1388, + "epoch": 2.061117081370037, + "grad_norm": 14.632909774780273, + "learning_rate": 5.278470357043385e-05, + "loss": 0.9385, "step": 149600 }, { - "epoch": 1.53, - "learning_rate": 5.929685763501418e-05, - "loss": 1.1419, + "epoch": 2.062494833429776, + "grad_norm": 15.738605499267578, + "learning_rate": 5.2781550652276904e-05, + "loss": 1.0193, "step": 149700 }, { - "epoch": 1.53, - "learning_rate": 5.9295429125304444e-05, - "loss": 1.1103, + "epoch": 2.0638725854895155, + "grad_norm": 8.371048927307129, + "learning_rate": 5.277839558632289e-05, + "loss": 0.919, "step": 149800 }, { - "epoch": 1.53, - "learning_rate": 5.9293999183223034e-05, - "loss": 1.1301, + "epoch": 2.0652503375492546, + "grad_norm": 19.53658103942871, + "learning_rate": 5.277523837283985e-05, + "loss": 1.0999, "step": 149900 }, { - "epoch": 1.53, - "learning_rate": 5.929256780883989e-05, - "loss": 1.3856, + "epoch": 2.066628089608994, + "grad_norm": 5.926362991333008, + "learning_rate": 5.2772079012095995e-05, + "loss": 1.0423, "step": 150000 }, { - "epoch": 1.53, - "learning_rate": 5.9291135002224994e-05, - "loss": 1.147, + "epoch": 2.0680058416687332, + "grad_norm": 72.1875991821289, + "learning_rate": 5.276891750435974e-05, + "loss": 1.0406, "step": 150100 }, { - "epoch": 1.53, - "learning_rate": 5.928970076344839e-05, - "loss": 1.1833, + "epoch": 2.069383593728473, + "grad_norm": 4.526686668395996, + "learning_rate": 5.2765753849899666e-05, + "loss": 0.8817, "step": 150200 }, { - "epoch": 1.53, - "learning_rate": 5.928826509258021e-05, - "loss": 1.1268, + "epoch": 2.070761345788212, + "grad_norm": 23.28188705444336, + "learning_rate": 5.2762588048984555e-05, + "loss": 0.8766, "step": 150300 }, { - "epoch": 1.53, - "learning_rate": 5.928682798969065e-05, - "loss": 1.3398, + "epoch": 2.0721390978479515, + "grad_norm": 5.578934669494629, + "learning_rate": 5.2759420101883335e-05, + "loss": 0.8567, "step": 150400 }, { - "epoch": 1.53, - "learning_rate": 5.9285389454849974e-05, - "loss": 1.1988, + "epoch": 2.0735168499076906, + "grad_norm": 23.463237762451172, + "learning_rate": 5.275625000886516e-05, + "loss": 0.9686, "step": 150500 }, { - "epoch": 1.53, - "learning_rate": 5.928394948812853e-05, - "loss": 1.1704, + "epoch": 2.07489460196743, + "grad_norm": 10.182479858398438, + "learning_rate": 5.2753077770199326e-05, + "loss": 0.8766, "step": 150600 }, { - "epoch": 1.54, - "learning_rate": 5.92825080895967e-05, - "loss": 1.1583, + "epoch": 2.0762723540271693, + "grad_norm": 16.524850845336914, + "learning_rate": 5.274993514061452e-05, + "loss": 0.9731, "step": 150700 }, { - "epoch": 1.54, - "learning_rate": 5.928106525932499e-05, - "loss": 1.0919, + "epoch": 2.077650106086909, + "grad_norm": 7.713785171508789, + "learning_rate": 5.274675863291182e-05, + "loss": 0.9396, "step": 150800 }, { - "epoch": 1.54, - "learning_rate": 5.927962099738391e-05, - "loss": 1.2255, + "epoch": 2.079027858146648, + "grad_norm": 7.996649265289307, + "learning_rate": 5.2743579980367816e-05, + "loss": 1.0065, "step": 150900 }, { - "epoch": 1.54, - "learning_rate": 5.927817530384411e-05, - "loss": 1.1635, + "epoch": 2.0804056102063875, + "grad_norm": 27.859678268432617, + "learning_rate": 5.274039918325255e-05, + "loss": 1.0318, "step": 151000 }, { - "epoch": 1.54, - "learning_rate": 5.9276728178776256e-05, - "loss": 1.2916, + "epoch": 2.0817833622661266, + "grad_norm": 4.3011064529418945, + "learning_rate": 5.273721624183623e-05, + "loss": 0.8812, "step": 151100 }, { - "epoch": 1.54, - "learning_rate": 5.9275279622251105e-05, - "loss": 1.3269, + "epoch": 2.0831611143258657, + "grad_norm": 42.48961639404297, + "learning_rate": 5.2734031156389284e-05, + "loss": 0.9334, "step": 151200 }, { - "epoch": 1.54, - "learning_rate": 5.927382963433949e-05, - "loss": 1.2458, + "epoch": 2.0845388663856053, + "grad_norm": 11.868022918701172, + "learning_rate": 5.273084392718229e-05, + "loss": 0.9865, "step": 151300 }, { - "epoch": 1.54, - "learning_rate": 5.92723782151123e-05, - "loss": 1.2964, + "epoch": 2.0859166184453444, + "grad_norm": 57.42210006713867, + "learning_rate": 5.2727654554486044e-05, + "loss": 0.8884, "step": 151400 }, { - "epoch": 1.54, - "learning_rate": 5.927092536464051e-05, - "loss": 1.2414, + "epoch": 2.087294370505084, + "grad_norm": 14.839951515197754, + "learning_rate": 5.272446303857146e-05, + "loss": 1.0079, "step": 151500 }, { - "epoch": 1.54, - "learning_rate": 5.926947108299514e-05, - "loss": 1.2577, + "epoch": 2.088672122564823, + "grad_norm": 12.66772747039795, + "learning_rate": 5.27212693797097e-05, + "loss": 1.0085, "step": 151600 }, { - "epoch": 1.55, - "learning_rate": 5.926801537024732e-05, - "loss": 1.2845, + "epoch": 2.0900498746245626, + "grad_norm": 5.242060661315918, + "learning_rate": 5.271807357817208e-05, + "loss": 1.0945, "step": 151700 }, { - "epoch": 1.55, - "learning_rate": 5.92665582264682e-05, - "loss": 1.2276, + "epoch": 2.0914276266843017, + "grad_norm": 17.87880516052246, + "learning_rate": 5.2714875634230086e-05, + "loss": 0.9576, "step": 151800 }, { - "epoch": 1.55, - "learning_rate": 5.926509965172903e-05, - "loss": 1.2558, + "epoch": 2.0928053787440413, + "grad_norm": 58.844966888427734, + "learning_rate": 5.271167554815541e-05, + "loss": 0.9756, "step": 151900 }, { - "epoch": 1.55, - "learning_rate": 5.926365425324009e-05, - "loss": 1.1331, + "epoch": 2.0941831308037804, + "grad_norm": 4.86928653717041, + "learning_rate": 5.270847332021991e-05, + "loss": 0.9737, "step": 152000 }, { - "epoch": 1.55, - "learning_rate": 5.926219283110268e-05, - "loss": 1.2888, + "epoch": 2.09556088286352, + "grad_norm": 63.286617279052734, + "learning_rate": 5.270526895069564e-05, + "loss": 1.0174, "step": 152100 }, { - "epoch": 1.55, - "learning_rate": 5.926072997821866e-05, - "loss": 1.2316, + "epoch": 2.096938634923259, + "grad_norm": 28.359848022460938, + "learning_rate": 5.2702062439854825e-05, + "loss": 0.8507, "step": 152200 }, { - "epoch": 1.55, - "learning_rate": 5.925926569465956e-05, - "loss": 1.1572, + "epoch": 2.0983163869829986, + "grad_norm": 19.392351150512695, + "learning_rate": 5.2698853787969885e-05, + "loss": 1.0027, "step": 152300 }, { - "epoch": 1.55, - "learning_rate": 5.925779998049699e-05, - "loss": 1.1334, + "epoch": 2.0996941390427377, + "grad_norm": 2.0332088470458984, + "learning_rate": 5.269564299531338e-05, + "loss": 0.9885, "step": 152400 }, { - "epoch": 1.55, - "learning_rate": 5.9256332835802596e-05, - "loss": 1.1042, + "epoch": 2.1010718911024773, + "grad_norm": 11.420206069946289, + "learning_rate": 5.269243006215811e-05, + "loss": 0.9107, "step": 152500 }, { - "epoch": 1.55, - "learning_rate": 5.925486426064812e-05, - "loss": 1.2739, + "epoch": 2.1024496431622164, + "grad_norm": 9.256026268005371, + "learning_rate": 5.268921498877702e-05, + "loss": 0.8763, "step": 152600 }, { - "epoch": 1.56, - "learning_rate": 5.9253394255105366e-05, - "loss": 1.1874, + "epoch": 2.103827395221956, + "grad_norm": 4.526012420654297, + "learning_rate": 5.2686029958168464e-05, + "loss": 0.9273, "step": 152700 }, { - "epoch": 1.56, - "learning_rate": 5.925192281924621e-05, - "loss": 1.2252, + "epoch": 2.105205147281695, + "grad_norm": 20.233253479003906, + "learning_rate": 5.2682810626550776e-05, + "loss": 0.9727, "step": 152800 }, { - "epoch": 1.56, - "learning_rate": 5.9250449953142586e-05, - "loss": 1.2749, + "epoch": 2.1065828993414346, + "grad_norm": 6.938904762268066, + "learning_rate": 5.2679589155524485e-05, + "loss": 0.9338, "step": 152900 }, { - "epoch": 1.56, - "learning_rate": 5.924897565686653e-05, - "loss": 1.1701, + "epoch": 2.1079606514011737, + "grad_norm": 50.39978790283203, + "learning_rate": 5.267636554536328e-05, + "loss": 1.0513, "step": 153000 }, { - "epoch": 1.56, - "learning_rate": 5.924751469483263e-05, - "loss": 1.234, + "epoch": 2.1093384034609133, + "grad_norm": 27.855424880981445, + "learning_rate": 5.267313979634102e-05, + "loss": 1.0309, "step": 153100 }, { - "epoch": 1.56, - "learning_rate": 5.9246037552727924e-05, - "loss": 1.2122, + "epoch": 2.1107161555206524, + "grad_norm": 8.242439270019531, + "learning_rate": 5.2669911908731754e-05, + "loss": 0.9814, "step": 153200 }, { - "epoch": 1.56, - "learning_rate": 5.924455898066652e-05, - "loss": 1.1334, + "epoch": 2.112093907580392, + "grad_norm": 7.917934417724609, + "learning_rate": 5.26666818828097e-05, + "loss": 0.9505, "step": 153300 }, { - "epoch": 1.56, - "learning_rate": 5.9243078978720694e-05, - "loss": 1.2941, + "epoch": 2.113471659640131, + "grad_norm": 5.1177568435668945, + "learning_rate": 5.266344971884927e-05, + "loss": 0.9524, "step": 153400 }, { - "epoch": 1.56, - "learning_rate": 5.924159754696283e-05, - "loss": 1.1894, + "epoch": 2.1148494116998706, + "grad_norm": 28.833621978759766, + "learning_rate": 5.266021541712505e-05, + "loss": 0.9389, "step": 153500 }, { - "epoch": 1.56, - "learning_rate": 5.924011468546534e-05, - "loss": 1.1101, + "epoch": 2.1162271637596097, + "grad_norm": 2.6027934551239014, + "learning_rate": 5.265701135288361e-05, + "loss": 0.975, "step": 153600 }, { - "epoch": 1.57, - "learning_rate": 5.9238630394300736e-05, - "loss": 1.1649, + "epoch": 2.1176049158193493, + "grad_norm": 98.79273223876953, + "learning_rate": 5.265377279782709e-05, + "loss": 0.9552, "step": 153700 }, { - "epoch": 1.57, - "learning_rate": 5.92371446735416e-05, - "loss": 1.184, + "epoch": 2.1189826678790884, + "grad_norm": 22.76773452758789, + "learning_rate": 5.265053210582888e-05, + "loss": 1.0472, "step": 153800 }, { - "epoch": 1.57, - "learning_rate": 5.923565752326057e-05, - "loss": 1.1365, + "epoch": 2.120360419938828, + "grad_norm": 17.150712966918945, + "learning_rate": 5.26472892771643e-05, + "loss": 0.9667, "step": 153900 }, { - "epoch": 1.57, - "learning_rate": 5.923416894353035e-05, - "loss": 1.2747, + "epoch": 2.121738171998567, + "grad_norm": 14.188424110412598, + "learning_rate": 5.264404431210883e-05, + "loss": 0.9707, "step": 154000 }, { - "epoch": 1.57, - "learning_rate": 5.9232678934423735e-05, - "loss": 1.2643, + "epoch": 2.1231159240583066, + "grad_norm": 4.750355243682861, + "learning_rate": 5.264079721093818e-05, + "loss": 0.9813, "step": 154100 }, { - "epoch": 1.57, - "learning_rate": 5.923118749601357e-05, - "loss": 1.1458, + "epoch": 2.1244936761180457, + "grad_norm": 158.37037658691406, + "learning_rate": 5.263754797392817e-05, + "loss": 0.9929, "step": 154200 }, { - "epoch": 1.57, - "learning_rate": 5.922969462837278e-05, - "loss": 1.0832, + "epoch": 2.1258714281777853, + "grad_norm": 20.168994903564453, + "learning_rate": 5.2634296601354864e-05, + "loss": 0.9156, "step": 154300 }, { - "epoch": 1.57, - "learning_rate": 5.922820033157435e-05, - "loss": 1.0339, + "epoch": 2.1272491802375244, + "grad_norm": 5.984154224395752, + "learning_rate": 5.263104309349448e-05, + "loss": 1.0161, "step": 154400 }, { - "epoch": 1.57, - "learning_rate": 5.9226704605691344e-05, - "loss": 1.1707, + "epoch": 2.128626932297264, + "grad_norm": 37.03858947753906, + "learning_rate": 5.262778745062341e-05, + "loss": 0.9872, "step": 154500 }, { - "epoch": 1.58, - "learning_rate": 5.9225207450796914e-05, - "loss": 1.1772, + "epoch": 2.130004684357003, + "grad_norm": 12.900752067565918, + "learning_rate": 5.262452967301824e-05, + "loss": 0.9569, "step": 154600 }, { - "epoch": 1.58, - "learning_rate": 5.9223708866964244e-05, - "loss": 1.2733, + "epoch": 2.1313824364167426, + "grad_norm": 11.495844841003418, + "learning_rate": 5.2621269760955744e-05, + "loss": 0.9399, "step": 154700 }, { - "epoch": 1.58, - "learning_rate": 5.9222208854266594e-05, - "loss": 1.1665, + "epoch": 2.1327601884764817, + "grad_norm": 10.074590682983398, + "learning_rate": 5.2618007714712864e-05, + "loss": 1.0478, "step": 154800 }, { - "epoch": 1.58, - "learning_rate": 5.9220707412777335e-05, - "loss": 1.1074, + "epoch": 2.1341379405362213, + "grad_norm": 14.428563117980957, + "learning_rate": 5.261474353456672e-05, + "loss": 0.9832, "step": 154900 }, { - "epoch": 1.58, - "learning_rate": 5.9219204542569856e-05, - "loss": 1.181, + "epoch": 2.1355156925959604, + "grad_norm": 4.966893672943115, + "learning_rate": 5.2611477220794635e-05, + "loss": 0.9286, "step": 155000 }, { - "epoch": 1.58, - "learning_rate": 5.921770024371765e-05, - "loss": 1.1131, + "epoch": 2.1368934446556995, + "grad_norm": 11.962916374206543, + "learning_rate": 5.2608208773674084e-05, + "loss": 1.0126, "step": 155100 }, { - "epoch": 1.58, - "learning_rate": 5.921619451629426e-05, - "loss": 1.2697, + "epoch": 2.138271196715439, + "grad_norm": 13.316461563110352, + "learning_rate": 5.260493819348275e-05, + "loss": 0.9576, "step": 155200 }, { - "epoch": 1.58, - "learning_rate": 5.9214687360373324e-05, - "loss": 1.2122, + "epoch": 2.1396489487751786, + "grad_norm": 7.460152626037598, + "learning_rate": 5.260166548049847e-05, + "loss": 1.0248, "step": 155300 }, { - "epoch": 1.58, - "learning_rate": 5.921319386894241e-05, - "loss": 1.1945, + "epoch": 2.1410267008349178, + "grad_norm": 7.871267318725586, + "learning_rate": 5.2598390634999296e-05, + "loss": 1.0188, "step": 155400 }, { - "epoch": 1.58, - "learning_rate": 5.921168387053062e-05, - "loss": 1.1864, + "epoch": 2.142404452894657, + "grad_norm": 13.87922477722168, + "learning_rate": 5.259511365726343e-05, + "loss": 0.985, "step": 155500 }, { - "epoch": 1.59, - "learning_rate": 5.921017244384183e-05, - "loss": 1.1472, + "epoch": 2.1437822049543964, + "grad_norm": 43.95943832397461, + "learning_rate": 5.259183454756928e-05, + "loss": 0.8258, "step": 155600 }, { - "epoch": 1.59, - "learning_rate": 5.9208659588949905e-05, - "loss": 1.1793, + "epoch": 2.1451599570141355, + "grad_norm": 10.786490440368652, + "learning_rate": 5.258855330619541e-05, + "loss": 0.998, "step": 155700 }, { - "epoch": 1.59, - "learning_rate": 5.9207145305928853e-05, - "loss": 1.2403, + "epoch": 2.146537709073875, + "grad_norm": 24.638206481933594, + "learning_rate": 5.258526993342059e-05, + "loss": 1.0332, "step": 155800 }, { - "epoch": 1.59, - "learning_rate": 5.920562959485268e-05, - "loss": 1.121, + "epoch": 2.147915461133614, + "grad_norm": 9.810965538024902, + "learning_rate": 5.258198442952375e-05, + "loss": 1.0727, "step": 155900 }, { - "epoch": 1.59, - "learning_rate": 5.9204112455795514e-05, - "loss": 1.194, + "epoch": 2.1492932131933538, + "grad_norm": 2.26515531539917, + "learning_rate": 5.257869679478402e-05, + "loss": 0.9942, "step": 156000 }, { - "epoch": 1.59, - "learning_rate": 5.920259388883154e-05, - "loss": 1.182, + "epoch": 2.150670965253093, + "grad_norm": 7.039181709289551, + "learning_rate": 5.257540702948069e-05, + "loss": 0.9363, "step": 156100 }, { - "epoch": 1.59, - "learning_rate": 5.920107389403498e-05, - "loss": 1.1897, + "epoch": 2.1520487173128324, + "grad_norm": 28.103984832763672, + "learning_rate": 5.257211513389324e-05, + "loss": 0.8856, "step": 156200 }, { - "epoch": 1.59, - "learning_rate": 5.919955247148018e-05, - "loss": 1.342, + "epoch": 2.1534264693725715, + "grad_norm": 7.266382694244385, + "learning_rate": 5.2568821108301356e-05, + "loss": 0.8907, "step": 156300 }, { - "epoch": 1.59, - "learning_rate": 5.919802962124151e-05, - "loss": 1.172, + "epoch": 2.154804221432311, + "grad_norm": 11.471376419067383, + "learning_rate": 5.256552495298486e-05, + "loss": 0.9448, "step": 156400 }, { - "epoch": 1.59, - "learning_rate": 5.919650534339345e-05, - "loss": 1.1726, + "epoch": 2.15618197349205, + "grad_norm": 8.607222557067871, + "learning_rate": 5.2562226668223785e-05, + "loss": 0.8891, "step": 156500 }, { - "epoch": 1.6, - "learning_rate": 5.9194979638010506e-05, - "loss": 1.0533, + "epoch": 2.1575597255517898, + "grad_norm": 9.009541511535645, + "learning_rate": 5.255892625429834e-05, + "loss": 0.903, "step": 156600 }, { - "epoch": 1.6, - "learning_rate": 5.919345250516728e-05, - "loss": 1.2937, + "epoch": 2.158937477611529, + "grad_norm": 5.408649444580078, + "learning_rate": 5.255562371148891e-05, + "loss": 0.9355, "step": 156700 }, { - "epoch": 1.6, - "learning_rate": 5.919192394493845e-05, - "loss": 1.2577, + "epoch": 2.1603152296712684, + "grad_norm": 10.264758110046387, + "learning_rate": 5.2552319040076056e-05, + "loss": 0.9703, "step": 156800 }, { - "epoch": 1.6, - "learning_rate": 5.919039395739874e-05, - "loss": 1.1842, + "epoch": 2.1616929817310075, + "grad_norm": 7.09464693069458, + "learning_rate": 5.254901224034054e-05, + "loss": 0.8404, "step": 156900 }, { - "epoch": 1.6, - "learning_rate": 5.9188862542622975e-05, - "loss": 1.2523, + "epoch": 2.163070733790747, + "grad_norm": 8.133198738098145, + "learning_rate": 5.254570331256328e-05, + "loss": 0.8998, "step": 157000 }, { - "epoch": 1.6, - "learning_rate": 5.918732970068602e-05, - "loss": 1.3062, + "epoch": 2.164448485850486, + "grad_norm": 22.95881462097168, + "learning_rate": 5.254239225702539e-05, + "loss": 0.9318, "step": 157100 }, { - "epoch": 1.6, - "learning_rate": 5.918579543166282e-05, - "loss": 1.2295, + "epoch": 2.1658262379102258, + "grad_norm": 6.442829132080078, + "learning_rate": 5.2539079074008164e-05, + "loss": 0.9243, "step": 157200 }, { - "epoch": 1.6, - "learning_rate": 5.918425973562839e-05, - "loss": 1.1519, + "epoch": 2.167203989969965, + "grad_norm": 4.950994491577148, + "learning_rate": 5.253576376379307e-05, + "loss": 0.8612, "step": 157300 }, { - "epoch": 1.6, - "learning_rate": 5.918272261265783e-05, - "loss": 1.2288, + "epoch": 2.1685817420297044, + "grad_norm": 26.30369758605957, + "learning_rate": 5.2532446326661764e-05, + "loss": 0.9551, "step": 157400 }, { - "epoch": 1.6, - "learning_rate": 5.918118406282629e-05, - "loss": 1.1961, + "epoch": 2.1699594940894436, + "grad_norm": 16.370683670043945, + "learning_rate": 5.2529126762896076e-05, + "loss": 0.879, "step": 157500 }, { - "epoch": 1.61, - "learning_rate": 5.9179644086208986e-05, - "loss": 1.2629, + "epoch": 2.171337246149183, + "grad_norm": 5.687817096710205, + "learning_rate": 5.2525805072778024e-05, + "loss": 0.8551, "step": 157600 }, { - "epoch": 1.61, - "learning_rate": 5.917810268288122e-05, - "loss": 1.2248, + "epoch": 2.172714998208922, + "grad_norm": 8.148140907287598, + "learning_rate": 5.252248125658979e-05, + "loss": 0.9787, "step": 157700 }, { - "epoch": 1.61, - "learning_rate": 5.917655985291836e-05, - "loss": 1.2071, + "epoch": 2.1740927502686618, + "grad_norm": 9.443391799926758, + "learning_rate": 5.251915531461377e-05, + "loss": 0.9364, "step": 157800 }, { - "epoch": 1.61, - "learning_rate": 5.917501559639585e-05, - "loss": 1.2202, + "epoch": 2.175470502328401, + "grad_norm": 10.445510864257812, + "learning_rate": 5.2515827247132505e-05, + "loss": 0.8426, "step": 157900 }, { - "epoch": 1.61, - "learning_rate": 5.917346991338917e-05, - "loss": 1.2198, + "epoch": 2.1768482543881404, + "grad_norm": 12.912979125976562, + "learning_rate": 5.251249705442874e-05, + "loss": 0.9348, "step": 158000 }, { - "epoch": 1.61, - "learning_rate": 5.917192280397392e-05, - "loss": 1.0927, + "epoch": 2.1782260064478796, + "grad_norm": 5.275145053863525, + "learning_rate": 5.250916473678538e-05, + "loss": 0.8936, "step": 158100 }, { - "epoch": 1.61, - "learning_rate": 5.917037426822573e-05, - "loss": 1.1159, + "epoch": 2.179603758507619, + "grad_norm": 6.103515148162842, + "learning_rate": 5.250583029448553e-05, + "loss": 0.9887, "step": 158200 }, { - "epoch": 1.61, - "learning_rate": 5.916882430622032e-05, - "loss": 1.1849, + "epoch": 2.1809815105673582, + "grad_norm": 10.050488471984863, + "learning_rate": 5.250249372781247e-05, + "loss": 0.941, "step": 158300 }, { - "epoch": 1.61, - "learning_rate": 5.916727291803347e-05, - "loss": 1.1164, + "epoch": 2.182359262627098, + "grad_norm": 26.635068893432617, + "learning_rate": 5.249915503704966e-05, + "loss": 1.0517, "step": 158400 }, { - "epoch": 1.61, - "learning_rate": 5.916572010374103e-05, - "loss": 1.1085, + "epoch": 2.183737014686837, + "grad_norm": 12.143385887145996, + "learning_rate": 5.249581422248073e-05, + "loss": 0.933, "step": 158500 }, { - "epoch": 1.62, - "learning_rate": 5.916416586341893e-05, - "loss": 1.0363, + "epoch": 2.1851147667465765, + "grad_norm": 5.439852714538574, + "learning_rate": 5.24924712843895e-05, + "loss": 0.9008, "step": 158600 }, { - "epoch": 1.62, - "learning_rate": 5.9162610197143174e-05, - "loss": 1.3396, + "epoch": 2.1864925188063156, + "grad_norm": 9.545210838317871, + "learning_rate": 5.2489126223059984e-05, + "loss": 0.9218, "step": 158700 }, { - "epoch": 1.62, - "learning_rate": 5.91610531049898e-05, - "loss": 1.2658, + "epoch": 2.187870270866055, + "grad_norm": 13.009220123291016, + "learning_rate": 5.248577903877635e-05, + "loss": 0.9617, "step": 158800 }, { - "epoch": 1.62, - "learning_rate": 5.915949458703496e-05, - "loss": 1.2445, + "epoch": 2.1892480229257942, + "grad_norm": 15.801142692565918, + "learning_rate": 5.248242973182296e-05, + "loss": 1.0053, "step": 158900 }, { - "epoch": 1.62, - "learning_rate": 5.9157934643354854e-05, - "loss": 1.1969, + "epoch": 2.190625774985534, + "grad_norm": 20.715030670166016, + "learning_rate": 5.2479078302484346e-05, + "loss": 0.9531, "step": 159000 }, { - "epoch": 1.62, - "learning_rate": 5.915637327402574e-05, - "loss": 1.2443, + "epoch": 2.192003527045273, + "grad_norm": 2.668189525604248, + "learning_rate": 5.247572475104524e-05, + "loss": 0.9391, "step": 159100 }, { - "epoch": 1.62, - "learning_rate": 5.915481047912397e-05, - "loss": 1.1003, + "epoch": 2.1933812791050125, + "grad_norm": 10.495641708374023, + "learning_rate": 5.247236907779055e-05, + "loss": 0.8596, "step": 159200 }, { - "epoch": 1.62, - "learning_rate": 5.915324625872595e-05, - "loss": 1.223, + "epoch": 2.1947590311647516, + "grad_norm": 5.542959690093994, + "learning_rate": 5.246901128300534e-05, + "loss": 0.9221, "step": 159300 }, { - "epoch": 1.62, - "learning_rate": 5.915168061290817e-05, - "loss": 1.0726, + "epoch": 2.1961367832244907, + "grad_norm": 8.880916595458984, + "learning_rate": 5.2465651366974884e-05, + "loss": 0.8193, "step": 159400 }, { - "epoch": 1.63, - "learning_rate": 5.9150113541747186e-05, - "loss": 1.1546, + "epoch": 2.1975145352842302, + "grad_norm": 3.6114373207092285, + "learning_rate": 5.2462289329984626e-05, + "loss": 0.8921, "step": 159500 }, { - "epoch": 1.63, - "learning_rate": 5.91485450453196e-05, - "loss": 1.0942, + "epoch": 2.19889228734397, + "grad_norm": 27.882787704467773, + "learning_rate": 5.245892517232018e-05, + "loss": 0.8985, "step": 159600 }, { - "epoch": 1.63, - "learning_rate": 5.914697512370211e-05, - "loss": 1.1822, + "epoch": 2.200270039403709, + "grad_norm": 13.388383865356445, + "learning_rate": 5.245555889426735e-05, + "loss": 0.9079, "step": 159700 }, { - "epoch": 1.63, - "learning_rate": 5.9145403776971484e-05, - "loss": 1.2163, + "epoch": 2.201647791463448, + "grad_norm": 16.630842208862305, + "learning_rate": 5.245219049611212e-05, + "loss": 0.9598, "step": 159800 }, { - "epoch": 1.63, - "learning_rate": 5.914383100520453e-05, - "loss": 1.1197, + "epoch": 2.2030255435231876, + "grad_norm": 4.307223796844482, + "learning_rate": 5.2448819978140654e-05, + "loss": 0.9551, "step": 159900 }, { - "epoch": 1.63, - "learning_rate": 5.9142256808478173e-05, - "loss": 1.1105, + "epoch": 2.2044032955829267, + "grad_norm": 13.902462005615234, + "learning_rate": 5.2445447340639293e-05, + "loss": 0.9426, "step": 160000 }, { - "epoch": 1.63, - "learning_rate": 5.914068118686937e-05, - "loss": 1.147, + "epoch": 2.2057810476426662, + "grad_norm": 5.826269149780273, + "learning_rate": 5.244207258389456e-05, + "loss": 0.9591, "step": 160100 }, { - "epoch": 1.63, - "learning_rate": 5.913910414045515e-05, - "loss": 1.1338, + "epoch": 2.2071587997024054, + "grad_norm": 3.053081512451172, + "learning_rate": 5.2438695708193164e-05, + "loss": 0.926, "step": 160200 }, { - "epoch": 1.63, - "learning_rate": 5.913752566931264e-05, - "loss": 1.1924, + "epoch": 2.208536551762145, + "grad_norm": 23.950437545776367, + "learning_rate": 5.243531671382198e-05, + "loss": 0.9329, "step": 160300 }, { - "epoch": 1.63, - "learning_rate": 5.9135945773519e-05, - "loss": 1.214, + "epoch": 2.209914303821884, + "grad_norm": 5.027472019195557, + "learning_rate": 5.243193560106806e-05, + "loss": 1.0093, "step": 160400 }, { - "epoch": 1.64, - "learning_rate": 5.913436445315149e-05, - "loss": 1.291, + "epoch": 2.2112920558816236, + "grad_norm": 37.77529525756836, + "learning_rate": 5.242855237021868e-05, + "loss": 0.9378, "step": 160500 }, { - "epoch": 1.64, - "learning_rate": 5.9132781708287425e-05, - "loss": 1.2537, + "epoch": 2.2126698079413627, + "grad_norm": 26.79740333557129, + "learning_rate": 5.242516702156123e-05, + "loss": 0.9427, "step": 160600 }, { - "epoch": 1.64, - "learning_rate": 5.9131213387747644e-05, - "loss": 1.0371, + "epoch": 2.2140475600011023, + "grad_norm": 7.521678924560547, + "learning_rate": 5.2421779555383325e-05, + "loss": 0.9531, "step": 160700 }, { - "epoch": 1.64, - "learning_rate": 5.912962780836572e-05, - "loss": 1.0675, + "epoch": 2.2154253120608414, + "grad_norm": 14.960001945495605, + "learning_rate": 5.2418389971972756e-05, + "loss": 0.972, "step": 160800 }, { - "epoch": 1.64, - "learning_rate": 5.9128040804718844e-05, - "loss": 1.1891, + "epoch": 2.216803064120581, + "grad_norm": 47.28500747680664, + "learning_rate": 5.241499827161746e-05, + "loss": 0.8699, "step": 160900 }, { - "epoch": 1.64, - "learning_rate": 5.91264523768846e-05, - "loss": 1.1541, + "epoch": 2.21818081618032, + "grad_norm": 8.531850814819336, + "learning_rate": 5.2411604454605614e-05, + "loss": 1.0195, "step": 161000 }, { - "epoch": 1.64, - "learning_rate": 5.9124862524940636e-05, - "loss": 1.027, + "epoch": 2.2195585682400596, + "grad_norm": 5.627152442932129, + "learning_rate": 5.240820852122551e-05, + "loss": 0.9033, "step": 161100 }, { - "epoch": 1.64, - "learning_rate": 5.912327124896471e-05, - "loss": 1.2509, + "epoch": 2.2209363202997987, + "grad_norm": 38.93098068237305, + "learning_rate": 5.240481047176568e-05, + "loss": 0.9048, "step": 161200 }, { - "epoch": 1.64, - "learning_rate": 5.912167854903463e-05, - "loss": 1.1014, + "epoch": 2.2223140723595383, + "grad_norm": 7.977657794952393, + "learning_rate": 5.240141030651477e-05, + "loss": 0.9873, "step": 161300 }, { - "epoch": 1.64, - "learning_rate": 5.912008442522825e-05, - "loss": 1.1002, + "epoch": 2.2236918244192774, + "grad_norm": 8.564409255981445, + "learning_rate": 5.239800802576167e-05, + "loss": 0.9275, "step": 161400 }, { - "epoch": 1.65, - "learning_rate": 5.911848887762352e-05, - "loss": 1.2381, + "epoch": 2.225069576479017, + "grad_norm": 11.705387115478516, + "learning_rate": 5.239460362979541e-05, + "loss": 0.7914, "step": 161500 }, { - "epoch": 1.65, - "learning_rate": 5.9116891906298465e-05, - "loss": 1.0596, + "epoch": 2.226447328538756, + "grad_norm": 13.273726463317871, + "learning_rate": 5.2391197118905204e-05, + "loss": 0.9548, "step": 161600 }, { - "epoch": 1.65, - "learning_rate": 5.9115293511331144e-05, - "loss": 1.2705, + "epoch": 2.2278250805984956, + "grad_norm": 4.967896938323975, + "learning_rate": 5.238778849338048e-05, + "loss": 0.932, "step": 161700 }, { - "epoch": 1.65, - "learning_rate": 5.9113693692799724e-05, - "loss": 1.0786, + "epoch": 2.2292028326582347, + "grad_norm": 9.133955955505371, + "learning_rate": 5.238437775351078e-05, + "loss": 0.8389, "step": 161800 }, { - "epoch": 1.65, - "learning_rate": 5.911209245078242e-05, - "loss": 1.1559, + "epoch": 2.2305805847179743, + "grad_norm": 14.688834190368652, + "learning_rate": 5.238096489958589e-05, + "loss": 0.8829, "step": 161900 }, { - "epoch": 1.65, - "learning_rate": 5.911048978535753e-05, - "loss": 1.1835, + "epoch": 2.2319583367777134, + "grad_norm": 40.99361038208008, + "learning_rate": 5.2377549931895733e-05, + "loss": 0.8876, "step": 162000 }, { - "epoch": 1.65, - "learning_rate": 5.910888569660341e-05, - "loss": 1.0689, + "epoch": 2.233336088837453, + "grad_norm": 54.10387420654297, + "learning_rate": 5.237413285073044e-05, + "loss": 0.9795, "step": 162100 }, { - "epoch": 1.65, - "learning_rate": 5.91072801845985e-05, - "loss": 1.3089, + "epoch": 2.234713840897192, + "grad_norm": 31.367080688476562, + "learning_rate": 5.237071365638031e-05, + "loss": 0.9148, "step": 162200 }, { - "epoch": 1.65, - "learning_rate": 5.910567324942129e-05, - "loss": 1.0768, + "epoch": 2.2360915929569316, + "grad_norm": 9.466647148132324, + "learning_rate": 5.2367326572666135e-05, + "loss": 1.0082, "step": 162300 }, { - "epoch": 1.65, - "learning_rate": 5.9104064891150344e-05, - "loss": 1.0874, + "epoch": 2.2374693450166707, + "grad_norm": 10.659646987915039, + "learning_rate": 5.236390317394253e-05, + "loss": 0.9343, "step": 162400 }, { - "epoch": 1.66, - "learning_rate": 5.910245510986432e-05, - "loss": 1.1226, + "epoch": 2.2388470970764103, + "grad_norm": 8.352649688720703, + "learning_rate": 5.2360477662903155e-05, + "loss": 0.8674, "step": 162500 }, { - "epoch": 1.66, - "learning_rate": 5.910084390564191e-05, - "loss": 1.077, + "epoch": 2.2402248491361494, + "grad_norm": 11.551562309265137, + "learning_rate": 5.235705003983901e-05, + "loss": 0.9781, "step": 162600 }, { - "epoch": 1.66, - "learning_rate": 5.909923127856189e-05, - "loss": 1.1045, + "epoch": 2.241602601195889, + "grad_norm": 101.57791900634766, + "learning_rate": 5.23536203050413e-05, + "loss": 1.0003, "step": 162700 }, { - "epoch": 1.66, - "learning_rate": 5.9097617228703125e-05, - "loss": 1.2107, + "epoch": 2.242980353255628, + "grad_norm": 33.70100402832031, + "learning_rate": 5.235018845880139e-05, + "loss": 1.015, "step": 162800 }, { - "epoch": 1.66, - "learning_rate": 5.9096001756144524e-05, - "loss": 1.1376, + "epoch": 2.2443581053153676, + "grad_norm": 8.317302703857422, + "learning_rate": 5.234675450141084e-05, + "loss": 0.9795, "step": 162900 }, { - "epoch": 1.66, - "learning_rate": 5.9094384860965065e-05, - "loss": 1.0462, + "epoch": 2.2457358573751067, + "grad_norm": 11.664278030395508, + "learning_rate": 5.234331843316138e-05, + "loss": 0.9885, "step": 163000 }, { - "epoch": 1.66, - "learning_rate": 5.909276654324382e-05, - "loss": 1.1375, + "epoch": 2.2471136094348463, + "grad_norm": 5.981961250305176, + "learning_rate": 5.233988025434493e-05, + "loss": 1.014, "step": 163100 }, { - "epoch": 1.66, - "learning_rate": 5.909114680305991e-05, - "loss": 1.1075, + "epoch": 2.2484913614945854, + "grad_norm": 9.815963745117188, + "learning_rate": 5.233643996525356e-05, + "loss": 1.007, "step": 163200 }, { - "epoch": 1.66, - "learning_rate": 5.908952564049252e-05, - "loss": 1.1731, + "epoch": 2.249869113554325, + "grad_norm": 32.083648681640625, + "learning_rate": 5.2332997566179563e-05, + "loss": 1.0017, "step": 163300 }, { - "epoch": 1.66, - "learning_rate": 5.908790305562092e-05, - "loss": 1.1392, + "epoch": 2.251246865614064, + "grad_norm": 64.1452407836914, + "learning_rate": 5.2329553057415375e-05, + "loss": 1.0477, "step": 163400 }, { - "epoch": 1.67, - "learning_rate": 5.908627904852446e-05, - "loss": 1.0818, + "epoch": 2.2526246176738036, + "grad_norm": 5.024203300476074, + "learning_rate": 5.2326106439253624e-05, + "loss": 0.8959, "step": 163500 }, { - "epoch": 1.67, - "learning_rate": 5.908465361928253e-05, - "loss": 1.1729, + "epoch": 2.2540023697335427, + "grad_norm": 14.767755508422852, + "learning_rate": 5.232272670719978e-05, + "loss": 0.9719, "step": 163600 }, { - "epoch": 1.67, - "learning_rate": 5.90830267679746e-05, - "loss": 1.0986, + "epoch": 2.255380121793282, + "grad_norm": 18.60188102722168, + "learning_rate": 5.231927591329488e-05, + "loss": 0.9632, "step": 163700 }, { - "epoch": 1.67, - "learning_rate": 5.9081398494680226e-05, - "loss": 1.1971, + "epoch": 2.2567578738530214, + "grad_norm": 12.099367141723633, + "learning_rate": 5.231582301086551e-05, + "loss": 0.9621, "step": 163800 }, { - "epoch": 1.67, - "learning_rate": 5.907976879947901e-05, - "loss": 1.0444, + "epoch": 2.258135625912761, + "grad_norm": 16.069604873657227, + "learning_rate": 5.2312368000205026e-05, + "loss": 1.016, "step": 163900 }, { - "epoch": 1.67, - "learning_rate": 5.907813768245064e-05, - "loss": 1.1125, + "epoch": 2.2595133779725, + "grad_norm": 8.607207298278809, + "learning_rate": 5.230891088160694e-05, + "loss": 0.9365, "step": 164000 }, { - "epoch": 1.67, - "learning_rate": 5.9076505143674856e-05, - "loss": 1.0496, + "epoch": 2.260891130032239, + "grad_norm": 5.177475929260254, + "learning_rate": 5.230545165536495e-05, + "loss": 1.0815, "step": 164100 }, { - "epoch": 1.67, - "learning_rate": 5.907487118323151e-05, - "loss": 1.1377, + "epoch": 2.2622688820919787, + "grad_norm": 153.39865112304688, + "learning_rate": 5.230199032177294e-05, + "loss": 1.0885, "step": 164200 }, { - "epoch": 1.67, - "learning_rate": 5.907323580120046e-05, - "loss": 1.2694, + "epoch": 2.2636466341517183, + "grad_norm": 9.966987609863281, + "learning_rate": 5.229852688112496e-05, + "loss": 0.955, "step": 164300 }, { - "epoch": 1.67, - "learning_rate": 5.9071598997661684e-05, - "loss": 1.1462, + "epoch": 2.2650243862114574, + "grad_norm": 6.414557456970215, + "learning_rate": 5.229506133371527e-05, + "loss": 1.0635, "step": 164400 }, { - "epoch": 1.68, - "learning_rate": 5.906996077269521e-05, - "loss": 1.1246, + "epoch": 2.2664021382711965, + "grad_norm": 64.12127685546875, + "learning_rate": 5.2291593679838246e-05, + "loss": 0.922, "step": 164500 }, { - "epoch": 1.68, - "learning_rate": 5.9068321126381126e-05, - "loss": 1.1021, + "epoch": 2.267779890330936, + "grad_norm": 17.174057006835938, + "learning_rate": 5.228812391978852e-05, + "loss": 0.9401, "step": 164600 }, { - "epoch": 1.68, - "learning_rate": 5.9066696476510437e-05, - "loss": 1.0859, + "epoch": 2.269157642390675, + "grad_norm": 6.744351863861084, + "learning_rate": 5.2284652053860846e-05, + "loss": 1.02, "step": 164700 }, { - "epoch": 1.68, - "learning_rate": 5.906505400195321e-05, - "loss": 0.999, + "epoch": 2.2705353944504147, + "grad_norm": 7.3058390617370605, + "learning_rate": 5.228121283248696e-05, + "loss": 0.9649, "step": 164800 }, { - "epoch": 1.68, - "learning_rate": 5.9063410106288284e-05, - "loss": 1.0934, + "epoch": 2.271913146510154, + "grad_norm": 14.383864402770996, + "learning_rate": 5.227773677673986e-05, + "loss": 1.0199, "step": 164900 }, { - "epoch": 1.68, - "learning_rate": 5.906176478959605e-05, - "loss": 1.1116, + "epoch": 2.2732908985698934, + "grad_norm": 212.670654296875, + "learning_rate": 5.227425861599725e-05, + "loss": 0.9731, "step": 165000 }, { - "epoch": 1.68, - "learning_rate": 5.906011805195696e-05, - "loss": 1.0252, + "epoch": 2.2746686506296325, + "grad_norm": 8.136039733886719, + "learning_rate": 5.2270778350554635e-05, + "loss": 0.9479, "step": 165100 }, { - "epoch": 1.68, - "learning_rate": 5.9058469893451504e-05, - "loss": 1.15, + "epoch": 2.276046402689372, + "grad_norm": 59.46452331542969, + "learning_rate": 5.2267295980707675e-05, + "loss": 0.978, "step": 165200 }, { - "epoch": 1.68, - "learning_rate": 5.905682031416028e-05, - "loss": 1.1213, + "epoch": 2.277424154749111, + "grad_norm": 15.516117095947266, + "learning_rate": 5.2263811506752215e-05, + "loss": 1.0419, "step": 165300 }, { - "epoch": 1.69, - "learning_rate": 5.9055169314163956e-05, - "loss": 1.1387, + "epoch": 2.2788019068088508, + "grad_norm": 18.471847534179688, + "learning_rate": 5.226032492898427e-05, + "loss": 0.9516, "step": 165400 }, { - "epoch": 1.69, - "learning_rate": 5.905351689354324e-05, - "loss": 1.0706, + "epoch": 2.28017965886859, + "grad_norm": 9.486163139343262, + "learning_rate": 5.225683624770004e-05, + "loss": 0.9295, "step": 165500 }, { - "epoch": 1.69, - "learning_rate": 5.905186305237893e-05, - "loss": 1.1451, + "epoch": 2.2815574109283294, + "grad_norm": 6.116447925567627, + "learning_rate": 5.2253345463195935e-05, + "loss": 0.9983, "step": 165600 }, { - "epoch": 1.69, - "learning_rate": 5.905020779075188e-05, - "loss": 1.1136, + "epoch": 2.2829351629880685, + "grad_norm": 10.514097213745117, + "learning_rate": 5.224985257576848e-05, + "loss": 0.9038, "step": 165700 }, { - "epoch": 1.69, - "learning_rate": 5.904855110874305e-05, - "loss": 1.1479, + "epoch": 2.284312915047808, + "grad_norm": 27.088058471679688, + "learning_rate": 5.224635758571444e-05, + "loss": 0.98, "step": 165800 }, { - "epoch": 1.69, - "learning_rate": 5.904689300643341e-05, - "loss": 1.0833, + "epoch": 2.285690667107547, + "grad_norm": 27.673587799072266, + "learning_rate": 5.224286049333071e-05, + "loss": 0.9489, "step": 165900 }, { - "epoch": 1.69, - "learning_rate": 5.904523348390405e-05, - "loss": 1.124, + "epoch": 2.2870684191672868, + "grad_norm": 11.980297088623047, + "learning_rate": 5.223936129891441e-05, + "loss": 1.0186, "step": 166000 }, { - "epoch": 1.69, - "learning_rate": 5.904357254123611e-05, - "loss": 1.0448, + "epoch": 2.288446171227026, + "grad_norm": 120.99385833740234, + "learning_rate": 5.22358600027628e-05, + "loss": 0.9645, "step": 166100 }, { - "epoch": 1.69, - "learning_rate": 5.9041910178510786e-05, - "loss": 1.1762, + "epoch": 2.2898239232867654, + "grad_norm": 59.4659538269043, + "learning_rate": 5.2232356605173326e-05, + "loss": 1.0393, "step": 166200 }, { - "epoch": 1.69, - "learning_rate": 5.904024639580937e-05, - "loss": 1.1621, + "epoch": 2.2912016753465045, + "grad_norm": 112.47696685791016, + "learning_rate": 5.222885110644364e-05, + "loss": 1.1404, "step": 166300 }, { - "epoch": 1.7, - "learning_rate": 5.903858119321321e-05, - "loss": 1.1899, + "epoch": 2.292579427406244, + "grad_norm": 22.85833168029785, + "learning_rate": 5.2225378593265444e-05, + "loss": 1.0471, "step": 166400 }, { - "epoch": 1.7, - "learning_rate": 5.903691457080373e-05, - "loss": 1.1389, + "epoch": 2.293957179465983, + "grad_norm": 16.62672233581543, + "learning_rate": 5.222186891415288e-05, + "loss": 1.0863, "step": 166500 }, { - "epoch": 1.7, - "learning_rate": 5.90352465286624e-05, - "loss": 1.1143, + "epoch": 2.2953349315257228, + "grad_norm": 52.5628547668457, + "learning_rate": 5.2218357134791074e-05, + "loss": 0.9931, "step": 166600 }, { - "epoch": 1.7, - "learning_rate": 5.903357706687079e-05, - "loss": 1.0186, + "epoch": 2.296712683585462, + "grad_norm": 32.63602828979492, + "learning_rate": 5.2214843255478376e-05, + "loss": 1.0147, "step": 166700 }, { - "epoch": 1.7, - "learning_rate": 5.9031922901350724e-05, - "loss": 1.1633, + "epoch": 2.2980904356452014, + "grad_norm": 5.233877658843994, + "learning_rate": 5.221132727651331e-05, + "loss": 1.0359, "step": 166800 }, { - "epoch": 1.7, - "learning_rate": 5.903025061469797e-05, - "loss": 1.1949, + "epoch": 2.2994681877049405, + "grad_norm": 14.243330001831055, + "learning_rate": 5.220780919819456e-05, + "loss": 1.1741, "step": 166900 }, { - "epoch": 1.7, - "learning_rate": 5.90285769086392e-05, - "loss": 1.1562, + "epoch": 2.30084593976468, + "grad_norm": 15.849098205566406, + "learning_rate": 5.220428902082102e-05, + "loss": 1.0128, "step": 167000 }, { - "epoch": 1.7, - "learning_rate": 5.902690178325626e-05, - "loss": 1.1467, + "epoch": 2.302223691824419, + "grad_norm": 51.943416595458984, + "learning_rate": 5.220076674469173e-05, + "loss": 1.0337, "step": 167100 }, { - "epoch": 1.7, - "learning_rate": 5.9025225238631035e-05, - "loss": 1.1466, + "epoch": 2.3036014438841588, + "grad_norm": 16.928619384765625, + "learning_rate": 5.219724237010594e-05, + "loss": 1.0639, "step": 167200 }, { - "epoch": 1.7, - "learning_rate": 5.902354727484552e-05, - "loss": 1.0918, + "epoch": 2.304979195943898, + "grad_norm": 10.309880256652832, + "learning_rate": 5.219371589736307e-05, + "loss": 1.019, "step": 167300 }, { - "epoch": 1.71, - "learning_rate": 5.9021867891981734e-05, - "loss": 1.0937, + "epoch": 2.3063569480036374, + "grad_norm": 16.806623458862305, + "learning_rate": 5.21901873267627e-05, + "loss": 1.0678, "step": 167400 }, { - "epoch": 1.71, - "learning_rate": 5.902018709012181e-05, - "loss": 1.184, + "epoch": 2.3077347000633766, + "grad_norm": 17.494747161865234, + "learning_rate": 5.21866566586046e-05, + "loss": 0.9179, "step": 167500 }, { - "epoch": 1.71, - "learning_rate": 5.901850486934792e-05, - "loss": 1.0813, + "epoch": 2.309112452123116, + "grad_norm": 40.3848876953125, + "learning_rate": 5.2183123893188716e-05, + "loss": 0.9432, "step": 167600 }, { - "epoch": 1.71, - "learning_rate": 5.901682122974231e-05, - "loss": 1.056, + "epoch": 2.3104902041828552, + "grad_norm": 135.49258422851562, + "learning_rate": 5.217958903081518e-05, + "loss": 1.0602, "step": 167700 }, { - "epoch": 1.71, - "learning_rate": 5.901513617138731e-05, - "loss": 1.0522, + "epoch": 2.311867956242595, + "grad_norm": 14.116009712219238, + "learning_rate": 5.2176052071784306e-05, + "loss": 1.0863, "step": 167800 }, { - "epoch": 1.71, - "learning_rate": 5.90134496943653e-05, - "loss": 1.2201, + "epoch": 2.313245708302334, + "grad_norm": 15.387771606445312, + "learning_rate": 5.217251301639656e-05, + "loss": 1.1613, "step": 167900 }, { - "epoch": 1.71, - "learning_rate": 5.9011761798758754e-05, - "loss": 1.0886, + "epoch": 2.314623460362073, + "grad_norm": 88.38285064697266, + "learning_rate": 5.216897186495261e-05, + "loss": 1.1406, "step": 168000 }, { - "epoch": 1.71, - "learning_rate": 5.901007248465018e-05, - "loss": 1.2309, + "epoch": 2.3160012124218126, + "grad_norm": 11.002593040466309, + "learning_rate": 5.216542861775329e-05, + "loss": 1.0048, "step": 168100 }, { - "epoch": 1.71, - "learning_rate": 5.9008381752122184e-05, - "loss": 1.0794, + "epoch": 2.317378964481552, + "grad_norm": 8.143379211425781, + "learning_rate": 5.216188327509963e-05, + "loss": 0.9663, "step": 168200 }, { - "epoch": 1.71, - "learning_rate": 5.9006689601257436e-05, - "loss": 1.17, + "epoch": 2.3187567165412912, + "grad_norm": 33.01509475708008, + "learning_rate": 5.21583358372928e-05, + "loss": 1.0433, "step": 168300 }, { - "epoch": 1.72, - "learning_rate": 5.900499603213867e-05, - "loss": 1.1347, + "epoch": 2.3201344686010303, + "grad_norm": 13.792733192443848, + "learning_rate": 5.2154786304634206e-05, + "loss": 0.9347, "step": 168400 }, { - "epoch": 1.72, - "learning_rate": 5.9003301044848694e-05, - "loss": 1.0973, + "epoch": 2.32151222066077, + "grad_norm": 38.95616912841797, + "learning_rate": 5.215123467742538e-05, + "loss": 1.1345, "step": 168500 }, { - "epoch": 1.72, - "learning_rate": 5.900160463947037e-05, - "loss": 1.149, + "epoch": 2.3228899727205095, + "grad_norm": 17.847572326660156, + "learning_rate": 5.214768095596805e-05, + "loss": 1.0973, "step": 168600 }, { - "epoch": 1.72, - "learning_rate": 5.899990681608666e-05, - "loss": 1.0234, + "epoch": 2.3242677247802486, + "grad_norm": 120.91565704345703, + "learning_rate": 5.214412514056413e-05, + "loss": 1.1131, "step": 168700 }, { - "epoch": 1.72, - "learning_rate": 5.899820757478057e-05, - "loss": 1.0795, + "epoch": 2.3256454768399877, + "grad_norm": 11.627106666564941, + "learning_rate": 5.21405672315157e-05, + "loss": 0.9853, "step": 168800 }, { - "epoch": 1.72, - "learning_rate": 5.899650691563518e-05, - "loss": 1.1501, + "epoch": 2.3270232288997272, + "grad_norm": 17.328262329101562, + "learning_rate": 5.2137007229125026e-05, + "loss": 1.0606, "step": 168900 }, { - "epoch": 1.72, - "learning_rate": 5.899480483873364e-05, - "loss": 1.0991, + "epoch": 2.3284009809594663, + "grad_norm": 29.377147674560547, + "learning_rate": 5.2133445133694536e-05, + "loss": 1.0498, "step": 169000 }, { - "epoch": 1.72, - "learning_rate": 5.8993101344159175e-05, - "loss": 1.0569, + "epoch": 2.329778733019206, + "grad_norm": 52.74782943725586, + "learning_rate": 5.212988094552686e-05, + "loss": 1.0497, "step": 169100 }, { - "epoch": 1.72, - "learning_rate": 5.8991396431995085e-05, - "loss": 1.1131, + "epoch": 2.331156485078945, + "grad_norm": 25.169078826904297, + "learning_rate": 5.21263146649248e-05, + "loss": 1.0042, "step": 169200 }, { - "epoch": 1.72, - "learning_rate": 5.89897071726378e-05, - "loss": 1.1271, + "epoch": 2.3325342371386846, + "grad_norm": 66.08953857421875, + "learning_rate": 5.2122746292191314e-05, + "loss": 1.0627, "step": 169300 }, { - "epoch": 1.73, - "learning_rate": 5.89879994397184e-05, - "loss": 1.1876, + "epoch": 2.3339119891984237, + "grad_norm": 34.516014099121094, + "learning_rate": 5.211917582762956e-05, + "loss": 1.1022, "step": 169400 }, { - "epoch": 1.73, - "learning_rate": 5.898629028945882e-05, - "loss": 1.0453, + "epoch": 2.3352897412581632, + "grad_norm": 25.2557373046875, + "learning_rate": 5.211560327154288e-05, + "loss": 1.0437, "step": 169500 }, { - "epoch": 1.73, - "learning_rate": 5.898457972194263e-05, - "loss": 1.1227, + "epoch": 2.3366674933179024, + "grad_norm": 7.747533798217773, + "learning_rate": 5.211202862423476e-05, + "loss": 1.0407, "step": 169600 }, { - "epoch": 1.73, - "learning_rate": 5.898286773725345e-05, - "loss": 1.0569, + "epoch": 2.338045245377642, + "grad_norm": 9.991101264953613, + "learning_rate": 5.2108451886008894e-05, + "loss": 1.0045, "step": 169700 }, { - "epoch": 1.73, - "learning_rate": 5.8981154335475006e-05, - "loss": 1.1641, + "epoch": 2.339422997437381, + "grad_norm": 10.435796737670898, + "learning_rate": 5.210487305716914e-05, + "loss": 1.0129, "step": 169800 }, { - "epoch": 1.73, - "learning_rate": 5.897943951669107e-05, - "loss": 1.0877, + "epoch": 2.3408007494971206, + "grad_norm": 8.149819374084473, + "learning_rate": 5.210129213801955e-05, + "loss": 0.9709, "step": 169900 }, { - "epoch": 1.73, - "learning_rate": 5.897772328098548e-05, - "loss": 1.0179, + "epoch": 2.3421785015568597, + "grad_norm": 32.893741607666016, + "learning_rate": 5.209770912886434e-05, + "loss": 1.0459, "step": 170000 }, { - "epoch": 1.73, - "learning_rate": 5.897600562844215e-05, - "loss": 1.0443, + "epoch": 2.3435562536165992, + "grad_norm": 10.252907752990723, + "learning_rate": 5.209412403000789e-05, + "loss": 0.9632, "step": 170100 }, { - "epoch": 1.73, - "learning_rate": 5.897428655914506e-05, - "loss": 0.9895, + "epoch": 2.3449340056763384, + "grad_norm": 123.37541961669922, + "learning_rate": 5.2090536841754784e-05, + "loss": 1.0678, "step": 170200 }, { - "epoch": 1.74, - "learning_rate": 5.8972566073178265e-05, - "loss": 0.973, + "epoch": 2.346311757736078, + "grad_norm": 15.898186683654785, + "learning_rate": 5.208694756440977e-05, + "loss": 1.0805, "step": 170300 }, { - "epoch": 1.74, - "learning_rate": 5.89708441706259e-05, - "loss": 1.103, + "epoch": 2.347689509795817, + "grad_norm": 11.285989761352539, + "learning_rate": 5.2083356198277765e-05, + "loss": 0.9717, "step": 170400 }, { - "epoch": 1.74, - "learning_rate": 5.8969120851572135e-05, - "loss": 1.1178, + "epoch": 2.3490672618555566, + "grad_norm": 62.288143157958984, + "learning_rate": 5.2079762743663884e-05, + "loss": 1.1064, "step": 170500 }, { - "epoch": 1.74, - "learning_rate": 5.8967396116101244e-05, - "loss": 1.138, + "epoch": 2.3504450139152957, + "grad_norm": 27.738956451416016, + "learning_rate": 5.207616720087343e-05, + "loss": 1.0938, "step": 170600 }, { - "epoch": 1.74, - "learning_rate": 5.896566996429755e-05, - "loss": 1.0396, + "epoch": 2.3518227659750353, + "grad_norm": 9.988628387451172, + "learning_rate": 5.2072569570211815e-05, + "loss": 1.108, "step": 170700 }, { - "epoch": 1.74, - "learning_rate": 5.896394239624544e-05, - "loss": 1.0276, + "epoch": 2.3532005180347744, + "grad_norm": 24.56854820251465, + "learning_rate": 5.206896985198471e-05, + "loss": 1.0553, "step": 170800 }, { - "epoch": 1.74, - "learning_rate": 5.896221341202942e-05, - "loss": 1.0412, + "epoch": 2.354578270094514, + "grad_norm": 4.584719181060791, + "learning_rate": 5.206536804649793e-05, + "loss": 0.9619, "step": 170900 }, { - "epoch": 1.74, - "learning_rate": 5.896048301173398e-05, - "loss": 1.1085, + "epoch": 2.355956022154253, + "grad_norm": 24.88559341430664, + "learning_rate": 5.2061764154057456e-05, + "loss": 1.0665, "step": 171000 }, { - "epoch": 1.74, - "learning_rate": 5.895875119544376e-05, - "loss": 1.1414, + "epoch": 2.3573337742139926, + "grad_norm": 27.593503952026367, + "learning_rate": 5.205815817496946e-05, + "loss": 1.0, "step": 171100 }, { - "epoch": 1.74, - "learning_rate": 5.895701796324341e-05, - "loss": 1.1306, + "epoch": 2.3587115262737317, + "grad_norm": 35.88474655151367, + "learning_rate": 5.2054550109540284e-05, + "loss": 0.9997, "step": 171200 }, { - "epoch": 1.75, - "learning_rate": 5.89552833152177e-05, - "loss": 1.0438, + "epoch": 2.3600892783334713, + "grad_norm": 32.62916946411133, + "learning_rate": 5.205093995807646e-05, + "loss": 1.0596, "step": 171300 }, { - "epoch": 1.75, - "learning_rate": 5.895354725145143e-05, - "loss": 1.0865, + "epoch": 2.3614670303932104, + "grad_norm": 11.110734939575195, + "learning_rate": 5.204732772088468e-05, + "loss": 1.0892, "step": 171400 }, { - "epoch": 1.75, - "learning_rate": 5.895182715383093e-05, - "loss": 1.0463, + "epoch": 2.36284478245295, + "grad_norm": 69.90022277832031, + "learning_rate": 5.204371339827183e-05, + "loss": 1.0014, "step": 171500 }, { - "epoch": 1.75, - "learning_rate": 5.895008827299354e-05, - "loss": 0.988, + "epoch": 2.364222534512689, + "grad_norm": 32.14912414550781, + "learning_rate": 5.2040096990544966e-05, + "loss": 1.0531, "step": 171600 }, { - "epoch": 1.75, - "learning_rate": 5.894834797666961e-05, - "loss": 1.1612, + "epoch": 2.3656002865724286, + "grad_norm": 53.77708435058594, + "learning_rate": 5.203647849801131e-05, + "loss": 1.0277, "step": 171700 }, { - "epoch": 1.75, - "learning_rate": 5.894660626494422e-05, - "loss": 0.998, + "epoch": 2.3669780386321677, + "grad_norm": 8.331936836242676, + "learning_rate": 5.2032857920978283e-05, + "loss": 0.845, "step": 171800 }, { - "epoch": 1.75, - "learning_rate": 5.8944863137902525e-05, - "loss": 1.2281, + "epoch": 2.3683557906919073, + "grad_norm": 7.75789737701416, + "learning_rate": 5.2029235259753464e-05, + "loss": 0.9146, "step": 171900 }, { - "epoch": 1.75, - "learning_rate": 5.894311859562975e-05, - "loss": 1.1075, + "epoch": 2.3697335427516464, + "grad_norm": 48.754512786865234, + "learning_rate": 5.2025610514644614e-05, + "loss": 0.8927, "step": 172000 }, { - "epoch": 1.75, - "learning_rate": 5.89413726382112e-05, - "loss": 0.9906, + "epoch": 2.371111294811386, + "grad_norm": 4.2286248207092285, + "learning_rate": 5.202198368595969e-05, + "loss": 0.9599, "step": 172100 }, { - "epoch": 1.75, - "learning_rate": 5.893962526573225e-05, - "loss": 1.0387, + "epoch": 2.372489046871125, + "grad_norm": 95.67806243896484, + "learning_rate": 5.201835477400679e-05, + "loss": 1.065, "step": 172200 }, { - "epoch": 1.76, - "learning_rate": 5.8937876478278324e-05, - "loss": 1.0356, + "epoch": 2.373866798930864, + "grad_norm": 24.00796127319336, + "learning_rate": 5.2014723779094215e-05, + "loss": 1.0615, "step": 172300 }, { - "epoch": 1.76, - "learning_rate": 5.893612627593492e-05, - "loss": 1.0154, + "epoch": 2.3752445509906037, + "grad_norm": 302.600830078125, + "learning_rate": 5.201109070153044e-05, + "loss": 0.9921, "step": 172400 }, { - "epoch": 1.76, - "learning_rate": 5.893437465878763e-05, - "loss": 1.1299, + "epoch": 2.3766223030503433, + "grad_norm": 16.198699951171875, + "learning_rate": 5.200745554162412e-05, + "loss": 1.0142, "step": 172500 }, { - "epoch": 1.76, - "learning_rate": 5.8932621626922094e-05, - "loss": 1.0584, + "epoch": 2.3780000551100824, + "grad_norm": 6.950168132781982, + "learning_rate": 5.2003818299684066e-05, + "loss": 1.0386, "step": 172600 }, { - "epoch": 1.76, - "learning_rate": 5.8930867180424027e-05, - "loss": 1.1945, + "epoch": 2.3793778071698215, + "grad_norm": 6.602731704711914, + "learning_rate": 5.2000178976019284e-05, + "loss": 1.0338, "step": 172700 }, { - "epoch": 1.76, - "learning_rate": 5.8929111319379195e-05, - "loss": 1.2254, + "epoch": 2.380755559229561, + "grad_norm": 10.891432762145996, + "learning_rate": 5.1996537570938964e-05, + "loss": 0.9747, "step": 172800 }, { - "epoch": 1.76, - "learning_rate": 5.892735404387346e-05, - "loss": 1.0428, + "epoch": 2.3821333112893006, + "grad_norm": 104.41752624511719, + "learning_rate": 5.199289408475245e-05, + "loss": 0.951, "step": 172900 }, { - "epoch": 1.76, - "learning_rate": 5.892559535399275e-05, - "loss": 1.0948, + "epoch": 2.3835110633490397, + "grad_norm": 13.820969581604004, + "learning_rate": 5.198924851776928e-05, + "loss": 0.9435, "step": 173000 }, { - "epoch": 1.76, - "learning_rate": 5.892383524982303e-05, - "loss": 0.9721, + "epoch": 2.384888815408779, + "grad_norm": 21.821395874023438, + "learning_rate": 5.198560087029916e-05, + "loss": 1.0227, "step": 173100 }, { - "epoch": 1.76, - "learning_rate": 5.892207373145039e-05, - "loss": 1.0578, + "epoch": 2.3862665674685184, + "grad_norm": 22.758373260498047, + "learning_rate": 5.1981951142651974e-05, + "loss": 1.0461, "step": 173200 }, { - "epoch": 1.77, - "learning_rate": 5.892031079896094e-05, - "loss": 1.1486, + "epoch": 2.3876443195282575, + "grad_norm": 22.348026275634766, + "learning_rate": 5.197829933513779e-05, + "loss": 1.0892, "step": 173300 }, { - "epoch": 1.77, - "learning_rate": 5.8918546452440874e-05, - "loss": 1.0789, + "epoch": 2.389022071587997, + "grad_norm": 10.256428718566895, + "learning_rate": 5.1974645448066855e-05, + "loss": 0.9678, "step": 173400 }, { - "epoch": 1.77, - "learning_rate": 5.891679835657985e-05, - "loss": 1.1121, + "epoch": 2.390399823647736, + "grad_norm": 9.233114242553711, + "learning_rate": 5.197098948174957e-05, + "loss": 1.0123, "step": 173500 }, { - "epoch": 1.77, - "learning_rate": 5.8915031196395586e-05, - "loss": 1.1555, + "epoch": 2.3917775757074757, + "grad_norm": 8.449286460876465, + "learning_rate": 5.196733143649654e-05, + "loss": 0.9685, "step": 173600 }, { - "epoch": 1.77, - "learning_rate": 5.891326262243885e-05, - "loss": 1.1175, + "epoch": 2.393155327767215, + "grad_norm": 19.234813690185547, + "learning_rate": 5.1963671312618535e-05, + "loss": 0.9747, "step": 173700 }, { - "epoch": 1.77, - "learning_rate": 5.891149263479612e-05, - "loss": 1.0122, + "epoch": 2.3945330798269544, + "grad_norm": 30.018075942993164, + "learning_rate": 5.1960009110426495e-05, + "loss": 0.9633, "step": 173800 }, { - "epoch": 1.77, - "learning_rate": 5.8909721233553926e-05, - "loss": 1.1746, + "epoch": 2.3959108318866935, + "grad_norm": 65.04824829101562, + "learning_rate": 5.195634483023154e-05, + "loss": 1.0313, "step": 173900 }, { - "epoch": 1.77, - "learning_rate": 5.8907948418798904e-05, - "loss": 1.0349, + "epoch": 2.397288583946433, + "grad_norm": 16.118972778320312, + "learning_rate": 5.195267847234498e-05, + "loss": 1.0113, "step": 174000 }, { - "epoch": 1.77, - "learning_rate": 5.890617419061771e-05, - "loss": 1.0381, + "epoch": 2.398666336006172, + "grad_norm": 32.16482162475586, + "learning_rate": 5.194901003707827e-05, + "loss": 1.0195, "step": 174100 }, { - "epoch": 1.77, - "learning_rate": 5.89043985490971e-05, - "loss": 1.012, + "epoch": 2.4000440880659117, + "grad_norm": 33.35468673706055, + "learning_rate": 5.194533952474309e-05, + "loss": 1.0382, "step": 174200 }, { - "epoch": 1.78, - "learning_rate": 5.8902621494323886e-05, - "loss": 1.0617, + "epoch": 2.401421840125651, + "grad_norm": 17.260162353515625, + "learning_rate": 5.1941666935651253e-05, + "loss": 1.0441, "step": 174300 }, { - "epoch": 1.78, - "learning_rate": 5.8900843026384976e-05, - "loss": 1.0212, + "epoch": 2.4027995921853904, + "grad_norm": 10.887475967407227, + "learning_rate": 5.1938029027047506e-05, + "loss": 1.0326, "step": 174400 }, { - "epoch": 1.78, - "learning_rate": 5.8899063145367305e-05, - "loss": 1.1118, + "epoch": 2.4041773442451295, + "grad_norm": 187.56747436523438, + "learning_rate": 5.193435230613833e-05, + "loss": 1.0227, "step": 174500 }, { - "epoch": 1.78, - "learning_rate": 5.8897281851357915e-05, - "loss": 0.9886, + "epoch": 2.405555096304869, + "grad_norm": 25.112625122070312, + "learning_rate": 5.1930673509405926e-05, + "loss": 1.05, "step": 174600 }, { - "epoch": 1.78, - "learning_rate": 5.889549914444388e-05, - "loss": 1.0445, + "epoch": 2.406932848364608, + "grad_norm": 15.029642105102539, + "learning_rate": 5.192699263716282e-05, + "loss": 1.1454, "step": 174700 }, { - "epoch": 1.78, - "learning_rate": 5.8893715024712385e-05, - "loss": 1.1851, + "epoch": 2.4083106004243477, + "grad_norm": 97.6905288696289, + "learning_rate": 5.192330968972171e-05, + "loss": 1.0397, "step": 174800 }, { - "epoch": 1.78, - "learning_rate": 5.889192949225065e-05, - "loss": 0.9408, + "epoch": 2.409688352484087, + "grad_norm": 7.099515914916992, + "learning_rate": 5.19196246673955e-05, + "loss": 1.0869, "step": 174900 }, { - "epoch": 1.78, - "learning_rate": 5.889014254714598e-05, - "loss": 1.1603, + "epoch": 2.4110661045438264, + "grad_norm": 29.096044540405273, + "learning_rate": 5.191593757049726e-05, + "loss": 1.0363, "step": 175000 }, { - "epoch": 1.78, - "learning_rate": 5.888835418948576e-05, - "loss": 0.9967, + "epoch": 2.4124438566035655, + "grad_norm": 10.986734390258789, + "learning_rate": 5.1912248399340194e-05, + "loss": 1.0172, "step": 175100 }, { - "epoch": 1.78, - "learning_rate": 5.88865644193574e-05, - "loss": 1.1263, + "epoch": 2.413821608663305, + "grad_norm": 25.678735733032227, + "learning_rate": 5.1908594076953767e-05, + "loss": 1.0575, "step": 175200 }, { - "epoch": 1.79, - "learning_rate": 5.888479115566452e-05, - "loss": 1.0883, + "epoch": 2.415199360723044, + "grad_norm": 20.406526565551758, + "learning_rate": 5.190490077895427e-05, + "loss": 1.0295, "step": 175300 }, { - "epoch": 1.79, - "learning_rate": 5.888299857498501e-05, - "loss": 1.1163, + "epoch": 2.4165771127827838, + "grad_norm": 21.655616760253906, + "learning_rate": 5.19012054076336e-05, + "loss": 1.0918, "step": 175400 }, { - "epoch": 1.79, - "learning_rate": 5.888120458209924e-05, - "loss": 1.0606, + "epoch": 2.417954864842523, + "grad_norm": 26.264522552490234, + "learning_rate": 5.18975079633057e-05, + "loss": 1.0617, "step": 175500 }, { - "epoch": 1.79, - "learning_rate": 5.887940917709491e-05, - "loss": 1.1004, + "epoch": 2.4193326169022624, + "grad_norm": 77.81526947021484, + "learning_rate": 5.1893808446284675e-05, + "loss": 1.0516, "step": 175600 }, { - "epoch": 1.79, - "learning_rate": 5.8877612360059835e-05, - "loss": 1.0697, + "epoch": 2.4207103689620015, + "grad_norm": 67.17523193359375, + "learning_rate": 5.1890106856884824e-05, + "loss": 1.0606, "step": 175700 }, { - "epoch": 1.79, - "learning_rate": 5.8875814131081826e-05, - "loss": 1.0931, + "epoch": 2.422088121021741, + "grad_norm": 30.02215576171875, + "learning_rate": 5.188640319542062e-05, + "loss": 1.2209, "step": 175800 }, { - "epoch": 1.79, - "learning_rate": 5.887401449024884e-05, - "loss": 1.2231, + "epoch": 2.42346587308148, + "grad_norm": 206.22950744628906, + "learning_rate": 5.1882697462206705e-05, + "loss": 1.0157, "step": 175900 }, { - "epoch": 1.79, - "learning_rate": 5.887221343764886e-05, - "loss": 1.1316, + "epoch": 2.4248436251412198, + "grad_norm": 14.736010551452637, + "learning_rate": 5.18789896575579e-05, + "loss": 1.0217, "step": 176000 }, { - "epoch": 1.79, - "learning_rate": 5.8870429005000236e-05, - "loss": 0.9989, + "epoch": 2.426221377200959, + "grad_norm": 204.29269409179688, + "learning_rate": 5.18752797817892e-05, + "loss": 1.1247, "step": 176100 }, { - "epoch": 1.8, - "learning_rate": 5.886862514324597e-05, - "loss": 1.2651, + "epoch": 2.4275991292606984, + "grad_norm": 140.85768127441406, + "learning_rate": 5.187156783521578e-05, + "loss": 0.9863, "step": 176200 }, { - "epoch": 1.8, - "learning_rate": 5.886681986998823e-05, - "loss": 0.9657, + "epoch": 2.4289768813204375, + "grad_norm": 22.160306930541992, + "learning_rate": 5.186785381815299e-05, + "loss": 0.9873, "step": 176300 }, { - "epoch": 1.8, - "learning_rate": 5.886501318531526e-05, - "loss": 1.1431, + "epoch": 2.430354633380177, + "grad_norm": 8.243571281433105, + "learning_rate": 5.186413773091634e-05, + "loss": 1.0438, "step": 176400 }, { - "epoch": 1.8, - "learning_rate": 5.88632050893154e-05, - "loss": 1.0394, + "epoch": 2.431732385439916, + "grad_norm": 16.818946838378906, + "learning_rate": 5.186041957382156e-05, + "loss": 1.0263, "step": 176500 }, { - "epoch": 1.8, - "learning_rate": 5.8861395582077055e-05, - "loss": 1.0307, + "epoch": 2.4331101374996553, + "grad_norm": 4.928921222686768, + "learning_rate": 5.1856699347184505e-05, + "loss": 1.1613, "step": 176600 }, { - "epoch": 1.8, - "learning_rate": 5.885960277985749e-05, - "loss": 1.0666, + "epoch": 2.434487889559395, + "grad_norm": 129.56832885742188, + "learning_rate": 5.1853014284521496e-05, + "loss": 1.1088, "step": 176700 }, { - "epoch": 1.8, - "learning_rate": 5.885779046451785e-05, - "loss": 1.0585, + "epoch": 2.4358656416191344, + "grad_norm": 44.18943405151367, + "learning_rate": 5.184928994043577e-05, + "loss": 1.1488, "step": 176800 }, { - "epoch": 1.8, - "learning_rate": 5.885597673820447e-05, - "loss": 1.1727, + "epoch": 2.4372433936788735, + "grad_norm": 123.67875671386719, + "learning_rate": 5.184556352775329e-05, + "loss": 1.119, "step": 176900 }, { - "epoch": 1.8, - "learning_rate": 5.885416160100604e-05, - "loss": 1.1319, + "epoch": 2.4386211457386127, + "grad_norm": 9.233516693115234, + "learning_rate": 5.184183504679064e-05, + "loss": 1.0325, "step": 177000 }, { - "epoch": 1.8, - "learning_rate": 5.885234505301129e-05, - "loss": 1.0306, + "epoch": 2.439998897798352, + "grad_norm": 10.702132225036621, + "learning_rate": 5.183810449786457e-05, + "loss": 1.0569, "step": 177100 }, { - "epoch": 1.81, - "learning_rate": 5.885052709430905e-05, - "loss": 1.0385, + "epoch": 2.4413766498580918, + "grad_norm": 21.172279357910156, + "learning_rate": 5.183437188129201e-05, + "loss": 1.1594, "step": 177200 }, { - "epoch": 1.81, - "learning_rate": 5.88487077249882e-05, - "loss": 1.0999, + "epoch": 2.442754401917831, + "grad_norm": 129.290771484375, + "learning_rate": 5.1830637197390064e-05, + "loss": 1.038, "step": 177300 }, { - "epoch": 1.81, - "learning_rate": 5.8846886945137715e-05, - "loss": 0.9836, + "epoch": 2.44413215397757, + "grad_norm": 7.982485771179199, + "learning_rate": 5.182690044647601e-05, + "loss": 1.1337, "step": 177400 }, { - "epoch": 1.81, - "learning_rate": 5.88450647548466e-05, - "loss": 1.1406, + "epoch": 2.4455099060373096, + "grad_norm": 12.42431354522705, + "learning_rate": 5.182316162886731e-05, + "loss": 1.1621, "step": 177500 }, { - "epoch": 1.81, - "learning_rate": 5.884324115420395e-05, - "loss": 1.0799, + "epoch": 2.4468876580970487, + "grad_norm": 176.325927734375, + "learning_rate": 5.1819420744881596e-05, + "loss": 1.0764, "step": 177600 }, { - "epoch": 1.81, - "learning_rate": 5.884141614329895e-05, - "loss": 1.1657, + "epoch": 2.4482654101567882, + "grad_norm": 24.430160522460938, + "learning_rate": 5.181567779483667e-05, + "loss": 0.9985, "step": 177700 }, { - "epoch": 1.81, - "learning_rate": 5.8839589722220814e-05, - "loss": 1.0296, + "epoch": 2.4496431622165273, + "grad_norm": 4.872775077819824, + "learning_rate": 5.1811932779050515e-05, + "loss": 1.0255, "step": 177800 }, { - "epoch": 1.81, - "learning_rate": 5.883776189105885e-05, - "loss": 1.0576, + "epoch": 2.451020914276267, + "grad_norm": 10.347536087036133, + "learning_rate": 5.1808185697841296e-05, + "loss": 1.0522, "step": 177900 }, { - "epoch": 1.81, - "learning_rate": 5.883593264990241e-05, - "loss": 1.1511, + "epoch": 2.452398666336006, + "grad_norm": 3.4556097984313965, + "learning_rate": 5.180443655152734e-05, + "loss": 1.0476, "step": 178000 }, { - "epoch": 1.81, - "learning_rate": 5.883410199884096e-05, - "loss": 1.057, + "epoch": 2.4537764183957456, + "grad_norm": 18.893156051635742, + "learning_rate": 5.1800685340427155e-05, + "loss": 1.0728, "step": 178100 }, { - "epoch": 1.82, - "learning_rate": 5.8832269937963985e-05, - "loss": 1.0776, + "epoch": 2.4551541704554847, + "grad_norm": 18.359251022338867, + "learning_rate": 5.179693206485944e-05, + "loss": 0.9061, "step": 178200 }, { - "epoch": 1.82, - "learning_rate": 5.883043646736108e-05, - "loss": 1.1126, + "epoch": 2.4565319225152242, + "grad_norm": 147.23263549804688, + "learning_rate": 5.1793176725143034e-05, + "loss": 1.0574, "step": 178300 }, { - "epoch": 1.82, - "learning_rate": 5.8828601587121874e-05, - "loss": 0.9161, + "epoch": 2.4579096745749633, + "grad_norm": 5.758816242218018, + "learning_rate": 5.178941932159698e-05, + "loss": 0.9852, "step": 178400 }, { - "epoch": 1.82, - "learning_rate": 5.88267652973361e-05, - "loss": 1.0497, + "epoch": 2.459287426634703, + "grad_norm": 33.01610565185547, + "learning_rate": 5.1785659854540494e-05, + "loss": 0.9077, "step": 178500 }, { - "epoch": 1.82, - "learning_rate": 5.882492759809353e-05, - "loss": 1.0773, + "epoch": 2.460665178694442, + "grad_norm": 17.409378051757812, + "learning_rate": 5.178189832429296e-05, + "loss": 0.9479, "step": 178600 }, { - "epoch": 1.82, - "learning_rate": 5.882308848948401e-05, - "loss": 1.1272, + "epoch": 2.4620429307541816, + "grad_norm": 17.433063507080078, + "learning_rate": 5.1778134731173935e-05, + "loss": 0.9505, "step": 178700 }, { - "epoch": 1.82, - "learning_rate": 5.882124797159748e-05, - "loss": 1.0945, + "epoch": 2.4634206828139207, + "grad_norm": 7.8560261726379395, + "learning_rate": 5.177436907550317e-05, + "loss": 1.1017, "step": 178800 }, { - "epoch": 1.82, - "learning_rate": 5.881940604452392e-05, - "loss": 1.1188, + "epoch": 2.4647984348736602, + "grad_norm": 8.070338249206543, + "learning_rate": 5.177060135760056e-05, + "loss": 0.9516, "step": 178900 }, { - "epoch": 1.82, - "learning_rate": 5.8817562708353386e-05, - "loss": 1.0852, + "epoch": 2.4661761869333994, + "grad_norm": 6.128262519836426, + "learning_rate": 5.176683157778619e-05, + "loss": 0.9875, "step": 179000 }, { - "epoch": 1.82, - "learning_rate": 5.881571796317601e-05, - "loss": 1.0829, + "epoch": 2.467553938993139, + "grad_norm": 13.260516166687012, + "learning_rate": 5.176305973638033e-05, + "loss": 0.8901, "step": 179100 }, { - "epoch": 1.83, - "learning_rate": 5.881387180908199e-05, - "loss": 1.0352, + "epoch": 2.468931691052878, + "grad_norm": 4.990565299987793, + "learning_rate": 5.175928583370342e-05, + "loss": 1.0459, "step": 179200 }, { - "epoch": 1.83, - "learning_rate": 5.881202424616159e-05, - "loss": 1.1804, + "epoch": 2.4703094431126176, + "grad_norm": 19.502981185913086, + "learning_rate": 5.1755509870076066e-05, + "loss": 0.9791, "step": 179300 }, { - "epoch": 1.83, - "learning_rate": 5.881017527450515e-05, - "loss": 1.1625, + "epoch": 2.4716871951723567, + "grad_norm": 9.662759780883789, + "learning_rate": 5.175173184581906e-05, + "loss": 0.8444, "step": 179400 }, { - "epoch": 1.83, - "learning_rate": 5.880832489420306e-05, - "loss": 1.0713, + "epoch": 2.4730649472320962, + "grad_norm": 164.9662628173828, + "learning_rate": 5.174795176125336e-05, + "loss": 0.983, "step": 179500 }, { - "epoch": 1.83, - "learning_rate": 5.8806473105345806e-05, - "loss": 0.9646, + "epoch": 2.4744426992918354, + "grad_norm": 54.17001724243164, + "learning_rate": 5.174416961670011e-05, + "loss": 0.9716, "step": 179600 }, { - "epoch": 1.83, - "learning_rate": 5.880461990802393e-05, - "loss": 1.1763, + "epoch": 2.475820451351575, + "grad_norm": 12.545207977294922, + "learning_rate": 5.174038541248062e-05, + "loss": 0.9388, "step": 179700 }, { - "epoch": 1.83, - "learning_rate": 5.880276530232803e-05, - "loss": 1.1165, + "epoch": 2.477198203411314, + "grad_norm": 7.745179176330566, + "learning_rate": 5.1736599148916374e-05, + "loss": 0.9997, "step": 179800 }, { - "epoch": 1.83, - "learning_rate": 5.8800909288348805e-05, - "loss": 1.1069, + "epoch": 2.4785759554710536, + "grad_norm": 14.741572380065918, + "learning_rate": 5.173281082632903e-05, + "loss": 1.0515, "step": 179900 }, { - "epoch": 1.83, - "learning_rate": 5.879905186617697e-05, - "loss": 1.0449, + "epoch": 2.4799537075307927, + "grad_norm": 10.962214469909668, + "learning_rate": 5.172902044504045e-05, + "loss": 0.9865, "step": 180000 }, { - "epoch": 1.83, - "learning_rate": 5.879719303590338e-05, - "loss": 1.1108, + "epoch": 2.4813314595905323, + "grad_norm": 6.911766052246094, + "learning_rate": 5.172522800537261e-05, + "loss": 0.8742, "step": 180100 }, { - "epoch": 1.84, - "learning_rate": 5.87953327976189e-05, - "loss": 0.9414, + "epoch": 2.4827092116502714, + "grad_norm": 7.514871120452881, + "learning_rate": 5.172143350764773e-05, + "loss": 0.9228, "step": 180200 }, { - "epoch": 1.84, - "learning_rate": 5.8793471151414484e-05, - "loss": 1.0686, + "epoch": 2.484086963710011, + "grad_norm": 7.460342884063721, + "learning_rate": 5.171763695218814e-05, + "loss": 1.0701, "step": 180300 }, { - "epoch": 1.84, - "learning_rate": 5.8791608097381165e-05, - "loss": 1.0198, + "epoch": 2.48546471576975, + "grad_norm": 12.996829986572266, + "learning_rate": 5.1713838339316415e-05, + "loss": 0.912, "step": 180400 }, { - "epoch": 1.84, - "learning_rate": 5.8789743635610015e-05, - "loss": 1.0388, + "epoch": 2.4868424678294896, + "grad_norm": 12.467066764831543, + "learning_rate": 5.171003766935523e-05, + "loss": 0.9402, "step": 180500 }, { - "epoch": 1.84, - "learning_rate": 5.878787776619223e-05, - "loss": 1.0546, + "epoch": 2.4882202198892287, + "grad_norm": 7.390972137451172, + "learning_rate": 5.170623494262749e-05, + "loss": 1.0215, "step": 180600 }, { - "epoch": 1.84, - "learning_rate": 5.8786010489219e-05, - "loss": 1.0832, + "epoch": 2.4895979719489683, + "grad_norm": 5.795259952545166, + "learning_rate": 5.170243015945626e-05, + "loss": 0.8947, "step": 180700 }, { - "epoch": 1.84, - "learning_rate": 5.8784141804781654e-05, - "loss": 1.1233, + "epoch": 2.4909757240087074, + "grad_norm": 12.004082679748535, + "learning_rate": 5.169862332016476e-05, + "loss": 1.0102, "step": 180800 }, { - "epoch": 1.84, - "learning_rate": 5.8782271712971554e-05, - "loss": 1.0932, + "epoch": 2.4923534760684465, + "grad_norm": 12.660604476928711, + "learning_rate": 5.169481442507642e-05, + "loss": 0.9221, "step": 180900 }, { - "epoch": 1.84, - "learning_rate": 5.878040021388012e-05, - "loss": 0.9733, + "epoch": 2.493731228128186, + "grad_norm": 7.932154178619385, + "learning_rate": 5.169100347451481e-05, + "loss": 0.9068, "step": 181000 }, { - "epoch": 1.85, - "learning_rate": 5.877852730759887e-05, - "loss": 0.9454, + "epoch": 2.4951089801879256, + "grad_norm": 14.801414489746094, + "learning_rate": 5.168719046880369e-05, + "loss": 0.8937, "step": 181100 }, { - "epoch": 1.85, - "learning_rate": 5.8776652994219373e-05, - "loss": 0.9843, + "epoch": 2.4964867322476647, + "grad_norm": 41.460079193115234, + "learning_rate": 5.1683375408267006e-05, + "loss": 0.9151, "step": 181200 }, { - "epoch": 1.85, - "learning_rate": 5.877477727383328e-05, - "loss": 1.0707, + "epoch": 2.497864484307404, + "grad_norm": 6.121121406555176, + "learning_rate": 5.167955829322886e-05, + "loss": 0.8914, "step": 181300 }, { - "epoch": 1.85, - "learning_rate": 5.8772900146532296e-05, - "loss": 1.1087, + "epoch": 2.4992422363671434, + "grad_norm": 8.627535820007324, + "learning_rate": 5.167573912401353e-05, + "loss": 0.8863, "step": 181400 }, { - "epoch": 1.85, - "learning_rate": 5.877104040471292e-05, - "loss": 1.0862, + "epoch": 2.500619988426883, + "grad_norm": 9.960061073303223, + "learning_rate": 5.167191790094548e-05, + "loss": 0.8927, "step": 181500 }, { - "epoch": 1.85, - "learning_rate": 5.876916047792442e-05, - "loss": 1.0858, + "epoch": 2.501997740486622, + "grad_norm": 6.799235820770264, + "learning_rate": 5.1668094624349345e-05, + "loss": 0.868, "step": 181600 }, { - "epoch": 1.85, - "learning_rate": 5.876727914449566e-05, - "loss": 1.0127, + "epoch": 2.503375492546361, + "grad_norm": 5.950893402099609, + "learning_rate": 5.166426929454992e-05, + "loss": 0.8329, "step": 181700 }, { - "epoch": 1.85, - "learning_rate": 5.876539640451863e-05, - "loss": 0.997, + "epoch": 2.5047532446061007, + "grad_norm": 12.782723426818848, + "learning_rate": 5.16604419118722e-05, + "loss": 0.9383, "step": 181800 }, { - "epoch": 1.85, - "learning_rate": 5.8763512258085376e-05, - "loss": 0.9871, + "epoch": 2.5061309966658403, + "grad_norm": 11.052231788635254, + "learning_rate": 5.1656612476641346e-05, + "loss": 0.9806, "step": 181900 }, { - "epoch": 1.85, - "learning_rate": 5.876162670528802e-05, - "loss": 1.087, + "epoch": 2.5075087487255794, + "grad_norm": 15.06791877746582, + "learning_rate": 5.165278098918266e-05, + "loss": 0.9745, "step": 182000 }, { - "epoch": 1.86, - "learning_rate": 5.875973974621876e-05, - "loss": 0.9743, + "epoch": 2.5088865007853185, + "grad_norm": 9.330639839172363, + "learning_rate": 5.164894744982167e-05, + "loss": 0.8876, "step": 182100 }, { - "epoch": 1.86, - "learning_rate": 5.8757851380969854e-05, - "loss": 1.0385, + "epoch": 2.510264252845058, + "grad_norm": 15.254342079162598, + "learning_rate": 5.164511185888406e-05, + "loss": 0.9242, "step": 182200 }, { - "epoch": 1.86, - "learning_rate": 5.875596160963364e-05, - "loss": 1.1426, + "epoch": 2.5116420049047976, + "grad_norm": 14.822551727294922, + "learning_rate": 5.1641274216695665e-05, + "loss": 0.9226, "step": 182300 }, { - "epoch": 1.86, - "learning_rate": 5.87540704323025e-05, - "loss": 1.1371, + "epoch": 2.5130197569645367, + "grad_norm": 4.59323263168335, + "learning_rate": 5.1637434523582514e-05, + "loss": 0.9647, "step": 182400 }, { - "epoch": 1.86, - "learning_rate": 5.8752177849068916e-05, - "loss": 1.103, + "epoch": 2.514397509024276, + "grad_norm": 18.973844528198242, + "learning_rate": 5.163359277987081e-05, + "loss": 0.8483, "step": 182500 }, { - "epoch": 1.86, - "learning_rate": 5.875028386002542e-05, - "loss": 1.1149, + "epoch": 2.5157752610840154, + "grad_norm": 10.772690773010254, + "learning_rate": 5.1629748985886946e-05, + "loss": 0.9064, "step": 182600 }, { - "epoch": 1.86, - "learning_rate": 5.874838846526462e-05, - "loss": 1.0722, + "epoch": 2.5171530131437545, + "grad_norm": 7.380763053894043, + "learning_rate": 5.1625903141957455e-05, + "loss": 0.9273, "step": 182700 }, { - "epoch": 1.86, - "learning_rate": 5.874649166487918e-05, - "loss": 1.1381, + "epoch": 2.518530765203494, + "grad_norm": 4.440961837768555, + "learning_rate": 5.1622055248409066e-05, + "loss": 0.8626, "step": 182800 }, { - "epoch": 1.86, - "learning_rate": 5.8744593458961856e-05, - "loss": 0.9331, + "epoch": 2.519908517263233, + "grad_norm": 4.579497337341309, + "learning_rate": 5.161820530556867e-05, + "loss": 0.9964, "step": 182900 }, { - "epoch": 1.86, - "learning_rate": 5.874269384760544e-05, - "loss": 1.0214, + "epoch": 2.5212862693229727, + "grad_norm": 7.019351005554199, + "learning_rate": 5.161435331376335e-05, + "loss": 0.8699, "step": 183000 }, { - "epoch": 1.87, - "learning_rate": 5.874079283090283e-05, - "loss": 1.1014, + "epoch": 2.522664021382712, + "grad_norm": 7.681735515594482, + "learning_rate": 5.161049927332034e-05, + "loss": 0.9026, "step": 183100 }, { - "epoch": 1.87, - "learning_rate": 5.873889040894696e-05, - "loss": 1.1651, + "epoch": 2.5240417734424514, + "grad_norm": 12.842411041259766, + "learning_rate": 5.1606643184567076e-05, + "loss": 0.8147, "step": 183200 }, { - "epoch": 1.87, - "learning_rate": 5.873698658183086e-05, - "loss": 1.0628, + "epoch": 2.5254195255021905, + "grad_norm": 5.394651412963867, + "learning_rate": 5.160278504783114e-05, + "loss": 1.0233, "step": 183300 }, { - "epoch": 1.87, - "learning_rate": 5.8735081349647605e-05, - "loss": 1.0845, + "epoch": 2.52679727756193, + "grad_norm": 11.24244499206543, + "learning_rate": 5.159892486344031e-05, + "loss": 0.7975, "step": 183400 }, { - "epoch": 1.87, - "learning_rate": 5.8733174712490353e-05, - "loss": 0.9792, + "epoch": 2.528175029621669, + "grad_norm": 3.0735654830932617, + "learning_rate": 5.1595062631722525e-05, + "loss": 0.8718, "step": 183500 }, { - "epoch": 1.87, - "learning_rate": 5.873126667045233e-05, - "loss": 0.984, + "epoch": 2.5295527816814087, + "grad_norm": 15.372994422912598, + "learning_rate": 5.1591198353005896e-05, + "loss": 0.9473, "step": 183600 }, { - "epoch": 1.87, - "learning_rate": 5.8729357223626825e-05, - "loss": 0.9166, + "epoch": 2.530930533741148, + "grad_norm": 10.221742630004883, + "learning_rate": 5.158733202761872e-05, + "loss": 0.8634, "step": 183700 }, { - "epoch": 1.87, - "learning_rate": 5.872744637210721e-05, - "loss": 1.0896, + "epoch": 2.5323082858008874, + "grad_norm": 6.982173442840576, + "learning_rate": 5.1583463655889457e-05, + "loss": 0.8723, "step": 183800 }, { - "epoch": 1.87, - "learning_rate": 5.8725534115986896e-05, - "loss": 1.0412, + "epoch": 2.5336860378606265, + "grad_norm": 6.5016303062438965, + "learning_rate": 5.1579593238146746e-05, + "loss": 0.9272, "step": 183900 }, { - "epoch": 1.87, - "learning_rate": 5.8723620455359386e-05, - "loss": 1.0795, + "epoch": 2.535063789920366, + "grad_norm": 6.839928150177002, + "learning_rate": 5.1575720774719396e-05, + "loss": 0.9469, "step": 184000 }, { - "epoch": 1.88, - "learning_rate": 5.872170539031825e-05, - "loss": 0.981, + "epoch": 2.536441541980105, + "grad_norm": 7.967733860015869, + "learning_rate": 5.15718462659364e-05, + "loss": 1.0197, "step": 184100 }, { - "epoch": 1.88, - "learning_rate": 5.8719788920957125e-05, - "loss": 1.1081, + "epoch": 2.5378192940398447, + "grad_norm": 11.0763521194458, + "learning_rate": 5.1567969712126904e-05, + "loss": 0.8719, "step": 184200 }, { - "epoch": 1.88, - "learning_rate": 5.8717871047369704e-05, - "loss": 1.1002, + "epoch": 2.539197046099584, + "grad_norm": 5.825921535491943, + "learning_rate": 5.1564091113620266e-05, + "loss": 0.8551, "step": 184300 }, { - "epoch": 1.88, - "learning_rate": 5.8715951769649777e-05, - "loss": 1.0586, + "epoch": 2.5405747981593234, + "grad_norm": 8.032723426818848, + "learning_rate": 5.156021047074596e-05, + "loss": 0.8672, "step": 184400 }, { - "epoch": 1.88, - "learning_rate": 5.871403108789118e-05, - "loss": 0.9811, + "epoch": 2.5419525502190625, + "grad_norm": 27.34185791015625, + "learning_rate": 5.15563277838337e-05, + "loss": 0.9162, "step": 184500 }, { - "epoch": 1.88, - "learning_rate": 5.8712109002187806e-05, - "loss": 1.1085, + "epoch": 2.543330302278802, + "grad_norm": 13.580251693725586, + "learning_rate": 5.1552443053213316e-05, + "loss": 0.9123, "step": 184600 }, { - "epoch": 1.88, - "learning_rate": 5.8710185512633655e-05, - "loss": 1.053, + "epoch": 2.544708054338541, + "grad_norm": 13.947186470031738, + "learning_rate": 5.154855627921483e-05, + "loss": 0.9009, "step": 184700 }, { - "epoch": 1.88, - "learning_rate": 5.870826061932277e-05, - "loss": 0.9772, + "epoch": 2.5460858063982803, + "grad_norm": 14.822541236877441, + "learning_rate": 5.1544667462168475e-05, + "loss": 0.9732, "step": 184800 }, { - "epoch": 1.88, - "learning_rate": 5.8706334322349254e-05, - "loss": 1.0585, + "epoch": 2.54746355845802, + "grad_norm": 28.581523895263672, + "learning_rate": 5.1540776602404595e-05, + "loss": 0.9473, "step": 184900 }, { - "epoch": 1.88, - "learning_rate": 5.87044066218073e-05, - "loss": 1.0163, + "epoch": 2.5488413105177594, + "grad_norm": 80.29841613769531, + "learning_rate": 5.1536883700253764e-05, + "loss": 0.8771, "step": 185000 }, { - "epoch": 1.89, - "learning_rate": 5.870247751779117e-05, - "loss": 1.043, + "epoch": 2.5502190625774985, + "grad_norm": 1.720413327217102, + "learning_rate": 5.1532988756046684e-05, + "loss": 0.9222, "step": 185100 }, { - "epoch": 1.89, - "learning_rate": 5.8700547010395175e-05, - "loss": 1.0392, + "epoch": 2.5515968146372376, + "grad_norm": 6.913283348083496, + "learning_rate": 5.1529091770114254e-05, + "loss": 0.9653, "step": 185200 }, { - "epoch": 1.89, - "learning_rate": 5.86986150997137e-05, - "loss": 0.9951, + "epoch": 2.552974566696977, + "grad_norm": 18.21382713317871, + "learning_rate": 5.152519274278755e-05, + "loss": 0.837, "step": 185300 }, { - "epoch": 1.89, - "learning_rate": 5.8696681785841225e-05, - "loss": 0.958, + "epoch": 2.5543523187567168, + "grad_norm": 8.810418128967285, + "learning_rate": 5.1521291674397807e-05, + "loss": 0.9495, "step": 185400 }, { - "epoch": 1.89, - "learning_rate": 5.869474706887224e-05, - "loss": 1.0135, + "epoch": 2.555730070816456, + "grad_norm": 30.632896423339844, + "learning_rate": 5.1517388565276446e-05, + "loss": 0.947, "step": 185500 }, { - "epoch": 1.89, - "learning_rate": 5.869281094890138e-05, - "loss": 1.0654, + "epoch": 2.557107822876195, + "grad_norm": 21.800125122070312, + "learning_rate": 5.151348341575506e-05, + "loss": 0.9462, "step": 185600 }, { - "epoch": 1.89, - "learning_rate": 5.8690873426023295e-05, - "loss": 0.9894, + "epoch": 2.5584855749359345, + "grad_norm": 4.775737285614014, + "learning_rate": 5.15095762261654e-05, + "loss": 0.9657, "step": 185700 }, { - "epoch": 1.89, - "learning_rate": 5.868893450033272e-05, - "loss": 1.0581, + "epoch": 2.559863326995674, + "grad_norm": 6.699219703674316, + "learning_rate": 5.15056669968394e-05, + "loss": 0.888, "step": 185800 }, { - "epoch": 1.89, - "learning_rate": 5.868699417192445e-05, - "loss": 1.0247, + "epoch": 2.561241079055413, + "grad_norm": 14.519498825073242, + "learning_rate": 5.150179485089045e-05, + "loss": 0.9088, "step": 185900 }, { - "epoch": 1.9, - "learning_rate": 5.8685052440893355e-05, - "loss": 1.2023, + "epoch": 2.5626188311151523, + "grad_norm": 13.107704162597656, + "learning_rate": 5.149788156347737e-05, + "loss": 1.0211, "step": 186000 }, { - "epoch": 1.9, - "learning_rate": 5.8683109307334384e-05, - "loss": 0.9881, + "epoch": 2.563996583174892, + "grad_norm": 169.83856201171875, + "learning_rate": 5.149396623732147e-05, + "loss": 0.886, "step": 186100 }, { - "epoch": 1.9, - "learning_rate": 5.8681164771342535e-05, - "loss": 1.0777, + "epoch": 2.5653743352346314, + "grad_norm": 4.7520647048950195, + "learning_rate": 5.1490048872755394e-05, + "loss": 0.9538, "step": 186200 }, { - "epoch": 1.9, - "learning_rate": 5.86792188330129e-05, - "loss": 1.0457, + "epoch": 2.5667520872943705, + "grad_norm": 51.831214904785156, + "learning_rate": 5.148612947011193e-05, + "loss": 0.9556, "step": 186300 }, { - "epoch": 1.9, - "learning_rate": 5.8677271492440604e-05, - "loss": 0.9898, + "epoch": 2.5681298393541097, + "grad_norm": 21.457719802856445, + "learning_rate": 5.148220802972406e-05, + "loss": 0.9755, "step": 186400 }, { - "epoch": 1.9, - "learning_rate": 5.8675342244088385e-05, - "loss": 1.0059, + "epoch": 2.569507591413849, + "grad_norm": 13.799973487854004, + "learning_rate": 5.147828455192492e-05, + "loss": 0.9267, "step": 186500 }, { - "epoch": 1.9, - "learning_rate": 5.867339211333655e-05, - "loss": 1.1224, + "epoch": 2.5708853434735888, + "grad_norm": 4.978957653045654, + "learning_rate": 5.147435903704784e-05, + "loss": 0.8939, "step": 186600 }, { - "epoch": 1.9, - "learning_rate": 5.867144058062695e-05, - "loss": 1.0864, + "epoch": 2.572263095533328, + "grad_norm": 44.25135803222656, + "learning_rate": 5.147043148542631e-05, + "loss": 0.9141, "step": 186700 }, { - "epoch": 1.9, - "learning_rate": 5.8669487646055e-05, - "loss": 1.0137, + "epoch": 2.573640847593067, + "grad_norm": 6.83934211730957, + "learning_rate": 5.1466501897393984e-05, + "loss": 0.7704, "step": 186800 }, { - "epoch": 1.9, - "learning_rate": 5.8667533309716205e-05, - "loss": 1.1196, + "epoch": 2.5750185996528066, + "grad_norm": 18.938186645507812, + "learning_rate": 5.146257027328471e-05, + "loss": 0.8558, "step": 186900 }, { - "epoch": 1.91, - "learning_rate": 5.866557757170611e-05, - "loss": 1.1045, + "epoch": 2.5763963517125457, + "grad_norm": 10.271759033203125, + "learning_rate": 5.1458636613432517e-05, + "loss": 0.8964, "step": 187000 }, { - "epoch": 1.91, - "learning_rate": 5.866362043212033e-05, - "loss": 1.0339, + "epoch": 2.577774103772285, + "grad_norm": 9.096436500549316, + "learning_rate": 5.145470091817155e-05, + "loss": 0.8865, "step": 187100 }, { - "epoch": 1.91, - "learning_rate": 5.866166189105457e-05, - "loss": 1.0753, + "epoch": 2.5791518558320243, + "grad_norm": 102.71835327148438, + "learning_rate": 5.145076318783621e-05, + "loss": 1.016, "step": 187200 }, { - "epoch": 1.91, - "learning_rate": 5.86597019486046e-05, - "loss": 1.0649, + "epoch": 2.580529607891764, + "grad_norm": 20.117971420288086, + "learning_rate": 5.144682342276099e-05, + "loss": 0.9383, "step": 187300 }, { - "epoch": 1.91, - "learning_rate": 5.8657740604866216e-05, - "loss": 1.0005, + "epoch": 2.581907359951503, + "grad_norm": 9.525062561035156, + "learning_rate": 5.1442881623280605e-05, + "loss": 1.0051, "step": 187400 }, { - "epoch": 1.91, - "learning_rate": 5.865579749432024e-05, - "loss": 1.0381, + "epoch": 2.5832851120112426, + "grad_norm": 4.5403666496276855, + "learning_rate": 5.143893778972995e-05, + "loss": 0.7994, "step": 187500 }, { - "epoch": 1.91, - "learning_rate": 5.865383336230334e-05, - "loss": 1.0746, + "epoch": 2.5846628640709817, + "grad_norm": 27.31873893737793, + "learning_rate": 5.1434991922444053e-05, + "loss": 0.9004, "step": 187600 }, { - "epoch": 1.91, - "learning_rate": 5.8651867829284974e-05, - "loss": 1.0191, + "epoch": 2.5860406161307212, + "grad_norm": 4.647820949554443, + "learning_rate": 5.1431044021758145e-05, + "loss": 0.9793, "step": 187700 }, { - "epoch": 1.91, - "learning_rate": 5.864990089536127e-05, - "loss": 1.094, + "epoch": 2.5874183681904603, + "grad_norm": 19.6611270904541, + "learning_rate": 5.142709408800761e-05, + "loss": 1.0231, "step": 187800 }, { - "epoch": 1.91, - "learning_rate": 5.864793256062837e-05, - "loss": 1.1974, + "epoch": 2.5887961202502, + "grad_norm": 54.47097396850586, + "learning_rate": 5.1423142121528026e-05, + "loss": 0.9445, "step": 187900 }, { - "epoch": 1.92, - "learning_rate": 5.8645962825182536e-05, - "loss": 1.1302, + "epoch": 2.590173872309939, + "grad_norm": 6.3395609855651855, + "learning_rate": 5.14192276727031e-05, + "loss": 0.8839, "step": 188000 }, { - "epoch": 1.92, - "learning_rate": 5.864399168912007e-05, - "loss": 1.059, + "epoch": 2.5915516243696786, + "grad_norm": 14.5242919921875, + "learning_rate": 5.141527166209171e-05, + "loss": 0.9234, "step": 188100 }, { - "epoch": 1.92, - "learning_rate": 5.8642019152537356e-05, - "loss": 1.1059, + "epoch": 2.5929293764294177, + "grad_norm": 19.29517936706543, + "learning_rate": 5.1411313619755646e-05, + "loss": 0.8739, "step": 188200 }, { - "epoch": 1.92, - "learning_rate": 5.8640045215530824e-05, - "loss": 1.0466, + "epoch": 2.5943071284891572, + "grad_norm": 23.12270736694336, + "learning_rate": 5.140735354603116e-05, + "loss": 0.9486, "step": 188300 }, { - "epoch": 1.92, - "learning_rate": 5.8638069878196996e-05, - "loss": 1.033, + "epoch": 2.5956848805488963, + "grad_norm": 6.882327556610107, + "learning_rate": 5.140339144125468e-05, + "loss": 0.8628, "step": 188400 }, { - "epoch": 1.92, - "learning_rate": 5.863609314063246e-05, - "loss": 1.1171, + "epoch": 2.597062632608636, + "grad_norm": 314.453857421875, + "learning_rate": 5.139946695716866e-05, + "loss": 0.9973, "step": 188500 }, { - "epoch": 1.92, - "learning_rate": 5.8634115002933854e-05, - "loss": 1.0081, + "epoch": 2.598440384668375, + "grad_norm": 4.135373115539551, + "learning_rate": 5.139550081160029e-05, + "loss": 0.9938, "step": 188600 }, { - "epoch": 1.92, - "learning_rate": 5.863213546519791e-05, - "loss": 0.9977, + "epoch": 2.5998181367281146, + "grad_norm": 18.16997528076172, + "learning_rate": 5.139153263598688e-05, + "loss": 0.8673, "step": 188700 }, { - "epoch": 1.92, - "learning_rate": 5.86301545275214e-05, - "loss": 0.9932, + "epoch": 2.6011958887878537, + "grad_norm": 20.263628005981445, + "learning_rate": 5.138756243066554e-05, + "loss": 0.9005, "step": 188800 }, { - "epoch": 1.92, - "learning_rate": 5.86281721900012e-05, - "loss": 1.039, + "epoch": 2.6025736408475932, + "grad_norm": 40.79197692871094, + "learning_rate": 5.138359019597356e-05, + "loss": 0.9471, "step": 188900 }, { - "epoch": 1.93, - "learning_rate": 5.862618845273421e-05, - "loss": 1.1327, + "epoch": 2.6039513929073324, + "grad_norm": 11.931047439575195, + "learning_rate": 5.137961593224842e-05, + "loss": 0.9217, "step": 189000 }, { - "epoch": 1.93, - "learning_rate": 5.862420331581745e-05, - "loss": 1.1378, + "epoch": 2.6053291449670715, + "grad_norm": 10.099672317504883, + "learning_rate": 5.137563963982772e-05, + "loss": 0.9391, "step": 189100 }, { - "epoch": 1.93, - "learning_rate": 5.862221677934796e-05, - "loss": 1.1226, + "epoch": 2.606706897026811, + "grad_norm": 7.533517837524414, + "learning_rate": 5.137166131904929e-05, + "loss": 0.9268, "step": 189200 }, { - "epoch": 1.93, - "learning_rate": 5.862022884342289e-05, - "loss": 1.0431, + "epoch": 2.6080846490865506, + "grad_norm": 18.0084228515625, + "learning_rate": 5.1367680970251106e-05, + "loss": 0.9717, "step": 189300 }, { - "epoch": 1.93, - "learning_rate": 5.861823950813941e-05, - "loss": 1.1207, + "epoch": 2.6094624011462897, + "grad_norm": 18.37828254699707, + "learning_rate": 5.136369859377133e-05, + "loss": 0.8855, "step": 189400 }, { - "epoch": 1.93, - "learning_rate": 5.861624877359481e-05, - "loss": 1.0674, + "epoch": 2.610840153206029, + "grad_norm": 12.89180850982666, + "learning_rate": 5.135971418994826e-05, + "loss": 0.9168, "step": 189500 }, { - "epoch": 1.93, - "learning_rate": 5.8614256639886424e-05, - "loss": 1.0298, + "epoch": 2.6122179052657684, + "grad_norm": 17.849620819091797, + "learning_rate": 5.1355767633461244e-05, + "loss": 0.9399, "step": 189600 }, { - "epoch": 1.93, - "learning_rate": 5.861226310711164e-05, - "loss": 1.0787, + "epoch": 2.613595657325508, + "grad_norm": 32.4993782043457, + "learning_rate": 5.1351779196232266e-05, + "loss": 0.9172, "step": 189700 }, { - "epoch": 1.93, - "learning_rate": 5.861026817536795e-05, - "loss": 1.0701, + "epoch": 2.614973409385247, + "grad_norm": 2.5051157474517822, + "learning_rate": 5.13477887326726e-05, + "loss": 0.9514, "step": 189800 }, { - "epoch": 1.93, - "learning_rate": 5.860829181498311e-05, - "loss": 1.0185, + "epoch": 2.616351161444986, + "grad_norm": 4.378296852111816, + "learning_rate": 5.134379624312129e-05, + "loss": 1.0027, "step": 189900 }, { - "epoch": 1.94, - "learning_rate": 5.860629409958152e-05, - "loss": 1.1304, + "epoch": 2.6177289135047257, + "grad_norm": 7.234669208526611, + "learning_rate": 5.133980172791751e-05, + "loss": 0.9112, "step": 190000 }, { - "epoch": 1.94, - "learning_rate": 5.860429498550287e-05, - "loss": 1.0306, + "epoch": 2.6191066655644653, + "grad_norm": 8.032721519470215, + "learning_rate": 5.1335805187400596e-05, + "loss": 0.9584, "step": 190100 }, { - "epoch": 1.94, - "learning_rate": 5.860229447284489e-05, - "loss": 1.1154, + "epoch": 2.6204844176242044, + "grad_norm": 9.139363288879395, + "learning_rate": 5.1331806621910094e-05, + "loss": 0.9918, "step": 190200 }, { - "epoch": 1.94, - "learning_rate": 5.8600292561705406e-05, - "loss": 0.9679, + "epoch": 2.6218621696839435, + "grad_norm": 8.141135215759277, + "learning_rate": 5.13278060317857e-05, + "loss": 0.9646, "step": 190300 }, { - "epoch": 1.94, - "learning_rate": 5.859828925218228e-05, - "loss": 1.0666, + "epoch": 2.623239921743683, + "grad_norm": 12.721752166748047, + "learning_rate": 5.132380341736727e-05, + "loss": 0.8484, "step": 190400 }, { - "epoch": 1.94, - "learning_rate": 5.859628454437349e-05, - "loss": 0.9944, + "epoch": 2.6246176738034226, + "grad_norm": 1.455106496810913, + "learning_rate": 5.1319798778994874e-05, + "loss": 0.9257, "step": 190500 }, { - "epoch": 1.94, - "learning_rate": 5.859427843837703e-05, - "loss": 0.9664, + "epoch": 2.6259954258631617, + "grad_norm": 9.383966445922852, + "learning_rate": 5.1315792117008703e-05, + "loss": 0.8106, "step": 190600 }, { - "epoch": 1.94, - "learning_rate": 5.8592270934291e-05, - "loss": 0.9415, + "epoch": 2.627373177922901, + "grad_norm": 7.546175003051758, + "learning_rate": 5.131178343174915e-05, + "loss": 0.9302, "step": 190700 }, { - "epoch": 1.94, - "learning_rate": 5.8590262032213554e-05, - "loss": 1.0905, + "epoch": 2.6287509299826404, + "grad_norm": 18.524019241333008, + "learning_rate": 5.1307772723556774e-05, + "loss": 0.8488, "step": 190800 }, { - "epoch": 1.94, - "learning_rate": 5.85882517322429e-05, - "loss": 0.9721, + "epoch": 2.63012868204238, + "grad_norm": 6.301628589630127, + "learning_rate": 5.130375999277231e-05, + "loss": 0.9353, "step": 190900 }, { - "epoch": 1.95, - "learning_rate": 5.858624003447736e-05, - "loss": 0.9859, + "epoch": 2.631506434102119, + "grad_norm": 6.803900718688965, + "learning_rate": 5.1299745239736646e-05, + "loss": 1.0234, "step": 191000 }, { - "epoch": 1.95, - "learning_rate": 5.858422693901526e-05, - "loss": 1.0749, + "epoch": 2.632884186161858, + "grad_norm": 11.359065055847168, + "learning_rate": 5.129572846479088e-05, + "loss": 0.9681, "step": 191100 }, { - "epoch": 1.95, - "learning_rate": 5.858221244595505e-05, - "loss": 1.0592, + "epoch": 2.6342619382215977, + "grad_norm": 7.186391830444336, + "learning_rate": 5.129170966827623e-05, + "loss": 0.9555, "step": 191200 }, { - "epoch": 1.95, - "learning_rate": 5.858019655539522e-05, - "loss": 1.1336, + "epoch": 2.635639690281337, + "grad_norm": 35.95524597167969, + "learning_rate": 5.128768885053413e-05, + "loss": 0.9275, "step": 191300 }, { - "epoch": 1.95, - "learning_rate": 5.8578179267434346e-05, - "loss": 1.0255, + "epoch": 2.6370174423410764, + "grad_norm": 10.369431495666504, + "learning_rate": 5.1283666011906165e-05, + "loss": 0.8901, "step": 191400 }, { - "epoch": 1.95, - "learning_rate": 5.857616058217104e-05, - "loss": 0.9245, + "epoch": 2.6383951944008155, + "grad_norm": 6.025608062744141, + "learning_rate": 5.1279641152734096e-05, + "loss": 0.9045, "step": 191500 }, { - "epoch": 1.95, - "learning_rate": 5.8574140499704026e-05, - "loss": 0.9219, + "epoch": 2.639772946460555, + "grad_norm": 16.589677810668945, + "learning_rate": 5.127561427335986e-05, + "loss": 0.9506, "step": 191600 }, { - "epoch": 1.95, - "learning_rate": 5.8572119020132055e-05, - "loss": 1.0034, + "epoch": 2.641150698520294, + "grad_norm": 61.594337463378906, + "learning_rate": 5.1271585374125547e-05, + "loss": 1.0512, "step": 191700 }, { - "epoch": 1.95, - "learning_rate": 5.857009614355397e-05, - "loss": 1.0574, + "epoch": 2.6425284505800337, + "grad_norm": 14.661096572875977, + "learning_rate": 5.126755445537345e-05, + "loss": 1.042, "step": 191800 }, { - "epoch": 1.96, - "learning_rate": 5.8568071870068686e-05, - "loss": 0.9735, + "epoch": 2.643906202639773, + "grad_norm": 6.875641345977783, + "learning_rate": 5.1263521517446e-05, + "loss": 1.0386, "step": 191900 }, { - "epoch": 1.96, - "learning_rate": 5.856604619977518e-05, - "loss": 0.9821, + "epoch": 2.6452839546995124, + "grad_norm": 19.642898559570312, + "learning_rate": 5.1259486560685824e-05, + "loss": 0.9108, "step": 192000 }, { - "epoch": 1.96, - "learning_rate": 5.856401913277247e-05, - "loss": 1.0241, + "epoch": 2.6466617067592515, + "grad_norm": 3.6995697021484375, + "learning_rate": 5.125544958543572e-05, + "loss": 0.925, "step": 192100 }, { - "epoch": 1.96, - "learning_rate": 5.85619906691597e-05, - "loss": 1.0664, + "epoch": 2.648039458818991, + "grad_norm": 25.81755256652832, + "learning_rate": 5.125141059203864e-05, + "loss": 0.992, "step": 192200 }, { - "epoch": 1.96, - "learning_rate": 5.855996080903603e-05, - "loss": 1.0637, + "epoch": 2.64941721087873, + "grad_norm": 16.51807975769043, + "learning_rate": 5.124736958083771e-05, + "loss": 0.9433, "step": 192300 }, { - "epoch": 1.96, - "learning_rate": 5.8557929552500716e-05, - "loss": 0.9362, + "epoch": 2.6507949629384697, + "grad_norm": 17.831552505493164, + "learning_rate": 5.1243326552176254e-05, + "loss": 0.964, "step": 192400 }, { - "epoch": 1.96, - "learning_rate": 5.8555896899653075e-05, - "loss": 1.0622, + "epoch": 2.652172714998209, + "grad_norm": 28.03275489807129, + "learning_rate": 5.1239321966839124e-05, + "loss": 0.8869, "step": 192500 }, { - "epoch": 1.96, - "learning_rate": 5.8553862850592476e-05, - "loss": 0.9776, + "epoch": 2.6535504670579484, + "grad_norm": 7.905217170715332, + "learning_rate": 5.123527492445322e-05, + "loss": 0.9856, "step": 192600 }, { - "epoch": 1.96, - "learning_rate": 5.85518274054184e-05, - "loss": 0.9896, + "epoch": 2.6549282191176875, + "grad_norm": 52.83687210083008, + "learning_rate": 5.1231225865634286e-05, + "loss": 0.9353, "step": 192700 }, { - "epoch": 1.96, - "learning_rate": 5.854979056423034e-05, - "loss": 1.0154, + "epoch": 2.656305971177427, + "grad_norm": 16.814350128173828, + "learning_rate": 5.12271747907263e-05, + "loss": 0.8925, "step": 192800 }, { - "epoch": 1.97, - "learning_rate": 5.85477523271279e-05, - "loss": 1.0719, + "epoch": 2.657683723237166, + "grad_norm": 23.393108367919922, + "learning_rate": 5.122312170007344e-05, + "loss": 0.9281, "step": 192900 }, { - "epoch": 1.97, - "learning_rate": 5.854571269421074e-05, - "loss": 1.0874, + "epoch": 2.6590614752969057, + "grad_norm": 27.19084930419922, + "learning_rate": 5.121906659402001e-05, + "loss": 0.9208, "step": 193000 }, { - "epoch": 1.97, - "learning_rate": 5.8543671665578575e-05, - "loss": 1.0047, + "epoch": 2.660439227356645, + "grad_norm": 8.059378623962402, + "learning_rate": 5.121500947291054e-05, + "loss": 1.0676, "step": 193100 }, { - "epoch": 1.97, - "learning_rate": 5.854162924133121e-05, - "loss": 0.9661, + "epoch": 2.6618169794163844, + "grad_norm": 36.84059524536133, + "learning_rate": 5.121095033708969e-05, + "loss": 0.9761, "step": 193200 }, { - "epoch": 1.97, - "learning_rate": 5.85395854215685e-05, - "loss": 1.0626, + "epoch": 2.6631947314761235, + "grad_norm": 12.709820747375488, + "learning_rate": 5.12068891869023e-05, + "loss": 1.0557, "step": 193300 }, { - "epoch": 1.97, - "learning_rate": 5.853754020639038e-05, - "loss": 0.9647, + "epoch": 2.6645724835358626, + "grad_norm": 18.64215087890625, + "learning_rate": 5.120282602269339e-05, + "loss": 0.9581, "step": 193400 }, { - "epoch": 1.97, - "learning_rate": 5.853549359589684e-05, - "loss": 1.0495, + "epoch": 2.665950235595602, + "grad_norm": 12.242551803588867, + "learning_rate": 5.119876084480814e-05, + "loss": 0.9817, "step": 193500 }, { - "epoch": 1.97, - "learning_rate": 5.853344559018797e-05, - "loss": 1.0336, + "epoch": 2.6673279876553417, + "grad_norm": 31.557125091552734, + "learning_rate": 5.119469365359192e-05, + "loss": 0.9564, "step": 193600 }, { - "epoch": 1.97, - "learning_rate": 5.853139618936388e-05, - "loss": 0.9776, + "epoch": 2.668705739715081, + "grad_norm": 33.50242614746094, + "learning_rate": 5.119062444939026e-05, + "loss": 0.9729, "step": 193700 }, { - "epoch": 1.97, - "learning_rate": 5.852934539352478e-05, - "loss": 1.0166, + "epoch": 2.67008349177482, + "grad_norm": 7.800500869750977, + "learning_rate": 5.1186553232548844e-05, + "loss": 0.854, "step": 193800 }, { - "epoch": 1.98, - "learning_rate": 5.852729320277095e-05, - "loss": 0.9768, + "epoch": 2.6714612438345595, + "grad_norm": 89.7582015991211, + "learning_rate": 5.1182480003413555e-05, + "loss": 1.0125, "step": 193900 }, { - "epoch": 1.98, - "learning_rate": 5.852523961720272e-05, - "loss": 0.9489, + "epoch": 2.672838995894299, + "grad_norm": 50.66563415527344, + "learning_rate": 5.117840476233043e-05, + "loss": 1.0552, "step": 194000 }, { - "epoch": 1.98, - "learning_rate": 5.85231846369205e-05, - "loss": 1.078, + "epoch": 2.674216747954038, + "grad_norm": 10.461341857910156, + "learning_rate": 5.117432750964568e-05, + "loss": 0.9723, "step": 194100 }, { - "epoch": 1.98, - "learning_rate": 5.852112826202477e-05, - "loss": 0.9232, + "epoch": 2.6755945000137773, + "grad_norm": 126.65116119384766, + "learning_rate": 5.117024824570569e-05, + "loss": 1.0072, "step": 194200 }, { - "epoch": 1.98, - "learning_rate": 5.8519070492616085e-05, - "loss": 1.012, + "epoch": 2.676972252073517, + "grad_norm": 23.27111053466797, + "learning_rate": 5.116616697085702e-05, + "loss": 0.9332, "step": 194300 }, { - "epoch": 1.98, - "learning_rate": 5.8517011328795035e-05, - "loss": 1.0948, + "epoch": 2.6783500041332564, + "grad_norm": 14.857584953308105, + "learning_rate": 5.116208368544639e-05, + "loss": 1.0109, "step": 194400 }, { - "epoch": 1.98, - "learning_rate": 5.8514971383145154e-05, - "loss": 1.0303, + "epoch": 2.6797277561929955, + "grad_norm": 3.397592544555664, + "learning_rate": 5.11579983898207e-05, + "loss": 0.9242, "step": 194500 }, { - "epoch": 1.98, - "learning_rate": 5.851290944474312e-05, - "loss": 1.0161, + "epoch": 2.6811055082527346, + "grad_norm": 4.395207405090332, + "learning_rate": 5.1153911084327004e-05, + "loss": 0.9252, "step": 194600 }, { - "epoch": 1.98, - "learning_rate": 5.851084611222996e-05, - "loss": 0.9773, + "epoch": 2.682483260312474, + "grad_norm": 5.647323131561279, + "learning_rate": 5.114982176931255e-05, + "loss": 0.9692, "step": 194700 }, { - "epoch": 1.98, - "learning_rate": 5.850878138570658e-05, - "loss": 0.9442, + "epoch": 2.6838610123722138, + "grad_norm": 10.908419609069824, + "learning_rate": 5.114573044512475e-05, + "loss": 0.916, "step": 194800 }, { - "epoch": 1.99, - "learning_rate": 5.850671526527392e-05, - "loss": 1.0646, + "epoch": 2.685238764431953, + "grad_norm": 13.371127128601074, + "learning_rate": 5.114163711211117e-05, + "loss": 0.955, "step": 194900 }, { - "epoch": 1.99, - "learning_rate": 5.8504647751033e-05, - "loss": 1.0325, + "epoch": 2.686616516491692, + "grad_norm": 8.732426643371582, + "learning_rate": 5.1137541770619586e-05, + "loss": 0.9925, "step": 195000 }, { - "epoch": 1.99, - "learning_rate": 5.850257884308492e-05, - "loss": 0.9553, + "epoch": 2.6879942685514315, + "grad_norm": 4.614773750305176, + "learning_rate": 5.113344442099789e-05, + "loss": 0.9404, "step": 195100 }, { - "epoch": 1.99, - "learning_rate": 5.8500508541530825e-05, - "loss": 0.9404, + "epoch": 2.689372020611171, + "grad_norm": 7.362372398376465, + "learning_rate": 5.112934506359417e-05, + "loss": 0.9214, "step": 195200 }, { - "epoch": 1.99, - "learning_rate": 5.8498436846471934e-05, - "loss": 0.8782, + "epoch": 2.69074977267091, + "grad_norm": 7.540863037109375, + "learning_rate": 5.1125243698756713e-05, + "loss": 0.9476, "step": 195300 }, { - "epoch": 1.99, - "learning_rate": 5.8496363758009564e-05, - "loss": 1.0195, + "epoch": 2.6921275247306493, + "grad_norm": 15.06735610961914, + "learning_rate": 5.1121140326833934e-05, + "loss": 0.9528, "step": 195400 }, { - "epoch": 1.99, - "learning_rate": 5.849428927624506e-05, - "loss": 1.0409, + "epoch": 2.693505276790389, + "grad_norm": 3.328019857406616, + "learning_rate": 5.111703494817444e-05, + "loss": 1.0005, "step": 195500 }, { - "epoch": 1.99, - "learning_rate": 5.849221340127986e-05, - "loss": 1.0213, + "epoch": 2.694883028850128, + "grad_norm": 11.551410675048828, + "learning_rate": 5.111292756312701e-05, + "loss": 0.9917, "step": 195600 }, { - "epoch": 1.99, - "learning_rate": 5.849013613321544e-05, - "loss": 1.0622, + "epoch": 2.6962607809098675, + "grad_norm": 18.742382049560547, + "learning_rate": 5.110881817204057e-05, + "loss": 0.9887, "step": 195700 }, { - "epoch": 1.99, - "learning_rate": 5.8488057472153406e-05, - "loss": 0.9559, + "epoch": 2.6976385329696067, + "grad_norm": 3.999192476272583, + "learning_rate": 5.110470677526425e-05, + "loss": 0.9466, "step": 195800 }, { - "epoch": 2.0, - "learning_rate": 5.848597741819536e-05, - "loss": 0.9598, + "epoch": 2.699016285029346, + "grad_norm": 6.355814456939697, + "learning_rate": 5.1100593373147325e-05, + "loss": 0.9488, "step": 195900 }, { - "epoch": 2.0, - "learning_rate": 5.848389597144301e-05, - "loss": 0.9517, + "epoch": 2.7003940370890853, + "grad_norm": 12.663365364074707, + "learning_rate": 5.109647796603925e-05, + "loss": 0.9965, "step": 196000 }, { - "epoch": 2.0, - "learning_rate": 5.848181313199812e-05, - "loss": 1.097, + "epoch": 2.701771789148825, + "grad_norm": 47.599945068359375, + "learning_rate": 5.1092360554289656e-05, + "loss": 0.9193, "step": 196100 }, { - "epoch": 2.0, - "learning_rate": 5.8479728899962564e-05, - "loss": 0.8822, + "epoch": 2.703149541208564, + "grad_norm": 31.9816837310791, + "learning_rate": 5.108824113824835e-05, + "loss": 0.9268, "step": 196200 }, { - "epoch": 2.0, - "learning_rate": 5.84776432754382e-05, - "loss": 1.0259, + "epoch": 2.7045272932683035, + "grad_norm": 7.92744779586792, + "learning_rate": 5.1084119718265264e-05, + "loss": 0.9374, "step": 196300 }, { - "epoch": 2.0, - "learning_rate": 5.8475556258527045e-05, - "loss": 0.9112, + "epoch": 2.7059050453280427, + "grad_norm": 10.099730491638184, + "learning_rate": 5.1079996294690556e-05, + "loss": 1.0393, "step": 196400 }, { - "epoch": 2.0, - "learning_rate": 5.847346784933111e-05, - "loss": 0.9234, + "epoch": 2.707282797387782, + "grad_norm": 35.509029388427734, + "learning_rate": 5.107587086787453e-05, + "loss": 0.991, "step": 196500 }, { - "epoch": 2.0, - "learning_rate": 5.847137804795252e-05, - "loss": 1.066, + "epoch": 2.7086605494475213, + "grad_norm": 24.436607360839844, + "learning_rate": 5.107174343816766e-05, + "loss": 0.9262, "step": 196600 }, { - "epoch": 2.0, - "learning_rate": 5.846928685449345e-05, - "loss": 0.903, + "epoch": 2.710038301507261, + "grad_norm": 33.455020904541016, + "learning_rate": 5.1067614005920595e-05, + "loss": 0.9521, "step": 196700 }, { - "epoch": 2.01, - "learning_rate": 5.846719426905615e-05, - "loss": 0.9079, + "epoch": 2.711416053567, + "grad_norm": 11.710322380065918, + "learning_rate": 5.106348257148415e-05, + "loss": 0.874, "step": 196800 }, { - "epoch": 2.01, - "learning_rate": 5.846510029174293e-05, - "loss": 1.0521, + "epoch": 2.7127938056267396, + "grad_norm": 22.525588989257812, + "learning_rate": 5.105934913520931e-05, + "loss": 1.0296, "step": 196900 }, { - "epoch": 2.01, - "learning_rate": 5.846300492265618e-05, - "loss": 0.9344, + "epoch": 2.7141715576864787, + "grad_norm": 6.8205976486206055, + "learning_rate": 5.105521369744723e-05, + "loss": 0.9052, "step": 197000 }, { - "epoch": 2.01, - "learning_rate": 5.846090816189835e-05, - "loss": 1.001, + "epoch": 2.715549309746218, + "grad_norm": 10.318865776062012, + "learning_rate": 5.1051076258549236e-05, + "loss": 0.9833, "step": 197100 }, { - "epoch": 2.01, - "learning_rate": 5.8458810009571964e-05, - "loss": 0.9671, + "epoch": 2.7169270618059573, + "grad_norm": 14.404354095458984, + "learning_rate": 5.104693681886684e-05, + "loss": 0.8851, "step": 197200 }, { - "epoch": 2.01, - "learning_rate": 5.845671046577959e-05, - "loss": 1.0997, + "epoch": 2.718304813865697, + "grad_norm": 9.110835075378418, + "learning_rate": 5.104279537875168e-05, + "loss": 0.8997, "step": 197300 }, { - "epoch": 2.01, - "learning_rate": 5.845463054686236e-05, - "loss": 1.1956, + "epoch": 2.719682565925436, + "grad_norm": 35.47287368774414, + "learning_rate": 5.103865193855561e-05, + "loss": 0.9696, "step": 197400 }, { - "epoch": 2.01, - "learning_rate": 5.845252823435817e-05, - "loss": 1.2579, + "epoch": 2.7210603179851756, + "grad_norm": 23.111448287963867, + "learning_rate": 5.10345479629274e-05, + "loss": 1.0111, "step": 197500 }, { - "epoch": 2.01, - "learning_rate": 5.845042453069514e-05, - "loss": 1.0775, + "epoch": 2.7224380700449147, + "grad_norm": 7.040343284606934, + "learning_rate": 5.103040054361772e-05, + "loss": 0.8598, "step": 197600 }, { - "epoch": 2.01, - "learning_rate": 5.844831943597614e-05, - "loss": 0.9923, + "epoch": 2.723815822104654, + "grad_norm": 27.314756393432617, + "learning_rate": 5.1026251125280146e-05, + "loss": 0.989, "step": 197700 }, { - "epoch": 2.02, - "learning_rate": 5.844621295030408e-05, - "loss": 1.581, + "epoch": 2.7251935741643933, + "grad_norm": 9.709699630737305, + "learning_rate": 5.102209970826717e-05, + "loss": 0.8676, "step": 197800 }, { - "epoch": 2.02, - "learning_rate": 5.844410507378198e-05, - "loss": 1.2333, + "epoch": 2.726571326224133, + "grad_norm": 58.49681091308594, + "learning_rate": 5.101794629293148e-05, + "loss": 0.8694, "step": 197900 }, { - "epoch": 2.02, - "learning_rate": 5.844199580651288e-05, - "loss": 1.031, + "epoch": 2.727949078283872, + "grad_norm": 2.6799161434173584, + "learning_rate": 5.1013790879625944e-05, + "loss": 0.9155, "step": 198000 }, { - "epoch": 2.02, - "learning_rate": 5.843988514859992e-05, - "loss": 1.0202, + "epoch": 2.729326830343611, + "grad_norm": 2.6764700412750244, + "learning_rate": 5.100963346870358e-05, + "loss": 0.9121, "step": 198100 }, { - "epoch": 2.02, - "learning_rate": 5.843777310014628e-05, - "loss": 1.0946, + "epoch": 2.7307045824033507, + "grad_norm": 9.626672744750977, + "learning_rate": 5.100547406051757e-05, + "loss": 0.8487, "step": 198200 }, { - "epoch": 2.02, - "learning_rate": 5.843565966125526e-05, - "loss": 1.1925, + "epoch": 2.7320823344630902, + "grad_norm": 23.776607513427734, + "learning_rate": 5.100131265542129e-05, + "loss": 1.0353, "step": 198300 }, { - "epoch": 2.02, - "learning_rate": 5.843356598720424e-05, - "loss": 1.2114, + "epoch": 2.7334600865228293, + "grad_norm": 6.544618606567383, + "learning_rate": 5.0997232541365294e-05, + "loss": 0.9686, "step": 198400 }, { - "epoch": 2.02, - "learning_rate": 5.843144978165029e-05, - "loss": 1.1792, + "epoch": 2.7348378385825685, + "grad_norm": 126.95794677734375, + "learning_rate": 5.0993067183429824e-05, + "loss": 0.9406, "step": 198500 }, { - "epoch": 2.02, - "learning_rate": 5.842933218596812e-05, - "loss": 1.0674, + "epoch": 2.736215590642308, + "grad_norm": 11.303043365478516, + "learning_rate": 5.098889982963811e-05, + "loss": 0.9429, "step": 198600 }, { - "epoch": 2.02, - "learning_rate": 5.842721320026125e-05, - "loss": 1.1121, + "epoch": 2.7375933427020476, + "grad_norm": 59.51509094238281, + "learning_rate": 5.098473048034419e-05, + "loss": 0.8943, "step": 198700 }, { - "epoch": 2.03, - "learning_rate": 5.8425092824633305e-05, - "loss": 1.0193, + "epoch": 2.7389710947617867, + "grad_norm": 55.9721565246582, + "learning_rate": 5.0980559135902254e-05, + "loss": 0.9932, "step": 198800 }, { - "epoch": 2.03, - "learning_rate": 5.842297105918796e-05, - "loss": 1.0007, + "epoch": 2.740348846821526, + "grad_norm": 16.333709716796875, + "learning_rate": 5.097638579666671e-05, + "loss": 0.914, "step": 198900 }, { - "epoch": 2.03, - "learning_rate": 5.842084790402895e-05, - "loss": 0.9817, + "epoch": 2.7417265988812654, + "grad_norm": 100.32839965820312, + "learning_rate": 5.097221046299208e-05, + "loss": 0.9003, "step": 199000 }, { - "epoch": 2.03, - "learning_rate": 5.841872335926008e-05, - "loss": 1.0131, + "epoch": 2.743104350941005, + "grad_norm": 11.707318305969238, + "learning_rate": 5.0968033135233093e-05, + "loss": 0.9718, "step": 199100 }, { - "epoch": 2.03, - "learning_rate": 5.8416597424985226e-05, - "loss": 0.9842, + "epoch": 2.744482103000744, + "grad_norm": 24.461414337158203, + "learning_rate": 5.096385381374462e-05, + "loss": 1.0454, "step": 199200 }, { - "epoch": 2.03, - "learning_rate": 5.8414470101308345e-05, - "loss": 1.0506, + "epoch": 2.745859855060483, + "grad_norm": 23.23276710510254, + "learning_rate": 5.0959672498881734e-05, + "loss": 1.017, "step": 199300 }, { - "epoch": 2.03, - "learning_rate": 5.8412341388333444e-05, - "loss": 1.0496, + "epoch": 2.7472376071202227, + "grad_norm": 7.538372039794922, + "learning_rate": 5.095548919099965e-05, + "loss": 1.0469, "step": 199400 }, { - "epoch": 2.03, - "learning_rate": 5.841021128616461e-05, - "loss": 0.9779, + "epoch": 2.7486153591799622, + "grad_norm": 6.8459296226501465, + "learning_rate": 5.0951303890453756e-05, + "loss": 0.9409, "step": 199500 }, { - "epoch": 2.03, - "learning_rate": 5.840807979490598e-05, - "loss": 1.0707, + "epoch": 2.7499931112397014, + "grad_norm": 14.233335494995117, + "learning_rate": 5.094711659759962e-05, + "loss": 0.9833, "step": 199600 }, { - "epoch": 2.03, - "learning_rate": 5.840594691466178e-05, - "loss": 1.0097, + "epoch": 2.7513708632994405, + "grad_norm": 12.74863052368164, + "learning_rate": 5.094292731279298e-05, + "loss": 0.8698, "step": 199700 }, { - "epoch": 2.04, - "learning_rate": 5.8403812645536295e-05, - "loss": 1.0124, + "epoch": 2.75274861535918, + "grad_norm": 4.602093696594238, + "learning_rate": 5.0938736036389734e-05, + "loss": 1.0049, "step": 199800 }, { - "epoch": 2.04, - "learning_rate": 5.840167698763388e-05, - "loss": 1.1572, + "epoch": 2.754126367418919, + "grad_norm": 5.2265753746032715, + "learning_rate": 5.093454276874594e-05, + "loss": 1.1012, "step": 199900 }, { - "epoch": 2.04, - "learning_rate": 5.839953994105894e-05, - "loss": 1.3573, + "epoch": 2.7555041194786587, + "grad_norm": 66.78414916992188, + "learning_rate": 5.093034751021785e-05, + "loss": 0.9655, "step": 200000 }, { - "epoch": 2.04, - "learning_rate": 5.8397401505915995e-05, - "loss": 1.4048, + "epoch": 2.756881871538398, + "grad_norm": 95.28353118896484, + "learning_rate": 5.092615026116188e-05, + "loss": 1.0562, "step": 200100 }, { - "epoch": 2.04, - "learning_rate": 5.839526168230957e-05, - "loss": 1.0354, + "epoch": 2.7582596235981374, + "grad_norm": 15.570234298706055, + "learning_rate": 5.092195102193459e-05, + "loss": 0.9923, "step": 200200 }, { - "epoch": 2.04, - "learning_rate": 5.839312047034431e-05, - "loss": 0.9075, + "epoch": 2.7596373756578765, + "grad_norm": 30.775211334228516, + "learning_rate": 5.091774979289274e-05, + "loss": 1.049, "step": 200300 }, { - "epoch": 2.04, - "learning_rate": 5.8390977870124896e-05, - "loss": 1.0124, + "epoch": 2.761015127717616, + "grad_norm": 18.46593475341797, + "learning_rate": 5.0913546574393234e-05, + "loss": 1.0008, "step": 200400 }, { - "epoch": 2.04, - "learning_rate": 5.838883388175609e-05, - "loss": 1.0527, + "epoch": 2.762392879777355, + "grad_norm": 22.81820297241211, + "learning_rate": 5.090934136679317e-05, + "loss": 1.1333, "step": 200500 }, { - "epoch": 2.04, - "learning_rate": 5.838668850534273e-05, - "loss": 0.8876, + "epoch": 2.7637706318370947, + "grad_norm": 47.288787841796875, + "learning_rate": 5.090513417044979e-05, + "loss": 1.2612, "step": 200600 }, { - "epoch": 2.04, - "learning_rate": 5.83845417409897e-05, - "loss": 1.0682, + "epoch": 2.765148383896834, + "grad_norm": 11.055154800415039, + "learning_rate": 5.0900924985720516e-05, + "loss": 1.0377, "step": 200700 }, { - "epoch": 2.05, - "learning_rate": 5.838239358880197e-05, - "loss": 0.9313, + "epoch": 2.7665261359565734, + "grad_norm": 13.395950317382812, + "learning_rate": 5.0896713812962964e-05, + "loss": 1.0821, "step": 200800 }, { - "epoch": 2.05, - "learning_rate": 5.8380244048884576e-05, - "loss": 1.0174, + "epoch": 2.7679038880163125, + "grad_norm": 13.816061019897461, + "learning_rate": 5.089250065253486e-05, + "loss": 1.0163, "step": 200900 }, { - "epoch": 2.05, - "learning_rate": 5.83780931213426e-05, - "loss": 0.9093, + "epoch": 2.769281640076052, + "grad_norm": 30.19580078125, + "learning_rate": 5.088828550479416e-05, + "loss": 1.0871, "step": 201000 }, { - "epoch": 2.05, - "learning_rate": 5.837594080628123e-05, - "loss": 1.0549, + "epoch": 2.770659392135791, + "grad_norm": 28.16486358642578, + "learning_rate": 5.088406837009895e-05, + "loss": 1.0917, "step": 201100 }, { - "epoch": 2.05, - "learning_rate": 5.8373787103805695e-05, - "loss": 1.0736, + "epoch": 2.7720371441955307, + "grad_norm": 31.24152374267578, + "learning_rate": 5.0879849248807506e-05, + "loss": 0.9505, "step": 201200 }, { - "epoch": 2.05, - "learning_rate": 5.837163201402129e-05, - "loss": 0.9724, + "epoch": 2.77341489625527, + "grad_norm": 68.79824829101562, + "learning_rate": 5.0875628141278246e-05, + "loss": 1.0807, "step": 201300 }, { - "epoch": 2.05, - "learning_rate": 5.8369475537033406e-05, - "loss": 0.9633, + "epoch": 2.7747926483150094, + "grad_norm": 33.98255157470703, + "learning_rate": 5.087140504786979e-05, + "loss": 1.0745, "step": 201400 }, { - "epoch": 2.05, - "learning_rate": 5.8367317672947455e-05, - "loss": 0.9577, + "epoch": 2.7761704003747485, + "grad_norm": 14.746597290039062, + "learning_rate": 5.0867179968940906e-05, + "loss": 1.0066, "step": 201500 }, { - "epoch": 2.05, - "learning_rate": 5.836515842186896e-05, - "loss": 1.0551, + "epoch": 2.777548152434488, + "grad_norm": 147.15943908691406, + "learning_rate": 5.086299518531682e-05, + "loss": 1.0723, "step": 201600 }, { - "epoch": 2.05, - "learning_rate": 5.8362997783903496e-05, - "loss": 1.0248, + "epoch": 2.778925904494227, + "grad_norm": 25.66536521911621, + "learning_rate": 5.085876615627033e-05, + "loss": 1.0457, "step": 201700 }, { - "epoch": 2.06, - "learning_rate": 5.83608357591567e-05, - "loss": 0.981, + "epoch": 2.7803036565539667, + "grad_norm": 10.483482360839844, + "learning_rate": 5.085457746273391e-05, + "loss": 0.9462, "step": 201800 }, { - "epoch": 2.06, - "learning_rate": 5.835867234773428e-05, - "loss": 1.0608, + "epoch": 2.781681408613706, + "grad_norm": 16.32695770263672, + "learning_rate": 5.0850344484992584e-05, + "loss": 1.0422, "step": 201900 }, { - "epoch": 2.06, - "learning_rate": 5.835650754974203e-05, - "loss": 1.0628, + "epoch": 2.783059160673445, + "grad_norm": 8.703271865844727, + "learning_rate": 5.084610952352003e-05, + "loss": 1.0052, "step": 202000 }, { - "epoch": 2.06, - "learning_rate": 5.835434136528577e-05, - "loss": 0.9436, + "epoch": 2.7844369127331845, + "grad_norm": 11.93905258178711, + "learning_rate": 5.0841872578676045e-05, + "loss": 0.9117, "step": 202100 }, { - "epoch": 2.06, - "learning_rate": 5.8352173794471435e-05, - "loss": 0.9861, + "epoch": 2.785814664792924, + "grad_norm": 8.124862670898438, + "learning_rate": 5.0837633650820556e-05, + "loss": 0.9941, "step": 202200 }, { - "epoch": 2.06, - "learning_rate": 5.8350004837404994e-05, - "loss": 1.046, + "epoch": 2.787192416852663, + "grad_norm": 29.387733459472656, + "learning_rate": 5.08333927403137e-05, + "loss": 0.9597, "step": 202300 }, { - "epoch": 2.06, - "learning_rate": 5.834783449419251e-05, - "loss": 1.0726, + "epoch": 2.7885701689124023, + "grad_norm": 64.52640533447266, + "learning_rate": 5.0829149847515754e-05, + "loss": 0.957, "step": 202400 }, { - "epoch": 2.06, - "learning_rate": 5.834566276494008e-05, - "loss": 0.8784, + "epoch": 2.789947920972142, + "grad_norm": 13.357098579406738, + "learning_rate": 5.082490497278717e-05, + "loss": 1.0003, "step": 202500 }, { - "epoch": 2.06, - "learning_rate": 5.83434896497539e-05, - "loss": 0.9368, + "epoch": 2.7913256730318814, + "grad_norm": 14.107404708862305, + "learning_rate": 5.08206581164886e-05, + "loss": 1.0488, "step": 202600 }, { - "epoch": 2.07, - "learning_rate": 5.834131514874022e-05, - "loss": 0.9253, + "epoch": 2.7927034250916205, + "grad_norm": 28.695308685302734, + "learning_rate": 5.08164092789808e-05, + "loss": 1.0696, "step": 202700 }, { - "epoch": 2.07, - "learning_rate": 5.8339139262005375e-05, - "loss": 1.0317, + "epoch": 2.7940811771513596, + "grad_norm": 7.728165149688721, + "learning_rate": 5.081215846062475e-05, + "loss": 1.2595, "step": 202800 }, { - "epoch": 2.07, - "learning_rate": 5.833696198965572e-05, - "loss": 0.8432, + "epoch": 2.795458929211099, + "grad_norm": 41.977813720703125, + "learning_rate": 5.0807905661781575e-05, + "loss": 1.1636, "step": 202900 }, { - "epoch": 2.07, - "learning_rate": 5.8334783331797736e-05, - "loss": 0.979, + "epoch": 2.7968366812708387, + "grad_norm": 32.05202865600586, + "learning_rate": 5.080365088281257e-05, + "loss": 0.9333, "step": 203000 }, { - "epoch": 2.07, - "learning_rate": 5.8332603288537944e-05, - "loss": 0.9048, + "epoch": 2.798214433330578, + "grad_norm": 58.59223175048828, + "learning_rate": 5.0799394124079204e-05, + "loss": 0.9932, "step": 203100 }, { - "epoch": 2.07, - "learning_rate": 5.833042185998293e-05, - "loss": 0.935, + "epoch": 2.799592185390317, + "grad_norm": 14.666093826293945, + "learning_rate": 5.07951353859431e-05, + "loss": 1.0129, "step": 203200 }, { - "epoch": 2.07, - "learning_rate": 5.8328239046239354e-05, - "loss": 1.0004, + "epoch": 2.8009699374500565, + "grad_norm": 402.260009765625, + "learning_rate": 5.079087466876607e-05, + "loss": 1.0459, "step": 203300 }, { - "epoch": 2.07, - "learning_rate": 5.832605484741394e-05, - "loss": 1.0152, + "epoch": 2.802347689509796, + "grad_norm": 15.870574951171875, + "learning_rate": 5.078661197291009e-05, + "loss": 1.0175, "step": 203400 }, { - "epoch": 2.07, - "learning_rate": 5.832386926361349e-05, - "loss": 0.9919, + "epoch": 2.803725441569535, + "grad_norm": 10.65597152709961, + "learning_rate": 5.078234729873729e-05, + "loss": 1.0481, "step": 203500 }, { - "epoch": 2.07, - "learning_rate": 5.832168229494486e-05, - "loss": 1.0018, + "epoch": 2.8051031936292743, + "grad_norm": 49.02125549316406, + "learning_rate": 5.077808064660997e-05, + "loss": 1.0732, "step": 203600 }, { - "epoch": 2.08, - "learning_rate": 5.831949394151498e-05, - "loss": 0.9661, + "epoch": 2.806480945689014, + "grad_norm": 27.337841033935547, + "learning_rate": 5.077381201689061e-05, + "loss": 1.0755, "step": 203700 }, { - "epoch": 2.08, - "learning_rate": 5.8317304203430855e-05, - "loss": 0.8944, + "epoch": 2.8078586977487534, + "grad_norm": 7.510376930236816, + "learning_rate": 5.076954140994185e-05, + "loss": 1.1307, "step": 203800 }, { - "epoch": 2.08, - "learning_rate": 5.831511308079953e-05, - "loss": 1.0393, + "epoch": 2.8092364498084925, + "grad_norm": 4.652828216552734, + "learning_rate": 5.0765268826126506e-05, + "loss": 1.0991, "step": 203900 }, { - "epoch": 2.08, - "learning_rate": 5.8312920573728155e-05, - "loss": 0.9409, + "epoch": 2.8106142018682316, + "grad_norm": 27.985675811767578, + "learning_rate": 5.076099426580754e-05, + "loss": 1.162, "step": 204000 }, { - "epoch": 2.08, - "learning_rate": 5.831072668232393e-05, - "loss": 1.0064, + "epoch": 2.811991953927971, + "grad_norm": 14.408230781555176, + "learning_rate": 5.075671772934812e-05, + "loss": 1.0234, "step": 204100 }, { - "epoch": 2.08, - "learning_rate": 5.830853140669412e-05, - "loss": 0.8611, + "epoch": 2.8133697059877103, + "grad_norm": 8.758142471313477, + "learning_rate": 5.0752439217111545e-05, + "loss": 1.076, "step": 204200 }, { - "epoch": 2.08, - "learning_rate": 5.830635672039457e-05, - "loss": 0.9927, + "epoch": 2.81474745804745, + "grad_norm": 38.64096450805664, + "learning_rate": 5.074815872946129e-05, + "loss": 0.9636, "step": 204300 }, { - "epoch": 2.08, - "learning_rate": 5.830415869047524e-05, - "loss": 0.9124, + "epoch": 2.816125210107189, + "grad_norm": 18.52665138244629, + "learning_rate": 5.074387626676103e-05, + "loss": 1.144, "step": 204400 }, { - "epoch": 2.08, - "learning_rate": 5.830198127763967e-05, - "loss": 0.968, + "epoch": 2.8175029621669285, + "grad_norm": 19.902381896972656, + "learning_rate": 5.073959182937455e-05, + "loss": 0.9249, "step": 204500 }, { - "epoch": 2.08, - "learning_rate": 5.829978049385642e-05, - "loss": 0.99, + "epoch": 2.8188807142266676, + "grad_norm": 182.37232971191406, + "learning_rate": 5.0735305417665864e-05, + "loss": 1.0091, "step": 204600 }, { - "epoch": 2.09, - "learning_rate": 5.8297578326382776e-05, - "loss": 0.9251, + "epoch": 2.820258466286407, + "grad_norm": 12.161040306091309, + "learning_rate": 5.073101703199911e-05, + "loss": 1.0376, "step": 204700 }, { - "epoch": 2.09, - "learning_rate": 5.829537477532644e-05, - "loss": 0.9285, + "epoch": 2.8216362183461463, + "grad_norm": 17.16762924194336, + "learning_rate": 5.0726726672738606e-05, + "loss": 0.9096, "step": 204800 }, { - "epoch": 2.09, - "learning_rate": 5.829316984079513e-05, - "loss": 1.0292, + "epoch": 2.823013970405886, + "grad_norm": 6.061741828918457, + "learning_rate": 5.072243434024885e-05, + "loss": 0.9391, "step": 204900 }, { - "epoch": 2.09, - "learning_rate": 5.8290963522896654e-05, - "loss": 0.8976, + "epoch": 2.824391722465625, + "grad_norm": 22.281923294067383, + "learning_rate": 5.0718140034894485e-05, + "loss": 0.9729, "step": 205000 }, { - "epoch": 2.09, - "learning_rate": 5.828875582173889e-05, - "loss": 0.8915, + "epoch": 2.8257694745253645, + "grad_norm": 4.719633102416992, + "learning_rate": 5.0713843757040345e-05, + "loss": 0.8728, "step": 205100 }, { - "epoch": 2.09, - "learning_rate": 5.8286546737429795e-05, - "loss": 0.9122, + "epoch": 2.8271472265851036, + "grad_norm": 100.57250213623047, + "learning_rate": 5.0709545507051414e-05, + "loss": 0.9244, "step": 205200 }, { - "epoch": 2.09, - "learning_rate": 5.828433627007736e-05, - "loss": 0.8964, + "epoch": 2.828524978644843, + "grad_norm": 8.920479774475098, + "learning_rate": 5.070524528529285e-05, + "loss": 0.9892, "step": 205300 }, { - "epoch": 2.09, - "learning_rate": 5.828212441978968e-05, - "loss": 1.0233, + "epoch": 2.8299027307045823, + "grad_norm": 5.643247604370117, + "learning_rate": 5.070094309212999e-05, + "loss": 0.9452, "step": 205400 }, { - "epoch": 2.09, - "learning_rate": 5.8279911186674895e-05, - "loss": 1.0239, + "epoch": 2.831280482764322, + "grad_norm": 4.509352207183838, + "learning_rate": 5.069663892792831e-05, + "loss": 0.9497, "step": 205500 }, { - "epoch": 2.09, - "learning_rate": 5.827769657084122e-05, - "loss": 0.9971, + "epoch": 2.832658234824061, + "grad_norm": 8.125088691711426, + "learning_rate": 5.069233279305349e-05, + "loss": 0.8737, "step": 205600 }, { - "epoch": 2.1, - "learning_rate": 5.827548057239693e-05, - "loss": 0.866, + "epoch": 2.8340359868838005, + "grad_norm": 10.599855422973633, + "learning_rate": 5.068802468787134e-05, + "loss": 0.9742, "step": 205700 }, { - "epoch": 2.1, - "learning_rate": 5.827326319145038e-05, - "loss": 0.8884, + "epoch": 2.8354137389435397, + "grad_norm": 8.889613151550293, + "learning_rate": 5.068371461274787e-05, + "loss": 0.9359, "step": 205800 }, { - "epoch": 2.1, - "learning_rate": 5.827104442810999e-05, - "loss": 0.8648, + "epoch": 2.836791491003279, + "grad_norm": 19.9876651763916, + "learning_rate": 5.067940256804923e-05, + "loss": 0.8721, "step": 205900 }, { - "epoch": 2.1, - "learning_rate": 5.8268824282484233e-05, - "loss": 0.8546, + "epoch": 2.8381692430630183, + "grad_norm": 3.8612754344940186, + "learning_rate": 5.067513170402722e-05, + "loss": 0.9702, "step": 206000 }, { - "epoch": 2.1, - "learning_rate": 5.826660275468167e-05, - "loss": 0.9017, + "epoch": 2.839546995122758, + "grad_norm": 10.275188446044922, + "learning_rate": 5.067081574096402e-05, + "loss": 1.1416, "step": 206100 }, { - "epoch": 2.1, - "learning_rate": 5.826437984481092e-05, - "loss": 1.0009, + "epoch": 2.840924747182497, + "grad_norm": 5.825127124786377, + "learning_rate": 5.066649780942148e-05, + "loss": 0.8829, "step": 206200 }, { - "epoch": 2.1, - "learning_rate": 5.826215555298068e-05, - "loss": 1.0987, + "epoch": 2.842302499242236, + "grad_norm": 14.092430114746094, + "learning_rate": 5.066217790976644e-05, + "loss": 1.0462, "step": 206300 }, { - "epoch": 2.1, - "learning_rate": 5.825992987929968e-05, - "loss": 0.8823, + "epoch": 2.8436802513019757, + "grad_norm": 10.614217758178711, + "learning_rate": 5.0657856042365886e-05, + "loss": 0.9353, "step": 206400 }, { - "epoch": 2.1, - "learning_rate": 5.825770282387675e-05, - "loss": 0.9849, + "epoch": 2.845058003361715, + "grad_norm": 17.03887176513672, + "learning_rate": 5.065353220758699e-05, + "loss": 0.9406, "step": 206500 }, { - "epoch": 2.1, - "learning_rate": 5.82554743868208e-05, - "loss": 0.8931, + "epoch": 2.8464357554214543, + "grad_norm": 82.23941802978516, + "learning_rate": 5.064920640579707e-05, + "loss": 0.8721, "step": 206600 }, { - "epoch": 2.11, - "learning_rate": 5.825324456824077e-05, - "loss": 0.9509, + "epoch": 2.8478135074811934, + "grad_norm": 12.375205039978027, + "learning_rate": 5.064487863736364e-05, + "loss": 0.8346, "step": 206700 }, { - "epoch": 2.11, - "learning_rate": 5.8251013368245674e-05, - "loss": 0.9464, + "epoch": 2.849191259540933, + "grad_norm": 16.048620223999023, + "learning_rate": 5.064054890265437e-05, + "loss": 0.9715, "step": 206800 }, { - "epoch": 2.11, - "learning_rate": 5.824878078694463e-05, - "loss": 0.9809, + "epoch": 2.8505690116006726, + "grad_norm": 16.478927612304688, + "learning_rate": 5.063621720203708e-05, + "loss": 1.0036, "step": 206900 }, { - "epoch": 2.11, - "learning_rate": 5.824654682444678e-05, - "loss": 0.9513, + "epoch": 2.8519467636604117, + "grad_norm": 16.252845764160156, + "learning_rate": 5.063188353587978e-05, + "loss": 0.8991, "step": 207000 }, { - "epoch": 2.11, - "learning_rate": 5.824431148086136e-05, - "loss": 0.9161, + "epoch": 2.853324515720151, + "grad_norm": 15.043439865112305, + "learning_rate": 5.062754790455062e-05, + "loss": 0.9867, "step": 207100 }, { - "epoch": 2.11, - "learning_rate": 5.824207475629765e-05, - "loss": 0.8685, + "epoch": 2.8547022677798903, + "grad_norm": 11.19311237335205, + "learning_rate": 5.0623210308417944e-05, + "loss": 0.9143, "step": 207200 }, { - "epoch": 2.11, - "learning_rate": 5.8239836650865046e-05, - "loss": 0.9233, + "epoch": 2.85608001983963, + "grad_norm": 11.718269348144531, + "learning_rate": 5.061887074785026e-05, + "loss": 0.9183, "step": 207300 }, { - "epoch": 2.11, - "learning_rate": 5.823759716467294e-05, - "loss": 0.9542, + "epoch": 2.857457771899369, + "grad_norm": 18.80988883972168, + "learning_rate": 5.0614529223216235e-05, + "loss": 0.9199, "step": 207400 }, { - "epoch": 2.11, - "learning_rate": 5.823535629783085e-05, - "loss": 0.9409, + "epoch": 2.858835523959108, + "grad_norm": 36.35650634765625, + "learning_rate": 5.06101857348847e-05, + "loss": 0.998, "step": 207500 }, { - "epoch": 2.12, - "learning_rate": 5.823311405044834e-05, - "loss": 0.9343, + "epoch": 2.8602132760188477, + "grad_norm": 46.12152099609375, + "learning_rate": 5.060584028322465e-05, + "loss": 1.0043, "step": 207600 }, { - "epoch": 2.12, - "learning_rate": 5.8230870422635034e-05, - "loss": 1.0041, + "epoch": 2.8615910280785872, + "grad_norm": 6.979639053344727, + "learning_rate": 5.0601492868605255e-05, + "loss": 0.9806, "step": 207700 }, { - "epoch": 2.12, - "learning_rate": 5.822862541450064e-05, - "loss": 0.9272, + "epoch": 2.8629687801383263, + "grad_norm": 827.8052978515625, + "learning_rate": 5.0597143491395866e-05, + "loss": 1.0127, "step": 207800 }, { - "epoch": 2.12, - "learning_rate": 5.8226379026154925e-05, - "loss": 1.2049, + "epoch": 2.8643465321980655, + "grad_norm": 62.50857925415039, + "learning_rate": 5.059279215196597e-05, + "loss": 0.7998, "step": 207900 }, { - "epoch": 2.12, - "learning_rate": 5.822413125770772e-05, - "loss": 1.0693, + "epoch": 2.865724284257805, + "grad_norm": 41.70628356933594, + "learning_rate": 5.0588438850685234e-05, + "loss": 0.9539, "step": 208000 }, { - "epoch": 2.12, - "learning_rate": 5.822188210926893e-05, - "loss": 0.9966, + "epoch": 2.8671020363175446, + "grad_norm": 59.98887252807617, + "learning_rate": 5.05840835879235e-05, + "loss": 0.9108, "step": 208100 }, { - "epoch": 2.12, - "learning_rate": 5.821963158094852e-05, - "loss": 1.006, + "epoch": 2.8684797883772837, + "grad_norm": 8.437341690063477, + "learning_rate": 5.057972636405077e-05, + "loss": 0.8788, "step": 208200 }, { - "epoch": 2.12, - "learning_rate": 5.821737967285654e-05, - "loss": 0.9542, + "epoch": 2.869857540437023, + "grad_norm": 9.392892837524414, + "learning_rate": 5.057536717943721e-05, + "loss": 0.8857, "step": 208300 }, { - "epoch": 2.12, - "learning_rate": 5.8215126385103084e-05, - "loss": 0.9005, + "epoch": 2.8712352924967623, + "grad_norm": 6.860851764678955, + "learning_rate": 5.057100603445315e-05, + "loss": 1.0502, "step": 208400 }, { - "epoch": 2.12, - "learning_rate": 5.821287171779832e-05, - "loss": 0.9195, + "epoch": 2.8726130445565015, + "grad_norm": 34.85822296142578, + "learning_rate": 5.056664292946911e-05, + "loss": 0.9415, "step": 208500 }, { - "epoch": 2.13, - "learning_rate": 5.82106156710525e-05, - "loss": 1.0223, + "epoch": 2.873990796616241, + "grad_norm": 12.538309097290039, + "learning_rate": 5.0562277864855726e-05, + "loss": 0.9393, "step": 208600 }, { - "epoch": 2.13, - "learning_rate": 5.820835824497592e-05, - "loss": 0.9419, + "epoch": 2.87536854867598, + "grad_norm": 7.2170090675354, + "learning_rate": 5.0557910840983865e-05, + "loss": 1.0151, "step": 208700 }, { - "epoch": 2.13, - "learning_rate": 5.820609943967897e-05, - "loss": 0.9852, + "epoch": 2.8767463007357197, + "grad_norm": 181.87399291992188, + "learning_rate": 5.0553541858224504e-05, + "loss": 0.956, "step": 208800 }, { - "epoch": 2.13, - "learning_rate": 5.820383925527208e-05, - "loss": 0.9725, + "epoch": 2.878124052795459, + "grad_norm": 57.523582458496094, + "learning_rate": 5.0549170916948834e-05, + "loss": 0.9321, "step": 208900 }, { - "epoch": 2.13, - "learning_rate": 5.820157769186577e-05, - "loss": 0.9984, + "epoch": 2.8795018048551984, + "grad_norm": 19.816213607788086, + "learning_rate": 5.054479801752817e-05, + "loss": 0.9569, "step": 209000 }, { - "epoch": 2.13, - "learning_rate": 5.819931474957059e-05, - "loss": 0.9217, + "epoch": 2.8808795569149375, + "grad_norm": 5.597317218780518, + "learning_rate": 5.054042316033402e-05, + "loss": 0.9008, "step": 209100 }, { - "epoch": 2.13, - "learning_rate": 5.819705042849721e-05, - "loss": 0.9432, + "epoch": 2.882257308974677, + "grad_norm": 9.473077774047852, + "learning_rate": 5.0536046345738044e-05, + "loss": 1.0578, "step": 209200 }, { - "epoch": 2.13, - "learning_rate": 5.819478472875633e-05, - "loss": 0.9681, + "epoch": 2.883635061034416, + "grad_norm": 63.83018112182617, + "learning_rate": 5.053166757411207e-05, + "loss": 0.9711, "step": 209300 }, { - "epoch": 2.13, - "learning_rate": 5.8192517650458754e-05, - "loss": 0.9399, + "epoch": 2.8850128130941557, + "grad_norm": 7.8863043785095215, + "learning_rate": 5.052728684582813e-05, + "loss": 1.0625, "step": 209400 }, { - "epoch": 2.13, - "learning_rate": 5.8190249193715295e-05, - "loss": 0.9766, + "epoch": 2.886390565153895, + "grad_norm": 62.159629821777344, + "learning_rate": 5.052290416125835e-05, + "loss": 0.9356, "step": 209500 }, { - "epoch": 2.14, - "learning_rate": 5.818797935863689e-05, - "loss": 0.9467, + "epoch": 2.8877683172136344, + "grad_norm": 10.068209648132324, + "learning_rate": 5.051851952077508e-05, + "loss": 0.9454, "step": 209600 }, { - "epoch": 2.14, - "learning_rate": 5.81857081453345e-05, - "loss": 0.8585, + "epoch": 2.8891460692733735, + "grad_norm": 10.165882110595703, + "learning_rate": 5.0514132924750814e-05, + "loss": 0.9771, "step": 209700 }, { - "epoch": 2.14, - "learning_rate": 5.818343555391919e-05, - "loss": 1.061, + "epoch": 2.890523821333113, + "grad_norm": 7.802192211151123, + "learning_rate": 5.050974437355822e-05, + "loss": 0.9354, "step": 209800 }, { - "epoch": 2.14, - "learning_rate": 5.818116158450207e-05, - "loss": 1.0029, + "epoch": 2.891901573392852, + "grad_norm": 14.152220726013184, + "learning_rate": 5.0505353867570126e-05, + "loss": 0.9706, "step": 209900 }, { - "epoch": 2.14, - "learning_rate": 5.8178886237194326e-05, - "loss": 0.9429, + "epoch": 2.8932793254525917, + "grad_norm": 28.09430503845215, + "learning_rate": 5.050096140715952e-05, + "loss": 0.9906, "step": 210000 }, { - "epoch": 2.14, - "learning_rate": 5.8176609512107215e-05, - "loss": 0.9674, + "epoch": 2.894657077512331, + "grad_norm": 22.049821853637695, + "learning_rate": 5.049656699269957e-05, + "loss": 0.9114, "step": 210100 }, { - "epoch": 2.14, - "learning_rate": 5.817433140935204e-05, - "loss": 0.8936, + "epoch": 2.8960348295720704, + "grad_norm": 5.832665920257568, + "learning_rate": 5.049217062456361e-05, + "loss": 0.8558, "step": 210200 }, { - "epoch": 2.14, - "learning_rate": 5.817205192904019e-05, - "loss": 0.831, + "epoch": 2.8974125816318095, + "grad_norm": 102.17266082763672, + "learning_rate": 5.048781629600713e-05, + "loss": 0.8317, "step": 210300 }, { - "epoch": 2.14, - "learning_rate": 5.816977107128313e-05, - "loss": 0.9999, + "epoch": 2.898790333691549, + "grad_norm": 21.945758819580078, + "learning_rate": 5.0483416041167227e-05, + "loss": 0.9463, "step": 210400 }, { - "epoch": 2.14, - "learning_rate": 5.816748883619238e-05, - "loss": 0.9154, + "epoch": 2.900168085751288, + "grad_norm": 5.362823963165283, + "learning_rate": 5.047901383376854e-05, + "loss": 0.9946, "step": 210500 }, { - "epoch": 2.15, - "learning_rate": 5.81652052238795e-05, - "loss": 0.8917, + "epoch": 2.9015458378110273, + "grad_norm": 19.6251163482666, + "learning_rate": 5.047460967418507e-05, + "loss": 0.929, "step": 210600 }, { - "epoch": 2.15, - "learning_rate": 5.8162920234456185e-05, - "loss": 0.921, + "epoch": 2.902923589870767, + "grad_norm": 2.2251875400543213, + "learning_rate": 5.047020356279097e-05, + "loss": 0.8858, "step": 210700 }, { - "epoch": 2.15, - "learning_rate": 5.816063386803413e-05, - "loss": 0.9357, + "epoch": 2.9043013419305064, + "grad_norm": 4.737329959869385, + "learning_rate": 5.0465795499960574e-05, + "loss": 0.9394, "step": 210800 }, { - "epoch": 2.15, - "learning_rate": 5.8158346124725135e-05, - "loss": 0.925, + "epoch": 2.9056790939902455, + "grad_norm": 7.203439712524414, + "learning_rate": 5.046138548606834e-05, + "loss": 0.9765, "step": 210900 }, { - "epoch": 2.15, - "learning_rate": 5.8156057004641056e-05, - "loss": 0.9148, + "epoch": 2.9070568460499846, + "grad_norm": 13.028100967407227, + "learning_rate": 5.0456973521488954e-05, + "loss": 0.9495, "step": 211000 }, { - "epoch": 2.15, - "learning_rate": 5.815376650789381e-05, - "loss": 0.866, + "epoch": 2.908434598109724, + "grad_norm": 109.66581726074219, + "learning_rate": 5.045255960659722e-05, + "loss": 0.8785, "step": 211100 }, { - "epoch": 2.15, - "learning_rate": 5.81514746345954e-05, - "loss": 0.8945, + "epoch": 2.9098123501694637, + "grad_norm": 33.24901580810547, + "learning_rate": 5.044814374176812e-05, + "loss": 0.9389, "step": 211200 }, { - "epoch": 2.15, - "learning_rate": 5.814918138485788e-05, - "loss": 0.9136, + "epoch": 2.911190102229203, + "grad_norm": 176.90284729003906, + "learning_rate": 5.0443725927376795e-05, + "loss": 0.8288, "step": 211300 }, { - "epoch": 2.15, - "learning_rate": 5.814688675879338e-05, - "loss": 0.7931, + "epoch": 2.912567854288942, + "grad_norm": 12.923941612243652, + "learning_rate": 5.043930616379859e-05, + "loss": 0.9319, "step": 211400 }, { - "epoch": 2.15, - "learning_rate": 5.814459075651407e-05, - "loss": 0.926, + "epoch": 2.9139456063486815, + "grad_norm": 20.760425567626953, + "learning_rate": 5.043492867817825e-05, + "loss": 0.9642, "step": 211500 }, { - "epoch": 2.16, - "learning_rate": 5.8142293378132247e-05, - "loss": 0.9469, + "epoch": 2.915323358408421, + "grad_norm": 11.829825401306152, + "learning_rate": 5.043050503683535e-05, + "loss": 1.0146, "step": 211600 }, { - "epoch": 2.16, - "learning_rate": 5.813999462376022e-05, - "loss": 0.9466, + "epoch": 2.91670111046816, + "grad_norm": 20.66179847717285, + "learning_rate": 5.042607944742873e-05, + "loss": 0.9887, "step": 211700 }, { - "epoch": 2.16, - "learning_rate": 5.813769449351038e-05, - "loss": 0.8628, + "epoch": 2.9180788625278993, + "grad_norm": 8.777021408081055, + "learning_rate": 5.042165191033439e-05, + "loss": 0.9347, "step": 211800 }, { - "epoch": 2.16, - "learning_rate": 5.81353929874952e-05, - "loss": 1.0375, + "epoch": 2.919456614587639, + "grad_norm": 3.8520824909210205, + "learning_rate": 5.041722242592844e-05, + "loss": 0.9659, "step": 211900 }, { - "epoch": 2.16, - "learning_rate": 5.813309010582721e-05, - "loss": 0.9182, + "epoch": 2.9208343666473784, + "grad_norm": 8.009589195251465, + "learning_rate": 5.0412790994587204e-05, + "loss": 0.9064, "step": 212000 }, { - "epoch": 2.16, - "learning_rate": 5.8130785848619e-05, - "loss": 0.9331, + "epoch": 2.9222121187071175, + "grad_norm": 6.041582107543945, + "learning_rate": 5.040835761668715e-05, + "loss": 0.8501, "step": 212100 }, { - "epoch": 2.16, - "learning_rate": 5.812848021598324e-05, - "loss": 0.9792, + "epoch": 2.9235898707668566, + "grad_norm": 12.624541282653809, + "learning_rate": 5.040392229260492e-05, + "loss": 0.8915, "step": 212200 }, { - "epoch": 2.16, - "learning_rate": 5.812617320803266e-05, - "loss": 0.9162, + "epoch": 2.924967622826596, + "grad_norm": 5.336921691894531, + "learning_rate": 5.0399485022717325e-05, + "loss": 0.8739, "step": 212300 }, { - "epoch": 2.16, - "learning_rate": 5.812386482488006e-05, - "loss": 0.9305, + "epoch": 2.9263453748863357, + "grad_norm": 20.81460189819336, + "learning_rate": 5.039504580740131e-05, + "loss": 0.9489, "step": 212400 }, { - "epoch": 2.16, - "learning_rate": 5.81215550666383e-05, - "loss": 0.8893, + "epoch": 2.927723126946075, + "grad_norm": 12.097780227661133, + "learning_rate": 5.039060464703403e-05, + "loss": 0.8683, "step": 212500 }, { - "epoch": 2.17, - "learning_rate": 5.811924393342032e-05, - "loss": 0.9299, + "epoch": 2.929100879005814, + "grad_norm": 15.456731796264648, + "learning_rate": 5.038616154199278e-05, + "loss": 1.0058, "step": 212600 }, { - "epoch": 2.17, - "learning_rate": 5.811693142533911e-05, - "loss": 0.9634, + "epoch": 2.9304786310655535, + "grad_norm": 6.603039264678955, + "learning_rate": 5.038171649265502e-05, + "loss": 0.9007, "step": 212700 }, { - "epoch": 2.17, - "learning_rate": 5.811461754250775e-05, - "loss": 0.9035, + "epoch": 2.9318563831252926, + "grad_norm": 12.64572525024414, + "learning_rate": 5.037726949939838e-05, + "loss": 1.0257, "step": 212800 }, { - "epoch": 2.17, - "learning_rate": 5.811230228503938e-05, - "loss": 0.8609, + "epoch": 2.933234135185032, + "grad_norm": 13.877869606018066, + "learning_rate": 5.0372820562600654e-05, + "loss": 0.9532, "step": 212900 }, { - "epoch": 2.17, - "learning_rate": 5.810998565304719e-05, - "loss": 0.9838, + "epoch": 2.9346118872447713, + "grad_norm": 7.404748916625977, + "learning_rate": 5.036836968263981e-05, + "loss": 0.8473, "step": 213000 }, { - "epoch": 2.17, - "learning_rate": 5.8107667646644456e-05, - "loss": 0.9323, + "epoch": 2.935989639304511, + "grad_norm": 9.050270080566406, + "learning_rate": 5.036391685989397e-05, + "loss": 0.8868, "step": 213100 }, { - "epoch": 2.17, - "learning_rate": 5.81053482659445e-05, - "loss": 0.9657, + "epoch": 2.93736739136425, + "grad_norm": 6.573676586151123, + "learning_rate": 5.0359462094741415e-05, + "loss": 0.8853, "step": 213200 }, { - "epoch": 2.17, - "learning_rate": 5.8103027511060754e-05, - "loss": 0.8661, + "epoch": 2.9387451434239895, + "grad_norm": 8.138660430908203, + "learning_rate": 5.035500538756061e-05, + "loss": 1.056, "step": 213300 }, { - "epoch": 2.17, - "learning_rate": 5.810070538210666e-05, - "loss": 0.8446, + "epoch": 2.9401228954837286, + "grad_norm": 3.7192952632904053, + "learning_rate": 5.0350546738730174e-05, + "loss": 0.8238, "step": 213400 }, { - "epoch": 2.18, - "learning_rate": 5.809840512102559e-05, - "loss": 0.8983, + "epoch": 2.941500647543468, + "grad_norm": 145.99395751953125, + "learning_rate": 5.034608614862889e-05, + "loss": 0.9578, "step": 213500 }, { - "epoch": 2.18, - "learning_rate": 5.8096080258009386e-05, - "loss": 1.0379, + "epoch": 2.9428783996032073, + "grad_norm": 5.892977714538574, + "learning_rate": 5.0341623617635706e-05, + "loss": 0.8992, "step": 213600 }, { - "epoch": 2.18, - "learning_rate": 5.809375402126253e-05, - "loss": 0.8791, + "epoch": 2.944256151662947, + "grad_norm": 9.625137329101562, + "learning_rate": 5.0337159146129735e-05, + "loss": 0.898, "step": 213700 }, { - "epoch": 2.18, - "learning_rate": 5.8091449693801406e-05, - "loss": 1.0102, + "epoch": 2.945633903722686, + "grad_norm": 4.5275983810424805, + "learning_rate": 5.0332692734490266e-05, + "loss": 0.8915, "step": 213800 }, { - "epoch": 2.18, - "learning_rate": 5.808912072366899e-05, - "loss": 1.002, + "epoch": 2.9470116557824255, + "grad_norm": 6.736204147338867, + "learning_rate": 5.032822438309673e-05, + "loss": 0.8381, "step": 213900 }, { - "epoch": 2.18, - "learning_rate": 5.8086790380146185e-05, - "loss": 0.9033, + "epoch": 2.9483894078421646, + "grad_norm": 9.65224838256836, + "learning_rate": 5.0323754092328755e-05, + "loss": 0.9175, "step": 214000 }, { - "epoch": 2.18, - "learning_rate": 5.808445866334694e-05, - "loss": 0.8961, + "epoch": 2.949767159901904, + "grad_norm": 9.735228538513184, + "learning_rate": 5.03192818625661e-05, + "loss": 0.8673, "step": 214100 }, { - "epoch": 2.18, - "learning_rate": 5.808212557338528e-05, - "loss": 0.8074, + "epoch": 2.9511449119616433, + "grad_norm": 11.951399803161621, + "learning_rate": 5.03148076941887e-05, + "loss": 0.8943, "step": 214200 }, { - "epoch": 2.18, - "learning_rate": 5.807979111037526e-05, - "loss": 0.9961, + "epoch": 2.952522664021383, + "grad_norm": 20.874267578125, + "learning_rate": 5.0310331587576676e-05, + "loss": 0.868, "step": 214300 }, { - "epoch": 2.18, - "learning_rate": 5.807745527443102e-05, - "loss": 0.9688, + "epoch": 2.953900416081122, + "grad_norm": 11.858498573303223, + "learning_rate": 5.030585354311028e-05, + "loss": 0.824, "step": 214400 }, { - "epoch": 2.19, - "learning_rate": 5.807511806566678e-05, - "loss": 0.8836, + "epoch": 2.9552781681408615, + "grad_norm": 22.140026092529297, + "learning_rate": 5.0301373561169965e-05, + "loss": 0.8791, "step": 214500 }, { - "epoch": 2.19, - "learning_rate": 5.807277948419681e-05, - "loss": 0.8424, + "epoch": 2.9566559202006006, + "grad_norm": 2.468700647354126, + "learning_rate": 5.0296891642136306e-05, + "loss": 0.9103, "step": 214600 }, { - "epoch": 2.19, - "learning_rate": 5.807043953013545e-05, - "loss": 0.8625, + "epoch": 2.95803367226034, + "grad_norm": 11.627277374267578, + "learning_rate": 5.0292407786390076e-05, + "loss": 0.9136, "step": 214700 }, { - "epoch": 2.19, - "learning_rate": 5.806809820359712e-05, - "loss": 0.9227, + "epoch": 2.9594114243200793, + "grad_norm": 9.476028442382812, + "learning_rate": 5.028792199431219e-05, + "loss": 0.9554, "step": 214800 }, { - "epoch": 2.19, - "learning_rate": 5.806575550469628e-05, - "loss": 0.9361, + "epoch": 2.9607891763798184, + "grad_norm": 13.155414581298828, + "learning_rate": 5.028343426628377e-05, + "loss": 0.8592, "step": 214900 }, { - "epoch": 2.19, - "learning_rate": 5.8063411433547486e-05, - "loss": 0.94, + "epoch": 2.962166928439558, + "grad_norm": 17.985248565673828, + "learning_rate": 5.027894460268603e-05, + "loss": 0.9618, "step": 215000 }, { - "epoch": 2.19, - "learning_rate": 5.806106599026535e-05, - "loss": 0.8761, + "epoch": 2.9635446804992975, + "grad_norm": 10.667896270751953, + "learning_rate": 5.027445300390041e-05, + "loss": 0.9412, "step": 215100 }, { - "epoch": 2.19, - "learning_rate": 5.805871917496454e-05, - "loss": 0.9426, + "epoch": 2.9649224325590366, + "grad_norm": 18.606653213500977, + "learning_rate": 5.02699594703085e-05, + "loss": 0.8715, "step": 215200 }, { - "epoch": 2.19, - "learning_rate": 5.8056370987759814e-05, - "loss": 0.8284, + "epoch": 2.9663001846187758, + "grad_norm": 2.976524829864502, + "learning_rate": 5.026546400229204e-05, + "loss": 0.874, "step": 215300 }, { - "epoch": 2.19, - "learning_rate": 5.805402142876598e-05, - "loss": 0.8186, + "epoch": 2.9676779366785153, + "grad_norm": 4.229238033294678, + "learning_rate": 5.0260966600232956e-05, + "loss": 0.8868, "step": 215400 }, { - "epoch": 2.2, - "learning_rate": 5.805167049809791e-05, - "loss": 0.9372, + "epoch": 2.969055688738255, + "grad_norm": 3.0284242630004883, + "learning_rate": 5.0256467264513304e-05, + "loss": 0.797, "step": 215500 }, { - "epoch": 2.2, - "learning_rate": 5.8049318195870564e-05, - "loss": 0.8143, + "epoch": 2.970433440797994, + "grad_norm": 8.253076553344727, + "learning_rate": 5.025196599551534e-05, + "loss": 0.7905, "step": 215600 }, { - "epoch": 2.2, - "learning_rate": 5.804696452219894e-05, - "loss": 0.8323, + "epoch": 2.971811192857733, + "grad_norm": 5.734689712524414, + "learning_rate": 5.024746279362146e-05, + "loss": 0.8638, "step": 215700 }, { - "epoch": 2.2, - "learning_rate": 5.804460947719813e-05, - "loss": 0.8944, + "epoch": 2.9731889449174727, + "grad_norm": 10.03180980682373, + "learning_rate": 5.024295765921424e-05, + "loss": 0.9138, "step": 215800 }, { - "epoch": 2.2, - "learning_rate": 5.804225306098328e-05, - "loss": 0.9302, + "epoch": 2.974566696977212, + "grad_norm": 4.002693176269531, + "learning_rate": 5.0238450592676434e-05, + "loss": 0.8836, "step": 215900 }, { - "epoch": 2.2, - "learning_rate": 5.8039895273669595e-05, - "loss": 1.0083, + "epoch": 2.9759444490369513, + "grad_norm": 40.967979431152344, + "learning_rate": 5.02339415943909e-05, + "loss": 0.7907, "step": 216000 }, { - "epoch": 2.2, - "learning_rate": 5.803753611537237e-05, - "loss": 0.9126, + "epoch": 2.9773222010966904, + "grad_norm": 14.720328330993652, + "learning_rate": 5.0229430664740736e-05, + "loss": 0.8197, "step": 216100 }, { - "epoch": 2.2, - "learning_rate": 5.803517558620695e-05, - "loss": 0.9005, + "epoch": 2.97869995315643, + "grad_norm": 3.3215177059173584, + "learning_rate": 5.022491780410915e-05, + "loss": 0.9149, "step": 216200 }, { - "epoch": 2.2, - "learning_rate": 5.803281368628875e-05, - "loss": 1.0283, + "epoch": 2.9800777052161695, + "grad_norm": 6.096555709838867, + "learning_rate": 5.022040301287953e-05, + "loss": 0.768, "step": 216300 }, { - "epoch": 2.2, - "learning_rate": 5.8030450415733246e-05, - "loss": 0.9941, + "epoch": 2.9814554572759087, + "grad_norm": 5.482794761657715, + "learning_rate": 5.0215886291435445e-05, + "loss": 0.8891, "step": 216400 }, { - "epoch": 2.21, - "learning_rate": 5.8028085774655994e-05, - "loss": 0.9545, + "epoch": 2.9828332093356478, + "grad_norm": 10.852657318115234, + "learning_rate": 5.02113676401606e-05, + "loss": 0.9312, "step": 216500 }, { - "epoch": 2.21, - "learning_rate": 5.8025719763172614e-05, - "loss": 0.9325, + "epoch": 2.9842109613953873, + "grad_norm": 4.301094055175781, + "learning_rate": 5.020684705943889e-05, + "loss": 0.9136, "step": 216600 }, { - "epoch": 2.21, - "learning_rate": 5.802335238139879e-05, - "loss": 0.8211, + "epoch": 2.985588713455127, + "grad_norm": 5.961721420288086, + "learning_rate": 5.020232454965435e-05, + "loss": 0.8515, "step": 216700 }, { - "epoch": 2.21, - "learning_rate": 5.802098362945026e-05, - "loss": 0.8575, + "epoch": 2.986966465514866, + "grad_norm": 9.368277549743652, + "learning_rate": 5.019780011119119e-05, + "loss": 0.961, "step": 216800 }, { - "epoch": 2.21, - "learning_rate": 5.801861350744285e-05, - "loss": 0.9321, + "epoch": 2.988344217574605, + "grad_norm": 5.15329122543335, + "learning_rate": 5.019327374443379e-05, + "loss": 0.7407, "step": 216900 }, { - "epoch": 2.21, - "learning_rate": 5.801624201549245e-05, - "loss": 0.9305, + "epoch": 2.9897219696343447, + "grad_norm": 13.162623405456543, + "learning_rate": 5.01887454497667e-05, + "loss": 0.8083, "step": 217000 }, { - "epoch": 2.21, - "learning_rate": 5.801386915371501e-05, - "loss": 0.8685, + "epoch": 2.991099721694084, + "grad_norm": 7.091092109680176, + "learning_rate": 5.01842152275746e-05, + "loss": 0.9746, "step": 217100 }, { - "epoch": 2.21, - "learning_rate": 5.801149492222655e-05, - "loss": 1.053, + "epoch": 2.9924774737538233, + "grad_norm": 6.582491874694824, + "learning_rate": 5.017968307824236e-05, + "loss": 0.9766, "step": 217200 }, { - "epoch": 2.21, - "learning_rate": 5.800911932114315e-05, - "loss": 0.8439, + "epoch": 2.9938552258135624, + "grad_norm": 6.723458290100098, + "learning_rate": 5.017514900215502e-05, + "loss": 0.9296, "step": 217300 }, { - "epoch": 2.21, - "learning_rate": 5.800674235058096e-05, - "loss": 0.8787, + "epoch": 2.995232977873302, + "grad_norm": 3.6613121032714844, + "learning_rate": 5.017061299969777e-05, + "loss": 0.7399, "step": 217400 }, { - "epoch": 2.22, - "learning_rate": 5.800436401065621e-05, - "loss": 0.8795, + "epoch": 2.996610729933041, + "grad_norm": 6.171582221984863, + "learning_rate": 5.016607507125596e-05, + "loss": 0.8053, "step": 217500 }, { - "epoch": 2.22, - "learning_rate": 5.8002008105354276e-05, - "loss": 0.8945, + "epoch": 2.9979884819927807, + "grad_norm": 6.335168361663818, + "learning_rate": 5.016153521721512e-05, + "loss": 0.7905, "step": 217600 }, { - "epoch": 2.22, - "learning_rate": 5.799962704074404e-05, - "loss": 0.9754, + "epoch": 2.99936623405252, + "grad_norm": 11.291169166564941, + "learning_rate": 5.0156993437960915e-05, + "loss": 0.8497, "step": 217700 }, { - "epoch": 2.22, - "learning_rate": 5.799724460711913e-05, - "loss": 0.8805, + "epoch": 3.0007439861122593, + "grad_norm": 9.230731964111328, + "learning_rate": 5.015244973387922e-05, + "loss": 0.8362, "step": 217800 }, { - "epoch": 2.22, - "learning_rate": 5.7994860804596054e-05, - "loss": 0.9783, + "epoch": 3.0021217381719985, + "grad_norm": 10.072036743164062, + "learning_rate": 5.0147904105356024e-05, + "loss": 0.6933, "step": 217900 }, { - "epoch": 2.22, - "learning_rate": 5.7992475633291334e-05, - "loss": 0.9421, + "epoch": 3.003499490231738, + "grad_norm": 4.5025634765625, + "learning_rate": 5.014335655277751e-05, + "loss": 0.7591, "step": 218000 }, { - "epoch": 2.22, - "learning_rate": 5.799008909332161e-05, - "loss": 0.8175, + "epoch": 3.004877242291477, + "grad_norm": 11.540807723999023, + "learning_rate": 5.013880707653001e-05, + "loss": 0.8106, "step": 218100 }, { - "epoch": 2.22, - "learning_rate": 5.798770118480357e-05, - "loss": 0.9645, + "epoch": 3.0062549943512167, + "grad_norm": 19.40630340576172, + "learning_rate": 5.013425567700003e-05, + "loss": 0.7955, "step": 218200 }, { - "epoch": 2.22, - "learning_rate": 5.7985311907853964e-05, - "loss": 0.8727, + "epoch": 3.007632746410956, + "grad_norm": 21.899789810180664, + "learning_rate": 5.0129702354574234e-05, + "loss": 0.7447, "step": 218300 }, { - "epoch": 2.23, - "learning_rate": 5.7982921262589606e-05, - "loss": 0.8778, + "epoch": 3.0090104984706953, + "grad_norm": 5.635985851287842, + "learning_rate": 5.012514710963945e-05, + "loss": 0.8361, "step": 218400 }, { - "epoch": 2.23, - "learning_rate": 5.79805292491274e-05, - "loss": 0.9507, + "epoch": 3.0103882505304345, + "grad_norm": 2.316899538040161, + "learning_rate": 5.0120589942582674e-05, + "loss": 0.6912, "step": 218500 }, { - "epoch": 2.23, - "learning_rate": 5.7978135867584286e-05, - "loss": 0.8838, + "epoch": 3.011766002590174, + "grad_norm": 4.447273254394531, + "learning_rate": 5.011603085379106e-05, + "loss": 0.709, "step": 218600 }, { - "epoch": 2.23, - "learning_rate": 5.7975741118077294e-05, - "loss": 0.8837, + "epoch": 3.013143754649913, + "grad_norm": 8.26756763458252, + "learning_rate": 5.011146984365191e-05, + "loss": 0.8466, "step": 218700 }, { - "epoch": 2.23, - "learning_rate": 5.7973345000723514e-05, - "loss": 0.9025, + "epoch": 3.0145215067096527, + "grad_norm": 2.827197313308716, + "learning_rate": 5.0106906912552724e-05, + "loss": 0.7533, "step": 218800 }, { - "epoch": 2.23, - "learning_rate": 5.7970947515640095e-05, - "loss": 0.9526, + "epoch": 3.015899258769392, + "grad_norm": 8.207708358764648, + "learning_rate": 5.010234206088114e-05, + "loss": 0.8498, "step": 218900 }, { - "epoch": 2.23, - "learning_rate": 5.796854866294427e-05, - "loss": 0.8226, + "epoch": 3.0172770108291314, + "grad_norm": 3.69254207611084, + "learning_rate": 5.009777528902496e-05, + "loss": 0.8627, "step": 219000 }, { - "epoch": 2.23, - "learning_rate": 5.796614844275332e-05, - "loss": 0.9796, + "epoch": 3.0186547628888705, + "grad_norm": 12.752203941345215, + "learning_rate": 5.009320659737217e-05, + "loss": 0.7736, "step": 219100 }, { - "epoch": 2.23, - "learning_rate": 5.796374685518461e-05, - "loss": 0.9351, + "epoch": 3.02003251494861, + "grad_norm": 10.074353218078613, + "learning_rate": 5.008863598631088e-05, + "loss": 0.7728, "step": 219200 }, { - "epoch": 2.23, - "learning_rate": 5.796134390035554e-05, - "loss": 0.9327, + "epoch": 3.021410267008349, + "grad_norm": 10.07119369506836, + "learning_rate": 5.008406345622941e-05, + "loss": 0.8828, "step": 219300 }, { - "epoch": 2.24, - "learning_rate": 5.7958939578383646e-05, - "loss": 0.85, + "epoch": 3.0227880190680887, + "grad_norm": 11.883198738098145, + "learning_rate": 5.00794890075162e-05, + "loss": 0.7723, "step": 219400 }, { - "epoch": 2.24, - "learning_rate": 5.7956533889386434e-05, - "loss": 0.865, + "epoch": 3.024165771127828, + "grad_norm": 3.8973376750946045, + "learning_rate": 5.0074912640559895e-05, + "loss": 0.7811, "step": 219500 }, { - "epoch": 2.24, - "learning_rate": 5.795412683348156e-05, - "loss": 0.9413, + "epoch": 3.0255435231875674, + "grad_norm": 15.14806842803955, + "learning_rate": 5.0070334355749264e-05, + "loss": 0.8236, "step": 219600 }, { - "epoch": 2.24, - "learning_rate": 5.7951718410786706e-05, - "loss": 0.8716, + "epoch": 3.0269212752473065, + "grad_norm": 4.638752460479736, + "learning_rate": 5.006575415347326e-05, + "loss": 0.7919, "step": 219700 }, { - "epoch": 2.24, - "learning_rate": 5.794930862141963e-05, - "loss": 0.9028, + "epoch": 3.028299027307046, + "grad_norm": 7.042424201965332, + "learning_rate": 5.0061172034121e-05, + "loss": 0.8563, "step": 219800 }, { - "epoch": 2.24, - "learning_rate": 5.794689746549815e-05, - "loss": 0.9877, + "epoch": 3.029676779366785, + "grad_norm": 5.502007007598877, + "learning_rate": 5.0056587998081746e-05, + "loss": 0.8177, "step": 219900 }, { - "epoch": 2.24, - "learning_rate": 5.794448494314017e-05, - "loss": 0.8103, + "epoch": 3.0310545314265247, + "grad_norm": 17.79983901977539, + "learning_rate": 5.005200204574495e-05, + "loss": 0.8254, "step": 220000 }, { - "epoch": 2.24, - "learning_rate": 5.794207105446362e-05, - "loss": 0.9408, + "epoch": 3.032432283486264, + "grad_norm": 43.76643753051758, + "learning_rate": 5.00474141775002e-05, + "loss": 0.897, "step": 220100 }, { - "epoch": 2.24, - "learning_rate": 5.793965579958657e-05, - "loss": 0.9233, + "epoch": 3.033810035546003, + "grad_norm": 4.057260990142822, + "learning_rate": 5.004282439373726e-05, + "loss": 0.8396, "step": 220200 }, { - "epoch": 2.24, - "learning_rate": 5.793723917862707e-05, - "loss": 0.8293, + "epoch": 3.0351877876057425, + "grad_norm": 10.954686164855957, + "learning_rate": 5.003823269484607e-05, + "loss": 0.7823, "step": 220300 }, { - "epoch": 2.25, - "learning_rate": 5.7934821191703306e-05, - "loss": 0.8348, + "epoch": 3.0365655396654816, + "grad_norm": 10.25699520111084, + "learning_rate": 5.00336390812167e-05, + "loss": 0.8326, "step": 220400 }, { - "epoch": 2.25, - "learning_rate": 5.793240183893349e-05, - "loss": 0.7893, + "epoch": 3.037943291725221, + "grad_norm": 6.70409631729126, + "learning_rate": 5.002908951799392e-05, + "loss": 0.8036, "step": 220500 }, { - "epoch": 2.25, - "learning_rate": 5.7929981120435905e-05, - "loss": 0.8719, + "epoch": 3.0393210437849603, + "grad_norm": 9.063064575195312, + "learning_rate": 5.0024492095196766e-05, + "loss": 0.782, "step": 220600 }, { - "epoch": 2.25, - "learning_rate": 5.792755903632893e-05, - "loss": 0.9576, + "epoch": 3.0406987958447, + "grad_norm": 2.866466522216797, + "learning_rate": 5.0019892758828774e-05, + "loss": 0.9106, "step": 220700 }, { - "epoch": 2.25, - "learning_rate": 5.792513558673098e-05, - "loss": 0.9176, + "epoch": 3.042076547904439, + "grad_norm": 26.142011642456055, + "learning_rate": 5.001529150928068e-05, + "loss": 0.7895, "step": 220800 }, { - "epoch": 2.25, - "learning_rate": 5.7922710771760545e-05, - "loss": 0.9514, + "epoch": 3.0434542999641785, + "grad_norm": 9.043352127075195, + "learning_rate": 5.0010688346943393e-05, + "loss": 0.7718, "step": 220900 }, { - "epoch": 2.25, - "learning_rate": 5.79202845915362e-05, - "loss": 1.0007, + "epoch": 3.0448320520239176, + "grad_norm": 23.54694175720215, + "learning_rate": 5.000608327220795e-05, + "loss": 0.7454, "step": 221000 }, { - "epoch": 2.25, - "learning_rate": 5.791785704617654e-05, - "loss": 0.9554, + "epoch": 3.046209804083657, + "grad_norm": 5.502631187438965, + "learning_rate": 5.000147628546561e-05, + "loss": 0.8136, "step": 221100 }, { - "epoch": 2.25, - "learning_rate": 5.791542813580028e-05, - "loss": 0.8683, + "epoch": 3.0475875561433963, + "grad_norm": 3.3556718826293945, + "learning_rate": 4.999686738710772e-05, + "loss": 0.7063, "step": 221200 }, { - "epoch": 2.25, - "learning_rate": 5.791299786052618e-05, - "loss": 0.8543, + "epoch": 3.048965308203136, + "grad_norm": 7.735686779022217, + "learning_rate": 4.999225657752587e-05, + "loss": 0.8155, "step": 221300 }, { - "epoch": 2.26, - "learning_rate": 5.791056622047306e-05, - "loss": 0.8579, + "epoch": 3.050343060262875, + "grad_norm": 8.710448265075684, + "learning_rate": 4.998764385711175e-05, + "loss": 0.8199, "step": 221400 }, { - "epoch": 2.26, - "learning_rate": 5.7908133215759816e-05, - "loss": 0.8674, + "epoch": 3.0517208123226145, + "grad_norm": 8.710021018981934, + "learning_rate": 4.9983029226257244e-05, + "loss": 0.7873, "step": 221500 }, { - "epoch": 2.26, - "learning_rate": 5.79056988465054e-05, - "loss": 0.9754, + "epoch": 3.0530985643823536, + "grad_norm": 2.5617220401763916, + "learning_rate": 4.9978412685354383e-05, + "loss": 0.8495, "step": 221600 }, { - "epoch": 2.26, - "learning_rate": 5.7903263112828845e-05, - "loss": 0.8512, + "epoch": 3.054476316442093, + "grad_norm": 12.85446834564209, + "learning_rate": 4.997379423479536e-05, + "loss": 0.7541, "step": 221700 }, { - "epoch": 2.26, - "learning_rate": 5.790082601484924e-05, - "loss": 0.8862, + "epoch": 3.0558540685018323, + "grad_norm": 10.130171775817871, + "learning_rate": 4.9969173874972534e-05, + "loss": 0.7851, "step": 221800 }, { - "epoch": 2.26, - "learning_rate": 5.789838755268575e-05, - "loss": 1.0098, + "epoch": 3.057231820561572, + "grad_norm": 4.199276924133301, + "learning_rate": 4.9964597838413016e-05, + "loss": 0.8581, "step": 221900 }, { - "epoch": 2.26, - "learning_rate": 5.78959721314716e-05, - "loss": 0.9144, + "epoch": 3.058609572621311, + "grad_norm": 9.939815521240234, + "learning_rate": 4.995997368032318e-05, + "loss": 0.7881, "step": 222000 }, { - "epoch": 2.26, - "learning_rate": 5.7893530954936934e-05, - "loss": 0.8751, + "epoch": 3.0599873246810505, + "grad_norm": 4.393372535705566, + "learning_rate": 4.995534761414367e-05, + "loss": 0.8115, "step": 222100 }, { - "epoch": 2.26, - "learning_rate": 5.789108841457508e-05, - "loss": 0.883, + "epoch": 3.0613650767407896, + "grad_norm": 4.2641215324401855, + "learning_rate": 4.9950719640267503e-05, + "loss": 0.7293, "step": 222200 }, { - "epoch": 2.26, - "learning_rate": 5.788864451050543e-05, - "loss": 1.0011, + "epoch": 3.062742828800529, + "grad_norm": 8.543421745300293, + "learning_rate": 4.994608975908785e-05, + "loss": 0.7358, "step": 222300 }, { - "epoch": 2.27, - "learning_rate": 5.7886199242847496e-05, - "loss": 1.0433, + "epoch": 3.0641205808602683, + "grad_norm": 26.361059188842773, + "learning_rate": 4.994145797099804e-05, + "loss": 0.8521, "step": 222400 }, { - "epoch": 2.27, - "learning_rate": 5.788375261172084e-05, - "loss": 0.8546, + "epoch": 3.065498332920008, + "grad_norm": 6.281571865081787, + "learning_rate": 4.993682427639156e-05, + "loss": 0.8118, "step": 222500 }, { - "epoch": 2.27, - "learning_rate": 5.788130461724508e-05, - "loss": 0.9071, + "epoch": 3.066876084979747, + "grad_norm": 18.130990982055664, + "learning_rate": 4.993218867566208e-05, + "loss": 0.78, "step": 222600 }, { - "epoch": 2.27, - "learning_rate": 5.787885525953992e-05, - "loss": 0.8967, + "epoch": 3.0682538370394865, + "grad_norm": 14.497140884399414, + "learning_rate": 4.99275511692034e-05, + "loss": 0.8034, "step": 222700 }, { - "epoch": 2.27, - "learning_rate": 5.78764045387251e-05, - "loss": 0.9155, + "epoch": 3.0696315890992256, + "grad_norm": 3.5376479625701904, + "learning_rate": 4.992291175740951e-05, + "loss": 0.8179, "step": 222800 }, { - "epoch": 2.27, - "learning_rate": 5.787395245492046e-05, - "loss": 0.9357, + "epoch": 3.071009341158965, + "grad_norm": 14.781637191772461, + "learning_rate": 4.991827044067455e-05, + "loss": 0.8105, "step": 222900 }, { - "epoch": 2.27, - "learning_rate": 5.7871499008245886e-05, - "loss": 0.8242, + "epoch": 3.0723870932187043, + "grad_norm": 7.163496494293213, + "learning_rate": 4.991362721939284e-05, + "loss": 0.7548, "step": 223000 }, { - "epoch": 2.27, - "learning_rate": 5.786904419882134e-05, - "loss": 0.913, + "epoch": 3.073764845278444, + "grad_norm": 2.9227442741394043, + "learning_rate": 4.9908982093958814e-05, + "loss": 0.7841, "step": 223100 }, { - "epoch": 2.27, - "learning_rate": 5.786658802676685e-05, - "loss": 1.0319, + "epoch": 3.075142597338183, + "grad_norm": 10.931370735168457, + "learning_rate": 4.9904335064767126e-05, + "loss": 0.7604, "step": 223200 }, { - "epoch": 2.28, - "learning_rate": 5.78641304922025e-05, - "loss": 0.8648, + "epoch": 3.0765203493979225, + "grad_norm": 11.45769214630127, + "learning_rate": 4.989968613221255e-05, + "loss": 0.8351, "step": 223300 }, { - "epoch": 2.28, - "learning_rate": 5.7861671595248456e-05, - "loss": 0.9501, + "epoch": 3.0778981014576616, + "grad_norm": 99.72901153564453, + "learning_rate": 4.989503529669004e-05, + "loss": 0.8157, "step": 223400 }, { - "epoch": 2.28, - "learning_rate": 5.7859211336024936e-05, - "loss": 0.8371, + "epoch": 3.079275853517401, + "grad_norm": 5.26624870300293, + "learning_rate": 4.9890382558594705e-05, + "loss": 0.8175, "step": 223500 }, { - "epoch": 2.28, - "learning_rate": 5.785674971465225e-05, - "loss": 0.9385, + "epoch": 3.0806536055771403, + "grad_norm": 16.809803009033203, + "learning_rate": 4.9885727918321825e-05, + "loss": 0.7716, "step": 223600 }, { - "epoch": 2.28, - "learning_rate": 5.785428673125073e-05, - "loss": 0.8396, + "epoch": 3.08203135763688, + "grad_norm": 10.616503715515137, + "learning_rate": 4.988107137626684e-05, + "loss": 0.8469, "step": 223700 }, { - "epoch": 2.28, - "learning_rate": 5.785182238594082e-05, - "loss": 0.8969, + "epoch": 3.083409109696619, + "grad_norm": 42.45695877075195, + "learning_rate": 4.9876412932825345e-05, + "loss": 0.7514, "step": 223800 }, { - "epoch": 2.28, - "learning_rate": 5.7849356678843e-05, - "loss": 0.9266, + "epoch": 3.0847868617563585, + "grad_norm": 13.717066764831543, + "learning_rate": 4.9871752588393085e-05, + "loss": 0.8221, "step": 223900 }, { - "epoch": 2.28, - "learning_rate": 5.7846914287505346e-05, - "loss": 0.9591, + "epoch": 3.0861646138160976, + "grad_norm": 40.778228759765625, + "learning_rate": 4.986709034336599e-05, + "loss": 0.7813, "step": 224000 }, { - "epoch": 2.28, - "learning_rate": 5.784444587080834e-05, - "loss": 0.8547, + "epoch": 3.087542365875837, + "grad_norm": 23.3128662109375, + "learning_rate": 4.9862426198140146e-05, + "loss": 0.6969, "step": 224100 }, { - "epoch": 2.28, - "learning_rate": 5.784197609268409e-05, - "loss": 0.8431, + "epoch": 3.0889201179355763, + "grad_norm": 6.894881248474121, + "learning_rate": 4.985776015311178e-05, + "loss": 0.7658, "step": 224200 }, { - "epoch": 2.29, - "learning_rate": 5.7839504953253364e-05, - "loss": 0.9919, + "epoch": 3.0902978699953154, + "grad_norm": 20.713045120239258, + "learning_rate": 4.985309220867732e-05, + "loss": 0.7492, "step": 224300 }, { - "epoch": 2.29, - "learning_rate": 5.783703245263698e-05, - "loss": 0.9913, + "epoch": 3.091675622055055, + "grad_norm": 9.274797439575195, + "learning_rate": 4.984842236523331e-05, + "loss": 0.8069, "step": 224400 }, { - "epoch": 2.29, - "learning_rate": 5.7834558590955834e-05, - "loss": 0.9112, + "epoch": 3.0930533741147945, + "grad_norm": 51.10271453857422, + "learning_rate": 4.9843750623176493e-05, + "loss": 0.7797, "step": 224500 }, { - "epoch": 2.29, - "learning_rate": 5.783208336833088e-05, - "loss": 0.9577, + "epoch": 3.0944311261745336, + "grad_norm": 3.907578468322754, + "learning_rate": 4.983907698290375e-05, + "loss": 0.7626, "step": 224600 }, { - "epoch": 2.29, - "learning_rate": 5.782960678488315e-05, - "loss": 0.9618, + "epoch": 3.0958088782342728, + "grad_norm": 10.407511711120605, + "learning_rate": 4.983440144481213e-05, + "loss": 0.7279, "step": 224700 }, { - "epoch": 2.29, - "learning_rate": 5.7827128840733716e-05, - "loss": 0.7787, + "epoch": 3.0971866302940123, + "grad_norm": 6.9903435707092285, + "learning_rate": 4.9829724009298844e-05, + "loss": 0.7881, "step": 224800 }, { - "epoch": 2.29, - "learning_rate": 5.782464953600375e-05, - "loss": 0.7985, + "epoch": 3.0985643823537514, + "grad_norm": 15.715108871459961, + "learning_rate": 4.9825044676761265e-05, + "loss": 0.7097, "step": 224900 }, { - "epoch": 2.29, - "learning_rate": 5.7822168870814464e-05, - "loss": 0.9151, + "epoch": 3.099942134413491, + "grad_norm": 6.130674362182617, + "learning_rate": 4.9820363447596936e-05, + "loss": 0.8622, "step": 225000 }, { - "epoch": 2.29, - "learning_rate": 5.781971167227572e-05, - "loss": 0.9218, + "epoch": 3.10131988647323, + "grad_norm": 4.548302173614502, + "learning_rate": 4.981568032220353e-05, + "loss": 0.6775, "step": 225100 }, { - "epoch": 2.29, - "learning_rate": 5.7817228300133314e-05, - "loss": 0.9224, + "epoch": 3.1026976385329696, + "grad_norm": 1.7215290069580078, + "learning_rate": 4.9810995300978915e-05, + "loss": 0.8629, "step": 225200 }, { - "epoch": 2.3, - "learning_rate": 5.7814743567894444e-05, - "loss": 0.8241, + "epoch": 3.1040753905927088, + "grad_norm": 10.366414070129395, + "learning_rate": 4.9806308384321115e-05, + "loss": 0.8022, "step": 225300 }, { - "epoch": 2.3, - "learning_rate": 5.781225747568061e-05, - "loss": 0.9015, + "epoch": 3.1054531426524483, + "grad_norm": 37.325042724609375, + "learning_rate": 4.9801619572628296e-05, + "loss": 0.7219, "step": 225400 }, { - "epoch": 2.3, - "learning_rate": 5.780977002361336e-05, - "loss": 0.8417, + "epoch": 3.1068308947121874, + "grad_norm": 8.17691421508789, + "learning_rate": 4.9796928866298794e-05, + "loss": 0.7952, "step": 225500 }, { - "epoch": 2.3, - "learning_rate": 5.7807281211814316e-05, - "loss": 0.7768, + "epoch": 3.108208646771927, + "grad_norm": 5.913504123687744, + "learning_rate": 4.979223626573112e-05, + "loss": 0.8158, "step": 225600 }, { - "epoch": 2.3, - "learning_rate": 5.780479104040517e-05, - "loss": 0.8853, + "epoch": 3.109586398831666, + "grad_norm": 15.628220558166504, + "learning_rate": 4.9787541771323926e-05, + "loss": 0.7727, "step": 225700 }, { - "epoch": 2.3, - "learning_rate": 5.780229950950766e-05, - "loss": 0.9516, + "epoch": 3.1109641508914057, + "grad_norm": 6.115756034851074, + "learning_rate": 4.978284538347604e-05, + "loss": 0.7998, "step": 225800 }, { - "epoch": 2.3, - "learning_rate": 5.779980661924363e-05, - "loss": 1.0194, + "epoch": 3.1123419029511448, + "grad_norm": 1.9399501085281372, + "learning_rate": 4.977814710258644e-05, + "loss": 0.7646, "step": 225900 }, { - "epoch": 2.3, - "learning_rate": 5.779731236973495e-05, - "loss": 0.9821, + "epoch": 3.1137196550108843, + "grad_norm": 52.01592254638672, + "learning_rate": 4.977344692905427e-05, + "loss": 0.7687, "step": 226000 }, { - "epoch": 2.3, - "learning_rate": 5.779481676110359e-05, - "loss": 0.9247, + "epoch": 3.1150974070706234, + "grad_norm": 5.268695831298828, + "learning_rate": 4.9768744863278826e-05, + "loss": 0.8519, "step": 226100 }, { - "epoch": 2.3, - "learning_rate": 5.7792319793471555e-05, - "loss": 0.9386, + "epoch": 3.116475159130363, + "grad_norm": 8.1824951171875, + "learning_rate": 4.976404090565958e-05, + "loss": 0.8065, "step": 226200 }, { - "epoch": 2.31, - "learning_rate": 5.778982146696094e-05, - "loss": 0.9808, + "epoch": 3.117852911190102, + "grad_norm": 9.205942153930664, + "learning_rate": 4.975933505659617e-05, + "loss": 0.8016, "step": 226300 }, { - "epoch": 2.31, - "learning_rate": 5.778732178169389e-05, - "loss": 0.933, + "epoch": 3.1192306632498417, + "grad_norm": 4.955207347869873, + "learning_rate": 4.9754627316488365e-05, + "loss": 0.7826, "step": 226400 }, { - "epoch": 2.31, - "learning_rate": 5.7784820737792644e-05, - "loss": 0.9377, + "epoch": 3.1206084153095808, + "grad_norm": 24.628856658935547, + "learning_rate": 4.974991768573611e-05, + "loss": 0.8405, "step": 226500 }, { - "epoch": 2.31, - "learning_rate": 5.778231833537946e-05, - "loss": 0.928, + "epoch": 3.1219861673693203, + "grad_norm": 15.530047416687012, + "learning_rate": 4.9745206164739515e-05, + "loss": 0.8171, "step": 226600 }, { - "epoch": 2.31, - "learning_rate": 5.777981457457671e-05, - "loss": 0.8999, + "epoch": 3.1233639194290594, + "grad_norm": 14.046979904174805, + "learning_rate": 4.974049275389886e-05, + "loss": 0.7727, "step": 226700 }, { - "epoch": 2.31, - "learning_rate": 5.777730945550682e-05, - "loss": 0.9975, + "epoch": 3.124741671488799, + "grad_norm": 3.7635655403137207, + "learning_rate": 4.973577745361455e-05, + "loss": 0.8115, "step": 226800 }, { - "epoch": 2.31, - "learning_rate": 5.777480297829225e-05, - "loss": 0.8384, + "epoch": 3.126119423548538, + "grad_norm": 12.773242950439453, + "learning_rate": 4.97310602642872e-05, + "loss": 0.8537, "step": 226900 }, { - "epoch": 2.31, - "learning_rate": 5.777229514305557e-05, - "loss": 0.9139, + "epoch": 3.1274971756082777, + "grad_norm": 6.3351030349731445, + "learning_rate": 4.9726341186317545e-05, + "loss": 0.8511, "step": 227000 }, { - "epoch": 2.31, - "learning_rate": 5.77697859499194e-05, - "loss": 0.89, + "epoch": 3.128874927668017, + "grad_norm": 2.8498470783233643, + "learning_rate": 4.9721620220106495e-05, + "loss": 0.8589, "step": 227100 }, { - "epoch": 2.31, - "learning_rate": 5.776727539900641e-05, - "loss": 0.8916, + "epoch": 3.1302526797277563, + "grad_norm": 6.089428901672363, + "learning_rate": 4.9716897366055124e-05, + "loss": 0.8427, "step": 227200 }, { - "epoch": 2.32, - "learning_rate": 5.776476349043936e-05, - "loss": 0.8947, + "epoch": 3.1316304317874954, + "grad_norm": 2.1297292709350586, + "learning_rate": 4.971217262456466e-05, + "loss": 0.7443, "step": 227300 }, { - "epoch": 2.32, - "learning_rate": 5.776225022434107e-05, - "loss": 0.8724, + "epoch": 3.133008183847235, + "grad_norm": 5.4099040031433105, + "learning_rate": 4.97074459960365e-05, + "loss": 0.8339, "step": 227400 }, { - "epoch": 2.32, - "learning_rate": 5.775973560083442e-05, - "loss": 0.9487, + "epoch": 3.134385935906974, + "grad_norm": 6.637537956237793, + "learning_rate": 4.9702717480872186e-05, + "loss": 0.7741, "step": 227500 }, { - "epoch": 2.32, - "learning_rate": 5.775721962004236e-05, - "loss": 0.8521, + "epoch": 3.1357636879667137, + "grad_norm": 9.33049201965332, + "learning_rate": 4.9697987079473424e-05, + "loss": 0.8442, "step": 227600 }, { - "epoch": 2.32, - "learning_rate": 5.775470228208791e-05, - "loss": 0.8946, + "epoch": 3.137141440026453, + "grad_norm": 17.611705780029297, + "learning_rate": 4.969325479224211e-05, + "loss": 0.7534, "step": 227700 }, { - "epoch": 2.32, - "learning_rate": 5.775218358709415e-05, - "loss": 0.9107, + "epoch": 3.1385191920861923, + "grad_norm": 5.704777240753174, + "learning_rate": 4.968852061958025e-05, + "loss": 0.8559, "step": 227800 }, { - "epoch": 2.32, - "learning_rate": 5.774966353518423e-05, - "loss": 0.9622, + "epoch": 3.1398969441459315, + "grad_norm": 16.293712615966797, + "learning_rate": 4.9683784561890056e-05, + "loss": 0.6788, "step": 227900 }, { - "epoch": 2.32, - "learning_rate": 5.7747142126481355e-05, - "loss": 0.8928, + "epoch": 3.141274696205671, + "grad_norm": 7.516731262207031, + "learning_rate": 4.967904661957387e-05, + "loss": 0.7837, "step": 228000 }, { - "epoch": 2.32, - "learning_rate": 5.774461936110882e-05, - "loss": 0.9051, + "epoch": 3.14265244826541, + "grad_norm": 4.9556498527526855, + "learning_rate": 4.967430679303421e-05, + "loss": 0.7702, "step": 228100 }, { - "epoch": 2.32, - "learning_rate": 5.774209523918997e-05, - "loss": 0.8532, + "epoch": 3.1440302003251497, + "grad_norm": 2.994645357131958, + "learning_rate": 4.9669565082673737e-05, + "loss": 0.8642, "step": 228200 }, { - "epoch": 2.33, - "learning_rate": 5.7739569760848226e-05, - "loss": 0.8888, + "epoch": 3.145407952384889, + "grad_norm": 3.5711493492126465, + "learning_rate": 4.96648214888953e-05, + "loss": 0.8233, "step": 228300 }, { - "epoch": 2.33, - "learning_rate": 5.7737042926207056e-05, - "loss": 0.8156, + "epoch": 3.1467857044446284, + "grad_norm": 15.144733428955078, + "learning_rate": 4.9660076012101875e-05, + "loss": 0.796, "step": 228400 }, { - "epoch": 2.33, - "learning_rate": 5.7734514735390006e-05, - "loss": 0.9001, + "epoch": 3.1481634565043675, + "grad_norm": 0.978360652923584, + "learning_rate": 4.965537613560828e-05, + "loss": 0.7949, "step": 228500 }, { - "epoch": 2.33, - "learning_rate": 5.773198518852071e-05, - "loss": 0.9048, + "epoch": 3.1495412085641066, + "grad_norm": 8.534567832946777, + "learning_rate": 4.965062691281461e-05, + "loss": 0.6975, "step": 228600 }, { - "epoch": 2.33, - "learning_rate": 5.7729454285722825e-05, - "loss": 0.8481, + "epoch": 3.150918960623846, + "grad_norm": 6.644782543182373, + "learning_rate": 4.964587580821185e-05, + "loss": 0.8, "step": 228700 }, { - "epoch": 2.33, - "learning_rate": 5.772692202712011e-05, - "loss": 0.8893, + "epoch": 3.1522967126835857, + "grad_norm": 2.691181182861328, + "learning_rate": 4.964112282220365e-05, + "loss": 0.8339, "step": 228800 }, { - "epoch": 2.33, - "learning_rate": 5.772438841283637e-05, - "loss": 0.9067, + "epoch": 3.153674464743325, + "grad_norm": 14.243195533752441, + "learning_rate": 4.963641551317352e-05, + "loss": 0.7791, "step": 228900 }, { - "epoch": 2.33, - "learning_rate": 5.772185344299549e-05, - "loss": 0.8836, + "epoch": 3.155052216803064, + "grad_norm": 3.695988893508911, + "learning_rate": 4.963165878436993e-05, + "loss": 0.7923, "step": 229000 }, { - "epoch": 2.33, - "learning_rate": 5.7719317117721406e-05, - "loss": 0.9468, + "epoch": 3.1564299688628035, + "grad_norm": 10.843060493469238, + "learning_rate": 4.962690017536871e-05, + "loss": 0.6995, "step": 229100 }, { - "epoch": 2.34, - "learning_rate": 5.771677943713815e-05, - "loss": 0.8321, + "epoch": 3.1578077209225426, + "grad_norm": 9.659454345703125, + "learning_rate": 4.962213968657411e-05, + "loss": 0.8871, "step": 229200 }, { - "epoch": 2.34, - "learning_rate": 5.7714240401369774e-05, - "loss": 0.809, + "epoch": 3.159185472982282, + "grad_norm": 4.380928993225098, + "learning_rate": 4.961737731839058e-05, + "loss": 0.7582, "step": 229300 }, { - "epoch": 2.34, - "learning_rate": 5.7711750831635815e-05, - "loss": 1.0896, + "epoch": 3.1605632250420213, + "grad_norm": 8.227472305297852, + "learning_rate": 4.961261307122267e-05, + "loss": 0.7525, "step": 229400 }, { - "epoch": 2.34, - "learning_rate": 5.770920911296723e-05, - "loss": 0.898, + "epoch": 3.161940977101761, + "grad_norm": 5.710347652435303, + "learning_rate": 4.960784694547517e-05, + "loss": 0.8346, "step": 229500 }, { - "epoch": 2.34, - "learning_rate": 5.770666603948368e-05, - "loss": 0.9454, + "epoch": 3.1633187291615, + "grad_norm": 64.2777328491211, + "learning_rate": 4.9603078941552974e-05, + "loss": 0.7342, "step": 229600 }, { - "epoch": 2.34, - "learning_rate": 5.770412161130952e-05, - "loss": 0.9282, + "epoch": 3.1646964812212395, + "grad_norm": 15.041741371154785, + "learning_rate": 4.959830905986113e-05, + "loss": 0.7812, "step": 229700 }, { - "epoch": 2.34, - "learning_rate": 5.770160129310123e-05, - "loss": 0.8343, + "epoch": 3.1660742332809786, + "grad_norm": 9.057526588439941, + "learning_rate": 4.9593537300804886e-05, + "loss": 0.825, "step": 229800 }, { - "epoch": 2.34, - "learning_rate": 5.7699054169462906e-05, - "loss": 0.8691, + "epoch": 3.167451985340718, + "grad_norm": 7.043004035949707, + "learning_rate": 4.958876366478962e-05, + "loss": 0.8314, "step": 229900 }, { - "epoch": 2.34, - "learning_rate": 5.7696505691506124e-05, - "loss": 0.8585, + "epoch": 3.1688297374004573, + "grad_norm": 8.075080871582031, + "learning_rate": 4.9583988152220866e-05, + "loss": 0.7655, "step": 230000 }, { - "epoch": 2.34, - "learning_rate": 5.7693955859355505e-05, - "loss": 1.0687, + "epoch": 3.170207489460197, + "grad_norm": 9.349101066589355, + "learning_rate": 4.957921076350434e-05, + "loss": 0.6921, "step": 230100 }, { - "epoch": 2.35, - "learning_rate": 5.7691404673135706e-05, - "loss": 0.8604, + "epoch": 3.171585241519936, + "grad_norm": 53.45508575439453, + "learning_rate": 4.957443149904591e-05, + "loss": 0.9097, "step": 230200 }, { - "epoch": 2.35, - "learning_rate": 5.768885213297147e-05, - "loss": 0.9415, + "epoch": 3.1729629935796755, + "grad_norm": 4.940711498260498, + "learning_rate": 4.9569650359251587e-05, + "loss": 0.7577, "step": 230300 }, { - "epoch": 2.35, - "learning_rate": 5.768629823898761e-05, - "loss": 0.8666, + "epoch": 3.1743407456394146, + "grad_norm": 10.162055969238281, + "learning_rate": 4.956486734452756e-05, + "loss": 0.7773, "step": 230400 }, { - "epoch": 2.35, - "learning_rate": 5.7683742991309e-05, - "loss": 0.9902, + "epoch": 3.175718497699154, + "grad_norm": 4.496201038360596, + "learning_rate": 4.9560082455280166e-05, + "loss": 0.8011, "step": 230500 }, { - "epoch": 2.35, - "learning_rate": 5.7681186390060543e-05, - "loss": 0.8047, + "epoch": 3.1770962497588933, + "grad_norm": 8.428885459899902, + "learning_rate": 4.9555295691915915e-05, + "loss": 0.7555, "step": 230600 }, { - "epoch": 2.35, - "learning_rate": 5.767862843536728e-05, - "loss": 0.8868, + "epoch": 3.178474001818633, + "grad_norm": 68.13356018066406, + "learning_rate": 4.955050705484146e-05, + "loss": 0.7661, "step": 230700 }, { - "epoch": 2.35, - "learning_rate": 5.767606912735426e-05, - "loss": 0.9105, + "epoch": 3.179851753878372, + "grad_norm": 3.0933313369750977, + "learning_rate": 4.9545716544463616e-05, + "loss": 0.7648, "step": 230800 }, { - "epoch": 2.35, - "learning_rate": 5.7673508466146616e-05, - "loss": 0.8843, + "epoch": 3.1812295059381115, + "grad_norm": 9.075510025024414, + "learning_rate": 4.9540924161189365e-05, + "loss": 0.8338, "step": 230900 }, { - "epoch": 2.35, - "learning_rate": 5.767094645186956e-05, - "loss": 0.9282, + "epoch": 3.1826072579978506, + "grad_norm": 6.76422643661499, + "learning_rate": 4.953612990542585e-05, + "loss": 0.8421, "step": 231000 }, { - "epoch": 2.35, - "learning_rate": 5.766838308464836e-05, - "loss": 0.7779, + "epoch": 3.18398501005759, + "grad_norm": 17.76362419128418, + "learning_rate": 4.953133377758037e-05, + "loss": 0.7331, "step": 231100 }, { - "epoch": 2.36, - "learning_rate": 5.766581836460833e-05, - "loss": 0.764, + "epoch": 3.1853627621173293, + "grad_norm": 18.08804702758789, + "learning_rate": 4.952653577806036e-05, + "loss": 0.85, "step": 231200 }, { - "epoch": 2.36, - "learning_rate": 5.76632522918749e-05, - "loss": 0.9104, + "epoch": 3.186740514177069, + "grad_norm": 32.398704528808594, + "learning_rate": 4.952178391524276e-05, + "loss": 0.9564, "step": 231300 }, { - "epoch": 2.36, - "learning_rate": 5.76606848665735e-05, - "loss": 0.8737, + "epoch": 3.188118266236808, + "grad_norm": 4.246829986572266, + "learning_rate": 4.95169821923033e-05, + "loss": 0.8004, "step": 231400 }, { - "epoch": 2.36, - "learning_rate": 5.765811608882969e-05, - "loss": 0.8657, + "epoch": 3.1894960182965475, + "grad_norm": 57.084571838378906, + "learning_rate": 4.951217859890856e-05, + "loss": 0.8262, "step": 231500 }, { - "epoch": 2.36, - "learning_rate": 5.765554595876906e-05, - "loss": 0.8487, + "epoch": 3.1908737703562866, + "grad_norm": 6.124798774719238, + "learning_rate": 4.950737313546664e-05, + "loss": 0.725, "step": 231600 }, { - "epoch": 2.36, - "learning_rate": 5.765297447651727e-05, - "loss": 0.9241, + "epoch": 3.192251522416026, + "grad_norm": 55.82466506958008, + "learning_rate": 4.950256580238578e-05, + "loss": 0.8259, "step": 231700 }, { - "epoch": 2.36, - "learning_rate": 5.7650401642200054e-05, - "loss": 0.84, + "epoch": 3.1936292744757653, + "grad_norm": 9.523139953613281, + "learning_rate": 4.949775660007439e-05, + "loss": 0.7991, "step": 231800 }, { - "epoch": 2.36, - "learning_rate": 5.764782745594321e-05, - "loss": 0.9761, + "epoch": 3.195007026535505, + "grad_norm": 9.465994834899902, + "learning_rate": 4.949294552894105e-05, + "loss": 0.6896, "step": 231900 }, { - "epoch": 2.36, - "learning_rate": 5.7645251917872584e-05, - "loss": 0.9093, + "epoch": 3.196384778595244, + "grad_norm": 10.29808521270752, + "learning_rate": 4.948813258939446e-05, + "loss": 0.8282, "step": 232000 }, { - "epoch": 2.36, - "learning_rate": 5.764267502811412e-05, - "loss": 0.8984, + "epoch": 3.1977625306549835, + "grad_norm": 4.562504291534424, + "learning_rate": 4.948331778184352e-05, + "loss": 0.8014, "step": 232100 }, { - "epoch": 2.37, - "learning_rate": 5.764009678679382e-05, - "loss": 0.8129, + "epoch": 3.1991402827147226, + "grad_norm": 5.956326961517334, + "learning_rate": 4.9478501106697264e-05, + "loss": 0.8166, "step": 232200 }, { - "epoch": 2.37, - "learning_rate": 5.763751719403773e-05, - "loss": 0.786, + "epoch": 3.200518034774462, + "grad_norm": 4.902593612670898, + "learning_rate": 4.94736825643649e-05, + "loss": 0.9226, "step": 232300 }, { - "epoch": 2.37, - "learning_rate": 5.7634936249971984e-05, - "loss": 0.9368, + "epoch": 3.2018957868342013, + "grad_norm": 1.2039607763290405, + "learning_rate": 4.9468862155255785e-05, + "loss": 0.632, "step": 232400 }, { - "epoch": 2.37, - "learning_rate": 5.763235395472277e-05, - "loss": 0.8499, + "epoch": 3.203273538893941, + "grad_norm": 9.601936340332031, + "learning_rate": 4.946403987977944e-05, + "loss": 0.76, "step": 232500 }, { - "epoch": 2.37, - "learning_rate": 5.762977030841635e-05, - "loss": 0.9226, + "epoch": 3.20465129095368, + "grad_norm": 8.302136421203613, + "learning_rate": 4.9459215738345545e-05, + "loss": 0.8161, "step": 232600 }, { - "epoch": 2.37, - "learning_rate": 5.762718531117904e-05, - "loss": 0.9002, + "epoch": 3.2060290430134195, + "grad_norm": 3.656987428665161, + "learning_rate": 4.945438973136392e-05, + "loss": 0.7357, "step": 232700 }, { - "epoch": 2.37, - "learning_rate": 5.762459896313724e-05, - "loss": 0.8452, + "epoch": 3.2074067950731586, + "grad_norm": 20.771461486816406, + "learning_rate": 4.944961014719685e-05, + "loss": 0.7025, "step": 232800 }, { - "epoch": 2.37, - "learning_rate": 5.7622011264417416e-05, - "loss": 0.841, + "epoch": 3.2087845471328977, + "grad_norm": 6.188084125518799, + "learning_rate": 4.944478042899518e-05, + "loss": 0.7883, "step": 232900 }, { - "epoch": 2.37, - "learning_rate": 5.761942221514608e-05, - "loss": 0.8239, + "epoch": 3.2101622991926373, + "grad_norm": 4.967450141906738, + "learning_rate": 4.943994884647214e-05, + "loss": 0.718, "step": 233000 }, { - "epoch": 2.37, - "learning_rate": 5.761683181544981e-05, - "loss": 0.8115, + "epoch": 3.211540051252377, + "grad_norm": 3.7283780574798584, + "learning_rate": 4.9435115400038196e-05, + "loss": 0.8945, "step": 233100 }, { - "epoch": 2.38, - "learning_rate": 5.761424006545528e-05, - "loss": 0.8338, + "epoch": 3.212917803312116, + "grad_norm": 6.124407768249512, + "learning_rate": 4.943028009010399e-05, + "loss": 0.7377, "step": 233200 }, { - "epoch": 2.38, - "learning_rate": 5.76116469652892e-05, - "loss": 0.8837, + "epoch": 3.214295555371855, + "grad_norm": 6.074686050415039, + "learning_rate": 4.9425442917080286e-05, + "loss": 0.7844, "step": 233300 }, { - "epoch": 2.38, - "learning_rate": 5.760905251507837e-05, - "loss": 0.9065, + "epoch": 3.2156733074315946, + "grad_norm": 7.642630100250244, + "learning_rate": 4.942060388137804e-05, + "loss": 0.7258, "step": 233400 }, { - "epoch": 2.38, - "learning_rate": 5.760645671494963e-05, - "loss": 0.8896, + "epoch": 3.2170510594913337, + "grad_norm": 10.451384544372559, + "learning_rate": 4.9415762983408353e-05, + "loss": 0.8141, "step": 233500 }, { - "epoch": 2.38, - "learning_rate": 5.76038595650299e-05, - "loss": 0.7289, + "epoch": 3.2184288115510733, + "grad_norm": 6.519449234008789, + "learning_rate": 4.941092022358248e-05, + "loss": 0.8014, "step": 233600 }, { - "epoch": 2.38, - "learning_rate": 5.760126106544617e-05, - "loss": 0.8551, + "epoch": 3.2198065636108124, + "grad_norm": 2.3638296127319336, + "learning_rate": 4.9406075602311826e-05, + "loss": 0.8337, "step": 233700 }, { - "epoch": 2.38, - "learning_rate": 5.7598661216325494e-05, - "loss": 0.872, + "epoch": 3.221184315670552, + "grad_norm": 5.68211555480957, + "learning_rate": 4.9401229120008e-05, + "loss": 0.9516, "step": 233800 }, { - "epoch": 2.38, - "learning_rate": 5.7596060017794976e-05, - "loss": 0.8103, + "epoch": 3.222562067730291, + "grad_norm": 13.048260688781738, + "learning_rate": 4.9396380777082695e-05, + "loss": 0.8023, "step": 233900 }, { - "epoch": 2.38, - "learning_rate": 5.759345746998181e-05, - "loss": 0.8642, + "epoch": 3.2239398197900306, + "grad_norm": 9.451485633850098, + "learning_rate": 4.9391530573947836e-05, + "loss": 0.8365, "step": 234000 }, { - "epoch": 2.39, - "learning_rate": 5.759085357301325e-05, - "loss": 0.7973, + "epoch": 3.2253175718497697, + "grad_norm": 7.705934047698975, + "learning_rate": 4.938667851101545e-05, + "loss": 0.8248, "step": 234100 }, { - "epoch": 2.39, - "learning_rate": 5.75882483270166e-05, - "loss": 0.7778, + "epoch": 3.2266953239095093, + "grad_norm": 12.55395221710205, + "learning_rate": 4.938182458869775e-05, + "loss": 0.7854, "step": 234200 }, { - "epoch": 2.39, - "learning_rate": 5.7585641732119245e-05, - "loss": 0.7703, + "epoch": 3.2280730759692484, + "grad_norm": 5.107557773590088, + "learning_rate": 4.937696880740711e-05, + "loss": 0.8152, "step": 234300 }, { - "epoch": 2.39, - "learning_rate": 5.7583033788448636e-05, - "loss": 0.8546, + "epoch": 3.229450828028988, + "grad_norm": 6.501389026641846, + "learning_rate": 4.937211116755605e-05, + "loss": 0.7882, "step": 234400 }, { - "epoch": 2.39, - "learning_rate": 5.758042449613227e-05, - "loss": 0.7968, + "epoch": 3.230828580088727, + "grad_norm": 42.70077133178711, + "learning_rate": 4.936725166955725e-05, + "loss": 0.818, "step": 234500 }, { - "epoch": 2.39, - "learning_rate": 5.757781385529775e-05, - "loss": 0.8611, + "epoch": 3.2322063321484666, + "grad_norm": 9.153830528259277, + "learning_rate": 4.9362390313823545e-05, + "loss": 0.8303, "step": 234600 }, { - "epoch": 2.39, - "learning_rate": 5.75752018660727e-05, - "loss": 1.0151, + "epoch": 3.2335840842082058, + "grad_norm": 25.342079162597656, + "learning_rate": 4.935752710076793e-05, + "loss": 0.6923, "step": 234700 }, { - "epoch": 2.39, - "learning_rate": 5.7572588528584845e-05, - "loss": 0.9415, + "epoch": 3.2349618362679453, + "grad_norm": 7.59571647644043, + "learning_rate": 4.9352662030803575e-05, + "loss": 0.7765, "step": 234800 }, { - "epoch": 2.39, - "learning_rate": 5.75700261498853e-05, - "loss": 0.8674, + "epoch": 3.2363395883276844, + "grad_norm": 18.282854080200195, + "learning_rate": 4.934779510434378e-05, + "loss": 0.8527, "step": 234900 }, { - "epoch": 2.39, - "learning_rate": 5.756741014321411e-05, - "loss": 0.8065, + "epoch": 3.237717340387424, + "grad_norm": 7.603454113006592, + "learning_rate": 4.934292632180202e-05, + "loss": 0.8148, "step": 235000 }, { - "epoch": 2.4, - "learning_rate": 5.756479278866107e-05, - "loss": 0.7896, + "epoch": 3.239095092447163, + "grad_norm": 30.90890884399414, + "learning_rate": 4.9338055683591914e-05, + "loss": 0.7449, "step": 235100 }, { - "epoch": 2.4, - "learning_rate": 5.756217408635416e-05, - "loss": 0.8506, + "epoch": 3.2404728445069026, + "grad_norm": 9.09598445892334, + "learning_rate": 4.933318319012727e-05, + "loss": 0.8409, "step": 235200 }, { - "epoch": 2.4, - "learning_rate": 5.755955403642143e-05, - "loss": 0.7986, + "epoch": 3.2418505965666418, + "grad_norm": 19.666332244873047, + "learning_rate": 4.9328308841822e-05, + "loss": 0.804, "step": 235300 }, { - "epoch": 2.4, - "learning_rate": 5.755693263899097e-05, - "loss": 0.8774, + "epoch": 3.2432283486263813, + "grad_norm": 28.167722702026367, + "learning_rate": 4.932343263909023e-05, + "loss": 0.7589, "step": 235400 }, { - "epoch": 2.4, - "learning_rate": 5.7554309894190954e-05, - "loss": 0.8999, + "epoch": 3.2446061006861204, + "grad_norm": 6.728652000427246, + "learning_rate": 4.93185545823462e-05, + "loss": 0.7256, "step": 235500 }, { - "epoch": 2.4, - "learning_rate": 5.755168580214961e-05, - "loss": 1.0107, + "epoch": 3.24598385274586, + "grad_norm": 10.087672233581543, + "learning_rate": 4.931367467200435e-05, + "loss": 0.8402, "step": 235600 }, { - "epoch": 2.4, - "learning_rate": 5.754906036299527e-05, - "loss": 0.8775, + "epoch": 3.247361604805599, + "grad_norm": 11.853771209716797, + "learning_rate": 4.930879290847923e-05, + "loss": 0.8326, "step": 235700 }, { - "epoch": 2.4, - "learning_rate": 5.754643357685626e-05, - "loss": 0.912, + "epoch": 3.2487393568653387, + "grad_norm": 78.4873046875, + "learning_rate": 4.930390929218558e-05, + "loss": 0.8416, "step": 235800 }, { - "epoch": 2.4, - "learning_rate": 5.7543805443861056e-05, - "loss": 0.8518, + "epoch": 3.2501171089250778, + "grad_norm": 42.94315719604492, + "learning_rate": 4.929902382353828e-05, + "loss": 0.8264, "step": 235900 }, { - "epoch": 2.4, - "learning_rate": 5.754117596413813e-05, - "loss": 0.82, + "epoch": 3.2514948609848173, + "grad_norm": 5.60575532913208, + "learning_rate": 4.92941365029524e-05, + "loss": 0.813, "step": 236000 }, { - "epoch": 2.41, - "learning_rate": 5.7538545137816056e-05, - "loss": 0.822, + "epoch": 3.2528726130445564, + "grad_norm": 4.687264919281006, + "learning_rate": 4.928924733084311e-05, + "loss": 0.839, "step": 236100 }, { - "epoch": 2.41, - "learning_rate": 5.753591296502348e-05, - "loss": 0.8965, + "epoch": 3.254250365104296, + "grad_norm": 8.667098999023438, + "learning_rate": 4.92843563076258e-05, + "loss": 0.8067, "step": 236200 }, { - "epoch": 2.41, - "learning_rate": 5.753327944588906e-05, - "loss": 0.7823, + "epoch": 3.255628117164035, + "grad_norm": 6.4489946365356445, + "learning_rate": 4.927946343371596e-05, + "loss": 0.8006, "step": 236300 }, { - "epoch": 2.41, - "learning_rate": 5.7530644580541604e-05, - "loss": 0.9084, + "epoch": 3.2570058692237747, + "grad_norm": 24.710708618164062, + "learning_rate": 4.927456870952929e-05, + "loss": 0.759, "step": 236400 }, { - "epoch": 2.41, - "learning_rate": 5.752800836910992e-05, - "loss": 0.9449, + "epoch": 3.2583836212835138, + "grad_norm": 6.834887981414795, + "learning_rate": 4.92696721354816e-05, + "loss": 0.7577, "step": 236500 }, { - "epoch": 2.41, - "learning_rate": 5.7525370811722896e-05, - "loss": 0.891, + "epoch": 3.2597613733432533, + "grad_norm": 2.1907448768615723, + "learning_rate": 4.9264773711988895e-05, + "loss": 0.8124, "step": 236600 }, { - "epoch": 2.41, - "learning_rate": 5.7522731908509517e-05, - "loss": 1.0042, + "epoch": 3.2611391254029924, + "grad_norm": 5.3939337730407715, + "learning_rate": 4.925987343946731e-05, + "loss": 0.8307, "step": 236700 }, { - "epoch": 2.41, - "learning_rate": 5.752009165959878e-05, - "loss": 0.9252, + "epoch": 3.2625168774627316, + "grad_norm": 4.920928955078125, + "learning_rate": 4.925497131833316e-05, + "loss": 0.7474, "step": 236800 }, { - "epoch": 2.41, - "learning_rate": 5.75174500651198e-05, - "loss": 0.8921, + "epoch": 3.263894629522471, + "grad_norm": 7.566749572753906, + "learning_rate": 4.92500673490029e-05, + "loss": 0.7992, "step": 236900 }, { - "epoch": 2.41, - "learning_rate": 5.751480712520172e-05, - "loss": 0.8566, + "epoch": 3.2652723815822107, + "grad_norm": 6.4035539627075195, + "learning_rate": 4.924516153189315e-05, + "loss": 0.7469, "step": 237000 }, { - "epoch": 2.42, - "learning_rate": 5.751216283997378e-05, - "loss": 0.8203, + "epoch": 3.26665013364195, + "grad_norm": 13.303295135498047, + "learning_rate": 4.924025386742067e-05, + "loss": 0.7973, "step": 237100 }, { - "epoch": 2.42, - "learning_rate": 5.750951720956526e-05, - "loss": 0.7973, + "epoch": 3.268027885701689, + "grad_norm": 11.453577041625977, + "learning_rate": 4.9235344356002404e-05, + "loss": 0.7975, "step": 237200 }, { - "epoch": 2.42, - "learning_rate": 5.7506870234105515e-05, - "loss": 0.8536, + "epoch": 3.2694056377614285, + "grad_norm": 15.064271926879883, + "learning_rate": 4.9230432998055435e-05, + "loss": 0.7601, "step": 237300 }, { - "epoch": 2.42, - "learning_rate": 5.7504221913723966e-05, - "loss": 0.9, + "epoch": 3.270783389821168, + "grad_norm": 70.24581146240234, + "learning_rate": 4.922551979399702e-05, + "loss": 0.8, "step": 237400 }, { - "epoch": 2.42, - "learning_rate": 5.7501572248550104e-05, - "loss": 0.8613, + "epoch": 3.272161141880907, + "grad_norm": 13.285362243652344, + "learning_rate": 4.922060474424454e-05, + "loss": 0.7647, "step": 237500 }, { - "epoch": 2.42, - "learning_rate": 5.749892123871348e-05, - "loss": 0.8334, + "epoch": 3.2735388939406462, + "grad_norm": 20.040693283081055, + "learning_rate": 4.921568784921557e-05, + "loss": 0.7933, "step": 237600 }, { - "epoch": 2.42, - "learning_rate": 5.7496268884343705e-05, - "loss": 0.9305, + "epoch": 3.274916646000386, + "grad_norm": 27.109277725219727, + "learning_rate": 4.9210769109327825e-05, + "loss": 0.8198, "step": 237700 }, { - "epoch": 2.42, - "learning_rate": 5.749361518557048e-05, - "loss": 0.761, + "epoch": 3.2762943980601253, + "grad_norm": 58.7088737487793, + "learning_rate": 4.9205848524999166e-05, + "loss": 0.8131, "step": 237800 }, { - "epoch": 2.42, - "learning_rate": 5.7490960142523545e-05, - "loss": 0.9899, + "epoch": 3.2776721501198645, + "grad_norm": 26.576208114624023, + "learning_rate": 4.9200926096647635e-05, + "loss": 0.8115, "step": 237900 }, { - "epoch": 2.42, - "learning_rate": 5.748830375533271e-05, - "loss": 0.9732, + "epoch": 3.2790499021796036, + "grad_norm": 9.232324600219727, + "learning_rate": 4.919600182469141e-05, + "loss": 0.7825, "step": 238000 }, { - "epoch": 2.43, - "learning_rate": 5.748564602412788e-05, - "loss": 1.1195, + "epoch": 3.280427654239343, + "grad_norm": 20.623849868774414, + "learning_rate": 4.919112497982265e-05, + "loss": 0.8295, "step": 238100 }, { - "epoch": 2.43, - "learning_rate": 5.748298694903897e-05, - "loss": 0.7973, + "epoch": 3.2818054062990822, + "grad_norm": 21.29308319091797, + "learning_rate": 4.918619704033783e-05, + "loss": 0.739, "step": 238200 }, { - "epoch": 2.43, - "learning_rate": 5.748032653019601e-05, - "loss": 0.8428, + "epoch": 3.283183158358822, + "grad_norm": 44.78761291503906, + "learning_rate": 4.9181267258499615e-05, + "loss": 0.8126, "step": 238300 }, { - "epoch": 2.43, - "learning_rate": 5.7477664767729076e-05, - "loss": 0.8425, + "epoch": 3.284560910418561, + "grad_norm": 27.85205078125, + "learning_rate": 4.917633563472684e-05, + "loss": 0.7799, "step": 238400 }, { - "epoch": 2.43, - "learning_rate": 5.747500166176832e-05, - "loss": 0.8356, + "epoch": 3.2859386624783005, + "grad_norm": 9.93897819519043, + "learning_rate": 4.917140216943844e-05, + "loss": 0.7927, "step": 238500 }, { - "epoch": 2.43, - "learning_rate": 5.747233721244394e-05, - "loss": 0.8686, + "epoch": 3.2873164145380396, + "grad_norm": 20.88914680480957, + "learning_rate": 4.916646686305357e-05, + "loss": 0.8038, "step": 238600 }, { - "epoch": 2.43, - "learning_rate": 5.7469671419886216e-05, - "loss": 0.8782, + "epoch": 3.288694166597779, + "grad_norm": 16.16457748413086, + "learning_rate": 4.9161529715991495e-05, + "loss": 0.8225, "step": 238700 }, { - "epoch": 2.43, - "learning_rate": 5.746700428422549e-05, - "loss": 0.862, + "epoch": 3.2900719186575182, + "grad_norm": 9.250761032104492, + "learning_rate": 4.915659072867164e-05, + "loss": 0.7767, "step": 238800 }, { - "epoch": 2.43, - "learning_rate": 5.746433580559216e-05, - "loss": 0.9365, + "epoch": 3.291449670717258, + "grad_norm": 14.04985523223877, + "learning_rate": 4.915164990151362e-05, + "loss": 0.7441, "step": 238900 }, { - "epoch": 2.43, - "learning_rate": 5.746166598411672e-05, - "loss": 0.965, + "epoch": 3.292827422776997, + "grad_norm": 5.425111770629883, + "learning_rate": 4.9146707234937165e-05, + "loss": 0.858, "step": 239000 }, { - "epoch": 2.44, - "learning_rate": 5.745899481992969e-05, - "loss": 0.7471, + "epoch": 3.2942051748367365, + "grad_norm": 6.852377891540527, + "learning_rate": 4.914176272936219e-05, + "loss": 0.6745, "step": 239100 }, { - "epoch": 2.44, - "learning_rate": 5.745632231316169e-05, - "loss": 0.8996, + "epoch": 3.2955829268964756, + "grad_norm": 3.4075093269348145, + "learning_rate": 4.913681638520875e-05, + "loss": 0.8322, "step": 239200 }, { - "epoch": 2.44, - "learning_rate": 5.745364846394337e-05, - "loss": 0.8262, + "epoch": 3.296960678956215, + "grad_norm": 16.69704246520996, + "learning_rate": 4.9131868202897055e-05, + "loss": 0.791, "step": 239300 }, { - "epoch": 2.44, - "learning_rate": 5.7450973272405486e-05, - "loss": 0.7473, + "epoch": 3.2983384310159543, + "grad_norm": 9.13206672668457, + "learning_rate": 4.9126918182847506e-05, + "loss": 0.843, "step": 239400 }, { - "epoch": 2.44, - "learning_rate": 5.744829673867882e-05, - "loss": 0.8634, + "epoch": 3.299716183075694, + "grad_norm": 8.079787254333496, + "learning_rate": 4.9121966325480604e-05, + "loss": 0.8134, "step": 239500 }, { - "epoch": 2.44, - "learning_rate": 5.744561886289424e-05, - "loss": 0.8106, + "epoch": 3.301093935135433, + "grad_norm": 11.790070533752441, + "learning_rate": 4.911701263121705e-05, + "loss": 0.8127, "step": 239600 }, { - "epoch": 2.44, - "learning_rate": 5.744293964518269e-05, - "loss": 0.8094, + "epoch": 3.3024716871951725, + "grad_norm": 5.893110275268555, + "learning_rate": 4.911205710047768e-05, + "loss": 0.8124, "step": 239700 }, { - "epoch": 2.44, - "learning_rate": 5.744025908567516e-05, - "loss": 0.9496, + "epoch": 3.3038494392549116, + "grad_norm": 9.047781944274902, + "learning_rate": 4.910709973368349e-05, + "loss": 0.8461, "step": 239800 }, { - "epoch": 2.44, - "learning_rate": 5.743757718450272e-05, - "loss": 0.7968, + "epoch": 3.305227191314651, + "grad_norm": 2.1555025577545166, + "learning_rate": 4.910214053125564e-05, + "loss": 0.6842, "step": 239900 }, { - "epoch": 2.45, - "learning_rate": 5.7434893941796495e-05, - "loss": 0.7634, + "epoch": 3.3066049433743903, + "grad_norm": 7.02500581741333, + "learning_rate": 4.9097179493615434e-05, + "loss": 0.7742, "step": 240000 }, { - "epoch": 2.45, - "learning_rate": 5.7432209357687675e-05, - "loss": 0.822, + "epoch": 3.30798269543413, + "grad_norm": 6.156063556671143, + "learning_rate": 4.9092216621184346e-05, + "loss": 0.7644, "step": 240100 }, { - "epoch": 2.45, - "learning_rate": 5.742952343230753e-05, - "loss": 0.9292, + "epoch": 3.309360447493869, + "grad_norm": 8.203913688659668, + "learning_rate": 4.908725191438398e-05, + "loss": 0.8096, "step": 240200 }, { - "epoch": 2.45, - "learning_rate": 5.7426836165787376e-05, - "loss": 0.8505, + "epoch": 3.3107381995536085, + "grad_norm": 12.512221336364746, + "learning_rate": 4.9082285373636135e-05, + "loss": 0.7629, "step": 240300 }, { - "epoch": 2.45, - "learning_rate": 5.7424147558258606e-05, - "loss": 0.9278, + "epoch": 3.3121159516133476, + "grad_norm": 35.61933135986328, + "learning_rate": 4.9077316999362725e-05, + "loss": 0.81, "step": 240400 }, { - "epoch": 2.45, - "learning_rate": 5.7421457609852685e-05, - "loss": 0.7476, + "epoch": 3.313493703673087, + "grad_norm": 7.461085796356201, + "learning_rate": 4.907234679198585e-05, + "loss": 0.8275, "step": 240500 }, { - "epoch": 2.45, - "learning_rate": 5.741876632070113e-05, - "loss": 0.812, + "epoch": 3.3148714557328263, + "grad_norm": 6.215442657470703, + "learning_rate": 4.9067374751927755e-05, + "loss": 0.7793, "step": 240600 }, { - "epoch": 2.45, - "learning_rate": 5.741607369093552e-05, - "loss": 0.974, + "epoch": 3.316249207792566, + "grad_norm": 17.07806968688965, + "learning_rate": 4.906245062740229e-05, + "loss": 0.8535, "step": 240700 }, { - "epoch": 2.45, - "learning_rate": 5.741337972068752e-05, - "loss": 0.9001, + "epoch": 3.317626959852305, + "grad_norm": 27.341754913330078, + "learning_rate": 4.905747494156537e-05, + "loss": 0.8013, "step": 240800 }, { - "epoch": 2.45, - "learning_rate": 5.741068441008885e-05, - "loss": 0.9165, + "epoch": 3.3190047119120445, + "grad_norm": 5.159193515777588, + "learning_rate": 4.9052497424310676e-05, + "loss": 0.8102, "step": 240900 }, { - "epoch": 2.46, - "learning_rate": 5.740798775927129e-05, - "loss": 0.8708, + "epoch": 3.3203824639717836, + "grad_norm": 5.837748050689697, + "learning_rate": 4.9047518076061055e-05, + "loss": 0.8491, "step": 241000 }, { - "epoch": 2.46, - "learning_rate": 5.740528976836669e-05, - "loss": 0.8258, + "epoch": 3.321760216031523, + "grad_norm": 19.702241897583008, + "learning_rate": 4.904253689723953e-05, + "loss": 0.8197, "step": 241100 }, { - "epoch": 2.46, - "learning_rate": 5.740259043750696e-05, - "loss": 0.9042, + "epoch": 3.3231379680912623, + "grad_norm": 32.154605865478516, + "learning_rate": 4.90375538882693e-05, + "loss": 0.8, "step": 241200 }, { - "epoch": 2.46, - "learning_rate": 5.73998897668241e-05, - "loss": 0.8047, + "epoch": 3.324515720151002, + "grad_norm": 5.821761608123779, + "learning_rate": 4.903256904957367e-05, + "loss": 0.7286, "step": 241300 }, { - "epoch": 2.46, - "learning_rate": 5.739718775645014e-05, - "loss": 0.8848, + "epoch": 3.325893472210741, + "grad_norm": 6.600139141082764, + "learning_rate": 4.902758238157615e-05, + "loss": 0.8019, "step": 241400 }, { - "epoch": 2.46, - "learning_rate": 5.73944844065172e-05, - "loss": 0.8763, + "epoch": 3.32727122427048, + "grad_norm": 21.331989288330078, + "learning_rate": 4.902264377872067e-05, + "loss": 0.879, "step": 241500 }, { - "epoch": 2.46, - "learning_rate": 5.7391779717157444e-05, - "loss": 0.802, + "epoch": 3.3286489763302196, + "grad_norm": 5.771658420562744, + "learning_rate": 4.901765347167288e-05, + "loss": 0.8432, "step": 241600 }, { - "epoch": 2.46, - "learning_rate": 5.738907368850313e-05, - "loss": 0.8045, + "epoch": 3.330026728389959, + "grad_norm": 8.749160766601562, + "learning_rate": 4.901266133659034e-05, + "loss": 0.724, "step": 241700 }, { - "epoch": 2.46, - "learning_rate": 5.738636632068656e-05, - "loss": 0.9475, + "epoch": 3.3314044804496983, + "grad_norm": 5.671021938323975, + "learning_rate": 4.900766737389716e-05, + "loss": 0.7721, "step": 241800 }, { - "epoch": 2.46, - "learning_rate": 5.7383657613840115e-05, - "loss": 0.7693, + "epoch": 3.3327822325094374, + "grad_norm": 4.128930568695068, + "learning_rate": 4.90026715840176e-05, + "loss": 0.7717, "step": 241900 }, { - "epoch": 2.47, - "learning_rate": 5.738094756809622e-05, - "loss": 0.9034, + "epoch": 3.334159984569177, + "grad_norm": 3.1032347679138184, + "learning_rate": 4.8997673967376095e-05, + "loss": 0.8735, "step": 242000 }, { - "epoch": 2.47, - "learning_rate": 5.737826330405894e-05, - "loss": 0.9182, + "epoch": 3.3355377366289165, + "grad_norm": 3.7344799041748047, + "learning_rate": 4.899267452439718e-05, + "loss": 0.7318, "step": 242100 }, { - "epoch": 2.47, - "learning_rate": 5.737555059430342e-05, - "loss": 0.9588, + "epoch": 3.3369154886886556, + "grad_norm": 9.54202938079834, + "learning_rate": 4.898767325550563e-05, + "loss": 0.8009, "step": 242200 }, { - "epoch": 2.47, - "learning_rate": 5.737283654604684e-05, - "loss": 0.9221, + "epoch": 3.3382932407483947, + "grad_norm": 2.4578840732574463, + "learning_rate": 4.8982670161126306e-05, + "loss": 0.7739, "step": 242300 }, { - "epoch": 2.47, - "learning_rate": 5.737012115942192e-05, - "loss": 0.8457, + "epoch": 3.3396709928081343, + "grad_norm": 10.420729637145996, + "learning_rate": 4.897766524168424e-05, + "loss": 0.7757, "step": 242400 }, { - "epoch": 2.47, - "learning_rate": 5.7367404434561404e-05, - "loss": 0.8711, + "epoch": 3.3410487448678734, + "grad_norm": 1.3552196025848389, + "learning_rate": 4.897265849760463e-05, + "loss": 0.8338, "step": 242500 }, { - "epoch": 2.47, - "learning_rate": 5.736471355885093e-05, - "loss": 0.8269, + "epoch": 3.342426496927613, + "grad_norm": 8.202522277832031, + "learning_rate": 4.8967649929312836e-05, + "loss": 0.8542, "step": 242600 }, { - "epoch": 2.47, - "learning_rate": 5.7361994171296836e-05, - "loss": 0.9725, + "epoch": 3.343804248987352, + "grad_norm": 5.955030918121338, + "learning_rate": 4.8962639537234344e-05, + "loss": 0.7837, "step": 242700 }, { - "epoch": 2.47, - "learning_rate": 5.735927344590452e-05, - "loss": 0.8244, + "epoch": 3.3451820010470916, + "grad_norm": 6.349592208862305, + "learning_rate": 4.895762732179483e-05, + "loss": 0.8793, "step": 242800 }, { - "epoch": 2.47, - "learning_rate": 5.7356551382807e-05, - "loss": 0.8646, + "epoch": 3.3465597531068307, + "grad_norm": 2.8601889610290527, + "learning_rate": 4.895261328342009e-05, + "loss": 0.8422, "step": 242900 }, { - "epoch": 2.48, - "learning_rate": 5.7353827982137366e-05, - "loss": 0.8295, + "epoch": 3.3479375051665703, + "grad_norm": 9.999490737915039, + "learning_rate": 4.894759742253611e-05, + "loss": 0.7839, "step": 243000 }, { - "epoch": 2.48, - "learning_rate": 5.735110324402879e-05, - "loss": 0.8854, + "epoch": 3.3493152572263094, + "grad_norm": 6.5584516525268555, + "learning_rate": 4.8942579739568993e-05, + "loss": 0.7936, "step": 243100 }, { - "epoch": 2.48, - "learning_rate": 5.734837716861449e-05, - "loss": 0.7545, + "epoch": 3.350693009286049, + "grad_norm": 3.6478030681610107, + "learning_rate": 4.893756023494503e-05, + "loss": 0.7194, "step": 243200 }, { - "epoch": 2.48, - "learning_rate": 5.734564975602775e-05, - "loss": 0.7202, + "epoch": 3.352070761345788, + "grad_norm": 8.727842330932617, + "learning_rate": 4.893253890909066e-05, + "loss": 0.7699, "step": 243300 }, { - "epoch": 2.48, - "learning_rate": 5.734292100640192e-05, - "loss": 0.8857, + "epoch": 3.3534485134055276, + "grad_norm": 6.058422088623047, + "learning_rate": 4.892751576243245e-05, + "loss": 0.7151, "step": 243400 }, { - "epoch": 2.48, - "learning_rate": 5.734019091987044e-05, - "loss": 0.8698, + "epoch": 3.3548262654652667, + "grad_norm": 20.173702239990234, + "learning_rate": 4.8922490795397165e-05, + "loss": 0.8029, "step": 243500 }, { - "epoch": 2.48, - "learning_rate": 5.7337459496566776e-05, - "loss": 0.8136, + "epoch": 3.3562040175250063, + "grad_norm": 14.083977699279785, + "learning_rate": 4.891746400841168e-05, + "loss": 0.8296, "step": 243600 }, { - "epoch": 2.48, - "learning_rate": 5.733472673662448e-05, - "loss": 0.9224, + "epoch": 3.3575817695847454, + "grad_norm": 4.4388651847839355, + "learning_rate": 4.8912435401903064e-05, + "loss": 0.7198, "step": 243700 }, { - "epoch": 2.48, - "learning_rate": 5.7331992640177185e-05, - "loss": 0.9174, + "epoch": 3.358959521644485, + "grad_norm": 5.750528812408447, + "learning_rate": 4.890740497629851e-05, + "loss": 0.7084, "step": 243800 }, { - "epoch": 2.48, - "learning_rate": 5.732925720735854e-05, - "loss": 0.9318, + "epoch": 3.360337273704224, + "grad_norm": 7.963659763336182, + "learning_rate": 4.8902372732025387e-05, + "loss": 0.8059, "step": 243900 }, { - "epoch": 2.49, - "learning_rate": 5.7326520438302324e-05, - "loss": 0.8895, + "epoch": 3.3617150257639636, + "grad_norm": 2.1787214279174805, + "learning_rate": 4.889733866951121e-05, + "loss": 0.7413, "step": 244000 }, { - "epoch": 2.49, - "learning_rate": 5.732378233314233e-05, - "loss": 0.9287, + "epoch": 3.3630927778237028, + "grad_norm": 6.035741329193115, + "learning_rate": 4.889230278918364e-05, + "loss": 0.7222, "step": 244100 }, { - "epoch": 2.49, - "learning_rate": 5.732104289201244e-05, - "loss": 0.8927, + "epoch": 3.3644705298834423, + "grad_norm": 15.489537239074707, + "learning_rate": 4.8887265091470505e-05, + "loss": 0.8222, "step": 244200 }, { - "epoch": 2.49, - "learning_rate": 5.73183021150466e-05, - "loss": 0.9159, + "epoch": 3.3658482819431814, + "grad_norm": 10.179112434387207, + "learning_rate": 4.8882225576799784e-05, + "loss": 0.7614, "step": 244300 }, { - "epoch": 2.49, - "learning_rate": 5.73155600023788e-05, - "loss": 0.8519, + "epoch": 3.367226034002921, + "grad_norm": 3.5675954818725586, + "learning_rate": 4.887718424559961e-05, + "loss": 0.821, "step": 244400 }, { - "epoch": 2.49, - "learning_rate": 5.731281655414314e-05, - "loss": 0.8667, + "epoch": 3.36860378606266, + "grad_norm": 11.235749244689941, + "learning_rate": 4.8872141098298277e-05, + "loss": 0.6896, "step": 244500 }, { - "epoch": 2.49, - "learning_rate": 5.7310071770473745e-05, - "loss": 0.8324, + "epoch": 3.3699815381223996, + "grad_norm": 8.315887451171875, + "learning_rate": 4.886709613532421e-05, + "loss": 0.7544, "step": 244600 }, { - "epoch": 2.49, - "learning_rate": 5.730732565150481e-05, - "loss": 0.854, + "epoch": 3.3713592901821388, + "grad_norm": 10.005486488342285, + "learning_rate": 4.8862049357106016e-05, + "loss": 0.7465, "step": 244700 }, { - "epoch": 2.49, - "learning_rate": 5.7304578197370616e-05, - "loss": 0.9844, + "epoch": 3.3727370422418783, + "grad_norm": 11.52979850769043, + "learning_rate": 4.8857000764072436e-05, + "loss": 0.8028, "step": 244800 }, { - "epoch": 2.5, - "learning_rate": 5.730182940820549e-05, - "loss": 0.9262, + "epoch": 3.3741147943016174, + "grad_norm": 7.157665252685547, + "learning_rate": 4.8851950356652386e-05, + "loss": 0.7716, "step": 244900 }, { - "epoch": 2.5, - "learning_rate": 5.729907928414384e-05, - "loss": 0.9692, + "epoch": 3.375492546361357, + "grad_norm": 6.1718339920043945, + "learning_rate": 4.884689813527491e-05, + "loss": 0.7281, "step": 245000 }, { - "epoch": 2.5, - "learning_rate": 5.729632782532012e-05, - "loss": 0.7791, + "epoch": 3.376870298421096, + "grad_norm": 17.761404037475586, + "learning_rate": 4.8841844100369234e-05, + "loss": 0.7585, "step": 245100 }, { - "epoch": 2.5, - "learning_rate": 5.7293575031868884e-05, - "loss": 0.8821, + "epoch": 3.3782480504808357, + "grad_norm": 5.098033428192139, + "learning_rate": 4.883678825236472e-05, + "loss": 0.7591, "step": 245200 }, { - "epoch": 2.5, - "learning_rate": 5.7290820903924686e-05, - "loss": 0.8035, + "epoch": 3.3796258025405748, + "grad_norm": 11.7904634475708, + "learning_rate": 4.8831730591690885e-05, + "loss": 0.6035, "step": 245300 }, { - "epoch": 2.5, - "learning_rate": 5.728806544162222e-05, - "loss": 0.8593, + "epoch": 3.3810035546003143, + "grad_norm": 102.54789733886719, + "learning_rate": 4.8826671118777396e-05, + "loss": 0.7313, "step": 245400 }, { - "epoch": 2.5, - "learning_rate": 5.7285308645096193e-05, - "loss": 0.868, + "epoch": 3.3823813066600534, + "grad_norm": 18.690048217773438, + "learning_rate": 4.88216098340541e-05, + "loss": 0.7894, "step": 245500 }, { - "epoch": 2.5, - "learning_rate": 5.728255051448141e-05, - "loss": 0.8856, + "epoch": 3.383759058719793, + "grad_norm": 4.71035623550415, + "learning_rate": 4.8816546737950955e-05, + "loss": 0.8117, "step": 245600 }, { - "epoch": 2.5, - "learning_rate": 5.727979104991272e-05, - "loss": 0.9176, + "epoch": 3.385136810779532, + "grad_norm": 4.706634044647217, + "learning_rate": 4.8811481830898115e-05, + "loss": 0.7514, "step": 245700 }, { - "epoch": 2.5, - "learning_rate": 5.727703025152504e-05, - "loss": 0.8087, + "epoch": 3.386514562839271, + "grad_norm": 4.474785804748535, + "learning_rate": 4.880641511332587e-05, + "loss": 0.7543, "step": 245800 }, { - "epoch": 2.51, - "learning_rate": 5.727426811945337e-05, - "loss": 0.8861, + "epoch": 3.3878923148990108, + "grad_norm": 62.48046112060547, + "learning_rate": 4.880134658566466e-05, + "loss": 0.6726, "step": 245900 }, { - "epoch": 2.51, - "learning_rate": 5.727150465383275e-05, - "loss": 0.8416, + "epoch": 3.3892700669587503, + "grad_norm": 5.013173580169678, + "learning_rate": 4.8796276248345066e-05, + "loss": 0.7154, "step": 246000 }, { - "epoch": 2.51, - "learning_rate": 5.726873985479829e-05, - "loss": 0.8767, + "epoch": 3.3906478190184894, + "grad_norm": 20.278827667236328, + "learning_rate": 4.879120410179787e-05, + "loss": 0.6962, "step": 246100 }, { - "epoch": 2.51, - "learning_rate": 5.726597372248519e-05, - "loss": 0.7727, + "epoch": 3.3920255710782286, + "grad_norm": 4.829151630401611, + "learning_rate": 4.8786130146453954e-05, + "loss": 0.7029, "step": 246200 }, { - "epoch": 2.51, - "learning_rate": 5.726320625702869e-05, - "loss": 0.884, + "epoch": 3.393403323137968, + "grad_norm": 7.272677421569824, + "learning_rate": 4.8781054382744374e-05, + "loss": 0.6493, "step": 246300 }, { - "epoch": 2.51, - "learning_rate": 5.72604374585641e-05, - "loss": 0.8328, + "epoch": 3.3947810751977077, + "grad_norm": 9.106263160705566, + "learning_rate": 4.8775976811100356e-05, + "loss": 0.8466, "step": 246400 }, { - "epoch": 2.51, - "learning_rate": 5.7257667327226814e-05, - "loss": 0.8487, + "epoch": 3.3961588272574468, + "grad_norm": 4.589905261993408, + "learning_rate": 4.877089743195327e-05, + "loss": 0.7432, "step": 246500 }, { - "epoch": 2.51, - "learning_rate": 5.725489586315225e-05, - "loss": 0.9323, + "epoch": 3.397536579317186, + "grad_norm": 10.127010345458984, + "learning_rate": 4.8765816245734616e-05, + "loss": 0.7921, "step": 246600 }, { - "epoch": 2.51, - "learning_rate": 5.725212306647592e-05, - "loss": 0.8996, + "epoch": 3.3989143313769254, + "grad_norm": 1.9080126285552979, + "learning_rate": 4.876073325287608e-05, + "loss": 0.7825, "step": 246700 }, { - "epoch": 2.51, - "learning_rate": 5.7249348937333414e-05, - "loss": 0.7903, + "epoch": 3.4002920834366646, + "grad_norm": 6.177097797393799, + "learning_rate": 4.875564845380949e-05, + "loss": 0.7109, "step": 246800 }, { - "epoch": 2.52, - "learning_rate": 5.724657347586036e-05, - "loss": 0.9189, + "epoch": 3.401669835496404, + "grad_norm": 4.822579383850098, + "learning_rate": 4.875056184896682e-05, + "loss": 0.6683, "step": 246900 }, { - "epoch": 2.52, - "learning_rate": 5.724379668219246e-05, - "loss": 0.8507, + "epoch": 3.4030475875561432, + "grad_norm": 2.6892597675323486, + "learning_rate": 4.874547343878019e-05, + "loss": 0.6962, "step": 247000 }, { - "epoch": 2.52, - "learning_rate": 5.7241046344315995e-05, - "loss": 0.9612, + "epoch": 3.404425339615883, + "grad_norm": 4.09142541885376, + "learning_rate": 4.874038322368192e-05, + "loss": 0.6677, "step": 247100 }, { - "epoch": 2.52, - "learning_rate": 5.7238294701020236e-05, - "loss": 0.8118, + "epoch": 3.405803091675622, + "grad_norm": 22.466520309448242, + "learning_rate": 4.8735291204104416e-05, + "loss": 0.7752, "step": 247200 }, { - "epoch": 2.52, - "learning_rate": 5.7235513938217094e-05, - "loss": 0.9644, + "epoch": 3.4071808437353615, + "grad_norm": 14.71407413482666, + "learning_rate": 4.873019738048029e-05, + "loss": 0.7925, "step": 247300 }, { - "epoch": 2.52, - "learning_rate": 5.7232731843759846e-05, - "loss": 0.9178, + "epoch": 3.4085585957951006, + "grad_norm": 1.8812305927276611, + "learning_rate": 4.8725101753242285e-05, + "loss": 0.8216, "step": 247400 }, { - "epoch": 2.52, - "learning_rate": 5.722994841778454e-05, - "loss": 0.9659, + "epoch": 3.40993634785484, + "grad_norm": 20.897409439086914, + "learning_rate": 4.872000432282329e-05, + "loss": 0.7097, "step": 247500 }, { - "epoch": 2.52, - "learning_rate": 5.7227163660427244e-05, - "loss": 0.7966, + "epoch": 3.4113140999145792, + "grad_norm": 8.49991512298584, + "learning_rate": 4.871490508965638e-05, + "loss": 0.7486, "step": 247600 }, { - "epoch": 2.52, - "learning_rate": 5.722437757182413e-05, - "loss": 0.7818, + "epoch": 3.412691851974319, + "grad_norm": 9.218452453613281, + "learning_rate": 4.870980405417475e-05, + "loss": 0.8095, "step": 247700 }, { - "epoch": 2.52, - "learning_rate": 5.7221590152111424e-05, - "loss": 0.8264, + "epoch": 3.414069604034058, + "grad_norm": 10.462718963623047, + "learning_rate": 4.870470121681174e-05, + "loss": 0.713, "step": 247800 }, { - "epoch": 2.53, - "learning_rate": 5.721880140142541e-05, - "loss": 0.8649, + "epoch": 3.4154473560937975, + "grad_norm": 18.287776947021484, + "learning_rate": 4.86995965780009e-05, + "loss": 0.7127, "step": 247900 }, { - "epoch": 2.53, - "learning_rate": 5.721601131990243e-05, - "loss": 0.942, + "epoch": 3.4168251081535366, + "grad_norm": 48.037933349609375, + "learning_rate": 4.869449013817585e-05, + "loss": 0.7077, "step": 248000 }, { - "epoch": 2.53, - "learning_rate": 5.721321990767893e-05, - "loss": 0.8436, + "epoch": 3.418202860213276, + "grad_norm": 7.83187198638916, + "learning_rate": 4.8689381897770454e-05, + "loss": 0.7141, "step": 248100 }, { - "epoch": 2.53, - "learning_rate": 5.721042716489136e-05, - "loss": 0.8783, + "epoch": 3.4195806122730152, + "grad_norm": 20.0269775390625, + "learning_rate": 4.8684271857218645e-05, + "loss": 0.8318, "step": 248200 }, { - "epoch": 2.53, - "learning_rate": 5.7207633091676295e-05, - "loss": 0.8413, + "epoch": 3.420958364332755, + "grad_norm": 13.097688674926758, + "learning_rate": 4.867916001695457e-05, + "loss": 0.6663, "step": 248300 }, { - "epoch": 2.53, - "learning_rate": 5.720483768817034e-05, - "loss": 0.8947, + "epoch": 3.422336116392494, + "grad_norm": 8.687681198120117, + "learning_rate": 4.8674046377412505e-05, + "loss": 0.753, "step": 248400 }, { - "epoch": 2.53, - "learning_rate": 5.720204095451016e-05, - "loss": 0.7859, + "epoch": 3.4237138684522335, + "grad_norm": 5.758730888366699, + "learning_rate": 4.866893093902687e-05, + "loss": 0.7695, "step": 248500 }, { - "epoch": 2.53, - "learning_rate": 5.719924289083252e-05, - "loss": 0.8549, + "epoch": 3.4250916205119726, + "grad_norm": 7.05889892578125, + "learning_rate": 4.8663864883500895e-05, + "loss": 0.6969, "step": 248600 }, { - "epoch": 2.53, - "learning_rate": 5.7196443497274217e-05, - "loss": 0.7906, + "epoch": 3.426469372571712, + "grad_norm": 41.88780975341797, + "learning_rate": 4.865874586670963e-05, + "loss": 0.6694, "step": 248700 }, { - "epoch": 2.53, - "learning_rate": 5.719364277397212e-05, - "loss": 0.9004, + "epoch": 3.4278471246314512, + "grad_norm": 4.313061237335205, + "learning_rate": 4.865362505237464e-05, + "loss": 0.7804, "step": 248800 }, { - "epoch": 2.54, - "learning_rate": 5.7190840721063194e-05, - "loss": 0.7983, + "epoch": 3.429224876691191, + "grad_norm": 10.696359634399414, + "learning_rate": 4.864850244093099e-05, + "loss": 0.7183, "step": 248900 }, { - "epoch": 2.54, - "learning_rate": 5.7188037338684406e-05, - "loss": 0.8178, + "epoch": 3.43060262875093, + "grad_norm": 4.369784355163574, + "learning_rate": 4.864337803281386e-05, + "loss": 0.8142, "step": 249000 }, { - "epoch": 2.54, - "learning_rate": 5.7185232626972856e-05, - "loss": 0.9048, + "epoch": 3.4319803808106695, + "grad_norm": 5.5349202156066895, + "learning_rate": 4.86383030993921e-05, + "loss": 0.8455, "step": 249100 }, { - "epoch": 2.54, - "learning_rate": 5.718242658606565e-05, - "loss": 0.7853, + "epoch": 3.4333581328704086, + "grad_norm": 25.62873649597168, + "learning_rate": 4.863317511719007e-05, + "loss": 0.7835, "step": 249200 }, { - "epoch": 2.54, - "learning_rate": 5.717961921610001e-05, - "loss": 0.8043, + "epoch": 3.434735884930148, + "grad_norm": 7.503832817077637, + "learning_rate": 4.8628045339616694e-05, + "loss": 0.796, "step": 249300 }, { - "epoch": 2.54, - "learning_rate": 5.7176810517213176e-05, - "loss": 0.8754, + "epoch": 3.4361136369898873, + "grad_norm": 6.2945685386657715, + "learning_rate": 4.8622913767107786e-05, + "loss": 0.7037, "step": 249400 }, { - "epoch": 2.54, - "learning_rate": 5.717400048954251e-05, - "loss": 0.9261, + "epoch": 3.437491389049627, + "grad_norm": 12.687012672424316, + "learning_rate": 4.8617780400099285e-05, + "loss": 0.8203, "step": 249500 }, { - "epoch": 2.54, - "learning_rate": 5.717118913322537e-05, - "loss": 0.8122, + "epoch": 3.438869141109366, + "grad_norm": 5.0454816818237305, + "learning_rate": 4.861264523902731e-05, + "loss": 0.7008, "step": 249600 }, { - "epoch": 2.54, - "learning_rate": 5.716837644839923e-05, - "loss": 0.8972, + "epoch": 3.4402468931691055, + "grad_norm": 3.082151412963867, + "learning_rate": 4.8607508284328097e-05, + "loss": 0.8666, "step": 249700 }, { - "epoch": 2.55, - "learning_rate": 5.716556243520163e-05, - "loss": 0.8242, + "epoch": 3.4416246452288446, + "grad_norm": 29.07480812072754, + "learning_rate": 4.860236953643807e-05, + "loss": 0.7788, "step": 249800 }, { - "epoch": 2.55, - "learning_rate": 5.716274709377012e-05, - "loss": 0.7834, + "epoch": 3.443002397288584, + "grad_norm": 4.837975978851318, + "learning_rate": 4.859722899579379e-05, + "loss": 0.6349, "step": 249900 }, { - "epoch": 2.55, - "learning_rate": 5.71599304242424e-05, - "loss": 0.7136, + "epoch": 3.4443801493483233, + "grad_norm": 6.527149200439453, + "learning_rate": 4.859208666283199e-05, + "loss": 0.7828, "step": 250000 }, { - "epoch": 2.55, - "learning_rate": 5.7157112426756146e-05, - "loss": 0.7837, + "epoch": 3.4457579014080624, + "grad_norm": 3.296645402908325, + "learning_rate": 4.858694253798951e-05, + "loss": 0.722, "step": 250100 }, { - "epoch": 2.55, - "learning_rate": 5.715429310144917e-05, - "loss": 0.8747, + "epoch": 3.447135653467802, + "grad_norm": 10.018839836120605, + "learning_rate": 4.8581848089732456e-05, + "loss": 0.7493, "step": 250200 }, { - "epoch": 2.55, - "learning_rate": 5.7151472448459304e-05, - "loss": 0.8101, + "epoch": 3.4485134055275415, + "grad_norm": 22.27680206298828, + "learning_rate": 4.857670040034775e-05, + "loss": 0.6785, "step": 250300 }, { - "epoch": 2.55, - "learning_rate": 5.714865046792447e-05, - "loss": 0.8779, + "epoch": 3.4498911575872806, + "grad_norm": 5.179969787597656, + "learning_rate": 4.857155092038952e-05, + "loss": 0.7364, "step": 250400 }, { - "epoch": 2.55, - "learning_rate": 5.7145827159982635e-05, - "loss": 0.7656, + "epoch": 3.4512689096470197, + "grad_norm": 5.054967403411865, + "learning_rate": 4.856639965029524e-05, + "loss": 0.6686, "step": 250500 }, { - "epoch": 2.55, - "learning_rate": 5.714300252477186e-05, - "loss": 0.8196, + "epoch": 3.4526466617067593, + "grad_norm": 17.167804718017578, + "learning_rate": 4.856124659050253e-05, + "loss": 0.7993, "step": 250600 }, { - "epoch": 2.55, - "learning_rate": 5.714017656243024e-05, - "loss": 0.8955, + "epoch": 3.454024413766499, + "grad_norm": 7.192329406738281, + "learning_rate": 4.8556091741449175e-05, + "loss": 0.7498, "step": 250700 }, { - "epoch": 2.56, - "learning_rate": 5.7137349273095945e-05, - "loss": 0.8981, + "epoch": 3.455402165826238, + "grad_norm": 18.25777244567871, + "learning_rate": 4.855093510357311e-05, + "loss": 0.7532, "step": 250800 }, { - "epoch": 2.56, - "learning_rate": 5.713452065690723e-05, - "loss": 0.7695, + "epoch": 3.456779917885977, + "grad_norm": 6.189223766326904, + "learning_rate": 4.854577667731241e-05, + "loss": 0.7239, "step": 250900 }, { - "epoch": 2.56, - "learning_rate": 5.713169071400237e-05, - "loss": 0.7515, + "epoch": 3.4581576699457166, + "grad_norm": 5.249884605407715, + "learning_rate": 4.854061646310531e-05, + "loss": 0.7946, "step": 251000 }, { - "epoch": 2.56, - "learning_rate": 5.7128859444519756e-05, - "loss": 0.9089, + "epoch": 3.4595354220054557, + "grad_norm": 5.6971659660339355, + "learning_rate": 4.8535454461390194e-05, + "loss": 0.6995, "step": 251100 }, { - "epoch": 2.56, - "learning_rate": 5.712602684859781e-05, - "loss": 1.0361, + "epoch": 3.4609131740651953, + "grad_norm": 40.974647521972656, + "learning_rate": 4.8530290672605605e-05, + "loss": 0.7671, "step": 251200 }, { - "epoch": 2.56, - "learning_rate": 5.7123192926375025e-05, - "loss": 0.8626, + "epoch": 3.4622909261249344, + "grad_norm": 4.172917366027832, + "learning_rate": 4.852512509719024e-05, + "loss": 0.7805, "step": 251300 }, { - "epoch": 2.56, - "learning_rate": 5.712035767798998e-05, - "loss": 0.7959, + "epoch": 3.463668678184674, + "grad_norm": 18.657901763916016, + "learning_rate": 4.851995773558293e-05, + "loss": 0.7516, "step": 251400 }, { - "epoch": 2.56, - "learning_rate": 5.711752110358129e-05, - "loss": 0.8262, + "epoch": 3.465046430244413, + "grad_norm": 9.104113578796387, + "learning_rate": 4.851478858822267e-05, + "loss": 0.7881, "step": 251500 }, { - "epoch": 2.56, - "learning_rate": 5.7114683203287634e-05, - "loss": 0.7986, + "epoch": 3.4664241823041526, + "grad_norm": 15.249149322509766, + "learning_rate": 4.8509617655548614e-05, + "loss": 0.7141, "step": 251600 }, { - "epoch": 2.56, - "learning_rate": 5.711184397724779e-05, - "loss": 0.7732, + "epoch": 3.4678019343638917, + "grad_norm": 4.647822380065918, + "learning_rate": 4.8504444938000054e-05, + "loss": 0.7246, "step": 251700 }, { - "epoch": 2.57, - "learning_rate": 5.710903183767834e-05, - "loss": 0.7471, + "epoch": 3.4691796864236313, + "grad_norm": 9.48563289642334, + "learning_rate": 4.849927043601644e-05, + "loss": 0.7333, "step": 251800 }, { - "epoch": 2.57, - "learning_rate": 5.710618997381663e-05, - "loss": 0.8087, + "epoch": 3.4705574384833704, + "grad_norm": 4.856204986572266, + "learning_rate": 4.849409415003737e-05, + "loss": 0.7581, "step": 251900 }, { - "epoch": 2.57, - "learning_rate": 5.710337522307584e-05, - "loss": 0.7915, + "epoch": 3.47193519054311, + "grad_norm": 3.4496560096740723, + "learning_rate": 4.8488916080502594e-05, + "loss": 0.6648, "step": 252000 }, { - "epoch": 2.57, - "learning_rate": 5.710053072194253e-05, - "loss": 0.921, + "epoch": 3.473312942602849, + "grad_norm": 4.758912086486816, + "learning_rate": 4.848373622785202e-05, + "loss": 0.6926, "step": 252100 }, { - "epoch": 2.57, - "learning_rate": 5.709768489575497e-05, - "loss": 0.8942, + "epoch": 3.4746906946625886, + "grad_norm": 7.947057723999023, + "learning_rate": 4.847855459252569e-05, + "loss": 0.7147, "step": 252200 }, { - "epoch": 2.57, - "learning_rate": 5.709483774465234e-05, - "loss": 0.9489, + "epoch": 3.4760684467223277, + "grad_norm": 3.94535756111145, + "learning_rate": 4.847337117496384e-05, + "loss": 0.7929, "step": 252300 }, { - "epoch": 2.57, - "learning_rate": 5.709198926877384e-05, - "loss": 0.9265, + "epoch": 3.4774461987820673, + "grad_norm": 4.1371917724609375, + "learning_rate": 4.8468185975606806e-05, + "loss": 0.7555, "step": 252400 }, { - "epoch": 2.57, - "learning_rate": 5.708916797282039e-05, - "loss": 0.8836, + "epoch": 3.4788239508418064, + "grad_norm": 22.970508575439453, + "learning_rate": 4.8462998994895086e-05, + "loss": 0.6991, "step": 252500 }, { - "epoch": 2.57, - "learning_rate": 5.708631686105231e-05, - "loss": 0.8376, + "epoch": 3.480201702901546, + "grad_norm": 24.19109344482422, + "learning_rate": 4.845781023326937e-05, + "loss": 0.7397, "step": 252600 }, { - "epoch": 2.57, - "learning_rate": 5.708346442492498e-05, - "loss": 0.8775, + "epoch": 3.481579454961285, + "grad_norm": 8.861376762390137, + "learning_rate": 4.845261969117046e-05, + "loss": 0.7637, "step": 252700 }, { - "epoch": 2.58, - "learning_rate": 5.708061066457787e-05, - "loss": 0.9899, + "epoch": 3.4829572070210246, + "grad_norm": 3.156024694442749, + "learning_rate": 4.84474273690393e-05, + "loss": 0.7181, "step": 252800 }, { - "epoch": 2.58, - "learning_rate": 5.70777555801505e-05, - "loss": 0.882, + "epoch": 3.4843349590807637, + "grad_norm": 5.054050922393799, + "learning_rate": 4.8442233267317024e-05, + "loss": 0.6469, "step": 252900 }, { - "epoch": 2.58, - "learning_rate": 5.707489917178248e-05, - "loss": 0.9497, + "epoch": 3.4857127111405033, + "grad_norm": 11.35214614868164, + "learning_rate": 4.843703738644489e-05, + "loss": 0.7373, "step": 253000 }, { - "epoch": 2.58, - "learning_rate": 5.707204143961346e-05, - "loss": 0.9415, + "epoch": 3.4870904632002424, + "grad_norm": 3.2364251613616943, + "learning_rate": 4.843183972686432e-05, + "loss": 0.7847, "step": 253100 }, { - "epoch": 2.58, - "learning_rate": 5.7069182383783164e-05, - "loss": 0.9093, + "epoch": 3.488468215259982, + "grad_norm": 20.48175811767578, + "learning_rate": 4.842664028901688e-05, + "loss": 0.6701, "step": 253200 }, { - "epoch": 2.58, - "learning_rate": 5.7066322004431405e-05, - "loss": 0.7644, + "epoch": 3.489845967319721, + "grad_norm": 10.260986328125, + "learning_rate": 4.842143907334429e-05, + "loss": 0.7859, "step": 253300 }, { - "epoch": 2.58, - "learning_rate": 5.7063460301698015e-05, - "loss": 0.7724, + "epoch": 3.4912237193794606, + "grad_norm": 5.544047832489014, + "learning_rate": 4.84162360802884e-05, + "loss": 0.678, "step": 253400 }, { - "epoch": 2.58, - "learning_rate": 5.7060597275722925e-05, - "loss": 0.9029, + "epoch": 3.4926014714391997, + "grad_norm": 16.02749252319336, + "learning_rate": 4.841103131029127e-05, + "loss": 0.7296, "step": 253500 }, { - "epoch": 2.58, - "learning_rate": 5.705773292664612e-05, - "loss": 0.9727, + "epoch": 3.4939792234989393, + "grad_norm": 1.9412063360214233, + "learning_rate": 4.8405824763795035e-05, + "loss": 0.7713, "step": 253600 }, { - "epoch": 2.58, - "learning_rate": 5.705486725460764e-05, - "loss": 0.7883, + "epoch": 3.4953569755586784, + "grad_norm": 8.680977821350098, + "learning_rate": 4.840061644124204e-05, + "loss": 0.7368, "step": 253700 }, { - "epoch": 2.59, - "learning_rate": 5.705200025974761e-05, - "loss": 0.8152, + "epoch": 3.496734727618418, + "grad_norm": 6.058280944824219, + "learning_rate": 4.8395406343074755e-05, + "loss": 0.7059, "step": 253800 }, { - "epoch": 2.59, - "learning_rate": 5.70491319422062e-05, - "loss": 0.8392, + "epoch": 3.498112479678157, + "grad_norm": 7.602729797363281, + "learning_rate": 4.8390194469735795e-05, + "loss": 0.7328, "step": 253900 }, { - "epoch": 2.59, - "learning_rate": 5.704626230212367e-05, - "loss": 0.8581, + "epoch": 3.4994902317378966, + "grad_norm": 35.16118240356445, + "learning_rate": 4.8384980821667946e-05, + "loss": 0.7251, "step": 254000 }, { - "epoch": 2.59, - "learning_rate": 5.704339133964031e-05, - "loss": 0.9669, + "epoch": 3.5008679837976358, + "grad_norm": 10.049656867980957, + "learning_rate": 4.8379765399314125e-05, + "loss": 0.6453, "step": 254100 }, { - "epoch": 2.59, - "learning_rate": 5.70405190548965e-05, - "loss": 0.82, + "epoch": 3.5022457358573753, + "grad_norm": 1.452939510345459, + "learning_rate": 4.837454820311741e-05, + "loss": 0.7423, "step": 254200 }, { - "epoch": 2.59, - "learning_rate": 5.703764544803268e-05, - "loss": 0.8998, + "epoch": 3.5036234879171144, + "grad_norm": 4.007043361663818, + "learning_rate": 4.836932923352104e-05, + "loss": 0.6879, "step": 254300 }, { - "epoch": 2.59, - "learning_rate": 5.703477051918935e-05, - "loss": 0.8175, + "epoch": 3.5050012399768535, + "grad_norm": 10.145913124084473, + "learning_rate": 4.836410849096838e-05, + "loss": 0.7283, "step": 254400 }, { - "epoch": 2.59, - "learning_rate": 5.7031894268507075e-05, - "loss": 0.797, + "epoch": 3.506378992036593, + "grad_norm": 4.904376029968262, + "learning_rate": 4.835888597590297e-05, + "loss": 0.7373, "step": 254500 }, { - "epoch": 2.59, - "learning_rate": 5.7029045478392244e-05, - "loss": 0.8205, + "epoch": 3.5077567440963326, + "grad_norm": 6.015730857849121, + "learning_rate": 4.835366168876848e-05, + "loss": 0.6974, "step": 254600 }, { - "epoch": 2.59, - "learning_rate": 5.702616659766891e-05, - "loss": 0.833, + "epoch": 3.5091344961560718, + "grad_norm": 10.618706703186035, + "learning_rate": 4.834843563000873e-05, + "loss": 0.7305, "step": 254700 }, { - "epoch": 2.6, - "learning_rate": 5.702328639552733e-05, - "loss": 0.8664, + "epoch": 3.510512248215811, + "grad_norm": 6.833013534545898, + "learning_rate": 4.834320780006773e-05, + "loss": 0.754, "step": 254800 }, { - "epoch": 2.6, - "learning_rate": 5.7020404872108305e-05, - "loss": 0.8918, + "epoch": 3.5118900002755504, + "grad_norm": 3.680297374725342, + "learning_rate": 4.833797819938959e-05, + "loss": 0.706, "step": 254900 }, { - "epoch": 2.6, - "learning_rate": 5.7017522027552736e-05, - "loss": 0.8842, + "epoch": 3.51326775233529, + "grad_norm": 3.6445000171661377, + "learning_rate": 4.833274682841859e-05, + "loss": 0.6842, "step": 255000 }, { - "epoch": 2.6, - "learning_rate": 5.701463786200156e-05, - "loss": 0.853, + "epoch": 3.514645504395029, + "grad_norm": 7.004006862640381, + "learning_rate": 4.832751368759917e-05, + "loss": 0.6952, "step": 255100 }, { - "epoch": 2.6, - "learning_rate": 5.701175237559582e-05, - "loss": 0.8193, + "epoch": 3.516023256454768, + "grad_norm": 5.290282726287842, + "learning_rate": 4.832227877737591e-05, + "loss": 0.7236, "step": 255200 }, { - "epoch": 2.6, - "learning_rate": 5.700886556847659e-05, - "loss": 0.8422, + "epoch": 3.5174010085145078, + "grad_norm": 5.978407382965088, + "learning_rate": 4.831704209819354e-05, + "loss": 0.7177, "step": 255300 }, { - "epoch": 2.6, - "learning_rate": 5.7006006328598286e-05, - "loss": 0.8899, + "epoch": 3.5187787605742473, + "grad_norm": 8.913142204284668, + "learning_rate": 4.8311803650496945e-05, + "loss": 0.7054, "step": 255400 }, { - "epoch": 2.6, - "learning_rate": 5.700311689367918e-05, - "loss": 0.8576, + "epoch": 3.5201565126339864, + "grad_norm": 6.210485935211182, + "learning_rate": 4.8306563434731164e-05, + "loss": 0.6782, "step": 255500 }, { - "epoch": 2.6, - "learning_rate": 5.70002261384688e-05, - "loss": 0.8841, + "epoch": 3.5215342646937255, + "grad_norm": 49.2089958190918, + "learning_rate": 4.8301321451341366e-05, + "loss": 0.784, "step": 255600 }, { - "epoch": 2.61, - "learning_rate": 5.69973340631085e-05, - "loss": 1.0285, + "epoch": 3.522912016753465, + "grad_norm": 3.744565010070801, + "learning_rate": 4.829607770077289e-05, + "loss": 0.7407, "step": 255700 }, { - "epoch": 2.61, - "learning_rate": 5.6994440667739674e-05, - "loss": 0.8977, + "epoch": 3.524289768813204, + "grad_norm": 13.391924858093262, + "learning_rate": 4.829083218347123e-05, + "loss": 0.6471, "step": 255800 }, { - "epoch": 2.61, - "learning_rate": 5.6991545952503783e-05, - "loss": 0.9608, + "epoch": 3.5256675208729438, + "grad_norm": 3.9713025093078613, + "learning_rate": 4.8285584899882e-05, + "loss": 0.7265, "step": 255900 }, { - "epoch": 2.61, - "learning_rate": 5.698864991754238e-05, - "loss": 0.9759, + "epoch": 3.527045272932683, + "grad_norm": 14.249966621398926, + "learning_rate": 4.8280335850451005e-05, + "loss": 0.7673, "step": 256000 }, { - "epoch": 2.61, - "learning_rate": 5.698575256299706e-05, - "loss": 0.9022, + "epoch": 3.5284230249924224, + "grad_norm": 3.9654293060302734, + "learning_rate": 4.8275085035624166e-05, + "loss": 0.7313, "step": 256100 }, { - "epoch": 2.61, - "learning_rate": 5.698285388900947e-05, - "loss": 0.9024, + "epoch": 3.5298007770521616, + "grad_norm": 3.889775276184082, + "learning_rate": 4.826983245584756e-05, + "loss": 0.7627, "step": 256200 }, { - "epoch": 2.61, - "learning_rate": 5.697995389572137e-05, - "loss": 0.8635, + "epoch": 3.531178529111901, + "grad_norm": 7.995856285095215, + "learning_rate": 4.826457811156744e-05, + "loss": 0.743, "step": 256300 }, { - "epoch": 2.61, - "learning_rate": 5.697705258327451e-05, - "loss": 1.011, + "epoch": 3.5325562811716402, + "grad_norm": 4.204479217529297, + "learning_rate": 4.825932200323017e-05, + "loss": 0.6754, "step": 256400 }, { - "epoch": 2.61, - "learning_rate": 5.697414995181079e-05, - "loss": 0.9641, + "epoch": 3.53393403323138, + "grad_norm": 15.757999420166016, + "learning_rate": 4.8254064131282305e-05, + "loss": 0.6966, "step": 256500 }, { - "epoch": 2.61, - "learning_rate": 5.6971246001472096e-05, - "loss": 0.9, + "epoch": 3.535311785291119, + "grad_norm": 8.793876647949219, + "learning_rate": 4.82488044961705e-05, + "loss": 0.7616, "step": 256600 }, { - "epoch": 2.62, - "learning_rate": 5.696834073240044e-05, - "loss": 0.9274, + "epoch": 3.5366895373508584, + "grad_norm": 8.58087158203125, + "learning_rate": 4.8243543098341614e-05, + "loss": 0.6844, "step": 256700 }, { - "epoch": 2.62, - "learning_rate": 5.696543414473785e-05, - "loss": 0.9429, + "epoch": 3.5380672894105976, + "grad_norm": 3.6597142219543457, + "learning_rate": 4.823833257856536e-05, + "loss": 0.7325, "step": 256800 }, { - "epoch": 2.62, - "learning_rate": 5.696252623862646e-05, - "loss": 0.8981, + "epoch": 3.539445041470337, + "grad_norm": 5.672914028167725, + "learning_rate": 4.823306767425941e-05, + "loss": 0.6947, "step": 256900 }, { - "epoch": 2.62, - "learning_rate": 5.695961701420843e-05, - "loss": 0.9315, + "epoch": 3.5408227935300762, + "grad_norm": 9.928915977478027, + "learning_rate": 4.8227801008573277e-05, + "loss": 0.7355, "step": 257000 }, { - "epoch": 2.62, - "learning_rate": 5.695670647162603e-05, - "loss": 0.9233, + "epoch": 3.542200545589816, + "grad_norm": 6.378506660461426, + "learning_rate": 4.822258527493575e-05, + "loss": 0.7494, "step": 257100 }, { - "epoch": 2.62, - "learning_rate": 5.695379461102154e-05, - "loss": 1.0118, + "epoch": 3.543578297649555, + "grad_norm": 9.644248008728027, + "learning_rate": 4.8217315105434356e-05, + "loss": 0.7181, "step": 257200 }, { - "epoch": 2.62, - "learning_rate": 5.695088143253735e-05, - "loss": 0.9831, + "epoch": 3.5449560497092945, + "grad_norm": 3.024136781692505, + "learning_rate": 4.8212043175891056e-05, + "loss": 0.659, "step": 257300 }, { - "epoch": 2.62, - "learning_rate": 5.694799608780044e-05, - "loss": 0.9721, + "epoch": 3.5463338017690336, + "grad_norm": 29.016172409057617, + "learning_rate": 4.820676948675373e-05, + "loss": 0.7913, "step": 257400 }, { - "epoch": 2.62, - "learning_rate": 5.694508028715945e-05, - "loss": 0.8437, + "epoch": 3.547711553828773, + "grad_norm": 97.14000701904297, + "learning_rate": 4.820149403847039e-05, + "loss": 0.738, "step": 257500 }, { - "epoch": 2.62, - "learning_rate": 5.694216316906485e-05, - "loss": 0.8963, + "epoch": 3.5490893058885122, + "grad_norm": 2.647348642349243, + "learning_rate": 4.819621683148924e-05, + "loss": 0.7436, "step": 257600 }, { - "epoch": 2.63, - "learning_rate": 5.6939244733659246e-05, - "loss": 0.887, + "epoch": 3.550467057948252, + "grad_norm": 4.298648357391357, + "learning_rate": 4.819093786625858e-05, + "loss": 0.6504, "step": 257700 }, { - "epoch": 2.63, - "learning_rate": 5.6936324981085335e-05, - "loss": 0.9561, + "epoch": 3.551844810007991, + "grad_norm": 2.510650396347046, + "learning_rate": 4.818565714322689e-05, + "loss": 0.7227, "step": 257800 }, { - "epoch": 2.63, - "learning_rate": 5.693340391148588e-05, - "loss": 0.9189, + "epoch": 3.55322256206773, + "grad_norm": 7.00123405456543, + "learning_rate": 4.81803746628428e-05, + "loss": 0.7115, "step": 257900 }, { - "epoch": 2.63, - "learning_rate": 5.693048152500371e-05, - "loss": 0.9984, + "epoch": 3.5546003141274696, + "grad_norm": 10.8532075881958, + "learning_rate": 4.817509042555509e-05, + "loss": 0.7083, "step": 258000 }, { - "epoch": 2.63, - "learning_rate": 5.69275578217817e-05, - "loss": 1.0144, + "epoch": 3.555978066187209, + "grad_norm": 21.37672233581543, + "learning_rate": 4.8169804431812665e-05, + "loss": 0.7287, "step": 258100 }, { - "epoch": 2.63, - "learning_rate": 5.692463280196281e-05, - "loss": 0.8798, + "epoch": 3.5573558182469482, + "grad_norm": 9.328338623046875, + "learning_rate": 4.816451668206462e-05, + "loss": 0.7438, "step": 258200 }, { - "epoch": 2.63, - "learning_rate": 5.692170646569006e-05, - "loss": 0.9476, + "epoch": 3.5587335703066874, + "grad_norm": 5.649765491485596, + "learning_rate": 4.8159227176760155e-05, + "loss": 0.7996, "step": 258300 }, { - "epoch": 2.63, - "learning_rate": 5.691877881310652e-05, - "loss": 0.9003, + "epoch": 3.560111322366427, + "grad_norm": 5.415292739868164, + "learning_rate": 4.815393591634866e-05, + "loss": 0.7156, "step": 258400 }, { - "epoch": 2.63, - "learning_rate": 5.6915849844355336e-05, - "loss": 1.0298, + "epoch": 3.5614890744261665, + "grad_norm": 17.550783157348633, + "learning_rate": 4.814864290127963e-05, + "loss": 0.8034, "step": 258500 }, { - "epoch": 2.63, - "learning_rate": 5.691291955957973e-05, - "loss": 0.8374, + "epoch": 3.5628668264859056, + "grad_norm": 13.280574798583984, + "learning_rate": 4.814334813200275e-05, + "loss": 0.7627, "step": 258600 }, { - "epoch": 2.64, - "learning_rate": 5.690998795892295e-05, - "loss": 0.971, + "epoch": 3.5642445785456447, + "grad_norm": 5.680484771728516, + "learning_rate": 4.813805160896784e-05, + "loss": 0.6671, "step": 258700 }, { - "epoch": 2.64, - "learning_rate": 5.690705504252836e-05, - "loss": 0.9238, + "epoch": 3.5656223306053842, + "grad_norm": 3.3929390907287598, + "learning_rate": 4.8132753332624864e-05, + "loss": 0.7065, "step": 258800 }, { - "epoch": 2.64, - "learning_rate": 5.690412081053935e-05, - "loss": 1.0069, + "epoch": 3.567000082665124, + "grad_norm": 4.375004291534424, + "learning_rate": 4.812745330342393e-05, + "loss": 0.7072, "step": 258900 }, { - "epoch": 2.64, - "learning_rate": 5.6901185263099386e-05, - "loss": 0.9382, + "epoch": 3.568377834724863, + "grad_norm": 4.153993606567383, + "learning_rate": 4.8122151521815315e-05, + "loss": 0.8178, "step": 259000 }, { - "epoch": 2.64, - "learning_rate": 5.6898248400351994e-05, - "loss": 0.9315, + "epoch": 3.569755586784602, + "grad_norm": 9.471790313720703, + "learning_rate": 4.811684798824942e-05, + "loss": 0.6827, "step": 259100 }, { - "epoch": 2.64, - "learning_rate": 5.6895310222440784e-05, - "loss": 0.9887, + "epoch": 3.5711333388443416, + "grad_norm": 7.491828441619873, + "learning_rate": 4.811154270317682e-05, + "loss": 0.7637, "step": 259200 }, { - "epoch": 2.64, - "learning_rate": 5.689237072950941e-05, - "loss": 0.8713, + "epoch": 3.572511090904081, + "grad_norm": 53.97184371948242, + "learning_rate": 4.810623566704822e-05, + "loss": 0.7672, "step": 259300 }, { - "epoch": 2.64, - "learning_rate": 5.688942992170158e-05, - "loss": 0.891, + "epoch": 3.5738888429638203, + "grad_norm": 10.1900634765625, + "learning_rate": 4.810092688031447e-05, + "loss": 0.7222, "step": 259400 }, { - "epoch": 2.64, - "learning_rate": 5.68864877991611e-05, - "loss": 0.9766, + "epoch": 3.5752665950235594, + "grad_norm": 25.034801483154297, + "learning_rate": 4.809561634342659e-05, + "loss": 0.7186, "step": 259500 }, { - "epoch": 2.64, - "learning_rate": 5.688354436203183e-05, - "loss": 0.8095, + "epoch": 3.576644347083299, + "grad_norm": 42.99748611450195, + "learning_rate": 4.809030405683574e-05, + "loss": 0.7698, "step": 259600 }, { - "epoch": 2.65, - "learning_rate": 5.688059961045766e-05, - "loss": 0.8364, + "epoch": 3.5780220991430385, + "grad_norm": 7.716062545776367, + "learning_rate": 4.808499002099322e-05, + "loss": 0.7963, "step": 259700 }, { - "epoch": 2.65, - "learning_rate": 5.6877653544582585e-05, - "loss": 0.9632, + "epoch": 3.5793998512027776, + "grad_norm": 4.706084728240967, + "learning_rate": 4.8079674236350485e-05, + "loss": 0.8121, "step": 259800 }, { - "epoch": 2.65, - "learning_rate": 5.687470616455065e-05, - "loss": 0.7766, + "epoch": 3.5807776032625167, + "grad_norm": 7.900924205780029, + "learning_rate": 4.807435670335913e-05, + "loss": 0.7499, "step": 259900 }, { - "epoch": 2.65, - "learning_rate": 5.687175747050597e-05, - "loss": 0.9271, + "epoch": 3.5821553553222563, + "grad_norm": 43.2916145324707, + "learning_rate": 4.806903742247093e-05, + "loss": 0.6282, "step": 260000 }, { - "epoch": 2.65, - "learning_rate": 5.68688074625927e-05, - "loss": 0.8942, + "epoch": 3.5835331073819954, + "grad_norm": 6.506124019622803, + "learning_rate": 4.806371639413777e-05, + "loss": 0.7603, "step": 260100 }, { - "epoch": 2.65, - "learning_rate": 5.68658561409551e-05, - "loss": 0.7587, + "epoch": 3.584910859441735, + "grad_norm": 4.683910369873047, + "learning_rate": 4.8058393618811685e-05, + "loss": 0.7214, "step": 260200 }, { - "epoch": 2.65, - "learning_rate": 5.6862903505737456e-05, - "loss": 0.8526, + "epoch": 3.586288611501474, + "grad_norm": 8.051076889038086, + "learning_rate": 4.80530690969449e-05, + "loss": 0.7195, "step": 260300 }, { - "epoch": 2.65, - "learning_rate": 5.6859949557084145e-05, - "loss": 0.9299, + "epoch": 3.5876663635612136, + "grad_norm": 25.076473236083984, + "learning_rate": 4.804774282898974e-05, + "loss": 0.6599, "step": 260400 }, { - "epoch": 2.65, - "learning_rate": 5.685699429513958e-05, - "loss": 0.848, + "epoch": 3.5890441156209527, + "grad_norm": 11.387847900390625, + "learning_rate": 4.8042414815398704e-05, + "loss": 0.7632, "step": 260500 }, { - "epoch": 2.66, - "learning_rate": 5.685403772004828e-05, - "loss": 0.9296, + "epoch": 3.5904218676806923, + "grad_norm": 41.54713821411133, + "learning_rate": 4.803708505662444e-05, + "loss": 0.7081, "step": 260600 }, { - "epoch": 2.66, - "learning_rate": 5.685107983195479e-05, - "loss": 0.9977, + "epoch": 3.5917996197404314, + "grad_norm": 37.84764099121094, + "learning_rate": 4.803175355311973e-05, + "loss": 0.7368, "step": 260700 }, { - "epoch": 2.66, - "learning_rate": 5.684812063100374e-05, - "loss": 0.8459, + "epoch": 3.593177371800171, + "grad_norm": 14.859453201293945, + "learning_rate": 4.8026420305337515e-05, + "loss": 0.7185, "step": 260800 }, { - "epoch": 2.66, - "learning_rate": 5.68451601173398e-05, - "loss": 0.8624, + "epoch": 3.59455512385991, + "grad_norm": 51.62633514404297, + "learning_rate": 4.802108531373088e-05, + "loss": 0.7543, "step": 260900 }, { - "epoch": 2.66, - "learning_rate": 5.684219829110774e-05, - "loss": 0.8767, + "epoch": 3.5959328759196496, + "grad_norm": 19.463062286376953, + "learning_rate": 4.801574857875307e-05, + "loss": 0.692, "step": 261000 }, { - "epoch": 2.66, - "learning_rate": 5.6839235152452377e-05, - "loss": 0.8299, + "epoch": 3.5973106279793887, + "grad_norm": 9.161649703979492, + "learning_rate": 4.801041010085746e-05, + "loss": 0.6862, "step": 261100 }, { - "epoch": 2.66, - "learning_rate": 5.683627070151856e-05, - "loss": 0.9453, + "epoch": 3.5986883800391283, + "grad_norm": 13.147780418395996, + "learning_rate": 4.800506988049757e-05, + "loss": 0.8044, "step": 261200 }, { - "epoch": 2.66, - "learning_rate": 5.683330493845128e-05, - "loss": 0.9189, + "epoch": 3.6000661320988674, + "grad_norm": 15.129803657531738, + "learning_rate": 4.79997279181271e-05, + "loss": 0.7687, "step": 261300 }, { - "epoch": 2.66, - "learning_rate": 5.6830337863395515e-05, - "loss": 0.8097, + "epoch": 3.601443884158607, + "grad_norm": 31.365673065185547, + "learning_rate": 4.799438421419987e-05, + "loss": 0.7516, "step": 261400 }, { - "epoch": 2.66, - "learning_rate": 5.6827369476496344e-05, - "loss": 0.845, + "epoch": 3.602821636218346, + "grad_norm": 9.364607810974121, + "learning_rate": 4.7989038769169845e-05, + "loss": 0.741, "step": 261500 }, { - "epoch": 2.67, - "learning_rate": 5.68243997778989e-05, - "loss": 0.7803, + "epoch": 3.6041993882780856, + "grad_norm": 2.6934702396392822, + "learning_rate": 4.7983745063962666e-05, + "loss": 0.7155, "step": 261600 }, { - "epoch": 2.67, - "learning_rate": 5.682142876774839e-05, - "loss": 0.8154, + "epoch": 3.6055771403378247, + "grad_norm": 14.262542724609375, + "learning_rate": 4.797839615548928e-05, + "loss": 0.735, "step": 261700 }, { - "epoch": 2.67, - "learning_rate": 5.6818456446190066e-05, - "loss": 0.826, + "epoch": 3.6069548923975643, + "grad_norm": 29.470951080322266, + "learning_rate": 4.797304550727137e-05, + "loss": 0.7384, "step": 261800 }, { - "epoch": 2.67, - "learning_rate": 5.681548281336927e-05, - "loss": 0.8515, + "epoch": 3.6083326444573034, + "grad_norm": 3.852599859237671, + "learning_rate": 4.796769311976351e-05, + "loss": 0.7171, "step": 261900 }, { - "epoch": 2.67, - "learning_rate": 5.6812507869431396e-05, - "loss": 0.9092, + "epoch": 3.609710396517043, + "grad_norm": 2.499370574951172, + "learning_rate": 4.796233899342041e-05, + "loss": 0.8475, "step": 262000 }, { - "epoch": 2.67, - "learning_rate": 5.68095316145219e-05, - "loss": 0.8429, + "epoch": 3.611088148576782, + "grad_norm": 12.833796501159668, + "learning_rate": 4.795698312869693e-05, + "loss": 0.7111, "step": 262100 }, { - "epoch": 2.67, - "learning_rate": 5.6806554048786286e-05, - "loss": 0.7835, + "epoch": 3.612465900636521, + "grad_norm": 16.0988712310791, + "learning_rate": 4.795162552604806e-05, + "loss": 0.689, "step": 262200 }, { - "epoch": 2.67, - "learning_rate": 5.680357517237016e-05, - "loss": 0.7801, + "epoch": 3.6138436526962607, + "grad_norm": 10.9572114944458, + "learning_rate": 4.794626618592899e-05, + "loss": 0.7906, "step": 262300 }, { - "epoch": 2.67, - "learning_rate": 5.680059498541916e-05, - "loss": 0.8571, + "epoch": 3.6152214047560003, + "grad_norm": 5.4418416023254395, + "learning_rate": 4.794090510879499e-05, + "loss": 0.7051, "step": 262400 }, { - "epoch": 2.67, - "learning_rate": 5.679761348807901e-05, - "loss": 0.791, + "epoch": 3.6165991568157394, + "grad_norm": 3.5932974815368652, + "learning_rate": 4.793554229510154e-05, + "loss": 0.7169, "step": 262500 }, { - "epoch": 2.68, - "learning_rate": 5.679463068049547e-05, - "loss": 0.8853, + "epoch": 3.6179769088754785, + "grad_norm": 4.044643878936768, + "learning_rate": 4.793017774530421e-05, + "loss": 0.7507, "step": 262600 }, { - "epoch": 2.68, - "learning_rate": 5.6791646562814395e-05, - "loss": 0.8014, + "epoch": 3.619354660935218, + "grad_norm": 3.4287917613983154, + "learning_rate": 4.792481145985877e-05, + "loss": 0.7921, "step": 262700 }, { - "epoch": 2.68, - "learning_rate": 5.678866113518169e-05, - "loss": 0.8094, + "epoch": 3.6207324129949576, + "grad_norm": 8.930657386779785, + "learning_rate": 4.7919443439221106e-05, + "loss": 0.8014, "step": 262800 }, { - "epoch": 2.68, - "learning_rate": 5.678567439774332e-05, - "loss": 0.8718, + "epoch": 3.6221101650546967, + "grad_norm": 163.52113342285156, + "learning_rate": 4.7914073683847254e-05, + "loss": 0.766, "step": 262900 }, { - "epoch": 2.68, - "learning_rate": 5.6782686350645324e-05, - "loss": 0.9101, + "epoch": 3.623487917114436, + "grad_norm": 2.648787021636963, + "learning_rate": 4.7908702194193406e-05, + "loss": 0.7713, "step": 263000 }, { - "epoch": 2.68, - "learning_rate": 5.677969699403379e-05, - "loss": 0.8268, + "epoch": 3.6248656691741754, + "grad_norm": 4.38862943649292, + "learning_rate": 4.790338271153159e-05, + "loss": 0.662, "step": 263100 }, { - "epoch": 2.68, - "learning_rate": 5.6776706328054896e-05, - "loss": 0.7762, + "epoch": 3.626243421233915, + "grad_norm": 8.300777435302734, + "learning_rate": 4.7898007772018314e-05, + "loss": 0.7063, "step": 263200 }, { - "epoch": 2.68, - "learning_rate": 5.6773714352854844e-05, - "loss": 0.8332, + "epoch": 3.627621173293654, + "grad_norm": 3.531403064727783, + "learning_rate": 4.789263109958992e-05, + "loss": 0.7463, "step": 263300 }, { - "epoch": 2.68, - "learning_rate": 5.677072106857995e-05, - "loss": 0.7458, + "epoch": 3.628998925353393, + "grad_norm": 3.786421775817871, + "learning_rate": 4.788725269470319e-05, + "loss": 0.6612, "step": 263400 }, { - "epoch": 2.68, - "learning_rate": 5.676772647537655e-05, - "loss": 0.8379, + "epoch": 3.6303766774131327, + "grad_norm": 5.484480381011963, + "learning_rate": 4.788187255781504e-05, + "loss": 0.8124, "step": 263500 }, { - "epoch": 2.69, - "learning_rate": 5.6764730573391064e-05, - "loss": 0.8777, + "epoch": 3.6317544294728723, + "grad_norm": 7.373251438140869, + "learning_rate": 4.787649068938254e-05, + "loss": 0.7311, "step": 263600 }, { - "epoch": 2.69, - "learning_rate": 5.6761733362769974e-05, - "loss": 0.8984, + "epoch": 3.6331321815326114, + "grad_norm": 25.674911499023438, + "learning_rate": 4.787110708986291e-05, + "loss": 0.6926, "step": 263700 }, { - "epoch": 2.69, - "learning_rate": 5.6758734843659834e-05, - "loss": 0.8848, + "epoch": 3.6345099335923505, + "grad_norm": 18.366029739379883, + "learning_rate": 4.7865721759713516e-05, + "loss": 0.7466, "step": 263800 }, { - "epoch": 2.69, - "learning_rate": 5.675573501620725e-05, - "loss": 0.8055, + "epoch": 3.63588768565209, + "grad_norm": 7.999305248260498, + "learning_rate": 4.786033469939187e-05, + "loss": 0.7021, "step": 263900 }, { - "epoch": 2.69, - "learning_rate": 5.675273388055889e-05, - "loss": 0.9445, + "epoch": 3.6372654377118296, + "grad_norm": 8.078387260437012, + "learning_rate": 4.7854945909355624e-05, + "loss": 0.7951, "step": 264000 }, { - "epoch": 2.69, - "learning_rate": 5.674973143686149e-05, - "loss": 0.8991, + "epoch": 3.6386431897715688, + "grad_norm": 10.383421897888184, + "learning_rate": 4.78495553900626e-05, + "loss": 0.751, "step": 264100 }, { - "epoch": 2.69, - "learning_rate": 5.674672768526187e-05, - "loss": 0.856, + "epoch": 3.640020941831308, + "grad_norm": 8.066372871398926, + "learning_rate": 4.784416314197073e-05, + "loss": 0.8231, "step": 264200 }, { - "epoch": 2.69, - "learning_rate": 5.674372262590687e-05, - "loss": 0.9428, + "epoch": 3.6413986938910474, + "grad_norm": 5.808138847351074, + "learning_rate": 4.783882311385623e-05, + "loss": 0.7748, "step": 264300 }, { - "epoch": 2.69, - "learning_rate": 5.6740716258943434e-05, - "loss": 0.8495, + "epoch": 3.6427764459507865, + "grad_norm": 11.194897651672363, + "learning_rate": 4.783342742681768e-05, + "loss": 0.6328, "step": 264400 }, { - "epoch": 2.69, - "learning_rate": 5.6737708584518564e-05, - "loss": 0.9197, + "epoch": 3.644154198010526, + "grad_norm": 8.608333587646484, + "learning_rate": 4.782803001235045e-05, + "loss": 0.7876, "step": 264500 }, { - "epoch": 2.7, - "learning_rate": 5.6734699602779296e-05, - "loss": 0.7676, + "epoch": 3.645531950070265, + "grad_norm": 4.367278575897217, + "learning_rate": 4.782263087091307e-05, + "loss": 0.695, "step": 264600 }, { - "epoch": 2.7, - "learning_rate": 5.673168931387276e-05, - "loss": 0.8718, + "epoch": 3.6469097021300048, + "grad_norm": 1.2145054340362549, + "learning_rate": 4.7817230002964225e-05, + "loss": 0.7885, "step": 264700 }, { - "epoch": 2.7, - "learning_rate": 5.672867771794615e-05, - "loss": 0.8502, + "epoch": 3.648287454189744, + "grad_norm": 8.021724700927734, + "learning_rate": 4.7811827408962756e-05, + "loss": 0.7201, "step": 264800 }, { - "epoch": 2.7, - "learning_rate": 5.67256648151467e-05, - "loss": 0.896, + "epoch": 3.6496652062494834, + "grad_norm": 14.330703735351562, + "learning_rate": 4.7806423089367634e-05, + "loss": 0.7589, "step": 264900 }, { - "epoch": 2.7, - "learning_rate": 5.672265060562174e-05, - "loss": 0.8119, + "epoch": 3.6510429583092225, + "grad_norm": 11.726005554199219, + "learning_rate": 4.780101704463798e-05, + "loss": 0.7641, "step": 265000 }, { - "epoch": 2.7, - "learning_rate": 5.671963508951863e-05, - "loss": 0.8941, + "epoch": 3.652420710368962, + "grad_norm": 6.068086624145508, + "learning_rate": 4.779560927523307e-05, + "loss": 0.837, "step": 265100 }, { - "epoch": 2.7, - "learning_rate": 5.6716618266984827e-05, - "loss": 0.7739, + "epoch": 3.653798462428701, + "grad_norm": 25.767248153686523, + "learning_rate": 4.779019978161232e-05, + "loss": 0.802, "step": 265200 }, { - "epoch": 2.7, - "learning_rate": 5.6713600138167816e-05, - "loss": 0.9141, + "epoch": 3.6551762144884408, + "grad_norm": 4.487380027770996, + "learning_rate": 4.7784788564235295e-05, + "loss": 0.8065, "step": 265300 }, { - "epoch": 2.7, - "learning_rate": 5.67106109040296e-05, - "loss": 0.9213, + "epoch": 3.65655396654818, + "grad_norm": 4.216097831726074, + "learning_rate": 4.77793756235617e-05, + "loss": 0.8177, "step": 265400 }, { - "epoch": 2.7, - "learning_rate": 5.670759017614812e-05, - "loss": 0.8806, + "epoch": 3.6579317186079194, + "grad_norm": 9.052308082580566, + "learning_rate": 4.7773960960051406e-05, + "loss": 0.7106, "step": 265500 }, { - "epoch": 2.71, - "learning_rate": 5.6704568142424855e-05, - "loss": 0.9328, + "epoch": 3.6593094706676585, + "grad_norm": 8.382248878479004, + "learning_rate": 4.77685445741644e-05, + "loss": 0.8162, "step": 265600 }, { - "epoch": 2.71, - "learning_rate": 5.670154480300758e-05, - "loss": 0.8146, + "epoch": 3.660687222727398, + "grad_norm": 6.573047161102295, + "learning_rate": 4.776312646636085e-05, + "loss": 0.763, "step": 265700 }, { - "epoch": 2.71, - "learning_rate": 5.669852015804412e-05, - "loss": 0.9284, + "epoch": 3.662064974787137, + "grad_norm": 16.751585006713867, + "learning_rate": 4.775770663710103e-05, + "loss": 0.7174, "step": 265800 }, { - "epoch": 2.71, - "learning_rate": 5.6695494207682345e-05, - "loss": 0.7828, + "epoch": 3.6634427268468768, + "grad_norm": 4.9187822341918945, + "learning_rate": 4.775228508684539e-05, + "loss": 0.7844, "step": 265900 }, { - "epoch": 2.71, - "learning_rate": 5.669246695207021e-05, - "loss": 0.8531, + "epoch": 3.664820478906616, + "grad_norm": 11.159941673278809, + "learning_rate": 4.7746861816054535e-05, + "loss": 0.7923, "step": 266000 }, { - "epoch": 2.71, - "learning_rate": 5.6689438391355734e-05, - "loss": 0.7673, + "epoch": 3.6661982309663554, + "grad_norm": 8.07771110534668, + "learning_rate": 4.774143682518918e-05, + "loss": 0.697, "step": 266100 }, { - "epoch": 2.71, - "learning_rate": 5.6686408525687e-05, - "loss": 0.9254, + "epoch": 3.6675759830260946, + "grad_norm": 3.0037763118743896, + "learning_rate": 4.7736010114710215e-05, + "loss": 0.7301, "step": 266200 }, { - "epoch": 2.71, - "learning_rate": 5.668337735521214e-05, - "loss": 0.9133, + "epoch": 3.668953735085834, + "grad_norm": 15.279373168945312, + "learning_rate": 4.7730581685078664e-05, + "loss": 0.7591, "step": 266300 }, { - "epoch": 2.71, - "learning_rate": 5.668034488007937e-05, - "loss": 0.897, + "epoch": 3.6703314871455732, + "grad_norm": 12.624349594116211, + "learning_rate": 4.77251515367557e-05, + "loss": 0.8636, "step": 266400 }, { - "epoch": 2.72, - "learning_rate": 5.667731110043695e-05, - "loss": 0.7841, + "epoch": 3.6717092392053123, + "grad_norm": 57.029048919677734, + "learning_rate": 4.771971967020264e-05, + "loss": 0.8478, "step": 266500 }, { - "epoch": 2.72, - "learning_rate": 5.667430637372935e-05, - "loss": 0.7991, + "epoch": 3.673086991265052, + "grad_norm": 4.073955059051514, + "learning_rate": 4.771428608588095e-05, + "loss": 0.8166, "step": 266600 }, { - "epoch": 2.72, - "learning_rate": 5.667126999855411e-05, - "loss": 0.8795, + "epoch": 3.6744647433247914, + "grad_norm": 38.54369354248047, + "learning_rate": 4.7708850784252244e-05, + "loss": 0.7079, "step": 266700 }, { - "epoch": 2.72, - "learning_rate": 5.666823231931293e-05, - "loss": 0.8652, + "epoch": 3.6758424953845306, + "grad_norm": 16.506465911865234, + "learning_rate": 4.770341376577827e-05, + "loss": 0.8103, "step": 266800 }, { - "epoch": 2.72, - "learning_rate": 5.666519333615433e-05, - "loss": 0.9181, + "epoch": 3.6772202474442697, + "grad_norm": 22.867197036743164, + "learning_rate": 4.769797503092094e-05, + "loss": 0.816, "step": 266900 }, { - "epoch": 2.72, - "learning_rate": 5.666215304922691e-05, - "loss": 1.0623, + "epoch": 3.6785979995040092, + "grad_norm": 9.203018188476562, + "learning_rate": 4.76925345801423e-05, + "loss": 0.7172, "step": 267000 }, { - "epoch": 2.72, - "learning_rate": 5.6659111458679316e-05, - "loss": 0.8177, + "epoch": 3.679975751563749, + "grad_norm": 7.021424293518066, + "learning_rate": 4.768709241390455e-05, + "loss": 0.73, "step": 267100 }, { - "epoch": 2.72, - "learning_rate": 5.6656068564660264e-05, - "loss": 0.7996, + "epoch": 3.681353503623488, + "grad_norm": 7.541938781738281, + "learning_rate": 4.768164853267001e-05, + "loss": 0.7382, "step": 267200 }, { - "epoch": 2.72, - "learning_rate": 5.665302436731853e-05, - "loss": 0.806, + "epoch": 3.682731255683227, + "grad_norm": 36.13368225097656, + "learning_rate": 4.767620293690118e-05, + "loss": 0.7394, "step": 267300 }, { - "epoch": 2.72, - "learning_rate": 5.664997886680296e-05, - "loss": 0.8959, + "epoch": 3.6841090077429666, + "grad_norm": 3.0906050205230713, + "learning_rate": 4.76707556270607e-05, + "loss": 0.7198, "step": 267400 }, { - "epoch": 2.73, - "learning_rate": 5.6646932063262474e-05, - "loss": 0.8346, + "epoch": 3.685486759802706, + "grad_norm": 40.52449417114258, + "learning_rate": 4.7665306603611334e-05, + "loss": 0.8418, "step": 267500 }, { - "epoch": 2.73, - "learning_rate": 5.664388395684603e-05, - "loss": 0.828, + "epoch": 3.6868645118624452, + "grad_norm": 6.134153366088867, + "learning_rate": 4.7659855867016004e-05, + "loss": 0.7729, "step": 267600 }, { - "epoch": 2.73, - "learning_rate": 5.664083454770266e-05, - "loss": 0.9029, + "epoch": 3.6882422639221843, + "grad_norm": 11.829121589660645, + "learning_rate": 4.7654403417737775e-05, + "loss": 0.8, "step": 267700 }, { - "epoch": 2.73, - "learning_rate": 5.663778383598146e-05, - "loss": 0.7655, + "epoch": 3.689620015981924, + "grad_norm": 87.53887939453125, + "learning_rate": 4.764894925623988e-05, + "loss": 0.7595, "step": 267800 }, { - "epoch": 2.73, - "learning_rate": 5.66347318218316e-05, - "loss": 0.8944, + "epoch": 3.6909977680416635, + "grad_norm": 22.38325309753418, + "learning_rate": 4.764349338298565e-05, + "loss": 0.752, "step": 267900 }, { - "epoch": 2.73, - "learning_rate": 5.6631678505402295e-05, - "loss": 0.9142, + "epoch": 3.6923755201014026, + "grad_norm": 13.194493293762207, + "learning_rate": 4.763803579843861e-05, + "loss": 0.7732, "step": 268000 }, { - "epoch": 2.73, - "learning_rate": 5.662862388684285e-05, - "loss": 0.9283, + "epoch": 3.6937532721611417, + "grad_norm": 4.538683891296387, + "learning_rate": 4.7632576503062405e-05, + "loss": 0.8146, "step": 268100 }, { - "epoch": 2.73, - "learning_rate": 5.6625567966302605e-05, - "loss": 0.7991, + "epoch": 3.6951310242208812, + "grad_norm": 12.0274019241333, + "learning_rate": 4.762711549732083e-05, + "loss": 0.7495, "step": 268200 }, { - "epoch": 2.73, - "learning_rate": 5.6622510743930975e-05, - "loss": 0.9507, + "epoch": 3.696508776280621, + "grad_norm": 21.319379806518555, + "learning_rate": 4.762165278167782e-05, + "loss": 0.6809, "step": 268300 }, { - "epoch": 2.73, - "learning_rate": 5.6619452219877445e-05, - "loss": 0.752, + "epoch": 3.69788652834036, + "grad_norm": 6.008383274078369, + "learning_rate": 4.761618835659746e-05, + "loss": 0.725, "step": 268400 }, { - "epoch": 2.74, - "learning_rate": 5.661639239429157e-05, - "loss": 0.902, + "epoch": 3.699264280400099, + "grad_norm": 34.65384292602539, + "learning_rate": 4.761072222254399e-05, + "loss": 0.7692, "step": 268500 }, { - "epoch": 2.74, - "learning_rate": 5.6613331267322934e-05, - "loss": 0.8358, + "epoch": 3.7006420324598386, + "grad_norm": 73.18565368652344, + "learning_rate": 4.760525437998178e-05, + "loss": 0.7571, "step": 268600 }, { - "epoch": 2.74, - "learning_rate": 5.661026883912122e-05, - "loss": 0.8998, + "epoch": 3.7020197845195777, + "grad_norm": 1.4377433061599731, + "learning_rate": 4.7599784829375354e-05, + "loss": 0.6696, "step": 268700 }, { - "epoch": 2.74, - "learning_rate": 5.660723575356889e-05, - "loss": 0.883, + "epoch": 3.7033975365793173, + "grad_norm": 8.201079368591309, + "learning_rate": 4.7594313571189385e-05, + "loss": 0.762, "step": 268800 }, { - "epoch": 2.74, - "learning_rate": 5.660417073635888e-05, - "loss": 0.9245, + "epoch": 3.7047752886390564, + "grad_norm": 4.367722034454346, + "learning_rate": 4.758884060588867e-05, + "loss": 0.7242, "step": 268900 }, { - "epoch": 2.74, - "learning_rate": 5.660110441836369e-05, - "loss": 0.8388, + "epoch": 3.706153040698796, + "grad_norm": 4.735632419586182, + "learning_rate": 4.758336593393817e-05, + "loss": 0.7481, "step": 269000 }, { - "epoch": 2.74, - "learning_rate": 5.659803679973323e-05, - "loss": 0.8983, + "epoch": 3.707530792758535, + "grad_norm": 9.206520080566406, + "learning_rate": 4.757788955580298e-05, + "loss": 0.7383, "step": 269100 }, { - "epoch": 2.74, - "learning_rate": 5.659496788061751e-05, - "loss": 0.9053, + "epoch": 3.7089085448182746, + "grad_norm": 13.853857040405273, + "learning_rate": 4.757241147194837e-05, + "loss": 0.7151, "step": 269200 }, { - "epoch": 2.74, - "learning_rate": 5.659189766116657e-05, - "loss": 0.8712, + "epoch": 3.7102862968780137, + "grad_norm": 5.568164348602295, + "learning_rate": 4.756693168283971e-05, + "loss": 0.8428, "step": 269300 }, { - "epoch": 2.74, - "learning_rate": 5.6588826141530526e-05, - "loss": 0.7831, + "epoch": 3.7116640489377533, + "grad_norm": 3.7256813049316406, + "learning_rate": 4.756145018894254e-05, + "loss": 0.7207, "step": 269400 }, { - "epoch": 2.75, - "learning_rate": 5.658575332185956e-05, - "loss": 0.8285, + "epoch": 3.7130418009974924, + "grad_norm": 4.671421527862549, + "learning_rate": 4.755596699072254e-05, + "loss": 0.6761, "step": 269500 }, { - "epoch": 2.75, - "learning_rate": 5.65826792023039e-05, - "loss": 0.7277, + "epoch": 3.714419553057232, + "grad_norm": 14.45839786529541, + "learning_rate": 4.755048208864555e-05, + "loss": 0.7421, "step": 269600 }, { - "epoch": 2.75, - "learning_rate": 5.6579603783013876e-05, - "loss": 0.7775, + "epoch": 3.715797305116971, + "grad_norm": 12.860676765441895, + "learning_rate": 4.7544995483177516e-05, + "loss": 0.6924, "step": 269700 }, { - "epoch": 2.75, - "learning_rate": 5.6576527064139845e-05, - "loss": 0.851, + "epoch": 3.7171750571767106, + "grad_norm": 5.4139251708984375, + "learning_rate": 4.753950717478457e-05, + "loss": 0.6845, "step": 269800 }, { - "epoch": 2.75, - "learning_rate": 5.657344904583224e-05, - "loss": 0.8398, + "epoch": 3.7185528092364497, + "grad_norm": 14.10604190826416, + "learning_rate": 4.753401716393297e-05, + "loss": 0.7336, "step": 269900 }, { - "epoch": 2.75, - "learning_rate": 5.657036972824155e-05, - "loss": 0.826, + "epoch": 3.7199305612961893, + "grad_norm": 5.590238571166992, + "learning_rate": 4.752858037664089e-05, + "loss": 0.7572, "step": 270000 }, { - "epoch": 2.75, - "learning_rate": 5.656728911151835e-05, - "loss": 0.972, + "epoch": 3.7213083133559284, + "grad_norm": 68.34678649902344, + "learning_rate": 4.752308697928428e-05, + "loss": 0.7648, "step": 270100 }, { - "epoch": 2.75, - "learning_rate": 5.656420719581326e-05, - "loss": 0.9173, + "epoch": 3.722686065415668, + "grad_norm": 3.155557155609131, + "learning_rate": 4.7517591880864e-05, + "loss": 0.7174, "step": 270200 }, { - "epoch": 2.75, - "learning_rate": 5.656112398127696e-05, - "loss": 0.9713, + "epoch": 3.724063817475407, + "grad_norm": 11.392009735107422, + "learning_rate": 4.751209508184687e-05, + "loss": 0.7653, "step": 270300 }, { - "epoch": 2.75, - "learning_rate": 5.655803946806021e-05, - "loss": 0.8209, + "epoch": 3.7254415695351466, + "grad_norm": 17.823469161987305, + "learning_rate": 4.750659658269989e-05, + "loss": 0.8423, "step": 270400 }, { - "epoch": 2.76, - "learning_rate": 5.6554953656313824e-05, - "loss": 0.8731, + "epoch": 3.7268193215948857, + "grad_norm": 6.774659633636475, + "learning_rate": 4.750109638389017e-05, + "loss": 0.8087, "step": 270500 }, { - "epoch": 2.76, - "learning_rate": 5.6551866546188674e-05, - "loss": 0.8399, + "epoch": 3.7281970736546253, + "grad_norm": 15.389320373535156, + "learning_rate": 4.7495594485885e-05, + "loss": 0.6586, "step": 270600 }, { - "epoch": 2.76, - "learning_rate": 5.65487781378357e-05, - "loss": 0.8209, + "epoch": 3.7295748257143644, + "grad_norm": 14.331101417541504, + "learning_rate": 4.749009088915177e-05, + "loss": 0.7198, "step": 270700 }, { - "epoch": 2.76, - "learning_rate": 5.654568843140591e-05, - "loss": 0.8956, + "epoch": 3.7309525777741035, + "grad_norm": 4.198037624359131, + "learning_rate": 4.748458559415806e-05, + "loss": 0.7551, "step": 270800 }, { - "epoch": 2.76, - "learning_rate": 5.654259742705037e-05, - "loss": 0.7214, + "epoch": 3.732330329833843, + "grad_norm": 3.555859088897705, + "learning_rate": 4.747907860137156e-05, + "loss": 0.6778, "step": 270900 }, { - "epoch": 2.76, - "learning_rate": 5.653950512492022e-05, - "loss": 0.8488, + "epoch": 3.7337080818935826, + "grad_norm": 13.764531135559082, + "learning_rate": 4.7473569911260116e-05, + "loss": 0.7844, "step": 271000 }, { - "epoch": 2.76, - "learning_rate": 5.653641152516665e-05, - "loss": 0.8523, + "epoch": 3.7350858339533217, + "grad_norm": 13.592623710632324, + "learning_rate": 4.7468059524291725e-05, + "loss": 0.739, "step": 271100 }, { - "epoch": 2.76, - "learning_rate": 5.653331662794091e-05, - "loss": 0.8286, + "epoch": 3.736463586013061, + "grad_norm": 3.8996264934539795, + "learning_rate": 4.7462547440934524e-05, + "loss": 0.8977, "step": 271200 }, { - "epoch": 2.76, - "learning_rate": 5.653022043339433e-05, - "loss": 0.8478, + "epoch": 3.7378413380728004, + "grad_norm": 3.623610734939575, + "learning_rate": 4.745703366165679e-05, + "loss": 0.6747, "step": 271300 }, { - "epoch": 2.77, - "learning_rate": 5.6527122941678305e-05, - "loss": 0.91, + "epoch": 3.73921909013254, + "grad_norm": 3.714876174926758, + "learning_rate": 4.745151818692695e-05, + "loss": 0.7487, "step": 271400 }, { - "epoch": 2.77, - "learning_rate": 5.652402415294426e-05, - "loss": 0.7761, + "epoch": 3.740596842192279, + "grad_norm": 8.382322311401367, + "learning_rate": 4.744600101721356e-05, + "loss": 0.7621, "step": 271500 }, { - "epoch": 2.77, - "learning_rate": 5.652092406734373e-05, - "loss": 0.8504, + "epoch": 3.741974594252018, + "grad_norm": 6.062857151031494, + "learning_rate": 4.744048215298535e-05, + "loss": 0.6358, "step": 271600 }, { - "epoch": 2.77, - "learning_rate": 5.6517853705269665e-05, - "loss": 0.8262, + "epoch": 3.7433523463117577, + "grad_norm": 5.738988399505615, + "learning_rate": 4.7434961594711166e-05, + "loss": 0.7446, "step": 271700 }, { - "epoch": 2.77, - "learning_rate": 5.651475103935582e-05, - "loss": 0.7957, + "epoch": 3.7447300983714973, + "grad_norm": 29.230758666992188, + "learning_rate": 4.742943934286e-05, + "loss": 0.7758, "step": 271800 }, { - "epoch": 2.77, - "learning_rate": 5.651164707702888e-05, - "loss": 0.8826, + "epoch": 3.7461078504312364, + "grad_norm": 19.700021743774414, + "learning_rate": 4.7423915397901004e-05, + "loss": 0.7268, "step": 271900 }, { - "epoch": 2.77, - "learning_rate": 5.6508541818440606e-05, - "loss": 0.8813, + "epoch": 3.7474856024909755, + "grad_norm": 1.5176565647125244, + "learning_rate": 4.741838976030347e-05, + "loss": 0.7574, "step": 272000 }, { - "epoch": 2.77, - "learning_rate": 5.650543526374284e-05, - "loss": 0.9414, + "epoch": 3.748863354550715, + "grad_norm": 4.334427356719971, + "learning_rate": 4.741286243053683e-05, + "loss": 0.7013, "step": 272100 }, { - "epoch": 2.77, - "learning_rate": 5.6502327413087455e-05, - "loss": 0.8849, + "epoch": 3.7502411066104546, + "grad_norm": 6.970549583435059, + "learning_rate": 4.740733340907064e-05, + "loss": 0.715, "step": 272200 }, { - "epoch": 2.77, - "learning_rate": 5.649921826662643e-05, - "loss": 0.8659, + "epoch": 3.7516188586701937, + "grad_norm": 6.901124954223633, + "learning_rate": 4.7401802696374635e-05, + "loss": 0.6704, "step": 272300 }, { - "epoch": 2.78, - "learning_rate": 5.649610782451176e-05, - "loss": 0.8527, + "epoch": 3.752996610729933, + "grad_norm": 11.396589279174805, + "learning_rate": 4.7396270292918674e-05, + "loss": 0.8068, "step": 272400 }, { - "epoch": 2.78, - "learning_rate": 5.649299608689554e-05, - "loss": 0.7676, + "epoch": 3.7543743627896724, + "grad_norm": 13.870627403259277, + "learning_rate": 4.7390791548475615e-05, + "loss": 0.7341, "step": 272500 }, { - "epoch": 2.78, - "learning_rate": 5.648988305392991e-05, - "loss": 0.9003, + "epoch": 3.755752114849412, + "grad_norm": 9.901688575744629, + "learning_rate": 4.738525578180578e-05, + "loss": 0.6902, "step": 272600 }, { - "epoch": 2.78, - "learning_rate": 5.648679987545944e-05, - "loss": 0.9116, + "epoch": 3.757129866909151, + "grad_norm": 7.905740737915039, + "learning_rate": 4.7379718325781725e-05, + "loss": 0.7451, "step": 272700 }, { - "epoch": 2.78, - "learning_rate": 5.6483684265201386e-05, - "loss": 0.8178, + "epoch": 3.75850761896889, + "grad_norm": 3.3949713706970215, + "learning_rate": 4.7374179180873905e-05, + "loss": 0.7406, "step": 272800 }, { - "epoch": 2.78, - "learning_rate": 5.648056736004922e-05, - "loss": 0.8899, + "epoch": 3.7598853710286297, + "grad_norm": 10.167157173156738, + "learning_rate": 4.736863834755288e-05, + "loss": 0.861, "step": 272900 }, { - "epoch": 2.78, - "learning_rate": 5.647744916015534e-05, - "loss": 0.9139, + "epoch": 3.761263123088369, + "grad_norm": 566.754150390625, + "learning_rate": 4.736309582628938e-05, + "loss": 0.7593, "step": 273000 }, { - "epoch": 2.78, - "learning_rate": 5.64743296656722e-05, - "loss": 0.8453, + "epoch": 3.7626408751481084, + "grad_norm": 18.167667388916016, + "learning_rate": 4.7357551617554274e-05, + "loss": 0.7495, "step": 273100 }, { - "epoch": 2.78, - "learning_rate": 5.647120887675233e-05, - "loss": 1.0177, + "epoch": 3.7640186272078475, + "grad_norm": 9.464631080627441, + "learning_rate": 4.735200572181857e-05, + "loss": 0.7617, "step": 273200 }, { - "epoch": 2.78, - "learning_rate": 5.646808679354831e-05, - "loss": 0.9189, + "epoch": 3.765396379267587, + "grad_norm": 5.3268232345581055, + "learning_rate": 4.734645813955341e-05, + "loss": 0.8335, "step": 273300 }, { - "epoch": 2.79, - "learning_rate": 5.6464963416212815e-05, - "loss": 0.8525, + "epoch": 3.766774131327326, + "grad_norm": 3.9660980701446533, + "learning_rate": 4.7340908871230105e-05, + "loss": 0.6887, "step": 273400 }, { - "epoch": 2.79, - "learning_rate": 5.6461838744898534e-05, - "loss": 0.8275, + "epoch": 3.7681518833870657, + "grad_norm": 7.923286437988281, + "learning_rate": 4.733535791732008e-05, + "loss": 0.8255, "step": 273500 }, { - "epoch": 2.79, - "learning_rate": 5.645871277975826e-05, - "loss": 0.8679, + "epoch": 3.769529635446805, + "grad_norm": 3.753889322280884, + "learning_rate": 4.732980527829493e-05, + "loss": 0.8136, "step": 273600 }, { - "epoch": 2.79, - "learning_rate": 5.645558552094482e-05, - "loss": 0.8387, + "epoch": 3.7709073875065444, + "grad_norm": 2.2283871173858643, + "learning_rate": 4.732430650620049e-05, + "loss": 0.7593, "step": 273700 }, { - "epoch": 2.79, - "learning_rate": 5.645245696861113e-05, - "loss": 0.885, + "epoch": 3.7722851395662835, + "grad_norm": 6.431221008300781, + "learning_rate": 4.731875051519976e-05, + "loss": 0.8895, "step": 273800 }, { - "epoch": 2.79, - "learning_rate": 5.644932712291016e-05, - "loss": 0.9016, + "epoch": 3.773662891626023, + "grad_norm": 1.4079269170761108, + "learning_rate": 4.731319284049479e-05, + "loss": 0.7761, "step": 273900 }, { - "epoch": 2.79, - "learning_rate": 5.644619598399493e-05, - "loss": 0.91, + "epoch": 3.775040643685762, + "grad_norm": 15.714146614074707, + "learning_rate": 4.730763348255773e-05, + "loss": 0.7908, "step": 274000 }, { - "epoch": 2.79, - "learning_rate": 5.644306355201854e-05, - "loss": 0.966, + "epoch": 3.7764183957455018, + "grad_norm": 3.988079071044922, + "learning_rate": 4.730207244186087e-05, + "loss": 0.7625, "step": 274100 }, { - "epoch": 2.79, - "learning_rate": 5.6439929827134145e-05, - "loss": 0.8628, + "epoch": 3.777796147805241, + "grad_norm": 62.9211311340332, + "learning_rate": 4.7296509718876633e-05, + "loss": 0.7935, "step": 274200 }, { - "epoch": 2.79, - "learning_rate": 5.643679480949497e-05, - "loss": 0.8955, + "epoch": 3.7791738998649804, + "grad_norm": 6.144111156463623, + "learning_rate": 4.7290945314077636e-05, + "loss": 0.7979, "step": 274300 }, { - "epoch": 2.8, - "learning_rate": 5.643365849925429e-05, - "loss": 0.8976, + "epoch": 3.7805516519247195, + "grad_norm": 29.91961097717285, + "learning_rate": 4.7285379227936575e-05, + "loss": 0.8063, "step": 274400 }, { - "epoch": 2.8, - "learning_rate": 5.6430520896565465e-05, - "loss": 0.8053, + "epoch": 3.781929403984459, + "grad_norm": 2.6208667755126953, + "learning_rate": 4.727981146092633e-05, + "loss": 0.8775, "step": 274500 }, { - "epoch": 2.8, - "learning_rate": 5.642738200158189e-05, - "loss": 0.8382, + "epoch": 3.783307156044198, + "grad_norm": 16.388765335083008, + "learning_rate": 4.727424201351991e-05, + "loss": 0.7705, "step": 274600 }, { - "epoch": 2.8, - "learning_rate": 5.6424241814457056e-05, - "loss": 0.8375, + "epoch": 3.7846849081039378, + "grad_norm": 22.490192413330078, + "learning_rate": 4.726867088619047e-05, + "loss": 0.8121, "step": 274700 }, { - "epoch": 2.8, - "learning_rate": 5.642110033534448e-05, - "loss": 0.8948, + "epoch": 3.786062660163677, + "grad_norm": 3.434366464614868, + "learning_rate": 4.7263098079411297e-05, + "loss": 0.8411, "step": 274800 }, { - "epoch": 2.8, - "learning_rate": 5.641795756439776e-05, - "loss": 0.8199, + "epoch": 3.7874404122234164, + "grad_norm": 2.3183443546295166, + "learning_rate": 4.725752359365584e-05, + "loss": 0.8552, "step": 274900 }, { - "epoch": 2.8, - "learning_rate": 5.6414813501770584e-05, - "loss": 0.8989, + "epoch": 3.7888181642831555, + "grad_norm": 10.366097450256348, + "learning_rate": 4.725194742939766e-05, + "loss": 0.8053, "step": 275000 }, { - "epoch": 2.8, - "learning_rate": 5.641166814761667e-05, - "loss": 1.0718, + "epoch": 3.7901959163428947, + "grad_norm": 12.962300300598145, + "learning_rate": 4.724636958711051e-05, + "loss": 0.7189, "step": 275100 }, { - "epoch": 2.8, - "learning_rate": 5.640852150208978e-05, - "loss": 0.8265, + "epoch": 3.791573668402634, + "grad_norm": 5.601036071777344, + "learning_rate": 4.7240790067268236e-05, + "loss": 0.8333, "step": 275200 }, { - "epoch": 2.8, - "learning_rate": 5.64053735653438e-05, - "loss": 0.9047, + "epoch": 3.7929514204623738, + "grad_norm": 16.954980850219727, + "learning_rate": 4.723520887034485e-05, + "loss": 0.7332, "step": 275300 }, { - "epoch": 2.81, - "learning_rate": 5.640222433753263e-05, - "loss": 0.8471, + "epoch": 3.794329172522113, + "grad_norm": 6.905393123626709, + "learning_rate": 4.7229625996814516e-05, + "loss": 0.7401, "step": 275400 }, { - "epoch": 2.81, - "learning_rate": 5.639907381881024e-05, - "loss": 0.9006, + "epoch": 3.795706924581852, + "grad_norm": 36.58599853515625, + "learning_rate": 4.722404144715151e-05, + "loss": 0.7407, "step": 275500 }, { - "epoch": 2.81, - "learning_rate": 5.63959220093307e-05, - "loss": 0.9087, + "epoch": 3.7970846766415915, + "grad_norm": 20.370412826538086, + "learning_rate": 4.721845522183028e-05, + "loss": 0.8583, "step": 275600 }, { - "epoch": 2.81, - "learning_rate": 5.6392768909248085e-05, - "loss": 0.9438, + "epoch": 3.798462428701331, + "grad_norm": 80.74333190917969, + "learning_rate": 4.72128673213254e-05, + "loss": 0.8026, "step": 275700 }, { - "epoch": 2.81, - "learning_rate": 5.638961451871657e-05, - "loss": 0.9513, + "epoch": 3.79984018076107, + "grad_norm": 9.61959457397461, + "learning_rate": 4.7207277746111575e-05, + "loss": 0.7635, "step": 275800 }, { - "epoch": 2.81, - "learning_rate": 5.6386458837890404e-05, - "loss": 0.9192, + "epoch": 3.8012179328208093, + "grad_norm": 21.183671951293945, + "learning_rate": 4.7201686496663705e-05, + "loss": 0.7851, "step": 275900 }, { - "epoch": 2.81, - "learning_rate": 5.638330186692385e-05, - "loss": 0.9952, + "epoch": 3.802595684880549, + "grad_norm": 195.9647216796875, + "learning_rate": 4.719609357345677e-05, + "loss": 0.787, "step": 276000 }, { - "epoch": 2.81, - "learning_rate": 5.638014360597129e-05, - "loss": 0.9226, + "epoch": 3.8039734369402884, + "grad_norm": 3.6417524814605713, + "learning_rate": 4.7190498976965914e-05, + "loss": 0.7718, "step": 276100 }, { - "epoch": 2.81, - "learning_rate": 5.637698405518714e-05, - "loss": 0.9865, + "epoch": 3.8053511890000276, + "grad_norm": 11.64912223815918, + "learning_rate": 4.718490270766643e-05, + "loss": 0.7545, "step": 276200 }, { - "epoch": 2.81, - "learning_rate": 5.6373823214725876e-05, - "loss": 1.042, + "epoch": 3.8067289410597667, + "grad_norm": 10.296647071838379, + "learning_rate": 4.717930476603377e-05, + "loss": 0.8476, "step": 276300 }, { - "epoch": 2.82, - "learning_rate": 5.637066108474204e-05, - "loss": 0.8512, + "epoch": 3.8081066931195062, + "grad_norm": 11.872393608093262, + "learning_rate": 4.717370515254348e-05, + "loss": 0.8365, "step": 276400 }, { - "epoch": 2.82, - "learning_rate": 5.6367497665390266e-05, - "loss": 0.8467, + "epoch": 3.809484445179246, + "grad_norm": 23.96843719482422, + "learning_rate": 4.7168103867671286e-05, + "loss": 0.8164, "step": 276500 }, { - "epoch": 2.82, - "learning_rate": 5.6364332956825194e-05, - "loss": 0.8368, + "epoch": 3.810862197238985, + "grad_norm": 11.160082817077637, + "learning_rate": 4.716255694972026e-05, + "loss": 0.7699, "step": 276600 }, { - "epoch": 2.82, - "learning_rate": 5.636116695920158e-05, - "loss": 0.907, + "epoch": 3.812239949298724, + "grad_norm": 12.165009498596191, + "learning_rate": 4.715695234021391e-05, + "loss": 0.7202, "step": 276700 }, { - "epoch": 2.82, - "learning_rate": 5.6357999672674226e-05, - "loss": 0.8802, + "epoch": 3.8136177013584636, + "grad_norm": 9.358080863952637, + "learning_rate": 4.7151346060748914e-05, + "loss": 0.8503, "step": 276800 }, { - "epoch": 2.82, - "learning_rate": 5.635483109739797e-05, - "loss": 0.9079, + "epoch": 3.814995453418203, + "grad_norm": 35.57636642456055, + "learning_rate": 4.7145738111801534e-05, + "loss": 0.8482, "step": 276900 }, { - "epoch": 2.82, - "learning_rate": 5.6351661233527754e-05, - "loss": 0.9171, + "epoch": 3.8163732054779422, + "grad_norm": 12.582289695739746, + "learning_rate": 4.7140128493848183e-05, + "loss": 0.8424, "step": 277000 }, { - "epoch": 2.82, - "learning_rate": 5.6348490081218564e-05, - "loss": 0.9014, + "epoch": 3.8177509575376813, + "grad_norm": 19.38596534729004, + "learning_rate": 4.713451720736544e-05, + "loss": 0.7397, "step": 277100 }, { - "epoch": 2.82, - "learning_rate": 5.6345317640625455e-05, - "loss": 0.8428, + "epoch": 3.819128709597421, + "grad_norm": 3.3069958686828613, + "learning_rate": 4.712896039063067e-05, + "loss": 0.8366, "step": 277200 }, { - "epoch": 2.83, - "learning_rate": 5.6342143911903516e-05, - "loss": 0.9874, + "epoch": 3.82050646165716, + "grad_norm": 10.138157844543457, + "learning_rate": 4.7123345785192806e-05, + "loss": 0.823, "step": 277300 }, { - "epoch": 2.83, - "learning_rate": 5.633896889520796e-05, - "loss": 0.9019, + "epoch": 3.8218842137168996, + "grad_norm": 2.509814739227295, + "learning_rate": 4.711772951265132e-05, + "loss": 0.703, "step": 277400 }, { - "epoch": 2.83, - "learning_rate": 5.633579259069399e-05, - "loss": 0.8927, + "epoch": 3.8232619657766387, + "grad_norm": 6.17867374420166, + "learning_rate": 4.7112111573483355e-05, + "loss": 0.8611, "step": 277500 }, { - "epoch": 2.83, - "learning_rate": 5.6332614998516945e-05, - "loss": 0.8153, + "epoch": 3.8246397178363782, + "grad_norm": 5.795092582702637, + "learning_rate": 4.710649196816617e-05, + "loss": 0.8168, "step": 277600 }, { - "epoch": 2.83, - "learning_rate": 5.6329436118832165e-05, - "loss": 0.9364, + "epoch": 3.8260174698961174, + "grad_norm": 46.700477600097656, + "learning_rate": 4.71008706971772e-05, + "loss": 0.874, "step": 277700 }, { - "epoch": 2.83, - "learning_rate": 5.632625595179509e-05, - "loss": 0.9679, + "epoch": 3.827395221955857, + "grad_norm": 31.78653907775879, + "learning_rate": 4.7095247760993974e-05, + "loss": 0.7207, "step": 277800 }, { - "epoch": 2.83, - "learning_rate": 5.6323074497561185e-05, - "loss": 0.9408, + "epoch": 3.828772974015596, + "grad_norm": 28.1950740814209, + "learning_rate": 4.70896231600942e-05, + "loss": 0.7582, "step": 277900 }, { - "epoch": 2.83, - "learning_rate": 5.631989175628604e-05, - "loss": 0.9853, + "epoch": 3.8301507260753356, + "grad_norm": 8.862968444824219, + "learning_rate": 4.7083996894955734e-05, + "loss": 0.7385, "step": 278000 }, { - "epoch": 2.83, - "learning_rate": 5.631670772812526e-05, - "loss": 0.8934, + "epoch": 3.8315284781350747, + "grad_norm": 9.290912628173828, + "learning_rate": 4.707836896605653e-05, + "loss": 0.7466, "step": 278100 }, { - "epoch": 2.83, - "learning_rate": 5.631352241323452e-05, - "loss": 0.9214, + "epoch": 3.8329062301948142, + "grad_norm": 6.6076340675354, + "learning_rate": 4.707273937387472e-05, + "loss": 0.8101, "step": 278200 }, { - "epoch": 2.84, - "learning_rate": 5.631033581176956e-05, - "loss": 0.8778, + "epoch": 3.8342839822545534, + "grad_norm": 6.146384239196777, + "learning_rate": 4.7067108118888566e-05, + "loss": 0.8619, "step": 278300 }, { - "epoch": 2.84, - "learning_rate": 5.6307147923886195e-05, - "loss": 0.8867, + "epoch": 3.835661734314293, + "grad_norm": 4.0692572593688965, + "learning_rate": 4.7061475201576475e-05, + "loss": 0.7975, "step": 278400 }, { - "epoch": 2.84, - "learning_rate": 5.630395874974029e-05, - "loss": 0.8421, + "epoch": 3.837039486374032, + "grad_norm": 5.783301830291748, + "learning_rate": 4.705584062241699e-05, + "loss": 0.8069, "step": 278500 }, { - "epoch": 2.84, - "learning_rate": 5.6300768289487774e-05, - "loss": 0.847, + "epoch": 3.8384172384337716, + "grad_norm": 24.528738021850586, + "learning_rate": 4.705020438188879e-05, + "loss": 0.8094, "step": 278600 }, { - "epoch": 2.84, - "learning_rate": 5.629757654328464e-05, - "loss": 0.7722, + "epoch": 3.8397949904935107, + "grad_norm": 16.147613525390625, + "learning_rate": 4.704456648047072e-05, + "loss": 0.8363, "step": 278700 }, { - "epoch": 2.84, - "learning_rate": 5.629438351128695e-05, - "loss": 0.8098, + "epoch": 3.8411727425532503, + "grad_norm": 10.864917755126953, + "learning_rate": 4.7038926918641735e-05, + "loss": 0.7708, "step": 278800 }, { - "epoch": 2.84, - "learning_rate": 5.629118919365083e-05, - "loss": 0.9551, + "epoch": 3.8425504946129894, + "grad_norm": 2.2151923179626465, + "learning_rate": 4.703328569688094e-05, + "loss": 0.7813, "step": 278900 }, { - "epoch": 2.84, - "learning_rate": 5.628799359053245e-05, - "loss": 0.8167, + "epoch": 3.843928246672729, + "grad_norm": 10.98679256439209, + "learning_rate": 4.702764281566761e-05, + "loss": 0.8556, "step": 279000 }, { - "epoch": 2.84, - "learning_rate": 5.628482867733436e-05, - "loss": 0.8282, + "epoch": 3.845305998732468, + "grad_norm": 6.018759727478027, + "learning_rate": 4.702199827548111e-05, + "loss": 0.7443, "step": 279100 }, { - "epoch": 2.84, - "learning_rate": 5.6281662504539915e-05, - "loss": 0.7864, + "epoch": 3.8466837507922076, + "grad_norm": 3.0387134552001953, + "learning_rate": 4.701635207680098e-05, + "loss": 0.6786, "step": 279200 }, { - "epoch": 2.85, - "learning_rate": 5.627846307161123e-05, - "loss": 0.8998, + "epoch": 3.8480615028519467, + "grad_norm": 9.06083869934082, + "learning_rate": 4.7010704220106896e-05, + "loss": 0.808, "step": 279300 }, { - "epoch": 2.85, - "learning_rate": 5.6275262353822524e-05, - "loss": 0.8947, + "epoch": 3.8494392549116863, + "grad_norm": 3.589306592941284, + "learning_rate": 4.700505470587868e-05, + "loss": 0.7647, "step": 279400 }, { - "epoch": 2.85, - "learning_rate": 5.627206035133029e-05, - "loss": 0.8634, + "epoch": 3.8508170069714254, + "grad_norm": 7.072883605957031, + "learning_rate": 4.699940353459628e-05, + "loss": 0.7335, "step": 279500 }, { - "epoch": 2.85, - "learning_rate": 5.62688570642911e-05, - "loss": 0.9087, + "epoch": 3.852194759031165, + "grad_norm": 11.459481239318848, + "learning_rate": 4.699375070673978e-05, + "loss": 0.7742, "step": 279600 }, { - "epoch": 2.85, - "learning_rate": 5.6265652492861565e-05, - "loss": 0.8074, + "epoch": 3.853572511090904, + "grad_norm": 7.030880451202393, + "learning_rate": 4.698809622278943e-05, + "loss": 0.8076, "step": 279700 }, { - "epoch": 2.85, - "learning_rate": 5.6262446637198354e-05, - "loss": 0.8247, + "epoch": 3.854950263150643, + "grad_norm": 109.31099700927734, + "learning_rate": 4.698249665281496e-05, + "loss": 0.6946, "step": 279800 }, { - "epoch": 2.85, - "learning_rate": 5.6259239497458233e-05, - "loss": 0.8478, + "epoch": 3.8563280152103827, + "grad_norm": 3.794438600540161, + "learning_rate": 4.697683887466713e-05, + "loss": 0.7673, "step": 279900 }, { - "epoch": 2.85, - "learning_rate": 5.6256031073798006e-05, - "loss": 0.8564, + "epoch": 3.8577057672701223, + "grad_norm": 4.457272529602051, + "learning_rate": 4.69711794418622e-05, + "loss": 0.747, "step": 280000 }, { - "epoch": 2.85, - "learning_rate": 5.6252821366374556e-05, - "loss": 0.8486, + "epoch": 3.8590835193298614, + "grad_norm": 3.3575732707977295, + "learning_rate": 4.6965518354880966e-05, + "loss": 0.8331, "step": 280100 }, { - "epoch": 2.85, - "learning_rate": 5.6249610375344793e-05, - "loss": 0.8445, + "epoch": 3.8604612713896005, + "grad_norm": 7.636532783508301, + "learning_rate": 4.6959855614204354e-05, + "loss": 0.7142, "step": 280200 }, { - "epoch": 2.86, - "learning_rate": 5.624639810086575e-05, - "loss": 0.9499, + "epoch": 3.86183902344934, + "grad_norm": 13.00849723815918, + "learning_rate": 4.695419122031346e-05, + "loss": 0.8094, "step": 280300 }, { - "epoch": 2.86, - "learning_rate": 5.6243184543094466e-05, - "loss": 0.9415, + "epoch": 3.8632167755090796, + "grad_norm": 5.083268165588379, + "learning_rate": 4.694852517368949e-05, + "loss": 0.7171, "step": 280400 }, { - "epoch": 2.86, - "learning_rate": 5.623996970218806e-05, - "loss": 0.9072, + "epoch": 3.8645945275688187, + "grad_norm": 13.605998039245605, + "learning_rate": 4.69428574748138e-05, + "loss": 0.8005, "step": 280500 }, { - "epoch": 2.86, - "learning_rate": 5.623675357830374e-05, - "loss": 0.8137, + "epoch": 3.865972279628558, + "grad_norm": 10.626866340637207, + "learning_rate": 4.693718812416792e-05, + "loss": 0.7407, "step": 280600 }, { - "epoch": 2.86, - "learning_rate": 5.623353617159874e-05, - "loss": 0.8468, + "epoch": 3.8673500316882974, + "grad_norm": 5.20609712600708, + "learning_rate": 4.693151712223346e-05, + "loss": 0.7583, "step": 280700 }, { - "epoch": 2.86, - "learning_rate": 5.623031748223037e-05, - "loss": 0.9591, + "epoch": 3.868727783748037, + "grad_norm": 12.129583358764648, + "learning_rate": 4.69258444694922e-05, + "loss": 0.6739, "step": 280800 }, { - "epoch": 2.86, - "learning_rate": 5.6227097510356013e-05, - "loss": 0.9549, + "epoch": 3.870105535807776, + "grad_norm": 15.339916229248047, + "learning_rate": 4.692017016642607e-05, + "loss": 0.6958, "step": 280900 }, { - "epoch": 2.86, - "learning_rate": 5.62238762561331e-05, - "loss": 0.8785, + "epoch": 3.871483287867515, + "grad_norm": 27.290355682373047, + "learning_rate": 4.691449421351715e-05, + "loss": 0.756, "step": 281000 }, { - "epoch": 2.86, - "learning_rate": 5.622065371971914e-05, - "loss": 0.9843, + "epoch": 3.8728610399272547, + "grad_norm": 15.555962562561035, + "learning_rate": 4.690881661124761e-05, + "loss": 0.6909, "step": 281100 }, { - "epoch": 2.86, - "learning_rate": 5.621742990127167e-05, - "loss": 0.9294, + "epoch": 3.8742387919869943, + "grad_norm": 3.7568814754486084, + "learning_rate": 4.690313736009979e-05, + "loss": 0.7812, "step": 281200 }, { - "epoch": 2.87, - "learning_rate": 5.621423705829636e-05, - "loss": 1.0382, + "epoch": 3.8756165440467334, + "grad_norm": 4.715430736541748, + "learning_rate": 4.6897456460556204e-05, + "loss": 0.7335, "step": 281300 }, { - "epoch": 2.87, - "learning_rate": 5.6211010689071265e-05, - "loss": 1.1271, + "epoch": 3.8769942961064725, + "grad_norm": 4.514675617218018, + "learning_rate": 4.6891773913099454e-05, + "loss": 0.688, "step": 281400 }, { - "epoch": 2.87, - "learning_rate": 5.6207815321135244e-05, - "loss": 1.1141, + "epoch": 3.878372048166212, + "grad_norm": 3.296437978744507, + "learning_rate": 4.6886089718212295e-05, + "loss": 0.7317, "step": 281500 }, { - "epoch": 2.87, - "learning_rate": 5.6204586401757216e-05, - "loss": 0.9543, + "epoch": 3.879749800225951, + "grad_norm": 11.520378112792969, + "learning_rate": 4.6880403876377646e-05, + "loss": 0.8156, "step": 281600 }, { - "epoch": 2.87, - "learning_rate": 5.62013562011313e-05, - "loss": 1.0317, + "epoch": 3.8811275522856907, + "grad_norm": 12.717551231384277, + "learning_rate": 4.687471638807853e-05, + "loss": 0.8177, "step": 281700 }, { - "epoch": 2.87, - "learning_rate": 5.619812471941542e-05, - "loss": 0.8983, + "epoch": 3.88250530434543, + "grad_norm": 8.70654010772705, + "learning_rate": 4.686902725379814e-05, + "loss": 0.8383, "step": 281800 }, { - "epoch": 2.87, - "learning_rate": 5.619489195676757e-05, - "loss": 0.9502, + "epoch": 3.8838830564051694, + "grad_norm": 26.836902618408203, + "learning_rate": 4.686333647401979e-05, + "loss": 0.8805, "step": 281900 }, { - "epoch": 2.87, - "learning_rate": 5.6191657913345844e-05, - "loss": 0.9722, + "epoch": 3.8852608084649085, + "grad_norm": 18.273624420166016, + "learning_rate": 4.685764404922695e-05, + "loss": 0.8539, "step": 282000 }, { - "epoch": 2.87, - "learning_rate": 5.6188422589308346e-05, - "loss": 0.9966, + "epoch": 3.886638560524648, + "grad_norm": 16.71436309814453, + "learning_rate": 4.685194997990321e-05, + "loss": 0.6822, "step": 282100 }, { - "epoch": 2.88, - "learning_rate": 5.618518598481327e-05, - "loss": 1.0188, + "epoch": 3.888016312584387, + "grad_norm": 13.167102813720703, + "learning_rate": 4.684625426653232e-05, + "loss": 0.8249, "step": 282200 }, { - "epoch": 2.88, - "learning_rate": 5.618194810001886e-05, - "loss": 0.9277, + "epoch": 3.8893940646441267, + "grad_norm": 9.252867698669434, + "learning_rate": 4.684055690959815e-05, + "loss": 0.7913, "step": 282300 }, { - "epoch": 2.88, - "learning_rate": 5.617870893508343e-05, - "loss": 0.8661, + "epoch": 3.890771816703866, + "grad_norm": 18.08372688293457, + "learning_rate": 4.683485790958472e-05, + "loss": 0.735, "step": 282400 }, { - "epoch": 2.88, - "learning_rate": 5.617546849016537e-05, - "loss": 0.8409, + "epoch": 3.8921495687636054, + "grad_norm": 22.017805099487305, + "learning_rate": 4.682915726697621e-05, + "loss": 0.6777, "step": 282500 }, { - "epoch": 2.88, - "learning_rate": 5.61722267654231e-05, - "loss": 0.8358, + "epoch": 3.8935273208233445, + "grad_norm": 7.59044075012207, + "learning_rate": 4.6823454982256896e-05, + "loss": 0.7403, "step": 282600 }, { - "epoch": 2.88, - "learning_rate": 5.6168983761015136e-05, - "loss": 0.8698, + "epoch": 3.894905072883084, + "grad_norm": 19.335826873779297, + "learning_rate": 4.681775105591122e-05, + "loss": 0.8477, "step": 282700 }, { - "epoch": 2.88, - "learning_rate": 5.6165739477100025e-05, - "loss": 0.8788, + "epoch": 3.896282824942823, + "grad_norm": 3.237865924835205, + "learning_rate": 4.681204548842376e-05, + "loss": 0.7686, "step": 282800 }, { - "epoch": 2.88, - "learning_rate": 5.616249391383641e-05, - "loss": 0.8727, + "epoch": 3.8976605770025627, + "grad_norm": 3.7802419662475586, + "learning_rate": 4.680633828027924e-05, + "loss": 0.7711, "step": 282900 }, { - "epoch": 2.88, - "learning_rate": 5.615924707138297e-05, - "loss": 0.9651, + "epoch": 3.899038329062302, + "grad_norm": 9.453742027282715, + "learning_rate": 4.680062943196252e-05, + "loss": 0.7047, "step": 283000 }, { - "epoch": 2.88, - "learning_rate": 5.615599894989846e-05, - "loss": 0.8527, + "epoch": 3.9004160811220414, + "grad_norm": 7.233980655670166, + "learning_rate": 4.679491894395857e-05, + "loss": 0.6415, "step": 283100 }, { - "epoch": 2.89, - "learning_rate": 5.6152749549541684e-05, - "loss": 0.9046, + "epoch": 3.9017938331817805, + "grad_norm": 3.4469568729400635, + "learning_rate": 4.678920681675256e-05, + "loss": 0.6709, "step": 283200 }, { - "epoch": 2.89, - "learning_rate": 5.614949887047154e-05, - "loss": 0.9876, + "epoch": 3.90317158524152, + "grad_norm": 4.129087924957275, + "learning_rate": 4.678349305082975e-05, + "loss": 0.7867, "step": 283300 }, { - "epoch": 2.89, - "learning_rate": 5.614624691284694e-05, - "loss": 1.0558, + "epoch": 3.904549337301259, + "grad_norm": 8.282999992370605, + "learning_rate": 4.677777764667554e-05, + "loss": 0.8287, "step": 283400 }, { - "epoch": 2.89, - "learning_rate": 5.61429936768269e-05, - "loss": 0.8692, + "epoch": 3.9059270893609987, + "grad_norm": 12.684904098510742, + "learning_rate": 4.67720606047755e-05, + "loss": 0.751, "step": 283500 }, { - "epoch": 2.89, - "learning_rate": 5.613973916257049e-05, - "loss": 0.8844, + "epoch": 3.907304841420738, + "grad_norm": 1.003591775894165, + "learning_rate": 4.6766341925615316e-05, + "loss": 0.8116, "step": 283600 }, { - "epoch": 2.89, - "learning_rate": 5.613648337023682e-05, - "loss": 0.861, + "epoch": 3.9086825934804774, + "grad_norm": 6.440825939178467, + "learning_rate": 4.676062160968082e-05, + "loss": 0.7158, "step": 283700 }, { - "epoch": 2.89, - "learning_rate": 5.61332262999851e-05, - "loss": 1.0241, + "epoch": 3.9100603455402165, + "grad_norm": 2.9218430519104004, + "learning_rate": 4.6754899657457974e-05, + "loss": 0.7279, "step": 283800 }, { - "epoch": 2.89, - "learning_rate": 5.612996795197456e-05, - "loss": 0.8861, + "epoch": 3.911438097599956, + "grad_norm": 8.239198684692383, + "learning_rate": 4.674917606943291e-05, + "loss": 0.7192, "step": 283900 }, { - "epoch": 2.89, - "learning_rate": 5.6126708326364526e-05, - "loss": 0.7528, + "epoch": 3.912815849659695, + "grad_norm": 5.853105545043945, + "learning_rate": 4.674345084609184e-05, + "loss": 0.8082, "step": 284000 }, { - "epoch": 2.89, - "learning_rate": 5.6123447423314365e-05, - "loss": 0.9213, + "epoch": 3.9141936017194343, + "grad_norm": 11.474648475646973, + "learning_rate": 4.6737723987921185e-05, + "loss": 0.7637, "step": 284100 }, { - "epoch": 2.9, - "learning_rate": 5.612018524298353e-05, - "loss": 0.9137, + "epoch": 3.915571353779174, + "grad_norm": 16.13724136352539, + "learning_rate": 4.673199549540746e-05, + "loss": 0.7011, "step": 284200 }, { - "epoch": 2.9, - "learning_rate": 5.611692178553151e-05, - "loss": 0.7721, + "epoch": 3.9169491058389134, + "grad_norm": 8.205422401428223, + "learning_rate": 4.6726265369037325e-05, + "loss": 0.7268, "step": 284300 }, { - "epoch": 2.9, - "learning_rate": 5.611365705111787e-05, - "loss": 0.7391, + "epoch": 3.9183268578986525, + "grad_norm": 11.68458080291748, + "learning_rate": 4.6720533609297583e-05, + "loss": 0.7169, "step": 284400 }, { - "epoch": 2.9, - "learning_rate": 5.611042370633404e-05, - "loss": 0.9863, + "epoch": 3.9197046099583917, + "grad_norm": 12.15461254119873, + "learning_rate": 4.671480021667518e-05, + "loss": 0.7506, "step": 284500 }, { - "epoch": 2.9, - "learning_rate": 5.610715643124174e-05, - "loss": 0.9222, + "epoch": 3.921082362018131, + "grad_norm": 14.598043441772461, + "learning_rate": 4.67090651916572e-05, + "loss": 0.717, "step": 284600 }, { - "epoch": 2.9, - "learning_rate": 5.610388787966529e-05, - "loss": 0.9537, + "epoch": 3.9224601140778708, + "grad_norm": 35.984710693359375, + "learning_rate": 4.6703328534730857e-05, + "loss": 0.7073, "step": 284700 }, { - "epoch": 2.9, - "learning_rate": 5.6100618051764506e-05, - "loss": 0.9555, + "epoch": 3.92383786613761, + "grad_norm": 15.192087173461914, + "learning_rate": 4.6697590246383505e-05, + "loss": 0.727, "step": 284800 }, { - "epoch": 2.9, - "learning_rate": 5.609734694769926e-05, - "loss": 0.8755, + "epoch": 3.925215618197349, + "grad_norm": 12.10468578338623, + "learning_rate": 4.669185032710266e-05, + "loss": 0.7586, "step": 284900 }, { - "epoch": 2.9, - "learning_rate": 5.609407456762948e-05, - "loss": 1.0815, + "epoch": 3.9265933702570885, + "grad_norm": 17.737178802490234, + "learning_rate": 4.668610877737594e-05, + "loss": 0.9379, "step": 285000 }, { - "epoch": 2.9, - "learning_rate": 5.6090800911715183e-05, - "loss": 1.0215, + "epoch": 3.927971122316828, + "grad_norm": 29.761802673339844, + "learning_rate": 4.668036559769112e-05, + "loss": 0.7204, "step": 285100 }, { - "epoch": 2.91, - "learning_rate": 5.6087525980116416e-05, - "loss": 0.9752, + "epoch": 3.929348874376567, + "grad_norm": 18.861635208129883, + "learning_rate": 4.667462078853611e-05, + "loss": 0.688, "step": 285200 }, { - "epoch": 2.91, - "learning_rate": 5.608424977299332e-05, - "loss": 1.0344, + "epoch": 3.9307266264363063, + "grad_norm": 114.94448852539062, + "learning_rate": 4.666887435039898e-05, + "loss": 0.7467, "step": 285300 }, { - "epoch": 2.91, - "learning_rate": 5.608097229050606e-05, - "loss": 0.9516, + "epoch": 3.932104378496046, + "grad_norm": 6.358266830444336, + "learning_rate": 4.66631262837679e-05, + "loss": 0.7498, "step": 285400 }, { - "epoch": 2.91, - "learning_rate": 5.60776935328149e-05, - "loss": 1.0518, + "epoch": 3.9334821305557854, + "grad_norm": 6.687554836273193, + "learning_rate": 4.665737658913121e-05, + "loss": 0.723, "step": 285500 }, { - "epoch": 2.91, - "learning_rate": 5.6074413500080156e-05, - "loss": 0.9529, + "epoch": 3.9348598826155246, + "grad_norm": 5.466715335845947, + "learning_rate": 4.6651625266977366e-05, + "loss": 0.7884, "step": 285600 }, { - "epoch": 2.91, - "learning_rate": 5.6071132192462196e-05, - "loss": 0.9231, + "epoch": 3.9362376346752637, + "grad_norm": 30.31229591369629, + "learning_rate": 4.664587231779498e-05, + "loss": 0.7059, "step": 285700 }, { - "epoch": 2.91, - "learning_rate": 5.6067882442254214e-05, - "loss": 0.7926, + "epoch": 3.937615386735003, + "grad_norm": 12.738252639770508, + "learning_rate": 4.6640117742072786e-05, + "loss": 0.7394, "step": 285800 }, { - "epoch": 2.91, - "learning_rate": 5.606459859809602e-05, - "loss": 0.8186, + "epoch": 3.9389931387947423, + "grad_norm": 14.36909294128418, + "learning_rate": 4.663436154029967e-05, + "loss": 0.7453, "step": 285900 }, { - "epoch": 2.91, - "learning_rate": 5.60613134795345e-05, - "loss": 0.9389, + "epoch": 3.940370890854482, + "grad_norm": 6.872807025909424, + "learning_rate": 4.662860371296466e-05, + "loss": 0.6916, "step": 286000 }, { - "epoch": 2.91, - "learning_rate": 5.605802708673028e-05, - "loss": 0.9772, + "epoch": 3.941748642914221, + "grad_norm": 9.961835861206055, + "learning_rate": 4.6622844260556886e-05, + "loss": 0.7001, "step": 286100 }, { - "epoch": 2.92, - "learning_rate": 5.605473941984404e-05, - "loss": 1.0135, + "epoch": 3.9431263949739606, + "grad_norm": 19.924083709716797, + "learning_rate": 4.661708318356567e-05, + "loss": 0.8202, "step": 286200 }, { - "epoch": 2.92, - "learning_rate": 5.605145047903653e-05, - "loss": 0.9971, + "epoch": 3.9445041470336997, + "grad_norm": 18.48103141784668, + "learning_rate": 4.6611320482480436e-05, + "loss": 0.755, "step": 286300 }, { - "epoch": 2.92, - "learning_rate": 5.6048160264468555e-05, - "loss": 0.9645, + "epoch": 3.9458818990934392, + "grad_norm": 7.876520156860352, + "learning_rate": 4.660555615779075e-05, + "loss": 0.8122, "step": 286400 }, { - "epoch": 2.92, - "learning_rate": 5.604486877630099e-05, - "loss": 0.8918, + "epoch": 3.9472596511531783, + "grad_norm": 23.327024459838867, + "learning_rate": 4.659979020998634e-05, + "loss": 0.6568, "step": 286500 }, { - "epoch": 2.92, - "learning_rate": 5.604157601469477e-05, - "loss": 1.034, + "epoch": 3.948637403212918, + "grad_norm": 2.7605140209198, + "learning_rate": 4.659402263955702e-05, + "loss": 0.8279, "step": 286600 }, { - "epoch": 2.92, - "learning_rate": 5.603828197981089e-05, - "loss": 0.9891, + "epoch": 3.950015155272657, + "grad_norm": 11.242687225341797, + "learning_rate": 4.65882534469928e-05, + "loss": 0.7345, "step": 286700 }, { - "epoch": 2.92, - "learning_rate": 5.6034986671810416e-05, - "loss": 0.9316, + "epoch": 3.9513929073323966, + "grad_norm": 4.0618438720703125, + "learning_rate": 4.65824826327838e-05, + "loss": 0.6334, "step": 286800 }, { - "epoch": 2.92, - "learning_rate": 5.6031690090854455e-05, - "loss": 0.8739, + "epoch": 3.9527706593921357, + "grad_norm": 29.51956558227539, + "learning_rate": 4.6576710197420264e-05, + "loss": 0.6697, "step": 286900 }, { - "epoch": 2.92, - "learning_rate": 5.60283922371042e-05, - "loss": 0.8775, + "epoch": 3.9541484114518752, + "grad_norm": 3.382915735244751, + "learning_rate": 4.6570936141392604e-05, + "loss": 0.8269, "step": 287000 }, { - "epoch": 2.93, - "learning_rate": 5.602509311072089e-05, - "loss": 0.989, + "epoch": 3.9555261635116143, + "grad_norm": 1.3904200792312622, + "learning_rate": 4.656516046519135e-05, + "loss": 0.7169, "step": 287100 }, { - "epoch": 2.93, - "learning_rate": 5.602179271186584e-05, - "loss": 0.8074, + "epoch": 3.956903915571354, + "grad_norm": 4.980300426483154, + "learning_rate": 4.655938316930719e-05, + "loss": 0.7243, "step": 287200 }, { - "epoch": 2.93, - "learning_rate": 5.601849104070042e-05, - "loss": 0.8277, + "epoch": 3.958281667631093, + "grad_norm": 11.517420768737793, + "learning_rate": 4.6553604254230914e-05, + "loss": 0.7269, "step": 287300 }, { - "epoch": 2.93, - "learning_rate": 5.6015188097386054e-05, - "loss": 0.9182, + "epoch": 3.9596594196908326, + "grad_norm": 6.000971794128418, + "learning_rate": 4.654782372045348e-05, + "loss": 0.7721, "step": 287400 }, { - "epoch": 2.93, - "learning_rate": 5.601188388208424e-05, - "loss": 0.9339, + "epoch": 3.9610371717505717, + "grad_norm": 2.6846120357513428, + "learning_rate": 4.654204156846597e-05, + "loss": 0.6701, "step": 287500 }, { - "epoch": 2.93, - "learning_rate": 5.600857839495654e-05, - "loss": 0.8433, + "epoch": 3.9624149238103112, + "grad_norm": 7.725025653839111, + "learning_rate": 4.653625779875961e-05, + "loss": 0.614, "step": 287600 }, { - "epoch": 2.93, - "learning_rate": 5.600527163616456e-05, - "loss": 0.9242, + "epoch": 3.9637926758700504, + "grad_norm": 6.2518205642700195, + "learning_rate": 4.653047241182576e-05, + "loss": 0.7316, "step": 287700 }, { - "epoch": 2.93, - "learning_rate": 5.600196360587e-05, - "loss": 0.7592, + "epoch": 3.96517042792979, + "grad_norm": 31.462007522583008, + "learning_rate": 4.652468540815593e-05, + "loss": 0.7132, "step": 287800 }, { - "epoch": 2.93, - "learning_rate": 5.599865430423458e-05, - "loss": 0.8787, + "epoch": 3.966548179989529, + "grad_norm": 5.510139465332031, + "learning_rate": 4.651889678824173e-05, + "loss": 0.7492, "step": 287900 }, { - "epoch": 2.93, - "learning_rate": 5.5995343731420126e-05, - "loss": 0.895, + "epoch": 3.9679259320492686, + "grad_norm": 2.874608278274536, + "learning_rate": 4.6513106552574964e-05, + "loss": 0.6882, "step": 288000 }, { - "epoch": 2.94, - "learning_rate": 5.599203188758848e-05, - "loss": 0.7914, + "epoch": 3.9693036841090077, + "grad_norm": 85.63662719726562, + "learning_rate": 4.650731470164752e-05, + "loss": 0.7373, "step": 288100 }, { - "epoch": 2.94, - "learning_rate": 5.5988718772901604e-05, - "loss": 0.867, + "epoch": 3.9706814361687472, + "grad_norm": 9.946539878845215, + "learning_rate": 4.650152123595144e-05, + "loss": 0.7325, "step": 288200 }, { - "epoch": 2.94, - "learning_rate": 5.598540438752147e-05, - "loss": 0.9595, + "epoch": 3.9720591882284864, + "grad_norm": 18.722869873046875, + "learning_rate": 4.6495726155978936e-05, + "loss": 0.7257, "step": 288300 }, { - "epoch": 2.94, - "learning_rate": 5.598208873161013e-05, - "loss": 0.8361, + "epoch": 3.9734369402882255, + "grad_norm": 7.619026184082031, + "learning_rate": 4.64899294622223e-05, + "loss": 0.7141, "step": 288400 }, { - "epoch": 2.94, - "learning_rate": 5.5978771805329694e-05, - "loss": 0.7567, + "epoch": 3.974814692347965, + "grad_norm": 10.969167709350586, + "learning_rate": 4.648413115517401e-05, + "loss": 0.7081, "step": 288500 }, { - "epoch": 2.94, - "learning_rate": 5.597545360884236e-05, - "loss": 0.9576, + "epoch": 3.9761924444077046, + "grad_norm": 11.60225772857666, + "learning_rate": 4.647833123532667e-05, + "loss": 0.6833, "step": 288600 }, { - "epoch": 2.94, - "learning_rate": 5.5972134142310356e-05, - "loss": 0.8621, + "epoch": 3.9775701964674437, + "grad_norm": 6.163235664367676, + "learning_rate": 4.647252970317297e-05, + "loss": 0.6646, "step": 288700 }, { - "epoch": 2.94, - "learning_rate": 5.5968813405895994e-05, - "loss": 0.8485, + "epoch": 3.978947948527183, + "grad_norm": 9.643515586853027, + "learning_rate": 4.646672655920583e-05, + "loss": 0.6888, "step": 288800 }, { - "epoch": 2.94, - "learning_rate": 5.5965491399761624e-05, - "loss": 0.8123, + "epoch": 3.9803257005869224, + "grad_norm": 3.917431354522705, + "learning_rate": 4.646092180391824e-05, + "loss": 0.685, "step": 288900 }, { - "epoch": 2.94, - "learning_rate": 5.596216812406967e-05, - "loss": 0.9099, + "epoch": 3.981703452646662, + "grad_norm": 9.85342788696289, + "learning_rate": 4.645511543780333e-05, + "loss": 0.6969, "step": 289000 }, { - "epoch": 2.95, - "learning_rate": 5.595884357898265e-05, - "loss": 0.7902, + "epoch": 3.983081204706401, + "grad_norm": 9.19890308380127, + "learning_rate": 4.644930746135438e-05, + "loss": 0.7208, "step": 289100 }, { - "epoch": 2.95, - "learning_rate": 5.595551776466307e-05, - "loss": 0.9385, + "epoch": 3.98445895676614, + "grad_norm": 17.32145118713379, + "learning_rate": 4.644349787506483e-05, + "loss": 0.7199, "step": 289200 }, { - "epoch": 2.95, - "learning_rate": 5.5952190681273575e-05, - "loss": 0.849, + "epoch": 3.9858367088258797, + "grad_norm": 6.630151271820068, + "learning_rate": 4.643768667942821e-05, + "loss": 0.7026, "step": 289300 }, { - "epoch": 2.95, - "learning_rate": 5.5948862328976834e-05, - "loss": 0.8103, + "epoch": 3.9872144608856193, + "grad_norm": 3.8577563762664795, + "learning_rate": 4.6431873874938235e-05, + "loss": 0.7031, "step": 289400 }, { - "epoch": 2.95, - "learning_rate": 5.594553270793558e-05, - "loss": 0.8966, + "epoch": 3.9885922129453584, + "grad_norm": 5.545560836791992, + "learning_rate": 4.642611761417697e-05, + "loss": 0.7269, "step": 289500 }, { - "epoch": 2.95, - "learning_rate": 5.5942201818312605e-05, - "loss": 0.8605, + "epoch": 3.9899699650050975, + "grad_norm": 6.154033660888672, + "learning_rate": 4.6420301609538085e-05, + "loss": 0.7344, "step": 289600 }, { - "epoch": 2.95, - "learning_rate": 5.593886966027078e-05, - "loss": 0.8312, + "epoch": 3.991347717064837, + "grad_norm": 17.962711334228516, + "learning_rate": 4.6414483997522785e-05, + "loss": 0.6977, "step": 289700 }, { - "epoch": 2.95, - "learning_rate": 5.593553623397303e-05, - "loss": 0.8683, + "epoch": 3.9927254691245766, + "grad_norm": 12.827983856201172, + "learning_rate": 4.6408664778625296e-05, + "loss": 0.7511, "step": 289800 }, { - "epoch": 2.95, - "learning_rate": 5.5932201539582324e-05, - "loss": 0.7844, + "epoch": 3.9941032211843157, + "grad_norm": 9.863675117492676, + "learning_rate": 4.640284395334e-05, + "loss": 0.6639, "step": 289900 }, { - "epoch": 2.95, - "learning_rate": 5.592886557726172e-05, - "loss": 0.7137, + "epoch": 3.995480973244055, + "grad_norm": 1.7848522663116455, + "learning_rate": 4.639702152216141e-05, + "loss": 0.7221, "step": 290000 }, { - "epoch": 2.96, - "learning_rate": 5.5925528347174324e-05, - "loss": 0.8806, + "epoch": 3.9968587253037944, + "grad_norm": 9.894287109375, + "learning_rate": 4.6391197485584164e-05, + "loss": 0.6245, "step": 290100 }, { - "epoch": 2.96, - "learning_rate": 5.5922189849483306e-05, - "loss": 0.9878, + "epoch": 3.9982364773635335, + "grad_norm": 3.5770010948181152, + "learning_rate": 4.638537184410304e-05, + "loss": 0.7844, "step": 290200 }, { - "epoch": 2.96, - "learning_rate": 5.591885008435191e-05, - "loss": 0.8321, + "epoch": 3.999614229423273, + "grad_norm": 5.768791198730469, + "learning_rate": 4.637954459821296e-05, + "loss": 0.6162, "step": 290300 }, { - "epoch": 2.96, - "learning_rate": 5.591550905194341e-05, - "loss": 0.7921, + "epoch": 4.000991981483012, + "grad_norm": 2.224738836288452, + "learning_rate": 4.637371574840898e-05, + "loss": 0.6058, "step": 290400 }, { - "epoch": 2.96, - "learning_rate": 5.591216675242118e-05, - "loss": 0.8759, + "epoch": 4.002369733542752, + "grad_norm": 3.0572924613952637, + "learning_rate": 4.636788529518629e-05, + "loss": 0.6076, "step": 290500 }, { - "epoch": 2.96, - "learning_rate": 5.590882318594862e-05, - "loss": 0.8571, + "epoch": 4.003747485602491, + "grad_norm": 2.082035541534424, + "learning_rate": 4.6362053239040225e-05, + "loss": 0.6615, "step": 290600 }, { - "epoch": 2.96, - "learning_rate": 5.590547835268924e-05, - "loss": 0.8859, + "epoch": 4.00512523766223, + "grad_norm": 18.01354217529297, + "learning_rate": 4.635621958046623e-05, + "loss": 0.6938, "step": 290700 }, { - "epoch": 2.96, - "learning_rate": 5.590213225280655e-05, - "loss": 0.92, + "epoch": 4.0065029897219695, + "grad_norm": 5.204951286315918, + "learning_rate": 4.635038431995992e-05, + "loss": 0.7008, "step": 290800 }, { - "epoch": 2.96, - "learning_rate": 5.589878488646417e-05, - "loss": 0.8402, + "epoch": 4.007880741781709, + "grad_norm": 12.204028129577637, + "learning_rate": 4.634454745801702e-05, + "loss": 0.6751, "step": 290900 }, { - "epoch": 2.96, - "learning_rate": 5.589543625382577e-05, - "loss": 0.8618, + "epoch": 4.009258493841449, + "grad_norm": 17.813941955566406, + "learning_rate": 4.6338708995133405e-05, + "loss": 0.6748, "step": 291000 }, { - "epoch": 2.97, - "learning_rate": 5.5892086355055064e-05, - "loss": 0.8839, + "epoch": 4.010636245901187, + "grad_norm": 8.7006196975708, + "learning_rate": 4.6332868931805086e-05, + "loss": 0.6319, "step": 291100 }, { - "epoch": 2.97, - "learning_rate": 5.588873519031587e-05, - "loss": 0.8851, + "epoch": 4.012013997960927, + "grad_norm": 9.431085586547852, + "learning_rate": 4.632702726852821e-05, + "loss": 0.6698, "step": 291200 }, { - "epoch": 2.97, - "learning_rate": 5.588538275977201e-05, - "loss": 0.9046, + "epoch": 4.013391750020666, + "grad_norm": 3.8761565685272217, + "learning_rate": 4.632118400579903e-05, + "loss": 0.6349, "step": 291300 }, { - "epoch": 2.97, - "learning_rate": 5.58820290635874e-05, - "loss": 0.8374, + "epoch": 4.014769502080406, + "grad_norm": 6.5067338943481445, + "learning_rate": 4.6315339144113996e-05, + "loss": 0.6412, "step": 291400 }, { - "epoch": 2.97, - "learning_rate": 5.587867410192603e-05, - "loss": 0.8257, + "epoch": 4.016147254140145, + "grad_norm": 3.528282403945923, + "learning_rate": 4.630949268396964e-05, + "loss": 0.6492, "step": 291500 }, { - "epoch": 2.97, - "learning_rate": 5.5875317874951936e-05, - "loss": 0.7741, + "epoch": 4.017525006199884, + "grad_norm": 10.56949520111084, + "learning_rate": 4.630364462586265e-05, + "loss": 0.7175, "step": 291600 }, { - "epoch": 2.97, - "learning_rate": 5.587196038282921e-05, - "loss": 0.9485, + "epoch": 4.018902758259624, + "grad_norm": 8.316420555114746, + "learning_rate": 4.629779497028985e-05, + "loss": 0.6765, "step": 291700 }, { - "epoch": 2.97, - "learning_rate": 5.586860162572202e-05, - "loss": 1.075, + "epoch": 4.020280510319363, + "grad_norm": 4.828928470611572, + "learning_rate": 4.629194371774819e-05, + "loss": 0.6435, "step": 291800 }, { - "epoch": 2.97, - "learning_rate": 5.5865241603794596e-05, - "loss": 0.892, + "epoch": 4.021658262379102, + "grad_norm": 14.561111450195312, + "learning_rate": 4.628609086873478e-05, + "loss": 0.6342, "step": 291900 }, { - "epoch": 2.97, - "learning_rate": 5.586191393633654e-05, - "loss": 0.87, + "epoch": 4.0230360144388415, + "grad_norm": 2.1733908653259277, + "learning_rate": 4.628023642374684e-05, + "loss": 0.6746, "step": 292000 }, { - "epoch": 2.98, - "learning_rate": 5.585858502954865e-05, - "loss": 0.7792, + "epoch": 4.024413766498581, + "grad_norm": 38.17127227783203, + "learning_rate": 4.627438038328174e-05, + "loss": 0.6031, "step": 292100 }, { - "epoch": 2.98, - "learning_rate": 5.585522123943137e-05, - "loss": 0.9867, + "epoch": 4.025791518558321, + "grad_norm": 151.87948608398438, + "learning_rate": 4.6268522747836986e-05, + "loss": 0.7576, "step": 292200 }, { - "epoch": 2.98, - "learning_rate": 5.585185618514807e-05, - "loss": 0.9321, + "epoch": 4.027169270618059, + "grad_norm": 6.559102535247803, + "learning_rate": 4.626266351791019e-05, + "loss": 0.6567, "step": 292300 }, { - "epoch": 2.98, - "learning_rate": 5.584848986686328e-05, - "loss": 0.7684, + "epoch": 4.028547022677799, + "grad_norm": 3.6919069290161133, + "learning_rate": 4.6256802693999145e-05, + "loss": 0.6294, "step": 292400 }, { - "epoch": 2.98, - "learning_rate": 5.584512228474159e-05, - "loss": 0.7608, + "epoch": 4.029924774737538, + "grad_norm": 6.815934658050537, + "learning_rate": 4.625094027660175e-05, + "loss": 0.7439, "step": 292500 }, { - "epoch": 2.98, - "learning_rate": 5.5841753438947655e-05, - "loss": 0.8884, + "epoch": 4.031302526797278, + "grad_norm": 2.451958417892456, + "learning_rate": 4.6245076266216055e-05, + "loss": 0.7037, "step": 292600 }, { - "epoch": 2.98, - "learning_rate": 5.5838383329646206e-05, - "loss": 0.828, + "epoch": 4.032680278857017, + "grad_norm": 12.651769638061523, + "learning_rate": 4.623921066334022e-05, + "loss": 0.7634, "step": 292700 }, { - "epoch": 2.98, - "learning_rate": 5.583501195700199e-05, - "loss": 0.9038, + "epoch": 4.034058030916756, + "grad_norm": 3.5858993530273438, + "learning_rate": 4.6233343468472586e-05, + "loss": 0.7031, "step": 292800 }, { - "epoch": 2.98, - "learning_rate": 5.583163932117988e-05, - "loss": 0.9312, + "epoch": 4.035435782976496, + "grad_norm": 37.72840118408203, + "learning_rate": 4.622747468211157e-05, + "loss": 0.5645, "step": 292900 }, { - "epoch": 2.99, - "learning_rate": 5.582826542234474e-05, - "loss": 0.9039, + "epoch": 4.036813535036235, + "grad_norm": 8.41267204284668, + "learning_rate": 4.622160430475579e-05, + "loss": 0.5949, "step": 293000 }, { - "epoch": 2.99, - "learning_rate": 5.582489026066157e-05, - "loss": 0.8819, + "epoch": 4.038191287095974, + "grad_norm": 22.63103675842285, + "learning_rate": 4.621579106445377e-05, + "loss": 0.6729, "step": 293100 }, { - "epoch": 2.99, - "learning_rate": 5.5821513836295385e-05, - "loss": 0.8151, + "epoch": 4.0395690391557135, + "grad_norm": 17.30426788330078, + "learning_rate": 4.620991752250219e-05, + "loss": 0.6312, "step": 293200 }, { - "epoch": 2.99, - "learning_rate": 5.5818136149411264e-05, - "loss": 0.9776, + "epoch": 4.040946791215453, + "grad_norm": 10.727584838867188, + "learning_rate": 4.620404239104742e-05, + "loss": 0.5985, "step": 293300 }, { - "epoch": 2.99, - "learning_rate": 5.581475720017436e-05, - "loss": 0.7361, + "epoch": 4.042324543275193, + "grad_norm": 10.214902877807617, + "learning_rate": 4.619816567058855e-05, + "loss": 0.6537, "step": 293400 }, { - "epoch": 2.99, - "learning_rate": 5.581137698874988e-05, - "loss": 0.9316, + "epoch": 4.043702295334931, + "grad_norm": 7.953614234924316, + "learning_rate": 4.6192287361624846e-05, + "loss": 0.7053, "step": 293500 }, { - "epoch": 2.99, - "learning_rate": 5.58079955153031e-05, - "loss": 0.8464, + "epoch": 4.045080047394671, + "grad_norm": 9.201581001281738, + "learning_rate": 4.6186407464655706e-05, + "loss": 0.6549, "step": 293600 }, { - "epoch": 2.99, - "learning_rate": 5.580461277999936e-05, - "loss": 0.8969, + "epoch": 4.04645779945441, + "grad_norm": 5.408073902130127, + "learning_rate": 4.6180525980180656e-05, + "loss": 0.6672, "step": 293700 }, { - "epoch": 2.99, - "learning_rate": 5.580122878300404e-05, - "loss": 0.8936, + "epoch": 4.047835551514149, + "grad_norm": 14.326773643493652, + "learning_rate": 4.617464290869934e-05, + "loss": 0.5852, "step": 293800 }, { - "epoch": 2.99, - "learning_rate": 5.579784352448261e-05, - "loss": 1.052, + "epoch": 4.049213303573889, + "grad_norm": 4.585080146789551, + "learning_rate": 4.6168758250711584e-05, + "loss": 0.6757, "step": 293900 }, { - "epoch": 3.0, - "learning_rate": 5.5794457004600587e-05, - "loss": 0.8714, + "epoch": 4.050591055633628, + "grad_norm": 7.5951080322265625, + "learning_rate": 4.6162872006717305e-05, + "loss": 0.6505, "step": 294000 }, { - "epoch": 3.0, - "learning_rate": 5.579106922352355e-05, - "loss": 0.95, + "epoch": 4.051968807693368, + "grad_norm": 6.346579074859619, + "learning_rate": 4.615698417721657e-05, + "loss": 0.6555, "step": 294100 }, { - "epoch": 3.0, - "learning_rate": 5.5787680181417144e-05, - "loss": 0.9016, + "epoch": 4.053346559753106, + "grad_norm": 14.753069877624512, + "learning_rate": 4.6151094762709573e-05, + "loss": 0.6137, "step": 294200 }, { - "epoch": 3.0, - "learning_rate": 5.578428987844706e-05, - "loss": 0.8257, + "epoch": 4.054724311812846, + "grad_norm": 5.630283355712891, + "learning_rate": 4.6145203763696664e-05, + "loss": 0.7107, "step": 294300 }, { - "epoch": 3.0, - "learning_rate": 5.578089831477908e-05, - "loss": 0.8762, + "epoch": 4.0561020638725855, + "grad_norm": 6.181244850158691, + "learning_rate": 4.6139311180678305e-05, + "loss": 0.7511, "step": 294400 }, { - "epoch": 3.0, - "learning_rate": 5.5777505490579026e-05, - "loss": 0.818, + "epoch": 4.057479815932325, + "grad_norm": 9.801922798156738, + "learning_rate": 4.61334170141551e-05, + "loss": 0.6808, "step": 294500 }, { - "epoch": 3.0, - "learning_rate": 5.5774111406012784e-05, - "loss": 0.792, + "epoch": 4.058857567992064, + "grad_norm": 23.463417053222656, + "learning_rate": 4.6127521264627796e-05, + "loss": 0.7575, "step": 294600 }, { - "epoch": 3.0, - "learning_rate": 5.57707160612463e-05, - "loss": 0.8292, + "epoch": 4.060235320051803, + "grad_norm": 13.619138717651367, + "learning_rate": 4.6121623932597266e-05, + "loss": 0.6982, "step": 294700 }, { - "epoch": 3.0, - "learning_rate": 5.57673194564456e-05, - "loss": 0.7993, + "epoch": 4.061613072111543, + "grad_norm": 1.987518548965454, + "learning_rate": 4.611572501856451e-05, + "loss": 0.7263, "step": 294800 }, { - "epoch": 3.0, - "learning_rate": 5.576392159177675e-05, - "loss": 0.7312, + "epoch": 4.062990824171282, + "grad_norm": 8.392542839050293, + "learning_rate": 4.610982452303068e-05, + "loss": 0.6535, "step": 294900 }, { - "epoch": 3.01, - "learning_rate": 5.576052246740588e-05, - "loss": 0.8982, + "epoch": 4.064368576231021, + "grad_norm": 14.59041976928711, + "learning_rate": 4.6103922446497044e-05, + "loss": 0.6871, "step": 295000 }, { - "epoch": 3.01, - "learning_rate": 5.57571220834992e-05, - "loss": 0.8481, + "epoch": 4.065746328290761, + "grad_norm": 2.0860438346862793, + "learning_rate": 4.6098018789465025e-05, + "loss": 0.6721, "step": 295100 }, { - "epoch": 3.01, - "learning_rate": 5.5753720440222954e-05, - "loss": 0.7562, + "epoch": 4.0671240803505, + "grad_norm": 8.673189163208008, + "learning_rate": 4.6092113552436156e-05, + "loss": 0.7174, "step": 295200 }, { - "epoch": 3.01, - "learning_rate": 5.575031753774346e-05, - "loss": 0.8092, + "epoch": 4.06850183241024, + "grad_norm": 2.694934368133545, + "learning_rate": 4.6086206735912134e-05, + "loss": 0.7058, "step": 295300 }, { - "epoch": 3.01, - "learning_rate": 5.574691337622712e-05, - "loss": 0.7914, + "epoch": 4.069879584469978, + "grad_norm": 22.90232276916504, + "learning_rate": 4.608029834039475e-05, + "loss": 0.6488, "step": 295400 }, { - "epoch": 3.01, - "learning_rate": 5.574350795584037e-05, - "loss": 0.8746, + "epoch": 4.071257336529718, + "grad_norm": 4.197026252746582, + "learning_rate": 4.607438836638598e-05, + "loss": 0.7081, "step": 295500 }, { - "epoch": 3.01, - "learning_rate": 5.574010127674971e-05, - "loss": 0.7733, + "epoch": 4.0726350885894576, + "grad_norm": 5.113185882568359, + "learning_rate": 4.6068476814387886e-05, + "loss": 0.5575, "step": 295600 }, { - "epoch": 3.01, - "learning_rate": 5.5736693339121694e-05, - "loss": 0.7928, + "epoch": 4.074012840649197, + "grad_norm": 29.31284523010254, + "learning_rate": 4.606256368490269e-05, + "loss": 0.6179, "step": 295700 }, { - "epoch": 3.01, - "learning_rate": 5.573328414312297e-05, - "loss": 0.8171, + "epoch": 4.075390592708936, + "grad_norm": 3.3541550636291504, + "learning_rate": 4.605664897843274e-05, + "loss": 0.6748, "step": 295800 }, { - "epoch": 3.01, - "learning_rate": 5.5729873688920223e-05, - "loss": 0.8629, + "epoch": 4.076768344768675, + "grad_norm": 21.39793586730957, + "learning_rate": 4.6050732695480535e-05, + "loss": 0.6716, "step": 295900 }, { - "epoch": 3.02, - "learning_rate": 5.572646197668019e-05, - "loss": 0.753, + "epoch": 4.078146096828415, + "grad_norm": 3.5587472915649414, + "learning_rate": 4.604481483654867e-05, + "loss": 0.6151, "step": 296000 }, { - "epoch": 3.02, - "learning_rate": 5.57230490065697e-05, - "loss": 0.7104, + "epoch": 4.0795238488881544, + "grad_norm": 19.50777816772461, + "learning_rate": 4.603889540213993e-05, + "loss": 0.6334, "step": 296100 }, { - "epoch": 3.02, - "learning_rate": 5.571963477875562e-05, - "loss": 0.7908, + "epoch": 4.080901600947893, + "grad_norm": 3.8695263862609863, + "learning_rate": 4.603297439275716e-05, + "loss": 0.6793, "step": 296200 }, { - "epoch": 3.02, - "learning_rate": 5.571621929340488e-05, - "loss": 0.9207, + "epoch": 4.082279353007633, + "grad_norm": 10.028346061706543, + "learning_rate": 4.602705180890341e-05, + "loss": 0.723, "step": 296300 }, { - "epoch": 3.02, - "learning_rate": 5.5712802550684496e-05, - "loss": 0.8474, + "epoch": 4.083657105067372, + "grad_norm": 9.337726593017578, + "learning_rate": 4.602112765108182e-05, + "loss": 0.6028, "step": 296400 }, { - "epoch": 3.02, - "learning_rate": 5.57093845507615e-05, - "loss": 0.7819, + "epoch": 4.085034857127112, + "grad_norm": 1.486716628074646, + "learning_rate": 4.601526118489555e-05, + "loss": 0.6733, "step": 296500 }, { - "epoch": 3.02, - "learning_rate": 5.5705965293803037e-05, - "loss": 0.8818, + "epoch": 4.0864126091868505, + "grad_norm": 228.84585571289062, + "learning_rate": 4.6009333896375405e-05, + "loss": 0.6992, "step": 296600 }, { - "epoch": 3.02, - "learning_rate": 5.5702544779976274e-05, - "loss": 0.7718, + "epoch": 4.08779036124659, + "grad_norm": 10.276219367980957, + "learning_rate": 4.6003405035392656e-05, + "loss": 0.619, "step": 296700 }, { - "epoch": 3.02, - "learning_rate": 5.5699123009448456e-05, - "loss": 0.8192, + "epoch": 4.08916811330633, + "grad_norm": 7.710974216461182, + "learning_rate": 4.599747460245098e-05, + "loss": 0.6592, "step": 296800 }, { - "epoch": 3.02, - "learning_rate": 5.569569998238688e-05, - "loss": 0.8796, + "epoch": 4.090545865366069, + "grad_norm": 8.020652770996094, + "learning_rate": 4.599154259805422e-05, + "loss": 0.7993, "step": 296900 }, { - "epoch": 3.03, - "learning_rate": 5.5692275698958924e-05, - "loss": 0.7936, + "epoch": 4.091923617425808, + "grad_norm": 3.6239333152770996, + "learning_rate": 4.59856090227063e-05, + "loss": 0.599, "step": 297000 }, { - "epoch": 3.03, - "learning_rate": 5.568885015933201e-05, - "loss": 0.758, + "epoch": 4.093301369485547, + "grad_norm": 8.320853233337402, + "learning_rate": 4.597967387691133e-05, + "loss": 0.6745, "step": 297100 }, { - "epoch": 3.03, - "learning_rate": 5.568542336367362e-05, - "loss": 0.7569, + "epoch": 4.094679121545287, + "grad_norm": 4.688587665557861, + "learning_rate": 4.5973737161173515e-05, + "loss": 0.7208, "step": 297200 }, { - "epoch": 3.03, - "learning_rate": 5.568199531215131e-05, - "loss": 0.829, + "epoch": 4.0960568736050265, + "grad_norm": 9.391998291015625, + "learning_rate": 4.5967798875997224e-05, + "loss": 0.7009, "step": 297300 }, { - "epoch": 3.03, - "learning_rate": 5.5678566004932706e-05, - "loss": 0.8228, + "epoch": 4.097434625664765, + "grad_norm": 8.450928688049316, + "learning_rate": 4.596185902188694e-05, + "loss": 0.7006, "step": 297400 }, { - "epoch": 3.03, - "learning_rate": 5.567513544218546e-05, - "loss": 0.8265, + "epoch": 4.098812377724505, + "grad_norm": 17.44727325439453, + "learning_rate": 4.595591759934728e-05, + "loss": 0.6514, "step": 297500 }, { - "epoch": 3.03, - "learning_rate": 5.56717036240773e-05, - "loss": 0.8532, + "epoch": 4.100190129784244, + "grad_norm": 18.792360305786133, + "learning_rate": 4.5949974608882994e-05, + "loss": 0.6217, "step": 297600 }, { - "epoch": 3.03, - "learning_rate": 5.566827055077605e-05, - "loss": 0.704, + "epoch": 4.101567881843984, + "grad_norm": 35.528717041015625, + "learning_rate": 4.594403005099898e-05, + "loss": 0.7896, "step": 297700 }, { - "epoch": 3.03, - "learning_rate": 5.5664836222449535e-05, - "loss": 0.8346, + "epoch": 4.1029456339037225, + "grad_norm": 45.263118743896484, + "learning_rate": 4.593808392620025e-05, + "loss": 0.721, "step": 297800 }, { - "epoch": 3.04, - "learning_rate": 5.56614006392657e-05, - "loss": 0.7599, + "epoch": 4.104323385963462, + "grad_norm": 8.396429061889648, + "learning_rate": 4.593213623499196e-05, + "loss": 0.6781, "step": 297900 }, { - "epoch": 3.04, - "learning_rate": 5.56579638013925e-05, - "loss": 0.8473, + "epoch": 4.105701138023202, + "grad_norm": 6.4494147300720215, + "learning_rate": 4.5926186977879415e-05, + "loss": 0.6052, "step": 298000 }, { - "epoch": 3.04, - "learning_rate": 5.5654525708998e-05, - "loss": 0.7965, + "epoch": 4.10707889008294, + "grad_norm": 4.370274543762207, + "learning_rate": 4.5920236155368e-05, + "loss": 0.6555, "step": 298100 }, { - "epoch": 3.04, - "learning_rate": 5.565108636225029e-05, - "loss": 0.7055, + "epoch": 4.10845664214268, + "grad_norm": 5.55239200592041, + "learning_rate": 4.59142837679633e-05, + "loss": 0.6822, "step": 298200 }, { - "epoch": 3.04, - "learning_rate": 5.564764576131753e-05, - "loss": 0.9232, + "epoch": 4.109834394202419, + "grad_norm": 3.1298775672912598, + "learning_rate": 4.590832981617098e-05, + "loss": 0.6552, "step": 298300 }, { - "epoch": 3.04, - "learning_rate": 5.564420390636796e-05, - "loss": 0.8211, + "epoch": 4.111212146262159, + "grad_norm": 10.128996849060059, + "learning_rate": 4.590237430049687e-05, + "loss": 0.666, "step": 298400 }, { - "epoch": 3.04, - "learning_rate": 5.564076079756984e-05, - "loss": 0.804, + "epoch": 4.112589898321898, + "grad_norm": 5.206035614013672, + "learning_rate": 4.589641722144691e-05, + "loss": 0.6689, "step": 298500 }, { - "epoch": 3.04, - "learning_rate": 5.563731643509155e-05, - "loss": 0.8233, + "epoch": 4.113967650381637, + "grad_norm": 5.965327739715576, + "learning_rate": 4.5890518173680934e-05, + "loss": 0.7026, "step": 298600 }, { - "epoch": 3.04, - "learning_rate": 5.5633870819101474e-05, - "loss": 0.8289, + "epoch": 4.115345402441377, + "grad_norm": 3.657636880874634, + "learning_rate": 4.58845579850188e-05, + "loss": 0.5738, "step": 298700 }, { - "epoch": 3.04, - "learning_rate": 5.563042394976808e-05, - "loss": 0.8055, + "epoch": 4.116723154501116, + "grad_norm": 7.645824432373047, + "learning_rate": 4.5878596234494424e-05, + "loss": 0.7064, "step": 298800 }, { - "epoch": 3.05, - "learning_rate": 5.562697582725992e-05, - "loss": 0.69, + "epoch": 4.118100906560855, + "grad_norm": 3.154310703277588, + "learning_rate": 4.587263292261427e-05, + "loss": 0.6868, "step": 298900 }, { - "epoch": 3.05, - "learning_rate": 5.562352645174558e-05, - "loss": 0.8177, + "epoch": 4.1194786586205945, + "grad_norm": 14.839678764343262, + "learning_rate": 4.586666804988495e-05, + "loss": 0.6765, "step": 299000 }, { - "epoch": 3.05, - "learning_rate": 5.56200758233937e-05, - "loss": 0.8271, + "epoch": 4.120856410680334, + "grad_norm": 3.0030577182769775, + "learning_rate": 4.586070161681322e-05, + "loss": 0.6241, "step": 299100 }, { - "epoch": 3.05, - "learning_rate": 5.561662394237301e-05, - "loss": 0.7222, + "epoch": 4.122234162740074, + "grad_norm": 6.3814697265625, + "learning_rate": 4.585473362390595e-05, + "loss": 0.7379, "step": 299200 }, { - "epoch": 3.05, - "learning_rate": 5.561317080885228e-05, - "loss": 0.841, + "epoch": 4.123611914799812, + "grad_norm": 2.8625292778015137, + "learning_rate": 4.5848764071670163e-05, + "loss": 0.6466, "step": 299300 }, { - "epoch": 3.05, - "learning_rate": 5.560971642300035e-05, - "loss": 0.8305, + "epoch": 4.124989666859552, + "grad_norm": 4.646816253662109, + "learning_rate": 4.5842792960613e-05, + "loss": 0.702, "step": 299400 }, { - "epoch": 3.05, - "learning_rate": 5.560626078498612e-05, - "loss": 0.8354, + "epoch": 4.126367418919291, + "grad_norm": 2.1967198848724365, + "learning_rate": 4.5836820291241724e-05, + "loss": 0.6001, "step": 299500 }, { - "epoch": 3.05, - "learning_rate": 5.560280389497854e-05, - "loss": 0.8884, + "epoch": 4.127745170979031, + "grad_norm": 3.3459267616271973, + "learning_rate": 4.583084606406376e-05, + "loss": 0.6948, "step": 299600 }, { - "epoch": 3.05, - "learning_rate": 5.559934575314665e-05, - "loss": 0.7171, + "epoch": 4.12912292303877, + "grad_norm": 6.72745943069458, + "learning_rate": 4.582487027958664e-05, + "loss": 0.6781, "step": 299700 }, { - "epoch": 3.05, - "learning_rate": 5.559592095978952e-05, - "loss": 0.8061, + "epoch": 4.130500675098509, + "grad_norm": 4.786750793457031, + "learning_rate": 4.5818892938318034e-05, + "loss": 0.6623, "step": 299800 }, { - "epoch": 3.06, - "learning_rate": 5.5592460327330316e-05, - "loss": 0.8752, + "epoch": 4.131878427158249, + "grad_norm": 4.595633029937744, + "learning_rate": 4.5812914040765766e-05, + "loss": 0.6556, "step": 299900 }, { - "epoch": 3.06, - "learning_rate": 5.5588998443552523e-05, - "loss": 0.6619, + "epoch": 4.133256179217988, + "grad_norm": 11.3757963180542, + "learning_rate": 4.580693358743776e-05, + "loss": 0.6595, + "step": 300000 + }, + { + "epoch": 4.133256179217988, + "eval_accuracy": 0.8888380360022988, + "eval_cer": 0.12308031299907674, + "eval_loss": 0.6750513911247253, + "eval_runtime": 10417.9033, + "eval_samples_per_second": 5.178, + "eval_steps_per_second": 0.324, + "eval_wer": 0.22392130254803114, "step": 300000 }, { - "epoch": 3.06, - "learning_rate": 5.558553530862541e-05, - "loss": 0.7897, + "epoch": 4.134633931277727, + "grad_norm": 3.223041296005249, + "learning_rate": 4.580095157884208e-05, + "loss": 0.6499, "step": 300100 }, { - "epoch": 3.06, - "learning_rate": 5.558207092271832e-05, - "loss": 0.775, + "epoch": 4.1360116833374665, + "grad_norm": 8.322097778320312, + "learning_rate": 4.579496801548694e-05, + "loss": 0.6594, "step": 300200 }, { - "epoch": 3.06, - "learning_rate": 5.5578605286000624e-05, - "loss": 0.9678, + "epoch": 4.137389435397206, + "grad_norm": 4.080111503601074, + "learning_rate": 4.5788982897880676e-05, + "loss": 0.6304, "step": 300300 }, { - "epoch": 3.06, - "learning_rate": 5.557513839864178e-05, - "loss": 0.7759, + "epoch": 4.138767187456946, + "grad_norm": 4.8094353675842285, + "learning_rate": 4.578299622653174e-05, + "loss": 0.6695, "step": 300400 }, { - "epoch": 3.06, - "learning_rate": 5.5571670260811283e-05, - "loss": 0.8929, + "epoch": 4.140144939516684, + "grad_norm": 61.309207916259766, + "learning_rate": 4.5777008001948746e-05, + "loss": 0.6342, "step": 300500 }, { - "epoch": 3.06, - "learning_rate": 5.5568200872678714e-05, - "loss": 0.7326, + "epoch": 4.141522691576424, + "grad_norm": 3.9324393272399902, + "learning_rate": 4.5771078130097816e-05, + "loss": 0.677, "step": 300600 }, { - "epoch": 3.06, - "learning_rate": 5.556473023441371e-05, - "loss": 0.8083, + "epoch": 4.142900443636163, + "grad_norm": 6.68555212020874, + "learning_rate": 4.576508681609266e-05, + "loss": 0.7262, "step": 300700 }, { - "epoch": 3.06, - "learning_rate": 5.5561258346185955e-05, - "loss": 0.7892, + "epoch": 4.144278195695903, + "grad_norm": 14.441776275634766, + "learning_rate": 4.575909395037494e-05, + "loss": 0.7022, "step": 300800 }, { - "epoch": 3.07, - "learning_rate": 5.5557785208165215e-05, - "loss": 0.9065, + "epoch": 4.145655947755642, + "grad_norm": 3.4261410236358643, + "learning_rate": 4.5753099533453766e-05, + "loss": 0.6292, "step": 300900 }, { - "epoch": 3.07, - "learning_rate": 5.5554310820521295e-05, - "loss": 0.8277, + "epoch": 4.147033699815381, + "grad_norm": 1.409772276878357, + "learning_rate": 4.574716353318883e-05, + "loss": 0.713, "step": 301000 }, { - "epoch": 3.07, - "learning_rate": 5.555083518342407e-05, - "loss": 0.9168, + "epoch": 4.148411451875121, + "grad_norm": 12.364428520202637, + "learning_rate": 4.5741166030887994e-05, + "loss": 0.5997, "step": 301100 }, { - "epoch": 3.07, - "learning_rate": 5.554735829704349e-05, - "loss": 0.7899, + "epoch": 4.14978920393486, + "grad_norm": 4.517096519470215, + "learning_rate": 4.5735166978906784e-05, + "loss": 0.7972, "step": 301200 }, { - "epoch": 3.07, - "learning_rate": 5.554388016154955e-05, - "loss": 0.8155, + "epoch": 4.151166955994599, + "grad_norm": 14.583377838134766, + "learning_rate": 4.572916637775486e-05, + "loss": 0.7412, "step": 301300 }, { - "epoch": 3.07, - "learning_rate": 5.554040077711231e-05, - "loss": 0.6979, + "epoch": 4.1525447080543385, + "grad_norm": 24.385231018066406, + "learning_rate": 4.5723164227941985e-05, + "loss": 0.6874, "step": 301400 }, { - "epoch": 3.07, - "learning_rate": 5.553692014390189e-05, - "loss": 0.9031, + "epoch": 4.153922460114078, + "grad_norm": 9.408326148986816, + "learning_rate": 4.571716052997809e-05, + "loss": 0.7456, "step": 301500 }, { - "epoch": 3.07, - "learning_rate": 5.553343826208847e-05, - "loss": 0.6888, + "epoch": 4.155300212173818, + "grad_norm": 37.284271240234375, + "learning_rate": 4.5711155284373186e-05, + "loss": 0.705, "step": 301600 }, { - "epoch": 3.07, - "learning_rate": 5.552995513184229e-05, - "loss": 0.868, + "epoch": 4.156677964233556, + "grad_norm": 4.45246696472168, + "learning_rate": 4.570514849163749e-05, + "loss": 0.7105, "step": 301700 }, { - "epoch": 3.07, - "learning_rate": 5.552647075333366e-05, - "loss": 0.7612, + "epoch": 4.158055716293296, + "grad_norm": 5.8539958000183105, + "learning_rate": 4.569914015228129e-05, + "loss": 0.6115, "step": 301800 }, { - "epoch": 3.08, - "learning_rate": 5.552298512673294e-05, - "loss": 0.8802, + "epoch": 4.159433468353035, + "grad_norm": 2.451735734939575, + "learning_rate": 4.569313026681503e-05, + "loss": 0.6619, "step": 301900 }, { - "epoch": 3.08, - "learning_rate": 5.551949825221056e-05, - "loss": 0.7936, + "epoch": 4.160811220412775, + "grad_norm": 4.320910930633545, + "learning_rate": 4.568711883574927e-05, + "loss": 0.7227, "step": 302000 }, { - "epoch": 3.08, - "learning_rate": 5.551601012993702e-05, - "loss": 0.8174, + "epoch": 4.162188972472514, + "grad_norm": 516.494384765625, + "learning_rate": 4.568110585959473e-05, + "loss": 0.6722, "step": 302100 }, { - "epoch": 3.08, - "learning_rate": 5.551252076008284e-05, - "loss": 0.7172, + "epoch": 4.163566724532253, + "grad_norm": 6.573520183563232, + "learning_rate": 4.567509133886223e-05, + "loss": 0.7271, "step": 302200 }, { - "epoch": 3.08, - "learning_rate": 5.550903014281866e-05, - "loss": 0.8194, + "epoch": 4.164944476591993, + "grad_norm": 8.189583778381348, + "learning_rate": 4.5669075274062726e-05, + "loss": 0.6514, "step": 302300 }, { - "epoch": 3.08, - "learning_rate": 5.5505538278315134e-05, - "loss": 0.7994, + "epoch": 4.166322228651731, + "grad_norm": 10.18826675415039, + "learning_rate": 4.5663057665707346e-05, + "loss": 0.6119, "step": 302400 }, { - "epoch": 3.08, - "learning_rate": 5.550204516674299e-05, - "loss": 0.9413, + "epoch": 4.167699980711471, + "grad_norm": 7.887221336364746, + "learning_rate": 4.565703851430728e-05, + "loss": 0.685, "step": 302500 }, { - "epoch": 3.08, - "learning_rate": 5.549855080827304e-05, - "loss": 0.7975, + "epoch": 4.1690777327712105, + "grad_norm": 6.623042583465576, + "learning_rate": 4.565101782037391e-05, + "loss": 0.594, "step": 302600 }, { - "epoch": 3.08, - "learning_rate": 5.54950552030761e-05, - "loss": 0.8288, + "epoch": 4.17045548483095, + "grad_norm": 5.593002796173096, + "learning_rate": 4.564499558441871e-05, + "loss": 0.5991, "step": 302700 }, { - "epoch": 3.08, - "learning_rate": 5.5491558351323125e-05, - "loss": 0.7787, + "epoch": 4.171833236890689, + "grad_norm": 5.588982582092285, + "learning_rate": 4.563897180695331e-05, + "loss": 0.5978, "step": 302800 }, { - "epoch": 3.09, - "learning_rate": 5.548809524033548e-05, - "loss": 0.7594, + "epoch": 4.173210988950428, + "grad_norm": 12.168755531311035, + "learning_rate": 4.563294648848946e-05, + "loss": 0.5849, "step": 302900 }, { - "epoch": 3.09, - "learning_rate": 5.548459590844467e-05, - "loss": 0.7344, + "epoch": 4.174588741010168, + "grad_norm": 6.639806747436523, + "learning_rate": 4.562691962953903e-05, + "loss": 0.5988, "step": 303000 }, { - "epoch": 3.09, - "learning_rate": 5.548109533050921e-05, - "loss": 0.8782, + "epoch": 4.175966493069907, + "grad_norm": 11.007120132446289, + "learning_rate": 4.562089123061404e-05, + "loss": 0.6065, "step": 303100 }, { - "epoch": 3.09, - "learning_rate": 5.547759350670025e-05, - "loss": 0.8046, + "epoch": 4.177344245129646, + "grad_norm": 5.840813159942627, + "learning_rate": 4.5614921599229176e-05, + "loss": 0.6902, "step": 303200 }, { - "epoch": 3.09, - "learning_rate": 5.547409043718901e-05, - "loss": 0.8336, + "epoch": 4.178721997189386, + "grad_norm": 9.484322547912598, + "learning_rate": 4.56088901372786e-05, + "loss": 0.6262, "step": 303300 }, { - "epoch": 3.09, - "learning_rate": 5.5470586122146764e-05, - "loss": 0.8667, + "epoch": 4.180099749249125, + "grad_norm": 13.334750175476074, + "learning_rate": 4.560285713688516e-05, + "loss": 0.6358, "step": 303400 }, { - "epoch": 3.09, - "learning_rate": 5.5467080561744866e-05, - "loss": 0.7842, + "epoch": 4.181477501308865, + "grad_norm": 7.545956134796143, + "learning_rate": 4.559682259856139e-05, + "loss": 0.6416, "step": 303500 }, { - "epoch": 3.09, - "learning_rate": 5.5463573756154696e-05, - "loss": 0.9258, + "epoch": 4.182855253368603, + "grad_norm": 11.94626522064209, + "learning_rate": 4.559078652281996e-05, + "loss": 0.6969, "step": 303600 }, { - "epoch": 3.09, - "learning_rate": 5.546010079221607e-05, - "loss": 0.8986, + "epoch": 4.184233005428343, + "grad_norm": 4.082379341125488, + "learning_rate": 4.558474891017367e-05, + "loss": 0.6951, "step": 303700 }, { - "epoch": 3.1, - "learning_rate": 5.545659150921144e-05, - "loss": 0.7306, + "epoch": 4.1856107574880825, + "grad_norm": 11.176995277404785, + "learning_rate": 4.557870976113543e-05, + "loss": 0.7071, "step": 303800 }, { - "epoch": 3.1, - "learning_rate": 5.54530809815314e-05, - "loss": 0.8076, + "epoch": 4.186988509547822, + "grad_norm": 10.222467422485352, + "learning_rate": 4.557266907621831e-05, + "loss": 0.7676, "step": 303900 }, { - "epoch": 3.1, - "learning_rate": 5.544960433322916e-05, - "loss": 0.7621, + "epoch": 4.188366261607561, + "grad_norm": 8.333457946777344, + "learning_rate": 4.556662685593549e-05, + "loss": 0.6495, "step": 304000 }, { - "epoch": 3.1, - "learning_rate": 5.544609132915577e-05, - "loss": 0.8193, + "epoch": 4.1897440136673, + "grad_norm": 15.739195823669434, + "learning_rate": 4.5560583100800294e-05, + "loss": 0.6683, "step": 304100 }, { - "epoch": 3.1, - "learning_rate": 5.544257708092036e-05, - "loss": 0.8694, + "epoch": 4.19112176572704, + "grad_norm": 6.781731128692627, + "learning_rate": 4.555453781132616e-05, + "loss": 0.621, "step": 304200 }, { - "epoch": 3.1, - "learning_rate": 5.543906158869477e-05, - "loss": 0.8475, + "epoch": 4.192499517786779, + "grad_norm": 47.04030227661133, + "learning_rate": 4.554849098802668e-05, + "loss": 0.6846, "step": 304300 }, { - "epoch": 3.1, - "learning_rate": 5.543554485265088e-05, - "loss": 0.887, + "epoch": 4.193877269846518, + "grad_norm": 4.148467063903809, + "learning_rate": 4.5542442631415555e-05, + "loss": 0.6307, "step": 304400 }, { - "epoch": 3.1, - "learning_rate": 5.543202687296064e-05, - "loss": 0.8971, + "epoch": 4.195255021906258, + "grad_norm": 5.733232498168945, + "learning_rate": 4.553639274200662e-05, + "loss": 0.7009, "step": 304500 }, { - "epoch": 3.1, - "learning_rate": 5.542850764979605e-05, - "loss": 0.7923, + "epoch": 4.196632773965997, + "grad_norm": 5.764578819274902, + "learning_rate": 4.553034132031384e-05, + "loss": 0.7528, "step": 304600 }, { - "epoch": 3.1, - "learning_rate": 5.5424987183329185e-05, - "loss": 0.7649, + "epoch": 4.198010526025737, + "grad_norm": 24.287540435791016, + "learning_rate": 4.552434890396653e-05, + "loss": 0.7154, "step": 304700 }, { - "epoch": 3.11, - "learning_rate": 5.542146547373218e-05, - "loss": 0.778, + "epoch": 4.199388278085475, + "grad_norm": 7.267184257507324, + "learning_rate": 4.551829443455853e-05, + "loss": 0.6338, "step": 304800 }, { - "epoch": 3.11, - "learning_rate": 5.5417942521177216e-05, - "loss": 0.9036, + "epoch": 4.200766030145215, + "grad_norm": 11.063650131225586, + "learning_rate": 4.551223843440422e-05, + "loss": 0.7643, "step": 304900 }, { - "epoch": 3.11, - "learning_rate": 5.541441832583656e-05, - "loss": 0.7702, + "epoch": 4.2021437822049545, + "grad_norm": 6.391530513763428, + "learning_rate": 4.550618090401811e-05, + "loss": 0.6596, "step": 305000 }, { - "epoch": 3.11, - "learning_rate": 5.54108928878825e-05, - "loss": 0.7857, + "epoch": 4.203521534264694, + "grad_norm": 19.334653854370117, + "learning_rate": 4.550012184391482e-05, + "loss": 0.594, "step": 305100 }, { - "epoch": 3.11, - "learning_rate": 5.540736620748743e-05, - "loss": 0.7958, + "epoch": 4.204899286324433, + "grad_norm": 16.1595516204834, + "learning_rate": 4.5494061254609094e-05, + "loss": 0.6308, "step": 305200 }, { - "epoch": 3.11, - "learning_rate": 5.540383828482377e-05, - "loss": 0.7862, + "epoch": 4.206277038384172, + "grad_norm": 2.512019157409668, + "learning_rate": 4.5487999136615795e-05, + "loss": 0.6856, "step": 305300 }, { - "epoch": 3.11, - "learning_rate": 5.540030912006403e-05, - "loss": 0.7545, + "epoch": 4.207654790443912, + "grad_norm": 11.41607666015625, + "learning_rate": 4.548193549044996e-05, + "loss": 0.6184, "step": 305400 }, { - "epoch": 3.11, - "learning_rate": 5.539677871338074e-05, - "loss": 0.6983, + "epoch": 4.209032542503651, + "grad_norm": 1.7704436779022217, + "learning_rate": 4.5475870316626697e-05, + "loss": 0.6761, "step": 305500 }, { - "epoch": 3.11, - "learning_rate": 5.539324706494654e-05, - "loss": 0.8891, + "epoch": 4.21041029456339, + "grad_norm": 4.644819736480713, + "learning_rate": 4.546980361566129e-05, + "loss": 0.7083, "step": 305600 }, { - "epoch": 3.11, - "learning_rate": 5.5389714174934104e-05, - "loss": 0.8237, + "epoch": 4.21178804662313, + "grad_norm": 6.914735794067383, + "learning_rate": 4.5463735388069126e-05, + "loss": 0.6939, "step": 305700 }, { - "epoch": 3.12, - "learning_rate": 5.5386180043516155e-05, - "loss": 0.7983, + "epoch": 4.213165798682869, + "grad_norm": 6.2788543701171875, + "learning_rate": 4.545766563436575e-05, + "loss": 0.6125, "step": 305800 }, { - "epoch": 3.12, - "learning_rate": 5.53826446708655e-05, - "loss": 0.8323, + "epoch": 4.214543550742609, + "grad_norm": 6.040369510650635, + "learning_rate": 4.545159435506681e-05, + "loss": 0.7533, "step": 305900 }, { - "epoch": 3.12, - "learning_rate": 5.537910805715499e-05, - "loss": 0.8797, + "epoch": 4.2159213028023474, + "grad_norm": 10.040986061096191, + "learning_rate": 4.544552155068808e-05, + "loss": 0.5517, "step": 306000 }, { - "epoch": 3.12, - "learning_rate": 5.537557020255755e-05, - "loss": 0.8403, + "epoch": 4.217299054862087, + "grad_norm": 13.150712966918945, + "learning_rate": 4.543944722174548e-05, + "loss": 0.7302, "step": 306100 }, { - "epoch": 3.12, - "learning_rate": 5.537203110724617e-05, - "loss": 0.8576, + "epoch": 4.218676806921827, + "grad_norm": 10.265639305114746, + "learning_rate": 4.5433371368755074e-05, + "loss": 0.7238, "step": 306200 }, { - "epoch": 3.12, - "learning_rate": 5.5368490771393884e-05, - "loss": 0.8132, + "epoch": 4.220054558981566, + "grad_norm": 2.7123641967773438, + "learning_rate": 4.5427293992233014e-05, + "loss": 0.7632, "step": 306300 }, { - "epoch": 3.12, - "learning_rate": 5.536494919517379e-05, - "loss": 0.7458, + "epoch": 4.221432311041305, + "grad_norm": 10.464102745056152, + "learning_rate": 4.542121509269562e-05, + "loss": 0.6421, "step": 306400 }, { - "epoch": 3.12, - "learning_rate": 5.536140637875904e-05, - "loss": 0.8461, + "epoch": 4.222810063101044, + "grad_norm": 12.108945846557617, + "learning_rate": 4.541513467065933e-05, + "loss": 0.6621, "step": 306500 }, { - "epoch": 3.12, - "learning_rate": 5.535786232232287e-05, - "loss": 0.8237, + "epoch": 4.224187815160784, + "grad_norm": 7.261825084686279, + "learning_rate": 4.5409113553612976e-05, + "loss": 0.6362, "step": 306600 }, { - "epoch": 3.12, - "learning_rate": 5.535431702603856e-05, - "loss": 0.8142, + "epoch": 4.225565567220523, + "grad_norm": 12.817927360534668, + "learning_rate": 4.5403030103340785e-05, + "loss": 0.6375, "step": 306700 }, { - "epoch": 3.13, - "learning_rate": 5.535077049007946e-05, - "loss": 0.8405, + "epoch": 4.226943319280262, + "grad_norm": 5.216375350952148, + "learning_rate": 4.539694513211459e-05, + "loss": 0.6797, "step": 306800 }, { - "epoch": 3.13, - "learning_rate": 5.534722271461896e-05, - "loss": 0.8549, + "epoch": 4.228321071340002, + "grad_norm": 30.395282745361328, + "learning_rate": 4.539085864045135e-05, + "loss": 0.6771, "step": 306900 }, { - "epoch": 3.13, - "learning_rate": 5.534367369983054e-05, - "loss": 0.7919, + "epoch": 4.229698823399741, + "grad_norm": 6.5052666664123535, + "learning_rate": 4.538477062886813e-05, + "loss": 0.6184, "step": 307000 }, { - "epoch": 3.13, - "learning_rate": 5.5340123445887724e-05, - "loss": 0.7916, + "epoch": 4.23107657545948, + "grad_norm": 4.310901165008545, + "learning_rate": 4.5378681097882146e-05, + "loss": 0.6836, "step": 307100 }, { - "epoch": 3.13, - "learning_rate": 5.533657195296408e-05, - "loss": 0.7415, + "epoch": 4.2324543275192195, + "grad_norm": 6.7607221603393555, + "learning_rate": 4.5372590048010735e-05, + "loss": 0.6863, "step": 307200 }, { - "epoch": 3.13, - "learning_rate": 5.533301922123328e-05, - "loss": 0.7761, + "epoch": 4.233832079578959, + "grad_norm": 5.41831636428833, + "learning_rate": 4.536649747977138e-05, + "loss": 0.6654, "step": 307300 }, { - "epoch": 3.13, - "learning_rate": 5.532946525086901e-05, - "loss": 0.8897, + "epoch": 4.235209831638699, + "grad_norm": 33.93351745605469, + "learning_rate": 4.5360403393681644e-05, + "loss": 0.6476, "step": 307400 }, { - "epoch": 3.13, - "learning_rate": 5.532591004204506e-05, - "loss": 0.8798, + "epoch": 4.236587583698437, + "grad_norm": 13.801878929138184, + "learning_rate": 4.5354307790259276e-05, + "loss": 0.7141, "step": 307500 }, { - "epoch": 3.13, - "learning_rate": 5.5322353594935236e-05, - "loss": 0.9649, + "epoch": 4.237965335758177, + "grad_norm": 3.7954368591308594, + "learning_rate": 4.534821067002212e-05, + "loss": 0.7271, "step": 307600 }, { - "epoch": 3.13, - "learning_rate": 5.531879590971344e-05, - "loss": 0.9134, + "epoch": 4.239343087817916, + "grad_norm": 3.5893633365631104, + "learning_rate": 4.534211203348815e-05, + "loss": 0.7161, "step": 307700 }, { - "epoch": 3.14, - "learning_rate": 5.531523698655362e-05, - "loss": 0.9157, + "epoch": 4.240720839877656, + "grad_norm": 3.2294416427612305, + "learning_rate": 4.533601188117549e-05, + "loss": 0.5693, "step": 307800 }, { - "epoch": 3.14, - "learning_rate": 5.531167682562979e-05, - "loss": 0.8242, + "epoch": 4.242098591937395, + "grad_norm": 2.4786124229431152, + "learning_rate": 4.5329910213602365e-05, + "loss": 0.6949, "step": 307900 }, { - "epoch": 3.14, - "learning_rate": 5.5308115427116e-05, - "loss": 0.7929, + "epoch": 4.243476343997134, + "grad_norm": 3.1852173805236816, + "learning_rate": 4.532380703128715e-05, + "loss": 0.5627, "step": 308000 }, { - "epoch": 3.14, - "learning_rate": 5.530455279118641e-05, - "loss": 0.8736, + "epoch": 4.244854096056874, + "grad_norm": 6.080512046813965, + "learning_rate": 4.531770233474835e-05, + "loss": 0.6595, "step": 308100 }, { - "epoch": 3.14, - "learning_rate": 5.530098891801519e-05, - "loss": 0.8234, + "epoch": 4.246231848116613, + "grad_norm": 3.594705820083618, + "learning_rate": 4.531159612450458e-05, + "loss": 0.6604, "step": 308200 }, { - "epoch": 3.14, - "learning_rate": 5.52974238077766e-05, - "loss": 0.7991, + "epoch": 4.247609600176352, + "grad_norm": 9.806014060974121, + "learning_rate": 4.530548840107458e-05, + "loss": 0.7253, "step": 308300 }, { - "epoch": 3.14, - "learning_rate": 5.529385746064496e-05, - "loss": 0.7409, + "epoch": 4.2489873522360915, + "grad_norm": 19.69601821899414, + "learning_rate": 4.529937916497727e-05, + "loss": 0.688, "step": 308400 }, { - "epoch": 3.14, - "learning_rate": 5.5290289876794634e-05, - "loss": 0.9241, + "epoch": 4.250365104295831, + "grad_norm": 9.94734001159668, + "learning_rate": 4.529326841673162e-05, + "loss": 0.6847, "step": 308500 }, { - "epoch": 3.14, - "learning_rate": 5.528672105640005e-05, - "loss": 0.8729, + "epoch": 4.251742856355571, + "grad_norm": 9.905821800231934, + "learning_rate": 4.5287156156856795e-05, + "loss": 0.6948, "step": 308600 }, { - "epoch": 3.15, - "learning_rate": 5.528315099963572e-05, - "loss": 0.8079, + "epoch": 4.253120608415309, + "grad_norm": 12.264094352722168, + "learning_rate": 4.528104238587206e-05, + "loss": 0.6775, "step": 308700 }, { - "epoch": 3.15, - "learning_rate": 5.5279579706676194e-05, - "loss": 0.8226, + "epoch": 4.254498360475049, + "grad_norm": 23.35835075378418, + "learning_rate": 4.527492710429681e-05, + "loss": 0.637, "step": 308800 }, { - "epoch": 3.15, - "learning_rate": 5.5276007177696064e-05, - "loss": 0.7893, + "epoch": 4.255876112534788, + "grad_norm": 4.416563034057617, + "learning_rate": 4.526881031265056e-05, + "loss": 0.6705, "step": 308900 }, { - "epoch": 3.15, - "learning_rate": 5.527243341287003e-05, - "loss": 0.9731, + "epoch": 4.257253864594528, + "grad_norm": 9.018585205078125, + "learning_rate": 4.526269201145298e-05, + "loss": 0.5894, "step": 309000 }, { - "epoch": 3.15, - "learning_rate": 5.5268858412372814e-05, - "loss": 0.692, + "epoch": 4.258631616654267, + "grad_norm": 7.559141635894775, + "learning_rate": 4.525657220122382e-05, + "loss": 0.7091, "step": 309100 }, { - "epoch": 3.15, - "learning_rate": 5.526528217637923e-05, - "loss": 0.837, + "epoch": 4.260009368714006, + "grad_norm": 4.722597122192383, + "learning_rate": 4.525045088248303e-05, + "loss": 0.6905, "step": 309200 }, { - "epoch": 3.15, - "learning_rate": 5.526170470506411e-05, - "loss": 0.791, + "epoch": 4.261387120773746, + "grad_norm": 20.97707176208496, + "learning_rate": 4.524432805575062e-05, + "loss": 0.6744, "step": 309300 }, { - "epoch": 3.15, - "learning_rate": 5.52581259986024e-05, - "loss": 0.8364, + "epoch": 4.262764872833485, + "grad_norm": 6.883830547332764, + "learning_rate": 4.5238203721546756e-05, + "loss": 0.612, "step": 309400 }, { - "epoch": 3.15, - "learning_rate": 5.525454605716905e-05, - "loss": 0.8224, + "epoch": 4.264142624893224, + "grad_norm": 4.53001594543457, + "learning_rate": 4.523207788039175e-05, + "loss": 0.7528, "step": 309500 }, { - "epoch": 3.15, - "learning_rate": 5.5250964880939105e-05, - "loss": 0.7625, + "epoch": 4.2655203769529635, + "grad_norm": 5.905134677886963, + "learning_rate": 4.5225950532806e-05, + "loss": 0.6995, "step": 309600 }, { - "epoch": 3.16, - "learning_rate": 5.524738247008768e-05, - "loss": 0.9721, + "epoch": 4.266898129012703, + "grad_norm": 21.644359588623047, + "learning_rate": 4.521982167931007e-05, + "loss": 0.5514, "step": 309700 }, { - "epoch": 3.16, - "learning_rate": 5.5243798824789905e-05, - "loss": 0.7669, + "epoch": 4.268275881072443, + "grad_norm": 4.295462131500244, + "learning_rate": 4.5213691320424635e-05, + "loss": 0.6912, "step": 309800 }, { - "epoch": 3.16, - "learning_rate": 5.524021394522102e-05, - "loss": 0.8734, + "epoch": 4.269653633132181, + "grad_norm": 7.276356220245361, + "learning_rate": 4.52075594566705e-05, + "loss": 0.6998, "step": 309900 }, { - "epoch": 3.16, - "learning_rate": 5.523662783155631e-05, - "loss": 0.8904, + "epoch": 4.271031385191921, + "grad_norm": 16.975078582763672, + "learning_rate": 4.52014260885686e-05, + "loss": 0.6877, "step": 310000 }, { - "epoch": 3.16, - "learning_rate": 5.523304048397109e-05, - "loss": 0.7375, + "epoch": 4.27240913725166, + "grad_norm": 7.268409252166748, + "learning_rate": 4.5195291216639985e-05, + "loss": 0.6977, "step": 310100 }, { - "epoch": 3.16, - "learning_rate": 5.5229451902640785e-05, - "loss": 0.8315, + "epoch": 4.273786889311399, + "grad_norm": 6.270153999328613, + "learning_rate": 4.518915484140586e-05, + "loss": 0.652, "step": 310200 }, { - "epoch": 3.16, - "learning_rate": 5.5225862087740835e-05, - "loss": 0.8048, + "epoch": 4.275164641371139, + "grad_norm": 1.7775713205337524, + "learning_rate": 4.5183016963387534e-05, + "loss": 0.5626, "step": 310300 }, { - "epoch": 3.16, - "learning_rate": 5.5222271039446774e-05, - "loss": 0.8243, + "epoch": 4.276542393430878, + "grad_norm": 11.436005592346191, + "learning_rate": 4.517687758310645e-05, + "loss": 0.5906, "step": 310400 }, { - "epoch": 3.16, - "learning_rate": 5.5218678757934175e-05, - "loss": 0.762, + "epoch": 4.277920145490618, + "grad_norm": 1.905484914779663, + "learning_rate": 4.517073670108418e-05, + "loss": 0.608, "step": 310500 }, { - "epoch": 3.16, - "learning_rate": 5.521508524337868e-05, - "loss": 0.7946, + "epoch": 4.279297897550357, + "grad_norm": 8.761491775512695, + "learning_rate": 4.5164594317842434e-05, + "loss": 0.6706, "step": 310600 }, { - "epoch": 3.17, - "learning_rate": 5.5211490495955996e-05, - "loss": 0.7114, + "epoch": 4.280675649610096, + "grad_norm": 3.2594447135925293, + "learning_rate": 4.5158450433903017e-05, + "loss": 0.6344, "step": 310700 }, { - "epoch": 3.17, - "learning_rate": 5.520789451584188e-05, - "loss": 0.7924, + "epoch": 4.2820534016698355, + "grad_norm": 32.87424850463867, + "learning_rate": 4.5152305049787904e-05, + "loss": 0.6822, "step": 310800 }, { - "epoch": 3.17, - "learning_rate": 5.5204297303212155e-05, - "loss": 0.8473, + "epoch": 4.283431153729575, + "grad_norm": 17.112659454345703, + "learning_rate": 4.514615816601916e-05, + "loss": 0.6699, "step": 310900 }, { - "epoch": 3.17, - "learning_rate": 5.52006988582427e-05, - "loss": 0.7552, + "epoch": 4.284808905789314, + "grad_norm": 581.0711669921875, + "learning_rate": 4.514000978311901e-05, + "loss": 0.681, "step": 311000 }, { - "epoch": 3.17, - "learning_rate": 5.5197099181109465e-05, - "loss": 0.7998, + "epoch": 4.286186657849053, + "grad_norm": 3.072092056274414, + "learning_rate": 4.513385990160977e-05, + "loss": 0.5739, "step": 311100 }, { - "epoch": 3.17, - "learning_rate": 5.5193498271988446e-05, - "loss": 0.8675, + "epoch": 4.287564409908793, + "grad_norm": 3.4997644424438477, + "learning_rate": 4.512770852201394e-05, + "loss": 0.684, "step": 311200 }, { - "epoch": 3.17, - "learning_rate": 5.518989613105571e-05, - "loss": 0.8509, + "epoch": 4.288942161968532, + "grad_norm": 2.147372007369995, + "learning_rate": 4.5121555644854066e-05, + "loss": 0.5565, "step": 311300 }, { - "epoch": 3.17, - "learning_rate": 5.518629275848738e-05, - "loss": 0.8878, + "epoch": 4.290319914028271, + "grad_norm": 9.841828346252441, + "learning_rate": 4.5115401270652906e-05, + "loss": 0.737, "step": 311400 }, { - "epoch": 3.17, - "learning_rate": 5.5182688154459626e-05, - "loss": 0.7066, + "epoch": 4.291697666088011, + "grad_norm": 1.6798075437545776, + "learning_rate": 4.5109245399933275e-05, + "loss": 0.5695, "step": 311500 }, { - "epoch": 3.17, - "learning_rate": 5.517908231914872e-05, - "loss": 0.8897, + "epoch": 4.29307541814775, + "grad_norm": 8.864051818847656, + "learning_rate": 4.5103088033218165e-05, + "loss": 0.644, "step": 311600 }, { - "epoch": 3.18, - "learning_rate": 5.517547525273094e-05, - "loss": 0.8146, + "epoch": 4.29445317020749, + "grad_norm": 5.417300701141357, + "learning_rate": 4.509692917103067e-05, + "loss": 0.6097, "step": 311700 }, { - "epoch": 3.18, - "learning_rate": 5.5171866955382664e-05, - "loss": 0.7305, + "epoch": 4.295830922267228, + "grad_norm": 24.710216522216797, + "learning_rate": 4.5090768813894014e-05, + "loss": 0.6593, "step": 311800 }, { - "epoch": 3.18, - "learning_rate": 5.5168257427280316e-05, - "loss": 0.8109, + "epoch": 4.297208674326968, + "grad_norm": 27.967021942138672, + "learning_rate": 4.5084606962331555e-05, + "loss": 0.645, "step": 311900 }, { - "epoch": 3.18, - "learning_rate": 5.5164682782277944e-05, - "loss": 0.7405, + "epoch": 4.2985864263867075, + "grad_norm": 23.30682373046875, + "learning_rate": 4.5078443616866777e-05, + "loss": 0.7004, "step": 312000 }, { - "epoch": 3.18, - "learning_rate": 5.516107080550008e-05, - "loss": 0.7427, + "epoch": 4.299964178446447, + "grad_norm": 1.977541446685791, + "learning_rate": 4.5072278778023274e-05, + "loss": 0.6215, "step": 312100 }, { - "epoch": 3.18, - "learning_rate": 5.515745759849602e-05, - "loss": 0.7829, + "epoch": 4.301341930506186, + "grad_norm": 5.870731353759766, + "learning_rate": 4.50661124463248e-05, + "loss": 0.6536, "step": 312200 }, { - "epoch": 3.18, - "learning_rate": 5.515384316144241e-05, - "loss": 0.6698, + "epoch": 4.302719682565925, + "grad_norm": 12.300503730773926, + "learning_rate": 4.50599446222952e-05, + "loss": 0.7482, "step": 312300 }, { - "epoch": 3.18, - "learning_rate": 5.515022749451599e-05, - "loss": 0.7891, + "epoch": 4.304097434625665, + "grad_norm": 108.59202575683594, + "learning_rate": 4.505377530645846e-05, + "loss": 0.685, "step": 312400 }, { - "epoch": 3.18, - "learning_rate": 5.514661059789352e-05, - "loss": 0.7424, + "epoch": 4.305475186685404, + "grad_norm": 145.99563598632812, + "learning_rate": 4.504760449933871e-05, + "loss": 0.6427, "step": 312500 }, { - "epoch": 3.18, - "learning_rate": 5.5142992471751864e-05, - "loss": 0.7787, + "epoch": 4.306852938745143, + "grad_norm": 4.786005020141602, + "learning_rate": 4.5041493931816513e-05, + "loss": 0.6589, "step": 312600 }, { - "epoch": 3.19, - "learning_rate": 5.513937311626792e-05, - "loss": 0.773, + "epoch": 4.308230690804883, + "grad_norm": 15.538369178771973, + "learning_rate": 4.503532015860333e-05, + "loss": 0.6376, "step": 312700 }, { - "epoch": 3.19, - "learning_rate": 5.513578874354893e-05, - "loss": 0.8729, + "epoch": 4.309608442864622, + "grad_norm": 6.881442546844482, + "learning_rate": 4.5029144895674986e-05, + "loss": 0.6426, "step": 312800 }, { - "epoch": 3.19, - "learning_rate": 5.513216694220037e-05, - "loss": 0.7631, + "epoch": 4.310986194924362, + "grad_norm": 7.188647270202637, + "learning_rate": 4.502296814355612e-05, + "loss": 0.7208, "step": 312900 }, { - "epoch": 3.19, - "learning_rate": 5.512854391203884e-05, - "loss": 0.815, + "epoch": 4.3123639469841, + "grad_norm": 7.574893951416016, + "learning_rate": 4.5016789902771444e-05, + "loss": 0.649, "step": 313000 }, { - "epoch": 3.19, - "learning_rate": 5.512491965324146e-05, - "loss": 0.6362, + "epoch": 4.31374169904384, + "grad_norm": 7.721707820892334, + "learning_rate": 4.501061017384586e-05, + "loss": 0.6867, "step": 313100 }, { - "epoch": 3.19, - "learning_rate": 5.512129416598545e-05, - "loss": 0.8626, + "epoch": 4.3151194511035795, + "grad_norm": 40.91524124145508, + "learning_rate": 4.500442895730436e-05, + "loss": 0.5987, "step": 313200 }, { - "epoch": 3.19, - "learning_rate": 5.5117667450448075e-05, - "loss": 0.6971, + "epoch": 4.316497203163319, + "grad_norm": 39.55015182495117, + "learning_rate": 4.499824625367206e-05, + "loss": 0.6682, "step": 313300 }, { - "epoch": 3.19, - "learning_rate": 5.511403950680666e-05, - "loss": 0.8308, + "epoch": 4.317874955223058, + "grad_norm": 53.41413116455078, + "learning_rate": 4.499206206347423e-05, + "loss": 0.6926, "step": 313400 }, { - "epoch": 3.19, - "learning_rate": 5.511041033523857e-05, - "loss": 0.7588, + "epoch": 4.319252707282797, + "grad_norm": 7.61053991317749, + "learning_rate": 4.498587638723623e-05, + "loss": 0.664, "step": 313500 }, { - "epoch": 3.2, - "learning_rate": 5.5106779935921284e-05, - "loss": 0.8053, + "epoch": 4.320630459342537, + "grad_norm": 7.264420032501221, + "learning_rate": 4.497968922548358e-05, + "loss": 0.7242, "step": 313600 }, { - "epoch": 3.2, - "learning_rate": 5.510314830903227e-05, - "loss": 0.7677, + "epoch": 4.322008211402276, + "grad_norm": 2.119528293609619, + "learning_rate": 4.49735005787419e-05, + "loss": 0.6099, "step": 313700 }, { - "epoch": 3.2, - "learning_rate": 5.5099515454749124e-05, - "loss": 0.7565, + "epoch": 4.323385963462015, + "grad_norm": 17.502540588378906, + "learning_rate": 4.496731044753696e-05, + "loss": 0.739, "step": 313800 }, { - "epoch": 3.2, - "learning_rate": 5.5095881373249456e-05, - "loss": 0.7855, + "epoch": 4.324763715521755, + "grad_norm": 4.526692867279053, + "learning_rate": 4.496111883239463e-05, + "loss": 0.6673, "step": 313900 }, { - "epoch": 3.2, - "learning_rate": 5.509224606471095e-05, - "loss": 0.8173, + "epoch": 4.326141467581494, + "grad_norm": 1.956165075302124, + "learning_rate": 4.495492573384092e-05, + "loss": 0.7291, "step": 314000 }, { - "epoch": 3.2, - "learning_rate": 5.508860952931136e-05, - "loss": 0.8468, + "epoch": 4.327519219641234, + "grad_norm": 10.1044340133667, + "learning_rate": 4.494873115240197e-05, + "loss": 0.7305, "step": 314100 }, { - "epoch": 3.2, - "learning_rate": 5.5084971767228484e-05, - "loss": 0.7788, + "epoch": 4.328896971700972, + "grad_norm": 9.427190780639648, + "learning_rate": 4.494253508860405e-05, + "loss": 0.6822, "step": 314200 }, { - "epoch": 3.2, - "learning_rate": 5.508133277864018e-05, - "loss": 0.7336, + "epoch": 4.330274723760712, + "grad_norm": 3.114659547805786, + "learning_rate": 4.493633754297354e-05, + "loss": 0.7219, "step": 314300 }, { - "epoch": 3.2, - "learning_rate": 5.507769256372439e-05, - "loss": 0.7865, + "epoch": 4.3316524758204515, + "grad_norm": 7.070925712585449, + "learning_rate": 4.493013851603694e-05, + "loss": 0.6383, "step": 314400 }, { - "epoch": 3.2, - "learning_rate": 5.507405112265909e-05, - "loss": 0.7773, + "epoch": 4.33303022788019, + "grad_norm": 4.399557113647461, + "learning_rate": 4.4923938008320915e-05, + "loss": 0.692, "step": 314500 }, { - "epoch": 3.21, - "learning_rate": 5.5070444888360655e-05, - "loss": 0.8586, + "epoch": 4.33440797993993, + "grad_norm": 5.275290489196777, + "learning_rate": 4.4917736020352207e-05, + "loss": 0.6595, "step": 314600 }, { - "epoch": 3.21, - "learning_rate": 5.5066801007787576e-05, - "loss": 0.7526, + "epoch": 4.335785731999669, + "grad_norm": 32.72828674316406, + "learning_rate": 4.491153255265772e-05, + "loss": 0.6366, "step": 314700 }, { - "epoch": 3.21, - "learning_rate": 5.506315590159752e-05, - "loss": 0.9598, + "epoch": 4.337163484059409, + "grad_norm": 11.588094711303711, + "learning_rate": 4.490532760576447e-05, + "loss": 0.6871, "step": 314800 }, { - "epoch": 3.21, - "learning_rate": 5.505950956996871e-05, - "loss": 0.8163, + "epoch": 4.338541236119148, + "grad_norm": 12.23223876953125, + "learning_rate": 4.489912118019958e-05, + "loss": 0.6105, "step": 314900 }, { - "epoch": 3.21, - "learning_rate": 5.505586201307942e-05, - "loss": 0.7115, + "epoch": 4.339918988178887, + "grad_norm": 7.590038776397705, + "learning_rate": 4.489291327649034e-05, + "loss": 0.6946, "step": 315000 }, { - "epoch": 3.21, - "learning_rate": 5.505221323110802e-05, - "loss": 0.7441, + "epoch": 4.341296740238627, + "grad_norm": 4.498335838317871, + "learning_rate": 4.488670389516414e-05, + "loss": 0.7135, "step": 315100 }, { - "epoch": 3.21, - "learning_rate": 5.504856322423288e-05, - "loss": 0.8354, + "epoch": 4.342674492298366, + "grad_norm": 5.7012038230896, + "learning_rate": 4.488049303674848e-05, + "loss": 0.7255, "step": 315200 }, { - "epoch": 3.21, - "learning_rate": 5.504491199263249e-05, - "loss": 0.8205, + "epoch": 4.344052244358105, + "grad_norm": 21.774002075195312, + "learning_rate": 4.487428070177104e-05, + "loss": 0.6117, "step": 315300 }, { - "epoch": 3.21, - "learning_rate": 5.5041259536485365e-05, - "loss": 0.8387, + "epoch": 4.345429996417844, + "grad_norm": 12.106348991394043, + "learning_rate": 4.486806689075955e-05, + "loss": 0.6473, "step": 315400 }, { - "epoch": 3.21, - "learning_rate": 5.5037605855970083e-05, - "loss": 0.8436, + "epoch": 4.346807748477584, + "grad_norm": 4.872758865356445, + "learning_rate": 4.486185160424194e-05, + "loss": 0.6807, "step": 315500 }, { - "epoch": 3.22, - "learning_rate": 5.503395095126529e-05, - "loss": 0.7858, + "epoch": 4.3481855005373236, + "grad_norm": 14.71099853515625, + "learning_rate": 4.4855634842746206e-05, + "loss": 0.6505, "step": 315600 }, { - "epoch": 3.22, - "learning_rate": 5.503029482254969e-05, - "loss": 0.8135, + "epoch": 4.349563252597062, + "grad_norm": 4.2744622230529785, + "learning_rate": 4.4849478796456743e-05, + "loss": 0.6094, "step": 315700 }, { - "epoch": 3.22, - "learning_rate": 5.5026637470002056e-05, - "loss": 0.8738, + "epoch": 4.350941004656802, + "grad_norm": 21.7128849029541, + "learning_rate": 4.4843259101325936e-05, + "loss": 0.7034, "step": 315800 }, { - "epoch": 3.22, - "learning_rate": 5.5022978893801185e-05, - "loss": 0.7975, + "epoch": 4.352318756716541, + "grad_norm": 5.750769138336182, + "learning_rate": 4.483703793279654e-05, + "loss": 0.6062, "step": 315900 }, { - "epoch": 3.22, - "learning_rate": 5.501931909412598e-05, - "loss": 0.8659, + "epoch": 4.353696508776281, + "grad_norm": 3.0670692920684814, + "learning_rate": 4.4830815291397086e-05, + "loss": 0.6707, "step": 316000 }, { - "epoch": 3.22, - "learning_rate": 5.501565807115538e-05, - "loss": 0.7053, + "epoch": 4.35507426083602, + "grad_norm": 6.325472354888916, + "learning_rate": 4.48245911776562e-05, + "loss": 0.6914, "step": 316100 }, { - "epoch": 3.22, - "learning_rate": 5.501199582506839e-05, - "loss": 0.8962, + "epoch": 4.356452012895759, + "grad_norm": 4.057070732116699, + "learning_rate": 4.481836559210266e-05, + "loss": 0.6692, "step": 316200 }, { - "epoch": 3.22, - "learning_rate": 5.500833235604407e-05, - "loss": 0.7892, + "epoch": 4.357829764955499, + "grad_norm": 7.541365623474121, + "learning_rate": 4.481213853526536e-05, + "loss": 0.6871, "step": 316300 }, { - "epoch": 3.22, - "learning_rate": 5.500470431723142e-05, - "loss": 0.7523, + "epoch": 4.359207517015238, + "grad_norm": 3.7444095611572266, + "learning_rate": 4.480591000767334e-05, + "loss": 0.6726, "step": 316400 }, { - "epoch": 3.22, - "learning_rate": 5.500103841509477e-05, - "loss": 0.9015, + "epoch": 4.360585269074977, + "grad_norm": 10.029513359069824, + "learning_rate": 4.479968000985572e-05, + "loss": 0.6633, "step": 316500 }, { - "epoch": 3.23, - "learning_rate": 5.499737129055652e-05, - "loss": 0.8482, + "epoch": 4.3619630211347165, + "grad_norm": 15.464344024658203, + "learning_rate": 4.4793448542341774e-05, + "loss": 0.6757, "step": 316600 }, { - "epoch": 3.23, - "learning_rate": 5.499370294379601e-05, - "loss": 0.8045, + "epoch": 4.363340773194456, + "grad_norm": 16.177799224853516, + "learning_rate": 4.4787215605660905e-05, + "loss": 0.5873, "step": 316700 }, { - "epoch": 3.23, - "learning_rate": 5.499003337499256e-05, - "loss": 0.738, + "epoch": 4.364718525254196, + "grad_norm": 104.21092224121094, + "learning_rate": 4.478098120034263e-05, + "loss": 0.7217, "step": 316800 }, { - "epoch": 3.23, - "learning_rate": 5.498636258432563e-05, - "loss": 0.7269, + "epoch": 4.366096277313934, + "grad_norm": 11.626893997192383, + "learning_rate": 4.477474532691659e-05, + "loss": 0.6055, "step": 316900 }, { - "epoch": 3.23, - "learning_rate": 5.498269057197467e-05, - "loss": 0.8382, + "epoch": 4.367474029373674, + "grad_norm": 5.727044105529785, + "learning_rate": 4.476850798591256e-05, + "loss": 0.6408, "step": 317000 }, { - "epoch": 3.23, - "learning_rate": 5.497901733811924e-05, - "loss": 0.871, + "epoch": 4.368851781433413, + "grad_norm": 19.25215721130371, + "learning_rate": 4.476226917786043e-05, + "loss": 0.6554, "step": 317100 }, { - "epoch": 3.23, - "learning_rate": 5.4975342882938915e-05, - "loss": 0.8269, + "epoch": 4.370229533493153, + "grad_norm": 33.02938461303711, + "learning_rate": 4.475602890329022e-05, + "loss": 0.6764, "step": 317200 }, { - "epoch": 3.23, - "learning_rate": 5.4971667206613376e-05, - "loss": 0.8114, + "epoch": 4.371607285552892, + "grad_norm": 7.360673904418945, + "learning_rate": 4.474978716273207e-05, + "loss": 0.6612, "step": 317300 }, { - "epoch": 3.23, - "learning_rate": 5.496799030932233e-05, - "loss": 0.7869, + "epoch": 4.372985037612631, + "grad_norm": 30.094141006469727, + "learning_rate": 4.474354395671626e-05, + "loss": 0.6373, "step": 317400 }, { - "epoch": 3.23, - "learning_rate": 5.4964312191245556e-05, - "loss": 0.7814, + "epoch": 4.374362789672371, + "grad_norm": 6.94580602645874, + "learning_rate": 4.4737299285773175e-05, + "loss": 0.7158, "step": 317500 }, { - "epoch": 3.24, - "learning_rate": 5.4960632852562905e-05, - "loss": 0.7729, + "epoch": 4.37574054173211, + "grad_norm": 3.3117239475250244, + "learning_rate": 4.473105315043332e-05, + "loss": 0.6187, "step": 317600 }, { - "epoch": 3.24, - "learning_rate": 5.4956952293454264e-05, - "loss": 0.8393, + "epoch": 4.377118293791849, + "grad_norm": 24.79054069519043, + "learning_rate": 4.472480555122735e-05, + "loss": 0.6927, "step": 317700 }, { - "epoch": 3.24, - "learning_rate": 5.4953270514099586e-05, - "loss": 0.8461, + "epoch": 4.3784960458515885, + "grad_norm": 3.6354496479034424, + "learning_rate": 4.471855648868603e-05, + "loss": 0.7071, "step": 317800 }, { - "epoch": 3.24, - "learning_rate": 5.4949587514678894e-05, - "loss": 0.8219, + "epoch": 4.379873797911328, + "grad_norm": 10.32691478729248, + "learning_rate": 4.471230596334024e-05, + "loss": 0.7083, "step": 317900 }, { - "epoch": 3.24, - "learning_rate": 5.494590329537227e-05, - "loss": 0.9927, + "epoch": 4.381251549971068, + "grad_norm": 55.8413200378418, + "learning_rate": 4.470605397572101e-05, + "loss": 0.7576, "step": 318000 }, { - "epoch": 3.24, - "learning_rate": 5.4942217856359834e-05, - "loss": 0.7248, + "epoch": 4.382629302030806, + "grad_norm": 5.073696613311768, + "learning_rate": 4.469980052635946e-05, + "loss": 0.6505, "step": 318100 }, { - "epoch": 3.24, - "learning_rate": 5.4938531197821794e-05, - "loss": 0.7973, + "epoch": 4.384007054090546, + "grad_norm": 2.443202257156372, + "learning_rate": 4.4693545615786866e-05, + "loss": 0.5787, "step": 318200 }, { - "epoch": 3.24, - "learning_rate": 5.4934843319938404e-05, - "loss": 0.8014, + "epoch": 4.385384806150285, + "grad_norm": 16.05838966369629, + "learning_rate": 4.468728924453461e-05, + "loss": 0.6481, "step": 318300 }, { - "epoch": 3.24, - "learning_rate": 5.4931154222889985e-05, - "loss": 0.7499, + "epoch": 4.386762558210025, + "grad_norm": 4.32959508895874, + "learning_rate": 4.468103141313421e-05, + "loss": 0.7195, "step": 318400 }, { - "epoch": 3.24, - "learning_rate": 5.4927463906856896e-05, - "loss": 0.7882, + "epoch": 4.388140310269764, + "grad_norm": 1.2014583349227905, + "learning_rate": 4.467477212211728e-05, + "loss": 0.6952, "step": 318500 }, { - "epoch": 3.25, - "learning_rate": 5.492377237201959e-05, - "loss": 0.7338, + "epoch": 4.389518062329503, + "grad_norm": 26.61128044128418, + "learning_rate": 4.46685113720156e-05, + "loss": 0.758, "step": 318600 }, { - "epoch": 3.25, - "learning_rate": 5.4920079618558555e-05, - "loss": 0.7708, + "epoch": 4.390895814389243, + "grad_norm": 7.952206134796143, + "learning_rate": 4.466231179266568e-05, + "loss": 0.6511, "step": 318700 }, { - "epoch": 3.25, - "learning_rate": 5.4916385646654334e-05, - "loss": 0.6926, + "epoch": 4.392273566448981, + "grad_norm": 23.078968048095703, + "learning_rate": 4.4656048140567834e-05, + "loss": 0.6312, "step": 318800 }, { - "epoch": 3.25, - "learning_rate": 5.491269045648755e-05, - "loss": 0.8163, + "epoch": 4.393651318508721, + "grad_norm": 8.388651847839355, + "learning_rate": 4.464978303097593e-05, + "loss": 0.6318, "step": 318900 }, { - "epoch": 3.25, - "learning_rate": 5.4908994048238875e-05, - "loss": 0.7346, + "epoch": 4.3950290705684605, + "grad_norm": 1.8579869270324707, + "learning_rate": 4.464351646442222e-05, + "loss": 0.687, "step": 319000 }, { - "epoch": 3.25, - "learning_rate": 5.490529642208905e-05, - "loss": 0.7821, + "epoch": 4.3964068226282, + "grad_norm": 7.543272495269775, + "learning_rate": 4.463724844143907e-05, + "loss": 0.689, "step": 319100 }, { - "epoch": 3.25, - "learning_rate": 5.490159757821884e-05, - "loss": 0.7507, + "epoch": 4.39778457468794, + "grad_norm": 8.869715690612793, + "learning_rate": 4.4630978962559005e-05, + "loss": 0.7199, "step": 319200 }, { - "epoch": 3.25, - "learning_rate": 5.4897897516809116e-05, - "loss": 0.7696, + "epoch": 4.399162326747678, + "grad_norm": 70.35514831542969, + "learning_rate": 4.462470802831463e-05, + "loss": 0.7331, "step": 319300 }, { - "epoch": 3.25, - "learning_rate": 5.48941962380408e-05, - "loss": 0.7881, + "epoch": 4.400540078807418, + "grad_norm": 13.741540908813477, + "learning_rate": 4.46184356392387e-05, + "loss": 0.6418, "step": 319400 }, { - "epoch": 3.26, - "learning_rate": 5.489049374209483e-05, - "loss": 0.7664, + "epoch": 4.401917830867157, + "grad_norm": 3.9362874031066895, + "learning_rate": 4.461216179586408e-05, + "loss": 0.5425, "step": 319500 }, { - "epoch": 3.26, - "learning_rate": 5.488679002915226e-05, - "loss": 0.8108, + "epoch": 4.403295582926896, + "grad_norm": 5.519320964813232, + "learning_rate": 4.460588649872377e-05, + "loss": 0.6122, "step": 319600 }, { - "epoch": 3.26, - "learning_rate": 5.488308509939418e-05, - "loss": 0.8469, + "epoch": 4.404673334986636, + "grad_norm": 12.589254379272461, + "learning_rate": 4.4599609748350895e-05, + "loss": 0.6953, "step": 319700 }, { - "epoch": 3.26, - "learning_rate": 5.487937895300174e-05, - "loss": 0.7825, + "epoch": 4.406051087046375, + "grad_norm": 3.9567782878875732, + "learning_rate": 4.45933315452787e-05, + "loss": 0.6358, "step": 319800 }, { - "epoch": 3.26, - "learning_rate": 5.4875671590156126e-05, - "loss": 0.8389, + "epoch": 4.407428839106115, + "grad_norm": 12.955341339111328, + "learning_rate": 4.4587051890040515e-05, + "loss": 0.6723, "step": 319900 }, { - "epoch": 3.26, - "learning_rate": 5.487196301103863e-05, - "loss": 0.9161, + "epoch": 4.408806591165853, + "grad_norm": 9.158559799194336, + "learning_rate": 4.458077078316988e-05, + "loss": 0.6046, "step": 320000 }, { - "epoch": 3.26, - "learning_rate": 5.48682903198017e-05, - "loss": 0.9238, + "epoch": 4.410184343225593, + "grad_norm": 9.448963165283203, + "learning_rate": 4.4574488225200364e-05, + "loss": 0.6838, "step": 320100 }, { - "epoch": 3.26, - "learning_rate": 5.4864579320842656e-05, - "loss": 0.7843, + "epoch": 4.4115620952853325, + "grad_norm": 18.403839111328125, + "learning_rate": 4.456820421666573e-05, + "loss": 0.6272, "step": 320200 }, { - "epoch": 3.26, - "learning_rate": 5.486086710615407e-05, - "loss": 0.7142, + "epoch": 4.412939847345072, + "grad_norm": 3.124269723892212, + "learning_rate": 4.4561918758099835e-05, + "loss": 0.5848, "step": 320300 }, { - "epoch": 3.26, - "learning_rate": 5.4857153675917444e-05, - "loss": 0.8741, + "epoch": 4.414317599404811, + "grad_norm": 4.586199760437012, + "learning_rate": 4.455563185003664e-05, + "loss": 0.5837, "step": 320400 }, { - "epoch": 3.27, - "learning_rate": 5.4853439030314346e-05, - "loss": 0.7631, + "epoch": 4.41569535146455, + "grad_norm": 7.89439058303833, + "learning_rate": 4.454934349301026e-05, + "loss": 0.6138, "step": 320500 }, { - "epoch": 3.27, - "learning_rate": 5.484972316952639e-05, - "loss": 0.7563, + "epoch": 4.41707310352429, + "grad_norm": 9.500040054321289, + "learning_rate": 4.454305368755494e-05, + "loss": 0.5725, "step": 320600 }, { - "epoch": 3.27, - "learning_rate": 5.4846006093735274e-05, - "loss": 0.8291, + "epoch": 4.418450855584029, + "grad_norm": 11.009655952453613, + "learning_rate": 4.453676243420501e-05, + "loss": 0.6691, "step": 320700 }, { - "epoch": 3.27, - "learning_rate": 5.484228780312272e-05, - "loss": 0.8175, + "epoch": 4.419828607643768, + "grad_norm": 40.77465057373047, + "learning_rate": 4.4530469733494955e-05, + "loss": 0.7356, "step": 320800 }, { - "epoch": 3.27, - "learning_rate": 5.4838568297870553e-05, - "loss": 0.7718, + "epoch": 4.421206359703508, + "grad_norm": 2.06315016746521, + "learning_rate": 4.4524238534594755e-05, + "loss": 0.5915, "step": 320900 }, { - "epoch": 3.27, - "learning_rate": 5.483484757816063e-05, - "loss": 0.7971, + "epoch": 4.422584111763247, + "grad_norm": 4.759917259216309, + "learning_rate": 4.451794295522862e-05, + "loss": 0.6672, "step": 321000 }, { - "epoch": 3.27, - "learning_rate": 5.483112564417486e-05, - "loss": 0.7681, + "epoch": 4.423961863822987, + "grad_norm": 11.225339889526367, + "learning_rate": 4.451164593010117e-05, + "loss": 0.6679, "step": 321100 }, { - "epoch": 3.27, - "learning_rate": 5.4827402496095225e-05, - "loss": 0.8042, + "epoch": 4.425339615882725, + "grad_norm": 7.234725475311279, + "learning_rate": 4.450534745974736e-05, + "loss": 0.6958, "step": 321200 }, { - "epoch": 3.27, - "learning_rate": 5.4823678134103776e-05, - "loss": 0.8367, + "epoch": 4.426717367942465, + "grad_norm": 25.36356544494629, + "learning_rate": 4.449904754470228e-05, + "loss": 0.5903, "step": 321300 }, { - "epoch": 3.27, - "learning_rate": 5.481995255838261e-05, - "loss": 0.7896, + "epoch": 4.4280951200022045, + "grad_norm": 12.474896430969238, + "learning_rate": 4.449274618550115e-05, + "loss": 0.7154, "step": 321400 }, { - "epoch": 3.28, - "learning_rate": 5.481622576911387e-05, - "loss": 0.8167, + "epoch": 4.429472872061944, + "grad_norm": 28.230289459228516, + "learning_rate": 4.44864433826793e-05, + "loss": 0.7167, "step": 321500 }, { - "epoch": 3.28, - "learning_rate": 5.481249776647979e-05, - "loss": 0.7701, + "epoch": 4.430850624121683, + "grad_norm": 7.706596851348877, + "learning_rate": 4.448013913677218e-05, + "loss": 0.6774, "step": 321600 }, { - "epoch": 3.28, - "learning_rate": 5.480876855066266e-05, - "loss": 0.7352, + "epoch": 4.432228376181422, + "grad_norm": 47.81163787841797, + "learning_rate": 4.447383344831538e-05, + "loss": 0.6442, "step": 321700 }, { - "epoch": 3.28, - "learning_rate": 5.480503812184478e-05, - "loss": 0.8721, + "epoch": 4.433606128241162, + "grad_norm": 4.364405155181885, + "learning_rate": 4.446752631784458e-05, + "loss": 0.6635, "step": 321800 }, { - "epoch": 3.28, - "learning_rate": 5.480130648020857e-05, - "loss": 0.7712, + "epoch": 4.434983880300901, + "grad_norm": 14.025710105895996, + "learning_rate": 4.4461217745895614e-05, + "loss": 0.5836, "step": 321900 }, { - "epoch": 3.28, - "learning_rate": 5.479757362593648e-05, - "loss": 0.7751, + "epoch": 4.43636163236064, + "grad_norm": 3.8178043365478516, + "learning_rate": 4.445490773300443e-05, + "loss": 0.6871, "step": 322000 }, { - "epoch": 3.28, - "learning_rate": 5.479383955921101e-05, - "loss": 0.8118, + "epoch": 4.43773938442038, + "grad_norm": 8.0946044921875, + "learning_rate": 4.4448596279707093e-05, + "loss": 0.6875, "step": 322100 }, { - "epoch": 3.28, - "learning_rate": 5.4790104280214776e-05, - "loss": 0.7559, + "epoch": 4.439117136480119, + "grad_norm": 3.0491600036621094, + "learning_rate": 4.44422833865398e-05, + "loss": 0.7648, "step": 322200 }, { - "epoch": 3.28, - "learning_rate": 5.478636778913036e-05, - "loss": 0.8321, + "epoch": 4.440494888539859, + "grad_norm": 27.746309280395508, + "learning_rate": 4.443596905403885e-05, + "loss": 0.6826, "step": 322300 }, { - "epoch": 3.28, - "learning_rate": 5.478263008614048e-05, - "loss": 0.8676, + "epoch": 4.441872640599597, + "grad_norm": 7.239984512329102, + "learning_rate": 4.442965328274068e-05, + "loss": 0.646, "step": 322400 }, { - "epoch": 3.29, - "learning_rate": 5.477889117142788e-05, - "loss": 0.7253, + "epoch": 4.443250392659337, + "grad_norm": 7.9559831619262695, + "learning_rate": 4.442333607318186e-05, + "loss": 0.7213, "step": 322500 }, { - "epoch": 3.29, - "learning_rate": 5.4775151045175394e-05, - "loss": 0.8268, + "epoch": 4.4446281447190765, + "grad_norm": 19.0515193939209, + "learning_rate": 4.441701742589906e-05, + "loss": 0.6669, "step": 322600 }, { - "epoch": 3.29, - "learning_rate": 5.4771409707565855e-05, - "loss": 0.7629, + "epoch": 4.446005896778816, + "grad_norm": 12.214559555053711, + "learning_rate": 4.4410697341429084e-05, + "loss": 0.6711, "step": 322700 }, { - "epoch": 3.29, - "learning_rate": 5.476766715878222e-05, - "loss": 0.7418, + "epoch": 4.447383648838555, + "grad_norm": 3.0706989765167236, + "learning_rate": 4.440437582030886e-05, + "loss": 0.59, "step": 322800 }, { - "epoch": 3.29, - "learning_rate": 5.4763923399007456e-05, - "loss": 0.7683, + "epoch": 4.448761400898294, + "grad_norm": 5.658199310302734, + "learning_rate": 4.439805286307541e-05, + "loss": 0.6901, "step": 322900 }, { - "epoch": 3.29, - "learning_rate": 5.476017842842462e-05, - "loss": 0.7908, + "epoch": 4.450139152958034, + "grad_norm": 8.028273582458496, + "learning_rate": 4.439172847026593e-05, + "loss": 0.6358, "step": 323000 }, { - "epoch": 3.29, - "learning_rate": 5.475643224721682e-05, - "loss": 0.7812, + "epoch": 4.451516905017773, + "grad_norm": 8.47390079498291, + "learning_rate": 4.4385465907797866e-05, + "loss": 0.6942, "step": 323100 }, { - "epoch": 3.29, - "learning_rate": 5.4752684855567236e-05, - "loss": 0.8362, + "epoch": 4.452894657077512, + "grad_norm": 6.019741535186768, + "learning_rate": 4.437913865979065e-05, + "loss": 0.634, "step": 323200 }, { - "epoch": 3.29, - "learning_rate": 5.4748936253659065e-05, - "loss": 0.7867, + "epoch": 4.454272409137252, + "grad_norm": 39.022335052490234, + "learning_rate": 4.4372809977814246e-05, + "loss": 0.7013, "step": 323300 }, { - "epoch": 3.29, - "learning_rate": 5.474518644167562e-05, - "loss": 0.8036, + "epoch": 4.455650161196991, + "grad_norm": 5.139946937561035, + "learning_rate": 4.4366479862406316e-05, + "loss": 0.6942, "step": 323400 }, { - "epoch": 3.3, - "learning_rate": 5.4741435419800215e-05, - "loss": 0.6976, + "epoch": 4.457027913256731, + "grad_norm": 2.410353422164917, + "learning_rate": 4.436014831410464e-05, + "loss": 0.6475, "step": 323500 }, { - "epoch": 3.3, - "learning_rate": 5.473768318821627e-05, - "loss": 0.7112, + "epoch": 4.458405665316469, + "grad_norm": 39.79777145385742, + "learning_rate": 4.43538153334471e-05, + "loss": 0.6899, "step": 323600 }, { - "epoch": 3.3, - "learning_rate": 5.4733929747107245e-05, - "loss": 0.7624, + "epoch": 4.459783417376209, + "grad_norm": 5.1380414962768555, + "learning_rate": 4.434748092097172e-05, + "loss": 0.6477, "step": 323700 }, { - "epoch": 3.3, - "learning_rate": 5.473017509665666e-05, - "loss": 0.9007, + "epoch": 4.4611611694359485, + "grad_norm": 5.901916980743408, + "learning_rate": 4.434114507721666e-05, + "loss": 0.6342, "step": 323800 }, { - "epoch": 3.3, - "learning_rate": 5.4726419237048096e-05, - "loss": 0.76, + "epoch": 4.462538921495687, + "grad_norm": 4.1130781173706055, + "learning_rate": 4.433480780272016e-05, + "loss": 0.6638, "step": 323900 }, { - "epoch": 3.3, - "learning_rate": 5.472266216846519e-05, - "loss": 0.8195, + "epoch": 4.463916673555427, + "grad_norm": 8.368687629699707, + "learning_rate": 4.4328469098020614e-05, + "loss": 0.6708, "step": 324000 }, { - "epoch": 3.3, - "learning_rate": 5.471890389109164e-05, - "loss": 0.8038, + "epoch": 4.465294425615166, + "grad_norm": 3.7628891468048096, + "learning_rate": 4.432212896365652e-05, + "loss": 0.5853, "step": 324100 }, { - "epoch": 3.3, - "learning_rate": 5.471514440511121e-05, - "loss": 0.7712, + "epoch": 4.466672177674906, + "grad_norm": 12.795971870422363, + "learning_rate": 4.4315787400166504e-05, + "loss": 0.6824, "step": 324200 }, { - "epoch": 3.3, - "learning_rate": 5.4711383710707703e-05, - "loss": 0.7628, + "epoch": 4.4680499297346445, + "grad_norm": 8.752735137939453, + "learning_rate": 4.4309444408089334e-05, + "loss": 0.7174, "step": 324300 }, { - "epoch": 3.31, - "learning_rate": 5.4707621808065e-05, - "loss": 0.8106, + "epoch": 4.469427681794384, + "grad_norm": 8.314553260803223, + "learning_rate": 4.430309998796385e-05, + "loss": 0.6286, "step": 324400 }, { - "epoch": 3.31, - "learning_rate": 5.470385869736705e-05, - "loss": 0.811, + "epoch": 4.470805433854124, + "grad_norm": 6.607656478881836, + "learning_rate": 4.429675414032906e-05, + "loss": 0.6874, "step": 324500 }, { - "epoch": 3.31, - "learning_rate": 5.470009437879783e-05, - "loss": 0.9012, + "epoch": 4.472183185913863, + "grad_norm": 5.661682605743408, + "learning_rate": 4.429040686572408e-05, + "loss": 0.5944, "step": 324600 }, { - "epoch": 3.31, - "learning_rate": 5.4696328852541396e-05, - "loss": 0.8155, + "epoch": 4.473560937973602, + "grad_norm": 4.260201930999756, + "learning_rate": 4.428405816468814e-05, + "loss": 0.6079, "step": 324700 }, { - "epoch": 3.31, - "learning_rate": 5.469256211878186e-05, - "loss": 0.826, + "epoch": 4.474938690033341, + "grad_norm": 9.348628997802734, + "learning_rate": 4.4277708037760586e-05, + "loss": 0.6204, "step": 324800 }, { - "epoch": 3.31, - "learning_rate": 5.468879417770339e-05, - "loss": 0.7043, + "epoch": 4.476316442093081, + "grad_norm": 6.534801006317139, + "learning_rate": 4.4271356485480895e-05, + "loss": 0.5981, "step": 324900 }, { - "epoch": 3.31, - "learning_rate": 5.468502502949023e-05, - "loss": 0.8091, + "epoch": 4.4776941941528206, + "grad_norm": 12.314505577087402, + "learning_rate": 4.4265003508388686e-05, + "loss": 0.6903, "step": 325000 }, { - "epoch": 3.31, - "learning_rate": 5.468125467432665e-05, - "loss": 0.7067, + "epoch": 4.479071946212559, + "grad_norm": 2.194709300994873, + "learning_rate": 4.425864910702364e-05, + "loss": 0.6886, "step": 325100 }, { - "epoch": 3.31, - "learning_rate": 5.4677483112397016e-05, - "loss": 0.7417, + "epoch": 4.480449698272299, + "grad_norm": 8.704954147338867, + "learning_rate": 4.425229328192563e-05, + "loss": 0.6035, "step": 325200 }, { - "epoch": 3.31, - "learning_rate": 5.467371034388571e-05, - "loss": 0.7695, + "epoch": 4.481827450332038, + "grad_norm": 3.1498658657073975, + "learning_rate": 4.42459360336346e-05, + "loss": 0.6406, "step": 325300 }, { - "epoch": 3.32, - "learning_rate": 5.466993636897722e-05, - "loss": 0.816, + "epoch": 4.483205202391778, + "grad_norm": 18.84713363647461, + "learning_rate": 4.423957736269063e-05, + "loss": 0.6397, "step": 325400 }, { - "epoch": 3.32, - "learning_rate": 5.4666161187856073e-05, - "loss": 0.7886, + "epoch": 4.484582954451517, + "grad_norm": 16.160200119018555, + "learning_rate": 4.4233217269633926e-05, + "loss": 0.645, "step": 325500 }, { - "epoch": 3.32, - "learning_rate": 5.4662384800706835e-05, - "loss": 0.8286, + "epoch": 4.485960706511256, + "grad_norm": 10.243626594543457, + "learning_rate": 4.42268557550048e-05, + "loss": 0.6115, "step": 325600 }, { - "epoch": 3.32, - "learning_rate": 5.465860720771416e-05, - "loss": 0.8942, + "epoch": 4.487338458570996, + "grad_norm": 12.973387718200684, + "learning_rate": 4.422049281934371e-05, + "loss": 0.6017, "step": 325700 }, { - "epoch": 3.32, - "learning_rate": 5.465482840906275e-05, - "loss": 0.7467, + "epoch": 4.488716210630735, + "grad_norm": 3.910337448120117, + "learning_rate": 4.4214192113782396e-05, + "loss": 0.6187, "step": 325800 }, { - "epoch": 3.32, - "learning_rate": 5.4651048404937355e-05, - "loss": 0.7576, + "epoch": 4.490093962690474, + "grad_norm": 4.787367820739746, + "learning_rate": 4.4207826351876004e-05, + "loss": 0.5603, "step": 325900 }, { - "epoch": 3.32, - "learning_rate": 5.4647267195522815e-05, - "loss": 0.7707, + "epoch": 4.4914717147502135, + "grad_norm": 4.082210063934326, + "learning_rate": 4.420145917055429e-05, + "loss": 0.7037, "step": 326000 }, { - "epoch": 3.32, - "learning_rate": 5.464348478100399e-05, - "loss": 0.7361, + "epoch": 4.492849466809953, + "grad_norm": 15.9624662399292, + "learning_rate": 4.419509057035818e-05, + "loss": 0.7349, "step": 326100 }, { - "epoch": 3.32, - "learning_rate": 5.463970116156582e-05, - "loss": 0.72, + "epoch": 4.494227218869693, + "grad_norm": 6.187216281890869, + "learning_rate": 4.4188720551828705e-05, + "loss": 0.6383, "step": 326200 }, { - "epoch": 3.32, - "learning_rate": 5.46359163373933e-05, - "loss": 0.7251, + "epoch": 4.495604970929431, + "grad_norm": 3.234055757522583, + "learning_rate": 4.418234911550705e-05, + "loss": 0.6447, "step": 326300 }, { - "epoch": 3.33, - "learning_rate": 5.46321303086715e-05, - "loss": 0.7344, + "epoch": 4.496982722989171, + "grad_norm": 36.49329376220703, + "learning_rate": 4.4175976261934476e-05, + "loss": 0.704, "step": 326400 }, { - "epoch": 3.33, - "learning_rate": 5.4628343075585525e-05, - "loss": 0.6811, + "epoch": 4.49836047504891, + "grad_norm": 6.525084018707275, + "learning_rate": 4.416960199165242e-05, + "loss": 0.6344, "step": 326500 }, { - "epoch": 3.33, - "learning_rate": 5.4624554638320546e-05, - "loss": 0.7626, + "epoch": 4.49973822710865, + "grad_norm": 29.574024200439453, + "learning_rate": 4.41632263052024e-05, + "loss": 0.6527, "step": 326600 }, { - "epoch": 3.33, - "learning_rate": 5.46207649970618e-05, - "loss": 0.8888, + "epoch": 4.501115979168389, + "grad_norm": 19.798866271972656, + "learning_rate": 4.4156849203126034e-05, + "loss": 0.65, "step": 326700 }, { - "epoch": 3.33, - "learning_rate": 5.461701206640349e-05, - "loss": 0.7839, + "epoch": 4.502493731228128, + "grad_norm": 7.43825101852417, + "learning_rate": 4.415047068596513e-05, + "loss": 0.679, "step": 326800 }, { - "epoch": 3.33, - "learning_rate": 5.461322002974844e-05, - "loss": 0.7151, + "epoch": 4.503871483287868, + "grad_norm": 5.0385212898254395, + "learning_rate": 4.414409075426155e-05, + "loss": 0.6786, "step": 326900 }, { - "epoch": 3.33, - "learning_rate": 5.460942678965382e-05, - "loss": 0.7741, + "epoch": 4.505249235347607, + "grad_norm": 10.723860740661621, + "learning_rate": 4.41377094085573e-05, + "loss": 0.6308, "step": 327000 }, { - "epoch": 3.33, - "learning_rate": 5.460567029669409e-05, - "loss": 0.643, + "epoch": 4.506626987407346, + "grad_norm": 30.588050842285156, + "learning_rate": 4.413132664939454e-05, + "loss": 0.6482, "step": 327100 }, { - "epoch": 3.33, - "learning_rate": 5.460187466230654e-05, - "loss": 0.6822, + "epoch": 4.5080047394670855, + "grad_norm": 3.961076259613037, + "learning_rate": 4.412494247731549e-05, + "loss": 0.5487, "step": 327200 }, { - "epoch": 3.33, - "learning_rate": 5.459807782503413e-05, - "loss": 0.7695, + "epoch": 4.509382491526825, + "grad_norm": 3.0004286766052246, + "learning_rate": 4.411855689286252e-05, + "loss": 0.6541, "step": 327300 }, { - "epoch": 3.34, - "learning_rate": 5.459427978506253e-05, - "loss": 0.8733, + "epoch": 4.510760243586564, + "grad_norm": 10.720518112182617, + "learning_rate": 4.4112169896578116e-05, + "loss": 0.7179, "step": 327400 }, { - "epoch": 3.34, - "learning_rate": 5.459048054257741e-05, - "loss": 0.8417, + "epoch": 4.512137995646303, + "grad_norm": 15.561443328857422, + "learning_rate": 4.4105781489004896e-05, + "loss": 0.6546, "step": 327500 }, { - "epoch": 3.34, - "learning_rate": 5.458668009776454e-05, - "loss": 0.8615, + "epoch": 4.513515747706043, + "grad_norm": 3.469061851501465, + "learning_rate": 4.409939167068559e-05, + "loss": 0.6741, "step": 327600 }, { - "epoch": 3.34, - "learning_rate": 5.4582878450809734e-05, - "loss": 0.8834, + "epoch": 4.514893499765782, + "grad_norm": 5.806593894958496, + "learning_rate": 4.4093000442163036e-05, + "loss": 0.6833, "step": 327700 }, { - "epoch": 3.34, - "learning_rate": 5.457907560189888e-05, - "loss": 0.9337, + "epoch": 4.516271251825522, + "grad_norm": 5.602105617523193, + "learning_rate": 4.4086607803980205e-05, + "loss": 0.651, "step": 327800 }, { - "epoch": 3.34, - "learning_rate": 5.4575271551217905e-05, - "loss": 0.7733, + "epoch": 4.517649003885261, + "grad_norm": 4.355697154998779, + "learning_rate": 4.408021375668018e-05, + "loss": 0.6217, "step": 327900 }, { - "epoch": 3.34, - "learning_rate": 5.4571466298952816e-05, - "loss": 0.8469, + "epoch": 4.519026755945, + "grad_norm": 6.771401882171631, + "learning_rate": 4.4073818300806174e-05, + "loss": 0.6178, "step": 328000 }, { - "epoch": 3.34, - "learning_rate": 5.4567659845289656e-05, - "loss": 0.8081, + "epoch": 4.52040450800474, + "grad_norm": 11.900725364685059, + "learning_rate": 4.406742143690152e-05, + "loss": 0.6847, "step": 328100 }, { - "epoch": 3.34, - "learning_rate": 5.4563852190414534e-05, - "loss": 0.867, + "epoch": 4.521782260064478, + "grad_norm": 5.0748372077941895, + "learning_rate": 4.406102316550965e-05, + "loss": 0.691, "step": 328200 }, { - "epoch": 3.34, - "learning_rate": 5.456004333451364e-05, - "loss": 0.7387, + "epoch": 4.523160012124218, + "grad_norm": 4.792394161224365, + "learning_rate": 4.4054623487174137e-05, + "loss": 0.7566, "step": 328300 }, { - "epoch": 3.35, - "learning_rate": 5.4556233277773194e-05, - "loss": 0.8197, + "epoch": 4.5245377641839575, + "grad_norm": 4.872078895568848, + "learning_rate": 4.404828642024591e-05, + "loss": 0.6961, "step": 328400 }, { - "epoch": 3.35, - "learning_rate": 5.4552422020379474e-05, - "loss": 0.7813, + "epoch": 4.525915516243697, + "grad_norm": 7.041717529296875, + "learning_rate": 4.404188394371016e-05, + "loss": 0.6334, "step": 328500 }, { - "epoch": 3.35, - "learning_rate": 5.454860956251885e-05, - "loss": 0.8357, + "epoch": 4.527293268303437, + "grad_norm": 9.934708595275879, + "learning_rate": 4.403548006185674e-05, + "loss": 0.6869, "step": 328600 }, { - "epoch": 3.35, - "learning_rate": 5.4544795904377714e-05, - "loss": 0.7754, + "epoch": 4.528671020363175, + "grad_norm": 7.335161209106445, + "learning_rate": 4.402907477522969e-05, + "loss": 0.6828, "step": 328700 }, { - "epoch": 3.35, - "learning_rate": 5.4540981046142526e-05, - "loss": 0.8294, + "epoch": 4.530048772422915, + "grad_norm": 10.300129890441895, + "learning_rate": 4.4022668084373176e-05, + "loss": 0.591, "step": 328800 }, { - "epoch": 3.35, - "learning_rate": 5.453716498799982e-05, - "loss": 0.817, + "epoch": 4.531426524482654, + "grad_norm": 4.763584136962891, + "learning_rate": 4.4016259989831476e-05, + "loss": 0.5627, "step": 328900 }, { - "epoch": 3.35, - "learning_rate": 5.4533347730136184e-05, - "loss": 0.7979, + "epoch": 4.532804276542393, + "grad_norm": 6.448105335235596, + "learning_rate": 4.4009850492148995e-05, + "loss": 0.6706, "step": 329000 }, { - "epoch": 3.35, - "learning_rate": 5.4529529272738246e-05, - "loss": 0.8336, + "epoch": 4.534182028602133, + "grad_norm": 7.568809986114502, + "learning_rate": 4.400343959187026e-05, + "loss": 0.7217, "step": 329100 }, { - "epoch": 3.35, - "learning_rate": 5.4525709615992714e-05, - "loss": 0.8298, + "epoch": 4.535559780661872, + "grad_norm": 13.323205947875977, + "learning_rate": 4.399702728953989e-05, + "loss": 0.6055, "step": 329200 }, { - "epoch": 3.35, - "learning_rate": 5.452188876008634e-05, - "loss": 0.8159, + "epoch": 4.536937532721612, + "grad_norm": 2.8426015377044678, + "learning_rate": 4.399061358570265e-05, + "loss": 0.6613, "step": 329300 }, { - "epoch": 3.36, - "learning_rate": 5.451806670520595e-05, - "loss": 0.7262, + "epoch": 4.53831528478135, + "grad_norm": 2.6047165393829346, + "learning_rate": 4.3984198480903436e-05, + "loss": 0.6935, "step": 329400 }, { - "epoch": 3.36, - "learning_rate": 5.45142434515384e-05, - "loss": 0.7565, + "epoch": 4.53969303684109, + "grad_norm": 32.55445098876953, + "learning_rate": 4.3977781975687215e-05, + "loss": 0.6591, "step": 329500 }, { - "epoch": 3.36, - "learning_rate": 5.451041899927066e-05, - "loss": 0.8151, + "epoch": 4.5410707889008295, + "grad_norm": 11.835723876953125, + "learning_rate": 4.397136407059912e-05, + "loss": 0.6166, "step": 329600 }, { - "epoch": 3.36, - "learning_rate": 5.4506593348589686e-05, - "loss": 0.8661, + "epoch": 4.542448540960569, + "grad_norm": 5.599956512451172, + "learning_rate": 4.3964944766184374e-05, + "loss": 0.607, "step": 329700 }, { - "epoch": 3.36, - "learning_rate": 5.450276649968255e-05, - "loss": 0.8424, + "epoch": 4.543826293020308, + "grad_norm": 3.1657302379608154, + "learning_rate": 4.3958524062988346e-05, + "loss": 0.6528, "step": 329800 }, { - "epoch": 3.36, - "learning_rate": 5.449893845273636e-05, - "loss": 0.8998, + "epoch": 4.545204045080047, + "grad_norm": 4.893840312957764, + "learning_rate": 4.3952101961556496e-05, + "loss": 0.6333, "step": 329900 }, { - "epoch": 3.36, - "learning_rate": 5.449510920793827e-05, - "loss": 0.6582, + "epoch": 4.546581797139787, + "grad_norm": 16.13520622253418, + "learning_rate": 4.3945678462434414e-05, + "loss": 0.6527, "step": 330000 }, { - "epoch": 3.36, - "learning_rate": 5.449127876547553e-05, - "loss": 0.7585, + "epoch": 4.547959549199526, + "grad_norm": 3.2625110149383545, + "learning_rate": 4.393925356616781e-05, + "loss": 0.7119, "step": 330100 }, { - "epoch": 3.36, - "learning_rate": 5.448744712553541e-05, - "loss": 0.7972, + "epoch": 4.549337301259265, + "grad_norm": 9.866243362426758, + "learning_rate": 4.393282727330252e-05, + "loss": 0.6564, "step": 330200 }, { - "epoch": 3.37, - "learning_rate": 5.448361428830527e-05, - "loss": 0.898, + "epoch": 4.550715053319005, + "grad_norm": 11.611804008483887, + "learning_rate": 4.3926399584384474e-05, + "loss": 0.63, "step": 330300 }, { - "epoch": 3.37, - "learning_rate": 5.447981860024086e-05, - "loss": 0.6729, + "epoch": 4.552092805378744, + "grad_norm": 38.45549392700195, + "learning_rate": 4.3919970499959745e-05, + "loss": 0.5968, "step": 330400 }, { - "epoch": 3.37, - "learning_rate": 5.447598338096113e-05, - "loss": 0.8386, + "epoch": 4.553470557438484, + "grad_norm": 4.421891689300537, + "learning_rate": 4.391354002057453e-05, + "loss": 0.6544, "step": 330500 }, { - "epoch": 3.37, - "learning_rate": 5.4472146964951896e-05, - "loss": 0.8171, + "epoch": 4.554848309498222, + "grad_norm": 29.308429718017578, + "learning_rate": 4.39071081467751e-05, + "loss": 0.6482, "step": 330600 }, { - "epoch": 3.37, - "learning_rate": 5.4468309352400705e-05, - "loss": 0.9008, + "epoch": 4.556226061557962, + "grad_norm": 8.412556648254395, + "learning_rate": 4.390067487910791e-05, + "loss": 0.6941, "step": 330700 }, { - "epoch": 3.37, - "learning_rate": 5.446447054349521e-05, - "loss": 0.8025, + "epoch": 4.5576038136177015, + "grad_norm": 5.871030807495117, + "learning_rate": 4.389424021811948e-05, + "loss": 0.6502, "step": 330800 }, { - "epoch": 3.37, - "learning_rate": 5.44606305384231e-05, - "loss": 0.7965, + "epoch": 4.558981565677441, + "grad_norm": 12.721481323242188, + "learning_rate": 4.388780416435648e-05, + "loss": 0.5806, "step": 330900 }, { - "epoch": 3.37, - "learning_rate": 5.445678933737214e-05, - "loss": 0.8075, + "epoch": 4.56035931773718, + "grad_norm": 5.437663555145264, + "learning_rate": 4.3881366718365664e-05, + "loss": 0.6978, "step": 331000 }, { - "epoch": 3.37, - "learning_rate": 5.445294694053012e-05, - "loss": 0.8148, + "epoch": 4.561737069796919, + "grad_norm": 3.4897611141204834, + "learning_rate": 4.3874927880693945e-05, + "loss": 0.6613, "step": 331100 }, { - "epoch": 3.37, - "learning_rate": 5.4449103348084925e-05, - "loss": 0.8622, + "epoch": 4.563114821856659, + "grad_norm": 9.729260444641113, + "learning_rate": 4.386848765188832e-05, + "loss": 0.7368, "step": 331200 }, { - "epoch": 3.38, - "learning_rate": 5.444525856022448e-05, - "loss": 0.7612, + "epoch": 4.564492573916398, + "grad_norm": 4.953622341156006, + "learning_rate": 4.3862046032495945e-05, + "loss": 0.6203, "step": 331300 }, { - "epoch": 3.38, - "learning_rate": 5.444141257713678e-05, - "loss": 0.8024, + "epoch": 4.565870325976137, + "grad_norm": 11.730154991149902, + "learning_rate": 4.385560302306403e-05, + "loss": 0.6922, "step": 331400 }, { - "epoch": 3.38, - "learning_rate": 5.443756539900986e-05, - "loss": 0.773, + "epoch": 4.567248078035877, + "grad_norm": 13.521852493286133, + "learning_rate": 4.384915862413998e-05, + "loss": 0.6582, "step": 331500 }, { - "epoch": 3.38, - "learning_rate": 5.443371702603182e-05, - "loss": 0.799, + "epoch": 4.568625830095616, + "grad_norm": 5.167218208312988, + "learning_rate": 4.384271283627126e-05, + "loss": 0.6732, "step": 331600 }, { - "epoch": 3.38, - "learning_rate": 5.4429867458390834e-05, - "loss": 0.7603, + "epoch": 4.570003582155355, + "grad_norm": 3.1702427864074707, + "learning_rate": 4.3836265660005474e-05, + "loss": 0.7069, "step": 331700 }, { - "epoch": 3.38, - "learning_rate": 5.4426016696275124e-05, - "loss": 0.8843, + "epoch": 4.571381334215094, + "grad_norm": 9.022198677062988, + "learning_rate": 4.382981709589034e-05, + "loss": 0.6747, "step": 331800 }, { - "epoch": 3.38, - "learning_rate": 5.4422164739872956e-05, - "loss": 0.6728, + "epoch": 4.572759086274834, + "grad_norm": 44.19960403442383, + "learning_rate": 4.3823367144473715e-05, + "loss": 0.6148, "step": 331900 }, { - "epoch": 3.38, - "learning_rate": 5.4418311589372684e-05, - "loss": 0.6997, + "epoch": 4.5741368383345735, + "grad_norm": 1.9782874584197998, + "learning_rate": 4.381691580630353e-05, + "loss": 0.652, "step": 332000 }, { - "epoch": 3.38, - "learning_rate": 5.4414457244962695e-05, - "loss": 0.7906, + "epoch": 4.575514590394313, + "grad_norm": 2.4546005725860596, + "learning_rate": 4.381046308192787e-05, + "loss": 0.596, "step": 332100 }, { - "epoch": 3.38, - "learning_rate": 5.4410601706831444e-05, - "loss": 0.8492, + "epoch": 4.576892342454052, + "grad_norm": 7.0841522216796875, + "learning_rate": 4.3804008971894926e-05, + "loss": 0.564, "step": 332200 }, { - "epoch": 3.39, - "learning_rate": 5.4406744975167445e-05, - "loss": 0.7517, + "epoch": 4.578270094513791, + "grad_norm": 3.3855671882629395, + "learning_rate": 4.3797553476753016e-05, + "loss": 0.6121, "step": 332300 }, { - "epoch": 3.39, - "learning_rate": 5.440288705015926e-05, - "loss": 0.8056, + "epoch": 4.579647846573531, + "grad_norm": 7.465083122253418, + "learning_rate": 4.379109659705056e-05, + "loss": 0.6452, "step": 332400 }, { - "epoch": 3.39, - "learning_rate": 5.439902793199554e-05, - "loss": 0.726, + "epoch": 4.5810255986332695, + "grad_norm": 5.384669303894043, + "learning_rate": 4.37846383333361e-05, + "loss": 0.6134, "step": 332500 }, { - "epoch": 3.39, - "learning_rate": 5.439516762086496e-05, - "loss": 0.7852, + "epoch": 4.582403350693009, + "grad_norm": 3.685746192932129, + "learning_rate": 4.3778178686158304e-05, + "loss": 0.6748, "step": 332600 }, { - "epoch": 3.39, - "learning_rate": 5.439130611695626e-05, - "loss": 0.8238, + "epoch": 4.583781102752749, + "grad_norm": 17.18223762512207, + "learning_rate": 4.3771717656065954e-05, + "loss": 0.674, "step": 332700 }, { - "epoch": 3.39, - "learning_rate": 5.4387482053325926e-05, - "loss": 0.8161, + "epoch": 4.585158854812488, + "grad_norm": 14.064021110534668, + "learning_rate": 4.376525524360793e-05, + "loss": 0.68, "step": 332800 }, { - "epoch": 3.39, - "learning_rate": 5.438361817635054e-05, - "loss": 0.9304, + "epoch": 4.586536606872228, + "grad_norm": 5.873450756072998, + "learning_rate": 4.3758791449333266e-05, + "loss": 0.6449, "step": 332900 }, { - "epoch": 3.39, - "learning_rate": 5.437975310716174e-05, - "loss": 0.8237, + "epoch": 4.587914358931966, + "grad_norm": 4.995344638824463, + "learning_rate": 4.37523262737911e-05, + "loss": 0.6322, "step": 333000 }, { - "epoch": 3.39, - "learning_rate": 5.4375886845948505e-05, - "loss": 0.8337, + "epoch": 4.589292110991706, + "grad_norm": 13.373953819274902, + "learning_rate": 4.374585971753066e-05, + "loss": 0.6165, "step": 333100 }, { - "epoch": 3.39, - "learning_rate": 5.437201939289987e-05, - "loss": 0.8601, + "epoch": 4.5906698630514455, + "grad_norm": 2.717437505722046, + "learning_rate": 4.3739456467295646e-05, + "loss": 0.6183, "step": 333200 }, { - "epoch": 3.4, - "learning_rate": 5.4368150748204926e-05, - "loss": 0.9195, + "epoch": 4.592047615111184, + "grad_norm": 13.884819030761719, + "learning_rate": 4.373298716504038e-05, + "loss": 0.5447, "step": 333300 }, { - "epoch": 3.4, - "learning_rate": 5.436428091205284e-05, - "loss": 0.8568, + "epoch": 4.593425367170924, + "grad_norm": 14.432552337646484, + "learning_rate": 4.37265164837098e-05, + "loss": 0.7218, "step": 333400 }, { - "epoch": 3.4, - "learning_rate": 5.436040988463281e-05, - "loss": 0.8183, + "epoch": 4.594803119230663, + "grad_norm": 7.3644819259643555, + "learning_rate": 4.372004442385363e-05, + "loss": 0.5659, "step": 333500 }, { - "epoch": 3.4, - "learning_rate": 5.4356537666134115e-05, - "loss": 0.7959, + "epoch": 4.596180871290403, + "grad_norm": 2.4393606185913086, + "learning_rate": 4.37136357272192e-05, + "loss": 0.6337, "step": 333600 }, { - "epoch": 3.4, - "learning_rate": 5.4352664256746075e-05, - "loss": 0.7719, + "epoch": 4.5975586233501415, + "grad_norm": 4.671186923980713, + "learning_rate": 4.3707160925733006e-05, + "loss": 0.6657, "step": 333700 }, { - "epoch": 3.4, - "learning_rate": 5.4348789656658085e-05, - "loss": 0.8386, + "epoch": 4.598936375409881, + "grad_norm": 5.4141340255737305, + "learning_rate": 4.3700684747365585e-05, + "loss": 0.6458, "step": 333800 }, { - "epoch": 3.4, - "learning_rate": 5.434491386605959e-05, - "loss": 0.8879, + "epoch": 4.600314127469621, + "grad_norm": 20.662593841552734, + "learning_rate": 4.36942071926671e-05, + "loss": 0.6444, "step": 333900 }, { - "epoch": 3.4, - "learning_rate": 5.4341036885140084e-05, - "loss": 0.8089, + "epoch": 4.60169187952936, + "grad_norm": 7.323440074920654, + "learning_rate": 4.368772826218787e-05, + "loss": 0.6511, "step": 334000 }, { - "epoch": 3.4, - "learning_rate": 5.4337158714089145e-05, - "loss": 0.9675, + "epoch": 4.603069631589099, + "grad_norm": 3.818000316619873, + "learning_rate": 4.368124795647831e-05, + "loss": 0.7214, "step": 334100 }, { - "epoch": 3.4, - "learning_rate": 5.433327935309637e-05, - "loss": 0.8112, + "epoch": 4.604447383648838, + "grad_norm": 13.386191368103027, + "learning_rate": 4.3674766276088964e-05, + "loss": 0.692, "step": 334200 }, { - "epoch": 3.41, - "learning_rate": 5.4329398802351454e-05, - "loss": 0.9335, + "epoch": 4.605825135708578, + "grad_norm": 5.456580638885498, + "learning_rate": 4.366828322157046e-05, + "loss": 0.7098, "step": 334300 }, { - "epoch": 3.41, - "learning_rate": 5.432551706204412e-05, - "loss": 0.8291, + "epoch": 4.6072028877683175, + "grad_norm": 2.9225990772247314, + "learning_rate": 4.366179879347358e-05, + "loss": 0.5972, "step": 334400 }, { - "epoch": 3.41, - "learning_rate": 5.432163413236417e-05, - "loss": 0.8628, + "epoch": 4.608580639828056, + "grad_norm": 3.7065563201904297, + "learning_rate": 4.365531299234921e-05, + "loss": 0.631, "step": 334500 }, { - "epoch": 3.41, - "learning_rate": 5.431775001350147e-05, - "loss": 0.8267, + "epoch": 4.609958391887796, + "grad_norm": 6.768597602844238, + "learning_rate": 4.364882581874835e-05, + "loss": 0.6423, "step": 334600 }, { - "epoch": 3.41, - "learning_rate": 5.43138647056459e-05, - "loss": 0.9392, + "epoch": 4.611336143947535, + "grad_norm": 4.486959934234619, + "learning_rate": 4.364233727322213e-05, + "loss": 0.6353, "step": 334700 }, { - "epoch": 3.41, - "learning_rate": 5.4310017079837985e-05, - "loss": 0.8747, + "epoch": 4.612713896007275, + "grad_norm": 10.104267120361328, + "learning_rate": 4.3635847356321765e-05, + "loss": 0.6059, "step": 334800 }, { - "epoch": 3.41, - "learning_rate": 5.4306129406451854e-05, - "loss": 0.8757, + "epoch": 4.6140916480670136, + "grad_norm": 25.838520050048828, + "learning_rate": 4.3629356068598616e-05, + "loss": 0.573, "step": 334900 }, { - "epoch": 3.41, - "learning_rate": 5.430224054464105e-05, - "loss": 0.9245, + "epoch": 4.615469400126753, + "grad_norm": 2.941793441772461, + "learning_rate": 4.362286341060415e-05, + "loss": 0.7028, "step": 335000 }, { - "epoch": 3.41, - "learning_rate": 5.429835049459572e-05, - "loss": 0.7811, + "epoch": 4.616847152186493, + "grad_norm": 7.484210968017578, + "learning_rate": 4.361636938288997e-05, + "loss": 0.6252, "step": 335100 }, { - "epoch": 3.42, - "learning_rate": 5.429445925650605e-05, - "loss": 0.8549, + "epoch": 4.618224904246232, + "grad_norm": 7.027438640594482, + "learning_rate": 4.360987398600774e-05, + "loss": 0.6118, "step": 335200 }, { - "epoch": 3.42, - "learning_rate": 5.42905668305623e-05, - "loss": 0.9381, + "epoch": 4.619602656305971, + "grad_norm": 4.250925064086914, + "learning_rate": 4.360337722050931e-05, + "loss": 0.6161, "step": 335300 }, { - "epoch": 3.42, - "learning_rate": 5.42866732169548e-05, - "loss": 0.9005, + "epoch": 4.6209804083657104, + "grad_norm": 32.049842834472656, + "learning_rate": 4.359687908694659e-05, + "loss": 0.6447, "step": 335400 }, { - "epoch": 3.42, - "learning_rate": 5.4282778415873913e-05, - "loss": 0.892, + "epoch": 4.62235816042545, + "grad_norm": 5.912929058074951, + "learning_rate": 4.3590379585871654e-05, + "loss": 0.6468, "step": 335500 }, { - "epoch": 3.42, - "learning_rate": 5.427888242751006e-05, - "loss": 0.9165, + "epoch": 4.62373591248519, + "grad_norm": 4.3188886642456055, + "learning_rate": 4.3583878717836646e-05, + "loss": 0.6896, "step": 335600 }, { - "epoch": 3.42, - "learning_rate": 5.427498525205376e-05, - "loss": 1.0178, + "epoch": 4.625113664544928, + "grad_norm": 4.989152908325195, + "learning_rate": 4.357737648339386e-05, + "loss": 0.6276, "step": 335700 }, { - "epoch": 3.42, - "learning_rate": 5.427108688969554e-05, - "loss": 0.9623, + "epoch": 4.626491416604668, + "grad_norm": 4.833415508270264, + "learning_rate": 4.3570872883095676e-05, + "loss": 0.611, "step": 335800 }, { - "epoch": 3.42, - "learning_rate": 5.426718734062601e-05, - "loss": 0.8048, + "epoch": 4.627869168664407, + "grad_norm": 46.401973724365234, + "learning_rate": 4.356436791749464e-05, + "loss": 0.5807, "step": 335900 }, { - "epoch": 3.42, - "learning_rate": 5.426328660503585e-05, - "loss": 0.8591, + "epoch": 4.629246920724146, + "grad_norm": 6.8872175216674805, + "learning_rate": 4.355786158714336e-05, + "loss": 0.5654, "step": 336000 }, { - "epoch": 3.42, - "learning_rate": 5.425938468311576e-05, - "loss": 0.8696, + "epoch": 4.630624672783886, + "grad_norm": 36.346012115478516, + "learning_rate": 4.355135389259459e-05, + "loss": 0.6942, "step": 336100 }, { - "epoch": 3.43, - "learning_rate": 5.425548157505654e-05, - "loss": 0.8766, + "epoch": 4.632002424843625, + "grad_norm": 52.620880126953125, + "learning_rate": 4.354484483440118e-05, + "loss": 0.6396, "step": 336200 }, { - "epoch": 3.43, - "learning_rate": 5.425157728104901e-05, - "loss": 0.8394, + "epoch": 4.633380176903365, + "grad_norm": 3.102942705154419, + "learning_rate": 4.3538334413116125e-05, + "loss": 0.6247, "step": 336300 }, { - "epoch": 3.43, - "learning_rate": 5.424767180128409e-05, - "loss": 0.7747, + "epoch": 4.634757928963104, + "grad_norm": 5.044652462005615, + "learning_rate": 4.3531822629292505e-05, + "loss": 0.6124, "step": 336400 }, { - "epoch": 3.43, - "learning_rate": 5.4243765135952706e-05, - "loss": 0.8581, + "epoch": 4.636135681022843, + "grad_norm": 11.227763175964355, + "learning_rate": 4.352530948348354e-05, + "loss": 0.6181, "step": 336500 }, { - "epoch": 3.43, - "learning_rate": 5.42398572852459e-05, - "loss": 0.7416, + "epoch": 4.6375134330825825, + "grad_norm": 7.284191608428955, + "learning_rate": 4.3518794976242536e-05, + "loss": 0.6899, "step": 336600 }, { - "epoch": 3.43, - "learning_rate": 5.423594824935472e-05, - "loss": 0.8795, + "epoch": 4.638891185142322, + "grad_norm": 13.224371910095215, + "learning_rate": 4.351227910812296e-05, + "loss": 0.665, "step": 336700 }, { - "epoch": 3.43, - "learning_rate": 5.4232038028470304e-05, - "loss": 0.9409, + "epoch": 4.640268937202061, + "grad_norm": 6.4130048751831055, + "learning_rate": 4.3505761879678355e-05, + "loss": 0.5789, "step": 336800 }, { - "epoch": 3.43, - "learning_rate": 5.4228126622783835e-05, - "loss": 0.886, + "epoch": 4.6416466892618, + "grad_norm": 45.207942962646484, + "learning_rate": 4.3499243291462387e-05, + "loss": 0.574, "step": 336900 }, { - "epoch": 3.43, - "learning_rate": 5.4224214032486566e-05, - "loss": 0.8711, + "epoch": 4.64302444132154, + "grad_norm": 5.822239875793457, + "learning_rate": 4.349272334402885e-05, + "loss": 0.6108, "step": 337000 }, { - "epoch": 3.43, - "learning_rate": 5.422030025776979e-05, - "loss": 1.0404, + "epoch": 4.644402193381279, + "grad_norm": 10.178132057189941, + "learning_rate": 4.3486202037931656e-05, + "loss": 0.6054, "step": 337100 }, { - "epoch": 3.44, - "learning_rate": 5.421638529882487e-05, - "loss": 0.838, + "epoch": 4.645779945441019, + "grad_norm": 8.700674057006836, + "learning_rate": 4.3479679373724806e-05, + "loss": 0.6075, "step": 337200 }, { - "epoch": 3.44, - "learning_rate": 5.4212469155843225e-05, - "loss": 0.8097, + "epoch": 4.647157697500758, + "grad_norm": 3.3527495861053467, + "learning_rate": 4.347315535196244e-05, + "loss": 0.6668, "step": 337300 }, { - "epoch": 3.44, - "learning_rate": 5.4208551829016334e-05, - "loss": 0.9381, + "epoch": 4.648535449560497, + "grad_norm": 5.201707363128662, + "learning_rate": 4.346662997319882e-05, + "loss": 0.5955, "step": 337400 }, { - "epoch": 3.44, - "learning_rate": 5.420463331853573e-05, - "loss": 0.7932, + "epoch": 4.649913201620237, + "grad_norm": 4.6474690437316895, + "learning_rate": 4.346010323798828e-05, + "loss": 0.6044, "step": 337500 }, { - "epoch": 3.44, - "learning_rate": 5.4200713624593e-05, - "loss": 0.8826, + "epoch": 4.651290953679975, + "grad_norm": 1.3089040517807007, + "learning_rate": 4.345357514688533e-05, + "loss": 0.5305, "step": 337600 }, { - "epoch": 3.44, - "learning_rate": 5.419679274737979e-05, - "loss": 0.9429, + "epoch": 4.652668705739715, + "grad_norm": 8.0230712890625, + "learning_rate": 4.3447045700444554e-05, + "loss": 0.7267, "step": 337700 }, { - "epoch": 3.44, - "learning_rate": 5.4192870687087816e-05, - "loss": 0.9054, + "epoch": 4.6540464577994545, + "grad_norm": 7.888185024261475, + "learning_rate": 4.3440514899220656e-05, + "loss": 0.6061, "step": 337800 }, { - "epoch": 3.44, - "learning_rate": 5.4188986682195295e-05, - "loss": 0.8673, + "epoch": 4.655424209859194, + "grad_norm": 3.1337671279907227, + "learning_rate": 4.343398274376847e-05, + "loss": 0.6158, "step": 337900 }, { - "epoch": 3.44, - "learning_rate": 5.418506226814715e-05, - "loss": 0.8667, + "epoch": 4.656801961918933, + "grad_norm": 4.105374813079834, + "learning_rate": 4.3427514576433055e-05, + "loss": 0.607, "step": 338000 }, { - "epoch": 3.44, - "learning_rate": 5.418113667159378e-05, - "loss": 0.8229, + "epoch": 4.658179713978672, + "grad_norm": 138.5779266357422, + "learning_rate": 4.3420979727717656e-05, + "loss": 0.6822, "step": 338100 }, { - "epoch": 3.45, - "learning_rate": 5.417720989272714e-05, - "loss": 0.8724, + "epoch": 4.659557466038412, + "grad_norm": 16.21944236755371, + "learning_rate": 4.341444352643358e-05, + "loss": 0.6598, "step": 338200 }, { - "epoch": 3.45, - "learning_rate": 5.417328193173922e-05, - "loss": 0.8777, + "epoch": 4.660935218098151, + "grad_norm": 9.633851051330566, + "learning_rate": 4.3407905973136104e-05, + "loss": 0.678, "step": 338300 }, { - "epoch": 3.45, - "learning_rate": 5.4169352788822064e-05, - "loss": 0.906, + "epoch": 4.66231297015789, + "grad_norm": 12.092757225036621, + "learning_rate": 4.3401367068380635e-05, + "loss": 0.7248, "step": 338400 }, { - "epoch": 3.45, - "learning_rate": 5.41654224641678e-05, - "loss": 0.948, + "epoch": 4.66369072221763, + "grad_norm": 7.795053482055664, + "learning_rate": 4.339482681272268e-05, + "loss": 0.6059, "step": 338500 }, { - "epoch": 3.45, - "learning_rate": 5.416149095796859e-05, - "loss": 0.988, + "epoch": 4.665068474277369, + "grad_norm": 5.384261608123779, + "learning_rate": 4.338828520671787e-05, + "loss": 0.6675, "step": 338600 }, { - "epoch": 3.45, - "learning_rate": 5.4157558270416654e-05, - "loss": 0.9876, + "epoch": 4.666446226337109, + "grad_norm": 7.8802947998046875, + "learning_rate": 4.338174225092195e-05, + "loss": 0.6546, "step": 338700 }, { - "epoch": 3.45, - "learning_rate": 5.415362440170428e-05, - "loss": 0.8309, + "epoch": 4.667823978396847, + "grad_norm": 3.0653738975524902, + "learning_rate": 4.3375197945890775e-05, + "loss": 0.61, "step": 338800 }, { - "epoch": 3.45, - "learning_rate": 5.4149689352023814e-05, - "loss": 0.9018, + "epoch": 4.669201730456587, + "grad_norm": 12.49382495880127, + "learning_rate": 4.336865229218032e-05, + "loss": 0.5998, "step": 338900 }, { - "epoch": 3.45, - "learning_rate": 5.414575312156767e-05, - "loss": 0.9025, + "epoch": 4.6705794825163265, + "grad_norm": 31.43951416015625, + "learning_rate": 4.336210529034667e-05, + "loss": 0.5845, "step": 339000 }, { - "epoch": 3.45, - "learning_rate": 5.414181571052829e-05, - "loss": 0.8341, + "epoch": 4.671957234576066, + "grad_norm": 9.029288291931152, + "learning_rate": 4.335555694094601e-05, + "loss": 0.7507, "step": 339100 }, { - "epoch": 3.46, - "learning_rate": 5.413787711909817e-05, - "loss": 0.8806, + "epoch": 4.673334986635805, + "grad_norm": 4.097837448120117, + "learning_rate": 4.334900724453469e-05, + "loss": 0.6854, "step": 339200 }, { - "epoch": 3.46, - "learning_rate": 5.4133937347469925e-05, - "loss": 0.8346, + "epoch": 4.674712738695544, + "grad_norm": 4.363492012023926, + "learning_rate": 4.334245620166911e-05, + "loss": 0.6521, "step": 339300 }, { - "epoch": 3.46, - "learning_rate": 5.4129996395836176e-05, - "loss": 1.0027, + "epoch": 4.676090490755284, + "grad_norm": 5.626641273498535, + "learning_rate": 4.3335903812905835e-05, + "loss": 0.6246, "step": 339400 }, { - "epoch": 3.46, - "learning_rate": 5.412605426438958e-05, - "loss": 0.9267, + "epoch": 4.677468242815023, + "grad_norm": 7.819990634918213, + "learning_rate": 4.33293500788015e-05, + "loss": 0.639, "step": 339500 }, { - "epoch": 3.46, - "learning_rate": 5.4122110953322926e-05, - "loss": 1.0539, + "epoch": 4.678845994874762, + "grad_norm": 9.912444114685059, + "learning_rate": 4.3322794999912916e-05, + "loss": 0.6164, "step": 339600 }, { - "epoch": 3.46, - "learning_rate": 5.4118166462829e-05, - "loss": 0.8187, + "epoch": 4.680223746934502, + "grad_norm": 5.36249303817749, + "learning_rate": 4.331623857679693e-05, + "loss": 0.666, "step": 339700 }, { - "epoch": 3.46, - "learning_rate": 5.411422079310065e-05, - "loss": 0.9457, + "epoch": 4.681601498994241, + "grad_norm": 7.043378829956055, + "learning_rate": 4.330968081001057e-05, + "loss": 0.658, "step": 339800 }, { - "epoch": 3.46, - "learning_rate": 5.411031341865414e-05, - "loss": 1.0481, + "epoch": 4.682979251053981, + "grad_norm": 81.09571838378906, + "learning_rate": 4.330312170011095e-05, + "loss": 0.6157, "step": 339900 }, { - "epoch": 3.46, - "learning_rate": 5.4106365402823315e-05, - "loss": 0.9842, + "epoch": 4.684357003113719, + "grad_norm": 4.815480709075928, + "learning_rate": 4.3296561247655285e-05, + "loss": 0.5948, "step": 340000 }, { - "epoch": 3.46, - "learning_rate": 5.4102416208335095e-05, - "loss": 1.04, + "epoch": 4.685734755173459, + "grad_norm": 2.4301512241363525, + "learning_rate": 4.3289999453200924e-05, + "loss": 0.698, "step": 340100 }, { - "epoch": 3.47, - "learning_rate": 5.409846583538255e-05, - "loss": 0.9499, + "epoch": 4.6871125072331985, + "grad_norm": 4.584778785705566, + "learning_rate": 4.328343631730533e-05, + "loss": 0.6204, "step": 340200 }, { - "epoch": 3.47, - "learning_rate": 5.4094514284158846e-05, - "loss": 0.8526, + "epoch": 4.688490259292937, + "grad_norm": 5.054748058319092, + "learning_rate": 4.3276871840526074e-05, + "loss": 0.5633, "step": 340300 }, { - "epoch": 3.47, - "learning_rate": 5.409056155485717e-05, - "loss": 0.9673, + "epoch": 4.689868011352677, + "grad_norm": 5.237736225128174, + "learning_rate": 4.327030602342085e-05, + "loss": 0.6407, "step": 340400 }, { - "epoch": 3.47, - "learning_rate": 5.408664719257256e-05, - "loss": 0.7544, + "epoch": 4.691245763412416, + "grad_norm": 7.613414764404297, + "learning_rate": 4.326380454474619e-05, + "loss": 0.5855, "step": 340500 }, { - "epoch": 3.47, - "learning_rate": 5.408269211947078e-05, - "loss": 0.907, + "epoch": 4.692623515472156, + "grad_norm": 6.827023029327393, + "learning_rate": 4.325723606205186e-05, + "loss": 0.6241, "step": 340600 }, { - "epoch": 3.47, - "learning_rate": 5.407873586886908e-05, - "loss": 0.9975, + "epoch": 4.694001267531895, + "grad_norm": 14.288708686828613, + "learning_rate": 4.3250666240699716e-05, + "loss": 0.7284, "step": 340700 }, { - "epoch": 3.47, - "learning_rate": 5.407477844096088e-05, - "loss": 0.8194, + "epoch": 4.695379019591634, + "grad_norm": 2.823258638381958, + "learning_rate": 4.324409508124788e-05, + "loss": 0.6481, "step": 340800 }, { - "epoch": 3.47, - "learning_rate": 5.4070819835939685e-05, - "loss": 0.8418, + "epoch": 4.696756771651374, + "grad_norm": 4.392832279205322, + "learning_rate": 4.323752258425464e-05, + "loss": 0.6244, "step": 340900 }, { - "epoch": 3.47, - "learning_rate": 5.406686005399905e-05, - "loss": 0.8876, + "epoch": 4.698134523711113, + "grad_norm": 4.342037200927734, + "learning_rate": 4.323094875027833e-05, + "loss": 0.655, "step": 341000 }, { - "epoch": 3.48, - "learning_rate": 5.406289909533258e-05, - "loss": 0.9251, + "epoch": 4.699512275770852, + "grad_norm": 11.47658634185791, + "learning_rate": 4.3224373579877446e-05, + "loss": 0.6078, "step": 341100 }, { - "epoch": 3.48, - "learning_rate": 5.405893696013395e-05, - "loss": 0.8449, + "epoch": 4.700890027830591, + "grad_norm": 2.0359914302825928, + "learning_rate": 4.321779707361059e-05, + "loss": 0.6016, "step": 341200 }, { - "epoch": 3.48, - "learning_rate": 5.405497364859687e-05, - "loss": 0.904, + "epoch": 4.702267779890331, + "grad_norm": 12.72111701965332, + "learning_rate": 4.321121923203645e-05, + "loss": 0.6964, "step": 341300 }, { - "epoch": 3.48, - "learning_rate": 5.4051048811613234e-05, - "loss": 0.8262, + "epoch": 4.7036455319500705, + "grad_norm": 10.523122787475586, + "learning_rate": 4.320464005571386e-05, + "loss": 0.6005, "step": 341400 }, { - "epoch": 3.48, - "learning_rate": 5.404708315973923e-05, - "loss": 0.7497, + "epoch": 4.70502328400981, + "grad_norm": 9.06456184387207, + "learning_rate": 4.3198059545201766e-05, + "loss": 0.5746, "step": 341500 }, { - "epoch": 3.48, - "learning_rate": 5.404311633210637e-05, - "loss": 0.7518, + "epoch": 4.706401036069549, + "grad_norm": 8.78666877746582, + "learning_rate": 4.319154352610025e-05, + "loss": 0.5869, "step": 341600 }, { - "epoch": 3.48, - "learning_rate": 5.403914832890859e-05, - "loss": 0.8741, + "epoch": 4.707778788129288, + "grad_norm": 10.096076011657715, + "learning_rate": 4.318496036221431e-05, + "loss": 0.6518, "step": 341700 }, { - "epoch": 3.48, - "learning_rate": 5.403517915033992e-05, - "loss": 1.0219, + "epoch": 4.709156540189028, + "grad_norm": 4.437751293182373, + "learning_rate": 4.317837586581075e-05, + "loss": 0.6404, "step": 341800 }, { - "epoch": 3.48, - "learning_rate": 5.403120879659443e-05, - "loss": 0.8878, + "epoch": 4.7105342922487665, + "grad_norm": 1.933472990989685, + "learning_rate": 4.317179003744895e-05, + "loss": 0.6197, "step": 341900 }, { - "epoch": 3.48, - "learning_rate": 5.402723726786624e-05, - "loss": 0.9455, + "epoch": 4.711912044308506, + "grad_norm": 7.073274612426758, + "learning_rate": 4.316520287768841e-05, + "loss": 0.6471, "step": 342000 }, { - "epoch": 3.49, - "learning_rate": 5.4023264564349535e-05, - "loss": 0.9049, + "epoch": 4.713289796368246, + "grad_norm": 10.69509506225586, + "learning_rate": 4.315861438708874e-05, + "loss": 0.6068, "step": 342100 }, { - "epoch": 3.49, - "learning_rate": 5.401929068623855e-05, - "loss": 0.916, + "epoch": 4.714667548427985, + "grad_norm": 8.324593544006348, + "learning_rate": 4.3152024566209665e-05, + "loss": 0.6434, "step": 342200 }, { - "epoch": 3.49, - "learning_rate": 5.401531563372759e-05, - "loss": 0.7544, + "epoch": 4.716045300487724, + "grad_norm": 10.524890899658203, + "learning_rate": 4.3145499333697296e-05, + "loss": 0.6683, "step": 342300 }, { - "epoch": 3.49, - "learning_rate": 5.4011339407011016e-05, - "loss": 0.868, + "epoch": 4.717423052547463, + "grad_norm": 6.639632225036621, + "learning_rate": 4.313890686722788e-05, + "loss": 0.6266, "step": 342400 }, { - "epoch": 3.49, - "learning_rate": 5.400736200628323e-05, - "loss": 0.7905, + "epoch": 4.718800804607203, + "grad_norm": 5.75840425491333, + "learning_rate": 4.313231307215331e-05, + "loss": 0.6337, "step": 342500 }, { - "epoch": 3.49, - "learning_rate": 5.400338343173872e-05, - "loss": 0.8767, + "epoch": 4.7201785566669425, + "grad_norm": 6.58840274810791, + "learning_rate": 4.312571794903378e-05, + "loss": 0.6424, "step": 342600 }, { - "epoch": 3.49, - "learning_rate": 5.3999403683572e-05, - "loss": 0.8184, + "epoch": 4.721556308726681, + "grad_norm": 45.629913330078125, + "learning_rate": 4.311912149842956e-05, + "loss": 0.7007, "step": 342700 }, { - "epoch": 3.49, - "learning_rate": 5.399542276197766e-05, - "loss": 0.8369, + "epoch": 4.722934060786421, + "grad_norm": 5.736390113830566, + "learning_rate": 4.311252372090107e-05, + "loss": 0.6642, "step": 342800 }, { - "epoch": 3.49, - "learning_rate": 5.3991440667150345e-05, - "loss": 0.9208, + "epoch": 4.72431181284616, + "grad_norm": 54.273075103759766, + "learning_rate": 4.3105924617008807e-05, + "loss": 0.6461, "step": 342900 }, { - "epoch": 3.49, - "learning_rate": 5.398745739928476e-05, - "loss": 0.8268, + "epoch": 4.7256895649059, + "grad_norm": 2.6785812377929688, + "learning_rate": 4.3099324187313416e-05, + "loss": 0.7001, "step": 343000 }, { - "epoch": 3.5, - "learning_rate": 5.398347295857565e-05, - "loss": 0.9143, + "epoch": 4.7270673169656385, + "grad_norm": 11.368042945861816, + "learning_rate": 4.309272243237563e-05, + "loss": 0.6648, "step": 343100 }, { - "epoch": 3.5, - "learning_rate": 5.397948734521784e-05, - "loss": 1.0979, + "epoch": 4.728445069025378, + "grad_norm": 7.296229362487793, + "learning_rate": 4.3086119352756296e-05, + "loss": 0.6033, "step": 343200 }, { - "epoch": 3.5, - "learning_rate": 5.3975500559406205e-05, - "loss": 0.8552, + "epoch": 4.729822821085118, + "grad_norm": 4.869019985198975, + "learning_rate": 4.3079514949016397e-05, + "loss": 0.6019, "step": 343300 }, { - "epoch": 3.5, - "learning_rate": 5.397151260133566e-05, - "loss": 0.9751, + "epoch": 4.731200573144857, + "grad_norm": 21.18593978881836, + "learning_rate": 4.307290922171699e-05, + "loss": 0.7382, "step": 343400 }, { - "epoch": 3.5, - "learning_rate": 5.396752347120122e-05, - "loss": 0.8371, + "epoch": 4.732578325204596, + "grad_norm": 13.06953239440918, + "learning_rate": 4.306630217141928e-05, + "loss": 0.6229, "step": 343500 }, { - "epoch": 3.5, - "learning_rate": 5.3963533169197904e-05, - "loss": 0.8045, + "epoch": 4.733956077264335, + "grad_norm": 24.202054977416992, + "learning_rate": 4.305969379868455e-05, + "loss": 0.7006, "step": 343600 }, { - "epoch": 3.5, - "learning_rate": 5.395954169552082e-05, - "loss": 0.7778, + "epoch": 4.735333829324075, + "grad_norm": 42.04108428955078, + "learning_rate": 4.305308410407424e-05, + "loss": 0.6311, "step": 343700 }, { - "epoch": 3.5, - "learning_rate": 5.395554905036514e-05, - "loss": 0.8089, + "epoch": 4.7367115813838145, + "grad_norm": 7.695801734924316, + "learning_rate": 4.3046473088149866e-05, + "loss": 0.6048, "step": 343800 }, { - "epoch": 3.5, - "learning_rate": 5.395159517788766e-05, - "loss": 0.8448, + "epoch": 4.738089333443553, + "grad_norm": 14.970787048339844, + "learning_rate": 4.303986075147307e-05, + "loss": 0.7253, "step": 343900 }, { - "epoch": 3.5, - "learning_rate": 5.394760020207039e-05, - "loss": 0.8314, + "epoch": 4.739467085503293, + "grad_norm": 7.628849029541016, + "learning_rate": 4.303324709460559e-05, + "loss": 0.6739, "step": 344000 }, { - "epoch": 3.51, - "learning_rate": 5.394360405535837e-05, - "loss": 0.8651, + "epoch": 4.740844837563032, + "grad_norm": 5.008774280548096, + "learning_rate": 4.302663211810931e-05, + "loss": 0.688, "step": 344100 }, { - "epoch": 3.51, - "learning_rate": 5.393960673794701e-05, - "loss": 0.8219, + "epoch": 4.742222589622772, + "grad_norm": 21.251340866088867, + "learning_rate": 4.302001582254619e-05, + "loss": 0.5997, "step": 344200 }, { - "epoch": 3.51, - "learning_rate": 5.393560825003174e-05, - "loss": 0.9103, + "epoch": 4.7436003416825105, + "grad_norm": 4.002758026123047, + "learning_rate": 4.301339820847834e-05, + "loss": 0.6583, "step": 344300 }, { - "epoch": 3.51, - "learning_rate": 5.393160859180807e-05, - "loss": 0.777, + "epoch": 4.74497809374225, + "grad_norm": 7.359635829925537, + "learning_rate": 4.300677927646794e-05, + "loss": 0.5987, "step": 344400 }, { - "epoch": 3.51, - "learning_rate": 5.392760776347155e-05, - "loss": 0.8987, + "epoch": 4.74635584580199, + "grad_norm": 25.74188232421875, + "learning_rate": 4.300015902707731e-05, + "loss": 0.5773, "step": 344500 }, { - "epoch": 3.51, - "learning_rate": 5.39236057652178e-05, - "loss": 0.8541, + "epoch": 4.747733597861728, + "grad_norm": 9.259632110595703, + "learning_rate": 4.299353746086887e-05, + "loss": 0.6032, "step": 344600 }, { - "epoch": 3.51, - "learning_rate": 5.39196025972425e-05, - "loss": 0.9376, + "epoch": 4.749111349921468, + "grad_norm": 248.90753173828125, + "learning_rate": 4.2986914578405154e-05, + "loss": 0.6006, "step": 344700 }, { - "epoch": 3.51, - "learning_rate": 5.391559825974138e-05, - "loss": 0.8821, + "epoch": 4.750489101981207, + "grad_norm": 29.208189010620117, + "learning_rate": 4.298029038024883e-05, + "loss": 0.7067, "step": 344800 }, { - "epoch": 3.51, - "learning_rate": 5.3911592752910225e-05, - "loss": 0.9233, + "epoch": 4.751866854040947, + "grad_norm": 3.7965328693389893, + "learning_rate": 4.2973664866962616e-05, + "loss": 0.6119, "step": 344900 }, { - "epoch": 3.51, - "learning_rate": 5.390758607694488e-05, - "loss": 0.8708, + "epoch": 4.7532446061006866, + "grad_norm": 6.786052227020264, + "learning_rate": 4.296703803910942e-05, + "loss": 0.6829, "step": 345000 }, { - "epoch": 3.52, - "learning_rate": 5.390357823204126e-05, - "loss": 0.8195, + "epoch": 4.754622358160425, + "grad_norm": 9.718019485473633, + "learning_rate": 4.2960409897252224e-05, + "loss": 0.6432, "step": 345100 }, { - "epoch": 3.52, - "learning_rate": 5.38995692183953e-05, - "loss": 0.8252, + "epoch": 4.756000110220165, + "grad_norm": 15.900662422180176, + "learning_rate": 4.2953780441954105e-05, + "loss": 0.651, "step": 345200 }, { - "epoch": 3.52, - "learning_rate": 5.3895559036203036e-05, - "loss": 0.8675, + "epoch": 4.757377862279904, + "grad_norm": 5.348118305206299, + "learning_rate": 4.2947149673778275e-05, + "loss": 0.6101, "step": 345300 }, { - "epoch": 3.52, - "learning_rate": 5.389154768566053e-05, - "loss": 1.0444, + "epoch": 4.758755614339643, + "grad_norm": 5.125334739685059, + "learning_rate": 4.294051759328806e-05, + "loss": 0.6619, "step": 345400 }, { - "epoch": 3.52, - "learning_rate": 5.3887535166963934e-05, - "loss": 0.9186, + "epoch": 4.760133366399383, + "grad_norm": 11.220720291137695, + "learning_rate": 4.293388420104687e-05, + "loss": 0.6458, "step": 345500 }, { - "epoch": 3.52, - "learning_rate": 5.388352148030941e-05, - "loss": 0.9672, + "epoch": 4.761511118459122, + "grad_norm": 18.100278854370117, + "learning_rate": 4.292724949761827e-05, + "loss": 0.5987, "step": 345600 }, { - "epoch": 3.52, - "learning_rate": 5.387950662589321e-05, - "loss": 0.8277, + "epoch": 4.762888870518862, + "grad_norm": 30.092592239379883, + "learning_rate": 4.292061348356589e-05, + "loss": 0.7135, "step": 345700 }, { - "epoch": 3.52, - "learning_rate": 5.387549060391165e-05, - "loss": 0.9781, + "epoch": 4.764266622578601, + "grad_norm": 2.859104871749878, + "learning_rate": 4.291397615945351e-05, + "loss": 0.679, "step": 345800 }, { - "epoch": 3.52, - "learning_rate": 5.3871473414561086e-05, - "loss": 0.9245, + "epoch": 4.76564437463834, + "grad_norm": 2.6178338527679443, + "learning_rate": 4.2907337525844995e-05, + "loss": 0.6723, "step": 345900 }, { - "epoch": 3.53, - "learning_rate": 5.386745505803792e-05, - "loss": 1.068, + "epoch": 4.7670221266980795, + "grad_norm": 3.2841413021087646, + "learning_rate": 4.290069758330433e-05, + "loss": 0.6428, "step": 346000 }, { - "epoch": 3.53, - "learning_rate": 5.386343553453864e-05, - "loss": 0.9302, + "epoch": 4.768399878757819, + "grad_norm": 14.654231071472168, + "learning_rate": 4.289405633239563e-05, + "loss": 0.6737, "step": 346100 }, { - "epoch": 3.53, - "learning_rate": 5.385941484425976e-05, - "loss": 0.9994, + "epoch": 4.769777630817558, + "grad_norm": 3.0337975025177, + "learning_rate": 4.288741377368307e-05, + "loss": 0.5654, "step": 346200 }, { - "epoch": 3.53, - "learning_rate": 5.38553929873979e-05, - "loss": 1.0715, + "epoch": 4.771155382877297, + "grad_norm": 16.27220916748047, + "learning_rate": 4.2880769907731e-05, + "loss": 0.5557, "step": 346300 }, { - "epoch": 3.53, - "learning_rate": 5.385136996414967e-05, - "loss": 0.9614, + "epoch": 4.772533134937037, + "grad_norm": 2.7087109088897705, + "learning_rate": 4.287419119329629e-05, + "loss": 0.5715, "step": 346400 }, { - "epoch": 3.53, - "learning_rate": 5.384734577471179e-05, - "loss": 1.0344, + "epoch": 4.773910886996776, + "grad_norm": 4.579300880432129, + "learning_rate": 4.28675447276169e-05, + "loss": 0.7386, "step": 346500 }, { - "epoch": 3.53, - "learning_rate": 5.384332041928102e-05, - "loss": 0.9919, + "epoch": 4.775288639056515, + "grad_norm": 7.389023303985596, + "learning_rate": 4.2860896956385955e-05, + "loss": 0.6614, "step": 346600 }, { - "epoch": 3.53, - "learning_rate": 5.3839293898054165e-05, - "loss": 1.0086, + "epoch": 4.776666391116255, + "grad_norm": 8.012945175170898, + "learning_rate": 4.285424788016822e-05, + "loss": 0.6886, "step": 346700 }, { - "epoch": 3.53, - "learning_rate": 5.383526621122811e-05, - "loss": 1.0085, + "epoch": 4.778044143175994, + "grad_norm": 23.281654357910156, + "learning_rate": 4.284759749952858e-05, + "loss": 0.6703, "step": 346800 }, { - "epoch": 3.53, - "learning_rate": 5.383123735899977e-05, - "loss": 0.8743, + "epoch": 4.779421895235734, + "grad_norm": 4.183088302612305, + "learning_rate": 4.284094581503202e-05, + "loss": 0.7064, "step": 346900 }, { - "epoch": 3.54, - "learning_rate": 5.382720734156614e-05, - "loss": 0.8637, + "epoch": 4.780799647295472, + "grad_norm": 4.177852630615234, + "learning_rate": 4.283429282724363e-05, + "loss": 0.7442, "step": 347000 }, { - "epoch": 3.54, - "learning_rate": 5.382317615912427e-05, - "loss": 1.0592, + "epoch": 4.782177399355212, + "grad_norm": 11.401872634887695, + "learning_rate": 4.282763853672861e-05, + "loss": 0.5458, "step": 347100 }, { - "epoch": 3.54, - "learning_rate": 5.381914381187126e-05, - "loss": 0.9845, + "epoch": 4.7835551514149515, + "grad_norm": 9.291254043579102, + "learning_rate": 4.282098294405227e-05, + "loss": 0.7176, "step": 347200 }, { - "epoch": 3.54, - "learning_rate": 5.381511030000425e-05, - "loss": 0.8499, + "epoch": 4.784932903474691, + "grad_norm": 9.33425521850586, + "learning_rate": 4.2814326049780064e-05, + "loss": 0.6523, "step": 347300 }, { - "epoch": 3.54, - "learning_rate": 5.3811075623720485e-05, - "loss": 0.9557, + "epoch": 4.78631065553443, + "grad_norm": 6.8329572677612305, + "learning_rate": 4.280766785447751e-05, + "loss": 0.6582, "step": 347400 }, { - "epoch": 3.54, - "learning_rate": 5.38070397832172e-05, - "loss": 0.89, + "epoch": 4.787688407594169, + "grad_norm": 46.750179290771484, + "learning_rate": 4.2801008358710255e-05, + "loss": 0.6308, "step": 347500 }, { - "epoch": 3.54, - "learning_rate": 5.380300277869175e-05, - "loss": 0.9347, + "epoch": 4.789066159653909, + "grad_norm": 4.833880424499512, + "learning_rate": 4.2794347563044064e-05, + "loss": 0.5835, "step": 347600 }, { - "epoch": 3.54, - "learning_rate": 5.3798964610341516e-05, - "loss": 0.9437, + "epoch": 4.790443911713648, + "grad_norm": 5.339344024658203, + "learning_rate": 4.2787685468044795e-05, + "loss": 0.7058, "step": 347700 }, { - "epoch": 3.54, - "learning_rate": 5.379492527836394e-05, - "loss": 0.8372, + "epoch": 4.791821663773387, + "grad_norm": 18.253280639648438, + "learning_rate": 4.278102207427844e-05, + "loss": 0.6121, "step": 347800 }, { - "epoch": 3.54, - "learning_rate": 5.3790884782956514e-05, - "loss": 0.9771, + "epoch": 4.793199415833127, + "grad_norm": 30.998825073242188, + "learning_rate": 4.2774357382311076e-05, + "loss": 0.5985, "step": 347900 }, { - "epoch": 3.55, - "learning_rate": 5.37868431243168e-05, - "loss": 0.7926, + "epoch": 4.794577167892866, + "grad_norm": 6.988937854766846, + "learning_rate": 4.276769139270891e-05, + "loss": 0.6668, "step": 348000 }, { - "epoch": 3.55, - "learning_rate": 5.37828003026424e-05, - "loss": 0.9211, + "epoch": 4.795954919952606, + "grad_norm": 7.60188102722168, + "learning_rate": 4.2761024106038264e-05, + "loss": 0.6605, "step": 348100 }, { - "epoch": 3.55, - "learning_rate": 5.377875631813101e-05, - "loss": 0.9682, + "epoch": 4.797332672012344, + "grad_norm": 2.9623730182647705, + "learning_rate": 4.2754355522865526e-05, + "loss": 0.6786, "step": 348200 }, { - "epoch": 3.55, - "learning_rate": 5.3774711170980325e-05, - "loss": 0.9586, + "epoch": 4.798710424072084, + "grad_norm": 3.613161563873291, + "learning_rate": 4.274768564375726e-05, + "loss": 0.5727, "step": 348300 }, { - "epoch": 3.55, - "learning_rate": 5.3770664861388155e-05, - "loss": 0.925, + "epoch": 4.8000881761318235, + "grad_norm": 2.2245612144470215, + "learning_rate": 4.274101446928009e-05, + "loss": 0.6603, "step": 348400 }, { - "epoch": 3.55, - "learning_rate": 5.376661738955233e-05, - "loss": 0.8547, + "epoch": 4.801465928191563, + "grad_norm": 5.619102478027344, + "learning_rate": 4.273434200000077e-05, + "loss": 0.6145, "step": 348500 }, { - "epoch": 3.55, - "learning_rate": 5.3762568755670745e-05, - "loss": 0.945, + "epoch": 4.802843680251302, + "grad_norm": 59.182674407958984, + "learning_rate": 4.2727668236486144e-05, + "loss": 0.6255, "step": 348600 }, { - "epoch": 3.55, - "learning_rate": 5.375851895994135e-05, - "loss": 0.865, + "epoch": 4.804221432311041, + "grad_norm": 4.695216655731201, + "learning_rate": 4.2720993179303215e-05, + "loss": 0.6451, "step": 348700 }, { - "epoch": 3.55, - "learning_rate": 5.375446800256216e-05, - "loss": 0.8899, + "epoch": 4.805599184370781, + "grad_norm": 5.404073715209961, + "learning_rate": 4.271431682901903e-05, + "loss": 0.5819, "step": 348800 }, { - "epoch": 3.55, - "learning_rate": 5.3750415883731247e-05, - "loss": 0.9063, + "epoch": 4.8069769364305195, + "grad_norm": 8.825237274169922, + "learning_rate": 4.270763918620081e-05, + "loss": 0.6225, "step": 348900 }, { - "epoch": 3.56, - "learning_rate": 5.374636260364673e-05, - "loss": 0.8484, + "epoch": 4.808354688490259, + "grad_norm": 1.9902760982513428, + "learning_rate": 4.270096025141583e-05, + "loss": 0.6198, "step": 349000 }, { - "epoch": 3.56, - "learning_rate": 5.3742308162506805e-05, - "loss": 0.9326, + "epoch": 4.809732440549999, + "grad_norm": 7.778998374938965, + "learning_rate": 4.269428002523151e-05, + "loss": 0.668, "step": 349100 }, { - "epoch": 3.56, - "learning_rate": 5.3738252560509676e-05, - "loss": 0.8755, + "epoch": 4.811110192609738, + "grad_norm": 6.042855262756348, + "learning_rate": 4.268759850821537e-05, + "loss": 0.7092, "step": 349200 }, { - "epoch": 3.56, - "learning_rate": 5.3734195797853677e-05, - "loss": 0.9273, + "epoch": 4.812487944669478, + "grad_norm": 8.571781158447266, + "learning_rate": 4.2680915700935045e-05, + "loss": 0.6834, "step": 349300 }, { - "epoch": 3.56, - "learning_rate": 5.3730137874737135e-05, - "loss": 0.8898, + "epoch": 4.813865696729216, + "grad_norm": 6.674981117248535, + "learning_rate": 4.267423160395825e-05, + "loss": 0.7157, "step": 349400 }, { - "epoch": 3.56, - "learning_rate": 5.372607879135846e-05, - "loss": 0.9333, + "epoch": 4.815243448788956, + "grad_norm": 4.643914699554443, + "learning_rate": 4.266754621785286e-05, + "loss": 0.6347, "step": 349500 }, { - "epoch": 3.56, - "learning_rate": 5.372201854791614e-05, - "loss": 0.9805, + "epoch": 4.8166212008486955, + "grad_norm": 3.260669708251953, + "learning_rate": 4.2660859543186825e-05, + "loss": 0.636, "step": 349600 }, { - "epoch": 3.56, - "learning_rate": 5.3717957144608655e-05, - "loss": 0.8594, + "epoch": 4.817998952908434, + "grad_norm": 59.931602478027344, + "learning_rate": 4.2654171580528196e-05, + "loss": 0.6207, "step": 349700 }, { - "epoch": 3.56, - "learning_rate": 5.3713894581634626e-05, - "loss": 0.8958, + "epoch": 4.819376704968174, + "grad_norm": 7.203336238861084, + "learning_rate": 4.264748233044518e-05, + "loss": 0.6024, "step": 349800 }, { - "epoch": 3.56, - "learning_rate": 5.3709830859192655e-05, - "loss": 0.9918, + "epoch": 4.820754457027913, + "grad_norm": 10.81579303741455, + "learning_rate": 4.264079179350603e-05, + "loss": 0.6292, "step": 349900 }, { - "epoch": 3.57, - "learning_rate": 5.370576597748146e-05, - "loss": 0.8977, - "step": 350000 - }, - { - "epoch": 3.57, - "eval_cer": 0.15400072018086786, - "eval_loss": 0.8868646025657654, - "eval_runtime": 11067.3494, - "eval_samples_per_second": 4.943, - "eval_steps_per_second": 0.309, - "eval_wer": 0.27368996588225003, + "epoch": 4.822132209087653, + "grad_norm": 9.10781192779541, + "learning_rate": 4.2634099970279165e-05, + "loss": 0.6698, "step": 350000 }, { - "epoch": 3.57, - "learning_rate": 5.3701699936699765e-05, - "loss": 0.8122, + "epoch": 4.823509961147392, + "grad_norm": 7.323215961456299, + "learning_rate": 4.262740686133308e-05, + "loss": 0.6258, "step": 350100 }, { - "epoch": 3.57, - "learning_rate": 5.36976327370464e-05, - "loss": 0.8942, + "epoch": 4.824887713207131, + "grad_norm": 7.165981292724609, + "learning_rate": 4.2620712467236394e-05, + "loss": 0.6133, "step": 350200 }, { - "epoch": 3.57, - "learning_rate": 5.3693564378720206e-05, - "loss": 0.8784, + "epoch": 4.826265465266871, + "grad_norm": 3.829420804977417, + "learning_rate": 4.261401678855783e-05, + "loss": 0.5866, "step": 350300 }, { - "epoch": 3.57, - "learning_rate": 5.368949486192012e-05, - "loss": 0.9242, + "epoch": 4.82764321732661, + "grad_norm": 9.75688362121582, + "learning_rate": 4.260731982586621e-05, + "loss": 0.6533, "step": 350400 }, { - "epoch": 3.57, - "learning_rate": 5.368542418684509e-05, - "loss": 0.8039, + "epoch": 4.829020969386349, + "grad_norm": 4.121615409851074, + "learning_rate": 4.260068856854302e-05, + "loss": 0.5646, "step": 350500 }, { - "epoch": 3.57, - "learning_rate": 5.368135235369418e-05, - "loss": 0.8437, + "epoch": 4.830398721446088, + "grad_norm": 5.265509605407715, + "learning_rate": 4.259398905235817e-05, + "loss": 0.6644, "step": 350600 }, { - "epoch": 3.57, - "learning_rate": 5.367727936266645e-05, - "loss": 0.8817, + "epoch": 4.831776473505828, + "grad_norm": 5.581333637237549, + "learning_rate": 4.2587288253861736e-05, + "loss": 0.6661, "step": 350700 }, { - "epoch": 3.57, - "learning_rate": 5.3673205213961065e-05, - "loss": 0.8223, + "epoch": 4.8331542255655675, + "grad_norm": 6.327726364135742, + "learning_rate": 4.258058617362297e-05, + "loss": 0.6045, "step": 350800 }, { - "epoch": 3.58, - "learning_rate": 5.366912990777722e-05, - "loss": 0.6902, + "epoch": 4.834531977625306, + "grad_norm": 3.824824810028076, + "learning_rate": 4.257388281221126e-05, + "loss": 0.6226, "step": 350900 }, { - "epoch": 3.58, - "learning_rate": 5.3665053444314175e-05, - "loss": 0.8682, + "epoch": 4.835909729685046, + "grad_norm": 2.384413480758667, + "learning_rate": 4.2567178170196086e-05, + "loss": 0.6302, "step": 351000 }, { - "epoch": 3.58, - "learning_rate": 5.366097582377123e-05, - "loss": 0.8662, + "epoch": 4.837287481744785, + "grad_norm": 6.625782012939453, + "learning_rate": 4.256047224814705e-05, + "loss": 0.6129, "step": 351100 }, { - "epoch": 3.58, - "learning_rate": 5.3656897046347776e-05, - "loss": 0.8344, + "epoch": 4.838665233804525, + "grad_norm": 33.81018829345703, + "learning_rate": 4.255376504663384e-05, + "loss": 0.6418, "step": 351200 }, { - "epoch": 3.58, - "learning_rate": 5.365281711224324e-05, - "loss": 0.9177, + "epoch": 4.8400429858642635, + "grad_norm": 12.79318618774414, + "learning_rate": 4.254712365735901e-05, + "loss": 0.6311, "step": 351300 }, { - "epoch": 3.58, - "learning_rate": 5.364873602165709e-05, - "loss": 0.9335, + "epoch": 4.841420737924003, + "grad_norm": 5.097681045532227, + "learning_rate": 4.254041391140744e-05, + "loss": 0.6556, "step": 351400 }, { - "epoch": 3.58, - "learning_rate": 5.364465377478888e-05, - "loss": 0.8431, + "epoch": 4.842798489983743, + "grad_norm": 16.894330978393555, + "learning_rate": 4.253370288769575e-05, + "loss": 0.652, "step": 351500 }, { - "epoch": 3.58, - "learning_rate": 5.364057037183821e-05, - "loss": 0.7737, + "epoch": 4.844176242043482, + "grad_norm": 3.3176510334014893, + "learning_rate": 4.252699058679409e-05, + "loss": 0.554, "step": 351600 }, { - "epoch": 3.58, - "learning_rate": 5.363648581300472e-05, - "loss": 0.8538, + "epoch": 4.845553994103221, + "grad_norm": 13.652451515197754, + "learning_rate": 4.2520277009272704e-05, + "loss": 0.6247, "step": 351700 }, { - "epoch": 3.58, - "learning_rate": 5.363240009848813e-05, - "loss": 0.8297, + "epoch": 4.84693174616296, + "grad_norm": 4.763336658477783, + "learning_rate": 4.251356215570195e-05, + "loss": 0.6832, "step": 351800 }, { - "epoch": 3.59, - "learning_rate": 5.362831322848821e-05, - "loss": 0.8857, + "epoch": 4.8483094982227, + "grad_norm": 5.352202415466309, + "learning_rate": 4.2506846026652275e-05, + "loss": 0.6036, "step": 351900 }, { - "epoch": 3.59, - "learning_rate": 5.362422520320478e-05, - "loss": 0.8212, + "epoch": 4.8496872502824395, + "grad_norm": 3.9824962615966797, + "learning_rate": 4.250012862269425e-05, + "loss": 0.6436, "step": 352000 }, { - "epoch": 3.59, - "learning_rate": 5.362013602283771e-05, - "loss": 0.8241, + "epoch": 4.851065002342178, + "grad_norm": 9.790726661682129, + "learning_rate": 4.249340994439858e-05, + "loss": 0.6255, "step": 352100 }, { - "epoch": 3.59, - "learning_rate": 5.3616045687586956e-05, - "loss": 0.9376, + "epoch": 4.852442754401918, + "grad_norm": 29.39607810974121, + "learning_rate": 4.248668999233601e-05, + "loss": 0.6255, "step": 352200 }, { - "epoch": 3.59, - "learning_rate": 5.36119541976525e-05, - "loss": 0.8152, + "epoch": 4.853820506461657, + "grad_norm": 4.3125104904174805, + "learning_rate": 4.247996876707747e-05, + "loss": 0.6313, "step": 352300 }, { - "epoch": 3.59, - "learning_rate": 5.3607861553234384e-05, - "loss": 0.7562, + "epoch": 4.855198258521397, + "grad_norm": 5.499168872833252, + "learning_rate": 4.247324626919392e-05, + "loss": 0.5914, "step": 352400 }, { - "epoch": 3.59, - "learning_rate": 5.360376775453274e-05, - "loss": 0.865, + "epoch": 4.8565760105811355, + "grad_norm": 4.054204940795898, + "learning_rate": 4.246652249925652e-05, + "loss": 0.6141, "step": 352500 }, { - "epoch": 3.59, - "learning_rate": 5.3599672801747704e-05, - "loss": 0.7685, + "epoch": 4.857953762640875, + "grad_norm": 2.557570219039917, + "learning_rate": 4.2459797457836454e-05, + "loss": 0.6582, "step": 352600 }, { - "epoch": 3.59, - "learning_rate": 5.35955766950795e-05, - "loss": 0.8248, + "epoch": 4.859331514700615, + "grad_norm": 8.356891632080078, + "learning_rate": 4.2453071145505064e-05, + "loss": 0.6141, "step": 352700 }, { - "epoch": 3.59, - "learning_rate": 5.3591479434728404e-05, - "loss": 0.7922, + "epoch": 4.860709266760354, + "grad_norm": 29.28074836730957, + "learning_rate": 4.244634356283378e-05, + "loss": 0.569, "step": 352800 }, { - "epoch": 3.6, - "learning_rate": 5.3587381020894754e-05, - "loss": 0.7246, + "epoch": 4.862087018820093, + "grad_norm": 4.220701217651367, + "learning_rate": 4.243961471039415e-05, + "loss": 0.624, "step": 352900 }, { - "epoch": 3.6, - "learning_rate": 5.358328145377893e-05, - "loss": 0.9608, + "epoch": 4.863464770879832, + "grad_norm": 2.6034014225006104, + "learning_rate": 4.243288458875781e-05, + "loss": 0.6679, "step": 353000 }, { - "epoch": 3.6, - "learning_rate": 5.357918073358139e-05, - "loss": 0.7184, + "epoch": 4.864842522939572, + "grad_norm": 3.912343978881836, + "learning_rate": 4.2426153198496535e-05, + "loss": 0.6819, "step": 353100 }, { - "epoch": 3.6, - "learning_rate": 5.3575078860502615e-05, - "loss": 0.7943, + "epoch": 4.866220274999311, + "grad_norm": 7.990548610687256, + "learning_rate": 4.241942054018218e-05, + "loss": 0.6395, "step": 353200 }, { - "epoch": 3.6, - "learning_rate": 5.357097583474319e-05, - "loss": 0.8057, + "epoch": 4.86759802705905, + "grad_norm": 15.660150527954102, + "learning_rate": 4.2412686614386725e-05, + "loss": 0.6778, "step": 353300 }, { - "epoch": 3.6, - "learning_rate": 5.356687165650369e-05, - "loss": 0.8545, + "epoch": 4.86897577911879, + "grad_norm": 29.208141326904297, + "learning_rate": 4.240595142168226e-05, + "loss": 0.6645, "step": 353400 }, { - "epoch": 3.6, - "learning_rate": 5.356276632598482e-05, - "loss": 0.8106, + "epoch": 4.870353531178529, + "grad_norm": 6.159826755523682, + "learning_rate": 4.2399214962640954e-05, + "loss": 0.7131, "step": 353500 }, { - "epoch": 3.6, - "learning_rate": 5.3558659843387284e-05, - "loss": 0.7421, + "epoch": 4.871731283238269, + "grad_norm": 5.614286422729492, + "learning_rate": 4.239247723783511e-05, + "loss": 0.6363, "step": 353600 }, { - "epoch": 3.6, - "learning_rate": 5.355455220891187e-05, - "loss": 0.793, + "epoch": 4.8731090352980075, + "grad_norm": 25.2165470123291, + "learning_rate": 4.238573824783714e-05, + "loss": 0.6855, "step": 353700 }, { - "epoch": 3.6, - "learning_rate": 5.355044342275943e-05, - "loss": 0.8183, + "epoch": 4.874486787357747, + "grad_norm": 4.608912944793701, + "learning_rate": 4.237906540202371e-05, + "loss": 0.7223, "step": 353800 }, { - "epoch": 3.61, - "learning_rate": 5.3546333485130846e-05, - "loss": 0.7912, + "epoch": 4.875864539417487, + "grad_norm": 2.840367555618286, + "learning_rate": 4.2372323895996754e-05, + "loss": 0.5955, "step": 353900 }, { - "epoch": 3.61, - "learning_rate": 5.354222239622706e-05, - "loss": 0.7747, + "epoch": 4.877242291477225, + "grad_norm": 9.986611366271973, + "learning_rate": 4.23655811264898e-05, + "loss": 0.6123, "step": 354000 }, { - "epoch": 3.61, - "learning_rate": 5.35381101562491e-05, - "loss": 0.7183, + "epoch": 4.878620043536965, + "grad_norm": 9.702980995178223, + "learning_rate": 4.2358837094075666e-05, + "loss": 0.653, "step": 354100 }, { - "epoch": 3.61, - "learning_rate": 5.353399676539802e-05, - "loss": 0.7398, + "epoch": 4.879997795596704, + "grad_norm": 5.322096824645996, + "learning_rate": 4.235209179932732e-05, + "loss": 0.6583, "step": 354200 }, { - "epoch": 3.61, - "learning_rate": 5.352988222387494e-05, - "loss": 0.7905, + "epoch": 4.881375547656444, + "grad_norm": 3.6570732593536377, + "learning_rate": 4.234534524281778e-05, + "loss": 0.5954, "step": 354300 }, { - "epoch": 3.61, - "learning_rate": 5.352576653188105e-05, - "loss": 0.7846, + "epoch": 4.8827532997161835, + "grad_norm": 19.012914657592773, + "learning_rate": 4.233859742512022e-05, + "loss": 0.6734, "step": 354400 }, { - "epoch": 3.61, - "learning_rate": 5.352164968961756e-05, - "loss": 0.7737, + "epoch": 4.884131051775922, + "grad_norm": 7.837798595428467, + "learning_rate": 4.2331848346807894e-05, + "loss": 0.6474, "step": 354500 }, { - "epoch": 3.61, - "learning_rate": 5.351753169728578e-05, - "loss": 0.6956, + "epoch": 4.885508803835662, + "grad_norm": 2.2519407272338867, + "learning_rate": 4.232509800845417e-05, + "loss": 0.6591, "step": 354600 }, { - "epoch": 3.61, - "learning_rate": 5.351341255508703e-05, - "loss": 0.8619, + "epoch": 4.886886555895401, + "grad_norm": 7.41689395904541, + "learning_rate": 4.231834641063253e-05, + "loss": 0.6365, "step": 354700 }, { - "epoch": 3.61, - "learning_rate": 5.3509292263222736e-05, - "loss": 0.733, + "epoch": 4.88826430795514, + "grad_norm": 11.385661125183105, + "learning_rate": 4.231159355391655e-05, + "loss": 0.6042, "step": 354800 }, { - "epoch": 3.62, - "learning_rate": 5.350517082189434e-05, - "loss": 0.8307, + "epoch": 4.8896420600148796, + "grad_norm": 11.584607124328613, + "learning_rate": 4.230483943887991e-05, + "loss": 0.6433, "step": 354900 }, { - "epoch": 3.62, - "learning_rate": 5.3501048231303366e-05, - "loss": 0.9157, + "epoch": 4.891019812074619, + "grad_norm": 5.1976189613342285, + "learning_rate": 4.229808406609644e-05, + "loss": 0.7044, "step": 355000 }, { - "epoch": 3.62, - "learning_rate": 5.349692449165138e-05, - "loss": 0.8511, + "epoch": 4.892397564134359, + "grad_norm": 7.088685035705566, + "learning_rate": 4.229132743614e-05, + "loss": 0.5413, "step": 355100 }, { - "epoch": 3.62, - "learning_rate": 5.3492799603140016e-05, - "loss": 0.784, + "epoch": 4.893775316194097, + "grad_norm": 288.20904541015625, + "learning_rate": 4.2284569549584636e-05, + "loss": 0.6139, "step": 355200 }, { - "epoch": 3.62, - "learning_rate": 5.3488673565970943e-05, - "loss": 0.6809, + "epoch": 4.895153068253837, + "grad_norm": 4.635827541351318, + "learning_rate": 4.227781040700445e-05, + "loss": 0.7333, "step": 355300 }, { - "epoch": 3.62, - "learning_rate": 5.34845463803459e-05, - "loss": 0.8163, + "epoch": 4.8965308203135764, + "grad_norm": 15.920924186706543, + "learning_rate": 4.227105000897367e-05, + "loss": 0.5883, "step": 355400 }, { - "epoch": 3.62, - "learning_rate": 5.3480418046466696e-05, - "loss": 0.7569, + "epoch": 4.897908572373316, + "grad_norm": 6.5138421058654785, + "learning_rate": 4.226428835606662e-05, + "loss": 0.6135, "step": 355500 }, { - "epoch": 3.62, - "learning_rate": 5.3476288564535176e-05, - "loss": 0.8531, + "epoch": 4.899286324433055, + "grad_norm": 4.578680038452148, + "learning_rate": 4.2257525448857736e-05, + "loss": 0.6835, "step": 355600 }, { - "epoch": 3.62, - "learning_rate": 5.347215793475324e-05, - "loss": 0.6863, + "epoch": 4.900664076492794, + "grad_norm": 10.249539375305176, + "learning_rate": 4.225076128792157e-05, + "loss": 0.7169, "step": 355700 }, { - "epoch": 3.62, - "learning_rate": 5.346802615732285e-05, - "loss": 0.8009, + "epoch": 4.902041828552534, + "grad_norm": 1.6489287614822388, + "learning_rate": 4.224399587383277e-05, + "loss": 0.6193, "step": 355800 }, { - "epoch": 3.63, - "learning_rate": 5.346389323244603e-05, - "loss": 0.782, + "epoch": 4.903419580612273, + "grad_norm": 6.4056291580200195, + "learning_rate": 4.223722920716609e-05, + "loss": 0.6761, "step": 355900 }, { - "epoch": 3.63, - "learning_rate": 5.3459759160324865e-05, - "loss": 0.809, + "epoch": 4.904797332672012, + "grad_norm": 5.431517124176025, + "learning_rate": 4.223046128849639e-05, + "loss": 0.6166, "step": 356000 }, { - "epoch": 3.63, - "learning_rate": 5.3455665299030306e-05, - "loss": 0.8036, + "epoch": 4.906175084731752, + "grad_norm": 5.034150123596191, + "learning_rate": 4.222369211839865e-05, + "loss": 0.6374, "step": 356100 }, { - "epoch": 3.63, - "learning_rate": 5.3451528944494284e-05, - "loss": 0.849, + "epoch": 4.907552836791491, + "grad_norm": 4.222480297088623, + "learning_rate": 4.2216921697447935e-05, + "loss": 0.6185, "step": 356200 }, { - "epoch": 3.63, - "learning_rate": 5.344739144331844e-05, - "loss": 0.6932, + "epoch": 4.908930588851231, + "grad_norm": 8.255706787109375, + "learning_rate": 4.22102177491187e-05, + "loss": 0.62, "step": 356300 }, { - "epoch": 3.63, - "learning_rate": 5.344325279570509e-05, - "loss": 0.774, + "epoch": 4.910308340910969, + "grad_norm": 13.287421226501465, + "learning_rate": 4.220344484068188e-05, + "loss": 0.617, "step": 356400 }, { - "epoch": 3.63, - "learning_rate": 5.3439113001856564e-05, - "loss": 0.721, + "epoch": 4.911686092970709, + "grad_norm": 8.075258255004883, + "learning_rate": 4.219667068311218e-05, + "loss": 0.6516, "step": 356500 }, { - "epoch": 3.63, - "learning_rate": 5.34349720619753e-05, - "loss": 0.7001, + "epoch": 4.9130638450304485, + "grad_norm": 4.107531547546387, + "learning_rate": 4.218989527698513e-05, + "loss": 0.6652, "step": 356600 }, { - "epoch": 3.63, - "learning_rate": 5.3430829976263764e-05, - "loss": 0.8254, + "epoch": 4.914441597090188, + "grad_norm": 8.637262344360352, + "learning_rate": 4.2183118622876325e-05, + "loss": 0.5962, "step": 356700 }, { - "epoch": 3.64, - "learning_rate": 5.342668674492446e-05, - "loss": 0.7804, + "epoch": 4.915819349149927, + "grad_norm": 18.10711669921875, + "learning_rate": 4.217634072136146e-05, + "loss": 0.5902, "step": 356800 }, { - "epoch": 3.64, - "learning_rate": 5.342254236815998e-05, - "loss": 0.808, + "epoch": 4.917197101209666, + "grad_norm": 3.852318048477173, + "learning_rate": 4.2169629370669745e-05, + "loss": 0.6784, "step": 356900 }, { - "epoch": 3.64, - "learning_rate": 5.341839684617295e-05, - "loss": 0.7551, + "epoch": 4.918574853269406, + "grad_norm": 2.2979257106781006, + "learning_rate": 4.216284898853005e-05, + "loss": 0.5952, "step": 357000 }, { - "epoch": 3.64, - "learning_rate": 5.3414250179166074e-05, - "loss": 0.8123, + "epoch": 4.919952605329145, + "grad_norm": 11.620049476623535, + "learning_rate": 4.215606736070633e-05, + "loss": 0.616, "step": 357100 }, { - "epoch": 3.64, - "learning_rate": 5.341010236734209e-05, - "loss": 0.786, + "epoch": 4.921330357388884, + "grad_norm": 5.2932515144348145, + "learning_rate": 4.214928448777469e-05, + "loss": 0.6233, "step": 357200 }, { - "epoch": 3.64, - "learning_rate": 5.340595341090381e-05, - "loss": 0.7171, + "epoch": 4.922708109448624, + "grad_norm": 5.9200849533081055, + "learning_rate": 4.21425003703114e-05, + "loss": 0.6537, "step": 357300 }, { - "epoch": 3.64, - "learning_rate": 5.3401803310054075e-05, - "loss": 0.8, + "epoch": 4.924085861508363, + "grad_norm": 13.241439819335938, + "learning_rate": 4.2135715008892773e-05, + "loss": 0.7043, "step": 357400 }, { - "epoch": 3.64, - "learning_rate": 5.339765206499583e-05, - "loss": 0.8735, + "epoch": 4.925463613568102, + "grad_norm": 24.02862548828125, + "learning_rate": 4.21289284040953e-05, + "loss": 0.541, "step": 357500 }, { - "epoch": 3.64, - "learning_rate": 5.339349967593202e-05, - "loss": 0.7045, + "epoch": 4.926841365627841, + "grad_norm": 2.1290619373321533, + "learning_rate": 4.212214055649551e-05, + "loss": 0.5816, "step": 357600 }, { - "epoch": 3.64, - "learning_rate": 5.338934614306568e-05, - "loss": 0.7032, + "epoch": 4.928219117687581, + "grad_norm": 5.67216682434082, + "learning_rate": 4.211535146667008e-05, + "loss": 0.6459, "step": 357700 }, { - "epoch": 3.65, - "learning_rate": 5.33851914665999e-05, - "loss": 0.7909, + "epoch": 4.9295968697473205, + "grad_norm": 33.091854095458984, + "learning_rate": 4.210856113519577e-05, + "loss": 0.6935, "step": 357800 }, { - "epoch": 3.65, - "learning_rate": 5.338103564673781e-05, - "loss": 0.7648, + "epoch": 4.93097462180706, + "grad_norm": 6.914583206176758, + "learning_rate": 4.210176956264945e-05, + "loss": 0.6591, "step": 357900 }, { - "epoch": 3.65, - "learning_rate": 5.3376878683682596e-05, - "loss": 0.7387, + "epoch": 4.932352373866799, + "grad_norm": 17.032636642456055, + "learning_rate": 4.2094976749608096e-05, + "loss": 0.6472, "step": 358000 }, { - "epoch": 3.65, - "learning_rate": 5.337272057763754e-05, - "loss": 0.8141, + "epoch": 4.933730125926538, + "grad_norm": 2.5963149070739746, + "learning_rate": 4.208818269664881e-05, + "loss": 0.6091, "step": 358100 }, { - "epoch": 3.65, - "learning_rate": 5.3368561328805914e-05, - "loss": 0.6764, + "epoch": 4.935107877986278, + "grad_norm": 9.638187408447266, + "learning_rate": 4.2081387404348766e-05, + "loss": 0.656, "step": 358200 }, { - "epoch": 3.65, - "learning_rate": 5.33644009373911e-05, - "loss": 0.7655, + "epoch": 4.9364856300460165, + "grad_norm": 12.491456031799316, + "learning_rate": 4.2074590873285274e-05, + "loss": 0.6281, "step": 358300 }, { - "epoch": 3.65, - "learning_rate": 5.336023940359652e-05, - "loss": 0.8875, + "epoch": 4.937863382105756, + "grad_norm": 3.607346534729004, + "learning_rate": 4.206779310403572e-05, + "loss": 0.686, "step": 358400 }, { - "epoch": 3.65, - "learning_rate": 5.3356076727625625e-05, - "loss": 0.7552, + "epoch": 4.939241134165496, + "grad_norm": 23.04509735107422, + "learning_rate": 4.20609940971776e-05, + "loss": 0.6462, "step": 358500 }, { - "epoch": 3.65, - "learning_rate": 5.335191290968197e-05, - "loss": 0.7859, + "epoch": 4.940618886225235, + "grad_norm": 10.33632755279541, + "learning_rate": 4.2054193853288566e-05, + "loss": 0.6242, "step": 358600 }, { - "epoch": 3.65, - "learning_rate": 5.334774794996913e-05, - "loss": 0.7241, + "epoch": 4.941996638284975, + "grad_norm": 2.7325823307037354, + "learning_rate": 4.2047392372946274e-05, + "loss": 0.6599, "step": 358700 }, { - "epoch": 3.66, - "learning_rate": 5.334358184869074e-05, - "loss": 0.7683, + "epoch": 4.943374390344713, + "grad_norm": 4.85376501083374, + "learning_rate": 4.204058965672859e-05, + "loss": 0.6081, "step": 358800 }, { - "epoch": 3.66, - "learning_rate": 5.333945628412599e-05, - "loss": 0.729, + "epoch": 4.944752142404453, + "grad_norm": 58.063602447509766, + "learning_rate": 4.203378570521344e-05, + "loss": 0.5858, "step": 358900 }, { - "epoch": 3.66, - "learning_rate": 5.333528791173823e-05, - "loss": 0.6936, + "epoch": 4.9461298944641925, + "grad_norm": 23.585988998413086, + "learning_rate": 4.2026980518978816e-05, + "loss": 0.5981, "step": 359000 }, { - "epoch": 3.66, - "learning_rate": 5.3331118398394165e-05, - "loss": 0.7482, + "epoch": 4.947507646523931, + "grad_norm": 7.902803897857666, + "learning_rate": 4.202017409860289e-05, + "loss": 0.7191, "step": 359100 }, { - "epoch": 3.66, - "learning_rate": 5.332694774429763e-05, - "loss": 0.7116, + "epoch": 4.948885398583671, + "grad_norm": 5.040465831756592, + "learning_rate": 4.2013366444663885e-05, + "loss": 0.6359, "step": 359200 }, { - "epoch": 3.66, - "learning_rate": 5.3322775949652577e-05, - "loss": 0.78, + "epoch": 4.95026315064341, + "grad_norm": 21.595529556274414, + "learning_rate": 4.2006557557740155e-05, + "loss": 0.6671, "step": 359300 }, { - "epoch": 3.66, - "learning_rate": 5.331860301466295e-05, - "loss": 0.7512, + "epoch": 4.95164090270315, + "grad_norm": 4.803142547607422, + "learning_rate": 4.199974743841015e-05, + "loss": 0.6007, "step": 359400 }, { - "epoch": 3.66, - "learning_rate": 5.331442893953279e-05, - "loss": 0.7099, + "epoch": 4.9530186547628885, + "grad_norm": 6.893428325653076, + "learning_rate": 4.199293608725241e-05, + "loss": 0.5709, "step": 359500 }, { - "epoch": 3.66, - "learning_rate": 5.33102537244662e-05, - "loss": 0.7141, + "epoch": 4.954396406822628, + "grad_norm": 7.387838363647461, + "learning_rate": 4.1986123504845606e-05, + "loss": 0.537, "step": 359600 }, { - "epoch": 3.66, - "learning_rate": 5.33060773696673e-05, - "loss": 0.7281, + "epoch": 4.955774158882368, + "grad_norm": 6.249050617218018, + "learning_rate": 4.197930969176849e-05, + "loss": 0.5397, "step": 359700 }, { - "epoch": 3.67, - "learning_rate": 5.33018998753403e-05, - "loss": 0.7264, + "epoch": 4.957151910942107, + "grad_norm": 18.658788681030273, + "learning_rate": 4.1972494648599964e-05, + "loss": 0.6408, "step": 359800 }, { - "epoch": 3.67, - "learning_rate": 5.329772124168947e-05, - "loss": 0.7272, + "epoch": 4.958529663001846, + "grad_norm": 2.8422133922576904, + "learning_rate": 4.196567837591896e-05, + "loss": 0.7516, "step": 359900 }, { - "epoch": 3.67, - "learning_rate": 5.329354146891909e-05, - "loss": 0.8464, + "epoch": 4.959907415061585, + "grad_norm": 4.506580829620361, + "learning_rate": 4.1958860874304575e-05, + "loss": 0.6432, "step": 360000 }, { - "epoch": 3.67, - "learning_rate": 5.3289360557233535e-05, - "loss": 0.7398, + "epoch": 4.961285167121325, + "grad_norm": 5.355544567108154, + "learning_rate": 4.195204214433599e-05, + "loss": 0.6113, "step": 360100 }, { - "epoch": 3.67, - "learning_rate": 5.3285178506837244e-05, - "loss": 0.7651, + "epoch": 4.9626629191810645, + "grad_norm": 4.7043776512146, + "learning_rate": 4.194522218659249e-05, + "loss": 0.5758, "step": 360200 }, { - "epoch": 3.67, - "learning_rate": 5.3280995317934684e-05, - "loss": 0.6942, + "epoch": 4.964040671240803, + "grad_norm": 6.156986713409424, + "learning_rate": 4.193840100165345e-05, + "loss": 0.5164, "step": 360300 }, { - "epoch": 3.67, - "learning_rate": 5.327681099073038e-05, - "loss": 0.7001, + "epoch": 4.965418423300543, + "grad_norm": 52.45740509033203, + "learning_rate": 4.1931578590098395e-05, + "loss": 0.6351, "step": 360400 }, { - "epoch": 3.67, - "learning_rate": 5.3272625525428934e-05, - "loss": 0.8536, + "epoch": 4.966796175360282, + "grad_norm": 3.1738970279693604, + "learning_rate": 4.19247549525069e-05, + "loss": 0.6273, "step": 360500 }, { - "epoch": 3.67, - "learning_rate": 5.326843892223498e-05, - "loss": 0.8733, + "epoch": 4.968173927420022, + "grad_norm": 16.057491302490234, + "learning_rate": 4.191793008945868e-05, + "loss": 0.6193, "step": 360600 }, { - "epoch": 3.67, - "learning_rate": 5.326425118135322e-05, - "loss": 0.7469, + "epoch": 4.9695516794797605, + "grad_norm": 4.419029235839844, + "learning_rate": 4.191110400153353e-05, + "loss": 0.6523, "step": 360700 }, { - "epoch": 3.68, - "learning_rate": 5.326006230298842e-05, - "loss": 0.7395, + "epoch": 4.9709294315395, + "grad_norm": 37.11264419555664, + "learning_rate": 4.190427668931138e-05, + "loss": 0.6745, "step": 360800 }, { - "epoch": 3.68, - "learning_rate": 5.325587228734537e-05, - "loss": 0.7114, + "epoch": 4.97230718359924, + "grad_norm": 5.937665939331055, + "learning_rate": 4.189744815337223e-05, + "loss": 0.6945, "step": 360900 }, { - "epoch": 3.68, - "learning_rate": 5.325168113462895e-05, - "loss": 0.6886, + "epoch": 4.973684935658979, + "grad_norm": 29.668067932128906, + "learning_rate": 4.1890618394296186e-05, + "loss": 0.5732, "step": 361000 }, { - "epoch": 3.68, - "learning_rate": 5.324748884504409e-05, - "loss": 0.7968, + "epoch": 4.975062687718718, + "grad_norm": 3.8287365436553955, + "learning_rate": 4.1883787412663506e-05, + "loss": 0.7104, "step": 361100 }, { - "epoch": 3.68, - "learning_rate": 5.3243295418795754e-05, - "loss": 0.6924, + "epoch": 4.976440439778457, + "grad_norm": 6.143915176391602, + "learning_rate": 4.18769552090545e-05, + "loss": 0.6377, "step": 361200 }, { - "epoch": 3.68, - "learning_rate": 5.3239142807340846e-05, - "loss": 0.6633, + "epoch": 4.977818191838197, + "grad_norm": 16.557329177856445, + "learning_rate": 4.187012178404958e-05, + "loss": 0.6204, "step": 361300 }, { - "epoch": 3.68, - "learning_rate": 5.323494711974224e-05, - "loss": 0.7264, + "epoch": 4.9791959438979365, + "grad_norm": 2.3030545711517334, + "learning_rate": 4.186328713822931e-05, + "loss": 0.6939, "step": 361400 }, { - "epoch": 3.68, - "learning_rate": 5.323075029609338e-05, - "loss": 0.7324, + "epoch": 4.980573695957675, + "grad_norm": 2.297210216522217, + "learning_rate": 4.185645127217431e-05, + "loss": 0.6302, "step": 361500 }, { - "epoch": 3.68, - "learning_rate": 5.3226552336599474e-05, - "loss": 0.7322, + "epoch": 4.981951448017415, + "grad_norm": 7.1969194412231445, + "learning_rate": 4.1849614186465337e-05, + "loss": 0.6338, "step": 361600 }, { - "epoch": 3.69, - "learning_rate": 5.3222353241465754e-05, - "loss": 0.8226, + "epoch": 4.983329200077154, + "grad_norm": 2.900331735610962, + "learning_rate": 4.184277588168321e-05, + "loss": 0.5577, "step": 361700 }, { - "epoch": 3.69, - "learning_rate": 5.3218153010897556e-05, - "loss": 0.7336, + "epoch": 4.984706952136893, + "grad_norm": 3.4706006050109863, + "learning_rate": 4.18359363584089e-05, + "loss": 0.611, "step": 361800 }, { - "epoch": 3.69, - "learning_rate": 5.3213951645100227e-05, - "loss": 0.6882, + "epoch": 4.9860847041966325, + "grad_norm": 8.389727592468262, + "learning_rate": 4.182909561722347e-05, + "loss": 0.611, "step": 361900 }, { - "epoch": 3.69, - "learning_rate": 5.3209749144279195e-05, - "loss": 0.705, + "epoch": 4.987462456256372, + "grad_norm": 17.859580993652344, + "learning_rate": 4.1822253658708045e-05, + "loss": 0.5764, "step": 362000 }, { - "epoch": 3.69, - "learning_rate": 5.320554550863994e-05, - "loss": 0.689, + "epoch": 4.988840208316112, + "grad_norm": 5.879579544067383, + "learning_rate": 4.1815410483443906e-05, + "loss": 0.6728, "step": 362100 }, { - "epoch": 3.69, - "learning_rate": 5.320134073838799e-05, - "loss": 0.8387, + "epoch": 4.990217960375851, + "grad_norm": 17.218673706054688, + "learning_rate": 4.180856609201241e-05, + "loss": 0.586, "step": 362200 }, { - "epoch": 3.69, - "learning_rate": 5.319713483372894e-05, - "loss": 0.7244, + "epoch": 4.99159571243559, + "grad_norm": 3.2508254051208496, + "learning_rate": 4.180172048499501e-05, + "loss": 0.6029, "step": 362300 }, { - "epoch": 3.69, - "learning_rate": 5.3192927794868434e-05, - "loss": 0.7248, + "epoch": 4.992973464495329, + "grad_norm": 10.402902603149414, + "learning_rate": 4.179487366297329e-05, + "loss": 0.672, "step": 362400 }, { - "epoch": 3.69, - "learning_rate": 5.3188719622012167e-05, - "loss": 0.6811, + "epoch": 4.994351216555069, + "grad_norm": 1.7490906715393066, + "learning_rate": 4.1788025626528935e-05, + "loss": 0.5079, "step": 362500 }, { - "epoch": 3.69, - "learning_rate": 5.318451031536589e-05, - "loss": 0.7096, + "epoch": 4.995728968614808, + "grad_norm": 4.2720794677734375, + "learning_rate": 4.17811763762437e-05, + "loss": 0.7242, "step": 362600 }, { - "epoch": 3.7, - "learning_rate": 5.318029987513541e-05, - "loss": 0.762, + "epoch": 4.997106720674547, + "grad_norm": 12.797200202941895, + "learning_rate": 4.177432591269947e-05, + "loss": 0.6602, "step": 362700 }, { - "epoch": 3.7, - "learning_rate": 5.317608830152661e-05, - "loss": 0.6746, + "epoch": 4.998484472734287, + "grad_norm": 4.103048324584961, + "learning_rate": 4.176747423647822e-05, + "loss": 0.6779, "step": 362800 }, { - "epoch": 3.7, - "learning_rate": 5.3171875594745387e-05, - "loss": 0.7081, + "epoch": 4.999862224794026, + "grad_norm": 5.677615642547607, + "learning_rate": 4.176062134816204e-05, + "loss": 0.5889, "step": 362900 }, { - "epoch": 3.7, - "learning_rate": 5.3167661754997745e-05, - "loss": 0.7599, + "epoch": 5.001239976853765, + "grad_norm": 11.1349458694458, + "learning_rate": 4.1753767248333136e-05, + "loss": 0.5987, "step": 363000 }, { - "epoch": 3.7, - "learning_rate": 5.3163446782489686e-05, - "loss": 0.8074, + "epoch": 5.0026177289135045, + "grad_norm": 28.439786911010742, + "learning_rate": 4.1746911937573764e-05, + "loss": 0.6101, "step": 363100 }, { - "epoch": 3.7, - "learning_rate": 5.3159230677427325e-05, - "loss": 0.6664, + "epoch": 5.003995480973244, + "grad_norm": 22.641624450683594, + "learning_rate": 4.174005541646635e-05, + "loss": 0.5408, "step": 363200 }, { - "epoch": 3.7, - "learning_rate": 5.315501344001678e-05, - "loss": 0.618, + "epoch": 5.005373233032984, + "grad_norm": 11.364294052124023, + "learning_rate": 4.173319768559337e-05, + "loss": 0.575, "step": 363300 }, { - "epoch": 3.7, - "learning_rate": 5.315083725976322e-05, - "loss": 0.863, + "epoch": 5.006750985092722, + "grad_norm": 2.4644153118133545, + "learning_rate": 4.172633874553744e-05, + "loss": 0.5785, "step": 363400 }, { - "epoch": 3.7, - "learning_rate": 5.3146617769593316e-05, - "loss": 0.7698, + "epoch": 5.008128737152462, + "grad_norm": 7.052544116973877, + "learning_rate": 4.1719478596881234e-05, + "loss": 0.6256, "step": 363500 }, { - "epoch": 3.7, - "learning_rate": 5.3142397147691944e-05, - "loss": 0.7323, + "epoch": 5.009506489212201, + "grad_norm": 2.3804948329925537, + "learning_rate": 4.1712617240207584e-05, + "loss": 0.5669, "step": 363600 }, { - "epoch": 3.71, - "learning_rate": 5.3138217617400085e-05, - "loss": 0.7784, + "epoch": 5.010884241271941, + "grad_norm": 6.756119251251221, + "learning_rate": 4.170575467609939e-05, + "loss": 0.6099, "step": 363700 }, { - "epoch": 3.71, - "learning_rate": 5.3133994743967056e-05, - "loss": 0.7617, + "epoch": 5.01226199333168, + "grad_norm": 1.881215214729309, + "learning_rate": 4.1698890905139666e-05, + "loss": 0.5922, "step": 363800 }, { - "epoch": 3.71, - "learning_rate": 5.3129770739419755e-05, - "loss": 0.8117, + "epoch": 5.013639745391419, + "grad_norm": 38.156002044677734, + "learning_rate": 4.16920945836529e-05, + "loss": 0.6207, "step": 363900 }, { - "epoch": 3.71, - "learning_rate": 5.312554560396468e-05, - "loss": 0.7493, + "epoch": 5.015017497451159, + "grad_norm": 3.037376880645752, + "learning_rate": 4.16852284127935e-05, + "loss": 0.6101, "step": 364000 }, { - "epoch": 3.71, - "learning_rate": 5.31213616060663e-05, - "loss": 0.8009, + "epoch": 5.016395249510898, + "grad_norm": 5.417888164520264, + "learning_rate": 4.1678361036826374e-05, + "loss": 0.6517, "step": 364100 }, { - "epoch": 3.71, - "learning_rate": 5.311713422071944e-05, - "loss": 0.8524, + "epoch": 5.017773001570637, + "grad_norm": 82.02448272705078, + "learning_rate": 4.167149245633494e-05, + "loss": 0.5222, "step": 364200 }, { - "epoch": 3.71, - "learning_rate": 5.311290570508268e-05, - "loss": 0.7762, + "epoch": 5.0191507536303765, + "grad_norm": 6.891456127166748, + "learning_rate": 4.1664622671902734e-05, + "loss": 0.6018, "step": 364300 }, { - "epoch": 3.71, - "learning_rate": 5.310867605936276e-05, - "loss": 0.7328, + "epoch": 5.020528505690116, + "grad_norm": 17.33257293701172, + "learning_rate": 4.165775168411336e-05, + "loss": 0.57, "step": 364400 }, { - "epoch": 3.71, - "learning_rate": 5.310444528376649e-05, - "loss": 0.8129, + "epoch": 5.021906257749856, + "grad_norm": 12.862934112548828, + "learning_rate": 4.165087949355055e-05, + "loss": 0.6209, "step": 364500 }, { - "epoch": 3.71, - "learning_rate": 5.3100213378500724e-05, - "loss": 0.6807, + "epoch": 5.023284009809594, + "grad_norm": 8.050983428955078, + "learning_rate": 4.164400610079812e-05, + "loss": 0.5868, "step": 364600 }, { - "epoch": 3.72, - "learning_rate": 5.309598034377237e-05, - "loss": 0.7141, + "epoch": 5.024661761869334, + "grad_norm": 10.230002403259277, + "learning_rate": 4.163713150644002e-05, + "loss": 0.5999, "step": 364700 }, { - "epoch": 3.72, - "learning_rate": 5.309174617978841e-05, - "loss": 0.6923, + "epoch": 5.026039513929073, + "grad_norm": 5.717103004455566, + "learning_rate": 4.163025571106028e-05, + "loss": 0.5354, "step": 364800 }, { - "epoch": 3.72, - "learning_rate": 5.308751088675587e-05, - "loss": 0.7448, + "epoch": 5.027417265988813, + "grad_norm": 4.582232475280762, + "learning_rate": 4.162337871524302e-05, + "loss": 0.6353, "step": 364900 }, { - "epoch": 3.72, - "learning_rate": 5.308327446488183e-05, - "loss": 0.7355, + "epoch": 5.028795018048552, + "grad_norm": 10.818204879760742, + "learning_rate": 4.161650051957249e-05, + "loss": 0.604, "step": 365000 }, { - "epoch": 3.72, - "learning_rate": 5.3079036914373425e-05, - "loss": 0.8136, + "epoch": 5.030172770108291, + "grad_norm": 3.3800549507141113, + "learning_rate": 4.160962112463302e-05, + "loss": 0.5648, "step": 365100 }, { - "epoch": 3.72, - "learning_rate": 5.307479823543783e-05, - "loss": 0.8348, + "epoch": 5.031550522168031, + "grad_norm": 8.884862899780273, + "learning_rate": 4.160274053100904e-05, + "loss": 0.5043, "step": 365200 }, { - "epoch": 3.72, - "learning_rate": 5.307055842828231e-05, - "loss": 0.7547, + "epoch": 5.03292827422777, + "grad_norm": 7.920313835144043, + "learning_rate": 4.159585873928511e-05, + "loss": 0.5536, "step": 365300 }, { - "epoch": 3.72, - "learning_rate": 5.3066317493114166e-05, - "loss": 0.7775, + "epoch": 5.034306026287509, + "grad_norm": 4.136593341827393, + "learning_rate": 4.158897575004587e-05, + "loss": 0.5865, "step": 365400 }, { - "epoch": 3.72, - "learning_rate": 5.3062075430140745e-05, - "loss": 0.8526, + "epoch": 5.035683778347249, + "grad_norm": 5.959545612335205, + "learning_rate": 4.1582091563876045e-05, + "loss": 0.5471, "step": 365500 }, { - "epoch": 3.72, - "learning_rate": 5.305783223956946e-05, - "loss": 0.805, + "epoch": 5.037061530406988, + "grad_norm": 6.247141361236572, + "learning_rate": 4.157520618136052e-05, + "loss": 0.6011, "step": 365600 }, { - "epoch": 3.73, - "learning_rate": 5.305358792160778e-05, - "loss": 0.8505, + "epoch": 5.038439282466728, + "grad_norm": 2.8242380619049072, + "learning_rate": 4.156831960308421e-05, + "loss": 0.5757, "step": 365700 }, { - "epoch": 3.73, - "learning_rate": 5.3049342476463245e-05, - "loss": 0.8073, + "epoch": 5.039817034526466, + "grad_norm": 2.302687168121338, + "learning_rate": 4.1561431829632176e-05, + "loss": 0.6191, "step": 365800 }, { - "epoch": 3.73, - "learning_rate": 5.30450959043434e-05, - "loss": 0.8105, + "epoch": 5.041194786586206, + "grad_norm": 10.013916015625, + "learning_rate": 4.155454286158957e-05, + "loss": 0.5053, "step": 365900 }, { - "epoch": 3.73, - "learning_rate": 5.304084820545588e-05, - "loss": 0.8095, + "epoch": 5.0425725386459455, + "grad_norm": 2.516279697418213, + "learning_rate": 4.154765269954165e-05, + "loss": 0.7116, "step": 366000 }, { - "epoch": 3.73, - "learning_rate": 5.30365993800084e-05, - "loss": 0.9612, + "epoch": 5.043950290705685, + "grad_norm": 5.0328288078308105, + "learning_rate": 4.1540761344073776e-05, + "loss": 0.5504, "step": 366100 }, { - "epoch": 3.73, - "learning_rate": 5.3032391933301436e-05, - "loss": 0.8482, + "epoch": 5.045328042765424, + "grad_norm": 1.6958752870559692, + "learning_rate": 4.153386879577138e-05, + "loss": 0.5612, "step": 366200 }, { - "epoch": 3.73, - "learning_rate": 5.30281408666177e-05, - "loss": 0.8228, + "epoch": 5.046705794825163, + "grad_norm": 9.392889022827148, + "learning_rate": 4.152697505522003e-05, + "loss": 0.5702, "step": 366300 }, { - "epoch": 3.73, - "learning_rate": 5.302388867399529e-05, - "loss": 0.8144, + "epoch": 5.048083546884903, + "grad_norm": 5.647604465484619, + "learning_rate": 4.1520080123005394e-05, + "loss": 0.6137, "step": 366400 }, { - "epoch": 3.73, - "learning_rate": 5.301963535564213e-05, - "loss": 0.8243, + "epoch": 5.049461298944642, + "grad_norm": 18.64179039001465, + "learning_rate": 4.151318399971322e-05, + "loss": 0.6157, "step": 366500 }, { - "epoch": 3.73, - "learning_rate": 5.301538091176617e-05, - "loss": 0.8503, + "epoch": 5.050839051004381, + "grad_norm": 57.31591796875, + "learning_rate": 4.150635566495823e-05, + "loss": 0.552, "step": 366600 }, { - "epoch": 3.74, - "learning_rate": 5.3011125342575435e-05, - "loss": 0.8511, + "epoch": 5.052216803064121, + "grad_norm": 4.111088275909424, + "learning_rate": 4.1499457173164834e-05, + "loss": 0.5856, "step": 366700 }, { - "epoch": 3.74, - "learning_rate": 5.3006868648278e-05, - "loss": 0.834, + "epoch": 5.05359455512386, + "grad_norm": 2.2314295768737793, + "learning_rate": 4.1492626494742364e-05, + "loss": 0.5392, "step": 366800 }, { - "epoch": 3.74, - "learning_rate": 5.300261082908198e-05, - "loss": 0.8478, + "epoch": 5.054972307183599, + "grad_norm": 4.245868682861328, + "learning_rate": 4.148572563676861e-05, + "loss": 0.5763, "step": 366900 }, { - "epoch": 3.74, - "learning_rate": 5.299835188519556e-05, - "loss": 0.7865, + "epoch": 5.056350059243338, + "grad_norm": 14.007862091064453, + "learning_rate": 4.14788235906359e-05, + "loss": 0.5775, "step": 367000 }, { - "epoch": 3.74, - "learning_rate": 5.299409181682698e-05, - "loss": 0.8557, + "epoch": 5.057727811303078, + "grad_norm": 8.79376220703125, + "learning_rate": 4.147192035693063e-05, + "loss": 0.5646, "step": 367100 }, { - "epoch": 3.74, - "learning_rate": 5.2989830624184536e-05, - "loss": 0.733, + "epoch": 5.0591055633628175, + "grad_norm": 4.9509477615356445, + "learning_rate": 4.1465015936239236e-05, + "loss": 0.6033, "step": 367200 }, { - "epoch": 3.74, - "learning_rate": 5.298556830747657e-05, - "loss": 0.7316, + "epoch": 5.060483315422556, + "grad_norm": 22.317596435546875, + "learning_rate": 4.145811032914829e-05, + "loss": 0.6139, "step": 367300 }, { - "epoch": 3.74, - "learning_rate": 5.298130486691149e-05, - "loss": 0.761, + "epoch": 5.061861067482296, + "grad_norm": 3.3228743076324463, + "learning_rate": 4.145120353624448e-05, + "loss": 0.6206, "step": 367400 }, { - "epoch": 3.74, - "learning_rate": 5.2977040302697734e-05, - "loss": 0.8472, + "epoch": 5.063238819542035, + "grad_norm": 1.2921619415283203, + "learning_rate": 4.1444295558114534e-05, + "loss": 0.4629, "step": 367500 }, { - "epoch": 3.75, - "learning_rate": 5.297277461504383e-05, - "loss": 0.9746, + "epoch": 5.064616571601775, + "grad_norm": 11.990192413330078, + "learning_rate": 4.143738639534535e-05, + "loss": 0.5826, "step": 367600 }, { - "epoch": 3.75, - "learning_rate": 5.2968507804158354e-05, - "loss": 0.7905, + "epoch": 5.0659943236615135, + "grad_norm": 1.4604682922363281, + "learning_rate": 4.14304760485239e-05, + "loss": 0.6262, "step": 367700 }, { - "epoch": 3.75, - "learning_rate": 5.296423987024991e-05, - "loss": 0.9817, + "epoch": 5.067372075721253, + "grad_norm": 13.105561256408691, + "learning_rate": 4.142356451823723e-05, + "loss": 0.5814, "step": 367800 }, { - "epoch": 3.75, - "learning_rate": 5.295997081352717e-05, - "loss": 0.9731, + "epoch": 5.068749827780993, + "grad_norm": 4.268446445465088, + "learning_rate": 4.141665180507252e-05, + "loss": 0.5768, "step": 367900 }, { - "epoch": 3.75, - "learning_rate": 5.295570063419888e-05, - "loss": 0.8726, + "epoch": 5.070127579840732, + "grad_norm": 1.995708703994751, + "learning_rate": 4.1409737909617046e-05, + "loss": 0.5649, "step": 368000 }, { - "epoch": 3.75, - "learning_rate": 5.2951429332473824e-05, - "loss": 0.844, + "epoch": 5.071505331900471, + "grad_norm": 9.153339385986328, + "learning_rate": 4.1402822832458176e-05, + "loss": 0.5351, "step": 368100 }, { - "epoch": 3.75, - "learning_rate": 5.2947156908560844e-05, - "loss": 0.7798, + "epoch": 5.07288308396021, + "grad_norm": 2.0645530223846436, + "learning_rate": 4.1395906574183385e-05, + "loss": 0.645, "step": 368200 }, { - "epoch": 3.75, - "learning_rate": 5.294288336266882e-05, - "loss": 0.8535, + "epoch": 5.07426083601995, + "grad_norm": 3.429266929626465, + "learning_rate": 4.138898913538023e-05, + "loss": 0.4891, "step": 368300 }, { - "epoch": 3.75, - "learning_rate": 5.293860869500673e-05, - "loss": 0.8187, + "epoch": 5.0756385880796895, + "grad_norm": 11.953442573547363, + "learning_rate": 4.138207051663639e-05, + "loss": 0.6215, "step": 368400 }, { - "epoch": 3.75, - "learning_rate": 5.293433290578356e-05, - "loss": 0.8617, + "epoch": 5.077016340139428, + "grad_norm": 3.2423934936523438, + "learning_rate": 4.137515071853965e-05, + "loss": 0.5465, "step": 368500 }, { - "epoch": 3.76, - "learning_rate": 5.293005599520838e-05, - "loss": 0.8839, + "epoch": 5.078394092199168, + "grad_norm": 1.6827054023742676, + "learning_rate": 4.1368229741677865e-05, + "loss": 0.5334, "step": 368600 }, { - "epoch": 3.76, - "learning_rate": 5.29257779634903e-05, - "loss": 0.8395, + "epoch": 5.079771844258907, + "grad_norm": 10.879039764404297, + "learning_rate": 4.136130758663901e-05, + "loss": 0.6364, "step": 368700 }, { - "epoch": 3.76, - "learning_rate": 5.2921498810838477e-05, - "loss": 0.8474, + "epoch": 5.081149596318647, + "grad_norm": 2.1504204273223877, + "learning_rate": 4.135438425401117e-05, + "loss": 0.6347, "step": 368800 }, { - "epoch": 3.76, - "learning_rate": 5.291721853746217e-05, - "loss": 0.8224, + "epoch": 5.0825273483783855, + "grad_norm": 7.6515913009643555, + "learning_rate": 4.1347459744382506e-05, + "loss": 0.5976, "step": 368900 }, { - "epoch": 3.76, - "learning_rate": 5.291293714357062e-05, - "loss": 0.8166, + "epoch": 5.083905100438125, + "grad_norm": 21.53762435913086, + "learning_rate": 4.134053405834129e-05, + "loss": 0.5814, "step": 369000 }, { - "epoch": 3.76, - "learning_rate": 5.29086546293732e-05, - "loss": 0.8327, + "epoch": 5.085282852497865, + "grad_norm": 3.4068212509155273, + "learning_rate": 4.1333676470912956e-05, + "loss": 0.6395, "step": 369100 }, { - "epoch": 3.76, - "learning_rate": 5.290437099507927e-05, - "loss": 0.8332, + "epoch": 5.086660604557604, + "grad_norm": 2.371778726577759, + "learning_rate": 4.13267484455613e-05, + "loss": 0.4973, "step": 369200 }, { - "epoch": 3.76, - "learning_rate": 5.2900086240898285e-05, - "loss": 0.8946, + "epoch": 5.088038356617343, + "grad_norm": 6.421906471252441, + "learning_rate": 4.131981924555665e-05, + "loss": 0.5746, "step": 369300 }, { - "epoch": 3.76, - "learning_rate": 5.289580036703974e-05, - "loss": 0.824, + "epoch": 5.089416108677082, + "grad_norm": 2.853595733642578, + "learning_rate": 4.1312888871487625e-05, + "loss": 0.6155, "step": 369400 }, { - "epoch": 3.76, - "learning_rate": 5.2891513373713195e-05, - "loss": 0.8446, + "epoch": 5.090793860736822, + "grad_norm": 3.2667086124420166, + "learning_rate": 4.1305957323943044e-05, + "loss": 0.62, "step": 369500 }, { - "epoch": 3.77, - "learning_rate": 5.288722526112826e-05, - "loss": 0.8777, + "epoch": 5.0921716127965615, + "grad_norm": 2.0608744621276855, + "learning_rate": 4.129902460351175e-05, + "loss": 0.5823, "step": 369600 }, { - "epoch": 3.77, - "learning_rate": 5.2882936029494595e-05, - "loss": 0.9609, + "epoch": 5.0935493648563, + "grad_norm": 2.729482412338257, + "learning_rate": 4.129209071078272e-05, + "loss": 0.5176, "step": 369700 }, { - "epoch": 3.77, - "learning_rate": 5.287864567902192e-05, - "loss": 0.7867, + "epoch": 5.09492711691604, + "grad_norm": 8.697037696838379, + "learning_rate": 4.1285155646345024e-05, + "loss": 0.6126, "step": 369800 }, { - "epoch": 3.77, - "learning_rate": 5.287435420991999e-05, - "loss": 0.764, + "epoch": 5.096304868975779, + "grad_norm": 21.503162384033203, + "learning_rate": 4.127821941078783e-05, + "loss": 0.641, "step": 369900 }, { - "epoch": 3.77, - "learning_rate": 5.287006162239865e-05, - "loss": 0.6431, + "epoch": 5.097682621035519, + "grad_norm": 5.574917793273926, + "learning_rate": 4.127128200470041e-05, + "loss": 0.6696, "step": 370000 }, { - "epoch": 3.77, - "learning_rate": 5.2865767916667784e-05, - "loss": 0.7414, + "epoch": 5.0990603730952575, + "grad_norm": 18.982444763183594, + "learning_rate": 4.1264343428672134e-05, + "loss": 0.7233, "step": 370100 }, { - "epoch": 3.77, - "learning_rate": 5.286147309293731e-05, - "loss": 0.7738, + "epoch": 5.100438125154997, + "grad_norm": 2.718024253845215, + "learning_rate": 4.125740368329246e-05, + "loss": 0.6405, "step": 370200 }, { - "epoch": 3.77, - "learning_rate": 5.285717715141725e-05, - "loss": 0.7223, + "epoch": 5.101815877214737, + "grad_norm": 4.670779705047607, + "learning_rate": 4.125046276915097e-05, + "loss": 0.6026, "step": 370300 }, { - "epoch": 3.77, - "learning_rate": 5.285288009231763e-05, - "loss": 0.9447, + "epoch": 5.103193629274476, + "grad_norm": 8.902013778686523, + "learning_rate": 4.124352068683731e-05, + "loss": 0.5597, "step": 370400 }, { - "epoch": 3.77, - "learning_rate": 5.2848581915848535e-05, - "loss": 0.7832, + "epoch": 5.104571381334215, + "grad_norm": 22.331748962402344, + "learning_rate": 4.123657743694126e-05, + "loss": 0.6467, "step": 370500 }, { - "epoch": 3.78, - "learning_rate": 5.284428262222015e-05, - "loss": 0.9128, + "epoch": 5.105949133393954, + "grad_norm": 10.051376342773438, + "learning_rate": 4.1229633020052684e-05, + "loss": 0.6235, "step": 370600 }, { - "epoch": 3.78, - "learning_rate": 5.283998221164267e-05, - "loss": 0.7135, + "epoch": 5.107326885453694, + "grad_norm": 6.93824577331543, + "learning_rate": 4.1222756898366214e-05, + "loss": 0.6308, "step": 370700 }, { - "epoch": 3.78, - "learning_rate": 5.283568068432635e-05, - "loss": 0.7424, + "epoch": 5.1087046375134335, + "grad_norm": 11.9744291305542, + "learning_rate": 4.121581016091777e-05, + "loss": 0.6979, "step": 370800 }, { - "epoch": 3.78, - "learning_rate": 5.2831378040481545e-05, - "loss": 0.8521, + "epoch": 5.110082389573172, + "grad_norm": 33.28582763671875, + "learning_rate": 4.120886225824108e-05, + "loss": 0.5997, "step": 370900 }, { - "epoch": 3.78, - "learning_rate": 5.282707428031859e-05, - "loss": 0.6903, + "epoch": 5.111460141632912, + "grad_norm": 4.222032070159912, + "learning_rate": 4.1201913190926415e-05, + "loss": 0.6096, "step": 371000 }, { - "epoch": 3.78, - "learning_rate": 5.2822812458334676e-05, - "loss": 0.8208, + "epoch": 5.112837893692651, + "grad_norm": 14.958670616149902, + "learning_rate": 4.119496295956412e-05, + "loss": 0.5833, "step": 371100 }, { - "epoch": 3.78, - "learning_rate": 5.281850647732473e-05, - "loss": 0.7118, + "epoch": 5.11421564575239, + "grad_norm": 2.6425986289978027, + "learning_rate": 4.118801156474466e-05, + "loss": 0.5559, "step": 371200 }, { - "epoch": 3.78, - "learning_rate": 5.2814199380625975e-05, - "loss": 0.8584, + "epoch": 5.1155933978121295, + "grad_norm": 126.07704162597656, + "learning_rate": 4.1181059007058596e-05, + "loss": 0.5604, "step": 371300 }, { - "epoch": 3.78, - "learning_rate": 5.280989116844902e-05, - "loss": 0.7124, + "epoch": 5.116971149871869, + "grad_norm": 6.945819854736328, + "learning_rate": 4.1174105287096576e-05, + "loss": 0.615, "step": 371400 }, { - "epoch": 3.78, - "learning_rate": 5.2805581841004515e-05, - "loss": 0.7787, + "epoch": 5.118348901931609, + "grad_norm": 3.546926736831665, + "learning_rate": 4.1167150405449353e-05, + "loss": 0.6426, "step": 371500 }, { - "epoch": 3.79, - "learning_rate": 5.280127139850315e-05, - "loss": 0.6698, + "epoch": 5.119726653991347, + "grad_norm": 11.858908653259277, + "learning_rate": 4.116019436270778e-05, + "loss": 0.6276, "step": 371600 }, { - "epoch": 3.79, - "learning_rate": 5.27969598411557e-05, - "loss": 0.9614, + "epoch": 5.121104406051087, + "grad_norm": 5.070420265197754, + "learning_rate": 4.115323715946281e-05, + "loss": 0.6204, "step": 371700 }, { - "epoch": 3.79, - "learning_rate": 5.279264716917294e-05, - "loss": 0.7173, + "epoch": 5.122482158110826, + "grad_norm": 4.287106990814209, + "learning_rate": 4.1146278796305495e-05, + "loss": 0.5996, "step": 371800 }, { - "epoch": 3.79, - "learning_rate": 5.278833338276576e-05, - "loss": 0.8584, + "epoch": 5.123859910170566, + "grad_norm": 4.991305828094482, + "learning_rate": 4.113931927382699e-05, + "loss": 0.6209, "step": 371900 }, { - "epoch": 3.79, - "learning_rate": 5.2784018482145076e-05, - "loss": 0.8063, + "epoch": 5.125237662230305, + "grad_norm": 4.611647605895996, + "learning_rate": 4.113242820516437e-05, + "loss": 0.6224, "step": 372000 }, { - "epoch": 3.79, - "learning_rate": 5.2779702467521845e-05, - "loss": 0.8011, + "epoch": 5.126615414290044, + "grad_norm": 7.752304553985596, + "learning_rate": 4.112546637739578e-05, + "loss": 0.6273, "step": 372100 }, { - "epoch": 3.79, - "learning_rate": 5.277538533910712e-05, - "loss": 0.6658, + "epoch": 5.127993166349784, + "grad_norm": 15.775985717773438, + "learning_rate": 4.1118503392074086e-05, + "loss": 0.6353, "step": 372200 }, { - "epoch": 3.79, - "learning_rate": 5.277106709711196e-05, - "loss": 0.9071, + "epoch": 5.129370918409523, + "grad_norm": 6.6105241775512695, + "learning_rate": 4.111160889693872e-05, + "loss": 0.6562, "step": 372300 }, { - "epoch": 3.79, - "learning_rate": 5.2766747741747503e-05, - "loss": 0.8608, + "epoch": 5.130748670469262, + "grad_norm": 2.3871562480926514, + "learning_rate": 4.110464360984637e-05, + "loss": 0.5681, "step": 372400 }, { - "epoch": 3.8, - "learning_rate": 5.276242727322496e-05, - "loss": 0.7413, + "epoch": 5.1321264225290015, + "grad_norm": 1.8231264352798462, + "learning_rate": 4.1097677166969955e-05, + "loss": 0.5977, "step": 372500 }, { - "epoch": 3.8, - "learning_rate": 5.2758148913078644e-05, - "loss": 0.7888, + "epoch": 5.133504174588741, + "grad_norm": 5.6286115646362305, + "learning_rate": 4.109070956890129e-05, + "loss": 0.5465, "step": 372600 }, { - "epoch": 3.8, - "learning_rate": 5.275382622999999e-05, - "loss": 0.7496, + "epoch": 5.134881926648481, + "grad_norm": 9.957012176513672, + "learning_rate": 4.1083740816232326e-05, + "loss": 0.5163, "step": 372700 }, { - "epoch": 3.8, - "learning_rate": 5.2749502434395025e-05, - "loss": 0.7547, + "epoch": 5.136259678708219, + "grad_norm": 9.75981616973877, + "learning_rate": 4.107677090955508e-05, + "loss": 0.5852, "step": 372800 }, { - "epoch": 3.8, - "learning_rate": 5.274517752647515e-05, - "loss": 0.766, + "epoch": 5.137637430767959, + "grad_norm": 7.914633274078369, + "learning_rate": 4.106979984946169e-05, + "loss": 0.6376, "step": 372900 }, { - "epoch": 3.8, - "learning_rate": 5.274085150645183e-05, - "loss": 0.7594, + "epoch": 5.139015182827698, + "grad_norm": 3.5851895809173584, + "learning_rate": 4.106282763654438e-05, + "loss": 0.5709, "step": 373000 }, { - "epoch": 3.8, - "learning_rate": 5.2736524374536595e-05, - "loss": 0.8584, + "epoch": 5.140392934887438, + "grad_norm": 7.711193084716797, + "learning_rate": 4.105585427139547e-05, + "loss": 0.6295, "step": 373100 }, { - "epoch": 3.8, - "learning_rate": 5.2732196130940986e-05, - "loss": 0.8021, + "epoch": 5.141770686947177, + "grad_norm": 32.90951919555664, + "learning_rate": 4.10488797546074e-05, + "loss": 0.5489, "step": 373200 }, { - "epoch": 3.8, - "learning_rate": 5.272786677587666e-05, - "loss": 0.7804, + "epoch": 5.143148439006916, + "grad_norm": 29.91495704650879, + "learning_rate": 4.1041904086772666e-05, + "loss": 0.6764, "step": 373300 }, { - "epoch": 3.8, - "learning_rate": 5.2723579619718504e-05, - "loss": 0.6777, + "epoch": 5.144526191066656, + "grad_norm": 43.439964294433594, + "learning_rate": 4.103492726848391e-05, + "loss": 0.6146, "step": 373400 }, { - "epoch": 3.81, - "learning_rate": 5.271924805346121e-05, - "loss": 0.819, + "epoch": 5.145903943126395, + "grad_norm": 1.0201935768127441, + "learning_rate": 4.102794930033383e-05, + "loss": 0.6339, "step": 373500 }, { - "epoch": 3.81, - "learning_rate": 5.2714915376368255e-05, - "loss": 0.7893, + "epoch": 5.147281695186134, + "grad_norm": 3.2723748683929443, + "learning_rate": 4.102097018291525e-05, + "loss": 0.6299, "step": 373600 }, { - "epoch": 3.81, - "learning_rate": 5.2710581588651494e-05, - "loss": 0.7907, + "epoch": 5.1486594472458735, + "grad_norm": 23.417081832885742, + "learning_rate": 4.101398991682109e-05, + "loss": 0.6017, "step": 373700 }, { - "epoch": 3.81, - "learning_rate": 5.2706246690522834e-05, - "loss": 0.7784, + "epoch": 5.150037199305613, + "grad_norm": 3.727722406387329, + "learning_rate": 4.100700850264434e-05, + "loss": 0.5638, "step": 373800 }, { - "epoch": 3.81, - "learning_rate": 5.2701910682194215e-05, - "loss": 0.8319, + "epoch": 5.151414951365353, + "grad_norm": 12.340597152709961, + "learning_rate": 4.100002594097811e-05, + "loss": 0.6044, "step": 373900 }, { - "epoch": 3.81, - "learning_rate": 5.2697573563877634e-05, - "loss": 0.6599, + "epoch": 5.152792703425091, + "grad_norm": 77.85572052001953, + "learning_rate": 4.099304223241562e-05, + "loss": 0.5691, "step": 374000 }, { - "epoch": 3.81, - "learning_rate": 5.269323533578515e-05, - "loss": 0.6981, + "epoch": 5.154170455484831, + "grad_norm": 5.215542316436768, + "learning_rate": 4.098605737755016e-05, + "loss": 0.5507, "step": 374100 }, { - "epoch": 3.81, - "learning_rate": 5.2688895998128896e-05, - "loss": 0.7587, + "epoch": 5.15554820754457, + "grad_norm": 3.6421759128570557, + "learning_rate": 4.097907137697514e-05, + "loss": 0.5906, "step": 374200 }, { - "epoch": 3.81, - "learning_rate": 5.268455555112103e-05, - "loss": 0.7292, + "epoch": 5.15692595960431, + "grad_norm": 2.5727028846740723, + "learning_rate": 4.0972084231284044e-05, + "loss": 0.5203, "step": 374300 }, { - "epoch": 3.81, - "learning_rate": 5.268021399497376e-05, - "loss": 0.8201, + "epoch": 5.158303711664049, + "grad_norm": 3.6209142208099365, + "learning_rate": 4.0965095941070455e-05, + "loss": 0.5257, "step": 374400 }, { - "epoch": 3.82, - "learning_rate": 5.267587132989938e-05, - "loss": 0.7939, + "epoch": 5.159681463723788, + "grad_norm": 206.15066528320312, + "learning_rate": 4.09581065069281e-05, + "loss": 0.6077, "step": 374500 }, { - "epoch": 3.82, - "learning_rate": 5.2671527556110224e-05, - "loss": 0.7083, + "epoch": 5.161059215783528, + "grad_norm": 36.08523178100586, + "learning_rate": 4.0951115929450726e-05, + "loss": 0.6378, "step": 374600 }, { - "epoch": 3.82, - "learning_rate": 5.2667182673818654e-05, - "loss": 0.7239, + "epoch": 5.162436967843267, + "grad_norm": 12.618553161621094, + "learning_rate": 4.0944124209232255e-05, + "loss": 0.6172, "step": 374700 }, { - "epoch": 3.82, - "learning_rate": 5.266283668323713e-05, - "loss": 0.6955, + "epoch": 5.163814719903006, + "grad_norm": 4.642848491668701, + "learning_rate": 4.0937131346866656e-05, + "loss": 0.6837, "step": 374800 }, { - "epoch": 3.82, - "learning_rate": 5.265848958457813e-05, - "loss": 0.7309, + "epoch": 5.165192471962746, + "grad_norm": 40.28071975708008, + "learning_rate": 4.0930137342947985e-05, + "loss": 0.6628, "step": 374900 }, { - "epoch": 3.82, - "learning_rate": 5.2654141378054216e-05, - "loss": 0.7747, + "epoch": 5.166570224022485, + "grad_norm": 6.119065761566162, + "learning_rate": 4.092314219807045e-05, + "loss": 0.6317, "step": 375000 }, { - "epoch": 3.82, - "learning_rate": 5.264979206387797e-05, - "loss": 0.7928, + "epoch": 5.167947976082225, + "grad_norm": 2.995183229446411, + "learning_rate": 4.0916145912828314e-05, + "loss": 0.6198, "step": 375100 }, { - "epoch": 3.82, - "learning_rate": 5.2645441642262076e-05, - "loss": 0.8331, + "epoch": 5.169325728141963, + "grad_norm": 14.597306251525879, + "learning_rate": 4.0909148487815946e-05, + "loss": 0.5681, "step": 375200 }, { - "epoch": 3.82, - "learning_rate": 5.2641090113419215e-05, - "loss": 0.8312, + "epoch": 5.170703480201703, + "grad_norm": 98.04617309570312, + "learning_rate": 4.090214992362781e-05, + "loss": 0.6087, "step": 375300 }, { - "epoch": 3.82, - "learning_rate": 5.263673747756216e-05, - "loss": 0.7642, + "epoch": 5.1720812322614425, + "grad_norm": 5.577531814575195, + "learning_rate": 4.089515022085848e-05, + "loss": 0.6834, "step": 375400 }, { - "epoch": 3.83, - "learning_rate": 5.263238373490373e-05, - "loss": 0.7614, + "epoch": 5.173458984321181, + "grad_norm": 5.794976234436035, + "learning_rate": 4.088814938010259e-05, + "loss": 0.587, "step": 375500 }, { - "epoch": 3.83, - "learning_rate": 5.262802888565681e-05, - "loss": 0.8131, + "epoch": 5.174836736380921, + "grad_norm": 10.025135040283203, + "learning_rate": 4.088114740195494e-05, + "loss": 0.6494, "step": 375600 }, { - "epoch": 3.83, - "learning_rate": 5.26236729300343e-05, - "loss": 0.8353, + "epoch": 5.17621448844066, + "grad_norm": 2.8370156288146973, + "learning_rate": 4.0874144287010354e-05, + "loss": 0.625, "step": 375700 }, { - "epoch": 3.83, - "learning_rate": 5.261931586824921e-05, - "loss": 0.7098, + "epoch": 5.1775922405004, + "grad_norm": 48.05025100708008, + "learning_rate": 4.0867210083997494e-05, + "loss": 0.6058, "step": 375800 }, { - "epoch": 3.83, - "learning_rate": 5.2614957700514556e-05, - "loss": 0.731, + "epoch": 5.1789699925601385, + "grad_norm": 8.473396301269531, + "learning_rate": 4.0860204708597125e-05, + "loss": 0.6052, "step": 375900 }, { - "epoch": 3.83, - "learning_rate": 5.261059842704342e-05, - "loss": 0.7013, + "epoch": 5.180347744619878, + "grad_norm": 42.94242858886719, + "learning_rate": 4.0853198198179e-05, + "loss": 0.5828, "step": 376000 }, { - "epoch": 3.83, - "learning_rate": 5.260623804804896e-05, - "loss": 0.7853, + "epoch": 5.181725496679618, + "grad_norm": 5.639178276062012, + "learning_rate": 4.084619055333838e-05, + "loss": 0.6109, "step": 376100 }, { - "epoch": 3.83, - "learning_rate": 5.2601876563744373e-05, - "loss": 0.8683, + "epoch": 5.183103248739357, + "grad_norm": 24.022245407104492, + "learning_rate": 4.083918177467061e-05, + "loss": 0.5179, "step": 376200 }, { - "epoch": 3.83, - "learning_rate": 5.259751397434289e-05, - "loss": 0.7507, + "epoch": 5.184481000799096, + "grad_norm": 19.259923934936523, + "learning_rate": 4.083217186277109e-05, + "loss": 0.6196, "step": 376300 }, { - "epoch": 3.83, - "learning_rate": 5.259315028005784e-05, - "loss": 0.7903, + "epoch": 5.185858752858835, + "grad_norm": 7.789968013763428, + "learning_rate": 4.0825160818235366e-05, + "loss": 0.6328, "step": 376400 }, { - "epoch": 3.84, - "learning_rate": 5.2588785481102564e-05, - "loss": 0.7842, + "epoch": 5.187236504918575, + "grad_norm": 5.4027814865112305, + "learning_rate": 4.0818148641659055e-05, + "loss": 0.602, "step": 376500 }, { - "epoch": 3.84, - "learning_rate": 5.258441957769048e-05, - "loss": 0.6854, + "epoch": 5.1886142569783145, + "grad_norm": 10.9011869430542, + "learning_rate": 4.0811135333637884e-05, + "loss": 0.6472, "step": 376600 }, { - "epoch": 3.84, - "learning_rate": 5.258005257003505e-05, - "loss": 0.7699, + "epoch": 5.189992009038053, + "grad_norm": 7.006741523742676, + "learning_rate": 4.080412089476767e-05, + "loss": 0.5627, "step": 376700 }, { - "epoch": 3.84, - "learning_rate": 5.257568445834981e-05, - "loss": 0.7286, + "epoch": 5.191369761097793, + "grad_norm": 9.492364883422852, + "learning_rate": 4.079710532564432e-05, + "loss": 0.5759, "step": 376800 }, { - "epoch": 3.84, - "learning_rate": 5.257131524284833e-05, - "loss": 0.6571, + "epoch": 5.192747513157532, + "grad_norm": 16.567487716674805, + "learning_rate": 4.079008862686385e-05, + "loss": 0.5429, "step": 376900 }, { - "epoch": 3.84, - "learning_rate": 5.2566944923744215e-05, - "loss": 0.8412, + "epoch": 5.194125265217272, + "grad_norm": 8.14239501953125, + "learning_rate": 4.078307079902236e-05, + "loss": 0.5523, "step": 377000 }, { - "epoch": 3.84, - "learning_rate": 5.256257350125118e-05, - "loss": 0.6845, + "epoch": 5.1955030172770105, + "grad_norm": 5.102200031280518, + "learning_rate": 4.0776051842716044e-05, + "loss": 0.538, "step": 377100 }, { - "epoch": 3.84, - "learning_rate": 5.2558200975582944e-05, - "loss": 0.7101, + "epoch": 5.19688076933675, + "grad_norm": 4.339067459106445, + "learning_rate": 4.0769031758541206e-05, + "loss": 0.6056, "step": 377200 }, { - "epoch": 3.84, - "learning_rate": 5.255382734695329e-05, - "loss": 0.6259, + "epoch": 5.19825852139649, + "grad_norm": 2.2519538402557373, + "learning_rate": 4.076201054709424e-05, + "loss": 0.6395, "step": 377300 }, { - "epoch": 3.85, - "learning_rate": 5.254945261557609e-05, - "loss": 0.7434, + "epoch": 5.199636273456229, + "grad_norm": 3.2794904708862305, + "learning_rate": 4.075498820897162e-05, + "loss": 0.5668, "step": 377400 }, { - "epoch": 3.85, - "learning_rate": 5.254507678166521e-05, - "loss": 0.7217, + "epoch": 5.201014025515968, + "grad_norm": 4.9867777824401855, + "learning_rate": 4.0747964744769946e-05, + "loss": 0.514, "step": 377500 }, { - "epoch": 3.85, - "learning_rate": 5.254069984543463e-05, - "loss": 0.7084, + "epoch": 5.202391777575707, + "grad_norm": 1.9453870058059692, + "learning_rate": 4.074094015508589e-05, + "loss": 0.5455, "step": 377600 }, { - "epoch": 3.85, - "learning_rate": 5.253632180709834e-05, - "loss": 0.7055, + "epoch": 5.203769529635447, + "grad_norm": 4.1607346534729, + "learning_rate": 4.073391444051623e-05, + "loss": 0.603, "step": 377700 }, { - "epoch": 3.85, - "learning_rate": 5.253194266687041e-05, - "loss": 0.6917, + "epoch": 5.2051472816951865, + "grad_norm": 5.72292947769165, + "learning_rate": 4.072688760165783e-05, + "loss": 0.6327, "step": 377800 }, { - "epoch": 3.85, - "learning_rate": 5.2527562424964925e-05, - "loss": 0.7846, + "epoch": 5.206525033754925, + "grad_norm": 1.0085132122039795, + "learning_rate": 4.071985963910767e-05, + "loss": 0.6038, "step": 377900 }, { - "epoch": 3.85, - "learning_rate": 5.2523181081596093e-05, - "loss": 0.7481, + "epoch": 5.207902785814665, + "grad_norm": 7.772294044494629, + "learning_rate": 4.071283055346279e-05, + "loss": 0.5819, "step": 378000 }, { - "epoch": 3.85, - "learning_rate": 5.251879863697812e-05, - "loss": 0.6889, + "epoch": 5.209280537874404, + "grad_norm": 5.603379726409912, + "learning_rate": 4.070580034532036e-05, + "loss": 0.5842, "step": 378100 }, { - "epoch": 3.85, - "learning_rate": 5.2514415091325264e-05, - "loss": 0.8596, + "epoch": 5.210658289934144, + "grad_norm": 8.517966270446777, + "learning_rate": 4.0698769015277634e-05, + "loss": 0.574, "step": 378200 }, { - "epoch": 3.85, - "learning_rate": 5.251003044485188e-05, - "loss": 0.6402, + "epoch": 5.2120360419938825, + "grad_norm": 3.800159454345703, + "learning_rate": 4.069173656393195e-05, + "loss": 0.6174, "step": 378300 }, { - "epoch": 3.86, - "learning_rate": 5.250564469777233e-05, - "loss": 0.7096, + "epoch": 5.213413794053622, + "grad_norm": 13.190343856811523, + "learning_rate": 4.068470299188076e-05, + "loss": 0.5524, "step": 378400 }, { - "epoch": 3.86, - "learning_rate": 5.250125785030108e-05, - "loss": 0.6826, + "epoch": 5.214791546113362, + "grad_norm": 4.494522571563721, + "learning_rate": 4.06776682997216e-05, + "loss": 0.6131, "step": 378500 }, { - "epoch": 3.86, - "learning_rate": 5.249686990265258e-05, - "loss": 0.7125, + "epoch": 5.216169298173101, + "grad_norm": 3.176579236984253, + "learning_rate": 4.0670632488052094e-05, + "loss": 0.6198, "step": 378600 }, { - "epoch": 3.86, - "learning_rate": 5.249248085504141e-05, - "loss": 0.7708, + "epoch": 5.21754705023284, + "grad_norm": 2.469952344894409, + "learning_rate": 4.066359555746999e-05, + "loss": 0.6303, "step": 378700 }, { - "epoch": 3.86, - "learning_rate": 5.248809070768215e-05, - "loss": 0.8924, + "epoch": 5.218924802292579, + "grad_norm": 3.082827568054199, + "learning_rate": 4.065655750857309e-05, + "loss": 0.6535, "step": 378800 }, { - "epoch": 3.86, - "learning_rate": 5.248369946078946e-05, - "loss": 0.7337, + "epoch": 5.220302554352319, + "grad_norm": 18.763904571533203, + "learning_rate": 4.0649518341959324e-05, + "loss": 0.5767, "step": 378900 }, { - "epoch": 3.86, - "learning_rate": 5.2479307114578035e-05, - "loss": 0.7011, + "epoch": 5.2216803064120585, + "grad_norm": 4.597332000732422, + "learning_rate": 4.064247805822671e-05, + "loss": 0.5857, "step": 379000 }, { - "epoch": 3.86, - "learning_rate": 5.247491366926264e-05, - "loss": 0.7311, + "epoch": 5.223058058471797, + "grad_norm": 7.7229743003845215, + "learning_rate": 4.063543665797333e-05, + "loss": 0.584, "step": 379100 }, { - "epoch": 3.86, - "learning_rate": 5.2470519125058096e-05, - "loss": 0.8064, + "epoch": 5.224435810531537, + "grad_norm": 2.106656551361084, + "learning_rate": 4.0628394141797424e-05, + "loss": 0.508, "step": 379200 }, { - "epoch": 3.86, - "learning_rate": 5.2466123482179264e-05, - "loss": 0.7126, + "epoch": 5.225813562591276, + "grad_norm": 13.566353797912598, + "learning_rate": 4.062135051029726e-05, + "loss": 0.5658, "step": 379300 }, { - "epoch": 3.87, - "learning_rate": 5.246172674084107e-05, - "loss": 0.7017, + "epoch": 5.227191314651016, + "grad_norm": 11.244034767150879, + "learning_rate": 4.0614305764071236e-05, + "loss": 0.6092, "step": 379400 }, { - "epoch": 3.87, - "learning_rate": 5.2457328901258465e-05, - "loss": 0.727, + "epoch": 5.2285690667107545, + "grad_norm": 1.7361465692520142, + "learning_rate": 4.0607259903717864e-05, + "loss": 0.5772, "step": 379500 }, { - "epoch": 3.87, - "learning_rate": 5.245292996364651e-05, - "loss": 0.7639, + "epoch": 5.229946818770494, + "grad_norm": 2.4289731979370117, + "learning_rate": 4.060021292983569e-05, + "loss": 0.6042, "step": 379600 }, { - "epoch": 3.87, - "learning_rate": 5.244852992822027e-05, - "loss": 0.6514, + "epoch": 5.231324570830234, + "grad_norm": 3.7731006145477295, + "learning_rate": 4.0593164843023416e-05, + "loss": 0.6586, "step": 379700 }, { - "epoch": 3.87, - "learning_rate": 5.2444128795194875e-05, - "loss": 0.8224, + "epoch": 5.232702322889972, + "grad_norm": 8.631765365600586, + "learning_rate": 4.058611564387981e-05, + "loss": 0.5831, "step": 379800 }, { - "epoch": 3.87, - "learning_rate": 5.243972656478553e-05, - "loss": 0.6368, + "epoch": 5.234080074949712, + "grad_norm": 3.49064302444458, + "learning_rate": 4.0579065333003715e-05, + "loss": 0.5367, "step": 379900 }, { - "epoch": 3.87, - "learning_rate": 5.243532323720747e-05, - "loss": 0.7287, + "epoch": 5.235457827009451, + "grad_norm": 3.032696485519409, + "learning_rate": 4.057201391099412e-05, + "loss": 0.5582, "step": 380000 }, { - "epoch": 3.87, - "learning_rate": 5.2430918812675985e-05, - "loss": 0.6993, + "epoch": 5.236835579069191, + "grad_norm": 6.222581386566162, + "learning_rate": 4.056496137845007e-05, + "loss": 0.5183, "step": 380100 }, { - "epoch": 3.87, - "learning_rate": 5.242651329140643e-05, - "loss": 0.647, + "epoch": 5.23821333112893, + "grad_norm": 5.745354652404785, + "learning_rate": 4.05579077359707e-05, + "loss": 0.6431, "step": 380200 }, { - "epoch": 3.87, - "learning_rate": 5.242210667361422e-05, - "loss": 0.6995, + "epoch": 5.239591083188669, + "grad_norm": 37.070980072021484, + "learning_rate": 4.055085298415527e-05, + "loss": 0.5982, "step": 380300 }, { - "epoch": 3.88, - "learning_rate": 5.241769895951479e-05, - "loss": 0.7198, + "epoch": 5.240968835248409, + "grad_norm": 2.498462677001953, + "learning_rate": 4.054379712360311e-05, + "loss": 0.6013, "step": 380400 }, { - "epoch": 3.88, - "learning_rate": 5.241329014932366e-05, - "loss": 0.6671, + "epoch": 5.242346587308148, + "grad_norm": 7.718915939331055, + "learning_rate": 4.0536740154913656e-05, + "loss": 0.5188, "step": 380500 }, { - "epoch": 3.88, - "learning_rate": 5.240888024325641e-05, - "loss": 0.7025, + "epoch": 5.243724339367887, + "grad_norm": 6.567794322967529, + "learning_rate": 4.052968207868643e-05, + "loss": 0.557, "step": 380600 }, { - "epoch": 3.88, - "learning_rate": 5.240446924152864e-05, - "loss": 0.7828, + "epoch": 5.2451020914276265, + "grad_norm": 11.195625305175781, + "learning_rate": 4.052262289552105e-05, + "loss": 0.5007, "step": 380700 }, { - "epoch": 3.88, - "learning_rate": 5.240005714435602e-05, - "loss": 0.6778, + "epoch": 5.246479843487366, + "grad_norm": 5.9724507331848145, + "learning_rate": 4.051556260601723e-05, + "loss": 0.6212, "step": 380800 }, { - "epoch": 3.88, - "learning_rate": 5.2395643951954296e-05, - "loss": 0.6014, + "epoch": 5.247857595547106, + "grad_norm": 4.797862529754639, + "learning_rate": 4.050850121077478e-05, + "loss": 0.5214, "step": 380900 }, { - "epoch": 3.88, - "learning_rate": 5.239127381283298e-05, - "loss": 0.7242, + "epoch": 5.249235347606844, + "grad_norm": 2.588568687438965, + "learning_rate": 4.0501438710393606e-05, + "loss": 0.5557, "step": 381000 }, { - "epoch": 3.88, - "learning_rate": 5.238685844156731e-05, - "loss": 0.8567, + "epoch": 5.250613099666584, + "grad_norm": 9.342585563659668, + "learning_rate": 4.049437510547369e-05, + "loss": 0.5424, "step": 381100 }, { - "epoch": 3.88, - "learning_rate": 5.238244197571785e-05, - "loss": 0.6796, + "epoch": 5.251990851726323, + "grad_norm": 836.8515625, + "learning_rate": 4.0487310396615136e-05, + "loss": 0.6345, "step": 381200 }, { - "epoch": 3.88, - "learning_rate": 5.237802441550057e-05, - "loss": 0.7776, + "epoch": 5.253368603786063, + "grad_norm": 11.792338371276855, + "learning_rate": 4.048024458441812e-05, + "loss": 0.5612, "step": 381300 }, { - "epoch": 3.89, - "learning_rate": 5.237360576113142e-05, - "loss": 0.7667, + "epoch": 5.254746355845802, + "grad_norm": 5.871984481811523, + "learning_rate": 4.0473177669482916e-05, + "loss": 0.5592, "step": 381400 }, { - "epoch": 3.89, - "learning_rate": 5.2369186012826486e-05, - "loss": 0.822, + "epoch": 5.256124107905541, + "grad_norm": 7.3336286544799805, + "learning_rate": 4.046610965240991e-05, + "loss": 0.5213, "step": 381500 }, { - "epoch": 3.89, - "learning_rate": 5.236476517080183e-05, - "loss": 0.6365, + "epoch": 5.257501859965281, + "grad_norm": 11.177349090576172, + "learning_rate": 4.045904053379954e-05, + "loss": 0.6225, "step": 381600 }, { - "epoch": 3.89, - "learning_rate": 5.236034323527363e-05, - "loss": 0.7893, + "epoch": 5.25887961202502, + "grad_norm": 14.836637496948242, + "learning_rate": 4.045197031425239e-05, + "loss": 0.5878, "step": 381700 }, { - "epoch": 3.89, - "learning_rate": 5.235592020645809e-05, - "loss": 0.6565, + "epoch": 5.260257364084759, + "grad_norm": 2.5071139335632324, + "learning_rate": 4.04448989943691e-05, + "loss": 0.5891, "step": 381800 }, { - "epoch": 3.89, - "learning_rate": 5.235149608457146e-05, - "loss": 0.7102, + "epoch": 5.2616351161444985, + "grad_norm": 22.321273803710938, + "learning_rate": 4.0437826574750404e-05, + "loss": 0.6396, "step": 381900 }, { - "epoch": 3.89, - "learning_rate": 5.234707086983006e-05, - "loss": 0.7489, + "epoch": 5.263012868204238, + "grad_norm": 7.139045715332031, + "learning_rate": 4.043075305599716e-05, + "loss": 0.5871, "step": 382000 }, { - "epoch": 3.89, - "learning_rate": 5.2342644562450256e-05, - "loss": 0.6939, + "epoch": 5.264390620263978, + "grad_norm": 6.893263339996338, + "learning_rate": 4.042367843871029e-05, + "loss": 0.5377, "step": 382100 }, { - "epoch": 3.89, - "learning_rate": 5.233821716264847e-05, - "loss": 0.7159, + "epoch": 5.265768372323716, + "grad_norm": 4.769865036010742, + "learning_rate": 4.041660272349082e-05, + "loss": 0.5813, "step": 382200 }, { - "epoch": 3.89, - "learning_rate": 5.2333788670641166e-05, - "loss": 0.7327, + "epoch": 5.267146124383456, + "grad_norm": 6.16351842880249, + "learning_rate": 4.040952591093987e-05, + "loss": 0.6101, "step": 382300 }, { - "epoch": 3.9, - "learning_rate": 5.232935908664487e-05, - "loss": 0.7246, + "epoch": 5.268523876443195, + "grad_norm": 7.79477596282959, + "learning_rate": 4.040244800165864e-05, + "loss": 0.6184, "step": 382400 }, { - "epoch": 3.9, - "learning_rate": 5.232492841087618e-05, - "loss": 0.7495, + "epoch": 5.269901628502935, + "grad_norm": 2.567798137664795, + "learning_rate": 4.039536899624844e-05, + "loss": 0.6354, "step": 382500 }, { - "epoch": 3.9, - "learning_rate": 5.232049664355172e-05, - "loss": 0.7247, + "epoch": 5.271279380562674, + "grad_norm": 5.0070672035217285, + "learning_rate": 4.038828889531069e-05, + "loss": 0.6699, "step": 382600 }, { - "epoch": 3.9, - "learning_rate": 5.231606378488817e-05, - "loss": 0.7211, + "epoch": 5.272657132622413, + "grad_norm": 5.721522808074951, + "learning_rate": 4.038120769944685e-05, + "loss": 0.6626, "step": 382700 }, { - "epoch": 3.9, - "learning_rate": 5.231162983510229e-05, - "loss": 0.8791, + "epoch": 5.274034884682153, + "grad_norm": 19.798641204833984, + "learning_rate": 4.037412540925852e-05, + "loss": 0.602, "step": 382800 }, { - "epoch": 3.9, - "learning_rate": 5.2307194794410846e-05, - "loss": 0.7544, + "epoch": 5.275412636741892, + "grad_norm": 7.47581672668457, + "learning_rate": 4.0367042025347376e-05, + "loss": 0.5653, "step": 382900 }, { - "epoch": 3.9, - "learning_rate": 5.23027586630307e-05, - "loss": 0.8426, + "epoch": 5.276790388801631, + "grad_norm": 14.146540641784668, + "learning_rate": 4.035995754831518e-05, + "loss": 0.5903, "step": 383000 }, { - "epoch": 3.9, - "learning_rate": 5.229832144117875e-05, - "loss": 0.6991, + "epoch": 5.2781681408613705, + "grad_norm": 4.865891456604004, + "learning_rate": 4.03528719787638e-05, + "loss": 0.5535, "step": 383100 }, { - "epoch": 3.9, - "learning_rate": 5.2293883129071956e-05, - "loss": 0.719, + "epoch": 5.27954589292111, + "grad_norm": 13.20900821685791, + "learning_rate": 4.034578531729518e-05, + "loss": 0.5371, "step": 383200 }, { - "epoch": 3.91, - "learning_rate": 5.2289488126343734e-05, - "loss": 0.6817, + "epoch": 5.28092364498085, + "grad_norm": 10.534146308898926, + "learning_rate": 4.0338697564511395e-05, + "loss": 0.5295, "step": 383300 }, { - "epoch": 3.91, - "learning_rate": 5.228504764527545e-05, - "loss": 0.7542, + "epoch": 5.282301397040588, + "grad_norm": 6.142459392547607, + "learning_rate": 4.033167961484659e-05, + "loss": 0.6659, "step": 383400 }, { - "epoch": 3.91, - "learning_rate": 5.2280606074601334e-05, - "loss": 0.7855, + "epoch": 5.283679149100328, + "grad_norm": 8.958629608154297, + "learning_rate": 4.0324589692137076e-05, + "loss": 0.6162, "step": 383500 }, { - "epoch": 3.91, - "learning_rate": 5.2276163414538526e-05, - "loss": 0.7093, + "epoch": 5.285056901160067, + "grad_norm": 90.62395477294922, + "learning_rate": 4.031749867991306e-05, + "loss": 0.6245, "step": 383600 }, { - "epoch": 3.91, - "learning_rate": 5.227171966530427e-05, - "loss": 0.6856, + "epoch": 5.286434653219807, + "grad_norm": 7.336931228637695, + "learning_rate": 4.031040657877696e-05, + "loss": 0.6219, "step": 383700 }, { - "epoch": 3.91, - "learning_rate": 5.2267274827115844e-05, - "loss": 0.7803, + "epoch": 5.287812405279546, + "grad_norm": 2.8121542930603027, + "learning_rate": 4.0303313389331274e-05, + "loss": 0.6539, "step": 383800 }, { - "epoch": 3.91, - "learning_rate": 5.2262828900190553e-05, - "loss": 0.6746, + "epoch": 5.289190157339285, + "grad_norm": 8.606476783752441, + "learning_rate": 4.029629006033232e-05, + "loss": 0.5587, "step": 383900 }, { - "epoch": 3.91, - "learning_rate": 5.225838188474579e-05, - "loss": 0.7051, + "epoch": 5.290567909399025, + "grad_norm": 2.72857928276062, + "learning_rate": 4.0289194706943446e-05, + "loss": 0.6036, "step": 384000 }, { - "epoch": 3.91, - "learning_rate": 5.2253933780998985e-05, - "loss": 0.7314, + "epoch": 5.291945661458763, + "grad_norm": 6.961001396179199, + "learning_rate": 4.028209826704706e-05, + "loss": 0.5443, "step": 384100 }, { - "epoch": 3.91, - "learning_rate": 5.2249484589167615e-05, - "loss": 0.636, + "epoch": 5.293323413518503, + "grad_norm": 1.7218700647354126, + "learning_rate": 4.027500074124602e-05, + "loss": 0.582, "step": 384200 }, { - "epoch": 3.92, - "learning_rate": 5.224503430946923e-05, - "loss": 0.7567, + "epoch": 5.2947011655782426, + "grad_norm": 2.7872872352600098, + "learning_rate": 4.026790213014332e-05, + "loss": 0.5201, "step": 384300 }, { - "epoch": 3.92, - "learning_rate": 5.2240582942121415e-05, - "loss": 0.8312, + "epoch": 5.296078917637982, + "grad_norm": 7.193872928619385, + "learning_rate": 4.026080243434201e-05, + "loss": 0.5944, "step": 384400 }, { - "epoch": 3.92, - "learning_rate": 5.223613048734183e-05, - "loss": 0.7635, + "epoch": 5.297456669697722, + "grad_norm": 3.1091573238372803, + "learning_rate": 4.025370165444525e-05, + "loss": 0.6305, "step": 384500 }, { - "epoch": 3.92, - "learning_rate": 5.223167694534816e-05, - "loss": 0.787, + "epoch": 5.29883442175746, + "grad_norm": 10.786712646484375, + "learning_rate": 4.024659979105629e-05, + "loss": 0.5025, "step": 384600 }, { - "epoch": 3.92, - "learning_rate": 5.222722231635815e-05, - "loss": 0.7603, + "epoch": 5.3002121738172, + "grad_norm": 3.10170578956604, + "learning_rate": 4.0239496844778466e-05, + "loss": 0.562, "step": 384700 }, { - "epoch": 3.92, - "learning_rate": 5.2222766600589624e-05, - "loss": 0.7356, + "epoch": 5.3015899258769394, + "grad_norm": 8.986777305603027, + "learning_rate": 4.023239281621521e-05, + "loss": 0.554, "step": 384800 }, { - "epoch": 3.92, - "learning_rate": 5.221830979826042e-05, - "loss": 0.6904, + "epoch": 5.302967677936678, + "grad_norm": 13.616964340209961, + "learning_rate": 4.0225287705970046e-05, + "loss": 0.623, "step": 384900 }, { - "epoch": 3.92, - "learning_rate": 5.221385190958847e-05, - "loss": 0.7178, + "epoch": 5.304345429996418, + "grad_norm": 3.461974859237671, + "learning_rate": 4.021818151464658e-05, + "loss": 0.6051, "step": 385000 }, { - "epoch": 3.92, - "learning_rate": 5.220939293479172e-05, - "loss": 0.7376, + "epoch": 5.305723182056157, + "grad_norm": 3.282644033432007, + "learning_rate": 4.021107424284854e-05, + "loss": 0.5755, "step": 385100 }, { - "epoch": 3.92, - "learning_rate": 5.2204932874088196e-05, - "loss": 0.6809, + "epoch": 5.307100934115897, + "grad_norm": 4.333539962768555, + "learning_rate": 4.0203965891179716e-05, + "loss": 0.5618, "step": 385200 }, { - "epoch": 3.93, - "learning_rate": 5.220047172769597e-05, - "loss": 0.674, + "epoch": 5.3084786861756355, + "grad_norm": 4.0965189933776855, + "learning_rate": 4.0196856460243986e-05, + "loss": 0.6056, "step": 385300 }, { - "epoch": 3.93, - "learning_rate": 5.2196009495833156e-05, - "loss": 0.7107, + "epoch": 5.309856438235375, + "grad_norm": 36.64126205444336, + "learning_rate": 4.018974595064535e-05, + "loss": 0.6409, "step": 385400 }, { - "epoch": 3.93, - "learning_rate": 5.219154617871794e-05, - "loss": 0.7117, + "epoch": 5.311234190295115, + "grad_norm": 31.54360580444336, + "learning_rate": 4.018263436298787e-05, + "loss": 0.4638, "step": 385500 }, { - "epoch": 3.93, - "learning_rate": 5.218708177656855e-05, - "loss": 0.743, + "epoch": 5.312611942354854, + "grad_norm": 2.8543038368225098, + "learning_rate": 4.017552169787572e-05, + "loss": 0.6129, "step": 385600 }, { - "epoch": 3.93, - "learning_rate": 5.2182616289603256e-05, - "loss": 0.7359, + "epoch": 5.313989694414593, + "grad_norm": 68.8862075805664, + "learning_rate": 4.016840795591315e-05, + "loss": 0.5871, "step": 385700 }, { - "epoch": 3.93, - "learning_rate": 5.2178149718040416e-05, - "loss": 0.7201, + "epoch": 5.315367446474332, + "grad_norm": 46.42893600463867, + "learning_rate": 4.0161293137704517e-05, + "loss": 0.6657, "step": 385800 }, { - "epoch": 3.93, - "learning_rate": 5.21736820620984e-05, - "loss": 0.7052, + "epoch": 5.316745198534072, + "grad_norm": 3.993032932281494, + "learning_rate": 4.0154177243854266e-05, + "loss": 0.6036, "step": 385900 }, { - "epoch": 3.93, - "learning_rate": 5.216921332199566e-05, - "loss": 0.7515, + "epoch": 5.3181229505938115, + "grad_norm": 8.683631896972656, + "learning_rate": 4.01470602749669e-05, + "loss": 0.6358, "step": 386000 }, { - "epoch": 3.93, - "learning_rate": 5.216474349795068e-05, - "loss": 0.6955, + "epoch": 5.31950070265355, + "grad_norm": 38.39250564575195, + "learning_rate": 4.013994223164708e-05, + "loss": 0.6056, "step": 386100 }, { - "epoch": 3.93, - "learning_rate": 5.216027259018202e-05, - "loss": 0.7773, + "epoch": 5.32087845471329, + "grad_norm": 6.656564235687256, + "learning_rate": 4.0132823114499494e-05, + "loss": 0.5059, "step": 386200 }, { - "epoch": 3.94, - "learning_rate": 5.215580059890828e-05, - "loss": 0.8942, + "epoch": 5.322256206773029, + "grad_norm": 4.556617736816406, + "learning_rate": 4.012570292412895e-05, + "loss": 0.5801, "step": 386300 }, { - "epoch": 3.94, - "learning_rate": 5.215132752434811e-05, - "loss": 0.6753, + "epoch": 5.323633958832769, + "grad_norm": 4.628555774688721, + "learning_rate": 4.0118581661140366e-05, + "loss": 0.5386, "step": 386400 }, { - "epoch": 3.94, - "learning_rate": 5.214685336672022e-05, - "loss": 0.6987, + "epoch": 5.3250117108925075, + "grad_norm": 47.28902053833008, + "learning_rate": 4.011145932613869e-05, + "loss": 0.5367, "step": 386500 }, { - "epoch": 3.94, - "learning_rate": 5.214237812624336e-05, - "loss": 0.6251, + "epoch": 5.326389462952247, + "grad_norm": 12.198378562927246, + "learning_rate": 4.0104335919729046e-05, + "loss": 0.6269, "step": 386600 }, { - "epoch": 3.94, - "learning_rate": 5.213790180313635e-05, - "loss": 0.7138, + "epoch": 5.327767215011987, + "grad_norm": 20.024791717529297, + "learning_rate": 4.009721144251658e-05, + "loss": 0.5378, "step": 386700 }, { - "epoch": 3.94, - "learning_rate": 5.2133424397618045e-05, - "loss": 0.7835, + "epoch": 5.329144967071726, + "grad_norm": 6.95658016204834, + "learning_rate": 4.0090085895106536e-05, + "loss": 0.5614, "step": 386800 }, { - "epoch": 3.94, - "learning_rate": 5.212894590990738e-05, - "loss": 0.7938, + "epoch": 5.330522719131465, + "grad_norm": 2.594550609588623, + "learning_rate": 4.0082959278104305e-05, + "loss": 0.5719, "step": 386900 }, { - "epoch": 3.94, - "learning_rate": 5.212446634022332e-05, - "loss": 0.7334, + "epoch": 5.331900471191204, + "grad_norm": 9.886303901672363, + "learning_rate": 4.0075831592115304e-05, + "loss": 0.5286, "step": 387000 }, { - "epoch": 3.94, - "learning_rate": 5.211998568878489e-05, - "loss": 0.6429, + "epoch": 5.333278223250944, + "grad_norm": 6.014042854309082, + "learning_rate": 4.0068702837745065e-05, + "loss": 0.6058, "step": 387100 }, { - "epoch": 3.94, - "learning_rate": 5.211550395581116e-05, - "loss": 0.7707, + "epoch": 5.3346559753106835, + "grad_norm": 17.563846588134766, + "learning_rate": 4.0061573015599225e-05, + "loss": 0.4989, "step": 387200 }, { - "epoch": 3.95, - "learning_rate": 5.211102114152126e-05, - "loss": 0.7496, + "epoch": 5.336033727370422, + "grad_norm": 2.6834816932678223, + "learning_rate": 4.00544421262835e-05, + "loss": 0.5503, "step": 387300 }, { - "epoch": 3.95, - "learning_rate": 5.2106537246134386e-05, - "loss": 0.6907, + "epoch": 5.337411479430162, + "grad_norm": 42.77082824707031, + "learning_rate": 4.0047310170403675e-05, + "loss": 0.6368, "step": 387400 }, { - "epoch": 3.95, - "learning_rate": 5.2102052269869765e-05, - "loss": 0.6363, + "epoch": 5.338789231489901, + "grad_norm": 7.212562084197998, + "learning_rate": 4.004017714856566e-05, + "loss": 0.6061, "step": 387500 }, { - "epoch": 3.95, - "learning_rate": 5.2097566212946686e-05, - "loss": 0.6997, + "epoch": 5.340166983549641, + "grad_norm": 11.456828117370605, + "learning_rate": 4.003304306137545e-05, + "loss": 0.5994, "step": 387600 }, { - "epoch": 3.95, - "learning_rate": 5.209307907558449e-05, - "loss": 0.8146, + "epoch": 5.3415447356093795, + "grad_norm": 13.261024475097656, + "learning_rate": 4.002590790943911e-05, + "loss": 0.5423, "step": 387700 }, { - "epoch": 3.95, - "learning_rate": 5.2088590858002573e-05, - "loss": 0.7243, + "epoch": 5.342922487669119, + "grad_norm": 5.116427898406982, + "learning_rate": 4.001877169336281e-05, + "loss": 0.4804, "step": 387800 }, { - "epoch": 3.95, - "learning_rate": 5.208410156042038e-05, - "loss": 0.7216, + "epoch": 5.344300239728859, + "grad_norm": 9.281394958496094, + "learning_rate": 4.0011634413752814e-05, + "loss": 0.5409, "step": 387900 }, { - "epoch": 3.95, - "learning_rate": 5.2079611183057405e-05, - "loss": 0.7504, + "epoch": 5.345677991788598, + "grad_norm": 3.0764312744140625, + "learning_rate": 4.0004496071215456e-05, + "loss": 0.5561, "step": 388000 }, { - "epoch": 3.95, - "learning_rate": 5.2075119726133215e-05, - "loss": 0.6799, + "epoch": 5.347055743848337, + "grad_norm": 3.0495798587799072, + "learning_rate": 3.999735666635719e-05, + "loss": 0.653, "step": 388100 }, { - "epoch": 3.96, - "learning_rate": 5.2070627189867405e-05, - "loss": 0.7553, + "epoch": 5.348433495908076, + "grad_norm": 5.166534900665283, + "learning_rate": 3.999021619978453e-05, + "loss": 0.5862, "step": 388200 }, { - "epoch": 3.96, - "learning_rate": 5.2066133574479635e-05, - "loss": 0.662, + "epoch": 5.349811247967816, + "grad_norm": 2.615530014038086, + "learning_rate": 3.998307467210411e-05, + "loss": 0.6143, "step": 388300 }, { - "epoch": 3.96, - "learning_rate": 5.206163888018962e-05, - "loss": 0.6261, + "epoch": 5.351189000027555, + "grad_norm": 3.1502695083618164, + "learning_rate": 3.997593208392264e-05, + "loss": 0.5436, "step": 388400 }, { - "epoch": 3.96, - "learning_rate": 5.205714310721712e-05, - "loss": 0.7814, + "epoch": 5.352566752087294, + "grad_norm": 7.142763614654541, + "learning_rate": 3.996878843584691e-05, + "loss": 0.5761, "step": 388500 }, { - "epoch": 3.96, - "learning_rate": 5.205264625578195e-05, - "loss": 0.8209, + "epoch": 5.353944504147034, + "grad_norm": 2.311624765396118, + "learning_rate": 3.9961643728483806e-05, + "loss": 0.6069, "step": 388600 }, { - "epoch": 3.96, - "learning_rate": 5.2048148326103983e-05, - "loss": 0.7429, + "epoch": 5.355322256206773, + "grad_norm": 4.66115665435791, + "learning_rate": 3.995449796244033e-05, + "loss": 0.5677, "step": 388700 }, { - "epoch": 3.96, - "learning_rate": 5.2043649318403146e-05, - "loss": 0.6306, + "epoch": 5.356700008266513, + "grad_norm": 5.503378868103027, + "learning_rate": 3.994735113832352e-05, + "loss": 0.5195, "step": 388800 }, { - "epoch": 3.96, - "learning_rate": 5.2039149232899395e-05, - "loss": 0.758, + "epoch": 5.3580777603262515, + "grad_norm": 5.903469562530518, + "learning_rate": 3.9940203256740554e-05, + "loss": 0.5981, "step": 388900 }, { - "epoch": 3.96, - "learning_rate": 5.203464806981278e-05, - "loss": 0.8028, + "epoch": 5.359455512385991, + "grad_norm": 10.365694046020508, + "learning_rate": 3.9933054318298684e-05, + "loss": 0.6242, "step": 389000 }, { - "epoch": 3.96, - "learning_rate": 5.203014582936336e-05, - "loss": 0.7253, + "epoch": 5.360833264445731, + "grad_norm": 3.0333306789398193, + "learning_rate": 3.992590432360523e-05, + "loss": 0.601, "step": 389100 }, { - "epoch": 3.97, - "learning_rate": 5.202564251177129e-05, - "loss": 0.803, + "epoch": 5.362211016505469, + "grad_norm": 4.537501811981201, + "learning_rate": 3.991875327326764e-05, + "loss": 0.636, "step": 389200 }, { - "epoch": 3.97, - "learning_rate": 5.2021138117256736e-05, - "loss": 0.7481, + "epoch": 5.363588768565209, + "grad_norm": 6.623795509338379, + "learning_rate": 3.991160116789343e-05, + "loss": 0.5535, "step": 389300 }, { - "epoch": 3.97, - "learning_rate": 5.201667770608106e-05, - "loss": 0.7113, + "epoch": 5.364966520624948, + "grad_norm": 0.8408687710762024, + "learning_rate": 3.9904448008090196e-05, + "loss": 0.5941, "step": 389400 }, { - "epoch": 3.97, - "learning_rate": 5.2012216239843275e-05, - "loss": 0.7134, + "epoch": 5.366344272684688, + "grad_norm": 9.286808013916016, + "learning_rate": 3.989729379446565e-05, + "loss": 0.6384, "step": 389500 }, { - "epoch": 3.97, - "learning_rate": 5.2007708637405996e-05, - "loss": 0.7832, + "epoch": 5.367722024744427, + "grad_norm": 5.8450422286987305, + "learning_rate": 3.989013852762757e-05, + "loss": 0.5605, "step": 389600 }, { - "epoch": 3.97, - "learning_rate": 5.200319995892311e-05, - "loss": 0.702, + "epoch": 5.369099776804166, + "grad_norm": 42.73175048828125, + "learning_rate": 3.988305377658667e-05, + "loss": 0.6346, "step": 389700 }, { - "epoch": 3.97, - "learning_rate": 5.199869020461505e-05, - "loss": 0.7066, + "epoch": 5.370477528863906, + "grad_norm": 3.9566800594329834, + "learning_rate": 3.987589641566221e-05, + "loss": 0.6917, "step": 389800 }, { - "epoch": 3.97, - "learning_rate": 5.1994179374702324e-05, - "loss": 0.6146, + "epoch": 5.371855280923645, + "grad_norm": 16.253812789916992, + "learning_rate": 3.986873800334204e-05, + "loss": 0.5924, "step": 389900 }, { - "epoch": 3.97, - "learning_rate": 5.198966746940548e-05, - "loss": 0.7075, + "epoch": 5.373233032983384, + "grad_norm": 19.51742172241211, + "learning_rate": 3.986157854023429e-05, + "loss": 0.6879, "step": 390000 }, { - "epoch": 3.97, - "learning_rate": 5.198515448894511e-05, - "loss": 0.6927, + "epoch": 5.3746107850431235, + "grad_norm": 1.9851157665252686, + "learning_rate": 3.9854418026947206e-05, + "loss": 0.5342, "step": 390100 }, { - "epoch": 3.98, - "learning_rate": 5.1980640433541906e-05, - "loss": 0.6173, + "epoch": 5.375988537102863, + "grad_norm": 17.37412452697754, + "learning_rate": 3.984725646408911e-05, + "loss": 0.6825, "step": 390200 }, { - "epoch": 3.98, - "learning_rate": 5.1976125303416547e-05, - "loss": 0.852, + "epoch": 5.377366289162603, + "grad_norm": 30.969364166259766, + "learning_rate": 3.98400938522684e-05, + "loss": 0.571, "step": 390300 }, { - "epoch": 3.98, - "learning_rate": 5.197160909878981e-05, - "loss": 0.6807, + "epoch": 5.378744041222341, + "grad_norm": 62.48212814331055, + "learning_rate": 3.983293019209359e-05, + "loss": 0.6106, "step": 390400 }, { - "epoch": 3.98, - "learning_rate": 5.1967091819882514e-05, - "loss": 0.7088, + "epoch": 5.380121793282081, + "grad_norm": 11.327045440673828, + "learning_rate": 3.982576548417326e-05, + "loss": 0.6305, "step": 390500 }, { - "epoch": 3.98, - "learning_rate": 5.196257346691551e-05, - "loss": 0.6933, + "epoch": 5.38149954534182, + "grad_norm": 9.365761756896973, + "learning_rate": 3.981867139184801e-05, + "loss": 0.774, "step": 390600 }, { - "epoch": 3.98, - "learning_rate": 5.195805404010974e-05, - "loss": 0.7684, + "epoch": 5.38287729740156, + "grad_norm": 1.2081712484359741, + "learning_rate": 3.9811504600725044e-05, + "loss": 0.5846, "step": 390700 }, { - "epoch": 3.98, - "learning_rate": 5.1953533539686155e-05, - "loss": 0.8055, + "epoch": 5.384255049461299, + "grad_norm": 7.587369918823242, + "learning_rate": 3.980433676367678e-05, + "loss": 0.6117, "step": 390800 }, { - "epoch": 3.98, - "learning_rate": 5.194905718691659e-05, - "loss": 0.7189, + "epoch": 5.385632801521038, + "grad_norm": 3.743663787841797, + "learning_rate": 3.979716788131216e-05, + "loss": 0.6546, "step": 390900 }, { - "epoch": 3.98, - "learning_rate": 5.194453455065119e-05, - "loss": 0.7271, + "epoch": 5.387010553580778, + "grad_norm": 2.2835028171539307, + "learning_rate": 3.9789997954240215e-05, + "loss": 0.5886, "step": 391000 }, { - "epoch": 3.98, - "learning_rate": 5.194001084142902e-05, - "loss": 0.7927, + "epoch": 5.388388305640517, + "grad_norm": 59.94197463989258, + "learning_rate": 3.9782826983070077e-05, + "loss": 0.6517, "step": 391100 }, { - "epoch": 3.99, - "learning_rate": 5.193548605947124e-05, - "loss": 0.7627, + "epoch": 5.389766057700256, + "grad_norm": 9.970721244812012, + "learning_rate": 3.9775654968410946e-05, + "loss": 0.6597, "step": 391200 }, { - "epoch": 3.99, - "learning_rate": 5.193096020499911e-05, - "loss": 0.7761, + "epoch": 5.3911438097599955, + "grad_norm": 17.020177841186523, + "learning_rate": 3.976848191087211e-05, + "loss": 0.5796, "step": 391300 }, { - "epoch": 3.99, - "learning_rate": 5.192643327823389e-05, - "loss": 0.8092, + "epoch": 5.392521561819735, + "grad_norm": 2.878309965133667, + "learning_rate": 3.976130781106298e-05, + "loss": 0.6131, "step": 391400 }, { - "epoch": 3.99, - "learning_rate": 5.192190527939693e-05, - "loss": 0.7005, + "epoch": 5.393899313879475, + "grad_norm": 10.692058563232422, + "learning_rate": 3.9754204426161944e-05, + "loss": 0.6202, "step": 391500 }, { - "epoch": 3.99, - "learning_rate": 5.191737620870963e-05, - "loss": 0.6574, + "epoch": 5.395277065939213, + "grad_norm": 4.510809421539307, + "learning_rate": 3.974702825404823e-05, + "loss": 0.571, "step": 391600 }, { - "epoch": 3.99, - "learning_rate": 5.191284606639344e-05, - "loss": 0.7435, + "epoch": 5.396654817998953, + "grad_norm": 2.2511394023895264, + "learning_rate": 3.9739851041486794e-05, + "loss": 0.619, "step": 391700 }, { - "epoch": 3.99, - "learning_rate": 5.190831485266983e-05, - "loss": 0.6932, + "epoch": 5.398032570058692, + "grad_norm": 89.06232452392578, + "learning_rate": 3.97326727890874e-05, + "loss": 0.6232, "step": 391800 }, { - "epoch": 3.99, - "learning_rate": 5.190378256776037e-05, - "loss": 0.6915, + "epoch": 5.399410322118432, + "grad_norm": 3.4528465270996094, + "learning_rate": 3.9725493497459873e-05, + "loss": 0.5915, "step": 391900 }, { - "epoch": 3.99, - "learning_rate": 5.189924921188667e-05, - "loss": 0.7471, + "epoch": 5.400788074178171, + "grad_norm": 30.032114028930664, + "learning_rate": 3.971831316721412e-05, + "loss": 0.5203, "step": 392000 }, { - "epoch": 3.99, - "learning_rate": 5.1894714785270355e-05, - "loss": 0.7607, + "epoch": 5.40216582623791, + "grad_norm": 2.21948504447937, + "learning_rate": 3.9711131798960166e-05, + "loss": 0.7044, "step": 392100 }, { - "epoch": 4.0, - "learning_rate": 5.1890179288133154e-05, - "loss": 0.6891, + "epoch": 5.40354357829765, + "grad_norm": 206.881591796875, + "learning_rate": 3.9703949393308086e-05, + "loss": 0.6272, "step": 392200 }, { - "epoch": 4.0, - "learning_rate": 5.188564272069682e-05, - "loss": 0.6575, + "epoch": 5.404921330357389, + "grad_norm": 3.525752305984497, + "learning_rate": 3.9696765950868074e-05, + "loss": 0.6257, "step": 392300 }, { - "epoch": 4.0, - "learning_rate": 5.188110508318316e-05, - "loss": 0.7304, + "epoch": 5.406299082417128, + "grad_norm": 6.595919609069824, + "learning_rate": 3.9689581472250394e-05, + "loss": 0.6626, "step": 392400 }, { - "epoch": 4.0, - "learning_rate": 5.187656637581405e-05, - "loss": 0.6718, + "epoch": 5.4076768344768675, + "grad_norm": 57.39454650878906, + "learning_rate": 3.968239595806541e-05, + "loss": 0.68, "step": 392500 }, { - "epoch": 4.0, - "learning_rate": 5.187207200187537e-05, - "loss": 0.6916, + "epoch": 5.409054586536607, + "grad_norm": 7.214725017547607, + "learning_rate": 3.967520940892356e-05, + "loss": 0.5895, "step": 392600 }, { - "epoch": 4.0, - "learning_rate": 5.1867531166154155e-05, - "loss": 0.6901, + "epoch": 5.410432338596346, + "grad_norm": 10.026534080505371, + "learning_rate": 3.966802182543539e-05, + "loss": 0.5966, "step": 392700 }, { - "epoch": 4.0, - "learning_rate": 5.186298926124116e-05, - "loss": 0.6898, + "epoch": 5.411810090656085, + "grad_norm": 9.579185485839844, + "learning_rate": 3.96608332082115e-05, + "loss": 0.6058, "step": 392800 }, { - "epoch": 4.0, - "learning_rate": 5.1858446287358466e-05, - "loss": 0.728, + "epoch": 5.413187842715825, + "grad_norm": 9.621077537536621, + "learning_rate": 3.9653643557862635e-05, + "loss": 0.5532, "step": 392900 }, { - "epoch": 4.0, - "learning_rate": 5.1853902244728195e-05, - "loss": 0.6864, + "epoch": 5.414565594775564, + "grad_norm": 157.06382751464844, + "learning_rate": 3.964645287499955e-05, + "loss": 0.5576, "step": 393000 }, { - "epoch": 4.0, - "learning_rate": 5.1849357133572516e-05, - "loss": 0.6824, + "epoch": 5.415943346835304, + "grad_norm": 6.495384216308594, + "learning_rate": 3.9639261160233174e-05, + "loss": 0.6186, "step": 393100 }, { - "epoch": 4.01, - "learning_rate": 5.184481095411366e-05, - "loss": 0.7674, + "epoch": 5.417321098895043, + "grad_norm": 8.68899154663086, + "learning_rate": 3.963206841417444e-05, + "loss": 0.5747, "step": 393200 }, { - "epoch": 4.01, - "learning_rate": 5.1840263706573917e-05, - "loss": 0.7345, + "epoch": 5.418698850954782, + "grad_norm": 10.314475059509277, + "learning_rate": 3.962487463743444e-05, + "loss": 0.5613, "step": 393300 }, { - "epoch": 4.01, - "learning_rate": 5.183571539117562e-05, - "loss": 0.6891, + "epoch": 5.420076603014522, + "grad_norm": 2.918860912322998, + "learning_rate": 3.961767983062431e-05, + "loss": 0.5409, "step": 393400 }, { - "epoch": 4.01, - "learning_rate": 5.1831166008141145e-05, - "loss": 0.7363, + "epoch": 5.42145435507426, + "grad_norm": 13.488360404968262, + "learning_rate": 3.961048399435527e-05, + "loss": 0.6498, "step": 393500 }, { - "epoch": 4.01, - "learning_rate": 5.182661555769292e-05, - "loss": 0.5846, + "epoch": 5.422832107134, + "grad_norm": 3.7352588176727295, + "learning_rate": 3.960328712923867e-05, + "loss": 0.5815, "step": 393600 }, { - "epoch": 4.01, - "learning_rate": 5.182206404005347e-05, - "loss": 0.6082, + "epoch": 5.4242098591937395, + "grad_norm": 11.204102516174316, + "learning_rate": 3.9596089235885904e-05, + "loss": 0.5885, "step": 393700 }, { - "epoch": 4.01, - "learning_rate": 5.1817511455445306e-05, - "loss": 0.681, + "epoch": 5.425587611253479, + "grad_norm": 4.194942474365234, + "learning_rate": 3.9588890314908475e-05, + "loss": 0.5427, "step": 393800 }, { - "epoch": 4.01, - "learning_rate": 5.181295780409102e-05, - "loss": 0.721, + "epoch": 5.426965363313218, + "grad_norm": 6.698248386383057, + "learning_rate": 3.9581690366917976e-05, + "loss": 0.5564, "step": 393900 }, { - "epoch": 4.01, - "learning_rate": 5.180840308621328e-05, - "loss": 0.7568, + "epoch": 5.428343115372957, + "grad_norm": 9.856194496154785, + "learning_rate": 3.9574489392526054e-05, + "loss": 0.6457, "step": 394000 }, { - "epoch": 4.02, - "learning_rate": 5.180384730203479e-05, - "loss": 0.67, + "epoch": 5.429720867432697, + "grad_norm": 2.5510141849517822, + "learning_rate": 3.9567287392344497e-05, + "loss": 0.6215, "step": 394100 }, { - "epoch": 4.02, - "learning_rate": 5.179929045177827e-05, - "loss": 0.6214, + "epoch": 5.431098619492436, + "grad_norm": 35.342491149902344, + "learning_rate": 3.956008436698514e-05, + "loss": 0.6565, "step": 394200 }, { - "epoch": 4.02, - "learning_rate": 5.179473253566654e-05, - "loss": 0.7553, + "epoch": 5.432476371552175, + "grad_norm": 42.12504577636719, + "learning_rate": 3.9552880317059906e-05, + "loss": 0.6903, "step": 394300 }, { - "epoch": 4.02, - "learning_rate": 5.179017355392245e-05, - "loss": 0.6994, + "epoch": 5.433854123611915, + "grad_norm": 6.624769687652588, + "learning_rate": 3.954567524318084e-05, + "loss": 0.5621, "step": 394400 }, { - "epoch": 4.02, - "learning_rate": 5.178561350676891e-05, - "loss": 0.6939, + "epoch": 5.435231875671654, + "grad_norm": 6.960642337799072, + "learning_rate": 3.9538469145960036e-05, + "loss": 0.5903, "step": 394500 }, { - "epoch": 4.02, - "learning_rate": 5.178105239442889e-05, - "loss": 0.7227, + "epoch": 5.436609627731394, + "grad_norm": 4.90280294418335, + "learning_rate": 3.953126202600968e-05, + "loss": 0.6016, "step": 394600 }, { - "epoch": 4.02, - "learning_rate": 5.177649021712539e-05, - "loss": 0.6822, + "epoch": 5.4379873797911324, + "grad_norm": 18.313831329345703, + "learning_rate": 3.952405388394208e-05, + "loss": 0.5806, "step": 394700 }, { - "epoch": 4.02, - "learning_rate": 5.177192697508146e-05, - "loss": 0.6834, + "epoch": 5.439365131850872, + "grad_norm": 7.8750433921813965, + "learning_rate": 3.9516844720369566e-05, + "loss": 0.5819, "step": 394800 }, { - "epoch": 4.02, - "learning_rate": 5.1767362668520236e-05, - "loss": 0.6197, + "epoch": 5.440742883910612, + "grad_norm": 4.338384628295898, + "learning_rate": 3.9509634535904625e-05, + "loss": 0.6167, "step": 394900 }, { - "epoch": 4.02, - "learning_rate": 5.176279729766488e-05, - "loss": 0.6922, + "epoch": 5.442120635970351, + "grad_norm": 9.941856384277344, + "learning_rate": 3.95024233311598e-05, + "loss": 0.6363, "step": 395000 }, { - "epoch": 4.03, - "learning_rate": 5.175823086273861e-05, - "loss": 0.7451, + "epoch": 5.44349838803009, + "grad_norm": 3.9847335815429688, + "learning_rate": 3.949521110674769e-05, + "loss": 0.6363, "step": 395100 }, { - "epoch": 4.03, - "learning_rate": 5.1753663363964695e-05, - "loss": 0.6559, + "epoch": 5.444876140089829, + "grad_norm": 5.161157608032227, + "learning_rate": 3.948799786328104e-05, + "loss": 0.4969, "step": 395200 }, { - "epoch": 4.03, - "learning_rate": 5.174909480156646e-05, - "loss": 0.6522, + "epoch": 5.446253892149569, + "grad_norm": 4.168668746948242, + "learning_rate": 3.948078360137264e-05, + "loss": 0.5912, "step": 395300 }, { - "epoch": 4.03, - "learning_rate": 5.174452517576728e-05, - "loss": 0.7406, + "epoch": 5.4476316442093085, + "grad_norm": 7.99725866317749, + "learning_rate": 3.9473568321635385e-05, + "loss": 0.5631, "step": 395400 }, { - "epoch": 4.03, - "learning_rate": 5.173995448679059e-05, - "loss": 0.7419, + "epoch": 5.449009396269047, + "grad_norm": 5.162728786468506, + "learning_rate": 3.946635202468224e-05, + "loss": 0.5957, "step": 395500 }, { - "epoch": 4.03, - "learning_rate": 5.1735382734859856e-05, - "loss": 0.6474, + "epoch": 5.450387148328787, + "grad_norm": 27.24070167541504, + "learning_rate": 3.945913471112627e-05, + "loss": 0.5731, "step": 395600 }, { - "epoch": 4.03, - "learning_rate": 5.1730809920198624e-05, - "loss": 0.7253, + "epoch": 5.451764900388526, + "grad_norm": 23.206134796142578, + "learning_rate": 3.945191638158062e-05, + "loss": 0.6385, "step": 395700 }, { - "epoch": 4.03, - "learning_rate": 5.1726236043030464e-05, - "loss": 0.6698, + "epoch": 5.453142652448266, + "grad_norm": 6.573702812194824, + "learning_rate": 3.944469703665853e-05, + "loss": 0.6125, "step": 395800 }, { - "epoch": 4.03, - "learning_rate": 5.1721661103579014e-05, - "loss": 0.7034, + "epoch": 5.4545204045080045, + "grad_norm": 3.9449212551116943, + "learning_rate": 3.943747667697333e-05, + "loss": 0.6089, "step": 395900 }, { - "epoch": 4.03, - "learning_rate": 5.1717085102067964e-05, - "loss": 0.6473, + "epoch": 5.455898156567744, + "grad_norm": 4.865775108337402, + "learning_rate": 3.9430255303138405e-05, + "loss": 0.5685, "step": 396000 }, { - "epoch": 4.04, - "learning_rate": 5.171250803872106e-05, - "loss": 0.7021, + "epoch": 5.457275908627484, + "grad_norm": 70.38639068603516, + "learning_rate": 3.942303291576725e-05, + "loss": 0.6121, "step": 396100 }, { - "epoch": 4.04, - "learning_rate": 5.170792991376208e-05, - "loss": 0.7163, + "epoch": 5.458653660687223, + "grad_norm": 7.472474575042725, + "learning_rate": 3.941580951547346e-05, + "loss": 0.5594, "step": 396200 }, { - "epoch": 4.04, - "learning_rate": 5.170335072741488e-05, - "loss": 0.6545, + "epoch": 5.460031412746962, + "grad_norm": 2.748600482940674, + "learning_rate": 3.9408585102870694e-05, + "loss": 0.5541, "step": 396300 }, { - "epoch": 4.04, - "learning_rate": 5.169877047990335e-05, - "loss": 0.67, + "epoch": 5.461409164806701, + "grad_norm": 9.134875297546387, + "learning_rate": 3.940135967857269e-05, + "loss": 0.5642, "step": 396400 }, { - "epoch": 4.04, - "learning_rate": 5.169418917145142e-05, - "loss": 0.6119, + "epoch": 5.462786916866441, + "grad_norm": 18.136253356933594, + "learning_rate": 3.93941332431933e-05, + "loss": 0.5701, "step": 396500 }, { - "epoch": 4.04, - "learning_rate": 5.1689606802283116e-05, - "loss": 0.6803, + "epoch": 5.4641646689261805, + "grad_norm": 5.452502250671387, + "learning_rate": 3.938690579734644e-05, + "loss": 0.6575, "step": 396600 }, { - "epoch": 4.04, - "learning_rate": 5.168502337262247e-05, - "loss": 0.7381, + "epoch": 5.465542420985919, + "grad_norm": 8.468731880187988, + "learning_rate": 3.937967734164612e-05, + "loss": 0.5374, "step": 396700 }, { - "epoch": 4.04, - "learning_rate": 5.16804388826936e-05, - "loss": 0.7128, + "epoch": 5.466920173045659, + "grad_norm": 6.156403064727783, + "learning_rate": 3.937244787670644e-05, + "loss": 0.5167, "step": 396800 }, { - "epoch": 4.04, - "learning_rate": 5.167585333272064e-05, - "loss": 0.7427, + "epoch": 5.468297925105398, + "grad_norm": 6.814610481262207, + "learning_rate": 3.9365217403141564e-05, + "loss": 0.5811, "step": 396900 }, { - "epoch": 4.04, - "learning_rate": 5.1671266722927816e-05, - "loss": 0.6865, + "epoch": 5.469675677165137, + "grad_norm": 12.494441986083984, + "learning_rate": 3.9357985921565765e-05, + "loss": 0.6224, "step": 397000 }, { - "epoch": 4.05, - "learning_rate": 5.1666679053539374e-05, - "loss": 0.7076, + "epoch": 5.4710534292248765, + "grad_norm": 5.694009780883789, + "learning_rate": 3.93507534325934e-05, + "loss": 0.553, "step": 397100 }, { - "epoch": 4.05, - "learning_rate": 5.1662090324779626e-05, - "loss": 0.7428, + "epoch": 5.472431181284616, + "grad_norm": 4.90285587310791, + "learning_rate": 3.934351993683891e-05, + "loss": 0.5698, "step": 397200 }, { - "epoch": 4.05, - "learning_rate": 5.165750053687293e-05, - "loss": 0.6535, + "epoch": 5.473808933344356, + "grad_norm": 12.405680656433105, + "learning_rate": 3.933628543491681e-05, + "loss": 0.5481, "step": 397300 }, { - "epoch": 4.05, - "learning_rate": 5.165290969004371e-05, - "loss": 0.5942, + "epoch": 5.475186685404095, + "grad_norm": 3.627758264541626, + "learning_rate": 3.93290499274417e-05, + "loss": 0.5757, "step": 397400 }, { - "epoch": 4.05, - "learning_rate": 5.164831778451641e-05, - "loss": 0.7949, + "epoch": 5.476564437463834, + "grad_norm": 2.3824477195739746, + "learning_rate": 3.932188578512485e-05, + "loss": 0.6218, "step": 397500 }, { - "epoch": 4.05, - "learning_rate": 5.164372482051558e-05, - "loss": 0.7817, + "epoch": 5.477942189523573, + "grad_norm": 8.639028549194336, + "learning_rate": 3.9314648278428094e-05, + "loss": 0.5181, "step": 397600 }, { - "epoch": 4.05, - "learning_rate": 5.163913079826576e-05, - "loss": 0.6795, + "epoch": 5.479319941583313, + "grad_norm": 12.686386108398438, + "learning_rate": 3.9307409768016524e-05, + "loss": 0.5235, "step": 397700 }, { - "epoch": 4.05, - "learning_rate": 5.163453571799157e-05, - "loss": 0.688, + "epoch": 5.480697693643052, + "grad_norm": 40.40836715698242, + "learning_rate": 3.93001702545051e-05, + "loss": 0.6171, "step": 397800 }, { - "epoch": 4.05, - "learning_rate": 5.16299395799177e-05, - "loss": 0.6463, + "epoch": 5.482075445702791, + "grad_norm": 11.910573959350586, + "learning_rate": 3.9292929738508833e-05, + "loss": 0.6197, "step": 397900 }, { - "epoch": 4.05, - "learning_rate": 5.162534238426887e-05, - "loss": 0.6954, + "epoch": 5.483453197762531, + "grad_norm": 71.02980041503906, + "learning_rate": 3.9285688220642856e-05, + "loss": 0.5833, "step": 398000 }, { - "epoch": 4.06, - "learning_rate": 5.1620744131269845e-05, - "loss": 0.6094, + "epoch": 5.48483094982227, + "grad_norm": 16.957067489624023, + "learning_rate": 3.9278445701522366e-05, + "loss": 0.638, "step": 398100 }, { - "epoch": 4.06, - "learning_rate": 5.1616144821145454e-05, - "loss": 0.7663, + "epoch": 5.486208701882009, + "grad_norm": 8.490316390991211, + "learning_rate": 3.9271202181762655e-05, + "loss": 0.5639, "step": 398200 }, { - "epoch": 4.06, - "learning_rate": 5.1611544454120586e-05, - "loss": 0.8094, + "epoch": 5.4875864539417485, + "grad_norm": 3.2989583015441895, + "learning_rate": 3.92639576619791e-05, + "loss": 0.6038, "step": 398300 }, { - "epoch": 4.06, - "learning_rate": 5.1606943030420165e-05, - "loss": 0.6036, + "epoch": 5.488964206001488, + "grad_norm": 2.340512752532959, + "learning_rate": 3.925671214278716e-05, + "loss": 0.6419, "step": 398400 }, { - "epoch": 4.06, - "learning_rate": 5.160234055026917e-05, - "loss": 0.7686, + "epoch": 5.490341958061228, + "grad_norm": 4.294698715209961, + "learning_rate": 3.924946562480237e-05, + "loss": 0.5905, "step": 398500 }, { - "epoch": 4.06, - "learning_rate": 5.1597737013892645e-05, - "loss": 0.6722, + "epoch": 5.491719710120966, + "grad_norm": 6.179967403411865, + "learning_rate": 3.9242218108640376e-05, + "loss": 0.57, "step": 398600 }, { - "epoch": 4.06, - "learning_rate": 5.159313242151566e-05, - "loss": 0.6498, + "epoch": 5.493097462180706, + "grad_norm": 6.88069486618042, + "learning_rate": 3.923496959491688e-05, + "loss": 0.5298, "step": 398700 }, { - "epoch": 4.06, - "learning_rate": 5.158852677336336e-05, - "loss": 0.6911, + "epoch": 5.494475214240445, + "grad_norm": 10.764232635498047, + "learning_rate": 3.922772008424767e-05, + "loss": 0.5, "step": 398800 }, { - "epoch": 4.06, - "learning_rate": 5.158392006966094e-05, - "loss": 0.7501, + "epoch": 5.495852966300185, + "grad_norm": 8.831521034240723, + "learning_rate": 3.922046957724865e-05, + "loss": 0.6417, "step": 398900 }, { - "epoch": 4.07, - "learning_rate": 5.1579312310633635e-05, - "loss": 0.6923, + "epoch": 5.497230718359924, + "grad_norm": 11.632200241088867, + "learning_rate": 3.921321807453577e-05, + "loss": 0.5413, "step": 399000 }, { - "epoch": 4.07, - "learning_rate": 5.157470349650674e-05, - "loss": 0.6912, + "epoch": 5.498608470419663, + "grad_norm": 15.019420623779297, + "learning_rate": 3.92059655767251e-05, + "loss": 0.6104, "step": 399100 }, { - "epoch": 4.07, - "learning_rate": 5.1570093627505586e-05, - "loss": 0.7275, + "epoch": 5.499986222479403, + "grad_norm": 6.926292896270752, + "learning_rate": 3.919871208443275e-05, + "loss": 0.6515, "step": 399200 }, { - "epoch": 4.07, - "learning_rate": 5.1565482703855584e-05, - "loss": 0.689, + "epoch": 5.501363974539142, + "grad_norm": 21.12950325012207, + "learning_rate": 3.9191457598274964e-05, + "loss": 0.5821, "step": 399300 }, { - "epoch": 4.07, - "learning_rate": 5.156087072578217e-05, - "loss": 0.7092, + "epoch": 5.502741726598881, + "grad_norm": 3.0918517112731934, + "learning_rate": 3.918420211886806e-05, + "loss": 0.5749, "step": 399400 }, { - "epoch": 4.07, - "learning_rate": 5.155625769351085e-05, - "loss": 0.6666, + "epoch": 5.5041194786586205, + "grad_norm": 3.0059430599212646, + "learning_rate": 3.91769456468284e-05, + "loss": 0.5806, "step": 399500 }, { - "epoch": 4.07, - "learning_rate": 5.155164360726717e-05, - "loss": 0.8005, + "epoch": 5.50549723071836, + "grad_norm": 16.383102416992188, + "learning_rate": 3.916968818277246e-05, + "loss": 0.5962, "step": 399600 }, { - "epoch": 4.07, - "learning_rate": 5.1547028467276726e-05, - "loss": 0.6433, + "epoch": 5.5068749827781, + "grad_norm": 4.6504340171813965, + "learning_rate": 3.916242972731681e-05, + "loss": 0.6096, "step": 399700 }, { - "epoch": 4.07, - "learning_rate": 5.154241227376518e-05, - "loss": 0.7297, + "epoch": 5.508252734837838, + "grad_norm": 1.9566203355789185, + "learning_rate": 3.91551702810781e-05, + "loss": 0.5579, "step": 399800 }, { - "epoch": 4.07, - "learning_rate": 5.1537841204639364e-05, - "loss": 0.7473, + "epoch": 5.509630486897578, + "grad_norm": 13.979101181030273, + "learning_rate": 3.9147909844673035e-05, + "loss": 0.6306, "step": 399900 }, { - "epoch": 4.08, - "learning_rate": 5.1533222915292354e-05, - "loss": 0.725, + "epoch": 5.511008238957317, + "grad_norm": 1.4463425874710083, + "learning_rate": 3.914072103787423e-05, + "loss": 0.5993, "step": 400000 }, { - "epoch": 4.08, - "learning_rate": 5.1528603573099234e-05, - "loss": 0.6883, + "epoch": 5.512385991017057, + "grad_norm": 4.543508052825928, + "learning_rate": 3.913345863287328e-05, + "loss": 0.6163, "step": 400100 }, { - "epoch": 4.08, - "learning_rate": 5.152398317828589e-05, - "loss": 0.6748, + "epoch": 5.513763743076796, + "grad_norm": 6.191413879394531, + "learning_rate": 3.91261952395505e-05, + "loss": 0.5561, "step": 400200 }, { - "epoch": 4.08, - "learning_rate": 5.151936173107821e-05, - "loss": 0.6349, + "epoch": 5.515141495136535, + "grad_norm": 18.279468536376953, + "learning_rate": 3.911893085852295e-05, + "loss": 0.5004, "step": 400300 }, { - "epoch": 4.08, - "learning_rate": 5.15147854619034e-05, - "loss": 0.6914, + "epoch": 5.516519247196275, + "grad_norm": 2.0381975173950195, + "learning_rate": 3.911166549040778e-05, + "loss": 0.5924, "step": 400400 }, { - "epoch": 4.08, - "learning_rate": 5.15101619211033e-05, - "loss": 0.7088, + "epoch": 5.517896999256013, + "grad_norm": 5.469998836517334, + "learning_rate": 3.910439913582223e-05, + "loss": 0.5727, "step": 400500 }, { - "epoch": 4.08, - "learning_rate": 5.150553732858465e-05, - "loss": 0.7469, + "epoch": 5.519274751315753, + "grad_norm": 5.091738700866699, + "learning_rate": 3.9097131795383596e-05, + "loss": 0.6142, "step": 400600 }, { - "epoch": 4.08, - "learning_rate": 5.150091168457356e-05, - "loss": 0.6863, + "epoch": 5.5206525033754925, + "grad_norm": 26.629655838012695, + "learning_rate": 3.9089863469709285e-05, + "loss": 0.658, "step": 400700 }, { - "epoch": 4.08, - "learning_rate": 5.149628498929621e-05, - "loss": 0.7228, + "epoch": 5.522030255435232, + "grad_norm": 11.263328552246094, + "learning_rate": 3.908259415941679e-05, + "loss": 0.534, "step": 400800 }, { - "epoch": 4.08, - "learning_rate": 5.1491657242978797e-05, - "loss": 0.8432, + "epoch": 5.523408007494972, + "grad_norm": 5.415755271911621, + "learning_rate": 3.9075323865123665e-05, + "loss": 0.5332, "step": 400900 }, { - "epoch": 4.09, - "learning_rate": 5.1487028445847596e-05, - "loss": 0.6141, + "epoch": 5.52478575955471, + "grad_norm": 193.8293914794922, + "learning_rate": 3.906805258744755e-05, + "loss": 0.5489, "step": 401000 }, { - "epoch": 4.09, - "learning_rate": 5.1482398598128936e-05, - "loss": 0.5481, + "epoch": 5.52616351161445, + "grad_norm": 5.2032904624938965, + "learning_rate": 3.906078032700619e-05, + "loss": 0.5734, "step": 401100 }, { - "epoch": 4.09, - "learning_rate": 5.1477767700049183e-05, - "loss": 0.7714, + "epoch": 5.527541263674189, + "grad_norm": 4.388431072235107, + "learning_rate": 3.90535070844174e-05, + "loss": 0.5996, "step": 401200 }, { - "epoch": 4.09, - "learning_rate": 5.1473135751834775e-05, - "loss": 0.6902, + "epoch": 5.528919015733928, + "grad_norm": 3.1175475120544434, + "learning_rate": 3.9046232860299075e-05, + "loss": 0.5605, "step": 401300 }, { - "epoch": 4.09, - "learning_rate": 5.146850275371216e-05, - "loss": 0.7318, + "epoch": 5.530296767793668, + "grad_norm": 96.86162567138672, + "learning_rate": 3.903895765526919e-05, + "loss": 0.5437, "step": 401400 }, { - "epoch": 4.09, - "learning_rate": 5.146386870590788e-05, - "loss": 0.6945, + "epoch": 5.531674519853407, + "grad_norm": 15.472757339477539, + "learning_rate": 3.903168146994582e-05, + "loss": 0.5706, "step": 401500 }, { - "epoch": 4.09, - "learning_rate": 5.145923360864851e-05, - "loss": 0.7537, + "epoch": 5.533052271913147, + "grad_norm": 3.812843084335327, + "learning_rate": 3.9024404304947124e-05, + "loss": 0.6388, "step": 401600 }, { - "epoch": 4.09, - "learning_rate": 5.145459746216068e-05, - "loss": 0.7469, + "epoch": 5.534430023972886, + "grad_norm": 8.083906173706055, + "learning_rate": 3.9017126160891314e-05, + "loss": 0.6355, "step": 401700 }, { - "epoch": 4.09, - "learning_rate": 5.1449960266671065e-05, - "loss": 0.7908, + "epoch": 5.535807776032625, + "grad_norm": 26.95009422302246, + "learning_rate": 3.900984703839672e-05, + "loss": 0.5922, "step": 401800 }, { - "epoch": 4.09, - "learning_rate": 5.144532202240641e-05, - "loss": 0.6816, + "epoch": 5.5371855280923645, + "grad_norm": 19.113903045654297, + "learning_rate": 3.900256693808174e-05, + "loss": 0.6205, "step": 401900 }, { - "epoch": 4.1, - "learning_rate": 5.144068272959347e-05, - "loss": 0.6899, + "epoch": 5.538563280152104, + "grad_norm": 9.053086280822754, + "learning_rate": 3.899535867617513e-05, + "loss": 0.5659, "step": 402000 }, { - "epoch": 4.1, - "learning_rate": 5.143604238845912e-05, - "loss": 0.7818, + "epoch": 5.539941032211843, + "grad_norm": 37.57135009765625, + "learning_rate": 3.8988149457113135e-05, + "loss": 0.5775, "step": 402100 }, { - "epoch": 4.1, - "learning_rate": 5.14314009992302e-05, - "loss": 0.8172, + "epoch": 5.541318784271582, + "grad_norm": 5.8613152503967285, + "learning_rate": 3.898086644656142e-05, + "loss": 0.5821, "step": 402200 }, { - "epoch": 4.1, - "learning_rate": 5.1426758562133665e-05, - "loss": 0.645, + "epoch": 5.542696536331322, + "grad_norm": 6.441689968109131, + "learning_rate": 3.897358246065138e-05, + "loss": 0.6273, "step": 402300 }, { - "epoch": 4.1, - "learning_rate": 5.142211507739652e-05, - "loss": 0.6912, + "epoch": 5.544074288391061, + "grad_norm": 4.96812629699707, + "learning_rate": 3.896629750000181e-05, + "loss": 0.6514, "step": 402400 }, { - "epoch": 4.1, - "learning_rate": 5.141747054524578e-05, - "loss": 0.7407, + "epoch": 5.5454520404508, + "grad_norm": 4.849573612213135, + "learning_rate": 3.895901156523161e-05, + "loss": 0.5573, "step": 402500 }, { - "epoch": 4.1, - "learning_rate": 5.141282496590855e-05, - "loss": 0.6894, + "epoch": 5.54682979251054, + "grad_norm": 19.24586296081543, + "learning_rate": 3.895172465695975e-05, + "loss": 0.6427, "step": 402600 }, { - "epoch": 4.1, - "learning_rate": 5.140817833961195e-05, - "loss": 0.6066, + "epoch": 5.548207544570279, + "grad_norm": 6.632706642150879, + "learning_rate": 3.8944436775805294e-05, + "loss": 0.4727, "step": 402700 }, { - "epoch": 4.1, - "learning_rate": 5.1403530666583195e-05, - "loss": 0.7579, + "epoch": 5.549585296630019, + "grad_norm": 14.708337783813477, + "learning_rate": 3.893714792238739e-05, + "loss": 0.5611, "step": 402800 }, { - "epoch": 4.1, - "learning_rate": 5.139888194704953e-05, - "loss": 0.5961, + "epoch": 5.550963048689757, + "grad_norm": 9.718215942382812, + "learning_rate": 3.892985809732525e-05, + "loss": 0.6052, "step": 402900 }, { - "epoch": 4.11, - "learning_rate": 5.139427868407466e-05, - "loss": 0.7051, + "epoch": 5.552340800749497, + "grad_norm": 11.996245384216309, + "learning_rate": 3.892256730123819e-05, + "loss": 0.5922, "step": 403000 }, { - "epoch": 4.11, - "learning_rate": 5.138962788267246e-05, - "loss": 0.7467, + "epoch": 5.5537185528092365, + "grad_norm": 4.267444133758545, + "learning_rate": 3.89152755347456e-05, + "loss": 0.5713, "step": 403100 }, { - "epoch": 4.11, - "learning_rate": 5.1384976035445094e-05, - "loss": 0.7721, + "epoch": 5.555096304868976, + "grad_norm": 9.997715950012207, + "learning_rate": 3.8907982798466956e-05, + "loss": 0.594, "step": 403200 }, { - "epoch": 4.11, - "learning_rate": 5.1380323142620036e-05, - "loss": 0.662, + "epoch": 5.556474056928715, + "grad_norm": 8.01296329498291, + "learning_rate": 3.89007620348716e-05, + "loss": 0.552, "step": 403300 }, { - "epoch": 4.11, - "learning_rate": 5.137566920442475e-05, - "loss": 0.7069, + "epoch": 5.557851808988454, + "grad_norm": 114.51327514648438, + "learning_rate": 3.889346737056199e-05, + "loss": 0.5488, "step": 403400 }, { - "epoch": 4.11, - "learning_rate": 5.137101422108681e-05, - "loss": 0.5968, + "epoch": 5.559229561048194, + "grad_norm": 1.3845990896224976, + "learning_rate": 3.888617173831905e-05, + "loss": 0.5585, "step": 403500 }, { - "epoch": 4.11, - "learning_rate": 5.1366358192833815e-05, - "loss": 0.5803, + "epoch": 5.560607313107933, + "grad_norm": 13.90626335144043, + "learning_rate": 3.887887513876255e-05, + "loss": 0.5187, "step": 403600 }, { - "epoch": 4.11, - "learning_rate": 5.136170111989341e-05, - "loss": 0.5757, + "epoch": 5.561985065167672, + "grad_norm": 4.643988132476807, + "learning_rate": 3.88715775725124e-05, + "loss": 0.5841, "step": 403700 }, { - "epoch": 4.11, - "learning_rate": 5.1357089588836624e-05, - "loss": 0.7869, + "epoch": 5.563362817227412, + "grad_norm": 7.148656845092773, + "learning_rate": 3.8864279040188565e-05, + "loss": 0.6647, "step": 403800 }, { - "epoch": 4.12, - "learning_rate": 5.135243043764575e-05, - "loss": 0.7007, + "epoch": 5.564740569287151, + "grad_norm": 8.253439903259277, + "learning_rate": 3.885697954241108e-05, + "loss": 0.5794, "step": 403900 }, { - "epoch": 4.12, - "learning_rate": 5.134777024244845e-05, - "loss": 0.6961, + "epoch": 5.566118321346891, + "grad_norm": 39.806983947753906, + "learning_rate": 3.884967907980009e-05, + "loss": 0.5727, "step": 404000 }, { - "epoch": 4.12, - "learning_rate": 5.1343109003472596e-05, - "loss": 0.633, + "epoch": 5.567496073406629, + "grad_norm": 12.74814224243164, + "learning_rate": 3.8842377652975784e-05, + "loss": 0.6619, "step": 404100 }, { - "epoch": 4.12, - "learning_rate": 5.133844672094607e-05, - "loss": 0.6078, + "epoch": 5.568873825466369, + "grad_norm": 7.739386081695557, + "learning_rate": 3.883507526255848e-05, + "loss": 0.5363, "step": 404200 }, { - "epoch": 4.12, - "learning_rate": 5.133378339509684e-05, - "loss": 0.7125, + "epoch": 5.5702515775261086, + "grad_norm": 3.6189370155334473, + "learning_rate": 3.8827771909168535e-05, + "loss": 0.5867, "step": 404300 }, { - "epoch": 4.12, - "learning_rate": 5.1329119026152916e-05, - "loss": 0.736, + "epoch": 5.571629329585848, + "grad_norm": 7.51109504699707, + "learning_rate": 3.882046759342641e-05, + "loss": 0.5445, "step": 404400 }, { - "epoch": 4.12, - "learning_rate": 5.132445361434237e-05, - "loss": 0.6975, + "epoch": 5.573007081645587, + "grad_norm": 8.570854187011719, + "learning_rate": 3.881316231595266e-05, + "loss": 0.6674, "step": 404500 }, { - "epoch": 4.12, - "learning_rate": 5.1319787159893285e-05, - "loss": 0.7345, + "epoch": 5.574384833705326, + "grad_norm": 23.80831527709961, + "learning_rate": 3.880585607736789e-05, + "loss": 0.5858, "step": 404600 }, { - "epoch": 4.12, - "learning_rate": 5.1315119663033835e-05, - "loss": 0.69, + "epoch": 5.575762585765066, + "grad_norm": 4.698869705200195, + "learning_rate": 3.87985488782928e-05, + "loss": 0.5427, "step": 404700 }, { - "epoch": 4.12, - "learning_rate": 5.131045112399224e-05, - "loss": 0.708, + "epoch": 5.577140337824805, + "grad_norm": 2.22591233253479, + "learning_rate": 3.8791240719348184e-05, + "loss": 0.5897, "step": 404800 }, { - "epoch": 4.13, - "learning_rate": 5.130578154299677e-05, - "loss": 0.7369, + "epoch": 5.578518089884544, + "grad_norm": 2.73796010017395, + "learning_rate": 3.878393160115491e-05, + "loss": 0.5859, "step": 404900 }, { - "epoch": 4.13, - "learning_rate": 5.130111092027571e-05, - "loss": 0.7915, + "epoch": 5.579895841944284, + "grad_norm": 5.244845390319824, + "learning_rate": 3.877662152433391e-05, + "loss": 0.5895, "step": 405000 }, { - "epoch": 4.13, - "learning_rate": 5.129643925605746e-05, - "loss": 0.7232, + "epoch": 5.581273594004023, + "grad_norm": 21.79768180847168, + "learning_rate": 3.8769310489506224e-05, + "loss": 0.5254, "step": 405100 }, { - "epoch": 4.13, - "learning_rate": 5.1291766550570416e-05, - "loss": 0.7162, + "epoch": 5.582651346063763, + "grad_norm": 3.9913852214813232, + "learning_rate": 3.876199849729295e-05, + "loss": 0.5458, "step": 405200 }, { - "epoch": 4.13, - "learning_rate": 5.128709280404305e-05, - "loss": 0.6872, + "epoch": 5.5840290981235015, + "grad_norm": 6.309573650360107, + "learning_rate": 3.87546855483153e-05, + "loss": 0.5521, "step": 405300 }, { - "epoch": 4.13, - "learning_rate": 5.128241801670388e-05, - "loss": 0.6106, + "epoch": 5.585406850183241, + "grad_norm": 3.8957104682922363, + "learning_rate": 3.874737164319453e-05, + "loss": 0.5393, "step": 405400 }, { - "epoch": 4.13, - "learning_rate": 5.127774218878147e-05, - "loss": 0.7566, + "epoch": 5.586784602242981, + "grad_norm": 14.508999824523926, + "learning_rate": 3.8740056782552e-05, + "loss": 0.5773, "step": 405500 }, { - "epoch": 4.13, - "learning_rate": 5.127306532050445e-05, - "loss": 0.7852, + "epoch": 5.588162354302719, + "grad_norm": 20.58478546142578, + "learning_rate": 3.873274096700913e-05, + "loss": 0.6063, "step": 405600 }, { - "epoch": 4.13, - "learning_rate": 5.1268387412101495e-05, - "loss": 0.7609, + "epoch": 5.589540106362459, + "grad_norm": 7.5302019119262695, + "learning_rate": 3.872542419718746e-05, + "loss": 0.6154, "step": 405700 }, { - "epoch": 4.13, - "learning_rate": 5.126370846380131e-05, - "loss": 0.7256, + "epoch": 5.590917858422198, + "grad_norm": 8.081854820251465, + "learning_rate": 3.871810647370858e-05, + "loss": 0.6599, "step": 405800 }, { - "epoch": 4.14, - "learning_rate": 5.125902847583268e-05, - "loss": 0.6586, + "epoch": 5.592295610481938, + "grad_norm": 18.692920684814453, + "learning_rate": 3.8710787797194145e-05, + "loss": 0.4728, "step": 405900 }, { - "epoch": 4.14, - "learning_rate": 5.125434744842442e-05, - "loss": 0.6315, + "epoch": 5.5936733625416775, + "grad_norm": 26.04582977294922, + "learning_rate": 3.870346816826595e-05, + "loss": 0.5816, "step": 406000 }, { - "epoch": 4.14, - "learning_rate": 5.1249665381805416e-05, - "loss": 0.6643, + "epoch": 5.595051114601416, + "grad_norm": 7.022157192230225, + "learning_rate": 3.869614758754581e-05, + "loss": 0.6078, "step": 406100 }, { - "epoch": 4.14, - "learning_rate": 5.124498227620458e-05, - "loss": 0.7323, + "epoch": 5.596428866661156, + "grad_norm": 10.696815490722656, + "learning_rate": 3.868882605565565e-05, + "loss": 0.668, "step": 406200 }, { - "epoch": 4.14, - "learning_rate": 5.1240298131850896e-05, - "loss": 0.6828, + "epoch": 5.597806618720895, + "grad_norm": 4.490745544433594, + "learning_rate": 3.8681503573217486e-05, + "loss": 0.589, "step": 406300 }, { - "epoch": 4.14, - "learning_rate": 5.123561294897339e-05, - "loss": 0.7082, + "epoch": 5.599184370780634, + "grad_norm": 4.078994274139404, + "learning_rate": 3.867418014085338e-05, + "loss": 0.5682, "step": 406400 }, { - "epoch": 4.14, - "learning_rate": 5.1230926727801136e-05, - "loss": 0.6944, + "epoch": 5.6005621228403735, + "grad_norm": 16.163284301757812, + "learning_rate": 3.866685575918551e-05, + "loss": 0.5954, "step": 406500 }, { - "epoch": 4.14, - "learning_rate": 5.1226239468563266e-05, - "loss": 0.6735, + "epoch": 5.601939874900113, + "grad_norm": 13.411498069763184, + "learning_rate": 3.865953042883612e-05, + "loss": 0.571, "step": 406600 }, { - "epoch": 4.14, - "learning_rate": 5.122155117148896e-05, - "loss": 0.6979, + "epoch": 5.603317626959853, + "grad_norm": 25.4854679107666, + "learning_rate": 3.865220415042751e-05, + "loss": 0.6416, "step": 406700 }, { - "epoch": 4.14, - "learning_rate": 5.121686183680745e-05, - "loss": 0.7746, + "epoch": 5.604695379019591, + "grad_norm": 21.939712524414062, + "learning_rate": 3.8644876924582115e-05, + "loss": 0.5763, "step": 406800 }, { - "epoch": 4.15, - "learning_rate": 5.1212171464747996e-05, - "loss": 0.6306, + "epoch": 5.606073131079331, + "grad_norm": 11.836036682128906, + "learning_rate": 3.8637548751922405e-05, + "loss": 0.6635, "step": 406900 }, { - "epoch": 4.15, - "learning_rate": 5.1207480055539956e-05, - "loss": 0.6502, + "epoch": 5.60745088313907, + "grad_norm": 10.029311180114746, + "learning_rate": 3.863021963307095e-05, + "loss": 0.5576, "step": 407000 }, { - "epoch": 4.15, - "learning_rate": 5.12027876094127e-05, - "loss": 0.6377, + "epoch": 5.60882863519881, + "grad_norm": 15.481919288635254, + "learning_rate": 3.86228895686504e-05, + "loss": 0.5838, "step": 407100 }, { - "epoch": 4.15, - "learning_rate": 5.1198094126595664e-05, - "loss": 0.6257, + "epoch": 5.610206387258549, + "grad_norm": 4.08052396774292, + "learning_rate": 3.861555855928347e-05, + "loss": 0.6174, "step": 407200 }, { - "epoch": 4.15, - "learning_rate": 5.119339960731832e-05, - "loss": 0.7213, + "epoch": 5.611584139318288, + "grad_norm": 8.861710548400879, + "learning_rate": 3.860822660559298e-05, + "loss": 0.6717, "step": 407300 }, { - "epoch": 4.15, - "learning_rate": 5.118870405181022e-05, - "loss": 0.6488, + "epoch": 5.612961891378028, + "grad_norm": 3.937213659286499, + "learning_rate": 3.86008937082018e-05, + "loss": 0.5986, "step": 407400 }, { - "epoch": 4.15, - "learning_rate": 5.1184007460300935e-05, - "loss": 0.6817, + "epoch": 5.614339643437767, + "grad_norm": 6.842425346374512, + "learning_rate": 3.859355986773292e-05, + "loss": 0.5749, "step": 407500 }, { - "epoch": 4.15, - "learning_rate": 5.11793098330201e-05, - "loss": 0.6763, + "epoch": 5.615717395497506, + "grad_norm": 11.892126083374023, + "learning_rate": 3.8586225084809365e-05, + "loss": 0.5783, "step": 407600 }, { - "epoch": 4.15, - "learning_rate": 5.117461117019741e-05, - "loss": 0.7927, + "epoch": 5.6170951475572455, + "grad_norm": 3.5909173488616943, + "learning_rate": 3.857888936005428e-05, + "loss": 0.5823, "step": 407700 }, { - "epoch": 4.15, - "learning_rate": 5.11699114720626e-05, - "loss": 0.7365, + "epoch": 5.618472899616985, + "grad_norm": 15.181912422180176, + "learning_rate": 3.857155269409086e-05, + "loss": 0.6049, "step": 407800 }, { - "epoch": 4.16, - "learning_rate": 5.116521073884545e-05, - "loss": 0.6903, + "epoch": 5.619850651676725, + "grad_norm": 13.354763984680176, + "learning_rate": 3.85642150875424e-05, + "loss": 0.6094, "step": 407900 }, { - "epoch": 4.16, - "learning_rate": 5.1160555993578276e-05, - "loss": 0.6971, + "epoch": 5.621228403736463, + "grad_norm": 23.534334182739258, + "learning_rate": 3.8556876541032264e-05, + "loss": 0.6279, "step": 408000 }, { - "epoch": 4.16, - "learning_rate": 5.11558532012311e-05, - "loss": 0.7181, + "epoch": 5.622606155796203, + "grad_norm": 10.670170783996582, + "learning_rate": 3.8549537055183894e-05, + "loss": 0.6505, "step": 408100 }, { - "epoch": 4.16, - "learning_rate": 5.115114937448896e-05, - "loss": 0.6944, + "epoch": 5.623983907855942, + "grad_norm": 10.856382369995117, + "learning_rate": 3.854219663062083e-05, + "loss": 0.6927, "step": 408200 }, { - "epoch": 4.16, - "learning_rate": 5.1146444513581855e-05, - "loss": 0.6757, + "epoch": 5.625361659915682, + "grad_norm": 19.299663543701172, + "learning_rate": 3.8534855267966665e-05, + "loss": 0.5689, "step": 408300 }, { - "epoch": 4.16, - "learning_rate": 5.114173861873979e-05, - "loss": 0.6896, + "epoch": 5.626739411975421, + "grad_norm": 153.18516540527344, + "learning_rate": 3.852751296784509e-05, + "loss": 0.6135, "step": 408400 }, { - "epoch": 4.16, - "learning_rate": 5.1137031690192884e-05, - "loss": 0.6571, + "epoch": 5.62811716403516, + "grad_norm": 6.1429524421691895, + "learning_rate": 3.852016973087988e-05, + "loss": 0.5947, "step": 408500 }, { - "epoch": 4.16, - "learning_rate": 5.113232372817126e-05, - "loss": 0.7753, + "epoch": 5.6294949160949, + "grad_norm": 11.997945785522461, + "learning_rate": 3.8512825557694865e-05, + "loss": 0.5577, "step": 408600 }, { - "epoch": 4.16, - "learning_rate": 5.112761473290512e-05, - "loss": 0.6441, + "epoch": 5.630872668154639, + "grad_norm": 16.177392959594727, + "learning_rate": 3.850548044891398e-05, + "loss": 0.5729, "step": 408700 }, { - "epoch": 4.16, - "learning_rate": 5.11229047046247e-05, - "loss": 0.6385, + "epoch": 5.632250420214378, + "grad_norm": 4.524254322052002, + "learning_rate": 3.849813440516123e-05, + "loss": 0.6511, "step": 408800 }, { - "epoch": 4.17, - "learning_rate": 5.11181936435603e-05, - "loss": 0.673, + "epoch": 5.6336281722741175, + "grad_norm": 27.117694854736328, + "learning_rate": 3.849078742706069e-05, + "loss": 0.6178, "step": 408900 }, { - "epoch": 4.17, - "learning_rate": 5.111348154994226e-05, - "loss": 0.6175, + "epoch": 5.635005924333857, + "grad_norm": 9.303934097290039, + "learning_rate": 3.848343951523653e-05, + "loss": 0.6131, "step": 409000 }, { - "epoch": 4.17, - "learning_rate": 5.110876842400096e-05, - "loss": 0.6319, + "epoch": 5.636383676393596, + "grad_norm": 3.180523633956909, + "learning_rate": 3.8476090670312996e-05, + "loss": 0.5776, "step": 409100 }, { - "epoch": 4.17, - "learning_rate": 5.110405426596686e-05, - "loss": 0.6489, + "epoch": 5.637761428453335, + "grad_norm": 71.8797836303711, + "learning_rate": 3.8468740892914414e-05, + "loss": 0.6215, "step": 409200 }, { - "epoch": 4.17, - "learning_rate": 5.109933907607045e-05, - "loss": 0.6717, + "epoch": 5.639139180513075, + "grad_norm": 4.7273478507995605, + "learning_rate": 3.8461390183665166e-05, + "loss": 0.6278, "step": 409300 }, { - "epoch": 4.17, - "learning_rate": 5.109462285454228e-05, - "loss": 0.7219, + "epoch": 5.640516932572814, + "grad_norm": 28.959108352661133, + "learning_rate": 3.8454038543189746e-05, + "loss": 0.6483, "step": 409400 }, { - "epoch": 4.17, - "learning_rate": 5.108990560161293e-05, - "loss": 0.6584, + "epoch": 5.641894684632554, + "grad_norm": 6.889366149902344, + "learning_rate": 3.844668597211271e-05, + "loss": 0.5617, "step": 409500 }, { - "epoch": 4.17, - "learning_rate": 5.108518731751305e-05, - "loss": 0.6518, + "epoch": 5.643272436692293, + "grad_norm": 8.85036849975586, + "learning_rate": 3.84393324710587e-05, + "loss": 0.6637, "step": 409600 }, { - "epoch": 4.17, - "learning_rate": 5.108046800247335e-05, - "loss": 0.6894, + "epoch": 5.644650188752032, + "grad_norm": 9.753440856933594, + "learning_rate": 3.843197804065244e-05, + "loss": 0.6394, "step": 409700 }, { - "epoch": 4.18, - "learning_rate": 5.1075747656724564e-05, - "loss": 0.6614, + "epoch": 5.646027940811772, + "grad_norm": 97.58231353759766, + "learning_rate": 3.84246226815187e-05, + "loss": 0.6126, "step": 409800 }, { - "epoch": 4.18, - "learning_rate": 5.107102628049749e-05, - "loss": 0.6827, + "epoch": 5.64740569287151, + "grad_norm": 1.8015433549880981, + "learning_rate": 3.841726639428239e-05, + "loss": 0.5896, "step": 409900 }, { - "epoch": 4.18, - "learning_rate": 5.106630387402297e-05, - "loss": 0.7216, + "epoch": 5.64878344493125, + "grad_norm": 1.3764885663986206, + "learning_rate": 3.840990917956843e-05, + "loss": 0.5802, "step": 410000 }, { - "epoch": 4.18, - "learning_rate": 5.106158043753191e-05, - "loss": 0.7221, + "epoch": 5.6501611969909895, + "grad_norm": 21.17970848083496, + "learning_rate": 3.840255103800188e-05, + "loss": 0.5663, "step": 410100 }, { - "epoch": 4.18, - "learning_rate": 5.105685597125526e-05, - "loss": 0.7594, + "epoch": 5.651538949050729, + "grad_norm": 7.723023891448975, + "learning_rate": 3.839519197020785e-05, + "loss": 0.5891, "step": 410200 }, { - "epoch": 4.18, - "learning_rate": 5.1052130475424015e-05, - "loss": 0.7165, + "epoch": 5.652916701110469, + "grad_norm": 6.013448238372803, + "learning_rate": 3.8387831976811504e-05, + "loss": 0.6004, "step": 410300 }, { - "epoch": 4.18, - "learning_rate": 5.104740395026922e-05, - "loss": 0.6979, + "epoch": 5.654294453170207, + "grad_norm": 11.646791458129883, + "learning_rate": 3.8380471058438155e-05, + "loss": 0.5385, "step": 410400 }, { - "epoch": 4.18, - "learning_rate": 5.104272367665769e-05, - "loss": 0.7685, + "epoch": 5.655672205229947, + "grad_norm": 421.31719970703125, + "learning_rate": 3.83731092157131e-05, + "loss": 0.5981, "step": 410500 }, { - "epoch": 4.18, - "learning_rate": 5.103799510383662e-05, - "loss": 0.721, + "epoch": 5.657049957289686, + "grad_norm": 11.343670845031738, + "learning_rate": 3.8365746449261806e-05, + "loss": 0.5304, "step": 410600 }, { - "epoch": 4.18, - "learning_rate": 5.103326550238314e-05, - "loss": 0.6945, + "epoch": 5.658427709349425, + "grad_norm": 7.379881381988525, + "learning_rate": 3.8358382759709765e-05, + "loss": 0.6346, "step": 410700 }, { - "epoch": 4.19, - "learning_rate": 5.1028534872528484e-05, - "loss": 0.683, + "epoch": 5.659805461409165, + "grad_norm": 11.422622680664062, + "learning_rate": 3.835101814768255e-05, + "loss": 0.6584, "step": 410800 }, { - "epoch": 4.19, - "learning_rate": 5.102380321450397e-05, - "loss": 0.6429, + "epoch": 5.661183213468904, + "grad_norm": 2.0989139080047607, + "learning_rate": 3.8343652613805826e-05, + "loss": 0.634, "step": 410900 }, { - "epoch": 4.19, - "learning_rate": 5.101907052854094e-05, - "loss": 0.7654, + "epoch": 5.662560965528644, + "grad_norm": 2.7042441368103027, + "learning_rate": 3.833628615870534e-05, + "loss": 0.5042, "step": 411000 }, { - "epoch": 4.19, - "learning_rate": 5.10143368148708e-05, - "loss": 0.7617, + "epoch": 5.663938717588382, + "grad_norm": 6.44378662109375, + "learning_rate": 3.8328918783006913e-05, + "loss": 0.5971, "step": 411100 }, { - "epoch": 4.19, - "learning_rate": 5.1009602073724995e-05, - "loss": 0.7148, + "epoch": 5.665316469648122, + "grad_norm": 2.921806573867798, + "learning_rate": 3.832155048733643e-05, + "loss": 0.5016, "step": 411200 }, { - "epoch": 4.19, - "learning_rate": 5.1004866305335025e-05, - "loss": 0.7772, + "epoch": 5.6666942217078615, + "grad_norm": 5.057344913482666, + "learning_rate": 3.831425496901874e-05, + "loss": 0.6401, "step": 411300 }, { - "epoch": 4.19, - "learning_rate": 5.1000129509932455e-05, - "loss": 0.7595, + "epoch": 5.668071973767601, + "grad_norm": 3.4876370429992676, + "learning_rate": 3.8306884844466255e-05, + "loss": 0.5588, "step": 411400 }, { - "epoch": 4.19, - "learning_rate": 5.099539168774886e-05, - "loss": 0.7132, + "epoch": 5.66944972582734, + "grad_norm": 6.4181413650512695, + "learning_rate": 3.829951380181362e-05, + "loss": 0.5348, "step": 411500 }, { - "epoch": 4.19, - "learning_rate": 5.099065283901591e-05, - "loss": 0.7462, + "epoch": 5.670827477887079, + "grad_norm": 16.41754722595215, + "learning_rate": 3.829214184168704e-05, + "loss": 0.5263, "step": 411600 }, { - "epoch": 4.19, - "learning_rate": 5.09859129639653e-05, - "loss": 0.7118, + "epoch": 5.672205229946819, + "grad_norm": 25.24420928955078, + "learning_rate": 3.828476896471279e-05, + "loss": 0.578, "step": 411700 }, { - "epoch": 4.2, - "learning_rate": 5.0981172062828787e-05, - "loss": 0.7628, + "epoch": 5.673582982006558, + "grad_norm": 9.882627487182617, + "learning_rate": 3.8277395171517253e-05, + "loss": 0.4634, "step": 411800 }, { - "epoch": 4.2, - "learning_rate": 5.097643013583817e-05, - "loss": 0.6506, + "epoch": 5.674960734066297, + "grad_norm": 16.55098533630371, + "learning_rate": 3.827002046272684e-05, + "loss": 0.5444, "step": 411900 }, { - "epoch": 4.2, - "learning_rate": 5.09716871832253e-05, - "loss": 0.669, + "epoch": 5.676338486126037, + "grad_norm": 8.782390594482422, + "learning_rate": 3.8262718599732725e-05, + "loss": 0.5511, "step": 412000 }, { - "epoch": 4.2, - "learning_rate": 5.096694320522207e-05, - "loss": 0.6966, + "epoch": 5.677716238185776, + "grad_norm": 5.046112060546875, + "learning_rate": 3.8255342070772565e-05, + "loss": 0.5717, "step": 412100 }, { - "epoch": 4.2, - "learning_rate": 5.0962198202060454e-05, - "loss": 0.64, + "epoch": 5.679093990245516, + "grad_norm": 4.593100070953369, + "learning_rate": 3.824796462809107e-05, + "loss": 0.614, "step": 412200 }, { - "epoch": 4.2, - "learning_rate": 5.095745217397243e-05, - "loss": 0.6732, + "epoch": 5.680471742305254, + "grad_norm": 4.590670108795166, + "learning_rate": 3.8240586272315e-05, + "loss": 0.6133, "step": 412300 }, { - "epoch": 4.2, - "learning_rate": 5.095270512119008e-05, - "loss": 0.679, + "epoch": 5.681849494364994, + "grad_norm": 10.165679931640625, + "learning_rate": 3.823320700407116e-05, + "loss": 0.5372, "step": 412400 }, { - "epoch": 4.2, - "learning_rate": 5.0947957043945475e-05, - "loss": 0.6008, + "epoch": 5.6832272464247335, + "grad_norm": 4.057440280914307, + "learning_rate": 3.822582682398649e-05, + "loss": 0.7187, "step": 412500 }, { - "epoch": 4.2, - "learning_rate": 5.094320794247078e-05, - "loss": 0.6721, + "epoch": 5.684604998484473, + "grad_norm": 13.169560432434082, + "learning_rate": 3.821844573268795e-05, + "loss": 0.5909, "step": 412600 }, { - "epoch": 4.2, - "learning_rate": 5.0938457816998205e-05, - "loss": 0.7097, + "epoch": 5.685982750544212, + "grad_norm": 9.047806739807129, + "learning_rate": 3.821106373080261e-05, + "loss": 0.5086, "step": 412700 }, { - "epoch": 4.21, - "learning_rate": 5.0933706667759996e-05, - "loss": 0.6769, + "epoch": 5.687360502603951, + "grad_norm": 9.026451110839844, + "learning_rate": 3.820368081895761e-05, + "loss": 0.64, "step": 412800 }, { - "epoch": 4.21, - "learning_rate": 5.092895449498845e-05, - "loss": 0.6894, + "epoch": 5.688738254663691, + "grad_norm": 20.203289031982422, + "learning_rate": 3.819629699778017e-05, + "loss": 0.5369, "step": 412900 }, { - "epoch": 4.21, - "learning_rate": 5.092420129891594e-05, - "loss": 0.7114, + "epoch": 5.69011600672343, + "grad_norm": 13.962346076965332, + "learning_rate": 3.818891226789757e-05, + "loss": 0.5912, "step": 413000 }, { - "epoch": 4.21, - "learning_rate": 5.0919447079774845e-05, - "loss": 0.7359, + "epoch": 5.691493758783169, + "grad_norm": 6.664554119110107, + "learning_rate": 3.818152662993719e-05, + "loss": 0.6237, "step": 413100 }, { - "epoch": 4.21, - "learning_rate": 5.091469183779763e-05, - "loss": 0.6115, + "epoch": 5.692871510842909, + "grad_norm": 38.70198440551758, + "learning_rate": 3.817414008452648e-05, + "loss": 0.6179, "step": 413200 }, { - "epoch": 4.21, - "learning_rate": 5.0909935573216795e-05, - "loss": 0.7034, + "epoch": 5.694249262902648, + "grad_norm": 3.2160747051239014, + "learning_rate": 3.816675263229296e-05, + "loss": 0.5593, "step": 413300 }, { - "epoch": 4.21, - "learning_rate": 5.09051782862649e-05, - "loss": 0.5739, + "epoch": 5.695627014962387, + "grad_norm": 4.834950923919678, + "learning_rate": 3.815936427386424e-05, + "loss": 0.6315, "step": 413400 }, { - "epoch": 4.21, - "learning_rate": 5.0900419977174535e-05, - "loss": 0.7178, + "epoch": 5.697004767022126, + "grad_norm": 8.595553398132324, + "learning_rate": 3.815197500986799e-05, + "loss": 0.5227, "step": 413500 }, { - "epoch": 4.21, - "learning_rate": 5.089566064617837e-05, - "loss": 0.6378, + "epoch": 5.698382519081866, + "grad_norm": 20.681102752685547, + "learning_rate": 3.8144584840931956e-05, + "loss": 0.5941, "step": 413600 }, { - "epoch": 4.21, - "learning_rate": 5.089090029350909e-05, - "loss": 0.7007, + "epoch": 5.6997602711416055, + "grad_norm": 7.695576190948486, + "learning_rate": 3.813719376768399e-05, + "loss": 0.5486, "step": 413700 }, { - "epoch": 4.22, - "learning_rate": 5.088613891939946e-05, - "loss": 0.7757, + "epoch": 5.701138023201345, + "grad_norm": 1.5854418277740479, + "learning_rate": 3.812980179075199e-05, + "loss": 0.609, "step": 413800 }, { - "epoch": 4.22, - "learning_rate": 5.088137652408228e-05, - "loss": 0.6297, + "epoch": 5.702515775261084, + "grad_norm": 4.077051162719727, + "learning_rate": 3.812240891076395e-05, + "loss": 0.6083, "step": 413900 }, { - "epoch": 4.22, - "learning_rate": 5.08766131077904e-05, - "loss": 0.6046, + "epoch": 5.703893527320823, + "grad_norm": 3.825028657913208, + "learning_rate": 3.811501512834793e-05, + "loss": 0.6136, "step": 414000 }, { - "epoch": 4.22, - "learning_rate": 5.0871848670756725e-05, - "loss": 0.7112, + "epoch": 5.705271279380563, + "grad_norm": 3.4526240825653076, + "learning_rate": 3.810762044413207e-05, + "loss": 0.5935, "step": 414100 }, { - "epoch": 4.22, - "learning_rate": 5.08670832132142e-05, - "loss": 0.725, + "epoch": 5.706649031440302, + "grad_norm": 9.169675827026367, + "learning_rate": 3.810022485874458e-05, + "loss": 0.635, "step": 414200 }, { - "epoch": 4.22, - "learning_rate": 5.086231673539584e-05, - "loss": 0.6296, + "epoch": 5.708026783500041, + "grad_norm": 5.189697265625, + "learning_rate": 3.809282837281376e-05, + "loss": 0.6432, "step": 414300 }, { - "epoch": 4.22, - "learning_rate": 5.0857549237534696e-05, - "loss": 0.6737, + "epoch": 5.709404535559781, + "grad_norm": 18.285369873046875, + "learning_rate": 3.808543098696798e-05, + "loss": 0.5431, "step": 414400 }, { - "epoch": 4.22, - "learning_rate": 5.085278071986386e-05, - "loss": 0.6706, + "epoch": 5.71078228761952, + "grad_norm": 6.748597145080566, + "learning_rate": 3.807803270183568e-05, + "loss": 0.5442, "step": 414500 }, { - "epoch": 4.22, - "learning_rate": 5.084801118261649e-05, - "loss": 0.7943, + "epoch": 5.71216003967926, + "grad_norm": 3.252908945083618, + "learning_rate": 3.807063351804539e-05, + "loss": 0.5767, "step": 414600 }, { - "epoch": 4.23, - "learning_rate": 5.0843240626025786e-05, - "loss": 0.6823, + "epoch": 5.7135377917389985, + "grad_norm": 9.5114164352417, + "learning_rate": 3.806323343622569e-05, + "loss": 0.6152, "step": 414700 }, { - "epoch": 4.23, - "learning_rate": 5.0838469050325005e-05, - "loss": 0.6725, + "epoch": 5.714915543798738, + "grad_norm": 1.6165428161621094, + "learning_rate": 3.8055832457005287e-05, + "loss": 0.5425, "step": 414800 }, { - "epoch": 4.23, - "learning_rate": 5.0833696455747436e-05, - "loss": 0.822, + "epoch": 5.716293295858478, + "grad_norm": 7.752899646759033, + "learning_rate": 3.804843058101291e-05, + "loss": 0.5609, "step": 414900 }, { - "epoch": 4.23, - "learning_rate": 5.0828970583700184e-05, - "loss": 0.6816, + "epoch": 5.717671047918216, + "grad_norm": 3.8368735313415527, + "learning_rate": 3.804102780887738e-05, + "loss": 0.6638, "step": 415000 }, { - "epoch": 4.23, - "learning_rate": 5.082419596225211e-05, - "loss": 0.7256, + "epoch": 5.719048799977956, + "grad_norm": 16.15721321105957, + "learning_rate": 3.803362414122761e-05, + "loss": 0.5415, "step": 415100 }, { - "epoch": 4.23, - "learning_rate": 5.0819420322625124e-05, - "loss": 0.7042, + "epoch": 5.720426552037695, + "grad_norm": 399.3175048828125, + "learning_rate": 3.802621957869259e-05, + "loss": 0.577, "step": 415200 }, { - "epoch": 4.23, - "learning_rate": 5.081464366505274e-05, - "loss": 0.693, + "epoch": 5.721804304097435, + "grad_norm": 6.12244176864624, + "learning_rate": 3.801881412190135e-05, + "loss": 0.6148, "step": 415300 }, { - "epoch": 4.23, - "learning_rate": 5.080986598976849e-05, - "loss": 0.6159, + "epoch": 5.723182056157174, + "grad_norm": 3.3543612957000732, + "learning_rate": 3.8011407771483056e-05, + "loss": 0.6247, "step": 415400 }, { - "epoch": 4.23, - "learning_rate": 5.080508729700598e-05, - "loss": 0.6622, + "epoch": 5.724559808216913, + "grad_norm": 6.6794538497924805, + "learning_rate": 3.800400052806688e-05, + "loss": 0.6077, "step": 415500 }, { - "epoch": 4.23, - "learning_rate": 5.0800355389133515e-05, - "loss": 0.6557, + "epoch": 5.725937560276653, + "grad_norm": 4.871297359466553, + "learning_rate": 3.799659239228212e-05, + "loss": 0.6212, "step": 415600 }, { - "epoch": 4.24, - "learning_rate": 5.0795574672284434e-05, - "loss": 0.6944, + "epoch": 5.727315312336392, + "grad_norm": 15.35666275024414, + "learning_rate": 3.798918336475815e-05, + "loss": 0.5743, "step": 415700 }, { - "epoch": 4.24, - "learning_rate": 5.079079293865585e-05, - "loss": 0.7134, + "epoch": 5.728693064396131, + "grad_norm": 10.151765823364258, + "learning_rate": 3.798177344612438e-05, + "loss": 0.5293, "step": 415800 }, { - "epoch": 4.24, - "learning_rate": 5.078601018848157e-05, - "loss": 0.6673, + "epoch": 5.7300708164558705, + "grad_norm": 1.2828086614608765, + "learning_rate": 3.797436263701034e-05, + "loss": 0.6429, "step": 415900 }, { - "epoch": 4.24, - "learning_rate": 5.078122642199544e-05, - "loss": 0.6115, + "epoch": 5.73144856851561, + "grad_norm": 7.951080322265625, + "learning_rate": 3.79669509380456e-05, + "loss": 0.5442, "step": 416000 }, { - "epoch": 4.24, - "learning_rate": 5.0776441639431334e-05, - "loss": 0.6758, + "epoch": 5.73282632057535, + "grad_norm": 10.235562324523926, + "learning_rate": 3.795953834985983e-05, + "loss": 0.599, "step": 416100 }, { - "epoch": 4.24, - "learning_rate": 5.077165584102324e-05, - "loss": 0.6866, + "epoch": 5.734204072635088, + "grad_norm": 5.476729869842529, + "learning_rate": 3.7952124873082766e-05, + "loss": 0.5723, "step": 416200 }, { - "epoch": 4.24, - "learning_rate": 5.076686902700513e-05, - "loss": 0.6241, + "epoch": 5.735581824694828, + "grad_norm": 6.906072616577148, + "learning_rate": 3.7944710508344225e-05, + "loss": 0.6486, "step": 416300 }, { - "epoch": 4.24, - "learning_rate": 5.0762081197611046e-05, - "loss": 0.5715, + "epoch": 5.736959576754567, + "grad_norm": 7.615192413330078, + "learning_rate": 3.793729525627409e-05, + "loss": 0.5485, "step": 416400 }, { - "epoch": 4.24, - "learning_rate": 5.07572923530751e-05, - "loss": 0.7165, + "epoch": 5.738337328814307, + "grad_norm": 6.2774810791015625, + "learning_rate": 3.792987911750233e-05, + "loss": 0.498, "step": 416500 }, { - "epoch": 4.24, - "learning_rate": 5.0752550397248893e-05, - "loss": 0.7817, + "epoch": 5.739715080874046, + "grad_norm": 11.927968978881836, + "learning_rate": 3.792246209265897e-05, + "loss": 0.6452, "step": 416600 }, { - "epoch": 4.25, - "learning_rate": 5.0747759533277276e-05, - "loss": 0.6904, + "epoch": 5.741092832933785, + "grad_norm": 3.4691221714019775, + "learning_rate": 3.791504418237414e-05, + "loss": 0.5523, "step": 416700 }, { - "epoch": 4.25, - "learning_rate": 5.074296765486402e-05, - "loss": 0.6678, + "epoch": 5.742470584993525, + "grad_norm": 33.873504638671875, + "learning_rate": 3.7907625387278023e-05, + "loss": 0.607, "step": 416800 }, { - "epoch": 4.25, - "learning_rate": 5.073817476224345e-05, - "loss": 0.7303, + "epoch": 5.743848337053264, + "grad_norm": 14.57044792175293, + "learning_rate": 3.790020570800088e-05, + "loss": 0.6101, "step": 416900 }, { - "epoch": 4.25, - "learning_rate": 5.073338085564988e-05, - "loss": 0.6607, + "epoch": 5.745226089113003, + "grad_norm": 0.9322608709335327, + "learning_rate": 3.789278514517307e-05, + "loss": 0.6007, "step": 417000 }, { - "epoch": 4.25, - "learning_rate": 5.072858593531772e-05, - "loss": 0.7032, + "epoch": 5.7466038411727425, + "grad_norm": 6.976534843444824, + "learning_rate": 3.788536369942498e-05, + "loss": 0.5511, "step": 417100 }, { - "epoch": 4.25, - "learning_rate": 5.072379000148141e-05, - "loss": 0.709, + "epoch": 5.747981593232482, + "grad_norm": 5.832825183868408, + "learning_rate": 3.787794137138711e-05, + "loss": 0.5612, "step": 417200 }, { - "epoch": 4.25, - "learning_rate": 5.071899305437545e-05, - "loss": 0.7181, + "epoch": 5.749359345292222, + "grad_norm": 5.226287841796875, + "learning_rate": 3.787051816169004e-05, + "loss": 0.6086, "step": 417300 }, { - "epoch": 4.25, - "learning_rate": 5.0714195094234356e-05, - "loss": 0.6621, + "epoch": 5.75073709735196, + "grad_norm": 4.8902974128723145, + "learning_rate": 3.786309407096439e-05, + "loss": 0.5356, "step": 417400 }, { - "epoch": 4.25, - "learning_rate": 5.070939612129276e-05, - "loss": 0.6692, + "epoch": 5.7521148494117, + "grad_norm": 5.891170024871826, + "learning_rate": 3.785566909984088e-05, + "loss": 0.5896, "step": 417500 }, { - "epoch": 4.25, - "learning_rate": 5.070459613578527e-05, - "loss": 0.6466, + "epoch": 5.753492601471439, + "grad_norm": 4.325453281402588, + "learning_rate": 3.78482432489503e-05, + "loss": 0.5538, "step": 417600 }, { - "epoch": 4.26, - "learning_rate": 5.069979513794658e-05, - "loss": 0.6777, + "epoch": 5.754870353531178, + "grad_norm": 10.180216789245605, + "learning_rate": 3.7840816518923516e-05, + "loss": 0.5688, "step": 417700 }, { - "epoch": 4.26, - "learning_rate": 5.069499312801145e-05, - "loss": 0.6604, + "epoch": 5.756248105590918, + "grad_norm": 6.114187240600586, + "learning_rate": 3.783338891039146e-05, + "loss": 0.6346, "step": 417800 }, { - "epoch": 4.26, - "learning_rate": 5.069019010621465e-05, - "loss": 0.7097, + "epoch": 5.757625857650657, + "grad_norm": 4.8008036613464355, + "learning_rate": 3.7825960423985165e-05, + "loss": 0.5841, "step": 417900 }, { - "epoch": 4.26, - "learning_rate": 5.0685386072791026e-05, - "loss": 0.6308, + "epoch": 5.759003609710397, + "grad_norm": 22.57602882385254, + "learning_rate": 3.781853106033569e-05, + "loss": 0.5017, "step": 418000 }, { - "epoch": 4.26, - "learning_rate": 5.068058102797547e-05, - "loss": 0.7607, + "epoch": 5.760381361770136, + "grad_norm": 5.566181659698486, + "learning_rate": 3.781110082007423e-05, + "loss": 0.6026, "step": 418100 }, { - "epoch": 4.26, - "learning_rate": 5.067577497200291e-05, - "loss": 0.6231, + "epoch": 5.761759113829875, + "grad_norm": 16.178930282592773, + "learning_rate": 3.7803669703831986e-05, + "loss": 0.641, "step": 418200 }, { - "epoch": 4.26, - "learning_rate": 5.067096790510836e-05, - "loss": 0.7091, + "epoch": 5.7631368658896145, + "grad_norm": 12.21933650970459, + "learning_rate": 3.7796237712240295e-05, + "loss": 0.5489, "step": 418300 }, { - "epoch": 4.26, - "learning_rate": 5.0666159827526816e-05, - "loss": 0.6366, + "epoch": 5.764514617949354, + "grad_norm": 5.758997440338135, + "learning_rate": 3.7788804845930535e-05, + "loss": 0.5397, "step": 418400 }, { - "epoch": 4.26, - "learning_rate": 5.066135073949339e-05, - "loss": 0.6548, + "epoch": 5.765892370009093, + "grad_norm": 6.082283973693848, + "learning_rate": 3.778137110553417e-05, + "loss": 0.6224, "step": 418500 }, { - "epoch": 4.26, - "learning_rate": 5.0656540641243224e-05, - "loss": 0.6548, + "epoch": 5.767270122068832, + "grad_norm": 11.053380966186523, + "learning_rate": 3.7773936491682723e-05, + "loss": 0.5798, "step": 418600 }, { - "epoch": 4.27, - "learning_rate": 5.0651729533011495e-05, - "loss": 0.7095, + "epoch": 5.768647874128572, + "grad_norm": 4.975136756896973, + "learning_rate": 3.7766501005007816e-05, + "loss": 0.6402, "step": 418700 }, { - "epoch": 4.27, - "learning_rate": 5.0646917415033416e-05, - "loss": 0.6764, + "epoch": 5.770025626188311, + "grad_norm": 29.005327224731445, + "learning_rate": 3.775906464614112e-05, + "loss": 0.5531, "step": 418800 }, { - "epoch": 4.27, - "learning_rate": 5.06421042875443e-05, - "loss": 0.7279, + "epoch": 5.771403378248051, + "grad_norm": 28.622915267944336, + "learning_rate": 3.7751627415714406e-05, + "loss": 0.617, "step": 418900 }, { - "epoch": 4.27, - "learning_rate": 5.063729015077947e-05, - "loss": 0.6231, + "epoch": 5.77278113030779, + "grad_norm": 31.343488693237305, + "learning_rate": 3.774418931435949e-05, + "loss": 0.5954, "step": 419000 }, { - "epoch": 4.27, - "learning_rate": 5.063247500497431e-05, - "loss": 0.6691, + "epoch": 5.774158882367529, + "grad_norm": 6.084015369415283, + "learning_rate": 3.773682473673068e-05, + "loss": 0.6142, "step": 419100 }, { - "epoch": 4.27, - "learning_rate": 5.062765885036425e-05, - "loss": 0.666, + "epoch": 5.775536634427269, + "grad_norm": 17.0856990814209, + "learning_rate": 3.772938490410868e-05, + "loss": 0.5966, "step": 419200 }, { - "epoch": 4.27, - "learning_rate": 5.0622841687184763e-05, - "loss": 0.7201, + "epoch": 5.776914386487007, + "grad_norm": 19.9857234954834, + "learning_rate": 3.772194420244809e-05, + "loss": 0.6046, "step": 419300 }, { - "epoch": 4.27, - "learning_rate": 5.06180235156714e-05, - "loss": 0.6714, + "epoch": 5.778292138546747, + "grad_norm": 2.3278746604919434, + "learning_rate": 3.771450263238105e-05, + "loss": 0.5507, "step": 419400 }, { - "epoch": 4.27, - "learning_rate": 5.061320433605973e-05, - "loss": 0.6416, + "epoch": 5.7796698906064865, + "grad_norm": 2.851388931274414, + "learning_rate": 3.770706019453976e-05, + "loss": 0.6338, "step": 419500 }, { - "epoch": 4.27, - "learning_rate": 5.060843235544825e-05, - "loss": 0.7886, + "epoch": 5.781047642666226, + "grad_norm": 6.879894256591797, + "learning_rate": 3.769961688955647e-05, + "loss": 0.6018, "step": 419600 }, { - "epoch": 4.28, - "learning_rate": 5.060361117042201e-05, - "loss": 0.7039, + "epoch": 5.782425394725965, + "grad_norm": 12.186042785644531, + "learning_rate": 3.769224716406563e-05, + "loss": 0.6212, "step": 419700 }, { - "epoch": 4.28, - "learning_rate": 5.0598788978002125e-05, - "loss": 0.6805, + "epoch": 5.783803146785704, + "grad_norm": 17.784088134765625, + "learning_rate": 3.7684876589922334e-05, + "loss": 0.6063, "step": 419800 }, { - "epoch": 4.28, - "learning_rate": 5.0593965778424384e-05, - "loss": 0.6682, + "epoch": 5.785180898845444, + "grad_norm": 392.12445068359375, + "learning_rate": 3.767743070460618e-05, + "loss": 0.5581, "step": 419900 }, { - "epoch": 4.28, - "learning_rate": 5.058914157192463e-05, - "loss": 0.6872, + "epoch": 5.786558650905183, + "grad_norm": 4.219203948974609, + "learning_rate": 3.766998395466522e-05, + "loss": 0.567, "step": 420000 }, { - "epoch": 4.28, - "learning_rate": 5.058431635873871e-05, - "loss": 0.7635, + "epoch": 5.787936402964922, + "grad_norm": 8.193329811096191, + "learning_rate": 3.7662536340732094e-05, + "loss": 0.5591, "step": 420100 }, { - "epoch": 4.28, - "learning_rate": 5.0579490139102554e-05, - "loss": 0.5902, + "epoch": 5.789314155024662, + "grad_norm": 9.114564895629883, + "learning_rate": 3.765508786343951e-05, + "loss": 0.608, "step": 420200 }, { - "epoch": 4.28, - "learning_rate": 5.057466291325216e-05, - "loss": 0.7591, + "epoch": 5.790691907084401, + "grad_norm": 15.776185035705566, + "learning_rate": 3.764763852342026e-05, + "loss": 0.5622, "step": 420300 }, { - "epoch": 4.28, - "learning_rate": 5.0569834681423516e-05, - "loss": 0.7737, + "epoch": 5.792069659144141, + "grad_norm": 14.720494270324707, + "learning_rate": 3.764018832130721e-05, + "loss": 0.6298, "step": 420400 }, { - "epoch": 4.28, - "learning_rate": 5.056500544385272e-05, - "loss": 0.6636, + "epoch": 5.793447411203879, + "grad_norm": 59.077117919921875, + "learning_rate": 3.763273725773328e-05, + "loss": 0.5939, "step": 420500 }, { - "epoch": 4.29, - "learning_rate": 5.056017520077588e-05, - "loss": 0.6258, + "epoch": 5.794825163263619, + "grad_norm": 18.532920837402344, + "learning_rate": 3.762528533333148e-05, + "loss": 0.6347, "step": 420600 }, { - "epoch": 4.29, - "learning_rate": 5.0555343952429173e-05, - "loss": 0.6943, + "epoch": 5.7962029153233585, + "grad_norm": 116.89422607421875, + "learning_rate": 3.76178325487349e-05, + "loss": 0.6525, "step": 420700 }, { - "epoch": 4.29, - "learning_rate": 5.055051169904882e-05, - "loss": 0.6887, + "epoch": 5.797580667383098, + "grad_norm": 178.13636779785156, + "learning_rate": 3.761037890457666e-05, + "loss": 0.693, "step": 420800 }, { - "epoch": 4.29, - "learning_rate": 5.0545678440871074e-05, - "loss": 0.6954, + "epoch": 5.798958419442837, + "grad_norm": 5.303542137145996, + "learning_rate": 3.760292440149002e-05, + "loss": 0.6456, "step": 420900 }, { - "epoch": 4.29, - "learning_rate": 5.054084417813227e-05, - "loss": 0.726, + "epoch": 5.800336171502576, + "grad_norm": 9.722251892089844, + "learning_rate": 3.7595469040108275e-05, + "loss": 0.6326, "step": 421000 }, { - "epoch": 4.29, - "learning_rate": 5.0536008911068754e-05, - "loss": 0.7022, + "epoch": 5.801713923562316, + "grad_norm": 12.68631649017334, + "learning_rate": 3.758801282106477e-05, + "loss": 0.5819, "step": 421100 }, { - "epoch": 4.29, - "learning_rate": 5.0531172639916964e-05, - "loss": 0.7421, + "epoch": 5.803091675622055, + "grad_norm": 36.83993148803711, + "learning_rate": 3.758055574499298e-05, + "loss": 0.597, "step": 421200 }, { - "epoch": 4.29, - "learning_rate": 5.052633536491336e-05, - "loss": 0.7415, + "epoch": 5.804469427681794, + "grad_norm": 13.51420783996582, + "learning_rate": 3.7573097812526403e-05, + "loss": 0.5683, "step": 421300 }, { - "epoch": 4.29, - "learning_rate": 5.052149708629445e-05, - "loss": 0.6707, + "epoch": 5.805847179741534, + "grad_norm": 23.411298751831055, + "learning_rate": 3.7565639024298636e-05, + "loss": 0.5431, "step": 421400 }, { - "epoch": 4.29, - "learning_rate": 5.051665780429679e-05, - "loss": 0.6245, + "epoch": 5.807224931801273, + "grad_norm": 6.391308784484863, + "learning_rate": 3.7558179380943335e-05, + "loss": 0.6411, "step": 421500 }, { - "epoch": 4.3, - "learning_rate": 5.0511817519157e-05, - "loss": 0.6403, + "epoch": 5.808602683861013, + "grad_norm": 36.90744400024414, + "learning_rate": 3.755071888309423e-05, + "loss": 0.6675, "step": 421600 }, { - "epoch": 4.3, - "learning_rate": 5.050697623111175e-05, - "loss": 0.5027, + "epoch": 5.809980435920751, + "grad_norm": 27.961341857910156, + "learning_rate": 3.7543257531385156e-05, + "loss": 0.7391, "step": 421700 }, { - "epoch": 4.3, - "learning_rate": 5.050213394039774e-05, - "loss": 0.6563, + "epoch": 5.811358187980491, + "grad_norm": 8.203859329223633, + "learning_rate": 3.753579532644997e-05, + "loss": 0.6663, "step": 421800 }, { - "epoch": 4.3, - "learning_rate": 5.049729064725173e-05, - "loss": 0.6283, + "epoch": 5.8127359400402305, + "grad_norm": 29.1221923828125, + "learning_rate": 3.752833226892261e-05, + "loss": 0.5969, "step": 421900 }, { - "epoch": 4.3, - "learning_rate": 5.049244635191052e-05, - "loss": 0.7218, + "epoch": 5.814113692099969, + "grad_norm": 17.87327003479004, + "learning_rate": 3.752086835943714e-05, + "loss": 0.5829, "step": 422000 }, { - "epoch": 4.3, - "learning_rate": 5.048760105461098e-05, - "loss": 0.735, + "epoch": 5.815491444159709, + "grad_norm": 9.801362991333008, + "learning_rate": 3.7513403598627614e-05, + "loss": 0.6016, "step": 422100 }, { - "epoch": 4.3, - "learning_rate": 5.0482754755590026e-05, - "loss": 0.6434, + "epoch": 5.816869196219448, + "grad_norm": 4.09297513961792, + "learning_rate": 3.750593798712824e-05, + "loss": 0.6083, "step": 422200 }, { - "epoch": 4.3, - "learning_rate": 5.0477907455084584e-05, - "loss": 0.7977, + "epoch": 5.818246948279188, + "grad_norm": 26.39430809020996, + "learning_rate": 3.749847152557325e-05, + "loss": 0.555, "step": 422300 }, { - "epoch": 4.3, - "learning_rate": 5.047305915333168e-05, - "loss": 0.728, + "epoch": 5.819624700338927, + "grad_norm": 20.324661254882812, + "learning_rate": 3.749100421459694e-05, + "loss": 0.6792, "step": 422400 }, { - "epoch": 4.3, - "learning_rate": 5.046820985056835e-05, - "loss": 0.7504, + "epoch": 5.821002452398666, + "grad_norm": 5.561123371124268, + "learning_rate": 3.7483536054833714e-05, + "loss": 0.6992, "step": 422500 }, { - "epoch": 4.31, - "learning_rate": 5.046335954703172e-05, - "loss": 0.6785, + "epoch": 5.822380204458406, + "grad_norm": 14.129959106445312, + "learning_rate": 3.747606704691801e-05, + "loss": 0.6028, "step": 422600 }, { - "epoch": 4.31, - "learning_rate": 5.045850824295892e-05, - "loss": 0.669, + "epoch": 5.823757956518145, + "grad_norm": 9.802308082580566, + "learning_rate": 3.746859719148439e-05, + "loss": 0.6703, "step": 422700 }, { - "epoch": 4.31, - "learning_rate": 5.045365593858716e-05, - "loss": 0.6211, + "epoch": 5.825135708577884, + "grad_norm": 11.730709075927734, + "learning_rate": 3.746112648916745e-05, + "loss": 0.601, "step": 422800 }, { - "epoch": 4.31, - "learning_rate": 5.044880263415368e-05, - "loss": 0.6683, + "epoch": 5.826513460637623, + "grad_norm": 3.9638137817382812, + "learning_rate": 3.745365494060183e-05, + "loss": 0.5589, "step": 422900 }, { - "epoch": 4.31, - "learning_rate": 5.044394832989579e-05, - "loss": 0.6364, + "epoch": 5.827891212697363, + "grad_norm": 5.916142463684082, + "learning_rate": 3.744618254642231e-05, + "loss": 0.6173, "step": 423000 }, { - "epoch": 4.31, - "learning_rate": 5.043909302605083e-05, - "loss": 0.6646, + "epoch": 5.8292689647571025, + "grad_norm": 4.996461868286133, + "learning_rate": 3.743870930726369e-05, + "loss": 0.6419, "step": 423100 }, { - "epoch": 4.31, - "learning_rate": 5.043423672285619e-05, - "loss": 0.7411, + "epoch": 5.830646716816842, + "grad_norm": 94.03470611572266, + "learning_rate": 3.743123522376088e-05, + "loss": 0.6925, "step": 423200 }, { - "epoch": 4.31, - "learning_rate": 5.042937942054932e-05, - "loss": 0.6846, + "epoch": 5.832024468876581, + "grad_norm": 10.281254768371582, + "learning_rate": 3.7423760296548826e-05, + "loss": 0.6221, "step": 423300 }, { - "epoch": 4.31, - "learning_rate": 5.0424521119367714e-05, - "loss": 0.654, + "epoch": 5.83340222093632, + "grad_norm": 33.95487976074219, + "learning_rate": 3.7416284526262565e-05, + "loss": 0.6585, "step": 423400 }, { - "epoch": 4.31, - "learning_rate": 5.041966181954891e-05, - "loss": 0.7416, + "epoch": 5.83477997299606, + "grad_norm": 3.6431691646575928, + "learning_rate": 3.74088079135372e-05, + "loss": 0.6664, "step": 423500 }, { - "epoch": 4.32, - "learning_rate": 5.04148015213305e-05, - "loss": 0.6738, + "epoch": 5.8361577250557986, + "grad_norm": 3.5131843090057373, + "learning_rate": 3.740133045900791e-05, + "loss": 0.6495, "step": 423600 }, { - "epoch": 4.32, - "learning_rate": 5.0409940224950134e-05, - "loss": 0.5921, + "epoch": 5.837535477115538, + "grad_norm": 17.174955368041992, + "learning_rate": 3.739385216330995e-05, + "loss": 0.6284, "step": 423700 }, { - "epoch": 4.32, - "learning_rate": 5.040507793064548e-05, - "loss": 0.6995, + "epoch": 5.838913229175278, + "grad_norm": 174.90777587890625, + "learning_rate": 3.7386373027078625e-05, + "loss": 0.6971, "step": 423800 }, { - "epoch": 4.32, - "learning_rate": 5.0400214638654295e-05, - "loss": 0.6897, + "epoch": 5.840290981235017, + "grad_norm": 51.398136138916016, + "learning_rate": 3.7378967854866036e-05, + "loss": 0.6496, "step": 423900 }, { - "epoch": 4.32, - "learning_rate": 5.0395350349214356e-05, - "loss": 0.6112, + "epoch": 5.841668733294756, + "grad_norm": 9.858229637145996, + "learning_rate": 3.7371487047863715e-05, + "loss": 0.6288, "step": 424000 }, { - "epoch": 4.32, - "learning_rate": 5.039048506256349e-05, - "loss": 0.7192, + "epoch": 5.8430464853544954, + "grad_norm": 3.9244627952575684, + "learning_rate": 3.736400540222808e-05, + "loss": 0.6168, "step": 424100 }, { - "epoch": 4.32, - "learning_rate": 5.038561877893959e-05, - "loss": 0.6194, + "epoch": 5.844424237414235, + "grad_norm": 28.409439086914062, + "learning_rate": 3.7356522918594716e-05, + "loss": 0.6231, "step": 424200 }, { - "epoch": 4.32, - "learning_rate": 5.03807514985806e-05, - "loss": 0.6337, + "epoch": 5.845801989473975, + "grad_norm": 30.201217651367188, + "learning_rate": 3.734903959759929e-05, + "loss": 0.6951, "step": 424300 }, { - "epoch": 4.32, - "learning_rate": 5.0375883221724474e-05, - "loss": 0.6753, + "epoch": 5.847179741533713, + "grad_norm": 7.8538312911987305, + "learning_rate": 3.734155543987758e-05, + "loss": 0.6718, "step": 424400 }, { - "epoch": 4.32, - "learning_rate": 5.037101394860926e-05, - "loss": 0.6177, + "epoch": 5.848557493593453, + "grad_norm": 46.438236236572266, + "learning_rate": 3.733407044606538e-05, + "loss": 0.6381, "step": 424500 }, { - "epoch": 4.33, - "learning_rate": 5.036614367947303e-05, - "loss": 0.6861, + "epoch": 5.849935245653192, + "grad_norm": 19.912607192993164, + "learning_rate": 3.732658461679859e-05, + "loss": 0.6957, "step": 424600 }, { - "epoch": 4.33, - "learning_rate": 5.0361272414553915e-05, - "loss": 0.7046, + "epoch": 5.851312997712932, + "grad_norm": 9.96639633178711, + "learning_rate": 3.731917282348428e-05, + "loss": 0.6889, "step": 424700 }, { - "epoch": 4.33, - "learning_rate": 5.035640015409009e-05, - "loss": 0.7382, + "epoch": 5.852690749772671, + "grad_norm": 1.5888310670852661, + "learning_rate": 3.731168533355492e-05, + "loss": 0.6641, "step": 424800 }, { - "epoch": 4.33, - "learning_rate": 5.035152689831977e-05, - "loss": 0.6595, + "epoch": 5.85406850183241, + "grad_norm": 76.31163787841797, + "learning_rate": 3.73041970100727e-05, + "loss": 0.6884, "step": 424900 }, { - "epoch": 4.33, - "learning_rate": 5.034665264748125e-05, - "loss": 0.6663, + "epoch": 5.85544625389215, + "grad_norm": 7.269099712371826, + "learning_rate": 3.729670785367379e-05, + "loss": 0.7136, "step": 425000 }, { - "epoch": 4.33, - "learning_rate": 5.034177740181282e-05, - "loss": 0.669, + "epoch": 5.856824005951889, + "grad_norm": 8.136226654052734, + "learning_rate": 3.728921786499442e-05, + "loss": 0.566, "step": 425100 }, { - "epoch": 4.33, - "learning_rate": 5.033690116155288e-05, - "loss": 0.6744, + "epoch": 5.858201758011628, + "grad_norm": 14.660557746887207, + "learning_rate": 3.7281727044670905e-05, + "loss": 0.6157, "step": 425200 }, { - "epoch": 4.33, - "learning_rate": 5.0332023926939835e-05, - "loss": 0.7273, + "epoch": 5.8595795100713675, + "grad_norm": 40.86747360229492, + "learning_rate": 3.727423539333965e-05, + "loss": 0.6475, "step": 425300 }, { - "epoch": 4.33, - "learning_rate": 5.0327145698212156e-05, - "loss": 0.6032, + "epoch": 5.860957262131107, + "grad_norm": 374.0465087890625, + "learning_rate": 3.726674291163709e-05, + "loss": 0.662, "step": 425400 }, { - "epoch": 4.34, - "learning_rate": 5.032226647560836e-05, - "loss": 0.6687, + "epoch": 5.862335014190847, + "grad_norm": 7.229824066162109, + "learning_rate": 3.7259249600199745e-05, + "loss": 0.7103, "step": 425500 }, { - "epoch": 4.34, - "learning_rate": 5.0317386259367006e-05, - "loss": 0.7383, + "epoch": 5.863712766250585, + "grad_norm": 25.638059616088867, + "learning_rate": 3.7251755459664226e-05, + "loss": 0.6467, "step": 425600 }, { - "epoch": 4.34, - "learning_rate": 5.031250504972672e-05, - "loss": 0.6944, + "epoch": 5.865090518310325, + "grad_norm": 353.8828430175781, + "learning_rate": 3.724426049066718e-05, + "loss": 0.7214, "step": 425700 }, { - "epoch": 4.34, - "learning_rate": 5.030762284692615e-05, - "loss": 0.7215, + "epoch": 5.866468270370064, + "grad_norm": 35.603641510009766, + "learning_rate": 3.7236764693845376e-05, + "loss": 0.7342, "step": 425800 }, { - "epoch": 4.34, - "learning_rate": 5.030278848807541e-05, - "loss": 0.7764, + "epoch": 5.867846022429804, + "grad_norm": 4.34743595123291, + "learning_rate": 3.7229268069835575e-05, + "loss": 0.724, "step": 425900 }, { - "epoch": 4.34, - "learning_rate": 5.0297953156293906e-05, - "loss": 0.7145, + "epoch": 5.869223774489543, + "grad_norm": 11.061627388000488, + "learning_rate": 3.7221770619274686e-05, + "loss": 0.7567, "step": 426000 }, { - "epoch": 4.34, - "learning_rate": 5.02930679952915e-05, - "loss": 0.7341, + "epoch": 5.870601526549282, + "grad_norm": 37.35090637207031, + "learning_rate": 3.721427234279965e-05, + "loss": 0.6375, "step": 426100 }, { - "epoch": 4.34, - "learning_rate": 5.0288230708521945e-05, - "loss": 0.7256, + "epoch": 5.871979278609022, + "grad_norm": 14.603950500488281, + "learning_rate": 3.720677324104748e-05, + "loss": 0.6786, "step": 426200 }, { - "epoch": 4.34, - "learning_rate": 5.0283343573257124e-05, - "loss": 0.6768, + "epoch": 5.87335703066876, + "grad_norm": 1196.724853515625, + "learning_rate": 3.719927331465526e-05, + "loss": 0.595, "step": 426300 }, { - "epoch": 4.34, - "learning_rate": 5.0278504332436146e-05, - "loss": 0.7276, + "epoch": 5.8747347827285, + "grad_norm": 5.4786577224731445, + "learning_rate": 3.7191772564260155e-05, + "loss": 0.6756, "step": 426400 }, { - "epoch": 4.35, - "learning_rate": 5.027361522385518e-05, - "loss": 0.7011, + "epoch": 5.8761125347882395, + "grad_norm": 24.734024047851562, + "learning_rate": 3.7184270990499383e-05, + "loss": 0.6679, "step": 426500 }, { - "epoch": 4.35, - "learning_rate": 5.0268725124015426e-05, - "loss": 0.6594, + "epoch": 5.877490286847979, + "grad_norm": 18.665855407714844, + "learning_rate": 3.7176768594010245e-05, + "loss": 0.6471, "step": 426600 }, { - "epoch": 4.35, - "learning_rate": 5.026383403315596e-05, - "loss": 0.7124, + "epoch": 5.878868038907719, + "grad_norm": 8.035333633422852, + "learning_rate": 3.7169265375430116e-05, + "loss": 0.6225, "step": 426700 }, { - "epoch": 4.35, - "learning_rate": 5.025894195151595e-05, - "loss": 0.6659, + "epoch": 5.880245790967457, + "grad_norm": 12.994061470031738, + "learning_rate": 3.7161761335396425e-05, + "loss": 0.7333, "step": 426800 }, { - "epoch": 4.35, - "learning_rate": 5.025404887933457e-05, - "loss": 0.6809, + "epoch": 5.881623543027197, + "grad_norm": 32.16779327392578, + "learning_rate": 3.715425647454667e-05, + "loss": 0.7809, "step": 426900 }, { - "epoch": 4.35, - "learning_rate": 5.024915481685107e-05, - "loss": 0.6606, + "epoch": 5.883001295086936, + "grad_norm": 6.404416084289551, + "learning_rate": 3.714675079351844e-05, + "loss": 0.6168, "step": 427000 }, { - "epoch": 4.35, - "learning_rate": 5.024425976430474e-05, - "loss": 0.671, + "epoch": 5.884379047146675, + "grad_norm": 4.859826564788818, + "learning_rate": 3.7139244292949386e-05, + "loss": 0.7034, "step": 427100 }, { - "epoch": 4.35, - "learning_rate": 5.023936372193492e-05, - "loss": 0.6549, + "epoch": 5.885756799206415, + "grad_norm": 7.830059051513672, + "learning_rate": 3.7131736973477206e-05, + "loss": 0.6149, "step": 427200 }, { - "epoch": 4.35, - "learning_rate": 5.023446668998099e-05, - "loss": 0.7506, + "epoch": 5.887134551266154, + "grad_norm": 5.535546779632568, + "learning_rate": 3.7124228835739704e-05, + "loss": 0.5354, "step": 427300 }, { - "epoch": 4.35, - "learning_rate": 5.0229568668682405e-05, - "loss": 0.6669, + "epoch": 5.888512303325894, + "grad_norm": 2.0457115173339844, + "learning_rate": 3.7116719880374714e-05, + "loss": 0.6405, "step": 427400 }, { - "epoch": 4.36, - "learning_rate": 5.022466965827863e-05, - "loss": 0.7246, + "epoch": 5.889890055385633, + "grad_norm": 47.52168655395508, + "learning_rate": 3.710921010802018e-05, + "loss": 0.5588, "step": 427500 }, { - "epoch": 4.36, - "learning_rate": 5.0219769659009195e-05, - "loss": 0.7291, + "epoch": 5.891267807445372, + "grad_norm": 2.555983304977417, + "learning_rate": 3.7101699519314085e-05, + "loss": 0.6799, "step": 427600 }, { - "epoch": 4.36, - "learning_rate": 5.021486867111371e-05, - "loss": 0.6716, + "epoch": 5.8926455595051115, + "grad_norm": 48.739097595214844, + "learning_rate": 3.709418811489449e-05, + "loss": 0.6886, "step": 427700 }, { - "epoch": 4.36, - "learning_rate": 5.020996669483176e-05, - "loss": 0.7209, + "epoch": 5.894023311564851, + "grad_norm": 14.525940895080566, + "learning_rate": 3.7086675895399535e-05, + "loss": 0.6192, "step": 427800 }, { - "epoch": 4.36, - "learning_rate": 5.020506373040305e-05, - "loss": 0.7368, + "epoch": 5.89540106362459, + "grad_norm": 5.6463141441345215, + "learning_rate": 3.707916286146741e-05, + "loss": 0.7687, "step": 427900 }, { - "epoch": 4.36, - "learning_rate": 5.020015977806731e-05, - "loss": 0.7203, + "epoch": 5.896778815684329, + "grad_norm": 7.920635223388672, + "learning_rate": 3.70716490137364e-05, + "loss": 0.6485, "step": 428000 }, { - "epoch": 4.36, - "learning_rate": 5.01952548380643e-05, - "loss": 0.713, + "epoch": 5.898156567744069, + "grad_norm": 5.616747856140137, + "learning_rate": 3.706413435284484e-05, + "loss": 0.7273, "step": 428100 }, { - "epoch": 4.36, - "learning_rate": 5.019034891063385e-05, - "loss": 0.7189, + "epoch": 5.899534319803808, + "grad_norm": 3.455004930496216, + "learning_rate": 3.7056618879431134e-05, + "loss": 0.6811, "step": 428200 }, { - "epoch": 4.36, - "learning_rate": 5.0185441996015825e-05, - "loss": 0.6492, + "epoch": 5.900912071863547, + "grad_norm": 7.980852127075195, + "learning_rate": 3.704910259413377e-05, + "loss": 0.639, "step": 428300 }, { - "epoch": 4.36, - "learning_rate": 5.018053409445015e-05, - "loss": 0.8632, + "epoch": 5.902289823923287, + "grad_norm": 6.100063323974609, + "learning_rate": 3.7041585497591274e-05, + "loss": 0.5664, "step": 428400 }, { - "epoch": 4.37, - "learning_rate": 5.017562520617679e-05, - "loss": 0.6912, + "epoch": 5.903667575983026, + "grad_norm": 6.610986232757568, + "learning_rate": 3.703406759044228e-05, + "loss": 0.6016, "step": 428500 }, { - "epoch": 4.37, - "learning_rate": 5.0170715331435744e-05, - "loss": 0.7184, + "epoch": 5.905045328042766, + "grad_norm": 4.744160175323486, + "learning_rate": 3.702654887332547e-05, + "loss": 0.6221, "step": 428600 }, { - "epoch": 4.37, - "learning_rate": 5.016580447046711e-05, - "loss": 0.7393, + "epoch": 5.906423080102504, + "grad_norm": 13.851295471191406, + "learning_rate": 3.70190293468796e-05, + "loss": 0.7111, "step": 428700 }, { - "epoch": 4.37, - "learning_rate": 5.0160892623510975e-05, - "loss": 0.7107, + "epoch": 5.907800832162244, + "grad_norm": 33.78352737426758, + "learning_rate": 3.701150901174348e-05, + "loss": 0.6226, "step": 428800 }, { - "epoch": 4.37, - "learning_rate": 5.01559797908075e-05, - "loss": 0.6983, + "epoch": 5.9091785842219835, + "grad_norm": 6.320300579071045, + "learning_rate": 3.700398786855602e-05, + "loss": 0.5655, "step": 428900 }, { - "epoch": 4.37, - "learning_rate": 5.015106597259691e-05, - "loss": 0.7329, + "epoch": 5.910556336281723, + "grad_norm": 3.313681125640869, + "learning_rate": 3.699646591795616e-05, + "loss": 0.5985, "step": 429000 }, { - "epoch": 4.37, - "learning_rate": 5.014615116911943e-05, - "loss": 0.6207, + "epoch": 5.911934088341462, + "grad_norm": 10.409984588623047, + "learning_rate": 3.698894316058294e-05, + "loss": 0.6006, "step": 429100 }, { - "epoch": 4.37, - "learning_rate": 5.0141235380615396e-05, - "loss": 0.6532, + "epoch": 5.913311840401201, + "grad_norm": 12.074596405029297, + "learning_rate": 3.698141959707546e-05, + "loss": 0.5957, "step": 429200 }, { - "epoch": 4.37, - "learning_rate": 5.013631860732514e-05, - "loss": 0.7355, + "epoch": 5.914689592460941, + "grad_norm": 8.207657814025879, + "learning_rate": 3.6973895228072865e-05, + "loss": 0.6958, "step": 429300 }, { - "epoch": 4.37, - "learning_rate": 5.0131400849489084e-05, - "loss": 0.6442, + "epoch": 5.91606734452068, + "grad_norm": 5.099185943603516, + "learning_rate": 3.69663700542144e-05, + "loss": 0.5632, "step": 429400 }, { - "epoch": 4.38, - "learning_rate": 5.0126482107347656e-05, - "loss": 0.6659, + "epoch": 5.917445096580419, + "grad_norm": 3.278627395629883, + "learning_rate": 3.695884407613938e-05, + "loss": 0.6279, "step": 429500 }, { - "epoch": 4.38, - "learning_rate": 5.012156238114137e-05, - "loss": 0.7708, + "epoch": 5.918822848640159, + "grad_norm": 2.1374051570892334, + "learning_rate": 3.6951317294487165e-05, + "loss": 0.5715, "step": 429600 }, { - "epoch": 4.38, - "learning_rate": 5.011664167111075e-05, - "loss": 0.6709, + "epoch": 5.920200600699898, + "grad_norm": 2.794398784637451, + "learning_rate": 3.6943789709897195e-05, + "loss": 0.5368, "step": 429700 }, { - "epoch": 4.38, - "learning_rate": 5.011171997749641e-05, - "loss": 0.7167, + "epoch": 5.921578352759638, + "grad_norm": 448.9095458984375, + "learning_rate": 3.693626132300896e-05, + "loss": 0.5971, "step": 429800 }, { - "epoch": 4.38, - "learning_rate": 5.010679730053898e-05, - "loss": 0.6385, + "epoch": 5.922956104819376, + "grad_norm": 8.30272102355957, + "learning_rate": 3.692873213446206e-05, + "loss": 0.5373, "step": 429900 }, { - "epoch": 4.38, - "learning_rate": 5.0101873640479154e-05, - "loss": 0.7165, + "epoch": 5.924333856879116, + "grad_norm": 3.7461960315704346, + "learning_rate": 3.692120214489613e-05, + "loss": 0.5221, "step": 430000 }, { - "epoch": 4.38, - "learning_rate": 5.009694899755767e-05, - "loss": 0.572, + "epoch": 5.9257116089388555, + "grad_norm": 4.689380168914795, + "learning_rate": 3.691367135495088e-05, + "loss": 0.6562, "step": 430100 }, { - "epoch": 4.38, - "learning_rate": 5.009202337201531e-05, - "loss": 0.7944, + "epoch": 5.927089360998595, + "grad_norm": 8.454404830932617, + "learning_rate": 3.690613976526608e-05, + "loss": 0.5559, "step": 430200 }, { - "epoch": 4.38, - "learning_rate": 5.008709676409292e-05, - "loss": 0.5477, + "epoch": 5.928467113058334, + "grad_norm": 5.8494768142700195, + "learning_rate": 3.689860737648159e-05, + "loss": 0.6162, "step": 430300 }, { - "epoch": 4.38, - "learning_rate": 5.008216917403136e-05, - "loss": 0.6377, + "epoch": 5.929844865118073, + "grad_norm": 3.5328571796417236, + "learning_rate": 3.6891074189237324e-05, + "loss": 0.576, "step": 430400 }, { - "epoch": 4.39, - "learning_rate": 5.007724060207157e-05, - "loss": 0.7971, + "epoch": 5.931222617177813, + "grad_norm": 3.603300094604492, + "learning_rate": 3.688354020417326e-05, + "loss": 0.5384, "step": 430500 }, { - "epoch": 4.39, - "learning_rate": 5.007231104845454e-05, - "loss": 0.7742, + "epoch": 5.9326003692375515, + "grad_norm": 10.173060417175293, + "learning_rate": 3.687600542192945e-05, + "loss": 0.5893, "step": 430600 }, { - "epoch": 4.39, - "learning_rate": 5.006738051342128e-05, - "loss": 0.7246, + "epoch": 5.933978121297291, + "grad_norm": 8.62248706817627, + "learning_rate": 3.686846984314601e-05, + "loss": 0.6259, "step": 430700 }, { - "epoch": 4.39, - "learning_rate": 5.0062448997212866e-05, - "loss": 0.7557, + "epoch": 5.935355873357031, + "grad_norm": 8.846124649047852, + "learning_rate": 3.686093346846313e-05, + "loss": 0.521, "step": 430800 }, { - "epoch": 4.39, - "learning_rate": 5.005751650007043e-05, - "loss": 0.7402, + "epoch": 5.93673362541677, + "grad_norm": 1.8469078540802002, + "learning_rate": 3.685339629852106e-05, + "loss": 0.5536, "step": 430900 }, { - "epoch": 4.39, - "learning_rate": 5.005258302223512e-05, - "loss": 0.7289, + "epoch": 5.93811137747651, + "grad_norm": 6.402103424072266, + "learning_rate": 3.6845858333960125e-05, + "loss": 0.5646, "step": 431000 }, { - "epoch": 4.39, - "learning_rate": 5.004764856394818e-05, - "loss": 0.6567, + "epoch": 5.939489129536248, + "grad_norm": 7.487903594970703, + "learning_rate": 3.683831957542071e-05, + "loss": 0.5571, "step": 431100 }, { - "epoch": 4.39, - "learning_rate": 5.004271312545087e-05, - "loss": 0.712, + "epoch": 5.940866881595988, + "grad_norm": 3.5843000411987305, + "learning_rate": 3.6830780023543266e-05, + "loss": 0.5761, "step": 431200 }, { - "epoch": 4.39, - "learning_rate": 5.0037776706984484e-05, - "loss": 0.6962, + "epoch": 5.9422446336557275, + "grad_norm": 4.272304058074951, + "learning_rate": 3.6823239678968326e-05, + "loss": 0.5372, "step": 431300 }, { - "epoch": 4.4, - "learning_rate": 5.0032839308790394e-05, - "loss": 0.7904, + "epoch": 5.943622385715466, + "grad_norm": 12.894792556762695, + "learning_rate": 3.6815773957621386e-05, + "loss": 0.5875, "step": 431400 }, { - "epoch": 4.4, - "learning_rate": 5.002790093111002e-05, - "loss": 0.567, + "epoch": 5.945000137775206, + "grad_norm": 132.09768676757812, + "learning_rate": 3.6808232037484276e-05, + "loss": 0.6061, "step": 431500 }, { - "epoch": 4.4, - "learning_rate": 5.00229615741848e-05, - "loss": 0.6883, + "epoch": 5.946377889834945, + "grad_norm": 24.59657859802246, + "learning_rate": 3.680068932656524e-05, + "loss": 0.5844, "step": 431600 }, { - "epoch": 4.4, - "learning_rate": 5.0018021238256265e-05, - "loss": 0.671, + "epoch": 5.947755641894685, + "grad_norm": 5.0189080238342285, + "learning_rate": 3.6793145825505064e-05, + "loss": 0.5308, "step": 431700 }, { - "epoch": 4.4, - "learning_rate": 5.001307992356594e-05, - "loss": 0.801, + "epoch": 5.949133393954424, + "grad_norm": 1.8234995603561401, + "learning_rate": 3.6785601534944605e-05, + "loss": 0.6098, "step": 431800 }, { - "epoch": 4.4, - "learning_rate": 5.000813763035545e-05, - "loss": 0.6168, + "epoch": 5.950511146014163, + "grad_norm": 9.298529624938965, + "learning_rate": 3.67780564555248e-05, + "loss": 0.5942, "step": 431900 }, { - "epoch": 4.4, - "learning_rate": 5.0003194358866426e-05, - "loss": 0.6935, + "epoch": 5.951888898073903, + "grad_norm": 10.730475425720215, + "learning_rate": 3.677051058788662e-05, + "loss": 0.596, "step": 432000 }, { - "epoch": 4.4, - "learning_rate": 4.999825010934057e-05, - "loss": 0.6424, + "epoch": 5.953266650133642, + "grad_norm": 11.711047172546387, + "learning_rate": 3.676296393267115e-05, + "loss": 0.523, "step": 432100 }, { - "epoch": 4.4, - "learning_rate": 4.999330488201962e-05, - "loss": 0.6875, + "epoch": 5.954644402193381, + "grad_norm": 8.184954643249512, + "learning_rate": 3.67554164905195e-05, + "loss": 0.5469, "step": 432200 }, { - "epoch": 4.4, - "learning_rate": 4.998835867714539e-05, - "loss": 0.6773, + "epoch": 5.95602215425312, + "grad_norm": 3.974088430404663, + "learning_rate": 3.674786826207286e-05, + "loss": 0.5885, "step": 432300 }, { - "epoch": 4.41, - "learning_rate": 4.99834114949597e-05, - "loss": 0.6703, + "epoch": 5.95739990631286, + "grad_norm": 5.20788049697876, + "learning_rate": 3.674031924797251e-05, + "loss": 0.5494, "step": 432400 }, { - "epoch": 4.41, - "learning_rate": 4.9978512822132695e-05, - "loss": 0.684, + "epoch": 5.9587776583725995, + "grad_norm": 5.296034336090088, + "learning_rate": 3.673276944885976e-05, + "loss": 0.5949, "step": 432500 }, { - "epoch": 4.41, - "learning_rate": 4.997356369581687e-05, - "loss": 0.7165, + "epoch": 5.960155410432339, + "grad_norm": 5.673565864562988, + "learning_rate": 3.672521886537601e-05, + "loss": 0.5615, "step": 432600 }, { - "epoch": 4.41, - "learning_rate": 4.9968613592913e-05, - "loss": 0.7496, + "epoch": 5.961533162492078, + "grad_norm": 10.703372955322266, + "learning_rate": 3.671766749816273e-05, + "loss": 0.5773, "step": 432700 }, { - "epoch": 4.41, - "learning_rate": 4.996366251366307e-05, - "loss": 0.6658, + "epoch": 5.962910914551817, + "grad_norm": 7.864493370056152, + "learning_rate": 3.6710115347861436e-05, + "loss": 0.5727, "step": 432800 }, { - "epoch": 4.41, - "learning_rate": 4.995871045830919e-05, - "loss": 0.5591, + "epoch": 5.964288666611557, + "grad_norm": 3.806501626968384, + "learning_rate": 3.670256241511372e-05, + "loss": 0.5684, "step": 432900 }, { - "epoch": 4.41, - "learning_rate": 4.995375742709348e-05, - "loss": 0.6732, + "epoch": 5.9656664186712955, + "grad_norm": 12.322609901428223, + "learning_rate": 3.6695008700561254e-05, + "loss": 0.5526, "step": 433000 }, { - "epoch": 4.41, - "learning_rate": 4.9948803420258114e-05, - "loss": 0.6448, + "epoch": 5.967044170731035, + "grad_norm": 25.0034122467041, + "learning_rate": 3.668745420484574e-05, + "loss": 0.6101, "step": 433100 }, { - "epoch": 4.41, - "learning_rate": 4.994384843804529e-05, - "loss": 0.7854, + "epoch": 5.968421922790775, + "grad_norm": 3.787853717803955, + "learning_rate": 3.6679898928609005e-05, + "loss": 0.5057, "step": 433200 }, { - "epoch": 4.41, - "learning_rate": 4.993889248069731e-05, - "loss": 0.6784, + "epoch": 5.969799674850514, + "grad_norm": 11.54350757598877, + "learning_rate": 3.6672342872492894e-05, + "loss": 0.5054, "step": 433300 }, { - "epoch": 4.42, - "learning_rate": 4.993393554845647e-05, - "loss": 0.7174, + "epoch": 5.971177426910253, + "grad_norm": 7.111335754394531, + "learning_rate": 3.666478603713932e-05, + "loss": 0.5422, "step": 433400 }, { - "epoch": 4.42, - "learning_rate": 4.992897764156515e-05, - "loss": 0.6565, + "epoch": 5.972555178969992, + "grad_norm": 5.190135955810547, + "learning_rate": 3.6657228423190287e-05, + "loss": 0.5514, "step": 433500 }, { - "epoch": 4.42, - "learning_rate": 4.9924018760265746e-05, - "loss": 0.5781, + "epoch": 5.973932931029732, + "grad_norm": 1.7636561393737793, + "learning_rate": 3.6649670031287866e-05, + "loss": 0.5779, "step": 433600 }, { - "epoch": 4.42, - "learning_rate": 4.991905890480073e-05, - "loss": 0.7323, + "epoch": 5.9753106830894716, + "grad_norm": 11.792951583862305, + "learning_rate": 3.664211086207415e-05, + "loss": 0.5175, "step": 433700 }, { - "epoch": 4.42, - "learning_rate": 4.9914098075412595e-05, - "loss": 0.6291, + "epoch": 5.97668843514921, + "grad_norm": 18.334692001342773, + "learning_rate": 3.6634550916191344e-05, + "loss": 0.5519, "step": 433800 }, { - "epoch": 4.42, - "learning_rate": 4.990913627234391e-05, - "loss": 0.6552, + "epoch": 5.97806618720895, + "grad_norm": 9.421588897705078, + "learning_rate": 3.662699019428172e-05, + "loss": 0.6008, "step": 433900 }, { - "epoch": 4.42, - "learning_rate": 4.990417349583728e-05, - "loss": 0.7695, + "epoch": 5.979443939268689, + "grad_norm": 0.761475145816803, + "learning_rate": 3.6619428696987566e-05, + "loss": 0.5441, "step": 434000 }, { - "epoch": 4.42, - "learning_rate": 4.989920974613534e-05, - "loss": 0.6178, + "epoch": 5.980821691328429, + "grad_norm": 2.420869827270508, + "learning_rate": 3.661186642495131e-05, + "loss": 0.5136, "step": 434100 }, { - "epoch": 4.42, - "learning_rate": 4.9894245023480794e-05, - "loss": 0.6355, + "epoch": 5.982199443388168, + "grad_norm": 56.12458038330078, + "learning_rate": 3.660430337881538e-05, + "loss": 0.565, "step": 434200 }, { - "epoch": 4.42, - "learning_rate": 4.98892793281164e-05, - "loss": 0.6772, + "epoch": 5.983577195447907, + "grad_norm": 3.856255292892456, + "learning_rate": 3.659673955922229e-05, + "loss": 0.6038, "step": 434300 }, { - "epoch": 4.43, - "learning_rate": 4.988431266028492e-05, - "loss": 0.7564, + "epoch": 5.984954947507647, + "grad_norm": 4.925717353820801, + "learning_rate": 3.658917496681465e-05, + "loss": 0.5903, "step": 434400 }, { - "epoch": 4.43, - "learning_rate": 4.987934502022923e-05, - "loss": 0.7043, + "epoch": 5.986332699567386, + "grad_norm": 5.484774589538574, + "learning_rate": 3.6581609602235086e-05, + "loss": 0.5318, "step": 434500 }, { - "epoch": 4.43, - "learning_rate": 4.9874376408192197e-05, - "loss": 0.6809, + "epoch": 5.987710451627125, + "grad_norm": 5.931771755218506, + "learning_rate": 3.657404346612633e-05, + "loss": 0.6118, "step": 434600 }, { - "epoch": 4.43, - "learning_rate": 4.986940682441677e-05, - "loss": 0.7409, + "epoch": 5.9890882036868645, + "grad_norm": 2.350668430328369, + "learning_rate": 3.656647655913116e-05, + "loss": 0.4815, "step": 434700 }, { - "epoch": 4.43, - "learning_rate": 4.986443626914591e-05, - "loss": 0.6305, + "epoch": 5.990465955746604, + "grad_norm": 3.3989789485931396, + "learning_rate": 3.655890888189242e-05, + "loss": 0.5539, "step": 434800 }, { - "epoch": 4.43, - "learning_rate": 4.985946474262268e-05, - "loss": 0.6624, + "epoch": 5.991843707806343, + "grad_norm": 3.3227851390838623, + "learning_rate": 3.655134043505302e-05, + "loss": 0.5306, "step": 434900 }, { - "epoch": 4.43, - "learning_rate": 4.985449224509013e-05, - "loss": 0.6669, + "epoch": 5.993221459866082, + "grad_norm": 23.648752212524414, + "learning_rate": 3.654377121925595e-05, + "loss": 0.5452, "step": 435000 }, { - "epoch": 4.43, - "learning_rate": 4.9849518776791385e-05, - "loss": 0.6623, + "epoch": 5.994599211925822, + "grad_norm": 434.21368408203125, + "learning_rate": 3.653620123514425e-05, + "loss": 0.5292, "step": 435100 }, { - "epoch": 4.43, - "learning_rate": 4.9844594087161145e-05, - "loss": 0.7072, + "epoch": 5.995976963985561, + "grad_norm": 7.6446404457092285, + "learning_rate": 3.652863048336102e-05, + "loss": 0.5151, "step": 435200 }, { - "epoch": 4.43, - "learning_rate": 4.9839618687761194e-05, - "loss": 0.6677, + "epoch": 5.997354716045301, + "grad_norm": 8.195099830627441, + "learning_rate": 3.652105896454945e-05, + "loss": 0.5419, "step": 435300 }, { - "epoch": 4.44, - "learning_rate": 4.9834642318322294e-05, - "loss": 0.6886, + "epoch": 5.99873246810504, + "grad_norm": 5.087191581726074, + "learning_rate": 3.651348667935276e-05, + "loss": 0.5407, "step": 435400 }, { - "epoch": 4.44, - "learning_rate": 4.982966497908775e-05, - "loss": 0.6502, + "epoch": 6.000110220164779, + "grad_norm": 3.932257890701294, + "learning_rate": 3.650591362841426e-05, + "loss": 0.5739, "step": 435500 }, { - "epoch": 4.44, - "learning_rate": 4.982468667030092e-05, - "loss": 0.6635, + "epoch": 6.001487972224519, + "grad_norm": 6.996191501617432, + "learning_rate": 3.6498339812377335e-05, + "loss": 0.4838, "step": 435600 }, { - "epoch": 4.44, - "learning_rate": 4.9819707392205216e-05, - "loss": 0.7045, + "epoch": 6.002865724284258, + "grad_norm": 4.764026165008545, + "learning_rate": 3.6490765231885404e-05, + "loss": 0.5172, "step": 435700 }, { - "epoch": 4.44, - "learning_rate": 4.98147271450441e-05, - "loss": 0.6756, + "epoch": 6.004243476343997, + "grad_norm": 11.481037139892578, + "learning_rate": 3.6483189887581966e-05, + "loss": 0.5502, "step": 435800 }, { - "epoch": 4.44, - "learning_rate": 4.980974592906108e-05, - "loss": 0.7234, + "epoch": 6.0056212284037365, + "grad_norm": 15.15224838256836, + "learning_rate": 3.647561378011059e-05, + "loss": 0.4605, "step": 435900 }, { - "epoch": 4.44, - "learning_rate": 4.9804763744499704e-05, - "loss": 0.6781, + "epoch": 6.006998980463476, + "grad_norm": 4.166591644287109, + "learning_rate": 3.6468036910114915e-05, + "loss": 0.5327, "step": 436000 }, { - "epoch": 4.44, - "learning_rate": 4.9799780591603564e-05, - "loss": 0.7439, + "epoch": 6.008376732523215, + "grad_norm": 4.7048821449279785, + "learning_rate": 3.6460459278238616e-05, + "loss": 0.5222, "step": 436100 }, { - "epoch": 4.44, - "learning_rate": 4.979479647061632e-05, - "loss": 0.755, + "epoch": 6.009754484582954, + "grad_norm": 10.500849723815918, + "learning_rate": 3.645288088512546e-05, + "loss": 0.5579, "step": 436200 }, { - "epoch": 4.45, - "learning_rate": 4.978981138178165e-05, - "loss": 0.7213, + "epoch": 6.011132236642694, + "grad_norm": 10.437847137451172, + "learning_rate": 3.644530173141926e-05, + "loss": 0.5227, "step": 436300 }, { - "epoch": 4.45, - "learning_rate": 4.97848253253433e-05, - "loss": 0.6981, + "epoch": 6.012509988702433, + "grad_norm": 243.36138916015625, + "learning_rate": 3.643772181776393e-05, + "loss": 0.566, "step": 436400 }, { - "epoch": 4.45, - "learning_rate": 4.9779838301545076e-05, - "loss": 0.7547, + "epoch": 6.013887740762172, + "grad_norm": 5.659040927886963, + "learning_rate": 3.643021695528945e-05, + "loss": 0.5845, "step": 436500 }, { - "epoch": 4.45, - "learning_rate": 4.977485031063078e-05, - "loss": 0.7776, + "epoch": 6.015265492821912, + "grad_norm": 7.633718967437744, + "learning_rate": 3.642263553125117e-05, + "loss": 0.487, "step": 436600 }, { - "epoch": 4.45, - "learning_rate": 4.9769861352844326e-05, - "loss": 0.7326, + "epoch": 6.016643244881651, + "grad_norm": 7.7264227867126465, + "learning_rate": 3.6415053349189354e-05, + "loss": 0.5635, "step": 436700 }, { - "epoch": 4.45, - "learning_rate": 4.9764871428429626e-05, - "loss": 0.6508, + "epoch": 6.018020996941391, + "grad_norm": 3.5479843616485596, + "learning_rate": 3.640747040974815e-05, + "loss": 0.6268, "step": 436800 }, { - "epoch": 4.45, - "learning_rate": 4.9759880537630666e-05, - "loss": 0.7494, + "epoch": 6.019398749001129, + "grad_norm": 2.445791482925415, + "learning_rate": 3.6399962554277246e-05, + "loss": 0.5395, "step": 436900 }, { - "epoch": 4.45, - "learning_rate": 4.975493860404246e-05, - "loss": 0.6432, + "epoch": 6.020776501060869, + "grad_norm": 3.3734915256500244, + "learning_rate": 3.639237810956767e-05, + "loss": 0.5019, "step": 437000 }, { - "epoch": 4.45, - "learning_rate": 4.974994579086486e-05, - "loss": 0.7281, + "epoch": 6.0221542531206085, + "grad_norm": 2.492452621459961, + "learning_rate": 3.638479290940507e-05, + "loss": 0.5262, "step": 437100 }, { - "epoch": 4.45, - "learning_rate": 4.9744952012032755e-05, - "loss": 0.5645, + "epoch": 6.023532005180348, + "grad_norm": 3.395153284072876, + "learning_rate": 3.637720695443387e-05, + "loss": 0.5754, "step": 437200 }, { - "epoch": 4.46, - "learning_rate": 4.973995726779034e-05, - "loss": 0.6535, + "epoch": 6.024909757240087, + "grad_norm": 7.3772969245910645, + "learning_rate": 3.636962024529851e-05, + "loss": 0.5231, "step": 437300 }, { - "epoch": 4.46, - "learning_rate": 4.973496155838181e-05, - "loss": 0.6945, + "epoch": 6.026287509299826, + "grad_norm": 3.5467405319213867, + "learning_rate": 3.636203278264355e-05, + "loss": 0.5894, "step": 437400 }, { - "epoch": 4.46, - "learning_rate": 4.972996488405144e-05, - "loss": 0.7149, + "epoch": 6.027665261359566, + "grad_norm": 3.104058027267456, + "learning_rate": 3.6354444567113555e-05, + "loss": 0.4561, "step": 437500 }, { - "epoch": 4.46, - "learning_rate": 4.972496724504352e-05, - "loss": 0.7294, + "epoch": 6.029043013419305, + "grad_norm": 4.787964344024658, + "learning_rate": 3.6346855599353195e-05, + "loss": 0.5876, "step": 437600 }, { - "epoch": 4.46, - "learning_rate": 4.971996864160242e-05, - "loss": 0.7339, + "epoch": 6.030420765479044, + "grad_norm": 12.891325950622559, + "learning_rate": 3.6339265880007205e-05, + "loss": 0.4888, "step": 437700 }, { - "epoch": 4.46, - "learning_rate": 4.9714969073972535e-05, - "loss": 0.6449, + "epoch": 6.031798517538784, + "grad_norm": 9.917616844177246, + "learning_rate": 3.633167540972034e-05, + "loss": 0.5959, "step": 437800 }, { - "epoch": 4.46, - "learning_rate": 4.9709968542398316e-05, - "loss": 0.6048, + "epoch": 6.033176269598523, + "grad_norm": 2.963008403778076, + "learning_rate": 3.632408418913747e-05, + "loss": 0.4673, "step": 437900 }, { - "epoch": 4.46, - "learning_rate": 4.9704967047124254e-05, - "loss": 0.8038, + "epoch": 6.034554021658263, + "grad_norm": 8.004764556884766, + "learning_rate": 3.6316492218903524e-05, + "loss": 0.5567, "step": 438000 }, { - "epoch": 4.46, - "learning_rate": 4.96999645883949e-05, - "loss": 0.6365, + "epoch": 6.035931773718001, + "grad_norm": 2.5110106468200684, + "learning_rate": 3.630889949966344e-05, + "loss": 0.4351, "step": 438100 }, { - "epoch": 4.46, - "learning_rate": 4.969496116645485e-05, - "loss": 0.709, + "epoch": 6.037309525777741, + "grad_norm": 2.3889193534851074, + "learning_rate": 3.6301306032062285e-05, + "loss": 0.5374, "step": 438200 }, { - "epoch": 4.47, - "learning_rate": 4.968995678154872e-05, - "loss": 0.6996, + "epoch": 6.0386872778374805, + "grad_norm": 5.569151878356934, + "learning_rate": 3.629371181674515e-05, + "loss": 0.4994, "step": 438300 }, { - "epoch": 4.47, - "learning_rate": 4.9684951433921205e-05, - "loss": 0.7499, + "epoch": 6.04006502989722, + "grad_norm": 3.1107232570648193, + "learning_rate": 3.628611685435721e-05, + "loss": 0.4785, "step": 438400 }, { - "epoch": 4.47, - "learning_rate": 4.967994512381705e-05, - "loss": 0.7049, + "epoch": 6.041442781956959, + "grad_norm": 11.284457206726074, + "learning_rate": 3.627852114554369e-05, + "loss": 0.5848, "step": 438500 }, { - "epoch": 4.47, - "learning_rate": 4.967493785148102e-05, - "loss": 0.7133, + "epoch": 6.042820534016698, + "grad_norm": 24.089033126831055, + "learning_rate": 3.627092469094989e-05, + "loss": 0.6009, "step": 438600 }, { - "epoch": 4.47, - "learning_rate": 4.966992961715793e-05, - "loss": 0.5767, + "epoch": 6.044198286076438, + "grad_norm": 9.23412799835205, + "learning_rate": 3.626332749122117e-05, + "loss": 0.5247, "step": 438700 }, { - "epoch": 4.47, - "learning_rate": 4.966492042109267e-05, - "loss": 0.6975, + "epoch": 6.045576038136177, + "grad_norm": 1.5072345733642578, + "learning_rate": 3.6255729547002946e-05, + "loss": 0.4833, "step": 438800 }, { - "epoch": 4.47, - "learning_rate": 4.965991026353017e-05, - "loss": 0.6571, + "epoch": 6.046953790195916, + "grad_norm": 4.464546203613281, + "learning_rate": 3.62481308589407e-05, + "loss": 0.5437, "step": 438900 }, { - "epoch": 4.47, - "learning_rate": 4.9654899144715364e-05, - "loss": 0.7542, + "epoch": 6.048331542255656, + "grad_norm": 1.1243488788604736, + "learning_rate": 3.6240531427679984e-05, + "loss": 0.5059, "step": 439000 }, { - "epoch": 4.47, - "learning_rate": 4.964988706489329e-05, - "loss": 0.7228, + "epoch": 6.049709294315395, + "grad_norm": 4.448837757110596, + "learning_rate": 3.623293125386641e-05, + "loss": 0.608, "step": 439100 }, { - "epoch": 4.47, - "learning_rate": 4.964487402430901e-05, - "loss": 0.7408, + "epoch": 6.051087046375135, + "grad_norm": 25.50322723388672, + "learning_rate": 3.622540635097317e-05, + "loss": 0.5858, "step": 439200 }, { - "epoch": 4.48, - "learning_rate": 4.963986002320762e-05, - "loss": 0.6805, + "epoch": 6.052464798434873, + "grad_norm": 2.336292266845703, + "learning_rate": 3.621780470140037e-05, + "loss": 0.5671, "step": 439300 }, { - "epoch": 4.48, - "learning_rate": 4.963484506183428e-05, - "loss": 0.6408, + "epoch": 6.053842550494613, + "grad_norm": 9.667577743530273, + "learning_rate": 3.6210202311205475e-05, + "loss": 0.5473, "step": 439400 }, { - "epoch": 4.48, - "learning_rate": 4.9629829140434197e-05, - "loss": 0.6306, + "epoch": 6.0552203025543525, + "grad_norm": 5.689519882202148, + "learning_rate": 3.6202599181034314e-05, + "loss": 0.5299, "step": 439500 }, { - "epoch": 4.48, - "learning_rate": 4.9624812259252607e-05, - "loss": 0.7145, + "epoch": 6.056598054614092, + "grad_norm": 2.0845119953155518, + "learning_rate": 3.6194995311532844e-05, + "loss": 0.4396, "step": 439600 }, { - "epoch": 4.48, - "learning_rate": 4.961979441853481e-05, - "loss": 0.7239, + "epoch": 6.057975806673831, + "grad_norm": 3.8977465629577637, + "learning_rate": 3.618739070334702e-05, + "loss": 0.5506, "step": 439700 }, { - "epoch": 4.48, - "learning_rate": 4.961477561852615e-05, - "loss": 0.7509, + "epoch": 6.05935355873357, + "grad_norm": 5.027807235717773, + "learning_rate": 3.617978535712292e-05, + "loss": 0.4433, "step": 439800 }, { - "epoch": 4.48, - "learning_rate": 4.9609755859472017e-05, - "loss": 0.649, + "epoch": 6.06073131079331, + "grad_norm": 1.3472216129302979, + "learning_rate": 3.617217927350666e-05, + "loss": 0.4872, "step": 439900 }, { - "epoch": 4.48, - "learning_rate": 4.9604735141617856e-05, - "loss": 0.5838, + "epoch": 6.062109062853049, + "grad_norm": 4.526773452758789, + "learning_rate": 3.616457245314439e-05, + "loss": 0.4335, "step": 440000 }, { - "epoch": 4.48, - "learning_rate": 4.959971346520913e-05, - "loss": 0.7609, + "epoch": 6.063486814912788, + "grad_norm": 6.204596996307373, + "learning_rate": 3.6156964896682374e-05, + "loss": 0.5058, "step": 440100 }, { - "epoch": 4.48, - "learning_rate": 4.9594690830491384e-05, - "loss": 0.663, + "epoch": 6.064864566972528, + "grad_norm": 2.962155818939209, + "learning_rate": 3.61493566047669e-05, + "loss": 0.5986, "step": 440200 }, { - "epoch": 4.49, - "learning_rate": 4.95896672377102e-05, - "loss": 0.6679, + "epoch": 6.066242319032267, + "grad_norm": 3.6378650665283203, + "learning_rate": 3.614174757804432e-05, + "loss": 0.4689, "step": 440300 }, { - "epoch": 4.49, - "learning_rate": 4.958464268711118e-05, - "loss": 0.6525, + "epoch": 6.067620071092006, + "grad_norm": 3.064312219619751, + "learning_rate": 3.613413781716109e-05, + "loss": 0.5226, "step": 440400 }, { - "epoch": 4.49, - "learning_rate": 4.957961717894002e-05, - "loss": 0.784, + "epoch": 6.068997823151745, + "grad_norm": 1.3371057510375977, + "learning_rate": 3.612652732276367e-05, + "loss": 0.5057, "step": 440500 }, { - "epoch": 4.49, - "learning_rate": 4.9574590713442425e-05, - "loss": 0.722, + "epoch": 6.070375575211485, + "grad_norm": 7.146762371063232, + "learning_rate": 3.6118916095498616e-05, + "loss": 0.5122, "step": 440600 }, { - "epoch": 4.49, - "learning_rate": 4.956956329086415e-05, - "loss": 0.6848, + "epoch": 6.0717533272712245, + "grad_norm": 3.738504409790039, + "learning_rate": 3.611130413601255e-05, + "loss": 0.5273, "step": 440700 }, { - "epoch": 4.49, - "learning_rate": 4.956453491145102e-05, - "loss": 0.6269, + "epoch": 6.073131079330963, + "grad_norm": 7.647688865661621, + "learning_rate": 3.6103691444952153e-05, + "loss": 0.5517, "step": 440800 }, { - "epoch": 4.49, - "learning_rate": 4.9559505575448894e-05, - "loss": 0.6028, + "epoch": 6.074508831390703, + "grad_norm": 6.716989517211914, + "learning_rate": 3.6096078022964135e-05, + "loss": 0.5269, "step": 440900 }, { - "epoch": 4.49, - "learning_rate": 4.955447528310366e-05, - "loss": 0.65, + "epoch": 6.075886583450442, + "grad_norm": 2.3925464153289795, + "learning_rate": 3.6088463870695325e-05, + "loss": 0.4833, "step": 441000 }, { - "epoch": 4.49, - "learning_rate": 4.954944403466128e-05, - "loss": 0.6295, + "epoch": 6.077264335510182, + "grad_norm": 5.472541809082031, + "learning_rate": 3.6080848988792564e-05, + "loss": 0.5763, "step": 441100 }, { - "epoch": 4.5, - "learning_rate": 4.954441183036775e-05, - "loss": 0.6594, + "epoch": 6.0786420875699205, + "grad_norm": 10.458732604980469, + "learning_rate": 3.607323337790278e-05, + "loss": 0.4669, "step": 441200 }, { - "epoch": 4.5, - "learning_rate": 4.953937867046913e-05, - "loss": 0.7185, + "epoch": 6.08001983962966, + "grad_norm": 2.846069812774658, + "learning_rate": 3.6065617038672965e-05, + "loss": 0.5704, "step": 441300 }, { - "epoch": 4.5, - "learning_rate": 4.953434455521149e-05, - "loss": 0.6824, + "epoch": 6.0813975916894, + "grad_norm": 5.048154830932617, + "learning_rate": 3.605799997175016e-05, + "loss": 0.5151, "step": 441400 }, { - "epoch": 4.5, - "learning_rate": 4.952930948484097e-05, - "loss": 0.6306, + "epoch": 6.082775343749139, + "grad_norm": 7.898248195648193, + "learning_rate": 3.6050382177781476e-05, + "loss": 0.5817, "step": 441500 }, { - "epoch": 4.5, - "learning_rate": 4.952427345960377e-05, - "loss": 0.6899, + "epoch": 6.084153095808878, + "grad_norm": 2.1498281955718994, + "learning_rate": 3.6042763657414084e-05, + "loss": 0.5335, "step": 441600 }, { - "epoch": 4.5, - "learning_rate": 4.951923647974611e-05, - "loss": 0.6913, + "epoch": 6.085530847868617, + "grad_norm": 11.37043285369873, + "learning_rate": 3.6035144411295215e-05, + "loss": 0.4992, "step": 441700 }, { - "epoch": 4.5, - "learning_rate": 4.9514198545514265e-05, - "loss": 0.6832, + "epoch": 6.086908599928357, + "grad_norm": 5.462133407592773, + "learning_rate": 3.602752444007215e-05, + "loss": 0.4969, "step": 441800 }, { - "epoch": 4.5, - "learning_rate": 4.950915965715457e-05, - "loss": 0.7589, + "epoch": 6.0882863519880965, + "grad_norm": 5.392831802368164, + "learning_rate": 3.601990374439227e-05, + "loss": 0.5282, "step": 441900 }, { - "epoch": 4.5, - "learning_rate": 4.9504119814913395e-05, - "loss": 0.587, + "epoch": 6.089664104047835, + "grad_norm": 6.102015018463135, + "learning_rate": 3.6012282324902985e-05, + "loss": 0.4772, "step": 442000 }, { - "epoch": 4.5, - "learning_rate": 4.949907901903716e-05, - "loss": 0.678, + "epoch": 6.091041856107575, + "grad_norm": 5.128676891326904, + "learning_rate": 3.600466018225176e-05, + "loss": 0.4876, "step": 442100 }, { - "epoch": 4.51, - "learning_rate": 4.949403726977232e-05, - "loss": 0.7047, + "epoch": 6.092419608167314, + "grad_norm": 1.8960788249969482, + "learning_rate": 3.599703731708614e-05, + "loss": 0.5109, "step": 442200 }, { - "epoch": 4.51, - "learning_rate": 4.9488994567365386e-05, - "loss": 0.6576, + "epoch": 6.093797360227054, + "grad_norm": 3.5986814498901367, + "learning_rate": 3.5989413730053727e-05, + "loss": 0.4788, "step": 442300 }, { - "epoch": 4.51, - "learning_rate": 4.9483950912062925e-05, - "loss": 0.7054, + "epoch": 6.0951751122867925, + "grad_norm": 5.08997106552124, + "learning_rate": 3.598178942180219e-05, + "loss": 0.5027, "step": 442400 }, { - "epoch": 4.51, - "learning_rate": 4.947890630411154e-05, - "loss": 0.64, + "epoch": 6.096552864346532, + "grad_norm": 2.4945812225341797, + "learning_rate": 3.5974164392979255e-05, + "loss": 0.4899, "step": 442500 }, { - "epoch": 4.51, - "learning_rate": 4.9473860743757884e-05, - "loss": 0.6651, + "epoch": 6.097930616406272, + "grad_norm": 3.864180326461792, + "learning_rate": 3.596653864423269e-05, + "loss": 0.4705, "step": 442600 }, { - "epoch": 4.51, - "learning_rate": 4.946881423124865e-05, - "loss": 0.6082, + "epoch": 6.099308368466011, + "grad_norm": 3.879204511642456, + "learning_rate": 3.595891217621037e-05, + "loss": 0.5654, "step": 442700 }, { - "epoch": 4.51, - "learning_rate": 4.9463766766830575e-05, - "loss": 0.6219, + "epoch": 6.10068612052575, + "grad_norm": 4.761159420013428, + "learning_rate": 3.5951284989560165e-05, + "loss": 0.4777, "step": 442800 }, { - "epoch": 4.51, - "learning_rate": 4.945871835075047e-05, - "loss": 0.628, + "epoch": 6.102063872585489, + "grad_norm": 1.4389569759368896, + "learning_rate": 3.594365708493008e-05, + "loss": 0.4663, "step": 442900 }, { - "epoch": 4.51, - "learning_rate": 4.945366898325516e-05, - "loss": 0.6402, + "epoch": 6.103441624645229, + "grad_norm": 21.642112731933594, + "learning_rate": 3.5936028462968124e-05, + "loss": 0.5343, "step": 443000 }, { - "epoch": 4.51, - "learning_rate": 4.944861866459152e-05, - "loss": 0.666, + "epoch": 6.1048193767049685, + "grad_norm": 10.866023063659668, + "learning_rate": 3.592839912432238e-05, + "loss": 0.5379, "step": 443100 }, { - "epoch": 4.52, - "learning_rate": 4.94435673950065e-05, - "loss": 0.6585, + "epoch": 6.106197128764707, + "grad_norm": 6.113267421722412, + "learning_rate": 3.592076906964102e-05, + "loss": 0.5235, "step": 443200 }, { - "epoch": 4.52, - "learning_rate": 4.943851517474707e-05, - "loss": 0.7371, + "epoch": 6.107574880824447, + "grad_norm": 7.626857280731201, + "learning_rate": 3.591313829957225e-05, + "loss": 0.4854, "step": 443300 }, { - "epoch": 4.52, - "learning_rate": 4.9433462004060245e-05, - "loss": 0.7183, + "epoch": 6.108952632884186, + "grad_norm": 70.18658447265625, + "learning_rate": 3.590550681476433e-05, + "loss": 0.4979, "step": 443400 }, { - "epoch": 4.52, - "learning_rate": 4.942840788319311e-05, - "loss": 0.581, + "epoch": 6.110330384943926, + "grad_norm": 17.8839054107666, + "learning_rate": 3.589787461586562e-05, + "loss": 0.5445, "step": 443500 }, { - "epoch": 4.52, - "learning_rate": 4.9423352812392756e-05, - "loss": 0.7019, + "epoch": 6.1117081370036646, + "grad_norm": 37.61186218261719, + "learning_rate": 3.589024170352448e-05, + "loss": 0.4973, "step": 443600 }, { - "epoch": 4.52, - "learning_rate": 4.941829679190637e-05, - "loss": 0.7097, + "epoch": 6.113085889063404, + "grad_norm": 327.0575256347656, + "learning_rate": 3.588260807838939e-05, + "loss": 0.4814, "step": 443700 }, { - "epoch": 4.52, - "learning_rate": 4.9413290396379316e-05, - "loss": 0.7073, + "epoch": 6.114463641123144, + "grad_norm": 5.977665901184082, + "learning_rate": 3.5874973741108866e-05, + "loss": 0.5655, "step": 443800 }, { - "epoch": 4.52, - "learning_rate": 4.9408232486753216e-05, - "loss": 0.6426, + "epoch": 6.115841393182883, + "grad_norm": 1.4254932403564453, + "learning_rate": 3.586733869233147e-05, + "loss": 0.5399, "step": 443900 }, { - "epoch": 4.52, - "learning_rate": 4.940317362818037e-05, - "loss": 0.6822, + "epoch": 6.117219145242622, + "grad_norm": 8.30862045288086, + "learning_rate": 3.585970293270585e-05, + "loss": 0.4537, "step": 444000 }, { - "epoch": 4.52, - "learning_rate": 4.9398113820908126e-05, - "loss": 0.7162, + "epoch": 6.1185968973023614, + "grad_norm": 2.7500791549682617, + "learning_rate": 3.585206646288069e-05, + "loss": 0.5098, "step": 444100 }, { - "epoch": 4.53, - "learning_rate": 4.939305306518388e-05, - "loss": 0.6753, + "epoch": 6.119974649362101, + "grad_norm": 5.256394386291504, + "learning_rate": 3.5844429283504765e-05, + "loss": 0.5182, "step": 444200 }, { - "epoch": 4.53, - "learning_rate": 4.938804198298716e-05, - "loss": 0.5966, + "epoch": 6.121352401421841, + "grad_norm": 4.8288350105285645, + "learning_rate": 3.583679139522687e-05, + "loss": 0.4725, "step": 444300 }, { - "epoch": 4.53, - "learning_rate": 4.9382979340579616e-05, - "loss": 0.7108, + "epoch": 6.122730153481579, + "grad_norm": 26.06218910217285, + "learning_rate": 3.582915279869591e-05, + "loss": 0.4934, "step": 444400 }, { - "epoch": 4.53, - "learning_rate": 4.937791575046005e-05, - "loss": 0.6441, + "epoch": 6.124107905541319, + "grad_norm": 4.325410842895508, + "learning_rate": 3.58215134945608e-05, + "loss": 0.5636, "step": 444500 }, { - "epoch": 4.53, - "learning_rate": 4.937285121287605e-05, - "loss": 0.693, + "epoch": 6.125485657601058, + "grad_norm": 15.727971076965332, + "learning_rate": 3.581387348347054e-05, + "loss": 0.5729, "step": 444600 }, { - "epoch": 4.53, - "learning_rate": 4.9367785728075245e-05, - "loss": 0.7163, + "epoch": 6.126863409660798, + "grad_norm": 4.680234432220459, + "learning_rate": 3.58062327660742e-05, + "loss": 0.4681, "step": 444700 }, { - "epoch": 4.53, - "learning_rate": 4.936271929630529e-05, - "loss": 0.7184, + "epoch": 6.128241161720537, + "grad_norm": 3.360046863555908, + "learning_rate": 3.57985913430209e-05, + "loss": 0.5741, "step": 444800 }, { - "epoch": 4.53, - "learning_rate": 4.935765191781391e-05, - "loss": 0.6917, + "epoch": 6.129618913780276, + "grad_norm": 9.207067489624023, + "learning_rate": 3.5790949214959795e-05, + "loss": 0.5031, "step": 444900 }, { - "epoch": 4.53, - "learning_rate": 4.935258359284888e-05, - "loss": 0.6552, + "epoch": 6.130996665840016, + "grad_norm": 11.302955627441406, + "learning_rate": 3.578330638254015e-05, + "loss": 0.5874, "step": 445000 }, { - "epoch": 4.53, - "learning_rate": 4.9347514321658003e-05, - "loss": 0.7358, + "epoch": 6.132374417899754, + "grad_norm": 7.2201738357543945, + "learning_rate": 3.577566284641124e-05, + "loss": 0.4837, "step": 445100 }, { - "epoch": 4.54, - "learning_rate": 4.934244410448913e-05, - "loss": 0.7435, + "epoch": 6.133752169959494, + "grad_norm": 1.8146162033081055, + "learning_rate": 3.576801860722245e-05, + "loss": 0.5133, "step": 445200 }, { - "epoch": 4.54, - "learning_rate": 4.9337372941590166e-05, - "loss": 0.6113, + "epoch": 6.1351299220192335, + "grad_norm": 7.022237777709961, + "learning_rate": 3.5760373665623166e-05, + "loss": 0.4991, "step": 445300 }, { - "epoch": 4.54, - "learning_rate": 4.933230083320907e-05, - "loss": 0.7115, + "epoch": 6.136507674078973, + "grad_norm": 10.87583065032959, + "learning_rate": 3.5752728022262876e-05, + "loss": 0.5313, "step": 445400 }, { - "epoch": 4.54, - "learning_rate": 4.932722777959383e-05, - "loss": 0.6758, + "epoch": 6.137885426138712, + "grad_norm": 4.28066873550415, + "learning_rate": 3.5745081677791135e-05, + "loss": 0.5797, "step": 445500 }, { - "epoch": 4.54, - "learning_rate": 4.932215378099249e-05, - "loss": 0.7169, + "epoch": 6.139263178198451, + "grad_norm": 12.023751258850098, + "learning_rate": 3.5737511106772004e-05, + "loss": 0.5142, "step": 445600 }, { - "epoch": 4.54, - "learning_rate": 4.931707883765314e-05, - "loss": 0.6167, + "epoch": 6.140640930258191, + "grad_norm": 43.1213493347168, + "learning_rate": 3.5729863369021074e-05, + "loss": 0.5477, "step": 445700 }, { - "epoch": 4.54, - "learning_rate": 4.9312002949823916e-05, - "loss": 0.6903, + "epoch": 6.14201868231793, + "grad_norm": 3.95560884475708, + "learning_rate": 3.572221493210115e-05, + "loss": 0.5364, "step": 445800 }, { - "epoch": 4.54, - "learning_rate": 4.9306926117752994e-05, - "loss": 0.7334, + "epoch": 6.143396434377669, + "grad_norm": 7.4035444259643555, + "learning_rate": 3.5714642291471925e-05, + "loss": 0.5676, "step": 445900 }, { - "epoch": 4.54, - "learning_rate": 4.9301848341688606e-05, - "loss": 0.6055, + "epoch": 6.144774186437409, + "grad_norm": 86.10315704345703, + "learning_rate": 3.570699246513887e-05, + "loss": 0.5262, "step": 446000 }, { - "epoch": 4.54, - "learning_rate": 4.9296769621879014e-05, - "loss": 0.705, + "epoch": 6.146151938497148, + "grad_norm": 3.5248169898986816, + "learning_rate": 3.569941845026454e-05, + "loss": 0.4538, "step": 446100 }, { - "epoch": 4.55, - "learning_rate": 4.929168995857255e-05, - "loss": 0.7207, + "epoch": 6.147529690556888, + "grad_norm": 28.532821655273438, + "learning_rate": 3.569176723709198e-05, + "loss": 0.4902, "step": 446200 }, { - "epoch": 4.55, - "learning_rate": 4.928660935201756e-05, - "loss": 0.6678, + "epoch": 6.148907442616626, + "grad_norm": 34.63907241821289, + "learning_rate": 3.5684115327986885e-05, + "loss": 0.5433, "step": 446300 }, { - "epoch": 4.55, - "learning_rate": 4.928157862262508e-05, - "loss": 0.7134, + "epoch": 6.150285194676366, + "grad_norm": 1.4430421590805054, + "learning_rate": 3.567646272359932e-05, + "loss": 0.57, "step": 446400 }, { - "epoch": 4.55, - "learning_rate": 4.927649613974464e-05, - "loss": 0.696, + "epoch": 6.1516629467361055, + "grad_norm": 2.3084681034088135, + "learning_rate": 3.566880942457942e-05, + "loss": 0.4334, "step": 446500 }, { - "epoch": 4.55, - "learning_rate": 4.927141271435858e-05, - "loss": 0.7637, + "epoch": 6.153040698795845, + "grad_norm": 3.7998335361480713, + "learning_rate": 3.5661155431577375e-05, + "loss": 0.5677, "step": 446600 }, { - "epoch": 4.55, - "learning_rate": 4.9266328346715444e-05, - "loss": 0.7812, + "epoch": 6.154418450855584, + "grad_norm": 6.744919776916504, + "learning_rate": 3.56535772955366e-05, + "loss": 0.4305, "step": 446700 }, { - "epoch": 4.55, - "learning_rate": 4.9261243037063823e-05, - "loss": 0.617, + "epoch": 6.155796202915323, + "grad_norm": 109.22599029541016, + "learning_rate": 3.5645921923444644e-05, + "loss": 0.5926, "step": 446800 }, { - "epoch": 4.55, - "learning_rate": 4.925615678565236e-05, - "loss": 0.6841, + "epoch": 6.157173954975063, + "grad_norm": 46.428497314453125, + "learning_rate": 3.5638265859314944e-05, + "loss": 0.4879, "step": 446900 }, { - "epoch": 4.55, - "learning_rate": 4.925106959272976e-05, - "loss": 0.6576, + "epoch": 6.158551707034802, + "grad_norm": 10.422097206115723, + "learning_rate": 3.563060910379791e-05, + "loss": 0.5638, "step": 447000 }, { - "epoch": 4.56, - "learning_rate": 4.9245981458544725e-05, - "loss": 0.6342, + "epoch": 6.159929459094541, + "grad_norm": 46.52714538574219, + "learning_rate": 3.562295165754405e-05, + "loss": 0.551, "step": 447100 }, { - "epoch": 4.56, - "learning_rate": 4.924089238334606e-05, - "loss": 0.6584, + "epoch": 6.161307211154281, + "grad_norm": 32.69171142578125, + "learning_rate": 3.561529352120389e-05, + "loss": 0.5483, "step": 447200 }, { - "epoch": 4.56, - "learning_rate": 4.923580236738258e-05, - "loss": 0.58, + "epoch": 6.16268496321402, + "grad_norm": 8.378368377685547, + "learning_rate": 3.5607634695428016e-05, + "loss": 0.5729, "step": 447300 }, { - "epoch": 4.56, - "learning_rate": 4.923071141090316e-05, - "loss": 0.6756, + "epoch": 6.16406271527376, + "grad_norm": 68.1645736694336, + "learning_rate": 3.559997518086711e-05, + "loss": 0.5117, "step": 447400 }, { - "epoch": 4.56, - "learning_rate": 4.922561951415671e-05, - "loss": 0.7154, + "epoch": 6.165440467333498, + "grad_norm": 40.23468017578125, + "learning_rate": 3.559231497817187e-05, + "loss": 0.5395, "step": 447500 }, { - "epoch": 4.56, - "learning_rate": 4.9220526677392196e-05, - "loss": 0.76, + "epoch": 6.166818219393238, + "grad_norm": 11.316801071166992, + "learning_rate": 3.558465408799307e-05, + "loss": 0.5486, "step": 447600 }, { - "epoch": 4.56, - "learning_rate": 4.9215432900858636e-05, - "loss": 0.7186, + "epoch": 6.1681959714529775, + "grad_norm": 4.000333309173584, + "learning_rate": 3.557699251098154e-05, + "loss": 0.5308, "step": 447700 }, { - "epoch": 4.56, - "learning_rate": 4.921033818480507e-05, - "loss": 0.6713, + "epoch": 6.169573723512717, + "grad_norm": 11.885077476501465, + "learning_rate": 3.5569330247788186e-05, + "loss": 0.5615, "step": 447800 }, { - "epoch": 4.56, - "learning_rate": 4.9205242529480617e-05, - "loss": 0.6451, + "epoch": 6.170951475572456, + "grad_norm": 8.210104942321777, + "learning_rate": 3.5561667299063934e-05, + "loss": 0.5312, "step": 447900 }, { - "epoch": 4.56, - "learning_rate": 4.9200145935134416e-05, - "loss": 0.6712, + "epoch": 6.172329227632195, + "grad_norm": 2.7470293045043945, + "learning_rate": 3.555400366545981e-05, + "loss": 0.5195, "step": 448000 }, { - "epoch": 4.57, - "learning_rate": 4.919504840201566e-05, - "loss": 0.6011, + "epoch": 6.173706979691935, + "grad_norm": 5.2759108543396, + "learning_rate": 3.5546339347626864e-05, + "loss": 0.4933, "step": 448100 }, { - "epoch": 4.57, - "learning_rate": 4.918994993037357e-05, - "loss": 0.7085, + "epoch": 6.175084731751674, + "grad_norm": 17.13570213317871, + "learning_rate": 3.553867434621622e-05, + "loss": 0.5528, "step": 448200 }, { - "epoch": 4.57, - "learning_rate": 4.918485052045747e-05, - "loss": 0.6702, + "epoch": 6.176462483811413, + "grad_norm": 67.52412414550781, + "learning_rate": 3.553100866187908e-05, + "loss": 0.5231, "step": 448300 }, { - "epoch": 4.57, - "learning_rate": 4.9179750172516653e-05, - "loss": 0.6455, + "epoch": 6.177840235871153, + "grad_norm": 3.0787155628204346, + "learning_rate": 3.5523342295266653e-05, + "loss": 0.537, "step": 448400 }, { - "epoch": 4.57, - "learning_rate": 4.917464888680052e-05, - "loss": 0.7097, + "epoch": 6.179217987930892, + "grad_norm": 2.602759838104248, + "learning_rate": 3.551567524703026e-05, + "loss": 0.5399, "step": 448500 }, { - "epoch": 4.57, - "learning_rate": 4.9169546663558486e-05, - "loss": 0.7222, + "epoch": 6.180595739990631, + "grad_norm": 22.202714920043945, + "learning_rate": 3.550800751782125e-05, + "loss": 0.5407, "step": 448600 }, { - "epoch": 4.57, - "learning_rate": 4.916444350304001e-05, - "loss": 0.718, + "epoch": 6.18197349205037, + "grad_norm": 1.3842966556549072, + "learning_rate": 3.5500339108291024e-05, + "loss": 0.571, "step": 448700 }, { - "epoch": 4.57, - "learning_rate": 4.9159390451107544e-05, - "loss": 0.7327, + "epoch": 6.18335124411011, + "grad_norm": 11.04072380065918, + "learning_rate": 3.549267001909107e-05, + "loss": 0.5185, "step": 448800 }, { - "epoch": 4.57, - "learning_rate": 4.9154285426151326e-05, - "loss": 0.7654, + "epoch": 6.1847289961698495, + "grad_norm": 5.193368911743164, + "learning_rate": 3.5485000250872893e-05, + "loss": 0.5623, "step": 448900 }, { - "epoch": 4.57, - "learning_rate": 4.914917946466487e-05, - "loss": 0.847, + "epoch": 6.186106748229589, + "grad_norm": 1.3192013502120972, + "learning_rate": 3.547732980428811e-05, + "loss": 0.4875, "step": 449000 }, { - "epoch": 4.58, - "learning_rate": 4.914407256689779e-05, - "loss": 0.7199, + "epoch": 6.187484500289328, + "grad_norm": 9.564032554626465, + "learning_rate": 3.546965867998834e-05, + "loss": 0.5589, "step": 449100 }, { - "epoch": 4.58, - "learning_rate": 4.913896473309983e-05, - "loss": 0.623, + "epoch": 6.188862252349067, + "grad_norm": 1.0402576923370361, + "learning_rate": 3.546198687862529e-05, + "loss": 0.5751, "step": 449200 }, { - "epoch": 4.58, - "learning_rate": 4.913385596352071e-05, - "loss": 0.6899, + "epoch": 6.190240004408807, + "grad_norm": 17.56105613708496, + "learning_rate": 3.5454314400850735e-05, + "loss": 0.5944, "step": 449300 }, { - "epoch": 4.58, - "learning_rate": 4.9128746258410205e-05, - "loss": 0.7079, + "epoch": 6.1916177564685455, + "grad_norm": 4.507169246673584, + "learning_rate": 3.544664124731646e-05, + "loss": 0.5029, "step": 449400 }, { - "epoch": 4.58, - "learning_rate": 4.912363561801817e-05, - "loss": 0.749, + "epoch": 6.192995508528285, + "grad_norm": 7.020826816558838, + "learning_rate": 3.5438967418674366e-05, + "loss": 0.5375, "step": 449500 }, { - "epoch": 4.58, - "learning_rate": 4.911852404259448e-05, - "loss": 0.7002, + "epoch": 6.194373260588025, + "grad_norm": 10.615067481994629, + "learning_rate": 3.5431292915576384e-05, + "loss": 0.537, "step": 449600 }, { - "epoch": 4.58, - "learning_rate": 4.911346266211746e-05, - "loss": 0.7784, + "epoch": 6.195751012647764, + "grad_norm": 18.18393325805664, + "learning_rate": 3.542361773867447e-05, + "loss": 0.5349, "step": 449700 }, { - "epoch": 4.58, - "learning_rate": 4.910834922672435e-05, - "loss": 0.7385, + "epoch": 6.197128764707503, + "grad_norm": 12.330747604370117, + "learning_rate": 3.541594188862071e-05, + "loss": 0.4997, "step": 449800 }, { - "epoch": 4.58, - "learning_rate": 4.9103234857047005e-05, - "loss": 0.6373, + "epoch": 6.198506516767242, + "grad_norm": 8.2967529296875, + "learning_rate": 3.540826536606718e-05, + "loss": 0.5679, "step": 449900 }, { - "epoch": 4.58, - "learning_rate": 4.909811955333547e-05, - "loss": 0.6936, + "epoch": 6.199884268826982, + "grad_norm": 0.19264723360538483, + "learning_rate": 3.540058817166605e-05, + "loss": 0.5127, "step": 450000 }, { - "epoch": 4.59, - "learning_rate": 4.9093003315839864e-05, - "loss": 0.6123, + "epoch": 6.2012620208867215, + "grad_norm": 29.366424560546875, + "learning_rate": 3.539291030606954e-05, + "loss": 0.6451, "step": 450100 }, { - "epoch": 4.59, - "learning_rate": 4.908788614481033e-05, - "loss": 0.6282, + "epoch": 6.20263977294646, + "grad_norm": 11.469525337219238, + "learning_rate": 3.5385231769929916e-05, + "loss": 0.5369, "step": 450200 }, { - "epoch": 4.59, - "learning_rate": 4.908276804049708e-05, - "loss": 0.6454, + "epoch": 6.2040175250062, + "grad_norm": 4.140508651733398, + "learning_rate": 3.537755256389951e-05, + "loss": 0.5039, "step": 450300 }, { - "epoch": 4.59, - "learning_rate": 4.907764900315035e-05, - "loss": 0.7118, + "epoch": 6.205395277065939, + "grad_norm": 3.899909496307373, + "learning_rate": 3.536987268863072e-05, + "loss": 0.5257, "step": 450400 }, { - "epoch": 4.59, - "learning_rate": 4.9072529033020435e-05, - "loss": 0.615, + "epoch": 6.206773029125679, + "grad_norm": 7.393928050994873, + "learning_rate": 3.536219214477598e-05, + "loss": 0.5299, "step": 450500 }, { - "epoch": 4.59, - "learning_rate": 4.906740813035767e-05, - "loss": 0.7156, + "epoch": 6.2081507811854175, + "grad_norm": 4.502346992492676, + "learning_rate": 3.535451093298779e-05, + "loss": 0.5149, "step": 450600 }, { - "epoch": 4.59, - "learning_rate": 4.906228629541244e-05, - "loss": 0.7318, + "epoch": 6.209528533245157, + "grad_norm": 2.4106757640838623, + "learning_rate": 3.534682905391872e-05, + "loss": 0.6581, "step": 450700 }, { - "epoch": 4.59, - "learning_rate": 4.905716352843516e-05, - "loss": 0.7111, + "epoch": 6.210906285304897, + "grad_norm": 9.251667976379395, + "learning_rate": 3.5339146508221375e-05, + "loss": 0.5916, "step": 450800 }, { - "epoch": 4.59, - "learning_rate": 4.905203982967631e-05, - "loss": 0.6226, + "epoch": 6.212284037364636, + "grad_norm": 5.105830192565918, + "learning_rate": 3.533146329654843e-05, + "loss": 0.6143, "step": 450900 }, { - "epoch": 4.59, - "learning_rate": 4.904696645029957e-05, - "loss": 0.697, + "epoch": 6.213661789424375, + "grad_norm": 4.3433685302734375, + "learning_rate": 3.532377941955261e-05, + "loss": 0.5629, "step": 451000 }, { - "epoch": 4.6, - "learning_rate": 4.904184089804075e-05, - "loss": 0.6854, + "epoch": 6.215039541484114, + "grad_norm": 7.390102863311768, + "learning_rate": 3.53160948778867e-05, + "loss": 0.6408, "step": 451100 }, { - "epoch": 4.6, - "learning_rate": 4.903671441474955e-05, - "loss": 0.7058, + "epoch": 6.216417293543854, + "grad_norm": 6.294495105743408, + "learning_rate": 3.5308409672203545e-05, + "loss": 0.4777, "step": 451200 }, { - "epoch": 4.6, - "learning_rate": 4.9031587000676614e-05, - "loss": 0.6228, + "epoch": 6.2177950456035935, + "grad_norm": 14.88788890838623, + "learning_rate": 3.5300723803156046e-05, + "loss": 0.4797, "step": 451300 }, { - "epoch": 4.6, - "learning_rate": 4.902645865607265e-05, - "loss": 0.772, + "epoch": 6.219172797663332, + "grad_norm": 13.858607292175293, + "learning_rate": 3.529311413999301e-05, + "loss": 0.5567, "step": 451400 }, { - "epoch": 4.6, - "learning_rate": 4.9021329381188405e-05, - "loss": 0.6994, + "epoch": 6.220550549723072, + "grad_norm": 4.591149806976318, + "learning_rate": 3.5285426952793085e-05, + "loss": 0.5709, "step": 451500 }, { - "epoch": 4.6, - "learning_rate": 4.901619917627466e-05, - "loss": 0.7291, + "epoch": 6.221928301782811, + "grad_norm": 2.202214002609253, + "learning_rate": 3.5277739104181294e-05, + "loss": 0.5549, "step": 451600 }, { - "epoch": 4.6, - "learning_rate": 4.9011068041582265e-05, - "loss": 0.5663, + "epoch": 6.223306053842551, + "grad_norm": 6.87120246887207, + "learning_rate": 3.527005059481079e-05, + "loss": 0.5243, "step": 451700 }, { - "epoch": 4.6, - "learning_rate": 4.9005935977362085e-05, - "loss": 0.5662, + "epoch": 6.2246838059022895, + "grad_norm": 8.729130744934082, + "learning_rate": 3.526236142533473e-05, + "loss": 0.5646, "step": 451800 }, { - "epoch": 4.6, - "learning_rate": 4.900080298386507e-05, - "loss": 0.7569, + "epoch": 6.226061557962029, + "grad_norm": 10.625425338745117, + "learning_rate": 3.5254671596406354e-05, + "loss": 0.5251, "step": 451900 }, { - "epoch": 4.61, - "learning_rate": 4.899566906134218e-05, - "loss": 0.6922, + "epoch": 6.227439310021769, + "grad_norm": 2.1450891494750977, + "learning_rate": 3.524698110867896e-05, + "loss": 0.5895, "step": 452000 }, { - "epoch": 4.61, - "learning_rate": 4.8990534210044416e-05, - "loss": 0.595, + "epoch": 6.228817062081508, + "grad_norm": 5.816488742828369, + "learning_rate": 3.523928996280588e-05, + "loss": 0.5044, "step": 452100 }, { - "epoch": 4.61, - "learning_rate": 4.8985398430222875e-05, - "loss": 0.6483, + "epoch": 6.230194814141247, + "grad_norm": 4.585272789001465, + "learning_rate": 3.5231598159440526e-05, + "loss": 0.5171, "step": 452200 }, { - "epoch": 4.61, - "learning_rate": 4.898026172212864e-05, - "loss": 0.7641, + "epoch": 6.231572566200986, + "grad_norm": 6.558437347412109, + "learning_rate": 3.522390569923636e-05, + "loss": 0.5942, "step": 452300 }, { - "epoch": 4.61, - "learning_rate": 4.897512408601288e-05, - "loss": 0.7983, + "epoch": 6.232950318260726, + "grad_norm": 8.557476997375488, + "learning_rate": 3.521621258284688e-05, + "loss": 0.4391, "step": 452400 }, { - "epoch": 4.61, - "learning_rate": 4.896998552212679e-05, - "loss": 0.6291, + "epoch": 6.2343280703204655, + "grad_norm": 2.2243638038635254, + "learning_rate": 3.520851881092567e-05, + "loss": 0.6063, "step": 452500 }, { - "epoch": 4.61, - "learning_rate": 4.896484603072161e-05, - "loss": 0.7403, + "epoch": 6.235705822380204, + "grad_norm": 3.2788567543029785, + "learning_rate": 3.520082438412635e-05, + "loss": 0.5307, "step": 452600 }, { - "epoch": 4.61, - "learning_rate": 4.895970561204864e-05, - "loss": 0.6618, + "epoch": 6.237083574439944, + "grad_norm": 95.05602264404297, + "learning_rate": 3.51931293031026e-05, + "loss": 0.5932, "step": 452700 }, { - "epoch": 4.61, - "learning_rate": 4.8954564266359216e-05, - "loss": 0.638, + "epoch": 6.238461326499683, + "grad_norm": 6.088125705718994, + "learning_rate": 3.518543356850816e-05, + "loss": 0.5323, "step": 452800 }, { - "epoch": 4.61, - "learning_rate": 4.8949421993904706e-05, - "loss": 0.6913, + "epoch": 6.239839078559422, + "grad_norm": 3.636418581008911, + "learning_rate": 3.517773718099682e-05, + "loss": 0.5586, "step": 452900 }, { - "epoch": 4.62, - "learning_rate": 4.894427879493655e-05, - "loss": 0.608, + "epoch": 6.2412168306191615, + "grad_norm": 4.503596305847168, + "learning_rate": 3.517004014122243e-05, + "loss": 0.4682, "step": 453000 }, { - "epoch": 4.62, - "learning_rate": 4.8939134669706214e-05, - "loss": 0.7368, + "epoch": 6.242594582678901, + "grad_norm": 4.1418538093566895, + "learning_rate": 3.5162342449838885e-05, + "loss": 0.5484, "step": 453100 }, { - "epoch": 4.62, - "learning_rate": 4.893398961846521e-05, - "loss": 0.7245, + "epoch": 6.243972334738641, + "grad_norm": 5.79539680480957, + "learning_rate": 3.515464410750015e-05, + "loss": 0.6073, "step": 453200 }, { - "epoch": 4.62, - "learning_rate": 4.892884364146511e-05, - "loss": 0.5872, + "epoch": 6.24535008679838, + "grad_norm": 41.32741165161133, + "learning_rate": 3.514694511486024e-05, + "loss": 0.5863, "step": 453300 }, { - "epoch": 4.62, - "learning_rate": 4.8923696738957516e-05, - "loss": 0.6046, + "epoch": 6.246727838858119, + "grad_norm": 2.4046976566314697, + "learning_rate": 3.513924547257322e-05, + "loss": 0.484, "step": 453400 }, { - "epoch": 4.62, - "learning_rate": 4.891854891119408e-05, - "loss": 0.6054, + "epoch": 6.248105590917858, + "grad_norm": 3.5096065998077393, + "learning_rate": 3.513154518129321e-05, + "loss": 0.5985, "step": 453500 }, { - "epoch": 4.62, - "learning_rate": 4.891340015842651e-05, - "loss": 0.7208, + "epoch": 6.249483342977598, + "grad_norm": 2.9447455406188965, + "learning_rate": 3.5123844241674395e-05, + "loss": 0.5178, "step": 453600 }, { - "epoch": 4.62, - "learning_rate": 4.890825048090653e-05, - "loss": 0.711, + "epoch": 6.250861095037337, + "grad_norm": 1.9499253034591675, + "learning_rate": 3.511621967344794e-05, + "loss": 0.5323, "step": 453700 }, { - "epoch": 4.62, - "learning_rate": 4.890309987888595e-05, - "loss": 0.7249, + "epoch": 6.252238847097076, + "grad_norm": 4.988081932067871, + "learning_rate": 3.5108517445581334e-05, + "loss": 0.5229, "step": 453800 }, { - "epoch": 4.62, - "learning_rate": 4.889794835261659e-05, - "loss": 0.6263, + "epoch": 6.253616599156816, + "grad_norm": 13.288809776306152, + "learning_rate": 3.510081457133225e-05, + "loss": 0.5018, "step": 453900 }, { - "epoch": 4.63, - "learning_rate": 4.889279590235033e-05, - "loss": 0.58, + "epoch": 6.254994351216555, + "grad_norm": 2.431906223297119, + "learning_rate": 3.509311105135509e-05, + "loss": 0.5333, "step": 454000 }, { - "epoch": 4.63, - "learning_rate": 4.88876425283391e-05, - "loss": 0.666, + "epoch": 6.256372103276294, + "grad_norm": 2.840204954147339, + "learning_rate": 3.508540688630428e-05, + "loss": 0.4758, "step": 454100 }, { - "epoch": 4.63, - "learning_rate": 4.888248823083487e-05, - "loss": 0.6199, + "epoch": 6.257749855336034, + "grad_norm": 5.37916374206543, + "learning_rate": 3.507770207683436e-05, + "loss": 0.5634, "step": 454200 }, { - "epoch": 4.63, - "learning_rate": 4.8877333010089644e-05, - "loss": 0.5923, + "epoch": 6.259127607395773, + "grad_norm": 13.765035629272461, + "learning_rate": 3.506999662359988e-05, + "loss": 0.6393, "step": 454300 }, { - "epoch": 4.63, - "learning_rate": 4.8872176866355494e-05, - "loss": 0.8148, + "epoch": 6.260505359455513, + "grad_norm": 3.532738208770752, + "learning_rate": 3.506229052725547e-05, + "loss": 0.5693, "step": 454400 }, { - "epoch": 4.63, - "learning_rate": 4.8867019799884515e-05, - "loss": 0.6852, + "epoch": 6.261883111515251, + "grad_norm": 0.5706472992897034, + "learning_rate": 3.505458378845578e-05, + "loss": 0.5661, "step": 454500 }, { - "epoch": 4.63, - "learning_rate": 4.886186181092885e-05, - "loss": 0.726, + "epoch": 6.263260863574991, + "grad_norm": 34.04207229614258, + "learning_rate": 3.504687640785555e-05, + "loss": 0.5841, "step": 454600 }, { - "epoch": 4.63, - "learning_rate": 4.885670289974072e-05, - "loss": 0.7535, + "epoch": 6.2646386156347305, + "grad_norm": 6.770870685577393, + "learning_rate": 3.503916838610956e-05, + "loss": 0.5061, "step": 454700 }, { - "epoch": 4.63, - "learning_rate": 4.885154306657234e-05, - "loss": 0.6558, + "epoch": 6.26601636769447, + "grad_norm": 4.09011173248291, + "learning_rate": 3.503145972387265e-05, + "loss": 0.5853, "step": 454800 }, { - "epoch": 4.63, - "learning_rate": 4.8846382311676016e-05, - "loss": 0.7754, + "epoch": 6.267394119754209, + "grad_norm": 5.528512477874756, + "learning_rate": 3.50237504217997e-05, + "loss": 0.5813, "step": 454900 }, { - "epoch": 4.64, - "learning_rate": 4.884122063530405e-05, - "loss": 0.6463, + "epoch": 6.268771871813948, + "grad_norm": 5.804129600524902, + "learning_rate": 3.5016040480545665e-05, + "loss": 0.5565, "step": 455000 }, { - "epoch": 4.64, - "learning_rate": 4.8836058037708846e-05, - "loss": 0.6451, + "epoch": 6.270149623873688, + "grad_norm": 2.95120906829834, + "learning_rate": 3.5008329900765533e-05, + "loss": 0.567, "step": 455100 }, { - "epoch": 4.64, - "learning_rate": 4.8830894519142804e-05, - "loss": 0.5607, + "epoch": 6.271527375933427, + "grad_norm": 21.578901290893555, + "learning_rate": 3.5000618683114366e-05, + "loss": 0.4828, "step": 455200 }, { - "epoch": 4.64, - "learning_rate": 4.882573007985839e-05, - "loss": 0.6176, + "epoch": 6.272905127993166, + "grad_norm": 0.12653601169586182, + "learning_rate": 3.4992906828247266e-05, + "loss": 0.5381, "step": 455300 }, { - "epoch": 4.64, - "learning_rate": 4.8820564720108136e-05, - "loss": 0.6714, + "epoch": 6.274282880052906, + "grad_norm": 5.963935375213623, + "learning_rate": 3.498519433681941e-05, + "loss": 0.5394, "step": 455400 }, { - "epoch": 4.64, - "learning_rate": 4.8815398440144586e-05, - "loss": 0.6445, + "epoch": 6.275660632112645, + "grad_norm": 36.67667770385742, + "learning_rate": 3.497748120948599e-05, + "loss": 0.503, "step": 455500 }, { - "epoch": 4.64, - "learning_rate": 4.8810231240220326e-05, - "loss": 0.6231, + "epoch": 6.277038384172385, + "grad_norm": 4.799067497253418, + "learning_rate": 3.496976744690229e-05, + "loss": 0.5629, "step": 455600 }, { - "epoch": 4.64, - "learning_rate": 4.880506312058801e-05, - "loss": 0.7309, + "epoch": 6.278416136232123, + "grad_norm": 2.2575008869171143, + "learning_rate": 3.496205304972363e-05, + "loss": 0.583, "step": 455700 }, { - "epoch": 4.64, - "learning_rate": 4.879989408150033e-05, - "loss": 0.7339, + "epoch": 6.279793888291863, + "grad_norm": 4.8926897048950195, + "learning_rate": 3.495433801860538e-05, + "loss": 0.5549, "step": 455800 }, { - "epoch": 4.64, - "learning_rate": 4.879472412321003e-05, - "loss": 0.6953, + "epoch": 6.2811716403516025, + "grad_norm": 2.7254655361175537, + "learning_rate": 3.494662235420299e-05, + "loss": 0.5376, "step": 455900 }, { - "epoch": 4.65, - "learning_rate": 4.878955324596988e-05, - "loss": 0.6229, + "epoch": 6.282549392411342, + "grad_norm": 0.46621039509773254, + "learning_rate": 3.4938906057171934e-05, + "loss": 0.5434, "step": 456000 }, { - "epoch": 4.65, - "learning_rate": 4.878443317253879e-05, - "loss": 0.7008, + "epoch": 6.283927144471081, + "grad_norm": 14.060787200927734, + "learning_rate": 3.4931189128167745e-05, + "loss": 0.5894, "step": 456100 }, { - "epoch": 4.65, - "learning_rate": 4.877926046734065e-05, - "loss": 0.728, + "epoch": 6.28530489653082, + "grad_norm": 3.1941330432891846, + "learning_rate": 3.492347156784603e-05, + "loss": 0.689, "step": 456200 }, { - "epoch": 4.65, - "learning_rate": 4.877408684394874e-05, - "loss": 0.7246, + "epoch": 6.28668264859056, + "grad_norm": 38.39863586425781, + "learning_rate": 3.4915753376862414e-05, + "loss": 0.5464, "step": 456300 }, { - "epoch": 4.65, - "learning_rate": 4.8768912302616025e-05, - "loss": 0.6537, + "epoch": 6.288060400650299, + "grad_norm": 120.04096984863281, + "learning_rate": 3.490803455587262e-05, + "loss": 0.5847, "step": 456400 }, { - "epoch": 4.65, - "learning_rate": 4.8763736843595505e-05, - "loss": 0.7256, + "epoch": 6.289438152710038, + "grad_norm": 13.030728340148926, + "learning_rate": 3.4900315105532394e-05, + "loss": 0.5741, "step": 456500 }, { - "epoch": 4.65, - "learning_rate": 4.875856046714022e-05, - "loss": 0.7296, + "epoch": 6.290815904769778, + "grad_norm": 2.070629358291626, + "learning_rate": 3.489259502649753e-05, + "loss": 0.4826, "step": 456600 }, { - "epoch": 4.65, - "learning_rate": 4.875338317350328e-05, - "loss": 0.673, + "epoch": 6.292193656829517, + "grad_norm": 3.9621589183807373, + "learning_rate": 3.488487431942391e-05, + "loss": 0.5095, "step": 456700 }, { - "epoch": 4.65, - "learning_rate": 4.8748204962937825e-05, - "loss": 0.6348, + "epoch": 6.293571408889257, + "grad_norm": 92.44857788085938, + "learning_rate": 3.487715298496742e-05, + "loss": 0.4997, "step": 456800 }, { - "epoch": 4.65, - "learning_rate": 4.8743025835697016e-05, - "loss": 0.713, + "epoch": 6.294949160948995, + "grad_norm": 6.448740005493164, + "learning_rate": 3.4869431023784055e-05, + "loss": 0.507, "step": 456900 }, { - "epoch": 4.66, - "learning_rate": 4.8737845792034104e-05, - "loss": 0.5801, + "epoch": 6.296326913008735, + "grad_norm": 3.7643239498138428, + "learning_rate": 3.486170843652983e-05, + "loss": 0.5531, "step": 457000 }, { - "epoch": 4.66, - "learning_rate": 4.873266483220236e-05, - "loss": 0.6052, + "epoch": 6.2977046650684745, + "grad_norm": 1.9391893148422241, + "learning_rate": 3.485398522386079e-05, + "loss": 0.555, "step": 457100 }, { - "epoch": 4.66, - "learning_rate": 4.8727482956455084e-05, - "loss": 0.5799, + "epoch": 6.299082417128213, + "grad_norm": 8.732494354248047, + "learning_rate": 3.484626138643309e-05, + "loss": 0.5667, "step": 457200 }, { - "epoch": 4.66, - "learning_rate": 4.8722300165045663e-05, - "loss": 0.6046, + "epoch": 6.300460169187953, + "grad_norm": 4.021539688110352, + "learning_rate": 3.48385369249029e-05, + "loss": 0.6163, "step": 457300 }, { - "epoch": 4.66, - "learning_rate": 4.871711645822749e-05, - "loss": 0.7177, + "epoch": 6.301837921247692, + "grad_norm": 1.5873167514801025, + "learning_rate": 3.4830811839926465e-05, + "loss": 0.5065, "step": 457400 }, { - "epoch": 4.66, - "learning_rate": 4.8711931836254014e-05, - "loss": 0.7043, + "epoch": 6.303215673307432, + "grad_norm": 69.21484375, + "learning_rate": 3.4823086132160054e-05, + "loss": 0.5174, "step": 457500 }, { - "epoch": 4.66, - "learning_rate": 4.870674629937874e-05, - "loss": 0.637, + "epoch": 6.304593425367171, + "grad_norm": 7.337741851806641, + "learning_rate": 3.481535980226001e-05, + "loss": 0.5122, "step": 457600 }, { - "epoch": 4.66, - "learning_rate": 4.870155984785521e-05, - "loss": 0.6485, + "epoch": 6.30597117742691, + "grad_norm": 5.86789608001709, + "learning_rate": 3.4807632850882735e-05, + "loss": 0.557, "step": 457700 }, { - "epoch": 4.66, - "learning_rate": 4.869637248193701e-05, - "loss": 0.7934, + "epoch": 6.30734892948665, + "grad_norm": 3.5734307765960693, + "learning_rate": 3.479990527868466e-05, + "loss": 0.519, "step": 457800 }, { - "epoch": 4.67, - "learning_rate": 4.869118420187776e-05, - "loss": 0.6673, + "epoch": 6.308726681546389, + "grad_norm": 0.16667194664478302, + "learning_rate": 3.479217708632227e-05, + "loss": 0.4791, "step": 457900 }, { - "epoch": 4.67, - "learning_rate": 4.8685995007931156e-05, - "loss": 0.6962, + "epoch": 6.310104433606128, + "grad_norm": 7.866695404052734, + "learning_rate": 3.478444827445215e-05, + "loss": 0.5163, "step": 458000 }, { - "epoch": 4.67, - "learning_rate": 4.8680804900350914e-05, - "loss": 0.726, + "epoch": 6.311482185665867, + "grad_norm": 8.209754943847656, + "learning_rate": 3.477671884373087e-05, + "loss": 0.4649, "step": 458100 }, { - "epoch": 4.67, - "learning_rate": 4.867561387939078e-05, - "loss": 0.6348, + "epoch": 6.312859937725607, + "grad_norm": 2.742061138153076, + "learning_rate": 3.476898879481511e-05, + "loss": 0.5171, "step": 458200 }, { - "epoch": 4.67, - "learning_rate": 4.867042194530458e-05, - "loss": 0.6547, + "epoch": 6.3142376897853465, + "grad_norm": 3.355088472366333, + "learning_rate": 3.476125812836155e-05, + "loss": 0.5451, "step": 458300 }, { - "epoch": 4.67, - "learning_rate": 4.866522909834618e-05, - "loss": 0.7325, + "epoch": 6.315615441845085, + "grad_norm": 1.8031038045883179, + "learning_rate": 3.475352684502697e-05, + "loss": 0.6196, "step": 458400 }, { - "epoch": 4.67, - "learning_rate": 4.866003533876944e-05, - "loss": 0.6568, + "epoch": 6.316993193904825, + "grad_norm": 38.720279693603516, + "learning_rate": 3.474587226751191e-05, + "loss": 0.5181, "step": 458500 }, { - "epoch": 4.67, - "learning_rate": 4.865484066682835e-05, - "loss": 0.605, + "epoch": 6.318370945964564, + "grad_norm": 13.696335792541504, + "learning_rate": 3.473813975853819e-05, + "loss": 0.4956, "step": 458600 }, { - "epoch": 4.67, - "learning_rate": 4.8649645082776876e-05, - "loss": 0.7516, + "epoch": 6.319748698024304, + "grad_norm": 6.031965732574463, + "learning_rate": 3.4730406634647464e-05, + "loss": 0.4961, "step": 458700 }, { - "epoch": 4.67, - "learning_rate": 4.8644448586869043e-05, - "loss": 0.6796, + "epoch": 6.3211264500840425, + "grad_norm": 4.09878396987915, + "learning_rate": 3.47226728964967e-05, + "loss": 0.5304, "step": 458800 }, { - "epoch": 4.68, - "learning_rate": 4.863925117935895e-05, - "loss": 0.599, + "epoch": 6.322504202143782, + "grad_norm": 4.406709671020508, + "learning_rate": 3.4714938544742934e-05, + "loss": 0.5477, "step": 458900 }, { - "epoch": 4.68, - "learning_rate": 4.8634052860500694e-05, - "loss": 0.7571, + "epoch": 6.323881954203522, + "grad_norm": 15.194746971130371, + "learning_rate": 3.470728093272214e-05, + "loss": 0.5639, "step": 459000 }, { - "epoch": 4.68, - "learning_rate": 4.862885363054846e-05, - "loss": 0.6359, + "epoch": 6.325259706263261, + "grad_norm": 6.894687652587891, + "learning_rate": 3.469954536185324e-05, + "loss": 0.5385, "step": 459100 }, { - "epoch": 4.68, - "learning_rate": 4.862365348975646e-05, - "loss": 0.6363, + "epoch": 6.326637458323, + "grad_norm": 4.836931228637695, + "learning_rate": 3.469180917934614e-05, + "loss": 0.5193, "step": 459200 }, { - "epoch": 4.68, - "learning_rate": 4.861845243837894e-05, - "loss": 0.6438, + "epoch": 6.328015210382739, + "grad_norm": 37.81837844848633, + "learning_rate": 3.4684072385858045e-05, + "loss": 0.5059, "step": 459300 }, { - "epoch": 4.68, - "learning_rate": 4.861325047667021e-05, + "epoch": 6.329392962442479, + "grad_norm": 4.395566940307617, + "learning_rate": 3.467633498204625e-05, "loss": 0.6069, "step": 459400 }, { - "epoch": 4.68, - "learning_rate": 4.8608047604884606e-05, - "loss": 0.7083, + "epoch": 6.3307707145022185, + "grad_norm": 7.261143207550049, + "learning_rate": 3.466859696856809e-05, + "loss": 0.5075, "step": 459500 }, { - "epoch": 4.68, - "learning_rate": 4.860284382327652e-05, - "loss": 0.7087, + "epoch": 6.332148466561957, + "grad_norm": 2.6545705795288086, + "learning_rate": 3.4660858346080936e-05, + "loss": 0.5434, "step": 459600 }, { - "epoch": 4.68, - "learning_rate": 4.859763913210039e-05, - "loss": 0.6578, + "epoch": 6.333526218621697, + "grad_norm": 58.85383605957031, + "learning_rate": 3.465311911524224e-05, + "loss": 0.5759, "step": 459700 }, { - "epoch": 4.68, - "learning_rate": 4.859243353161069e-05, - "loss": 0.7035, + "epoch": 6.334903970681436, + "grad_norm": 14.57435131072998, + "learning_rate": 3.464537927670948e-05, + "loss": 0.5165, "step": 459800 }, { - "epoch": 4.69, - "learning_rate": 4.858722702206194e-05, - "loss": 0.6706, + "epoch": 6.336281722741176, + "grad_norm": 11.907734870910645, + "learning_rate": 3.463763883114018e-05, + "loss": 0.4545, "step": 459900 }, { - "epoch": 4.69, - "learning_rate": 4.8582019603708715e-05, - "loss": 0.608, + "epoch": 6.3376594748009145, + "grad_norm": 5.383899688720703, + "learning_rate": 3.462989777919197e-05, + "loss": 0.5175, "step": 460000 }, { - "epoch": 4.69, - "learning_rate": 4.857681127680562e-05, - "loss": 0.5823, + "epoch": 6.339037226860654, + "grad_norm": 3.570643663406372, + "learning_rate": 3.462215612152244e-05, + "loss": 0.471, "step": 460100 }, { - "epoch": 4.69, - "learning_rate": 4.857160204160732e-05, - "loss": 0.6734, + "epoch": 6.340414978920394, + "grad_norm": 4.963911056518555, + "learning_rate": 3.4614413858789325e-05, + "loss": 0.4757, "step": 460200 }, { - "epoch": 4.69, - "learning_rate": 4.856639189836851e-05, - "loss": 0.6129, + "epoch": 6.341792730980133, + "grad_norm": 14.90195369720459, + "learning_rate": 3.460667099165036e-05, + "loss": 0.5004, "step": 460300 }, { - "epoch": 4.69, - "learning_rate": 4.856118084734393e-05, - "loss": 0.6144, + "epoch": 6.343170483039872, + "grad_norm": 5.259658336639404, + "learning_rate": 3.459892752076333e-05, + "loss": 0.5133, "step": 460400 }, { - "epoch": 4.69, - "learning_rate": 4.855596888878838e-05, - "loss": 0.58, + "epoch": 6.344548235099611, + "grad_norm": 8.458319664001465, + "learning_rate": 3.459118344678609e-05, + "loss": 0.5886, "step": 460500 }, { - "epoch": 4.69, - "learning_rate": 4.855075602295668e-05, - "loss": 0.7306, + "epoch": 6.345925987159351, + "grad_norm": 13.110440254211426, + "learning_rate": 3.4583438770376536e-05, + "loss": 0.6334, "step": 460600 }, { - "epoch": 4.69, - "learning_rate": 4.8545542250103716e-05, - "loss": 0.6818, + "epoch": 6.3473037392190905, + "grad_norm": 20.464799880981445, + "learning_rate": 3.457569349219262e-05, + "loss": 0.6236, "step": 460700 }, { - "epoch": 4.69, - "learning_rate": 4.8540327570484416e-05, - "loss": 0.6426, + "epoch": 6.348681491278829, + "grad_norm": 7.437765598297119, + "learning_rate": 3.456794761289235e-05, + "loss": 0.5151, "step": 460800 }, { - "epoch": 4.7, - "learning_rate": 4.853511198435374e-05, - "loss": 0.7697, + "epoch": 6.350059243338569, + "grad_norm": 4.299627304077148, + "learning_rate": 3.456020113313376e-05, + "loss": 0.5655, "step": 460900 }, { - "epoch": 4.7, - "learning_rate": 4.8529895491966694e-05, - "loss": 0.6117, + "epoch": 6.351436995398308, + "grad_norm": 5.7341203689575195, + "learning_rate": 3.4552454053574975e-05, + "loss": 0.4883, "step": 461000 }, { - "epoch": 4.7, - "learning_rate": 4.852467809357834e-05, - "loss": 0.6062, + "epoch": 6.352814747458048, + "grad_norm": 9.073986053466797, + "learning_rate": 3.4544706374874134e-05, + "loss": 0.623, "step": 461100 }, { - "epoch": 4.7, - "learning_rate": 4.8519459789443777e-05, - "loss": 0.6913, + "epoch": 6.3541924995177865, + "grad_norm": 6.882385730743408, + "learning_rate": 3.453695809768944e-05, + "loss": 0.5603, "step": 461200 }, { - "epoch": 4.7, - "learning_rate": 4.8514240579818155e-05, - "loss": 0.5542, + "epoch": 6.355570251577526, + "grad_norm": 2.997835159301758, + "learning_rate": 3.452920922267917e-05, + "loss": 0.5211, "step": 461300 }, { - "epoch": 4.7, - "learning_rate": 4.850902046495665e-05, - "loss": 0.7515, + "epoch": 6.356948003637266, + "grad_norm": 23.735191345214844, + "learning_rate": 3.45214597505016e-05, + "loss": 0.5414, "step": 461400 }, { - "epoch": 4.7, - "learning_rate": 4.8503799445114505e-05, - "loss": 0.6467, + "epoch": 6.358325755697004, + "grad_norm": 1.2504786252975464, + "learning_rate": 3.451370968181511e-05, + "loss": 0.5107, "step": 461500 }, { - "epoch": 4.7, - "learning_rate": 4.8498577520546985e-05, - "loss": 0.7683, + "epoch": 6.359703507756744, + "grad_norm": 46.827781677246094, + "learning_rate": 3.4505959017278096e-05, + "loss": 0.5035, "step": 461600 }, { - "epoch": 4.7, - "learning_rate": 4.8493354691509424e-05, - "loss": 0.6353, + "epoch": 6.361081259816483, + "grad_norm": 4.429317951202393, + "learning_rate": 3.4498207757549026e-05, + "loss": 0.5114, "step": 461700 }, { - "epoch": 4.7, - "learning_rate": 4.848813095825718e-05, - "loss": 0.6467, + "epoch": 6.362459011876223, + "grad_norm": 9.95505142211914, + "learning_rate": 3.44904559032864e-05, + "loss": 0.5657, "step": 461800 }, { - "epoch": 4.71, - "learning_rate": 4.8482906321045666e-05, - "loss": 0.6308, + "epoch": 6.3638367639359625, + "grad_norm": 42.97420120239258, + "learning_rate": 3.4482703455148786e-05, + "loss": 0.5305, "step": 461900 }, { - "epoch": 4.71, - "learning_rate": 4.847768078013032e-05, - "loss": 0.7193, + "epoch": 6.365214515995701, + "grad_norm": 4.830787181854248, + "learning_rate": 3.4474950413794786e-05, + "loss": 0.5017, "step": 462000 }, { - "epoch": 4.71, - "learning_rate": 4.847245433576667e-05, - "loss": 0.6654, + "epoch": 6.366592268055441, + "grad_norm": 3.6229782104492188, + "learning_rate": 3.4467196779883065e-05, + "loss": 0.522, "step": 462100 }, { - "epoch": 4.71, - "learning_rate": 4.846722698821023e-05, - "loss": 0.6543, + "epoch": 6.36797002011518, + "grad_norm": 3.63191294670105, + "learning_rate": 3.445944255407233e-05, + "loss": 0.5688, "step": 462200 }, { - "epoch": 4.71, - "learning_rate": 4.84619987377166e-05, - "loss": 0.6524, + "epoch": 6.369347772174919, + "grad_norm": 20.11029624938965, + "learning_rate": 3.4451687737021364e-05, + "loss": 0.5197, "step": 462300 }, { - "epoch": 4.71, - "learning_rate": 4.845676958454141e-05, - "loss": 0.6761, + "epoch": 6.3707255242346585, + "grad_norm": 2.780252456665039, + "learning_rate": 3.4443932329388946e-05, + "loss": 0.4674, "step": 462400 }, { - "epoch": 4.71, - "learning_rate": 4.8451539528940324e-05, - "loss": 0.5824, + "epoch": 6.372103276294398, + "grad_norm": 9.972923278808594, + "learning_rate": 3.4436176331833954e-05, + "loss": 0.5934, "step": 462500 }, { - "epoch": 4.71, - "learning_rate": 4.84463608852117e-05, - "loss": 0.6291, + "epoch": 6.373481028354138, + "grad_norm": 3.0733118057250977, + "learning_rate": 3.44284197450153e-05, + "loss": 0.5463, "step": 462600 }, { - "epoch": 4.71, - "learning_rate": 4.844118135751424e-05, - "loss": 0.7199, + "epoch": 6.374858780413876, + "grad_norm": 12.761159896850586, + "learning_rate": 3.442066256959193e-05, + "loss": 0.5579, "step": 462700 }, { - "epoch": 4.72, - "learning_rate": 4.8435948614200645e-05, - "loss": 0.628, + "epoch": 6.376236532473616, + "grad_norm": 4.008204936981201, + "learning_rate": 3.441290480622289e-05, + "loss": 0.5283, "step": 462800 }, { - "epoch": 4.72, - "learning_rate": 4.8430714969479185e-05, - "loss": 0.605, + "epoch": 6.377614284533355, + "grad_norm": 3.52803373336792, + "learning_rate": 3.440522404197866e-05, + "loss": 0.5378, "step": 462900 }, { - "epoch": 4.72, - "learning_rate": 4.842548042360575e-05, - "loss": 0.6613, + "epoch": 6.378992036593095, + "grad_norm": 18.950180053710938, + "learning_rate": 3.4397465110558476e-05, + "loss": 0.5544, "step": 463000 }, { - "epoch": 4.72, - "learning_rate": 4.842024497683628e-05, - "loss": 0.6611, + "epoch": 6.380369788652834, + "grad_norm": 4.705127716064453, + "learning_rate": 3.438970559316334e-05, + "loss": 0.5723, "step": 463100 }, { - "epoch": 4.72, - "learning_rate": 4.841500862942676e-05, - "loss": 0.6552, + "epoch": 6.381747540712573, + "grad_norm": 54.34796905517578, + "learning_rate": 3.4381945490452466e-05, + "loss": 0.495, "step": 463200 }, { - "epoch": 4.72, - "learning_rate": 4.84097713816332e-05, - "loss": 0.6531, + "epoch": 6.383125292772313, + "grad_norm": 13.651488304138184, + "learning_rate": 3.437418480308512e-05, + "loss": 0.6149, "step": 463300 }, { - "epoch": 4.72, - "learning_rate": 4.84045332337117e-05, - "loss": 0.7, + "epoch": 6.384503044832052, + "grad_norm": 5.200097560882568, + "learning_rate": 3.436642353172061e-05, + "loss": 0.5014, "step": 463400 }, { - "epoch": 4.72, - "learning_rate": 4.839929418591835e-05, - "loss": 0.7042, + "epoch": 6.385880796891791, + "grad_norm": 5.135319709777832, + "learning_rate": 3.4358661677018276e-05, + "loss": 0.5211, "step": 463500 }, { - "epoch": 4.72, - "learning_rate": 4.839405423850931e-05, - "loss": 0.6734, + "epoch": 6.387258548951531, + "grad_norm": 6.770837783813477, + "learning_rate": 3.4350899239637554e-05, + "loss": 0.5376, "step": 463600 }, { - "epoch": 4.72, - "learning_rate": 4.83888133917408e-05, - "loss": 0.6635, + "epoch": 6.38863630101127, + "grad_norm": 3.2738966941833496, + "learning_rate": 3.4343136220237896e-05, + "loss": 0.4791, "step": 463700 }, { - "epoch": 4.73, - "learning_rate": 4.838357164586905e-05, - "loss": 0.6787, + "epoch": 6.39001405307101, + "grad_norm": 35.05431365966797, + "learning_rate": 3.43353726194788e-05, + "loss": 0.5216, "step": 463800 }, { - "epoch": 4.73, - "learning_rate": 4.8378329001150355e-05, - "loss": 0.6788, + "epoch": 6.391391805130748, + "grad_norm": 5.473015785217285, + "learning_rate": 3.4327608438019834e-05, + "loss": 0.5852, "step": 463900 }, { - "epoch": 4.73, - "learning_rate": 4.837308545784104e-05, - "loss": 0.7333, + "epoch": 6.392769557190488, + "grad_norm": 4.0664448738098145, + "learning_rate": 3.43198436765206e-05, + "loss": 0.568, "step": 464000 }, { - "epoch": 4.73, - "learning_rate": 4.836784101619749e-05, - "loss": 0.6748, + "epoch": 6.3941473092502275, + "grad_norm": 20.636764526367188, + "learning_rate": 3.4312078335640755e-05, + "loss": 0.464, "step": 464100 }, { - "epoch": 4.73, - "learning_rate": 4.836259567647614e-05, - "loss": 0.6543, + "epoch": 6.395525061309967, + "grad_norm": 4.607789039611816, + "learning_rate": 3.4304312416040014e-05, + "loss": 0.5298, "step": 464200 }, { - "epoch": 4.73, - "learning_rate": 4.835734943893344e-05, - "loss": 0.6279, + "epoch": 6.396902813369706, + "grad_norm": 4.129605770111084, + "learning_rate": 3.4296545918378115e-05, + "loss": 0.5272, "step": 464300 }, { - "epoch": 4.73, - "learning_rate": 4.835210230382589e-05, - "loss": 0.6538, + "epoch": 6.398280565429445, + "grad_norm": 11.238272666931152, + "learning_rate": 3.428877884331486e-05, + "loss": 0.4859, "step": 464400 }, { - "epoch": 4.73, - "learning_rate": 4.8346854271410075e-05, - "loss": 0.6638, + "epoch": 6.399658317489185, + "grad_norm": 2.492154836654663, + "learning_rate": 3.4281011191510134e-05, + "loss": 0.5288, "step": 464500 }, { - "epoch": 4.73, - "learning_rate": 4.834160534194257e-05, - "loss": 0.6734, + "epoch": 6.401036069548924, + "grad_norm": 4.2271599769592285, + "learning_rate": 3.427324296362379e-05, + "loss": 0.5255, "step": 464600 }, { - "epoch": 4.73, - "learning_rate": 4.833635551568002e-05, - "loss": 0.7441, + "epoch": 6.402413821608663, + "grad_norm": 17.49359130859375, + "learning_rate": 3.426547416031583e-05, + "loss": 0.5483, "step": 464700 }, { - "epoch": 4.74, - "learning_rate": 4.833110479287911e-05, - "loss": 0.6416, + "epoch": 6.403791573668403, + "grad_norm": 26.748050689697266, + "learning_rate": 3.4257704782246216e-05, + "loss": 0.4936, "step": 464800 }, { - "epoch": 4.74, - "learning_rate": 4.832585317379656e-05, - "loss": 0.7301, + "epoch": 6.405169325728142, + "grad_norm": 23.832195281982422, + "learning_rate": 3.4249934830075e-05, + "loss": 0.5243, "step": 464900 }, { - "epoch": 4.74, - "learning_rate": 4.832060065868916e-05, - "loss": 0.6981, + "epoch": 6.406547077787882, + "grad_norm": 3.791219711303711, + "learning_rate": 3.4242164304462304e-05, + "loss": 0.5899, "step": 465000 }, { - "epoch": 4.74, - "learning_rate": 4.831534724781373e-05, - "loss": 0.7026, + "epoch": 6.40792482984762, + "grad_norm": 3.6568827629089355, + "learning_rate": 3.423439320606824e-05, + "loss": 0.566, "step": 465100 }, { - "epoch": 4.74, - "learning_rate": 4.831009294142711e-05, - "loss": 0.6898, + "epoch": 6.40930258190736, + "grad_norm": 17.30653953552246, + "learning_rate": 3.4226621535553026e-05, + "loss": 0.5962, "step": 465200 }, { - "epoch": 4.74, - "learning_rate": 4.830483773978621e-05, - "loss": 0.7516, + "epoch": 6.4106803339670995, + "grad_norm": 10.36335563659668, + "learning_rate": 3.4218849293576916e-05, + "loss": 0.5667, "step": 465300 }, { - "epoch": 4.74, - "learning_rate": 4.829958164314799e-05, - "loss": 0.6559, + "epoch": 6.412058086026839, + "grad_norm": 2.072679042816162, + "learning_rate": 3.421107648080016e-05, + "loss": 0.5336, "step": 465400 }, { - "epoch": 4.74, - "learning_rate": 4.8294324651769425e-05, - "loss": 0.5516, + "epoch": 6.413435838086578, + "grad_norm": 3.226314067840576, + "learning_rate": 3.420330309788314e-05, + "loss": 0.5372, "step": 465500 }, { - "epoch": 4.74, - "learning_rate": 4.8289066765907566e-05, - "loss": 0.6529, + "epoch": 6.414813590146317, + "grad_norm": 50.06383514404297, + "learning_rate": 3.4195529145486224e-05, + "loss": 0.5435, "step": 465600 }, { - "epoch": 4.74, - "learning_rate": 4.8283807985819485e-05, - "loss": 0.7609, + "epoch": 6.416191342206057, + "grad_norm": 4.390735149383545, + "learning_rate": 3.418775462426985e-05, + "loss": 0.4922, "step": 465700 }, { - "epoch": 4.75, - "learning_rate": 4.827854831176231e-05, - "loss": 0.6185, + "epoch": 6.4175690942657955, + "grad_norm": 5.059786796569824, + "learning_rate": 3.4179979534894514e-05, + "loss": 0.557, "step": 465800 }, { - "epoch": 4.75, - "learning_rate": 4.827334035409392e-05, - "loss": 0.6474, + "epoch": 6.418946846325535, + "grad_norm": 3.7022817134857178, + "learning_rate": 3.417220387802073e-05, + "loss": 0.5162, "step": 465900 }, { - "epoch": 4.75, - "learning_rate": 4.826807890180336e-05, - "loss": 0.7141, + "epoch": 6.420324598385275, + "grad_norm": 4.939906597137451, + "learning_rate": 3.41644276543091e-05, + "loss": 0.4894, "step": 466000 }, { - "epoch": 4.75, - "learning_rate": 4.826281655631276e-05, - "loss": 0.6863, + "epoch": 6.421702350445014, + "grad_norm": 4.897161483764648, + "learning_rate": 3.4156650864420244e-05, + "loss": 0.5312, "step": 466100 }, { - "epoch": 4.75, - "learning_rate": 4.825755331787942e-05, - "loss": 0.7209, + "epoch": 6.423080102504754, + "grad_norm": 8.984227180480957, + "learning_rate": 3.414887350901485e-05, + "loss": 0.4259, "step": 466200 }, { - "epoch": 4.75, - "learning_rate": 4.825228918676068e-05, - "loss": 0.7162, + "epoch": 6.424457854564492, + "grad_norm": 1.7079793214797974, + "learning_rate": 3.41411733707501e-05, + "loss": 0.4979, "step": 466300 }, { - "epoch": 4.75, - "learning_rate": 4.824702416321391e-05, - "loss": 0.7492, + "epoch": 6.425835606624232, + "grad_norm": 5.8413615226745605, + "learning_rate": 3.4133394891932515e-05, + "loss": 0.5137, "step": 466400 }, { - "epoch": 4.75, - "learning_rate": 4.8241758247496564e-05, - "loss": 0.5179, + "epoch": 6.4272133586839715, + "grad_norm": 9.191996574401855, + "learning_rate": 3.412569364278504e-05, + "loss": 0.5454, "step": 466500 }, { - "epoch": 4.75, - "learning_rate": 4.8236491439866093e-05, - "loss": 0.682, + "epoch": 6.42859111074371, + "grad_norm": 3.10945463180542, + "learning_rate": 3.411791404317218e-05, + "loss": 0.5298, "step": 466600 }, { - "epoch": 4.75, - "learning_rate": 4.823122374058003e-05, - "loss": 0.6177, + "epoch": 6.42996886280345, + "grad_norm": 4.685968399047852, + "learning_rate": 3.411013388133367e-05, + "loss": 0.4721, "step": 466700 }, { - "epoch": 4.76, - "learning_rate": 4.822595514989591e-05, - "loss": 0.6371, + "epoch": 6.431346614863189, + "grad_norm": 103.82674407958984, + "learning_rate": 3.4102353157930464e-05, + "loss": 0.4824, "step": 466800 }, { - "epoch": 4.76, - "learning_rate": 4.822068566807136e-05, - "loss": 0.5733, + "epoch": 6.432724366922929, + "grad_norm": 14.167476654052734, + "learning_rate": 3.4094571873623586e-05, + "loss": 0.5477, "step": 466900 }, { - "epoch": 4.76, - "learning_rate": 4.821541529536402e-05, - "loss": 0.631, + "epoch": 6.4341021189826675, + "grad_norm": 10.491928100585938, + "learning_rate": 3.4086790029074094e-05, + "loss": 0.4711, "step": 467000 }, { - "epoch": 4.76, - "learning_rate": 4.821014403203156e-05, - "loss": 0.6588, + "epoch": 6.435479871042407, + "grad_norm": 6.281071662902832, + "learning_rate": 3.4079007624943086e-05, + "loss": 0.5018, "step": 467100 }, { - "epoch": 4.76, - "learning_rate": 4.820487187833175e-05, - "loss": 0.6983, + "epoch": 6.436857623102147, + "grad_norm": 14.812719345092773, + "learning_rate": 3.4071224661891724e-05, + "loss": 0.5254, "step": 467200 }, { - "epoch": 4.76, - "learning_rate": 4.819959883452233e-05, - "loss": 0.6345, + "epoch": 6.438235375161886, + "grad_norm": 1.3977235555648804, + "learning_rate": 3.4063441140581206e-05, + "loss": 0.5088, "step": 467300 }, { - "epoch": 4.76, - "learning_rate": 4.8194324900861145e-05, - "loss": 0.673, + "epoch": 6.439613127221625, + "grad_norm": 5.854102611541748, + "learning_rate": 3.4055657061672776e-05, + "loss": 0.4982, "step": 467400 }, { - "epoch": 4.76, - "learning_rate": 4.8189102830241245e-05, - "loss": 0.6055, + "epoch": 6.440990879281364, + "grad_norm": 13.613001823425293, + "learning_rate": 3.404787242582776e-05, + "loss": 0.5188, "step": 467500 }, { - "epoch": 4.76, - "learning_rate": 4.818382712654222e-05, - "loss": 0.6946, + "epoch": 6.442368631341104, + "grad_norm": 3.4949545860290527, + "learning_rate": 3.4040087233707473e-05, + "loss": 0.5313, "step": 467600 }, { - "epoch": 4.77, - "learning_rate": 4.817855053376258e-05, - "loss": 0.6567, + "epoch": 6.4437463834008435, + "grad_norm": 10.11595344543457, + "learning_rate": 3.4032301485973314e-05, + "loss": 0.4718, "step": 467700 }, { - "epoch": 4.77, - "learning_rate": 4.81732730521603e-05, - "loss": 0.6913, + "epoch": 6.445124135460582, + "grad_norm": 4.291743755340576, + "learning_rate": 3.402451518328673e-05, + "loss": 0.4898, "step": 467800 }, { - "epoch": 4.77, - "learning_rate": 4.816799468199341e-05, - "loss": 0.7095, + "epoch": 6.446501887520322, + "grad_norm": 4.582491397857666, + "learning_rate": 3.4016728326309196e-05, + "loss": 0.549, "step": 467900 }, { - "epoch": 4.77, - "learning_rate": 4.816271542352001e-05, - "loss": 0.6767, + "epoch": 6.447879639580061, + "grad_norm": 2.074584722518921, + "learning_rate": 3.4008940915702256e-05, + "loss": 0.4883, "step": 468000 }, { - "epoch": 4.77, - "learning_rate": 4.8157435276998215e-05, - "loss": 0.6345, + "epoch": 6.449257391639801, + "grad_norm": 5.900487899780273, + "learning_rate": 3.400115295212749e-05, + "loss": 0.5176, "step": 468100 }, { - "epoch": 4.77, - "learning_rate": 4.8152154242686185e-05, - "loss": 0.6577, + "epoch": 6.4506351436995395, + "grad_norm": 7.9630937576293945, + "learning_rate": 3.3993364436246505e-05, + "loss": 0.5398, "step": 468200 }, { - "epoch": 4.77, - "learning_rate": 4.814687232084215e-05, - "loss": 0.653, + "epoch": 6.452012895759279, + "grad_norm": 3.6541032791137695, + "learning_rate": 3.3985575368721005e-05, + "loss": 0.4832, "step": 468300 }, { - "epoch": 4.77, - "learning_rate": 4.814158951172434e-05, - "loss": 0.5682, + "epoch": 6.453390647819019, + "grad_norm": 3.677736759185791, + "learning_rate": 3.3977785750212695e-05, + "loss": 0.463, "step": 468400 }, { - "epoch": 4.77, - "learning_rate": 4.813630581559107e-05, - "loss": 0.7012, + "epoch": 6.454768399878758, + "grad_norm": 4.9105544090271, + "learning_rate": 3.396999558138334e-05, + "loss": 0.5095, "step": 468500 }, { - "epoch": 4.77, - "learning_rate": 4.813102123270068e-05, - "loss": 0.6532, + "epoch": 6.456146151938497, + "grad_norm": 9.214556694030762, + "learning_rate": 3.396220486289477e-05, + "loss": 0.5653, "step": 468600 }, { - "epoch": 4.78, - "learning_rate": 4.8125735763311555e-05, - "loss": 0.702, + "epoch": 6.457523903998236, + "grad_norm": 6.541526794433594, + "learning_rate": 3.395441359540883e-05, + "loss": 0.567, "step": 468700 }, { - "epoch": 4.78, - "learning_rate": 4.8120449407682116e-05, - "loss": 0.6736, + "epoch": 6.458901656057976, + "grad_norm": 6.857761383056641, + "learning_rate": 3.394662177958743e-05, + "loss": 0.488, "step": 468800 }, { - "epoch": 4.78, - "learning_rate": 4.811516216607084e-05, - "loss": 0.6852, + "epoch": 6.4602794081177155, + "grad_norm": 4.510544300079346, + "learning_rate": 3.3938829416092535e-05, + "loss": 0.5462, "step": 468900 }, { - "epoch": 4.78, - "learning_rate": 4.810987403873624e-05, - "loss": 0.6739, + "epoch": 6.461657160177454, + "grad_norm": 2.774104356765747, + "learning_rate": 3.393103650558614e-05, + "loss": 0.4651, "step": 469000 }, { - "epoch": 4.78, - "learning_rate": 4.810458502593687e-05, - "loss": 0.6579, + "epoch": 6.463034912237194, + "grad_norm": 7.511016845703125, + "learning_rate": 3.39232430487303e-05, + "loss": 0.5954, "step": 469100 }, { - "epoch": 4.78, - "learning_rate": 4.8099295127931345e-05, - "loss": 0.6383, + "epoch": 6.464412664296933, + "grad_norm": 2.5176055431365967, + "learning_rate": 3.3915449046187085e-05, + "loss": 0.4857, "step": 469200 }, { - "epoch": 4.78, - "learning_rate": 4.809400434497829e-05, - "loss": 0.6571, + "epoch": 6.465790416356673, + "grad_norm": 3.5668587684631348, + "learning_rate": 3.3907654498618665e-05, + "loss": 0.4752, "step": 469300 }, { - "epoch": 4.78, - "learning_rate": 4.808871267733641e-05, - "loss": 0.6376, + "epoch": 6.4671681684164115, + "grad_norm": 2.3010289669036865, + "learning_rate": 3.389985940668721e-05, + "loss": 0.508, "step": 469400 }, { - "epoch": 4.78, - "learning_rate": 4.808342012526442e-05, - "loss": 0.7185, + "epoch": 6.468545920476151, + "grad_norm": 585.8804321289062, + "learning_rate": 3.389206377105496e-05, + "loss": 0.525, "step": 469500 }, { - "epoch": 4.78, - "learning_rate": 4.80781266890211e-05, - "loss": 0.6646, + "epoch": 6.469923672535891, + "grad_norm": 2.168923854827881, + "learning_rate": 3.3884267592384194e-05, + "loss": 0.5275, "step": 469600 }, { - "epoch": 4.79, - "learning_rate": 4.807283236886527e-05, - "loss": 0.7991, + "epoch": 6.47130142459563, + "grad_norm": 6.4968438148498535, + "learning_rate": 3.387647087133723e-05, + "loss": 0.4872, "step": 469700 }, { - "epoch": 4.79, - "learning_rate": 4.8067537165055796e-05, - "loss": 0.6664, + "epoch": 6.472679176655369, + "grad_norm": 4.322395324707031, + "learning_rate": 3.386867360857644e-05, + "loss": 0.5246, "step": 469800 }, { - "epoch": 4.79, - "learning_rate": 4.806224107785158e-05, - "loss": 0.6859, + "epoch": 6.474056928715108, + "grad_norm": 1.8616739511489868, + "learning_rate": 3.3860875804764246e-05, + "loss": 0.5923, "step": 469900 }, { - "epoch": 4.79, - "learning_rate": 4.805694410751155e-05, - "loss": 0.6428, + "epoch": 6.475434680774848, + "grad_norm": 4.919229030609131, + "learning_rate": 3.385307746056311e-05, + "loss": 0.57, "step": 470000 }, { - "epoch": 4.79, - "learning_rate": 4.805169923719628e-05, - "loss": 0.7105, + "epoch": 6.476812432834587, + "grad_norm": 26.47780418395996, + "learning_rate": 3.3845278576635554e-05, + "loss": 0.5623, "step": 470100 }, { - "epoch": 4.79, - "learning_rate": 4.8046400510186573e-05, - "loss": 0.6545, + "epoch": 6.478190184894326, + "grad_norm": 15.909097671508789, + "learning_rate": 3.3837479153644104e-05, + "loss": 0.6046, "step": 470200 }, { - "epoch": 4.79, - "learning_rate": 4.804110090081558e-05, - "loss": 0.6723, + "epoch": 6.479567936954066, + "grad_norm": 4.691531658172607, + "learning_rate": 3.382967919225139e-05, + "loss": 0.4628, "step": 470300 }, { - "epoch": 4.79, - "learning_rate": 4.8035800409342406e-05, - "loss": 0.6281, + "epoch": 6.480945689013805, + "grad_norm": 17.47124481201172, + "learning_rate": 3.3821878693120036e-05, + "loss": 0.4563, "step": 470400 }, { - "epoch": 4.79, - "learning_rate": 4.803049903602622e-05, - "loss": 0.7299, + "epoch": 6.482323441073545, + "grad_norm": 3.935624599456787, + "learning_rate": 3.381407765691275e-05, + "loss": 0.5361, "step": 470500 }, { - "epoch": 4.79, - "learning_rate": 4.802524980803824e-05, - "loss": 0.7251, + "epoch": 6.4837011931332835, + "grad_norm": 2.7093863487243652, + "learning_rate": 3.380627608429226e-05, + "loss": 0.5025, "step": 470600 }, { - "epoch": 4.8, - "learning_rate": 4.8019946680625664e-05, - "loss": 0.718, + "epoch": 6.485078945193023, + "grad_norm": 4.365957736968994, + "learning_rate": 3.379847397592136e-05, + "loss": 0.4969, "step": 470700 }, { - "epoch": 4.8, - "learning_rate": 4.8014642672145236e-05, - "loss": 0.6282, + "epoch": 6.486456697252763, + "grad_norm": 0.8611201643943787, + "learning_rate": 3.3790671332462874e-05, + "loss": 0.4982, "step": 470800 }, { - "epoch": 4.8, - "learning_rate": 4.8009337782856286e-05, - "loss": 0.7017, + "epoch": 6.487834449312501, + "grad_norm": 5.220925331115723, + "learning_rate": 3.378286815457967e-05, + "loss": 0.535, "step": 470900 }, { - "epoch": 4.8, - "learning_rate": 4.8004032013018196e-05, - "loss": 0.7218, + "epoch": 6.489212201372241, + "grad_norm": 12.679610252380371, + "learning_rate": 3.3775064442934666e-05, + "loss": 0.4894, "step": 471000 }, { - "epoch": 4.8, - "learning_rate": 4.799872536289038e-05, - "loss": 0.5841, + "epoch": 6.49058995343198, + "grad_norm": 0.8816003203392029, + "learning_rate": 3.376726019819085e-05, + "loss": 0.4809, "step": 471100 }, { - "epoch": 4.8, - "learning_rate": 4.799341783273231e-05, - "loss": 0.6282, + "epoch": 6.49196770549172, + "grad_norm": 13.836054801940918, + "learning_rate": 3.375945542101121e-05, + "loss": 0.5055, "step": 471200 }, { - "epoch": 4.8, - "learning_rate": 4.7988109422803475e-05, - "loss": 0.7199, + "epoch": 6.493345457551459, + "grad_norm": 88.91038513183594, + "learning_rate": 3.375165011205881e-05, + "loss": 0.539, "step": 471300 }, { - "epoch": 4.8, - "learning_rate": 4.798280013336345e-05, - "loss": 0.7139, + "epoch": 6.494723209611198, + "grad_norm": 8.592035293579102, + "learning_rate": 3.3743844271996754e-05, + "loss": 0.5181, "step": 471400 }, { - "epoch": 4.8, - "learning_rate": 4.7977489964671806e-05, - "loss": 0.6759, + "epoch": 6.496100961670938, + "grad_norm": 1.4551607370376587, + "learning_rate": 3.373603790148817e-05, + "loss": 0.5676, "step": 471500 }, { - "epoch": 4.8, - "learning_rate": 4.797217891698819e-05, - "loss": 0.6639, + "epoch": 6.497478713730677, + "grad_norm": 2.1242740154266357, + "learning_rate": 3.372823100119628e-05, + "loss": 0.5473, "step": 471600 }, { - "epoch": 4.81, - "learning_rate": 4.7966866990572286e-05, - "loss": 0.7671, + "epoch": 6.498856465790416, + "grad_norm": 3.7006642818450928, + "learning_rate": 3.372042357178429e-05, + "loss": 0.5132, "step": 471700 }, { - "epoch": 4.81, - "learning_rate": 4.796155418568379e-05, - "loss": 0.7142, + "epoch": 6.5002342178501555, + "grad_norm": 3.8233206272125244, + "learning_rate": 3.37126156139155e-05, + "loss": 0.6044, "step": 471800 }, { - "epoch": 4.81, - "learning_rate": 4.795624050258249e-05, - "loss": 0.7303, + "epoch": 6.501611969909895, + "grad_norm": 7.363094329833984, + "learning_rate": 3.370496330313456e-05, + "loss": 0.4758, "step": 471900 }, { - "epoch": 4.81, - "learning_rate": 4.79509259415282e-05, - "loss": 0.7087, + "epoch": 6.502989721969635, + "grad_norm": 4.851779937744141, + "learning_rate": 3.369715430087828e-05, + "loss": 0.5753, "step": 472000 }, { - "epoch": 4.81, - "learning_rate": 4.794561050278073e-05, - "loss": 0.6714, + "epoch": 6.504367474029373, + "grad_norm": 7.83793830871582, + "learning_rate": 3.3689344772142045e-05, + "loss": 0.5238, "step": 472100 }, { - "epoch": 4.81, - "learning_rate": 4.7940294186600026e-05, - "loss": 0.6853, + "epoch": 6.505745226089113, + "grad_norm": 2.358006477355957, + "learning_rate": 3.36815347175893e-05, + "loss": 0.5111, "step": 472200 }, { - "epoch": 4.81, - "learning_rate": 4.793497699324598e-05, - "loss": 0.6369, + "epoch": 6.507122978148852, + "grad_norm": 37.04079055786133, + "learning_rate": 3.3673724137883555e-05, + "loss": 0.4996, "step": 472300 }, { - "epoch": 4.81, - "learning_rate": 4.79296589229786e-05, - "loss": 0.6476, + "epoch": 6.508500730208592, + "grad_norm": 13.890275955200195, + "learning_rate": 3.3665913033688365e-05, + "loss": 0.5195, "step": 472400 }, { - "epoch": 4.81, - "learning_rate": 4.792433997605789e-05, - "loss": 0.6402, + "epoch": 6.509878482268331, + "grad_norm": 4.729398727416992, + "learning_rate": 3.365810140566731e-05, + "loss": 0.4996, "step": 472500 }, { - "epoch": 4.81, - "learning_rate": 4.791902015274393e-05, - "loss": 0.6429, + "epoch": 6.51125623432807, + "grad_norm": 1.6167852878570557, + "learning_rate": 3.3650289254484044e-05, + "loss": 0.5998, "step": 472600 }, { - "epoch": 4.82, - "learning_rate": 4.7913699453296816e-05, - "loss": 0.6589, + "epoch": 6.51263398638781, + "grad_norm": 21.66167449951172, + "learning_rate": 3.364247658080224e-05, + "loss": 0.5722, "step": 472700 }, { - "epoch": 4.82, - "learning_rate": 4.7908377877976695e-05, - "loss": 0.6191, + "epoch": 6.514011738447549, + "grad_norm": 5.861439228057861, + "learning_rate": 3.363466338528562e-05, + "loss": 0.5368, "step": 472800 }, { - "epoch": 4.82, - "learning_rate": 4.790310865588653e-05, - "loss": 0.6958, + "epoch": 6.515389490507288, + "grad_norm": 10.542923927307129, + "learning_rate": 3.3626849668597964e-05, + "loss": 0.5995, "step": 472900 }, { - "epoch": 4.82, - "learning_rate": 4.789778533835329e-05, - "loss": 0.5885, + "epoch": 6.5167672425670276, + "grad_norm": 8.882357597351074, + "learning_rate": 3.361903543140309e-05, + "loss": 0.5322, "step": 473000 }, { - "epoch": 4.82, - "learning_rate": 4.7892461145725137e-05, - "loss": 0.643, + "epoch": 6.518144994626767, + "grad_norm": 6.442994117736816, + "learning_rate": 3.361122067436484e-05, + "loss": 0.5019, "step": 473100 }, { - "epoch": 4.82, - "learning_rate": 4.788713607826241e-05, - "loss": 0.6939, + "epoch": 6.519522746686507, + "grad_norm": 1.5847370624542236, + "learning_rate": 3.360340539814714e-05, + "loss": 0.4784, "step": 473200 }, { - "epoch": 4.82, - "learning_rate": 4.788181013622549e-05, - "loss": 0.6922, + "epoch": 6.520900498746245, + "grad_norm": 3.729393482208252, + "learning_rate": 3.359558960341392e-05, + "loss": 0.5788, "step": 473300 }, { - "epoch": 4.82, - "learning_rate": 4.7876483319874744e-05, - "loss": 0.5744, + "epoch": 6.522278250805985, + "grad_norm": 22.281179428100586, + "learning_rate": 3.358777329082918e-05, + "loss": 0.4523, "step": 473400 }, { - "epoch": 4.82, - "learning_rate": 4.787115562947066e-05, - "loss": 0.7081, + "epoch": 6.5236560028657244, + "grad_norm": 10.255950927734375, + "learning_rate": 3.3579956461056954e-05, + "loss": 0.5112, "step": 473500 }, { - "epoch": 4.83, - "learning_rate": 4.786582706527371e-05, - "loss": 0.6653, + "epoch": 6.525033754925463, + "grad_norm": 21.808143615722656, + "learning_rate": 3.357213911476132e-05, + "loss": 0.5197, "step": 473600 }, { - "epoch": 4.83, - "learning_rate": 4.7860497627544436e-05, - "loss": 0.6459, + "epoch": 6.526411506985203, + "grad_norm": 18.429672241210938, + "learning_rate": 3.3564321252606405e-05, + "loss": 0.4994, "step": 473700 }, { - "epoch": 4.83, - "learning_rate": 4.785516731654342e-05, - "loss": 0.6879, + "epoch": 6.527789259044942, + "grad_norm": 4.4939470291137695, + "learning_rate": 3.355650287525637e-05, + "loss": 0.4903, "step": 473800 }, { - "epoch": 4.83, - "learning_rate": 4.784983613253126e-05, - "loss": 0.6847, + "epoch": 6.529167011104682, + "grad_norm": 6.932443618774414, + "learning_rate": 3.354868398337544e-05, + "loss": 0.4575, "step": 473900 }, { - "epoch": 4.83, - "learning_rate": 4.784450407576865e-05, - "loss": 0.6414, + "epoch": 6.530544763164421, + "grad_norm": 6.450246334075928, + "learning_rate": 3.354086457762786e-05, + "loss": 0.5593, "step": 474000 }, { - "epoch": 4.83, - "learning_rate": 4.783917114651628e-05, - "loss": 0.7259, + "epoch": 6.53192251522416, + "grad_norm": 5.7159318923950195, + "learning_rate": 3.3533044658677925e-05, + "loss": 0.4866, "step": 474100 }, { - "epoch": 4.83, - "learning_rate": 4.78338373450349e-05, - "loss": 0.6421, + "epoch": 6.5333002672839, + "grad_norm": 2.7029058933258057, + "learning_rate": 3.3525224227189986e-05, + "loss": 0.4919, "step": 474200 }, { - "epoch": 4.83, - "learning_rate": 4.78285026715853e-05, - "loss": 0.672, + "epoch": 6.534678019343639, + "grad_norm": 3.650818109512329, + "learning_rate": 3.3517403283828415e-05, + "loss": 0.5296, "step": 474300 }, { - "epoch": 4.83, - "learning_rate": 4.78231671264283e-05, - "loss": 0.6649, + "epoch": 6.536055771403378, + "grad_norm": 9.655908584594727, + "learning_rate": 3.350958182925766e-05, + "loss": 0.5935, "step": 474400 }, { - "epoch": 4.83, - "learning_rate": 4.781783070982481e-05, - "loss": 0.6962, + "epoch": 6.537433523463117, + "grad_norm": 53.43095397949219, + "learning_rate": 3.350175986414218e-05, + "loss": 0.4687, "step": 474500 }, { - "epoch": 4.84, - "learning_rate": 4.781249342203572e-05, - "loss": 0.6613, + "epoch": 6.538811275522857, + "grad_norm": 4.060654640197754, + "learning_rate": 3.349393738914649e-05, + "loss": 0.5496, "step": 474600 }, { - "epoch": 4.84, - "learning_rate": 4.780715526332201e-05, - "loss": 0.5756, + "epoch": 6.5401890275825965, + "grad_norm": 3.6551802158355713, + "learning_rate": 3.3486114404935155e-05, + "loss": 0.5736, "step": 474700 }, { - "epoch": 4.84, - "learning_rate": 4.7801816233944656e-05, - "loss": 0.5518, + "epoch": 6.541566779642336, + "grad_norm": 21.855300903320312, + "learning_rate": 3.3478290912172776e-05, + "loss": 0.5015, "step": 474800 }, { - "epoch": 4.84, - "learning_rate": 4.779647633416474e-05, - "loss": 0.718, + "epoch": 6.542944531702075, + "grad_norm": 2.1812844276428223, + "learning_rate": 3.3470466911524014e-05, + "loss": 0.5123, "step": 474900 }, { - "epoch": 4.84, - "learning_rate": 4.7791135564243333e-05, - "loss": 0.6741, + "epoch": 6.544322283761814, + "grad_norm": 1.5420588254928589, + "learning_rate": 3.3462642403653526e-05, + "loss": 0.4814, "step": 475000 }, { - "epoch": 4.84, - "learning_rate": 4.778579392444156e-05, - "loss": 0.5952, + "epoch": 6.545700035821554, + "grad_norm": 12.950900077819824, + "learning_rate": 3.345481738922606e-05, + "loss": 0.5513, "step": 475100 }, { - "epoch": 4.84, - "learning_rate": 4.778045141502062e-05, - "loss": 0.6397, + "epoch": 6.5470777878812925, + "grad_norm": 47.913818359375, + "learning_rate": 3.344699186890641e-05, + "loss": 0.4856, "step": 475200 }, { - "epoch": 4.84, - "learning_rate": 4.777510803624169e-05, - "loss": 0.6106, + "epoch": 6.548455539941032, + "grad_norm": 9.541970252990723, + "learning_rate": 3.343916584335936e-05, + "loss": 0.5612, "step": 475300 }, { - "epoch": 4.84, - "learning_rate": 4.776976378836607e-05, - "loss": 0.7089, + "epoch": 6.549833292000772, + "grad_norm": 1.2268084287643433, + "learning_rate": 3.343133931324979e-05, + "loss": 0.5352, "step": 475400 }, { - "epoch": 4.84, - "learning_rate": 4.7764418671655056e-05, - "loss": 0.6893, + "epoch": 6.551211044060511, + "grad_norm": 20.780214309692383, + "learning_rate": 3.3423512279242614e-05, + "loss": 0.4917, "step": 475500 }, { - "epoch": 4.85, - "learning_rate": 4.775907268636996e-05, - "loss": 0.6144, + "epoch": 6.552588796120251, + "grad_norm": 1.8556984663009644, + "learning_rate": 3.341568474200274e-05, + "loss": 0.529, "step": 475600 }, { - "epoch": 4.85, - "learning_rate": 4.7753725832772204e-05, - "loss": 0.7305, + "epoch": 6.553966548179989, + "grad_norm": 2.5443620681762695, + "learning_rate": 3.340785670219521e-05, + "loss": 0.5013, "step": 475700 }, { - "epoch": 4.85, - "learning_rate": 4.774837811112319e-05, - "loss": 0.6723, + "epoch": 6.555344300239729, + "grad_norm": 5.767784118652344, + "learning_rate": 3.340002816048501e-05, + "loss": 0.592, "step": 475800 }, { - "epoch": 4.85, - "learning_rate": 4.7743029521684414e-05, - "loss": 0.7739, + "epoch": 6.5567220522994685, + "grad_norm": 5.811985969543457, + "learning_rate": 3.3392199117537245e-05, + "loss": 0.5086, "step": 475900 }, { - "epoch": 4.85, - "learning_rate": 4.773768006471738e-05, - "loss": 0.6641, + "epoch": 6.558099804359207, + "grad_norm": 39.33713150024414, + "learning_rate": 3.338436957401702e-05, + "loss": 0.4897, "step": 476000 }, { - "epoch": 4.85, - "learning_rate": 4.7732329740483646e-05, - "loss": 0.7364, + "epoch": 6.559477556418947, + "grad_norm": 5.036971092224121, + "learning_rate": 3.337653953058948e-05, + "loss": 0.4779, "step": 476100 }, { - "epoch": 4.85, - "learning_rate": 4.7726978549244805e-05, - "loss": 0.5834, + "epoch": 6.560855308478686, + "grad_norm": 7.764959812164307, + "learning_rate": 3.336870898791985e-05, + "loss": 0.5157, "step": 476200 }, { - "epoch": 4.85, - "learning_rate": 4.772162649126249e-05, - "loss": 0.6768, + "epoch": 6.562233060538426, + "grad_norm": 13.46796703338623, + "learning_rate": 3.3360877946673375e-05, + "loss": 0.5134, "step": 476300 }, { - "epoch": 4.85, - "learning_rate": 4.771627356679841e-05, - "loss": 0.7498, + "epoch": 6.5636108125981645, + "grad_norm": 2.08974289894104, + "learning_rate": 3.335304640751533e-05, + "loss": 0.4405, "step": 476400 }, { - "epoch": 4.85, - "learning_rate": 4.771091977611428e-05, - "loss": 0.6843, + "epoch": 6.564988564657904, + "grad_norm": 82.97801208496094, + "learning_rate": 3.334521437111105e-05, + "loss": 0.4729, "step": 476500 }, { - "epoch": 4.86, - "learning_rate": 4.770556511947186e-05, - "loss": 0.5953, + "epoch": 6.566366316717644, + "grad_norm": 21.792648315429688, + "learning_rate": 3.33373818381259e-05, + "loss": 0.5104, "step": 476600 }, { - "epoch": 4.86, - "learning_rate": 4.7700209597132965e-05, - "loss": 0.6419, + "epoch": 6.567744068777383, + "grad_norm": 2.1306300163269043, + "learning_rate": 3.332954880922531e-05, + "loss": 0.616, "step": 476700 }, { - "epoch": 4.86, - "learning_rate": 4.7694853209359454e-05, - "loss": 0.6724, + "epoch": 6.569121820837122, + "grad_norm": 3.446024179458618, + "learning_rate": 3.3321715285074716e-05, + "loss": 0.4872, "step": 476800 }, { - "epoch": 4.86, - "learning_rate": 4.7689495956413216e-05, - "loss": 0.6993, + "epoch": 6.570499572896861, + "grad_norm": 4.544395446777344, + "learning_rate": 3.3313881266339625e-05, + "loss": 0.4667, "step": 476900 }, { - "epoch": 4.86, - "learning_rate": 4.7684137838556195e-05, - "loss": 0.6677, + "epoch": 6.571877324956601, + "grad_norm": 6.018442153930664, + "learning_rate": 3.330604675368558e-05, + "loss": 0.4431, "step": 477000 }, { - "epoch": 4.86, - "learning_rate": 4.7678778856050367e-05, - "loss": 0.7117, + "epoch": 6.5732550770163405, + "grad_norm": 8.804194450378418, + "learning_rate": 3.329821174777816e-05, + "loss": 0.4841, "step": 477100 }, { - "epoch": 4.86, - "learning_rate": 4.767341900915775e-05, - "loss": 0.5953, + "epoch": 6.574632829076079, + "grad_norm": 2.9686954021453857, + "learning_rate": 3.3290376249282986e-05, + "loss": 0.5417, "step": 477200 }, { - "epoch": 4.86, - "learning_rate": 4.766805829814041e-05, - "loss": 0.7442, + "epoch": 6.576010581135819, + "grad_norm": 15.990909576416016, + "learning_rate": 3.3282540258865725e-05, + "loss": 0.4967, "step": 477300 }, { - "epoch": 4.86, - "learning_rate": 4.7662696723260464e-05, - "loss": 0.6792, + "epoch": 6.577388333195558, + "grad_norm": 4.4878034591674805, + "learning_rate": 3.327470377719209e-05, + "loss": 0.6021, "step": 477400 }, { - "epoch": 4.86, - "learning_rate": 4.7657334284780054e-05, - "loss": 0.683, + "epoch": 6.578766085255298, + "grad_norm": 3.6620984077453613, + "learning_rate": 3.3266866804927826e-05, + "loss": 0.5129, "step": 477500 }, { - "epoch": 4.87, - "learning_rate": 4.765197098296136e-05, - "loss": 0.6616, + "epoch": 6.5801438373150365, + "grad_norm": 14.283812522888184, + "learning_rate": 3.3259029342738716e-05, + "loss": 0.5296, "step": 477600 }, { - "epoch": 4.87, - "learning_rate": 4.764660681806664e-05, - "loss": 0.6808, + "epoch": 6.581521589374776, + "grad_norm": 4.429503917694092, + "learning_rate": 3.325119139129061e-05, + "loss": 0.5294, "step": 477700 }, { - "epoch": 4.87, - "learning_rate": 4.7641241790358155e-05, - "loss": 0.6228, + "epoch": 6.582899341434516, + "grad_norm": 39.468849182128906, + "learning_rate": 3.324335295124937e-05, + "loss": 0.5457, "step": 477800 }, { - "epoch": 4.87, - "learning_rate": 4.763592956326958e-05, - "loss": 0.6211, + "epoch": 6.584277093494255, + "grad_norm": 4.502475738525391, + "learning_rate": 3.3235514023280906e-05, + "loss": 0.5436, "step": 477900 }, { - "epoch": 4.87, - "learning_rate": 4.7630562819342146e-05, - "loss": 0.6092, + "epoch": 6.585654845553994, + "grad_norm": 33.94144058227539, + "learning_rate": 3.3227674608051196e-05, + "loss": 0.5207, "step": 478000 }, { - "epoch": 4.87, - "learning_rate": 4.762519521338541e-05, - "loss": 0.6329, + "epoch": 6.587032597613733, + "grad_norm": 7.2556471824646, + "learning_rate": 3.321983470622622e-05, + "loss": 0.5627, "step": 478100 }, { - "epoch": 4.87, - "learning_rate": 4.761982674566181e-05, - "loss": 0.7091, + "epoch": 6.588410349673473, + "grad_norm": 18.895158767700195, + "learning_rate": 3.321199431847201e-05, + "loss": 0.5371, "step": 478200 }, { - "epoch": 4.87, - "learning_rate": 4.761445741643383e-05, - "loss": 0.7075, + "epoch": 6.5897881017332125, + "grad_norm": 1.9143149852752686, + "learning_rate": 3.320415344545468e-05, + "loss": 0.4969, "step": 478300 }, { - "epoch": 4.87, - "learning_rate": 4.7609087225964005e-05, - "loss": 0.6638, + "epoch": 6.591165853792951, + "grad_norm": 11.343670845031738, + "learning_rate": 3.3196312087840324e-05, + "loss": 0.5592, "step": 478400 }, { - "epoch": 4.88, - "learning_rate": 4.760371617451489e-05, - "loss": 0.6205, + "epoch": 6.592543605852691, + "grad_norm": 3.6983842849731445, + "learning_rate": 3.3188470246295125e-05, + "loss": 0.5377, "step": 478500 }, { - "epoch": 4.88, - "learning_rate": 4.759834426234912e-05, - "loss": 0.5948, + "epoch": 6.59392135791243, + "grad_norm": 11.56848430633545, + "learning_rate": 3.318062792148526e-05, + "loss": 0.459, "step": 478600 }, { - "epoch": 4.88, - "learning_rate": 4.759297148972933e-05, - "loss": 0.6264, + "epoch": 6.595299109972169, + "grad_norm": 4.3684000968933105, + "learning_rate": 3.317278511407699e-05, + "loss": 0.5941, "step": 478700 }, { - "epoch": 4.88, - "learning_rate": 4.758759785691821e-05, - "loss": 0.6077, + "epoch": 6.5966768620319085, + "grad_norm": 3.685560703277588, + "learning_rate": 3.316494182473662e-05, + "loss": 0.4777, "step": 478800 }, { - "epoch": 4.88, - "learning_rate": 4.758222336417854e-05, - "loss": 0.5524, + "epoch": 6.598054614091648, + "grad_norm": 4.0189738273620605, + "learning_rate": 3.315709805413044e-05, + "loss": 0.4787, "step": 478900 }, { - "epoch": 4.88, - "learning_rate": 4.757684801177305e-05, - "loss": 0.674, + "epoch": 6.599432366151388, + "grad_norm": 2.0849928855895996, + "learning_rate": 3.314933224781369e-05, + "loss": 0.537, "step": 479000 }, { - "epoch": 4.88, - "learning_rate": 4.757147179996459e-05, - "loss": 0.7099, + "epoch": 6.600810118211127, + "grad_norm": 3.9438023567199707, + "learning_rate": 3.314156597110806e-05, + "loss": 0.4727, "step": 479100 }, { - "epoch": 4.88, - "learning_rate": 4.756609472901603e-05, - "loss": 0.6849, + "epoch": 6.602187870270866, + "grad_norm": 7.533387660980225, + "learning_rate": 3.313372077028169e-05, + "loss": 0.5242, "step": 479200 }, { - "epoch": 4.88, - "learning_rate": 4.7560716799190254e-05, - "loss": 0.6719, + "epoch": 6.603565622330605, + "grad_norm": 23.13103485107422, + "learning_rate": 3.312587509084193e-05, + "loss": 0.518, "step": 479300 }, { - "epoch": 4.88, - "learning_rate": 4.7555338010750236e-05, - "loss": 0.6419, + "epoch": 6.604943374390345, + "grad_norm": 4.273341178894043, + "learning_rate": 3.311802893345529e-05, + "loss": 0.4648, "step": 479400 }, { - "epoch": 4.89, - "learning_rate": 4.7549958363958947e-05, - "loss": 0.6565, + "epoch": 6.606321126450084, + "grad_norm": 10.277251243591309, + "learning_rate": 3.3110182298788356e-05, + "loss": 0.5075, "step": 479500 }, { - "epoch": 4.89, - "learning_rate": 4.7544577859079425e-05, - "loss": 0.7481, + "epoch": 6.607698878509823, + "grad_norm": 3.7137677669525146, + "learning_rate": 3.310233518750773e-05, + "loss": 0.5354, "step": 479600 }, { - "epoch": 4.89, - "learning_rate": 4.7539196496374754e-05, - "loss": 0.6183, + "epoch": 6.609076630569563, + "grad_norm": 2.8192920684814453, + "learning_rate": 3.309448760028008e-05, + "loss": 0.5696, "step": 479700 }, { - "epoch": 4.89, - "learning_rate": 4.753381427610803e-05, - "loss": 0.6719, + "epoch": 6.610454382629302, + "grad_norm": 21.763931274414062, + "learning_rate": 3.308663953777207e-05, + "loss": 0.4731, "step": 479800 }, { - "epoch": 4.89, - "learning_rate": 4.752843119854244e-05, - "loss": 0.6301, + "epoch": 6.611832134689042, + "grad_norm": 14.947737693786621, + "learning_rate": 3.3078791000650455e-05, + "loss": 0.4877, "step": 479900 }, { - "epoch": 4.89, - "learning_rate": 4.7523101107528625e-05, - "loss": 0.7285, + "epoch": 6.6132098867487805, + "grad_norm": 2.8194727897644043, + "learning_rate": 3.3070941989582013e-05, + "loss": 0.5482, "step": 480000 }, { - "epoch": 4.89, - "learning_rate": 4.751771632472133e-05, - "loss": 0.6604, + "epoch": 6.61458763880852, + "grad_norm": 11.757162094116211, + "learning_rate": 3.306309250523354e-05, + "loss": 0.5233, "step": 480100 }, { - "epoch": 4.89, - "learning_rate": 4.751233068540224e-05, - "loss": 0.7168, + "epoch": 6.61596539086826, + "grad_norm": 1.4464075565338135, + "learning_rate": 3.30552425482719e-05, + "loss": 0.4637, "step": 480200 }, { - "epoch": 4.89, - "learning_rate": 4.75069441898347e-05, - "loss": 0.6833, + "epoch": 6.617343142927998, + "grad_norm": 18.960248947143555, + "learning_rate": 3.304739211936398e-05, + "loss": 0.5695, "step": 480300 }, { - "epoch": 4.89, - "learning_rate": 4.750155683828206e-05, - "loss": 0.6717, + "epoch": 6.618720894987738, + "grad_norm": 1.988845705986023, + "learning_rate": 3.303954121917673e-05, + "loss": 0.4336, "step": 480400 }, { - "epoch": 4.9, - "learning_rate": 4.7496168631007725e-05, - "loss": 0.7384, + "epoch": 6.620098647047477, + "grad_norm": 1.8592430353164673, + "learning_rate": 3.303168984837711e-05, + "loss": 0.5139, "step": 480500 }, { - "epoch": 4.9, - "learning_rate": 4.7490779568275155e-05, - "loss": 0.6648, + "epoch": 6.621476399107217, + "grad_norm": 1.5606125593185425, + "learning_rate": 3.302383800763213e-05, + "loss": 0.5365, "step": 480600 }, { - "epoch": 4.9, - "learning_rate": 4.748538965034785e-05, - "loss": 0.6064, + "epoch": 6.622854151166956, + "grad_norm": 5.78675651550293, + "learning_rate": 3.301598569760886e-05, + "loss": 0.5418, "step": 480700 }, { - "epoch": 4.9, - "learning_rate": 4.7479998877489346e-05, - "loss": 0.6804, + "epoch": 6.624231903226695, + "grad_norm": 3.0970494747161865, + "learning_rate": 3.3008132918974384e-05, + "loss": 0.4604, "step": 480800 }, { - "epoch": 4.9, - "learning_rate": 4.7474607249963204e-05, - "loss": 0.6584, + "epoch": 6.625609655286435, + "grad_norm": 7.67454719543457, + "learning_rate": 3.3000279672395833e-05, + "loss": 0.5215, "step": 480900 }, { - "epoch": 4.9, - "learning_rate": 4.746921476803305e-05, - "loss": 0.5439, + "epoch": 6.626987407346174, + "grad_norm": 8.267184257507324, + "learning_rate": 3.2992425958540384e-05, + "loss": 0.4412, "step": 481000 }, { - "epoch": 4.9, - "learning_rate": 4.7463821431962546e-05, - "loss": 0.6715, + "epoch": 6.628365159405913, + "grad_norm": 7.434189796447754, + "learning_rate": 3.298457177807525e-05, + "loss": 0.4746, "step": 481100 }, { - "epoch": 4.9, - "learning_rate": 4.745842724201539e-05, - "loss": 0.5925, + "epoch": 6.6297429114656525, + "grad_norm": 4.048120021820068, + "learning_rate": 3.297671713166769e-05, + "loss": 0.4731, "step": 481200 }, { - "epoch": 4.9, - "learning_rate": 4.7453032198455336e-05, - "loss": 0.5991, + "epoch": 6.631120663525392, + "grad_norm": 7.0578508377075195, + "learning_rate": 3.2968862019984995e-05, + "loss": 0.46, "step": 481300 }, { - "epoch": 4.9, - "learning_rate": 4.744763630154615e-05, - "loss": 0.6417, + "epoch": 6.632498415585132, + "grad_norm": 6.383736610412598, + "learning_rate": 3.296100644369448e-05, + "loss": 0.5021, "step": 481400 }, { - "epoch": 4.91, - "learning_rate": 4.744223955155168e-05, - "loss": 0.664, + "epoch": 6.63387616764487, + "grad_norm": 10.01197624206543, + "learning_rate": 3.2953150403463536e-05, + "loss": 0.5876, "step": 481500 }, { - "epoch": 4.91, - "learning_rate": 4.7436841948735785e-05, - "loss": 0.6533, + "epoch": 6.63525391970461, + "grad_norm": 22.271011352539062, + "learning_rate": 3.294529389995957e-05, + "loss": 0.3953, "step": 481600 }, { - "epoch": 4.91, - "learning_rate": 4.7431443493362376e-05, - "loss": 0.7175, + "epoch": 6.636631671764349, + "grad_norm": 3.4908883571624756, + "learning_rate": 3.293743693385002e-05, + "loss": 0.4916, "step": 481700 }, { - "epoch": 4.91, - "learning_rate": 4.74260441856954e-05, - "loss": 0.6662, + "epoch": 6.638009423824089, + "grad_norm": 6.715983867645264, + "learning_rate": 3.29295795058024e-05, + "loss": 0.5247, "step": 481800 }, { - "epoch": 4.91, - "learning_rate": 4.742064402599887e-05, - "loss": 0.6124, + "epoch": 6.639387175883828, + "grad_norm": 4.584227085113525, + "learning_rate": 3.292172161648421e-05, + "loss": 0.4767, "step": 481900 }, { - "epoch": 4.91, - "learning_rate": 4.74152430145368e-05, - "loss": 0.5611, + "epoch": 6.640764927943567, + "grad_norm": 6.77118444442749, + "learning_rate": 3.291386326656303e-05, + "loss": 0.5416, "step": 482000 }, { - "epoch": 4.91, - "learning_rate": 4.740984115157327e-05, - "loss": 0.6502, + "epoch": 6.642142680003307, + "grad_norm": 2.8333487510681152, + "learning_rate": 3.290600445670649e-05, + "loss": 0.4468, "step": 482100 }, { - "epoch": 4.91, - "learning_rate": 4.740443843737241e-05, - "loss": 0.6868, + "epoch": 6.643520432063046, + "grad_norm": 2.227372169494629, + "learning_rate": 3.289814518758219e-05, + "loss": 0.4468, "step": 482200 }, { - "epoch": 4.91, - "learning_rate": 4.739908891206156e-05, - "loss": 0.6171, + "epoch": 6.644898184122785, + "grad_norm": 1.9961074590682983, + "learning_rate": 3.289028545985785e-05, + "loss": 0.473, "step": 482300 }, { - "epoch": 4.91, - "learning_rate": 4.739368450468433e-05, - "loss": 0.7234, + "epoch": 6.6462759361825245, + "grad_norm": 2.170229196548462, + "learning_rate": 3.2882425274201184e-05, + "loss": 0.4622, "step": 482400 }, { - "epoch": 4.92, - "learning_rate": 4.738827924685973e-05, - "loss": 0.5608, + "epoch": 6.647653688242264, + "grad_norm": 23.161144256591797, + "learning_rate": 3.287456463127995e-05, + "loss": 0.563, "step": 482500 }, { - "epoch": 4.92, - "learning_rate": 4.738287313885204e-05, - "loss": 0.7062, + "epoch": 6.649031440302004, + "grad_norm": 54.195125579833984, + "learning_rate": 3.2866703531761955e-05, + "loss": 0.532, "step": 482600 }, { - "epoch": 4.92, - "learning_rate": 4.737746618092559e-05, - "loss": 0.7252, + "epoch": 6.650409192361742, + "grad_norm": 3.0172743797302246, + "learning_rate": 3.285884197631504e-05, + "loss": 0.5573, "step": 482700 }, { - "epoch": 4.92, - "learning_rate": 4.737205837334475e-05, - "loss": 0.5987, + "epoch": 6.651786944421482, + "grad_norm": 2.6992311477661133, + "learning_rate": 3.285097996560709e-05, + "loss": 0.4854, "step": 482800 }, { - "epoch": 4.92, - "learning_rate": 4.736664971637393e-05, - "loss": 0.5758, + "epoch": 6.653164696481221, + "grad_norm": 8.62267017364502, + "learning_rate": 3.284311750030601e-05, + "loss": 0.5506, "step": 482900 }, { - "epoch": 4.92, - "learning_rate": 4.736124021027756e-05, - "loss": 0.7343, + "epoch": 6.65454244854096, + "grad_norm": 5.223740577697754, + "learning_rate": 3.283525458107976e-05, + "loss": 0.472, "step": 483000 }, { - "epoch": 4.92, - "learning_rate": 4.735582985532017e-05, - "loss": 0.7125, + "epoch": 6.6559202006007, + "grad_norm": 4.635311603546143, + "learning_rate": 3.282739120859634e-05, + "loss": 0.4923, "step": 483100 }, { - "epoch": 4.92, - "learning_rate": 4.735041865176627e-05, - "loss": 0.6986, + "epoch": 6.657297952660439, + "grad_norm": 5.964470386505127, + "learning_rate": 3.2819527383523786e-05, + "loss": 0.531, "step": 483200 }, { - "epoch": 4.92, - "learning_rate": 4.7345006599880434e-05, - "loss": 0.6769, + "epoch": 6.658675704720179, + "grad_norm": 97.39588928222656, + "learning_rate": 3.281166310653017e-05, + "loss": 0.466, "step": 483300 }, { - "epoch": 4.92, - "learning_rate": 4.733959369992729e-05, - "loss": 0.6472, + "epoch": 6.660053456779918, + "grad_norm": 11.894049644470215, + "learning_rate": 3.280379837828359e-05, + "loss": 0.5036, "step": 483400 }, { - "epoch": 4.93, - "learning_rate": 4.7334179952171495e-05, - "loss": 0.6563, + "epoch": 6.661431208839657, + "grad_norm": 5.156511306762695, + "learning_rate": 3.279593319945221e-05, + "loss": 0.502, "step": 483500 }, { - "epoch": 4.93, - "learning_rate": 4.732876535687773e-05, - "loss": 0.6527, + "epoch": 6.662808960899397, + "grad_norm": 19.589689254760742, + "learning_rate": 3.278806757070421e-05, + "loss": 0.5605, "step": 483600 }, { - "epoch": 4.93, - "learning_rate": 4.732334991431076e-05, - "loss": 0.5603, + "epoch": 6.664186712959136, + "grad_norm": 4.9715352058410645, + "learning_rate": 3.2780201492707815e-05, + "loss": 0.5309, "step": 483700 }, { - "epoch": 4.93, - "learning_rate": 4.7317933624735364e-05, - "loss": 0.6313, + "epoch": 6.665564465018875, + "grad_norm": 5.03038215637207, + "learning_rate": 3.2772334966131286e-05, + "loss": 0.5306, "step": 483800 }, { - "epoch": 4.93, - "learning_rate": 4.731251648841636e-05, - "loss": 0.727, + "epoch": 6.666942217078614, + "grad_norm": 1.5268027782440186, + "learning_rate": 3.2764467991642936e-05, + "loss": 0.4856, "step": 483900 }, { - "epoch": 4.93, - "learning_rate": 4.730709850561861e-05, - "loss": 0.649, + "epoch": 6.668319969138354, + "grad_norm": 14.442225456237793, + "learning_rate": 3.275660056991109e-05, + "loss": 0.531, "step": 484000 }, { - "epoch": 4.93, - "learning_rate": 4.730167967660703e-05, - "loss": 0.6101, + "epoch": 6.6696977211980935, + "grad_norm": 4.5085768699646, + "learning_rate": 3.274873270160414e-05, + "loss": 0.4886, "step": 484100 }, { - "epoch": 4.93, - "learning_rate": 4.7296260001646565e-05, - "loss": 0.6724, + "epoch": 6.671075473257833, + "grad_norm": 4.754798412322998, + "learning_rate": 3.2740864387390496e-05, + "loss": 0.5464, "step": 484200 }, { - "epoch": 4.93, - "learning_rate": 4.7290839481002206e-05, - "loss": 0.5981, + "epoch": 6.672453225317572, + "grad_norm": 5.900638103485107, + "learning_rate": 3.27329956279386e-05, + "loss": 0.5197, "step": 484300 }, { - "epoch": 4.94, - "learning_rate": 4.728541811493898e-05, - "loss": 0.765, + "epoch": 6.673830977377311, + "grad_norm": 14.146143913269043, + "learning_rate": 3.272512642391696e-05, + "loss": 0.4744, "step": 484400 }, { - "epoch": 4.94, - "learning_rate": 4.727999590372196e-05, - "loss": 0.7483, + "epoch": 6.675208729437051, + "grad_norm": 6.912065029144287, + "learning_rate": 3.2717256775994095e-05, + "loss": 0.5196, "step": 484500 }, { - "epoch": 4.94, - "learning_rate": 4.727457284761627e-05, - "loss": 0.7394, + "epoch": 6.6765864814967895, + "grad_norm": 30.106733322143555, + "learning_rate": 3.270938668483858e-05, + "loss": 0.4377, "step": 484600 }, { - "epoch": 4.94, - "learning_rate": 4.726914894688705e-05, - "loss": 0.6585, + "epoch": 6.677964233556529, + "grad_norm": 7.502816677093506, + "learning_rate": 3.2701516151119015e-05, + "loss": 0.4493, "step": 484700 }, { - "epoch": 4.94, - "learning_rate": 4.726372420179951e-05, - "loss": 0.6969, + "epoch": 6.679341985616269, + "grad_norm": 31.605297088623047, + "learning_rate": 3.269364517550403e-05, + "loss": 0.4129, "step": 484800 }, { - "epoch": 4.94, - "learning_rate": 4.725829861261888e-05, - "loss": 0.7463, + "epoch": 6.680719737676008, + "grad_norm": 4.691681861877441, + "learning_rate": 3.2685773758662336e-05, + "loss": 0.5738, "step": 484900 }, { - "epoch": 4.94, - "learning_rate": 4.7252872179610436e-05, - "loss": 0.596, + "epoch": 6.682097489735747, + "grad_norm": 8.39633560180664, + "learning_rate": 3.267790190126262e-05, + "loss": 0.56, "step": 485000 }, { - "epoch": 4.94, - "learning_rate": 4.72474449030395e-05, - "loss": 0.6844, + "epoch": 6.683475241795486, + "grad_norm": 1.8914600610733032, + "learning_rate": 3.267002960397365e-05, + "loss": 0.5342, "step": 485100 }, { - "epoch": 4.94, - "learning_rate": 4.724201678317144e-05, - "loss": 0.6285, + "epoch": 6.684852993855226, + "grad_norm": 2.823514223098755, + "learning_rate": 3.2662156867464215e-05, + "loss": 0.4553, "step": 485200 }, { - "epoch": 4.94, - "learning_rate": 4.723658782027165e-05, - "loss": 0.6742, + "epoch": 6.6862307459149655, + "grad_norm": 3.148958206176758, + "learning_rate": 3.265428369240314e-05, + "loss": 0.4745, "step": 485300 }, { - "epoch": 4.95, - "learning_rate": 4.723115801460558e-05, - "loss": 0.585, + "epoch": 6.687608497974704, + "grad_norm": 5.808370590209961, + "learning_rate": 3.264648881775407e-05, + "loss": 0.4824, "step": 485400 }, { - "epoch": 4.95, - "learning_rate": 4.7225727366438716e-05, - "loss": 0.6094, + "epoch": 6.688986250034444, + "grad_norm": 3.830756425857544, + "learning_rate": 3.263861477196519e-05, + "loss": 0.4953, "step": 485500 }, { - "epoch": 4.95, - "learning_rate": 4.7220295876036574e-05, - "loss": 0.6066, + "epoch": 6.690364002094183, + "grad_norm": 19.666732788085938, + "learning_rate": 3.26307402896247e-05, + "loss": 0.4816, "step": 485600 }, { - "epoch": 4.95, - "learning_rate": 4.7214863543664736e-05, - "loss": 0.6703, + "epoch": 6.691741754153923, + "grad_norm": 12.761123657226562, + "learning_rate": 3.262286537140157e-05, + "loss": 0.5555, "step": 485700 }, { - "epoch": 4.95, - "learning_rate": 4.720943036958881e-05, - "loss": 0.6886, + "epoch": 6.6931195062136615, + "grad_norm": 5.290180683135986, + "learning_rate": 3.261499001796483e-05, + "loss": 0.4334, "step": 485800 }, { - "epoch": 4.95, - "learning_rate": 4.720399635407443e-05, - "loss": 0.6401, + "epoch": 6.694497258273401, + "grad_norm": 2.622549533843994, + "learning_rate": 3.2607114229983506e-05, + "loss": 0.4774, "step": 485900 }, { - "epoch": 4.95, - "learning_rate": 4.7198561497387293e-05, - "loss": 0.7454, + "epoch": 6.695875010333141, + "grad_norm": 2.8366758823394775, + "learning_rate": 3.259923800812671e-05, + "loss": 0.5055, "step": 486000 }, { - "epoch": 4.95, - "learning_rate": 4.719312579979315e-05, - "loss": 0.6822, + "epoch": 6.69725276239288, + "grad_norm": 8.155316352844238, + "learning_rate": 3.2591440121756355e-05, + "loss": 0.5687, "step": 486100 }, { - "epoch": 4.95, - "learning_rate": 4.718768926155774e-05, - "loss": 0.6647, + "epoch": 6.698630514452619, + "grad_norm": 33.312744140625, + "learning_rate": 3.258356303847807e-05, + "loss": 0.4846, "step": 486200 }, { - "epoch": 4.95, - "learning_rate": 4.7182251882946904e-05, - "loss": 0.7062, + "epoch": 6.700008266512358, + "grad_norm": 3.404679775238037, + "learning_rate": 3.257568552332508e-05, + "loss": 0.5001, "step": 486300 }, { - "epoch": 4.96, - "learning_rate": 4.717681366422649e-05, - "loss": 0.6276, + "epoch": 6.701386018572098, + "grad_norm": 3.142570734024048, + "learning_rate": 3.256780757696665e-05, + "loss": 0.468, "step": 486400 }, { - "epoch": 4.96, - "learning_rate": 4.717142900040438e-05, - "loss": 0.6886, + "epoch": 6.7027637706318375, + "grad_norm": 3.5822596549987793, + "learning_rate": 3.255992920007205e-05, + "loss": 0.5092, "step": 486500 }, { - "epoch": 4.96, - "learning_rate": 4.7165989110657e-05, - "loss": 0.7222, + "epoch": 6.704141522691576, + "grad_norm": 2.6791110038757324, + "learning_rate": 3.255205039331056e-05, + "loss": 0.441, "step": 486600 }, { - "epoch": 4.96, - "learning_rate": 4.71605483815952e-05, - "loss": 0.6439, + "epoch": 6.705519274751316, + "grad_norm": 1.9100193977355957, + "learning_rate": 3.254417115735155e-05, + "loss": 0.53, "step": 486700 }, { - "epoch": 4.96, - "learning_rate": 4.715510681348499e-05, - "loss": 0.6439, + "epoch": 6.706897026811055, + "grad_norm": 24.22588539123535, + "learning_rate": 3.2536291492864393e-05, + "loss": 0.4945, "step": 486800 }, { - "epoch": 4.96, - "learning_rate": 4.714966440659242e-05, - "loss": 0.6056, + "epoch": 6.708274778870795, + "grad_norm": 3.6156580448150635, + "learning_rate": 3.252841140051851e-05, + "loss": 0.4579, "step": 486900 }, { - "epoch": 4.96, - "learning_rate": 4.7144221161183627e-05, - "loss": 0.6484, + "epoch": 6.7096525309305335, + "grad_norm": 1.2529386281967163, + "learning_rate": 3.252053088098335e-05, + "loss": 0.5118, "step": 487000 }, { - "epoch": 4.96, - "learning_rate": 4.7138777077524705e-05, - "loss": 0.7028, + "epoch": 6.711030282990273, + "grad_norm": 1.7775460481643677, + "learning_rate": 3.2512649934928416e-05, + "loss": 0.5469, "step": 487100 }, { - "epoch": 4.96, - "learning_rate": 4.713333215588188e-05, - "loss": 0.6561, + "epoch": 6.712408035050013, + "grad_norm": 5.117815017700195, + "learning_rate": 3.2504768563023213e-05, + "loss": 0.4713, "step": 487200 }, { - "epoch": 4.96, - "learning_rate": 4.712788639652135e-05, - "loss": 0.815, + "epoch": 6.713785787109751, + "grad_norm": 0.360550194978714, + "learning_rate": 3.249688676593733e-05, + "loss": 0.6273, "step": 487300 }, { - "epoch": 4.97, - "learning_rate": 4.7122439799709394e-05, - "loss": 0.6578, + "epoch": 6.715163539169491, + "grad_norm": 4.095431327819824, + "learning_rate": 3.248900454434034e-05, + "loss": 0.4707, "step": 487400 }, { - "epoch": 4.97, - "learning_rate": 4.711699236571231e-05, - "loss": 0.6664, + "epoch": 6.71654129122923, + "grad_norm": 0.3416297733783722, + "learning_rate": 3.24811218989019e-05, + "loss": 0.4315, "step": 487500 }, { - "epoch": 4.97, - "learning_rate": 4.711154409479645e-05, - "loss": 0.5999, + "epoch": 6.71791904328897, + "grad_norm": 2.7688403129577637, + "learning_rate": 3.247323883029166e-05, + "loss": 0.4854, "step": 487600 }, { - "epoch": 4.97, - "learning_rate": 4.7106094987228206e-05, - "loss": 0.5817, + "epoch": 6.7192967953487095, + "grad_norm": 3.3573837280273438, + "learning_rate": 3.246535533917935e-05, + "loss": 0.5234, "step": 487700 }, { - "epoch": 4.97, - "learning_rate": 4.710064504327399e-05, - "loss": 0.7061, + "epoch": 6.720674547408448, + "grad_norm": 4.6890177726745605, + "learning_rate": 3.24574714262347e-05, + "loss": 0.4517, "step": 487800 }, { - "epoch": 4.97, - "learning_rate": 4.709519426320029e-05, - "loss": 0.5745, + "epoch": 6.722052299468188, + "grad_norm": 2.5689754486083984, + "learning_rate": 3.2449587092127484e-05, + "loss": 0.5398, "step": 487900 }, { - "epoch": 4.97, - "learning_rate": 4.708974264727361e-05, - "loss": 0.5967, + "epoch": 6.723430051527927, + "grad_norm": 9.631098747253418, + "learning_rate": 3.2441702337527515e-05, + "loss": 0.5326, "step": 488000 }, { - "epoch": 4.97, - "learning_rate": 4.70842901957605e-05, - "loss": 0.6267, + "epoch": 6.724807803587666, + "grad_norm": 5.43438196182251, + "learning_rate": 3.243381716310466e-05, + "loss": 0.5599, "step": 488100 }, { - "epoch": 4.97, - "learning_rate": 4.707883690892755e-05, - "loss": 0.6878, + "epoch": 6.7261855556474055, + "grad_norm": 7.3053364753723145, + "learning_rate": 3.24259315695288e-05, + "loss": 0.5374, "step": 488200 }, { - "epoch": 4.97, - "learning_rate": 4.70733827870414e-05, - "loss": 0.6855, + "epoch": 6.727563307707145, + "grad_norm": 1.1146855354309082, + "learning_rate": 3.241804555746985e-05, + "loss": 0.4776, "step": 488300 }, { - "epoch": 4.98, - "learning_rate": 4.706792783036872e-05, - "loss": 0.7513, + "epoch": 6.728941059766885, + "grad_norm": 11.632970809936523, + "learning_rate": 3.2410159127597766e-05, + "loss": 0.5525, "step": 488400 }, { - "epoch": 4.98, - "learning_rate": 4.706247203917621e-05, - "loss": 0.5748, + "epoch": 6.730318811826624, + "grad_norm": 2.38018536567688, + "learning_rate": 3.240227228058255e-05, + "loss": 0.5466, "step": 488500 }, { - "epoch": 4.98, - "learning_rate": 4.705701541373066e-05, - "loss": 0.6151, + "epoch": 6.731696563886363, + "grad_norm": 6.1933817863464355, + "learning_rate": 3.239438501709422e-05, + "loss": 0.5129, "step": 488600 }, { - "epoch": 4.98, - "learning_rate": 4.705155795429884e-05, - "loss": 0.6374, + "epoch": 6.733074315946102, + "grad_norm": 2.9498612880706787, + "learning_rate": 3.238649733780284e-05, + "loss": 0.5218, "step": 488700 }, { - "epoch": 4.98, - "learning_rate": 4.704615424820515e-05, - "loss": 0.6312, + "epoch": 6.734452068005842, + "grad_norm": 5.941333770751953, + "learning_rate": 3.237860924337852e-05, + "loss": 0.5081, "step": 488800 }, { - "epoch": 4.98, - "learning_rate": 4.704069512993455e-05, - "loss": 0.6967, + "epoch": 6.735829820065581, + "grad_norm": 3.954420328140259, + "learning_rate": 3.237072073449137e-05, + "loss": 0.5357, "step": 488900 }, { - "epoch": 4.98, - "learning_rate": 4.703523517847567e-05, - "loss": 0.6383, + "epoch": 6.73720757212532, + "grad_norm": 8.380908966064453, + "learning_rate": 3.2362831811811595e-05, + "loss": 0.4416, "step": 489000 }, { - "epoch": 4.98, - "learning_rate": 4.7029774394095445e-05, - "loss": 0.7385, + "epoch": 6.73858532418506, + "grad_norm": 1.4821916818618774, + "learning_rate": 3.235494247600937e-05, + "loss": 0.5783, "step": 489100 }, { - "epoch": 4.98, - "learning_rate": 4.7024367397351985e-05, - "loss": 0.6068, + "epoch": 6.739963076244799, + "grad_norm": 4.570451259613037, + "learning_rate": 3.234705272775494e-05, + "loss": 0.5671, "step": 489200 }, { - "epoch": 4.99, - "learning_rate": 4.701890495625267e-05, - "loss": 0.6275, + "epoch": 6.741340828304538, + "grad_norm": 5.681606292724609, + "learning_rate": 3.2339162567718595e-05, + "loss": 0.4956, "step": 489300 }, { - "epoch": 4.99, - "learning_rate": 4.701344168303047e-05, - "loss": 0.6722, + "epoch": 6.7427185803642775, + "grad_norm": 5.003773212432861, + "learning_rate": 3.233135090431491e-05, + "loss": 0.5427, "step": 489400 }, { - "epoch": 4.99, - "learning_rate": 4.7007977577952506e-05, - "loss": 0.6999, + "epoch": 6.744096332424017, + "grad_norm": 19.467884063720703, + "learning_rate": 3.2323459926826755e-05, + "loss": 0.4236, "step": 489500 }, { - "epoch": 4.99, - "learning_rate": 4.7002512641285936e-05, - "loss": 0.5853, + "epoch": 6.745474084483757, + "grad_norm": 7.783166885375977, + "learning_rate": 3.2315568539561016e-05, + "loss": 0.5016, "step": 489600 }, { - "epoch": 4.99, - "learning_rate": 4.699704687329795e-05, - "loss": 0.5723, + "epoch": 6.746851836543495, + "grad_norm": 6.3391432762146, + "learning_rate": 3.230767674318809e-05, + "loss": 0.5005, "step": 489700 }, { - "epoch": 4.99, - "learning_rate": 4.699158027425582e-05, - "loss": 0.606, + "epoch": 6.748229588603235, + "grad_norm": 4.056929588317871, + "learning_rate": 3.229978453837842e-05, + "loss": 0.4315, "step": 489800 }, { - "epoch": 4.99, - "learning_rate": 4.698611284442681e-05, - "loss": 0.6646, + "epoch": 6.749607340662974, + "grad_norm": 3.4065561294555664, + "learning_rate": 3.229189192580251e-05, + "loss": 0.5535, "step": 489900 }, { - "epoch": 4.99, - "learning_rate": 4.6980644584078246e-05, - "loss": 0.6526, + "epoch": 6.750985092722714, + "grad_norm": 2.0479066371917725, + "learning_rate": 3.2283998906130855e-05, + "loss": 0.4908, "step": 490000 }, { - "epoch": 4.99, - "learning_rate": 4.69751754934775e-05, - "loss": 0.6882, + "epoch": 6.752362844782453, + "grad_norm": 9.683295249938965, + "learning_rate": 3.227610548003402e-05, + "loss": 0.5337, "step": 490100 }, { - "epoch": 4.99, - "learning_rate": 4.696970557289197e-05, - "loss": 0.672, + "epoch": 6.753740596842192, + "grad_norm": 8.605391502380371, + "learning_rate": 3.2268211648182605e-05, + "loss": 0.522, "step": 490200 }, { - "epoch": 5.0, - "learning_rate": 4.696423482258912e-05, - "loss": 0.5691, + "epoch": 6.755118348901932, + "grad_norm": 6.824690341949463, + "learning_rate": 3.2260317411247195e-05, + "loss": 0.4674, "step": 490300 }, { - "epoch": 5.0, - "learning_rate": 4.695876324283641e-05, - "loss": 0.5944, + "epoch": 6.756496100961671, + "grad_norm": 8.162220001220703, + "learning_rate": 3.225242276989848e-05, + "loss": 0.5158, "step": 490400 }, { - "epoch": 5.0, - "learning_rate": 4.695329083390139e-05, - "loss": 0.6865, + "epoch": 6.75787385302141, + "grad_norm": 1.6816174983978271, + "learning_rate": 3.224452772480713e-05, + "loss": 0.4538, "step": 490500 }, { - "epoch": 5.0, - "learning_rate": 4.694781759605162e-05, - "loss": 0.5747, + "epoch": 6.7592516050811495, + "grad_norm": 41.01235580444336, + "learning_rate": 3.2236632276643884e-05, + "loss": 0.4143, "step": 490600 }, { - "epoch": 5.0, - "learning_rate": 4.694234352955471e-05, - "loss": 0.5783, + "epoch": 6.760629357140889, + "grad_norm": 6.8301100730896, + "learning_rate": 3.222873642607949e-05, + "loss": 0.5168, "step": 490700 }, { - "epoch": 5.0, - "learning_rate": 4.6936868634678305e-05, - "loss": 0.6435, + "epoch": 6.762007109200629, + "grad_norm": 5.045297622680664, + "learning_rate": 3.2220840173784745e-05, + "loss": 0.4729, "step": 490800 }, { - "epoch": 5.0, - "learning_rate": 4.69313929116901e-05, - "loss": 0.6459, + "epoch": 6.763384861260367, + "grad_norm": 34.562564849853516, + "learning_rate": 3.221294352043048e-05, + "loss": 0.5204, "step": 490900 }, { - "epoch": 5.0, - "learning_rate": 4.6925916360857816e-05, - "loss": 0.6217, + "epoch": 6.764762613320107, + "grad_norm": 1.4532976150512695, + "learning_rate": 3.220504646668755e-05, + "loss": 0.4942, "step": 491000 }, { - "epoch": 5.0, - "learning_rate": 4.692043898244923e-05, - "loss": 0.5496, + "epoch": 6.766140365379846, + "grad_norm": 3.980806827545166, + "learning_rate": 3.2197149013226844e-05, + "loss": 0.4614, "step": 491100 }, { - "epoch": 5.0, - "learning_rate": 4.691496077673217e-05, - "loss": 0.5992, + "epoch": 6.767518117439586, + "grad_norm": 6.934349536895752, + "learning_rate": 3.218925116071931e-05, + "loss": 0.4826, "step": 491200 }, { - "epoch": 5.01, - "learning_rate": 4.690948174397446e-05, - "loss": 0.7196, + "epoch": 6.768895869499325, + "grad_norm": 4.31529426574707, + "learning_rate": 3.218135290983589e-05, + "loss": 0.4803, "step": 491300 }, { - "epoch": 5.01, - "learning_rate": 4.690400188444401e-05, - "loss": 0.5553, + "epoch": 6.770273621559064, + "grad_norm": 7.444957256317139, + "learning_rate": 3.2173454261247594e-05, + "loss": 0.4711, "step": 491400 }, { - "epoch": 5.01, - "learning_rate": 4.689852119840873e-05, - "loss": 0.5504, + "epoch": 6.771651373618804, + "grad_norm": 2.31036114692688, + "learning_rate": 3.2165555215625457e-05, + "loss": 0.4993, "step": 491500 }, { - "epoch": 5.01, - "learning_rate": 4.689303968613661e-05, - "loss": 0.6328, + "epoch": 6.773029125678542, + "grad_norm": 2.6928467750549316, + "learning_rate": 3.2157655773640526e-05, + "loss": 0.5387, "step": 491600 }, { - "epoch": 5.01, - "learning_rate": 4.688755734789568e-05, - "loss": 0.5754, + "epoch": 6.774406877738282, + "grad_norm": 5.866816997528076, + "learning_rate": 3.214975593596391e-05, + "loss": 0.4708, "step": 491700 }, { - "epoch": 5.01, - "learning_rate": 4.688207418395396e-05, - "loss": 0.535, + "epoch": 6.7757846297980215, + "grad_norm": 1.9427406787872314, + "learning_rate": 3.2141855703266745e-05, + "loss": 0.5387, "step": 491800 }, { - "epoch": 5.01, - "learning_rate": 4.687659019457957e-05, - "loss": 0.6402, + "epoch": 6.777162381857761, + "grad_norm": 4.013765335083008, + "learning_rate": 3.213395507622018e-05, + "loss": 0.4648, "step": 491900 }, { - "epoch": 5.01, - "learning_rate": 4.687110538004063e-05, - "loss": 0.6182, + "epoch": 6.778540133917501, + "grad_norm": 7.95597505569458, + "learning_rate": 3.212605405549543e-05, + "loss": 0.4618, "step": 492000 }, { - "epoch": 5.01, - "learning_rate": 4.686561974060532e-05, - "loss": 0.5961, + "epoch": 6.779917885977239, + "grad_norm": 5.17407751083374, + "learning_rate": 3.2118152641763724e-05, + "loss": 0.4951, "step": 492100 }, { - "epoch": 5.01, - "learning_rate": 4.686013327654184e-05, - "loss": 0.6268, + "epoch": 6.781295638036979, + "grad_norm": 4.269516944885254, + "learning_rate": 3.2110250835696326e-05, + "loss": 0.491, "step": 492200 }, { - "epoch": 5.02, - "learning_rate": 4.6854645988118475e-05, - "loss": 0.6136, + "epoch": 6.782673390096718, + "grad_norm": 4.915322780609131, + "learning_rate": 3.2102348637964525e-05, + "loss": 0.5258, "step": 492300 }, { - "epoch": 5.02, - "learning_rate": 4.68491578756035e-05, - "loss": 0.6538, + "epoch": 6.784051142156457, + "grad_norm": 2.7525646686553955, + "learning_rate": 3.2094446049239666e-05, + "loss": 0.5491, "step": 492400 }, { - "epoch": 5.02, - "learning_rate": 4.684366893926525e-05, - "loss": 0.6573, + "epoch": 6.785428894216197, + "grad_norm": 5.786526203155518, + "learning_rate": 3.2086543070193115e-05, + "loss": 0.4285, "step": 492500 }, { - "epoch": 5.02, - "learning_rate": 4.683817917937211e-05, - "loss": 0.6614, + "epoch": 6.786806646275936, + "grad_norm": 12.089977264404297, + "learning_rate": 3.207863970149626e-05, + "loss": 0.4866, "step": 492600 }, { - "epoch": 5.02, - "learning_rate": 4.683268859619249e-05, - "loss": 0.6107, + "epoch": 6.788184398335676, + "grad_norm": 27.45583152770996, + "learning_rate": 3.207073594382054e-05, + "loss": 0.4653, "step": 492700 }, { - "epoch": 5.02, - "learning_rate": 4.682719718999486e-05, - "loss": 0.5632, + "epoch": 6.789562150395415, + "grad_norm": 6.595381736755371, + "learning_rate": 3.2062831797837415e-05, + "loss": 0.4734, "step": 492800 }, { - "epoch": 5.02, - "learning_rate": 4.68217049610477e-05, - "loss": 0.614, + "epoch": 6.790939902455154, + "grad_norm": 2.7230958938598633, + "learning_rate": 3.205492726421839e-05, + "loss": 0.4776, "step": 492900 }, { - "epoch": 5.02, - "learning_rate": 4.681621190961955e-05, - "loss": 0.671, + "epoch": 6.7923176545148936, + "grad_norm": 3.9158592224121094, + "learning_rate": 3.204702234363499e-05, + "loss": 0.4275, "step": 493000 }, { - "epoch": 5.02, - "learning_rate": 4.6810718035978996e-05, - "loss": 0.6693, + "epoch": 6.793695406574633, + "grad_norm": 4.048093795776367, + "learning_rate": 3.203911703675877e-05, + "loss": 0.5006, "step": 493100 }, { - "epoch": 5.02, - "learning_rate": 4.680522334039466e-05, - "loss": 0.5117, + "epoch": 6.795073158634372, + "grad_norm": 3.271937131881714, + "learning_rate": 3.203121134426136e-05, + "loss": 0.5368, "step": 493200 }, { - "epoch": 5.03, - "learning_rate": 4.679978278237418e-05, - "loss": 0.6432, + "epoch": 6.796450910694111, + "grad_norm": 3.249682664871216, + "learning_rate": 3.2023305266814344e-05, + "loss": 0.532, "step": 493300 }, { - "epoch": 5.03, - "learning_rate": 4.6794286451921e-05, - "loss": 0.6625, + "epoch": 6.797828662753851, + "grad_norm": 1.9720748662948608, + "learning_rate": 3.201539880508941e-05, + "loss": 0.4613, "step": 493400 }, { - "epoch": 5.03, - "learning_rate": 4.6788789300327436e-05, - "loss": 0.6655, + "epoch": 6.7992064148135904, + "grad_norm": 0.5858930349349976, + "learning_rate": 3.200749195975825e-05, + "loss": 0.4689, "step": 493500 }, { - "epoch": 5.03, - "learning_rate": 4.6783291327862275e-05, - "loss": 0.6274, + "epoch": 6.800584166873329, + "grad_norm": 13.444096565246582, + "learning_rate": 3.199958473149258e-05, + "loss": 0.4849, "step": 493600 }, { - "epoch": 5.03, - "learning_rate": 4.677779253479431e-05, - "loss": 0.5649, + "epoch": 6.801961918933069, + "grad_norm": 0.05797514319419861, + "learning_rate": 3.199167712096417e-05, + "loss": 0.4725, "step": 493700 }, { - "epoch": 5.03, - "learning_rate": 4.677229292139242e-05, - "loss": 0.6773, + "epoch": 6.803339670992808, + "grad_norm": 3.8541224002838135, + "learning_rate": 3.198384821065268e-05, + "loss": 0.4986, "step": 493800 }, { - "epoch": 5.03, - "learning_rate": 4.67667924879255e-05, - "loss": 0.6521, + "epoch": 6.804717423052548, + "grad_norm": 0.6057089567184448, + "learning_rate": 3.197593984142005e-05, + "loss": 0.4329, "step": 493900 }, { - "epoch": 5.03, - "learning_rate": 4.6761291234662487e-05, - "loss": 0.6483, + "epoch": 6.8060951751122865, + "grad_norm": 5.346893310546875, + "learning_rate": 3.196803109193345e-05, + "loss": 0.493, "step": 494000 }, { - "epoch": 5.03, - "learning_rate": 4.675578916187236e-05, - "loss": 0.6698, + "epoch": 6.807472927172026, + "grad_norm": 4.722138404846191, + "learning_rate": 3.196012196286474e-05, + "loss": 0.4863, "step": 494100 }, { - "epoch": 5.03, - "learning_rate": 4.675028626982413e-05, - "loss": 0.6534, + "epoch": 6.808850679231766, + "grad_norm": 7.602149963378906, + "learning_rate": 3.195221245488585e-05, + "loss": 0.4951, "step": 494200 }, { - "epoch": 5.04, - "learning_rate": 4.674478255878687e-05, - "loss": 0.6904, + "epoch": 6.810228431291505, + "grad_norm": 1.5293307304382324, + "learning_rate": 3.1944302568668746e-05, + "loss": 0.445, "step": 494300 }, { - "epoch": 5.04, - "learning_rate": 4.673927802902967e-05, - "loss": 0.7136, + "epoch": 6.811606183351244, + "grad_norm": 7.256683349609375, + "learning_rate": 3.1936392304885396e-05, + "loss": 0.5257, "step": 494400 }, { - "epoch": 5.04, - "learning_rate": 4.673377268082166e-05, - "loss": 0.682, + "epoch": 6.812983935410983, + "grad_norm": 5.243493556976318, + "learning_rate": 3.192848166420782e-05, + "loss": 0.505, "step": 494500 }, { - "epoch": 5.04, - "learning_rate": 4.6728321580145044e-05, - "loss": 0.5822, + "epoch": 6.814361687470723, + "grad_norm": 2.9806153774261475, + "learning_rate": 3.192057064730807e-05, + "loss": 0.5492, "step": 494600 }, { - "epoch": 5.04, - "learning_rate": 4.672281460402081e-05, - "loss": 0.5781, + "epoch": 6.8157394395304625, + "grad_norm": 5.731026649475098, + "learning_rate": 3.1912738370639494e-05, + "loss": 0.5791, "step": 494700 }, { - "epoch": 5.04, - "learning_rate": 4.671730681025073e-05, - "loss": 0.6858, + "epoch": 6.817117191590201, + "grad_norm": 4.773301124572754, + "learning_rate": 3.190482660705713e-05, + "loss": 0.5285, "step": 494800 }, { - "epoch": 5.04, - "learning_rate": 4.671179819910411e-05, - "loss": 0.6818, + "epoch": 6.818494943649941, + "grad_norm": 1.94752037525177, + "learning_rate": 3.1896914469262203e-05, + "loss": 0.4838, "step": 494900 }, { - "epoch": 5.04, - "learning_rate": 4.670628877085029e-05, - "loss": 0.6579, + "epoch": 6.81987269570968, + "grad_norm": 1.5738259553909302, + "learning_rate": 3.18890019579269e-05, + "loss": 0.4624, "step": 495000 }, { - "epoch": 5.04, - "learning_rate": 4.6700778525758654e-05, - "loss": 0.6935, + "epoch": 6.82125044776942, + "grad_norm": 13.824067115783691, + "learning_rate": 3.1881089073723436e-05, + "loss": 0.4716, "step": 495100 }, { - "epoch": 5.05, - "learning_rate": 4.669526746409861e-05, - "loss": 0.6435, + "epoch": 6.8226281998291585, + "grad_norm": 1.0518079996109009, + "learning_rate": 3.187317581732404e-05, + "loss": 0.4049, "step": 495200 }, { - "epoch": 5.05, - "learning_rate": 4.6689755586139615e-05, - "loss": 0.7002, + "epoch": 6.824005951888898, + "grad_norm": 6.62912130355835, + "learning_rate": 3.186526218940098e-05, + "loss": 0.492, "step": 495300 }, { - "epoch": 5.05, - "learning_rate": 4.668424289215118e-05, - "loss": 0.5883, + "epoch": 6.825383703948638, + "grad_norm": 4.1702880859375, + "learning_rate": 3.185734819062656e-05, + "loss": 0.4524, "step": 495400 }, { - "epoch": 5.05, - "learning_rate": 4.6678729382402826e-05, - "loss": 0.7353, + "epoch": 6.826761456008377, + "grad_norm": 2.934587001800537, + "learning_rate": 3.184943382167313e-05, + "loss": 0.5201, "step": 495500 }, { - "epoch": 5.05, - "learning_rate": 4.667321505716414e-05, - "loss": 0.7024, + "epoch": 6.828139208068116, + "grad_norm": 7.047045707702637, + "learning_rate": 3.184151908321304e-05, + "loss": 0.4311, "step": 495600 }, { - "epoch": 5.05, - "learning_rate": 4.666769991670474e-05, - "loss": 0.6122, + "epoch": 6.829516960127855, + "grad_norm": 3.5394976139068604, + "learning_rate": 3.1833603975918695e-05, + "loss": 0.506, "step": 495700 }, { - "epoch": 5.05, - "learning_rate": 4.666218396129428e-05, - "loss": 0.5887, + "epoch": 6.830894712187595, + "grad_norm": 6.301085472106934, + "learning_rate": 3.182568850046252e-05, + "loss": 0.5721, "step": 495800 }, { - "epoch": 5.05, - "learning_rate": 4.665666719120246e-05, - "loss": 0.6482, + "epoch": 6.832272464247334, + "grad_norm": 10.820953369140625, + "learning_rate": 3.181777265751698e-05, + "loss": 0.5412, "step": 495900 }, { - "epoch": 5.05, - "learning_rate": 4.6651149606699016e-05, - "loss": 0.6784, + "epoch": 6.833650216307073, + "grad_norm": 2.890017032623291, + "learning_rate": 3.180985644775455e-05, + "loss": 0.5362, "step": 496000 }, { - "epoch": 5.05, - "learning_rate": 4.664563120805372e-05, - "loss": 0.6589, + "epoch": 6.835027968366813, + "grad_norm": 6.602724075317383, + "learning_rate": 3.180193987184778e-05, + "loss": 0.5041, "step": 496100 }, { - "epoch": 5.06, - "learning_rate": 4.6640167191689354e-05, - "loss": 0.6455, + "epoch": 6.836405720426552, + "grad_norm": 5.462466716766357, + "learning_rate": 3.1794022930469204e-05, + "loss": 0.4821, "step": 496200 }, { - "epoch": 5.06, - "learning_rate": 4.6634647173704554e-05, - "loss": 0.6005, + "epoch": 6.837783472486292, + "grad_norm": 14.555152893066406, + "learning_rate": 3.1786105624291415e-05, + "loss": 0.5095, "step": 496300 }, { - "epoch": 5.06, - "learning_rate": 4.662912634238476e-05, - "loss": 0.7014, + "epoch": 6.8391612245460305, + "grad_norm": 3.682447671890259, + "learning_rate": 3.1778187953987016e-05, + "loss": 0.4157, "step": 496400 }, { - "epoch": 5.06, - "learning_rate": 4.662360469799994e-05, - "loss": 0.6144, + "epoch": 6.84053897660577, + "grad_norm": 1.9888455867767334, + "learning_rate": 3.177026992022868e-05, + "loss": 0.5603, "step": 496500 }, { - "epoch": 5.06, - "learning_rate": 4.661808224082004e-05, - "loss": 0.5828, + "epoch": 6.84191672866551, + "grad_norm": 2.1429059505462646, + "learning_rate": 3.1762351523689055e-05, + "loss": 0.4713, "step": 496600 }, { - "epoch": 5.06, - "learning_rate": 4.66125589711151e-05, - "loss": 0.6791, + "epoch": 6.843294480725248, + "grad_norm": 4.387076377868652, + "learning_rate": 3.1754432765040866e-05, + "loss": 0.5193, "step": 496700 }, { - "epoch": 5.06, - "learning_rate": 4.6607034889155166e-05, - "loss": 0.5954, + "epoch": 6.844672232784988, + "grad_norm": 10.121049880981445, + "learning_rate": 3.174651364495685e-05, + "loss": 0.5285, "step": 496800 }, { - "epoch": 5.06, - "learning_rate": 4.660150999521033e-05, - "loss": 0.5959, + "epoch": 6.846049984844727, + "grad_norm": 8.326859474182129, + "learning_rate": 3.1738594164109765e-05, + "loss": 0.4851, "step": 496900 }, { - "epoch": 5.06, - "learning_rate": 4.659598428955072e-05, - "loss": 0.6001, + "epoch": 6.847427736904467, + "grad_norm": 4.124664306640625, + "learning_rate": 3.173067432317244e-05, + "loss": 0.4572, "step": 497000 }, { - "epoch": 5.06, - "learning_rate": 4.6590457772446536e-05, - "loss": 0.6161, + "epoch": 6.8488054889642065, + "grad_norm": 6.335623264312744, + "learning_rate": 3.1722754122817665e-05, + "loss": 0.4621, "step": 497100 }, { - "epoch": 5.07, - "learning_rate": 4.658493044416796e-05, - "loss": 0.7238, + "epoch": 6.850183241023945, + "grad_norm": 4.623098850250244, + "learning_rate": 3.1714833563718336e-05, + "loss": 0.503, "step": 497200 }, { - "epoch": 5.07, - "learning_rate": 4.657940230498527e-05, - "loss": 0.6141, + "epoch": 6.851560993083685, + "grad_norm": 12.144587516784668, + "learning_rate": 3.170691264654733e-05, + "loss": 0.4547, "step": 497300 }, { - "epoch": 5.07, - "learning_rate": 4.6573873355168725e-05, - "loss": 0.5851, + "epoch": 6.852938745143424, + "grad_norm": 1.749037742614746, + "learning_rate": 3.169899137197757e-05, + "loss": 0.4806, "step": 497400 }, { - "epoch": 5.07, - "learning_rate": 4.65683435949887e-05, - "loss": 0.716, + "epoch": 6.854316497203163, + "grad_norm": 3.978651523590088, + "learning_rate": 3.169106974068201e-05, + "loss": 0.4687, "step": 497500 }, { - "epoch": 5.07, - "learning_rate": 4.6562813024715544e-05, - "loss": 0.6883, + "epoch": 6.8556942492629025, + "grad_norm": 0.904420793056488, + "learning_rate": 3.168314775333365e-05, + "loss": 0.5043, "step": 497600 }, { - "epoch": 5.07, - "learning_rate": 4.655728164461967e-05, - "loss": 0.6175, + "epoch": 6.857072001322642, + "grad_norm": 3.9423272609710693, + "learning_rate": 3.167522541060547e-05, + "loss": 0.489, "step": 497700 }, { - "epoch": 5.07, - "learning_rate": 4.6551749454971534e-05, - "loss": 0.6378, + "epoch": 6.858449753382382, + "grad_norm": 31.07257843017578, + "learning_rate": 3.166730271317054e-05, + "loss": 0.417, "step": 497800 }, { - "epoch": 5.07, - "learning_rate": 4.6546216456041626e-05, - "loss": 0.5727, + "epoch": 6.85982750544212, + "grad_norm": 9.66943645477295, + "learning_rate": 3.165937966170192e-05, + "loss": 0.5451, "step": 497900 }, { - "epoch": 5.07, - "learning_rate": 4.654068264810048e-05, - "loss": 0.6134, + "epoch": 6.86120525750186, + "grad_norm": 1.4526536464691162, + "learning_rate": 3.165145625687272e-05, + "loss": 0.4968, "step": 498000 }, { - "epoch": 5.07, - "learning_rate": 4.653514803141865e-05, - "loss": 0.5859, + "epoch": 6.862583009561599, + "grad_norm": 20.55359649658203, + "learning_rate": 3.164353249935608e-05, + "loss": 0.5431, "step": 498100 }, { - "epoch": 5.08, - "learning_rate": 4.652961260626677e-05, - "loss": 0.7131, + "epoch": 6.863960761621339, + "grad_norm": 2.7309725284576416, + "learning_rate": 3.163560838982515e-05, + "loss": 0.4878, "step": 498200 }, { - "epoch": 5.08, - "learning_rate": 4.652407637291547e-05, - "loss": 0.6403, + "epoch": 6.865338513681078, + "grad_norm": 1.5482150316238403, + "learning_rate": 3.1627683928953124e-05, + "loss": 0.561, "step": 498300 }, { - "epoch": 5.08, - "learning_rate": 4.6518539331635454e-05, - "loss": 0.5428, + "epoch": 6.866716265740817, + "grad_norm": 10.03954029083252, + "learning_rate": 3.161975911741324e-05, + "loss": 0.5143, "step": 498400 }, { - "epoch": 5.08, - "learning_rate": 4.6513001482697435e-05, - "loss": 0.7137, + "epoch": 6.868094017800557, + "grad_norm": 4.985734939575195, + "learning_rate": 3.161183395587874e-05, + "loss": 0.5662, "step": 498500 }, { - "epoch": 5.08, - "learning_rate": 4.650746282637219e-05, - "loss": 0.6813, + "epoch": 6.869471769860296, + "grad_norm": 2.313674211502075, + "learning_rate": 3.1603908445022904e-05, + "loss": 0.4884, "step": 498600 }, { - "epoch": 5.08, - "learning_rate": 4.650192336293052e-05, - "loss": 0.6401, + "epoch": 6.870849521920035, + "grad_norm": 6.781418800354004, + "learning_rate": 3.159606184583769e-05, + "loss": 0.5005, "step": 498700 }, { - "epoch": 5.08, - "learning_rate": 4.6496383092643284e-05, - "loss": 0.6313, + "epoch": 6.8722272739797745, + "grad_norm": 2.4639346599578857, + "learning_rate": 3.1588135641835576e-05, + "loss": 0.5789, "step": 498800 }, { - "epoch": 5.08, - "learning_rate": 4.649084201578135e-05, - "loss": 0.6363, + "epoch": 6.873605026039514, + "grad_norm": 0.7739230990409851, + "learning_rate": 3.158020909052543e-05, + "loss": 0.4535, "step": 498900 }, { - "epoch": 5.08, - "learning_rate": 4.648530013261566e-05, - "loss": 0.5828, + "epoch": 6.874982778099254, + "grad_norm": 5.304497241973877, + "learning_rate": 3.157228219258064e-05, + "loss": 0.4985, "step": 499000 }, { - "epoch": 5.08, - "learning_rate": 4.647975744341718e-05, - "loss": 0.5838, + "epoch": 6.876360530158992, + "grad_norm": 6.326128005981445, + "learning_rate": 3.1564354948674644e-05, + "loss": 0.4601, "step": 499100 }, { - "epoch": 5.09, - "learning_rate": 4.6474213948456895e-05, - "loss": 0.5584, + "epoch": 6.877738282218732, + "grad_norm": 6.563399791717529, + "learning_rate": 3.1556427359480914e-05, + "loss": 0.5091, "step": 499200 }, { - "epoch": 5.09, - "learning_rate": 4.646866964800586e-05, - "loss": 0.6174, + "epoch": 6.879116034278471, + "grad_norm": 4.808310508728027, + "learning_rate": 3.154849942567291e-05, + "loss": 0.4562, "step": 499300 }, { - "epoch": 5.09, - "learning_rate": 4.6463179997376825e-05, - "loss": 0.617, + "epoch": 6.880493786338211, + "grad_norm": 4.943713665008545, + "learning_rate": 3.154057114792418e-05, + "loss": 0.4924, "step": 499400 }, { - "epoch": 5.09, - "learning_rate": 4.645763409480572e-05, - "loss": 0.5597, + "epoch": 6.88187153839795, + "grad_norm": 3.5838000774383545, + "learning_rate": 3.153264252690826e-05, + "loss": 0.3959, "step": 499500 }, { - "epoch": 5.09, - "learning_rate": 4.645208738755453e-05, - "loss": 0.7158, + "epoch": 6.883249290457689, + "grad_norm": 1.174947738647461, + "learning_rate": 3.152471356329872e-05, + "loss": 0.4396, "step": 499600 }, { - "epoch": 5.09, - "learning_rate": 4.644653987589446e-05, - "loss": 0.6066, + "epoch": 6.884627042517429, + "grad_norm": 4.0670390129089355, + "learning_rate": 3.151678425776918e-05, + "loss": 0.5085, "step": 499700 }, { - "epoch": 5.09, - "learning_rate": 4.644099156009672e-05, - "loss": 0.6791, + "epoch": 6.886004794577168, + "grad_norm": 3.499330759048462, + "learning_rate": 3.150885461099326e-05, + "loss": 0.46, "step": 499800 }, { - "epoch": 5.09, - "learning_rate": 4.643544244043262e-05, - "loss": 0.6103, + "epoch": 6.887382546636907, + "grad_norm": 4.000747203826904, + "learning_rate": 3.150092462364464e-05, + "loss": 0.4708, "step": 499900 }, { - "epoch": 5.09, - "learning_rate": 4.642989251717346e-05, - "loss": 0.577, + "epoch": 6.8887602986966465, + "grad_norm": 9.867107391357422, + "learning_rate": 3.1492994296396996e-05, + "loss": 0.4882, "step": 500000 }, { - "epoch": 5.1, - "learning_rate": 4.6424341790590596e-05, - "loss": 0.6842, + "epoch": 6.890138050756386, + "grad_norm": 90.97855377197266, + "learning_rate": 3.148506362992406e-05, + "loss": 0.4481, "step": 500100 }, { - "epoch": 5.1, - "learning_rate": 4.641879026095544e-05, - "loss": 0.6613, + "epoch": 6.891515802816125, + "grad_norm": 1.7543073892593384, + "learning_rate": 3.147713262489958e-05, + "loss": 0.564, "step": 500200 }, { - "epoch": 5.1, - "learning_rate": 4.641323792853943e-05, - "loss": 0.5771, + "epoch": 6.892893554875864, + "grad_norm": 4.410828590393066, + "learning_rate": 3.146920128199734e-05, + "loss": 0.4306, "step": 500300 }, { - "epoch": 5.1, - "learning_rate": 4.6407684793614024e-05, - "loss": 0.6796, + "epoch": 6.894271306935604, + "grad_norm": 16.284709930419922, + "learning_rate": 3.146126960189114e-05, + "loss": 0.4977, "step": 500400 }, { - "epoch": 5.1, - "learning_rate": 4.640213085645076e-05, - "loss": 0.6304, + "epoch": 6.895649058995343, + "grad_norm": 4.696292400360107, + "learning_rate": 3.1453337585254834e-05, + "loss": 0.4558, "step": 500500 }, { - "epoch": 5.1, - "learning_rate": 4.639657611732116e-05, - "loss": 0.6233, + "epoch": 6.897026811055083, + "grad_norm": 0.5019664764404297, + "learning_rate": 3.144540523276227e-05, + "loss": 0.4428, "step": 500600 }, { - "epoch": 5.1, - "learning_rate": 4.6391020576496856e-05, - "loss": 0.6715, + "epoch": 6.898404563114822, + "grad_norm": 2.202000856399536, + "learning_rate": 3.143747254508734e-05, + "loss": 0.5189, "step": 500700 }, { - "epoch": 5.1, - "learning_rate": 4.6385464234249455e-05, - "loss": 0.7335, + "epoch": 6.899782315174561, + "grad_norm": 4.785269260406494, + "learning_rate": 3.1429539522903973e-05, + "loss": 0.4897, "step": 500800 }, { - "epoch": 5.1, - "learning_rate": 4.637990709085064e-05, - "loss": 0.6301, + "epoch": 6.901160067234301, + "grad_norm": 3.115980386734009, + "learning_rate": 3.1421606166886125e-05, + "loss": 0.4327, "step": 500900 }, { - "epoch": 5.1, - "learning_rate": 4.6374349146572114e-05, - "loss": 0.5899, + "epoch": 6.902537819294039, + "grad_norm": 2.6305105686187744, + "learning_rate": 3.1413672477707775e-05, + "loss": 0.5207, "step": 501000 }, { - "epoch": 5.11, - "learning_rate": 4.636879040168563e-05, - "loss": 0.6519, + "epoch": 6.903915571353779, + "grad_norm": 3.773603677749634, + "learning_rate": 3.140573845604292e-05, + "loss": 0.4767, "step": 501100 }, { - "epoch": 5.11, - "learning_rate": 4.636323085646299e-05, - "loss": 0.5941, + "epoch": 6.9052933234135185, + "grad_norm": 3.1492981910705566, + "learning_rate": 3.13978041025656e-05, + "loss": 0.5809, "step": 501200 }, { - "epoch": 5.11, - "learning_rate": 4.6357670511176e-05, - "loss": 0.6369, + "epoch": 6.906671075473258, + "grad_norm": 24.479820251464844, + "learning_rate": 3.138986941794988e-05, + "loss": 0.4322, "step": 501300 }, { - "epoch": 5.11, - "learning_rate": 4.635210936609654e-05, - "loss": 0.5861, + "epoch": 6.908048827532998, + "grad_norm": 1.3926104307174683, + "learning_rate": 3.138193440286986e-05, + "loss": 0.4288, "step": 501400 }, { - "epoch": 5.11, - "learning_rate": 4.6346547421496526e-05, - "loss": 0.6015, + "epoch": 6.909426579592736, + "grad_norm": 10.369736671447754, + "learning_rate": 3.1373999057999645e-05, + "loss": 0.49, "step": 501500 }, { - "epoch": 5.11, - "learning_rate": 4.634098467764789e-05, - "loss": 0.6181, + "epoch": 6.910804331652476, + "grad_norm": 1.4693257808685303, + "learning_rate": 3.13660633840134e-05, + "loss": 0.543, "step": 501600 }, { - "epoch": 5.11, - "learning_rate": 4.633542113482262e-05, - "loss": 0.587, + "epoch": 6.912182083712215, + "grad_norm": 3.880431652069092, + "learning_rate": 3.135812738158528e-05, + "loss": 0.5376, "step": 501700 }, { - "epoch": 5.11, - "learning_rate": 4.6329856793292737e-05, - "loss": 0.6811, + "epoch": 6.913559835771954, + "grad_norm": 6.648316860198975, + "learning_rate": 3.13502704163117e-05, + "loss": 0.4842, "step": 501800 }, { - "epoch": 5.11, - "learning_rate": 4.632429165333032e-05, - "loss": 0.5608, + "epoch": 6.914937587831694, + "grad_norm": 5.535737037658691, + "learning_rate": 3.1342333762290096e-05, + "loss": 0.5468, "step": 501900 }, { - "epoch": 5.11, - "learning_rate": 4.631872571520745e-05, - "loss": 0.6396, + "epoch": 6.916315339891433, + "grad_norm": 8.411781311035156, + "learning_rate": 3.133439678184258e-05, + "loss": 0.4651, "step": 502000 }, { - "epoch": 5.12, - "learning_rate": 4.631315897919628e-05, - "loss": 0.5888, + "epoch": 6.917693091951173, + "grad_norm": 4.188355445861816, + "learning_rate": 3.1326459475643444e-05, + "loss": 0.4832, "step": 502100 }, { - "epoch": 5.12, - "learning_rate": 4.630759144556898e-05, - "loss": 0.6202, + "epoch": 6.919070844010911, + "grad_norm": 4.7604217529296875, + "learning_rate": 3.1318521844366995e-05, + "loss": 0.4266, "step": 502200 }, { - "epoch": 5.12, - "learning_rate": 4.6302023114597786e-05, - "loss": 0.6958, + "epoch": 6.920448596070651, + "grad_norm": 2.426968574523926, + "learning_rate": 3.1310583888687586e-05, + "loss": 0.4613, "step": 502300 }, { - "epoch": 5.12, - "learning_rate": 4.6296453986554945e-05, - "loss": 0.5922, + "epoch": 6.9218263481303905, + "grad_norm": 4.971142768859863, + "learning_rate": 3.130264560927958e-05, + "loss": 0.4702, "step": 502400 }, { - "epoch": 5.12, - "learning_rate": 4.629088406171276e-05, - "loss": 0.6118, + "epoch": 6.92320410019013, + "grad_norm": 31.65337562561035, + "learning_rate": 3.1294707006817365e-05, + "loss": 0.5103, "step": 502500 }, { - "epoch": 5.12, - "learning_rate": 4.6285313340343555e-05, - "loss": 0.609, + "epoch": 6.924581852249869, + "grad_norm": 5.310464859008789, + "learning_rate": 3.128676808197539e-05, + "loss": 0.4752, "step": 502600 }, { - "epoch": 5.12, - "learning_rate": 4.6279741822719716e-05, - "loss": 0.6568, + "epoch": 6.925959604309608, + "grad_norm": 3.0934910774230957, + "learning_rate": 3.127882883542809e-05, + "loss": 0.518, "step": 502700 }, { - "epoch": 5.12, - "learning_rate": 4.627416950911366e-05, - "loss": 0.6791, + "epoch": 6.927337356369348, + "grad_norm": 3.8131024837493896, + "learning_rate": 3.127088926784994e-05, + "loss": 0.4081, "step": 502800 }, { - "epoch": 5.12, - "learning_rate": 4.626859639979784e-05, - "loss": 0.5671, + "epoch": 6.928715108429087, + "grad_norm": 6.654551029205322, + "learning_rate": 3.126294937991546e-05, + "loss": 0.5059, "step": 502900 }, { - "epoch": 5.12, - "learning_rate": 4.626302249504472e-05, - "loss": 0.6345, + "epoch": 6.930092860488826, + "grad_norm": 9.5676908493042, + "learning_rate": 3.125500917229918e-05, + "loss": 0.5122, "step": 503000 }, { - "epoch": 5.13, - "learning_rate": 4.625744779512686e-05, - "loss": 0.679, + "epoch": 6.931470612548566, + "grad_norm": 8.509909629821777, + "learning_rate": 3.124706864567566e-05, + "loss": 0.5514, "step": 503100 }, { - "epoch": 5.13, - "learning_rate": 4.625187230031683e-05, - "loss": 0.6027, + "epoch": 6.932848364608305, + "grad_norm": 2.3952219486236572, + "learning_rate": 3.123912780071949e-05, + "loss": 0.4621, "step": 503200 }, { - "epoch": 5.13, - "learning_rate": 4.624629601088723e-05, - "loss": 0.6528, + "epoch": 6.934226116668045, + "grad_norm": 31.232297897338867, + "learning_rate": 3.123118663810527e-05, + "loss": 0.4865, "step": 503300 }, { - "epoch": 5.13, - "learning_rate": 4.6240718927110705e-05, - "loss": 0.6277, + "epoch": 6.9356038687277834, + "grad_norm": 4.742006301879883, + "learning_rate": 3.122324515850768e-05, + "loss": 0.4909, "step": 503400 }, { - "epoch": 5.13, - "learning_rate": 4.623514104925995e-05, - "loss": 0.722, + "epoch": 6.936981620787523, + "grad_norm": 2.189765453338623, + "learning_rate": 3.1215303362601355e-05, + "loss": 0.5231, "step": 503500 }, { - "epoch": 5.13, - "learning_rate": 4.622956237760767e-05, - "loss": 0.6678, + "epoch": 6.938359372847263, + "grad_norm": 3.0376875400543213, + "learning_rate": 3.1207361251061e-05, + "loss": 0.474, "step": 503600 }, { - "epoch": 5.13, - "learning_rate": 4.6223982912426656e-05, - "loss": 0.6401, + "epoch": 6.939737124907002, + "grad_norm": 4.776489734649658, + "learning_rate": 3.119941882456134e-05, + "loss": 0.4679, "step": 503700 }, { - "epoch": 5.13, - "learning_rate": 4.62184026539897e-05, - "loss": 0.6601, + "epoch": 6.941114876966741, + "grad_norm": 44.82393264770508, + "learning_rate": 3.1191476083777124e-05, + "loss": 0.4496, "step": 503800 }, { - "epoch": 5.13, - "learning_rate": 4.6212821602569626e-05, - "loss": 0.6296, + "epoch": 6.94249262902648, + "grad_norm": 0.723544716835022, + "learning_rate": 3.118361246147723e-05, + "loss": 0.4407, "step": 503900 }, { - "epoch": 5.13, - "learning_rate": 4.620723975843934e-05, - "loss": 0.6904, + "epoch": 6.94387038108622, + "grad_norm": 3.5784547328948975, + "learning_rate": 3.117566909727427e-05, + "loss": 0.4546, "step": 504000 }, { - "epoch": 5.14, - "learning_rate": 4.620165712187174e-05, - "loss": 0.6156, + "epoch": 6.9452481331459595, + "grad_norm": 3.118750810623169, + "learning_rate": 3.116772542080442e-05, + "loss": 0.4263, "step": 504100 }, { - "epoch": 5.14, - "learning_rate": 4.61960736931398e-05, - "loss": 0.6183, + "epoch": 6.946625885205698, + "grad_norm": 2.2398905754089355, + "learning_rate": 3.115978143274253e-05, + "loss": 0.4868, "step": 504200 }, { - "epoch": 5.14, - "learning_rate": 4.619048947251651e-05, - "loss": 0.6709, + "epoch": 6.948003637265438, + "grad_norm": 2.22751522064209, + "learning_rate": 3.115183713376348e-05, + "loss": 0.4383, "step": 504300 }, { - "epoch": 5.14, - "learning_rate": 4.618490446027491e-05, - "loss": 0.634, + "epoch": 6.949381389325177, + "grad_norm": 2.2035436630249023, + "learning_rate": 3.114397197216788e-05, + "loss": 0.4788, "step": 504400 }, { - "epoch": 5.14, - "learning_rate": 4.6179318656688045e-05, - "loss": 0.5894, + "epoch": 6.950759141384916, + "grad_norm": 3.2892355918884277, + "learning_rate": 3.11360270564716e-05, + "loss": 0.479, "step": 504500 }, { - "epoch": 5.14, - "learning_rate": 4.617373206202907e-05, - "loss": 0.687, + "epoch": 6.9521368934446555, + "grad_norm": 2.4126551151275635, + "learning_rate": 3.112808183187623e-05, + "loss": 0.4948, "step": 504600 }, { - "epoch": 5.14, - "learning_rate": 4.6168144676571114e-05, - "loss": 0.5318, + "epoch": 6.953514645504395, + "grad_norm": 11.513557434082031, + "learning_rate": 3.1120136299056744e-05, + "loss": 0.4821, "step": 504700 }, { - "epoch": 5.14, - "learning_rate": 4.6162556500587375e-05, - "loss": 0.5696, + "epoch": 6.954892397564135, + "grad_norm": 2.8434507846832275, + "learning_rate": 3.111219045868816e-05, + "loss": 0.4704, "step": 504800 }, { - "epoch": 5.14, - "learning_rate": 4.615696753435107e-05, - "loss": 0.6464, + "epoch": 6.956270149623874, + "grad_norm": 9.525400161743164, + "learning_rate": 3.110424431144551e-05, + "loss": 0.4625, "step": 504900 }, { - "epoch": 5.15, - "learning_rate": 4.615137777813547e-05, - "loss": 0.6614, + "epoch": 6.957647901683613, + "grad_norm": 0.6736272573471069, + "learning_rate": 3.109629785800387e-05, + "loss": 0.4282, "step": 505000 }, { - "epoch": 5.15, - "learning_rate": 4.6145787232213885e-05, - "loss": 0.5522, + "epoch": 6.959025653743352, + "grad_norm": 1.9305440187454224, + "learning_rate": 3.108835109903833e-05, + "loss": 0.4173, "step": 505100 }, { - "epoch": 5.15, - "learning_rate": 4.6140195896859656e-05, - "loss": 0.618, + "epoch": 6.960403405803092, + "grad_norm": 1.8894736766815186, + "learning_rate": 3.1080404035224006e-05, + "loss": 0.4974, "step": 505200 }, { - "epoch": 5.15, - "learning_rate": 4.6134603772346175e-05, - "loss": 0.5846, + "epoch": 6.961781157862831, + "grad_norm": 2.7600796222686768, + "learning_rate": 3.107245666723604e-05, + "loss": 0.5096, "step": 505300 }, { - "epoch": 5.15, - "learning_rate": 4.612901085894685e-05, - "loss": 0.7239, + "epoch": 6.96315890992257, + "grad_norm": 3.5628931522369385, + "learning_rate": 3.106450899574961e-05, + "loss": 0.4847, "step": 505400 }, { - "epoch": 5.15, - "learning_rate": 4.612341715693515e-05, - "loss": 0.6547, + "epoch": 6.96453666198231, + "grad_norm": 6.958978652954102, + "learning_rate": 3.105656102143989e-05, + "loss": 0.5269, "step": 505500 }, { - "epoch": 5.15, - "learning_rate": 4.6117822666584566e-05, - "loss": 0.6415, + "epoch": 6.965914414042049, + "grad_norm": 2.5903046131134033, + "learning_rate": 3.1048612744982125e-05, + "loss": 0.3984, "step": 505600 }, { - "epoch": 5.15, - "learning_rate": 4.611222738816864e-05, - "loss": 0.6347, + "epoch": 6.967292166101789, + "grad_norm": 4.880198955535889, + "learning_rate": 3.104066416705156e-05, + "loss": 0.4831, "step": 505700 }, { - "epoch": 5.15, - "learning_rate": 4.6106631321960945e-05, - "loss": 0.56, + "epoch": 6.9686699181615275, + "grad_norm": 1.1121947765350342, + "learning_rate": 3.1032715288323455e-05, + "loss": 0.4608, "step": 505800 }, { - "epoch": 5.15, - "learning_rate": 4.610103446823511e-05, - "loss": 0.6357, + "epoch": 6.970047670221267, + "grad_norm": 6.725651741027832, + "learning_rate": 3.102476610947311e-05, + "loss": 0.443, "step": 505900 }, { - "epoch": 5.16, - "learning_rate": 4.609543682726476e-05, - "loss": 0.6406, + "epoch": 6.971425422281007, + "grad_norm": 3.4882566928863525, + "learning_rate": 3.101681663117585e-05, + "loss": 0.4318, "step": 506000 }, { - "epoch": 5.16, - "learning_rate": 4.608983839932361e-05, - "loss": 0.5634, + "epoch": 6.972803174340745, + "grad_norm": 5.540103435516357, + "learning_rate": 3.100886685410703e-05, + "loss": 0.4187, "step": 506100 }, { - "epoch": 5.16, - "learning_rate": 4.6084239184685385e-05, - "loss": 0.6373, + "epoch": 6.974180926400485, + "grad_norm": 3.5153932571411133, + "learning_rate": 3.100091677894202e-05, + "loss": 0.4448, "step": 506200 }, { - "epoch": 5.16, - "learning_rate": 4.6078639183623844e-05, - "loss": 0.6174, + "epoch": 6.975558678460224, + "grad_norm": 9.365389823913574, + "learning_rate": 3.0992966406356216e-05, + "loss": 0.4937, "step": 506300 }, { - "epoch": 5.16, - "learning_rate": 4.6073038396412796e-05, - "loss": 0.5114, + "epoch": 6.976936430519964, + "grad_norm": 2.807455539703369, + "learning_rate": 3.098501573702505e-05, + "loss": 0.4133, "step": 506400 }, { - "epoch": 5.16, - "learning_rate": 4.606743682332609e-05, - "loss": 0.6216, + "epoch": 6.978314182579703, + "grad_norm": 2.1231842041015625, + "learning_rate": 3.097706477162396e-05, + "loss": 0.4902, "step": 506500 }, { - "epoch": 5.16, - "learning_rate": 4.606183446463762e-05, - "loss": 0.5393, + "epoch": 6.979691934639442, + "grad_norm": 1.3850544691085815, + "learning_rate": 3.0969113510828423e-05, + "loss": 0.4493, "step": 506600 }, { - "epoch": 5.16, - "learning_rate": 4.605623132062129e-05, - "loss": 0.6204, + "epoch": 6.981069686699182, + "grad_norm": 8.940996170043945, + "learning_rate": 3.096116195531397e-05, + "loss": 0.4689, "step": 506700 }, { - "epoch": 5.16, - "learning_rate": 4.6050627391551066e-05, - "loss": 0.6207, + "epoch": 6.982447438758921, + "grad_norm": 2.3018627166748047, + "learning_rate": 3.095321010575608e-05, + "loss": 0.5327, "step": 506800 }, { - "epoch": 5.16, - "learning_rate": 4.604502267770095e-05, - "loss": 0.6052, + "epoch": 6.98382519081866, + "grad_norm": 5.0102643966674805, + "learning_rate": 3.094525796283034e-05, + "loss": 0.4557, "step": 506900 }, { - "epoch": 5.17, - "learning_rate": 4.603941717934497e-05, - "loss": 0.6617, + "epoch": 6.9852029428783995, + "grad_norm": 6.3492536544799805, + "learning_rate": 3.09373055272123e-05, + "loss": 0.5399, "step": 507000 }, { - "epoch": 5.17, - "learning_rate": 4.603381089675721e-05, - "loss": 0.5592, + "epoch": 6.986580694938139, + "grad_norm": 3.183195114135742, + "learning_rate": 3.092935279957757e-05, + "loss": 0.4557, "step": 507100 }, { - "epoch": 5.17, - "learning_rate": 4.602825990475694e-05, - "loss": 0.6749, + "epoch": 6.987958446997879, + "grad_norm": 16.109149932861328, + "learning_rate": 3.092139978060178e-05, + "loss": 0.484, "step": 507200 }, { - "epoch": 5.17, - "learning_rate": 4.602265206236347e-05, - "loss": 0.7209, + "epoch": 6.989336199057617, + "grad_norm": 3.767611026763916, + "learning_rate": 3.091352600549356e-05, + "loss": 0.4853, "step": 507300 }, { - "epoch": 5.17, - "learning_rate": 4.601704343655794e-05, - "loss": 0.6787, + "epoch": 6.990713951117357, + "grad_norm": 5.4513068199157715, + "learning_rate": 3.090557240875917e-05, + "loss": 0.5395, "step": 507400 }, { - "epoch": 5.17, - "learning_rate": 4.601143402761457e-05, - "loss": 0.6493, + "epoch": 6.992091703177096, + "grad_norm": 5.304195404052734, + "learning_rate": 3.0897618522703974e-05, + "loss": 0.5016, "step": 507500 }, { - "epoch": 5.17, - "learning_rate": 4.600582383580763e-05, - "loss": 0.6847, + "epoch": 6.993469455236836, + "grad_norm": 9.135114669799805, + "learning_rate": 3.088966434800371e-05, + "loss": 0.4338, "step": 507600 }, { - "epoch": 5.17, - "learning_rate": 4.600021286141141e-05, - "loss": 0.7071, + "epoch": 6.994847207296575, + "grad_norm": 2.1700408458709717, + "learning_rate": 3.088170988533411e-05, + "loss": 0.4509, "step": 507700 }, { - "epoch": 5.17, - "learning_rate": 4.5994601104700283e-05, - "loss": 0.607, + "epoch": 6.996224959356314, + "grad_norm": 3.658083200454712, + "learning_rate": 3.087375513537096e-05, + "loss": 0.4806, "step": 507800 }, { - "epoch": 5.17, - "learning_rate": 4.598898856594859e-05, - "loss": 0.678, + "epoch": 6.997602711416054, + "grad_norm": 1.2274701595306396, + "learning_rate": 3.086580009879005e-05, + "loss": 0.5319, "step": 507900 }, { - "epoch": 5.18, - "learning_rate": 4.59833752454308e-05, - "loss": 0.6539, + "epoch": 6.998980463475793, + "grad_norm": 1.8919082880020142, + "learning_rate": 3.0857844776267195e-05, + "loss": 0.4602, "step": 508000 }, { - "epoch": 5.18, - "learning_rate": 4.5977761143421325e-05, - "loss": 0.6634, + "epoch": 7.000358215535532, + "grad_norm": 3.0730230808258057, + "learning_rate": 3.0849889168478256e-05, + "loss": 0.4708, "step": 508100 }, { - "epoch": 5.18, - "learning_rate": 4.597214626019469e-05, - "loss": 0.5979, + "epoch": 7.0017359675952715, + "grad_norm": 3.043363094329834, + "learning_rate": 3.0841933276099094e-05, + "loss": 0.4178, "step": 508200 }, { - "epoch": 5.18, - "learning_rate": 4.5966530596025416e-05, - "loss": 0.6044, + "epoch": 7.003113719655011, + "grad_norm": 3.6803717613220215, + "learning_rate": 3.0833977099805594e-05, + "loss": 0.4457, "step": 508300 }, { - "epoch": 5.18, - "learning_rate": 4.5960914151188084e-05, - "loss": 0.66, + "epoch": 7.004491471714751, + "grad_norm": 3.0459094047546387, + "learning_rate": 3.0826020640273696e-05, + "loss": 0.4497, "step": 508400 }, { - "epoch": 5.18, - "learning_rate": 4.5955296925957295e-05, - "loss": 0.6885, + "epoch": 7.005869223774489, + "grad_norm": 3.3015036582946777, + "learning_rate": 3.081806389817931e-05, + "loss": 0.5178, "step": 508500 }, { - "epoch": 5.18, - "learning_rate": 4.5949678920607704e-05, - "loss": 0.602, + "epoch": 7.007246975834229, + "grad_norm": 4.332216262817383, + "learning_rate": 3.081010687419845e-05, + "loss": 0.4321, "step": 508600 }, { - "epoch": 5.18, - "learning_rate": 4.5944060135413995e-05, - "loss": 0.7248, + "epoch": 7.008624727893968, + "grad_norm": 2.6249611377716064, + "learning_rate": 3.0802149569007056e-05, + "loss": 0.4651, "step": 508700 }, { - "epoch": 5.18, - "learning_rate": 4.59384405706509e-05, - "loss": 0.626, + "epoch": 7.010002479953708, + "grad_norm": 11.402806282043457, + "learning_rate": 3.079419198328116e-05, + "loss": 0.4067, "step": 508800 }, { - "epoch": 5.18, - "learning_rate": 4.593282022659317e-05, - "loss": 0.5859, + "epoch": 7.011380232013447, + "grad_norm": 5.703737258911133, + "learning_rate": 3.0786234117696813e-05, + "loss": 0.397, "step": 508900 }, { - "epoch": 5.19, - "learning_rate": 4.5927199103515605e-05, - "loss": 0.6129, + "epoch": 7.012757984073186, + "grad_norm": 13.51340389251709, + "learning_rate": 3.077827597293006e-05, + "loss": 0.4753, "step": 509000 }, { - "epoch": 5.19, - "learning_rate": 4.592157720169307e-05, - "loss": 0.6182, + "epoch": 7.014135736132926, + "grad_norm": 2.3686861991882324, + "learning_rate": 3.0770317549657e-05, + "loss": 0.4537, "step": 509100 }, { - "epoch": 5.19, - "learning_rate": 4.5915954521400414e-05, - "loss": 0.733, + "epoch": 7.015513488192665, + "grad_norm": 0.9951190948486328, + "learning_rate": 3.0762358848553724e-05, + "loss": 0.4088, "step": 509200 }, { - "epoch": 5.19, - "learning_rate": 4.591033106291257e-05, - "loss": 0.5552, + "epoch": 7.016891240252404, + "grad_norm": 7.1610517501831055, + "learning_rate": 3.075439987029637e-05, + "loss": 0.4219, "step": 509300 }, { - "epoch": 5.19, - "learning_rate": 4.590470682650448e-05, - "loss": 0.6554, + "epoch": 7.0182689923121435, + "grad_norm": 7.076640605926514, + "learning_rate": 3.0746440615561107e-05, + "loss": 0.421, "step": 509400 }, { - "epoch": 5.19, - "learning_rate": 4.589908181245115e-05, - "loss": 0.6512, + "epoch": 7.019646744371883, + "grad_norm": 3.9407331943511963, + "learning_rate": 3.0738481085024095e-05, + "loss": 0.4539, "step": 509500 }, { - "epoch": 5.19, - "learning_rate": 4.589345602102758e-05, - "loss": 0.6416, + "epoch": 7.021024496431622, + "grad_norm": 4.092820167541504, + "learning_rate": 3.073052127936155e-05, + "loss": 0.4536, "step": 509600 }, { - "epoch": 5.19, - "learning_rate": 4.588782945250888e-05, - "loss": 0.6442, + "epoch": 7.022402248491361, + "grad_norm": 14.47374439239502, + "learning_rate": 3.072256119924971e-05, + "loss": 0.4627, "step": 509700 }, { - "epoch": 5.19, - "learning_rate": 4.5882202107170117e-05, - "loss": 0.6469, + "epoch": 7.023780000551101, + "grad_norm": 5.377509117126465, + "learning_rate": 3.071460084536479e-05, + "loss": 0.4174, "step": 509800 }, { - "epoch": 5.19, - "learning_rate": 4.587657398528646e-05, - "loss": 0.5413, + "epoch": 7.02515775261084, + "grad_norm": 6.398963928222656, + "learning_rate": 3.0706640218383096e-05, + "loss": 0.4497, "step": 509900 }, { - "epoch": 5.2, - "learning_rate": 4.587094508713308e-05, - "loss": 0.7031, + "epoch": 7.026535504670579, + "grad_norm": 1.2777804136276245, + "learning_rate": 3.06986793189809e-05, + "loss": 0.4446, "step": 510000 }, { - "epoch": 5.2, - "learning_rate": 4.5865315412985195e-05, - "loss": 0.6487, + "epoch": 7.027913256730319, + "grad_norm": 3.766404390335083, + "learning_rate": 3.0690718147834534e-05, + "loss": 0.3517, "step": 510100 }, { - "epoch": 5.2, - "learning_rate": 4.5859684963118074e-05, - "loss": 0.6169, + "epoch": 7.029291008790058, + "grad_norm": 5.022616386413574, + "learning_rate": 3.0682756705620346e-05, + "loss": 0.4452, "step": 510200 }, { - "epoch": 5.2, - "learning_rate": 4.585405373780699e-05, - "loss": 0.6437, + "epoch": 7.030668760849798, + "grad_norm": 3.8178772926330566, + "learning_rate": 3.067479499301468e-05, + "loss": 0.3791, "step": 510300 }, { - "epoch": 5.2, - "learning_rate": 4.5848421737327304e-05, - "loss": 0.6578, + "epoch": 7.032046512909536, + "grad_norm": 2.398712158203125, + "learning_rate": 3.0666912631850016e-05, + "loss": 0.3962, "step": 510400 }, { - "epoch": 5.2, - "learning_rate": 4.584278896195437e-05, - "loss": 0.6294, + "epoch": 7.033424264969276, + "grad_norm": 1.932659387588501, + "learning_rate": 3.0658950383177663e-05, + "loss": 0.436, "step": 510500 }, { - "epoch": 5.2, - "learning_rate": 4.583715541196359e-05, - "loss": 0.6365, + "epoch": 7.0348020170290155, + "grad_norm": 1.6256223917007446, + "learning_rate": 3.0650987866136304e-05, + "loss": 0.4505, "step": 510600 }, { - "epoch": 5.2, - "learning_rate": 4.583152108763044e-05, - "loss": 0.6542, + "epoch": 7.036179769088755, + "grad_norm": 3.6010141372680664, + "learning_rate": 3.064302508140239e-05, + "loss": 0.3881, "step": 510700 }, { - "epoch": 5.2, - "learning_rate": 4.582588598923036e-05, - "loss": 0.6135, + "epoch": 7.037557521148494, + "grad_norm": 21.118865966796875, + "learning_rate": 3.0635062029652445e-05, + "loss": 0.457, "step": 510800 }, { - "epoch": 5.21, - "learning_rate": 4.5820250117038915e-05, - "loss": 0.5599, + "epoch": 7.038935273208233, + "grad_norm": 3.9681448936462402, + "learning_rate": 3.062709871156292e-05, + "loss": 0.4641, "step": 510900 }, { - "epoch": 5.21, - "learning_rate": 4.581461347133166e-05, - "loss": 0.6369, + "epoch": 7.040313025267973, + "grad_norm": 3.96144962310791, + "learning_rate": 3.0619135127810376e-05, + "loss": 0.4246, "step": 511000 }, { - "epoch": 5.21, - "learning_rate": 4.580897605238416e-05, - "loss": 0.5911, + "epoch": 7.041690777327712, + "grad_norm": 7.626237392425537, + "learning_rate": 3.061117127907134e-05, + "loss": 0.4033, "step": 511100 }, { - "epoch": 5.21, - "learning_rate": 4.580333786047209e-05, - "loss": 0.6171, + "epoch": 7.043068529387451, + "grad_norm": 1.7304155826568604, + "learning_rate": 3.060320716602238e-05, + "loss": 0.4325, "step": 511200 }, { - "epoch": 5.21, - "learning_rate": 4.579769889587111e-05, - "loss": 0.598, + "epoch": 7.044446281447191, + "grad_norm": 3.747910976409912, + "learning_rate": 3.0595242789340106e-05, + "loss": 0.3817, "step": 511300 }, { - "epoch": 5.21, - "learning_rate": 4.5792115560049605e-05, - "loss": 0.6592, + "epoch": 7.04582403350693, + "grad_norm": 2.054321050643921, + "learning_rate": 3.058727814970111e-05, + "loss": 0.4156, "step": 511400 }, { - "epoch": 5.21, - "learning_rate": 4.5786475058617984e-05, - "loss": 0.6675, + "epoch": 7.04720178556667, + "grad_norm": 12.061721801757812, + "learning_rate": 3.057931324778205e-05, + "loss": 0.4536, "step": 511500 }, { - "epoch": 5.21, - "learning_rate": 4.5780833785321926e-05, - "loss": 0.6724, + "epoch": 7.048579537626408, + "grad_norm": 2.83654522895813, + "learning_rate": 3.057134808425958e-05, + "loss": 0.3963, "step": 511600 }, { - "epoch": 5.21, - "learning_rate": 4.5775191740437283e-05, - "loss": 0.691, + "epoch": 7.049957289686148, + "grad_norm": 1.5100481510162354, + "learning_rate": 3.0563382659810365e-05, + "loss": 0.4602, "step": 511700 }, { - "epoch": 5.21, - "learning_rate": 4.5769548924239906e-05, - "loss": 0.6051, + "epoch": 7.0513350417458875, + "grad_norm": 3.5936434268951416, + "learning_rate": 3.0555416975111125e-05, + "loss": 0.4491, "step": 511800 }, { - "epoch": 5.22, - "learning_rate": 4.576390533700569e-05, - "loss": 0.6817, + "epoch": 7.052712793805627, + "grad_norm": 2.6990315914154053, + "learning_rate": 3.0547451030838584e-05, + "loss": 0.4064, "step": 511900 }, { - "epoch": 5.22, - "learning_rate": 4.575826097901058e-05, - "loss": 0.5349, + "epoch": 7.054090545865366, + "grad_norm": 4.937404632568359, + "learning_rate": 3.053948482766949e-05, + "loss": 0.4176, "step": 512000 }, { - "epoch": 5.22, - "learning_rate": 4.5752615850530544e-05, - "loss": 0.6362, + "epoch": 7.055468297925105, + "grad_norm": 1.8632651567459106, + "learning_rate": 3.05315183662806e-05, + "loss": 0.4221, "step": 512100 }, { - "epoch": 5.22, - "learning_rate": 4.5746969951841595e-05, - "loss": 0.5957, + "epoch": 7.056846049984845, + "grad_norm": 3.9911539554595947, + "learning_rate": 3.0523551647348724e-05, + "loss": 0.4019, "step": 512200 }, { - "epoch": 5.22, - "learning_rate": 4.574137975371628e-05, - "loss": 0.5662, + "epoch": 7.058223802044584, + "grad_norm": 61.9040641784668, + "learning_rate": 3.0515584671550668e-05, + "loss": 0.4738, "step": 512300 }, { - "epoch": 5.22, - "learning_rate": 4.5735732323132916e-05, - "loss": 0.5816, + "epoch": 7.059601554104323, + "grad_norm": 3.3290135860443115, + "learning_rate": 3.050761743956326e-05, + "loss": 0.4035, "step": 512400 }, { - "epoch": 5.22, - "learning_rate": 4.573008412316614e-05, - "loss": 0.605, + "epoch": 7.060979306164063, + "grad_norm": 2.886779308319092, + "learning_rate": 3.049964995206337e-05, + "loss": 0.4099, "step": 512500 }, { - "epoch": 5.22, - "learning_rate": 4.572443515409213e-05, - "loss": 0.7491, + "epoch": 7.062357058223802, + "grad_norm": 3.3119235038757324, + "learning_rate": 3.0491682209727864e-05, + "loss": 0.4687, "step": 512600 }, { - "epoch": 5.22, - "learning_rate": 4.5718785416187076e-05, - "loss": 0.6556, + "epoch": 7.063734810283542, + "grad_norm": 2.0905604362487793, + "learning_rate": 3.048371421323366e-05, + "loss": 0.4001, "step": 512700 }, { - "epoch": 5.22, - "learning_rate": 4.5713134909727217e-05, - "loss": 0.5915, + "epoch": 7.0651125623432804, + "grad_norm": 4.203014850616455, + "learning_rate": 3.0475745963257655e-05, + "loss": 0.4361, "step": 512800 }, { - "epoch": 5.23, - "learning_rate": 4.5707483634988836e-05, - "loss": 0.7017, + "epoch": 7.06649031440302, + "grad_norm": 3.405407667160034, + "learning_rate": 3.0467777460476818e-05, + "loss": 0.4542, "step": 512900 }, { - "epoch": 5.23, - "learning_rate": 4.570183159224823e-05, - "loss": 0.5975, + "epoch": 7.06786806646276, + "grad_norm": 0.03760630637407303, + "learning_rate": 3.04598087055681e-05, + "loss": 0.4496, "step": 513000 }, { - "epoch": 5.23, - "learning_rate": 4.569617878178178e-05, - "loss": 0.5428, + "epoch": 7.069245818522499, + "grad_norm": 3.6018900871276855, + "learning_rate": 3.0451839699208485e-05, + "loss": 0.4487, "step": 513100 }, { - "epoch": 5.23, - "learning_rate": 4.5690525203865856e-05, - "loss": 0.5692, + "epoch": 7.070623570582238, + "grad_norm": 15.080385208129883, + "learning_rate": 3.044387044207499e-05, + "loss": 0.3994, "step": 513200 }, { - "epoch": 5.23, - "learning_rate": 4.568487085877688e-05, - "loss": 0.6423, + "epoch": 7.072001322641977, + "grad_norm": 8.081743240356445, + "learning_rate": 3.0435900934844637e-05, + "loss": 0.4801, "step": 513300 }, { - "epoch": 5.23, - "learning_rate": 4.5679215746791323e-05, - "loss": 0.5668, + "epoch": 7.073379074701717, + "grad_norm": 6.781630992889404, + "learning_rate": 3.0427931178194484e-05, + "loss": 0.474, "step": 513400 }, { - "epoch": 5.23, - "learning_rate": 4.567355986818569e-05, - "loss": 0.6039, + "epoch": 7.0747568267614565, + "grad_norm": 4.604904651641846, + "learning_rate": 3.0419961172801592e-05, + "loss": 0.4059, "step": 513500 }, { - "epoch": 5.23, - "learning_rate": 4.56679032232365e-05, - "loss": 0.6207, + "epoch": 7.076134578821195, + "grad_norm": 5.984791278839111, + "learning_rate": 3.0411990919343068e-05, + "loss": 0.4695, "step": 513600 }, { - "epoch": 5.23, - "learning_rate": 4.566224581222035e-05, - "loss": 0.5873, + "epoch": 7.077512330880935, + "grad_norm": 9.89235782623291, + "learning_rate": 3.0404020418496015e-05, + "loss": 0.4242, "step": 513700 }, { - "epoch": 5.23, - "learning_rate": 4.565658763541386e-05, - "loss": 0.6084, + "epoch": 7.078890082940674, + "grad_norm": 1.737723708152771, + "learning_rate": 3.039604967093757e-05, + "loss": 0.492, "step": 513800 }, { - "epoch": 5.24, - "learning_rate": 4.565092869309365e-05, - "loss": 0.6509, + "epoch": 7.080267835000413, + "grad_norm": 3.33735990524292, + "learning_rate": 3.0388078677344887e-05, + "loss": 0.4114, "step": 513900 }, { - "epoch": 5.24, - "learning_rate": 4.5645268985536435e-05, - "loss": 0.511, + "epoch": 7.0816455870601525, + "grad_norm": 2.9308416843414307, + "learning_rate": 3.0380107438395146e-05, + "loss": 0.4311, "step": 514000 }, { - "epoch": 5.24, - "learning_rate": 4.563960851301894e-05, - "loss": 0.6214, + "epoch": 7.083023339119892, + "grad_norm": 12.007293701171875, + "learning_rate": 3.037213595476555e-05, + "loss": 0.3914, "step": 514100 }, { - "epoch": 5.24, - "learning_rate": 4.5633947275817914e-05, - "loss": 0.5757, + "epoch": 7.084401091179632, + "grad_norm": 2.809816360473633, + "learning_rate": 3.0364164227133306e-05, + "loss": 0.4261, "step": 514200 }, { - "epoch": 5.24, - "learning_rate": 4.5628285274210156e-05, - "loss": 0.5345, + "epoch": 7.08577884323937, + "grad_norm": 10.038213729858398, + "learning_rate": 3.035619225617565e-05, + "loss": 0.4254, "step": 514300 }, { - "epoch": 5.24, - "learning_rate": 4.562262250847252e-05, - "loss": 0.7158, + "epoch": 7.08715659529911, + "grad_norm": 3.3328492641448975, + "learning_rate": 3.0348220042569863e-05, + "loss": 0.4583, "step": 514400 }, { - "epoch": 5.24, - "learning_rate": 4.561695897888188e-05, - "loss": 0.6437, + "epoch": 7.088534347358849, + "grad_norm": 1.0093694925308228, + "learning_rate": 3.0340247586993202e-05, + "loss": 0.4324, "step": 514500 }, { - "epoch": 5.24, - "learning_rate": 4.561135133242559e-05, - "loss": 0.6941, + "epoch": 7.089912099418589, + "grad_norm": 5.072347640991211, + "learning_rate": 3.0332354618283865e-05, + "loss": 0.4712, "step": 514600 }, { - "epoch": 5.24, - "learning_rate": 4.5605686283591324e-05, - "loss": 0.6419, + "epoch": 7.091289851478328, + "grad_norm": 4.36110258102417, + "learning_rate": 3.0324381683200216e-05, + "loss": 0.5111, "step": 514700 }, { - "epoch": 5.24, - "learning_rate": 4.560002047173212e-05, - "loss": 0.7149, + "epoch": 7.092667603538067, + "grad_norm": 7.218010902404785, + "learning_rate": 3.0316408508170893e-05, + "loss": 0.4284, "step": 514800 }, { - "epoch": 5.25, - "learning_rate": 4.559435389712502e-05, - "loss": 0.5452, + "epoch": 7.094045355597807, + "grad_norm": 10.516966819763184, + "learning_rate": 3.030843509387325e-05, + "loss": 0.4565, "step": 514900 }, { - "epoch": 5.25, - "learning_rate": 4.5588686560047086e-05, - "loss": 0.6719, + "epoch": 7.095423107657546, + "grad_norm": 2.4566447734832764, + "learning_rate": 3.0300461440984687e-05, + "loss": 0.4645, "step": 515000 }, { - "epoch": 5.25, - "learning_rate": 4.558301846077538e-05, - "loss": 0.6272, + "epoch": 7.096800859717285, + "grad_norm": 2.3622419834136963, + "learning_rate": 3.0292487550182597e-05, + "loss": 0.4296, "step": 515100 }, { - "epoch": 5.25, - "learning_rate": 4.5577349599587094e-05, - "loss": 0.5727, + "epoch": 7.0981786117770245, + "grad_norm": 4.899537563323975, + "learning_rate": 3.0284593164596874e-05, + "loss": 0.4866, "step": 515200 }, { - "epoch": 5.25, - "learning_rate": 4.557167997675935e-05, - "loss": 0.5979, + "epoch": 7.099556363836764, + "grad_norm": 8.881457328796387, + "learning_rate": 3.027661880236225e-05, + "loss": 0.3999, "step": 515300 }, { - "epoch": 5.25, - "learning_rate": 4.55660095925694e-05, - "loss": 0.6044, + "epoch": 7.100934115896504, + "grad_norm": 6.3609747886657715, + "learning_rate": 3.0268644204239648e-05, + "loss": 0.4221, "step": 515400 }, { - "epoch": 5.25, - "learning_rate": 4.5560338447294466e-05, - "loss": 0.6781, + "epoch": 7.102311867956242, + "grad_norm": 2.240654230117798, + "learning_rate": 3.0260669370906568e-05, + "loss": 0.4363, "step": 515500 }, { - "epoch": 5.25, - "learning_rate": 4.555466654121184e-05, - "loss": 0.6644, + "epoch": 7.103689620015982, + "grad_norm": 9.152902603149414, + "learning_rate": 3.0252694303040497e-05, + "loss": 0.4064, "step": 515600 }, { - "epoch": 5.25, - "learning_rate": 4.554899387459885e-05, - "loss": 0.625, + "epoch": 7.105067372075721, + "grad_norm": 2.1131410598754883, + "learning_rate": 3.0244719001318956e-05, + "loss": 0.4849, "step": 515700 }, { - "epoch": 5.26, - "learning_rate": 4.554332044773285e-05, - "loss": 0.6293, + "epoch": 7.106445124135461, + "grad_norm": 6.97761344909668, + "learning_rate": 3.02367434664195e-05, + "loss": 0.4491, "step": 515800 }, { - "epoch": 5.26, - "learning_rate": 4.553764626089123e-05, - "loss": 0.6183, + "epoch": 7.1078228761952, + "grad_norm": 9.90329647064209, + "learning_rate": 3.0228767699019687e-05, + "loss": 0.4905, "step": 515900 }, { - "epoch": 5.26, - "learning_rate": 4.553197131435145e-05, - "loss": 0.5868, + "epoch": 7.109200628254939, + "grad_norm": 12.74622631072998, + "learning_rate": 3.02207916997971e-05, + "loss": 0.4048, "step": 516000 }, { - "epoch": 5.26, - "learning_rate": 4.552629560839094e-05, - "loss": 0.6978, + "epoch": 7.110578380314679, + "grad_norm": 14.02765941619873, + "learning_rate": 3.0212815469429326e-05, + "loss": 0.421, "step": 516100 }, { - "epoch": 5.26, - "learning_rate": 4.5520619143287235e-05, - "loss": 0.6151, + "epoch": 7.111956132374418, + "grad_norm": 2.7014546394348145, + "learning_rate": 3.0204839008594012e-05, + "loss": 0.365, "step": 516200 }, { - "epoch": 5.26, - "learning_rate": 4.5514941919317884e-05, - "loss": 0.6742, + "epoch": 7.113333884434157, + "grad_norm": 2.474792003631592, + "learning_rate": 3.019686231796878e-05, + "loss": 0.4447, "step": 516300 }, { - "epoch": 5.26, - "learning_rate": 4.550926393676047e-05, - "loss": 0.6533, + "epoch": 7.1147116364938965, + "grad_norm": 3.366610288619995, + "learning_rate": 3.01888853982313e-05, + "loss": 0.4626, "step": 516400 }, { - "epoch": 5.26, - "learning_rate": 4.550358519589259e-05, - "loss": 0.5821, + "epoch": 7.116089388553636, + "grad_norm": 1.4523404836654663, + "learning_rate": 3.0180908250059247e-05, + "loss": 0.412, "step": 516500 }, { - "epoch": 5.26, - "learning_rate": 4.549796249573228e-05, - "loss": 0.5985, + "epoch": 7.117467140613376, + "grad_norm": 1.7142333984375, + "learning_rate": 3.0172930874130323e-05, + "loss": 0.4165, "step": 516600 }, { - "epoch": 5.26, - "learning_rate": 4.5492282246652676e-05, - "loss": 0.6606, + "epoch": 7.118844892673114, + "grad_norm": 4.076796531677246, + "learning_rate": 3.0164953271122248e-05, + "loss": 0.4068, "step": 516700 }, { - "epoch": 5.27, - "learning_rate": 4.5486601240092934e-05, - "loss": 0.7584, + "epoch": 7.120222644732854, + "grad_norm": 3.382675886154175, + "learning_rate": 3.0156975441712762e-05, + "loss": 0.4397, "step": 516800 }, { - "epoch": 5.27, - "learning_rate": 4.54809194763308e-05, - "loss": 0.5426, + "epoch": 7.121600396792593, + "grad_norm": 3.0515313148498535, + "learning_rate": 3.014899738657961e-05, + "loss": 0.4513, "step": 516900 }, { - "epoch": 5.27, - "learning_rate": 4.54752369556441e-05, - "loss": 0.5782, + "epoch": 7.122978148852333, + "grad_norm": 7.7009711265563965, + "learning_rate": 3.0141019106400586e-05, + "loss": 0.4646, "step": 517000 }, { - "epoch": 5.27, - "learning_rate": 4.546955367831066e-05, - "loss": 0.663, + "epoch": 7.124355900912072, + "grad_norm": 2.972747325897217, + "learning_rate": 3.0133040601853478e-05, + "loss": 0.4453, "step": 517100 }, { - "epoch": 5.27, - "learning_rate": 4.546386964460836e-05, - "loss": 0.6505, + "epoch": 7.125733652971811, + "grad_norm": 4.326071739196777, + "learning_rate": 3.0125061873616094e-05, + "loss": 0.4013, "step": 517200 }, { - "epoch": 5.27, - "learning_rate": 4.545818485481513e-05, - "loss": 0.6099, + "epoch": 7.127111405031551, + "grad_norm": 4.487992763519287, + "learning_rate": 3.0117082922366266e-05, + "loss": 0.4091, "step": 517300 }, { - "epoch": 5.27, - "learning_rate": 4.54524993092089e-05, - "loss": 0.6174, + "epoch": 7.12848915709129, + "grad_norm": 2.09557843208313, + "learning_rate": 3.0109103748781877e-05, + "loss": 0.4303, "step": 517400 }, { - "epoch": 5.27, - "learning_rate": 4.544681300806766e-05, - "loss": 0.6313, + "epoch": 7.129866909151029, + "grad_norm": 12.754274368286133, + "learning_rate": 3.0101124353540762e-05, + "loss": 0.4457, "step": 517500 }, { - "epoch": 5.27, - "learning_rate": 4.5441125951669464e-05, - "loss": 0.6208, + "epoch": 7.1312446612107685, + "grad_norm": 3.8575031757354736, + "learning_rate": 3.0093144737320827e-05, + "loss": 0.4703, "step": 517600 }, { - "epoch": 5.27, - "learning_rate": 4.543543814029234e-05, - "loss": 0.562, + "epoch": 7.132622413270508, + "grad_norm": 4.321773052215576, + "learning_rate": 3.0085164900799986e-05, + "loss": 0.456, "step": 517700 }, { - "epoch": 5.28, - "learning_rate": 4.542974957421442e-05, - "loss": 0.6162, + "epoch": 7.134000165330248, + "grad_norm": 8.63968563079834, + "learning_rate": 3.0077184844656153e-05, + "loss": 0.4482, "step": 517800 }, { - "epoch": 5.28, - "learning_rate": 4.542406025371383e-05, - "loss": 0.6186, + "epoch": 7.135377917389986, + "grad_norm": 2.6466023921966553, + "learning_rate": 3.0069204569567286e-05, + "loss": 0.402, "step": 517900 }, { - "epoch": 5.28, - "learning_rate": 4.541837017906871e-05, - "loss": 0.5819, + "epoch": 7.136755669449726, + "grad_norm": 8.302736282348633, + "learning_rate": 3.0061224076211354e-05, + "loss": 0.4454, "step": 518000 }, { - "epoch": 5.28, - "learning_rate": 4.541267935055733e-05, - "loss": 0.6957, + "epoch": 7.138133421509465, + "grad_norm": 0.9995610117912292, + "learning_rate": 3.0053243365266326e-05, + "loss": 0.3898, "step": 518100 }, { - "epoch": 5.28, - "learning_rate": 4.5406987768457894e-05, - "loss": 0.697, + "epoch": 7.139511173569204, + "grad_norm": 55.964229583740234, + "learning_rate": 3.0045262437410217e-05, + "loss": 0.471, "step": 518200 }, { - "epoch": 5.28, - "learning_rate": 4.54012954330487e-05, - "loss": 0.6091, + "epoch": 7.140888925628944, + "grad_norm": 3.818483352661133, + "learning_rate": 3.0037361105830053e-05, + "loss": 0.4831, "step": 518300 }, { - "epoch": 5.28, - "learning_rate": 4.539560234460808e-05, - "loss": 0.5423, + "epoch": 7.142266677688683, + "grad_norm": 3.3601295948028564, + "learning_rate": 3.0029379748338044e-05, + "loss": 0.4791, "step": 518400 }, { - "epoch": 5.28, - "learning_rate": 4.538990850341437e-05, - "loss": 0.6348, + "epoch": 7.143644429748423, + "grad_norm": 3.047697067260742, + "learning_rate": 3.0021398175962284e-05, + "loss": 0.4197, "step": 518500 }, { - "epoch": 5.28, - "learning_rate": 4.538421390974598e-05, - "loss": 0.5592, + "epoch": 7.145022181808161, + "grad_norm": 1.284662127494812, + "learning_rate": 3.0013416389380846e-05, + "loss": 0.3649, "step": 518600 }, { - "epoch": 5.28, - "learning_rate": 4.5378518563881334e-05, - "loss": 0.6101, + "epoch": 7.146399933867901, + "grad_norm": 5.258458614349365, + "learning_rate": 3.0005434389271828e-05, + "loss": 0.3916, "step": 518700 }, { - "epoch": 5.29, - "learning_rate": 4.537282246609891e-05, - "loss": 0.6071, + "epoch": 7.1477776859276405, + "grad_norm": 30.18373680114746, + "learning_rate": 2.999745217631335e-05, + "loss": 0.4054, "step": 518800 }, { - "epoch": 5.29, - "learning_rate": 4.536712561667721e-05, - "loss": 0.4949, + "epoch": 7.14915543798738, + "grad_norm": 7.279288291931152, + "learning_rate": 2.9989469751183526e-05, + "loss": 0.485, "step": 518900 }, { - "epoch": 5.29, - "learning_rate": 4.5361428015894776e-05, - "loss": 0.63, + "epoch": 7.150533190047119, + "grad_norm": 1.3396530151367188, + "learning_rate": 2.998148711456051e-05, + "loss": 0.4461, "step": 519000 }, { - "epoch": 5.29, - "learning_rate": 4.535572966403017e-05, - "loss": 0.5384, + "epoch": 7.151910942106858, + "grad_norm": 34.63507080078125, + "learning_rate": 2.997350426712247e-05, + "loss": 0.4164, "step": 519100 }, { - "epoch": 5.29, - "learning_rate": 4.535003056136203e-05, - "loss": 0.5711, + "epoch": 7.153288694166598, + "grad_norm": 5.330453872680664, + "learning_rate": 2.9965521209547576e-05, + "loss": 0.4527, "step": 519200 }, { - "epoch": 5.29, - "learning_rate": 4.5344330708169e-05, - "loss": 0.5931, + "epoch": 7.154666446226337, + "grad_norm": 1.974777102470398, + "learning_rate": 2.995753794251406e-05, + "loss": 0.4552, "step": 519300 }, { - "epoch": 5.29, - "learning_rate": 4.5338630104729764e-05, - "loss": 0.6595, + "epoch": 7.156044198286076, + "grad_norm": 3.270253896713257, + "learning_rate": 2.9949554466700117e-05, + "loss": 0.4305, "step": 519400 }, { - "epoch": 5.29, - "learning_rate": 4.533292875132306e-05, - "loss": 0.6582, + "epoch": 7.157421950345816, + "grad_norm": 27.618839263916016, + "learning_rate": 2.9941570782783983e-05, + "loss": 0.4638, "step": 519500 }, { - "epoch": 5.29, - "learning_rate": 4.532722664822764e-05, - "loss": 0.6752, + "epoch": 7.158799702405555, + "grad_norm": 3.824273109436035, + "learning_rate": 2.993358689144393e-05, + "loss": 0.4341, "step": 519600 }, { - "epoch": 5.29, - "learning_rate": 4.53215237957223e-05, - "loss": 0.7087, + "epoch": 7.160177454465295, + "grad_norm": 1.5950987339019775, + "learning_rate": 2.9925602793358213e-05, + "loss": 0.4206, "step": 519700 }, { - "epoch": 5.3, - "learning_rate": 4.531587723380954e-05, - "loss": 0.618, + "epoch": 7.161555206525033, + "grad_norm": 151.4424285888672, + "learning_rate": 2.991761848920513e-05, + "loss": 0.37, "step": 519800 }, { - "epoch": 5.3, - "learning_rate": 4.531017289080805e-05, - "loss": 0.5747, + "epoch": 7.162932958584773, + "grad_norm": 5.675760746002197, + "learning_rate": 2.9909633979662995e-05, + "loss": 0.3889, "step": 519900 }, { - "epoch": 5.3, - "learning_rate": 4.530446779923048e-05, - "loss": 0.6026, + "epoch": 7.1643107106445125, + "grad_norm": 2.904588460922241, + "learning_rate": 2.990164926541012e-05, + "loss": 0.517, "step": 520000 }, { - "epoch": 5.3, - "learning_rate": 4.5298761959355764e-05, - "loss": 0.5242, + "epoch": 7.165688462704252, + "grad_norm": 1.7478234767913818, + "learning_rate": 2.9893664347124852e-05, + "loss": 0.4609, "step": 520100 }, { - "epoch": 5.3, - "learning_rate": 4.529305537146289e-05, - "loss": 0.6124, + "epoch": 7.167066214763991, + "grad_norm": 4.9433274269104, + "learning_rate": 2.9885679225485554e-05, + "loss": 0.4542, "step": 520200 }, { - "epoch": 5.3, - "learning_rate": 4.5287348035830877e-05, - "loss": 0.6456, + "epoch": 7.16844396682373, + "grad_norm": 1.3795069456100464, + "learning_rate": 2.9877693901170602e-05, + "loss": 0.4164, "step": 520300 }, { - "epoch": 5.3, - "learning_rate": 4.5281639952738766e-05, - "loss": 0.5777, + "epoch": 7.16982171888347, + "grad_norm": 3.183743953704834, + "learning_rate": 2.9869708374858393e-05, + "loss": 0.4676, "step": 520400 }, { - "epoch": 5.3, - "learning_rate": 4.5275931122465666e-05, - "loss": 0.6726, + "epoch": 7.171199470943209, + "grad_norm": 5.469613075256348, + "learning_rate": 2.9861722647227324e-05, + "loss": 0.4687, "step": 520500 }, { - "epoch": 5.3, - "learning_rate": 4.5270221545290683e-05, - "loss": 0.6206, + "epoch": 7.172577223002948, + "grad_norm": 6.8705339431762695, + "learning_rate": 2.985373671895584e-05, + "loss": 0.4248, "step": 520600 }, { - "epoch": 5.3, - "learning_rate": 4.5264511221493e-05, - "loss": 0.6183, + "epoch": 7.173954975062688, + "grad_norm": 2.3069798946380615, + "learning_rate": 2.9845750590722378e-05, + "loss": 0.4291, "step": 520700 }, { - "epoch": 5.31, - "learning_rate": 4.525880015135182e-05, - "loss": 0.5969, + "epoch": 7.175332727122427, + "grad_norm": 3.884575128555298, + "learning_rate": 2.9837764263205408e-05, + "loss": 0.4307, "step": 520800 }, { - "epoch": 5.31, - "learning_rate": 4.525308833514636e-05, - "loss": 0.6297, + "epoch": 7.176710479182167, + "grad_norm": 10.956753730773926, + "learning_rate": 2.98297777370834e-05, + "loss": 0.4638, "step": 520900 }, { - "epoch": 5.31, - "learning_rate": 4.524737577315592e-05, - "loss": 0.6263, + "epoch": 7.178088231241905, + "grad_norm": 3.845417022705078, + "learning_rate": 2.9821791013034853e-05, + "loss": 0.391, "step": 521000 }, { - "epoch": 5.31, - "learning_rate": 4.524166246565979e-05, - "loss": 0.5519, + "epoch": 7.179465983301645, + "grad_norm": 3.302147388458252, + "learning_rate": 2.981380409173828e-05, + "loss": 0.4312, "step": 521100 }, { - "epoch": 5.31, - "learning_rate": 4.5235948412937327e-05, - "loss": 0.5599, + "epoch": 7.1808437353613845, + "grad_norm": 2.3778023719787598, + "learning_rate": 2.9805816973872217e-05, + "loss": 0.4814, "step": 521200 }, { - "epoch": 5.31, - "learning_rate": 4.52302336152679e-05, - "loss": 0.6462, + "epoch": 7.182221487421124, + "grad_norm": 1.9851925373077393, + "learning_rate": 2.97978296601152e-05, + "loss": 0.366, "step": 521300 }, { - "epoch": 5.31, - "learning_rate": 4.522451807293094e-05, - "loss": 0.6443, + "epoch": 7.183599239480863, + "grad_norm": 3.7024972438812256, + "learning_rate": 2.97898421511458e-05, + "loss": 0.4017, "step": 521400 }, { - "epoch": 5.31, - "learning_rate": 4.521880178620591e-05, - "loss": 0.5655, + "epoch": 7.184976991540602, + "grad_norm": 5.1820149421691895, + "learning_rate": 2.97818544476426e-05, + "loss": 0.4547, "step": 521500 }, { - "epoch": 5.31, - "learning_rate": 4.521308475537229e-05, - "loss": 0.7267, + "epoch": 7.186354743600342, + "grad_norm": 1.1525753736495972, + "learning_rate": 2.977386655028418e-05, + "loss": 0.4352, "step": 521600 }, { - "epoch": 5.32, - "learning_rate": 4.5207366980709604e-05, - "loss": 0.5959, + "epoch": 7.187732495660081, + "grad_norm": 0.24104857444763184, + "learning_rate": 2.9765878459749174e-05, + "loss": 0.4042, "step": 521700 }, { - "epoch": 5.32, - "learning_rate": 4.520164846249745e-05, - "loss": 0.6693, + "epoch": 7.18911024771982, + "grad_norm": 5.256969451904297, + "learning_rate": 2.9757890176716194e-05, + "loss": 0.4602, "step": 521800 }, { - "epoch": 5.32, - "learning_rate": 4.519592920101538e-05, - "loss": 0.6673, + "epoch": 7.19048799977956, + "grad_norm": 2.743558168411255, + "learning_rate": 2.974990170186389e-05, + "loss": 0.4433, "step": 521900 }, { - "epoch": 5.32, - "learning_rate": 4.519026640026467e-05, - "loss": 0.6553, + "epoch": 7.191865751839299, + "grad_norm": 6.436760902404785, + "learning_rate": 2.974199292347478e-05, + "loss": 0.4772, "step": 522000 }, { - "epoch": 5.32, - "learning_rate": 4.5184545660507494e-05, - "loss": 0.6814, + "epoch": 7.193243503899039, + "grad_norm": 0.617780327796936, + "learning_rate": 2.9734004068921097e-05, + "loss": 0.4326, "step": 522100 }, { - "epoch": 5.32, - "learning_rate": 4.517882417831666e-05, - "loss": 0.6389, + "epoch": 7.194621255958777, + "grad_norm": 2.4693448543548584, + "learning_rate": 2.9726015024577336e-05, + "loss": 0.4073, "step": 522200 }, { - "epoch": 5.32, - "learning_rate": 4.517310195397189e-05, - "loss": 0.6398, + "epoch": 7.195999008018517, + "grad_norm": 6.432834625244141, + "learning_rate": 2.9718025791122218e-05, + "loss": 0.4747, "step": 522300 }, { - "epoch": 5.32, - "learning_rate": 4.5167378987753e-05, - "loss": 0.6461, + "epoch": 7.1973767600782566, + "grad_norm": 2.1386616230010986, + "learning_rate": 2.971011626438385e-05, + "loss": 0.4159, "step": 522400 }, { - "epoch": 5.32, - "learning_rate": 4.516165527993978e-05, - "loss": 0.6148, + "epoch": 7.198754512137995, + "grad_norm": 4.946809768676758, + "learning_rate": 2.970212665661637e-05, + "loss": 0.4102, "step": 522500 }, { - "epoch": 5.32, - "learning_rate": 4.51559308308121e-05, - "loss": 0.643, + "epoch": 7.200132264197735, + "grad_norm": 0.419688880443573, + "learning_rate": 2.9694136861766973e-05, + "loss": 0.4213, "step": 522600 }, { - "epoch": 5.33, - "learning_rate": 4.515020564064985e-05, - "loss": 0.543, + "epoch": 7.201510016257474, + "grad_norm": 5.979204177856445, + "learning_rate": 2.968614688051442e-05, + "loss": 0.4521, "step": 522700 }, { - "epoch": 5.33, - "learning_rate": 4.514447970973296e-05, - "loss": 0.6343, + "epoch": 7.202887768317214, + "grad_norm": 14.80136489868164, + "learning_rate": 2.9678156713537505e-05, + "loss": 0.4867, "step": 522800 }, { - "epoch": 5.33, - "learning_rate": 4.513875303834139e-05, - "loss": 0.5857, + "epoch": 7.204265520376953, + "grad_norm": 3.5998952388763428, + "learning_rate": 2.9670166361515034e-05, + "loss": 0.4268, "step": 522900 }, { - "epoch": 5.33, - "learning_rate": 4.5133025626755136e-05, - "loss": 0.6159, + "epoch": 7.205643272436692, + "grad_norm": 7.841350555419922, + "learning_rate": 2.9662175825125823e-05, + "loss": 0.4179, "step": 523000 }, { - "epoch": 5.33, - "learning_rate": 4.512729747525424e-05, - "loss": 0.6061, + "epoch": 7.207021024496432, + "grad_norm": 3.468855619430542, + "learning_rate": 2.9654185105048718e-05, + "loss": 0.495, "step": 523100 }, { - "epoch": 5.33, - "learning_rate": 4.512156858411876e-05, - "loss": 0.6386, + "epoch": 7.208398776556171, + "grad_norm": 3.9456002712249756, + "learning_rate": 2.964619420196258e-05, + "loss": 0.4701, "step": 523200 }, { - "epoch": 5.33, - "learning_rate": 4.511583895362883e-05, - "loss": 0.6936, + "epoch": 7.20977652861591, + "grad_norm": 18.172672271728516, + "learning_rate": 2.9638203116546247e-05, + "loss": 0.4695, "step": 523300 }, { - "epoch": 5.33, - "learning_rate": 4.5110108584064585e-05, - "loss": 0.46, + "epoch": 7.2111542806756495, + "grad_norm": 21.43109893798828, + "learning_rate": 2.9630211849478623e-05, + "loss": 0.4594, "step": 523400 }, { - "epoch": 5.33, - "learning_rate": 4.510437747570619e-05, - "loss": 0.6174, + "epoch": 7.212532032735389, + "grad_norm": 13.627384185791016, + "learning_rate": 2.962222040143861e-05, + "loss": 0.3979, "step": 523500 }, { - "epoch": 5.33, - "learning_rate": 4.509864562883388e-05, - "loss": 0.5373, + "epoch": 7.213909784795129, + "grad_norm": 3.7063870429992676, + "learning_rate": 2.9614228773105113e-05, + "loss": 0.4841, "step": 523600 }, { - "epoch": 5.34, - "learning_rate": 4.5092913043727905e-05, - "loss": 0.6647, + "epoch": 7.215287536854867, + "grad_norm": 4.228190898895264, + "learning_rate": 2.9606236965157075e-05, + "loss": 0.4706, "step": 523700 }, { - "epoch": 5.34, - "learning_rate": 4.5087179720668554e-05, - "loss": 0.6418, + "epoch": 7.216665288914607, + "grad_norm": 2.6452341079711914, + "learning_rate": 2.959824497827342e-05, + "loss": 0.4628, "step": 523800 }, { - "epoch": 5.34, - "learning_rate": 4.508144565993614e-05, - "loss": 0.6397, + "epoch": 7.218043040974346, + "grad_norm": 2.161000967025757, + "learning_rate": 2.9590332735664657e-05, + "loss": 0.4914, "step": 523900 }, { - "epoch": 5.34, - "learning_rate": 4.507571086181104e-05, - "loss": 0.5518, + "epoch": 7.219420793034086, + "grad_norm": 2.0252413749694824, + "learning_rate": 2.958234039471911e-05, + "loss": 0.4206, "step": 524000 }, { - "epoch": 5.34, - "learning_rate": 4.5070032685573805e-05, - "loss": 0.7298, + "epoch": 7.220798545093825, + "grad_norm": 2.671046495437622, + "learning_rate": 2.9574347876868095e-05, + "loss": 0.4021, "step": 524100 }, { - "epoch": 5.34, - "learning_rate": 4.506429642087149e-05, - "loss": 0.7373, + "epoch": 7.222176297153564, + "grad_norm": 2.3486526012420654, + "learning_rate": 2.9566355182790603e-05, + "loss": 0.4204, "step": 524200 }, { - "epoch": 5.34, - "learning_rate": 4.5058559419614976e-05, - "loss": 0.7065, + "epoch": 7.223554049213304, + "grad_norm": 5.901416778564453, + "learning_rate": 2.955836231316568e-05, + "loss": 0.412, "step": 524300 }, { - "epoch": 5.34, - "learning_rate": 4.505282168208476e-05, - "loss": 0.5687, + "epoch": 7.224931801273043, + "grad_norm": 9.44016170501709, + "learning_rate": 2.955036926867233e-05, + "loss": 0.4258, "step": 524400 }, { - "epoch": 5.34, - "learning_rate": 4.5047083208561404e-05, - "loss": 0.5042, + "epoch": 7.226309553332782, + "grad_norm": 2.9871771335601807, + "learning_rate": 2.9542376049989646e-05, + "loss": 0.4158, "step": 524500 }, { - "epoch": 5.34, - "learning_rate": 4.504134399932547e-05, - "loss": 0.6122, + "epoch": 7.2276873053925215, + "grad_norm": 14.95018196105957, + "learning_rate": 2.9534382657796653e-05, + "loss": 0.4722, "step": 524600 }, { - "epoch": 5.35, - "learning_rate": 4.5035604054657576e-05, - "loss": 0.4681, + "epoch": 7.229065057452261, + "grad_norm": 1.974859595298767, + "learning_rate": 2.9526389092772434e-05, + "loss": 0.4452, "step": 524700 }, { - "epoch": 5.35, - "learning_rate": 4.502986337483837e-05, - "loss": 0.6303, + "epoch": 7.230442809512001, + "grad_norm": 2.981731653213501, + "learning_rate": 2.9518395355596115e-05, + "loss": 0.4352, "step": 524800 }, { - "epoch": 5.35, - "learning_rate": 4.502412196014855e-05, - "loss": 0.6399, + "epoch": 7.231820561571739, + "grad_norm": 4.314277648925781, + "learning_rate": 2.9510401446946774e-05, + "loss": 0.4807, "step": 524900 }, { - "epoch": 5.35, - "learning_rate": 4.501837981086882e-05, - "loss": 0.556, + "epoch": 7.233198313631479, + "grad_norm": 9.199114799499512, + "learning_rate": 2.950240736750355e-05, + "loss": 0.4366, "step": 525000 }, { - "epoch": 5.35, - "learning_rate": 4.5012636927279936e-05, - "loss": 0.6739, + "epoch": 7.234576065691218, + "grad_norm": 1.1397278308868408, + "learning_rate": 2.9494413117945576e-05, + "loss": 0.4193, "step": 525100 }, { - "epoch": 5.35, - "learning_rate": 4.500695074947139e-05, - "loss": 0.5258, + "epoch": 7.235953817750958, + "grad_norm": 5.191973686218262, + "learning_rate": 2.9486418698951997e-05, + "loss": 0.4511, "step": 525200 }, { - "epoch": 5.35, - "learning_rate": 4.500120640544272e-05, - "loss": 0.6643, + "epoch": 7.237331569810697, + "grad_norm": 2.0410349369049072, + "learning_rate": 2.9478424111201993e-05, + "loss": 0.3834, "step": 525300 }, { - "epoch": 5.35, - "learning_rate": 4.499546132794457e-05, - "loss": 0.5412, + "epoch": 7.238709321870436, + "grad_norm": 3.705223321914673, + "learning_rate": 2.9470429355374737e-05, + "loss": 0.4109, "step": 525400 }, { - "epoch": 5.35, - "learning_rate": 4.4989715517257855e-05, - "loss": 0.5702, + "epoch": 7.240087073930176, + "grad_norm": 4.902582168579102, + "learning_rate": 2.946243443214943e-05, + "loss": 0.416, "step": 525500 }, { - "epoch": 5.35, - "learning_rate": 4.4983968973663506e-05, - "loss": 0.6078, + "epoch": 7.241464825989915, + "grad_norm": 3.0717673301696777, + "learning_rate": 2.9454439342205272e-05, + "loss": 0.4985, "step": 525600 }, { - "epoch": 5.36, - "learning_rate": 4.497822169744249e-05, - "loss": 0.6462, + "epoch": 7.242842578049654, + "grad_norm": 2.995602607727051, + "learning_rate": 2.94464440862215e-05, + "loss": 0.4075, "step": 525700 }, { - "epoch": 5.36, - "learning_rate": 4.497247368887582e-05, - "loss": 0.5826, + "epoch": 7.2442203301093935, + "grad_norm": 4.364591598510742, + "learning_rate": 2.943844866487734e-05, + "loss": 0.4497, "step": 525800 }, { - "epoch": 5.36, - "learning_rate": 4.496672494824453e-05, - "loss": 0.6826, + "epoch": 7.245598082169133, + "grad_norm": 11.302666664123535, + "learning_rate": 2.9430453078852052e-05, + "loss": 0.4524, "step": 525900 }, { - "epoch": 5.36, - "learning_rate": 4.496097547582971e-05, - "loss": 0.5613, + "epoch": 7.246975834228873, + "grad_norm": 3.318530321121216, + "learning_rate": 2.9422457328824896e-05, + "loss": 0.4194, "step": 526000 }, { - "epoch": 5.36, - "learning_rate": 4.4955225271912485e-05, - "loss": 0.5629, + "epoch": 7.248353586288611, + "grad_norm": 4.413473129272461, + "learning_rate": 2.9414461415475154e-05, + "loss": 0.5009, "step": 526100 }, { - "epoch": 5.36, - "learning_rate": 4.494947433677398e-05, - "loss": 0.6689, + "epoch": 7.249731338348351, + "grad_norm": 3.0524516105651855, + "learning_rate": 2.9406465339482126e-05, + "loss": 0.389, "step": 526200 }, { - "epoch": 5.36, - "learning_rate": 4.4943722670695394e-05, - "loss": 0.4877, + "epoch": 7.25110909040809, + "grad_norm": 2.052962064743042, + "learning_rate": 2.9398469101525107e-05, + "loss": 0.4102, "step": 526300 }, { - "epoch": 5.36, - "learning_rate": 4.4937970273957954e-05, - "loss": 0.5553, + "epoch": 7.25248684246783, + "grad_norm": 5.107116222381592, + "learning_rate": 2.9390472702283432e-05, + "loss": 0.4026, "step": 526400 }, { - "epoch": 5.36, - "learning_rate": 4.49322171468429e-05, - "loss": 0.5388, + "epoch": 7.253864594527569, + "grad_norm": 15.659510612487793, + "learning_rate": 2.9382476142436423e-05, + "loss": 0.4649, "step": 526500 }, { - "epoch": 5.37, - "learning_rate": 4.4926463289631556e-05, - "loss": 0.6236, + "epoch": 7.255242346587308, + "grad_norm": 3.75313401222229, + "learning_rate": 2.937447942266344e-05, + "loss": 0.4052, "step": 526600 }, { - "epoch": 5.37, - "learning_rate": 4.4920708702605224e-05, - "loss": 0.679, + "epoch": 7.256620098647048, + "grad_norm": 3.125775098800659, + "learning_rate": 2.936648254364384e-05, + "loss": 0.4134, "step": 526700 }, { - "epoch": 5.37, - "learning_rate": 4.491495338604528e-05, - "loss": 0.6725, + "epoch": 7.257997850706786, + "grad_norm": 3.8388991355895996, + "learning_rate": 2.9358485506056994e-05, + "loss": 0.4775, "step": 526800 }, { - "epoch": 5.37, - "learning_rate": 4.490919734023312e-05, - "loss": 0.6508, + "epoch": 7.259375602766526, + "grad_norm": 4.062264919281006, + "learning_rate": 2.9350488310582303e-05, + "loss": 0.4405, "step": 526900 }, { - "epoch": 5.37, - "learning_rate": 4.490344056545019e-05, - "loss": 0.6255, + "epoch": 7.2607533548262655, + "grad_norm": 3.9900310039520264, + "learning_rate": 2.9342490957899157e-05, + "loss": 0.4336, "step": 527000 }, { - "epoch": 5.37, - "learning_rate": 4.4897683061977945e-05, - "loss": 0.687, + "epoch": 7.262131106886005, + "grad_norm": 4.169195175170898, + "learning_rate": 2.9334493448686982e-05, + "loss": 0.4213, "step": 527100 }, { - "epoch": 5.37, - "learning_rate": 4.4891924830097915e-05, - "loss": 0.5674, + "epoch": 7.263508858945744, + "grad_norm": 0.6121081709861755, + "learning_rate": 2.9326495783625203e-05, + "loss": 0.4636, "step": 527200 }, { - "epoch": 5.37, - "learning_rate": 4.4886165870091625e-05, - "loss": 0.5086, + "epoch": 7.264886611005483, + "grad_norm": 6.136238098144531, + "learning_rate": 2.9318497963393264e-05, + "loss": 0.462, "step": 527300 }, { - "epoch": 5.37, - "learning_rate": 4.4880406182240664e-05, - "loss": 0.6606, + "epoch": 7.266264363065223, + "grad_norm": 5.410613059997559, + "learning_rate": 2.931049998867062e-05, + "loss": 0.4871, "step": 527400 }, { - "epoch": 5.37, - "learning_rate": 4.487464576682664e-05, - "loss": 0.6007, + "epoch": 7.267642115124962, + "grad_norm": 5.456384181976318, + "learning_rate": 2.9302501860136725e-05, + "loss": 0.4255, "step": 527500 }, { - "epoch": 5.38, - "learning_rate": 4.48688846241312e-05, - "loss": 0.6548, + "epoch": 7.269019867184701, + "grad_norm": 15.0463228225708, + "learning_rate": 2.9294503578471096e-05, + "loss": 0.4193, "step": 527600 }, { - "epoch": 5.38, - "learning_rate": 4.486312275443605e-05, - "loss": 0.6419, + "epoch": 7.270397619244441, + "grad_norm": 5.020733833312988, + "learning_rate": 2.92865051443532e-05, + "loss": 0.4931, "step": 527700 }, { - "epoch": 5.38, - "learning_rate": 4.485736015802288e-05, - "loss": 0.582, + "epoch": 7.27177537130418, + "grad_norm": 7.640762805938721, + "learning_rate": 2.9278506558462548e-05, + "loss": 0.3628, "step": 527800 }, { - "epoch": 5.38, - "learning_rate": 4.485159683517347e-05, - "loss": 0.6252, + "epoch": 7.27315312336392, + "grad_norm": 6.148104190826416, + "learning_rate": 2.927050782147867e-05, + "loss": 0.4479, "step": 527900 }, { - "epoch": 5.38, - "learning_rate": 4.48458327861696e-05, - "loss": 0.5997, + "epoch": 7.274530875423658, + "grad_norm": 3.0570690631866455, + "learning_rate": 2.9262508934081092e-05, + "loss": 0.4184, "step": 528000 }, { - "epoch": 5.38, - "learning_rate": 4.48400680112931e-05, - "loss": 0.652, + "epoch": 7.275908627483398, + "grad_norm": 0.2889871597290039, + "learning_rate": 2.9254509896949365e-05, + "loss": 0.4049, "step": 528100 }, { - "epoch": 5.38, - "learning_rate": 4.483430251082585e-05, - "loss": 0.5784, + "epoch": 7.2772863795431375, + "grad_norm": 3.989398717880249, + "learning_rate": 2.924651071076305e-05, + "loss": 0.4074, "step": 528200 }, { - "epoch": 5.38, - "learning_rate": 4.482853628504972e-05, - "loss": 0.5931, + "epoch": 7.278664131602877, + "grad_norm": 14.667363166809082, + "learning_rate": 2.92385113762017e-05, + "loss": 0.4336, "step": 528300 }, { - "epoch": 5.38, - "learning_rate": 4.482276933424666e-05, - "loss": 0.6495, + "epoch": 7.280041883662616, + "grad_norm": 11.679713249206543, + "learning_rate": 2.9230511893944936e-05, + "loss": 0.4514, "step": 528400 }, { - "epoch": 5.38, - "learning_rate": 4.481700165869864e-05, - "loss": 0.538, + "epoch": 7.281419635722355, + "grad_norm": 1.3945144414901733, + "learning_rate": 2.9222512264672317e-05, + "loss": 0.4507, "step": 528500 }, { - "epoch": 5.39, - "learning_rate": 4.481123325868766e-05, - "loss": 0.592, + "epoch": 7.282797387782095, + "grad_norm": 2.6839096546173096, + "learning_rate": 2.9214512489063474e-05, + "loss": 0.3817, "step": 528600 }, { - "epoch": 5.39, - "learning_rate": 4.480546413449576e-05, - "loss": 0.5635, + "epoch": 7.284175139841834, + "grad_norm": 5.41428279876709, + "learning_rate": 2.9206512567798028e-05, + "loss": 0.4908, "step": 528700 }, { - "epoch": 5.39, - "learning_rate": 4.479969428640502e-05, - "loss": 0.6909, + "epoch": 7.285552891901573, + "grad_norm": 1.5273067951202393, + "learning_rate": 2.91985125015556e-05, + "loss": 0.3675, "step": 528800 }, { - "epoch": 5.39, - "learning_rate": 4.479392371469755e-05, - "loss": 0.5876, + "epoch": 7.286930643961313, + "grad_norm": 3.85188889503479, + "learning_rate": 2.9190512291015852e-05, + "loss": 0.4272, "step": 528900 }, { - "epoch": 5.39, - "learning_rate": 4.4788152419655495e-05, - "loss": 0.5047, + "epoch": 7.288308396021052, + "grad_norm": 4.808576583862305, + "learning_rate": 2.918251193685843e-05, + "loss": 0.4369, "step": 529000 }, { - "epoch": 5.39, - "learning_rate": 4.478238040156104e-05, - "loss": 0.6059, + "epoch": 7.289686148080792, + "grad_norm": 5.606349945068359, + "learning_rate": 2.917451143976302e-05, + "loss": 0.4907, "step": 529100 }, { - "epoch": 5.39, - "learning_rate": 4.47766076606964e-05, - "loss": 0.5658, + "epoch": 7.29106390014053, + "grad_norm": 6.450354099273682, + "learning_rate": 2.916651080040929e-05, + "loss": 0.511, "step": 529200 }, { - "epoch": 5.39, - "learning_rate": 4.477083419734383e-05, - "loss": 0.6419, + "epoch": 7.29244165220027, + "grad_norm": 13.357542037963867, + "learning_rate": 2.915851001947694e-05, + "loss": 0.3752, "step": 529300 }, { - "epoch": 5.39, - "learning_rate": 4.4765060011785616e-05, - "loss": 0.5918, + "epoch": 7.2938194042600095, + "grad_norm": 7.168420791625977, + "learning_rate": 2.9150509097645687e-05, + "loss": 0.4906, "step": 529400 }, { - "epoch": 5.39, - "learning_rate": 4.475928510430407e-05, - "loss": 0.7025, + "epoch": 7.295197156319749, + "grad_norm": 2.673644781112671, + "learning_rate": 2.9142508035595238e-05, + "loss": 0.461, "step": 529500 }, { - "epoch": 5.4, - "learning_rate": 4.475350947518157e-05, - "loss": 0.6346, + "epoch": 7.296574908379488, + "grad_norm": 4.309229850769043, + "learning_rate": 2.9134506834005326e-05, + "loss": 0.4162, "step": 529600 }, { - "epoch": 5.4, - "learning_rate": 4.4747733124700504e-05, - "loss": 0.556, + "epoch": 7.297952660439227, + "grad_norm": 0.18738001585006714, + "learning_rate": 2.9126505493555704e-05, + "loss": 0.479, "step": 529700 }, { - "epoch": 5.4, - "learning_rate": 4.474195605314329e-05, - "loss": 0.6855, + "epoch": 7.299330412498967, + "grad_norm": 14.323974609375, + "learning_rate": 2.9118504014926113e-05, + "loss": 0.4673, "step": 529800 }, { - "epoch": 5.4, - "learning_rate": 4.4736178260792405e-05, - "loss": 0.6182, + "epoch": 7.300708164558706, + "grad_norm": 1.7647420167922974, + "learning_rate": 2.9110502398796323e-05, + "loss": 0.4056, "step": 529900 }, { - "epoch": 5.4, - "learning_rate": 4.4730399747930335e-05, - "loss": 0.59, + "epoch": 7.302085916618445, + "grad_norm": 15.972511291503906, + "learning_rate": 2.910250064584612e-05, + "loss": 0.3977, "step": 530000 }, { - "epoch": 5.4, - "learning_rate": 4.472462051483964e-05, - "loss": 0.5916, + "epoch": 7.303463668678185, + "grad_norm": 2.2886714935302734, + "learning_rate": 2.9094498756755287e-05, + "loss": 0.4233, "step": 530100 }, { - "epoch": 5.4, - "learning_rate": 4.471884056180286e-05, - "loss": 0.5715, + "epoch": 7.304841420737924, + "grad_norm": 3.633195400238037, + "learning_rate": 2.908649673220363e-05, + "loss": 0.4259, "step": 530200 }, { - "epoch": 5.4, - "learning_rate": 4.471305988910262e-05, - "loss": 0.6999, + "epoch": 7.306219172797663, + "grad_norm": 13.278536796569824, + "learning_rate": 2.9078494572870957e-05, + "loss": 0.4427, "step": 530300 }, { - "epoch": 5.4, - "learning_rate": 4.470727849702155e-05, - "loss": 0.4921, + "epoch": 7.307596924857402, + "grad_norm": 9.276966094970703, + "learning_rate": 2.9070492279437095e-05, + "loss": 0.4599, "step": 530400 }, { - "epoch": 5.4, - "learning_rate": 4.470149638584234e-05, - "loss": 0.537, + "epoch": 7.308974676917142, + "grad_norm": 7.163000106811523, + "learning_rate": 2.906248985258188e-05, + "loss": 0.386, "step": 530500 }, { - "epoch": 5.41, - "learning_rate": 4.4695713555847674e-05, - "loss": 0.5916, + "epoch": 7.3103524289768815, + "grad_norm": 8.477914810180664, + "learning_rate": 2.9054487292985148e-05, + "loss": 0.3981, "step": 530600 }, { - "epoch": 5.41, - "learning_rate": 4.468993000732033e-05, - "loss": 0.5762, + "epoch": 7.311730181036621, + "grad_norm": 7.509415149688721, + "learning_rate": 2.9046484601326782e-05, + "loss": 0.4666, "step": 530700 }, { - "epoch": 5.41, - "learning_rate": 4.468414574054307e-05, - "loss": 0.5568, + "epoch": 7.31310793309636, + "grad_norm": 5.5561723709106445, + "learning_rate": 2.903848177828663e-05, + "loss": 0.4509, "step": 530800 }, { - "epoch": 5.41, - "learning_rate": 4.467836075579871e-05, - "loss": 0.6476, + "epoch": 7.314485685156099, + "grad_norm": 2.2281110286712646, + "learning_rate": 2.9030478824544584e-05, + "loss": 0.5091, "step": 530900 }, { - "epoch": 5.41, - "learning_rate": 4.4672575053370104e-05, - "loss": 0.6192, + "epoch": 7.315863437215839, + "grad_norm": 3.82918119430542, + "learning_rate": 2.902247574078053e-05, + "loss": 0.3843, "step": 531000 }, { - "epoch": 5.41, - "learning_rate": 4.466678863354014e-05, - "loss": 0.5528, + "epoch": 7.3172411892755775, + "grad_norm": 2.1636435985565186, + "learning_rate": 2.9014472527674365e-05, + "loss": 0.41, "step": 531100 }, { - "epoch": 5.41, - "learning_rate": 4.466100149659175e-05, - "loss": 0.6199, + "epoch": 7.318618941335317, + "grad_norm": 2.1685776710510254, + "learning_rate": 2.9006469185906032e-05, + "loss": 0.3967, "step": 531200 }, { - "epoch": 5.41, - "learning_rate": 4.4655213642807864e-05, - "loss": 0.5983, + "epoch": 7.319996693395057, + "grad_norm": 6.576162815093994, + "learning_rate": 2.8998465716155414e-05, + "loss": 0.402, "step": 531300 }, { - "epoch": 5.41, - "learning_rate": 4.46494250724715e-05, - "loss": 0.6362, + "epoch": 7.321374445454796, + "grad_norm": 0.6103913187980652, + "learning_rate": 2.8990462119102477e-05, + "loss": 0.3635, "step": 531400 }, { - "epoch": 5.42, - "learning_rate": 4.4643635785865665e-05, - "loss": 0.5769, + "epoch": 7.322752197514535, + "grad_norm": 1.9723066091537476, + "learning_rate": 2.8982458395427158e-05, + "loss": 0.5149, "step": 531500 }, { - "epoch": 5.42, - "learning_rate": 4.463784578327343e-05, - "loss": 0.6321, + "epoch": 7.324129949574274, + "grad_norm": 7.15331506729126, + "learning_rate": 2.8974454545809406e-05, + "loss": 0.3947, "step": 531600 }, { - "epoch": 5.42, - "learning_rate": 4.463205506497789e-05, - "loss": 0.546, + "epoch": 7.325507701634014, + "grad_norm": 0.49902448058128357, + "learning_rate": 2.8966450570929203e-05, + "loss": 0.415, "step": 531700 }, { - "epoch": 5.42, - "learning_rate": 4.462626363126218e-05, - "loss": 0.6022, + "epoch": 7.3268854536937535, + "grad_norm": 2.9225857257843018, + "learning_rate": 2.895844647146653e-05, + "loss": 0.4571, "step": 531800 }, { - "epoch": 5.42, - "learning_rate": 4.462047148240946e-05, - "loss": 0.5746, + "epoch": 7.328263205753492, + "grad_norm": 2.2372071743011475, + "learning_rate": 2.895044224810135e-05, + "loss": 0.4725, "step": 531900 }, { - "epoch": 5.42, - "learning_rate": 4.4614678618702936e-05, - "loss": 0.7018, + "epoch": 7.329640957813232, + "grad_norm": 13.383376121520996, + "learning_rate": 2.8942437901513694e-05, + "loss": 0.4283, "step": 532000 }, { - "epoch": 5.42, - "learning_rate": 4.460888504042585e-05, - "loss": 0.6679, + "epoch": 7.331018709872971, + "grad_norm": 7.015833377838135, + "learning_rate": 2.893451347767921e-05, + "loss": 0.4322, "step": 532100 }, { - "epoch": 5.42, - "learning_rate": 4.460309074786145e-05, - "loss": 0.6041, + "epoch": 7.332396461932711, + "grad_norm": 2.4232017993927, + "learning_rate": 2.892650888790187e-05, + "loss": 0.3958, "step": 532200 }, { - "epoch": 5.42, - "learning_rate": 4.4597295741293075e-05, - "loss": 0.7303, + "epoch": 7.3337742139924496, + "grad_norm": 9.83591365814209, + "learning_rate": 2.89185041769353e-05, + "loss": 0.506, "step": 532300 }, { - "epoch": 5.42, - "learning_rate": 4.459150002100404e-05, - "loss": 0.6175, + "epoch": 7.335151966052189, + "grad_norm": 5.600688934326172, + "learning_rate": 2.8910499345459546e-05, + "loss": 0.4758, "step": 532400 }, { - "epoch": 5.43, - "learning_rate": 4.458570358727774e-05, - "loss": 0.5557, + "epoch": 7.336529718111929, + "grad_norm": 7.676602363586426, + "learning_rate": 2.8902494394154653e-05, + "loss": 0.4548, "step": 532500 }, { - "epoch": 5.43, - "learning_rate": 4.457990644039758e-05, - "loss": 0.6733, + "epoch": 7.337907470171668, + "grad_norm": 5.2926859855651855, + "learning_rate": 2.8894489323700694e-05, + "loss": 0.4022, "step": 532600 }, { - "epoch": 5.43, - "learning_rate": 4.4574108580646993e-05, - "loss": 0.7231, + "epoch": 7.339285222231407, + "grad_norm": 1.8817243576049805, + "learning_rate": 2.888648413477773e-05, + "loss": 0.4084, "step": 532700 }, { - "epoch": 5.43, - "learning_rate": 4.456836799755923e-05, - "loss": 0.6454, + "epoch": 7.3406629742911464, + "grad_norm": 3.6371889114379883, + "learning_rate": 2.8878478828065834e-05, + "loss": 0.4387, "step": 532800 }, { - "epoch": 5.43, - "learning_rate": 4.456256872003992e-05, - "loss": 0.5856, + "epoch": 7.342040726350886, + "grad_norm": 8.56273365020752, + "learning_rate": 2.8870473404245126e-05, + "loss": 0.4313, "step": 532900 }, { - "epoch": 5.43, - "learning_rate": 4.45567687304979e-05, - "loss": 0.6081, + "epoch": 7.343418478410626, + "grad_norm": 1.4176666736602783, + "learning_rate": 2.8862467863995676e-05, + "loss": 0.4533, "step": 533000 }, { - "epoch": 5.43, - "learning_rate": 4.455096802921677e-05, - "loss": 0.6131, + "epoch": 7.344796230470364, + "grad_norm": 5.151734352111816, + "learning_rate": 2.885446220799763e-05, + "loss": 0.407, "step": 533100 }, { - "epoch": 5.43, - "learning_rate": 4.454516661648013e-05, - "loss": 0.578, + "epoch": 7.346173982530104, + "grad_norm": 1.582909107208252, + "learning_rate": 2.8846456436931075e-05, + "loss": 0.4715, "step": 533200 }, { - "epoch": 5.43, - "learning_rate": 4.453936449257165e-05, - "loss": 0.6074, + "epoch": 7.347551734589843, + "grad_norm": 3.3835840225219727, + "learning_rate": 2.8838450551476157e-05, + "loss": 0.4339, "step": 533300 }, { - "epoch": 5.43, - "learning_rate": 4.453356165777501e-05, - "loss": 0.5656, + "epoch": 7.348929486649583, + "grad_norm": 3.6356005668640137, + "learning_rate": 2.883044455231303e-05, + "loss": 0.4139, "step": 533400 }, { - "epoch": 5.44, - "learning_rate": 4.4527758112373946e-05, - "loss": 0.6126, + "epoch": 7.350307238709322, + "grad_norm": 2.878119468688965, + "learning_rate": 2.8822438440121817e-05, + "loss": 0.3802, "step": 533500 }, { - "epoch": 5.44, - "learning_rate": 4.452195385665219e-05, - "loss": 0.5385, + "epoch": 7.351684990769061, + "grad_norm": 1.8025470972061157, + "learning_rate": 2.8814432215582696e-05, + "loss": 0.4256, "step": 533600 }, { - "epoch": 5.44, - "learning_rate": 4.451614889089357e-05, - "loss": 0.5525, + "epoch": 7.353062742828801, + "grad_norm": 4.973130702972412, + "learning_rate": 2.8806425879375844e-05, + "loss": 0.4103, "step": 533700 }, { - "epoch": 5.44, - "learning_rate": 4.451034321538188e-05, - "loss": 0.6747, + "epoch": 7.35444049488854, + "grad_norm": 3.463181495666504, + "learning_rate": 2.8798419432181415e-05, + "loss": 0.4916, "step": 533800 }, { - "epoch": 5.44, - "learning_rate": 4.4504536830401014e-05, - "loss": 0.5421, + "epoch": 7.355818246948279, + "grad_norm": 6.862941741943359, + "learning_rate": 2.8790412874679622e-05, + "loss": 0.4747, "step": 533900 }, { - "epoch": 5.44, - "learning_rate": 4.449872973623486e-05, - "loss": 0.6221, + "epoch": 7.3571959990080185, + "grad_norm": 4.2563323974609375, + "learning_rate": 2.8782406207550652e-05, + "loss": 0.4257, "step": 534000 }, { - "epoch": 5.44, - "learning_rate": 4.4492921933167334e-05, - "loss": 0.6298, + "epoch": 7.358573751067758, + "grad_norm": 2.533911943435669, + "learning_rate": 2.877439943147471e-05, + "loss": 0.4251, "step": 534100 }, { - "epoch": 5.44, - "learning_rate": 4.448711342148241e-05, - "loss": 0.5338, + "epoch": 7.359951503127498, + "grad_norm": 6.186253070831299, + "learning_rate": 2.876639254713203e-05, + "loss": 0.4444, "step": 534200 }, { - "epoch": 5.44, - "learning_rate": 4.44813042014641e-05, - "loss": 0.6281, + "epoch": 7.361329255187236, + "grad_norm": 75.2963638305664, + "learning_rate": 2.8758385555202807e-05, + "loss": 0.4514, "step": 534300 }, { - "epoch": 5.44, - "learning_rate": 4.447549427339644e-05, - "loss": 0.5767, + "epoch": 7.362707007246976, + "grad_norm": 2.9996325969696045, + "learning_rate": 2.87503784563673e-05, + "loss": 0.4059, "step": 534400 }, { - "epoch": 5.45, - "learning_rate": 4.4469683637563494e-05, - "loss": 0.7092, + "epoch": 7.364084759306715, + "grad_norm": 1.9278546571731567, + "learning_rate": 2.874245132387995e-05, + "loss": 0.4366, "step": 534500 }, { - "epoch": 5.45, - "learning_rate": 4.446387229424937e-05, - "loss": 0.593, + "epoch": 7.365462511366454, + "grad_norm": 8.845470428466797, + "learning_rate": 2.8734444014324697e-05, + "loss": 0.4597, "step": 534600 }, { - "epoch": 5.45, - "learning_rate": 4.44580602437382e-05, - "loss": 0.6403, + "epoch": 7.366840263426194, + "grad_norm": 6.135219097137451, + "learning_rate": 2.8726436599897097e-05, + "loss": 0.3821, "step": 534700 }, { - "epoch": 5.45, - "learning_rate": 4.4452247486314164e-05, - "loss": 0.6177, + "epoch": 7.368218015485933, + "grad_norm": 7.168758392333984, + "learning_rate": 2.871842908127745e-05, + "loss": 0.4434, "step": 534800 }, { - "epoch": 5.45, - "learning_rate": 4.444643402226147e-05, - "loss": 0.6262, + "epoch": 7.369595767545673, + "grad_norm": 2.1625781059265137, + "learning_rate": 2.8710421459146006e-05, + "loss": 0.4486, "step": 534900 }, { - "epoch": 5.45, - "learning_rate": 4.444061985186438e-05, - "loss": 0.6235, + "epoch": 7.370973519605412, + "grad_norm": 5.022161960601807, + "learning_rate": 2.8702413734183077e-05, + "loss": 0.4862, "step": 535000 }, { - "epoch": 5.45, - "learning_rate": 4.443480497540714e-05, - "loss": 0.5891, + "epoch": 7.372351271665151, + "grad_norm": 4.327476978302002, + "learning_rate": 2.8694405907068946e-05, + "loss": 0.4424, "step": 535100 }, { - "epoch": 5.45, - "learning_rate": 4.4428989393174094e-05, - "loss": 0.5875, + "epoch": 7.3737290237248905, + "grad_norm": 4.025018215179443, + "learning_rate": 2.8686397978483918e-05, + "loss": 0.5015, "step": 535200 }, { - "epoch": 5.45, - "learning_rate": 4.442317310544957e-05, - "loss": 0.6354, + "epoch": 7.37510677578463, + "grad_norm": 19.758026123046875, + "learning_rate": 2.8678389949108325e-05, + "loss": 0.4413, "step": 535300 }, { - "epoch": 5.45, - "learning_rate": 4.4417356112517945e-05, - "loss": 0.6336, + "epoch": 7.376484527844369, + "grad_norm": 79.03694152832031, + "learning_rate": 2.8670381819622464e-05, + "loss": 0.4203, "step": 535400 }, { - "epoch": 5.46, - "learning_rate": 4.441153841466365e-05, - "loss": 0.615, + "epoch": 7.377862279904108, + "grad_norm": 15.999736785888672, + "learning_rate": 2.8662373590706677e-05, + "loss": 0.4213, "step": 535500 }, { - "epoch": 5.46, - "learning_rate": 4.440572001217113e-05, - "loss": 0.5998, + "epoch": 7.379240031963848, + "grad_norm": 4.96759033203125, + "learning_rate": 2.865444534680455e-05, + "loss": 0.4026, "step": 535600 }, { - "epoch": 5.46, - "learning_rate": 4.439990090532487e-05, - "loss": 0.5982, + "epoch": 7.380617784023587, + "grad_norm": 5.163212299346924, + "learning_rate": 2.8646436922047275e-05, + "loss": 0.4621, "step": 535700 }, { - "epoch": 5.46, - "learning_rate": 4.439408109440939e-05, - "loss": 0.6228, + "epoch": 7.381995536083326, + "grad_norm": 2.5682151317596436, + "learning_rate": 2.8638428399894317e-05, + "loss": 0.4248, "step": 535800 }, { - "epoch": 5.46, - "learning_rate": 4.438826057970924e-05, - "loss": 0.5754, + "epoch": 7.383373288143066, + "grad_norm": 5.439509391784668, + "learning_rate": 2.8630419781026053e-05, + "loss": 0.401, "step": 535900 }, { - "epoch": 5.46, - "learning_rate": 4.438243936150902e-05, - "loss": 0.6568, + "epoch": 7.384751040202805, + "grad_norm": 3.276240110397339, + "learning_rate": 2.8622411066122833e-05, + "loss": 0.3952, "step": 536000 }, { - "epoch": 5.46, - "learning_rate": 4.437661744009334e-05, - "loss": 0.5877, + "epoch": 7.386128792262545, + "grad_norm": 3.118412971496582, + "learning_rate": 2.861440225586506e-05, + "loss": 0.4748, "step": 536100 }, { - "epoch": 5.46, - "learning_rate": 4.437079481574686e-05, - "loss": 0.5033, + "epoch": 7.387506544322283, + "grad_norm": 2.562843084335327, + "learning_rate": 2.8606393350933108e-05, + "loss": 0.4624, "step": 536200 }, { - "epoch": 5.46, - "learning_rate": 4.436497148875428e-05, - "loss": 0.6899, + "epoch": 7.388884296382023, + "grad_norm": 9.925040245056152, + "learning_rate": 2.8598384352007377e-05, + "loss": 0.4602, "step": 536300 }, { - "epoch": 5.46, - "learning_rate": 4.435914745940032e-05, - "loss": 0.5746, + "epoch": 7.3902620484417625, + "grad_norm": 5.952777862548828, + "learning_rate": 2.8590375259768286e-05, + "loss": 0.44, "step": 536400 }, { - "epoch": 5.47, - "learning_rate": 4.435338097875839e-05, - "loss": 0.6168, + "epoch": 7.391639800501502, + "grad_norm": 3.4073009490966797, + "learning_rate": 2.858236607489624e-05, + "loss": 0.4874, "step": 536500 }, { - "epoch": 5.47, - "learning_rate": 4.434755555255249e-05, - "loss": 0.6057, + "epoch": 7.393017552561241, + "grad_norm": 2.9499261379241943, + "learning_rate": 2.857435679807164e-05, + "loss": 0.4362, "step": 536600 }, { - "epoch": 5.47, - "learning_rate": 4.434172942483674e-05, - "loss": 0.6808, + "epoch": 7.39439530462098, + "grad_norm": 27.421464920043945, + "learning_rate": 2.8566347429974954e-05, + "loss": 0.4466, "step": 536700 }, { - "epoch": 5.47, - "learning_rate": 4.433590259589601e-05, - "loss": 0.6403, + "epoch": 7.39577305668072, + "grad_norm": 3.419938802719116, + "learning_rate": 2.855833797128658e-05, + "loss": 0.4359, "step": 536800 }, { - "epoch": 5.47, - "learning_rate": 4.43300750660152e-05, - "loss": 0.5827, + "epoch": 7.397150808740459, + "grad_norm": 0.5551576614379883, + "learning_rate": 2.8550328422686995e-05, + "loss": 0.4982, "step": 536900 }, { - "epoch": 5.47, - "learning_rate": 4.432424683547924e-05, - "loss": 0.6541, + "epoch": 7.398528560800198, + "grad_norm": 2.9478471279144287, + "learning_rate": 2.8542318784856638e-05, + "loss": 0.4494, "step": 537000 }, { - "epoch": 5.47, - "learning_rate": 4.431841790457309e-05, - "loss": 0.6105, + "epoch": 7.399906312859938, + "grad_norm": 2.561605453491211, + "learning_rate": 2.8534309058475953e-05, + "loss": 0.477, "step": 537100 }, { - "epoch": 5.47, - "learning_rate": 4.431258827358175e-05, - "loss": 0.558, + "epoch": 7.401284064919677, + "grad_norm": 9.127660751342773, + "learning_rate": 2.8526299244225435e-05, + "loss": 0.3674, "step": 537200 }, { - "epoch": 5.47, - "learning_rate": 4.430675794279026e-05, - "loss": 0.6587, + "epoch": 7.402661816979417, + "grad_norm": 31.75274085998535, + "learning_rate": 2.8518289342785544e-05, + "loss": 0.3494, "step": 537300 }, { - "epoch": 5.48, - "learning_rate": 4.430092691248368e-05, - "loss": 0.5687, + "epoch": 7.404039569039155, + "grad_norm": 4.384542465209961, + "learning_rate": 2.8510279354836762e-05, + "loss": 0.4763, "step": 537400 }, { - "epoch": 5.48, - "learning_rate": 4.429515350370273e-05, - "loss": 0.6228, + "epoch": 7.405417321098895, + "grad_norm": 5.3455915451049805, + "learning_rate": 2.8502269281059588e-05, + "loss": 0.4089, "step": 537500 }, { - "epoch": 5.48, - "learning_rate": 4.4289321082209366e-05, - "loss": 0.622, + "epoch": 7.4067950731586345, + "grad_norm": 2.7535457611083984, + "learning_rate": 2.8494259122134498e-05, + "loss": 0.3736, "step": 537600 }, { - "epoch": 5.48, - "learning_rate": 4.428348796205348e-05, - "loss": 0.5678, + "epoch": 7.408172825218374, + "grad_norm": 14.361881256103516, + "learning_rate": 2.848624887874201e-05, + "loss": 0.4311, "step": 537700 }, { - "epoch": 5.48, - "learning_rate": 4.427765414352025e-05, - "loss": 0.5563, + "epoch": 7.409550577278113, + "grad_norm": 8.127216339111328, + "learning_rate": 2.8478238551562643e-05, + "loss": 0.4814, "step": 537800 }, { - "epoch": 5.48, - "learning_rate": 4.4271819626894956e-05, - "loss": 0.541, + "epoch": 7.410928329337852, + "grad_norm": 5.568881034851074, + "learning_rate": 2.8470228141276904e-05, + "loss": 0.4091, "step": 537900 }, { - "epoch": 5.48, - "learning_rate": 4.426598441246284e-05, - "loss": 0.6585, + "epoch": 7.412306081397592, + "grad_norm": 4.130171298980713, + "learning_rate": 2.8462217648565315e-05, + "loss": 0.4916, "step": 538000 }, { - "epoch": 5.48, - "learning_rate": 4.4260148500509225e-05, - "loss": 0.6518, + "epoch": 7.413683833457331, + "grad_norm": 3.00921368598938, + "learning_rate": 2.845420707410842e-05, + "loss": 0.4329, "step": 538100 }, { - "epoch": 5.48, - "learning_rate": 4.425431189131945e-05, - "loss": 0.6301, + "epoch": 7.41506158551707, + "grad_norm": 6.275873184204102, + "learning_rate": 2.8446196418586746e-05, + "loss": 0.4459, "step": 538200 }, { - "epoch": 5.48, - "learning_rate": 4.424847458517889e-05, - "loss": 0.6119, + "epoch": 7.41643933757681, + "grad_norm": 2.6935315132141113, + "learning_rate": 2.8438185682680855e-05, + "loss": 0.4002, "step": 538300 }, { - "epoch": 5.49, - "learning_rate": 4.424263658237295e-05, - "loss": 0.6751, + "epoch": 7.417817089636549, + "grad_norm": 1.236141562461853, + "learning_rate": 2.8430174867071287e-05, + "loss": 0.4465, "step": 538400 }, { - "epoch": 5.49, - "learning_rate": 4.4236797883187086e-05, - "loss": 0.5911, + "epoch": 7.419194841696289, + "grad_norm": 7.577119827270508, + "learning_rate": 2.8422163972438604e-05, + "loss": 0.4508, "step": 538500 }, { - "epoch": 5.49, - "learning_rate": 4.4230958487906755e-05, - "loss": 0.5835, + "epoch": 7.420572593756027, + "grad_norm": 3.757509231567383, + "learning_rate": 2.841415299946337e-05, + "loss": 0.4487, "step": 538600 }, { - "epoch": 5.49, - "learning_rate": 4.4225118396817474e-05, - "loss": 0.555, + "epoch": 7.421950345815767, + "grad_norm": 2.673419952392578, + "learning_rate": 2.8406141948826166e-05, + "loss": 0.4198, "step": 538700 }, { - "epoch": 5.49, - "learning_rate": 4.421927761020481e-05, - "loss": 0.6717, + "epoch": 7.4233280978755065, + "grad_norm": 5.737542152404785, + "learning_rate": 2.8398130821207567e-05, + "loss": 0.3899, "step": 538800 }, { - "epoch": 5.49, - "learning_rate": 4.4213436128354315e-05, - "loss": 0.584, + "epoch": 7.424705849935245, + "grad_norm": 0.56877201795578, + "learning_rate": 2.839011961728816e-05, + "loss": 0.4721, "step": 538900 }, { - "epoch": 5.49, - "learning_rate": 4.420759395155163e-05, - "loss": 0.5698, + "epoch": 7.426083601994985, + "grad_norm": 2.6419119834899902, + "learning_rate": 2.8382108337748535e-05, + "loss": 0.3868, "step": 539000 }, { - "epoch": 5.49, - "learning_rate": 4.420175108008237e-05, - "loss": 0.668, + "epoch": 7.427461354054724, + "grad_norm": 5.866122245788574, + "learning_rate": 2.83740969832693e-05, + "loss": 0.4438, "step": 539100 }, { - "epoch": 5.49, - "learning_rate": 4.4195907514232246e-05, - "loss": 0.5092, + "epoch": 7.428839106114464, + "grad_norm": 5.372983932495117, + "learning_rate": 2.8366085554531052e-05, + "loss": 0.4033, "step": 539200 }, { - "epoch": 5.49, - "learning_rate": 4.4190063254286956e-05, - "loss": 0.6313, + "epoch": 7.430216858174203, + "grad_norm": 1.023581624031067, + "learning_rate": 2.8358154167599546e-05, + "loss": 0.4075, "step": 539300 }, { - "epoch": 5.5, - "learning_rate": 4.4184218300532255e-05, - "loss": 0.6157, + "epoch": 7.431594610233942, + "grad_norm": 7.562337398529053, + "learning_rate": 2.835014259311072e-05, + "loss": 0.4169, "step": 539400 }, { - "epoch": 5.5, - "learning_rate": 4.4178372653253934e-05, - "loss": 0.7111, + "epoch": 7.432972362293682, + "grad_norm": 6.716611862182617, + "learning_rate": 2.8342130946397945e-05, + "loss": 0.4291, "step": 539500 }, { - "epoch": 5.5, - "learning_rate": 4.4172526312737794e-05, - "loss": 0.6334, + "epoch": 7.434350114353421, + "grad_norm": 2.795417070388794, + "learning_rate": 2.833411922814183e-05, + "loss": 0.4276, "step": 539600 }, { - "epoch": 5.5, - "learning_rate": 4.416667927926969e-05, - "loss": 0.6465, + "epoch": 7.43572786641316, + "grad_norm": 2.315000295639038, + "learning_rate": 2.8326107439023017e-05, + "loss": 0.3747, "step": 539700 }, { - "epoch": 5.5, - "learning_rate": 4.4160831553135524e-05, - "loss": 0.5948, + "epoch": 7.437105618472899, + "grad_norm": 3.9713361263275146, + "learning_rate": 2.8318095579722155e-05, + "loss": 0.422, "step": 539800 }, { - "epoch": 5.5, - "learning_rate": 4.4154983134621196e-05, - "loss": 0.6976, + "epoch": 7.438483370532639, + "grad_norm": 7.583224773406982, + "learning_rate": 2.831008365091989e-05, + "loss": 0.3877, "step": 539900 }, { - "epoch": 5.5, - "learning_rate": 4.414913402401266e-05, - "loss": 0.6168, + "epoch": 7.4398611225923785, + "grad_norm": 2.159616231918335, + "learning_rate": 2.8302071653296866e-05, + "loss": 0.4354, "step": 540000 }, { - "epoch": 5.5, - "learning_rate": 4.414328422159591e-05, - "loss": 0.6072, + "epoch": 7.441238874652117, + "grad_norm": 12.720037460327148, + "learning_rate": 2.8294059587533757e-05, + "loss": 0.432, "step": 540100 }, { - "epoch": 5.5, - "learning_rate": 4.4137433727656974e-05, - "loss": 0.7067, + "epoch": 7.442616626711857, + "grad_norm": 5.1632304191589355, + "learning_rate": 2.8286047454311226e-05, + "loss": 0.4922, "step": 540200 }, { - "epoch": 5.5, - "learning_rate": 4.413158254248189e-05, - "loss": 0.5905, + "epoch": 7.443994378771596, + "grad_norm": 10.606891632080078, + "learning_rate": 2.827803525430994e-05, + "loss": 0.456, "step": 540300 }, { - "epoch": 5.51, - "learning_rate": 4.412573066635675e-05, - "loss": 0.6042, + "epoch": 7.445372130831336, + "grad_norm": 5.939263820648193, + "learning_rate": 2.827002298821058e-05, + "loss": 0.4572, "step": 540400 }, { - "epoch": 5.51, - "learning_rate": 4.4119878099567683e-05, - "loss": 0.6284, + "epoch": 7.4467498828910745, + "grad_norm": 1.9471814632415771, + "learning_rate": 2.8262010656693828e-05, + "loss": 0.4526, "step": 540500 }, { - "epoch": 5.51, - "learning_rate": 4.411402484240083e-05, - "loss": 0.6194, + "epoch": 7.448127634950814, + "grad_norm": 18.333885192871094, + "learning_rate": 2.825399826044037e-05, + "loss": 0.4129, "step": 540600 }, { - "epoch": 5.51, - "learning_rate": 4.4108170895142396e-05, - "loss": 0.6498, + "epoch": 7.449505387010554, + "grad_norm": 3.623263120651245, + "learning_rate": 2.82459858001309e-05, + "loss": 0.4514, "step": 540700 }, { - "epoch": 5.51, - "learning_rate": 4.41023162580786e-05, - "loss": 0.5895, + "epoch": 7.450883139070293, + "grad_norm": 4.437058925628662, + "learning_rate": 2.8237973276446122e-05, + "loss": 0.4565, "step": 540800 }, { - "epoch": 5.51, - "learning_rate": 4.40964609314957e-05, - "loss": 0.5994, + "epoch": 7.452260891130032, + "grad_norm": 2.5760324001312256, + "learning_rate": 2.822996069006674e-05, + "loss": 0.4344, "step": 540900 }, { - "epoch": 5.51, - "learning_rate": 4.409060491567998e-05, - "loss": 0.6237, + "epoch": 7.453638643189771, + "grad_norm": 3.3212900161743164, + "learning_rate": 2.822194804167346e-05, + "loss": 0.4333, "step": 541000 }, { - "epoch": 5.51, - "learning_rate": 4.4084748210917776e-05, - "loss": 0.6029, + "epoch": 7.455016395249511, + "grad_norm": 5.009664535522461, + "learning_rate": 2.8213935331947008e-05, + "loss": 0.3425, "step": 541100 }, { - "epoch": 5.51, - "learning_rate": 4.407889081749543e-05, - "loss": 0.5726, + "epoch": 7.4563941473092505, + "grad_norm": 1.490462064743042, + "learning_rate": 2.820592256156809e-05, + "loss": 0.5042, "step": 541200 }, { - "epoch": 5.51, - "learning_rate": 4.407303273569934e-05, - "loss": 0.6478, + "epoch": 7.457771899368989, + "grad_norm": 2.1359567642211914, + "learning_rate": 2.8197909731217436e-05, + "loss": 0.3638, "step": 541300 }, { - "epoch": 5.52, - "learning_rate": 4.406717396581594e-05, - "loss": 0.6845, + "epoch": 7.459149651428729, + "grad_norm": 1.5745075941085815, + "learning_rate": 2.818989684157579e-05, + "loss": 0.5117, "step": 541400 }, { - "epoch": 5.52, - "learning_rate": 4.4061314508131666e-05, - "loss": 0.7196, + "epoch": 7.460527403488468, + "grad_norm": 4.589914321899414, + "learning_rate": 2.8181883893323876e-05, + "loss": 0.4293, "step": 541500 }, { - "epoch": 5.52, - "learning_rate": 4.4055454362933045e-05, - "loss": 0.6521, + "epoch": 7.461905155548208, + "grad_norm": 6.481187343597412, + "learning_rate": 2.8173870887142427e-05, + "loss": 0.4427, "step": 541600 }, { - "epoch": 5.52, - "learning_rate": 4.4049593530506574e-05, - "loss": 0.6904, + "epoch": 7.4632829076079465, + "grad_norm": 0.042053285986185074, + "learning_rate": 2.816585782371221e-05, + "loss": 0.3851, "step": 541700 }, { - "epoch": 5.52, - "learning_rate": 4.404373201113882e-05, - "loss": 0.7974, + "epoch": 7.464660659667686, + "grad_norm": 8.815479278564453, + "learning_rate": 2.8157844703713953e-05, + "loss": 0.4387, "step": 541800 }, { - "epoch": 5.52, - "learning_rate": 4.4037869805116386e-05, - "loss": 0.5769, + "epoch": 7.466038411727426, + "grad_norm": 2.8633878231048584, + "learning_rate": 2.8149911659861695e-05, + "loss": 0.4418, "step": 541900 }, { - "epoch": 5.52, - "learning_rate": 4.40320069127259e-05, - "loss": 0.5989, + "epoch": 7.467416163787165, + "grad_norm": 6.054144859313965, + "learning_rate": 2.8141898429318355e-05, + "loss": 0.4278, "step": 542000 }, { - "epoch": 5.52, - "learning_rate": 4.4026143334254e-05, - "loss": 0.6951, + "epoch": 7.468793915846904, + "grad_norm": 3.6870791912078857, + "learning_rate": 2.8133885144242454e-05, + "loss": 0.4411, "step": 542100 }, { - "epoch": 5.52, - "learning_rate": 4.4020279069987404e-05, - "loss": 0.6586, + "epoch": 7.470171667906643, + "grad_norm": 28.393657684326172, + "learning_rate": 2.8125871805314777e-05, + "loss": 0.3942, "step": 542200 }, { - "epoch": 5.53, - "learning_rate": 4.4014414120212815e-05, - "loss": 0.6265, + "epoch": 7.471549419966383, + "grad_norm": 8.25516128540039, + "learning_rate": 2.811785841321608e-05, + "loss": 0.4861, "step": 542300 }, { - "epoch": 5.53, - "learning_rate": 4.400860714495789e-05, - "loss": 0.6445, + "epoch": 7.4729271720261226, + "grad_norm": 4.084705352783203, + "learning_rate": 2.8109844968627156e-05, + "loss": 0.404, "step": 542400 }, { - "epoch": 5.53, - "learning_rate": 4.40027408318756e-05, - "loss": 0.6343, + "epoch": 7.474304924085861, + "grad_norm": 3.2064032554626465, + "learning_rate": 2.8101831472228777e-05, + "loss": 0.4117, "step": 542500 }, { - "epoch": 5.53, - "learning_rate": 4.399687383414284e-05, - "loss": 0.7116, + "epoch": 7.475682676145601, + "grad_norm": 0.1507849395275116, + "learning_rate": 2.8093817924701737e-05, + "loss": 0.4073, "step": 542600 }, { - "epoch": 5.53, - "learning_rate": 4.399100615204648e-05, - "loss": 0.5308, + "epoch": 7.47706042820534, + "grad_norm": 10.756246566772461, + "learning_rate": 2.8085804326726814e-05, + "loss": 0.4548, "step": 542700 }, { - "epoch": 5.53, - "learning_rate": 4.3985137785873416e-05, - "loss": 0.6299, + "epoch": 7.47843818026508, + "grad_norm": 2.5515196323394775, + "learning_rate": 2.807779067898484e-05, + "loss": 0.4752, "step": 542800 }, { - "epoch": 5.53, - "learning_rate": 4.397926873591058e-05, - "loss": 0.7087, + "epoch": 7.479815932324819, + "grad_norm": 6.677034378051758, + "learning_rate": 2.8069776982156577e-05, + "loss": 0.4321, "step": 542900 }, { - "epoch": 5.53, - "learning_rate": 4.3973399002444915e-05, - "loss": 0.5172, + "epoch": 7.481193684384558, + "grad_norm": 2.3016717433929443, + "learning_rate": 2.8061763236922845e-05, + "loss": 0.38, "step": 543000 }, { - "epoch": 5.53, - "learning_rate": 4.396752858576344e-05, - "loss": 0.6207, + "epoch": 7.482571436444298, + "grad_norm": 3.4783923625946045, + "learning_rate": 2.805374944396446e-05, + "loss": 0.4317, "step": 543100 }, { - "epoch": 5.53, - "learning_rate": 4.396165748615315e-05, - "loss": 0.5287, + "epoch": 7.483949188504036, + "grad_norm": 2.9948785305023193, + "learning_rate": 2.8045735603962214e-05, + "loss": 0.3922, "step": 543200 }, { - "epoch": 5.54, - "learning_rate": 4.3955785703901146e-05, - "loss": 0.5323, + "epoch": 7.485326940563776, + "grad_norm": 6.320486545562744, + "learning_rate": 2.8037721717596953e-05, + "loss": 0.3554, "step": 543300 }, { - "epoch": 5.54, - "learning_rate": 4.39499132392945e-05, - "loss": 0.7058, + "epoch": 7.4867046926235155, + "grad_norm": 7.366561412811279, + "learning_rate": 2.8029707785549475e-05, + "loss": 0.4667, "step": 543400 }, { - "epoch": 5.54, - "learning_rate": 4.394404009262035e-05, - "loss": 0.6369, + "epoch": 7.488082444683255, + "grad_norm": 0.17292086780071259, + "learning_rate": 2.8021693808500603e-05, + "loss": 0.4258, "step": 543500 }, { - "epoch": 5.54, - "learning_rate": 4.3938166264165845e-05, - "loss": 0.6712, + "epoch": 7.489460196742995, + "grad_norm": 3.4930570125579834, + "learning_rate": 2.801367978713118e-05, + "loss": 0.3629, "step": 543600 }, { - "epoch": 5.54, - "learning_rate": 4.3932291754218196e-05, - "loss": 0.6051, + "epoch": 7.490837948802733, + "grad_norm": 3.487156629562378, + "learning_rate": 2.800566572212202e-05, + "loss": 0.4246, "step": 543700 }, { - "epoch": 5.54, - "learning_rate": 4.3926416563064614e-05, - "loss": 0.7129, + "epoch": 7.492215700862473, + "grad_norm": 12.597655296325684, + "learning_rate": 2.7997651614153985e-05, + "loss": 0.4273, "step": 543800 }, { - "epoch": 5.54, - "learning_rate": 4.392054069099237e-05, - "loss": 0.5879, + "epoch": 7.493593452922212, + "grad_norm": 11.945016860961914, + "learning_rate": 2.79896374639079e-05, + "loss": 0.4542, "step": 543900 }, { - "epoch": 5.54, - "learning_rate": 4.391466413828877e-05, - "loss": 0.5682, + "epoch": 7.494971204981951, + "grad_norm": 0.7353599071502686, + "learning_rate": 2.798162327206459e-05, + "loss": 0.4616, "step": 544000 }, { - "epoch": 5.54, - "learning_rate": 4.3908786905241137e-05, - "loss": 0.6544, + "epoch": 7.496348957041691, + "grad_norm": 3.4027493000030518, + "learning_rate": 2.797360903930493e-05, + "loss": 0.4658, "step": 544100 }, { - "epoch": 5.54, - "learning_rate": 4.390290899213682e-05, - "loss": 0.6695, + "epoch": 7.49772670910143, + "grad_norm": 5.9105544090271, + "learning_rate": 2.796559476630976e-05, + "loss": 0.4275, "step": 544200 }, { - "epoch": 5.55, - "learning_rate": 4.3897030399263215e-05, - "loss": 0.5555, + "epoch": 7.49910446116117, + "grad_norm": 15.132706642150879, + "learning_rate": 2.7957580453759934e-05, + "loss": 0.4558, "step": 544300 }, { - "epoch": 5.55, - "learning_rate": 4.389120992299381e-05, - "loss": 0.6823, + "epoch": 7.500482213220909, + "grad_norm": 9.754582405090332, + "learning_rate": 2.794956610233631e-05, + "loss": 0.4595, "step": 544400 }, { - "epoch": 5.55, - "learning_rate": 4.388532997823448e-05, - "loss": 0.6049, + "epoch": 7.501859965280648, + "grad_norm": 3.4021215438842773, + "learning_rate": 2.7941551712719735e-05, + "loss": 0.453, "step": 544500 }, { - "epoch": 5.55, - "learning_rate": 4.387944935456538e-05, - "loss": 0.6513, + "epoch": 7.5032377173403875, + "grad_norm": 3.115478038787842, + "learning_rate": 2.7933537285591087e-05, + "loss": 0.3945, "step": 544600 }, { - "epoch": 5.55, - "learning_rate": 4.387356805227405e-05, - "loss": 0.6894, + "epoch": 7.504615469400127, + "grad_norm": 5.966651916503906, + "learning_rate": 2.7925522821631235e-05, + "loss": 0.4195, "step": 544700 }, { - "epoch": 5.55, - "learning_rate": 4.386768607164802e-05, - "loss": 0.5959, + "epoch": 7.505993221459866, + "grad_norm": 5.1514105796813965, + "learning_rate": 2.7917508321521035e-05, + "loss": 0.4192, "step": 544800 }, { - "epoch": 5.55, - "learning_rate": 4.38618034129749e-05, - "loss": 0.5943, + "epoch": 7.507370973519605, + "grad_norm": 4.739969253540039, + "learning_rate": 2.7909573931470508e-05, + "loss": 0.4486, "step": 544900 }, { - "epoch": 5.55, - "learning_rate": 4.385592007654232e-05, - "loss": 0.5475, + "epoch": 7.508748725579345, + "grad_norm": 7.577914237976074, + "learning_rate": 2.790155936144677e-05, + "loss": 0.4292, "step": 545000 }, { - "epoch": 5.55, - "learning_rate": 4.385003606263793e-05, - "loss": 0.6197, + "epoch": 7.510126477639084, + "grad_norm": 3.4212565422058105, + "learning_rate": 2.7893544757308502e-05, + "loss": 0.4494, "step": 545100 }, { - "epoch": 5.55, - "learning_rate": 4.384415137154943e-05, - "loss": 0.6858, + "epoch": 7.511504229698823, + "grad_norm": 4.344147205352783, + "learning_rate": 2.7885530119736596e-05, + "loss": 0.4346, "step": 545200 }, { - "epoch": 5.56, - "learning_rate": 4.383826600356455e-05, - "loss": 0.5553, + "epoch": 7.512881981758563, + "grad_norm": 13.158196449279785, + "learning_rate": 2.787751544941194e-05, + "loss": 0.4817, "step": 545300 }, { - "epoch": 5.56, - "learning_rate": 4.383237995897105e-05, - "loss": 0.5994, + "epoch": 7.514259733818302, + "grad_norm": 9.003913879394531, + "learning_rate": 2.786950074701541e-05, + "loss": 0.4278, "step": 545400 }, { - "epoch": 5.56, - "learning_rate": 4.382649323805671e-05, - "loss": 0.6438, + "epoch": 7.515637485878042, + "grad_norm": 9.059292793273926, + "learning_rate": 2.7861486013227906e-05, + "loss": 0.4371, "step": 545500 }, { - "epoch": 5.56, - "learning_rate": 4.382060584110937e-05, - "loss": 0.587, + "epoch": 7.51701523793778, + "grad_norm": 0.09797698259353638, + "learning_rate": 2.7853471248730304e-05, + "loss": 0.452, "step": 545600 }, { - "epoch": 5.56, - "learning_rate": 4.381471776841687e-05, - "loss": 0.6921, + "epoch": 7.51839298999752, + "grad_norm": 5.361985683441162, + "learning_rate": 2.784545645420352e-05, + "loss": 0.3916, "step": 545700 }, { - "epoch": 5.56, - "learning_rate": 4.380882902026711e-05, - "loss": 0.531, + "epoch": 7.5197707420572595, + "grad_norm": 4.8792643547058105, + "learning_rate": 2.783744163032845e-05, + "loss": 0.4467, "step": 545800 }, { - "epoch": 5.56, - "learning_rate": 4.3802939596948035e-05, - "loss": 0.5933, + "epoch": 7.521148494116999, + "grad_norm": 6.651999473571777, + "learning_rate": 2.7829426777785974e-05, + "loss": 0.4608, "step": 545900 }, { - "epoch": 5.56, - "learning_rate": 4.379704949874758e-05, - "loss": 0.6053, + "epoch": 7.522526246176738, + "grad_norm": 4.2536492347717285, + "learning_rate": 2.782141189725701e-05, + "loss": 0.4687, "step": 546000 }, { - "epoch": 5.56, - "learning_rate": 4.3791158725953735e-05, - "loss": 0.6528, + "epoch": 7.523903998236477, + "grad_norm": 2.4149169921875, + "learning_rate": 2.7813396989422468e-05, + "loss": 0.4177, "step": 546100 }, { - "epoch": 5.56, - "learning_rate": 4.3785267278854534e-05, - "loss": 0.5721, + "epoch": 7.525281750296217, + "grad_norm": 4.404573440551758, + "learning_rate": 2.7805382054963247e-05, + "loss": 0.3984, "step": 546200 }, { - "epoch": 5.57, - "learning_rate": 4.377937515773803e-05, - "loss": 0.6541, + "epoch": 7.526659502355956, + "grad_norm": 2.8749806880950928, + "learning_rate": 2.7797367094560257e-05, + "loss": 0.4649, "step": 546300 }, { - "epoch": 5.57, - "learning_rate": 4.377348236289231e-05, - "loss": 0.5926, + "epoch": 7.528037254415695, + "grad_norm": 4.871800899505615, + "learning_rate": 2.7789352108894404e-05, + "loss": 0.4709, "step": 546400 }, { - "epoch": 5.57, - "learning_rate": 4.3767588894605494e-05, - "loss": 0.645, + "epoch": 7.529415006475435, + "grad_norm": 22.180395126342773, + "learning_rate": 2.7781337098646615e-05, + "loss": 0.4227, "step": 546500 }, { - "epoch": 5.57, - "learning_rate": 4.376169475316574e-05, - "loss": 0.6006, + "epoch": 7.530792758535174, + "grad_norm": 0.8249420523643494, + "learning_rate": 2.7773322064497802e-05, + "loss": 0.4538, "step": 546600 }, { - "epoch": 5.57, - "learning_rate": 4.375579993886125e-05, - "loss": 0.648, + "epoch": 7.532170510594914, + "grad_norm": 10.610916137695312, + "learning_rate": 2.776530700712888e-05, + "loss": 0.4457, "step": 546700 }, { - "epoch": 5.57, - "learning_rate": 4.374990445198023e-05, - "loss": 0.6921, + "epoch": 7.533548262654652, + "grad_norm": 2.190502643585205, + "learning_rate": 2.775729192722077e-05, + "loss": 0.4445, "step": 546800 }, { - "epoch": 5.57, - "learning_rate": 4.374400829281093e-05, - "loss": 0.5948, + "epoch": 7.534926014714392, + "grad_norm": 9.598668098449707, + "learning_rate": 2.774927682545439e-05, + "loss": 0.5067, "step": 546900 }, { - "epoch": 5.57, - "learning_rate": 4.373811146164164e-05, - "loss": 0.612, + "epoch": 7.5363037667741315, + "grad_norm": 0.7223147749900818, + "learning_rate": 2.7741261702510673e-05, + "loss": 0.4449, "step": 547000 }, { - "epoch": 5.57, - "learning_rate": 4.3732213958760696e-05, - "loss": 0.595, + "epoch": 7.537681518833871, + "grad_norm": 3.7646493911743164, + "learning_rate": 2.773324655907054e-05, + "loss": 0.5046, "step": 547100 }, { - "epoch": 5.57, - "learning_rate": 4.372631578445644e-05, - "loss": 0.6331, + "epoch": 7.53905927089361, + "grad_norm": 2.554262161254883, + "learning_rate": 2.772523139581492e-05, + "loss": 0.3902, "step": 547200 }, { - "epoch": 5.58, - "learning_rate": 4.3720416939017255e-05, - "loss": 0.5206, + "epoch": 7.540437022953349, + "grad_norm": 3.9231226444244385, + "learning_rate": 2.7717216213424737e-05, + "loss": 0.4376, "step": 547300 }, { - "epoch": 5.58, - "learning_rate": 4.371451742273155e-05, - "loss": 0.5713, + "epoch": 7.541814775013089, + "grad_norm": 3.263807535171509, + "learning_rate": 2.770920101258093e-05, + "loss": 0.3812, "step": 547400 }, { - "epoch": 5.58, - "learning_rate": 4.37086172358878e-05, - "loss": 0.5669, + "epoch": 7.5431925270728275, + "grad_norm": 4.426365375518799, + "learning_rate": 2.7701185793964423e-05, + "loss": 0.4195, "step": 547500 }, { - "epoch": 5.58, - "learning_rate": 4.3702716378774473e-05, - "loss": 0.6388, + "epoch": 7.544570279132567, + "grad_norm": 0.9327697157859802, + "learning_rate": 2.769317055825616e-05, + "loss": 0.4011, "step": 547600 }, { - "epoch": 5.58, - "learning_rate": 4.3696814851680085e-05, - "loss": 0.5884, + "epoch": 7.545948031192307, + "grad_norm": 0.11842532455921173, + "learning_rate": 2.7685155306137054e-05, + "loss": 0.559, "step": 547700 }, { - "epoch": 5.58, - "learning_rate": 4.369091265489319e-05, - "loss": 0.5899, + "epoch": 7.547325783252046, + "grad_norm": 2.767132043838501, + "learning_rate": 2.7677140038288074e-05, + "loss": 0.4519, "step": 547800 }, { - "epoch": 5.58, - "learning_rate": 4.3685009788702366e-05, - "loss": 0.61, + "epoch": 7.548703535311786, + "grad_norm": 4.038454532623291, + "learning_rate": 2.7669124755390133e-05, + "loss": 0.4346, "step": 547900 }, { - "epoch": 5.58, - "learning_rate": 4.367910625339624e-05, - "loss": 0.5401, + "epoch": 7.550081287371524, + "grad_norm": 2.855855703353882, + "learning_rate": 2.7661109458124186e-05, + "loss": 0.4632, "step": 548000 }, { - "epoch": 5.58, - "learning_rate": 4.367320204926345e-05, - "loss": 0.6246, + "epoch": 7.551459039431264, + "grad_norm": 2.4040982723236084, + "learning_rate": 2.7653094147171163e-05, + "loss": 0.4184, "step": 548100 }, { - "epoch": 5.59, - "learning_rate": 4.366729717659267e-05, - "loss": 0.6131, + "epoch": 7.5528367914910035, + "grad_norm": 4.255384922027588, + "learning_rate": 2.7645078823211996e-05, + "loss": 0.3831, "step": 548200 }, { - "epoch": 5.59, - "learning_rate": 4.366139163567262e-05, - "loss": 0.6377, + "epoch": 7.554214543550742, + "grad_norm": 6.381831169128418, + "learning_rate": 2.763706348692766e-05, + "loss": 0.485, "step": 548300 }, { - "epoch": 5.59, - "learning_rate": 4.365548542679205e-05, - "loss": 0.5178, + "epoch": 7.555592295610482, + "grad_norm": 2.5923430919647217, + "learning_rate": 2.762904813899907e-05, + "loss": 0.43, "step": 548400 }, { - "epoch": 5.59, - "learning_rate": 4.364957855023974e-05, - "loss": 0.6348, + "epoch": 7.556970047670221, + "grad_norm": 1.9660134315490723, + "learning_rate": 2.7621032780107176e-05, + "loss": 0.3912, "step": 548500 }, { - "epoch": 5.59, - "learning_rate": 4.364367100630449e-05, - "loss": 0.5241, + "epoch": 7.558347799729961, + "grad_norm": 21.31551742553711, + "learning_rate": 2.7613017410932943e-05, + "loss": 0.4232, "step": 548600 }, { - "epoch": 5.59, - "learning_rate": 4.363776279527514e-05, - "loss": 0.552, + "epoch": 7.5597255517897, + "grad_norm": 1.1674977540969849, + "learning_rate": 2.760508218599034e-05, + "loss": 0.4445, "step": 548700 }, { - "epoch": 5.59, - "learning_rate": 4.363185391744059e-05, - "loss": 0.5183, + "epoch": 7.561103303849439, + "grad_norm": 1.6215500831604004, + "learning_rate": 2.7597066798380063e-05, + "loss": 0.4453, "step": 548800 }, { - "epoch": 5.59, - "learning_rate": 4.362594437308973e-05, - "loss": 0.5679, + "epoch": 7.562481055909179, + "grad_norm": 2.0366902351379395, + "learning_rate": 2.7589051402523462e-05, + "loss": 0.3568, "step": 548900 }, { - "epoch": 5.59, - "learning_rate": 4.3620034162511496e-05, - "loss": 0.5126, + "epoch": 7.563858807968918, + "grad_norm": 3.8983054161071777, + "learning_rate": 2.75810359991015e-05, + "loss": 0.4205, "step": 549000 }, { - "epoch": 5.59, - "learning_rate": 4.361412328599488e-05, - "loss": 0.5286, + "epoch": 7.565236560028657, + "grad_norm": 4.745150089263916, + "learning_rate": 2.7573020588795105e-05, + "loss": 0.3978, "step": 549100 }, { - "epoch": 5.6, - "learning_rate": 4.360821174382888e-05, - "loss": 0.6341, + "epoch": 7.566614312088396, + "grad_norm": 1.7292938232421875, + "learning_rate": 2.7565005172285243e-05, + "loss": 0.3805, "step": 549200 }, { - "epoch": 5.6, - "learning_rate": 4.360229953630253e-05, - "loss": 0.6589, + "epoch": 7.567992064148136, + "grad_norm": 4.7617573738098145, + "learning_rate": 2.7556989750252857e-05, + "loss": 0.4486, "step": 549300 }, { - "epoch": 5.6, - "learning_rate": 4.3596386663704914e-05, - "loss": 0.5656, + "epoch": 7.5693698162078755, + "grad_norm": 4.335742473602295, + "learning_rate": 2.7548974323378903e-05, + "loss": 0.4752, "step": 549400 }, { - "epoch": 5.6, - "learning_rate": 4.3590473126325127e-05, - "loss": 0.6225, + "epoch": 7.570747568267614, + "grad_norm": 7.225325107574463, + "learning_rate": 2.754095889234433e-05, + "loss": 0.4665, "step": 549500 }, { - "epoch": 5.6, - "learning_rate": 4.35845589244523e-05, - "loss": 0.6008, + "epoch": 7.572125320327354, + "grad_norm": 7.217161178588867, + "learning_rate": 2.7532943457830076e-05, + "loss": 0.4916, "step": 549600 }, { - "epoch": 5.6, - "learning_rate": 4.3578644058375624e-05, - "loss": 0.6276, + "epoch": 7.573503072387093, + "grad_norm": 6.872833251953125, + "learning_rate": 2.7524928020517125e-05, + "loss": 0.4356, "step": 549700 }, { - "epoch": 5.6, - "learning_rate": 4.3572728528384276e-05, - "loss": 0.6903, + "epoch": 7.574880824446833, + "grad_norm": 3.390831232070923, + "learning_rate": 2.7516912581086405e-05, + "loss": 0.3905, "step": 549800 }, { - "epoch": 5.6, - "learning_rate": 4.356687149998767e-05, - "loss": 0.627, + "epoch": 7.5762585765065715, + "grad_norm": 1.4677950143814087, + "learning_rate": 2.7508897140218866e-05, + "loss": 0.3942, "step": 549900 }, { - "epoch": 5.6, - "learning_rate": 4.3560954649666674e-05, - "loss": 0.5396, + "epoch": 7.577636328566311, + "grad_norm": 5.779827117919922, + "learning_rate": 2.7500881698595486e-05, + "loss": 0.4291, "step": 550000 }, { - "epoch": 5.6, - "learning_rate": 4.355503713629591e-05, - "loss": 0.5334, + "epoch": 7.579014080626051, + "grad_norm": 5.80511474609375, + "learning_rate": 2.749286625689719e-05, + "loss": 0.45, "step": 550100 }, { - "epoch": 5.61, - "learning_rate": 4.354911896016473e-05, - "loss": 0.5482, + "epoch": 7.58039183268579, + "grad_norm": 3.532496213912964, + "learning_rate": 2.7484850815804945e-05, + "loss": 0.4133, "step": 550200 }, { - "epoch": 5.61, - "learning_rate": 4.354320012156248e-05, - "loss": 0.5601, + "epoch": 7.581769584745529, + "grad_norm": 4.331103324890137, + "learning_rate": 2.7476835375999697e-05, + "loss": 0.4377, "step": 550300 }, { - "epoch": 5.61, - "learning_rate": 4.353728062077858e-05, - "loss": 0.6359, + "epoch": 7.583147336805268, + "grad_norm": 3.1047475337982178, + "learning_rate": 2.7468819938162402e-05, + "loss": 0.4224, "step": 550400 }, { - "epoch": 5.61, - "learning_rate": 4.353136045810244e-05, - "loss": 0.4822, + "epoch": 7.584525088865008, + "grad_norm": 3.1765358448028564, + "learning_rate": 2.7460804502974013e-05, + "loss": 0.4322, "step": 550500 }, { - "epoch": 5.61, - "learning_rate": 4.352543963382352e-05, - "loss": 0.6395, + "epoch": 7.5859028409247475, + "grad_norm": 15.844441413879395, + "learning_rate": 2.7452789071115486e-05, + "loss": 0.4464, "step": 550600 }, { - "epoch": 5.61, - "learning_rate": 4.3519518148231326e-05, - "loss": 0.6078, + "epoch": 7.587280592984486, + "grad_norm": 3.4814414978027344, + "learning_rate": 2.7444773643267775e-05, + "loss": 0.4075, "step": 550700 }, { - "epoch": 5.61, - "learning_rate": 4.3513596001615365e-05, - "loss": 0.6139, + "epoch": 7.588658345044226, + "grad_norm": 2.5962369441986084, + "learning_rate": 2.7436758220111818e-05, + "loss": 0.4031, "step": 550800 }, { - "epoch": 5.61, - "learning_rate": 4.350767319426521e-05, - "loss": 0.5345, + "epoch": 7.590036097103965, + "grad_norm": 2.559882879257202, + "learning_rate": 2.742882295647757e-05, + "loss": 0.4743, "step": 550900 }, { - "epoch": 5.61, - "learning_rate": 4.350174972647044e-05, - "loss": 0.5326, + "epoch": 7.591413849163705, + "grad_norm": 0.9008790254592896, + "learning_rate": 2.742080754468408e-05, + "loss": 0.4486, "step": 551000 }, { - "epoch": 5.61, - "learning_rate": 4.349582559852069e-05, - "loss": 0.5607, + "epoch": 7.5927916012234435, + "grad_norm": 8.004463195800781, + "learning_rate": 2.7412792139618397e-05, + "loss": 0.4016, "step": 551100 }, { - "epoch": 5.62, - "learning_rate": 4.34899008107056e-05, - "loss": 0.6106, + "epoch": 7.594169353283183, + "grad_norm": 17.060848236083984, + "learning_rate": 2.740477674196147e-05, + "loss": 0.4366, "step": 551200 }, { - "epoch": 5.62, - "learning_rate": 4.348397536331488e-05, - "loss": 0.6167, + "epoch": 7.595547105342923, + "grad_norm": 1.7429239749908447, + "learning_rate": 2.7396761352394246e-05, + "loss": 0.4756, "step": 551300 }, { - "epoch": 5.62, - "learning_rate": 4.347804925663821e-05, - "loss": 0.6763, + "epoch": 7.596924857402662, + "grad_norm": 5.913734436035156, + "learning_rate": 2.738874597159767e-05, + "loss": 0.4153, "step": 551400 }, { - "epoch": 5.62, - "learning_rate": 4.3472122490965375e-05, - "loss": 0.55, + "epoch": 7.598302609462401, + "grad_norm": 3.1381280422210693, + "learning_rate": 2.738073060025269e-05, + "loss": 0.4191, "step": 551500 }, { - "epoch": 5.62, - "learning_rate": 4.346619506658615e-05, - "loss": 0.5514, + "epoch": 7.59968036152214, + "grad_norm": 3.892171859741211, + "learning_rate": 2.7372715239040254e-05, + "loss": 0.4157, "step": 551600 }, { - "epoch": 5.62, - "learning_rate": 4.346026698379034e-05, - "loss": 0.5638, + "epoch": 7.60105811358188, + "grad_norm": 4.3432416915893555, + "learning_rate": 2.7364699888641323e-05, + "loss": 0.401, "step": 551700 }, { - "epoch": 5.62, - "learning_rate": 4.3454338242867804e-05, - "loss": 0.5867, + "epoch": 7.602435865641619, + "grad_norm": 11.388246536254883, + "learning_rate": 2.735668454973682e-05, + "loss": 0.4001, "step": 551800 }, { - "epoch": 5.62, - "learning_rate": 4.3448408844108425e-05, - "loss": 0.6505, + "epoch": 7.603813617701358, + "grad_norm": 11.4810791015625, + "learning_rate": 2.7348669223007698e-05, + "loss": 0.4245, "step": 551900 }, { - "epoch": 5.62, - "learning_rate": 4.344247878780211e-05, - "loss": 0.671, + "epoch": 7.605191369761098, + "grad_norm": 3.2112677097320557, + "learning_rate": 2.7340653909134894e-05, + "loss": 0.4443, "step": 552000 }, { - "epoch": 5.62, - "learning_rate": 4.3436548074238793e-05, - "loss": 0.5807, + "epoch": 7.606569121820837, + "grad_norm": 13.221724510192871, + "learning_rate": 2.733263860879936e-05, + "loss": 0.4185, "step": 552100 }, { - "epoch": 5.63, - "learning_rate": 4.343061670370847e-05, - "loss": 0.5581, + "epoch": 7.607946873880577, + "grad_norm": 4.487495422363281, + "learning_rate": 2.7324623322682035e-05, + "loss": 0.371, "step": 552200 }, { - "epoch": 5.63, - "learning_rate": 4.342468467650114e-05, - "loss": 0.589, + "epoch": 7.609324625940316, + "grad_norm": 3.0562098026275635, + "learning_rate": 2.7316608051463862e-05, + "loss": 0.3934, "step": 552300 }, { - "epoch": 5.63, - "learning_rate": 4.341875199290684e-05, - "loss": 0.6767, + "epoch": 7.610702378000055, + "grad_norm": 0.7446732521057129, + "learning_rate": 2.7308592795825763e-05, + "loss": 0.4926, "step": 552400 }, { - "epoch": 5.63, - "learning_rate": 4.341281865321564e-05, - "loss": 0.5379, + "epoch": 7.612080130059795, + "grad_norm": 3.0428073406219482, + "learning_rate": 2.7300577556448684e-05, + "loss": 0.3953, "step": 552500 }, { - "epoch": 5.63, - "learning_rate": 4.340688465771766e-05, - "loss": 0.6394, + "epoch": 7.613457882119533, + "grad_norm": 3.3515334129333496, + "learning_rate": 2.7292562334013568e-05, + "loss": 0.4957, "step": 552600 }, { - "epoch": 5.63, - "learning_rate": 4.340095000670303e-05, - "loss": 0.4846, + "epoch": 7.614835634179273, + "grad_norm": 2.5777339935302734, + "learning_rate": 2.728454712920134e-05, + "loss": 0.4384, "step": 552700 }, { - "epoch": 5.63, - "learning_rate": 4.3395014700461904e-05, - "loss": 0.5949, + "epoch": 7.6162133862390125, + "grad_norm": 4.800641059875488, + "learning_rate": 2.7276531942692935e-05, + "loss": 0.4549, "step": 552800 }, { - "epoch": 5.63, - "learning_rate": 4.338907873928451e-05, - "loss": 0.725, + "epoch": 7.617591138298752, + "grad_norm": 20.748266220092773, + "learning_rate": 2.726851677516929e-05, + "loss": 0.4585, "step": 552900 }, { - "epoch": 5.63, - "learning_rate": 4.338320149285884e-05, - "loss": 0.568, + "epoch": 7.618968890358492, + "grad_norm": 3.864189863204956, + "learning_rate": 2.7260501627311324e-05, + "loss": 0.4663, "step": 553000 }, { - "epoch": 5.64, - "learning_rate": 4.3377264229221736e-05, - "loss": 0.5683, + "epoch": 7.62034664241823, + "grad_norm": 14.188593864440918, + "learning_rate": 2.725248649979997e-05, + "loss": 0.376, "step": 553100 }, { - "epoch": 5.64, - "learning_rate": 4.3371326311516235e-05, - "loss": 0.5702, + "epoch": 7.62172439447797, + "grad_norm": 5.0581817626953125, + "learning_rate": 2.7244471393316158e-05, + "loss": 0.457, "step": 553200 }, { - "epoch": 5.64, - "learning_rate": 4.336538774003268e-05, - "loss": 0.6097, + "epoch": 7.623102146537709, + "grad_norm": 1.201155424118042, + "learning_rate": 2.7236456308540806e-05, + "loss": 0.437, "step": 553300 }, { - "epoch": 5.64, - "learning_rate": 4.335944851506141e-05, - "loss": 0.6066, + "epoch": 7.624479898597448, + "grad_norm": 63.219810485839844, + "learning_rate": 2.7228441246154843e-05, + "loss": 0.5435, "step": 553400 }, { - "epoch": 5.64, - "learning_rate": 4.335350863689285e-05, - "loss": 0.6705, + "epoch": 7.625857650657188, + "grad_norm": 10.504203796386719, + "learning_rate": 2.7220426206839177e-05, + "loss": 0.4972, "step": 553500 }, { - "epoch": 5.64, - "learning_rate": 4.33475681058174e-05, - "loss": 0.5738, + "epoch": 7.627235402716927, + "grad_norm": 3.643195152282715, + "learning_rate": 2.7212411191274738e-05, + "loss": 0.4497, "step": 553600 }, { - "epoch": 5.64, - "learning_rate": 4.3341626922125525e-05, - "loss": 0.5466, + "epoch": 7.628613154776667, + "grad_norm": 3.3292970657348633, + "learning_rate": 2.7204396200142443e-05, + "loss": 0.4045, "step": 553700 }, { - "epoch": 5.64, - "learning_rate": 4.333568508610771e-05, - "loss": 0.4833, + "epoch": 7.629990906836405, + "grad_norm": 3.394129991531372, + "learning_rate": 2.71963812341232e-05, + "loss": 0.4129, "step": 553800 }, { - "epoch": 5.64, - "learning_rate": 4.332974259805448e-05, - "loss": 0.5848, + "epoch": 7.631368658896145, + "grad_norm": 2.786311626434326, + "learning_rate": 2.7188446443170265e-05, + "loss": 0.4085, "step": 553900 }, { - "epoch": 5.64, - "learning_rate": 4.3323858892879546e-05, - "loss": 0.61, + "epoch": 7.6327464109558845, + "grad_norm": 11.43811321258545, + "learning_rate": 2.7180431529151748e-05, + "loss": 0.4132, "step": 554000 }, { - "epoch": 5.65, - "learning_rate": 4.331791510814028e-05, - "loss": 0.6415, + "epoch": 7.634124163015624, + "grad_norm": 0.5038183927536011, + "learning_rate": 2.7172416642282212e-05, + "loss": 0.4673, "step": 554100 }, { - "epoch": 5.65, - "learning_rate": 4.331197067223442e-05, - "loss": 0.5616, + "epoch": 7.635501915075363, + "grad_norm": 8.87948989868164, + "learning_rate": 2.7164401783242547e-05, + "loss": 0.3787, "step": 554200 }, { - "epoch": 5.65, - "learning_rate": 4.3306025585452656e-05, - "loss": 0.6331, + "epoch": 7.636879667135102, + "grad_norm": 6.885680675506592, + "learning_rate": 2.7156386952713675e-05, + "loss": 0.4465, "step": 554300 }, { - "epoch": 5.65, - "learning_rate": 4.3300079848085645e-05, - "loss": 0.5442, + "epoch": 7.638257419194842, + "grad_norm": 2.42177677154541, + "learning_rate": 2.7148452299243125e-05, + "loss": 0.5471, "step": 554400 }, { - "epoch": 5.65, - "learning_rate": 4.3294133460424104e-05, - "loss": 0.6466, + "epoch": 7.639635171254581, + "grad_norm": 1.4351866245269775, + "learning_rate": 2.7140437527476432e-05, + "loss": 0.428, "step": 554500 }, { - "epoch": 5.65, - "learning_rate": 4.328818642275876e-05, - "loss": 0.5342, + "epoch": 7.64101292331432, + "grad_norm": 8.119945526123047, + "learning_rate": 2.7132422786256407e-05, + "loss": 0.4198, "step": 554600 }, { - "epoch": 5.65, - "learning_rate": 4.3282238735380406e-05, - "loss": 0.6022, + "epoch": 7.64239067537406, + "grad_norm": 2.7974629402160645, + "learning_rate": 2.7124408076263946e-05, + "loss": 0.3663, "step": 554700 }, { - "epoch": 5.65, - "learning_rate": 4.327629039857984e-05, - "loss": 0.6032, + "epoch": 7.643768427433799, + "grad_norm": 3.1003544330596924, + "learning_rate": 2.7116393398179934e-05, + "loss": 0.4467, "step": 554800 }, { - "epoch": 5.65, - "learning_rate": 4.32703414126479e-05, - "loss": 0.6293, + "epoch": 7.645146179493539, + "grad_norm": 1.658278226852417, + "learning_rate": 2.710837875268527e-05, + "loss": 0.4093, "step": 554900 }, { - "epoch": 5.65, - "learning_rate": 4.326439177787546e-05, - "loss": 0.5969, + "epoch": 7.646523931553277, + "grad_norm": 4.178915023803711, + "learning_rate": 2.7100364140460845e-05, + "loss": 0.4519, "step": 555000 }, { - "epoch": 5.66, - "learning_rate": 4.3258441494553425e-05, - "loss": 0.556, + "epoch": 7.647901683613017, + "grad_norm": 10.85798168182373, + "learning_rate": 2.7092349562187523e-05, + "loss": 0.404, "step": 555100 }, { - "epoch": 5.66, - "learning_rate": 4.3252490562972716e-05, - "loss": 0.6045, + "epoch": 7.6492794356727565, + "grad_norm": 3.015878915786743, + "learning_rate": 2.708433501854619e-05, + "loss": 0.4606, "step": 555200 }, { - "epoch": 5.66, - "learning_rate": 4.324653898342431e-05, - "loss": 0.5206, + "epoch": 7.650657187732496, + "grad_norm": 4.767267227172852, + "learning_rate": 2.7076320510217717e-05, + "loss": 0.4036, "step": 555300 }, { - "epoch": 5.66, - "learning_rate": 4.3240586756199196e-05, - "loss": 0.5951, + "epoch": 7.652034939792235, + "grad_norm": 2.2454137802124023, + "learning_rate": 2.706830603788299e-05, + "loss": 0.4091, "step": 555400 }, { - "epoch": 5.66, - "learning_rate": 4.3234633881588406e-05, - "loss": 0.6341, + "epoch": 7.653412691851974, + "grad_norm": 9.89771842956543, + "learning_rate": 2.7060291602222876e-05, + "loss": 0.4364, "step": 555500 }, { - "epoch": 5.66, - "learning_rate": 4.3228680359883e-05, - "loss": 0.5919, + "epoch": 7.654790443911714, + "grad_norm": 37.50621795654297, + "learning_rate": 2.7052277203918248e-05, + "loss": 0.4235, "step": 555600 }, { - "epoch": 5.66, - "learning_rate": 4.322272619137407e-05, - "loss": 0.7124, + "epoch": 7.656168195971453, + "grad_norm": 3.441382884979248, + "learning_rate": 2.704426284364995e-05, + "loss": 0.4475, "step": 555700 }, { - "epoch": 5.66, - "learning_rate": 4.321677137635274e-05, - "loss": 0.6143, + "epoch": 7.657545948031192, + "grad_norm": 1.5240055322647095, + "learning_rate": 2.7036248522098852e-05, + "loss": 0.3591, "step": 555800 }, { - "epoch": 5.66, - "learning_rate": 4.321081591511016e-05, - "loss": 0.6462, + "epoch": 7.658923700090932, + "grad_norm": 3.976015329360962, + "learning_rate": 2.7028234239945822e-05, + "loss": 0.4359, "step": 555900 }, { - "epoch": 5.66, - "learning_rate": 4.320485980793752e-05, - "loss": 0.5808, + "epoch": 7.660301452150671, + "grad_norm": 22.412111282348633, + "learning_rate": 2.7020219997871705e-05, + "loss": 0.4308, "step": 556000 }, { - "epoch": 5.67, - "learning_rate": 4.3198903055126035e-05, - "loss": 0.6107, + "epoch": 7.66167920421041, + "grad_norm": 1.1921263933181763, + "learning_rate": 2.701220579655736e-05, + "loss": 0.3562, "step": 556100 }, { - "epoch": 5.67, - "learning_rate": 4.319294565696696e-05, - "loss": 0.6008, + "epoch": 7.663056956270149, + "grad_norm": 2.0331079959869385, + "learning_rate": 2.700419163668362e-05, + "loss": 0.4328, "step": 556200 }, { - "epoch": 5.67, - "learning_rate": 4.318698761375158e-05, - "loss": 0.5931, + "epoch": 7.664434708329889, + "grad_norm": 2.5105929374694824, + "learning_rate": 2.6996177518931335e-05, + "loss": 0.4011, "step": 556300 }, { - "epoch": 5.67, - "learning_rate": 4.31810289257712e-05, - "loss": 0.6418, + "epoch": 7.6658124603896285, + "grad_norm": 42.577552795410156, + "learning_rate": 2.698816344398135e-05, + "loss": 0.4453, "step": 556400 }, { - "epoch": 5.67, - "learning_rate": 4.317506959331716e-05, - "loss": 0.4871, + "epoch": 7.667190212449368, + "grad_norm": 10.191869735717773, + "learning_rate": 2.6980149412514502e-05, + "loss": 0.4004, "step": 556500 }, { - "epoch": 5.67, - "learning_rate": 4.3169169219634954e-05, - "loss": 0.6527, + "epoch": 7.668567964509107, + "grad_norm": 2.487185001373291, + "learning_rate": 2.6972135425211618e-05, + "loss": 0.4382, "step": 556600 }, { - "epoch": 5.67, - "learning_rate": 4.3163208605545225e-05, - "loss": 0.6247, + "epoch": 7.669945716568846, + "grad_norm": 5.342384338378906, + "learning_rate": 2.6964121482753535e-05, + "loss": 0.4139, "step": 556700 }, { - "epoch": 5.67, - "learning_rate": 4.315724734785316e-05, - "loss": 0.6339, + "epoch": 7.671323468628586, + "grad_norm": 5.744577884674072, + "learning_rate": 2.6956107585821068e-05, + "loss": 0.4347, "step": 556800 }, { - "epoch": 5.67, - "learning_rate": 4.315128544685021e-05, - "loss": 0.5041, + "epoch": 7.6727012206883245, + "grad_norm": 2.852713108062744, + "learning_rate": 2.6948093735095044e-05, + "loss": 0.4401, "step": 556900 }, { - "epoch": 5.67, - "learning_rate": 4.314532290282789e-05, - "loss": 0.4832, + "epoch": 7.674078972748064, + "grad_norm": 1.303440809249878, + "learning_rate": 2.6940079931256285e-05, + "loss": 0.4303, "step": 557000 }, { - "epoch": 5.68, - "learning_rate": 4.3139359716077734e-05, - "loss": 0.5883, + "epoch": 7.675456724807804, + "grad_norm": 4.114048480987549, + "learning_rate": 2.69320661749856e-05, + "loss": 0.4402, "step": 557100 }, { - "epoch": 5.68, - "learning_rate": 4.313339588689129e-05, - "loss": 0.6008, + "epoch": 7.676834476867543, + "grad_norm": 19.123132705688477, + "learning_rate": 2.692405246696379e-05, + "loss": 0.3761, "step": 557200 }, { - "epoch": 5.68, - "learning_rate": 4.3127431415560166e-05, - "loss": 0.5565, + "epoch": 7.678212228927283, + "grad_norm": 3.209638833999634, + "learning_rate": 2.691603880787169e-05, + "loss": 0.4454, "step": 557300 }, { - "epoch": 5.68, - "learning_rate": 4.312146630237598e-05, - "loss": 0.6162, + "epoch": 7.679589980987021, + "grad_norm": 10.314963340759277, + "learning_rate": 2.690802519839006e-05, + "loss": 0.4055, "step": 557400 }, { - "epoch": 5.68, - "learning_rate": 4.31155005476304e-05, - "loss": 0.4892, + "epoch": 7.680967733046761, + "grad_norm": 2.643754720687866, + "learning_rate": 2.6900011639199726e-05, + "loss": 0.453, "step": 557500 }, { - "epoch": 5.68, - "learning_rate": 4.31095938187486e-05, - "loss": 0.6597, + "epoch": 7.6823454851065005, + "grad_norm": 9.098482131958008, + "learning_rate": 2.689199813098147e-05, + "loss": 0.3885, "step": 557600 }, { - "epoch": 5.68, - "learning_rate": 4.310362678816366e-05, - "loss": 0.6182, + "epoch": 7.683723237166239, + "grad_norm": 10.241753578186035, + "learning_rate": 2.688398467441608e-05, + "loss": 0.4361, "step": 557700 }, { - "epoch": 5.68, - "learning_rate": 4.309765911688957e-05, - "loss": 0.6434, + "epoch": 7.685100989225979, + "grad_norm": 2.877782106399536, + "learning_rate": 2.6875971270184363e-05, + "loss": 0.4063, "step": 557800 }, { - "epoch": 5.68, - "learning_rate": 4.309169080521812e-05, - "loss": 0.6423, + "epoch": 7.686478741285718, + "grad_norm": 7.93145751953125, + "learning_rate": 2.6867957918967058e-05, + "loss": 0.4785, "step": 557900 }, { - "epoch": 5.69, - "learning_rate": 4.308572185344111e-05, - "loss": 0.5865, + "epoch": 7.687856493345458, + "grad_norm": 1.1333529949188232, + "learning_rate": 2.685994462144497e-05, + "loss": 0.3646, "step": 558000 }, { - "epoch": 5.69, - "learning_rate": 4.3079752261850385e-05, - "loss": 0.6173, + "epoch": 7.6892342454051965, + "grad_norm": 1.2457891702651978, + "learning_rate": 2.6851931378298857e-05, + "loss": 0.4471, "step": 558100 }, { - "epoch": 5.69, - "learning_rate": 4.307378203073784e-05, - "loss": 0.5951, + "epoch": 7.690611997464936, + "grad_norm": 1.9103825092315674, + "learning_rate": 2.6843918190209486e-05, + "loss": 0.4184, "step": 558200 }, { - "epoch": 5.69, - "learning_rate": 4.3067811160395377e-05, - "loss": 0.5642, + "epoch": 7.691989749524676, + "grad_norm": 4.85763692855835, + "learning_rate": 2.6835905057857624e-05, + "loss": 0.4061, "step": 558300 }, { - "epoch": 5.69, - "learning_rate": 4.3061839651114926e-05, - "loss": 0.6104, + "epoch": 7.693367501584415, + "grad_norm": 2.5763587951660156, + "learning_rate": 2.6827891981924035e-05, + "loss": 0.3913, "step": 558400 }, { - "epoch": 5.69, - "learning_rate": 4.305586750318847e-05, - "loss": 0.752, + "epoch": 7.694745253644154, + "grad_norm": 8.854558944702148, + "learning_rate": 2.681987896308945e-05, + "loss": 0.4125, "step": 558500 }, { - "epoch": 5.69, - "learning_rate": 4.3049894716908005e-05, - "loss": 0.5335, + "epoch": 7.696123005703893, + "grad_norm": 3.6522865295410156, + "learning_rate": 2.681186600203463e-05, + "loss": 0.4602, "step": 558600 }, { - "epoch": 5.69, - "learning_rate": 4.304392129256556e-05, - "loss": 0.611, + "epoch": 7.697500757763633, + "grad_norm": 5.700541019439697, + "learning_rate": 2.680385309944031e-05, + "loss": 0.4814, "step": 558700 }, { - "epoch": 5.69, - "learning_rate": 4.3037947230453215e-05, - "loss": 0.6107, + "epoch": 7.6988785098233725, + "grad_norm": 3.4218716621398926, + "learning_rate": 2.6795840255987232e-05, + "loss": 0.4159, "step": 558800 }, { - "epoch": 5.69, - "learning_rate": 4.3031972530863054e-05, - "loss": 0.6909, + "epoch": 7.700256261883111, + "grad_norm": 3.5009915828704834, + "learning_rate": 2.6787827472356134e-05, + "loss": 0.4139, "step": 558900 }, { - "epoch": 5.7, - "learning_rate": 4.302599719408721e-05, - "loss": 0.5326, + "epoch": 7.701634013942851, + "grad_norm": 12.02125072479248, + "learning_rate": 2.6779814749227732e-05, + "loss": 0.43, "step": 559000 }, { - "epoch": 5.7, - "learning_rate": 4.302002122041783e-05, - "loss": 0.6143, + "epoch": 7.70301176600259, + "grad_norm": 4.383796215057373, + "learning_rate": 2.677180208728275e-05, + "loss": 0.3723, "step": 559100 }, { - "epoch": 5.7, - "learning_rate": 4.301404461014712e-05, - "loss": 0.5387, + "epoch": 7.70438951806233, + "grad_norm": 5.934932231903076, + "learning_rate": 2.67637894872019e-05, + "loss": 0.4077, "step": 559200 }, { - "epoch": 5.7, - "learning_rate": 4.300806736356728e-05, - "loss": 0.5312, + "epoch": 7.7057672701220685, + "grad_norm": 4.267889499664307, + "learning_rate": 2.6755776949665903e-05, + "loss": 0.427, "step": 559300 }, { - "epoch": 5.7, - "learning_rate": 4.300208948097058e-05, - "loss": 0.6137, + "epoch": 7.707145022181808, + "grad_norm": 9.591529846191406, + "learning_rate": 2.6747764475355462e-05, + "loss": 0.3654, "step": 559400 }, { - "epoch": 5.7, - "learning_rate": 4.2996110962649306e-05, - "loss": 0.7067, + "epoch": 7.708522774241548, + "grad_norm": 12.786456108093262, + "learning_rate": 2.673975206495129e-05, + "loss": 0.423, "step": 559500 }, { - "epoch": 5.7, - "learning_rate": 4.2990131808895755e-05, - "loss": 0.6202, + "epoch": 7.709900526301287, + "grad_norm": 12.553844451904297, + "learning_rate": 2.6731739719134056e-05, + "loss": 0.4766, "step": 559600 }, { - "epoch": 5.7, - "learning_rate": 4.298415202000228e-05, - "loss": 0.6079, + "epoch": 7.711278278361026, + "grad_norm": 9.227893829345703, + "learning_rate": 2.6723727438584458e-05, + "loss": 0.439, "step": 559700 }, { - "epoch": 5.7, - "learning_rate": 4.2978171596261256e-05, - "loss": 0.5986, + "epoch": 7.712656030420765, + "grad_norm": 4.103210926055908, + "learning_rate": 2.6715715223983192e-05, + "loss": 0.4315, "step": 559800 }, { - "epoch": 5.7, - "learning_rate": 4.297219053796509e-05, - "loss": 0.6878, + "epoch": 7.714033782480505, + "grad_norm": 5.083662033081055, + "learning_rate": 2.670770307601093e-05, + "loss": 0.498, "step": 559900 }, { - "epoch": 5.71, - "learning_rate": 4.296620884540622e-05, - "loss": 0.6901, + "epoch": 7.7154115345402445, + "grad_norm": 5.420502185821533, + "learning_rate": 2.6699690995348356e-05, + "loss": 0.446, "step": 560000 }, { - "epoch": 5.71, - "learning_rate": 4.2960226518877124e-05, - "loss": 0.5864, + "epoch": 7.716789286599983, + "grad_norm": 16.163969039916992, + "learning_rate": 2.6691678982676116e-05, + "loss": 0.4454, "step": 560100 }, { - "epoch": 5.71, - "learning_rate": 4.2954303391408095e-05, - "loss": 0.5956, + "epoch": 7.718167038659723, + "grad_norm": 2.8711178302764893, + "learning_rate": 2.6683667038674877e-05, + "loss": 0.4122, "step": 560200 }, { - "epoch": 5.71, - "learning_rate": 4.2948319804148464e-05, - "loss": 0.5423, + "epoch": 7.719544790719462, + "grad_norm": 2.779858350753784, + "learning_rate": 2.6675655164025304e-05, + "loss": 0.3653, "step": 560300 }, { - "epoch": 5.71, - "learning_rate": 4.294233558379326e-05, - "loss": 0.7795, + "epoch": 7.720922542779201, + "grad_norm": 2.6652016639709473, + "learning_rate": 2.6667643359408043e-05, + "loss": 0.4462, "step": 560400 }, { - "epoch": 5.71, - "learning_rate": 4.293635073063508e-05, - "loss": 0.6248, + "epoch": 7.7223002948389405, + "grad_norm": 13.430249214172363, + "learning_rate": 2.6659711742490513e-05, + "loss": 0.4056, "step": 560500 }, { - "epoch": 5.71, - "learning_rate": 4.293036524496655e-05, - "loss": 0.6419, + "epoch": 7.72367804689868, + "grad_norm": 0.8853716850280762, + "learning_rate": 2.6651700079262498e-05, + "loss": 0.3815, "step": 560600 }, { - "epoch": 5.71, - "learning_rate": 4.292437912708033e-05, - "loss": 0.6114, + "epoch": 7.72505579895842, + "grad_norm": 1.9806708097457886, + "learning_rate": 2.66436884881019e-05, + "loss": 0.4437, "step": 560700 }, { - "epoch": 5.71, - "learning_rate": 4.2918392377269095e-05, - "loss": 0.5693, + "epoch": 7.726433551018159, + "grad_norm": 4.61207914352417, + "learning_rate": 2.663567696968934e-05, + "loss": 0.4567, "step": 560800 }, { - "epoch": 5.71, - "learning_rate": 4.291240499582557e-05, - "loss": 0.613, + "epoch": 7.727811303077898, + "grad_norm": 24.115774154663086, + "learning_rate": 2.6627665524705445e-05, + "loss": 0.4012, "step": 560900 }, { - "epoch": 5.72, - "learning_rate": 4.2906416983042495e-05, - "loss": 0.6145, + "epoch": 7.729189055137637, + "grad_norm": 7.2109575271606445, + "learning_rate": 2.6619654153830825e-05, + "loss": 0.4484, "step": 561000 }, { - "epoch": 5.72, - "learning_rate": 4.290042833921265e-05, - "loss": 0.6099, + "epoch": 7.730566807197377, + "grad_norm": 1.6353265047073364, + "learning_rate": 2.66116428577461e-05, + "loss": 0.4777, "step": 561100 }, { - "epoch": 5.72, - "learning_rate": 4.2894439064628845e-05, - "loss": 0.6162, + "epoch": 7.731944559257116, + "grad_norm": 2.9727556705474854, + "learning_rate": 2.660363163713185e-05, + "loss": 0.4616, "step": 561200 }, { - "epoch": 5.72, - "learning_rate": 4.288844915958392e-05, - "loss": 0.5448, + "epoch": 7.733322311316855, + "grad_norm": 6.219762802124023, + "learning_rate": 2.6595700603734124e-05, + "loss": 0.3998, "step": 561300 }, { - "epoch": 5.72, - "learning_rate": 4.288245862437075e-05, - "loss": 0.6628, + "epoch": 7.734700063376595, + "grad_norm": 2.914759874343872, + "learning_rate": 2.6587689535330934e-05, + "loss": 0.4137, "step": 561400 }, { - "epoch": 5.72, - "learning_rate": 4.287646745928223e-05, - "loss": 0.6489, + "epoch": 7.736077815436334, + "grad_norm": 2.966567039489746, + "learning_rate": 2.6579678544433177e-05, + "loss": 0.3974, "step": 561500 }, { - "epoch": 5.72, - "learning_rate": 4.2870475664611284e-05, - "loss": 0.5763, + "epoch": 7.737455567496074, + "grad_norm": 4.464357376098633, + "learning_rate": 2.6571667631721446e-05, + "loss": 0.3672, "step": 561600 }, { - "epoch": 5.72, - "learning_rate": 4.28644832406509e-05, - "loss": 0.5843, + "epoch": 7.7388333195558126, + "grad_norm": 3.9366455078125, + "learning_rate": 2.6563656797876302e-05, + "loss": 0.4507, "step": 561700 }, { - "epoch": 5.72, - "learning_rate": 4.285849018769405e-05, - "loss": 0.5917, + "epoch": 7.740211071615552, + "grad_norm": 4.0575480461120605, + "learning_rate": 2.6555646043578296e-05, + "loss": 0.3838, "step": 561800 }, { - "epoch": 5.72, - "learning_rate": 4.285249650603375e-05, - "loss": 0.595, + "epoch": 7.741588823675292, + "grad_norm": 4.34018087387085, + "learning_rate": 2.6547635369507995e-05, + "loss": 0.4754, "step": 561900 }, { - "epoch": 5.73, - "learning_rate": 4.2846562142173464e-05, - "loss": 0.7103, + "epoch": 7.74296657573503, + "grad_norm": 0.7386291027069092, + "learning_rate": 2.6539624776345932e-05, + "loss": 0.3844, "step": 562000 }, { - "epoch": 5.73, - "learning_rate": 4.284056721026523e-05, - "loss": 0.6344, + "epoch": 7.74434432779477, + "grad_norm": 6.642742156982422, + "learning_rate": 2.6531614264772664e-05, + "loss": 0.4125, "step": 562100 }, { - "epoch": 5.73, - "learning_rate": 4.283457165052989e-05, - "loss": 0.6899, + "epoch": 7.7457220798545094, + "grad_norm": 6.9536566734313965, + "learning_rate": 2.652360383546872e-05, + "loss": 0.4391, "step": 562200 }, { - "epoch": 5.73, - "learning_rate": 4.282857546326058e-05, - "loss": 0.6086, + "epoch": 7.747099831914249, + "grad_norm": 10.2610445022583, + "learning_rate": 2.6515593489114627e-05, + "loss": 0.418, "step": 562300 }, { - "epoch": 5.73, - "learning_rate": 4.282257864875049e-05, - "loss": 0.4851, + "epoch": 7.748477583973988, + "grad_norm": 4.365941524505615, + "learning_rate": 2.65075832263909e-05, + "loss": 0.4079, "step": 562400 }, { - "epoch": 5.73, - "learning_rate": 4.281658120729283e-05, - "loss": 0.5682, + "epoch": 7.749855336033727, + "grad_norm": 20.58535385131836, + "learning_rate": 2.649957304797805e-05, + "loss": 0.4213, "step": 562500 }, { - "epoch": 5.73, - "learning_rate": 4.281058313918083e-05, - "loss": 0.6163, + "epoch": 7.751233088093467, + "grad_norm": 4.588970184326172, + "learning_rate": 2.6491562954556592e-05, + "loss": 0.4259, "step": 562600 }, { - "epoch": 5.73, - "learning_rate": 4.2804584444707764e-05, - "loss": 0.6508, + "epoch": 7.752610840153206, + "grad_norm": 4.208446979522705, + "learning_rate": 2.6483552946807022e-05, + "loss": 0.3596, "step": 562700 }, { - "epoch": 5.73, - "learning_rate": 4.279858512416693e-05, - "loss": 0.6689, + "epoch": 7.753988592212945, + "grad_norm": 52.8625602722168, + "learning_rate": 2.647554302540983e-05, + "loss": 0.395, "step": 562800 }, { - "epoch": 5.73, - "learning_rate": 4.2792585177851664e-05, - "loss": 0.6754, + "epoch": 7.755366344272685, + "grad_norm": 2.5685863494873047, + "learning_rate": 2.64675331910455e-05, + "loss": 0.4088, "step": 562900 }, { - "epoch": 5.74, - "learning_rate": 4.278658460605533e-05, - "loss": 0.5285, + "epoch": 7.756744096332424, + "grad_norm": 3.0562326908111572, + "learning_rate": 2.6459523444394497e-05, + "loss": 0.4425, "step": 563000 }, { - "epoch": 5.74, - "learning_rate": 4.278058340907131e-05, - "loss": 0.5393, + "epoch": 7.758121848392164, + "grad_norm": 7.920937538146973, + "learning_rate": 2.645151378613731e-05, + "loss": 0.4541, "step": 563100 }, { - "epoch": 5.74, - "learning_rate": 4.277458158719303e-05, - "loss": 0.6492, + "epoch": 7.759499600451902, + "grad_norm": 3.6052544116973877, + "learning_rate": 2.6443504216954387e-05, + "loss": 0.3607, "step": 563200 }, { - "epoch": 5.74, - "learning_rate": 4.276857914071395e-05, - "loss": 0.6301, + "epoch": 7.760877352511642, + "grad_norm": 1.7825300693511963, + "learning_rate": 2.6435494737526195e-05, + "loss": 0.3671, "step": 563300 }, { - "epoch": 5.74, - "learning_rate": 4.276257606992754e-05, - "loss": 0.5828, + "epoch": 7.7622551045713815, + "grad_norm": 7.826066970825195, + "learning_rate": 2.6427485348533166e-05, + "loss": 0.4822, "step": 563400 }, { - "epoch": 5.74, - "learning_rate": 4.275657237512734e-05, - "loss": 0.5016, + "epoch": 7.763632856631121, + "grad_norm": 1.5091921091079712, + "learning_rate": 2.6419476050655732e-05, + "loss": 0.434, "step": 563500 }, { - "epoch": 5.74, - "learning_rate": 4.275056805660686e-05, - "loss": 0.5719, + "epoch": 7.76501060869086, + "grad_norm": 1.339563012123108, + "learning_rate": 2.641146684457435e-05, + "loss": 0.4014, "step": 563600 }, { - "epoch": 5.74, - "learning_rate": 4.27445631146597e-05, - "loss": 0.6831, + "epoch": 7.766388360750599, + "grad_norm": 1.4326204061508179, + "learning_rate": 2.640345773096942e-05, + "loss": 0.4113, "step": 563700 }, { - "epoch": 5.74, - "learning_rate": 4.273855754957946e-05, - "loss": 0.5659, + "epoch": 7.767766112810339, + "grad_norm": 4.294501304626465, + "learning_rate": 2.6395448710521363e-05, + "loss": 0.4447, "step": 563800 }, { - "epoch": 5.75, - "learning_rate": 4.273255136165978e-05, - "loss": 0.6473, + "epoch": 7.769143864870078, + "grad_norm": 2.865030288696289, + "learning_rate": 2.6387439783910606e-05, + "loss": 0.4677, "step": 563900 }, { - "epoch": 5.75, - "learning_rate": 4.272654455119432e-05, - "loss": 0.667, + "epoch": 7.770521616929817, + "grad_norm": 1.9951144456863403, + "learning_rate": 2.6379430951817515e-05, + "loss": 0.4348, "step": 564000 }, { - "epoch": 5.75, - "learning_rate": 4.272053711847678e-05, - "loss": 0.7306, + "epoch": 7.771899368989557, + "grad_norm": 5.0759124755859375, + "learning_rate": 2.6371422214922504e-05, + "loss": 0.4425, "step": 564100 }, { - "epoch": 5.75, - "learning_rate": 4.2714529063800885e-05, - "loss": 0.582, + "epoch": 7.773277121049296, + "grad_norm": 4.322912216186523, + "learning_rate": 2.6363413573905945e-05, + "loss": 0.3546, "step": 564200 }, { - "epoch": 5.75, - "learning_rate": 4.2708520387460396e-05, - "loss": 0.6059, + "epoch": 7.774654873109036, + "grad_norm": 1.111991047859192, + "learning_rate": 2.6355405029448217e-05, + "loss": 0.4067, "step": 564300 }, { - "epoch": 5.75, - "learning_rate": 4.2702511089749094e-05, - "loss": 0.6016, + "epoch": 7.776032625168774, + "grad_norm": 34.97132873535156, + "learning_rate": 2.6347396582229703e-05, + "loss": 0.4227, "step": 564400 }, { - "epoch": 5.75, - "learning_rate": 4.269650117096082e-05, - "loss": 0.6232, + "epoch": 7.777410377228514, + "grad_norm": 5.639999866485596, + "learning_rate": 2.6339388232930728e-05, + "loss": 0.4211, "step": 564500 }, { - "epoch": 5.75, - "learning_rate": 4.269049063138939e-05, - "loss": 0.6248, + "epoch": 7.7787881292882535, + "grad_norm": 1.3731552362442017, + "learning_rate": 2.6331379982231666e-05, + "loss": 0.3848, "step": 564600 }, { - "epoch": 5.75, - "learning_rate": 4.268447947132871e-05, - "loss": 0.6167, + "epoch": 7.780165881347992, + "grad_norm": 2.370307445526123, + "learning_rate": 2.6323371830812856e-05, + "loss": 0.3922, "step": 564700 }, { - "epoch": 5.75, - "learning_rate": 4.267846769107268e-05, - "loss": 0.5898, + "epoch": 7.781543633407732, + "grad_norm": 2.8705644607543945, + "learning_rate": 2.631536377935462e-05, + "loss": 0.3729, "step": 564800 }, { - "epoch": 5.76, - "learning_rate": 4.267245529091524e-05, - "loss": 0.6361, + "epoch": 7.782921385467471, + "grad_norm": 2.0288171768188477, + "learning_rate": 2.6307355828537297e-05, + "loss": 0.3766, "step": 564900 }, { - "epoch": 5.76, - "learning_rate": 4.266644227115036e-05, - "loss": 0.5917, + "epoch": 7.784299137527211, + "grad_norm": 3.128899574279785, + "learning_rate": 2.6299347979041205e-05, + "loss": 0.4042, "step": 565000 }, { - "epoch": 5.76, - "learning_rate": 4.2660428632072044e-05, - "loss": 0.6089, + "epoch": 7.78567688958695, + "grad_norm": 0.427706778049469, + "learning_rate": 2.6291340231546637e-05, + "loss": 0.4259, "step": 565100 }, { - "epoch": 5.76, - "learning_rate": 4.2654414373974324e-05, - "loss": 0.6353, + "epoch": 7.787054641646689, + "grad_norm": 9.575339317321777, + "learning_rate": 2.6283332586733902e-05, + "loss": 0.4748, "step": 565200 }, { - "epoch": 5.76, - "learning_rate": 4.264839949715125e-05, - "loss": 0.5883, + "epoch": 7.788432393706429, + "grad_norm": 1.6544184684753418, + "learning_rate": 2.6275405120183912e-05, + "loss": 0.3711, "step": 565300 }, { - "epoch": 5.76, - "learning_rate": 4.2642384001896933e-05, - "loss": 0.5892, + "epoch": 7.789810145766168, + "grad_norm": 4.526662349700928, + "learning_rate": 2.626739768173191e-05, + "loss": 0.3737, "step": 565400 }, { - "epoch": 5.76, - "learning_rate": 4.263636788850548e-05, - "loss": 0.4948, + "epoch": 7.791187897825907, + "grad_norm": 2.490725517272949, + "learning_rate": 2.6259390347995786e-05, + "loss": 0.4355, "step": 565500 }, { - "epoch": 5.76, - "learning_rate": 4.263035115727105e-05, - "loss": 0.5148, + "epoch": 7.792565649885646, + "grad_norm": 11.295348167419434, + "learning_rate": 2.6251383119655785e-05, + "loss": 0.3842, "step": 565600 }, { - "epoch": 5.76, - "learning_rate": 4.262433380848781e-05, - "loss": 0.5126, + "epoch": 7.793943401945386, + "grad_norm": 14.879592895507812, + "learning_rate": 2.6243375997392183e-05, + "loss": 0.4518, "step": 565700 }, { - "epoch": 5.76, - "learning_rate": 4.261831584245e-05, - "loss": 0.6142, + "epoch": 7.7953211540051255, + "grad_norm": 3.0076754093170166, + "learning_rate": 2.623536898188522e-05, + "loss": 0.4581, "step": 565800 }, { - "epoch": 5.77, - "learning_rate": 4.261229725945185e-05, - "loss": 0.5729, + "epoch": 7.796698906064865, + "grad_norm": 2.590406656265259, + "learning_rate": 2.6227362073815132e-05, + "loss": 0.4213, "step": 565900 }, { - "epoch": 5.77, - "learning_rate": 4.260627805978762e-05, - "loss": 0.5198, + "epoch": 7.798076658124604, + "grad_norm": 0.42780813574790955, + "learning_rate": 2.621935527386214e-05, + "loss": 0.3608, "step": 566000 }, { - "epoch": 5.77, - "learning_rate": 4.2600258243751624e-05, - "loss": 0.5868, + "epoch": 7.799454410184343, + "grad_norm": 14.491691589355469, + "learning_rate": 2.6211348582706486e-05, + "loss": 0.409, "step": 566100 }, { - "epoch": 5.77, - "learning_rate": 4.259423781163819e-05, - "loss": 0.6401, + "epoch": 7.800832162244083, + "grad_norm": 5.128462791442871, + "learning_rate": 2.6203342001028344e-05, + "loss": 0.4283, "step": 566200 }, { - "epoch": 5.77, - "learning_rate": 4.258821676374169e-05, - "loss": 0.5718, + "epoch": 7.8022099143038215, + "grad_norm": 5.963847637176514, + "learning_rate": 2.619533552950794e-05, + "loss": 0.3778, "step": 566300 }, { - "epoch": 5.77, - "learning_rate": 4.258219510035651e-05, - "loss": 0.6175, + "epoch": 7.803587666363561, + "grad_norm": 4.169637203216553, + "learning_rate": 2.618732916882546e-05, + "loss": 0.3817, "step": 566400 }, { - "epoch": 5.77, - "learning_rate": 4.2576172821777085e-05, - "loss": 0.5458, + "epoch": 7.804965418423301, + "grad_norm": 2.175851345062256, + "learning_rate": 2.617932291966107e-05, + "loss": 0.3646, "step": 566500 }, { - "epoch": 5.77, - "learning_rate": 4.257014992829785e-05, - "loss": 0.5564, + "epoch": 7.80634317048304, + "grad_norm": 1.4738775491714478, + "learning_rate": 2.617131678269498e-05, + "loss": 0.4078, "step": 566600 }, { - "epoch": 5.77, - "learning_rate": 4.256412642021331e-05, - "loss": 0.6289, + "epoch": 7.807720922542779, + "grad_norm": 2.8327794075012207, + "learning_rate": 2.6163310758607303e-05, + "loss": 0.4243, "step": 566700 }, { - "epoch": 5.77, - "learning_rate": 4.255810229781796e-05, - "loss": 0.5675, + "epoch": 7.809098674602518, + "grad_norm": 1.4749609231948853, + "learning_rate": 2.6155304848078224e-05, + "loss": 0.3565, "step": 566800 }, { - "epoch": 5.78, - "learning_rate": 4.255207756140635e-05, - "loss": 0.5231, + "epoch": 7.810476426662258, + "grad_norm": 2.7822959423065186, + "learning_rate": 2.6147299051787876e-05, + "loss": 0.4418, "step": 566900 }, { - "epoch": 5.78, - "learning_rate": 4.254605221127305e-05, - "loss": 0.4503, + "epoch": 7.8118541787219975, + "grad_norm": 6.20667839050293, + "learning_rate": 2.613929337041639e-05, + "loss": 0.379, "step": 567000 }, { - "epoch": 5.78, - "learning_rate": 4.254002624771267e-05, - "loss": 0.5503, + "epoch": 7.813231930781736, + "grad_norm": 2.1793887615203857, + "learning_rate": 2.6131287804643898e-05, + "loss": 0.4167, "step": 567100 }, { - "epoch": 5.78, - "learning_rate": 4.253399967101984e-05, - "loss": 0.6196, + "epoch": 7.814609682841476, + "grad_norm": 8.690431594848633, + "learning_rate": 2.612336240906764e-05, + "loss": 0.397, "step": 567200 }, { - "epoch": 5.78, - "learning_rate": 4.252797248148924e-05, - "loss": 0.6626, + "epoch": 7.815987434901215, + "grad_norm": 4.355604648590088, + "learning_rate": 2.6115357075360494e-05, + "loss": 0.4604, "step": 567300 }, { - "epoch": 5.78, - "learning_rate": 4.252194467941553e-05, - "loss": 0.4948, + "epoch": 7.817365186960955, + "grad_norm": 15.770100593566895, + "learning_rate": 2.6107351859285854e-05, + "loss": 0.3565, "step": 567400 }, { - "epoch": 5.78, - "learning_rate": 4.251591626509346e-05, - "loss": 0.6289, + "epoch": 7.8187429390206935, + "grad_norm": 5.782956123352051, + "learning_rate": 2.60993467615238e-05, + "loss": 0.4724, "step": 567500 }, { - "epoch": 5.78, - "learning_rate": 4.250988723881778e-05, - "loss": 0.5575, + "epoch": 7.820120691080433, + "grad_norm": 3.9906046390533447, + "learning_rate": 2.6091341782754396e-05, + "loss": 0.4499, "step": 567600 }, { - "epoch": 5.78, - "learning_rate": 4.2503857600883256e-05, - "loss": 0.6365, + "epoch": 7.821498443140173, + "grad_norm": 3.61303448677063, + "learning_rate": 2.6083336923657736e-05, + "loss": 0.3671, "step": 567700 }, { - "epoch": 5.78, - "learning_rate": 4.249782735158473e-05, - "loss": 0.6386, + "epoch": 7.822876195199912, + "grad_norm": 2.9689767360687256, + "learning_rate": 2.6075332184913833e-05, + "loss": 0.4194, "step": 567800 }, { - "epoch": 5.79, - "learning_rate": 4.249179649121702e-05, - "loss": 0.5913, + "epoch": 7.824253947259651, + "grad_norm": 5.303683757781982, + "learning_rate": 2.6067327567202753e-05, + "loss": 0.4382, "step": 567900 }, { - "epoch": 5.79, - "learning_rate": 4.2485765020075016e-05, - "loss": 0.6071, + "epoch": 7.82563169931939, + "grad_norm": 6.710387706756592, + "learning_rate": 2.6059323071204526e-05, + "loss": 0.4151, "step": 568000 }, { - "epoch": 5.79, - "learning_rate": 4.2479732938453606e-05, - "loss": 0.6043, + "epoch": 7.82700945137913, + "grad_norm": 3.0356507301330566, + "learning_rate": 2.605131869759917e-05, + "loss": 0.3545, "step": 568100 }, { - "epoch": 5.79, - "learning_rate": 4.2473700246647734e-05, - "loss": 0.5723, + "epoch": 7.8283872034388695, + "grad_norm": 3.9841818809509277, + "learning_rate": 2.6043314447066707e-05, + "loss": 0.3851, "step": 568200 }, { - "epoch": 5.79, - "learning_rate": 4.2467666944952355e-05, - "loss": 0.5994, + "epoch": 7.829764955498608, + "grad_norm": 6.8464179039001465, + "learning_rate": 2.603531032028714e-05, + "loss": 0.3606, "step": 568300 }, { - "epoch": 5.79, - "learning_rate": 4.2461693375791894e-05, - "loss": 0.6128, + "epoch": 7.831142707558348, + "grad_norm": 4.464069843292236, + "learning_rate": 2.6027306317940448e-05, + "loss": 0.4169, "step": 568400 }, { - "epoch": 5.79, - "learning_rate": 4.2455658861294054e-05, - "loss": 0.5928, + "epoch": 7.832520459618087, + "grad_norm": 0.2582082450389862, + "learning_rate": 2.601930244070662e-05, + "loss": 0.3763, "step": 568500 }, { - "epoch": 5.79, - "learning_rate": 4.244962373778883e-05, - "loss": 0.6408, + "epoch": 7.833898211677827, + "grad_norm": 5.311912536621094, + "learning_rate": 2.601129868926562e-05, + "loss": 0.4158, "step": 568600 }, { - "epoch": 5.79, - "learning_rate": 4.24435880055713e-05, - "loss": 0.6415, + "epoch": 7.8352759637375655, + "grad_norm": 2.2017080783843994, + "learning_rate": 2.600329506429742e-05, + "loss": 0.426, "step": 568700 }, { - "epoch": 5.8, - "learning_rate": 4.243755166493657e-05, - "loss": 0.5418, + "epoch": 7.836653715797305, + "grad_norm": 1.7846885919570923, + "learning_rate": 2.599529156648196e-05, + "loss": 0.4042, "step": 568800 }, { - "epoch": 5.8, - "learning_rate": 4.243151471617979e-05, - "loss": 0.5966, + "epoch": 7.838031467857045, + "grad_norm": 2.7411811351776123, + "learning_rate": 2.598728819649918e-05, + "loss": 0.3834, "step": 568900 }, { - "epoch": 5.8, - "learning_rate": 4.2425477159596124e-05, - "loss": 0.6199, + "epoch": 7.839409219916783, + "grad_norm": 4.691423416137695, + "learning_rate": 2.5979284955029004e-05, + "loss": 0.4371, "step": 569000 }, { - "epoch": 5.8, - "learning_rate": 4.2419438995480784e-05, - "loss": 0.4995, + "epoch": 7.840786971976523, + "grad_norm": 1.355125904083252, + "learning_rate": 2.5971281842751346e-05, + "loss": 0.4117, "step": 569100 }, { - "epoch": 5.8, - "learning_rate": 4.2413400224129e-05, - "loss": 0.7143, + "epoch": 7.842164724036262, + "grad_norm": 4.330750942230225, + "learning_rate": 2.5963278860346122e-05, + "loss": 0.3715, "step": 569200 }, { - "epoch": 5.8, - "learning_rate": 4.2407360845836e-05, - "loss": 0.6713, + "epoch": 7.843542476096002, + "grad_norm": 1.6993210315704346, + "learning_rate": 2.5955276008493223e-05, + "loss": 0.3687, "step": 569300 }, { - "epoch": 5.8, - "learning_rate": 4.240132086089712e-05, - "loss": 0.5714, + "epoch": 7.8449202281557415, + "grad_norm": 5.218278884887695, + "learning_rate": 2.594727328787253e-05, + "loss": 0.445, "step": 569400 }, { - "epoch": 5.8, - "learning_rate": 4.239528026960765e-05, - "loss": 0.5984, + "epoch": 7.84629798021548, + "grad_norm": 7.775432586669922, + "learning_rate": 2.5939270699163914e-05, + "loss": 0.2981, "step": 569500 }, { - "epoch": 5.8, - "learning_rate": 4.238923907226294e-05, - "loss": 0.5193, + "epoch": 7.84767573227522, + "grad_norm": 9.965502738952637, + "learning_rate": 2.5931268243047227e-05, + "loss": 0.4132, "step": 569600 }, { - "epoch": 5.8, - "learning_rate": 4.238319726915838e-05, - "loss": 0.6532, + "epoch": 7.849053484334959, + "grad_norm": 2.4802777767181396, + "learning_rate": 2.5923265920202337e-05, + "loss": 0.3762, "step": 569700 }, { - "epoch": 5.81, - "learning_rate": 4.237715486058937e-05, - "loss": 0.6755, + "epoch": 7.850431236394698, + "grad_norm": 1.8184071779251099, + "learning_rate": 2.591526373130907e-05, + "loss": 0.4546, "step": 569800 }, { - "epoch": 5.81, - "learning_rate": 4.237111184685136e-05, - "loss": 0.6455, + "epoch": 7.8518089884544375, + "grad_norm": 5.638575077056885, + "learning_rate": 2.5907261677047264e-05, + "loss": 0.4105, "step": 569900 }, { - "epoch": 5.81, - "learning_rate": 4.23650682282398e-05, - "loss": 0.65, + "epoch": 7.853186740514177, + "grad_norm": 2.285266399383545, + "learning_rate": 2.5899259758096726e-05, + "loss": 0.3404, "step": 570000 }, { - "epoch": 5.81, - "learning_rate": 4.235902400505019e-05, - "loss": 0.5548, + "epoch": 7.854564492573917, + "grad_norm": 2.8196353912353516, + "learning_rate": 2.589125797513725e-05, + "loss": 0.4075, "step": 570100 }, { - "epoch": 5.81, - "learning_rate": 4.235297917757807e-05, - "loss": 0.5646, + "epoch": 7.855942244633656, + "grad_norm": 2.015794515609741, + "learning_rate": 2.588325632884865e-05, + "loss": 0.409, "step": 570200 }, { - "epoch": 5.81, - "learning_rate": 4.234693374611897e-05, - "loss": 0.5424, + "epoch": 7.857319996693395, + "grad_norm": 2.6167922019958496, + "learning_rate": 2.587525481991069e-05, + "loss": 0.4386, "step": 570300 }, { - "epoch": 5.81, - "learning_rate": 4.234088771096851e-05, - "loss": 0.6472, + "epoch": 7.858697748753134, + "grad_norm": 2.875387668609619, + "learning_rate": 2.5867253449003146e-05, + "loss": 0.413, "step": 570400 }, { - "epoch": 5.81, - "learning_rate": 4.2334841072422285e-05, - "loss": 0.6008, + "epoch": 7.860075500812874, + "grad_norm": 2.809980869293213, + "learning_rate": 2.5859252216805784e-05, + "loss": 0.4364, "step": 570500 }, { - "epoch": 5.81, - "learning_rate": 4.232879383077594e-05, - "loss": 0.6287, + "epoch": 7.861453252872613, + "grad_norm": 3.130037546157837, + "learning_rate": 2.585125112399833e-05, + "loss": 0.4008, "step": 570600 }, { - "epoch": 5.81, - "learning_rate": 4.2322745986325154e-05, - "loss": 0.6418, + "epoch": 7.862831004932352, + "grad_norm": 4.842129707336426, + "learning_rate": 2.5843250171260528e-05, + "loss": 0.441, "step": 570700 }, { - "epoch": 5.82, - "learning_rate": 4.2316697539365616e-05, - "loss": 0.6768, + "epoch": 7.864208756992092, + "grad_norm": 2.6613197326660156, + "learning_rate": 2.58352493592721e-05, + "loss": 0.3766, "step": 570800 }, { - "epoch": 5.82, - "learning_rate": 4.2310648490193083e-05, - "loss": 0.5594, + "epoch": 7.865586509051831, + "grad_norm": 3.213390827178955, + "learning_rate": 2.582724868871276e-05, + "loss": 0.3882, "step": 570900 }, { - "epoch": 5.82, - "learning_rate": 4.23045988391033e-05, - "loss": 0.596, + "epoch": 7.86696426111157, + "grad_norm": 5.0584516525268555, + "learning_rate": 2.581924816026219e-05, + "loss": 0.4301, "step": 571000 }, { - "epoch": 5.82, - "learning_rate": 4.2298548586392066e-05, - "loss": 0.6538, + "epoch": 7.8683420131713095, + "grad_norm": 3.562164783477783, + "learning_rate": 2.5811247774600107e-05, + "loss": 0.4083, "step": 571100 }, { - "epoch": 5.82, - "learning_rate": 4.22924977323552e-05, - "loss": 0.546, + "epoch": 7.869719765231049, + "grad_norm": 2.4599568843841553, + "learning_rate": 2.5803247532406156e-05, + "loss": 0.3868, "step": 571200 }, { - "epoch": 5.82, - "learning_rate": 4.228644627728855e-05, - "loss": 0.6101, + "epoch": 7.871097517290789, + "grad_norm": 2.2250401973724365, + "learning_rate": 2.5795247434360005e-05, + "loss": 0.3955, "step": 571300 }, { - "epoch": 5.82, - "learning_rate": 4.2280394221487995e-05, - "loss": 0.5492, + "epoch": 7.872475269350527, + "grad_norm": 2.9267094135284424, + "learning_rate": 2.5787327479954364e-05, + "loss": 0.438, "step": 571400 }, { - "epoch": 5.82, - "learning_rate": 4.2274341565249455e-05, - "loss": 0.6174, + "epoch": 7.873853021410267, + "grad_norm": 0.6323767900466919, + "learning_rate": 2.577932767078432e-05, + "loss": 0.4212, "step": 571500 }, { - "epoch": 5.82, - "learning_rate": 4.226828830886886e-05, - "loss": 0.5446, + "epoch": 7.875230773470006, + "grad_norm": 4.856153964996338, + "learning_rate": 2.5771328007794202e-05, + "loss": 0.3295, "step": 571600 }, { - "epoch": 5.82, - "learning_rate": 4.226223445264218e-05, - "loss": 0.593, + "epoch": 7.876608525529746, + "grad_norm": 1.9507824182510376, + "learning_rate": 2.5763328491663602e-05, + "loss": 0.4042, "step": 571700 }, { - "epoch": 5.83, - "learning_rate": 4.225617999686541e-05, - "loss": 0.4871, + "epoch": 7.877986277589485, + "grad_norm": 2.466931104660034, + "learning_rate": 2.5755329123072132e-05, + "loss": 0.4183, "step": 571800 }, { - "epoch": 5.83, - "learning_rate": 4.225012494183459e-05, - "loss": 0.6361, + "epoch": 7.879364029649224, + "grad_norm": 13.116840362548828, + "learning_rate": 2.574732990269938e-05, + "loss": 0.4507, "step": 571900 }, { - "epoch": 5.83, - "learning_rate": 4.224406928784575e-05, - "loss": 0.6074, + "epoch": 7.880741781708964, + "grad_norm": 4.8487868309021, + "learning_rate": 2.5739330831224922e-05, + "loss": 0.4412, "step": 572000 }, { - "epoch": 5.83, - "learning_rate": 4.2238013035195e-05, - "loss": 0.576, + "epoch": 7.882119533768703, + "grad_norm": 2.353994607925415, + "learning_rate": 2.5731411897804643e-05, + "loss": 0.4294, "step": 572100 }, { - "epoch": 5.83, - "learning_rate": 4.2231956184178435e-05, - "loss": 0.5063, + "epoch": 7.883497285828442, + "grad_norm": 34.3173713684082, + "learning_rate": 2.5723413124659503e-05, + "loss": 0.4385, "step": 572200 }, { - "epoch": 5.83, - "learning_rate": 4.2225898735092223e-05, - "loss": 0.496, + "epoch": 7.884875037888182, + "grad_norm": 2.1839373111724854, + "learning_rate": 2.5715414502444507e-05, + "loss": 0.4574, "step": 572300 }, { - "epoch": 5.83, - "learning_rate": 4.221984068823252e-05, - "loss": 0.5012, + "epoch": 7.886252789947921, + "grad_norm": 2.063659906387329, + "learning_rate": 2.570741603183917e-05, + "loss": 0.3899, "step": 572400 }, { - "epoch": 5.83, - "learning_rate": 4.221378204389552e-05, - "loss": 0.5535, + "epoch": 7.887630542007661, + "grad_norm": 3.1924850940704346, + "learning_rate": 2.569941771352302e-05, + "loss": 0.4293, "step": 572500 }, { - "epoch": 5.83, - "learning_rate": 4.220778339774773e-05, - "loss": 0.5549, + "epoch": 7.889008294067399, + "grad_norm": 3.378161907196045, + "learning_rate": 2.5691419548175547e-05, + "loss": 0.3575, "step": 572600 }, { - "epoch": 5.83, - "learning_rate": 4.220172356531228e-05, - "loss": 0.4943, + "epoch": 7.890386046127139, + "grad_norm": 2.4634101390838623, + "learning_rate": 2.5683421536476245e-05, + "loss": 0.4625, "step": 572700 }, { - "epoch": 5.84, - "learning_rate": 4.219566313628535e-05, - "loss": 0.5722, + "epoch": 7.8917637981868785, + "grad_norm": 7.387239456176758, + "learning_rate": 2.5675423679104567e-05, + "loss": 0.4136, "step": 572800 }, { - "epoch": 5.84, - "learning_rate": 4.218960211096328e-05, - "loss": 0.5007, + "epoch": 7.893141550246618, + "grad_norm": 2.0837934017181396, + "learning_rate": 2.5667425976739976e-05, + "loss": 0.4187, "step": 572900 }, { - "epoch": 5.84, - "learning_rate": 4.218354048964241e-05, - "loss": 0.5579, + "epoch": 7.894519302306357, + "grad_norm": 4.208126068115234, + "learning_rate": 2.565942843006193e-05, + "loss": 0.359, "step": 573000 }, { - "epoch": 5.84, - "learning_rate": 4.217747827261911e-05, - "loss": 0.51, + "epoch": 7.895897054366096, + "grad_norm": 1.845621109008789, + "learning_rate": 2.5651431039749858e-05, + "loss": 0.3958, "step": 573100 }, { - "epoch": 5.84, - "learning_rate": 4.217141546018981e-05, - "loss": 0.5708, + "epoch": 7.897274806425836, + "grad_norm": 17.89778709411621, + "learning_rate": 2.564343380648318e-05, + "loss": 0.4281, "step": 573200 }, { - "epoch": 5.84, - "learning_rate": 4.216535205265092e-05, - "loss": 0.5793, + "epoch": 7.8986525584855745, + "grad_norm": 256.0893249511719, + "learning_rate": 2.5635436730941292e-05, + "loss": 0.4628, "step": 573300 }, { - "epoch": 5.84, - "learning_rate": 4.2159288050298905e-05, - "loss": 0.5643, + "epoch": 7.900030310545314, + "grad_norm": 5.519166469573975, + "learning_rate": 2.562743981380359e-05, + "loss": 0.4253, "step": 573400 }, { - "epoch": 5.84, - "learning_rate": 4.2153223453430273e-05, - "loss": 0.5648, + "epoch": 7.901408062605054, + "grad_norm": 3.711449146270752, + "learning_rate": 2.5619443055749457e-05, + "loss": 0.3716, "step": 573500 }, { - "epoch": 5.84, - "learning_rate": 4.2147218917192853e-05, - "loss": 0.6121, + "epoch": 7.902785814664793, + "grad_norm": 6.972109794616699, + "learning_rate": 2.561144645745826e-05, + "loss": 0.4532, "step": 573600 }, { - "epoch": 5.84, - "learning_rate": 4.214115313811835e-05, - "loss": 0.632, + "epoch": 7.904163566724533, + "grad_norm": 8.208768844604492, + "learning_rate": 2.5603450019609344e-05, + "loss": 0.4242, "step": 573700 }, { - "epoch": 5.85, - "learning_rate": 4.21350867654139e-05, - "loss": 0.5347, + "epoch": 7.905541318784271, + "grad_norm": 1.4055252075195312, + "learning_rate": 2.5595453742882065e-05, + "loss": 0.4448, "step": 573800 }, { - "epoch": 5.85, - "learning_rate": 4.212901979937613e-05, - "loss": 0.614, + "epoch": 7.906919070844011, + "grad_norm": 2.9897499084472656, + "learning_rate": 2.5587457627955717e-05, + "loss": 0.446, "step": 573900 }, { - "epoch": 5.85, - "learning_rate": 4.212295224030168e-05, - "loss": 0.5957, + "epoch": 7.9082968229037505, + "grad_norm": 9.12283992767334, + "learning_rate": 2.5579461675509634e-05, + "loss": 0.4791, "step": 574000 }, { - "epoch": 5.85, - "learning_rate": 4.211688408848721e-05, - "loss": 0.5692, + "epoch": 7.909674574963489, + "grad_norm": 1.9095302820205688, + "learning_rate": 2.5571465886223107e-05, + "loss": 0.3901, "step": 574100 }, { - "epoch": 5.85, - "learning_rate": 4.2110815344229403e-05, - "loss": 0.5393, + "epoch": 7.911052327023229, + "grad_norm": 9.948441505432129, + "learning_rate": 2.556347026077542e-05, + "loss": 0.4153, "step": 574200 }, { - "epoch": 5.85, - "learning_rate": 4.210474600782501e-05, - "loss": 0.5866, + "epoch": 7.912430079082968, + "grad_norm": 2.4185192584991455, + "learning_rate": 2.5555474799845838e-05, + "loss": 0.3412, "step": 574300 }, { - "epoch": 5.85, - "learning_rate": 4.209867607957078e-05, - "loss": 0.5357, + "epoch": 7.913807831142708, + "grad_norm": 17.072826385498047, + "learning_rate": 2.5547479504113632e-05, + "loss": 0.4045, "step": 574400 }, { - "epoch": 5.85, - "learning_rate": 4.209260555976349e-05, - "loss": 0.5652, + "epoch": 7.915185583202447, + "grad_norm": 3.982893466949463, + "learning_rate": 2.553948437425802e-05, + "loss": 0.3965, "step": 574500 }, { - "epoch": 5.85, - "learning_rate": 4.2086534448699946e-05, - "loss": 0.5762, + "epoch": 7.916563335262186, + "grad_norm": 2.8802690505981445, + "learning_rate": 2.5531489410958242e-05, + "loss": 0.4122, "step": 574600 }, { - "epoch": 5.86, - "learning_rate": 4.2080462746677e-05, - "loss": 0.5948, + "epoch": 7.917941087321926, + "grad_norm": 10.4449462890625, + "learning_rate": 2.5523494614893513e-05, + "loss": 0.3842, "step": 574700 }, { - "epoch": 5.86, - "learning_rate": 4.207439045399152e-05, - "loss": 0.5051, + "epoch": 7.919318839381665, + "grad_norm": 0.7747752666473389, + "learning_rate": 2.551549998674302e-05, + "loss": 0.4521, "step": 574800 }, { - "epoch": 5.86, - "learning_rate": 4.2068317570940395e-05, - "loss": 0.558, + "epoch": 7.920696591441404, + "grad_norm": 19.949691772460938, + "learning_rate": 2.5507505527185974e-05, + "loss": 0.4399, "step": 574900 }, { - "epoch": 5.86, - "learning_rate": 4.206224409782057e-05, - "loss": 0.5288, + "epoch": 7.922074343501143, + "grad_norm": 4.304880142211914, + "learning_rate": 2.5499511236901507e-05, + "loss": 0.408, "step": 575000 }, { - "epoch": 5.86, - "learning_rate": 4.205617003492898e-05, - "loss": 0.5618, + "epoch": 7.923452095560883, + "grad_norm": 2.953263759613037, + "learning_rate": 2.5491517116568804e-05, + "loss": 0.4221, "step": 575100 }, { - "epoch": 5.86, - "learning_rate": 4.205009538256264e-05, - "loss": 0.4772, + "epoch": 7.9248298476206225, + "grad_norm": 0.3570718765258789, + "learning_rate": 2.5483523166866995e-05, + "loss": 0.415, "step": 575200 }, { - "epoch": 5.86, - "learning_rate": 4.204402014101854e-05, - "loss": 0.547, + "epoch": 7.926207599680361, + "grad_norm": 2.9194681644439697, + "learning_rate": 2.5475529388475205e-05, + "loss": 0.4056, "step": 575300 }, { - "epoch": 5.86, - "learning_rate": 4.203794431059373e-05, - "loss": 0.5448, + "epoch": 7.927585351740101, + "grad_norm": 2.4002156257629395, + "learning_rate": 2.5467535782072556e-05, + "loss": 0.3716, "step": 575400 }, { - "epoch": 5.86, - "learning_rate": 4.203186789158527e-05, - "loss": 0.5888, + "epoch": 7.92896310379984, + "grad_norm": 4.822518348693848, + "learning_rate": 2.5459542348338146e-05, + "loss": 0.3922, "step": 575500 }, { - "epoch": 5.86, - "learning_rate": 4.202579088429029e-05, - "loss": 0.6193, + "epoch": 7.93034085585958, + "grad_norm": 1.4808429479599, + "learning_rate": 2.5451549087951045e-05, + "loss": 0.393, "step": 575600 }, { - "epoch": 5.87, - "learning_rate": 4.201971328900589e-05, - "loss": 0.5517, + "epoch": 7.9317186079193185, + "grad_norm": 2.558567523956299, + "learning_rate": 2.5443556001590333e-05, + "loss": 0.4077, "step": 575700 }, { - "epoch": 5.87, - "learning_rate": 4.201363510602925e-05, - "loss": 0.601, + "epoch": 7.933096359979058, + "grad_norm": 5.497522354125977, + "learning_rate": 2.5435563089935048e-05, + "loss": 0.3509, "step": 575800 }, { - "epoch": 5.87, - "learning_rate": 4.200755633565754e-05, - "loss": 0.58, + "epoch": 7.934474112038798, + "grad_norm": 7.092094421386719, + "learning_rate": 2.5427570353664246e-05, + "loss": 0.3809, "step": 575900 }, { - "epoch": 5.87, - "learning_rate": 4.2001476978187985e-05, - "loss": 0.6291, + "epoch": 7.935851864098537, + "grad_norm": 2.1565871238708496, + "learning_rate": 2.5419577793456958e-05, + "loss": 0.4432, "step": 576000 }, { - "epoch": 5.87, - "learning_rate": 4.199539703391782e-05, - "loss": 0.6389, + "epoch": 7.937229616158276, + "grad_norm": 4.346516132354736, + "learning_rate": 2.5411585409992168e-05, + "loss": 0.3974, "step": 576100 }, { - "epoch": 5.87, - "learning_rate": 4.198931650314433e-05, - "loss": 0.5835, + "epoch": 7.938607368218015, + "grad_norm": 7.287683486938477, + "learning_rate": 2.5403593203948884e-05, + "loss": 0.3503, "step": 576200 }, { - "epoch": 5.87, - "learning_rate": 4.198323538616482e-05, - "loss": 0.6335, + "epoch": 7.939985120277755, + "grad_norm": 2.822115659713745, + "learning_rate": 2.5395601176006075e-05, + "loss": 0.4218, "step": 576300 }, { - "epoch": 5.87, - "learning_rate": 4.19771536832766e-05, - "loss": 0.5073, + "epoch": 7.9413628723374945, + "grad_norm": 1.4879916906356812, + "learning_rate": 2.5387609326842718e-05, + "loss": 0.3648, "step": 576400 }, { - "epoch": 5.87, - "learning_rate": 4.197107139477705e-05, - "loss": 0.6009, + "epoch": 7.942740624397233, + "grad_norm": 2.8131580352783203, + "learning_rate": 2.537961765713776e-05, + "loss": 0.3476, "step": 576500 }, { - "epoch": 5.87, - "learning_rate": 4.196498852096354e-05, - "loss": 0.4934, + "epoch": 7.944118376456973, + "grad_norm": 5.575896739959717, + "learning_rate": 2.5371626167570128e-05, + "loss": 0.4035, "step": 576600 }, { - "epoch": 5.88, - "learning_rate": 4.19589050621335e-05, - "loss": 0.6401, + "epoch": 7.945496128516712, + "grad_norm": 1.5800246000289917, + "learning_rate": 2.536363485881874e-05, + "loss": 0.3972, "step": 576700 }, { - "epoch": 5.88, - "learning_rate": 4.1952821018584364e-05, - "loss": 0.5703, + "epoch": 7.946873880576452, + "grad_norm": 3.6117801666259766, + "learning_rate": 2.5355643731562495e-05, + "loss": 0.4865, "step": 576800 }, { - "epoch": 5.88, - "learning_rate": 4.1946736390613616e-05, - "loss": 0.538, + "epoch": 7.9482516326361905, + "grad_norm": 2.568058967590332, + "learning_rate": 2.5347652786480293e-05, + "loss": 0.3146, "step": 576900 }, { - "epoch": 5.88, - "learning_rate": 4.194065117851875e-05, - "loss": 0.6146, + "epoch": 7.94962938469593, + "grad_norm": 19.48751449584961, + "learning_rate": 2.5339662024250994e-05, + "loss": 0.4294, "step": 577000 }, { - "epoch": 5.88, - "learning_rate": 4.19345653825973e-05, - "loss": 0.5697, + "epoch": 7.95100713675567, + "grad_norm": 2.744723320007324, + "learning_rate": 2.5331671445553463e-05, + "loss": 0.4209, "step": 577100 }, { - "epoch": 5.88, - "learning_rate": 4.1928479003146835e-05, - "loss": 0.588, + "epoch": 7.952384888815409, + "grad_norm": 3.562403440475464, + "learning_rate": 2.5323681051066536e-05, + "loss": 0.4274, "step": 577200 }, { - "epoch": 5.88, - "learning_rate": 4.192239204046492e-05, - "loss": 0.5262, + "epoch": 7.953762640875148, + "grad_norm": 3.140655279159546, + "learning_rate": 2.531569084146903e-05, + "loss": 0.3748, "step": 577300 }, { - "epoch": 5.88, - "learning_rate": 4.1916304494849185e-05, - "loss": 0.5905, + "epoch": 7.955140392934887, + "grad_norm": 2.7843120098114014, + "learning_rate": 2.5307700817439762e-05, + "loss": 0.4226, "step": 577400 }, { - "epoch": 5.88, - "learning_rate": 4.191021636659728e-05, - "loss": 0.5116, + "epoch": 7.956518144994627, + "grad_norm": 3.3314783573150635, + "learning_rate": 2.529971097965753e-05, + "loss": 0.4802, "step": 577500 }, { - "epoch": 5.88, - "learning_rate": 4.1904127656006856e-05, - "loss": 0.5436, + "epoch": 7.957895897054366, + "grad_norm": 3.069687604904175, + "learning_rate": 2.529180122438216e-05, + "loss": 0.4703, "step": 577600 }, { - "epoch": 5.89, - "learning_rate": 4.1898099259182075e-05, - "loss": 0.5829, + "epoch": 7.959273649114105, + "grad_norm": 9.834455490112305, + "learning_rate": 2.5283811759250905e-05, + "loss": 0.3704, "step": 577700 }, { - "epoch": 5.89, - "learning_rate": 4.189200939062374e-05, - "loss": 0.5838, + "epoch": 7.960651401173845, + "grad_norm": 3.5520596504211426, + "learning_rate": 2.5275822482396176e-05, + "loss": 0.3881, "step": 577800 }, { - "epoch": 5.89, - "learning_rate": 4.1885918940617126e-05, - "loss": 0.6344, + "epoch": 7.962029153233584, + "grad_norm": 3.569101572036743, + "learning_rate": 2.52678333944967e-05, + "loss": 0.3665, "step": 577900 }, { - "epoch": 5.89, - "learning_rate": 4.187982790945999e-05, - "loss": 0.6218, + "epoch": 7.963406905293324, + "grad_norm": 1.5755479335784912, + "learning_rate": 2.52598444962312e-05, + "loss": 0.411, "step": 578000 }, { - "epoch": 5.89, - "learning_rate": 4.187373629745017e-05, - "loss": 0.6308, + "epoch": 7.9647846573530625, + "grad_norm": 10.073369026184082, + "learning_rate": 2.5251855788278373e-05, + "loss": 0.4834, "step": 578100 }, { - "epoch": 5.89, - "learning_rate": 4.186764410488551e-05, - "loss": 0.5946, + "epoch": 7.966162409412802, + "grad_norm": 1.798141360282898, + "learning_rate": 2.5243867271316905e-05, + "loss": 0.3937, "step": 578200 }, { - "epoch": 5.89, - "learning_rate": 4.186155133206386e-05, - "loss": 0.652, + "epoch": 7.967540161472542, + "grad_norm": 7.872628688812256, + "learning_rate": 2.523587894602545e-05, + "loss": 0.4234, "step": 578300 }, { - "epoch": 5.89, - "learning_rate": 4.185545797928316e-05, - "loss": 0.6009, + "epoch": 7.96891791353228, + "grad_norm": 41.05575180053711, + "learning_rate": 2.522789081308265e-05, + "loss": 0.4066, "step": 578400 }, { - "epoch": 5.89, - "learning_rate": 4.18493640468413e-05, - "loss": 0.5138, + "epoch": 7.97029566559202, + "grad_norm": 1.5082037448883057, + "learning_rate": 2.521990287316716e-05, + "loss": 0.3792, "step": 578500 }, { - "epoch": 5.89, - "learning_rate": 4.184326953503625e-05, - "loss": 0.6374, + "epoch": 7.971673417651759, + "grad_norm": 4.020823001861572, + "learning_rate": 2.5211915126957587e-05, + "loss": 0.4582, "step": 578600 }, { - "epoch": 5.9, - "learning_rate": 4.1837174444165994e-05, - "loss": 0.6108, + "epoch": 7.973051169711499, + "grad_norm": 3.1719093322753906, + "learning_rate": 2.5203927575132523e-05, + "loss": 0.369, "step": 578700 }, { - "epoch": 5.9, - "learning_rate": 4.183107877452857e-05, - "loss": 0.5198, + "epoch": 7.9744289217712385, + "grad_norm": 3.79127836227417, + "learning_rate": 2.519594021837057e-05, + "loss": 0.3865, "step": 578800 }, { - "epoch": 5.9, - "learning_rate": 4.182504349176548e-05, - "loss": 0.5569, + "epoch": 7.975806673830977, + "grad_norm": 2.3954873085021973, + "learning_rate": 2.518795305735027e-05, + "loss": 0.4073, "step": 578900 }, { - "epoch": 5.9, - "learning_rate": 4.181894667126805e-05, - "loss": 0.5336, + "epoch": 7.977184425890717, + "grad_norm": 3.813998222351074, + "learning_rate": 2.5179966092750192e-05, + "loss": 0.4388, "step": 579000 }, { - "epoch": 5.9, - "learning_rate": 4.181284927289466e-05, - "loss": 0.5381, + "epoch": 7.978562177950456, + "grad_norm": 2.709373950958252, + "learning_rate": 2.5171979325248863e-05, + "loss": 0.4524, "step": 579100 }, { - "epoch": 5.9, - "learning_rate": 4.180675129694344e-05, - "loss": 0.6074, + "epoch": 7.979939930010195, + "grad_norm": 0.18899941444396973, + "learning_rate": 2.5163992755524803e-05, + "loss": 0.4042, "step": 579200 }, { - "epoch": 5.9, - "learning_rate": 4.180065274371253e-05, - "loss": 0.6188, + "epoch": 7.9813176820699345, + "grad_norm": 2.6865170001983643, + "learning_rate": 2.5156006384256514e-05, + "loss": 0.354, "step": 579300 }, { - "epoch": 5.9, - "learning_rate": 4.179455361350012e-05, - "loss": 0.6354, + "epoch": 7.982695434129674, + "grad_norm": 5.534639835357666, + "learning_rate": 2.514802021212246e-05, + "loss": 0.4178, "step": 579400 }, { - "epoch": 5.9, - "learning_rate": 4.178845390660444e-05, - "loss": 0.572, + "epoch": 7.984073186189414, + "grad_norm": 51.13286590576172, + "learning_rate": 2.514003423980113e-05, + "loss": 0.3789, "step": 579500 }, { - "epoch": 5.91, - "learning_rate": 4.178235362332371e-05, - "loss": 0.5296, + "epoch": 7.985450938249152, + "grad_norm": 5.714435577392578, + "learning_rate": 2.5132048467970962e-05, + "loss": 0.3794, "step": 579600 }, { - "epoch": 5.91, - "learning_rate": 4.17762527639562e-05, - "loss": 0.4643, + "epoch": 7.986828690308892, + "grad_norm": 1.4140841960906982, + "learning_rate": 2.5124062897310395e-05, + "loss": 0.4031, "step": 579700 }, { - "epoch": 5.91, - "learning_rate": 4.177015132880021e-05, - "loss": 0.45, + "epoch": 7.988206442368631, + "grad_norm": 16.32473373413086, + "learning_rate": 2.511607752849783e-05, + "loss": 0.4109, "step": 579800 }, { - "epoch": 5.91, - "learning_rate": 4.1764049318154055e-05, - "loss": 0.6342, + "epoch": 7.989584194428371, + "grad_norm": 5.832487106323242, + "learning_rate": 2.5108092362211687e-05, + "loss": 0.3748, "step": 579900 }, { - "epoch": 5.91, - "learning_rate": 4.175794673231608e-05, - "loss": 0.6484, + "epoch": 7.99096194648811, + "grad_norm": 0.9610350131988525, + "learning_rate": 2.5100107399130334e-05, + "loss": 0.4126, "step": 580000 }, { - "epoch": 5.91, - "learning_rate": 4.17518435715847e-05, - "loss": 0.5219, + "epoch": 7.992339698547849, + "grad_norm": 7.916874885559082, + "learning_rate": 2.509212263993213e-05, + "loss": 0.4233, "step": 580100 }, { - "epoch": 5.91, - "learning_rate": 4.174573983625829e-05, - "loss": 0.5191, + "epoch": 7.993717450607589, + "grad_norm": 2.643354654312134, + "learning_rate": 2.508413808529543e-05, + "loss": 0.3827, "step": 580200 }, { - "epoch": 5.91, - "learning_rate": 4.17396355266353e-05, - "loss": 0.6277, + "epoch": 7.995095202667328, + "grad_norm": 2.7436318397521973, + "learning_rate": 2.507615373589855e-05, + "loss": 0.4819, "step": 580300 }, { - "epoch": 5.91, - "learning_rate": 4.173353064301419e-05, - "loss": 0.5578, + "epoch": 7.996472954727067, + "grad_norm": 7.280269622802734, + "learning_rate": 2.506816959241983e-05, + "loss": 0.4383, "step": 580400 }, { - "epoch": 5.91, - "learning_rate": 4.172742518569345e-05, - "loss": 0.5549, + "epoch": 7.9978507067868065, + "grad_norm": 4.963311672210693, + "learning_rate": 2.5060185655537527e-05, + "loss": 0.3718, "step": 580500 }, { - "epoch": 5.92, - "learning_rate": 4.1721319154971605e-05, - "loss": 0.5566, + "epoch": 7.999228458846546, + "grad_norm": 0.560031533241272, + "learning_rate": 2.505220192592994e-05, + "loss": 0.4805, "step": 580600 }, { - "epoch": 5.92, - "learning_rate": 4.1715212551147204e-05, - "loss": 0.5945, + "epoch": 8.000606210906286, + "grad_norm": 6.957786560058594, + "learning_rate": 2.5044218404275323e-05, + "loss": 0.3501, "step": 580700 }, { - "epoch": 5.92, - "learning_rate": 4.170910537451883e-05, - "loss": 0.6283, + "epoch": 8.001983962966024, + "grad_norm": 2.446098566055298, + "learning_rate": 2.5036235091251908e-05, + "loss": 0.4419, "step": 580800 }, { - "epoch": 5.92, - "learning_rate": 4.1702997625385074e-05, - "loss": 0.5688, + "epoch": 8.003361715025765, + "grad_norm": 5.296990871429443, + "learning_rate": 2.5028251987537934e-05, + "loss": 0.2845, "step": 580900 }, { - "epoch": 5.92, - "learning_rate": 4.169688930404457e-05, - "loss": 0.6035, + "epoch": 8.004739467085503, + "grad_norm": 19.149066925048828, + "learning_rate": 2.5020269093811606e-05, + "loss": 0.3206, "step": 581000 }, { - "epoch": 5.92, - "learning_rate": 4.1690780410795986e-05, - "loss": 0.4726, + "epoch": 8.006117219145242, + "grad_norm": 19.77134895324707, + "learning_rate": 2.5012286410751097e-05, + "loss": 0.3589, "step": 581100 }, { - "epoch": 5.92, - "learning_rate": 4.1684670945938e-05, - "loss": 0.5327, + "epoch": 8.007494971204983, + "grad_norm": 0.6790552139282227, + "learning_rate": 2.5004303939034585e-05, + "loss": 0.4234, "step": 581200 }, { - "epoch": 5.92, - "learning_rate": 4.167862201295804e-05, - "loss": 0.6126, + "epoch": 8.008872723264721, + "grad_norm": 3.866525173187256, + "learning_rate": 2.4996401500885425e-05, + "loss": 0.3598, "step": 581300 }, { - "epoch": 5.92, - "learning_rate": 4.1672511411486086e-05, - "loss": 0.5975, + "epoch": 8.01025047532446, + "grad_norm": 2.6719462871551514, + "learning_rate": 2.498841945176098e-05, + "loss": 0.3573, "step": 581400 }, { - "epoch": 5.92, - "learning_rate": 4.166640023929798e-05, - "loss": 0.5948, + "epoch": 8.0116282273842, + "grad_norm": 3.52056622505188, + "learning_rate": 2.4980437616008164e-05, + "loss": 0.3626, "step": 581500 }, { - "epoch": 5.93, - "learning_rate": 4.166028849669253e-05, - "loss": 0.6736, + "epoch": 8.013005979443939, + "grad_norm": 1.1236474514007568, + "learning_rate": 2.497245599430505e-05, + "loss": 0.4041, "step": 581600 }, { - "epoch": 5.93, - "learning_rate": 4.1654176183968556e-05, - "loss": 0.6527, + "epoch": 8.01438373150368, + "grad_norm": 72.87816619873047, + "learning_rate": 2.4964474587329735e-05, + "loss": 0.4067, "step": 581700 }, { - "epoch": 5.93, - "learning_rate": 4.164806330142491e-05, - "loss": 0.7274, + "epoch": 8.015761483563418, + "grad_norm": 3.123290538787842, + "learning_rate": 2.4956493395760276e-05, + "loss": 0.364, "step": 581800 }, { - "epoch": 5.93, - "learning_rate": 4.164201098669927e-05, - "loss": 0.5922, + "epoch": 8.017139235623157, + "grad_norm": 7.390705108642578, + "learning_rate": 2.4948512420274705e-05, + "loss": 0.4034, "step": 581900 }, { - "epoch": 5.93, - "learning_rate": 4.163589697110371e-05, - "loss": 0.5286, + "epoch": 8.018516987682897, + "grad_norm": 5.756507396697998, + "learning_rate": 2.4940531661551066e-05, + "loss": 0.3837, "step": 582000 }, { - "epoch": 5.93, - "learning_rate": 4.1629782386582235e-05, - "loss": 0.5594, + "epoch": 8.019894739742636, + "grad_norm": 5.694328308105469, + "learning_rate": 2.4932551120267364e-05, + "loss": 0.3223, "step": 582100 }, { - "epoch": 5.93, - "learning_rate": 4.16236672334338e-05, - "loss": 0.6266, + "epoch": 8.021272491802375, + "grad_norm": 5.076456069946289, + "learning_rate": 2.492457079710157e-05, + "loss": 0.4219, "step": 582200 }, { - "epoch": 5.93, - "learning_rate": 4.16175515119574e-05, - "loss": 0.5597, + "epoch": 8.022650243862115, + "grad_norm": 3.5752804279327393, + "learning_rate": 2.491659069273167e-05, + "loss": 0.4212, "step": 582300 }, { - "epoch": 5.93, - "learning_rate": 4.161143522245206e-05, - "loss": 0.5293, + "epoch": 8.024027995921854, + "grad_norm": 13.525606155395508, + "learning_rate": 2.4908610807835594e-05, + "loss": 0.3778, "step": 582400 }, { - "epoch": 5.93, - "learning_rate": 4.160531836521683e-05, - "loss": 0.595, + "epoch": 8.025405747981594, + "grad_norm": 3.615927219390869, + "learning_rate": 2.4900631143091293e-05, + "loss": 0.3144, "step": 582500 }, { - "epoch": 5.94, - "learning_rate": 4.1599200940550785e-05, - "loss": 0.4925, + "epoch": 8.026783500041333, + "grad_norm": 3.405439615249634, + "learning_rate": 2.4892651699176684e-05, + "loss": 0.3821, "step": 582600 }, { - "epoch": 5.94, - "learning_rate": 4.159308294875305e-05, - "loss": 0.5489, + "epoch": 8.028161252101071, + "grad_norm": 0.8719682693481445, + "learning_rate": 2.4884672476769644e-05, + "loss": 0.3854, "step": 582700 }, { - "epoch": 5.94, - "learning_rate": 4.158696439012273e-05, - "loss": 0.531, + "epoch": 8.029539004160812, + "grad_norm": 2.0392138957977295, + "learning_rate": 2.487669347654806e-05, + "loss": 0.4037, "step": 582800 }, { - "epoch": 5.94, - "learning_rate": 4.1580845264959004e-05, - "loss": 0.4915, + "epoch": 8.03091675622055, + "grad_norm": 6.125339031219482, + "learning_rate": 2.486871469918978e-05, + "loss": 0.3863, "step": 582900 }, { - "epoch": 5.94, - "learning_rate": 4.157472557356106e-05, - "loss": 0.6102, + "epoch": 8.03229450828029, + "grad_norm": 3.3226945400238037, + "learning_rate": 2.486073614537265e-05, + "loss": 0.4577, "step": 583000 }, { - "epoch": 5.94, - "learning_rate": 4.15686053162281e-05, - "loss": 0.5712, + "epoch": 8.03367226034003, + "grad_norm": 6.521392822265625, + "learning_rate": 2.485275781577449e-05, + "loss": 0.3628, "step": 583100 }, { - "epoch": 5.94, - "learning_rate": 4.156248449325938e-05, - "loss": 0.5761, + "epoch": 8.035050012399768, + "grad_norm": 3.3724944591522217, + "learning_rate": 2.4844779711073103e-05, + "loss": 0.3712, "step": 583200 }, { - "epoch": 5.94, - "learning_rate": 4.155636310495417e-05, - "loss": 0.5345, + "epoch": 8.036427764459507, + "grad_norm": 4.434491157531738, + "learning_rate": 2.4836801831946256e-05, + "loss": 0.4095, "step": 583300 }, { - "epoch": 5.94, - "learning_rate": 4.155024115161177e-05, - "loss": 0.5068, + "epoch": 8.037805516519247, + "grad_norm": 1.3057156801223755, + "learning_rate": 2.482882417907171e-05, + "loss": 0.3733, "step": 583400 }, { - "epoch": 5.94, - "learning_rate": 4.154411863353151e-05, - "loss": 0.5088, + "epoch": 8.039183268578986, + "grad_norm": 3.5886263847351074, + "learning_rate": 2.4820846753127226e-05, + "loss": 0.3534, "step": 583500 }, { - "epoch": 5.95, - "learning_rate": 4.1537995551012734e-05, - "loss": 0.6086, + "epoch": 8.040561020638727, + "grad_norm": 4.770686626434326, + "learning_rate": 2.4812869554790517e-05, + "loss": 0.3746, "step": 583600 }, { - "epoch": 5.95, - "learning_rate": 4.153187190435484e-05, - "loss": 0.5429, + "epoch": 8.041938772698465, + "grad_norm": 4.717912197113037, + "learning_rate": 2.4804892584739283e-05, + "loss": 0.3566, "step": 583700 }, { - "epoch": 5.95, - "learning_rate": 4.152574769385722e-05, - "loss": 0.6396, + "epoch": 8.043316524758204, + "grad_norm": 4.722676753997803, + "learning_rate": 2.4796915843651216e-05, + "loss": 0.347, "step": 583800 }, { - "epoch": 5.95, - "learning_rate": 4.151962291981932e-05, - "loss": 0.6534, + "epoch": 8.044694276817944, + "grad_norm": 15.43338680267334, + "learning_rate": 2.478893933220397e-05, + "loss": 0.4241, "step": 583900 }, { - "epoch": 5.95, - "learning_rate": 4.15134975825406e-05, - "loss": 0.5257, + "epoch": 8.046072028877683, + "grad_norm": 1.5749702453613281, + "learning_rate": 2.4780963051075203e-05, + "loss": 0.4263, "step": 584000 }, { - "epoch": 5.95, - "learning_rate": 4.150737168232057e-05, - "loss": 0.5576, + "epoch": 8.047449780937422, + "grad_norm": 13.211424827575684, + "learning_rate": 2.4772987000942537e-05, + "loss": 0.3843, "step": 584100 }, { - "epoch": 5.95, - "learning_rate": 4.150124521945873e-05, - "loss": 0.5409, + "epoch": 8.048827532997162, + "grad_norm": 4.8888959884643555, + "learning_rate": 2.476501118248357e-05, + "loss": 0.3367, "step": 584200 }, { - "epoch": 5.95, - "learning_rate": 4.149511819425464e-05, - "loss": 0.6353, + "epoch": 8.0502052850569, + "grad_norm": 1.3789567947387695, + "learning_rate": 2.4757035596375908e-05, + "loss": 0.3575, "step": 584300 }, { - "epoch": 5.95, - "learning_rate": 4.148899060700787e-05, - "loss": 0.6322, + "epoch": 8.051583037116641, + "grad_norm": 0.9290852546691895, + "learning_rate": 2.4749139995672162e-05, + "loss": 0.3297, "step": 584400 }, { - "epoch": 5.95, - "learning_rate": 4.148286245801801e-05, - "loss": 0.4708, + "epoch": 8.05296078917638, + "grad_norm": 5.468562126159668, + "learning_rate": 2.474116487395934e-05, + "loss": 0.38, "step": 584500 }, { - "epoch": 5.96, - "learning_rate": 4.1476733747584705e-05, - "loss": 0.5462, + "epoch": 8.054338541236119, + "grad_norm": 2.3821027278900146, + "learning_rate": 2.4733189986623667e-05, + "loss": 0.353, "step": 584600 }, { - "epoch": 5.96, - "learning_rate": 4.1470604476007614e-05, - "loss": 0.6395, + "epoch": 8.055716293295859, + "grad_norm": 4.483600616455078, + "learning_rate": 2.4725215334342664e-05, + "loss": 0.3691, "step": 584700 }, { - "epoch": 5.96, - "learning_rate": 4.146447464358642e-05, - "loss": 0.6236, + "epoch": 8.057094045355598, + "grad_norm": 10.275177001953125, + "learning_rate": 2.4717240917793826e-05, + "loss": 0.3254, "step": 584800 }, { - "epoch": 5.96, - "learning_rate": 4.145834425062083e-05, - "loss": 0.5002, + "epoch": 8.058471797415336, + "grad_norm": 7.386829376220703, + "learning_rate": 2.4709266737654598e-05, + "loss": 0.3882, "step": 584900 }, { - "epoch": 5.96, - "learning_rate": 4.145221329741058e-05, - "loss": 0.4929, + "epoch": 8.059849549475077, + "grad_norm": 1.0556020736694336, + "learning_rate": 2.470129279460244e-05, + "loss": 0.3875, "step": 585000 }, { - "epoch": 5.96, - "learning_rate": 4.144608178425545e-05, - "loss": 0.5874, + "epoch": 8.061227301534815, + "grad_norm": 4.6821370124816895, + "learning_rate": 2.469331908931477e-05, + "loss": 0.4297, "step": 585100 }, { - "epoch": 5.96, - "learning_rate": 4.1439949711455224e-05, - "loss": 0.6709, + "epoch": 8.062605053594556, + "grad_norm": 20.469966888427734, + "learning_rate": 2.468534562246901e-05, + "loss": 0.3798, "step": 585200 }, { - "epoch": 5.96, - "learning_rate": 4.1433817079309724e-05, - "loss": 0.6451, + "epoch": 8.063982805654295, + "grad_norm": 1.6803171634674072, + "learning_rate": 2.4677372394742536e-05, + "loss": 0.3917, "step": 585300 }, { - "epoch": 5.96, - "learning_rate": 4.142768388811881e-05, - "loss": 0.4483, + "epoch": 8.065360557714033, + "grad_norm": 1.3143651485443115, + "learning_rate": 2.4669399406812725e-05, + "loss": 0.3587, "step": 585400 }, { - "epoch": 5.97, - "learning_rate": 4.1421550138182346e-05, - "loss": 0.6493, + "epoch": 8.066738309773774, + "grad_norm": 2.5498340129852295, + "learning_rate": 2.466142665935691e-05, + "loss": 0.391, "step": 585500 }, { - "epoch": 5.97, - "learning_rate": 4.141541582980024e-05, - "loss": 0.5465, + "epoch": 8.068116061833512, + "grad_norm": 3.9095711708068848, + "learning_rate": 2.4653454153052417e-05, + "loss": 0.3532, "step": 585600 }, { - "epoch": 5.97, - "learning_rate": 4.1409280963272426e-05, - "loss": 0.637, + "epoch": 8.069493813893251, + "grad_norm": 6.187689781188965, + "learning_rate": 2.464548188857656e-05, + "loss": 0.455, "step": 585700 }, { - "epoch": 5.97, - "learning_rate": 4.1403145538898854e-05, - "loss": 0.6471, + "epoch": 8.070871565952991, + "grad_norm": 5.546910285949707, + "learning_rate": 2.4637509866606622e-05, + "loss": 0.3863, "step": 585800 }, { - "epoch": 5.97, - "learning_rate": 4.1397009556979526e-05, - "loss": 0.6329, + "epoch": 8.07224931801273, + "grad_norm": 2.386507511138916, + "learning_rate": 2.4629538087819864e-05, + "loss": 0.3837, "step": 585900 }, { - "epoch": 5.97, - "learning_rate": 4.139087301781445e-05, - "loss": 0.6026, + "epoch": 8.07362707007247, + "grad_norm": 12.018856048583984, + "learning_rate": 2.4621566552893538e-05, + "loss": 0.4, "step": 586000 }, { - "epoch": 5.97, - "learning_rate": 4.1384735921703644e-05, - "loss": 0.5673, + "epoch": 8.07500482213221, + "grad_norm": 3.6749844551086426, + "learning_rate": 2.4613674974196055e-05, + "loss": 0.4033, "step": 586100 }, { - "epoch": 5.97, - "learning_rate": 4.1378598268947196e-05, - "loss": 0.5796, + "epoch": 8.076382574191948, + "grad_norm": 2.346867799758911, + "learning_rate": 2.4605703926566714e-05, + "loss": 0.3522, "step": 586200 }, { - "epoch": 5.97, - "learning_rate": 4.1372460059845197e-05, - "loss": 0.4758, + "epoch": 8.077760326251688, + "grad_norm": 3.628887176513672, + "learning_rate": 2.4597733124822634e-05, + "loss": 0.3212, "step": 586300 }, { - "epoch": 5.97, - "learning_rate": 4.1366321294697766e-05, - "loss": 0.5691, + "epoch": 8.079138078311427, + "grad_norm": 5.22951602935791, + "learning_rate": 2.4589762569640976e-05, + "loss": 0.4024, "step": 586400 }, { - "epoch": 5.98, - "learning_rate": 4.136018197380507e-05, - "loss": 0.6423, + "epoch": 8.080515830371166, + "grad_norm": 21.341642379760742, + "learning_rate": 2.4581792261698883e-05, + "loss": 0.3919, "step": 586500 }, { - "epoch": 5.98, - "learning_rate": 4.135404209746725e-05, - "loss": 0.4958, + "epoch": 8.081893582430906, + "grad_norm": 37.0136833190918, + "learning_rate": 2.4573822201673458e-05, + "loss": 0.3846, "step": 586600 }, { - "epoch": 5.98, - "learning_rate": 4.134790166598454e-05, - "loss": 0.6371, + "epoch": 8.083271334490645, + "grad_norm": 3.5850675106048584, + "learning_rate": 2.4565852390241804e-05, + "loss": 0.4317, "step": 586700 }, { - "epoch": 5.98, - "learning_rate": 4.134176067965715e-05, - "loss": 0.6296, + "epoch": 8.084649086550385, + "grad_norm": 2.767005443572998, + "learning_rate": 2.4557882828081003e-05, + "loss": 0.3224, "step": 586800 }, { - "epoch": 5.98, - "learning_rate": 4.1335619138785355e-05, - "loss": 0.5309, + "epoch": 8.086026838610124, + "grad_norm": 2.5002527236938477, + "learning_rate": 2.454991351586811e-05, + "loss": 0.3734, "step": 586900 }, { - "epoch": 5.98, - "learning_rate": 4.132947704366943e-05, - "loss": 0.5133, + "epoch": 8.087404590669863, + "grad_norm": 3.275029182434082, + "learning_rate": 2.4541944454280165e-05, + "loss": 0.3285, "step": 587000 }, { - "epoch": 5.98, - "learning_rate": 4.132333439460969e-05, - "loss": 0.6355, + "epoch": 8.088782342729603, + "grad_norm": 3.186555862426758, + "learning_rate": 2.453397564399416e-05, + "loss": 0.3757, "step": 587100 }, { - "epoch": 5.98, - "learning_rate": 4.131719119190647e-05, - "loss": 0.6336, + "epoch": 8.090160094789342, + "grad_norm": 1.81247878074646, + "learning_rate": 2.4526007085687098e-05, + "loss": 0.3433, "step": 587200 }, { - "epoch": 5.98, - "learning_rate": 4.1311047435860136e-05, - "loss": 0.6097, + "epoch": 8.09153784684908, + "grad_norm": 3.46832275390625, + "learning_rate": 2.4518038780035947e-05, + "loss": 0.4017, "step": 587300 }, { - "epoch": 5.98, - "learning_rate": 4.1304903126771084e-05, - "loss": 0.5597, + "epoch": 8.09291559890882, + "grad_norm": 2.4992687702178955, + "learning_rate": 2.4510070727717666e-05, + "loss": 0.4383, "step": 587400 }, { - "epoch": 5.99, - "learning_rate": 4.129875826493974e-05, - "loss": 0.5633, + "epoch": 8.09429335096856, + "grad_norm": 0.7022088170051575, + "learning_rate": 2.4502102929409167e-05, + "loss": 0.3616, "step": 587500 }, { - "epoch": 5.99, - "learning_rate": 4.1292612850666533e-05, - "loss": 0.68, + "epoch": 8.095671103028298, + "grad_norm": 3.1558876037597656, + "learning_rate": 2.449413538578737e-05, + "loss": 0.3719, "step": 587600 }, { - "epoch": 5.99, - "learning_rate": 4.128646688425195e-05, - "loss": 0.6008, + "epoch": 8.097048855088039, + "grad_norm": 4.710842132568359, + "learning_rate": 2.4486168097529137e-05, + "loss": 0.3555, "step": 587700 }, { - "epoch": 5.99, - "learning_rate": 4.12803203659965e-05, - "loss": 0.5101, + "epoch": 8.098426607147777, + "grad_norm": 2.684572219848633, + "learning_rate": 2.447820106531135e-05, + "loss": 0.3875, "step": 587800 }, { - "epoch": 5.99, - "learning_rate": 4.12741732962007e-05, - "loss": 0.4864, + "epoch": 8.099804359207518, + "grad_norm": 2.155186176300049, + "learning_rate": 2.4470234289810844e-05, + "loss": 0.3816, "step": 587900 }, { - "epoch": 5.99, - "learning_rate": 4.12680871541031e-05, - "loss": 0.5169, + "epoch": 8.101182111267256, + "grad_norm": 2.575976610183716, + "learning_rate": 2.446226777170443e-05, + "loss": 0.3729, "step": 588000 }, { - "epoch": 5.99, - "learning_rate": 4.12619389876362e-05, - "loss": 0.57, + "epoch": 8.102559863326995, + "grad_norm": 8.682147026062012, + "learning_rate": 2.4454301511668903e-05, + "loss": 0.4029, "step": 588100 }, { - "epoch": 5.99, - "learning_rate": 4.1255790270527685e-05, - "loss": 0.5923, + "epoch": 8.103937615386736, + "grad_norm": 2.202414035797119, + "learning_rate": 2.444633551038106e-05, + "loss": 0.3256, "step": 588200 }, { - "epoch": 5.99, - "learning_rate": 4.12496410030782e-05, - "loss": 0.5668, + "epoch": 8.105315367446474, + "grad_norm": 2.3452773094177246, + "learning_rate": 2.4438369768517633e-05, + "loss": 0.3766, "step": 588300 }, { - "epoch": 5.99, - "learning_rate": 4.1243491185588396e-05, - "loss": 0.6334, + "epoch": 8.106693119506213, + "grad_norm": 4.665926933288574, + "learning_rate": 2.4430404286755354e-05, + "loss": 0.3883, "step": 588400 }, { - "epoch": 6.0, - "learning_rate": 4.123734081835898e-05, - "loss": 0.5487, + "epoch": 8.108070871565953, + "grad_norm": 0.5225210785865784, + "learning_rate": 2.4422439065770938e-05, + "loss": 0.3915, "step": 588500 }, { - "epoch": 6.0, - "learning_rate": 4.1231189901690664e-05, - "loss": 0.5597, + "epoch": 8.109448623625692, + "grad_norm": 3.7652533054351807, + "learning_rate": 2.4414474106241063e-05, + "loss": 0.435, "step": 588600 }, { - "epoch": 6.0, - "learning_rate": 4.1225038435884174e-05, - "loss": 0.5646, + "epoch": 8.110826375685432, + "grad_norm": 10.29556941986084, + "learning_rate": 2.4406509408842406e-05, + "loss": 0.3668, "step": 588700 }, { - "epoch": 6.0, - "learning_rate": 4.1218886421240294e-05, - "loss": 0.6376, + "epoch": 8.112204127745171, + "grad_norm": 5.862832546234131, + "learning_rate": 2.4398544974251598e-05, + "loss": 0.4052, "step": 588800 }, { - "epoch": 6.0, - "learning_rate": 4.121273385805982e-05, - "loss": 0.612, + "epoch": 8.11358187980491, + "grad_norm": 3.3638527393341064, + "learning_rate": 2.4390580803145255e-05, + "loss": 0.4398, "step": 588900 }, { - "epoch": 6.0, - "learning_rate": 4.1206580746643576e-05, - "loss": 0.4853, + "epoch": 8.11495963186465, + "grad_norm": 4.01972770690918, + "learning_rate": 2.4382616896199987e-05, + "loss": 0.3262, "step": 589000 }, { - "epoch": 6.0, - "learning_rate": 4.120042708729241e-05, - "loss": 0.5164, + "epoch": 8.116337383924389, + "grad_norm": 8.807936668395996, + "learning_rate": 2.4374653254092354e-05, + "loss": 0.376, "step": 589100 }, { - "epoch": 6.0, - "learning_rate": 4.11942728803072e-05, - "loss": 0.4396, + "epoch": 8.117715135984128, + "grad_norm": 18.51058578491211, + "learning_rate": 2.436668987749892e-05, + "loss": 0.4229, "step": 589200 }, { - "epoch": 6.0, - "learning_rate": 4.1188118125988855e-05, - "loss": 0.5545, + "epoch": 8.119092888043868, + "grad_norm": 2.7492315769195557, + "learning_rate": 2.435880639688039e-05, + "loss": 0.3598, "step": 589300 }, { - "epoch": 6.0, - "learning_rate": 4.11819628246383e-05, - "loss": 0.5377, + "epoch": 8.120470640103607, + "grad_norm": 1.5246316194534302, + "learning_rate": 2.4350843550672885e-05, + "loss": 0.3344, "step": 589400 }, { - "epoch": 6.01, - "learning_rate": 4.1175806976556485e-05, - "loss": 0.4688, + "epoch": 8.121848392163347, + "grad_norm": 7.685235500335693, + "learning_rate": 2.434288097200234e-05, + "loss": 0.4201, "step": 589500 }, { - "epoch": 6.01, - "learning_rate": 4.11696505820444e-05, - "loss": 0.5219, + "epoch": 8.123226144223086, + "grad_norm": 3.045987606048584, + "learning_rate": 2.433491866154521e-05, + "loss": 0.3697, "step": 589600 }, { - "epoch": 6.01, - "learning_rate": 4.116349364140307e-05, - "loss": 0.6404, + "epoch": 8.124603896282824, + "grad_norm": 3.55100154876709, + "learning_rate": 2.4326956619977925e-05, + "loss": 0.3516, "step": 589700 }, { - "epoch": 6.01, - "learning_rate": 4.115733615493351e-05, - "loss": 0.4468, + "epoch": 8.125981648342565, + "grad_norm": 2.05771803855896, + "learning_rate": 2.4318994847976924e-05, + "loss": 0.3444, "step": 589800 }, { - "epoch": 6.01, - "learning_rate": 4.115117812293681e-05, - "loss": 0.5317, + "epoch": 8.127359400402304, + "grad_norm": 2.076603412628174, + "learning_rate": 2.431103334621857e-05, + "loss": 0.3839, "step": 589900 }, { - "epoch": 6.01, - "learning_rate": 4.1145019545714046e-05, - "loss": 0.5006, + "epoch": 8.128737152462042, + "grad_norm": 1.0720906257629395, + "learning_rate": 2.4303072115379255e-05, + "loss": 0.4236, "step": 590000 }, { - "epoch": 6.01, - "learning_rate": 4.1138860423566324e-05, - "loss": 0.5178, + "epoch": 8.130114904521783, + "grad_norm": 8.220071792602539, + "learning_rate": 2.4295111156135315e-05, + "loss": 0.3565, "step": 590100 }, { - "epoch": 6.01, - "learning_rate": 4.113270075679481e-05, - "loss": 0.4846, + "epoch": 8.131492656581521, + "grad_norm": 3.7118887901306152, + "learning_rate": 2.4287150469163072e-05, + "loss": 0.4404, "step": 590200 }, { - "epoch": 6.01, - "learning_rate": 4.1126540545700654e-05, - "loss": 0.5962, + "epoch": 8.132870408641262, + "grad_norm": 1.0506024360656738, + "learning_rate": 2.4279190055138845e-05, + "loss": 0.3967, "step": 590300 }, { - "epoch": 6.02, - "learning_rate": 4.1120379790585075e-05, - "loss": 0.4703, + "epoch": 8.134248160701, + "grad_norm": 2.0124876499176025, + "learning_rate": 2.42712299147389e-05, + "loss": 0.3737, "step": 590400 }, { - "epoch": 6.02, - "learning_rate": 4.111421849174928e-05, - "loss": 0.5624, + "epoch": 8.135625912760739, + "grad_norm": 2.834197998046875, + "learning_rate": 2.426327004863949e-05, + "loss": 0.3778, "step": 590500 }, { - "epoch": 6.02, - "learning_rate": 4.110805664949454e-05, - "loss": 0.5424, + "epoch": 8.13700366482048, + "grad_norm": 2.8162879943847656, + "learning_rate": 2.425531045751685e-05, + "loss": 0.3665, "step": 590600 }, { - "epoch": 6.02, - "learning_rate": 4.110195589066327e-05, - "loss": 0.6128, + "epoch": 8.138381416880218, + "grad_norm": 3.4018990993499756, + "learning_rate": 2.4247351142047178e-05, + "loss": 0.3366, "step": 590700 }, { - "epoch": 6.02, - "learning_rate": 4.109579296790114e-05, - "loss": 0.5603, + "epoch": 8.139759168939957, + "grad_norm": 1.353232502937317, + "learning_rate": 2.423939210290667e-05, + "loss": 0.3567, "step": 590800 }, { - "epoch": 6.02, - "learning_rate": 4.108962950262094e-05, - "loss": 0.5908, + "epoch": 8.141136920999697, + "grad_norm": 4.212244987487793, + "learning_rate": 2.4231433340771495e-05, + "loss": 0.3662, "step": 590900 }, { - "epoch": 6.02, - "learning_rate": 4.108346549512405e-05, - "loss": 0.5302, + "epoch": 8.142514673059436, + "grad_norm": 4.180209159851074, + "learning_rate": 2.4223474856317772e-05, + "loss": 0.3656, "step": 591000 }, { - "epoch": 6.02, - "learning_rate": 4.107730094571183e-05, - "loss": 0.5074, + "epoch": 8.143892425119176, + "grad_norm": 10.292398452758789, + "learning_rate": 2.421551665022162e-05, + "loss": 0.4106, "step": 591100 }, { - "epoch": 6.02, - "learning_rate": 4.10711358546857e-05, - "loss": 0.4752, + "epoch": 8.145270177178915, + "grad_norm": 3.6615183353424072, + "learning_rate": 2.4207558723159126e-05, + "loss": 0.3467, "step": 591200 }, { - "epoch": 6.02, - "learning_rate": 4.10649702223471e-05, - "loss": 0.4885, + "epoch": 8.146647929238654, + "grad_norm": 0.9376118779182434, + "learning_rate": 2.4199601075806373e-05, + "loss": 0.3996, "step": 591300 }, { - "epoch": 6.03, - "learning_rate": 4.105880404899749e-05, - "loss": 0.611, + "epoch": 8.148025681298394, + "grad_norm": 5.277390480041504, + "learning_rate": 2.4191643708839388e-05, + "loss": 0.3259, "step": 591400 }, { - "epoch": 6.03, - "learning_rate": 4.105263733493836e-05, - "loss": 0.4913, + "epoch": 8.149403433358133, + "grad_norm": 1.3508918285369873, + "learning_rate": 2.4183686622934205e-05, + "loss": 0.3492, "step": 591500 }, { - "epoch": 6.03, - "learning_rate": 4.104647008047121e-05, - "loss": 0.4679, + "epoch": 8.150781185417872, + "grad_norm": 18.680063247680664, + "learning_rate": 2.4175729818766803e-05, + "loss": 0.3493, "step": 591600 }, { - "epoch": 6.03, - "learning_rate": 4.104030228589761e-05, - "loss": 0.5721, + "epoch": 8.152158937477612, + "grad_norm": 8.195116996765137, + "learning_rate": 2.4167773297013152e-05, + "loss": 0.4073, "step": 591700 }, { - "epoch": 6.03, - "learning_rate": 4.10341339515191e-05, - "loss": 0.5539, + "epoch": 8.15353668953735, + "grad_norm": 9.060653686523438, + "learning_rate": 2.4159817058349215e-05, + "loss": 0.3619, "step": 591800 }, { - "epoch": 6.03, - "learning_rate": 4.10279650776373e-05, - "loss": 0.5891, + "epoch": 8.15491444159709, + "grad_norm": 1.9935379028320312, + "learning_rate": 2.4151861103450907e-05, + "loss": 0.4389, "step": 591900 }, { - "epoch": 6.03, - "learning_rate": 4.102179566455381e-05, - "loss": 0.5013, + "epoch": 8.15629219365683, + "grad_norm": 3.29073429107666, + "learning_rate": 2.414390543299413e-05, + "loss": 0.331, "step": 592000 }, { - "epoch": 6.03, - "learning_rate": 4.101568741475669e-05, - "loss": 0.5112, + "epoch": 8.157669945716568, + "grad_norm": 0.2610456943511963, + "learning_rate": 2.4135950047654755e-05, + "loss": 0.3759, "step": 592100 }, { - "epoch": 6.03, - "learning_rate": 4.10095169295593e-05, - "loss": 0.6183, + "epoch": 8.159047697776309, + "grad_norm": 3.4052512645721436, + "learning_rate": 2.4127994948108627e-05, + "loss": 0.3656, "step": 592200 }, { - "epoch": 6.03, - "learning_rate": 4.1003407618960795e-05, - "loss": 0.62, + "epoch": 8.160425449836048, + "grad_norm": 1.7754141092300415, + "learning_rate": 2.412004013503158e-05, + "loss": 0.4142, "step": 592300 }, { - "epoch": 6.04, - "learning_rate": 4.0997236062844254e-05, - "loss": 0.6609, + "epoch": 8.161803201895786, + "grad_norm": 0.057644303888082504, + "learning_rate": 2.4112085609099422e-05, + "loss": 0.3774, "step": 592400 }, { - "epoch": 6.04, - "learning_rate": 4.099106396902851e-05, - "loss": 0.5438, + "epoch": 8.163180953955527, + "grad_norm": 2.374054193496704, + "learning_rate": 2.4104131370987922e-05, + "loss": 0.4012, "step": 592500 }, { - "epoch": 6.04, - "learning_rate": 4.098489133781534e-05, - "loss": 0.6257, + "epoch": 8.164558706015265, + "grad_norm": 2.596203088760376, + "learning_rate": 2.409617742137284e-05, + "loss": 0.2971, "step": 592600 }, { - "epoch": 6.04, - "learning_rate": 4.0978718169506516e-05, - "loss": 0.511, + "epoch": 8.165936458075004, + "grad_norm": 2.750272035598755, + "learning_rate": 2.4088223760929887e-05, + "loss": 0.34, "step": 592700 }, { - "epoch": 6.04, - "learning_rate": 4.0972606204111065e-05, - "loss": 0.6321, + "epoch": 8.167314210134744, + "grad_norm": 2.6829633712768555, + "learning_rate": 2.4080270390334786e-05, + "loss": 0.3827, "step": 592800 }, { - "epoch": 6.04, - "learning_rate": 4.096643196787993e-05, - "loss": 0.5457, + "epoch": 8.168691962194483, + "grad_norm": 4.634260177612305, + "learning_rate": 2.4072317310263208e-05, + "loss": 0.3521, "step": 592900 }, { - "epoch": 6.04, - "learning_rate": 4.096025719545571e-05, - "loss": 0.5972, + "epoch": 8.170069714254224, + "grad_norm": 1.6457654237747192, + "learning_rate": 2.4064364521390817e-05, + "loss": 0.383, "step": 593000 }, { - "epoch": 6.04, - "learning_rate": 4.095408188714033e-05, - "loss": 0.5937, + "epoch": 8.171447466313962, + "grad_norm": 2.2018299102783203, + "learning_rate": 2.405641202439324e-05, + "loss": 0.3409, "step": 593100 }, { - "epoch": 6.04, - "learning_rate": 4.094790604323572e-05, - "loss": 0.6263, + "epoch": 8.172825218373701, + "grad_norm": 3.3153560161590576, + "learning_rate": 2.4048459819946067e-05, + "loss": 0.3608, "step": 593200 }, { - "epoch": 6.04, - "learning_rate": 4.0941729664043826e-05, - "loss": 0.6391, + "epoch": 8.174202970433441, + "grad_norm": 6.065871238708496, + "learning_rate": 2.4040507908724895e-05, + "loss": 0.3746, "step": 593300 }, { - "epoch": 6.05, - "learning_rate": 4.093555274986666e-05, - "loss": 0.5879, + "epoch": 8.17558072249318, + "grad_norm": 4.061732769012451, + "learning_rate": 2.4032556291405274e-05, + "loss": 0.38, "step": 593400 }, { - "epoch": 6.05, - "learning_rate": 4.0929375301006215e-05, - "loss": 0.4909, + "epoch": 8.176958474552919, + "grad_norm": 4.170194149017334, + "learning_rate": 2.402460496866274e-05, + "loss": 0.427, "step": 593500 }, { - "epoch": 6.05, - "learning_rate": 4.0923197317764546e-05, - "loss": 0.5866, + "epoch": 8.17833622661266, + "grad_norm": 4.86626672744751, + "learning_rate": 2.4016653941172783e-05, + "loss": 0.38, "step": 593600 }, { - "epoch": 6.05, - "learning_rate": 4.091701880044371e-05, - "loss": 0.5637, + "epoch": 8.179713978672398, + "grad_norm": 115.64444732666016, + "learning_rate": 2.4008703209610908e-05, + "loss": 0.3639, "step": 593700 }, { - "epoch": 6.05, - "learning_rate": 4.0910839749345804e-05, - "loss": 0.6098, + "epoch": 8.181091730732138, + "grad_norm": 2.123413562774658, + "learning_rate": 2.400075277465255e-05, + "loss": 0.3959, "step": 593800 }, { - "epoch": 6.05, - "learning_rate": 4.090466016477295e-05, - "loss": 0.5655, + "epoch": 8.182469482791877, + "grad_norm": 2.600405216217041, + "learning_rate": 2.3992882136876187e-05, + "loss": 0.3835, "step": 593900 }, { - "epoch": 6.05, - "learning_rate": 4.089848004702729e-05, - "loss": 0.517, + "epoch": 8.183847234851616, + "grad_norm": 2.9496586322784424, + "learning_rate": 2.3984932294168256e-05, + "loss": 0.4046, "step": 594000 }, { - "epoch": 6.05, - "learning_rate": 4.0892299396411e-05, - "loss": 0.5466, + "epoch": 8.185224986911356, + "grad_norm": 3.0139377117156982, + "learning_rate": 2.3976982750083313e-05, + "loss": 0.3847, "step": 594100 }, { - "epoch": 6.05, - "learning_rate": 4.0886118213226264e-05, - "loss": 0.4581, + "epoch": 8.186602738971095, + "grad_norm": 2.8484079837799072, + "learning_rate": 2.396903350529671e-05, + "loss": 0.3166, "step": 594200 }, { - "epoch": 6.05, - "learning_rate": 4.08799364977753e-05, - "loss": 0.5438, + "epoch": 8.187980491030833, + "grad_norm": 0.648325502872467, + "learning_rate": 2.396108456048377e-05, + "loss": 0.304, "step": 594300 }, { - "epoch": 6.06, - "learning_rate": 4.087375425036039e-05, - "loss": 0.5562, + "epoch": 8.189358243090574, + "grad_norm": 24.33765983581543, + "learning_rate": 2.3953135916319794e-05, + "loss": 0.3166, "step": 594400 }, { - "epoch": 6.06, - "learning_rate": 4.086757147128379e-05, - "loss": 0.6083, + "epoch": 8.190735995150312, + "grad_norm": 2.528599977493286, + "learning_rate": 2.3945187573480076e-05, + "loss": 0.3675, "step": 594500 }, { - "epoch": 6.06, - "learning_rate": 4.0861388160847784e-05, - "loss": 0.5024, + "epoch": 8.192113747210053, + "grad_norm": 3.2952067852020264, + "learning_rate": 2.393723953263986e-05, + "loss": 0.3656, "step": 594600 }, { - "epoch": 6.06, - "learning_rate": 4.085520431935472e-05, - "loss": 0.5311, + "epoch": 8.193491499269792, + "grad_norm": 2.1732401847839355, + "learning_rate": 2.3929291794474363e-05, + "loss": 0.4261, "step": 594700 }, { - "epoch": 6.06, - "learning_rate": 4.084901994710694e-05, - "loss": 0.518, + "epoch": 8.19486925132953, + "grad_norm": 4.165266990661621, + "learning_rate": 2.392134435965882e-05, + "loss": 0.3583, "step": 594800 }, { - "epoch": 6.06, - "learning_rate": 4.084283504440684e-05, - "loss": 0.502, + "epoch": 8.19624700338927, + "grad_norm": 3.733318328857422, + "learning_rate": 2.391339722886836e-05, + "loss": 0.4283, "step": 594900 }, { - "epoch": 6.06, - "learning_rate": 4.083664961155681e-05, - "loss": 0.562, + "epoch": 8.19762475544901, + "grad_norm": 0.897555410861969, + "learning_rate": 2.390545040277816e-05, + "loss": 0.3226, "step": 595000 }, { - "epoch": 6.06, - "learning_rate": 4.0830463648859276e-05, - "loss": 0.4815, + "epoch": 8.199002507508748, + "grad_norm": 3.5876872539520264, + "learning_rate": 2.3897503882063338e-05, + "loss": 0.3049, "step": 595100 }, { - "epoch": 6.06, - "learning_rate": 4.08242771566167e-05, - "loss": 0.6263, + "epoch": 8.200380259568488, + "grad_norm": 11.801627159118652, + "learning_rate": 2.388955766739899e-05, + "loss": 0.3764, "step": 595200 }, { - "epoch": 6.07, - "learning_rate": 4.081809013513158e-05, - "loss": 0.5333, + "epoch": 8.201758011628227, + "grad_norm": 0.23541490733623505, + "learning_rate": 2.3881611759460198e-05, + "loss": 0.3459, "step": 595300 }, { - "epoch": 6.07, - "learning_rate": 4.08119025847064e-05, - "loss": 0.542, + "epoch": 8.203135763687968, + "grad_norm": 3.1915454864501953, + "learning_rate": 2.3873666158921987e-05, + "loss": 0.3515, "step": 595400 }, { - "epoch": 6.07, - "learning_rate": 4.080571450564372e-05, - "loss": 0.5084, + "epoch": 8.204513515747706, + "grad_norm": 8.96133804321289, + "learning_rate": 2.386572086645939e-05, + "loss": 0.3607, "step": 595500 }, { - "epoch": 6.07, - "learning_rate": 4.079952589824607e-05, - "loss": 0.5497, + "epoch": 8.205891267807445, + "grad_norm": 4.7565741539001465, + "learning_rate": 2.3857775882747402e-05, + "loss": 0.3701, "step": 595600 }, { - "epoch": 6.07, - "learning_rate": 4.079333676281606e-05, - "loss": 0.4469, + "epoch": 8.207269019867185, + "grad_norm": 2.963245391845703, + "learning_rate": 2.384983120846098e-05, + "loss": 0.4234, "step": 595700 }, { - "epoch": 6.07, - "learning_rate": 4.0787147099656296e-05, - "loss": 0.5457, + "epoch": 8.208646771926924, + "grad_norm": 2.912383556365967, + "learning_rate": 2.3841886844275077e-05, + "loss": 0.3806, "step": 595800 }, { - "epoch": 6.07, - "learning_rate": 4.0780956909069404e-05, - "loss": 0.559, + "epoch": 8.210024523986663, + "grad_norm": 2.8053717613220215, + "learning_rate": 2.3833942790864604e-05, + "loss": 0.3714, "step": 595900 }, { - "epoch": 6.07, - "learning_rate": 4.0774766191358067e-05, - "loss": 0.5114, + "epoch": 8.211402276046403, + "grad_norm": 2.7469027042388916, + "learning_rate": 2.3825999048904446e-05, + "loss": 0.4079, "step": 596000 }, { - "epoch": 6.07, - "learning_rate": 4.076857494682495e-05, - "loss": 0.5013, + "epoch": 8.212780028106142, + "grad_norm": 3.2951292991638184, + "learning_rate": 2.3818055619069457e-05, + "loss": 0.4025, "step": 596100 }, { - "epoch": 6.07, - "learning_rate": 4.0762383175772796e-05, - "loss": 0.5116, + "epoch": 8.21415778016588, + "grad_norm": 13.589353561401367, + "learning_rate": 2.3810112502034483e-05, + "loss": 0.4093, "step": 596200 }, { - "epoch": 6.08, - "learning_rate": 4.075619087850432e-05, - "loss": 0.4689, + "epoch": 8.215535532225621, + "grad_norm": 12.381446838378906, + "learning_rate": 2.380216969847433e-05, + "loss": 0.3148, "step": 596300 }, { - "epoch": 6.08, - "learning_rate": 4.074999805532231e-05, - "loss": 0.5377, + "epoch": 8.21691328428536, + "grad_norm": 2.9471869468688965, + "learning_rate": 2.3794227209063778e-05, + "loss": 0.3429, "step": 596400 }, { - "epoch": 6.08, - "learning_rate": 4.074380470652954e-05, - "loss": 0.5886, + "epoch": 8.2182910363451, + "grad_norm": 2.6726393699645996, + "learning_rate": 2.3786285034477594e-05, + "loss": 0.3193, "step": 596500 }, { - "epoch": 6.08, - "learning_rate": 4.073761083242884e-05, - "loss": 0.4881, + "epoch": 8.219668788404839, + "grad_norm": 1.4599651098251343, + "learning_rate": 2.3778343175390484e-05, + "loss": 0.416, "step": 596600 }, { - "epoch": 6.08, - "learning_rate": 4.0731416433323046e-05, - "loss": 0.586, + "epoch": 8.221046540464577, + "grad_norm": 6.416299343109131, + "learning_rate": 2.377040163247716e-05, + "loss": 0.3648, "step": 596700 }, { - "epoch": 6.08, - "learning_rate": 4.072522150951503e-05, - "loss": 0.5998, + "epoch": 8.222424292524318, + "grad_norm": 6.702447414398193, + "learning_rate": 2.3762460406412302e-05, + "loss": 0.3936, "step": 596800 }, { - "epoch": 6.08, - "learning_rate": 4.0719026061307695e-05, - "loss": 0.5474, + "epoch": 8.223802044584057, + "grad_norm": 3.8319742679595947, + "learning_rate": 2.3754519497870553e-05, + "loss": 0.4281, "step": 596900 }, { - "epoch": 6.08, - "learning_rate": 4.071283008900394e-05, - "loss": 0.5887, + "epoch": 8.225179796643795, + "grad_norm": 49.12172317504883, + "learning_rate": 2.374657890752654e-05, + "loss": 0.4049, "step": 597000 }, { - "epoch": 6.08, - "learning_rate": 4.070663359290674e-05, - "loss": 0.5512, + "epoch": 8.226557548703536, + "grad_norm": 1.7983771562576294, + "learning_rate": 2.373863863605485e-05, + "loss": 0.3459, "step": 597100 }, { - "epoch": 6.08, - "learning_rate": 4.070043657331904e-05, - "loss": 0.4717, + "epoch": 8.227935300763274, + "grad_norm": 6.215121269226074, + "learning_rate": 2.3730698684130037e-05, + "loss": 0.3489, "step": 597200 }, { - "epoch": 6.09, - "learning_rate": 4.0694239030543846e-05, - "loss": 0.5564, + "epoch": 8.229313052823015, + "grad_norm": 3.391045093536377, + "learning_rate": 2.3722759052426664e-05, + "loss": 0.3825, "step": 597300 }, { - "epoch": 6.09, - "learning_rate": 4.068804096488419e-05, - "loss": 0.5988, + "epoch": 8.230690804882753, + "grad_norm": 5.365300178527832, + "learning_rate": 2.3714819741619237e-05, + "loss": 0.3795, "step": 597400 }, { - "epoch": 6.09, - "learning_rate": 4.06818423766431e-05, - "loss": 0.5198, + "epoch": 8.232068556942492, + "grad_norm": 1.8313648700714111, + "learning_rate": 2.3706880752382223e-05, + "loss": 0.4024, "step": 597500 }, { - "epoch": 6.09, - "learning_rate": 4.067570525981316e-05, - "loss": 0.5715, + "epoch": 8.233446309002233, + "grad_norm": 11.814016342163086, + "learning_rate": 2.369894208539012e-05, + "loss": 0.3888, "step": 597600 }, { - "epoch": 6.09, - "learning_rate": 4.066950563253674e-05, - "loss": 0.4577, + "epoch": 8.234824061061971, + "grad_norm": 6.280355453491211, + "learning_rate": 2.369108312315738e-05, + "loss": 0.3785, "step": 597700 }, { - "epoch": 6.09, - "learning_rate": 4.0663305483585165e-05, - "loss": 0.486, + "epoch": 8.23620181312171, + "grad_norm": 5.001255035400391, + "learning_rate": 2.3683145099439022e-05, + "loss": 0.3578, "step": 597800 }, { - "epoch": 6.09, - "learning_rate": 4.06571048132616e-05, - "loss": 0.4831, + "epoch": 8.23757956518145, + "grad_norm": 4.572300434112549, + "learning_rate": 2.3675207399982008e-05, + "loss": 0.381, "step": 597900 }, { - "epoch": 6.09, - "learning_rate": 4.065090362186919e-05, - "loss": 0.4965, + "epoch": 8.238957317241189, + "grad_norm": 6.731862545013428, + "learning_rate": 2.3667270025460703e-05, + "loss": 0.4538, "step": 598000 }, { - "epoch": 6.09, - "learning_rate": 4.064470190971116e-05, - "loss": 0.5462, + "epoch": 8.24033506930093, + "grad_norm": 2.721407651901245, + "learning_rate": 2.3659332976549418e-05, + "loss": 0.401, "step": 598100 }, { - "epoch": 6.09, - "learning_rate": 4.063849967709073e-05, - "loss": 0.4754, + "epoch": 8.241712821360668, + "grad_norm": 3.0862796306610107, + "learning_rate": 2.365139625392244e-05, + "loss": 0.4178, "step": 598200 }, { - "epoch": 6.1, - "learning_rate": 4.0632296924311154e-05, - "loss": 0.5617, + "epoch": 8.243090573420407, + "grad_norm": 1.7837305068969727, + "learning_rate": 2.3643539220590068e-05, + "loss": 0.3603, "step": 598300 }, { - "epoch": 6.1, - "learning_rate": 4.0626093651675716e-05, - "loss": 0.5968, + "epoch": 8.244468325480147, + "grad_norm": 3.3449065685272217, + "learning_rate": 2.3635603149274818e-05, + "loss": 0.389, "step": 598400 }, { - "epoch": 6.1, - "learning_rate": 4.061988985948771e-05, - "loss": 0.6003, + "epoch": 8.245846077539886, + "grad_norm": 1.9673396348953247, + "learning_rate": 2.3627667406259845e-05, + "loss": 0.3753, "step": 598500 }, { - "epoch": 6.1, - "learning_rate": 4.061368554805046e-05, - "loss": 0.5738, + "epoch": 8.247223829599625, + "grad_norm": 6.038936138153076, + "learning_rate": 2.3619731992219356e-05, + "loss": 0.3798, "step": 598600 }, { - "epoch": 6.1, - "learning_rate": 4.0607480717667325e-05, - "loss": 0.6421, + "epoch": 8.248601581659365, + "grad_norm": 0.7166121006011963, + "learning_rate": 2.3611796907827465e-05, + "loss": 0.3308, "step": 598700 }, { - "epoch": 6.1, - "learning_rate": 4.060127536864169e-05, - "loss": 0.4819, + "epoch": 8.249979333719104, + "grad_norm": 1.7720637321472168, + "learning_rate": 2.360386215375833e-05, + "loss": 0.3657, "step": 598800 }, { - "epoch": 6.1, - "learning_rate": 4.0595069501276946e-05, - "loss": 0.5702, + "epoch": 8.251357085778844, + "grad_norm": 3.3728113174438477, + "learning_rate": 2.3595927730686032e-05, + "loss": 0.2964, "step": 598900 }, { - "epoch": 6.1, - "learning_rate": 4.0588863115876546e-05, - "loss": 0.5597, + "epoch": 8.252734837838583, + "grad_norm": 1.96244478225708, + "learning_rate": 2.358799363928464e-05, + "loss": 0.3805, "step": 599000 }, { - "epoch": 6.1, - "learning_rate": 4.058265621274392e-05, - "loss": 0.4722, + "epoch": 8.254112589898321, + "grad_norm": 2.015028715133667, + "learning_rate": 2.3580059880228212e-05, + "loss": 0.3648, "step": 599100 }, { - "epoch": 6.1, - "learning_rate": 4.057644879218257e-05, - "loss": 0.5098, + "epoch": 8.255490341958062, + "grad_norm": 2.4776034355163574, + "learning_rate": 2.3572126454190757e-05, + "loss": 0.3487, "step": 599200 }, { - "epoch": 6.11, - "learning_rate": 4.057024085449597e-05, - "loss": 0.5347, + "epoch": 8.2568680940178, + "grad_norm": 3.41325044631958, + "learning_rate": 2.3564193361846246e-05, + "loss": 0.3302, "step": 599300 }, { - "epoch": 6.11, - "learning_rate": 4.056403239998769e-05, - "loss": 0.5183, + "epoch": 8.25824584607754, + "grad_norm": 1.0462127923965454, + "learning_rate": 2.3556260603868644e-05, + "loss": 0.3896, "step": 599400 }, { - "epoch": 6.11, - "learning_rate": 4.055782342896126e-05, - "loss": 0.5267, + "epoch": 8.25962359813728, + "grad_norm": 10.382911682128906, + "learning_rate": 2.3548328180931878e-05, + "loss": 0.3496, "step": 599500 }, { - "epoch": 6.11, - "learning_rate": 4.055161394172028e-05, - "loss": 0.5413, + "epoch": 8.261001350197018, + "grad_norm": 2.761988639831543, + "learning_rate": 2.3540396093709852e-05, + "loss": 0.3523, "step": 599600 }, { - "epoch": 6.11, - "learning_rate": 4.054540393856833e-05, - "loss": 0.4451, + "epoch": 8.262379102256759, + "grad_norm": 2.800248861312866, + "learning_rate": 2.3532464342876444e-05, + "loss": 0.338, "step": 599700 }, { - "epoch": 6.11, - "learning_rate": 4.053919341980908e-05, - "loss": 0.4851, + "epoch": 8.263756854316497, + "grad_norm": 23.189224243164062, + "learning_rate": 2.352453292910548e-05, + "loss": 0.3473, "step": 599800 }, { - "epoch": 6.11, - "learning_rate": 4.0532982385746154e-05, - "loss": 0.5158, + "epoch": 8.265134606376236, + "grad_norm": 5.876138687133789, + "learning_rate": 2.351660185307078e-05, + "loss": 0.4313, "step": 599900 }, { - "epoch": 6.11, - "learning_rate": 4.0526770836683246e-05, - "loss": 0.5769, + "epoch": 8.266512358435977, + "grad_norm": 3.2400267124176025, + "learning_rate": 2.3508671115446127e-05, + "loss": 0.3509, "step": 600000 }, { - "epoch": 6.11, - "learning_rate": 4.052055877292407e-05, - "loss": 0.5363, + "epoch": 8.266512358435977, + "eval_accuracy": 0.8889665328785155, + "eval_cer": 0.07646042258881632, + "eval_loss": 0.4270029664039612, + "eval_runtime": 8833.6312, + "eval_samples_per_second": 6.106, + "eval_steps_per_second": 0.382, + "eval_wer": 0.15148950089177773, + "step": 600000 + }, + { + "epoch": 8.267890110495715, + "grad_norm": 5.5132293701171875, + "learning_rate": 2.350074071690529e-05, + "loss": 0.3045, "step": 600100 }, { - "epoch": 6.11, - "learning_rate": 4.0514346194772345e-05, - "loss": 0.5936, + "epoch": 8.269267862555454, + "grad_norm": 5.565347194671631, + "learning_rate": 2.3492810658121985e-05, + "loss": 0.4019, "step": 600200 }, { - "epoch": 6.12, - "learning_rate": 4.0508133102531844e-05, - "loss": 0.594, + "epoch": 8.270645614615194, + "grad_norm": 5.544607639312744, + "learning_rate": 2.3484880939769924e-05, + "loss": 0.3741, "step": 600300 }, { - "epoch": 6.12, - "learning_rate": 4.0501919496506344e-05, - "loss": 0.5522, + "epoch": 8.272023366674933, + "grad_norm": 3.7324399948120117, + "learning_rate": 2.3476951562522764e-05, + "loss": 0.424, "step": 600400 }, { - "epoch": 6.12, - "learning_rate": 4.049570537699965e-05, - "loss": 0.5655, + "epoch": 8.273401118734672, + "grad_norm": 4.268611431121826, + "learning_rate": 2.3469022527054144e-05, + "loss": 0.3721, "step": 600500 }, { - "epoch": 6.12, - "learning_rate": 4.0489490744315605e-05, - "loss": 0.519, + "epoch": 8.274778870794412, + "grad_norm": 3.2757174968719482, + "learning_rate": 2.346109383403769e-05, + "loss": 0.3595, "step": 600600 }, { - "epoch": 6.12, - "learning_rate": 4.048327559875805e-05, - "loss": 0.5413, + "epoch": 8.27615662285415, + "grad_norm": 0.6043170094490051, + "learning_rate": 2.3453165484146972e-05, + "loss": 0.3292, "step": 600700 }, { - "epoch": 6.12, - "learning_rate": 4.047705994063089e-05, - "loss": 0.5333, + "epoch": 8.277534374913891, + "grad_norm": 47.83230972290039, + "learning_rate": 2.3445237478055555e-05, + "loss": 0.4445, "step": 600800 }, { - "epoch": 6.12, - "learning_rate": 4.047084377023802e-05, - "loss": 0.4624, + "epoch": 8.27891212697363, + "grad_norm": 1.1040006875991821, + "learning_rate": 2.3437309816436963e-05, + "loss": 0.3769, "step": 600900 }, { - "epoch": 6.12, - "learning_rate": 4.046462708788337e-05, - "loss": 0.551, + "epoch": 8.280289879033369, + "grad_norm": 4.363882064819336, + "learning_rate": 2.342938249996467e-05, + "loss": 0.3596, "step": 601000 }, { - "epoch": 6.12, - "learning_rate": 4.0458409893870905e-05, - "loss": 0.5785, + "epoch": 8.281667631093109, + "grad_norm": 6.074005126953125, + "learning_rate": 2.3421455529312172e-05, + "loss": 0.3651, "step": 601100 }, { - "epoch": 6.13, - "learning_rate": 4.045219218850461e-05, - "loss": 0.5354, + "epoch": 8.283045383152848, + "grad_norm": 3.6676464080810547, + "learning_rate": 2.3413528905152887e-05, + "loss": 0.4084, "step": 601200 }, { - "epoch": 6.13, - "learning_rate": 4.04459739720885e-05, - "loss": 0.5072, + "epoch": 8.284423135212586, + "grad_norm": 9.595353126525879, + "learning_rate": 2.340560262816023e-05, + "loss": 0.4027, "step": 601300 }, { - "epoch": 6.13, - "learning_rate": 4.043975524492659e-05, - "loss": 0.5157, + "epoch": 8.285800887272327, + "grad_norm": 3.8764138221740723, + "learning_rate": 2.3397676699007583e-05, + "loss": 0.445, "step": 601400 }, { - "epoch": 6.13, - "learning_rate": 4.043359820222468e-05, - "loss": 0.5498, + "epoch": 8.287178639332065, + "grad_norm": 2.0737035274505615, + "learning_rate": 2.3389751118368274e-05, + "loss": 0.3393, "step": 601500 }, { - "epoch": 6.13, - "learning_rate": 4.042737845958328e-05, - "loss": 0.5086, + "epoch": 8.288556391391806, + "grad_norm": 3.8336541652679443, + "learning_rate": 2.3381825886915645e-05, + "loss": 0.3761, "step": 601600 }, { - "epoch": 6.13, - "learning_rate": 4.042115820710529e-05, - "loss": 0.4798, + "epoch": 8.289934143451545, + "grad_norm": 4.329288482666016, + "learning_rate": 2.337390100532297e-05, + "loss": 0.3364, "step": 601700 }, { - "epoch": 6.13, - "learning_rate": 4.041493744509485e-05, - "loss": 0.5029, + "epoch": 8.291311895511283, + "grad_norm": 1.8989430665969849, + "learning_rate": 2.336597647426352e-05, + "loss": 0.4321, "step": 601800 }, { - "epoch": 6.13, - "learning_rate": 4.040871617385612e-05, - "loss": 0.5925, + "epoch": 8.292689647571024, + "grad_norm": 3.7484686374664307, + "learning_rate": 2.3358052294410507e-05, + "loss": 0.3013, "step": 601900 }, { - "epoch": 6.13, - "learning_rate": 4.04024943936933e-05, - "loss": 0.5896, + "epoch": 8.294067399630762, + "grad_norm": 131.56802368164062, + "learning_rate": 2.3350128466437156e-05, + "loss": 0.3319, "step": 602000 }, { - "epoch": 6.13, - "learning_rate": 4.0396272104910575e-05, - "loss": 0.4564, + "epoch": 8.295445151690501, + "grad_norm": 4.122030735015869, + "learning_rate": 2.3342204991016614e-05, + "loss": 0.3312, "step": 602100 }, { - "epoch": 6.14, - "learning_rate": 4.0390049307812185e-05, - "loss": 0.5879, + "epoch": 8.296822903750241, + "grad_norm": 2.953845262527466, + "learning_rate": 2.3334281868822032e-05, + "loss": 0.3149, "step": 602200 }, { - "epoch": 6.14, - "learning_rate": 4.038382600270239e-05, - "loss": 0.6472, + "epoch": 8.29820065580998, + "grad_norm": 2.1116340160369873, + "learning_rate": 2.3326359100526516e-05, + "loss": 0.3329, "step": 602300 }, { - "epoch": 6.14, - "learning_rate": 4.0377602189885464e-05, - "loss": 0.5262, + "epoch": 8.29957840786972, + "grad_norm": 4.048133373260498, + "learning_rate": 2.331843668680314e-05, + "loss": 0.4537, "step": 602400 }, { - "epoch": 6.14, - "learning_rate": 4.037137786966573e-05, - "loss": 0.5659, + "epoch": 8.30095615992946, + "grad_norm": 7.376530170440674, + "learning_rate": 2.3310514628324973e-05, + "loss": 0.4062, "step": 602500 }, { - "epoch": 6.14, - "learning_rate": 4.03651530423475e-05, - "loss": 0.5408, + "epoch": 8.302333911989198, + "grad_norm": 2.8816463947296143, + "learning_rate": 2.3302592925765013e-05, + "loss": 0.3593, "step": 602600 }, { - "epoch": 6.14, - "learning_rate": 4.035892770823516e-05, - "loss": 0.5425, + "epoch": 8.303711664048938, + "grad_norm": 9.513242721557617, + "learning_rate": 2.329467157979626e-05, + "loss": 0.3453, "step": 602700 }, { - "epoch": 6.14, - "learning_rate": 4.0352701867633067e-05, - "loss": 0.4688, + "epoch": 8.305089416108677, + "grad_norm": 0.8848949670791626, + "learning_rate": 2.328675059109167e-05, + "loss": 0.4427, "step": 602800 }, { - "epoch": 6.14, - "learning_rate": 4.034647552084563e-05, - "loss": 0.5114, + "epoch": 8.306467168168416, + "grad_norm": 1.0727472305297852, + "learning_rate": 2.3278829960324167e-05, + "loss": 0.3866, "step": 602900 }, { - "epoch": 6.14, - "learning_rate": 4.034024866817729e-05, - "loss": 0.6103, + "epoch": 8.307844920228156, + "grad_norm": 1.1733559370040894, + "learning_rate": 2.3270909688166662e-05, + "loss": 0.3644, "step": 603000 }, { - "epoch": 6.14, - "learning_rate": 4.033402130993249e-05, - "loss": 0.493, + "epoch": 8.309222672287895, + "grad_norm": 4.333801746368408, + "learning_rate": 2.326298977529202e-05, + "loss": 0.3817, "step": 603100 }, { - "epoch": 6.15, - "learning_rate": 4.032779344641572e-05, - "loss": 0.5265, + "epoch": 8.310600424347635, + "grad_norm": 3.5866081714630127, + "learning_rate": 2.3255070222373066e-05, + "loss": 0.36, "step": 603200 }, { - "epoch": 6.15, - "learning_rate": 4.0321565077931484e-05, - "loss": 0.5428, + "epoch": 8.311978176407374, + "grad_norm": 2.5693233013153076, + "learning_rate": 2.324715103008261e-05, + "loss": 0.4152, "step": 603300 }, { - "epoch": 6.15, - "learning_rate": 4.031533620478431e-05, - "loss": 0.4963, + "epoch": 8.313355928467113, + "grad_norm": 3.4720840454101562, + "learning_rate": 2.323923219909343e-05, + "loss": 0.4331, "step": 603400 }, { - "epoch": 6.15, - "learning_rate": 4.030910682727876e-05, - "loss": 0.5481, + "epoch": 8.314733680526853, + "grad_norm": 4.121750354766846, + "learning_rate": 2.3231313730078275e-05, + "loss": 0.3307, "step": 603500 }, { - "epoch": 6.15, - "learning_rate": 4.030287694571941e-05, - "loss": 0.5186, + "epoch": 8.316111432586592, + "grad_norm": 2.5358846187591553, + "learning_rate": 2.3223395623709866e-05, + "loss": 0.3363, "step": 603600 }, { - "epoch": 6.15, - "learning_rate": 4.029664656041085e-05, - "loss": 0.5113, + "epoch": 8.31748918464633, + "grad_norm": 4.643316268920898, + "learning_rate": 2.321547788066087e-05, + "loss": 0.4222, "step": 603700 }, { - "epoch": 6.15, - "learning_rate": 4.029041567165773e-05, - "loss": 0.513, + "epoch": 8.31886693670607, + "grad_norm": 15.085054397583008, + "learning_rate": 2.3207560501603948e-05, + "loss": 0.4, "step": 603800 }, { - "epoch": 6.15, - "learning_rate": 4.028418427976469e-05, - "loss": 0.5194, + "epoch": 8.32024468876581, + "grad_norm": 1.613295078277588, + "learning_rate": 2.319964348721172e-05, + "loss": 0.426, "step": 603900 }, { - "epoch": 6.15, - "learning_rate": 4.027795238503641e-05, - "loss": 0.5685, + "epoch": 8.32162244082555, + "grad_norm": 2.696157693862915, + "learning_rate": 2.3191726838156775e-05, + "loss": 0.3816, "step": 604000 }, { - "epoch": 6.15, - "learning_rate": 4.0271719987777597e-05, - "loss": 0.5974, + "epoch": 8.323000192885289, + "grad_norm": 4.008109092712402, + "learning_rate": 2.3183810555111682e-05, + "loss": 0.3973, "step": 604100 }, { - "epoch": 6.16, - "learning_rate": 4.026548708829297e-05, - "loss": 0.4914, + "epoch": 8.324377944945027, + "grad_norm": 4.119177341461182, + "learning_rate": 2.3175894638748968e-05, + "loss": 0.3812, "step": 604200 }, { - "epoch": 6.16, - "learning_rate": 4.0259253686887295e-05, - "loss": 0.4673, + "epoch": 8.325755697004768, + "grad_norm": 1.6911532878875732, + "learning_rate": 2.3167979089741123e-05, + "loss": 0.3343, "step": 604300 }, { - "epoch": 6.16, - "learning_rate": 4.025301978386533e-05, - "loss": 0.6079, + "epoch": 8.327133449064506, + "grad_norm": 40.78925323486328, + "learning_rate": 2.316006390876061e-05, + "loss": 0.3955, "step": 604400 }, { - "epoch": 6.16, - "learning_rate": 4.024678537953189e-05, - "loss": 0.6289, + "epoch": 8.328511201124245, + "grad_norm": 5.041555404663086, + "learning_rate": 2.3152149096479877e-05, + "loss": 0.4188, "step": 604500 }, { - "epoch": 6.16, - "learning_rate": 4.02405504741918e-05, - "loss": 0.5928, + "epoch": 8.329888953183985, + "grad_norm": 5.011321067810059, + "learning_rate": 2.314423465357132e-05, + "loss": 0.3411, "step": 604600 }, { - "epoch": 6.16, - "learning_rate": 4.023431506814989e-05, - "loss": 0.569, + "epoch": 8.331266705243724, + "grad_norm": 1.6941614151000977, + "learning_rate": 2.313632058070732e-05, + "loss": 0.4286, "step": 604700 }, { - "epoch": 6.16, - "learning_rate": 4.022807916171106e-05, - "loss": 0.5276, + "epoch": 8.332644457303463, + "grad_norm": 1.391391634941101, + "learning_rate": 2.3128406878560205e-05, + "loss": 0.3986, "step": 604800 }, { - "epoch": 6.16, - "learning_rate": 4.022184275518019e-05, - "loss": 0.5481, + "epoch": 8.334022209363203, + "grad_norm": 1.0014657974243164, + "learning_rate": 2.3120493547802283e-05, + "loss": 0.3414, "step": 604900 }, { - "epoch": 6.16, - "learning_rate": 4.0215605848862215e-05, - "loss": 0.6384, + "epoch": 8.335399961422942, + "grad_norm": 4.861555099487305, + "learning_rate": 2.311258058910585e-05, + "loss": 0.3396, "step": 605000 }, { - "epoch": 6.16, - "learning_rate": 4.020936844306207e-05, - "loss": 0.5264, + "epoch": 8.336777713482682, + "grad_norm": 1.4590474367141724, + "learning_rate": 2.3104668003143134e-05, + "loss": 0.3458, "step": 605100 }, { - "epoch": 6.17, - "learning_rate": 4.0203130538084744e-05, - "loss": 0.469, + "epoch": 8.338155465542421, + "grad_norm": 14.449016571044922, + "learning_rate": 2.309675579058636e-05, + "loss": 0.318, "step": 605200 }, { - "epoch": 6.17, - "learning_rate": 4.019689213423522e-05, - "loss": 0.6008, + "epoch": 8.33953321760216, + "grad_norm": 0.2966105043888092, + "learning_rate": 2.3088843952107726e-05, + "loss": 0.3762, "step": 605300 }, { - "epoch": 6.17, - "learning_rate": 4.019065323181852e-05, - "loss": 0.5136, + "epoch": 8.3409109696619, + "grad_norm": 2.887688636779785, + "learning_rate": 2.308093248837934e-05, + "loss": 0.3545, "step": 605400 }, { - "epoch": 6.17, - "learning_rate": 4.018441383113969e-05, - "loss": 0.6229, + "epoch": 8.342288721721639, + "grad_norm": 2.524355173110962, + "learning_rate": 2.307302140007336e-05, + "loss": 0.351, "step": 605500 }, { - "epoch": 6.17, - "learning_rate": 4.01781739325038e-05, - "loss": 0.5536, + "epoch": 8.343666473781377, + "grad_norm": 2.019190549850464, + "learning_rate": 2.3065189793120094e-05, + "loss": 0.3583, "step": 605600 }, { - "epoch": 6.17, - "learning_rate": 4.017193353621595e-05, - "loss": 0.5351, + "epoch": 8.345044225841118, + "grad_norm": 17.822757720947266, + "learning_rate": 2.3057279453904136e-05, + "loss": 0.4177, "step": 605700 }, { - "epoch": 6.17, - "learning_rate": 4.016569264258125e-05, - "loss": 0.4996, + "epoch": 8.346421977900857, + "grad_norm": 3.5091047286987305, + "learning_rate": 2.3049369492120022e-05, + "loss": 0.3025, "step": 605800 }, { - "epoch": 6.17, - "learning_rate": 4.0159451251904854e-05, - "loss": 0.468, + "epoch": 8.347799729960597, + "grad_norm": 4.332221984863281, + "learning_rate": 2.3041459908439737e-05, + "loss": 0.3338, "step": 605900 }, { - "epoch": 6.17, - "learning_rate": 4.015320936449191e-05, - "loss": 0.5723, + "epoch": 8.349177482020336, + "grad_norm": 4.5085577964782715, + "learning_rate": 2.303355070353523e-05, + "loss": 0.3733, "step": 606000 }, { - "epoch": 6.18, - "learning_rate": 4.0146966980647625e-05, - "loss": 0.5624, + "epoch": 8.350555234080074, + "grad_norm": 3.847532272338867, + "learning_rate": 2.302564187807845e-05, + "loss": 0.3917, "step": 606100 }, { - "epoch": 6.18, - "learning_rate": 4.014072410067721e-05, - "loss": 0.4742, + "epoch": 8.351932986139815, + "grad_norm": 3.193223476409912, + "learning_rate": 2.3017733432741284e-05, + "loss": 0.3147, "step": 606200 }, { - "epoch": 6.18, - "learning_rate": 4.0134480724885896e-05, - "loss": 0.5734, + "epoch": 8.353310738199553, + "grad_norm": 2.7339401245117188, + "learning_rate": 2.300982536819559e-05, + "loss": 0.3297, "step": 606300 }, { - "epoch": 6.18, - "learning_rate": 4.012823685357895e-05, - "loss": 0.5797, + "epoch": 8.354688490259292, + "grad_norm": 8.012553215026855, + "learning_rate": 2.3001917685113215e-05, + "loss": 0.3798, "step": 606400 }, { - "epoch": 6.18, - "learning_rate": 4.012199248706168e-05, - "loss": 0.4791, + "epoch": 8.356066242319033, + "grad_norm": 3.3491709232330322, + "learning_rate": 2.299401038416593e-05, + "loss": 0.3884, "step": 606500 }, { - "epoch": 6.18, - "learning_rate": 4.0115747625639366e-05, - "loss": 0.4535, + "epoch": 8.357443994378771, + "grad_norm": 3.142204761505127, + "learning_rate": 2.298610346602552e-05, + "loss": 0.3787, "step": 606600 }, { - "epoch": 6.18, - "learning_rate": 4.0109502269617375e-05, - "loss": 0.53, + "epoch": 8.358821746438512, + "grad_norm": 0.42889827489852905, + "learning_rate": 2.2978196931363716e-05, + "loss": 0.3027, "step": 606700 }, { - "epoch": 6.18, - "learning_rate": 4.0103256419301054e-05, - "loss": 0.5411, + "epoch": 8.36019949849825, + "grad_norm": 8.121647834777832, + "learning_rate": 2.297029078085221e-05, + "loss": 0.3942, "step": 606800 }, { - "epoch": 6.18, - "learning_rate": 4.0097010074995775e-05, - "loss": 0.5572, + "epoch": 8.361577250557989, + "grad_norm": 4.54153299331665, + "learning_rate": 2.296238501516269e-05, + "loss": 0.3688, "step": 606900 }, { - "epoch": 6.18, - "learning_rate": 4.009082570782958e-05, - "loss": 0.6621, + "epoch": 8.36295500261773, + "grad_norm": 3.75486159324646, + "learning_rate": 2.2954479634966764e-05, + "loss": 0.417, "step": 607000 }, { - "epoch": 6.19, - "learning_rate": 4.0084578381394933e-05, - "loss": 0.5928, + "epoch": 8.364332754677468, + "grad_norm": 2.48858904838562, + "learning_rate": 2.294657464093605e-05, + "loss": 0.3655, "step": 607100 }, { - "epoch": 6.19, - "learning_rate": 4.007833056188459e-05, - "loss": 0.5286, + "epoch": 8.365710506737207, + "grad_norm": 1.6995512247085571, + "learning_rate": 2.2938670033742116e-05, + "loss": 0.4368, "step": 607200 }, { - "epoch": 6.19, - "learning_rate": 4.0072082249604024e-05, - "loss": 0.5323, + "epoch": 8.367088258796947, + "grad_norm": 1.7572351694107056, + "learning_rate": 2.29307658140565e-05, + "loss": 0.3861, "step": 607300 }, { - "epoch": 6.19, - "learning_rate": 4.006583344485874e-05, - "loss": 0.5379, + "epoch": 8.368466010856686, + "grad_norm": 1.17812979221344, + "learning_rate": 2.2922861982550694e-05, + "loss": 0.3635, "step": 607400 }, { - "epoch": 6.19, - "learning_rate": 4.005958414795428e-05, - "loss": 0.4721, + "epoch": 8.369843762916426, + "grad_norm": 3.8669700622558594, + "learning_rate": 2.2914958539896197e-05, + "loss": 0.3535, "step": 607500 }, { - "epoch": 6.19, - "learning_rate": 4.005333435919618e-05, - "loss": 0.5848, + "epoch": 8.371221514976165, + "grad_norm": 2.132659912109375, + "learning_rate": 2.29071345153654e-05, + "loss": 0.34, "step": 607600 }, { - "epoch": 6.19, - "learning_rate": 4.004708407889002e-05, - "loss": 0.505, + "epoch": 8.372599267035904, + "grad_norm": 9.288592338562012, + "learning_rate": 2.2899231848522483e-05, + "loss": 0.422, "step": 607700 }, { - "epoch": 6.19, - "learning_rate": 4.0040833307341414e-05, - "loss": 0.5671, + "epoch": 8.373977019095644, + "grad_norm": 2.039140224456787, + "learning_rate": 2.289132957253837e-05, + "loss": 0.4281, "step": 607800 }, { - "epoch": 6.19, - "learning_rate": 4.003458204485598e-05, - "loss": 0.5476, + "epoch": 8.375354771155383, + "grad_norm": 4.187887668609619, + "learning_rate": 2.2883427688084385e-05, + "loss": 0.3237, "step": 607900 }, { - "epoch": 6.19, - "learning_rate": 4.002833029173936e-05, - "loss": 0.4678, + "epoch": 8.376732523215122, + "grad_norm": 6.897355556488037, + "learning_rate": 2.287552619583184e-05, + "loss": 0.4401, "step": 608000 }, { - "epoch": 6.2, - "learning_rate": 4.002207804829724e-05, - "loss": 0.5345, + "epoch": 8.378110275274862, + "grad_norm": 10.177162170410156, + "learning_rate": 2.2867625096452e-05, + "loss": 0.3406, "step": 608100 }, { - "epoch": 6.2, - "learning_rate": 4.001582531483531e-05, - "loss": 0.5251, + "epoch": 8.3794880273346, + "grad_norm": 1.845786452293396, + "learning_rate": 2.2859724390616104e-05, + "loss": 0.3308, "step": 608200 }, { - "epoch": 6.2, - "learning_rate": 4.00095720916593e-05, - "loss": 0.506, + "epoch": 8.380865779394341, + "grad_norm": 4.943540573120117, + "learning_rate": 2.285182407899536e-05, + "loss": 0.3152, "step": 608300 }, { - "epoch": 6.2, - "learning_rate": 4.0003318379074945e-05, - "loss": 0.5311, + "epoch": 8.38224353145408, + "grad_norm": 1.8014031648635864, + "learning_rate": 2.2843924162260944e-05, + "loss": 0.4084, "step": 608400 }, { - "epoch": 6.2, - "learning_rate": 3.999706417738801e-05, - "loss": 0.5478, + "epoch": 8.383621283513818, + "grad_norm": 6.4546003341674805, + "learning_rate": 2.2836024641083983e-05, + "loss": 0.3594, "step": 608500 }, { - "epoch": 6.2, - "learning_rate": 3.999080948690431e-05, - "loss": 0.5107, + "epoch": 8.384999035573559, + "grad_norm": 1.6401779651641846, + "learning_rate": 2.2828125516135607e-05, + "loss": 0.3357, "step": 608600 }, { - "epoch": 6.2, - "learning_rate": 3.998455430792964e-05, - "loss": 0.5349, + "epoch": 8.386376787633298, + "grad_norm": 6.374835014343262, + "learning_rate": 2.2820226788086854e-05, + "loss": 0.3954, "step": 608700 }, { - "epoch": 6.2, - "learning_rate": 3.997829864076985e-05, - "loss": 0.5177, + "epoch": 8.387754539693036, + "grad_norm": 2.8909759521484375, + "learning_rate": 2.2812328457608782e-05, + "loss": 0.3428, "step": 608800 }, { - "epoch": 6.2, - "learning_rate": 3.99720424857308e-05, - "loss": 0.6177, + "epoch": 8.389132291752777, + "grad_norm": 3.7436742782592773, + "learning_rate": 2.2804509502721254e-05, + "loss": 0.3669, "step": 608900 }, { - "epoch": 6.2, - "learning_rate": 3.996578584311838e-05, - "loss": 0.4951, + "epoch": 8.390510043812515, + "grad_norm": 3.7521841526031494, + "learning_rate": 2.279661196540506e-05, + "loss": 0.374, "step": 609000 }, { - "epoch": 6.21, - "learning_rate": 3.9959528713238515e-05, - "loss": 0.5466, + "epoch": 8.391887795872254, + "grad_norm": 3.0969252586364746, + "learning_rate": 2.278871482766575e-05, + "loss": 0.3472, "step": 609100 }, { - "epoch": 6.21, - "learning_rate": 3.995327109639713e-05, - "loss": 0.567, + "epoch": 8.393265547931994, + "grad_norm": 1.7246965169906616, + "learning_rate": 2.2780818090174216e-05, + "loss": 0.3478, "step": 609200 }, { - "epoch": 6.21, - "learning_rate": 3.994701299290018e-05, - "loss": 0.6196, + "epoch": 8.394643299991733, + "grad_norm": 2.6951217651367188, + "learning_rate": 2.2772921753601318e-05, + "loss": 0.3401, "step": 609300 }, { - "epoch": 6.21, - "learning_rate": 3.9940754403053665e-05, - "loss": 0.5509, + "epoch": 8.396021052051474, + "grad_norm": 4.763112545013428, + "learning_rate": 2.2765025818617907e-05, + "loss": 0.3571, "step": 609400 }, { - "epoch": 6.21, - "learning_rate": 3.993449532716357e-05, - "loss": 0.5109, + "epoch": 8.397398804111212, + "grad_norm": 1.2914090156555176, + "learning_rate": 2.275713028589478e-05, + "loss": 0.3724, "step": 609500 }, { - "epoch": 6.21, - "learning_rate": 3.992823576553595e-05, - "loss": 0.4559, + "epoch": 8.39877655617095, + "grad_norm": 4.179256439208984, + "learning_rate": 2.2749235156102695e-05, + "loss": 0.3408, "step": 609600 }, { - "epoch": 6.21, - "learning_rate": 3.9921975718476846e-05, - "loss": 0.5443, + "epoch": 8.400154308230691, + "grad_norm": 2.138127565383911, + "learning_rate": 2.2741340429912397e-05, + "loss": 0.41, "step": 609700 }, { - "epoch": 6.21, - "learning_rate": 3.9915715186292336e-05, - "loss": 0.5179, + "epoch": 8.40153206029043, + "grad_norm": 3.9688987731933594, + "learning_rate": 2.2733446107994564e-05, + "loss": 0.3795, "step": 609800 }, { - "epoch": 6.21, - "learning_rate": 3.990945416928854e-05, - "loss": 0.6347, + "epoch": 8.402909812350169, + "grad_norm": 4.312638759613037, + "learning_rate": 2.2725552191019872e-05, + "loss": 0.4125, "step": 609900 }, { - "epoch": 6.21, - "learning_rate": 3.990319266777157e-05, - "loss": 0.558, + "epoch": 8.404287564409909, + "grad_norm": 3.477618932723999, + "learning_rate": 2.2717658679658952e-05, + "loss": 0.3811, "step": 610000 }, { - "epoch": 6.22, - "learning_rate": 3.989693068204757e-05, - "loss": 0.5508, + "epoch": 8.405665316469648, + "grad_norm": 3.677065849304199, + "learning_rate": 2.2709765574582386e-05, + "loss": 0.3886, "step": 610100 }, { - "epoch": 6.22, - "learning_rate": 3.989066821242271e-05, - "loss": 0.5514, + "epoch": 8.407043068529388, + "grad_norm": 0.648706316947937, + "learning_rate": 2.2701872876460756e-05, + "loss": 0.3446, "step": 610200 }, { - "epoch": 6.22, - "learning_rate": 3.988440525920322e-05, - "loss": 0.5465, + "epoch": 8.408420820589127, + "grad_norm": 4.264386177062988, + "learning_rate": 2.2693980585964557e-05, + "loss": 0.3438, "step": 610300 }, { - "epoch": 6.22, - "learning_rate": 3.9878141822695284e-05, - "loss": 0.5104, + "epoch": 8.409798572648866, + "grad_norm": 1.4507111310958862, + "learning_rate": 2.2686167620563035e-05, + "loss": 0.336, "step": 610400 }, { - "epoch": 6.22, - "learning_rate": 3.987187790320518e-05, - "loss": 0.5531, + "epoch": 8.411176324708606, + "grad_norm": 4.321676254272461, + "learning_rate": 2.2678276143236182e-05, + "loss": 0.4352, "step": 610500 }, { - "epoch": 6.22, - "learning_rate": 3.986561350103915e-05, - "loss": 0.591, + "epoch": 8.412554076768345, + "grad_norm": 3.7936556339263916, + "learning_rate": 2.2670385075539443e-05, + "loss": 0.3565, "step": 610600 }, { - "epoch": 6.22, - "learning_rate": 3.9859348616503496e-05, - "loss": 0.6012, + "epoch": 8.413931828828083, + "grad_norm": 55.288944244384766, + "learning_rate": 2.2662494418143204e-05, + "loss": 0.3025, "step": 610700 }, { - "epoch": 6.22, - "learning_rate": 3.985308324990452e-05, - "loss": 0.5802, + "epoch": 8.415309580887824, + "grad_norm": 1.353340983390808, + "learning_rate": 2.2654604171717813e-05, + "loss": 0.2957, "step": 610800 }, { - "epoch": 6.22, - "learning_rate": 3.9846817401548585e-05, - "loss": 0.5808, + "epoch": 8.416687332947562, + "grad_norm": 2.9988954067230225, + "learning_rate": 2.2646714336933585e-05, + "loss": 0.3446, "step": 610900 }, { - "epoch": 6.22, - "learning_rate": 3.984055107174204e-05, - "loss": 0.4851, + "epoch": 8.418065085007303, + "grad_norm": 2.7173781394958496, + "learning_rate": 2.2638824914460795e-05, + "loss": 0.3954, "step": 611000 }, { - "epoch": 6.23, - "learning_rate": 3.9834284260791275e-05, - "loss": 0.5649, + "epoch": 8.419442837067042, + "grad_norm": 3.973426580429077, + "learning_rate": 2.2630935904969706e-05, + "loss": 0.3835, "step": 611100 }, { - "epoch": 6.23, - "learning_rate": 3.982801696900269e-05, - "loss": 0.5688, + "epoch": 8.42082058912678, + "grad_norm": 3.0652267932891846, + "learning_rate": 2.2623047309130517e-05, + "loss": 0.4002, "step": 611200 }, { - "epoch": 6.23, - "learning_rate": 3.982181187678356e-05, - "loss": 0.5012, + "epoch": 8.42219834118652, + "grad_norm": 14.96908950805664, + "learning_rate": 2.261515912761342e-05, + "loss": 0.3555, "step": 611300 }, { - "epoch": 6.23, - "learning_rate": 3.981554362903941e-05, - "loss": 0.5915, + "epoch": 8.42357609324626, + "grad_norm": 2.573018789291382, + "learning_rate": 2.260727136108854e-05, + "loss": 0.4071, "step": 611400 }, { - "epoch": 6.23, - "learning_rate": 3.980927490137375e-05, - "loss": 0.5049, + "epoch": 8.424953845305998, + "grad_norm": 2.935473680496216, + "learning_rate": 2.259938401022598e-05, + "loss": 0.3932, "step": 611500 }, { - "epoch": 6.23, - "learning_rate": 3.980300569409309e-05, - "loss": 0.4747, + "epoch": 8.426331597365738, + "grad_norm": 12.06583309173584, + "learning_rate": 2.2591497075695825e-05, + "loss": 0.3949, "step": 611600 }, { - "epoch": 6.23, - "learning_rate": 3.979673600750395e-05, - "loss": 0.5424, + "epoch": 8.427709349425477, + "grad_norm": 2.336740493774414, + "learning_rate": 2.2583610558168108e-05, + "loss": 0.3601, "step": 611700 }, { - "epoch": 6.23, - "learning_rate": 3.979046584191289e-05, - "loss": 0.4842, + "epoch": 8.429087101485218, + "grad_norm": 4.078133583068848, + "learning_rate": 2.257572445831282e-05, + "loss": 0.3446, "step": 611800 }, { - "epoch": 6.23, - "learning_rate": 3.978419519762648e-05, - "loss": 0.4898, + "epoch": 8.430464853544956, + "grad_norm": 4.701194763183594, + "learning_rate": 2.2567838776799945e-05, + "loss": 0.3829, "step": 611900 }, { - "epoch": 6.24, - "learning_rate": 3.97779240749513e-05, - "loss": 0.5528, + "epoch": 8.431842605604695, + "grad_norm": 0.9086661338806152, + "learning_rate": 2.2559953514299387e-05, + "loss": 0.4539, "step": 612000 }, { - "epoch": 6.24, - "learning_rate": 3.9771652474193993e-05, - "loss": 0.5704, + "epoch": 8.433220357664435, + "grad_norm": 2.3533055782318115, + "learning_rate": 2.2552068671481055e-05, + "loss": 0.3443, "step": 612100 }, { - "epoch": 6.24, - "learning_rate": 3.976538039566118e-05, - "loss": 0.5993, + "epoch": 8.434598109724174, + "grad_norm": 8.864639282226562, + "learning_rate": 2.25441842490148e-05, + "loss": 0.3849, "step": 612200 }, { - "epoch": 6.24, - "learning_rate": 3.975910783965956e-05, - "loss": 0.5526, + "epoch": 8.435975861783913, + "grad_norm": 0.09770727902650833, + "learning_rate": 2.2536300247570444e-05, + "loss": 0.3283, "step": 612300 }, { - "epoch": 6.24, - "learning_rate": 3.97528348064958e-05, - "loss": 0.517, + "epoch": 8.437353613843653, + "grad_norm": 4.248231887817383, + "learning_rate": 2.2528416667817797e-05, + "loss": 0.4212, "step": 612400 }, { - "epoch": 6.24, - "learning_rate": 3.9746561296476626e-05, - "loss": 0.5207, + "epoch": 8.438731365903392, + "grad_norm": 3.1666977405548096, + "learning_rate": 2.2520533510426566e-05, + "loss": 0.3853, "step": 612500 }, { - "epoch": 6.24, - "learning_rate": 3.9740287309908756e-05, - "loss": 0.5493, + "epoch": 8.440109117963132, + "grad_norm": 3.1559460163116455, + "learning_rate": 2.2512650776066496e-05, + "loss": 0.3534, "step": 612600 }, { - "epoch": 6.24, - "learning_rate": 3.973401284709897e-05, - "loss": 0.5485, + "epoch": 8.441486870022871, + "grad_norm": 17.667865753173828, + "learning_rate": 2.2504768465407255e-05, + "loss": 0.3432, "step": 612700 }, { - "epoch": 6.24, - "learning_rate": 3.9727737908354044e-05, - "loss": 0.4344, + "epoch": 8.44286462208261, + "grad_norm": 3.9279208183288574, + "learning_rate": 2.2496886579118486e-05, + "loss": 0.3363, "step": 612800 }, { - "epoch": 6.24, - "learning_rate": 3.97214624939808e-05, - "loss": 0.514, + "epoch": 8.44424237414235, + "grad_norm": 1.1642777919769287, + "learning_rate": 2.24890051178698e-05, + "loss": 0.3377, "step": 612900 }, { - "epoch": 6.25, - "learning_rate": 3.9715186604286034e-05, - "loss": 0.5442, + "epoch": 8.445620126202089, + "grad_norm": 2.8263070583343506, + "learning_rate": 2.2481124082330772e-05, + "loss": 0.3332, "step": 613000 }, { - "epoch": 6.25, - "learning_rate": 3.970891023957663e-05, - "loss": 0.5231, + "epoch": 8.446997878261827, + "grad_norm": 2.421978712081909, + "learning_rate": 2.2473243473170925e-05, + "loss": 0.3642, "step": 613100 }, { - "epoch": 6.25, - "learning_rate": 3.970263340015945e-05, - "loss": 0.531, + "epoch": 8.448375630321568, + "grad_norm": 3.852734088897705, + "learning_rate": 2.2465363291059764e-05, + "loss": 0.3992, "step": 613200 }, { - "epoch": 6.25, - "learning_rate": 3.9696356086341404e-05, - "loss": 0.4815, + "epoch": 8.449753382381306, + "grad_norm": 7.440618991851807, + "learning_rate": 2.245748353666674e-05, + "loss": 0.4098, "step": 613300 }, { - "epoch": 6.25, - "learning_rate": 3.969007829842941e-05, - "loss": 0.544, + "epoch": 8.451131134441045, + "grad_norm": 3.2082061767578125, + "learning_rate": 2.2449604210661292e-05, + "loss": 0.3467, "step": 613400 }, { - "epoch": 6.25, - "learning_rate": 3.968380003673042e-05, - "loss": 0.5307, + "epoch": 8.452508886500786, + "grad_norm": 4.258767127990723, + "learning_rate": 2.2441725313712805e-05, + "loss": 0.3394, "step": 613500 }, { - "epoch": 6.25, - "learning_rate": 3.9677584091245896e-05, - "loss": 0.7407, + "epoch": 8.453886638560524, + "grad_norm": 10.594532012939453, + "learning_rate": 2.2433846846490637e-05, + "loss": 0.3901, "step": 613600 }, { - "epoch": 6.25, - "learning_rate": 3.9671304887624045e-05, - "loss": 0.5101, + "epoch": 8.455264390620265, + "grad_norm": 1.9518663883209229, + "learning_rate": 2.242596880966409e-05, + "loss": 0.3417, "step": 613700 }, { - "epoch": 6.25, - "learning_rate": 3.966502521113309e-05, - "loss": 0.5707, + "epoch": 8.456642142680003, + "grad_norm": 7.226503372192383, + "learning_rate": 2.2418091203902447e-05, + "loss": 0.3096, "step": 613800 }, { - "epoch": 6.25, - "learning_rate": 3.965874506208009e-05, - "loss": 0.5631, + "epoch": 8.458019894739742, + "grad_norm": 2.3644461631774902, + "learning_rate": 2.241021402987496e-05, + "loss": 0.3249, "step": 613900 }, { - "epoch": 6.26, - "learning_rate": 3.965246444077209e-05, - "loss": 0.4977, + "epoch": 8.459397646799482, + "grad_norm": 3.722769021987915, + "learning_rate": 2.240233728825083e-05, + "loss": 0.3694, "step": 614000 }, { - "epoch": 6.26, - "learning_rate": 3.964618334751618e-05, - "loss": 0.5533, + "epoch": 8.460775398859221, + "grad_norm": 5.688827037811279, + "learning_rate": 2.2394460979699233e-05, + "loss": 0.3863, "step": 614100 }, { - "epoch": 6.26, - "learning_rate": 3.9639964600602055e-05, - "loss": 0.5658, + "epoch": 8.46215315091896, + "grad_norm": 5.0374755859375, + "learning_rate": 2.238658510488929e-05, + "loss": 0.4053, "step": 614200 }, { - "epoch": 6.26, - "learning_rate": 3.9633682569083494e-05, - "loss": 0.4703, + "epoch": 8.4635309029787, + "grad_norm": 1.7014131546020508, + "learning_rate": 2.2378709664490098e-05, + "loss": 0.3535, "step": 614300 }, { - "epoch": 6.26, - "learning_rate": 3.9627400066535345e-05, - "loss": 0.5128, + "epoch": 8.464908655038439, + "grad_norm": 4.276094436645508, + "learning_rate": 2.2370834659170725e-05, + "loss": 0.3563, "step": 614400 }, { - "epoch": 6.26, - "learning_rate": 3.962111709326479e-05, - "loss": 0.5575, + "epoch": 8.46628640709818, + "grad_norm": 58.399173736572266, + "learning_rate": 2.2362960089600185e-05, + "loss": 0.3891, "step": 614500 }, { - "epoch": 6.26, - "learning_rate": 3.9614833649579026e-05, - "loss": 0.5075, + "epoch": 8.467664159157918, + "grad_norm": 13.329980850219727, + "learning_rate": 2.2355085956447467e-05, + "loss": 0.4228, "step": 614600 }, { - "epoch": 6.26, - "learning_rate": 3.960854973578527e-05, - "loss": 0.553, + "epoch": 8.469041911217657, + "grad_norm": 0.040426105260849, + "learning_rate": 2.2347212260381534e-05, + "loss": 0.3723, "step": 614700 }, { - "epoch": 6.26, - "learning_rate": 3.960226535219077e-05, - "loss": 0.5185, + "epoch": 8.470419663277397, + "grad_norm": 1.624005913734436, + "learning_rate": 2.2339339002071258e-05, + "loss": 0.3514, "step": 614800 }, { - "epoch": 6.26, - "learning_rate": 3.9595980499102804e-05, - "loss": 0.5467, + "epoch": 8.471797415337136, + "grad_norm": 6.812719345092773, + "learning_rate": 2.2331466182185547e-05, + "loss": 0.3903, "step": 614900 }, { - "epoch": 6.27, - "learning_rate": 3.958969517682866e-05, - "loss": 0.5462, + "epoch": 8.473175167396874, + "grad_norm": 2.2937488555908203, + "learning_rate": 2.232359380139322e-05, + "loss": 0.3432, "step": 615000 }, { - "epoch": 6.27, - "learning_rate": 3.9583409385675645e-05, - "loss": 0.5287, + "epoch": 8.474552919456615, + "grad_norm": 3.268092155456543, + "learning_rate": 2.2315721860363085e-05, + "loss": 0.4469, "step": 615100 }, { - "epoch": 6.27, - "learning_rate": 3.957712312595111e-05, - "loss": 0.4678, + "epoch": 8.475930671516354, + "grad_norm": 2.450117826461792, + "learning_rate": 2.2307850359763912e-05, + "loss": 0.2871, "step": 615200 }, { - "epoch": 6.27, - "learning_rate": 3.957083639796239e-05, - "loss": 0.5079, + "epoch": 8.477308423576094, + "grad_norm": 10.77607536315918, + "learning_rate": 2.2299979300264397e-05, + "loss": 0.3628, "step": 615300 }, { - "epoch": 6.27, - "learning_rate": 3.95645492020169e-05, - "loss": 0.5142, + "epoch": 8.478686175635833, + "grad_norm": 3.631063461303711, + "learning_rate": 2.2292108682533248e-05, + "loss": 0.3261, "step": 615400 }, { - "epoch": 6.27, - "learning_rate": 3.9558261538422026e-05, - "loss": 0.5787, + "epoch": 8.480063927695571, + "grad_norm": 7.375083923339844, + "learning_rate": 2.2284238507239114e-05, + "loss": 0.3845, "step": 615500 }, { - "epoch": 6.27, - "learning_rate": 3.955197340748521e-05, - "loss": 0.5292, + "epoch": 8.481441679755312, + "grad_norm": 18.21994972229004, + "learning_rate": 2.2276368775050598e-05, + "loss": 0.396, "step": 615600 }, { - "epoch": 6.27, - "learning_rate": 3.954568480951389e-05, - "loss": 0.5801, + "epoch": 8.48281943181505, + "grad_norm": 8.315932273864746, + "learning_rate": 2.2268499486636277e-05, + "loss": 0.4236, "step": 615700 }, { - "epoch": 6.27, - "learning_rate": 3.9539395744815555e-05, - "loss": 0.5145, + "epoch": 8.48419718387479, + "grad_norm": 3.5208487510681152, + "learning_rate": 2.2260630642664697e-05, + "loss": 0.2986, "step": 615800 }, { - "epoch": 6.27, - "learning_rate": 3.95331062136977e-05, - "loss": 0.5964, + "epoch": 8.48557493593453, + "grad_norm": 4.82790994644165, + "learning_rate": 2.2252762243804343e-05, + "loss": 0.3374, "step": 615900 }, { - "epoch": 6.28, - "learning_rate": 3.9526816216467834e-05, - "loss": 0.5535, + "epoch": 8.486952687994268, + "grad_norm": 2.6233551502227783, + "learning_rate": 2.224489429072368e-05, + "loss": 0.3301, "step": 616000 }, { - "epoch": 6.28, - "learning_rate": 3.95205257534335e-05, - "loss": 0.533, + "epoch": 8.488330440054009, + "grad_norm": 4.033213138580322, + "learning_rate": 2.2237026784091137e-05, + "loss": 0.3587, "step": 616100 }, { - "epoch": 6.28, - "learning_rate": 3.951423482490228e-05, - "loss": 0.6197, + "epoch": 8.489708192113747, + "grad_norm": 0.020865071564912796, + "learning_rate": 2.2229159724575085e-05, + "loss": 0.4055, "step": 616200 }, { - "epoch": 6.28, - "learning_rate": 3.950794343118176e-05, - "loss": 0.5362, + "epoch": 8.491085944173486, + "grad_norm": 2.265803575515747, + "learning_rate": 2.222129311284389e-05, + "loss": 0.2873, "step": 616300 }, { - "epoch": 6.28, - "learning_rate": 3.950165157257954e-05, - "loss": 0.5345, + "epoch": 8.492463696233226, + "grad_norm": 4.1437296867370605, + "learning_rate": 2.2213426949565846e-05, + "loss": 0.3674, "step": 616400 }, { - "epoch": 6.28, - "learning_rate": 3.9495359249403263e-05, - "loss": 0.7021, + "epoch": 8.493841448292965, + "grad_norm": 1.2387611865997314, + "learning_rate": 2.220556123540923e-05, + "loss": 0.3699, "step": 616500 }, { - "epoch": 6.28, - "learning_rate": 3.94890664619606e-05, - "loss": 0.5909, + "epoch": 8.495219200352704, + "grad_norm": 4.634833812713623, + "learning_rate": 2.219769597104227e-05, + "loss": 0.341, "step": 616600 }, { - "epoch": 6.28, - "learning_rate": 3.948277321055919e-05, - "loss": 0.4945, + "epoch": 8.496596952412444, + "grad_norm": 2.196974515914917, + "learning_rate": 2.218983115713316e-05, + "loss": 0.335, "step": 616700 }, { - "epoch": 6.28, - "learning_rate": 3.947647949550677e-05, - "loss": 0.6002, + "epoch": 8.497974704472183, + "grad_norm": 6.876392364501953, + "learning_rate": 2.2181966794350065e-05, + "loss": 0.3516, "step": 616800 }, { - "epoch": 6.29, - "learning_rate": 3.9470185317111056e-05, - "loss": 0.5448, + "epoch": 8.499352456531923, + "grad_norm": 6.537124156951904, + "learning_rate": 2.2174102883361107e-05, + "loss": 0.3388, "step": 616900 }, { - "epoch": 6.29, - "learning_rate": 3.9463890675679794e-05, - "loss": 0.5949, + "epoch": 8.500730208591662, + "grad_norm": 3.6307976245880127, + "learning_rate": 2.2166239424834346e-05, + "loss": 0.3919, "step": 617000 }, { - "epoch": 6.29, - "learning_rate": 3.945765852485184e-05, - "loss": 0.4957, + "epoch": 8.5021079606514, + "grad_norm": 2.5189926624298096, + "learning_rate": 2.2158376419437836e-05, + "loss": 0.3275, "step": 617100 }, { - "epoch": 6.29, - "learning_rate": 3.94513629628955e-05, - "loss": 0.4775, + "epoch": 8.503485712711141, + "grad_norm": 6.779555797576904, + "learning_rate": 2.215059249110706e-05, + "loss": 0.3508, "step": 617200 }, { - "epoch": 6.29, - "learning_rate": 3.94450669388239e-05, - "loss": 0.5508, + "epoch": 8.50486346477088, + "grad_norm": 1.5058640241622925, + "learning_rate": 2.214273038942704e-05, + "loss": 0.3538, "step": 617300 }, { - "epoch": 6.29, - "learning_rate": 3.94387704529449e-05, - "loss": 0.5532, + "epoch": 8.506241216830619, + "grad_norm": 1.3997087478637695, + "learning_rate": 2.2134868742874495e-05, + "loss": 0.3169, "step": 617400 }, { - "epoch": 6.29, - "learning_rate": 3.943247350556635e-05, - "loss": 0.5209, + "epoch": 8.507618968890359, + "grad_norm": 3.7029576301574707, + "learning_rate": 2.2127007552117284e-05, + "loss": 0.3227, "step": 617500 }, { - "epoch": 6.29, - "learning_rate": 3.942617609699615e-05, - "loss": 0.5472, + "epoch": 8.508996720950098, + "grad_norm": 0.2418132722377777, + "learning_rate": 2.2119146817823276e-05, + "loss": 0.3398, "step": 617600 }, { - "epoch": 6.29, - "learning_rate": 3.941987822754218e-05, - "loss": 0.5319, + "epoch": 8.510374473009836, + "grad_norm": 3.9458911418914795, + "learning_rate": 2.2111286540660274e-05, + "loss": 0.3432, "step": 617700 }, { - "epoch": 6.29, - "learning_rate": 3.941357989751237e-05, - "loss": 0.5567, + "epoch": 8.511752225069577, + "grad_norm": 6.2931647300720215, + "learning_rate": 2.210342672129604e-05, + "loss": 0.3449, "step": 617800 }, { - "epoch": 6.3, - "learning_rate": 3.94072811072147e-05, - "loss": 0.4866, + "epoch": 8.513129977129315, + "grad_norm": 3.383282423019409, + "learning_rate": 2.209556736039832e-05, + "loss": 0.3166, "step": 617900 }, { - "epoch": 6.3, - "learning_rate": 3.940098185695711e-05, - "loss": 0.5557, + "epoch": 8.514507729189056, + "grad_norm": 1.0736831426620483, + "learning_rate": 2.2087708458634813e-05, + "loss": 0.3434, "step": 618000 }, { - "epoch": 6.3, - "learning_rate": 3.9394682147047605e-05, - "loss": 0.5382, + "epoch": 8.515885481248795, + "grad_norm": 13.528286933898926, + "learning_rate": 2.2079850016673154e-05, + "loss": 0.359, "step": 618100 }, { - "epoch": 6.3, - "learning_rate": 3.938838197779422e-05, - "loss": 0.55, + "epoch": 8.517263233308533, + "grad_norm": 2.126558542251587, + "learning_rate": 2.2071992035180964e-05, + "loss": 0.3862, "step": 618200 }, { - "epoch": 6.3, - "learning_rate": 3.938208134950498e-05, - "loss": 0.4971, + "epoch": 8.518640985368274, + "grad_norm": 10.538199424743652, + "learning_rate": 2.2064213087744546e-05, + "loss": 0.3365, "step": 618300 }, { - "epoch": 6.3, - "learning_rate": 3.937578026248796e-05, - "loss": 0.4625, + "epoch": 8.520018737428012, + "grad_norm": 3.284346103668213, + "learning_rate": 2.205635602457264e-05, + "loss": 0.3891, "step": 618400 }, { - "epoch": 6.3, - "learning_rate": 3.9369478717051225e-05, - "loss": 0.5343, + "epoch": 8.521396489487751, + "grad_norm": 4.785995006561279, + "learning_rate": 2.2048499423866137e-05, + "loss": 0.3935, "step": 618500 }, { - "epoch": 6.3, - "learning_rate": 3.936317671350289e-05, - "loss": 0.5644, + "epoch": 8.522774241547491, + "grad_norm": 2.2195351123809814, + "learning_rate": 2.2040643286292492e-05, + "loss": 0.3867, "step": 618600 }, { - "epoch": 6.3, - "learning_rate": 3.9356874252151095e-05, - "loss": 0.5344, + "epoch": 8.52415199360723, + "grad_norm": 10.841598510742188, + "learning_rate": 2.203278761251913e-05, + "loss": 0.3986, "step": 618700 }, { - "epoch": 6.3, - "learning_rate": 3.935057133330399e-05, - "loss": 0.5499, + "epoch": 8.52552974566697, + "grad_norm": 1.6979165077209473, + "learning_rate": 2.2024932403213433e-05, + "loss": 0.3768, "step": 618800 }, { - "epoch": 6.31, - "learning_rate": 3.9344267957269744e-05, - "loss": 0.5553, + "epoch": 8.52690749772671, + "grad_norm": 2.557459831237793, + "learning_rate": 2.201707765904274e-05, + "loss": 0.3505, "step": 618900 }, { - "epoch": 6.31, - "learning_rate": 3.933796412435657e-05, - "loss": 0.5984, + "epoch": 8.528285249786448, + "grad_norm": 5.050466060638428, + "learning_rate": 2.2009223380674342e-05, + "loss": 0.4186, "step": 619000 }, { - "epoch": 6.31, - "learning_rate": 3.9331659834872674e-05, - "loss": 0.5414, + "epoch": 8.529663001846188, + "grad_norm": 3.518502950668335, + "learning_rate": 2.2001369568775524e-05, + "loss": 0.3236, "step": 619100 }, { - "epoch": 6.31, - "learning_rate": 3.932535508912629e-05, - "loss": 0.5398, + "epoch": 8.531040753905927, + "grad_norm": 14.200884819030762, + "learning_rate": 2.199351622401348e-05, + "loss": 0.333, "step": 619200 }, { - "epoch": 6.31, - "learning_rate": 3.93190498874257e-05, - "loss": 0.6267, + "epoch": 8.532418505965666, + "grad_norm": 2.772815704345703, + "learning_rate": 2.198566334705541e-05, + "loss": 0.3971, "step": 619300 }, { - "epoch": 6.31, - "learning_rate": 3.931274423007918e-05, - "loss": 0.5489, + "epoch": 8.533796258025406, + "grad_norm": 3.06123948097229, + "learning_rate": 2.1977810938568446e-05, + "loss": 0.3708, "step": 619400 }, { - "epoch": 6.31, - "learning_rate": 3.9306438117395044e-05, - "loss": 0.5718, + "epoch": 8.535174010085145, + "grad_norm": 15.280471801757812, + "learning_rate": 2.1970037516288753e-05, + "loss": 0.3949, "step": 619500 }, { - "epoch": 6.31, - "learning_rate": 3.930013154968162e-05, - "loss": 0.5296, + "epoch": 8.536551762144885, + "grad_norm": 10.371110916137695, + "learning_rate": 2.196218604204393e-05, + "loss": 0.357, "step": 619600 }, { - "epoch": 6.31, - "learning_rate": 3.929382452724727e-05, - "loss": 0.6019, + "epoch": 8.537929514204624, + "grad_norm": 1.9435280561447144, + "learning_rate": 2.1954335038264723e-05, + "loss": 0.4076, "step": 619700 }, { - "epoch": 6.31, - "learning_rate": 3.928751705040036e-05, - "loss": 0.469, + "epoch": 8.539307266264363, + "grad_norm": 5.665966987609863, + "learning_rate": 2.1946484505618124e-05, + "loss": 0.3412, "step": 619800 }, { - "epoch": 6.32, - "learning_rate": 3.92812722010056e-05, - "loss": 0.5457, + "epoch": 8.540685018324103, + "grad_norm": 4.273377418518066, + "learning_rate": 2.1938634444771085e-05, + "loss": 0.3348, "step": 619900 }, { - "epoch": 6.32, - "learning_rate": 3.927496382079522e-05, - "loss": 0.4942, + "epoch": 8.542062770383842, + "grad_norm": 3.8269553184509277, + "learning_rate": 2.19307848563905e-05, + "loss": 0.4072, "step": 620000 }, { - "epoch": 6.32, - "learning_rate": 3.9268654987094466e-05, - "loss": 0.5044, + "epoch": 8.54344052244358, + "grad_norm": 3.528566837310791, + "learning_rate": 2.1922935741143226e-05, + "loss": 0.3975, "step": 620100 }, { - "epoch": 6.32, - "learning_rate": 3.926234570021179e-05, - "loss": 0.525, + "epoch": 8.54481827450332, + "grad_norm": 2.17942214012146, + "learning_rate": 2.191508709969611e-05, + "loss": 0.3837, "step": 620200 }, { - "epoch": 6.32, - "learning_rate": 3.925603596045569e-05, - "loss": 0.5621, + "epoch": 8.54619602656306, + "grad_norm": 1.3713845014572144, + "learning_rate": 2.1907238932715903e-05, + "loss": 0.3173, "step": 620300 }, { - "epoch": 6.32, - "learning_rate": 3.924972576813467e-05, - "loss": 0.4164, + "epoch": 8.547573778622798, + "grad_norm": 1.2039626836776733, + "learning_rate": 2.189939124086936e-05, + "loss": 0.408, "step": 620400 }, { - "epoch": 6.32, - "learning_rate": 3.924341512355725e-05, - "loss": 0.6039, + "epoch": 8.548951530682539, + "grad_norm": 8.209099769592285, + "learning_rate": 2.189154402482319e-05, + "loss": 0.3694, "step": 620500 }, { - "epoch": 6.32, - "learning_rate": 3.9237104027031994e-05, - "loss": 0.4845, + "epoch": 8.550329282742277, + "grad_norm": 11.325695037841797, + "learning_rate": 2.188369728524404e-05, + "loss": 0.3669, "step": 620600 }, { - "epoch": 6.32, - "learning_rate": 3.923079247886748e-05, - "loss": 0.6014, + "epoch": 8.551707034802018, + "grad_norm": 4.762801647186279, + "learning_rate": 2.1875851022798557e-05, + "loss": 0.4044, "step": 620700 }, { - "epoch": 6.32, - "learning_rate": 3.9224480479372295e-05, - "loss": 0.508, + "epoch": 8.553084786861756, + "grad_norm": 4.980169296264648, + "learning_rate": 2.1868005238153286e-05, + "loss": 0.3757, "step": 620800 }, { - "epoch": 6.33, - "learning_rate": 3.921816802885507e-05, - "loss": 0.4611, + "epoch": 8.554462538921495, + "grad_norm": 3.2319538593292236, + "learning_rate": 2.1860159931974786e-05, + "loss": 0.3736, "step": 620900 }, { - "epoch": 6.33, - "learning_rate": 3.921185512762443e-05, - "loss": 0.4722, + "epoch": 8.555840290981235, + "grad_norm": 3.6315629482269287, + "learning_rate": 2.1852315104929556e-05, + "loss": 0.4119, "step": 621000 }, { - "epoch": 6.33, - "learning_rate": 3.9205541775989056e-05, - "loss": 0.5601, + "epoch": 8.557218043040974, + "grad_norm": 5.15279483795166, + "learning_rate": 2.184447075768404e-05, + "loss": 0.4228, "step": 621100 }, { - "epoch": 6.33, - "learning_rate": 3.919922797425762e-05, - "loss": 0.4211, + "epoch": 8.558595795100715, + "grad_norm": 10.44373893737793, + "learning_rate": 2.1836626890904678e-05, + "loss": 0.3641, "step": 621200 }, { - "epoch": 6.33, - "learning_rate": 3.9192913722738836e-05, - "loss": 0.5072, + "epoch": 8.559973547160453, + "grad_norm": 3.743250846862793, + "learning_rate": 2.182878350525784e-05, + "loss": 0.3489, "step": 621300 }, { - "epoch": 6.33, - "learning_rate": 3.918666217097531e-05, - "loss": 0.5572, + "epoch": 8.561351299220192, + "grad_norm": 5.815445899963379, + "learning_rate": 2.1820940601409848e-05, + "loss": 0.4291, "step": 621400 }, { - "epoch": 6.33, - "learning_rate": 3.918034702529821e-05, - "loss": 0.5045, + "epoch": 8.562729051279932, + "grad_norm": 15.064278602600098, + "learning_rate": 2.1813098180027008e-05, + "loss": 0.4041, "step": 621500 }, { - "epoch": 6.33, - "learning_rate": 3.917403143075692e-05, - "loss": 0.5601, + "epoch": 8.564106803339671, + "grad_norm": 0.438976526260376, + "learning_rate": 2.1805256241775566e-05, + "loss": 0.3378, "step": 621600 }, { - "epoch": 6.33, - "learning_rate": 3.916771538766025e-05, - "loss": 0.4876, + "epoch": 8.56548455539941, + "grad_norm": 37.62239456176758, + "learning_rate": 2.1797414787321747e-05, + "loss": 0.4028, "step": 621700 }, { - "epoch": 6.34, - "learning_rate": 3.9161398896317e-05, - "loss": 0.5432, + "epoch": 8.56686230745915, + "grad_norm": 3.2186551094055176, + "learning_rate": 2.1789573817331722e-05, + "loss": 0.3689, "step": 621800 }, { - "epoch": 6.34, - "learning_rate": 3.915508195703601e-05, - "loss": 0.5413, + "epoch": 8.568240059518889, + "grad_norm": 3.1746091842651367, + "learning_rate": 2.1781733332471608e-05, + "loss": 0.3712, "step": 621900 }, { - "epoch": 6.34, - "learning_rate": 3.9148764570126156e-05, - "loss": 0.5793, + "epoch": 8.569617811578627, + "grad_norm": 1.5825210809707642, + "learning_rate": 2.1773893333407502e-05, + "loss": 0.4296, "step": 622000 }, { - "epoch": 6.34, - "learning_rate": 3.914244673589631e-05, - "loss": 0.6149, + "epoch": 8.570995563638368, + "grad_norm": 2.9672021865844727, + "learning_rate": 2.1766053820805446e-05, + "loss": 0.3918, "step": 622100 }, { - "epoch": 6.34, - "learning_rate": 3.913612845465538e-05, - "loss": 0.4908, + "epoch": 8.572373315698107, + "grad_norm": 2.572219133377075, + "learning_rate": 2.1758214795331458e-05, + "loss": 0.2776, "step": 622200 }, { - "epoch": 6.34, - "learning_rate": 3.912980972671228e-05, - "loss": 0.5388, + "epoch": 8.573751067757847, + "grad_norm": 3.83341646194458, + "learning_rate": 2.17503762576515e-05, + "loss": 0.3809, "step": 622300 }, { - "epoch": 6.34, - "learning_rate": 3.912349055237598e-05, - "loss": 0.5169, + "epoch": 8.575128819817586, + "grad_norm": 2.831357002258301, + "learning_rate": 2.17425382084315e-05, + "loss": 0.3602, "step": 622400 }, { - "epoch": 6.34, - "learning_rate": 3.911717093195543e-05, - "loss": 0.6151, + "epoch": 8.576506571877324, + "grad_norm": 3.027714490890503, + "learning_rate": 2.1734700648337324e-05, + "loss": 0.3878, "step": 622500 }, { - "epoch": 6.34, - "learning_rate": 3.9110850865759635e-05, - "loss": 0.5821, + "epoch": 8.577884323937065, + "grad_norm": 4.66804313659668, + "learning_rate": 2.172686357803482e-05, + "loss": 0.3284, "step": 622600 }, { - "epoch": 6.34, - "learning_rate": 3.910453035409761e-05, - "loss": 0.5559, + "epoch": 8.579262075996803, + "grad_norm": 23.84748077392578, + "learning_rate": 2.17190269981898e-05, + "loss": 0.3428, "step": 622700 }, { - "epoch": 6.35, - "learning_rate": 3.909820939727838e-05, - "loss": 0.5057, + "epoch": 8.580639828056542, + "grad_norm": 3.4912636280059814, + "learning_rate": 2.1711190909468002e-05, + "loss": 0.3501, "step": 622800 }, { - "epoch": 6.35, - "learning_rate": 3.9091887995611e-05, - "loss": 0.5898, + "epoch": 8.582017580116283, + "grad_norm": 4.160470962524414, + "learning_rate": 2.1703355312535167e-05, + "loss": 0.3844, "step": 622900 }, { - "epoch": 6.35, - "learning_rate": 3.9085566149404555e-05, - "loss": 0.532, + "epoch": 8.583395332176021, + "grad_norm": 1.3331727981567383, + "learning_rate": 2.1695598556661888e-05, + "loss": 0.3289, "step": 623000 }, { - "epoch": 6.35, - "learning_rate": 3.9079243858968144e-05, - "loss": 0.4956, + "epoch": 8.584773084235762, + "grad_norm": 4.610540866851807, + "learning_rate": 2.168776394036942e-05, + "loss": 0.3148, "step": 623100 }, { - "epoch": 6.35, - "learning_rate": 3.907292112461089e-05, - "loss": 0.5432, + "epoch": 8.5861508362955, + "grad_norm": 3.8662302494049072, + "learning_rate": 2.1679929817856143e-05, + "loss": 0.3659, "step": 623200 }, { - "epoch": 6.35, - "learning_rate": 3.906659794664194e-05, - "loss": 0.5047, + "epoch": 8.587528588355239, + "grad_norm": 2.9380335807800293, + "learning_rate": 2.1672096189787592e-05, + "loss": 0.3748, "step": 623300 }, { - "epoch": 6.35, - "learning_rate": 3.906027432537045e-05, - "loss": 0.5733, + "epoch": 8.58890634041498, + "grad_norm": 22.89092254638672, + "learning_rate": 2.1664263056829298e-05, + "loss": 0.3834, "step": 623400 }, { - "epoch": 6.35, - "learning_rate": 3.905395026110563e-05, - "loss": 0.5153, + "epoch": 8.590284092474718, + "grad_norm": 2.1078238487243652, + "learning_rate": 2.1656430419646716e-05, + "loss": 0.3908, "step": 623500 }, { - "epoch": 6.35, - "learning_rate": 3.9047625754156676e-05, - "loss": 0.5662, + "epoch": 8.591661844534457, + "grad_norm": 0.28540176153182983, + "learning_rate": 2.164859827890526e-05, + "loss": 0.3971, "step": 623600 }, { - "epoch": 6.35, - "learning_rate": 3.90413008048328e-05, - "loss": 0.5494, + "epoch": 8.593039596594197, + "grad_norm": 2.0287351608276367, + "learning_rate": 2.1640766635270314e-05, + "loss": 0.3485, "step": 623700 }, { - "epoch": 6.36, - "learning_rate": 3.9034975413443266e-05, - "loss": 0.5467, + "epoch": 8.594417348653936, + "grad_norm": 5.338911056518555, + "learning_rate": 2.1632935489407214e-05, + "loss": 0.3248, "step": 623800 }, { - "epoch": 6.36, - "learning_rate": 3.902864958029737e-05, - "loss": 0.4614, + "epoch": 8.595795100713676, + "grad_norm": 8.471325874328613, + "learning_rate": 2.1625104841981265e-05, + "loss": 0.3621, "step": 623900 }, { - "epoch": 6.36, - "learning_rate": 3.902232330570437e-05, - "loss": 0.4705, + "epoch": 8.597172852773415, + "grad_norm": 2.9211699962615967, + "learning_rate": 2.1617274693657723e-05, + "loss": 0.4189, "step": 624000 }, { - "epoch": 6.36, - "learning_rate": 3.901599658997362e-05, - "loss": 0.6193, + "epoch": 8.598550604833154, + "grad_norm": 9.724971771240234, + "learning_rate": 2.160944504510178e-05, + "loss": 0.3334, "step": 624100 }, { - "epoch": 6.36, - "learning_rate": 3.9009669433414435e-05, - "loss": 0.5019, + "epoch": 8.599928356892894, + "grad_norm": 20.8826904296875, + "learning_rate": 2.1601615896978625e-05, + "loss": 0.3576, "step": 624200 }, { - "epoch": 6.36, - "learning_rate": 3.900334183633619e-05, - "loss": 0.5182, + "epoch": 8.601306108952633, + "grad_norm": 1.1538742780685425, + "learning_rate": 2.1593787249953362e-05, + "loss": 0.3389, "step": 624300 }, { - "epoch": 6.36, - "learning_rate": 3.899701379904825e-05, - "loss": 0.5207, + "epoch": 8.602683861012371, + "grad_norm": 8.331433296203613, + "learning_rate": 2.1585959104691098e-05, + "loss": 0.3762, "step": 624400 }, { - "epoch": 6.36, - "learning_rate": 3.8990685321860016e-05, - "loss": 0.502, + "epoch": 8.604061613072112, + "grad_norm": 3.8170533180236816, + "learning_rate": 2.1578131461856862e-05, + "loss": 0.3505, "step": 624500 }, { - "epoch": 6.36, - "learning_rate": 3.8984356405080936e-05, - "loss": 0.4636, + "epoch": 8.60543936513185, + "grad_norm": 3.952625036239624, + "learning_rate": 2.1570304322115663e-05, + "loss": 0.3906, "step": 624600 }, { - "epoch": 6.36, - "learning_rate": 3.8978027049020445e-05, - "loss": 0.526, + "epoch": 8.60681711719159, + "grad_norm": 1.0608445405960083, + "learning_rate": 2.1562477686132438e-05, + "loss": 0.3441, "step": 624700 }, { - "epoch": 6.37, - "learning_rate": 3.8971697253988e-05, - "loss": 0.493, + "epoch": 8.60819486925133, + "grad_norm": 8.85407829284668, + "learning_rate": 2.1554651554572106e-05, + "loss": 0.3283, "step": 624800 }, { - "epoch": 6.37, - "learning_rate": 3.8965367020293115e-05, - "loss": 0.5652, + "epoch": 8.609572621311068, + "grad_norm": 3.8254077434539795, + "learning_rate": 2.154682592809955e-05, + "loss": 0.3561, "step": 624900 }, { - "epoch": 6.37, - "learning_rate": 3.895903634824527e-05, - "loss": 0.5161, + "epoch": 8.610950373370809, + "grad_norm": 0.8852051496505737, + "learning_rate": 2.1539000807379583e-05, + "loss": 0.4056, "step": 625000 }, { - "epoch": 6.37, - "learning_rate": 3.8952705238154024e-05, - "loss": 0.4711, + "epoch": 8.612328125430547, + "grad_norm": 1.8141647577285767, + "learning_rate": 2.1531176193077002e-05, + "loss": 0.3857, "step": 625100 }, { - "epoch": 6.37, - "learning_rate": 3.8946373690328914e-05, - "loss": 0.5535, + "epoch": 8.613705877490286, + "grad_norm": 4.894231796264648, + "learning_rate": 2.1523352085856533e-05, + "loss": 0.3728, "step": 625200 }, { - "epoch": 6.37, - "learning_rate": 3.894004170507951e-05, - "loss": 0.5852, + "epoch": 8.615083629550027, + "grad_norm": 3.1469619274139404, + "learning_rate": 2.1515528486382874e-05, + "loss": 0.3991, "step": 625300 }, { - "epoch": 6.37, - "learning_rate": 3.8933709282715435e-05, - "loss": 0.5104, + "epoch": 8.616461381609765, + "grad_norm": 5.235361099243164, + "learning_rate": 2.1507705395320693e-05, + "loss": 0.3484, "step": 625400 }, { - "epoch": 6.37, - "learning_rate": 3.8927376423546286e-05, - "loss": 0.568, + "epoch": 8.617839133669506, + "grad_norm": 0.24693188071250916, + "learning_rate": 2.1499882813334593e-05, + "loss": 0.3861, "step": 625500 }, { - "epoch": 6.37, - "learning_rate": 3.89210431278817e-05, - "loss": 0.5385, + "epoch": 8.619216885729244, + "grad_norm": 3.9979472160339355, + "learning_rate": 2.149206074108914e-05, + "loss": 0.357, "step": 625600 }, { - "epoch": 6.37, - "learning_rate": 3.8914772735507947e-05, - "loss": 0.5203, + "epoch": 8.620594637788983, + "grad_norm": 2.707336664199829, + "learning_rate": 2.1484239179248882e-05, + "loss": 0.373, "step": 625700 }, { - "epoch": 6.38, - "learning_rate": 3.8908438572138723e-05, - "loss": 0.4415, + "epoch": 8.621972389848723, + "grad_norm": 2.6941821575164795, + "learning_rate": 2.1476418128478265e-05, + "loss": 0.3757, "step": 625800 }, { - "epoch": 6.38, - "learning_rate": 3.890210397320003e-05, - "loss": 0.5888, + "epoch": 8.623350141908462, + "grad_norm": 9.870085716247559, + "learning_rate": 2.1468675792296846e-05, + "loss": 0.3871, "step": 625900 }, { - "epoch": 6.38, - "learning_rate": 3.889576893900157e-05, - "loss": 0.5675, + "epoch": 8.6247278939682, + "grad_norm": 1.317989468574524, + "learning_rate": 2.1460855760531547e-05, + "loss": 0.3981, "step": 626000 }, { - "epoch": 6.38, - "learning_rate": 3.888943346985309e-05, - "loss": 0.6172, + "epoch": 8.626105646027941, + "grad_norm": 0.3457280993461609, + "learning_rate": 2.145303624182245e-05, + "loss": 0.3748, "step": 626100 }, { - "epoch": 6.38, - "learning_rate": 3.888309756606438e-05, - "loss": 0.5383, + "epoch": 8.62748339808768, + "grad_norm": 6.618454933166504, + "learning_rate": 2.1445217236833864e-05, + "loss": 0.3637, "step": 626200 }, { - "epoch": 6.38, - "learning_rate": 3.887676122794519e-05, - "loss": 0.548, + "epoch": 8.628861150147419, + "grad_norm": 0.3231651782989502, + "learning_rate": 2.1437398746230043e-05, + "loss": 0.3317, "step": 626300 }, { - "epoch": 6.38, - "learning_rate": 3.887042445580536e-05, - "loss": 0.5542, + "epoch": 8.630238902207159, + "grad_norm": 123.69172668457031, + "learning_rate": 2.1429580770675208e-05, + "loss": 0.3121, "step": 626400 }, { - "epoch": 6.38, - "learning_rate": 3.886408724995471e-05, - "loss": 0.6445, + "epoch": 8.631616654266898, + "grad_norm": 13.843061447143555, + "learning_rate": 2.1421763310833554e-05, + "loss": 0.3702, "step": 626500 }, { - "epoch": 6.38, - "learning_rate": 3.885774961070309e-05, - "loss": 0.5071, + "epoch": 8.632994406326638, + "grad_norm": 4.132974147796631, + "learning_rate": 2.1413946367369198e-05, + "loss": 0.3185, "step": 626600 }, { - "epoch": 6.38, - "learning_rate": 3.885141153836038e-05, - "loss": 0.5566, + "epoch": 8.634372158386377, + "grad_norm": 2.850433111190796, + "learning_rate": 2.1406129940946235e-05, + "loss": 0.3682, "step": 626700 }, { - "epoch": 6.39, - "learning_rate": 3.8845073033236466e-05, - "loss": 0.499, + "epoch": 8.635749910446116, + "grad_norm": 3.6372785568237305, + "learning_rate": 2.1398392188751067e-05, + "loss": 0.3459, "step": 626800 }, { - "epoch": 6.39, - "learning_rate": 3.883873409564126e-05, - "loss": 0.4963, + "epoch": 8.637127662505856, + "grad_norm": 3.0738308429718018, + "learning_rate": 2.1390576793216e-05, + "loss": 0.368, "step": 626900 }, { - "epoch": 6.39, - "learning_rate": 3.8832394725884706e-05, - "loss": 0.5456, + "epoch": 8.638505414565595, + "grad_norm": 4.2001190185546875, + "learning_rate": 2.138276191670769e-05, + "loss": 0.3747, "step": 627000 }, { - "epoch": 6.39, - "learning_rate": 3.882605492427676e-05, - "loss": 0.5518, + "epoch": 8.639883166625333, + "grad_norm": 5.7996649742126465, + "learning_rate": 2.1374947559890045e-05, + "loss": 0.4183, "step": 627100 }, { - "epoch": 6.39, - "learning_rate": 3.8819714691127394e-05, - "loss": 0.6104, + "epoch": 8.641260918685074, + "grad_norm": 4.514593601226807, + "learning_rate": 2.1367133723426945e-05, + "loss": 0.3402, "step": 627200 }, { - "epoch": 6.39, - "learning_rate": 3.881337402674662e-05, - "loss": 0.6112, + "epoch": 8.642638670744812, + "grad_norm": 3.181171417236328, + "learning_rate": 2.1359320407982217e-05, + "loss": 0.347, "step": 627300 }, { - "epoch": 6.39, - "learning_rate": 3.8807032931444446e-05, - "loss": 0.4784, + "epoch": 8.644016422804553, + "grad_norm": 10.193471908569336, + "learning_rate": 2.135150761421963e-05, + "loss": 0.3796, "step": 627400 }, { - "epoch": 6.39, - "learning_rate": 3.8800691405530926e-05, - "loss": 0.5748, + "epoch": 8.645394174864292, + "grad_norm": 5.279388427734375, + "learning_rate": 2.1343695342802928e-05, + "loss": 0.3462, "step": 627500 }, { - "epoch": 6.39, - "learning_rate": 3.879434944931612e-05, - "loss": 0.4576, + "epoch": 8.64677192692403, + "grad_norm": 1.758195400238037, + "learning_rate": 2.1335883594395796e-05, + "loss": 0.3909, "step": 627600 }, { - "epoch": 6.4, - "learning_rate": 3.878800706311009e-05, - "loss": 0.5395, + "epoch": 8.64814967898377, + "grad_norm": 5.303006172180176, + "learning_rate": 2.1328072369661892e-05, + "loss": 0.3801, "step": 627700 }, { - "epoch": 6.4, - "learning_rate": 3.8781664247222974e-05, - "loss": 0.5237, + "epoch": 8.64952743104351, + "grad_norm": 10.079161643981934, + "learning_rate": 2.132026166926482e-05, + "loss": 0.3501, "step": 627800 }, { - "epoch": 6.4, - "learning_rate": 3.877532100196488e-05, - "loss": 0.473, + "epoch": 8.650905183103248, + "grad_norm": 4.683069705963135, + "learning_rate": 2.1312451493868136e-05, + "loss": 0.3903, "step": 627900 }, { - "epoch": 6.4, - "learning_rate": 3.8768977327645956e-05, - "loss": 0.4975, + "epoch": 8.652282935162988, + "grad_norm": 2.807865858078003, + "learning_rate": 2.1304641844135347e-05, + "loss": 0.395, "step": 628000 }, { - "epoch": 6.4, - "learning_rate": 3.876263322457637e-05, - "loss": 0.558, + "epoch": 8.653660687222727, + "grad_norm": 291.091064453125, + "learning_rate": 2.1296832720729923e-05, + "loss": 0.3458, "step": 628100 }, { - "epoch": 6.4, - "learning_rate": 3.8756288693066306e-05, - "loss": 0.5763, + "epoch": 8.655038439282468, + "grad_norm": 5.003805637359619, + "learning_rate": 2.1289024124315303e-05, + "loss": 0.3934, "step": 628200 }, { - "epoch": 6.4, - "learning_rate": 3.8749943733425985e-05, - "loss": 0.5168, + "epoch": 8.656416191342206, + "grad_norm": 2.5112297534942627, + "learning_rate": 2.1281216055554854e-05, + "loss": 0.3772, "step": 628300 }, { - "epoch": 6.4, - "learning_rate": 3.8743598345965634e-05, - "loss": 0.5838, + "epoch": 8.657793943401945, + "grad_norm": 3.257223129272461, + "learning_rate": 2.127340851511193e-05, + "loss": 0.4067, "step": 628400 }, { - "epoch": 6.4, - "learning_rate": 3.8737252530995496e-05, - "loss": 0.4094, + "epoch": 8.659171695461685, + "grad_norm": 2.134662389755249, + "learning_rate": 2.1265601503649793e-05, + "loss": 0.3785, "step": 628500 }, { - "epoch": 6.4, - "learning_rate": 3.8730906288825863e-05, - "loss": 0.5462, + "epoch": 8.660549447521424, + "grad_norm": 1.4923478364944458, + "learning_rate": 2.1257795021831706e-05, + "loss": 0.3921, "step": 628600 }, { - "epoch": 6.41, - "learning_rate": 3.8724559619767014e-05, - "loss": 0.4997, + "epoch": 8.661927199581163, + "grad_norm": 0.9694164991378784, + "learning_rate": 2.1249989070320866e-05, + "loss": 0.3845, "step": 628700 }, { - "epoch": 6.41, - "learning_rate": 3.8718212524129254e-05, - "loss": 0.5377, + "epoch": 8.663304951640903, + "grad_norm": 3.6847665309906006, + "learning_rate": 2.1242183649780432e-05, + "loss": 0.3627, "step": 628800 }, { - "epoch": 6.41, - "learning_rate": 3.8711865002222935e-05, - "loss": 0.422, + "epoch": 8.664682703700642, + "grad_norm": 5.224494457244873, + "learning_rate": 2.1234378760873506e-05, + "loss": 0.3502, "step": 628900 }, { - "epoch": 6.41, - "learning_rate": 3.8705517054358404e-05, - "loss": 0.5571, + "epoch": 8.66606045576038, + "grad_norm": 2.7181360721588135, + "learning_rate": 2.1226574404263174e-05, + "loss": 0.3584, "step": 629000 }, { - "epoch": 6.41, - "learning_rate": 3.869916868084605e-05, - "loss": 0.5243, + "epoch": 8.66743820782012, + "grad_norm": 4.30332612991333, + "learning_rate": 2.1218770580612427e-05, + "loss": 0.3758, "step": 629100 }, { - "epoch": 6.41, - "learning_rate": 3.869281988199626e-05, - "loss": 0.5476, + "epoch": 8.66881595987986, + "grad_norm": 10.797175407409668, + "learning_rate": 2.1210967290584255e-05, + "loss": 0.3691, "step": 629200 }, { - "epoch": 6.41, - "learning_rate": 3.868647065811944e-05, - "loss": 0.5078, + "epoch": 8.6701937119396, + "grad_norm": 1.8653578758239746, + "learning_rate": 2.120316453484159e-05, + "loss": 0.3064, "step": 629300 }, { - "epoch": 6.41, - "learning_rate": 3.868012100952606e-05, - "loss": 0.511, + "epoch": 8.671571463999339, + "grad_norm": 1.7396917343139648, + "learning_rate": 2.1195362314047304e-05, + "loss": 0.3993, "step": 629400 }, { - "epoch": 6.41, - "learning_rate": 3.867377093652655e-05, - "loss": 0.4887, + "epoch": 8.672949216059077, + "grad_norm": 6.089523792266846, + "learning_rate": 2.1187560628864264e-05, + "loss": 0.3728, "step": 629500 }, { - "epoch": 6.41, - "learning_rate": 3.8667420439431406e-05, - "loss": 0.4617, + "epoch": 8.674326968118818, + "grad_norm": 1.9020843505859375, + "learning_rate": 2.1179759479955222e-05, + "loss": 0.3878, "step": 629600 }, { - "epoch": 6.42, - "learning_rate": 3.8661069518551134e-05, - "loss": 0.4925, + "epoch": 8.675704720178556, + "grad_norm": 1.65755033493042, + "learning_rate": 2.1171958867982957e-05, + "loss": 0.3914, "step": 629700 }, { - "epoch": 6.42, - "learning_rate": 3.865471817419625e-05, - "loss": 0.4522, + "epoch": 8.677082472238297, + "grad_norm": 13.32901382446289, + "learning_rate": 2.1164158793610154e-05, + "loss": 0.3985, "step": 629800 }, { - "epoch": 6.42, - "learning_rate": 3.864842992644612e-05, - "loss": 0.5142, + "epoch": 8.678460224298036, + "grad_norm": 2.6107616424560547, + "learning_rate": 2.1156359257499474e-05, + "loss": 0.3577, "step": 629900 }, { - "epoch": 6.42, - "learning_rate": 3.864207774030066e-05, - "loss": 0.5012, + "epoch": 8.679837976357774, + "grad_norm": 4.511290073394775, + "learning_rate": 2.1148560260313533e-05, + "loss": 0.4, "step": 630000 }, { - "epoch": 6.42, - "learning_rate": 3.863572513160916e-05, - "loss": 0.5527, + "epoch": 8.681215728417515, + "grad_norm": 2.9307472705841064, + "learning_rate": 2.11407618027149e-05, + "loss": 0.3736, "step": 630100 }, { - "epoch": 6.42, - "learning_rate": 3.8629372100682236e-05, - "loss": 0.4647, + "epoch": 8.682593480477253, + "grad_norm": 4.101418972015381, + "learning_rate": 2.113296388536608e-05, + "loss": 0.364, "step": 630200 }, { - "epoch": 6.42, - "learning_rate": 3.8623018647830526e-05, - "loss": 0.5333, + "epoch": 8.683971232536992, + "grad_norm": 2.0894899368286133, + "learning_rate": 2.1125166508929544e-05, + "loss": 0.3727, "step": 630300 }, { - "epoch": 6.42, - "learning_rate": 3.861666477336465e-05, - "loss": 0.4803, + "epoch": 8.685348984596732, + "grad_norm": 5.536656856536865, + "learning_rate": 2.1117369674067726e-05, + "loss": 0.3735, "step": 630400 }, { - "epoch": 6.42, - "learning_rate": 3.86103104775953e-05, - "loss": 0.5362, + "epoch": 8.686726736656471, + "grad_norm": 3.605229139328003, + "learning_rate": 2.1109573381443014e-05, + "loss": 0.404, "step": 630500 }, { - "epoch": 6.42, - "learning_rate": 3.860395576083314e-05, - "loss": 0.5201, + "epoch": 8.68810448871621, + "grad_norm": 12.471243858337402, + "learning_rate": 2.1101777631717737e-05, + "loss": 0.3092, "step": 630600 }, { - "epoch": 6.43, - "learning_rate": 3.85976006233889e-05, - "loss": 0.4721, + "epoch": 8.68948224077595, + "grad_norm": 6.4673261642456055, + "learning_rate": 2.1093982425554187e-05, + "loss": 0.366, "step": 630700 }, { - "epoch": 6.43, - "learning_rate": 3.859124506557328e-05, - "loss": 0.5554, + "epoch": 8.690859992835689, + "grad_norm": 37.566261291503906, + "learning_rate": 2.1086187763614597e-05, + "loss": 0.3343, "step": 630800 }, { - "epoch": 6.43, - "learning_rate": 3.8584889087697055e-05, - "loss": 0.4375, + "epoch": 8.69223774489543, + "grad_norm": 73.8747329711914, + "learning_rate": 2.1078393646561162e-05, + "loss": 0.363, "step": 630900 }, { - "epoch": 6.43, - "learning_rate": 3.8578532690071e-05, - "loss": 0.6021, + "epoch": 8.693615496955168, + "grad_norm": 4.5704474449157715, + "learning_rate": 2.1070600075056048e-05, + "loss": 0.4258, "step": 631000 }, { - "epoch": 6.43, - "learning_rate": 3.857217587300587e-05, - "loss": 0.6005, + "epoch": 8.694993249014907, + "grad_norm": 5.659417629241943, + "learning_rate": 2.106280704976135e-05, + "loss": 0.3957, "step": 631100 }, { - "epoch": 6.43, - "learning_rate": 3.856581863681251e-05, - "loss": 0.5931, + "epoch": 8.696371001074647, + "grad_norm": 1.0288254022598267, + "learning_rate": 2.105501457133913e-05, + "loss": 0.34, "step": 631200 }, { - "epoch": 6.43, - "learning_rate": 3.855946098180173e-05, - "loss": 0.504, + "epoch": 8.697748753134386, + "grad_norm": 1.318321704864502, + "learning_rate": 2.1047222640451394e-05, + "loss": 0.3644, "step": 631300 }, { - "epoch": 6.43, - "learning_rate": 3.855310290828439e-05, - "loss": 0.5346, + "epoch": 8.699126505194124, + "grad_norm": 9.477001190185547, + "learning_rate": 2.1039431257760093e-05, + "loss": 0.3952, "step": 631400 }, { - "epoch": 6.43, - "learning_rate": 3.854680800355754e-05, - "loss": 0.5164, + "epoch": 8.700504257253865, + "grad_norm": 2.8804399967193604, + "learning_rate": 2.1031640423927173e-05, + "loss": 0.3663, "step": 631500 }, { - "epoch": 6.43, - "learning_rate": 3.8540449098137016e-05, - "loss": 0.4551, + "epoch": 8.701882009313604, + "grad_norm": 3.0058481693267822, + "learning_rate": 2.1023850139614483e-05, + "loss": 0.3694, "step": 631600 }, { - "epoch": 6.44, - "learning_rate": 3.8534089775139505e-05, - "loss": 0.5681, + "epoch": 8.703259761373344, + "grad_norm": 4.2868733406066895, + "learning_rate": 2.1016060405483855e-05, + "loss": 0.3723, "step": 631700 }, { - "epoch": 6.44, - "learning_rate": 3.852773003487593e-05, - "loss": 0.5253, + "epoch": 8.704637513433083, + "grad_norm": 3.027644157409668, + "learning_rate": 2.1008271222197082e-05, + "loss": 0.3173, "step": 631800 }, { - "epoch": 6.44, - "learning_rate": 3.852136987765725e-05, - "loss": 0.4507, + "epoch": 8.706015265492821, + "grad_norm": 7.656113147735596, + "learning_rate": 2.100048259041586e-05, + "loss": 0.3912, "step": 631900 }, { - "epoch": 6.44, - "learning_rate": 3.851500930379443e-05, - "loss": 0.5054, + "epoch": 8.707393017552562, + "grad_norm": 3.0780627727508545, + "learning_rate": 2.0992694510801908e-05, + "loss": 0.3499, "step": 632000 }, { - "epoch": 6.44, - "learning_rate": 3.8508648313598476e-05, - "loss": 0.518, + "epoch": 8.7087707696123, + "grad_norm": 2.0451438426971436, + "learning_rate": 2.098490698401684e-05, + "loss": 0.4192, "step": 632100 }, { - "epoch": 6.44, - "learning_rate": 3.8502286907380386e-05, - "loss": 0.5072, + "epoch": 8.71014852167204, + "grad_norm": 1.1024266481399536, + "learning_rate": 2.0977120010722256e-05, + "loss": 0.3635, "step": 632200 }, { - "epoch": 6.44, - "learning_rate": 3.849592508545121e-05, - "loss": 0.4719, + "epoch": 8.71152627373178, + "grad_norm": 5.2056660652160645, + "learning_rate": 2.0969333591579706e-05, + "loss": 0.3522, "step": 632300 }, { - "epoch": 6.44, - "learning_rate": 3.848956284812201e-05, - "loss": 0.5683, + "epoch": 8.712904025791518, + "grad_norm": 5.245372772216797, + "learning_rate": 2.0961547727250665e-05, + "loss": 0.3681, "step": 632400 }, { - "epoch": 6.44, - "learning_rate": 3.848320019570384e-05, - "loss": 0.5434, + "epoch": 8.714281777851259, + "grad_norm": 3.949108600616455, + "learning_rate": 2.09537624183966e-05, + "loss": 0.42, "step": 632500 }, { - "epoch": 6.45, - "learning_rate": 3.8476837128507805e-05, - "loss": 0.5273, + "epoch": 8.715659529910997, + "grad_norm": 31.30867576599121, + "learning_rate": 2.094597766567891e-05, + "loss": 0.3786, "step": 632600 }, { - "epoch": 6.45, - "learning_rate": 3.8470473646845035e-05, - "loss": 0.5009, + "epoch": 8.717037281970736, + "grad_norm": 1.5206201076507568, + "learning_rate": 2.0938193469758946e-05, + "loss": 0.3889, "step": 632700 }, { - "epoch": 6.45, - "learning_rate": 3.8464109751026636e-05, - "loss": 0.4959, + "epoch": 8.718415034030476, + "grad_norm": 4.9385809898376465, + "learning_rate": 2.0930409831298016e-05, + "loss": 0.3642, "step": 632800 }, { - "epoch": 6.45, - "learning_rate": 3.8457745441363775e-05, - "loss": 0.5014, + "epoch": 8.719792786090215, + "grad_norm": 2.5209903717041016, + "learning_rate": 2.0922626750957388e-05, + "loss": 0.4055, "step": 632900 }, { - "epoch": 6.45, - "learning_rate": 3.8451380718167646e-05, - "loss": 0.4881, + "epoch": 8.721170538149954, + "grad_norm": 102.56981658935547, + "learning_rate": 2.0914922051845713e-05, + "loss": 0.3687, "step": 633000 }, { - "epoch": 6.45, - "learning_rate": 3.844501558174944e-05, - "loss": 0.4613, + "epoch": 8.722548290209694, + "grad_norm": 1.3988274335861206, + "learning_rate": 2.0907217901037317e-05, + "loss": 0.4149, "step": 633100 }, { - "epoch": 6.45, - "learning_rate": 3.843865003242037e-05, - "loss": 0.5936, + "epoch": 8.723926042269433, + "grad_norm": 4.61720085144043, + "learning_rate": 2.0899436487816093e-05, + "loss": 0.3269, "step": 633200 }, { - "epoch": 6.45, - "learning_rate": 3.8432284070491665e-05, - "loss": 0.4976, + "epoch": 8.725303794329172, + "grad_norm": 2.69108510017395, + "learning_rate": 2.0891655635346514e-05, + "loss": 0.3574, "step": 633300 }, { - "epoch": 6.45, - "learning_rate": 3.8425917696274595e-05, - "loss": 0.5197, + "epoch": 8.726681546388912, + "grad_norm": 4.8942766189575195, + "learning_rate": 2.08838753442896e-05, + "loss": 0.4135, "step": 633400 }, { - "epoch": 6.45, - "learning_rate": 3.8419550910080426e-05, - "loss": 0.4852, + "epoch": 8.72805929844865, + "grad_norm": 2.1186954975128174, + "learning_rate": 2.0876095615306317e-05, + "loss": 0.3424, "step": 633500 }, { - "epoch": 6.46, - "learning_rate": 3.8413183712220464e-05, - "loss": 0.4654, + "epoch": 8.729437050508391, + "grad_norm": 1.4958524703979492, + "learning_rate": 2.0868316449057602e-05, + "loss": 0.3547, "step": 633600 }, { - "epoch": 6.46, - "learning_rate": 3.8406816103006034e-05, - "loss": 0.542, + "epoch": 8.73081480256813, + "grad_norm": 1.7743455171585083, + "learning_rate": 2.0860537846204325e-05, + "loss": 0.3844, "step": 633700 }, { - "epoch": 6.46, - "learning_rate": 3.840044808274846e-05, - "loss": 0.4455, + "epoch": 8.732192554627868, + "grad_norm": 4.088932514190674, + "learning_rate": 2.085275980740733e-05, + "loss": 0.2961, "step": 633800 }, { - "epoch": 6.46, - "learning_rate": 3.839407965175911e-05, - "loss": 0.5642, + "epoch": 8.733570306687609, + "grad_norm": 4.657464981079102, + "learning_rate": 2.0844982333327402e-05, + "loss": 0.3539, "step": 633900 }, { - "epoch": 6.46, - "learning_rate": 3.838771081034936e-05, - "loss": 0.536, + "epoch": 8.734948058747348, + "grad_norm": 3.937804698944092, + "learning_rate": 2.0837205424625268e-05, + "loss": 0.3732, "step": 634000 }, { - "epoch": 6.46, - "learning_rate": 3.838134155883059e-05, - "loss": 0.4027, + "epoch": 8.736325810807088, + "grad_norm": 1.4566566944122314, + "learning_rate": 2.082942908196162e-05, + "loss": 0.3736, "step": 634100 }, { - "epoch": 6.46, - "learning_rate": 3.837497189751425e-05, - "loss": 0.4923, + "epoch": 8.737703562866827, + "grad_norm": 4.650998592376709, + "learning_rate": 2.082165330599709e-05, + "loss": 0.3367, "step": 634200 }, { - "epoch": 6.46, - "learning_rate": 3.8368601826711766e-05, - "loss": 0.5543, + "epoch": 8.739081314926565, + "grad_norm": 1.7612947225570679, + "learning_rate": 2.0813878097392283e-05, + "loss": 0.3818, "step": 634300 }, { - "epoch": 6.46, - "learning_rate": 3.8362231346734585e-05, - "loss": 0.5271, + "epoch": 8.740459066986306, + "grad_norm": 6.239738464355469, + "learning_rate": 2.080610345680774e-05, + "loss": 0.4275, "step": 634400 }, { - "epoch": 6.46, - "learning_rate": 3.8355860457894194e-05, - "loss": 0.4994, + "epoch": 8.741836819046044, + "grad_norm": 2.9712233543395996, + "learning_rate": 2.0798329384903958e-05, + "loss": 0.3319, "step": 634500 }, { - "epoch": 6.47, - "learning_rate": 3.83494891605021e-05, - "loss": 0.585, + "epoch": 8.743214571105783, + "grad_norm": 3.7142488956451416, + "learning_rate": 2.0790555882341377e-05, + "loss": 0.3258, "step": 634600 }, { - "epoch": 6.47, - "learning_rate": 3.8343117454869814e-05, - "loss": 0.4882, + "epoch": 8.744592323165524, + "grad_norm": 0.22519977390766144, + "learning_rate": 2.0782782949780393e-05, + "loss": 0.3356, "step": 634700 }, { - "epoch": 6.47, - "learning_rate": 3.833674534130886e-05, - "loss": 0.4894, + "epoch": 8.745970075225262, + "grad_norm": 2.24440598487854, + "learning_rate": 2.0775010587881365e-05, + "loss": 0.3566, "step": 634800 }, { - "epoch": 6.47, - "learning_rate": 3.833037282013081e-05, - "loss": 0.4367, + "epoch": 8.747347827285001, + "grad_norm": 2.5549769401550293, + "learning_rate": 2.0767238797304592e-05, + "loss": 0.3664, "step": 634900 }, { - "epoch": 6.47, - "learning_rate": 3.8323999891647255e-05, - "loss": 0.4038, + "epoch": 8.748725579344741, + "grad_norm": 3.2478723526000977, + "learning_rate": 2.075946757871033e-05, + "loss": 0.3617, "step": 635000 }, { - "epoch": 6.47, - "learning_rate": 3.831762655616979e-05, - "loss": 0.4617, + "epoch": 8.75010333140448, + "grad_norm": 0.5323312282562256, + "learning_rate": 2.0751774636381554e-05, + "loss": 0.3847, "step": 635100 }, { - "epoch": 6.47, - "learning_rate": 3.831125281401e-05, - "loss": 0.4597, + "epoch": 8.75148108346422, + "grad_norm": 4.1103668212890625, + "learning_rate": 2.074400455799658e-05, + "loss": 0.3774, "step": 635200 }, { - "epoch": 6.47, - "learning_rate": 3.830487866547955e-05, - "loss": 0.56, + "epoch": 8.75285883552396, + "grad_norm": 3.3447136878967285, + "learning_rate": 2.073623505356798e-05, + "loss": 0.3973, "step": 635300 }, { - "epoch": 6.47, - "learning_rate": 3.829850411089011e-05, - "loss": 0.5664, + "epoch": 8.754236587583698, + "grad_norm": 31.8425350189209, + "learning_rate": 2.072846612375581e-05, + "loss": 0.3592, "step": 635400 }, { - "epoch": 6.47, - "learning_rate": 3.829212915055332e-05, - "loss": 0.5628, + "epoch": 8.755614339643438, + "grad_norm": 6.60775089263916, + "learning_rate": 2.0720697769220094e-05, + "loss": 0.3959, "step": 635500 }, { - "epoch": 6.48, - "learning_rate": 3.8285753784780915e-05, - "loss": 0.5534, + "epoch": 8.756992091703177, + "grad_norm": 3.9991989135742188, + "learning_rate": 2.0712929990620785e-05, + "loss": 0.3653, "step": 635600 }, { - "epoch": 6.48, - "learning_rate": 3.827937801388459e-05, - "loss": 0.4291, + "epoch": 8.758369843762916, + "grad_norm": 91.29729461669922, + "learning_rate": 2.0705162788617797e-05, + "loss": 0.3593, "step": 635700 }, { - "epoch": 6.48, - "learning_rate": 3.827300183817609e-05, - "loss": 0.5154, + "epoch": 8.759747595822656, + "grad_norm": 2.214317560195923, + "learning_rate": 2.0697396163870987e-05, + "loss": 0.2996, "step": 635800 }, { - "epoch": 6.48, - "learning_rate": 3.8266625257967176e-05, - "loss": 0.5357, + "epoch": 8.761125347882395, + "grad_norm": 5.3555474281311035, + "learning_rate": 2.0689630117040175e-05, + "loss": 0.3369, "step": 635900 }, { - "epoch": 6.48, - "learning_rate": 3.82603120454133e-05, - "loss": 0.5425, + "epoch": 8.762503099942135, + "grad_norm": 3.082096815109253, + "learning_rate": 2.0681864648785127e-05, + "loss": 0.3839, "step": 636000 }, { - "epoch": 6.48, - "learning_rate": 3.8253934661176113e-05, - "loss": 0.5378, + "epoch": 8.763880852001874, + "grad_norm": 3.0181756019592285, + "learning_rate": 2.0674099759765565e-05, + "loss": 0.3329, "step": 636100 }, { - "epoch": 6.48, - "learning_rate": 3.824755687337078e-05, - "loss": 0.6535, + "epoch": 8.765258604061613, + "grad_norm": 2.8812708854675293, + "learning_rate": 2.0666335450641147e-05, + "loss": 0.3516, "step": 636200 }, { - "epoch": 6.48, - "learning_rate": 3.8241178682309144e-05, - "loss": 0.6025, + "epoch": 8.766636356121353, + "grad_norm": 1.071608066558838, + "learning_rate": 2.0658571722071487e-05, + "loss": 0.408, "step": 636300 }, { - "epoch": 6.48, - "learning_rate": 3.823480008830305e-05, - "loss": 0.5056, + "epoch": 8.768014108181092, + "grad_norm": 31.29279327392578, + "learning_rate": 2.065080857471616e-05, + "loss": 0.3685, "step": 636400 }, { - "epoch": 6.48, - "learning_rate": 3.8228421091664366e-05, - "loss": 0.5638, + "epoch": 8.76939186024083, + "grad_norm": 6.259915351867676, + "learning_rate": 2.0643046009234683e-05, + "loss": 0.4008, "step": 636500 }, { - "epoch": 6.49, - "learning_rate": 3.8222041692705e-05, - "loss": 0.5694, + "epoch": 8.77076961230057, + "grad_norm": 2.4642298221588135, + "learning_rate": 2.0635284026286534e-05, + "loss": 0.3663, "step": 636600 }, { - "epoch": 6.49, - "learning_rate": 3.821566189173686e-05, - "loss": 0.536, + "epoch": 8.77214736436031, + "grad_norm": 9.889074325561523, + "learning_rate": 2.0627522626531134e-05, + "loss": 0.3251, "step": 636700 }, { - "epoch": 6.49, - "learning_rate": 3.8209281689071875e-05, - "loss": 0.5972, + "epoch": 8.77352511642005, + "grad_norm": 8.76566219329834, + "learning_rate": 2.0619761810627836e-05, + "loss": 0.3339, "step": 636800 }, { - "epoch": 6.49, - "learning_rate": 3.820290108502202e-05, - "loss": 0.5433, + "epoch": 8.774902868479789, + "grad_norm": 4.284940242767334, + "learning_rate": 2.0612001579235964e-05, + "loss": 0.4172, "step": 636900 }, { - "epoch": 6.49, - "learning_rate": 3.819652007989924e-05, - "loss": 0.4479, + "epoch": 8.776280620539527, + "grad_norm": 5.477663516998291, + "learning_rate": 2.0604241933014808e-05, + "loss": 0.3845, "step": 637000 }, { - "epoch": 6.49, - "learning_rate": 3.819013867401556e-05, - "loss": 0.5722, + "epoch": 8.777658372599268, + "grad_norm": 1.4679712057113647, + "learning_rate": 2.0596482872623574e-05, + "loss": 0.3244, "step": 637100 }, { - "epoch": 6.49, - "learning_rate": 3.8183756867682965e-05, - "loss": 0.5606, + "epoch": 8.779036124659006, + "grad_norm": 2.950423240661621, + "learning_rate": 2.0588724398721438e-05, + "loss": 0.2932, "step": 637200 }, { - "epoch": 6.49, - "learning_rate": 3.81773746612135e-05, - "loss": 0.5321, + "epoch": 8.780413876718745, + "grad_norm": 1.8477563858032227, + "learning_rate": 2.0580966511967535e-05, + "loss": 0.3989, "step": 637300 }, { - "epoch": 6.49, - "learning_rate": 3.8170992054919206e-05, - "loss": 0.5998, + "epoch": 8.781791628778485, + "grad_norm": 2.641486644744873, + "learning_rate": 2.0573209213020905e-05, + "loss": 0.3496, "step": 637400 }, { - "epoch": 6.49, - "learning_rate": 3.816460904911217e-05, - "loss": 0.5378, + "epoch": 8.783169380838224, + "grad_norm": 5.36055326461792, + "learning_rate": 2.05654525025406e-05, + "loss": 0.3255, "step": 637500 }, { - "epoch": 6.5, - "learning_rate": 3.815822564410448e-05, - "loss": 0.5933, + "epoch": 8.784547132897963, + "grad_norm": 3.5428483486175537, + "learning_rate": 2.055769638118558e-05, + "loss": 0.3172, "step": 637600 }, { - "epoch": 6.5, - "learning_rate": 3.8151841840208235e-05, - "loss": 0.6085, + "epoch": 8.785924884957703, + "grad_norm": 2.5028932094573975, + "learning_rate": 2.0549940849614762e-05, + "loss": 0.3935, "step": 637700 }, { - "epoch": 6.5, - "learning_rate": 3.8145457637735577e-05, - "loss": 0.5341, + "epoch": 8.787302637017442, + "grad_norm": 20.1326904296875, + "learning_rate": 2.054218590848704e-05, + "loss": 0.4184, "step": 637800 }, { - "epoch": 6.5, - "learning_rate": 3.813913688497639e-05, - "loss": 0.5314, + "epoch": 8.788680389077182, + "grad_norm": 3.7458786964416504, + "learning_rate": 2.053443155846121e-05, + "loss": 0.3261, "step": 637900 }, { - "epoch": 6.5, - "learning_rate": 3.813275189026533e-05, - "loss": 0.5003, + "epoch": 8.790058141136921, + "grad_norm": 2.482987403869629, + "learning_rate": 2.0526677800196055e-05, + "loss": 0.3527, "step": 638000 }, { - "epoch": 6.5, - "learning_rate": 3.812636649791124e-05, - "loss": 0.4893, + "epoch": 8.79143589319666, + "grad_norm": 4.323294162750244, + "learning_rate": 2.05189246343503e-05, + "loss": 0.359, "step": 638100 }, { - "epoch": 6.5, - "learning_rate": 3.811998070822633e-05, - "loss": 0.5263, + "epoch": 8.7928136452564, + "grad_norm": 6.655841827392578, + "learning_rate": 2.0511172061582603e-05, + "loss": 0.3961, "step": 638200 }, { - "epoch": 6.5, - "learning_rate": 3.811359452152281e-05, - "loss": 0.6136, + "epoch": 8.794191397316139, + "grad_norm": 3.3421835899353027, + "learning_rate": 2.0503420082551603e-05, + "loss": 0.3659, "step": 638300 }, { - "epoch": 6.5, - "learning_rate": 3.810720793811294e-05, - "loss": 0.4571, + "epoch": 8.79556914937588, + "grad_norm": 3.375495672225952, + "learning_rate": 2.0495668697915867e-05, + "loss": 0.3447, "step": 638400 }, { - "epoch": 6.51, - "learning_rate": 3.810082095830899e-05, - "loss": 0.4359, + "epoch": 8.796946901435618, + "grad_norm": 2.357154130935669, + "learning_rate": 2.04879179083339e-05, + "loss": 0.3615, "step": 638500 }, { - "epoch": 6.51, - "learning_rate": 3.809443358242323e-05, - "loss": 0.5481, + "epoch": 8.798324653495357, + "grad_norm": 0.9620901346206665, + "learning_rate": 2.048016771446418e-05, + "loss": 0.3136, "step": 638600 }, { - "epoch": 6.51, - "learning_rate": 3.808804581076797e-05, - "loss": 0.4375, + "epoch": 8.799702405555097, + "grad_norm": 3.7110488414764404, + "learning_rate": 2.0472418116965123e-05, + "loss": 0.3705, "step": 638700 }, { - "epoch": 6.51, - "learning_rate": 3.8081657643655546e-05, - "loss": 0.5736, + "epoch": 8.801080157614836, + "grad_norm": 3.417762041091919, + "learning_rate": 2.0464669116495107e-05, + "loss": 0.3978, "step": 638800 }, { - "epoch": 6.51, - "learning_rate": 3.8075269081398294e-05, - "loss": 0.4922, + "epoch": 8.802457909674574, + "grad_norm": 3.186450719833374, + "learning_rate": 2.0456920713712447e-05, + "loss": 0.3583, "step": 638900 }, { - "epoch": 6.51, - "learning_rate": 3.806888012430857e-05, - "loss": 0.4358, + "epoch": 8.803835661734315, + "grad_norm": 8.985251426696777, + "learning_rate": 2.04491729092754e-05, + "loss": 0.3596, "step": 639000 }, { - "epoch": 6.51, - "learning_rate": 3.806249077269877e-05, - "loss": 0.5477, + "epoch": 8.805213413794053, + "grad_norm": 2.134378671646118, + "learning_rate": 2.0441425703842182e-05, + "loss": 0.318, "step": 639100 }, { - "epoch": 6.51, - "learning_rate": 3.805610102688127e-05, - "loss": 0.5541, + "epoch": 8.806591165853792, + "grad_norm": 4.261523723602295, + "learning_rate": 2.0433679098070955e-05, + "loss": 0.357, "step": 639200 }, { - "epoch": 6.51, - "learning_rate": 3.80497108871685e-05, - "loss": 0.5283, + "epoch": 8.807968917913533, + "grad_norm": 1.3818753957748413, + "learning_rate": 2.0425933092619848e-05, + "loss": 0.3662, "step": 639300 }, { - "epoch": 6.51, - "learning_rate": 3.804332035387293e-05, - "loss": 0.5051, + "epoch": 8.809346669973271, + "grad_norm": 4.6248674392700195, + "learning_rate": 2.041818768814692e-05, + "loss": 0.3433, "step": 639400 }, { - "epoch": 6.52, - "learning_rate": 3.803692942730698e-05, - "loss": 0.568, + "epoch": 8.810724422033012, + "grad_norm": 3.4297780990600586, + "learning_rate": 2.0410442885310178e-05, + "loss": 0.3308, "step": 639500 }, { - "epoch": 6.52, - "learning_rate": 3.803053810778314e-05, - "loss": 0.547, + "epoch": 8.81210217409275, + "grad_norm": 1.6342157125473022, + "learning_rate": 2.0402698684767576e-05, + "loss": 0.3548, "step": 639600 }, { - "epoch": 6.52, - "learning_rate": 3.802414639561392e-05, - "loss": 0.5457, + "epoch": 8.813479926152489, + "grad_norm": 2.110055685043335, + "learning_rate": 2.0394955087177028e-05, + "loss": 0.348, "step": 639700 }, { - "epoch": 6.52, - "learning_rate": 3.801775429111182e-05, - "loss": 0.5237, + "epoch": 8.81485767821223, + "grad_norm": 1.732565999031067, + "learning_rate": 2.03872120931964e-05, + "loss": 0.3287, "step": 639800 }, { - "epoch": 6.52, - "learning_rate": 3.8011361794589383e-05, - "loss": 0.5376, + "epoch": 8.816235430271968, + "grad_norm": 14.913119316101074, + "learning_rate": 2.037946970348349e-05, + "loss": 0.3657, "step": 639900 }, { - "epoch": 6.52, - "learning_rate": 3.800496890635916e-05, - "loss": 0.5043, + "epoch": 8.817613182331707, + "grad_norm": 3.0401253700256348, + "learning_rate": 2.0371805333547403e-05, + "loss": 0.343, "step": 640000 }, { - "epoch": 6.52, - "learning_rate": 3.799857562673374e-05, - "loss": 0.5146, + "epoch": 8.818990934391447, + "grad_norm": 8.204343795776367, + "learning_rate": 2.0364064148284063e-05, + "loss": 0.386, "step": 640100 }, { - "epoch": 6.52, - "learning_rate": 3.7992245894667615e-05, - "loss": 0.4985, + "epoch": 8.820368686451186, + "grad_norm": 2.843414306640625, + "learning_rate": 2.0356323569254984e-05, + "loss": 0.403, "step": 640200 }, { - "epoch": 6.52, - "learning_rate": 3.7985851837095724e-05, - "loss": 0.5517, + "epoch": 8.821746438510926, + "grad_norm": 1.1177184581756592, + "learning_rate": 2.0348583597117767e-05, + "loss": 0.354, "step": 640300 }, { - "epoch": 6.52, - "learning_rate": 3.797945738906334e-05, - "loss": 0.5854, + "epoch": 8.823124190570665, + "grad_norm": 1.9453452825546265, + "learning_rate": 2.0340844232529952e-05, + "loss": 0.3325, "step": 640400 }, { - "epoch": 6.53, - "learning_rate": 3.79730625508831e-05, - "loss": 0.4775, + "epoch": 8.824501942630404, + "grad_norm": 1.9291720390319824, + "learning_rate": 2.033310547614905e-05, + "loss": 0.4021, "step": 640500 }, { - "epoch": 6.53, - "learning_rate": 3.796666732286768e-05, - "loss": 0.6324, + "epoch": 8.825879694690144, + "grad_norm": 18.277000427246094, + "learning_rate": 2.0325367328632514e-05, + "loss": 0.3942, "step": 640600 }, { - "epoch": 6.53, - "learning_rate": 3.796027170532978e-05, - "loss": 0.5578, + "epoch": 8.827257446749883, + "grad_norm": 3.3130598068237305, + "learning_rate": 2.031762979063772e-05, + "loss": 0.3686, "step": 640700 }, { - "epoch": 6.53, - "learning_rate": 3.795393966057513e-05, - "loss": 0.4782, + "epoch": 8.828635198809621, + "grad_norm": 2.889070510864258, + "learning_rate": 2.030989286282202e-05, + "loss": 0.3175, "step": 640800 }, { - "epoch": 6.53, - "learning_rate": 3.794754326881782e-05, - "loss": 0.4901, + "epoch": 8.830012950869362, + "grad_norm": 3.128248453140259, + "learning_rate": 2.0302156545842698e-05, + "loss": 0.3889, "step": 640900 }, { - "epoch": 6.53, - "learning_rate": 3.794114648847306e-05, - "loss": 0.4893, + "epoch": 8.8313907029291, + "grad_norm": 2.4471993446350098, + "learning_rate": 2.0294420840357008e-05, + "loss": 0.3585, "step": 641000 }, { - "epoch": 6.53, - "learning_rate": 3.793474931985363e-05, - "loss": 0.5765, + "epoch": 8.832768454988841, + "grad_norm": 2.7810728549957275, + "learning_rate": 2.028668574702214e-05, + "loss": 0.347, "step": 641100 }, { - "epoch": 6.53, - "learning_rate": 3.792835176327231e-05, - "loss": 0.5445, + "epoch": 8.83414620704858, + "grad_norm": 3.0514719486236572, + "learning_rate": 2.0278951266495213e-05, + "loss": 0.3781, "step": 641200 }, { - "epoch": 6.53, - "learning_rate": 3.7921953819041895e-05, - "loss": 0.5603, + "epoch": 8.835523959108318, + "grad_norm": 18.44270896911621, + "learning_rate": 2.0271217399433314e-05, + "loss": 0.3712, "step": 641300 }, { - "epoch": 6.53, - "learning_rate": 3.791555548747523e-05, - "loss": 0.5566, + "epoch": 8.836901711168059, + "grad_norm": 6.6126532554626465, + "learning_rate": 2.026348414649348e-05, + "loss": 0.383, "step": 641400 }, { - "epoch": 6.54, - "learning_rate": 3.790915676888514e-05, - "loss": 0.5261, + "epoch": 8.838279463227797, + "grad_norm": 2.161134719848633, + "learning_rate": 2.0255751508332694e-05, + "loss": 0.4232, "step": 641500 }, { - "epoch": 6.54, - "learning_rate": 3.790275766358447e-05, - "loss": 0.4711, + "epoch": 8.839657215287536, + "grad_norm": 0.09722273051738739, + "learning_rate": 2.0248019485607877e-05, + "loss": 0.3768, "step": 641600 }, { - "epoch": 6.54, - "learning_rate": 3.789635817188612e-05, - "loss": 0.4729, + "epoch": 8.841034967347277, + "grad_norm": 1.4188485145568848, + "learning_rate": 2.0240288078975913e-05, + "loss": 0.3404, "step": 641700 }, { - "epoch": 6.54, - "learning_rate": 3.788995829410297e-05, - "loss": 0.503, + "epoch": 8.842412719407015, + "grad_norm": 2.2287075519561768, + "learning_rate": 2.023255728909361e-05, + "loss": 0.3726, "step": 641800 }, { - "epoch": 6.54, - "learning_rate": 3.788355803054795e-05, - "loss": 0.5381, + "epoch": 8.843790471466754, + "grad_norm": 2.5462350845336914, + "learning_rate": 2.0224827116617738e-05, + "loss": 0.4319, "step": 641900 }, { - "epoch": 6.54, - "learning_rate": 3.787715738153399e-05, - "loss": 0.5422, + "epoch": 8.845168223526494, + "grad_norm": 17.24924659729004, + "learning_rate": 2.021709756220503e-05, + "loss": 0.3561, "step": 642000 }, { - "epoch": 6.54, - "learning_rate": 3.7870756347374036e-05, - "loss": 0.5047, + "epoch": 8.846545975586233, + "grad_norm": 2.118520736694336, + "learning_rate": 2.020936862651214e-05, + "loss": 0.3184, "step": 642100 }, { - "epoch": 6.54, - "learning_rate": 3.786435492838108e-05, - "loss": 0.56, + "epoch": 8.847923727645973, + "grad_norm": 2.4389121532440186, + "learning_rate": 2.0201640310195696e-05, + "loss": 0.3698, "step": 642200 }, { - "epoch": 6.54, - "learning_rate": 3.785795312486809e-05, - "loss": 0.4611, + "epoch": 8.849301479705712, + "grad_norm": 1.0755431652069092, + "learning_rate": 2.0193989887803747e-05, + "loss": 0.4169, "step": 642300 }, { - "epoch": 6.54, - "learning_rate": 3.785155093714808e-05, - "loss": 0.5796, + "epoch": 8.85067923176545, + "grad_norm": 0.5344855785369873, + "learning_rate": 2.0186262805999645e-05, + "loss": 0.385, "step": 642400 }, { - "epoch": 6.55, - "learning_rate": 3.784514836553409e-05, - "loss": 0.5225, + "epoch": 8.852056983825191, + "grad_norm": 1.9990497827529907, + "learning_rate": 2.0178536345534926e-05, + "loss": 0.3362, "step": 642500 }, { - "epoch": 6.55, - "learning_rate": 3.783874541033917e-05, - "loss": 0.46, + "epoch": 8.85343473588493, + "grad_norm": 3.2231595516204834, + "learning_rate": 2.0170810507066012e-05, + "loss": 0.3497, "step": 642600 }, { - "epoch": 6.55, - "learning_rate": 3.783234207187638e-05, - "loss": 0.6001, + "epoch": 8.85481248794467, + "grad_norm": 69.06319427490234, + "learning_rate": 2.0163085291249235e-05, + "loss": 0.3334, "step": 642700 }, { - "epoch": 6.55, - "learning_rate": 3.78259383504588e-05, - "loss": 0.5549, + "epoch": 8.856190240004409, + "grad_norm": 13.277430534362793, + "learning_rate": 2.0155360698740905e-05, + "loss": 0.3377, "step": 642800 }, { - "epoch": 6.55, - "learning_rate": 3.781953424639953e-05, - "loss": 0.5002, + "epoch": 8.857567992064148, + "grad_norm": 4.476804733276367, + "learning_rate": 2.0147636730197256e-05, + "loss": 0.3558, "step": 642900 }, { - "epoch": 6.55, - "learning_rate": 3.781312976001171e-05, - "loss": 0.4642, + "epoch": 8.858945744123888, + "grad_norm": 9.009331703186035, + "learning_rate": 2.013991338627447e-05, + "loss": 0.3262, "step": 643000 }, { - "epoch": 6.55, - "learning_rate": 3.780672489160846e-05, - "loss": 0.5741, + "epoch": 8.860323496183627, + "grad_norm": 2.8945515155792236, + "learning_rate": 2.0132267891717896e-05, + "loss": 0.3657, "step": 643100 }, { - "epoch": 6.55, - "learning_rate": 3.7800319641502954e-05, - "loss": 0.4785, + "epoch": 8.861701248243365, + "grad_norm": 3.7373850345611572, + "learning_rate": 2.0124545792742638e-05, + "loss": 0.355, "step": 643200 }, { - "epoch": 6.55, - "learning_rate": 3.7793914010008374e-05, - "loss": 0.5692, + "epoch": 8.863079000303106, + "grad_norm": 2.566220760345459, + "learning_rate": 2.011682432034997e-05, + "loss": 0.3767, "step": 643300 }, { - "epoch": 6.56, - "learning_rate": 3.778750799743791e-05, - "loss": 0.5426, + "epoch": 8.864456752362845, + "grad_norm": 9.89470386505127, + "learning_rate": 2.0109103475195833e-05, + "loss": 0.407, "step": 643400 }, { - "epoch": 6.56, - "learning_rate": 3.778110160410477e-05, - "loss": 0.5421, + "epoch": 8.865834504422583, + "grad_norm": 10.400753021240234, + "learning_rate": 2.010138325793617e-05, + "loss": 0.3621, "step": 643500 }, { - "epoch": 6.56, - "learning_rate": 3.77746948303222e-05, - "loss": 0.4852, + "epoch": 8.867212256482324, + "grad_norm": 3.108999729156494, + "learning_rate": 2.009366366922685e-05, + "loss": 0.3685, "step": 643600 }, { - "epoch": 6.56, - "learning_rate": 3.776835174982329e-05, - "loss": 0.4544, + "epoch": 8.868590008542062, + "grad_norm": 3.425283193588257, + "learning_rate": 2.0085944709723686e-05, + "loss": 0.3849, "step": 643700 }, { - "epoch": 6.56, - "learning_rate": 3.77619442198783e-05, - "loss": 0.499, + "epoch": 8.869967760601803, + "grad_norm": 1.8222777843475342, + "learning_rate": 2.007822638008246e-05, + "loss": 0.3713, "step": 643800 }, { - "epoch": 6.56, - "learning_rate": 3.775553631042057e-05, - "loss": 0.4793, + "epoch": 8.871345512661541, + "grad_norm": 2.6649415493011475, + "learning_rate": 2.0070508680958872e-05, + "loss": 0.353, "step": 643900 }, { - "epoch": 6.56, - "learning_rate": 3.77491280217634e-05, - "loss": 0.5337, + "epoch": 8.87272326472128, + "grad_norm": 28.03338623046875, + "learning_rate": 2.0062791613008575e-05, + "loss": 0.4077, "step": 644000 }, { - "epoch": 6.56, - "learning_rate": 3.774271935422011e-05, - "loss": 0.5972, + "epoch": 8.87410101678102, + "grad_norm": 5.776099681854248, + "learning_rate": 2.0055075176887183e-05, + "loss": 0.3818, "step": 644100 }, { - "epoch": 6.56, - "learning_rate": 3.7736310308104054e-05, - "loss": 0.4977, + "epoch": 8.87547876884076, + "grad_norm": 4.197676181793213, + "learning_rate": 2.0047359373250233e-05, + "loss": 0.3454, "step": 644200 }, { - "epoch": 6.56, - "learning_rate": 3.772990088372859e-05, - "loss": 0.6456, + "epoch": 8.876856520900498, + "grad_norm": 24.748563766479492, + "learning_rate": 2.003964420275324e-05, + "loss": 0.3467, "step": 644300 }, { - "epoch": 6.57, - "learning_rate": 3.77234910814071e-05, - "loss": 0.5815, + "epoch": 8.878234272960238, + "grad_norm": 22.444381713867188, + "learning_rate": 2.0031929666051646e-05, + "loss": 0.3601, "step": 644400 }, { - "epoch": 6.57, - "learning_rate": 3.771708090145298e-05, - "loss": 0.5494, + "epoch": 8.879612025019977, + "grad_norm": 4.238951683044434, + "learning_rate": 2.0024215763800825e-05, + "loss": 0.3603, "step": 644500 }, { - "epoch": 6.57, - "learning_rate": 3.771067034417967e-05, - "loss": 0.5426, + "epoch": 8.880989777079717, + "grad_norm": 3.1222634315490723, + "learning_rate": 2.001650249665612e-05, + "loss": 0.3436, "step": 644600 }, { - "epoch": 6.57, - "learning_rate": 3.7704259409900594e-05, - "loss": 0.6046, + "epoch": 8.882367529139456, + "grad_norm": 4.98201322555542, + "learning_rate": 2.000878986527281e-05, + "loss": 0.3781, "step": 644700 }, { - "epoch": 6.57, - "learning_rate": 3.76978480989292e-05, - "loss": 0.4872, + "epoch": 8.883745281199195, + "grad_norm": 4.639926910400391, + "learning_rate": 2.000107787030613e-05, + "loss": 0.3801, "step": 644800 }, { - "epoch": 6.57, - "learning_rate": 3.769143641157898e-05, - "loss": 0.4802, + "epoch": 8.885123033258935, + "grad_norm": 4.252802848815918, + "learning_rate": 1.9993366512411248e-05, + "loss": 0.3545, "step": 644900 }, { - "epoch": 6.57, - "learning_rate": 3.768502434816342e-05, - "loss": 0.5041, + "epoch": 8.886500785318674, + "grad_norm": 2.558443546295166, + "learning_rate": 1.9985655792243293e-05, + "loss": 0.405, "step": 645000 }, { - "epoch": 6.57, - "learning_rate": 3.767861190899603e-05, - "loss": 0.5784, + "epoch": 8.887878537378413, + "grad_norm": 1.008420467376709, + "learning_rate": 1.997794571045731e-05, + "loss": 0.3637, "step": 645100 }, { - "epoch": 6.57, - "learning_rate": 3.767219909439034e-05, - "loss": 0.5663, + "epoch": 8.889256289438153, + "grad_norm": 17.324047088623047, + "learning_rate": 1.9970236267708326e-05, + "loss": 0.3195, "step": 645200 }, { - "epoch": 6.57, - "learning_rate": 3.766578590465989e-05, - "loss": 0.5911, + "epoch": 8.890634041497892, + "grad_norm": 2.8452277183532715, + "learning_rate": 1.9962527464651288e-05, + "loss": 0.4106, "step": 645300 }, { - "epoch": 6.58, - "learning_rate": 3.765937234011827e-05, - "loss": 0.5324, + "epoch": 8.892011793557632, + "grad_norm": 2.7887914180755615, + "learning_rate": 1.9954819301941112e-05, + "loss": 0.3562, "step": 645400 }, { - "epoch": 6.58, - "learning_rate": 3.765295840107906e-05, - "loss": 0.4894, + "epoch": 8.89338954561737, + "grad_norm": 11.319156646728516, + "learning_rate": 1.9947111780232637e-05, + "loss": 0.3825, "step": 645500 }, { - "epoch": 6.58, - "learning_rate": 3.7646544087855836e-05, - "loss": 0.4647, + "epoch": 8.89476729767711, + "grad_norm": 1.8130007982254028, + "learning_rate": 1.993940490018065e-05, + "loss": 0.3958, "step": 645600 }, { - "epoch": 6.58, - "learning_rate": 3.764012940076224e-05, - "loss": 0.5204, + "epoch": 8.89614504973685, + "grad_norm": 5.498356819152832, + "learning_rate": 1.9931698662439896e-05, + "loss": 0.3622, "step": 645700 }, { - "epoch": 6.58, - "learning_rate": 3.763371434011191e-05, - "loss": 0.5444, + "epoch": 8.897522801796589, + "grad_norm": 2.856766939163208, + "learning_rate": 1.9923993067665063e-05, + "loss": 0.34, "step": 645800 }, { - "epoch": 6.58, - "learning_rate": 3.762729890621851e-05, - "loss": 0.5093, + "epoch": 8.898900553856327, + "grad_norm": 1.9779382944107056, + "learning_rate": 1.991628811651078e-05, + "loss": 0.3917, "step": 645900 }, { - "epoch": 6.58, - "learning_rate": 3.7620883099395716e-05, - "loss": 0.4852, + "epoch": 8.900278305916068, + "grad_norm": 2.144191265106201, + "learning_rate": 1.9908583809631612e-05, + "loss": 0.3565, "step": 646000 }, { - "epoch": 6.58, - "learning_rate": 3.761446691995721e-05, - "loss": 0.5114, + "epoch": 8.901656057975806, + "grad_norm": 4.26161003112793, + "learning_rate": 1.99008801476821e-05, + "loss": 0.3507, "step": 646100 }, { - "epoch": 6.58, - "learning_rate": 3.7608050368216715e-05, - "loss": 0.458, + "epoch": 8.903033810035547, + "grad_norm": 0.9047802090644836, + "learning_rate": 1.9893177131316685e-05, + "loss": 0.3669, "step": 646200 }, { - "epoch": 6.58, - "learning_rate": 3.760163344448795e-05, - "loss": 0.4893, + "epoch": 8.904411562095286, + "grad_norm": 2.501375675201416, + "learning_rate": 1.988547476118979e-05, + "loss": 0.3704, "step": 646300 }, { - "epoch": 6.59, - "learning_rate": 3.759521614908468e-05, - "loss": 0.5187, + "epoch": 8.905789314155024, + "grad_norm": 4.128477573394775, + "learning_rate": 1.987777303795577e-05, + "loss": 0.3601, "step": 646400 }, { - "epoch": 6.59, - "learning_rate": 3.7588798482320675e-05, - "loss": 0.5397, + "epoch": 8.907167066214765, + "grad_norm": 5.648437023162842, + "learning_rate": 1.9870071962268926e-05, + "loss": 0.3672, "step": 646500 }, { - "epoch": 6.59, - "learning_rate": 3.7582380444509704e-05, - "loss": 0.4794, + "epoch": 8.908544818274503, + "grad_norm": 3.3617026805877686, + "learning_rate": 1.9862371534783517e-05, + "loss": 0.3645, "step": 646600 }, { - "epoch": 6.59, - "learning_rate": 3.757596203596558e-05, - "loss": 0.6186, + "epoch": 8.909922570334242, + "grad_norm": 1.1747711896896362, + "learning_rate": 1.985474875072602e-05, + "loss": 0.3421, "step": 646700 }, { - "epoch": 6.59, - "learning_rate": 3.756954325700213e-05, - "loss": 0.4716, + "epoch": 8.911300322393982, + "grad_norm": 10.259430885314941, + "learning_rate": 1.9847126603246615e-05, + "loss": 0.4167, "step": 646800 }, { - "epoch": 6.59, - "learning_rate": 3.756312410793317e-05, - "loss": 0.5347, + "epoch": 8.912678074453721, + "grad_norm": 3.7096409797668457, + "learning_rate": 1.9839428111280693e-05, + "loss": 0.3456, "step": 646900 }, { - "epoch": 6.59, - "learning_rate": 3.755670458907259e-05, - "loss": 0.5501, + "epoch": 8.914055826513462, + "grad_norm": 4.699954032897949, + "learning_rate": 1.983173027011954e-05, + "loss": 0.3976, "step": 647000 }, { - "epoch": 6.59, - "learning_rate": 3.7550284700734245e-05, - "loss": 0.5767, + "epoch": 8.9154335785732, + "grad_norm": 1.1032865047454834, + "learning_rate": 1.982403308041714e-05, + "loss": 0.3781, "step": 647100 }, { - "epoch": 6.59, - "learning_rate": 3.754386444323203e-05, - "loss": 0.4323, + "epoch": 8.916811330632939, + "grad_norm": 2.5502681732177734, + "learning_rate": 1.98163365428274e-05, + "loss": 0.3834, "step": 647200 }, { - "epoch": 6.59, - "learning_rate": 3.753744381687988e-05, - "loss": 0.5423, + "epoch": 8.91818908269268, + "grad_norm": 4.178029537200928, + "learning_rate": 1.9808640658004177e-05, + "loss": 0.3739, "step": 647300 }, { - "epoch": 6.6, - "learning_rate": 3.7531022821991696e-05, - "loss": 0.5501, + "epoch": 8.919566834752418, + "grad_norm": 2.408040761947632, + "learning_rate": 1.980094542660128e-05, + "loss": 0.3741, "step": 647400 }, { - "epoch": 6.6, - "learning_rate": 3.752460145888145e-05, - "loss": 0.5642, + "epoch": 8.920944586812157, + "grad_norm": 3.5056746006011963, + "learning_rate": 1.979325084927245e-05, + "loss": 0.2945, "step": 647500 }, { - "epoch": 6.6, - "learning_rate": 3.751817972786309e-05, - "loss": 0.5593, + "epoch": 8.922322338871897, + "grad_norm": 15.244461059570312, + "learning_rate": 1.9785556926671394e-05, + "loss": 0.4315, "step": 647600 }, { - "epoch": 6.6, - "learning_rate": 3.7511757629250596e-05, - "loss": 0.5147, + "epoch": 8.923700090931636, + "grad_norm": 4.473372936248779, + "learning_rate": 1.977786365945175e-05, + "loss": 0.374, "step": 647700 }, { - "epoch": 6.6, - "learning_rate": 3.750533516335799e-05, - "loss": 0.5679, + "epoch": 8.925077842991374, + "grad_norm": 2.718208074569702, + "learning_rate": 1.9770171048267088e-05, + "loss": 0.3505, "step": 647800 }, { - "epoch": 6.6, - "learning_rate": 3.7498912330499294e-05, - "loss": 0.5182, + "epoch": 8.926455595051115, + "grad_norm": 1.6715748310089111, + "learning_rate": 1.976247909377094e-05, + "loss": 0.3496, "step": 647900 }, { - "epoch": 6.6, - "learning_rate": 3.7492489130988526e-05, - "loss": 0.4729, + "epoch": 8.927833347110854, + "grad_norm": 2.0373291969299316, + "learning_rate": 1.9754787796616774e-05, + "loss": 0.3915, "step": 648000 }, { - "epoch": 6.6, - "learning_rate": 3.748606556513976e-05, - "loss": 0.4973, + "epoch": 8.929211099170594, + "grad_norm": 0.7419623136520386, + "learning_rate": 1.9747097157458015e-05, + "loss": 0.3486, "step": 648100 }, { - "epoch": 6.6, - "learning_rate": 3.747964163326706e-05, - "loss": 0.5247, + "epoch": 8.930588851230333, + "grad_norm": 1.6946591138839722, + "learning_rate": 1.9739407176948014e-05, + "loss": 0.3539, "step": 648200 }, { - "epoch": 6.6, - "learning_rate": 3.7473217335684516e-05, - "loss": 0.6206, + "epoch": 8.931966603290071, + "grad_norm": 4.420561790466309, + "learning_rate": 1.973171785574008e-05, + "loss": 0.3821, "step": 648300 }, { - "epoch": 6.61, - "learning_rate": 3.74668569211437e-05, - "loss": 0.4371, + "epoch": 8.933344355349812, + "grad_norm": 0.7315015196800232, + "learning_rate": 1.9724029194487457e-05, + "loss": 0.357, "step": 648400 }, { - "epoch": 6.61, - "learning_rate": 3.746043189673309e-05, - "loss": 0.531, + "epoch": 8.93472210740955, + "grad_norm": 2.912536382675171, + "learning_rate": 1.9716341193843322e-05, + "loss": 0.3878, "step": 648500 }, { - "epoch": 6.61, - "learning_rate": 3.745400650755187e-05, - "loss": 0.5494, + "epoch": 8.936099859469289, + "grad_norm": 2.9838454723358154, + "learning_rate": 1.9708653854460838e-05, + "loss": 0.3609, "step": 648600 }, { - "epoch": 6.61, - "learning_rate": 3.744758075391424e-05, - "loss": 0.5041, + "epoch": 8.93747761152903, + "grad_norm": 16.680017471313477, + "learning_rate": 1.9700967176993063e-05, + "loss": 0.3641, "step": 648700 }, { - "epoch": 6.61, - "learning_rate": 3.744115463613433e-05, - "loss": 0.5318, + "epoch": 8.938855363588768, + "grad_norm": 0.606227695941925, + "learning_rate": 1.9693281162093034e-05, + "loss": 0.3477, "step": 648800 }, { - "epoch": 6.61, - "learning_rate": 3.7434728154526375e-05, - "loss": 0.5787, + "epoch": 8.940233115648509, + "grad_norm": 2.8133199214935303, + "learning_rate": 1.9685595810413703e-05, + "loss": 0.3109, "step": 648900 }, { - "epoch": 6.61, - "learning_rate": 3.742830130940457e-05, - "loss": 0.5133, + "epoch": 8.941610867708247, + "grad_norm": 2.7305634021759033, + "learning_rate": 1.9677911122607982e-05, + "loss": 0.4273, "step": 649000 }, { - "epoch": 6.61, - "learning_rate": 3.742187410108318e-05, - "loss": 0.4797, + "epoch": 8.942988619767986, + "grad_norm": 4.37584114074707, + "learning_rate": 1.9670227099328732e-05, + "loss": 0.3321, "step": 649100 }, { - "epoch": 6.61, - "learning_rate": 3.741544652987643e-05, - "loss": 0.5597, + "epoch": 8.944366371827726, + "grad_norm": 3.619100570678711, + "learning_rate": 1.966254374122875e-05, + "loss": 0.3497, "step": 649200 }, { - "epoch": 6.62, - "learning_rate": 3.74090185960986e-05, - "loss": 0.5025, + "epoch": 8.945744123887465, + "grad_norm": 1.2310850620269775, + "learning_rate": 1.9654861048960758e-05, + "loss": 0.339, "step": 649300 }, { - "epoch": 6.62, - "learning_rate": 3.7402590300063976e-05, - "loss": 0.5317, + "epoch": 8.947121875947204, + "grad_norm": 5.259525775909424, + "learning_rate": 1.9647179023177484e-05, + "loss": 0.3635, "step": 649400 }, { - "epoch": 6.62, - "learning_rate": 3.7396161642086854e-05, - "loss": 0.5667, + "epoch": 8.948499628006944, + "grad_norm": 2.097104787826538, + "learning_rate": 1.9639497664531505e-05, + "loss": 0.3272, "step": 649500 }, { - "epoch": 6.62, - "learning_rate": 3.7389732622481565e-05, - "loss": 0.4278, + "epoch": 8.949877380066683, + "grad_norm": 25.21429443359375, + "learning_rate": 1.963181697367542e-05, + "loss": 0.3521, "step": 649600 }, { - "epoch": 6.62, - "learning_rate": 3.738330324156246e-05, - "loss": 0.5121, + "epoch": 8.951255132126423, + "grad_norm": 4.1588921546936035, + "learning_rate": 1.9624136951261736e-05, + "loss": 0.3512, "step": 649700 }, { - "epoch": 6.62, - "learning_rate": 3.737687349964388e-05, - "loss": 0.4946, + "epoch": 8.952632884186162, + "grad_norm": 6.7349982261657715, + "learning_rate": 1.9616457597942906e-05, + "loss": 0.4332, "step": 649800 }, { - "epoch": 6.62, - "learning_rate": 3.737044339704022e-05, - "loss": 0.4955, + "epoch": 8.9540106362459, + "grad_norm": 2.080474376678467, + "learning_rate": 1.960877891437135e-05, + "loss": 0.3255, "step": 649900 }, { - "epoch": 6.62, - "learning_rate": 3.736401293406586e-05, - "loss": 0.5424, + "epoch": 8.955388388305641, + "grad_norm": 2.8107616901397705, + "learning_rate": 1.9601100901199377e-05, + "loss": 0.3393, "step": 650000 }, { - "epoch": 6.62, - "learning_rate": 3.7357582111035205e-05, - "loss": 0.4217, + "epoch": 8.95676614036538, + "grad_norm": 4.335209369659424, + "learning_rate": 1.9593423559079307e-05, + "loss": 0.2934, "step": 650100 }, { - "epoch": 6.62, - "learning_rate": 3.7351150928262694e-05, - "loss": 0.4857, + "epoch": 8.958143892425118, + "grad_norm": 2.77384090423584, + "learning_rate": 1.9585746888663346e-05, + "loss": 0.3469, "step": 650200 }, { - "epoch": 6.63, - "learning_rate": 3.734471938606278e-05, - "loss": 0.4527, + "epoch": 8.959521644484859, + "grad_norm": 4.639960289001465, + "learning_rate": 1.9578070890603674e-05, + "loss": 0.4201, "step": 650300 }, { - "epoch": 6.63, - "learning_rate": 3.733828748474991e-05, - "loss": 0.5265, + "epoch": 8.960899396544598, + "grad_norm": 1.8101547956466675, + "learning_rate": 1.9570395565552418e-05, + "loss": 0.3313, "step": 650400 }, { - "epoch": 6.63, - "learning_rate": 3.7331855224638583e-05, - "loss": 0.5047, + "epoch": 8.962277148604338, + "grad_norm": 4.467845439910889, + "learning_rate": 1.956272091416163e-05, + "loss": 0.3304, "step": 650500 }, { - "epoch": 6.63, - "learning_rate": 3.732542260604328e-05, - "loss": 0.4972, + "epoch": 8.963654900664077, + "grad_norm": 0.37650877237319946, + "learning_rate": 1.9555046937083298e-05, + "loss": 0.349, "step": 650600 }, { - "epoch": 6.63, - "learning_rate": 3.731898962927853e-05, - "loss": 0.4476, + "epoch": 8.965032652723815, + "grad_norm": 3.261247158050537, + "learning_rate": 1.9547373634969377e-05, + "loss": 0.3851, "step": 650700 }, { - "epoch": 6.63, - "learning_rate": 3.731262062977542e-05, - "loss": 0.5041, + "epoch": 8.966410404783556, + "grad_norm": 2.061296224594116, + "learning_rate": 1.9539701008471744e-05, + "loss": 0.3765, "step": 650800 }, { - "epoch": 6.63, - "learning_rate": 3.730618694118923e-05, - "loss": 0.5415, + "epoch": 8.967788156843294, + "grad_norm": 2.228896379470825, + "learning_rate": 1.953202905824224e-05, + "loss": 0.3305, "step": 650900 }, { - "epoch": 6.63, - "learning_rate": 3.729975289537411e-05, - "loss": 0.5246, + "epoch": 8.969165908903033, + "grad_norm": 8.218269348144531, + "learning_rate": 1.952435778493264e-05, + "loss": 0.3627, "step": 651000 }, { - "epoch": 6.63, - "learning_rate": 3.729331849264464e-05, - "loss": 0.5812, + "epoch": 8.970543660962774, + "grad_norm": 2.1343777179718018, + "learning_rate": 1.9516687189194645e-05, + "loss": 0.3877, "step": 651100 }, { - "epoch": 6.63, - "learning_rate": 3.728688373331541e-05, - "loss": 0.5537, + "epoch": 8.971921413022512, + "grad_norm": 40.226924896240234, + "learning_rate": 1.950901727167991e-05, + "loss": 0.3208, "step": 651200 }, { - "epoch": 6.64, - "learning_rate": 3.728044861770106e-05, - "loss": 0.5339, + "epoch": 8.973299165082253, + "grad_norm": 9.179095268249512, + "learning_rate": 1.9501348033040035e-05, + "loss": 0.3757, "step": 651300 }, { - "epoch": 6.64, - "learning_rate": 3.727401314611622e-05, - "loss": 0.5037, + "epoch": 8.974676917141991, + "grad_norm": 1.7589216232299805, + "learning_rate": 1.9493679473926574e-05, + "loss": 0.338, "step": 651400 }, { - "epoch": 6.64, - "learning_rate": 3.726757731887554e-05, - "loss": 0.5523, + "epoch": 8.97605466920173, + "grad_norm": 1.6015539169311523, + "learning_rate": 1.9486011594991e-05, + "loss": 0.3935, "step": 651500 }, { - "epoch": 6.64, - "learning_rate": 3.72611411362937e-05, - "loss": 0.5695, + "epoch": 8.97743242126147, + "grad_norm": 2.482551336288452, + "learning_rate": 1.9478344396884748e-05, + "loss": 0.3721, "step": 651600 }, { - "epoch": 6.64, - "learning_rate": 3.725470459868539e-05, - "loss": 0.4919, + "epoch": 8.97881017332121, + "grad_norm": 4.991109371185303, + "learning_rate": 1.947067788025917e-05, + "loss": 0.4536, "step": 651700 }, { - "epoch": 6.64, - "learning_rate": 3.724826770636532e-05, - "loss": 0.5418, + "epoch": 8.980187925380948, + "grad_norm": 31.218257904052734, + "learning_rate": 1.946301204576558e-05, + "loss": 0.3648, "step": 651800 }, { - "epoch": 6.64, - "learning_rate": 3.7241830459648224e-05, - "loss": 0.4592, + "epoch": 8.981565677440688, + "grad_norm": 4.4216437339782715, + "learning_rate": 1.9455346894055235e-05, + "loss": 0.3503, "step": 651900 }, { - "epoch": 6.64, - "learning_rate": 3.723545723660849e-05, - "loss": 0.483, + "epoch": 8.982943429500427, + "grad_norm": 12.041495323181152, + "learning_rate": 1.9447682425779333e-05, + "loss": 0.4132, "step": 652000 }, { - "epoch": 6.64, - "learning_rate": 3.7229019285577684e-05, - "loss": 0.6228, + "epoch": 8.984321181560166, + "grad_norm": 4.941658973693848, + "learning_rate": 1.9440018641589e-05, + "loss": 0.322, "step": 652100 }, { - "epoch": 6.64, - "learning_rate": 3.7222580981090965e-05, - "loss": 0.5042, + "epoch": 8.985698933619906, + "grad_norm": 1.1174205541610718, + "learning_rate": 1.9432355542135327e-05, + "loss": 0.3558, "step": 652200 }, { - "epoch": 6.65, - "learning_rate": 3.7216142323463145e-05, - "loss": 0.4903, + "epoch": 8.987076685679645, + "grad_norm": 9.361457824707031, + "learning_rate": 1.9424693128069313e-05, + "loss": 0.3712, "step": 652300 }, { - "epoch": 6.65, - "learning_rate": 3.720970331300903e-05, - "loss": 0.4873, + "epoch": 8.988454437739385, + "grad_norm": 3.7306151390075684, + "learning_rate": 1.9417031400041933e-05, + "loss": 0.3485, "step": 652400 }, { - "epoch": 6.65, - "learning_rate": 3.7203263950043445e-05, - "loss": 0.5102, + "epoch": 8.989832189799124, + "grad_norm": 0.7190465331077576, + "learning_rate": 1.9409370358704093e-05, + "loss": 0.3853, "step": 652500 }, { - "epoch": 6.65, - "learning_rate": 3.719682423488124e-05, - "loss": 0.5588, + "epoch": 8.991209941858862, + "grad_norm": 2.62778639793396, + "learning_rate": 1.940171000470663e-05, + "loss": 0.343, "step": 652600 }, { - "epoch": 6.65, - "learning_rate": 3.7190384167837287e-05, - "loss": 0.4271, + "epoch": 8.992587693918603, + "grad_norm": 2.3849852085113525, + "learning_rate": 1.9394050338700335e-05, + "loss": 0.3181, "step": 652700 }, { - "epoch": 6.65, - "learning_rate": 3.7183943749226445e-05, - "loss": 0.4695, + "epoch": 8.993965445978342, + "grad_norm": 3.9958038330078125, + "learning_rate": 1.9386391361335924e-05, + "loss": 0.3648, "step": 652800 }, { - "epoch": 6.65, - "learning_rate": 3.717750297936363e-05, - "loss": 0.5046, + "epoch": 8.99534319803808, + "grad_norm": 6.0911431312561035, + "learning_rate": 1.937880965273067e-05, + "loss": 0.3746, "step": 652900 }, { - "epoch": 6.65, - "learning_rate": 3.717106185856375e-05, - "loss": 0.5333, + "epoch": 8.99672095009782, + "grad_norm": 3.161404848098755, + "learning_rate": 1.9371152047699343e-05, + "loss": 0.2874, "step": 653000 }, { - "epoch": 6.65, - "learning_rate": 3.716462038714174e-05, - "loss": 0.4315, + "epoch": 8.99809870215756, + "grad_norm": 2.9368815422058105, + "learning_rate": 1.9363495133255242e-05, + "loss": 0.3651, "step": 653100 }, { - "epoch": 6.65, - "learning_rate": 3.7158178565412564e-05, - "loss": 0.5089, + "epoch": 8.9994764542173, + "grad_norm": 3.8970587253570557, + "learning_rate": 1.9355838910048867e-05, + "loss": 0.3118, "step": 653200 }, { - "epoch": 6.66, - "learning_rate": 3.715173639369115e-05, - "loss": 0.4676, + "epoch": 9.000854206277038, + "grad_norm": 2.086366891860962, + "learning_rate": 1.9348183378730632e-05, + "loss": 0.3165, "step": 653300 }, { - "epoch": 6.66, - "learning_rate": 3.714529387229252e-05, - "loss": 0.5052, + "epoch": 9.002231958336777, + "grad_norm": 14.260379791259766, + "learning_rate": 1.9340528539950922e-05, + "loss": 0.3473, "step": 653400 }, { - "epoch": 6.66, - "learning_rate": 3.7138851001531656e-05, - "loss": 0.5293, + "epoch": 9.003609710396518, + "grad_norm": 1.8470979928970337, + "learning_rate": 1.9332874394360053e-05, + "loss": 0.2843, "step": 653500 }, { - "epoch": 6.66, - "learning_rate": 3.7132407781723586e-05, - "loss": 0.5438, + "epoch": 9.004987462456256, + "grad_norm": 1.7738455533981323, + "learning_rate": 1.9325220942608287e-05, + "loss": 0.3101, "step": 653600 }, { - "epoch": 6.66, - "learning_rate": 3.712596421318333e-05, - "loss": 0.5439, + "epoch": 9.006365214515995, + "grad_norm": 2.6851699352264404, + "learning_rate": 1.931756818534583e-05, + "loss": 0.3478, "step": 653700 }, { - "epoch": 6.66, - "learning_rate": 3.711952029622596e-05, - "loss": 0.5865, + "epoch": 9.007742966575735, + "grad_norm": 1.751868486404419, + "learning_rate": 1.930991612322282e-05, + "loss": 0.2857, "step": 653800 }, { - "epoch": 6.66, - "learning_rate": 3.711307603116653e-05, - "loss": 0.4825, + "epoch": 9.009120718635474, + "grad_norm": 0.8391517996788025, + "learning_rate": 1.9302264756889334e-05, + "loss": 0.3333, "step": 653900 }, { - "epoch": 6.66, - "learning_rate": 3.7106631418320136e-05, - "loss": 0.5458, + "epoch": 9.010498470695214, + "grad_norm": 5.324915409088135, + "learning_rate": 1.929461408699538e-05, + "loss": 0.3862, "step": 654000 }, { - "epoch": 6.66, - "learning_rate": 3.7100186458001864e-05, - "loss": 0.5616, + "epoch": 9.011876222754953, + "grad_norm": 0.32781800627708435, + "learning_rate": 1.9286964114190953e-05, + "loss": 0.3942, "step": 654100 }, { - "epoch": 6.67, - "learning_rate": 3.709374115052685e-05, - "loss": 0.483, + "epoch": 9.013253974814692, + "grad_norm": 1.898881435394287, + "learning_rate": 1.927931483912594e-05, + "loss": 0.3558, "step": 654200 }, { - "epoch": 6.67, - "learning_rate": 3.7087295496210237e-05, - "loss": 0.487, + "epoch": 9.014631726874432, + "grad_norm": 7.0831451416015625, + "learning_rate": 1.927166626245018e-05, + "loss": 0.2918, "step": 654300 }, { - "epoch": 6.67, - "learning_rate": 3.7080849495367155e-05, - "loss": 0.4763, + "epoch": 9.016009478934171, + "grad_norm": 5.255655288696289, + "learning_rate": 1.9264018384813488e-05, + "loss": 0.3313, "step": 654400 }, { - "epoch": 6.67, - "learning_rate": 3.70744031483128e-05, - "loss": 0.4314, + "epoch": 9.01738723099391, + "grad_norm": 5.007075786590576, + "learning_rate": 1.9256371206865546e-05, + "loss": 0.3338, "step": 654500 }, { - "epoch": 6.67, - "learning_rate": 3.706795645536234e-05, - "loss": 0.432, + "epoch": 9.01876498305365, + "grad_norm": 5.827671527862549, + "learning_rate": 1.924872472925606e-05, + "loss": 0.3802, "step": 654600 }, { - "epoch": 6.67, - "learning_rate": 3.7061509416830994e-05, - "loss": 0.5061, + "epoch": 9.020142735113389, + "grad_norm": 2.536600112915039, + "learning_rate": 1.924107895263461e-05, + "loss": 0.3368, "step": 654700 }, { - "epoch": 6.67, - "learning_rate": 3.705506203303398e-05, - "loss": 0.4543, + "epoch": 9.02152048717313, + "grad_norm": 2.9947972297668457, + "learning_rate": 1.9233433877650757e-05, + "loss": 0.3148, "step": 654800 }, { - "epoch": 6.67, - "learning_rate": 3.7048614304286525e-05, - "loss": 0.4884, + "epoch": 9.022898239232868, + "grad_norm": 3.742250919342041, + "learning_rate": 1.9225789504954e-05, + "loss": 0.3346, "step": 654900 }, { - "epoch": 6.67, - "learning_rate": 3.7042230713342645e-05, - "loss": 0.5257, + "epoch": 9.024275991292606, + "grad_norm": 2.8474090099334717, + "learning_rate": 1.921814583519374e-05, + "loss": 0.3194, "step": 655000 }, { - "epoch": 6.67, - "learning_rate": 3.7035782299081756e-05, - "loss": 0.5248, + "epoch": 9.025653743352347, + "grad_norm": 0.39667606353759766, + "learning_rate": 1.921050286901937e-05, + "loss": 0.344, "step": 655100 }, { - "epoch": 6.68, - "learning_rate": 3.702933354081309e-05, - "loss": 0.5704, + "epoch": 9.027031495412086, + "grad_norm": 1.5153580904006958, + "learning_rate": 1.920293702621148e-05, + "loss": 0.3408, "step": 655200 }, { - "epoch": 6.68, - "learning_rate": 3.702288443885198e-05, - "loss": 0.5118, + "epoch": 9.028409247471824, + "grad_norm": 3.461282253265381, + "learning_rate": 1.9195295462104678e-05, + "loss": 0.3435, "step": 655300 }, { - "epoch": 6.68, - "learning_rate": 3.701643499351373e-05, - "loss": 0.5316, + "epoch": 9.029786999531565, + "grad_norm": 11.459829330444336, + "learning_rate": 1.918765460352502e-05, + "loss": 0.3204, "step": 655400 }, { - "epoch": 6.68, - "learning_rate": 3.7009985205113675e-05, - "loss": 0.5284, + "epoch": 9.031164751591303, + "grad_norm": 2.755305290222168, + "learning_rate": 1.918001445112162e-05, + "loss": 0.3978, "step": 655500 }, { - "epoch": 6.68, - "learning_rate": 3.7003535073967174e-05, - "loss": 0.5209, + "epoch": 9.032542503651044, + "grad_norm": 4.273592948913574, + "learning_rate": 1.9172375005543557e-05, + "loss": 0.3399, "step": 655600 }, { - "epoch": 6.68, - "learning_rate": 3.6997084600389606e-05, - "loss": 0.5208, + "epoch": 9.033920255710782, + "grad_norm": 1.006514072418213, + "learning_rate": 1.9164736267439843e-05, + "loss": 0.292, "step": 655700 }, { - "epoch": 6.68, - "learning_rate": 3.699063378469636e-05, - "loss": 0.3749, + "epoch": 9.035298007770521, + "grad_norm": 1.3249393701553345, + "learning_rate": 1.9157098237459427e-05, + "loss": 0.3759, "step": 655800 }, { - "epoch": 6.68, - "learning_rate": 3.698418262720284e-05, - "loss": 0.4537, + "epoch": 9.036675759830262, + "grad_norm": 6.889577865600586, + "learning_rate": 1.91494609162512e-05, + "loss": 0.3488, "step": 655900 }, { - "epoch": 6.68, - "learning_rate": 3.697773112822446e-05, - "loss": 0.5132, + "epoch": 9.03805351189, + "grad_norm": 5.249415397644043, + "learning_rate": 1.914182430446399e-05, + "loss": 0.3441, "step": 656000 }, { - "epoch": 6.68, - "learning_rate": 3.697127928807668e-05, - "loss": 0.461, + "epoch": 9.039431263949739, + "grad_norm": 5.642026424407959, + "learning_rate": 1.9134188402746553e-05, + "loss": 0.3226, "step": 656100 }, { - "epoch": 6.69, - "learning_rate": 3.696482710707493e-05, - "loss": 0.5357, + "epoch": 9.04080901600948, + "grad_norm": 1.900230884552002, + "learning_rate": 1.912655321174762e-05, + "loss": 0.3411, "step": 656200 }, { - "epoch": 6.69, - "learning_rate": 3.695837458553472e-05, - "loss": 0.5019, + "epoch": 9.042186768069218, + "grad_norm": 1.1722453832626343, + "learning_rate": 1.911891873211583e-05, + "loss": 0.3359, "step": 656300 }, { - "epoch": 6.69, - "learning_rate": 3.695192172377151e-05, - "loss": 0.5209, + "epoch": 9.043564520128957, + "grad_norm": 4.841881275177002, + "learning_rate": 1.9111284964499772e-05, + "loss": 0.2931, "step": 656400 }, { - "epoch": 6.69, - "learning_rate": 3.69454685221008e-05, - "loss": 0.5917, + "epoch": 9.044942272188697, + "grad_norm": 1.3248008489608765, + "learning_rate": 1.9103651909547967e-05, + "loss": 0.3582, "step": 656500 }, { - "epoch": 6.69, - "learning_rate": 3.6939014980838144e-05, - "loss": 0.54, + "epoch": 9.046320024248436, + "grad_norm": 2.876638412475586, + "learning_rate": 1.9096019567908903e-05, + "loss": 0.3125, "step": 656600 }, { - "epoch": 6.69, - "learning_rate": 3.693256110029906e-05, - "loss": 0.4647, + "epoch": 9.047697776308176, + "grad_norm": 2.9264562129974365, + "learning_rate": 1.9088387940230955e-05, + "loss": 0.3023, "step": 656700 }, { - "epoch": 6.69, - "learning_rate": 3.692610688079911e-05, - "loss": 0.5196, + "epoch": 9.049075528367915, + "grad_norm": 0.5946580767631531, + "learning_rate": 1.9080757027162488e-05, + "loss": 0.3055, "step": 656800 }, { - "epoch": 6.69, - "learning_rate": 3.6919652322653864e-05, - "loss": 0.5456, + "epoch": 9.050453280427654, + "grad_norm": 6.849308490753174, + "learning_rate": 1.9073126829351785e-05, + "loss": 0.3995, "step": 656900 }, { - "epoch": 6.69, - "learning_rate": 3.691319742617892e-05, - "loss": 0.4584, + "epoch": 9.051831032487394, + "grad_norm": 1.8169491291046143, + "learning_rate": 1.9065573638720256e-05, + "loss": 0.3333, "step": 657000 }, { - "epoch": 6.69, - "learning_rate": 3.690674219168987e-05, - "loss": 0.4391, + "epoch": 9.053208784547133, + "grad_norm": 6.851864337921143, + "learning_rate": 1.905794486620095e-05, + "loss": 0.3623, "step": 657100 }, { - "epoch": 6.7, - "learning_rate": 3.690028661950235e-05, - "loss": 0.5406, + "epoch": 9.054586536606871, + "grad_norm": 1.1169012784957886, + "learning_rate": 1.905031681087741e-05, + "loss": 0.3315, "step": 657200 }, { - "epoch": 6.7, - "learning_rate": 3.689383070993199e-05, - "loss": 0.5771, + "epoch": 9.055964288666612, + "grad_norm": 7.5937957763671875, + "learning_rate": 1.9042689473397678e-05, + "loss": 0.3001, "step": 657300 }, { - "epoch": 6.7, - "learning_rate": 3.6887374463294446e-05, - "loss": 0.4882, + "epoch": 9.05734204072635, + "grad_norm": 2.8620786666870117, + "learning_rate": 1.903506285440975e-05, + "loss": 0.3373, "step": 657400 }, { - "epoch": 6.7, - "learning_rate": 3.68809178799054e-05, - "loss": 0.5586, + "epoch": 9.058719792786091, + "grad_norm": 3.499518394470215, + "learning_rate": 1.902743695456154e-05, + "loss": 0.3202, "step": 657500 }, { - "epoch": 6.7, - "learning_rate": 3.687446096008053e-05, - "loss": 0.5308, + "epoch": 9.06009754484583, + "grad_norm": 1.1459527015686035, + "learning_rate": 1.9019811774500907e-05, + "loss": 0.3682, "step": 657600 }, { - "epoch": 6.7, - "learning_rate": 3.686800370413554e-05, - "loss": 0.4798, + "epoch": 9.061475296905568, + "grad_norm": 9.876166343688965, + "learning_rate": 1.9012187314875647e-05, + "loss": 0.2768, "step": 657700 }, { - "epoch": 6.7, - "learning_rate": 3.686154611238616e-05, - "loss": 0.5406, + "epoch": 9.062853048965309, + "grad_norm": 3.774568796157837, + "learning_rate": 1.900456357633349e-05, + "loss": 0.3148, "step": 657800 }, { - "epoch": 6.7, - "learning_rate": 3.685508818514812e-05, - "loss": 0.5328, + "epoch": 9.064230801025047, + "grad_norm": 5.276428699493408, + "learning_rate": 1.8996940559522125e-05, + "loss": 0.2929, "step": 657900 }, { - "epoch": 6.7, - "learning_rate": 3.684862992273718e-05, - "loss": 0.5241, + "epoch": 9.065608553084786, + "grad_norm": 12.878783226013184, + "learning_rate": 1.8989318265089166e-05, + "loss": 0.3364, "step": 658000 }, { - "epoch": 6.7, - "learning_rate": 3.68421713254691e-05, - "loss": 0.4499, + "epoch": 9.066986305144527, + "grad_norm": 2.5375170707702637, + "learning_rate": 1.8981696693682146e-05, + "loss": 0.3486, "step": 658100 }, { - "epoch": 6.71, - "learning_rate": 3.683571239365968e-05, - "loss": 0.47, + "epoch": 9.068364057204265, + "grad_norm": 42.197322845458984, + "learning_rate": 1.8974075845948593e-05, + "loss": 0.3415, "step": 658200 }, { - "epoch": 6.71, - "learning_rate": 3.682925312762472e-05, - "loss": 0.5805, + "epoch": 9.069741809264006, + "grad_norm": 1.193588376045227, + "learning_rate": 1.89664557225359e-05, + "loss": 0.329, "step": 658300 }, { - "epoch": 6.71, - "learning_rate": 3.682279352768003e-05, - "loss": 0.5134, + "epoch": 9.071119561323744, + "grad_norm": 3.3420767784118652, + "learning_rate": 1.8958836324091456e-05, + "loss": 0.2652, "step": 658400 }, { - "epoch": 6.71, - "learning_rate": 3.681633359414146e-05, - "loss": 0.4566, + "epoch": 9.072497313383483, + "grad_norm": 2.342928886413574, + "learning_rate": 1.895121765126256e-05, + "loss": 0.2867, "step": 658500 }, { - "epoch": 6.71, - "learning_rate": 3.6809873327324845e-05, - "loss": 0.5126, + "epoch": 9.073875065443223, + "grad_norm": 3.6669256687164307, + "learning_rate": 1.8943599704696457e-05, + "loss": 0.3036, "step": 658600 }, { - "epoch": 6.71, - "learning_rate": 3.680341272754606e-05, - "loss": 0.4936, + "epoch": 9.075252817502962, + "grad_norm": 4.23557710647583, + "learning_rate": 1.8935982485040336e-05, + "loss": 0.3305, "step": 658700 }, { - "epoch": 6.71, - "learning_rate": 3.679695179512101e-05, - "loss": 0.4423, + "epoch": 9.0766305695627, + "grad_norm": 2.9444212913513184, + "learning_rate": 1.8928365992941326e-05, + "loss": 0.3561, "step": 658800 }, { - "epoch": 6.71, - "learning_rate": 3.679049053036557e-05, - "loss": 0.5378, + "epoch": 9.078008321622441, + "grad_norm": 4.4864301681518555, + "learning_rate": 1.8920750229046467e-05, + "loss": 0.2939, "step": 658900 }, { - "epoch": 6.71, - "learning_rate": 3.678402893359567e-05, - "loss": 0.5358, + "epoch": 9.07938607368218, + "grad_norm": 25.02345085144043, + "learning_rate": 1.891313519400277e-05, + "loss": 0.3943, "step": 659000 }, { - "epoch": 6.72, - "learning_rate": 3.677756700512724e-05, - "loss": 0.4929, + "epoch": 9.08076382574192, + "grad_norm": 1.5346360206604004, + "learning_rate": 1.8905520888457163e-05, + "loss": 0.3065, "step": 659100 }, { - "epoch": 6.72, - "learning_rate": 3.6771104745276225e-05, - "loss": 0.414, + "epoch": 9.082141577801659, + "grad_norm": 3.7177698612213135, + "learning_rate": 1.889790731305653e-05, + "loss": 0.3285, "step": 659200 }, { - "epoch": 6.72, - "learning_rate": 3.6764642154358594e-05, - "loss": 0.5156, + "epoch": 9.083519329861398, + "grad_norm": 4.2940778732299805, + "learning_rate": 1.8890294468447683e-05, + "loss": 0.3337, "step": 659300 }, { - "epoch": 6.72, - "learning_rate": 3.6758179232690344e-05, - "loss": 0.468, + "epoch": 9.084897081921138, + "grad_norm": 2.8907344341278076, + "learning_rate": 1.8882682355277364e-05, + "loss": 0.3025, "step": 659400 }, { - "epoch": 6.72, - "learning_rate": 3.675171598058745e-05, - "loss": 0.5837, + "epoch": 9.086274833980877, + "grad_norm": 2.1692159175872803, + "learning_rate": 1.8875070974192264e-05, + "loss": 0.3567, "step": 659500 }, { - "epoch": 6.72, - "learning_rate": 3.6745252398365935e-05, - "loss": 0.4184, + "epoch": 9.087652586040615, + "grad_norm": 4.225934982299805, + "learning_rate": 1.8867460325839e-05, + "loss": 0.3404, "step": 659600 }, { - "epoch": 6.72, - "learning_rate": 3.673878848634184e-05, - "loss": 0.4631, + "epoch": 9.089030338100356, + "grad_norm": 2.248513698577881, + "learning_rate": 1.8859850410864158e-05, + "loss": 0.3381, "step": 659700 }, { - "epoch": 6.72, - "learning_rate": 3.6732324244831194e-05, - "loss": 0.5345, + "epoch": 9.090408090160095, + "grad_norm": 14.0415678024292, + "learning_rate": 1.8852241229914223e-05, + "loss": 0.2877, "step": 659800 }, { - "epoch": 6.72, - "learning_rate": 3.672592432148524e-05, - "loss": 0.4897, + "epoch": 9.091785842219835, + "grad_norm": 2.9047765731811523, + "learning_rate": 1.884463278363564e-05, + "loss": 0.304, "step": 659900 }, { - "epoch": 6.72, - "learning_rate": 3.67194594252367e-05, - "loss": 0.5565, + "epoch": 9.093163594279574, + "grad_norm": 3.4089772701263428, + "learning_rate": 1.8837025072674784e-05, + "loss": 0.3505, "step": 660000 }, { - "epoch": 6.73, - "learning_rate": 3.671299420044669e-05, - "loss": 0.4544, + "epoch": 9.094541346339312, + "grad_norm": 2.795149564743042, + "learning_rate": 1.8829418097677958e-05, + "loss": 0.3162, "step": 660100 }, { - "epoch": 6.73, - "learning_rate": 3.670652864743133e-05, - "loss": 0.4403, + "epoch": 9.095919098399053, + "grad_norm": 0.02902807481586933, + "learning_rate": 1.882181185929143e-05, + "loss": 0.3304, "step": 660200 }, { - "epoch": 6.73, - "learning_rate": 3.670006276650673e-05, - "loss": 0.4926, + "epoch": 9.097296850458791, + "grad_norm": 8.011312484741211, + "learning_rate": 1.8814206358161376e-05, + "loss": 0.3649, "step": 660300 }, { - "epoch": 6.73, - "learning_rate": 3.6693596557989066e-05, - "loss": 0.4669, + "epoch": 9.09867460251853, + "grad_norm": 3.283021926879883, + "learning_rate": 1.880660159493394e-05, + "loss": 0.3459, "step": 660400 }, { - "epoch": 6.73, - "learning_rate": 3.668713002219446e-05, - "loss": 0.4158, + "epoch": 9.10005235457827, + "grad_norm": 3.5296435356140137, + "learning_rate": 1.8798997570255166e-05, + "loss": 0.2651, "step": 660500 }, { - "epoch": 6.73, - "learning_rate": 3.66806631594391e-05, - "loss": 0.5083, + "epoch": 9.10143010663801, + "grad_norm": 3.820683479309082, + "learning_rate": 1.879139428477106e-05, + "loss": 0.3926, "step": 660600 }, { - "epoch": 6.73, - "learning_rate": 3.667419597003919e-05, - "loss": 0.5254, + "epoch": 9.102807858697748, + "grad_norm": 5.917936325073242, + "learning_rate": 1.8783791739127563e-05, + "loss": 0.3278, "step": 660700 }, { - "epoch": 6.73, - "learning_rate": 3.666772845431091e-05, - "loss": 0.5178, + "epoch": 9.104185610757488, + "grad_norm": 1.6316719055175781, + "learning_rate": 1.877618993397055e-05, + "loss": 0.3287, "step": 660800 }, { - "epoch": 6.73, - "learning_rate": 3.666126061257052e-05, - "loss": 0.4636, + "epoch": 9.105563362817227, + "grad_norm": 1.9378429651260376, + "learning_rate": 1.8768588869945838e-05, + "loss": 0.3004, "step": 660900 }, { - "epoch": 6.73, - "learning_rate": 3.665479244513423e-05, - "loss": 0.4893, + "epoch": 9.106941114876967, + "grad_norm": 3.2813785076141357, + "learning_rate": 1.8760988547699174e-05, + "loss": 0.3673, "step": 661000 }, { - "epoch": 6.74, - "learning_rate": 3.66483239523183e-05, - "loss": 0.4859, + "epoch": 9.108318866936706, + "grad_norm": 1.8384367227554321, + "learning_rate": 1.8753388967876227e-05, + "loss": 0.3135, "step": 661100 }, { - "epoch": 6.74, - "learning_rate": 3.6641855134439e-05, - "loss": 0.5485, + "epoch": 9.109696618996445, + "grad_norm": 3.2492265701293945, + "learning_rate": 1.8745790131122644e-05, + "loss": 0.3072, "step": 661200 }, { - "epoch": 6.74, - "learning_rate": 3.663538599181262e-05, - "loss": 0.4874, + "epoch": 9.111074371056185, + "grad_norm": 2.9144251346588135, + "learning_rate": 1.873819203808397e-05, + "loss": 0.307, "step": 661300 }, { - "epoch": 6.74, - "learning_rate": 3.662891652475548e-05, - "loss": 0.4899, + "epoch": 9.112452123115924, + "grad_norm": 18.725872039794922, + "learning_rate": 1.8730594689405713e-05, + "loss": 0.339, "step": 661400 }, { - "epoch": 6.74, - "learning_rate": 3.662244673358387e-05, - "loss": 0.4512, + "epoch": 9.113829875175663, + "grad_norm": 1.7501806020736694, + "learning_rate": 1.872299808573331e-05, + "loss": 0.3095, "step": 661500 }, { - "epoch": 6.74, - "learning_rate": 3.661597661861414e-05, - "loss": 0.5161, + "epoch": 9.115207627235403, + "grad_norm": 1.690932273864746, + "learning_rate": 1.87154022277121e-05, + "loss": 0.3457, "step": 661600 }, { - "epoch": 6.74, - "learning_rate": 3.660950618016263e-05, - "loss": 0.4732, + "epoch": 9.116585379295142, + "grad_norm": 1.1774755716323853, + "learning_rate": 1.8707807115987426e-05, + "loss": 0.3121, "step": 661700 }, { - "epoch": 6.74, - "learning_rate": 3.660303541854571e-05, - "loss": 0.5533, + "epoch": 9.117963131354882, + "grad_norm": 1.8819571733474731, + "learning_rate": 1.8700288691152862e-05, + "loss": 0.3147, "step": 661800 }, { - "epoch": 6.74, - "learning_rate": 3.659656433407976e-05, - "loss": 0.4281, + "epoch": 9.11934088341462, + "grad_norm": 9.761354446411133, + "learning_rate": 1.869269506647784e-05, + "loss": 0.3224, "step": 661900 }, { - "epoch": 6.74, - "learning_rate": 3.659009292708119e-05, - "loss": 0.4899, + "epoch": 9.12071863547436, + "grad_norm": 2.3311543464660645, + "learning_rate": 1.8685102190028427e-05, + "loss": 0.3367, "step": 662000 }, { - "epoch": 6.75, - "learning_rate": 3.658362119786639e-05, - "loss": 0.6039, + "epoch": 9.1220963875341, + "grad_norm": 2.0261716842651367, + "learning_rate": 1.8677510062449682e-05, + "loss": 0.3428, "step": 662100 }, { - "epoch": 6.75, - "learning_rate": 3.657714914675182e-05, - "loss": 0.5875, + "epoch": 9.123474139593839, + "grad_norm": 1.8166346549987793, + "learning_rate": 1.8669918684386587e-05, + "loss": 0.3158, "step": 662200 }, { - "epoch": 6.75, - "learning_rate": 3.65706767740539e-05, - "loss": 0.4893, + "epoch": 9.124851891653577, + "grad_norm": 1.3290455341339111, + "learning_rate": 1.8662328056484073e-05, + "loss": 0.3242, "step": 662300 }, { - "epoch": 6.75, - "learning_rate": 3.65642040800891e-05, - "loss": 0.4673, + "epoch": 9.126229643713318, + "grad_norm": 1.2308013439178467, + "learning_rate": 1.8654738179387006e-05, + "loss": 0.3231, "step": 662400 }, { - "epoch": 6.75, - "learning_rate": 3.655773106517388e-05, - "loss": 0.496, + "epoch": 9.127607395773056, + "grad_norm": 8.199753761291504, + "learning_rate": 1.8647149053740185e-05, + "loss": 0.355, "step": 662500 }, { - "epoch": 6.75, - "learning_rate": 3.655125772962476e-05, - "loss": 0.5655, + "epoch": 9.128985147832797, + "grad_norm": 3.0945870876312256, + "learning_rate": 1.8639560680188345e-05, + "loss": 0.3227, "step": 662600 }, { - "epoch": 6.75, - "learning_rate": 3.6544784073758225e-05, - "loss": 0.5621, + "epoch": 9.130362899892535, + "grad_norm": 5.005845069885254, + "learning_rate": 1.8631973059376156e-05, + "loss": 0.3159, "step": 662700 }, { - "epoch": 6.75, - "learning_rate": 3.6538310097890815e-05, - "loss": 0.4366, + "epoch": 9.131740651952274, + "grad_norm": 2.084890842437744, + "learning_rate": 1.8624386191948212e-05, + "loss": 0.3662, "step": 662800 }, { - "epoch": 6.75, - "learning_rate": 3.6531835802339054e-05, - "loss": 0.4584, + "epoch": 9.133118404012015, + "grad_norm": 3.2349278926849365, + "learning_rate": 1.8616800078549076e-05, + "loss": 0.3511, "step": 662900 }, { - "epoch": 6.75, - "learning_rate": 3.65253611874195e-05, - "loss": 0.5005, + "epoch": 9.134496156071753, + "grad_norm": 4.741024017333984, + "learning_rate": 1.860921471982322e-05, + "loss": 0.2891, "step": 663000 }, { - "epoch": 6.76, - "learning_rate": 3.6518886253448726e-05, - "loss": 0.5123, + "epoch": 9.135873908131492, + "grad_norm": 1.9143905639648438, + "learning_rate": 1.8601630116415053e-05, + "loss": 0.3318, "step": 663100 }, { - "epoch": 6.76, - "learning_rate": 3.651241100074332e-05, - "loss": 0.4999, + "epoch": 9.137251660191232, + "grad_norm": 0.8522217869758606, + "learning_rate": 1.8594046268968947e-05, + "loss": 0.2929, "step": 663200 }, { - "epoch": 6.76, - "learning_rate": 3.650593542961988e-05, - "loss": 0.4984, + "epoch": 9.138629412250971, + "grad_norm": 1.7559139728546143, + "learning_rate": 1.8586463178129156e-05, + "loss": 0.3288, "step": 663300 }, { - "epoch": 6.76, - "learning_rate": 3.649945954039502e-05, - "loss": 0.4432, + "epoch": 9.140007164310711, + "grad_norm": 13.857282638549805, + "learning_rate": 1.8578880844539927e-05, + "loss": 0.3344, "step": 663400 }, { - "epoch": 6.76, - "learning_rate": 3.6492983333385376e-05, - "loss": 0.4939, + "epoch": 9.14138491637045, + "grad_norm": 4.089639663696289, + "learning_rate": 1.857129926884541e-05, + "loss": 0.3391, "step": 663500 }, { - "epoch": 6.76, - "learning_rate": 3.64865068089076e-05, - "loss": 0.5484, + "epoch": 9.142762668430189, + "grad_norm": 7.112442493438721, + "learning_rate": 1.856371845168969e-05, + "loss": 0.3215, "step": 663600 }, { - "epoch": 6.76, - "learning_rate": 3.648002996727834e-05, - "loss": 0.5036, + "epoch": 9.14414042048993, + "grad_norm": 2.9991023540496826, + "learning_rate": 1.8556138393716822e-05, + "loss": 0.3308, "step": 663700 }, { - "epoch": 6.76, - "learning_rate": 3.647355280881429e-05, - "loss": 0.5832, + "epoch": 9.145518172549668, + "grad_norm": 4.503046035766602, + "learning_rate": 1.8548559095570743e-05, + "loss": 0.3489, "step": 663800 }, { - "epoch": 6.76, - "learning_rate": 3.6467075333832144e-05, - "loss": 0.5186, + "epoch": 9.146895924609407, + "grad_norm": 2.8440442085266113, + "learning_rate": 1.8540980557895367e-05, + "loss": 0.2849, "step": 663900 }, { - "epoch": 6.76, - "learning_rate": 3.64605975426486e-05, - "loss": 0.4676, + "epoch": 9.148273676669147, + "grad_norm": 2.5758185386657715, + "learning_rate": 1.8533402781334527e-05, + "loss": 0.3564, "step": 664000 }, { - "epoch": 6.77, - "learning_rate": 3.6454119435580404e-05, - "loss": 0.5172, + "epoch": 9.149651428728886, + "grad_norm": 96.88946533203125, + "learning_rate": 1.852582576653199e-05, + "loss": 0.3813, "step": 664100 }, { - "epoch": 6.77, - "learning_rate": 3.6447641012944275e-05, - "loss": 0.541, + "epoch": 9.151029180788626, + "grad_norm": 0.5691716074943542, + "learning_rate": 1.8518249514131464e-05, + "loss": 0.3976, "step": 664200 }, { - "epoch": 6.77, - "learning_rate": 3.6441162275056984e-05, - "loss": 0.5516, + "epoch": 9.152406932848365, + "grad_norm": 1.4628241062164307, + "learning_rate": 1.8510674024776602e-05, + "loss": 0.286, "step": 664300 }, { - "epoch": 6.77, - "learning_rate": 3.6434748014321386e-05, - "loss": 0.4715, + "epoch": 9.153784684908103, + "grad_norm": 1.9692130088806152, + "learning_rate": 1.8503099299110966e-05, + "loss": 0.3096, "step": 664400 }, { - "epoch": 6.77, - "learning_rate": 3.642826865002669e-05, - "loss": 0.6193, + "epoch": 9.155162436967844, + "grad_norm": 1.3080925941467285, + "learning_rate": 1.8495525337778072e-05, + "loss": 0.2746, "step": 664500 }, { - "epoch": 6.77, - "learning_rate": 3.642178897142802e-05, - "loss": 0.5233, + "epoch": 9.156540189027583, + "grad_norm": 1.6796101331710815, + "learning_rate": 1.8487952141421358e-05, + "loss": 0.3394, "step": 664600 }, { - "epoch": 6.77, - "learning_rate": 3.6415308978842196e-05, - "loss": 0.4734, + "epoch": 9.157917941087321, + "grad_norm": 3.575744152069092, + "learning_rate": 1.8480379710684223e-05, + "loss": 0.2725, "step": 664700 }, { - "epoch": 6.77, - "learning_rate": 3.6408828672586044e-05, - "loss": 0.4175, + "epoch": 9.159295693147062, + "grad_norm": 3.0670909881591797, + "learning_rate": 1.8472808046209986e-05, + "loss": 0.3052, "step": 664800 }, { - "epoch": 6.77, - "learning_rate": 3.640234805297642e-05, - "loss": 0.4571, + "epoch": 9.1606734452068, + "grad_norm": 3.4189727306365967, + "learning_rate": 1.8465237148641877e-05, + "loss": 0.3519, "step": 664900 }, { - "epoch": 6.78, - "learning_rate": 3.6395867120330176e-05, - "loss": 0.4764, + "epoch": 9.162051197266539, + "grad_norm": 2.5494418144226074, + "learning_rate": 1.8457667018623094e-05, + "loss": 0.337, "step": 665000 }, { - "epoch": 6.78, - "learning_rate": 3.6389385874964204e-05, - "loss": 0.447, + "epoch": 9.16342894932628, + "grad_norm": 1.8558762073516846, + "learning_rate": 1.8450097656796754e-05, + "loss": 0.3081, "step": 665100 }, { - "epoch": 6.78, - "learning_rate": 3.638290431719539e-05, - "loss": 0.4488, + "epoch": 9.164806701386018, + "grad_norm": 2.674231767654419, + "learning_rate": 1.8442529063805922e-05, + "loss": 0.2847, "step": 665200 }, { - "epoch": 6.78, - "learning_rate": 3.6376422447340644e-05, - "loss": 0.4287, + "epoch": 9.166184453445759, + "grad_norm": 1.8738274574279785, + "learning_rate": 1.8434961240293586e-05, + "loss": 0.2987, "step": 665300 }, { - "epoch": 6.78, - "learning_rate": 3.63699402657169e-05, - "loss": 0.4888, + "epoch": 9.167562205505497, + "grad_norm": 88.1220932006836, + "learning_rate": 1.8427394186902674e-05, + "loss": 0.3353, "step": 665400 }, { - "epoch": 6.78, - "learning_rate": 3.6363457772641094e-05, - "loss": 0.5018, + "epoch": 9.168939957565236, + "grad_norm": 1.8143916130065918, + "learning_rate": 1.8419827904276033e-05, + "loss": 0.2877, "step": 665500 }, { - "epoch": 6.78, - "learning_rate": 3.635697496843016e-05, - "loss": 0.537, + "epoch": 9.170317709624976, + "grad_norm": 4.368610382080078, + "learning_rate": 1.841226239305646e-05, + "loss": 0.407, "step": 665600 }, { - "epoch": 6.78, - "learning_rate": 3.63504918534011e-05, - "loss": 0.4665, + "epoch": 9.171695461684715, + "grad_norm": 4.862356185913086, + "learning_rate": 1.8404697653886696e-05, + "loss": 0.3121, "step": 665700 }, { - "epoch": 6.78, - "learning_rate": 3.634400842787089e-05, - "loss": 0.5256, + "epoch": 9.173073213744454, + "grad_norm": 3.308805227279663, + "learning_rate": 1.83971336874094e-05, + "loss": 0.2568, "step": 665800 }, { - "epoch": 6.78, - "learning_rate": 3.633752469215651e-05, - "loss": 0.5017, + "epoch": 9.174450965804194, + "grad_norm": 2.6034023761749268, + "learning_rate": 1.838957049426716e-05, + "loss": 0.3045, "step": 665900 }, { - "epoch": 6.79, - "learning_rate": 3.633104064657501e-05, - "loss": 0.5082, + "epoch": 9.175828717863933, + "grad_norm": 2.0631253719329834, + "learning_rate": 1.838200807510253e-05, + "loss": 0.3384, "step": 666000 }, { - "epoch": 6.79, - "learning_rate": 3.632455629144339e-05, - "loss": 0.5594, + "epoch": 9.177206469923673, + "grad_norm": 16.743928909301758, + "learning_rate": 1.8374446430557944e-05, + "loss": 0.3411, "step": 666100 }, { - "epoch": 6.79, - "learning_rate": 3.6318071627078726e-05, - "loss": 0.512, + "epoch": 9.178584221983412, + "grad_norm": 1.4728195667266846, + "learning_rate": 1.8366885561275826e-05, + "loss": 0.3629, "step": 666200 }, { - "epoch": 6.79, - "learning_rate": 3.6311586653798055e-05, - "loss": 0.4553, + "epoch": 9.17996197404315, + "grad_norm": 2.939405679702759, + "learning_rate": 1.8359325467898504e-05, + "loss": 0.289, "step": 666300 }, { - "epoch": 6.79, - "learning_rate": 3.630510137191846e-05, - "loss": 0.5456, + "epoch": 9.181339726102891, + "grad_norm": 13.283120155334473, + "learning_rate": 1.8351841740390532e-05, + "loss": 0.3518, "step": 666400 }, { - "epoch": 6.79, - "learning_rate": 3.629861578175704e-05, - "loss": 0.4726, + "epoch": 9.18271747816263, + "grad_norm": 13.674266815185547, + "learning_rate": 1.8344283192974478e-05, + "loss": 0.3232, "step": 666500 }, { - "epoch": 6.79, - "learning_rate": 3.629212988363089e-05, - "loss": 0.4826, + "epoch": 9.184095230222368, + "grad_norm": 2.1873505115509033, + "learning_rate": 1.83367254233834e-05, + "loss": 0.3638, "step": 666600 }, { - "epoch": 6.79, - "learning_rate": 3.6285643677857145e-05, - "loss": 0.5063, + "epoch": 9.185472982282109, + "grad_norm": 1.5052802562713623, + "learning_rate": 1.8329168432259378e-05, + "loss": 0.3584, "step": 666700 }, { - "epoch": 6.79, - "learning_rate": 3.6279157164752935e-05, - "loss": 0.4396, + "epoch": 9.186850734341848, + "grad_norm": 4.314074993133545, + "learning_rate": 1.832161222024441e-05, + "loss": 0.3115, "step": 666800 }, { - "epoch": 6.79, - "learning_rate": 3.6272670344635406e-05, - "loss": 0.4103, + "epoch": 9.188228486401588, + "grad_norm": 3.674027919769287, + "learning_rate": 1.8314056787980447e-05, + "loss": 0.3559, "step": 666900 }, { - "epoch": 6.8, - "learning_rate": 3.6266183217821734e-05, - "loss": 0.4805, + "epoch": 9.189606238461327, + "grad_norm": 5.653722763061523, + "learning_rate": 1.8306502136109355e-05, + "loss": 0.3865, "step": 667000 }, { - "epoch": 6.8, - "learning_rate": 3.62596957846291e-05, - "loss": 0.4982, + "epoch": 9.190983990521065, + "grad_norm": 1.302977204322815, + "learning_rate": 1.8298948265272938e-05, + "loss": 0.3282, "step": 667100 }, { - "epoch": 6.8, - "learning_rate": 3.625320804537468e-05, - "loss": 0.5055, + "epoch": 9.192361742580806, + "grad_norm": 7.035192012786865, + "learning_rate": 1.829139517611294e-05, + "loss": 0.3382, "step": 667200 }, { - "epoch": 6.8, - "learning_rate": 3.6246720000375725e-05, - "loss": 0.4847, + "epoch": 9.193739494640544, + "grad_norm": 1.8187967538833618, + "learning_rate": 1.8283842869271026e-05, + "loss": 0.361, "step": 667300 }, { - "epoch": 6.8, - "learning_rate": 3.6240231649949445e-05, - "loss": 0.4427, + "epoch": 9.195117246700283, + "grad_norm": 2.8751726150512695, + "learning_rate": 1.8276291345388813e-05, + "loss": 0.3543, "step": 667400 }, { - "epoch": 6.8, - "learning_rate": 3.6233742994413055e-05, - "loss": 0.4951, + "epoch": 9.196494998760024, + "grad_norm": 1.1306523084640503, + "learning_rate": 1.8268740605107838e-05, + "loss": 0.3185, "step": 667500 }, { - "epoch": 6.8, - "learning_rate": 3.6227254034083846e-05, - "loss": 0.4521, + "epoch": 9.197872750819762, + "grad_norm": 2.5310068130493164, + "learning_rate": 1.8261190649069584e-05, + "loss": 0.3017, "step": 667600 }, { - "epoch": 6.8, - "learning_rate": 3.6220764769279075e-05, - "loss": 0.4607, + "epoch": 9.199250502879503, + "grad_norm": 2.3662636280059814, + "learning_rate": 1.8253641477915443e-05, + "loss": 0.28, "step": 667700 }, { - "epoch": 6.8, - "learning_rate": 3.621427520031602e-05, - "loss": 0.539, + "epoch": 9.200628254939241, + "grad_norm": 4.916684150695801, + "learning_rate": 1.824609309228676e-05, + "loss": 0.3157, "step": 667800 }, { - "epoch": 6.8, - "learning_rate": 3.620778532751202e-05, - "loss": 0.5401, + "epoch": 9.20200600699898, + "grad_norm": 2.3282675743103027, + "learning_rate": 1.823854549282481e-05, + "loss": 0.2895, "step": 667900 }, { - "epoch": 6.81, - "learning_rate": 3.620136005444901e-05, - "loss": 0.4674, + "epoch": 9.20338375905872, + "grad_norm": 4.90512228012085, + "learning_rate": 1.823099868017081e-05, + "loss": 0.3449, "step": 668000 }, { - "epoch": 6.81, - "learning_rate": 3.619486957794552e-05, - "loss": 0.5241, + "epoch": 9.204761511118459, + "grad_norm": 2.478827714920044, + "learning_rate": 1.822345265496588e-05, + "loss": 0.353, "step": 668100 }, { - "epoch": 6.81, - "learning_rate": 3.618837879854985e-05, - "loss": 0.4471, + "epoch": 9.206139263178198, + "grad_norm": 2.5820541381835938, + "learning_rate": 1.8215907417851126e-05, + "loss": 0.2918, "step": 668200 }, { - "epoch": 6.81, - "learning_rate": 3.6181887716579395e-05, - "loss": 0.4803, + "epoch": 9.207517015237938, + "grad_norm": 2.040433645248413, + "learning_rate": 1.8208362969467514e-05, + "loss": 0.3412, "step": 668300 }, { - "epoch": 6.81, - "learning_rate": 3.617539633235153e-05, - "loss": 0.4539, + "epoch": 9.208894767297677, + "grad_norm": 2.73866605758667, + "learning_rate": 1.820089474313663e-05, + "loss": 0.3219, "step": 668400 }, { - "epoch": 6.81, - "learning_rate": 3.6168904646183624e-05, - "loss": 0.4693, + "epoch": 9.210272519357417, + "grad_norm": 0.9639937877655029, + "learning_rate": 1.8193351866234795e-05, + "loss": 0.3192, "step": 668500 }, { - "epoch": 6.81, - "learning_rate": 3.616241265839311e-05, - "loss": 0.4683, + "epoch": 9.211650271417156, + "grad_norm": 0.4384617805480957, + "learning_rate": 1.8185809779980334e-05, + "loss": 0.3682, "step": 668600 }, { - "epoch": 6.81, - "learning_rate": 3.615592036929739e-05, - "loss": 0.6487, + "epoch": 9.213028023476895, + "grad_norm": 4.066040992736816, + "learning_rate": 1.8178268485013994e-05, + "loss": 0.3221, "step": 668700 }, { - "epoch": 6.81, - "learning_rate": 3.614942777921389e-05, - "loss": 0.5398, + "epoch": 9.214405775536635, + "grad_norm": 4.149028301239014, + "learning_rate": 1.8170727981976425e-05, + "loss": 0.3076, "step": 668800 }, { - "epoch": 6.81, - "learning_rate": 3.614293488846007e-05, - "loss": 0.4632, + "epoch": 9.215783527596374, + "grad_norm": 3.9431777000427246, + "learning_rate": 1.816318827150824e-05, + "loss": 0.3431, "step": 668900 }, { - "epoch": 6.82, - "learning_rate": 3.6136441697353403e-05, - "loss": 0.4871, + "epoch": 9.217161279656112, + "grad_norm": 4.759339809417725, + "learning_rate": 1.815564935424998e-05, + "loss": 0.3543, "step": 669000 }, { - "epoch": 6.82, - "learning_rate": 3.6129948206211356e-05, - "loss": 0.5138, + "epoch": 9.218539031715853, + "grad_norm": 4.536906719207764, + "learning_rate": 1.8148111230842113e-05, + "loss": 0.2961, "step": 669100 }, { - "epoch": 6.82, - "learning_rate": 3.612345441535142e-05, - "loss": 0.5199, + "epoch": 9.219916783775592, + "grad_norm": 0.23387058079242706, + "learning_rate": 1.8140573901925043e-05, + "loss": 0.3275, "step": 669200 }, { - "epoch": 6.82, - "learning_rate": 3.61169603250911e-05, - "loss": 0.5318, + "epoch": 9.22129453583533, + "grad_norm": 19.41399574279785, + "learning_rate": 1.8133037368139108e-05, + "loss": 0.2992, "step": 669300 }, { - "epoch": 6.82, - "learning_rate": 3.6110465935747936e-05, - "loss": 0.5054, + "epoch": 9.22267228789507, + "grad_norm": 3.484515905380249, + "learning_rate": 1.8125501630124558e-05, + "loss": 0.3622, "step": 669400 }, { - "epoch": 6.82, - "learning_rate": 3.610397124763944e-05, - "loss": 0.4647, + "epoch": 9.22405003995481, + "grad_norm": 3.081313133239746, + "learning_rate": 1.811796668852161e-05, + "loss": 0.3477, "step": 669500 }, { - "epoch": 6.82, - "learning_rate": 3.609747626108319e-05, - "loss": 0.5289, + "epoch": 9.22542779201455, + "grad_norm": 4.139540672302246, + "learning_rate": 1.811043254397039e-05, + "loss": 0.363, "step": 669600 }, { - "epoch": 6.82, - "learning_rate": 3.609098097639673e-05, - "loss": 0.5634, + "epoch": 9.226805544074288, + "grad_norm": 10.904473304748535, + "learning_rate": 1.810289919711096e-05, + "loss": 0.2931, "step": 669700 }, { - "epoch": 6.82, - "learning_rate": 3.608448539389765e-05, - "loss": 0.5382, + "epoch": 9.228183296134027, + "grad_norm": 5.84497594833374, + "learning_rate": 1.8095366648583326e-05, + "loss": 0.3407, "step": 669800 }, { - "epoch": 6.83, - "learning_rate": 3.607798951390355e-05, - "loss": 0.4902, + "epoch": 9.229561048193768, + "grad_norm": 2.066365957260132, + "learning_rate": 1.8087834899027397e-05, + "loss": 0.3507, "step": 669900 }, { - "epoch": 6.83, - "learning_rate": 3.607149333673203e-05, - "loss": 0.4505, + "epoch": 9.230938800253506, + "grad_norm": 2.467874765396118, + "learning_rate": 1.808030394908305e-05, + "loss": 0.3029, "step": 670000 }, { - "epoch": 6.83, - "learning_rate": 3.6065061828909445e-05, - "loss": 0.4815, + "epoch": 9.232316552313245, + "grad_norm": 4.054433345794678, + "learning_rate": 1.8072773799390075e-05, + "loss": 0.3059, "step": 670100 }, { - "epoch": 6.83, - "learning_rate": 3.6058565061299834e-05, - "loss": 0.5913, + "epoch": 9.233694304372985, + "grad_norm": 6.117511749267578, + "learning_rate": 1.8065244450588197e-05, + "loss": 0.3228, "step": 670200 }, { - "epoch": 6.83, - "learning_rate": 3.605206799746254e-05, - "loss": 0.4183, + "epoch": 9.235072056432724, + "grad_norm": 2.0441293716430664, + "learning_rate": 1.805771590331706e-05, + "loss": 0.3137, "step": 670300 }, { - "epoch": 6.83, - "learning_rate": 3.604557063771524e-05, - "loss": 0.5186, + "epoch": 9.236449808492464, + "grad_norm": 1.0492639541625977, + "learning_rate": 1.8050188158216277e-05, + "loss": 0.2872, "step": 670400 }, { - "epoch": 6.83, - "learning_rate": 3.603907298237562e-05, - "loss": 0.4363, + "epoch": 9.237827560552203, + "grad_norm": 1.9644291400909424, + "learning_rate": 1.8042661215925354e-05, + "loss": 0.3598, "step": 670500 }, { - "epoch": 6.83, - "learning_rate": 3.603257503176137e-05, - "loss": 0.5108, + "epoch": 9.239205312611942, + "grad_norm": 6.718084335327148, + "learning_rate": 1.803513507708374e-05, + "loss": 0.2659, "step": 670600 }, { - "epoch": 6.83, - "learning_rate": 3.6026076786190196e-05, - "loss": 0.5005, + "epoch": 9.240583064671682, + "grad_norm": 1.1509180068969727, + "learning_rate": 1.8027609742330823e-05, + "loss": 0.3199, "step": 670700 }, { - "epoch": 6.83, - "learning_rate": 3.601957824597982e-05, - "loss": 0.4583, + "epoch": 9.241960816731421, + "grad_norm": 3.4187633991241455, + "learning_rate": 1.8020085212305914e-05, + "loss": 0.322, "step": 670800 }, { - "epoch": 6.84, - "learning_rate": 3.6013079411448e-05, - "loss": 0.5036, + "epoch": 9.24333856879116, + "grad_norm": 4.264100551605225, + "learning_rate": 1.8012561487648277e-05, + "loss": 0.2814, "step": 670900 }, { - "epoch": 6.84, - "learning_rate": 3.600658028291249e-05, - "loss": 0.4659, + "epoch": 9.2447163208509, + "grad_norm": 2.0538039207458496, + "learning_rate": 1.8005113794191756e-05, + "loss": 0.3257, "step": 671000 }, { - "epoch": 6.84, - "learning_rate": 3.600008086069104e-05, - "loss": 0.5261, + "epoch": 9.246094072910639, + "grad_norm": 3.604013442993164, + "learning_rate": 1.7997591674116478e-05, + "loss": 0.3224, "step": 671100 }, { - "epoch": 6.84, - "learning_rate": 3.599358114510145e-05, - "loss": 0.5056, + "epoch": 9.24747182497038, + "grad_norm": 3.001390218734741, + "learning_rate": 1.7990070361319404e-05, + "loss": 0.285, "step": 671200 }, { - "epoch": 6.84, - "learning_rate": 3.59870811364615e-05, - "loss": 0.4737, + "epoch": 9.248849577030118, + "grad_norm": 3.397099494934082, + "learning_rate": 1.798254985643951e-05, + "loss": 0.3398, "step": 671300 }, { - "epoch": 6.84, - "learning_rate": 3.5980580835089015e-05, - "loss": 0.4706, + "epoch": 9.250227329089856, + "grad_norm": 3.9213922023773193, + "learning_rate": 1.7975030160115684e-05, + "loss": 0.3565, "step": 671400 }, { - "epoch": 6.84, - "learning_rate": 3.597408024130182e-05, - "loss": 0.4643, + "epoch": 9.251605081149597, + "grad_norm": 2.9863834381103516, + "learning_rate": 1.7967511272986796e-05, + "loss": 0.3358, "step": 671500 }, { - "epoch": 6.84, - "learning_rate": 3.596757935541775e-05, - "loss": 0.5279, + "epoch": 9.252982833209336, + "grad_norm": 1.9003570079803467, + "learning_rate": 1.7959993195691575e-05, + "loss": 0.3503, "step": 671600 }, { - "epoch": 6.84, - "learning_rate": 3.596107817775466e-05, - "loss": 0.481, + "epoch": 9.254360585269074, + "grad_norm": 4.73380184173584, + "learning_rate": 1.7952475928868747e-05, + "loss": 0.3247, "step": 671700 }, { - "epoch": 6.84, - "learning_rate": 3.595457670863043e-05, - "loss": 0.4979, + "epoch": 9.255738337328815, + "grad_norm": 4.568268299102783, + "learning_rate": 1.7944959473156927e-05, + "loss": 0.3281, "step": 671800 }, { - "epoch": 6.85, - "learning_rate": 3.594807494836293e-05, - "loss": 0.4025, + "epoch": 9.257116089388553, + "grad_norm": 3.1382946968078613, + "learning_rate": 1.7937443829194687e-05, + "loss": 0.3268, "step": 671900 }, { - "epoch": 6.85, - "learning_rate": 3.5941572897270056e-05, - "loss": 0.5739, + "epoch": 9.258493841448294, + "grad_norm": 1.7915526628494263, + "learning_rate": 1.7929928997620527e-05, + "loss": 0.3586, "step": 672000 }, { - "epoch": 6.85, - "learning_rate": 3.59351355805227e-05, - "loss": 0.4507, + "epoch": 9.259871593508032, + "grad_norm": 2.7113542556762695, + "learning_rate": 1.7922414979072842e-05, + "loss": 0.3164, "step": 672100 }, { - "epoch": 6.85, - "learning_rate": 3.592863295163316e-05, - "loss": 0.496, + "epoch": 9.261249345567771, + "grad_norm": 2.5293684005737305, + "learning_rate": 1.791490177419001e-05, + "loss": 0.3133, "step": 672200 }, { - "epoch": 6.85, - "learning_rate": 3.5922130032868844e-05, - "loss": 0.47, + "epoch": 9.262627097627512, + "grad_norm": 1.965452790260315, + "learning_rate": 1.7907389383610306e-05, + "loss": 0.3511, "step": 672300 }, { - "epoch": 6.85, - "learning_rate": 3.591562682454772e-05, - "loss": 0.5211, + "epoch": 9.26400484968725, + "grad_norm": 2.111309051513672, + "learning_rate": 1.7899877807971947e-05, + "loss": 0.3235, "step": 672400 }, { - "epoch": 6.85, - "learning_rate": 3.5909123326987746e-05, - "loss": 0.4861, + "epoch": 9.265382601746989, + "grad_norm": 0.14067216217517853, + "learning_rate": 1.7892367047913088e-05, + "loss": 0.3647, "step": 672500 }, { - "epoch": 6.85, - "learning_rate": 3.5902619540506905e-05, - "loss": 0.4926, + "epoch": 9.26676035380673, + "grad_norm": 21.046375274658203, + "learning_rate": 1.7884857104071802e-05, + "loss": 0.3543, "step": 672600 }, { - "epoch": 6.85, - "learning_rate": 3.589618050760158e-05, - "loss": 0.4992, + "epoch": 9.268138105866468, + "grad_norm": 4.697961807250977, + "learning_rate": 1.7877347977086096e-05, + "loss": 0.3182, "step": 672700 }, { - "epoch": 6.85, - "learning_rate": 3.588974119214512e-05, - "loss": 0.5105, + "epoch": 9.269515857926208, + "grad_norm": 4.218894004821777, + "learning_rate": 1.7869839667593906e-05, + "loss": 0.3453, "step": 672800 }, { - "epoch": 6.86, - "learning_rate": 3.588323654656593e-05, - "loss": 0.4958, + "epoch": 9.270893609985947, + "grad_norm": 2.187321186065674, + "learning_rate": 1.7862332176233097e-05, + "loss": 0.3644, "step": 672900 }, { - "epoch": 6.86, - "learning_rate": 3.587673161333159e-05, - "loss": 0.543, + "epoch": 9.272271362045686, + "grad_norm": 3.0138494968414307, + "learning_rate": 1.785482550364148e-05, + "loss": 0.282, "step": 673000 }, { - "epoch": 6.86, - "learning_rate": 3.587022639276014e-05, - "loss": 0.4032, + "epoch": 9.273649114105426, + "grad_norm": 0.10651703178882599, + "learning_rate": 1.7847319650456793e-05, + "loss": 0.2964, "step": 673100 }, { - "epoch": 6.86, - "learning_rate": 3.586372088516965e-05, - "loss": 0.4216, + "epoch": 9.275026866165165, + "grad_norm": 1.2092101573944092, + "learning_rate": 1.7839889663586754e-05, + "loss": 0.3494, "step": 673200 }, { - "epoch": 6.86, - "learning_rate": 3.58572150908782e-05, - "loss": 0.5138, + "epoch": 9.276404618224904, + "grad_norm": 1.141021490097046, + "learning_rate": 1.7832385442918832e-05, + "loss": 0.307, "step": 673300 }, { - "epoch": 6.86, - "learning_rate": 3.585070901020389e-05, - "loss": 0.4552, + "epoch": 9.277782370284644, + "grad_norm": 1.4206140041351318, + "learning_rate": 1.782488204356422e-05, + "loss": 0.3453, "step": 673400 }, { - "epoch": 6.86, - "learning_rate": 3.584420264346482e-05, - "loss": 0.5489, + "epoch": 9.279160122344383, + "grad_norm": 11.561460494995117, + "learning_rate": 1.7817379466160374e-05, + "loss": 0.2967, "step": 673500 }, { - "epoch": 6.86, - "learning_rate": 3.583769599097911e-05, - "loss": 0.4855, + "epoch": 9.280537874404121, + "grad_norm": 16.791946411132812, + "learning_rate": 1.780987771134468e-05, + "loss": 0.3177, "step": 673600 }, { - "epoch": 6.86, - "learning_rate": 3.583118905306492e-05, - "loss": 0.5079, + "epoch": 9.281915626463862, + "grad_norm": 10.034136772155762, + "learning_rate": 1.780237677975446e-05, + "loss": 0.3102, "step": 673700 }, { - "epoch": 6.86, - "learning_rate": 3.582468183004037e-05, - "loss": 0.4652, + "epoch": 9.2832933785236, + "grad_norm": 1.6947712898254395, + "learning_rate": 1.779487667202693e-05, + "loss": 0.3548, "step": 673800 }, { - "epoch": 6.87, - "learning_rate": 3.581817432222365e-05, - "loss": 0.5611, + "epoch": 9.284671130583341, + "grad_norm": 1.68232262134552, + "learning_rate": 1.7787377388799282e-05, + "loss": 0.3516, "step": 673900 }, { - "epoch": 6.87, - "learning_rate": 3.5811666529932924e-05, - "loss": 0.4351, + "epoch": 9.28604888264308, + "grad_norm": 0.639404833316803, + "learning_rate": 1.7779878930708617e-05, + "loss": 0.3438, "step": 674000 }, { - "epoch": 6.87, - "learning_rate": 3.580515845348638e-05, - "loss": 0.4841, + "epoch": 9.287426634702818, + "grad_norm": 5.2984113693237305, + "learning_rate": 1.7772381298391958e-05, + "loss": 0.3559, "step": 674100 }, { - "epoch": 6.87, - "learning_rate": 3.579865009320223e-05, - "loss": 0.5385, + "epoch": 9.288804386762559, + "grad_norm": 0.9397193789482117, + "learning_rate": 1.776488449248629e-05, + "loss": 0.362, "step": 674200 }, { - "epoch": 6.87, - "learning_rate": 3.579214144939871e-05, - "loss": 0.4425, + "epoch": 9.290182138822297, + "grad_norm": 1.8699778318405151, + "learning_rate": 1.775738851362847e-05, + "loss": 0.316, "step": 674300 }, { - "epoch": 6.87, - "learning_rate": 3.5785632522394024e-05, - "loss": 0.4961, + "epoch": 9.291559890882036, + "grad_norm": 3.7849740982055664, + "learning_rate": 1.774989336245535e-05, + "loss": 0.33, "step": 674400 }, { - "epoch": 6.87, - "learning_rate": 3.5779123312506444e-05, - "loss": 0.5677, + "epoch": 9.292937642941776, + "grad_norm": 4.329867362976074, + "learning_rate": 1.7742399039603664e-05, + "loss": 0.3824, "step": 674500 }, { - "epoch": 6.87, - "learning_rate": 3.5772613820054215e-05, - "loss": 0.4267, + "epoch": 9.294315395001515, + "grad_norm": 3.1027746200561523, + "learning_rate": 1.7734905545710096e-05, + "loss": 0.3064, "step": 674600 }, { - "epoch": 6.87, - "learning_rate": 3.576610404535562e-05, - "loss": 0.4522, + "epoch": 9.295693147061256, + "grad_norm": 2.9407665729522705, + "learning_rate": 1.7727412881411265e-05, + "loss": 0.3151, "step": 674700 }, { - "epoch": 6.87, - "learning_rate": 3.575959398872895e-05, - "loss": 0.4864, + "epoch": 9.297070899120994, + "grad_norm": 6.45418119430542, + "learning_rate": 1.7719921047343714e-05, + "loss": 0.3519, "step": 674800 }, { - "epoch": 6.88, - "learning_rate": 3.5753083650492506e-05, - "loss": 0.4657, + "epoch": 9.298448651180733, + "grad_norm": 0.926025390625, + "learning_rate": 1.77124300441439e-05, + "loss": 0.3172, "step": 674900 }, { - "epoch": 6.88, - "learning_rate": 3.5746573030964604e-05, - "loss": 0.5481, + "epoch": 9.299826403240473, + "grad_norm": 1.4097199440002441, + "learning_rate": 1.7704939872448222e-05, + "loss": 0.3266, "step": 675000 }, { - "epoch": 6.88, - "learning_rate": 3.5740062130463575e-05, - "loss": 0.4292, + "epoch": 9.301204155300212, + "grad_norm": 2.1308705806732178, + "learning_rate": 1.769745053289301e-05, + "loss": 0.3102, "step": 675100 }, { - "epoch": 6.88, - "learning_rate": 3.573355094930776e-05, - "loss": 0.3947, + "epoch": 9.30258190735995, + "grad_norm": 2.81240177154541, + "learning_rate": 1.768996202611453e-05, + "loss": 0.3598, "step": 675200 }, { - "epoch": 6.88, - "learning_rate": 3.572703948781553e-05, - "loss": 0.4997, + "epoch": 9.303959659419691, + "grad_norm": 2.48207426071167, + "learning_rate": 1.7682474352748966e-05, + "loss": 0.3153, "step": 675300 }, { - "epoch": 6.88, - "learning_rate": 3.5720527746305224e-05, - "loss": 0.4819, + "epoch": 9.30533741147943, + "grad_norm": 6.205196380615234, + "learning_rate": 1.767498751343243e-05, + "loss": 0.3422, "step": 675400 }, { - "epoch": 6.88, - "learning_rate": 3.571401572509526e-05, - "loss": 0.4822, + "epoch": 9.30671516353917, + "grad_norm": 1.7789554595947266, + "learning_rate": 1.7667501508800964e-05, + "loss": 0.3088, "step": 675500 }, { - "epoch": 6.88, - "learning_rate": 3.570750342450404e-05, - "loss": 0.5081, + "epoch": 9.308092915598909, + "grad_norm": 0.8953598141670227, + "learning_rate": 1.7660091187046727e-05, + "loss": 0.3108, "step": 675600 }, { - "epoch": 6.88, - "learning_rate": 3.570099084484995e-05, - "loss": 0.4565, + "epoch": 9.309470667658648, + "grad_norm": 7.384527683258057, + "learning_rate": 1.7652681684607667e-05, + "loss": 0.3331, "step": 675700 }, { - "epoch": 6.89, - "learning_rate": 3.569447798645143e-05, - "loss": 0.496, + "epoch": 9.310848419718388, + "grad_norm": 20.89877700805664, + "learning_rate": 1.7645198171108912e-05, + "loss": 0.2974, "step": 675800 }, { - "epoch": 6.89, - "learning_rate": 3.568802998237233e-05, - "loss": 0.5476, + "epoch": 9.312226171778127, + "grad_norm": 24.380756378173828, + "learning_rate": 1.7637715494825982e-05, + "loss": 0.3273, "step": 675900 }, { - "epoch": 6.89, - "learning_rate": 3.568151657021978e-05, - "loss": 0.5375, + "epoch": 9.313603923837865, + "grad_norm": 2.0169479846954346, + "learning_rate": 1.763023365639458e-05, + "loss": 0.2865, "step": 676000 }, { - "epoch": 6.89, - "learning_rate": 3.567500288027498e-05, - "loss": 0.5519, + "epoch": 9.314981675897606, + "grad_norm": 2.934162139892578, + "learning_rate": 1.7622752656450316e-05, + "loss": 0.3153, "step": 676100 }, { - "epoch": 6.89, - "learning_rate": 3.56684889128564e-05, - "loss": 0.5217, + "epoch": 9.316359427957345, + "grad_norm": 4.041893005371094, + "learning_rate": 1.761527249562875e-05, + "loss": 0.3614, "step": 676200 }, { - "epoch": 6.89, - "learning_rate": 3.566197466828255e-05, - "loss": 0.5068, + "epoch": 9.317737180017085, + "grad_norm": 21.639204025268555, + "learning_rate": 1.7607793174565354e-05, + "loss": 0.3131, "step": 676300 }, { - "epoch": 6.89, - "learning_rate": 3.565546014687192e-05, - "loss": 0.477, + "epoch": 9.319114932076824, + "grad_norm": 4.907148838043213, + "learning_rate": 1.7600314693895543e-05, + "loss": 0.3674, "step": 676400 }, { - "epoch": 6.89, - "learning_rate": 3.5648945348943036e-05, - "loss": 0.5241, + "epoch": 9.320492684136562, + "grad_norm": 1.8368486166000366, + "learning_rate": 1.759283705425464e-05, + "loss": 0.355, "step": 676500 }, { - "epoch": 6.89, - "learning_rate": 3.5642430274814444e-05, - "loss": 0.5511, + "epoch": 9.321870436196303, + "grad_norm": 5.02472448348999, + "learning_rate": 1.7585360256277906e-05, + "loss": 0.302, "step": 676600 }, { - "epoch": 6.89, - "learning_rate": 3.563591492480468e-05, - "loss": 0.5716, + "epoch": 9.323248188256041, + "grad_norm": 2.847317695617676, + "learning_rate": 1.7577884300600547e-05, + "loss": 0.348, "step": 676700 }, { - "epoch": 6.9, - "learning_rate": 3.5629399299232304e-05, - "loss": 0.5128, + "epoch": 9.32462594031578, + "grad_norm": 11.740234375, + "learning_rate": 1.757040918785768e-05, + "loss": 0.3361, "step": 676800 }, { - "epoch": 6.9, - "learning_rate": 3.562288339841591e-05, - "loss": 0.4848, + "epoch": 9.32600369237552, + "grad_norm": 3.729403495788574, + "learning_rate": 1.756293491868436e-05, + "loss": 0.3309, "step": 676900 }, { - "epoch": 6.9, - "learning_rate": 3.5616367222674055e-05, - "loss": 0.4499, + "epoch": 9.32738144443526, + "grad_norm": 3.784461736679077, + "learning_rate": 1.7555461493715544e-05, + "loss": 0.2769, "step": 677000 }, { - "epoch": 6.9, - "learning_rate": 3.5609850772325376e-05, - "loss": 0.4991, + "epoch": 9.328759196495, + "grad_norm": 3.5498251914978027, + "learning_rate": 1.7547988913586148e-05, + "loss": 0.3702, "step": 677100 }, { - "epoch": 6.9, - "learning_rate": 3.560333404768847e-05, - "loss": 0.5605, + "epoch": 9.330136948554738, + "grad_norm": 2.397456645965576, + "learning_rate": 1.7540517178931013e-05, + "loss": 0.3027, "step": 677200 }, { - "epoch": 6.9, - "learning_rate": 3.5596817049081965e-05, - "loss": 0.5085, + "epoch": 9.331514700614477, + "grad_norm": 3.777019500732422, + "learning_rate": 1.75330462903849e-05, + "loss": 0.3347, "step": 677300 }, { - "epoch": 6.9, - "learning_rate": 3.5590299776824495e-05, - "loss": 0.5279, + "epoch": 9.332892452674217, + "grad_norm": 2.6081326007843018, + "learning_rate": 1.752565094480704e-05, + "loss": 0.3224, "step": 677400 }, { - "epoch": 6.9, - "learning_rate": 3.5583782231234743e-05, - "loss": 0.6079, + "epoch": 9.334270204733956, + "grad_norm": 2.033360242843628, + "learning_rate": 1.751818174190604e-05, + "loss": 0.364, "step": 677500 }, { - "epoch": 6.9, - "learning_rate": 3.557726441263136e-05, - "loss": 0.4581, + "epoch": 9.335647956793695, + "grad_norm": 3.4498722553253174, + "learning_rate": 1.7510713387011563e-05, + "loss": 0.3195, "step": 677600 }, { - "epoch": 6.9, - "learning_rate": 3.557074632133303e-05, - "loss": 0.4687, + "epoch": 9.337025708853435, + "grad_norm": 1.201979160308838, + "learning_rate": 1.750324588075808e-05, + "loss": 0.3146, "step": 677700 }, { - "epoch": 6.91, - "learning_rate": 3.556422795765844e-05, - "loss": 0.5262, + "epoch": 9.338403460913174, + "grad_norm": 5.784176349639893, + "learning_rate": 1.749577922378001e-05, + "loss": 0.3293, "step": 677800 }, { - "epoch": 6.91, - "learning_rate": 3.5557709321926315e-05, - "loss": 0.4798, + "epoch": 9.339781212972913, + "grad_norm": 7.811093807220459, + "learning_rate": 1.7488313416711677e-05, + "loss": 0.2953, "step": 677900 }, { - "epoch": 6.91, - "learning_rate": 3.5551255604874146e-05, - "loss": 0.4504, + "epoch": 9.341158965032653, + "grad_norm": 3.642213821411133, + "learning_rate": 1.748084846018734e-05, + "loss": 0.3078, "step": 678000 }, { - "epoch": 6.91, - "learning_rate": 3.554473642869573e-05, - "loss": 0.5968, + "epoch": 9.342536717092392, + "grad_norm": 3.5646255016326904, + "learning_rate": 1.7473384354841188e-05, + "loss": 0.2859, "step": 678100 }, { - "epoch": 6.91, - "learning_rate": 3.553821698141281e-05, - "loss": 0.5011, + "epoch": 9.343914469152132, + "grad_norm": 9.357978820800781, + "learning_rate": 1.7465921101307315e-05, + "loss": 0.329, "step": 678200 }, { - "epoch": 6.91, - "learning_rate": 3.5531697263344125e-05, - "loss": 0.4457, + "epoch": 9.34529222121187, + "grad_norm": 3.851332664489746, + "learning_rate": 1.7458458700219787e-05, + "loss": 0.3201, "step": 678300 }, { - "epoch": 6.91, - "learning_rate": 3.552517727480845e-05, - "loss": 0.5849, + "epoch": 9.34666997327161, + "grad_norm": 3.7411699295043945, + "learning_rate": 1.7450997152212564e-05, + "loss": 0.3757, "step": 678400 }, { - "epoch": 6.91, - "learning_rate": 3.551865701612459e-05, - "loss": 0.55, + "epoch": 9.34804772533135, + "grad_norm": 1.7756787538528442, + "learning_rate": 1.744353645791954e-05, + "loss": 0.3122, "step": 678500 }, { - "epoch": 6.91, - "learning_rate": 3.551213648761133e-05, - "loss": 0.544, + "epoch": 9.349425477391089, + "grad_norm": 1.9279903173446655, + "learning_rate": 1.743607661797456e-05, + "loss": 0.3409, "step": 678600 }, { - "epoch": 6.91, - "learning_rate": 3.550561568958748e-05, - "loss": 0.4558, + "epoch": 9.350803229450827, + "grad_norm": 1.9116443395614624, + "learning_rate": 1.742861763301134e-05, + "loss": 0.3362, "step": 678700 }, { - "epoch": 6.92, - "learning_rate": 3.549909462237189e-05, - "loss": 0.5014, + "epoch": 9.352180981510568, + "grad_norm": 2.4937658309936523, + "learning_rate": 1.742115950366358e-05, + "loss": 0.3468, "step": 678800 }, { - "epoch": 6.92, - "learning_rate": 3.549257328628338e-05, - "loss": 0.4973, + "epoch": 9.353558733570306, + "grad_norm": 1.7982778549194336, + "learning_rate": 1.7413702230564883e-05, + "loss": 0.3173, "step": 678900 }, { - "epoch": 6.92, - "learning_rate": 3.548605168164083e-05, - "loss": 0.5116, + "epoch": 9.354936485630047, + "grad_norm": 13.983357429504395, + "learning_rate": 1.7406245814348778e-05, + "loss": 0.3355, "step": 679000 }, { - "epoch": 6.92, - "learning_rate": 3.547952980876309e-05, - "loss": 0.5169, + "epoch": 9.356314237689785, + "grad_norm": 3.6085236072540283, + "learning_rate": 1.739879025564875e-05, + "loss": 0.2948, "step": 679100 }, { - "epoch": 6.92, - "learning_rate": 3.547300766796903e-05, - "loss": 0.4444, + "epoch": 9.357691989749524, + "grad_norm": 9.648759841918945, + "learning_rate": 1.7391335555098146e-05, + "loss": 0.3697, "step": 679200 }, { - "epoch": 6.92, - "learning_rate": 3.546648525957755e-05, - "loss": 0.4847, + "epoch": 9.359069741809265, + "grad_norm": 1.647913932800293, + "learning_rate": 1.7383881713330314e-05, + "loss": 0.2968, "step": 679300 }, { - "epoch": 6.92, - "learning_rate": 3.5459962583907575e-05, - "loss": 0.4595, + "epoch": 9.360447493869003, + "grad_norm": 2.4777965545654297, + "learning_rate": 1.737642873097848e-05, + "loss": 0.3126, "step": 679400 }, { - "epoch": 6.92, - "learning_rate": 3.5453439641278014e-05, - "loss": 0.5173, + "epoch": 9.361825245928742, + "grad_norm": 4.222289562225342, + "learning_rate": 1.736897660867581e-05, + "loss": 0.3261, "step": 679500 }, { - "epoch": 6.92, - "learning_rate": 3.54469164320078e-05, - "loss": 0.5262, + "epoch": 9.363202997988482, + "grad_norm": 20.759096145629883, + "learning_rate": 1.7361525347055417e-05, + "loss": 0.3089, "step": 679600 }, { - "epoch": 6.92, - "learning_rate": 3.5440392956415874e-05, - "loss": 0.5186, + "epoch": 9.364580750048221, + "grad_norm": 0.8587434887886047, + "learning_rate": 1.7354074946750317e-05, + "loss": 0.3204, "step": 679700 }, { - "epoch": 6.93, - "learning_rate": 3.543386921482121e-05, - "loss": 0.449, + "epoch": 9.365958502107961, + "grad_norm": 1.9344478845596313, + "learning_rate": 1.7346625408393452e-05, + "loss": 0.2952, "step": 679800 }, { - "epoch": 6.93, - "learning_rate": 3.542734520754275e-05, - "loss": 0.5298, + "epoch": 9.3673362541677, + "grad_norm": 3.457611560821533, + "learning_rate": 1.73391767326177e-05, + "loss": 0.3534, "step": 679900 }, { - "epoch": 6.93, - "learning_rate": 3.542082093489951e-05, - "loss": 0.4692, + "epoch": 9.368714006227439, + "grad_norm": 2.8440911769866943, + "learning_rate": 1.7331728920055863e-05, + "loss": 0.2572, "step": 680000 }, { - "epoch": 6.93, - "learning_rate": 3.5414296397210476e-05, - "loss": 0.5563, + "epoch": 9.37009175828718, + "grad_norm": 3.2267119884490967, + "learning_rate": 1.732428197134068e-05, + "loss": 0.3592, "step": 680100 }, { - "epoch": 6.93, - "learning_rate": 3.540777159479466e-05, - "loss": 0.488, + "epoch": 9.371469510346918, + "grad_norm": 4.819392204284668, + "learning_rate": 1.7316835887104808e-05, + "loss": 0.3647, "step": 680200 }, { - "epoch": 6.93, - "learning_rate": 3.5401246527971074e-05, - "loss": 0.5281, + "epoch": 9.372847262406657, + "grad_norm": 53.90210723876953, + "learning_rate": 1.730939066798082e-05, + "loss": 0.3705, "step": 680300 }, { - "epoch": 6.93, - "learning_rate": 3.539472119705878e-05, - "loss": 0.5031, + "epoch": 9.374225014466397, + "grad_norm": 2.236374855041504, + "learning_rate": 1.730194631460123e-05, + "loss": 0.26, "step": 680400 }, { - "epoch": 6.93, - "learning_rate": 3.5388195602376805e-05, - "loss": 0.4847, + "epoch": 9.375602766526136, + "grad_norm": 8.308146476745605, + "learning_rate": 1.7294502827598465e-05, + "loss": 0.3124, "step": 680500 }, { - "epoch": 6.93, - "learning_rate": 3.538166974424422e-05, - "loss": 0.4981, + "epoch": 9.376980518585876, + "grad_norm": 0.9520137906074524, + "learning_rate": 1.7287060207604905e-05, + "loss": 0.3051, "step": 680600 }, { - "epoch": 6.94, - "learning_rate": 3.5375143622980106e-05, - "loss": 0.4627, + "epoch": 9.378358270645615, + "grad_norm": 2.7053496837615967, + "learning_rate": 1.7279618455252825e-05, + "loss": 0.3135, "step": 680700 }, { - "epoch": 6.94, - "learning_rate": 3.536861723890354e-05, - "loss": 0.4835, + "epoch": 9.379736022705353, + "grad_norm": 2.6034014225006104, + "learning_rate": 1.7272177571174453e-05, + "loss": 0.3162, "step": 680800 }, { - "epoch": 6.94, - "learning_rate": 3.5362090592333634e-05, - "loss": 0.5308, + "epoch": 9.381113774765094, + "grad_norm": 2.3545210361480713, + "learning_rate": 1.7264737556001915e-05, + "loss": 0.3039, "step": 680900 }, { - "epoch": 6.94, - "learning_rate": 3.5355563683589486e-05, - "loss": 0.5092, + "epoch": 9.382491526824833, + "grad_norm": 24.03419303894043, + "learning_rate": 1.7257298410367276e-05, + "loss": 0.3631, "step": 681000 }, { - "epoch": 6.94, - "learning_rate": 3.5349036512990246e-05, - "loss": 0.4558, + "epoch": 9.383869278884571, + "grad_norm": 2.7791194915771484, + "learning_rate": 1.724986013490255e-05, + "loss": 0.3174, "step": 681100 }, { - "epoch": 6.94, - "learning_rate": 3.534250908085504e-05, - "loss": 0.4877, + "epoch": 9.385247030944312, + "grad_norm": 0.7346885800361633, + "learning_rate": 1.7242422730239643e-05, + "loss": 0.3408, "step": 681200 }, { - "epoch": 6.94, - "learning_rate": 3.533598138750302e-05, - "loss": 0.5427, + "epoch": 9.38662478300405, + "grad_norm": 4.430757522583008, + "learning_rate": 1.7234986197010402e-05, + "loss": 0.3224, "step": 681300 }, { - "epoch": 6.94, - "learning_rate": 3.532945343325336e-05, - "loss": 0.5351, + "epoch": 9.38800253506379, + "grad_norm": 5.133533000946045, + "learning_rate": 1.7227550535846606e-05, + "loss": 0.2908, "step": 681400 }, { - "epoch": 6.94, - "learning_rate": 3.532292521842523e-05, - "loss": 0.5174, + "epoch": 9.38938028712353, + "grad_norm": 2.1571388244628906, + "learning_rate": 1.722011574737993e-05, + "loss": 0.384, "step": 681500 }, { - "epoch": 6.94, - "learning_rate": 3.531639674333782e-05, - "loss": 0.418, + "epoch": 9.390758039183268, + "grad_norm": 2.6111044883728027, + "learning_rate": 1.721268183224202e-05, + "loss": 0.3019, "step": 681600 }, { - "epoch": 6.95, - "learning_rate": 3.530986800831034e-05, - "loss": 0.5384, + "epoch": 9.392135791243009, + "grad_norm": 2.909450054168701, + "learning_rate": 1.720524879106442e-05, + "loss": 0.3285, "step": 681700 }, { - "epoch": 6.95, - "learning_rate": 3.530333901366199e-05, - "loss": 0.4506, + "epoch": 9.393513543302747, + "grad_norm": 19.432626724243164, + "learning_rate": 1.71978166244786e-05, + "loss": 0.3627, "step": 681800 }, { - "epoch": 6.95, - "learning_rate": 3.529680975971201e-05, - "loss": 0.4526, + "epoch": 9.394891295362486, + "grad_norm": 3.875836133956909, + "learning_rate": 1.719038533311597e-05, + "loss": 0.3003, "step": 681900 }, { - "epoch": 6.95, - "learning_rate": 3.529028024677966e-05, - "loss": 0.5298, + "epoch": 9.396269047422226, + "grad_norm": 2.5819971561431885, + "learning_rate": 1.7182954917607846e-05, + "loss": 0.3676, "step": 682000 }, { - "epoch": 6.95, - "learning_rate": 3.528375047518416e-05, - "loss": 0.4294, + "epoch": 9.397646799481965, + "grad_norm": 2.9338810443878174, + "learning_rate": 1.7175525378585476e-05, + "loss": 0.3217, "step": 682100 }, { - "epoch": 6.95, - "learning_rate": 3.52772204452448e-05, - "loss": 0.443, + "epoch": 9.399024551541704, + "grad_norm": 3.7809653282165527, + "learning_rate": 1.716809671668005e-05, + "loss": 0.372, "step": 682200 }, { - "epoch": 6.95, - "learning_rate": 3.5270690157280866e-05, - "loss": 0.4039, + "epoch": 9.400402303601444, + "grad_norm": 1.086075782775879, + "learning_rate": 1.7160668932522667e-05, + "loss": 0.3311, "step": 682300 }, { - "epoch": 6.95, - "learning_rate": 3.526415961161162e-05, - "loss": 0.4581, + "epoch": 9.401780055661183, + "grad_norm": 2.3986122608184814, + "learning_rate": 1.7153316291452085e-05, + "loss": 0.3289, "step": 682400 }, { - "epoch": 6.95, - "learning_rate": 3.5257628808556383e-05, - "loss": 0.6104, + "epoch": 9.403157807720923, + "grad_norm": 0.9070461392402649, + "learning_rate": 1.714589025589058e-05, + "loss": 0.2922, "step": 682500 }, { - "epoch": 6.95, - "learning_rate": 3.525109774843448e-05, - "loss": 0.5208, + "epoch": 9.404535559780662, + "grad_norm": 2.0744144916534424, + "learning_rate": 1.713846509996366e-05, + "loss": 0.307, "step": 682600 }, { - "epoch": 6.96, - "learning_rate": 3.5244566431565236e-05, - "loss": 0.543, + "epoch": 9.4059133118404, + "grad_norm": 3.639514446258545, + "learning_rate": 1.7131040824302132e-05, + "loss": 0.2896, "step": 682700 }, { - "epoch": 6.96, - "learning_rate": 3.523803485826799e-05, - "loss": 0.5433, + "epoch": 9.407291063900141, + "grad_norm": 1.4293047189712524, + "learning_rate": 1.7123617429536743e-05, + "loss": 0.2998, "step": 682800 }, { - "epoch": 6.96, - "learning_rate": 3.52315030288621e-05, - "loss": 0.5217, + "epoch": 9.40866881595988, + "grad_norm": 4.762004375457764, + "learning_rate": 1.7116194916298136e-05, + "loss": 0.3372, "step": 682900 }, { - "epoch": 6.96, - "learning_rate": 3.5224970943666925e-05, - "loss": 0.501, + "epoch": 9.410046568019618, + "grad_norm": 5.242640972137451, + "learning_rate": 1.7108773285216895e-05, + "loss": 0.3574, "step": 683000 }, { - "epoch": 6.96, - "learning_rate": 3.521843860300185e-05, - "loss": 0.4783, + "epoch": 9.411424320079359, + "grad_norm": 0.8914658427238464, + "learning_rate": 1.710135253692353e-05, + "loss": 0.31, "step": 683100 }, { - "epoch": 6.96, - "learning_rate": 3.521190600718627e-05, - "loss": 0.4824, + "epoch": 9.412802072139097, + "grad_norm": 7.477059841156006, + "learning_rate": 1.709393267204845e-05, + "loss": 0.305, "step": 683200 }, { - "epoch": 6.96, - "learning_rate": 3.52053731565396e-05, - "loss": 0.4812, + "epoch": 9.414179824198838, + "grad_norm": 3.8547801971435547, + "learning_rate": 1.7086513691222038e-05, + "loss": 0.3114, "step": 683300 }, { - "epoch": 6.96, - "learning_rate": 3.5198840051381244e-05, - "loss": 0.5769, + "epoch": 9.415557576258577, + "grad_norm": 2.740771770477295, + "learning_rate": 1.707909559507456e-05, + "loss": 0.357, "step": 683400 }, { - "epoch": 6.96, - "learning_rate": 3.5192306692030636e-05, - "loss": 0.4308, + "epoch": 9.416935328318315, + "grad_norm": 5.158901214599609, + "learning_rate": 1.707167838423622e-05, + "loss": 0.3032, "step": 683500 }, { - "epoch": 6.96, - "learning_rate": 3.518577307880721e-05, - "loss": 0.471, + "epoch": 9.418313080378056, + "grad_norm": 1.5831928253173828, + "learning_rate": 1.706426205933717e-05, + "loss": 0.3429, "step": 683600 }, { - "epoch": 6.97, - "learning_rate": 3.517923921203043e-05, - "loss": 0.5129, + "epoch": 9.419690832437794, + "grad_norm": 5.148906707763672, + "learning_rate": 1.7056846621007428e-05, + "loss": 0.299, "step": 683700 }, { - "epoch": 6.97, - "learning_rate": 3.517270509201975e-05, - "loss": 0.5355, + "epoch": 9.421068584497533, + "grad_norm": 3.8267674446105957, + "learning_rate": 1.7049432069877003e-05, + "loss": 0.2365, "step": 683800 }, { - "epoch": 6.97, - "learning_rate": 3.516617071909468e-05, - "loss": 0.5171, + "epoch": 9.422446336557273, + "grad_norm": 4.152651786804199, + "learning_rate": 1.704201840657578e-05, + "loss": 0.2809, "step": 683900 }, { - "epoch": 6.97, - "learning_rate": 3.515963609357468e-05, - "loss": 0.472, + "epoch": 9.423824088617012, + "grad_norm": 2.201584577560425, + "learning_rate": 1.7034605631733596e-05, + "loss": 0.3479, "step": 684000 }, { - "epoch": 6.97, - "learning_rate": 3.515310121577927e-05, - "loss": 0.6151, + "epoch": 9.425201840676753, + "grad_norm": 3.6741957664489746, + "learning_rate": 1.702719374598022e-05, + "loss": 0.3312, "step": 684100 }, { - "epoch": 6.97, - "learning_rate": 3.5146566086027966e-05, - "loss": 0.4364, + "epoch": 9.426579592736491, + "grad_norm": 0.8137636184692383, + "learning_rate": 1.70197827499453e-05, + "loss": 0.3093, "step": 684200 }, { - "epoch": 6.97, - "learning_rate": 3.5140030704640286e-05, - "loss": 0.5133, + "epoch": 9.42795734479623, + "grad_norm": 1.0340723991394043, + "learning_rate": 1.7012446740906037e-05, + "loss": 0.3416, "step": 684300 }, { - "epoch": 6.97, - "learning_rate": 3.5133495071935775e-05, - "loss": 0.5401, + "epoch": 9.42933509685597, + "grad_norm": 2.266164541244507, + "learning_rate": 1.700503751728389e-05, + "loss": 0.3557, "step": 684400 }, { - "epoch": 6.97, - "learning_rate": 3.5126959188233996e-05, - "loss": 0.5363, + "epoch": 9.430712848915709, + "grad_norm": 4.826414108276367, + "learning_rate": 1.6997629185262508e-05, + "loss": 0.2965, "step": 684500 }, { - "epoch": 6.97, - "learning_rate": 3.512042305385451e-05, - "loss": 0.4922, + "epoch": 9.432090600975448, + "grad_norm": 3.8050553798675537, + "learning_rate": 1.699022174547125e-05, + "loss": 0.373, "step": 684600 }, { - "epoch": 6.98, - "learning_rate": 3.51138866691169e-05, - "loss": 0.3739, + "epoch": 9.433468353035188, + "grad_norm": 2.3289053440093994, + "learning_rate": 1.698281519853943e-05, + "loss": 0.3381, "step": 684700 }, { - "epoch": 6.98, - "learning_rate": 3.510735003434074e-05, - "loss": 0.5071, + "epoch": 9.434846105094927, + "grad_norm": 0.971208393573761, + "learning_rate": 1.6975409545096264e-05, + "loss": 0.3062, "step": 684800 }, { - "epoch": 6.98, - "learning_rate": 3.510081314984564e-05, - "loss": 0.4713, + "epoch": 9.436223857154667, + "grad_norm": 1.1102173328399658, + "learning_rate": 1.696800478577089e-05, + "loss": 0.3748, "step": 684900 }, { - "epoch": 6.98, - "learning_rate": 3.509427601595123e-05, - "loss": 0.5095, + "epoch": 9.437601609214406, + "grad_norm": 1.7782244682312012, + "learning_rate": 1.6960600921192398e-05, + "loss": 0.3018, "step": 685000 }, { - "epoch": 6.98, - "learning_rate": 3.5087738632977107e-05, - "loss": 0.4793, + "epoch": 9.438979361274145, + "grad_norm": 4.1507110595703125, + "learning_rate": 1.695319795198978e-05, + "loss": 0.3308, "step": 685100 }, { - "epoch": 6.98, - "learning_rate": 3.508120100124294e-05, - "loss": 0.5293, + "epoch": 9.440357113333885, + "grad_norm": 2.7041454315185547, + "learning_rate": 1.6945795878791956e-05, + "loss": 0.2937, "step": 685200 }, { - "epoch": 6.98, - "learning_rate": 3.507466312106837e-05, - "loss": 0.4923, + "epoch": 9.441734865393624, + "grad_norm": 3.111372470855713, + "learning_rate": 1.693839470222776e-05, + "loss": 0.3103, "step": 685300 }, { - "epoch": 6.98, - "learning_rate": 3.506819037528315e-05, - "loss": 0.5233, + "epoch": 9.443112617453362, + "grad_norm": 3.622607469558716, + "learning_rate": 1.693099442292596e-05, + "loss": 0.3264, "step": 685400 }, { - "epoch": 6.98, - "learning_rate": 3.506165200166319e-05, - "loss": 0.5402, + "epoch": 9.444490369513103, + "grad_norm": 1.2135097980499268, + "learning_rate": 1.6923595041515265e-05, + "loss": 0.2964, "step": 685500 }, { - "epoch": 6.99, - "learning_rate": 3.5055113380558654e-05, - "loss": 0.3898, + "epoch": 9.445868121572842, + "grad_norm": 1.7570555210113525, + "learning_rate": 1.6916196558624275e-05, + "loss": 0.3142, "step": 685600 }, { - "epoch": 6.99, - "learning_rate": 3.5048639902194344e-05, - "loss": 0.5548, + "epoch": 9.447245873632582, + "grad_norm": 2.5322115421295166, + "learning_rate": 1.6908798974881533e-05, + "loss": 0.3333, "step": 685700 }, { - "epoch": 6.99, - "learning_rate": 3.504210078954663e-05, - "loss": 0.4224, + "epoch": 9.44862362569232, + "grad_norm": 2.4079370498657227, + "learning_rate": 1.6901402290915515e-05, + "loss": 0.2382, "step": 685800 }, { - "epoch": 6.99, - "learning_rate": 3.503556143037029e-05, - "loss": 0.5932, + "epoch": 9.45000137775206, + "grad_norm": 3.14982271194458, + "learning_rate": 1.689400650735458e-05, + "loss": 0.2722, "step": 685900 }, { - "epoch": 6.99, - "learning_rate": 3.502902182498504e-05, - "loss": 0.4524, + "epoch": 9.4513791298118, + "grad_norm": 1.9461138248443604, + "learning_rate": 1.6886611624827056e-05, + "loss": 0.3227, "step": 686000 }, { - "epoch": 6.99, - "learning_rate": 3.5022481973710635e-05, - "loss": 0.4871, + "epoch": 9.452756881871538, + "grad_norm": 3.121630907058716, + "learning_rate": 1.6879217643961175e-05, + "loss": 0.3746, "step": 686100 }, { - "epoch": 6.99, - "learning_rate": 3.5015941876866834e-05, - "loss": 0.5449, + "epoch": 9.454134633931277, + "grad_norm": 1.5833219289779663, + "learning_rate": 1.6871824565385082e-05, + "loss": 0.2665, "step": 686200 }, { - "epoch": 6.99, - "learning_rate": 3.5009401534773404e-05, - "loss": 0.5129, + "epoch": 9.455512385991018, + "grad_norm": 2.4706714153289795, + "learning_rate": 1.686443238972688e-05, + "loss": 0.3622, "step": 686300 }, { - "epoch": 6.99, - "learning_rate": 3.500286094775014e-05, - "loss": 0.4619, + "epoch": 9.456890138050756, + "grad_norm": 1.2587929964065552, + "learning_rate": 1.685704111761455e-05, + "loss": 0.3371, "step": 686400 }, { - "epoch": 6.99, - "learning_rate": 3.499632011611683e-05, - "loss": 0.5103, + "epoch": 9.458267890110495, + "grad_norm": 4.239849090576172, + "learning_rate": 1.6849650749676023e-05, + "loss": 0.3447, "step": 686500 }, { - "epoch": 7.0, - "learning_rate": 3.4989844452160704e-05, - "loss": 0.5073, + "epoch": 9.459645642170235, + "grad_norm": 5.095478057861328, + "learning_rate": 1.6842261286539153e-05, + "loss": 0.3254, "step": 686600 }, { - "epoch": 7.0, - "learning_rate": 3.498330313470486e-05, - "loss": 0.4449, + "epoch": 9.461023394229974, + "grad_norm": 4.181543350219727, + "learning_rate": 1.6834872728831712e-05, + "loss": 0.3111, "step": 686700 }, { - "epoch": 7.0, - "learning_rate": 3.4976761573595235e-05, - "loss": 0.4858, + "epoch": 9.462401146289714, + "grad_norm": 2.247666835784912, + "learning_rate": 1.6827485077181395e-05, + "loss": 0.3168, "step": 686800 }, { - "epoch": 7.0, - "learning_rate": 3.497028518839955e-05, - "loss": 0.4881, + "epoch": 9.463778898349453, + "grad_norm": 1.4321414232254028, + "learning_rate": 1.6820172195175336e-05, + "loss": 0.3077, "step": 686900 }, { - "epoch": 7.0, - "learning_rate": 3.4963743143370446e-05, - "loss": 0.48, + "epoch": 9.465156650409192, + "grad_norm": 3.2892682552337646, + "learning_rate": 1.681278634844581e-05, + "loss": 0.2918, "step": 687000 }, { - "epoch": 7.0, - "learning_rate": 3.495720085564393e-05, - "loss": 0.3663, + "epoch": 9.466534402468932, + "grad_norm": 4.064759731292725, + "learning_rate": 1.680540140964977e-05, + "loss": 0.3004, "step": 687100 }, { - "epoch": 7.0, - "learning_rate": 3.495065832553987e-05, - "loss": 0.4919, + "epoch": 9.46791215452867, + "grad_norm": 3.0618879795074463, + "learning_rate": 1.6798017379414593e-05, + "loss": 0.2754, "step": 687200 }, { - "epoch": 7.0, - "learning_rate": 3.494411555337817e-05, - "loss": 0.4042, + "epoch": 9.46928990658841, + "grad_norm": 1.4956387281417847, + "learning_rate": 1.679063425836758e-05, + "loss": 0.2771, "step": 687300 }, { - "epoch": 7.0, - "learning_rate": 3.493757253947872e-05, - "loss": 0.4198, + "epoch": 9.47066765864815, + "grad_norm": 3.4466359615325928, + "learning_rate": 1.6783252047135995e-05, + "loss": 0.3947, "step": 687400 }, { - "epoch": 7.0, - "learning_rate": 3.493102928416144e-05, - "loss": 0.5168, + "epoch": 9.472045410707889, + "grad_norm": 2.664890766143799, + "learning_rate": 1.6775870746346962e-05, + "loss": 0.3361, "step": 687500 }, { - "epoch": 7.01, - "learning_rate": 3.492448578774626e-05, - "loss": 0.4357, + "epoch": 9.473423162767629, + "grad_norm": 2.4691033363342285, + "learning_rate": 1.676856415601292e-05, + "loss": 0.3701, "step": 687600 }, { - "epoch": 7.01, - "learning_rate": 3.49179420505531e-05, - "loss": 0.4583, + "epoch": 9.474800914827368, + "grad_norm": 2.4638006687164307, + "learning_rate": 1.67611846688701e-05, + "loss": 0.3707, "step": 687700 }, { - "epoch": 7.01, - "learning_rate": 3.491139807290193e-05, - "loss": 0.4146, + "epoch": 9.476178666887106, + "grad_norm": 55.51746368408203, + "learning_rate": 1.6753806094044596e-05, + "loss": 0.3542, "step": 687800 }, { - "epoch": 7.01, - "learning_rate": 3.49048538551127e-05, - "loss": 0.4159, + "epoch": 9.477556418946847, + "grad_norm": 3.844655752182007, + "learning_rate": 1.6746428432163236e-05, + "loss": 0.3211, "step": 687900 }, { - "epoch": 7.01, - "learning_rate": 3.489830939750539e-05, - "loss": 0.4622, + "epoch": 9.478934171006586, + "grad_norm": 11.643254280090332, + "learning_rate": 1.6739051683852805e-05, + "loss": 0.3182, "step": 688000 }, { - "epoch": 7.01, - "learning_rate": 3.489176470039999e-05, - "loss": 0.4825, + "epoch": 9.480311923066324, + "grad_norm": 2.1945297718048096, + "learning_rate": 1.6731675849739976e-05, + "loss": 0.3429, "step": 688100 }, { - "epoch": 7.01, - "learning_rate": 3.488521976411647e-05, - "loss": 0.439, + "epoch": 9.481689675126065, + "grad_norm": 16.415245056152344, + "learning_rate": 1.6724300930451373e-05, + "loss": 0.3414, "step": 688200 }, { - "epoch": 7.01, - "learning_rate": 3.487867458897488e-05, - "loss": 0.3803, + "epoch": 9.483067427185803, + "grad_norm": 1.9102420806884766, + "learning_rate": 1.6716926926613536e-05, + "loss": 0.3383, "step": 688300 }, { - "epoch": 7.01, - "learning_rate": 3.48721291752952e-05, - "loss": 0.466, + "epoch": 9.484445179245544, + "grad_norm": 3.0731265544891357, + "learning_rate": 1.670955383885292e-05, + "loss": 0.3558, "step": 688400 }, { - "epoch": 7.01, - "learning_rate": 3.486558352339748e-05, - "loss": 0.4282, + "epoch": 9.485822931305282, + "grad_norm": 1.145594596862793, + "learning_rate": 1.670218166779592e-05, + "loss": 0.3362, "step": 688500 }, { - "epoch": 7.02, - "learning_rate": 3.485903763360177e-05, - "loss": 0.4476, + "epoch": 9.487200683365021, + "grad_norm": 2.1142444610595703, + "learning_rate": 1.669481041406882e-05, + "loss": 0.3044, "step": 688600 }, { - "epoch": 7.02, - "learning_rate": 3.485249150622812e-05, - "loss": 0.4935, + "epoch": 9.488578435424762, + "grad_norm": 55.36131286621094, + "learning_rate": 1.6687440078297854e-05, + "loss": 0.3174, "step": 688700 }, { - "epoch": 7.02, - "learning_rate": 3.484594514159658e-05, - "loss": 0.4784, + "epoch": 9.4899561874845, + "grad_norm": 1.893140435218811, + "learning_rate": 1.668007066110917e-05, + "loss": 0.3003, "step": 688800 }, { - "epoch": 7.02, - "learning_rate": 3.483939854002725e-05, - "loss": 0.5261, + "epoch": 9.491333939544239, + "grad_norm": 1.9860502481460571, + "learning_rate": 1.6672702163128838e-05, + "loss": 0.3562, "step": 688900 }, { - "epoch": 7.02, - "learning_rate": 3.48328517018402e-05, - "loss": 0.4173, + "epoch": 9.49271169160398, + "grad_norm": 1.3350133895874023, + "learning_rate": 1.6665334584982845e-05, + "loss": 0.3237, "step": 689000 }, { - "epoch": 7.02, - "learning_rate": 3.4826304627355555e-05, - "loss": 0.5298, + "epoch": 9.494089443663718, + "grad_norm": 3.0858402252197266, + "learning_rate": 1.6657967927297127e-05, + "loss": 0.3581, "step": 689100 }, { - "epoch": 7.02, - "learning_rate": 3.4819757316893414e-05, - "loss": 0.4602, + "epoch": 9.495467195723458, + "grad_norm": 1.2162363529205322, + "learning_rate": 1.6650602190697482e-05, + "loss": 0.2917, "step": 689200 }, { - "epoch": 7.02, - "learning_rate": 3.481320977077389e-05, - "loss": 0.375, + "epoch": 9.496844947783197, + "grad_norm": 2.634979248046875, + "learning_rate": 1.6643237375809692e-05, + "loss": 0.3358, "step": 689300 }, { - "epoch": 7.02, - "learning_rate": 3.4806661989317155e-05, - "loss": 0.4102, + "epoch": 9.498222699842936, + "grad_norm": 2.258401870727539, + "learning_rate": 1.6635873483259433e-05, + "loss": 0.294, "step": 689400 }, { - "epoch": 7.02, - "learning_rate": 3.480011397284333e-05, - "loss": 0.4009, + "epoch": 9.499600451902676, + "grad_norm": 3.4919583797454834, + "learning_rate": 1.662851051367229e-05, + "loss": 0.3145, "step": 689500 }, { - "epoch": 7.03, - "learning_rate": 3.479356572167256e-05, - "loss": 0.4654, + "epoch": 9.500978203962415, + "grad_norm": 3.4044504165649414, + "learning_rate": 1.662114846767382e-05, + "loss": 0.3252, "step": 689600 }, { - "epoch": 7.03, - "learning_rate": 3.478701723612504e-05, - "loss": 0.5416, + "epoch": 9.502355956022154, + "grad_norm": 6.067110061645508, + "learning_rate": 1.6613787345889422e-05, + "loss": 0.3135, "step": 689700 }, { - "epoch": 7.03, - "learning_rate": 3.478046851652094e-05, - "loss": 0.4529, + "epoch": 9.503733708081894, + "grad_norm": 24.15770721435547, + "learning_rate": 1.6606427148944494e-05, + "loss": 0.3438, "step": 689800 }, { - "epoch": 7.03, - "learning_rate": 3.477391956318047e-05, - "loss": 0.5293, + "epoch": 9.505111460141633, + "grad_norm": 2.949833631515503, + "learning_rate": 1.6599067877464305e-05, + "loss": 0.3092, "step": 689900 }, { - "epoch": 7.03, - "learning_rate": 3.476737037642382e-05, - "loss": 0.4453, + "epoch": 9.506489212201373, + "grad_norm": 5.739774703979492, + "learning_rate": 1.6591709532074063e-05, + "loss": 0.3439, "step": 690000 }, { - "epoch": 7.03, - "learning_rate": 3.47608209565712e-05, - "loss": 0.4366, + "epoch": 9.507866964261112, + "grad_norm": 1.464240312576294, + "learning_rate": 1.6584352113398913e-05, + "loss": 0.3439, "step": 690100 }, { - "epoch": 7.03, - "learning_rate": 3.475427130394286e-05, - "loss": 0.4538, + "epoch": 9.50924471632085, + "grad_norm": 2.0904932022094727, + "learning_rate": 1.6576995622063897e-05, + "loss": 0.3141, "step": 690200 }, { - "epoch": 7.03, - "learning_rate": 3.4747721418859016e-05, - "loss": 0.4469, + "epoch": 9.51062246838059, + "grad_norm": 5.582279205322266, + "learning_rate": 1.6569640058693983e-05, + "loss": 0.2866, "step": 690300 }, { - "epoch": 7.03, - "learning_rate": 3.4741171301639924e-05, - "loss": 0.4821, + "epoch": 9.51200022044033, + "grad_norm": 2.8447728157043457, + "learning_rate": 1.656228542391406e-05, + "loss": 0.3404, "step": 690400 }, { - "epoch": 7.03, - "learning_rate": 3.473462095260585e-05, - "loss": 0.4512, + "epoch": 9.513377972500068, + "grad_norm": 4.412662506103516, + "learning_rate": 1.6554931718348945e-05, + "loss": 0.2327, "step": 690500 }, { - "epoch": 7.04, - "learning_rate": 3.472807037207707e-05, - "loss": 0.4911, + "epoch": 9.514755724559809, + "grad_norm": 1.2669757604599, + "learning_rate": 1.6547578942623377e-05, + "loss": 0.3176, "step": 690600 }, { - "epoch": 7.04, - "learning_rate": 3.472151956037387e-05, - "loss": 0.4301, + "epoch": 9.516133476619547, + "grad_norm": 2.0791256427764893, + "learning_rate": 1.6540227097362018e-05, + "loss": 0.356, "step": 690700 }, { - "epoch": 7.04, - "learning_rate": 3.471496851781654e-05, - "loss": 0.4448, + "epoch": 9.517511228679286, + "grad_norm": 4.004638195037842, + "learning_rate": 1.6532876183189435e-05, + "loss": 0.3339, "step": 690800 }, { - "epoch": 7.04, - "learning_rate": 3.470841724472538e-05, - "loss": 0.4552, + "epoch": 9.518888980739026, + "grad_norm": 1.1769605875015259, + "learning_rate": 1.6525526200730123e-05, + "loss": 0.3044, "step": 690900 }, { - "epoch": 7.04, - "learning_rate": 3.470186574142071e-05, - "loss": 0.4682, + "epoch": 9.520266732798765, + "grad_norm": 2.9707272052764893, + "learning_rate": 1.65181771506085e-05, + "loss": 0.2599, "step": 691000 }, { - "epoch": 7.04, - "learning_rate": 3.469531400822287e-05, - "loss": 0.463, + "epoch": 9.521644484858506, + "grad_norm": 4.949897766113281, + "learning_rate": 1.6510829033448918e-05, + "loss": 0.3621, "step": 691100 }, { - "epoch": 7.04, - "learning_rate": 3.4688762045452196e-05, - "loss": 0.4729, + "epoch": 9.523022236918244, + "grad_norm": 3.8138391971588135, + "learning_rate": 1.650348184987563e-05, + "loss": 0.3635, "step": 691200 }, { - "epoch": 7.04, - "learning_rate": 3.4682209853429045e-05, - "loss": 0.5005, + "epoch": 9.524399988977983, + "grad_norm": 7.296191692352295, + "learning_rate": 1.6496135600512822e-05, + "loss": 0.3191, "step": 691300 }, { - "epoch": 7.04, - "learning_rate": 3.4675657432473765e-05, - "loss": 0.4537, + "epoch": 9.525777741037723, + "grad_norm": 5.1465959548950195, + "learning_rate": 1.6488790285984584e-05, + "loss": 0.2777, "step": 691400 }, { - "epoch": 7.05, - "learning_rate": 3.4669104782906753e-05, - "loss": 0.4472, + "epoch": 9.527155493097462, + "grad_norm": 2.4102354049682617, + "learning_rate": 1.648144590691494e-05, + "loss": 0.2873, "step": 691500 }, { - "epoch": 7.05, - "learning_rate": 3.466255190504836e-05, - "loss": 0.4528, + "epoch": 9.5285332451572, + "grad_norm": 7.835136890411377, + "learning_rate": 1.6474102463927837e-05, + "loss": 0.3528, "step": 691600 }, { - "epoch": 7.05, - "learning_rate": 3.465599879921901e-05, - "loss": 0.4924, + "epoch": 9.529910997216941, + "grad_norm": 3.2894773483276367, + "learning_rate": 1.6466759957647143e-05, + "loss": 0.3366, "step": 691700 }, { - "epoch": 7.05, - "learning_rate": 3.464944546573912e-05, - "loss": 0.4711, + "epoch": 9.53128874927668, + "grad_norm": 2.3257899284362793, + "learning_rate": 1.645941838869664e-05, + "loss": 0.2946, "step": 691800 }, { - "epoch": 7.05, - "learning_rate": 3.464289190492908e-05, - "loss": 0.4731, + "epoch": 9.53266650133642, + "grad_norm": 0.7014309763908386, + "learning_rate": 1.6452077757700023e-05, + "loss": 0.2742, "step": 691900 }, { - "epoch": 7.05, - "learning_rate": 3.4636338117109335e-05, - "loss": 0.4543, + "epoch": 9.534044253396159, + "grad_norm": 5.086578369140625, + "learning_rate": 1.6444738065280917e-05, + "loss": 0.3439, "step": 692000 }, { - "epoch": 7.05, - "learning_rate": 3.462978410260032e-05, - "loss": 0.4796, + "epoch": 9.535422005455898, + "grad_norm": 4.228861331939697, + "learning_rate": 1.6437399312062876e-05, + "loss": 0.3527, "step": 692100 }, { - "epoch": 7.05, - "learning_rate": 3.46232298617225e-05, - "loss": 0.5059, + "epoch": 9.536799757515638, + "grad_norm": 0.7030498385429382, + "learning_rate": 1.6430061498669357e-05, + "loss": 0.35, "step": 692200 }, { - "epoch": 7.05, - "learning_rate": 3.461667539479632e-05, - "loss": 0.4197, + "epoch": 9.538177509575377, + "grad_norm": 2.3977200984954834, + "learning_rate": 1.6422724625723743e-05, + "loss": 0.2661, "step": 692300 }, { - "epoch": 7.05, - "learning_rate": 3.4610120702142274e-05, - "loss": 0.4649, + "epoch": 9.539555261635115, + "grad_norm": 2.7346768379211426, + "learning_rate": 1.641538869384936e-05, + "loss": 0.3489, "step": 692400 }, { - "epoch": 7.06, - "learning_rate": 3.460356578408083e-05, - "loss": 0.4655, + "epoch": 9.540933013694856, + "grad_norm": 1.3992122411727905, + "learning_rate": 1.6408053703669397e-05, + "loss": 0.3124, "step": 692500 }, { - "epoch": 7.06, - "learning_rate": 3.45970106409325e-05, - "loss": 0.4614, + "epoch": 9.542310765754594, + "grad_norm": 5.367378234863281, + "learning_rate": 1.6400719655807025e-05, + "loss": 0.3768, "step": 692600 }, { - "epoch": 7.06, - "learning_rate": 3.459052082780847e-05, - "loss": 0.4403, + "epoch": 9.543688517814335, + "grad_norm": 2.5158753395080566, + "learning_rate": 1.6393386550885302e-05, + "loss": 0.3369, "step": 692700 }, { - "epoch": 7.06, - "learning_rate": 3.4583965237690744e-05, - "loss": 0.449, + "epoch": 9.545066269874074, + "grad_norm": 0.03340692073106766, + "learning_rate": 1.638605438952721e-05, + "loss": 0.3301, "step": 692800 }, { - "epoch": 7.06, - "learning_rate": 3.457740942344448e-05, - "loss": 0.4312, + "epoch": 9.546444021933812, + "grad_norm": 2.227006435394287, + "learning_rate": 1.637872317235566e-05, + "loss": 0.3786, "step": 692900 }, { - "epoch": 7.06, - "learning_rate": 3.457085338539021e-05, - "loss": 0.447, + "epoch": 9.547821773993553, + "grad_norm": 3.787794589996338, + "learning_rate": 1.6371392899993474e-05, + "loss": 0.3269, "step": 693000 }, { - "epoch": 7.06, - "learning_rate": 3.456429712384849e-05, - "loss": 0.4493, + "epoch": 9.549199526053291, + "grad_norm": 2.7167136669158936, + "learning_rate": 1.6364063573063393e-05, + "loss": 0.3373, "step": 693100 }, { - "epoch": 7.06, - "learning_rate": 3.455774063913988e-05, - "loss": 0.3755, + "epoch": 9.55057727811303, + "grad_norm": 4.415078639984131, + "learning_rate": 1.635673519218808e-05, + "loss": 0.2869, "step": 693200 }, { - "epoch": 7.06, - "learning_rate": 3.455118393158495e-05, - "loss": 0.5208, + "epoch": 9.55195503017277, + "grad_norm": 3.1602120399475098, + "learning_rate": 1.6349407757990115e-05, + "loss": 0.3058, "step": 693300 }, { - "epoch": 7.06, - "learning_rate": 3.454462700150429e-05, - "loss": 0.4794, + "epoch": 9.55333278223251, + "grad_norm": 1.7955783605575562, + "learning_rate": 1.634208127109202e-05, + "loss": 0.3554, "step": 693400 }, { - "epoch": 7.07, - "learning_rate": 3.4538069849218484e-05, - "loss": 0.4975, + "epoch": 9.55471053429225, + "grad_norm": 2.6662306785583496, + "learning_rate": 1.6334755732116203e-05, + "loss": 0.304, "step": 693500 }, { - "epoch": 7.07, - "learning_rate": 3.453151247504815e-05, - "loss": 0.4223, + "epoch": 9.556088286351988, + "grad_norm": 3.034860610961914, + "learning_rate": 1.632750438289198e-05, + "loss": 0.3325, "step": 693600 }, { - "epoch": 7.07, - "learning_rate": 3.4524954879313894e-05, - "loss": 0.4829, + "epoch": 9.557466038411727, + "grad_norm": 2.304090976715088, + "learning_rate": 1.632018073213291e-05, + "loss": 0.3176, "step": 693700 }, { - "epoch": 7.07, - "learning_rate": 3.451839706233635e-05, - "loss": 0.4875, + "epoch": 9.558843790471467, + "grad_norm": 5.1975812911987305, + "learning_rate": 1.6312858031156687e-05, + "loss": 0.2813, "step": 693800 }, { - "epoch": 7.07, - "learning_rate": 3.451183902443616e-05, - "loss": 0.4503, + "epoch": 9.560221542531206, + "grad_norm": 4.145631313323975, + "learning_rate": 1.6305536280585407e-05, + "loss": 0.3323, "step": 693900 }, { - "epoch": 7.07, - "learning_rate": 3.450528076593396e-05, - "loss": 0.4839, + "epoch": 9.561599294590945, + "grad_norm": 2.6459829807281494, + "learning_rate": 1.6298215481041097e-05, + "loss": 0.3432, "step": 694000 }, { - "epoch": 7.07, - "learning_rate": 3.449872228715041e-05, - "loss": 0.3984, + "epoch": 9.562977046650685, + "grad_norm": 1.3821395635604858, + "learning_rate": 1.6290895633145683e-05, + "loss": 0.3033, "step": 694100 }, { - "epoch": 7.07, - "learning_rate": 3.4492163588406196e-05, - "loss": 0.4424, + "epoch": 9.564354798710424, + "grad_norm": 4.769979476928711, + "learning_rate": 1.6283576737521025e-05, + "loss": 0.3476, "step": 694200 }, { - "epoch": 7.07, - "learning_rate": 3.448560467002199e-05, - "loss": 0.4793, + "epoch": 9.565732550770164, + "grad_norm": 1.6830343008041382, + "learning_rate": 1.6276258794788914e-05, + "loss": 0.3083, "step": 694300 }, { - "epoch": 7.07, - "learning_rate": 3.447904553231848e-05, - "loss": 0.4926, + "epoch": 9.567110302829903, + "grad_norm": 5.125626087188721, + "learning_rate": 1.6268941805571035e-05, + "loss": 0.3219, "step": 694400 }, { - "epoch": 7.08, - "learning_rate": 3.447248617561638e-05, - "loss": 0.3824, + "epoch": 9.568488054889642, + "grad_norm": 2.8273632526397705, + "learning_rate": 1.6261625770489004e-05, + "loss": 0.3178, "step": 694500 }, { - "epoch": 7.08, - "learning_rate": 3.4465926600236396e-05, - "loss": 0.4499, + "epoch": 9.569865806949382, + "grad_norm": 4.085282802581787, + "learning_rate": 1.625431069016438e-05, + "loss": 0.2738, "step": 694600 }, { - "epoch": 7.08, - "learning_rate": 3.445936680649925e-05, - "loss": 0.478, + "epoch": 9.57124355900912, + "grad_norm": 1.0584511756896973, + "learning_rate": 1.6246996565218568e-05, + "loss": 0.3193, "step": 694700 }, { - "epoch": 7.08, - "learning_rate": 3.4452806794725674e-05, - "loss": 0.4477, + "epoch": 9.57262131106886, + "grad_norm": 1.325571894645691, + "learning_rate": 1.623968339627298e-05, + "loss": 0.3041, "step": 694800 }, { - "epoch": 7.08, - "learning_rate": 3.444624656523642e-05, - "loss": 0.444, + "epoch": 9.5739990631286, + "grad_norm": 2.0018537044525146, + "learning_rate": 1.6232371183948893e-05, + "loss": 0.2926, "step": 694900 }, { - "epoch": 7.08, - "learning_rate": 3.4439686118352244e-05, - "loss": 0.4918, + "epoch": 9.575376815188338, + "grad_norm": 109.8460464477539, + "learning_rate": 1.6225059928867514e-05, + "loss": 0.3145, "step": 695000 }, { - "epoch": 7.08, - "learning_rate": 3.443312545439392e-05, - "loss": 0.4157, + "epoch": 9.576754567248077, + "grad_norm": 7.8019561767578125, + "learning_rate": 1.6217749631649985e-05, + "loss": 0.3566, "step": 695100 }, { - "epoch": 7.08, - "learning_rate": 3.44265645736822e-05, - "loss": 0.3893, + "epoch": 9.578132319307818, + "grad_norm": 1.925970435142517, + "learning_rate": 1.621044029291735e-05, + "loss": 0.3246, "step": 695200 }, { - "epoch": 7.08, - "learning_rate": 3.4420003476537904e-05, - "loss": 0.4554, + "epoch": 9.579510071367556, + "grad_norm": 2.6698803901672363, + "learning_rate": 1.620313191329056e-05, + "loss": 0.3754, "step": 695300 }, { - "epoch": 7.08, - "learning_rate": 3.441344216328181e-05, - "loss": 0.3943, + "epoch": 9.580887823427297, + "grad_norm": 5.3387861251831055, + "learning_rate": 1.6195824493390506e-05, + "loss": 0.3424, "step": 695400 }, { - "epoch": 7.09, - "learning_rate": 3.440688063423473e-05, - "loss": 0.4832, + "epoch": 9.582265575487035, + "grad_norm": 0.6787081956863403, + "learning_rate": 1.618851803383799e-05, + "loss": 0.31, "step": 695500 }, { - "epoch": 7.09, - "learning_rate": 3.44003188897175e-05, - "loss": 0.4709, + "epoch": 9.583643327546774, + "grad_norm": 2.0670363903045654, + "learning_rate": 1.6181212535253746e-05, + "loss": 0.3114, "step": 695600 }, { - "epoch": 7.09, - "learning_rate": 3.439375693005091e-05, - "loss": 0.4348, + "epoch": 9.585021079606515, + "grad_norm": 3.8436789512634277, + "learning_rate": 1.61739079982584e-05, + "loss": 0.3358, "step": 695700 }, { - "epoch": 7.09, - "learning_rate": 3.438719475555585e-05, - "loss": 0.4345, + "epoch": 9.586398831666253, + "grad_norm": 0.8414889574050903, + "learning_rate": 1.6166604423472516e-05, + "loss": 0.3309, "step": 695800 }, { - "epoch": 7.09, - "learning_rate": 3.438063236655314e-05, - "loss": 0.5666, + "epoch": 9.587776583725992, + "grad_norm": 5.926618576049805, + "learning_rate": 1.6159301811516563e-05, + "loss": 0.3124, "step": 695900 }, { - "epoch": 7.09, - "learning_rate": 3.4374069763363645e-05, - "loss": 0.4231, + "epoch": 9.589154335785732, + "grad_norm": 3.80885910987854, + "learning_rate": 1.6152000163010936e-05, + "loss": 0.3386, "step": 696000 }, { - "epoch": 7.09, - "learning_rate": 3.436750694630824e-05, - "loss": 0.4404, + "epoch": 9.590532087845471, + "grad_norm": 6.9158034324646, + "learning_rate": 1.614469947857596e-05, + "loss": 0.3089, "step": 696100 }, { - "epoch": 7.09, - "learning_rate": 3.436094391570782e-05, - "loss": 0.4461, + "epoch": 9.591909839905211, + "grad_norm": 1.3664408922195435, + "learning_rate": 1.613739975883185e-05, + "loss": 0.2705, "step": 696200 }, { - "epoch": 7.09, - "learning_rate": 3.4354380671883254e-05, - "loss": 0.4784, + "epoch": 9.59328759196495, + "grad_norm": 1.1309088468551636, + "learning_rate": 1.6130101004398774e-05, + "loss": 0.2748, "step": 696300 }, { - "epoch": 7.1, - "learning_rate": 3.434781721515547e-05, - "loss": 0.3859, + "epoch": 9.594665344024689, + "grad_norm": 15.730361938476562, + "learning_rate": 1.6122803215896774e-05, + "loss": 0.311, "step": 696400 }, { - "epoch": 7.1, - "learning_rate": 3.4341253545845366e-05, - "loss": 0.4469, + "epoch": 9.59604309608443, + "grad_norm": 2.9531161785125732, + "learning_rate": 1.6115506393945843e-05, + "loss": 0.2845, "step": 696500 }, { - "epoch": 7.1, - "learning_rate": 3.433468966427387e-05, - "loss": 0.4637, + "epoch": 9.597420848144168, + "grad_norm": 3.1972222328186035, + "learning_rate": 1.610821053916589e-05, + "loss": 0.3344, "step": 696600 }, { - "epoch": 7.1, - "learning_rate": 3.432812557076192e-05, - "loss": 0.4849, + "epoch": 9.598798600203907, + "grad_norm": 2.167327404022217, + "learning_rate": 1.6100915652176736e-05, + "loss": 0.3334, "step": 696700 }, { - "epoch": 7.1, - "learning_rate": 3.4321561265630454e-05, - "loss": 0.4427, + "epoch": 9.600176352263647, + "grad_norm": 6.5683746337890625, + "learning_rate": 1.6093621733598117e-05, + "loss": 0.3285, "step": 696800 }, { - "epoch": 7.1, - "learning_rate": 3.431499674920044e-05, - "loss": 0.4595, + "epoch": 9.601554104323386, + "grad_norm": 2.083061933517456, + "learning_rate": 1.6086328784049683e-05, + "loss": 0.2731, "step": 696900 }, { - "epoch": 7.1, - "learning_rate": 3.4308432021792836e-05, - "loss": 0.5135, + "epoch": 9.602931856383126, + "grad_norm": 1.7082992792129517, + "learning_rate": 1.6079036804151005e-05, + "loss": 0.2815, "step": 697000 }, { - "epoch": 7.1, - "learning_rate": 3.4301932734150945e-05, - "loss": 0.4431, + "epoch": 9.604309608442865, + "grad_norm": 6.441465854644775, + "learning_rate": 1.6071818699813007e-05, + "loss": 0.3118, "step": 697100 }, { - "epoch": 7.1, - "learning_rate": 3.4295367587852866e-05, - "loss": 0.481, + "epoch": 9.605687360502603, + "grad_norm": 3.2115180492401123, + "learning_rate": 1.6064528651360295e-05, + "loss": 0.3486, "step": 697200 }, { - "epoch": 7.1, - "learning_rate": 3.428880223153693e-05, - "loss": 0.5014, + "epoch": 9.607065112562344, + "grad_norm": 6.019472122192383, + "learning_rate": 1.605723957440938e-05, + "loss": 0.3699, "step": 697300 }, { - "epoch": 7.11, - "learning_rate": 3.4282236665524175e-05, - "loss": 0.418, + "epoch": 9.608442864622083, + "grad_norm": 5.26919412612915, + "learning_rate": 1.6049951469579504e-05, + "loss": 0.3735, "step": 697400 }, { - "epoch": 7.11, - "learning_rate": 3.427573654892484e-05, - "loss": 0.4557, + "epoch": 9.609820616681821, + "grad_norm": 16.880033493041992, + "learning_rate": 1.6042664337489818e-05, + "loss": 0.2804, "step": 697500 }, { - "epoch": 7.11, - "learning_rate": 3.426917056657042e-05, - "loss": 0.4474, + "epoch": 9.611198368741562, + "grad_norm": 5.169192314147949, + "learning_rate": 1.6035378178759428e-05, + "loss": 0.2727, "step": 697600 }, { - "epoch": 7.11, - "learning_rate": 3.426260437547906e-05, - "loss": 0.4937, + "epoch": 9.6125761208013, + "grad_norm": 3.3329882621765137, + "learning_rate": 1.6028092994007304e-05, + "loss": 0.2687, "step": 697700 }, { - "epoch": 7.11, - "learning_rate": 3.425603797597176e-05, - "loss": 0.4615, + "epoch": 9.61395387286104, + "grad_norm": 6.606675624847412, + "learning_rate": 1.6020808783852363e-05, + "loss": 0.2946, "step": 697800 }, { - "epoch": 7.11, - "learning_rate": 3.424947136836961e-05, - "loss": 0.4478, + "epoch": 9.61533162492078, + "grad_norm": 2.520848512649536, + "learning_rate": 1.601352554891347e-05, + "loss": 0.3851, "step": 697900 }, { - "epoch": 7.11, - "learning_rate": 3.4242904552993674e-05, - "loss": 0.4965, + "epoch": 9.616709376980518, + "grad_norm": 4.3599958419799805, + "learning_rate": 1.600624328980932e-05, + "loss": 0.307, "step": 698000 }, { - "epoch": 7.11, - "learning_rate": 3.423633753016502e-05, - "loss": 0.4965, + "epoch": 9.618087129040259, + "grad_norm": 2.7149875164031982, + "learning_rate": 1.599896200715861e-05, + "loss": 0.3296, "step": 698100 }, { - "epoch": 7.11, - "learning_rate": 3.4229770300204746e-05, - "loss": 0.4529, + "epoch": 9.619464881099997, + "grad_norm": 9.638872146606445, + "learning_rate": 1.5991681701579924e-05, + "loss": 0.3016, "step": 698200 }, { - "epoch": 7.11, - "learning_rate": 3.4223202863433945e-05, - "loss": 0.4255, + "epoch": 9.620842633159736, + "grad_norm": 0.9224613308906555, + "learning_rate": 1.5984402373691744e-05, + "loss": 0.2632, "step": 698300 }, { - "epoch": 7.12, - "learning_rate": 3.421663522017372e-05, - "loss": 0.4588, + "epoch": 9.622220385219476, + "grad_norm": 4.34193754196167, + "learning_rate": 1.5977124024112493e-05, + "loss": 0.3456, "step": 698400 }, { - "epoch": 7.12, - "learning_rate": 3.421006737074521e-05, - "loss": 0.4824, + "epoch": 9.623598137279215, + "grad_norm": 4.808312892913818, + "learning_rate": 1.5969919422319324e-05, + "loss": 0.3476, "step": 698500 }, { - "epoch": 7.12, - "learning_rate": 3.420349931546952e-05, - "loss": 0.4586, + "epoch": 9.624975889338955, + "grad_norm": 6.366484642028809, + "learning_rate": 1.5962643021414338e-05, + "loss": 0.3375, "step": 698600 }, { - "epoch": 7.12, - "learning_rate": 3.41969310546678e-05, - "loss": 0.4795, + "epoch": 9.626353641398694, + "grad_norm": 2.1494252681732178, + "learning_rate": 1.5955367600666865e-05, + "loss": 0.3255, "step": 698700 }, { - "epoch": 7.12, - "learning_rate": 3.41903625886612e-05, - "loss": 0.5428, + "epoch": 9.627731393458433, + "grad_norm": 0.07877679169178009, + "learning_rate": 1.5948093160694962e-05, + "loss": 0.3152, "step": 698800 }, { - "epoch": 7.12, - "learning_rate": 3.418379391777087e-05, - "loss": 0.3829, + "epoch": 9.629109145518173, + "grad_norm": 3.3718981742858887, + "learning_rate": 1.5940892431842506e-05, + "loss": 0.3654, "step": 698900 }, { - "epoch": 7.12, - "learning_rate": 3.4177225042318e-05, - "loss": 0.4889, + "epoch": 9.630486897577912, + "grad_norm": 4.589846611022949, + "learning_rate": 1.593361994545253e-05, + "loss": 0.292, "step": 699000 }, { - "epoch": 7.12, - "learning_rate": 3.4170655962623744e-05, - "loss": 0.4927, + "epoch": 9.63186464963765, + "grad_norm": 3.9629225730895996, + "learning_rate": 1.5926348441685715e-05, + "loss": 0.293, "step": 699100 }, { - "epoch": 7.12, - "learning_rate": 3.41640866790093e-05, - "loss": 0.4363, + "epoch": 9.633242401697391, + "grad_norm": 0.20854812860488892, + "learning_rate": 1.5919077921159797e-05, + "loss": 0.3158, "step": 699200 }, { - "epoch": 7.12, - "learning_rate": 3.4157517191795865e-05, - "loss": 0.447, + "epoch": 9.63462015375713, + "grad_norm": 2.1737875938415527, + "learning_rate": 1.591180838449246e-05, + "loss": 0.3727, "step": 699300 }, { - "epoch": 7.13, - "learning_rate": 3.4150947501304646e-05, - "loss": 0.5036, + "epoch": 9.635997905816868, + "grad_norm": 2.6579370498657227, + "learning_rate": 1.5904539832301296e-05, + "loss": 0.323, "step": 699400 }, { - "epoch": 7.13, - "learning_rate": 3.414437760785688e-05, - "loss": 0.5019, + "epoch": 9.637375657876609, + "grad_norm": 0.4594109058380127, + "learning_rate": 1.589727226520379e-05, + "loss": 0.3268, "step": 699500 }, { - "epoch": 7.13, - "learning_rate": 3.413780751177377e-05, - "loss": 0.4529, + "epoch": 9.638753409936347, + "grad_norm": 3.778564691543579, + "learning_rate": 1.5890005683817358e-05, + "loss": 0.2986, "step": 699600 }, { - "epoch": 7.13, - "learning_rate": 3.4131237213376575e-05, - "loss": 0.4887, + "epoch": 9.640131161996088, + "grad_norm": 2.7593014240264893, + "learning_rate": 1.588274008875935e-05, + "loss": 0.3273, "step": 699700 }, { - "epoch": 7.13, - "learning_rate": 3.412466671298653e-05, - "loss": 0.4426, + "epoch": 9.641508914055827, + "grad_norm": 3.726316452026367, + "learning_rate": 1.587547548064699e-05, + "loss": 0.3043, "step": 699800 }, { - "epoch": 7.13, - "learning_rate": 3.411809601092489e-05, - "loss": 0.4296, + "epoch": 9.642886666115565, + "grad_norm": 6.5598225593566895, + "learning_rate": 1.5868211860097467e-05, + "loss": 0.3298, "step": 699900 }, { - "epoch": 7.13, - "learning_rate": 3.411152510751293e-05, - "loss": 0.4029, - "step": 700000 - }, - { - "epoch": 7.13, - "eval_cer": 0.08554024028639817, - "eval_loss": 0.5128045678138733, - "eval_runtime": 9250.4927, - "eval_samples_per_second": 5.914, - "eval_steps_per_second": 0.37, - "eval_wer": 0.1703941037529525, + "epoch": 9.644264418175306, + "grad_norm": 3.1059436798095703, + "learning_rate": 1.5860949227727857e-05, + "loss": 0.3183, "step": 700000 }, { - "epoch": 7.13, - "learning_rate": 3.4104954003071945e-05, - "loss": 0.5237, + "epoch": 9.645642170235044, + "grad_norm": 1.6653426885604858, + "learning_rate": 1.585368758415516e-05, + "loss": 0.2629, "step": 700100 }, { - "epoch": 7.13, - "learning_rate": 3.40983826979232e-05, - "loss": 0.3956, + "epoch": 9.647019922294783, + "grad_norm": 6.720180988311768, + "learning_rate": 1.5846426929996273e-05, + "loss": 0.2941, "step": 700200 }, { - "epoch": 7.13, - "learning_rate": 3.409181119238799e-05, - "loss": 0.4842, + "epoch": 9.648397674354523, + "grad_norm": 5.051051616668701, + "learning_rate": 1.583916726586804e-05, + "loss": 0.2755, "step": 700300 }, { - "epoch": 7.14, - "learning_rate": 3.408523948678764e-05, - "loss": 0.4793, + "epoch": 9.649775426414262, + "grad_norm": 3.5669496059417725, + "learning_rate": 1.5831908592387208e-05, + "loss": 0.3523, "step": 700400 }, { - "epoch": 7.14, - "learning_rate": 3.4078667581443455e-05, - "loss": 0.3795, + "epoch": 9.651153178474003, + "grad_norm": 3.3381452560424805, + "learning_rate": 1.582465091017043e-05, + "loss": 0.2955, "step": 700500 }, { - "epoch": 7.14, - "learning_rate": 3.4072095476676764e-05, - "loss": 0.4664, + "epoch": 9.652530930533741, + "grad_norm": 10.62447738647461, + "learning_rate": 1.5817394219834295e-05, + "loss": 0.4033, "step": 700600 }, { - "epoch": 7.14, - "learning_rate": 3.4065523172808915e-05, - "loss": 0.5034, + "epoch": 9.65390868259348, + "grad_norm": 4.705462455749512, + "learning_rate": 1.5810138521995292e-05, + "loss": 0.2642, "step": 700700 }, { - "epoch": 7.14, - "learning_rate": 3.405895067016123e-05, - "loss": 0.506, + "epoch": 9.65528643465322, + "grad_norm": 1.075913429260254, + "learning_rate": 1.5802883817269813e-05, + "loss": 0.3894, "step": 700800 }, { - "epoch": 7.14, - "learning_rate": 3.405237796905509e-05, - "loss": 0.427, + "epoch": 9.656664186712959, + "grad_norm": 4.889693260192871, + "learning_rate": 1.5795630106274195e-05, + "loss": 0.3047, "step": 700900 }, { - "epoch": 7.14, - "learning_rate": 3.4045870799784e-05, - "loss": 0.4575, + "epoch": 9.658041938772698, + "grad_norm": 3.811338186264038, + "learning_rate": 1.5788377389624693e-05, + "loss": 0.3193, "step": 701000 }, { - "epoch": 7.14, - "learning_rate": 3.4039297704701606e-05, - "loss": 0.4124, + "epoch": 9.659419690832438, + "grad_norm": 2.9069392681121826, + "learning_rate": 1.5781125667937435e-05, + "loss": 0.2691, "step": 701100 }, { - "epoch": 7.14, - "learning_rate": 3.403272441212165e-05, - "loss": 0.5877, + "epoch": 9.660797442892177, + "grad_norm": 3.4906105995178223, + "learning_rate": 1.5773874941828518e-05, + "loss": 0.3038, "step": 701200 }, { - "epoch": 7.14, - "learning_rate": 3.4026150922365536e-05, - "loss": 0.4166, + "epoch": 9.662175194951917, + "grad_norm": 2.4097721576690674, + "learning_rate": 1.5766625211913902e-05, + "loss": 0.3174, "step": 701300 }, { - "epoch": 7.15, - "learning_rate": 3.401957723575467e-05, - "loss": 0.4528, + "epoch": 9.663552947011656, + "grad_norm": 2.8913867473602295, + "learning_rate": 1.5759376478809512e-05, + "loss": 0.3832, "step": 701400 }, { - "epoch": 7.15, - "learning_rate": 3.4013003352610474e-05, - "loss": 0.5024, + "epoch": 9.664930699071395, + "grad_norm": 7.890986442565918, + "learning_rate": 1.5752128743131144e-05, + "loss": 0.3149, "step": 701500 }, { - "epoch": 7.15, - "learning_rate": 3.400642927325435e-05, - "loss": 0.5133, + "epoch": 9.666308451131135, + "grad_norm": 2.572356939315796, + "learning_rate": 1.5744882005494543e-05, + "loss": 0.3155, "step": 701600 }, { - "epoch": 7.15, - "learning_rate": 3.399985499800776e-05, - "loss": 0.4635, + "epoch": 9.667686203190874, + "grad_norm": 3.5634989738464355, + "learning_rate": 1.5737636266515364e-05, + "loss": 0.2649, "step": 701700 }, { - "epoch": 7.15, - "learning_rate": 3.399328052719211e-05, - "loss": 0.4744, + "epoch": 9.669063955250612, + "grad_norm": 2.391688585281372, + "learning_rate": 1.573039152680916e-05, + "loss": 0.3004, "step": 701800 }, { - "epoch": 7.15, - "learning_rate": 3.398670586112888e-05, - "loss": 0.3647, + "epoch": 9.670441707310353, + "grad_norm": 19.462615966796875, + "learning_rate": 1.5723147786991397e-05, + "loss": 0.2901, "step": 701900 }, { - "epoch": 7.15, - "learning_rate": 3.398013100013953e-05, - "loss": 0.4557, + "epoch": 9.671819459370091, + "grad_norm": 2.35721492767334, + "learning_rate": 1.5715905047677492e-05, + "loss": 0.2816, "step": 702000 }, { - "epoch": 7.15, - "learning_rate": 3.397355594454552e-05, - "loss": 0.4186, + "epoch": 9.673197211429832, + "grad_norm": 4.3905181884765625, + "learning_rate": 1.5708663309482728e-05, + "loss": 0.345, "step": 702100 }, { - "epoch": 7.15, - "learning_rate": 3.396698069466835e-05, - "loss": 0.444, + "epoch": 9.67457496348957, + "grad_norm": 3.691394090652466, + "learning_rate": 1.570142257302234e-05, + "loss": 0.3047, "step": 702200 }, { - "epoch": 7.16, - "learning_rate": 3.396040525082949e-05, - "loss": 0.4612, + "epoch": 9.67595271554931, + "grad_norm": 2.775655508041382, + "learning_rate": 1.5694182838911477e-05, + "loss": 0.3513, "step": 702300 }, { - "epoch": 7.16, - "learning_rate": 3.395382961335044e-05, - "loss": 0.411, + "epoch": 9.67733046760905, + "grad_norm": 3.659954309463501, + "learning_rate": 1.568694410776518e-05, + "loss": 0.2753, "step": 702400 }, { - "epoch": 7.16, - "learning_rate": 3.3947253782552724e-05, - "loss": 0.4276, + "epoch": 9.678708219668788, + "grad_norm": 3.304738759994507, + "learning_rate": 1.5679706380198407e-05, + "loss": 0.3199, "step": 702500 }, { - "epoch": 7.16, - "learning_rate": 3.394067775875786e-05, - "loss": 0.514, + "epoch": 9.680085971728527, + "grad_norm": 1.7630168199539185, + "learning_rate": 1.567246965682605e-05, + "loss": 0.3914, "step": 702600 }, { - "epoch": 7.16, - "learning_rate": 3.393410154228736e-05, - "loss": 0.4698, + "epoch": 9.681463723788267, + "grad_norm": 4.834746360778809, + "learning_rate": 1.566523393826291e-05, + "loss": 0.3158, "step": 702700 }, { - "epoch": 7.16, - "learning_rate": 3.392752513346278e-05, - "loss": 0.3696, + "epoch": 9.682841475848006, + "grad_norm": 1.9091852903366089, + "learning_rate": 1.5657999225123687e-05, + "loss": 0.2939, "step": 702800 }, { - "epoch": 7.16, - "learning_rate": 3.392094853260565e-05, - "loss": 0.4617, + "epoch": 9.684219227907747, + "grad_norm": 2.500666856765747, + "learning_rate": 1.5650765518023023e-05, + "loss": 0.3269, "step": 702900 }, { - "epoch": 7.16, - "learning_rate": 3.3914371740037544e-05, - "loss": 0.385, + "epoch": 9.685596979967485, + "grad_norm": 1.8422175645828247, + "learning_rate": 1.564353281757545e-05, + "loss": 0.3114, "step": 703000 }, { - "epoch": 7.16, - "learning_rate": 3.390779475608001e-05, - "loss": 0.4652, + "epoch": 9.686974732027224, + "grad_norm": 0.5250390768051147, + "learning_rate": 1.5636301124395405e-05, + "loss": 0.3565, "step": 703100 }, { - "epoch": 7.16, - "learning_rate": 3.390121758105464e-05, - "loss": 0.4706, + "epoch": 9.688352484086964, + "grad_norm": 11.935446739196777, + "learning_rate": 1.562907043909728e-05, + "loss": 0.3233, "step": 703200 }, { - "epoch": 7.17, - "learning_rate": 3.3894640215283014e-05, - "loss": 0.5484, + "epoch": 9.689730236146703, + "grad_norm": 3.246448516845703, + "learning_rate": 1.5621840762295357e-05, + "loss": 0.3015, "step": 703300 }, { - "epoch": 7.17, - "learning_rate": 3.3888062659086725e-05, - "loss": 0.4208, + "epoch": 9.691107988206442, + "grad_norm": 4.2183942794799805, + "learning_rate": 1.5614612094603837e-05, + "loss": 0.4, "step": 703400 }, { - "epoch": 7.17, - "learning_rate": 3.388148491278736e-05, - "loss": 0.4248, + "epoch": 9.692485740266182, + "grad_norm": 0.8239351511001587, + "learning_rate": 1.560738443663681e-05, + "loss": 0.3339, "step": 703500 }, { - "epoch": 7.17, - "learning_rate": 3.387490697670656e-05, - "loss": 0.4515, + "epoch": 9.69386349232592, + "grad_norm": 5.141870975494385, + "learning_rate": 1.560015778900832e-05, + "loss": 0.3047, "step": 703600 }, { - "epoch": 7.17, - "learning_rate": 3.386832885116593e-05, - "loss": 0.4564, + "epoch": 9.69524124438566, + "grad_norm": 2.0303468704223633, + "learning_rate": 1.559293215233231e-05, + "loss": 0.3216, "step": 703700 }, { - "epoch": 7.17, - "learning_rate": 3.38617505364871e-05, - "loss": 0.4001, + "epoch": 9.6966189964454, + "grad_norm": 4.558074951171875, + "learning_rate": 1.5585707527222622e-05, + "loss": 0.306, "step": 703800 }, { - "epoch": 7.17, - "learning_rate": 3.385517203299173e-05, - "loss": 0.426, + "epoch": 9.697996748505139, + "grad_norm": 6.400722503662109, + "learning_rate": 1.5578483914293035e-05, + "loss": 0.3079, "step": 703900 }, { - "epoch": 7.17, - "learning_rate": 3.384859334100143e-05, - "loss": 0.4317, + "epoch": 9.699374500564879, + "grad_norm": 4.560937881469727, + "learning_rate": 1.5571261314157247e-05, + "loss": 0.3149, "step": 704000 }, { - "epoch": 7.17, - "learning_rate": 3.38420144608379e-05, - "loss": 0.4805, + "epoch": 9.700752252624618, + "grad_norm": 4.5805487632751465, + "learning_rate": 1.556403972742882e-05, + "loss": 0.3337, "step": 704100 }, { - "epoch": 7.17, - "learning_rate": 3.383543539282279e-05, - "loss": 0.4748, + "epoch": 9.702130004684356, + "grad_norm": 38.73161697387695, + "learning_rate": 1.5556819154721283e-05, + "loss": 0.288, "step": 704200 }, { - "epoch": 7.18, - "learning_rate": 3.382885613727777e-05, - "loss": 0.4435, + "epoch": 9.703507756744097, + "grad_norm": 1.80768620967865, + "learning_rate": 1.5549671787204335e-05, + "loss": 0.3344, "step": 704300 }, { - "epoch": 7.18, - "learning_rate": 3.382227669452454e-05, - "loss": 0.4979, + "epoch": 9.704885508803835, + "grad_norm": 2.6821987628936768, + "learning_rate": 1.554245323422326e-05, + "loss": 0.311, "step": 704400 }, { - "epoch": 7.18, - "learning_rate": 3.381569706488478e-05, - "loss": 0.3693, + "epoch": 9.706263260863574, + "grad_norm": 2.034491539001465, + "learning_rate": 1.5535235697096957e-05, + "loss": 0.3513, "step": 704500 }, { - "epoch": 7.18, - "learning_rate": 3.380911724868022e-05, - "loss": 0.4247, + "epoch": 9.707641012923315, + "grad_norm": 5.47861385345459, + "learning_rate": 1.5528019176438582e-05, + "loss": 0.2697, "step": 704600 }, { - "epoch": 7.18, - "learning_rate": 3.380253724623255e-05, - "loss": 0.4368, + "epoch": 9.709018764983053, + "grad_norm": 15.920827865600586, + "learning_rate": 1.5520803672861232e-05, + "loss": 0.2776, "step": 704700 }, { - "epoch": 7.18, - "learning_rate": 3.37959570578635e-05, - "loss": 0.4472, + "epoch": 9.710396517042794, + "grad_norm": 3.699014902114868, + "learning_rate": 1.551358918697789e-05, + "loss": 0.3833, "step": 704800 }, { - "epoch": 7.18, - "learning_rate": 3.378937668389481e-05, - "loss": 0.4358, + "epoch": 9.711774269102532, + "grad_norm": 1.7043111324310303, + "learning_rate": 1.5506375719401473e-05, + "loss": 0.324, "step": 704900 }, { - "epoch": 7.18, - "learning_rate": 3.37827961246482e-05, - "loss": 0.4691, + "epoch": 9.713152021162271, + "grad_norm": 29.512975692749023, + "learning_rate": 1.54991632707448e-05, + "loss": 0.3091, "step": 705000 }, { - "epoch": 7.18, - "learning_rate": 3.377621538044544e-05, - "loss": 0.5104, + "epoch": 9.714529773222011, + "grad_norm": 7.502317428588867, + "learning_rate": 1.5491951841620606e-05, + "loss": 0.337, "step": 705100 }, { - "epoch": 7.18, - "learning_rate": 3.37696344516083e-05, - "loss": 0.5214, + "epoch": 9.71590752528175, + "grad_norm": 1.7898133993148804, + "learning_rate": 1.5484741432641525e-05, + "loss": 0.312, "step": 705200 }, { - "epoch": 7.19, - "learning_rate": 3.376305333845851e-05, - "loss": 0.5475, + "epoch": 9.717285277341489, + "grad_norm": 2.392545700073242, + "learning_rate": 1.547753204442013e-05, + "loss": 0.3264, "step": 705300 }, { - "epoch": 7.19, - "learning_rate": 3.375647204131787e-05, - "loss": 0.4678, + "epoch": 9.71866302940123, + "grad_norm": 8.011979103088379, + "learning_rate": 1.5470323677568905e-05, + "loss": 0.2837, "step": 705400 }, { - "epoch": 7.19, - "learning_rate": 3.3749890560508176e-05, - "loss": 0.499, + "epoch": 9.720040781460968, + "grad_norm": 1.7386094331741333, + "learning_rate": 1.546311633270021e-05, + "loss": 0.2984, "step": 705500 }, { - "epoch": 7.19, - "learning_rate": 3.3743308896351205e-05, - "loss": 0.4634, + "epoch": 9.721418533520708, + "grad_norm": 4.689943790435791, + "learning_rate": 1.5455910010426382e-05, + "loss": 0.3077, "step": 705600 }, { - "epoch": 7.19, - "learning_rate": 3.3736727049168764e-05, - "loss": 0.5355, + "epoch": 9.722796285580447, + "grad_norm": 2.996116876602173, + "learning_rate": 1.5448704711359615e-05, + "loss": 0.2759, "step": 705700 }, { - "epoch": 7.19, - "learning_rate": 3.373014501928268e-05, - "loss": 0.4342, + "epoch": 9.724174037640186, + "grad_norm": 8.044126510620117, + "learning_rate": 1.5441500436112024e-05, + "loss": 0.3215, "step": 705800 }, { - "epoch": 7.19, - "learning_rate": 3.372356280701475e-05, - "loss": 0.4234, + "epoch": 9.725551789699926, + "grad_norm": 2.8170111179351807, + "learning_rate": 1.543429718529566e-05, + "loss": 0.3509, "step": 705900 }, { - "epoch": 7.19, - "learning_rate": 3.371698041268684e-05, - "loss": 0.4503, + "epoch": 9.726929541759665, + "grad_norm": 2.577775239944458, + "learning_rate": 1.5427094959522494e-05, + "loss": 0.3055, "step": 706000 }, { - "epoch": 7.19, - "learning_rate": 3.371039783662076e-05, - "loss": 0.5151, + "epoch": 9.728307293819404, + "grad_norm": 5.114053249359131, + "learning_rate": 1.5419893759404355e-05, + "loss": 0.3041, "step": 706100 }, { - "epoch": 7.19, - "learning_rate": 3.370381507913837e-05, - "loss": 0.4758, + "epoch": 9.729685045879144, + "grad_norm": 4.764209747314453, + "learning_rate": 1.5412693585553063e-05, + "loss": 0.3564, "step": 706200 }, { - "epoch": 7.2, - "learning_rate": 3.369723214056151e-05, - "loss": 0.4164, + "epoch": 9.731062797938883, + "grad_norm": 3.8005058765411377, + "learning_rate": 1.5405494438580274e-05, + "loss": 0.2987, "step": 706300 }, { - "epoch": 7.2, - "learning_rate": 3.369064902121208e-05, - "loss": 0.4573, + "epoch": 9.732440549998623, + "grad_norm": 3.850269079208374, + "learning_rate": 1.539829631909762e-05, + "loss": 0.2657, "step": 706400 }, { - "epoch": 7.2, - "learning_rate": 3.368406572141193e-05, - "loss": 0.3828, + "epoch": 9.733818302058362, + "grad_norm": 3.09354567527771, + "learning_rate": 1.5391099227716595e-05, + "loss": 0.2989, "step": 706500 }, { - "epoch": 7.2, - "learning_rate": 3.367748224148296e-05, - "loss": 0.4883, + "epoch": 9.7351960541181, + "grad_norm": 4.967903137207031, + "learning_rate": 1.538390316504864e-05, + "loss": 0.2744, "step": 706600 }, { - "epoch": 7.2, - "learning_rate": 3.367089858174705e-05, - "loss": 0.5137, + "epoch": 9.73657380617784, + "grad_norm": 2.23189377784729, + "learning_rate": 1.5376708131705116e-05, + "loss": 0.2572, "step": 706700 }, { - "epoch": 7.2, - "learning_rate": 3.3664314742526104e-05, - "loss": 0.4469, + "epoch": 9.73795155823758, + "grad_norm": 3.1625421047210693, + "learning_rate": 1.5369514128297235e-05, + "loss": 0.2929, "step": 706800 }, { - "epoch": 7.2, - "learning_rate": 3.3657730724142036e-05, - "loss": 0.4437, + "epoch": 9.739329310297318, + "grad_norm": 4.154350757598877, + "learning_rate": 1.5362321155436188e-05, + "loss": 0.3265, "step": 706900 }, { - "epoch": 7.2, - "learning_rate": 3.365114652691676e-05, - "loss": 0.4959, + "epoch": 9.740707062357059, + "grad_norm": 0.32776451110839844, + "learning_rate": 1.535512921373307e-05, + "loss": 0.2713, "step": 707000 }, { - "epoch": 7.2, - "learning_rate": 3.364456215117221e-05, - "loss": 0.453, + "epoch": 9.742084814416797, + "grad_norm": 4.893360614776611, + "learning_rate": 1.5347938303798846e-05, + "loss": 0.3338, "step": 707100 }, { - "epoch": 7.21, - "learning_rate": 3.363797759723032e-05, - "loss": 0.521, + "epoch": 9.743462566476538, + "grad_norm": 4.980701923370361, + "learning_rate": 1.5340748426244433e-05, + "loss": 0.3142, "step": 707200 }, { - "epoch": 7.21, - "learning_rate": 3.363139286541304e-05, - "loss": 0.4771, + "epoch": 9.744840318536276, + "grad_norm": 2.2574617862701416, + "learning_rate": 1.5333559581680655e-05, + "loss": 0.3143, "step": 707300 }, { - "epoch": 7.21, - "learning_rate": 3.362480795604232e-05, - "loss": 0.4966, + "epoch": 9.746218070596015, + "grad_norm": 2.1633872985839844, + "learning_rate": 1.5326371770718237e-05, + "loss": 0.2726, "step": 707400 }, { - "epoch": 7.21, - "learning_rate": 3.361828872118238e-05, - "loss": 0.4583, + "epoch": 9.747595822655756, + "grad_norm": 4.102578639984131, + "learning_rate": 1.5319184993967804e-05, + "loss": 0.3183, "step": 707500 }, { - "epoch": 7.21, - "learning_rate": 3.361176931293028e-05, - "loss": 0.4402, + "epoch": 9.748973574715494, + "grad_norm": 14.609067916870117, + "learning_rate": 1.5311999252039924e-05, + "loss": 0.2999, "step": 707600 }, { - "epoch": 7.21, - "learning_rate": 3.3605183876359635e-05, - "loss": 0.4732, + "epoch": 9.750351326775233, + "grad_norm": 2.2417547702789307, + "learning_rate": 1.5304886387482626e-05, + "loss": 0.3263, "step": 707700 }, { - "epoch": 7.21, - "learning_rate": 3.359859826351703e-05, - "loss": 0.4188, + "epoch": 9.751729078834973, + "grad_norm": 4.8949079513549805, + "learning_rate": 1.5297702706667712e-05, + "loss": 0.3341, "step": 707800 }, { - "epoch": 7.21, - "learning_rate": 3.359201247472444e-05, - "loss": 0.4084, + "epoch": 9.753106830894712, + "grad_norm": 3.1890227794647217, + "learning_rate": 1.529052006250037e-05, + "loss": 0.302, "step": 707900 }, { - "epoch": 7.21, - "learning_rate": 3.3585426510303866e-05, - "loss": 0.4525, + "epoch": 9.75448458295445, + "grad_norm": 3.2141008377075195, + "learning_rate": 1.5283410266523478e-05, + "loss": 0.3177, "step": 708000 }, { - "epoch": 7.21, - "learning_rate": 3.357884037057735e-05, - "loss": 0.4459, + "epoch": 9.755862335014191, + "grad_norm": 0.39695999026298523, + "learning_rate": 1.527622968710012e-05, + "loss": 0.3236, "step": 708100 }, { - "epoch": 7.22, - "learning_rate": 3.357225405586688e-05, - "loss": 0.5033, + "epoch": 9.75724008707393, + "grad_norm": 1.4784473180770874, + "learning_rate": 1.5269050146148595e-05, + "loss": 0.2904, "step": 708200 }, { - "epoch": 7.22, - "learning_rate": 3.35656675664945e-05, - "loss": 0.4297, + "epoch": 9.75861783913367, + "grad_norm": 3.742617607116699, + "learning_rate": 1.5261871644278824e-05, + "loss": 0.3331, "step": 708300 }, { - "epoch": 7.22, - "learning_rate": 3.355908090278228e-05, - "loss": 0.4474, + "epoch": 9.759995591193409, + "grad_norm": 3.3070783615112305, + "learning_rate": 1.5254694182100678e-05, + "loss": 0.3384, "step": 708400 }, { - "epoch": 7.22, - "learning_rate": 3.3552494065052216e-05, - "loss": 0.5154, + "epoch": 9.761373343253148, + "grad_norm": 0.3331131339073181, + "learning_rate": 1.5247517760223906e-05, + "loss": 0.3308, "step": 708500 }, { - "epoch": 7.22, - "learning_rate": 3.3545907053626414e-05, - "loss": 0.4337, + "epoch": 9.762751095312888, + "grad_norm": 2.4163436889648438, + "learning_rate": 1.5240342379258174e-05, + "loss": 0.3483, "step": 708600 }, { - "epoch": 7.22, - "learning_rate": 3.353931986882691e-05, - "loss": 0.4766, + "epoch": 9.764128847372627, + "grad_norm": 0.8620812296867371, + "learning_rate": 1.5233168039813079e-05, + "loss": 0.3437, "step": 708700 }, { - "epoch": 7.22, - "learning_rate": 3.353273251097578e-05, - "loss": 0.4807, + "epoch": 9.765506599432365, + "grad_norm": 3.4354171752929688, + "learning_rate": 1.5225994742498122e-05, + "loss": 0.3456, "step": 708800 }, { - "epoch": 7.22, - "learning_rate": 3.3526144980395125e-05, - "loss": 0.5385, + "epoch": 9.766884351492106, + "grad_norm": 2.2740471363067627, + "learning_rate": 1.5218822487922708e-05, + "loss": 0.3948, "step": 708900 }, { - "epoch": 7.22, - "learning_rate": 3.3519557277407016e-05, - "loss": 0.4287, + "epoch": 9.768262103551844, + "grad_norm": 3.7027769088745117, + "learning_rate": 1.5211651276696141e-05, + "loss": 0.3068, "step": 709000 }, { - "epoch": 7.22, - "learning_rate": 3.351296940233355e-05, - "loss": 0.4038, + "epoch": 9.769639855611585, + "grad_norm": 3.6294806003570557, + "learning_rate": 1.5204481109427663e-05, + "loss": 0.3617, "step": 709100 }, { - "epoch": 7.23, - "learning_rate": 3.3506447236814397e-05, - "loss": 0.5092, + "epoch": 9.771017607671324, + "grad_norm": 2.9316868782043457, + "learning_rate": 1.5197311986726432e-05, + "loss": 0.307, "step": 709200 }, { - "epoch": 7.23, - "learning_rate": 3.3499859020249395e-05, - "loss": 0.4135, + "epoch": 9.772395359731062, + "grad_norm": 2.4531657695770264, + "learning_rate": 1.5190143909201477e-05, + "loss": 0.3562, "step": 709300 }, { - "epoch": 7.23, - "learning_rate": 3.349327063256218e-05, - "loss": 0.4604, + "epoch": 9.773773111790803, + "grad_norm": 2.2681167125701904, + "learning_rate": 1.5182976877461774e-05, + "loss": 0.3072, "step": 709400 }, { - "epoch": 7.23, - "learning_rate": 3.348668207407487e-05, - "loss": 0.4525, + "epoch": 9.775150863850541, + "grad_norm": 14.267127990722656, + "learning_rate": 1.517581089211622e-05, + "loss": 0.3428, "step": 709500 }, { - "epoch": 7.23, - "learning_rate": 3.3480093345109614e-05, - "loss": 0.4759, + "epoch": 9.77652861591028, + "grad_norm": 2.158982276916504, + "learning_rate": 1.5168645953773558e-05, + "loss": 0.3107, "step": 709600 }, { - "epoch": 7.23, - "learning_rate": 3.347350444598856e-05, - "loss": 0.39, + "epoch": 9.77790636797002, + "grad_norm": 4.477296352386475, + "learning_rate": 1.516148206304251e-05, + "loss": 0.3151, "step": 709700 }, { - "epoch": 7.23, - "learning_rate": 3.3466915377033864e-05, - "loss": 0.4728, + "epoch": 9.779284120029759, + "grad_norm": 2.5479581356048584, + "learning_rate": 1.5154319220531698e-05, + "loss": 0.3081, "step": 709800 }, { - "epoch": 7.23, - "learning_rate": 3.3460326138567705e-05, - "loss": 0.4434, + "epoch": 9.7806618720895, + "grad_norm": 3.171149730682373, + "learning_rate": 1.5147157426849613e-05, + "loss": 0.3362, "step": 709900 }, { - "epoch": 7.23, - "learning_rate": 3.345373673091225e-05, - "loss": 0.4323, + "epoch": 9.782039624149238, + "grad_norm": 6.223041534423828, + "learning_rate": 1.5139996682604716e-05, + "loss": 0.3002, "step": 710000 }, { - "epoch": 7.23, - "learning_rate": 3.3447147154389667e-05, - "loss": 0.4254, + "epoch": 9.783417376208977, + "grad_norm": 1.9789401292800903, + "learning_rate": 1.5132836988405318e-05, + "loss": 0.3222, "step": 710100 }, { - "epoch": 7.24, - "learning_rate": 3.344055740932217e-05, - "loss": 0.4392, + "epoch": 9.784795128268717, + "grad_norm": 3.9383902549743652, + "learning_rate": 1.5125678344859701e-05, + "loss": 0.3473, "step": 710200 }, { - "epoch": 7.24, - "learning_rate": 3.343396749603194e-05, - "loss": 0.4736, + "epoch": 9.786172880328456, + "grad_norm": 4.3298563957214355, + "learning_rate": 1.5118520752576006e-05, + "loss": 0.2954, "step": 710300 }, { - "epoch": 7.24, - "learning_rate": 3.34273774148412e-05, - "loss": 0.4429, + "epoch": 9.787550632388195, + "grad_norm": 3.706490993499756, + "learning_rate": 1.5111364212162314e-05, + "loss": 0.329, "step": 710400 }, { - "epoch": 7.24, - "learning_rate": 3.3420787166072154e-05, - "loss": 0.5264, + "epoch": 9.788928384447935, + "grad_norm": 2.9419243335723877, + "learning_rate": 1.5104208724226626e-05, + "loss": 0.361, "step": 710500 }, { - "epoch": 7.24, - "learning_rate": 3.341419675004703e-05, - "loss": 0.4354, + "epoch": 9.790306136507674, + "grad_norm": 82.68695068359375, + "learning_rate": 1.509705428937682e-05, + "loss": 0.3263, "step": 710600 }, { - "epoch": 7.24, - "learning_rate": 3.340760616708806e-05, - "loss": 0.4974, + "epoch": 9.791683888567414, + "grad_norm": 5.371208667755127, + "learning_rate": 1.5089900908220694e-05, + "loss": 0.3345, "step": 710700 }, { - "epoch": 7.24, - "learning_rate": 3.340101541751749e-05, - "loss": 0.4206, + "epoch": 9.793061640627153, + "grad_norm": 1.4671239852905273, + "learning_rate": 1.5082748581365988e-05, + "loss": 0.3247, "step": 710800 }, { - "epoch": 7.24, - "learning_rate": 3.339442450165755e-05, - "loss": 0.4174, + "epoch": 9.794439392686892, + "grad_norm": 2.2416317462921143, + "learning_rate": 1.507559730942031e-05, + "loss": 0.3345, "step": 710900 }, { - "epoch": 7.24, - "learning_rate": 3.3387833419830514e-05, - "loss": 0.4937, + "epoch": 9.795817144746632, + "grad_norm": 1.836391806602478, + "learning_rate": 1.5068447092991197e-05, + "loss": 0.2734, "step": 711000 }, { - "epoch": 7.24, - "learning_rate": 3.338124217235865e-05, - "loss": 0.5004, + "epoch": 9.79719489680637, + "grad_norm": 10.556929588317871, + "learning_rate": 1.5061297932686114e-05, + "loss": 0.3113, "step": 711100 }, { - "epoch": 7.25, - "learning_rate": 3.3374650759564206e-05, - "loss": 0.4407, + "epoch": 9.79857264886611, + "grad_norm": 2.046957492828369, + "learning_rate": 1.5054149829112408e-05, + "loss": 0.2577, "step": 711200 }, { - "epoch": 7.25, - "learning_rate": 3.336805918176949e-05, - "loss": 0.4816, + "epoch": 9.79995040092585, + "grad_norm": 2.323071002960205, + "learning_rate": 1.5047002782877337e-05, + "loss": 0.3219, "step": 711300 }, { - "epoch": 7.25, - "learning_rate": 3.336146743929678e-05, - "loss": 0.4715, + "epoch": 9.801328152985588, + "grad_norm": 0.9787241816520691, + "learning_rate": 1.5039856794588092e-05, + "loss": 0.3118, "step": 711400 }, { - "epoch": 7.25, - "learning_rate": 3.335487553246837e-05, - "loss": 0.431, + "epoch": 9.802705905045329, + "grad_norm": 2.4513192176818848, + "learning_rate": 1.5032711864851764e-05, + "loss": 0.3404, "step": 711500 }, { - "epoch": 7.25, - "learning_rate": 3.334828346160656e-05, - "loss": 0.3767, + "epoch": 9.804083657105068, + "grad_norm": 2.3759207725524902, + "learning_rate": 1.5025567994275336e-05, + "loss": 0.2763, "step": 711600 }, { - "epoch": 7.25, - "learning_rate": 3.334169122703368e-05, - "loss": 0.5239, + "epoch": 9.805461409164806, + "grad_norm": 6.9838786125183105, + "learning_rate": 1.5018425183465736e-05, + "loss": 0.3166, "step": 711700 }, { - "epoch": 7.25, - "learning_rate": 3.3335098829072034e-05, - "loss": 0.4847, + "epoch": 9.806839161224547, + "grad_norm": 2.0206189155578613, + "learning_rate": 1.501128343302977e-05, + "loss": 0.2958, "step": 711800 }, { - "epoch": 7.25, - "learning_rate": 3.332850626804397e-05, - "loss": 0.4398, + "epoch": 9.808216913284285, + "grad_norm": 3.627972364425659, + "learning_rate": 1.500414274357416e-05, + "loss": 0.2664, "step": 711900 }, { - "epoch": 7.25, - "learning_rate": 3.33219135442718e-05, - "loss": 0.4334, + "epoch": 9.809594665344024, + "grad_norm": 5.444515705108643, + "learning_rate": 1.4997003115705555e-05, + "loss": 0.3267, "step": 712000 }, { - "epoch": 7.25, - "learning_rate": 3.3315320658077885e-05, - "loss": 0.4497, + "epoch": 9.810972417403764, + "grad_norm": 3.542719841003418, + "learning_rate": 1.4989864550030507e-05, + "loss": 0.3198, "step": 712100 }, { - "epoch": 7.26, - "learning_rate": 3.330872760978458e-05, - "loss": 0.4365, + "epoch": 9.812350169463503, + "grad_norm": 2.6820003986358643, + "learning_rate": 1.4982727047155458e-05, + "loss": 0.3348, "step": 712200 }, { - "epoch": 7.26, - "learning_rate": 3.3302134399714225e-05, - "loss": 0.4006, + "epoch": 9.813727921523242, + "grad_norm": 2.5595321655273438, + "learning_rate": 1.497559060768679e-05, + "loss": 0.295, "step": 712300 }, { - "epoch": 7.26, - "learning_rate": 3.329554102818923e-05, - "loss": 0.5132, + "epoch": 9.815105673582982, + "grad_norm": 2.1263515949249268, + "learning_rate": 1.496845523223077e-05, + "loss": 0.3062, "step": 712400 }, { - "epoch": 7.26, - "learning_rate": 3.328894749553193e-05, - "loss": 0.3632, + "epoch": 9.816483425642721, + "grad_norm": 2.0316073894500732, + "learning_rate": 1.49613209213936e-05, + "loss": 0.2799, "step": 712500 }, { - "epoch": 7.26, - "learning_rate": 3.3282353802064723e-05, - "loss": 0.4345, + "epoch": 9.817861177702461, + "grad_norm": 4.8766608238220215, + "learning_rate": 1.4954187675781348e-05, + "loss": 0.2872, "step": 712600 }, { - "epoch": 7.26, - "learning_rate": 3.3275759948110004e-05, - "loss": 0.465, + "epoch": 9.8192389297622, + "grad_norm": 1.3855955600738525, + "learning_rate": 1.4947055496000042e-05, + "loss": 0.3232, "step": 712700 }, { - "epoch": 7.26, - "learning_rate": 3.326916593399018e-05, - "loss": 0.4819, + "epoch": 9.820616681821939, + "grad_norm": 2.3175644874572754, + "learning_rate": 1.4939924382655612e-05, + "loss": 0.3342, "step": 712800 }, { - "epoch": 7.26, - "learning_rate": 3.326257176002764e-05, - "loss": 0.4702, + "epoch": 9.82199443388168, + "grad_norm": 1.045776128768921, + "learning_rate": 1.4932794336353841e-05, + "loss": 0.3062, "step": 712900 }, { - "epoch": 7.26, - "learning_rate": 3.325597742654482e-05, - "loss": 0.4405, + "epoch": 9.823372185941418, + "grad_norm": 4.187001705169678, + "learning_rate": 1.4925665357700487e-05, + "loss": 0.2842, "step": 713000 }, { - "epoch": 7.27, - "learning_rate": 3.324938293386413e-05, - "loss": 0.4831, + "epoch": 9.824749938001156, + "grad_norm": 4.426389217376709, + "learning_rate": 1.4918537447301199e-05, + "loss": 0.3434, "step": 713100 }, { - "epoch": 7.27, - "learning_rate": 3.3242788282308004e-05, - "loss": 0.4246, + "epoch": 9.826127690060897, + "grad_norm": 8.635143280029297, + "learning_rate": 1.491141060576151e-05, + "loss": 0.337, "step": 713200 }, { - "epoch": 7.27, - "learning_rate": 3.3236193472198885e-05, - "loss": 0.4125, + "epoch": 9.827505442120636, + "grad_norm": 3.3043558597564697, + "learning_rate": 1.4904284833686893e-05, + "loss": 0.3429, "step": 713300 }, { - "epoch": 7.27, - "learning_rate": 3.322959850385922e-05, - "loss": 0.4668, + "epoch": 9.828883194180376, + "grad_norm": 4.820959568023682, + "learning_rate": 1.489723137340394e-05, + "loss": 0.3258, "step": 713400 }, { - "epoch": 7.27, - "learning_rate": 3.3223003377611454e-05, - "loss": 0.3906, + "epoch": 9.830260946240115, + "grad_norm": 2.582156181335449, + "learning_rate": 1.4890107731365744e-05, + "loss": 0.3299, "step": 713500 }, { - "epoch": 7.27, - "learning_rate": 3.3216408093778065e-05, - "loss": 0.4307, + "epoch": 9.831638698299853, + "grad_norm": 3.1191978454589844, + "learning_rate": 1.4882985160602402e-05, + "loss": 0.3117, "step": 713600 }, { - "epoch": 7.27, - "learning_rate": 3.3209812652681517e-05, - "loss": 0.5007, + "epoch": 9.833016450359594, + "grad_norm": 1.1854172945022583, + "learning_rate": 1.4875863661719016e-05, + "loss": 0.3368, "step": 713700 }, { - "epoch": 7.27, - "learning_rate": 3.3203217054644285e-05, - "loss": 0.4026, + "epoch": 9.834394202419332, + "grad_norm": 1.875817894935608, + "learning_rate": 1.4868743235320606e-05, + "loss": 0.2716, "step": 713800 }, { - "epoch": 7.27, - "learning_rate": 3.3196621299988845e-05, - "loss": 0.4598, + "epoch": 9.835771954479071, + "grad_norm": 3.114182472229004, + "learning_rate": 1.4861623882012082e-05, + "loss": 0.335, "step": 713900 }, { - "epoch": 7.27, - "learning_rate": 3.319002538903771e-05, - "loss": 0.4146, + "epoch": 9.837149706538812, + "grad_norm": 6.847827434539795, + "learning_rate": 1.4854505602398257e-05, + "loss": 0.3188, "step": 714000 }, { - "epoch": 7.28, - "learning_rate": 3.3183429322113366e-05, - "loss": 0.4003, + "epoch": 9.83852745859855, + "grad_norm": 2.2460103034973145, + "learning_rate": 1.4847388397083872e-05, + "loss": 0.346, "step": 714100 }, { - "epoch": 7.28, - "learning_rate": 3.317683309953832e-05, - "loss": 0.4651, + "epoch": 9.83990521065829, + "grad_norm": 1.74933660030365, + "learning_rate": 1.4840272266673586e-05, + "loss": 0.3253, "step": 714200 }, { - "epoch": 7.28, - "learning_rate": 3.317023672163511e-05, - "loss": 0.4559, + "epoch": 9.84128296271803, + "grad_norm": 4.5215277671813965, + "learning_rate": 1.4833157211771923e-05, + "loss": 0.2876, "step": 714300 }, { - "epoch": 7.28, - "learning_rate": 3.316364018872622e-05, - "loss": 0.5292, + "epoch": 9.842660714777768, + "grad_norm": 5.034599781036377, + "learning_rate": 1.4826114367442515e-05, + "loss": 0.2875, "step": 714400 }, { - "epoch": 7.28, - "learning_rate": 3.315704350113422e-05, - "loss": 0.4726, + "epoch": 9.844038466837508, + "grad_norm": 0.8060349822044373, + "learning_rate": 1.4819001454601254e-05, + "loss": 0.3501, "step": 714500 }, { - "epoch": 7.28, - "learning_rate": 3.315044665918163e-05, - "loss": 0.427, + "epoch": 9.845416218897247, + "grad_norm": 0.05606954172253609, + "learning_rate": 1.4811889619075706e-05, + "loss": 0.3047, "step": 714600 }, { - "epoch": 7.28, - "learning_rate": 3.3143849663191006e-05, - "loss": 0.4131, + "epoch": 9.846793970956986, + "grad_norm": 1.3229053020477295, + "learning_rate": 1.4804778861470035e-05, + "loss": 0.3594, "step": 714700 }, { - "epoch": 7.28, - "learning_rate": 3.3137252513484885e-05, - "loss": 0.4975, + "epoch": 9.848171723016726, + "grad_norm": 1.2164642810821533, + "learning_rate": 1.479766918238835e-05, + "loss": 0.3533, "step": 714800 }, { - "epoch": 7.28, - "learning_rate": 3.313065521038584e-05, - "loss": 0.518, + "epoch": 9.849549475076465, + "grad_norm": 2.3053691387176514, + "learning_rate": 1.4790560582434662e-05, + "loss": 0.3331, "step": 714900 }, { - "epoch": 7.28, - "learning_rate": 3.312405775421644e-05, - "loss": 0.4901, + "epoch": 9.850927227136205, + "grad_norm": 2.7995054721832275, + "learning_rate": 1.478345306221287e-05, + "loss": 0.3558, "step": 715000 }, { - "epoch": 7.29, - "learning_rate": 3.311746014529926e-05, - "loss": 0.4247, + "epoch": 9.852304979195944, + "grad_norm": 2.0508270263671875, + "learning_rate": 1.4776346622326792e-05, + "loss": 0.307, "step": 715100 }, { - "epoch": 7.29, - "learning_rate": 3.311086238395688e-05, - "loss": 0.4625, + "epoch": 9.853682731255683, + "grad_norm": 5.861050605773926, + "learning_rate": 1.4769241263380163e-05, + "loss": 0.2463, "step": 715200 }, { - "epoch": 7.29, - "learning_rate": 3.3104264470511904e-05, - "loss": 0.4399, + "epoch": 9.855060483315423, + "grad_norm": 2.458979845046997, + "learning_rate": 1.4762136985976626e-05, + "loss": 0.3354, "step": 715300 }, { - "epoch": 7.29, - "learning_rate": 3.309766640528691e-05, - "loss": 0.4969, + "epoch": 9.856438235375162, + "grad_norm": 1.6369178295135498, + "learning_rate": 1.4755033790719707e-05, + "loss": 0.2857, "step": 715400 }, { - "epoch": 7.29, - "learning_rate": 3.309106818860451e-05, - "loss": 0.4213, + "epoch": 9.8578159874349, + "grad_norm": 3.129263162612915, + "learning_rate": 1.4747931678212884e-05, + "loss": 0.3243, "step": 715500 }, { - "epoch": 7.29, - "learning_rate": 3.308446982078733e-05, - "loss": 0.4933, + "epoch": 9.859193739494641, + "grad_norm": 5.650224208831787, + "learning_rate": 1.4740830649059502e-05, + "loss": 0.2871, "step": 715600 }, { - "epoch": 7.29, - "learning_rate": 3.307787130215799e-05, - "loss": 0.4425, + "epoch": 9.86057149155438, + "grad_norm": 0.9417009353637695, + "learning_rate": 1.4733730703862825e-05, + "loss": 0.3099, "step": 715700 }, { - "epoch": 7.29, - "learning_rate": 3.30712726330391e-05, - "loss": 0.526, + "epoch": 9.86194924361412, + "grad_norm": 2.6635608673095703, + "learning_rate": 1.4726631843226031e-05, + "loss": 0.3388, "step": 715800 }, { - "epoch": 7.29, - "learning_rate": 3.3064673813753316e-05, - "loss": 0.5263, + "epoch": 9.863326995673859, + "grad_norm": 2.5292627811431885, + "learning_rate": 1.4719534067752224e-05, + "loss": 0.3035, "step": 715900 }, { - "epoch": 7.29, - "learning_rate": 3.3058074844623264e-05, - "loss": 0.4562, + "epoch": 9.864704747733597, + "grad_norm": 0.5057100057601929, + "learning_rate": 1.471243737804437e-05, + "loss": 0.3263, "step": 716000 }, { - "epoch": 7.3, - "learning_rate": 3.30514757259716e-05, - "loss": 0.5105, + "epoch": 9.866082499793338, + "grad_norm": 4.652040481567383, + "learning_rate": 1.4705341774705395e-05, + "loss": 0.311, "step": 716100 }, { - "epoch": 7.3, - "learning_rate": 3.304487645812099e-05, - "loss": 0.4353, + "epoch": 9.867460251853077, + "grad_norm": 2.5622265338897705, + "learning_rate": 1.4698247258338079e-05, + "loss": 0.3066, "step": 716200 }, { - "epoch": 7.3, - "learning_rate": 3.303827704139408e-05, - "loss": 0.471, + "epoch": 9.868838003912815, + "grad_norm": 8.261514663696289, + "learning_rate": 1.4691153829545165e-05, + "loss": 0.3083, "step": 716300 }, { - "epoch": 7.3, - "learning_rate": 3.303167747611357e-05, - "loss": 0.4247, + "epoch": 9.870215755972556, + "grad_norm": 2.6371586322784424, + "learning_rate": 1.4684061488929253e-05, + "loss": 0.3174, "step": 716400 }, { - "epoch": 7.3, - "learning_rate": 3.302507776260212e-05, - "loss": 0.4171, + "epoch": 9.871593508032294, + "grad_norm": 62.25011444091797, + "learning_rate": 1.467697023709289e-05, + "loss": 0.3197, "step": 716500 }, { - "epoch": 7.3, - "learning_rate": 3.3018477901182415e-05, - "loss": 0.4924, + "epoch": 9.872971260092033, + "grad_norm": 0.07412987947463989, + "learning_rate": 1.46698800746385e-05, + "loss": 0.3445, "step": 716600 }, { - "epoch": 7.3, - "learning_rate": 3.301187789217715e-05, - "loss": 0.3955, + "epoch": 9.874349012151773, + "grad_norm": 2.3381614685058594, + "learning_rate": 1.4662791002168447e-05, + "loss": 0.3293, "step": 716700 }, { - "epoch": 7.3, - "learning_rate": 3.300527773590903e-05, - "loss": 0.3796, + "epoch": 9.875726764211512, + "grad_norm": 9.242097854614258, + "learning_rate": 1.4655703020284961e-05, + "loss": 0.3575, "step": 716800 }, { - "epoch": 7.3, - "learning_rate": 3.299867743270078e-05, - "loss": 0.5173, + "epoch": 9.877104516271253, + "grad_norm": 3.0195701122283936, + "learning_rate": 1.4648616129590226e-05, + "loss": 0.3236, "step": 716900 }, { - "epoch": 7.3, - "learning_rate": 3.2992076982875083e-05, - "loss": 0.532, + "epoch": 9.878482268330991, + "grad_norm": 3.3999199867248535, + "learning_rate": 1.4641530330686286e-05, + "loss": 0.2787, "step": 717000 }, { - "epoch": 7.31, - "learning_rate": 3.298547638675468e-05, - "loss": 0.3563, + "epoch": 9.87986002039073, + "grad_norm": 1.4309579133987427, + "learning_rate": 1.4634445624175128e-05, + "loss": 0.3437, "step": 717100 }, { - "epoch": 7.31, - "learning_rate": 3.2978875644662304e-05, - "loss": 0.4047, + "epoch": 9.88123777245047, + "grad_norm": 3.080718517303467, + "learning_rate": 1.4627362010658647e-05, + "loss": 0.2883, "step": 717200 }, { - "epoch": 7.31, - "learning_rate": 3.297227475692068e-05, - "loss": 0.4877, + "epoch": 9.882615524510209, + "grad_norm": 1.9679582118988037, + "learning_rate": 1.4620279490738615e-05, + "loss": 0.3478, "step": 717300 }, { - "epoch": 7.31, - "learning_rate": 3.296567372385256e-05, - "loss": 0.4302, + "epoch": 9.88399327656995, + "grad_norm": 1.5996785163879395, + "learning_rate": 1.4613198065016727e-05, + "loss": 0.3943, "step": 717400 }, { - "epoch": 7.31, - "learning_rate": 3.29590725457807e-05, - "loss": 0.4902, + "epoch": 9.885371028629688, + "grad_norm": 3.8279483318328857, + "learning_rate": 1.4606117734094597e-05, + "loss": 0.3022, "step": 717500 }, { - "epoch": 7.31, - "learning_rate": 3.295247122302784e-05, - "loss": 0.4601, + "epoch": 9.886748780689427, + "grad_norm": 2.4598217010498047, + "learning_rate": 1.4599038498573724e-05, + "loss": 0.3554, "step": 717600 }, { - "epoch": 7.31, - "learning_rate": 3.294586975591677e-05, - "loss": 0.4622, + "epoch": 9.888126532749167, + "grad_norm": 1.9888883829116821, + "learning_rate": 1.4591960359055529e-05, + "loss": 0.2952, "step": 717700 }, { - "epoch": 7.31, - "learning_rate": 3.2939334161593616e-05, - "loss": 0.4232, + "epoch": 9.889504284808906, + "grad_norm": 1.9636355638504028, + "learning_rate": 1.4584883316141353e-05, + "loss": 0.311, "step": 717800 }, { - "epoch": 7.31, - "learning_rate": 3.293273240816994e-05, - "loss": 0.4487, + "epoch": 9.890882036868645, + "grad_norm": 1.9344828128814697, + "learning_rate": 1.4577807370432408e-05, + "loss": 0.3049, "step": 717900 }, { - "epoch": 7.32, - "learning_rate": 3.292613051135315e-05, - "loss": 0.4178, + "epoch": 9.892259788928385, + "grad_norm": 3.5512256622314453, + "learning_rate": 1.4570732522529828e-05, + "loss": 0.3654, "step": 718000 }, { - "epoch": 7.32, - "learning_rate": 3.2919528471466045e-05, - "loss": 0.4076, + "epoch": 9.893637540988124, + "grad_norm": 1.6024874448776245, + "learning_rate": 1.4563658773034663e-05, + "loss": 0.3393, "step": 718100 }, { - "epoch": 7.32, - "learning_rate": 3.2912926288831424e-05, - "loss": 0.4042, + "epoch": 9.895015293047862, + "grad_norm": 2.5506958961486816, + "learning_rate": 1.455658612254788e-05, + "loss": 0.3253, "step": 718200 }, { - "epoch": 7.32, - "learning_rate": 3.290632396377209e-05, - "loss": 0.4311, + "epoch": 9.896393045107603, + "grad_norm": 2.268040180206299, + "learning_rate": 1.4549514571670324e-05, + "loss": 0.3194, "step": 718300 }, { - "epoch": 7.32, - "learning_rate": 3.2899721496610853e-05, - "loss": 0.4534, + "epoch": 9.897770797167341, + "grad_norm": 1.7642951011657715, + "learning_rate": 1.454244412100275e-05, + "loss": 0.3306, "step": 718400 }, { - "epoch": 7.32, - "learning_rate": 3.289311888767055e-05, - "loss": 0.4882, + "epoch": 9.899148549227082, + "grad_norm": 3.9760279655456543, + "learning_rate": 1.4535374771145848e-05, + "loss": 0.291, "step": 718500 }, { - "epoch": 7.32, - "learning_rate": 3.2886516137273976e-05, - "loss": 0.4842, + "epoch": 9.90052630128682, + "grad_norm": 2.7596538066864014, + "learning_rate": 1.4528306522700175e-05, + "loss": 0.3369, "step": 718600 }, { - "epoch": 7.32, - "learning_rate": 3.2879913245744e-05, - "loss": 0.4497, + "epoch": 9.90190405334656, + "grad_norm": 1.5413869619369507, + "learning_rate": 1.4521239376266224e-05, + "loss": 0.3209, "step": 718700 }, { - "epoch": 7.32, - "learning_rate": 3.287331021340346e-05, - "loss": 0.427, + "epoch": 9.9032818054063, + "grad_norm": 2.7431859970092773, + "learning_rate": 1.45141733324444e-05, + "loss": 0.2955, "step": 718800 }, { - "epoch": 7.32, - "learning_rate": 3.286677307299783e-05, - "loss": 0.4375, + "epoch": 9.904659557466038, + "grad_norm": 1.5346322059631348, + "learning_rate": 1.4507108391834984e-05, + "loss": 0.255, "step": 718900 }, { - "epoch": 7.33, - "learning_rate": 3.286016976140475e-05, - "loss": 0.4674, + "epoch": 9.906037309525777, + "grad_norm": 2.869378089904785, + "learning_rate": 1.4500115187940302e-05, + "loss": 0.3162, "step": 719000 }, { - "epoch": 7.33, - "learning_rate": 3.285356630996644e-05, - "loss": 0.4905, + "epoch": 9.907415061585517, + "grad_norm": 13.489603042602539, + "learning_rate": 1.4493123066474644e-05, + "loss": 0.3316, "step": 719100 }, { - "epoch": 7.33, - "learning_rate": 3.2846962719005755e-05, - "loss": 0.5627, + "epoch": 9.908792813645256, + "grad_norm": 1.4707773923873901, + "learning_rate": 1.4486061416997158e-05, + "loss": 0.2865, "step": 719200 }, { - "epoch": 7.33, - "learning_rate": 3.2840358988845585e-05, - "loss": 0.4357, + "epoch": 9.910170565704997, + "grad_norm": 1.8340994119644165, + "learning_rate": 1.4479000873120344e-05, + "loss": 0.31, "step": 719300 }, { - "epoch": 7.33, - "learning_rate": 3.283375511980882e-05, - "loss": 0.464, + "epoch": 9.911548317764735, + "grad_norm": 1.2378714084625244, + "learning_rate": 1.447194143544402e-05, + "loss": 0.2684, "step": 719400 }, { - "epoch": 7.33, - "learning_rate": 3.2827151112218336e-05, - "loss": 0.4485, + "epoch": 9.912926069824474, + "grad_norm": 2.8528451919555664, + "learning_rate": 1.4464883104567913e-05, + "loss": 0.2904, "step": 719500 }, { - "epoch": 7.33, - "learning_rate": 3.2820546966397035e-05, - "loss": 0.4553, + "epoch": 9.914303821884214, + "grad_norm": 2.7626633644104004, + "learning_rate": 1.445782588109167e-05, + "loss": 0.3193, "step": 719600 }, { - "epoch": 7.33, - "learning_rate": 3.281394268266782e-05, - "loss": 0.4615, + "epoch": 9.915681573943953, + "grad_norm": 0.23233921825885773, + "learning_rate": 1.4450769765614857e-05, + "loss": 0.2861, "step": 719700 }, { - "epoch": 7.33, - "learning_rate": 3.280733826135359e-05, - "loss": 0.4348, + "epoch": 9.917059326003692, + "grad_norm": 4.531364917755127, + "learning_rate": 1.4443714758736898e-05, + "loss": 0.251, "step": 719800 }, { - "epoch": 7.33, - "learning_rate": 3.2800733702777276e-05, - "loss": 0.437, + "epoch": 9.918437078063432, + "grad_norm": 2.497335433959961, + "learning_rate": 1.4436660861057181e-05, + "loss": 0.3549, "step": 719900 }, { - "epoch": 7.34, - "learning_rate": 3.279412900726181e-05, - "loss": 0.4091, + "epoch": 9.91981483012317, + "grad_norm": 2.77644681930542, + "learning_rate": 1.4429608073174941e-05, + "loss": 0.3022, "step": 720000 }, { - "epoch": 7.34, - "learning_rate": 3.27875241751301e-05, - "loss": 0.407, + "epoch": 9.921192582182911, + "grad_norm": 1.5866984128952026, + "learning_rate": 1.4422556395689372e-05, + "loss": 0.321, "step": 720100 }, { - "epoch": 7.34, - "learning_rate": 3.27809192067051e-05, - "loss": 0.4499, + "epoch": 9.92257033424265, + "grad_norm": 1.859381914138794, + "learning_rate": 1.4415505829199532e-05, + "loss": 0.31, "step": 720200 }, { - "epoch": 7.34, - "learning_rate": 3.2774314102309736e-05, - "loss": 0.5122, + "epoch": 9.923948086302389, + "grad_norm": 3.298774242401123, + "learning_rate": 1.4408456374304424e-05, + "loss": 0.3041, "step": 720300 }, { - "epoch": 7.34, - "learning_rate": 3.276770886226698e-05, - "loss": 0.4187, + "epoch": 9.925325838362129, + "grad_norm": 2.7661261558532715, + "learning_rate": 1.440140803160291e-05, + "loss": 0.3219, "step": 720400 }, { - "epoch": 7.34, - "learning_rate": 3.2761103486899775e-05, - "loss": 0.3999, + "epoch": 9.926703590421868, + "grad_norm": 8.83280086517334, + "learning_rate": 1.4394360801693806e-05, + "loss": 0.3213, "step": 720500 }, { - "epoch": 7.34, - "learning_rate": 3.2754497976531076e-05, - "loss": 0.4975, + "epoch": 9.928081342481606, + "grad_norm": 2.0057129859924316, + "learning_rate": 1.438731468517579e-05, + "loss": 0.3385, "step": 720600 }, { - "epoch": 7.34, - "learning_rate": 3.2747892331483874e-05, - "loss": 0.4307, + "epoch": 9.929459094541347, + "grad_norm": 5.539236068725586, + "learning_rate": 1.4380269682647487e-05, + "loss": 0.3112, "step": 720700 }, { - "epoch": 7.34, - "learning_rate": 3.274128655208113e-05, - "loss": 0.4739, + "epoch": 9.930836846601085, + "grad_norm": 1.8977854251861572, + "learning_rate": 1.4373225794707385e-05, + "loss": 0.3497, "step": 720800 }, { - "epoch": 7.34, - "learning_rate": 3.273468063864584e-05, - "loss": 0.4882, + "epoch": 9.932214598660824, + "grad_norm": 1.5052602291107178, + "learning_rate": 1.4366183021953903e-05, + "loss": 0.2446, "step": 720900 }, { - "epoch": 7.35, - "learning_rate": 3.272807459150099e-05, - "loss": 0.4653, + "epoch": 9.933592350720565, + "grad_norm": 0.15553897619247437, + "learning_rate": 1.4359141364985382e-05, + "loss": 0.3405, "step": 721000 }, { - "epoch": 7.35, - "learning_rate": 3.272146841096956e-05, - "loss": 0.4065, + "epoch": 9.934970102780303, + "grad_norm": 5.117846965789795, + "learning_rate": 1.4352100824400026e-05, + "loss": 0.2837, "step": 721100 }, { - "epoch": 7.35, - "learning_rate": 3.271486209737457e-05, - "loss": 0.4249, + "epoch": 9.936347854840044, + "grad_norm": 2.749969959259033, + "learning_rate": 1.4345061400795961e-05, + "loss": 0.3189, "step": 721200 }, { - "epoch": 7.35, - "learning_rate": 3.270825565103903e-05, - "loss": 0.4704, + "epoch": 9.937725606899782, + "grad_norm": 8.230259895324707, + "learning_rate": 1.4338023094771239e-05, + "loss": 0.3192, "step": 721300 }, { - "epoch": 7.35, - "learning_rate": 3.270164907228594e-05, - "loss": 0.4776, + "epoch": 9.939103358959521, + "grad_norm": 6.327287197113037, + "learning_rate": 1.4330985906923782e-05, + "loss": 0.283, "step": 721400 }, { - "epoch": 7.35, - "learning_rate": 3.269504236143835e-05, - "loss": 0.4859, + "epoch": 9.940481111019261, + "grad_norm": 6.003288745880127, + "learning_rate": 1.4323949837851445e-05, + "loss": 0.3371, "step": 721500 }, { - "epoch": 7.35, - "learning_rate": 3.2688435518819253e-05, - "loss": 0.479, + "epoch": 9.941858863079, + "grad_norm": 3.5814056396484375, + "learning_rate": 1.4316914888151986e-05, + "loss": 0.3138, "step": 721600 }, { - "epoch": 7.35, - "learning_rate": 3.2681828544751716e-05, - "loss": 0.42, + "epoch": 9.94323661513874, + "grad_norm": 11.655320167541504, + "learning_rate": 1.4309881058423052e-05, + "loss": 0.314, "step": 721700 }, { - "epoch": 7.35, - "learning_rate": 3.267522143955876e-05, - "loss": 0.4384, + "epoch": 9.94461436719848, + "grad_norm": 3.9682466983795166, + "learning_rate": 1.430284834926219e-05, + "loss": 0.3669, "step": 721800 }, { - "epoch": 7.35, - "learning_rate": 3.2668614203563436e-05, - "loss": 0.4411, + "epoch": 9.945992119258218, + "grad_norm": 3.8927829265594482, + "learning_rate": 1.4295816761266882e-05, + "loss": 0.2976, "step": 721900 }, { - "epoch": 7.36, - "learning_rate": 3.266200683708882e-05, - "loss": 0.3853, + "epoch": 9.947369871317958, + "grad_norm": 3.0761945247650146, + "learning_rate": 1.4288786295034499e-05, + "loss": 0.3407, "step": 722000 }, { - "epoch": 7.36, - "learning_rate": 3.2655399340457944e-05, - "loss": 0.4367, + "epoch": 9.948747623377697, + "grad_norm": 4.82087516784668, + "learning_rate": 1.42817569511623e-05, + "loss": 0.2746, "step": 722100 }, { - "epoch": 7.36, - "learning_rate": 3.264879171399389e-05, - "loss": 0.3963, + "epoch": 9.950125375437436, + "grad_norm": 1.373894214630127, + "learning_rate": 1.427472873024748e-05, + "loss": 0.2938, "step": 722200 }, { - "epoch": 7.36, - "learning_rate": 3.2642183958019726e-05, - "loss": 0.4308, + "epoch": 9.951503127497176, + "grad_norm": 1.8402490615844727, + "learning_rate": 1.4267701632887118e-05, + "loss": 0.3042, "step": 722300 }, { - "epoch": 7.36, - "learning_rate": 3.263557607285853e-05, - "loss": 0.4543, + "epoch": 9.952880879556915, + "grad_norm": 4.238638877868652, + "learning_rate": 1.4260675659678182e-05, + "loss": 0.3074, "step": 722400 }, { - "epoch": 7.36, - "learning_rate": 3.26289680588334e-05, - "loss": 0.4167, + "epoch": 9.954258631616653, + "grad_norm": 1.7759859561920166, + "learning_rate": 1.4253650811217586e-05, + "loss": 0.3583, "step": 722500 }, { - "epoch": 7.36, - "learning_rate": 3.2622359916267424e-05, - "loss": 0.4283, + "epoch": 9.955636383676394, + "grad_norm": 1.4588953256607056, + "learning_rate": 1.424662708810213e-05, + "loss": 0.3156, "step": 722600 }, { - "epoch": 7.36, - "learning_rate": 3.2615751645483686e-05, - "loss": 0.4649, + "epoch": 9.957014135736133, + "grad_norm": 1.7758018970489502, + "learning_rate": 1.4239604490928506e-05, + "loss": 0.2542, "step": 722700 }, { - "epoch": 7.36, - "learning_rate": 3.260914324680531e-05, - "loss": 0.4988, + "epoch": 9.958391887795873, + "grad_norm": 6.296766757965088, + "learning_rate": 1.4232583020293314e-05, + "loss": 0.2991, "step": 722800 }, { - "epoch": 7.37, - "learning_rate": 3.260260080644832e-05, - "loss": 0.3935, + "epoch": 9.959769639855612, + "grad_norm": 3.489271879196167, + "learning_rate": 1.4225562676793065e-05, + "loss": 0.3329, "step": 722900 }, { - "epoch": 7.37, - "learning_rate": 3.259599215422088e-05, - "loss": 0.4048, + "epoch": 9.96114739191535, + "grad_norm": 4.678731441497803, + "learning_rate": 1.4218543461024189e-05, + "loss": 0.313, "step": 723000 }, { - "epoch": 7.37, - "learning_rate": 3.2589383375064905e-05, - "loss": 0.442, + "epoch": 9.96252514397509, + "grad_norm": 4.383260726928711, + "learning_rate": 1.4211525373582986e-05, + "loss": 0.2831, "step": 723100 }, { - "epoch": 7.37, - "learning_rate": 3.2582774469303536e-05, - "loss": 0.5406, + "epoch": 9.96390289603483, + "grad_norm": 2.5311295986175537, + "learning_rate": 1.4204578579060728e-05, + "loss": 0.2954, "step": 723200 }, { - "epoch": 7.37, - "learning_rate": 3.257623152820439e-05, - "loss": 0.4263, + "epoch": 9.965280648094568, + "grad_norm": 2.3203036785125732, + "learning_rate": 1.4197562738765315e-05, + "loss": 0.3161, "step": 723300 }, { - "epoch": 7.37, - "learning_rate": 3.2569622371459646e-05, - "loss": 0.4512, + "epoch": 9.966658400154309, + "grad_norm": 5.560645580291748, + "learning_rate": 1.4190548028580002e-05, + "loss": 0.2859, "step": 723400 }, { - "epoch": 7.37, - "learning_rate": 3.25630130890757e-05, - "loss": 0.5269, + "epoch": 9.968036152214047, + "grad_norm": 2.037083625793457, + "learning_rate": 1.4183534449100716e-05, + "loss": 0.3011, "step": 723500 }, { - "epoch": 7.37, - "learning_rate": 3.255640368137571e-05, - "loss": 0.3814, + "epoch": 9.969413904273788, + "grad_norm": 1.9922387599945068, + "learning_rate": 1.4176522000923302e-05, + "loss": 0.2708, "step": 723600 }, { - "epoch": 7.37, - "learning_rate": 3.254979414868283e-05, - "loss": 0.4156, + "epoch": 9.970791656333526, + "grad_norm": 3.6708500385284424, + "learning_rate": 1.4169510684643519e-05, + "loss": 0.3035, "step": 723700 }, { - "epoch": 7.37, - "learning_rate": 3.254318449132024e-05, - "loss": 0.4891, + "epoch": 9.972169408393265, + "grad_norm": 1.4355281591415405, + "learning_rate": 1.4162500500857003e-05, + "loss": 0.3347, "step": 723800 }, { - "epoch": 7.38, - "learning_rate": 3.2536574709611106e-05, - "loss": 0.4262, + "epoch": 9.973547160453005, + "grad_norm": 5.6878662109375, + "learning_rate": 1.4155491450159289e-05, + "loss": 0.3428, "step": 723900 }, { - "epoch": 7.38, - "learning_rate": 3.252996480387861e-05, - "loss": 0.4947, + "epoch": 9.974924912512744, + "grad_norm": 2.0668327808380127, + "learning_rate": 1.4148483533145851e-05, + "loss": 0.2941, "step": 724000 }, { - "epoch": 7.38, - "learning_rate": 3.2523354774445925e-05, - "loss": 0.4076, + "epoch": 9.976302664572483, + "grad_norm": 5.348653793334961, + "learning_rate": 1.414147675041205e-05, + "loss": 0.3552, "step": 724100 }, { - "epoch": 7.38, - "learning_rate": 3.251674462163626e-05, - "loss": 0.4013, + "epoch": 9.977680416632223, + "grad_norm": 3.0700888633728027, + "learning_rate": 1.4134471102553132e-05, + "loss": 0.3476, "step": 724200 }, { - "epoch": 7.38, - "learning_rate": 3.251013434577279e-05, - "loss": 0.4886, + "epoch": 9.979058168691962, + "grad_norm": 13.120688438415527, + "learning_rate": 1.4127466590164279e-05, + "loss": 0.3498, "step": 724300 }, { - "epoch": 7.38, - "learning_rate": 3.250352394717873e-05, - "loss": 0.4672, + "epoch": 9.980435920751702, + "grad_norm": 2.3017940521240234, + "learning_rate": 1.4120463213840556e-05, + "loss": 0.265, "step": 724400 }, { - "epoch": 7.38, - "learning_rate": 3.24969134261773e-05, - "loss": 0.4508, + "epoch": 9.981813672811441, + "grad_norm": 75.67904663085938, + "learning_rate": 1.411346097417692e-05, + "loss": 0.3503, "step": 724500 }, { - "epoch": 7.38, - "learning_rate": 3.249030278309169e-05, - "loss": 0.3874, + "epoch": 9.98319142487118, + "grad_norm": 8.453766822814941, + "learning_rate": 1.4106459871768263e-05, + "loss": 0.2894, "step": 724600 }, { - "epoch": 7.38, - "learning_rate": 3.248369201824515e-05, - "loss": 0.4552, + "epoch": 9.98456917693092, + "grad_norm": 3.7739968299865723, + "learning_rate": 1.4099459907209369e-05, + "loss": 0.2567, "step": 724700 }, { - "epoch": 7.38, - "learning_rate": 3.2477081131960885e-05, - "loss": 0.4548, + "epoch": 9.985946928990659, + "grad_norm": 6.567869663238525, + "learning_rate": 1.4092461081094898e-05, + "loss": 0.3133, "step": 724800 }, { - "epoch": 7.39, - "learning_rate": 3.2470470124562136e-05, - "loss": 0.4618, + "epoch": 9.987324681050398, + "grad_norm": 1.419635534286499, + "learning_rate": 1.4085463394019464e-05, + "loss": 0.3031, "step": 724900 }, { - "epoch": 7.39, - "learning_rate": 3.246392510825089e-05, - "loss": 0.4555, + "epoch": 9.988702433110138, + "grad_norm": 2.9086742401123047, + "learning_rate": 1.4078466846577533e-05, + "loss": 0.2775, "step": 725000 }, { - "epoch": 7.39, - "learning_rate": 3.245731386079597e-05, - "loss": 0.4919, + "epoch": 9.990080185169877, + "grad_norm": 3.0090432167053223, + "learning_rate": 1.4071471439363518e-05, + "loss": 0.276, "step": 725100 }, { - "epoch": 7.39, - "learning_rate": 3.245070249319308e-05, - "loss": 0.4294, + "epoch": 9.991457937229615, + "grad_norm": 1.9325841665267944, + "learning_rate": 1.4064477172971695e-05, + "loss": 0.2695, "step": 725200 }, { - "epoch": 7.39, - "learning_rate": 3.244409100576545e-05, - "loss": 0.4116, + "epoch": 9.992835689289356, + "grad_norm": 5.092696189880371, + "learning_rate": 1.4057484047996282e-05, + "loss": 0.2886, "step": 725300 }, { - "epoch": 7.39, - "learning_rate": 3.243747939883636e-05, - "loss": 0.4804, + "epoch": 9.994213441349094, + "grad_norm": 2.2072484493255615, + "learning_rate": 1.4050492065031358e-05, + "loss": 0.2941, "step": 725400 }, { - "epoch": 7.39, - "learning_rate": 3.2430867672729075e-05, - "loss": 0.4597, + "epoch": 9.995591193408835, + "grad_norm": 2.657590627670288, + "learning_rate": 1.4043501224670952e-05, + "loss": 0.359, "step": 725500 }, { - "epoch": 7.39, - "learning_rate": 3.242425582776686e-05, - "loss": 0.3916, + "epoch": 9.996968945468574, + "grad_norm": 14.954183578491211, + "learning_rate": 1.4036511527508947e-05, + "loss": 0.3251, "step": 725600 }, { - "epoch": 7.39, - "learning_rate": 3.2417643864273006e-05, - "loss": 0.4334, + "epoch": 9.998346697528312, + "grad_norm": 4.210543632507324, + "learning_rate": 1.4029522974139181e-05, + "loss": 0.3161, "step": 725700 }, { - "epoch": 7.39, - "learning_rate": 3.2411031782570796e-05, - "loss": 0.4799, + "epoch": 9.999724449588053, + "grad_norm": 9.302190780639648, + "learning_rate": 1.4022535565155338e-05, + "loss": 0.3349, "step": 725800 }, { - "epoch": 7.4, - "learning_rate": 3.240441958298352e-05, - "loss": 0.4739, + "epoch": 10.001102201647791, + "grad_norm": 3.400059223175049, + "learning_rate": 1.4015549301151056e-05, + "loss": 0.2936, "step": 725900 }, { - "epoch": 7.4, - "learning_rate": 3.2397807265834476e-05, - "loss": 0.4246, + "epoch": 10.00247995370753, + "grad_norm": 4.8409833908081055, + "learning_rate": 1.400856418271985e-05, + "loss": 0.2517, "step": 726000 }, { - "epoch": 7.4, - "learning_rate": 3.2391194831446964e-05, - "loss": 0.4076, + "epoch": 10.00385770576727, + "grad_norm": 5.489073276519775, + "learning_rate": 1.4001580210455143e-05, + "loss": 0.2631, "step": 726100 }, { - "epoch": 7.4, - "learning_rate": 3.23845822801443e-05, - "loss": 0.4704, + "epoch": 10.005235457827009, + "grad_norm": 6.839758396148682, + "learning_rate": 1.399459738495024e-05, + "loss": 0.3201, "step": 726200 }, { - "epoch": 7.4, - "learning_rate": 3.237796961224978e-05, - "loss": 0.468, + "epoch": 10.00661320988675, + "grad_norm": 2.8148086071014404, + "learning_rate": 1.3987615706798397e-05, + "loss": 0.2547, "step": 726300 }, { - "epoch": 7.4, - "learning_rate": 3.2371356828086744e-05, - "loss": 0.4758, + "epoch": 10.007990961946488, + "grad_norm": 2.114227771759033, + "learning_rate": 1.3980635176592716e-05, + "loss": 0.2726, "step": 726400 }, { - "epoch": 7.4, - "learning_rate": 3.236474392797852e-05, - "loss": 0.3783, + "epoch": 10.009368714006227, + "grad_norm": 5.301149845123291, + "learning_rate": 1.3973655794926242e-05, + "loss": 0.3044, "step": 726500 }, { - "epoch": 7.4, - "learning_rate": 3.235813091224842e-05, - "loss": 0.4618, + "epoch": 10.010746466065967, + "grad_norm": 0.5168023705482483, + "learning_rate": 1.3966677562391916e-05, + "loss": 0.2595, "step": 726600 }, { - "epoch": 7.4, - "learning_rate": 3.235151778121979e-05, - "loss": 0.4859, + "epoch": 10.012124218125706, + "grad_norm": 1.8498117923736572, + "learning_rate": 1.3959700479582572e-05, + "loss": 0.2763, "step": 726700 }, { - "epoch": 7.4, - "learning_rate": 3.234490453521598e-05, - "loss": 0.4638, + "epoch": 10.013501970185445, + "grad_norm": 1.5371010303497314, + "learning_rate": 1.3952724547090929e-05, + "loss": 0.2726, "step": 726800 }, { - "epoch": 7.41, - "learning_rate": 3.233829117456033e-05, - "loss": 0.4402, + "epoch": 10.014879722245185, + "grad_norm": 4.805660724639893, + "learning_rate": 1.3945749765509646e-05, + "loss": 0.3086, "step": 726900 }, { - "epoch": 7.41, - "learning_rate": 3.2331677699576184e-05, - "loss": 0.3863, + "epoch": 10.016257474304924, + "grad_norm": 2.421093463897705, + "learning_rate": 1.3938776135431275e-05, + "loss": 0.2814, "step": 727000 }, { - "epoch": 7.41, - "learning_rate": 3.2325064110586936e-05, - "loss": 0.3704, + "epoch": 10.017635226364664, + "grad_norm": 3.1444449424743652, + "learning_rate": 1.3931803657448238e-05, + "loss": 0.2762, "step": 727100 }, { - "epoch": 7.41, - "learning_rate": 3.231845040791591e-05, - "loss": 0.4645, + "epoch": 10.019012978424403, + "grad_norm": 5.2588372230529785, + "learning_rate": 1.3924832332152905e-05, + "loss": 0.2743, "step": 727200 }, { - "epoch": 7.41, - "learning_rate": 3.231183659188651e-05, - "loss": 0.3886, + "epoch": 10.020390730484142, + "grad_norm": 1.2897651195526123, + "learning_rate": 1.3917862160137517e-05, + "loss": 0.2888, "step": 727300 }, { - "epoch": 7.41, - "learning_rate": 3.230522266282209e-05, - "loss": 0.442, + "epoch": 10.021768482543882, + "grad_norm": 2.2728793621063232, + "learning_rate": 1.391089314199421e-05, + "loss": 0.2659, "step": 727400 }, { - "epoch": 7.41, - "learning_rate": 3.2298608621046025e-05, - "loss": 0.447, + "epoch": 10.02314623460362, + "grad_norm": 1.017311692237854, + "learning_rate": 1.3903925278315054e-05, + "loss": 0.2736, "step": 727500 }, { - "epoch": 7.41, - "learning_rate": 3.229199446688173e-05, - "loss": 0.4259, + "epoch": 10.02452398666336, + "grad_norm": 2.155237913131714, + "learning_rate": 1.3896958569692013e-05, + "loss": 0.2486, "step": 727600 }, { - "epoch": 7.41, - "learning_rate": 3.2285380200652584e-05, - "loss": 0.4432, + "epoch": 10.0259017387231, + "grad_norm": 4.465235710144043, + "learning_rate": 1.3889993016716934e-05, + "loss": 0.3012, "step": 727700 }, { - "epoch": 7.41, - "learning_rate": 3.2278765822681974e-05, - "loss": 0.4265, + "epoch": 10.027279490782838, + "grad_norm": 3.0036001205444336, + "learning_rate": 1.3883028619981569e-05, + "loss": 0.3288, "step": 727800 }, { - "epoch": 7.42, - "learning_rate": 3.227215133329333e-05, - "loss": 0.5108, + "epoch": 10.028657242842579, + "grad_norm": 2.107567071914673, + "learning_rate": 1.3876065380077582e-05, + "loss": 0.2845, "step": 727900 }, { - "epoch": 7.42, - "learning_rate": 3.226553673281003e-05, - "loss": 0.4478, + "epoch": 10.030034994902318, + "grad_norm": 3.331310272216797, + "learning_rate": 1.3869103297596553e-05, + "loss": 0.2522, "step": 728000 }, { - "epoch": 7.42, - "learning_rate": 3.225892202155551e-05, - "loss": 0.4618, + "epoch": 10.031412746962056, + "grad_norm": 8.626947402954102, + "learning_rate": 1.3862142373129925e-05, + "loss": 0.2874, "step": 728100 }, { - "epoch": 7.42, - "learning_rate": 3.225230719985319e-05, - "loss": 0.437, + "epoch": 10.032790499021797, + "grad_norm": 5.837547302246094, + "learning_rate": 1.3855182607269072e-05, + "loss": 0.2735, "step": 728200 }, { - "epoch": 7.42, - "learning_rate": 3.224569226802648e-05, - "loss": 0.4036, + "epoch": 10.034168251081535, + "grad_norm": 6.366975784301758, + "learning_rate": 1.3848224000605284e-05, + "loss": 0.2652, "step": 728300 }, { - "epoch": 7.42, - "learning_rate": 3.223907722639882e-05, - "loss": 0.3911, + "epoch": 10.035546003141274, + "grad_norm": 6.6881890296936035, + "learning_rate": 1.384126655372969e-05, + "loss": 0.2909, "step": 728400 }, { - "epoch": 7.42, - "learning_rate": 3.2232462075293656e-05, - "loss": 0.44, + "epoch": 10.036923755201014, + "grad_norm": 2.935879945755005, + "learning_rate": 1.3834310267233379e-05, + "loss": 0.3375, "step": 728500 }, { - "epoch": 7.42, - "learning_rate": 3.2225846815034406e-05, - "loss": 0.4357, + "epoch": 10.038301507260753, + "grad_norm": 0.49858301877975464, + "learning_rate": 1.3827355141707332e-05, + "loss": 0.2761, "step": 728600 }, { - "epoch": 7.42, - "learning_rate": 3.221923144594454e-05, - "loss": 0.4229, + "epoch": 10.039679259320494, + "grad_norm": 2.6344661712646484, + "learning_rate": 1.3820401177742405e-05, + "loss": 0.2603, "step": 728700 }, { - "epoch": 7.43, - "learning_rate": 3.221261596834749e-05, - "loss": 0.391, + "epoch": 10.041057011380232, + "grad_norm": 11.682106971740723, + "learning_rate": 1.3813448375929395e-05, + "loss": 0.3186, "step": 728800 }, { - "epoch": 7.43, - "learning_rate": 3.220600038256673e-05, - "loss": 0.4338, + "epoch": 10.04243476343997, + "grad_norm": 3.943646192550659, + "learning_rate": 1.3806496736858953e-05, + "loss": 0.2964, "step": 728900 }, { - "epoch": 7.43, - "learning_rate": 3.2199384688925715e-05, - "loss": 0.3893, + "epoch": 10.043812515499711, + "grad_norm": 3.1700892448425293, + "learning_rate": 1.3799546261121682e-05, + "loss": 0.2298, "step": 729000 }, { - "epoch": 7.43, - "learning_rate": 3.219276888774791e-05, - "loss": 0.4304, + "epoch": 10.04519026755945, + "grad_norm": 2.388519048690796, + "learning_rate": 1.3792596949308032e-05, + "loss": 0.2763, "step": 729100 }, { - "epoch": 7.43, - "learning_rate": 3.21861529793568e-05, - "loss": 0.4552, + "epoch": 10.046568019619189, + "grad_norm": 4.996067047119141, + "learning_rate": 1.37856488020084e-05, + "loss": 0.3055, "step": 729200 }, { - "epoch": 7.43, - "learning_rate": 3.2179536964075856e-05, - "loss": 0.3944, + "epoch": 10.047945771678929, + "grad_norm": 1.8150806427001953, + "learning_rate": 1.3778701819813073e-05, + "loss": 0.2841, "step": 729300 }, { - "epoch": 7.43, - "learning_rate": 3.217292084222855e-05, - "loss": 0.4667, + "epoch": 10.049323523738668, + "grad_norm": 3.8512775897979736, + "learning_rate": 1.3771756003312228e-05, + "loss": 0.3006, "step": 729400 }, { - "epoch": 7.43, - "learning_rate": 3.216630461413839e-05, - "loss": 0.4143, + "epoch": 10.050701275798406, + "grad_norm": 3.4396886825561523, + "learning_rate": 1.376481135309593e-05, + "loss": 0.3091, "step": 729500 }, { - "epoch": 7.43, - "learning_rate": 3.215968828012886e-05, - "loss": 0.4525, + "epoch": 10.052079027858147, + "grad_norm": 6.079519271850586, + "learning_rate": 1.375786786975419e-05, + "loss": 0.3078, "step": 729600 }, { - "epoch": 7.43, - "learning_rate": 3.215307184052347e-05, - "loss": 0.3993, + "epoch": 10.053456779917886, + "grad_norm": 0.27762430906295776, + "learning_rate": 1.3750925553876866e-05, + "loss": 0.2583, "step": 729700 }, { - "epoch": 7.44, - "learning_rate": 3.214645529564571e-05, - "loss": 0.475, + "epoch": 10.054834531977626, + "grad_norm": 3.54455304145813, + "learning_rate": 1.3743984406053755e-05, + "loss": 0.2898, "step": 729800 }, { - "epoch": 7.44, - "learning_rate": 3.213983864581909e-05, - "loss": 0.482, + "epoch": 10.056212284037365, + "grad_norm": 5.610214710235596, + "learning_rate": 1.3737113820879626e-05, + "loss": 0.2955, "step": 729900 }, { - "epoch": 7.44, - "learning_rate": 3.213322189136713e-05, - "loss": 0.4554, + "epoch": 10.057590036097103, + "grad_norm": 3.868779420852661, + "learning_rate": 1.373017499923865e-05, + "loss": 0.3041, "step": 730000 }, { - "epoch": 7.44, - "learning_rate": 3.2126605032613356e-05, - "loss": 0.4303, + "epoch": 10.058967788156844, + "grad_norm": 0.3032437264919281, + "learning_rate": 1.3723237347414767e-05, + "loss": 0.2272, "step": 730100 }, { - "epoch": 7.44, - "learning_rate": 3.211998806988128e-05, - "loss": 0.4397, + "epoch": 10.060345540216582, + "grad_norm": 2.3694279193878174, + "learning_rate": 1.3716300865997338e-05, + "loss": 0.291, "step": 730200 }, { - "epoch": 7.44, - "learning_rate": 3.211337100349445e-05, - "loss": 0.37, + "epoch": 10.061723292276321, + "grad_norm": 5.370370864868164, + "learning_rate": 1.370936555557568e-05, + "loss": 0.2935, "step": 730300 }, { - "epoch": 7.44, - "learning_rate": 3.2106753833776374e-05, - "loss": 0.4067, + "epoch": 10.063101044336062, + "grad_norm": 5.260372638702393, + "learning_rate": 1.3702431416738957e-05, + "loss": 0.3145, "step": 730400 }, { - "epoch": 7.44, - "learning_rate": 3.210013656105062e-05, - "loss": 0.3679, + "epoch": 10.0644787963958, + "grad_norm": 2.998494863510132, + "learning_rate": 1.3695498450076287e-05, + "loss": 0.2738, "step": 730500 }, { - "epoch": 7.44, - "learning_rate": 3.209351918564073e-05, - "loss": 0.4635, + "epoch": 10.06585654845554, + "grad_norm": 3.5449092388153076, + "learning_rate": 1.3688566656176636e-05, + "loss": 0.3368, "step": 730600 }, { - "epoch": 7.44, - "learning_rate": 3.208690170787023e-05, - "loss": 0.4427, + "epoch": 10.06723430051528, + "grad_norm": 3.2793149948120117, + "learning_rate": 1.3681636035628916e-05, + "loss": 0.3254, "step": 730700 }, { - "epoch": 7.45, - "learning_rate": 3.208028412806269e-05, - "loss": 0.3532, + "epoch": 10.068612052575018, + "grad_norm": 0.44621741771698, + "learning_rate": 1.3674706589021898e-05, + "loss": 0.2847, "step": 730800 }, { - "epoch": 7.45, - "learning_rate": 3.207366644654168e-05, - "loss": 0.4387, + "epoch": 10.069989804634758, + "grad_norm": 2.9050819873809814, + "learning_rate": 1.3667778316944287e-05, + "loss": 0.3426, "step": 730900 }, { - "epoch": 7.45, - "learning_rate": 3.2067048663630745e-05, - "loss": 0.4334, + "epoch": 10.071367556694497, + "grad_norm": 1.4298114776611328, + "learning_rate": 1.3660851219984683e-05, + "loss": 0.2409, "step": 731000 }, { - "epoch": 7.45, - "learning_rate": 3.206043077965347e-05, - "loss": 0.4808, + "epoch": 10.072745308754236, + "grad_norm": 1.2119760513305664, + "learning_rate": 1.3653925298731566e-05, + "loss": 0.2892, "step": 731100 }, { - "epoch": 7.45, - "learning_rate": 3.205381279493344e-05, - "loss": 0.4265, + "epoch": 10.074123060813976, + "grad_norm": 3.488784074783325, + "learning_rate": 1.3647000553773322e-05, + "loss": 0.3079, "step": 731200 }, { - "epoch": 7.45, - "learning_rate": 3.20471947097942e-05, - "loss": 0.4377, + "epoch": 10.075500812873715, + "grad_norm": 12.563276290893555, + "learning_rate": 1.3640076985698264e-05, + "loss": 0.2986, "step": 731300 }, { - "epoch": 7.45, - "learning_rate": 3.204057652455936e-05, - "loss": 0.4497, + "epoch": 10.076878564933455, + "grad_norm": 2.4049036502838135, + "learning_rate": 1.3633154595094564e-05, + "loss": 0.2963, "step": 731400 }, { - "epoch": 7.45, - "learning_rate": 3.203395823955251e-05, - "loss": 0.4512, + "epoch": 10.078256316993194, + "grad_norm": 2.8991074562072754, + "learning_rate": 1.362630258884244e-05, + "loss": 0.3828, "step": 731500 }, { - "epoch": 7.45, - "learning_rate": 3.202733985509724e-05, - "loss": 0.453, + "epoch": 10.079634069052933, + "grad_norm": 5.3424248695373535, + "learning_rate": 1.3619382543156274e-05, + "loss": 0.2668, "step": 731600 }, { - "epoch": 7.45, - "learning_rate": 3.202072137151715e-05, - "loss": 0.4162, + "epoch": 10.081011821112673, + "grad_norm": 1.4488914012908936, + "learning_rate": 1.3612463676699566e-05, + "loss": 0.3032, "step": 731700 }, { - "epoch": 7.46, - "learning_rate": 3.201410278913584e-05, - "loss": 0.4843, + "epoch": 10.082389573172412, + "grad_norm": 2.85404634475708, + "learning_rate": 1.3605545990060124e-05, + "loss": 0.3251, "step": 731800 }, { - "epoch": 7.46, - "learning_rate": 3.200748410827693e-05, - "loss": 0.4794, + "epoch": 10.08376732523215, + "grad_norm": 3.097522258758545, + "learning_rate": 1.3598629483825623e-05, + "loss": 0.2487, "step": 731900 }, { - "epoch": 7.46, - "learning_rate": 3.2000865329264026e-05, - "loss": 0.5307, + "epoch": 10.085145077291891, + "grad_norm": 3.816513776779175, + "learning_rate": 1.3591714158583669e-05, + "loss": 0.279, "step": 732000 }, { - "epoch": 7.46, - "learning_rate": 3.199431264167237e-05, - "loss": 0.4165, + "epoch": 10.08652282935163, + "grad_norm": 4.017214298248291, + "learning_rate": 1.358480001492177e-05, + "loss": 0.284, "step": 732100 }, { - "epoch": 7.46, - "learning_rate": 3.1987693668295815e-05, - "loss": 0.4332, + "epoch": 10.08790058141137, + "grad_norm": 3.5168819427490234, + "learning_rate": 1.3577887053427281e-05, + "loss": 0.2769, "step": 732200 }, { - "epoch": 7.46, - "learning_rate": 3.198107459773288e-05, - "loss": 0.4313, + "epoch": 10.089278333471109, + "grad_norm": 2.658149480819702, + "learning_rate": 1.3570975274687512e-05, + "loss": 0.2712, "step": 732300 }, { - "epoch": 7.46, - "learning_rate": 3.197445543030724e-05, - "loss": 0.3313, + "epoch": 10.090656085530847, + "grad_norm": 6.469781398773193, + "learning_rate": 1.356406467928966e-05, + "loss": 0.3032, "step": 732400 }, { - "epoch": 7.46, - "learning_rate": 3.1967836166342506e-05, - "loss": 0.4287, + "epoch": 10.092033837590588, + "grad_norm": 2.4033329486846924, + "learning_rate": 1.3557155267820805e-05, + "loss": 0.2852, "step": 732500 }, { - "epoch": 7.46, - "learning_rate": 3.196121680616232e-05, - "loss": 0.4063, + "epoch": 10.093411589650326, + "grad_norm": 2.9667887687683105, + "learning_rate": 1.3550247040867937e-05, + "loss": 0.319, "step": 732600 }, { - "epoch": 7.46, - "learning_rate": 3.195459735009034e-05, - "loss": 0.408, + "epoch": 10.094789341710065, + "grad_norm": 3.0170135498046875, + "learning_rate": 1.3543339999017959e-05, + "loss": 0.2911, "step": 732700 }, { - "epoch": 7.47, - "learning_rate": 3.194797779845021e-05, - "loss": 0.4181, + "epoch": 10.096167093769806, + "grad_norm": 6.031506061553955, + "learning_rate": 1.3536434142857654e-05, + "loss": 0.2968, "step": 732800 }, { - "epoch": 7.47, - "learning_rate": 3.194135815156559e-05, - "loss": 0.5025, + "epoch": 10.097544845829544, + "grad_norm": 1.1898102760314941, + "learning_rate": 1.352952947297369e-05, + "loss": 0.3323, "step": 732900 }, { - "epoch": 7.47, - "learning_rate": 3.193473840976015e-05, - "loss": 0.4057, + "epoch": 10.098922597889285, + "grad_norm": 1.4634368419647217, + "learning_rate": 1.3522625989952674e-05, + "loss": 0.2327, "step": 733000 }, { - "epoch": 7.47, - "learning_rate": 3.1928184772188764e-05, - "loss": 0.4441, + "epoch": 10.100300349949023, + "grad_norm": 26.642776489257812, + "learning_rate": 1.3515723694381098e-05, + "loss": 0.2999, "step": 733100 }, { - "epoch": 7.47, - "learning_rate": 3.1921564842453803e-05, - "loss": 0.4717, + "epoch": 10.101678102008762, + "grad_norm": 9.944792747497559, + "learning_rate": 1.3508822586845336e-05, + "loss": 0.2775, "step": 733200 }, { - "epoch": 7.47, - "learning_rate": 3.1914944818765786e-05, - "loss": 0.4437, + "epoch": 10.103055854068502, + "grad_norm": 34.712032318115234, + "learning_rate": 1.3501922667931665e-05, + "loss": 0.2859, "step": 733300 }, { - "epoch": 7.47, - "learning_rate": 3.1908324701448393e-05, - "loss": 0.4322, + "epoch": 10.104433606128241, + "grad_norm": 10.03830337524414, + "learning_rate": 1.3495023938226283e-05, + "loss": 0.3115, "step": 733400 }, { - "epoch": 7.47, - "learning_rate": 3.190170449082531e-05, - "loss": 0.4509, + "epoch": 10.10581135818798, + "grad_norm": 1.809373140335083, + "learning_rate": 1.348812639831526e-05, + "loss": 0.2655, "step": 733500 }, { - "epoch": 7.47, - "learning_rate": 3.189508418722022e-05, - "loss": 0.4946, + "epoch": 10.10718911024772, + "grad_norm": 1.4340425729751587, + "learning_rate": 1.348123004878458e-05, + "loss": 0.2798, "step": 733600 }, { - "epoch": 7.48, - "learning_rate": 3.1888463790956835e-05, - "loss": 0.3716, + "epoch": 10.108566862307459, + "grad_norm": 2.3071742057800293, + "learning_rate": 1.347433489022014e-05, + "loss": 0.289, "step": 733700 }, { - "epoch": 7.48, - "learning_rate": 3.188184330235884e-05, - "loss": 0.4428, + "epoch": 10.109944614367198, + "grad_norm": 4.753567218780518, + "learning_rate": 1.3467440923207708e-05, + "loss": 0.3061, "step": 733800 }, { - "epoch": 7.48, - "learning_rate": 3.187522272174993e-05, - "loss": 0.3937, + "epoch": 10.111322366426938, + "grad_norm": 2.845501661300659, + "learning_rate": 1.3460548148332947e-05, + "loss": 0.2425, "step": 733900 }, { - "epoch": 7.48, - "learning_rate": 3.186860204945382e-05, - "loss": 0.3972, + "epoch": 10.112700118486677, + "grad_norm": 1.8497999906539917, + "learning_rate": 1.3453656566181444e-05, + "loss": 0.2571, "step": 734000 }, { - "epoch": 7.48, - "learning_rate": 3.1861981285794224e-05, - "loss": 0.508, + "epoch": 10.114077870546417, + "grad_norm": 1.6104137897491455, + "learning_rate": 1.3446766177338689e-05, + "loss": 0.3211, "step": 734100 }, { - "epoch": 7.48, - "learning_rate": 3.1855360431094854e-05, - "loss": 0.4538, + "epoch": 10.115455622606156, + "grad_norm": 4.806046485900879, + "learning_rate": 1.3439876982390035e-05, + "loss": 0.2859, "step": 734200 }, { - "epoch": 7.48, - "learning_rate": 3.1848739485679434e-05, - "loss": 0.4172, + "epoch": 10.116833374665894, + "grad_norm": 3.6091084480285645, + "learning_rate": 1.343298898192077e-05, + "loss": 0.2629, "step": 734300 }, { - "epoch": 7.48, - "learning_rate": 3.184211844987168e-05, - "loss": 0.4152, + "epoch": 10.118211126725635, + "grad_norm": 3.761121988296509, + "learning_rate": 1.3426102176516056e-05, + "loss": 0.2801, "step": 734400 }, { - "epoch": 7.48, - "learning_rate": 3.183556353569887e-05, - "loss": 0.4482, + "epoch": 10.119588878785374, + "grad_norm": 8.344184875488281, + "learning_rate": 1.3419216566760955e-05, + "loss": 0.2581, "step": 734500 }, { - "epoch": 7.48, - "learning_rate": 3.1828942320973504e-05, - "loss": 0.4063, + "epoch": 10.120966630845112, + "grad_norm": 0.8365576863288879, + "learning_rate": 1.3412332153240445e-05, + "loss": 0.2947, "step": 734600 }, { - "epoch": 7.49, - "learning_rate": 3.182232101682377e-05, - "loss": 0.417, + "epoch": 10.122344382904853, + "grad_norm": 0.5754960775375366, + "learning_rate": 1.3405448936539398e-05, + "loss": 0.2758, "step": 734700 }, { - "epoch": 7.49, - "learning_rate": 3.181569962357341e-05, - "loss": 0.4747, + "epoch": 10.123722134964591, + "grad_norm": 2.604860782623291, + "learning_rate": 1.339856691724256e-05, + "loss": 0.2448, "step": 734800 }, { - "epoch": 7.49, - "learning_rate": 3.1809078141546185e-05, - "loss": 0.4781, + "epoch": 10.125099887024332, + "grad_norm": 3.7445244789123535, + "learning_rate": 1.3391686095934619e-05, + "loss": 0.2776, "step": 734900 }, { - "epoch": 7.49, - "learning_rate": 3.1802456571065824e-05, - "loss": 0.3714, + "epoch": 10.12647763908407, + "grad_norm": 1.6025595664978027, + "learning_rate": 1.338480647320011e-05, + "loss": 0.2502, "step": 735000 }, { - "epoch": 7.49, - "learning_rate": 3.1795834912456096e-05, - "loss": 0.4975, + "epoch": 10.12785539114381, + "grad_norm": 2.389474868774414, + "learning_rate": 1.3377928049623514e-05, + "loss": 0.2614, "step": 735100 }, { - "epoch": 7.49, - "learning_rate": 3.178921316604076e-05, - "loss": 0.4689, + "epoch": 10.12923314320355, + "grad_norm": 23.347993850708008, + "learning_rate": 1.3371050825789167e-05, + "loss": 0.3194, "step": 735200 }, { - "epoch": 7.49, - "learning_rate": 3.178259133214358e-05, - "loss": 0.3757, + "epoch": 10.130610895263288, + "grad_norm": 4.336905479431152, + "learning_rate": 1.3364174802281338e-05, + "loss": 0.3071, "step": 735300 }, { - "epoch": 7.49, - "learning_rate": 3.177596941108831e-05, - "loss": 0.4609, + "epoch": 10.131988647323027, + "grad_norm": 1.7863801717758179, + "learning_rate": 1.3357299979684198e-05, + "loss": 0.272, "step": 735400 }, { - "epoch": 7.49, - "learning_rate": 3.176934740319874e-05, - "loss": 0.4407, + "epoch": 10.133366399382767, + "grad_norm": 1.7998559474945068, + "learning_rate": 1.3350426358581755e-05, + "loss": 0.254, "step": 735500 }, { - "epoch": 7.49, - "learning_rate": 3.176272530879865e-05, - "loss": 0.4899, + "epoch": 10.134744151442506, + "grad_norm": 6.019591331481934, + "learning_rate": 1.3343622657796016e-05, + "loss": 0.3062, "step": 735600 }, { - "epoch": 7.5, - "learning_rate": 3.175610312821181e-05, - "loss": 0.4573, + "epoch": 10.136121903502247, + "grad_norm": 4.577052116394043, + "learning_rate": 1.3336751429405245e-05, + "loss": 0.2543, "step": 735700 }, { - "epoch": 7.5, - "learning_rate": 3.174948086176199e-05, - "loss": 0.4603, + "epoch": 10.137499655561985, + "grad_norm": 5.137942790985107, + "learning_rate": 1.3329881404254911e-05, + "loss": 0.3173, "step": 735800 }, { - "epoch": 7.5, - "learning_rate": 3.174285850977301e-05, - "loss": 0.454, + "epoch": 10.138877407621724, + "grad_norm": 2.155247926712036, + "learning_rate": 1.3323012582928636e-05, + "loss": 0.2948, "step": 735900 }, { - "epoch": 7.5, - "learning_rate": 3.173623607256865e-05, - "loss": 0.4728, + "epoch": 10.140255159681464, + "grad_norm": 3.973073959350586, + "learning_rate": 1.331614496600998e-05, + "loss": 0.273, "step": 736000 }, { - "epoch": 7.5, - "learning_rate": 3.1729613550472697e-05, - "loss": 0.4214, + "epoch": 10.141632911741203, + "grad_norm": 2.047118902206421, + "learning_rate": 1.3309278554082372e-05, + "loss": 0.3261, "step": 736100 }, { - "epoch": 7.5, - "learning_rate": 3.1722990943808976e-05, - "loss": 0.3795, + "epoch": 10.143010663800942, + "grad_norm": 0.5330919623374939, + "learning_rate": 1.330241334772914e-05, + "loss": 0.312, "step": 736200 }, { - "epoch": 7.5, - "learning_rate": 3.171636825290127e-05, - "loss": 0.4667, + "epoch": 10.144388415860682, + "grad_norm": 3.573786973953247, + "learning_rate": 1.3295549347533531e-05, + "loss": 0.2882, "step": 736300 }, { - "epoch": 7.5, - "learning_rate": 3.1709745478073414e-05, - "loss": 0.4821, + "epoch": 10.14576616792042, + "grad_norm": 4.5614094734191895, + "learning_rate": 1.3288686554078683e-05, + "loss": 0.3029, "step": 736400 }, { - "epoch": 7.5, - "learning_rate": 3.1703122619649194e-05, - "loss": 0.4754, + "epoch": 10.147143919980161, + "grad_norm": 3.086592197418213, + "learning_rate": 1.3281824967947616e-05, + "loss": 0.339, "step": 736500 }, { - "epoch": 7.5, - "learning_rate": 3.169649967795244e-05, - "loss": 0.4166, + "epoch": 10.1485216720399, + "grad_norm": 1.4972175359725952, + "learning_rate": 1.3275033187524442e-05, + "loss": 0.3316, "step": 736600 }, { - "epoch": 7.51, - "learning_rate": 3.168987665330698e-05, - "loss": 0.4351, + "epoch": 10.149899424099639, + "grad_norm": 6.534455299377441, + "learning_rate": 1.3268174005701828e-05, + "loss": 0.3234, "step": 736700 }, { - "epoch": 7.51, - "learning_rate": 3.168325354603663e-05, - "loss": 0.4827, + "epoch": 10.151277176159379, + "grad_norm": 1.3562685251235962, + "learning_rate": 1.3261316032945641e-05, + "loss": 0.2655, "step": 736800 }, { - "epoch": 7.51, - "learning_rate": 3.167663035646525e-05, - "loss": 0.4112, + "epoch": 10.152654928219118, + "grad_norm": 1.747830867767334, + "learning_rate": 1.3254459269838511e-05, + "loss": 0.2523, "step": 736900 }, { - "epoch": 7.51, - "learning_rate": 3.167000708491664e-05, - "loss": 0.4715, + "epoch": 10.154032680278856, + "grad_norm": 3.305440664291382, + "learning_rate": 1.3247603716962937e-05, + "loss": 0.2712, "step": 737000 }, { - "epoch": 7.51, - "learning_rate": 3.1663383731714635e-05, - "loss": 0.4351, + "epoch": 10.155410432338597, + "grad_norm": 0.5590237379074097, + "learning_rate": 1.3240749374901352e-05, + "loss": 0.2486, "step": 737100 }, { - "epoch": 7.51, - "learning_rate": 3.165676029718311e-05, - "loss": 0.4208, + "epoch": 10.156788184398335, + "grad_norm": 3.1297619342803955, + "learning_rate": 1.3233896244236058e-05, + "loss": 0.3334, "step": 737200 }, { - "epoch": 7.51, - "learning_rate": 3.165013678164588e-05, - "loss": 0.4629, + "epoch": 10.158165936458076, + "grad_norm": 2.03391170501709, + "learning_rate": 1.3227044325549249e-05, + "loss": 0.2675, "step": 737300 }, { - "epoch": 7.51, - "learning_rate": 3.164351318542681e-05, - "loss": 0.4762, + "epoch": 10.159543688517815, + "grad_norm": 10.185530662536621, + "learning_rate": 1.322019361942305e-05, + "loss": 0.2704, "step": 737400 }, { - "epoch": 7.51, - "learning_rate": 3.163688950884976e-05, - "loss": 0.4246, + "epoch": 10.160921440577553, + "grad_norm": 1.7408205270767212, + "learning_rate": 1.3213344126439466e-05, + "loss": 0.2709, "step": 737500 }, { - "epoch": 7.51, - "learning_rate": 3.1630265752238575e-05, - "loss": 0.4077, + "epoch": 10.162299192637294, + "grad_norm": 13.838605880737305, + "learning_rate": 1.3206495847180385e-05, + "loss": 0.2715, "step": 737600 }, { - "epoch": 7.52, - "learning_rate": 3.1623641915917135e-05, - "loss": 0.4031, + "epoch": 10.163676944697032, + "grad_norm": 0.7557150721549988, + "learning_rate": 1.3199648782227601e-05, + "loss": 0.2838, "step": 737700 }, { - "epoch": 7.52, - "learning_rate": 3.161701800020929e-05, - "loss": 0.4337, + "epoch": 10.165054696756771, + "grad_norm": 0.598028302192688, + "learning_rate": 1.3192802932162812e-05, + "loss": 0.2817, "step": 737800 }, { - "epoch": 7.52, - "learning_rate": 3.161039400543891e-05, - "loss": 0.4196, + "epoch": 10.166432448816511, + "grad_norm": 1.9788265228271484, + "learning_rate": 1.3185958297567617e-05, + "loss": 0.2724, "step": 737900 }, { - "epoch": 7.52, - "learning_rate": 3.1603769931929866e-05, - "loss": 0.4746, + "epoch": 10.16781020087625, + "grad_norm": 1.9298508167266846, + "learning_rate": 1.3179114879023487e-05, + "loss": 0.2627, "step": 738000 }, { - "epoch": 7.52, - "learning_rate": 3.159714578000606e-05, - "loss": 0.4703, + "epoch": 10.169187952935989, + "grad_norm": 15.164461135864258, + "learning_rate": 1.3172272677111813e-05, + "loss": 0.3314, "step": 738100 }, { - "epoch": 7.52, - "learning_rate": 3.1590521549991347e-05, - "loss": 0.3703, + "epoch": 10.17056570499573, + "grad_norm": 11.63528823852539, + "learning_rate": 1.3165431692413895e-05, + "loss": 0.2752, "step": 738200 }, { - "epoch": 7.52, - "learning_rate": 3.1583897242209626e-05, - "loss": 0.4371, + "epoch": 10.171943457055468, + "grad_norm": 3.259536027908325, + "learning_rate": 1.315859192551087e-05, + "loss": 0.2667, "step": 738300 }, { - "epoch": 7.52, - "learning_rate": 3.157727285698478e-05, - "loss": 0.4376, + "epoch": 10.173321209115208, + "grad_norm": 2.716491460800171, + "learning_rate": 1.3151753376983829e-05, + "loss": 0.2895, "step": 738400 }, { - "epoch": 7.52, - "learning_rate": 3.1570648394640704e-05, - "loss": 0.5279, + "epoch": 10.174698961174947, + "grad_norm": 1.2367470264434814, + "learning_rate": 1.3144916047413756e-05, + "loss": 0.2931, "step": 738500 }, { - "epoch": 7.52, - "learning_rate": 3.1564090101271765e-05, - "loss": 0.4325, + "epoch": 10.176076713234686, + "grad_norm": 3.94279408454895, + "learning_rate": 1.3138079937381494e-05, + "loss": 0.3295, "step": 738600 }, { - "epoch": 7.53, - "learning_rate": 3.1557465486424027e-05, - "loss": 0.4294, + "epoch": 10.177454465294426, + "grad_norm": 1.1917893886566162, + "learning_rate": 1.3131245047467828e-05, + "loss": 0.3154, "step": 738700 }, { - "epoch": 7.53, - "learning_rate": 3.155084079542553e-05, - "loss": 0.4708, + "epoch": 10.178832217354165, + "grad_norm": 3.9291369915008545, + "learning_rate": 1.3124411378253394e-05, + "loss": 0.2687, "step": 738800 }, { - "epoch": 7.53, - "learning_rate": 3.154421602860018e-05, - "loss": 0.4308, + "epoch": 10.180209969413903, + "grad_norm": 11.463781356811523, + "learning_rate": 1.311757893031877e-05, + "loss": 0.3287, "step": 738900 }, { - "epoch": 7.53, - "learning_rate": 3.1537591186271866e-05, - "loss": 0.411, + "epoch": 10.181587721473644, + "grad_norm": 3.051361560821533, + "learning_rate": 1.3110747704244383e-05, + "loss": 0.361, "step": 739000 }, { - "epoch": 7.53, - "learning_rate": 3.153096626876453e-05, - "loss": 0.3781, + "epoch": 10.182965473533383, + "grad_norm": 9.015741348266602, + "learning_rate": 1.3103917700610605e-05, + "loss": 0.3095, "step": 739100 }, { - "epoch": 7.53, - "learning_rate": 3.152434127640207e-05, - "loss": 0.3861, + "epoch": 10.184343225593123, + "grad_norm": 2.560722827911377, + "learning_rate": 1.3097088919997659e-05, + "loss": 0.3095, "step": 739200 }, { - "epoch": 7.53, - "learning_rate": 3.151771620950842e-05, - "loss": 0.333, + "epoch": 10.185720977652862, + "grad_norm": 2.2690556049346924, + "learning_rate": 1.3090261362985702e-05, + "loss": 0.2518, "step": 739300 }, { - "epoch": 7.53, - "learning_rate": 3.151109106840752e-05, - "loss": 0.4282, + "epoch": 10.1870987297126, + "grad_norm": 4.059639930725098, + "learning_rate": 1.3083435030154752e-05, + "loss": 0.295, "step": 739400 }, { - "epoch": 7.53, - "learning_rate": 3.150446585342326e-05, - "loss": 0.4822, + "epoch": 10.18847648177234, + "grad_norm": 2.6916866302490234, + "learning_rate": 1.3076609922084765e-05, + "loss": 0.3174, "step": 739500 }, { - "epoch": 7.54, - "learning_rate": 3.149784056487962e-05, - "loss": 0.4217, + "epoch": 10.18985423383208, + "grad_norm": 1.2301732301712036, + "learning_rate": 1.3069786039355541e-05, + "loss": 0.3035, "step": 739600 }, { - "epoch": 7.54, - "learning_rate": 3.149121520310051e-05, - "loss": 0.3608, + "epoch": 10.191231985891818, + "grad_norm": 3.22831392288208, + "learning_rate": 1.3062963382546819e-05, + "loss": 0.3063, "step": 739700 }, { - "epoch": 7.54, - "learning_rate": 3.148458976840987e-05, - "loss": 0.4601, + "epoch": 10.192609737951559, + "grad_norm": 4.637147903442383, + "learning_rate": 1.3056141952238225e-05, + "loss": 0.3091, "step": 739800 }, { - "epoch": 7.54, - "learning_rate": 3.1477964261131646e-05, - "loss": 0.4014, + "epoch": 10.193987490011297, + "grad_norm": 5.76689338684082, + "learning_rate": 1.3049321749009266e-05, + "loss": 0.2603, "step": 739900 }, { - "epoch": 7.54, - "learning_rate": 3.147140493774185e-05, - "loss": 0.4002, + "epoch": 10.195365242071038, + "grad_norm": 1.3951846361160278, + "learning_rate": 1.3042502773439347e-05, + "loss": 0.323, "step": 740000 }, { - "epoch": 7.54, - "learning_rate": 3.146477928697812e-05, - "loss": 0.3793, + "epoch": 10.196742994130776, + "grad_norm": 1.2893242835998535, + "learning_rate": 1.303568502610779e-05, + "loss": 0.302, "step": 740100 }, { - "epoch": 7.54, - "learning_rate": 3.1458153564595416e-05, - "loss": 0.4912, + "epoch": 10.198120746190515, + "grad_norm": 3.65417742729187, + "learning_rate": 1.3028868507593776e-05, + "loss": 0.2824, "step": 740200 }, { - "epoch": 7.54, - "learning_rate": 3.14515277709177e-05, - "loss": 0.391, + "epoch": 10.199498498250255, + "grad_norm": 4.525059223175049, + "learning_rate": 1.302205321847642e-05, + "loss": 0.3006, "step": 740300 }, { - "epoch": 7.54, - "learning_rate": 3.144490190626895e-05, - "loss": 0.4297, + "epoch": 10.200876250309994, + "grad_norm": 2.6735799312591553, + "learning_rate": 1.301523915933472e-05, + "loss": 0.281, "step": 740400 }, { - "epoch": 7.54, - "learning_rate": 3.1438275970973104e-05, - "loss": 0.477, + "epoch": 10.202254002369733, + "grad_norm": 1.3909764289855957, + "learning_rate": 1.3008426330747557e-05, + "loss": 0.2684, "step": 740500 }, { - "epoch": 7.55, - "learning_rate": 3.143164996535415e-05, - "loss": 0.3904, + "epoch": 10.203631754429473, + "grad_norm": 0.34083470702171326, + "learning_rate": 1.3001614733293706e-05, + "loss": 0.3036, "step": 740600 }, { - "epoch": 7.55, - "learning_rate": 3.142502388973607e-05, - "loss": 0.414, + "epoch": 10.205009506489212, + "grad_norm": 1.9238654375076294, + "learning_rate": 1.2994804367551855e-05, + "loss": 0.3122, "step": 740700 }, { - "epoch": 7.55, - "learning_rate": 3.1418397744442806e-05, - "loss": 0.4402, + "epoch": 10.206387258548952, + "grad_norm": 2.0558156967163086, + "learning_rate": 1.2987995234100596e-05, + "loss": 0.3425, "step": 740800 }, { - "epoch": 7.55, - "learning_rate": 3.141177152979838e-05, - "loss": 0.4457, + "epoch": 10.207765010608691, + "grad_norm": 2.3343918323516846, + "learning_rate": 1.2981187333518369e-05, + "loss": 0.2769, "step": 740900 }, { - "epoch": 7.55, - "learning_rate": 3.140514524612674e-05, - "loss": 0.4133, + "epoch": 10.20914276266843, + "grad_norm": 1.6474955081939697, + "learning_rate": 1.297438066638357e-05, + "loss": 0.3105, "step": 741000 }, { - "epoch": 7.55, - "learning_rate": 3.139851889375187e-05, - "loss": 0.396, + "epoch": 10.21052051472817, + "grad_norm": 2.036238670349121, + "learning_rate": 1.2967575233274445e-05, + "loss": 0.2899, "step": 741100 }, { - "epoch": 7.55, - "learning_rate": 3.1391892472997785e-05, - "loss": 0.4562, + "epoch": 10.211898266787909, + "grad_norm": 1.719221591949463, + "learning_rate": 1.296077103476914e-05, + "loss": 0.3463, "step": 741200 }, { - "epoch": 7.55, - "learning_rate": 3.138526598418846e-05, - "loss": 0.4794, + "epoch": 10.213276018847647, + "grad_norm": 2.2464818954467773, + "learning_rate": 1.2953968071445719e-05, + "loss": 0.2829, "step": 741300 }, { - "epoch": 7.55, - "learning_rate": 3.13786394276479e-05, - "loss": 0.4545, + "epoch": 10.214653770907388, + "grad_norm": 4.058666229248047, + "learning_rate": 1.2947166343882135e-05, + "loss": 0.2862, "step": 741400 }, { - "epoch": 7.55, - "learning_rate": 3.13720128037001e-05, - "loss": 0.4396, + "epoch": 10.216031522967127, + "grad_norm": 0.3164874017238617, + "learning_rate": 1.2940365852656223e-05, + "loss": 0.2515, "step": 741500 }, { - "epoch": 7.56, - "learning_rate": 3.1365386112669054e-05, - "loss": 0.3842, + "epoch": 10.217409275026867, + "grad_norm": 10.059370040893555, + "learning_rate": 1.293356659834571e-05, + "loss": 0.3382, "step": 741600 }, { - "epoch": 7.56, - "learning_rate": 3.135875935487879e-05, - "loss": 0.5155, + "epoch": 10.218787027086606, + "grad_norm": 1.440412163734436, + "learning_rate": 1.2926768581528233e-05, + "loss": 0.2707, "step": 741700 }, { - "epoch": 7.56, - "learning_rate": 3.135213253065328e-05, - "loss": 0.3741, + "epoch": 10.220164779146344, + "grad_norm": 5.088151454925537, + "learning_rate": 1.2919971802781333e-05, + "loss": 0.3139, "step": 741800 }, { - "epoch": 7.56, - "learning_rate": 3.134550564031657e-05, - "loss": 0.3812, + "epoch": 10.221542531206085, + "grad_norm": 2.0129246711730957, + "learning_rate": 1.2913176262682408e-05, + "loss": 0.2599, "step": 741900 }, { - "epoch": 7.56, - "learning_rate": 3.133887868419267e-05, - "loss": 0.3706, + "epoch": 10.222920283265823, + "grad_norm": 1.8397879600524902, + "learning_rate": 1.2906449898681477e-05, + "loss": 0.2954, "step": 742000 }, { - "epoch": 7.56, - "learning_rate": 3.133225166260559e-05, - "loss": 0.4918, + "epoch": 10.224298035325562, + "grad_norm": 3.9937195777893066, + "learning_rate": 1.2899656825209485e-05, + "loss": 0.2788, "step": 742100 }, { - "epoch": 7.56, - "learning_rate": 3.1325624575879353e-05, - "loss": 0.4312, + "epoch": 10.225675787385303, + "grad_norm": 3.7888314723968506, + "learning_rate": 1.2892864992111353e-05, + "loss": 0.2661, "step": 742200 }, { - "epoch": 7.56, - "learning_rate": 3.131899742433798e-05, - "loss": 0.3926, + "epoch": 10.227053539445041, + "grad_norm": 2.9476096630096436, + "learning_rate": 1.2886074399964057e-05, + "loss": 0.2453, "step": 742300 }, { - "epoch": 7.56, - "learning_rate": 3.131237020830551e-05, - "loss": 0.4316, + "epoch": 10.22843129150478, + "grad_norm": 4.9511799812316895, + "learning_rate": 1.2879285049344505e-05, + "loss": 0.3062, "step": 742400 }, { - "epoch": 7.56, - "learning_rate": 3.1305742928105956e-05, - "loss": 0.4191, + "epoch": 10.22980904356452, + "grad_norm": 1.0618526935577393, + "learning_rate": 1.28724969408295e-05, + "loss": 0.2955, "step": 742500 }, { - "epoch": 7.57, - "learning_rate": 3.1299115584063384e-05, - "loss": 0.4262, + "epoch": 10.231186795624259, + "grad_norm": 4.4451003074646, + "learning_rate": 1.2865710074995713e-05, + "loss": 0.2665, "step": 742600 }, { - "epoch": 7.57, - "learning_rate": 3.1292488176501804e-05, - "loss": 0.4204, + "epoch": 10.232564547684, + "grad_norm": 0.5552589893341064, + "learning_rate": 1.2858924452419712e-05, + "loss": 0.2429, "step": 742700 }, { - "epoch": 7.57, - "learning_rate": 3.128586070574526e-05, - "loss": 0.4876, + "epoch": 10.233942299743738, + "grad_norm": 2.503857135772705, + "learning_rate": 1.2852140073677981e-05, + "loss": 0.2642, "step": 742800 }, { - "epoch": 7.57, - "learning_rate": 3.1279233172117814e-05, - "loss": 0.481, + "epoch": 10.235320051803477, + "grad_norm": 0.1566755622625351, + "learning_rate": 1.2845356939346903e-05, + "loss": 0.2925, "step": 742900 }, { - "epoch": 7.57, - "learning_rate": 3.1272605575943484e-05, - "loss": 0.4077, + "epoch": 10.236697803863217, + "grad_norm": 4.694526195526123, + "learning_rate": 1.2838575050002712e-05, + "loss": 0.2714, "step": 743000 }, { - "epoch": 7.57, - "learning_rate": 3.1265977917546345e-05, - "loss": 0.478, + "epoch": 10.238075555922956, + "grad_norm": 37.567081451416016, + "learning_rate": 1.2831794406221579e-05, + "loss": 0.3262, "step": 743100 }, { - "epoch": 7.57, - "learning_rate": 3.125935019725044e-05, - "loss": 0.3906, + "epoch": 10.239453307982695, + "grad_norm": 1.7142767906188965, + "learning_rate": 1.2825015008579572e-05, + "loss": 0.2406, "step": 743200 }, { - "epoch": 7.57, - "learning_rate": 3.125272241537983e-05, - "loss": 0.4347, + "epoch": 10.240831060042435, + "grad_norm": 2.3837833404541016, + "learning_rate": 1.2818236857652598e-05, + "loss": 0.3072, "step": 743300 }, { - "epoch": 7.57, - "learning_rate": 3.1246094572258565e-05, - "loss": 0.4911, + "epoch": 10.242208812102174, + "grad_norm": 0.5881015658378601, + "learning_rate": 1.2811459954016516e-05, + "loss": 0.2713, "step": 743400 }, { - "epoch": 7.57, - "learning_rate": 3.1239466668210704e-05, - "loss": 0.4506, + "epoch": 10.243586564161914, + "grad_norm": 1.1353763341903687, + "learning_rate": 1.2804752048625922e-05, + "loss": 0.2909, "step": 743500 }, { - "epoch": 7.58, - "learning_rate": 3.123283870356033e-05, - "loss": 0.4666, + "epoch": 10.244964316221653, + "grad_norm": 4.626894950866699, + "learning_rate": 1.2797977628811448e-05, + "loss": 0.3014, "step": 743600 }, { - "epoch": 7.58, - "learning_rate": 3.12262106786315e-05, - "loss": 0.5144, + "epoch": 10.246342068281391, + "grad_norm": 6.538792610168457, + "learning_rate": 1.2791204458008993e-05, + "loss": 0.2946, "step": 743700 }, { - "epoch": 7.58, - "learning_rate": 3.121958259374828e-05, - "loss": 0.4945, + "epoch": 10.247719820341132, + "grad_norm": 1.1734533309936523, + "learning_rate": 1.2784432536793956e-05, + "loss": 0.3298, "step": 743800 }, { - "epoch": 7.58, - "learning_rate": 3.121295444923476e-05, - "loss": 0.4113, + "epoch": 10.24909757240087, + "grad_norm": 4.123232364654541, + "learning_rate": 1.277766186574167e-05, + "loss": 0.3029, "step": 743900 }, { - "epoch": 7.58, - "learning_rate": 3.1206326245415e-05, - "loss": 0.4156, + "epoch": 10.25047532446061, + "grad_norm": 2.223881244659424, + "learning_rate": 1.2770892445427317e-05, + "loss": 0.2839, "step": 744000 }, { - "epoch": 7.58, - "learning_rate": 3.1199697982613083e-05, - "loss": 0.4257, + "epoch": 10.25185307652035, + "grad_norm": 1.725261926651001, + "learning_rate": 1.2764124276426003e-05, + "loss": 0.3164, "step": 744100 }, { - "epoch": 7.58, - "learning_rate": 3.11930696611531e-05, - "loss": 0.472, + "epoch": 10.253230828580088, + "grad_norm": 0.7118489742279053, + "learning_rate": 1.2757357359312733e-05, + "loss": 0.3455, "step": 744200 }, { - "epoch": 7.58, - "learning_rate": 3.1186441281359126e-05, - "loss": 0.4421, + "epoch": 10.254608580639829, + "grad_norm": 3.9301257133483887, + "learning_rate": 1.2750591694662379e-05, + "loss": 0.2928, "step": 744300 }, { - "epoch": 7.58, - "learning_rate": 3.117981284355525e-05, - "loss": 0.4192, + "epoch": 10.255986332699567, + "grad_norm": 2.872012138366699, + "learning_rate": 1.2743827283049704e-05, + "loss": 0.2894, "step": 744400 }, { - "epoch": 7.59, - "learning_rate": 3.117325063330495e-05, - "loss": 0.4608, + "epoch": 10.257364084759306, + "grad_norm": 4.396772384643555, + "learning_rate": 1.2737064125049395e-05, + "loss": 0.2665, "step": 744500 }, { - "epoch": 7.59, - "learning_rate": 3.116662208102557e-05, - "loss": 0.4638, + "epoch": 10.258741836819047, + "grad_norm": 4.133399963378906, + "learning_rate": 1.2730302221236027e-05, + "loss": 0.2834, "step": 744600 }, { - "epoch": 7.59, - "learning_rate": 3.115999347170533e-05, - "loss": 0.3901, + "epoch": 10.260119588878785, + "grad_norm": 1.3539600372314453, + "learning_rate": 1.2723541572184036e-05, + "loss": 0.2854, "step": 744700 }, { - "epoch": 7.59, - "learning_rate": 3.1153364805668324e-05, - "loss": 0.3941, + "epoch": 10.261497340938524, + "grad_norm": 2.0183892250061035, + "learning_rate": 1.2716782178467793e-05, + "loss": 0.2574, "step": 744800 }, { - "epoch": 7.59, - "learning_rate": 3.114673608323868e-05, - "loss": 0.4426, + "epoch": 10.262875092998264, + "grad_norm": 0.10228355973958969, + "learning_rate": 1.2710024040661535e-05, + "loss": 0.2971, "step": 744900 }, { - "epoch": 7.59, - "learning_rate": 3.1140107304740475e-05, - "loss": 0.4065, + "epoch": 10.264252845058003, + "grad_norm": 0.18874327838420868, + "learning_rate": 1.2703267159339388e-05, + "loss": 0.2545, "step": 745000 }, { - "epoch": 7.59, - "learning_rate": 3.113347847049782e-05, - "loss": 0.4328, + "epoch": 10.265630597117744, + "grad_norm": 0.07201740890741348, + "learning_rate": 1.2696511535075398e-05, + "loss": 0.2964, "step": 745100 }, { - "epoch": 7.59, - "learning_rate": 3.112684958083485e-05, - "loss": 0.4396, + "epoch": 10.267008349177482, + "grad_norm": 2.653935432434082, + "learning_rate": 1.2689757168443495e-05, + "loss": 0.3243, "step": 745200 }, { - "epoch": 7.59, - "learning_rate": 3.1120286925794905e-05, - "loss": 0.5135, + "epoch": 10.26838610123722, + "grad_norm": 6.803628444671631, + "learning_rate": 1.2683004060017483e-05, + "loss": 0.2882, "step": 745300 }, { - "epoch": 7.59, - "learning_rate": 3.111365792680972e-05, - "loss": 0.443, + "epoch": 10.269763853296961, + "grad_norm": 3.905977964401245, + "learning_rate": 1.2676252210371084e-05, + "loss": 0.3046, "step": 745400 }, { - "epoch": 7.6, - "learning_rate": 3.11070288733733e-05, - "loss": 0.4254, + "epoch": 10.2711416053567, + "grad_norm": 3.8674843311309814, + "learning_rate": 1.2669501620077893e-05, + "loss": 0.2301, "step": 745500 }, { - "epoch": 7.6, - "learning_rate": 3.110039976580979e-05, - "loss": 0.4511, + "epoch": 10.272519357416439, + "grad_norm": 3.3966798782348633, + "learning_rate": 1.2662752289711418e-05, + "loss": 0.3044, "step": 745600 }, { - "epoch": 7.6, - "learning_rate": 3.109377060444328e-05, - "loss": 0.4012, + "epoch": 10.273897109476179, + "grad_norm": 2.188098192214966, + "learning_rate": 1.2656004219845033e-05, + "loss": 0.2905, "step": 745700 }, { - "epoch": 7.6, - "learning_rate": 3.108714138959793e-05, - "loss": 0.4863, + "epoch": 10.275274861535918, + "grad_norm": 7.0580644607543945, + "learning_rate": 1.2649257411052035e-05, + "loss": 0.2874, "step": 745800 }, { - "epoch": 7.6, - "learning_rate": 3.108051212159784e-05, - "loss": 0.4282, + "epoch": 10.276652613595658, + "grad_norm": 7.363040447235107, + "learning_rate": 1.2642511863905613e-05, + "loss": 0.2318, "step": 745900 }, { - "epoch": 7.6, - "learning_rate": 3.107388280076717e-05, - "loss": 0.5036, + "epoch": 10.278030365655397, + "grad_norm": 1.6303383111953735, + "learning_rate": 1.2635767578978797e-05, + "loss": 0.3022, "step": 746000 }, { - "epoch": 7.6, - "learning_rate": 3.1067253427430035e-05, - "loss": 0.46, + "epoch": 10.279408117715136, + "grad_norm": 4.955488681793213, + "learning_rate": 1.262902455684457e-05, + "loss": 0.2944, "step": 746100 }, { - "epoch": 7.6, - "learning_rate": 3.106062400191057e-05, - "loss": 0.4919, + "epoch": 10.280785869774876, + "grad_norm": 2.6587889194488525, + "learning_rate": 1.2622282798075798e-05, + "loss": 0.2932, "step": 746200 }, { - "epoch": 7.6, - "learning_rate": 3.1053994524532926e-05, - "loss": 0.3813, + "epoch": 10.282163621834615, + "grad_norm": 2.686431884765625, + "learning_rate": 1.2615542303245206e-05, + "loss": 0.2757, "step": 746300 }, { - "epoch": 7.6, - "learning_rate": 3.1047364995621234e-05, - "loss": 0.4038, + "epoch": 10.283541373894353, + "grad_norm": 1.9033284187316895, + "learning_rate": 1.2608803072925442e-05, + "loss": 0.2779, "step": 746400 }, { - "epoch": 7.61, - "learning_rate": 3.104073541549965e-05, - "loss": 0.5234, + "epoch": 10.284919125954094, + "grad_norm": 5.9717912673950195, + "learning_rate": 1.260206510768905e-05, + "loss": 0.2772, "step": 746500 }, { - "epoch": 7.61, - "learning_rate": 3.103410578449232e-05, - "loss": 0.4496, + "epoch": 10.286296878013832, + "grad_norm": 0.2728531062602997, + "learning_rate": 1.2595328408108447e-05, + "loss": 0.2913, "step": 746600 }, { - "epoch": 7.61, - "learning_rate": 3.102747610292338e-05, - "loss": 0.4163, + "epoch": 10.287674630073571, + "grad_norm": 2.6363723278045654, + "learning_rate": 1.2588592974755937e-05, + "loss": 0.2491, "step": 746700 }, { - "epoch": 7.61, - "learning_rate": 3.1020846371116995e-05, - "loss": 0.367, + "epoch": 10.289052382133312, + "grad_norm": 31.184005737304688, + "learning_rate": 1.2581858808203738e-05, + "loss": 0.289, "step": 746800 }, { - "epoch": 7.61, - "learning_rate": 3.101421658939731e-05, - "loss": 0.4736, + "epoch": 10.29043013419305, + "grad_norm": 1.7844102382659912, + "learning_rate": 1.2575125909023966e-05, + "loss": 0.2829, "step": 746900 }, { - "epoch": 7.61, - "learning_rate": 3.100758675808848e-05, - "loss": 0.377, + "epoch": 10.29180788625279, + "grad_norm": 1.8636345863342285, + "learning_rate": 1.2568394277788602e-05, + "loss": 0.2821, "step": 747000 }, { - "epoch": 7.61, - "learning_rate": 3.1000956877514685e-05, - "loss": 0.5069, + "epoch": 10.29318563831253, + "grad_norm": 2.7658934593200684, + "learning_rate": 1.2561663915069526e-05, + "loss": 0.3015, "step": 747100 }, { - "epoch": 7.61, - "learning_rate": 3.099432694800007e-05, - "loss": 0.4319, + "epoch": 10.294563390372268, + "grad_norm": 10.444513320922852, + "learning_rate": 1.2554934821438533e-05, + "loss": 0.3579, "step": 747200 }, { - "epoch": 7.61, - "learning_rate": 3.098769696986879e-05, - "loss": 0.3508, + "epoch": 10.295941142432008, + "grad_norm": 1.9455314874649048, + "learning_rate": 1.2548206997467274e-05, + "loss": 0.2568, "step": 747300 }, { - "epoch": 7.61, - "learning_rate": 3.098106694344503e-05, - "loss": 0.4593, + "epoch": 10.297318894491747, + "grad_norm": 1.7967429161071777, + "learning_rate": 1.2541480443727322e-05, + "loss": 0.2987, "step": 747400 }, { - "epoch": 7.62, - "learning_rate": 3.0974436869052956e-05, - "loss": 0.3391, + "epoch": 10.298696646551486, + "grad_norm": 2.6173243522644043, + "learning_rate": 1.2534822407327163e-05, + "loss": 0.2736, "step": 747500 }, { - "epoch": 7.62, - "learning_rate": 3.096780674701672e-05, - "loss": 0.4315, + "epoch": 10.300074398611226, + "grad_norm": 1.535183072090149, + "learning_rate": 1.2528098383047525e-05, + "loss": 0.2561, "step": 747600 }, { - "epoch": 7.62, - "learning_rate": 3.0961176577660524e-05, - "loss": 0.4582, + "epoch": 10.301452150670965, + "grad_norm": 7.821176052093506, + "learning_rate": 1.252137563070753e-05, + "loss": 0.2792, "step": 747700 }, { - "epoch": 7.62, - "learning_rate": 3.0954546361308526e-05, - "loss": 0.3671, + "epoch": 10.302829902730705, + "grad_norm": 0.6094760894775391, + "learning_rate": 1.2514654150878296e-05, + "loss": 0.2472, "step": 747800 }, { - "epoch": 7.62, - "learning_rate": 3.09479160982849e-05, - "loss": 0.4694, + "epoch": 10.304207654790444, + "grad_norm": 2.171246290206909, + "learning_rate": 1.2507933944130863e-05, + "loss": 0.2789, "step": 747900 }, { - "epoch": 7.62, - "learning_rate": 3.0941285788913836e-05, - "loss": 0.4077, + "epoch": 10.305585406850183, + "grad_norm": 9.747644424438477, + "learning_rate": 1.2501215011036132e-05, + "loss": 0.3167, "step": 748000 }, { - "epoch": 7.62, - "learning_rate": 3.0934655433519523e-05, - "loss": 0.4447, + "epoch": 10.306963158909923, + "grad_norm": 0.5684568285942078, + "learning_rate": 1.249449735216493e-05, + "loss": 0.2438, "step": 748100 }, { - "epoch": 7.62, - "learning_rate": 3.092802503242613e-05, - "loss": 0.4076, + "epoch": 10.308340910969662, + "grad_norm": 2.6274960041046143, + "learning_rate": 1.2487780968087934e-05, + "loss": 0.2968, "step": 748200 }, { - "epoch": 7.62, - "learning_rate": 3.0921394585957845e-05, - "loss": 0.3692, + "epoch": 10.3097186630294, + "grad_norm": 1.7250436544418335, + "learning_rate": 1.2481065859375756e-05, + "loss": 0.2701, "step": 748300 }, { - "epoch": 7.62, - "learning_rate": 3.0914764094438874e-05, - "loss": 0.4786, + "epoch": 10.31109641508914, + "grad_norm": 0.046784158796072006, + "learning_rate": 1.2474352026598861e-05, + "loss": 0.3154, "step": 748400 }, { - "epoch": 7.63, - "learning_rate": 3.09081335581934e-05, - "loss": 0.3866, + "epoch": 10.31247416714888, + "grad_norm": 2.8722221851348877, + "learning_rate": 1.2467639470327632e-05, + "loss": 0.2613, "step": 748500 }, { - "epoch": 7.63, - "learning_rate": 3.090150297754561e-05, - "loss": 0.4992, + "epoch": 10.31385191920862, + "grad_norm": 0.864767849445343, + "learning_rate": 1.2460928191132345e-05, + "loss": 0.2587, "step": 748600 }, { - "epoch": 7.63, - "learning_rate": 3.08948723528197e-05, - "loss": 0.3901, + "epoch": 10.315229671268359, + "grad_norm": 2.326507091522217, + "learning_rate": 1.2454218189583146e-05, + "loss": 0.2634, "step": 748700 }, { - "epoch": 7.63, - "learning_rate": 3.088824168433988e-05, - "loss": 0.3847, + "epoch": 10.316607423328097, + "grad_norm": 4.557476997375488, + "learning_rate": 1.2447509466250079e-05, + "loss": 0.2237, "step": 748800 }, { - "epoch": 7.63, - "learning_rate": 3.0881610972430334e-05, - "loss": 0.4889, + "epoch": 10.317985175387838, + "grad_norm": 2.7181835174560547, + "learning_rate": 1.2440869089816697e-05, + "loss": 0.3133, "step": 748900 }, { - "epoch": 7.63, - "learning_rate": 3.0874980217415275e-05, - "loss": 0.3674, + "epoch": 10.319362927447576, + "grad_norm": 1.1447982788085938, + "learning_rate": 1.2434162911829248e-05, + "loss": 0.2767, "step": 749000 }, { - "epoch": 7.63, - "learning_rate": 3.086834941961891e-05, - "loss": 0.3912, + "epoch": 10.320740679507315, + "grad_norm": 4.563024044036865, + "learning_rate": 1.2427458013761725e-05, + "loss": 0.2612, "step": 749100 }, { - "epoch": 7.63, - "learning_rate": 3.086171857936543e-05, - "loss": 0.4054, + "epoch": 10.322118431567056, + "grad_norm": 3.108729600906372, + "learning_rate": 1.2420754396183766e-05, + "loss": 0.2696, "step": 749200 }, { - "epoch": 7.63, - "learning_rate": 3.085508769697907e-05, - "loss": 0.3967, + "epoch": 10.323496183626794, + "grad_norm": 6.1577582359313965, + "learning_rate": 1.2414052059664856e-05, + "loss": 0.2363, "step": 749300 }, { - "epoch": 7.64, - "learning_rate": 3.084845677278401e-05, - "loss": 0.3654, + "epoch": 10.324873935686535, + "grad_norm": 52.30073928833008, + "learning_rate": 1.2407351004774411e-05, + "loss": 0.2866, "step": 749400 }, { - "epoch": 7.64, - "learning_rate": 3.084182580710448e-05, - "loss": 0.4439, + "epoch": 10.326251687746273, + "grad_norm": 4.92311429977417, + "learning_rate": 1.2400651232081702e-05, + "loss": 0.2829, "step": 749500 }, { - "epoch": 7.64, - "learning_rate": 3.08351948002647e-05, - "loss": 0.4097, + "epoch": 10.327629439806012, + "grad_norm": 0.09638363122940063, + "learning_rate": 1.2393952742155919e-05, + "loss": 0.2792, "step": 749600 }, { - "epoch": 7.64, - "learning_rate": 3.0828563752588865e-05, - "loss": 0.3982, + "epoch": 10.329007191865752, + "grad_norm": 0.4088597297668457, + "learning_rate": 1.2387255535566143e-05, + "loss": 0.2588, "step": 749700 }, { - "epoch": 7.64, - "learning_rate": 3.082193266440122e-05, - "loss": 0.4307, + "epoch": 10.330384943925491, + "grad_norm": 1.1428639888763428, + "learning_rate": 1.2380559612881325e-05, + "loss": 0.2564, "step": 749800 }, { - "epoch": 7.64, - "learning_rate": 3.081530153602596e-05, - "loss": 0.4433, + "epoch": 10.33176269598523, + "grad_norm": 2.0343477725982666, + "learning_rate": 1.237386497467031e-05, + "loss": 0.2187, "step": 749900 }, { - "epoch": 7.64, - "learning_rate": 3.080867036778733e-05, - "loss": 0.4471, + "epoch": 10.33314044804497, + "grad_norm": 1.163087248802185, + "learning_rate": 1.2367171621501858e-05, + "loss": 0.2757, "step": 750000 }, { - "epoch": 7.64, - "learning_rate": 3.080203916000954e-05, - "loss": 0.4519, + "epoch": 10.334518200104709, + "grad_norm": 0.33483877778053284, + "learning_rate": 1.2360479553944586e-05, + "loss": 0.2293, "step": 750100 }, { - "epoch": 7.64, - "learning_rate": 3.0795407913016826e-05, - "loss": 0.5162, + "epoch": 10.33589595216445, + "grad_norm": 49.04188537597656, + "learning_rate": 1.235378877256703e-05, + "loss": 0.2841, "step": 750200 }, { - "epoch": 7.64, - "learning_rate": 3.078877662713341e-05, - "loss": 0.4297, + "epoch": 10.337273704224188, + "grad_norm": 2.908079147338867, + "learning_rate": 1.2347099277937615e-05, + "loss": 0.2342, "step": 750300 }, { - "epoch": 7.65, - "learning_rate": 3.0782145302683525e-05, - "loss": 0.476, + "epoch": 10.338651456283927, + "grad_norm": 11.657766342163086, + "learning_rate": 1.2340411070624638e-05, + "loss": 0.3136, "step": 750400 }, { - "epoch": 7.65, - "learning_rate": 3.07755139399914e-05, - "loss": 0.4014, + "epoch": 10.340029208343667, + "grad_norm": 2.917191505432129, + "learning_rate": 1.2333724151196288e-05, + "loss": 0.304, "step": 750500 }, { - "epoch": 7.65, - "learning_rate": 3.076888253938128e-05, - "loss": 0.5183, + "epoch": 10.341406960403406, + "grad_norm": 1.011428952217102, + "learning_rate": 1.2327038520220658e-05, + "loss": 0.3094, "step": 750600 }, { - "epoch": 7.65, - "learning_rate": 3.076225110117738e-05, - "loss": 0.3984, + "epoch": 10.342784712463144, + "grad_norm": 1.028139352798462, + "learning_rate": 1.2320354178265743e-05, + "loss": 0.2623, "step": 750700 }, { - "epoch": 7.65, - "learning_rate": 3.075561962570395e-05, - "loss": 0.4935, + "epoch": 10.344162464522885, + "grad_norm": 3.5651354789733887, + "learning_rate": 1.2313671125899389e-05, + "loss": 0.3188, "step": 750800 }, { - "epoch": 7.65, - "learning_rate": 3.0748988113285236e-05, - "loss": 0.3687, + "epoch": 10.345540216582624, + "grad_norm": 2.692758798599243, + "learning_rate": 1.2306989363689372e-05, + "loss": 0.2815, "step": 750900 }, { - "epoch": 7.65, - "learning_rate": 3.074235656424547e-05, - "loss": 0.4868, + "epoch": 10.346917968642362, + "grad_norm": 1.5006853342056274, + "learning_rate": 1.2300308892203339e-05, + "loss": 0.3066, "step": 751000 }, { - "epoch": 7.65, - "learning_rate": 3.0735724978908906e-05, - "loss": 0.472, + "epoch": 10.348295720702103, + "grad_norm": 2.941396474838257, + "learning_rate": 1.2293629712008817e-05, + "loss": 0.2623, "step": 751100 }, { - "epoch": 7.65, - "learning_rate": 3.0729159673989854e-05, - "loss": 0.4299, + "epoch": 10.349673472761841, + "grad_norm": 2.8104448318481445, + "learning_rate": 1.2286951823673244e-05, + "loss": 0.2835, "step": 751200 }, { - "epoch": 7.65, - "learning_rate": 3.0722528017387295e-05, - "loss": 0.4645, + "epoch": 10.351051224821582, + "grad_norm": 4.43010139465332, + "learning_rate": 1.2280275227763954e-05, + "loss": 0.3457, "step": 751300 }, { - "epoch": 7.66, - "learning_rate": 3.071589632545742e-05, - "loss": 0.4098, + "epoch": 10.35242897688132, + "grad_norm": 2.877288818359375, + "learning_rate": 1.2273599924848147e-05, + "loss": 0.3042, "step": 751400 }, { - "epoch": 7.66, - "learning_rate": 3.070926459852448e-05, - "loss": 0.4357, + "epoch": 10.35380672894106, + "grad_norm": 1.2747961282730103, + "learning_rate": 1.2266925915492914e-05, + "loss": 0.2414, "step": 751500 }, { - "epoch": 7.66, - "learning_rate": 3.070263283691274e-05, - "loss": 0.3863, + "epoch": 10.3551844810008, + "grad_norm": 4.4095258712768555, + "learning_rate": 1.2260253200265258e-05, + "loss": 0.2994, "step": 751600 }, { - "epoch": 7.66, - "learning_rate": 3.0696001040946455e-05, - "loss": 0.4247, + "epoch": 10.356562233060538, + "grad_norm": 15.69942855834961, + "learning_rate": 1.2253581779732069e-05, + "loss": 0.2389, "step": 751700 }, { - "epoch": 7.66, - "learning_rate": 3.068936921094986e-05, - "loss": 0.4724, + "epoch": 10.357939985120277, + "grad_norm": 3.9615352153778076, + "learning_rate": 1.2246911654460096e-05, + "loss": 0.2768, "step": 751800 }, { - "epoch": 7.66, - "learning_rate": 3.068273734724724e-05, - "loss": 0.3932, + "epoch": 10.359317737180017, + "grad_norm": 1.496427059173584, + "learning_rate": 1.224024282501601e-05, + "loss": 0.2717, "step": 751900 }, { - "epoch": 7.66, - "learning_rate": 3.0676105450162825e-05, - "loss": 0.4434, + "epoch": 10.360695489239756, + "grad_norm": 111.40361022949219, + "learning_rate": 1.2233575291966388e-05, + "loss": 0.2973, "step": 752000 }, { - "epoch": 7.66, - "learning_rate": 3.066947352002088e-05, - "loss": 0.416, + "epoch": 10.362073241299496, + "grad_norm": 2.0507266521453857, + "learning_rate": 1.2226909055877626e-05, + "loss": 0.2757, "step": 752100 }, { - "epoch": 7.66, - "learning_rate": 3.066284155714569e-05, - "loss": 0.4615, + "epoch": 10.363450993359235, + "grad_norm": 3.4131627082824707, + "learning_rate": 1.2220244117316078e-05, + "loss": 0.2844, "step": 752200 }, { - "epoch": 7.66, - "learning_rate": 3.06562095618615e-05, - "loss": 0.4978, + "epoch": 10.364828745418974, + "grad_norm": 0.6258629560470581, + "learning_rate": 1.2213580476847972e-05, + "loss": 0.2986, "step": 752300 }, { - "epoch": 7.67, - "learning_rate": 3.064957753449258e-05, - "loss": 0.5006, + "epoch": 10.366206497478714, + "grad_norm": 3.588021993637085, + "learning_rate": 1.2206918135039399e-05, + "loss": 0.3108, "step": 752400 }, { - "epoch": 7.67, - "learning_rate": 3.06429454753632e-05, - "loss": 0.4498, + "epoch": 10.367584249538453, + "grad_norm": 1.8724514245986938, + "learning_rate": 1.2200257092456377e-05, + "loss": 0.2382, "step": 752500 }, { - "epoch": 7.67, - "learning_rate": 3.063631338479761e-05, - "loss": 0.3794, + "epoch": 10.368962001598192, + "grad_norm": 3.2880165576934814, + "learning_rate": 1.2193597349664782e-05, + "loss": 0.2265, "step": 752600 }, { - "epoch": 7.67, - "learning_rate": 3.0629681263120096e-05, - "loss": 0.4471, + "epoch": 10.370339753657932, + "grad_norm": 1.7806172370910645, + "learning_rate": 1.2186938907230407e-05, + "loss": 0.255, "step": 752700 }, { - "epoch": 7.67, - "learning_rate": 3.0623049110654936e-05, - "loss": 0.4869, + "epoch": 10.37171750571767, + "grad_norm": 4.624688148498535, + "learning_rate": 1.2180281765718901e-05, + "loss": 0.3445, "step": 752800 }, { - "epoch": 7.67, - "learning_rate": 3.061641692772638e-05, - "loss": 0.4507, + "epoch": 10.373095257777411, + "grad_norm": 2.98949933052063, + "learning_rate": 1.2173625925695838e-05, + "loss": 0.2988, "step": 752900 }, { - "epoch": 7.67, - "learning_rate": 3.060978471465872e-05, - "loss": 0.3959, + "epoch": 10.37447300983715, + "grad_norm": 3.386470317840576, + "learning_rate": 1.2166971387726671e-05, + "loss": 0.2419, "step": 753000 }, { - "epoch": 7.67, - "learning_rate": 3.0603152471776224e-05, - "loss": 0.481, + "epoch": 10.375850761896888, + "grad_norm": 2.0045390129089355, + "learning_rate": 1.2160318152376726e-05, + "loss": 0.2826, "step": 753100 }, { - "epoch": 7.67, - "learning_rate": 3.059652019940318e-05, - "loss": 0.4218, + "epoch": 10.377228513956629, + "grad_norm": 9.526384353637695, + "learning_rate": 1.2153666220211225e-05, + "loss": 0.4032, "step": 753200 }, { - "epoch": 7.67, - "learning_rate": 3.058988789786385e-05, - "loss": 0.4285, + "epoch": 10.378606266016368, + "grad_norm": 4.81276798248291, + "learning_rate": 1.2147015591795296e-05, + "loss": 0.3102, "step": 753300 }, { - "epoch": 7.68, - "learning_rate": 3.058325556748252e-05, - "loss": 0.4261, + "epoch": 10.379984018076106, + "grad_norm": 2.2522106170654297, + "learning_rate": 1.2140366267693926e-05, + "loss": 0.3012, "step": 753400 }, { - "epoch": 7.68, - "learning_rate": 3.057662320858348e-05, - "loss": 0.416, + "epoch": 10.381361770135847, + "grad_norm": 0.10712818056344986, + "learning_rate": 1.2133718248472026e-05, + "loss": 0.2639, "step": 753500 }, { - "epoch": 7.68, - "learning_rate": 3.0569990821491014e-05, - "loss": 0.4515, + "epoch": 10.382739522195585, + "grad_norm": 2.9620654582977295, + "learning_rate": 1.2127071534694381e-05, + "loss": 0.2425, "step": 753600 }, { - "epoch": 7.68, - "learning_rate": 3.056335840652938e-05, - "loss": 0.4142, + "epoch": 10.384117274255326, + "grad_norm": 2.1191298961639404, + "learning_rate": 1.2120426126925659e-05, + "loss": 0.2882, "step": 753700 }, { - "epoch": 7.68, - "learning_rate": 3.0556725964022895e-05, - "loss": 0.4506, + "epoch": 10.385495026315064, + "grad_norm": 0.9867560863494873, + "learning_rate": 1.2113782025730408e-05, + "loss": 0.319, "step": 753800 }, { - "epoch": 7.68, - "learning_rate": 3.0550093494295826e-05, - "loss": 0.4797, + "epoch": 10.386872778374803, + "grad_norm": 10.295557022094727, + "learning_rate": 1.21071392316731e-05, + "loss": 0.289, "step": 753900 }, { - "epoch": 7.68, - "learning_rate": 3.054346099767247e-05, - "loss": 0.4607, + "epoch": 10.388250530434544, + "grad_norm": 1.3176130056381226, + "learning_rate": 1.210049774531805e-05, + "loss": 0.3031, "step": 754000 }, { - "epoch": 7.68, - "learning_rate": 3.053682847447712e-05, - "loss": 0.4683, + "epoch": 10.389628282494282, + "grad_norm": 4.669096946716309, + "learning_rate": 1.2093923962532621e-05, + "loss": 0.3646, "step": 754100 }, { - "epoch": 7.68, - "learning_rate": 3.0530195925034044e-05, - "loss": 0.5217, + "epoch": 10.391006034554021, + "grad_norm": 2.310152530670166, + "learning_rate": 1.2087285080183599e-05, + "loss": 0.2546, "step": 754200 }, { - "epoch": 7.68, - "learning_rate": 3.052356334966757e-05, - "loss": 0.4532, + "epoch": 10.392383786613761, + "grad_norm": 1.4878326654434204, + "learning_rate": 1.2080647507223555e-05, + "loss": 0.2774, "step": 754300 }, { - "epoch": 7.69, - "learning_rate": 3.0516930748701968e-05, - "loss": 0.4148, + "epoch": 10.3937615386735, + "grad_norm": 4.410424709320068, + "learning_rate": 1.2074011244216399e-05, + "loss": 0.2829, "step": 754400 }, { - "epoch": 7.69, - "learning_rate": 3.0510364448847983e-05, - "loss": 0.3701, + "epoch": 10.39513929073324, + "grad_norm": 3.3908345699310303, + "learning_rate": 1.2067376291725896e-05, + "loss": 0.2398, "step": 754500 }, { - "epoch": 7.69, - "learning_rate": 3.050373179790492e-05, - "loss": 0.4666, + "epoch": 10.39651704279298, + "grad_norm": 2.1960129737854004, + "learning_rate": 1.2060742650315728e-05, + "loss": 0.2901, "step": 754600 }, { - "epoch": 7.69, - "learning_rate": 3.0497099122332375e-05, - "loss": 0.4665, + "epoch": 10.397894794852718, + "grad_norm": 5.312730312347412, + "learning_rate": 1.2054110320549481e-05, + "loss": 0.2523, "step": 754700 }, { - "epoch": 7.69, - "learning_rate": 3.0490466422454646e-05, - "loss": 0.4035, + "epoch": 10.399272546912458, + "grad_norm": 5.1576409339904785, + "learning_rate": 1.2047479302990563e-05, + "loss": 0.2979, "step": 754800 }, { - "epoch": 7.69, - "learning_rate": 3.0483833698596046e-05, - "loss": 0.3939, + "epoch": 10.400650298972197, + "grad_norm": 1.6031330823898315, + "learning_rate": 1.2040849598202332e-05, + "loss": 0.3266, "step": 754900 }, { - "epoch": 7.69, - "learning_rate": 3.0477200951080852e-05, - "loss": 0.4051, + "epoch": 10.402028051031936, + "grad_norm": 3.285905122756958, + "learning_rate": 1.2034221206748025e-05, + "loss": 0.2421, "step": 755000 }, { - "epoch": 7.69, - "learning_rate": 3.047056818023339e-05, - "loss": 0.3826, + "epoch": 10.403405803091676, + "grad_norm": 0.20130255818367004, + "learning_rate": 1.202759412919074e-05, + "loss": 0.3022, "step": 755100 }, { - "epoch": 7.69, - "learning_rate": 3.046393538637795e-05, - "loss": 0.3561, + "epoch": 10.404783555151415, + "grad_norm": 0.8661023378372192, + "learning_rate": 1.202096836609349e-05, + "loss": 0.2495, "step": 755200 }, { - "epoch": 7.7, - "learning_rate": 3.0457302569838833e-05, - "loss": 0.4, + "epoch": 10.406161307211153, + "grad_norm": 4.751830577850342, + "learning_rate": 1.2014343918019179e-05, + "loss": 0.2838, "step": 755300 }, { - "epoch": 7.7, - "learning_rate": 3.0450669730940358e-05, - "loss": 0.4267, + "epoch": 10.407539059270894, + "grad_norm": 0.5373607873916626, + "learning_rate": 1.2007720785530576e-05, + "loss": 0.2949, "step": 755400 }, { - "epoch": 7.7, - "learning_rate": 3.0444036870006812e-05, - "loss": 0.4194, + "epoch": 10.408916811330633, + "grad_norm": 1.6983964443206787, + "learning_rate": 1.2001098969190339e-05, + "loss": 0.288, "step": 755500 }, { - "epoch": 7.7, - "learning_rate": 3.043740398736252e-05, - "loss": 0.3832, + "epoch": 10.410294563390373, + "grad_norm": 3.3707869052886963, + "learning_rate": 1.1994478469561035e-05, + "loss": 0.2924, "step": 755600 }, { - "epoch": 7.7, - "learning_rate": 3.0430771083331787e-05, - "loss": 0.4045, + "epoch": 10.411672315450112, + "grad_norm": 4.937296390533447, + "learning_rate": 1.1987859287205124e-05, + "loss": 0.2731, "step": 755700 }, { - "epoch": 7.7, - "learning_rate": 3.042413815823891e-05, - "loss": 0.4191, + "epoch": 10.41305006750985, + "grad_norm": 1.05008864402771, + "learning_rate": 1.1981241422684913e-05, + "loss": 0.3511, "step": 755800 }, { - "epoch": 7.7, - "learning_rate": 3.0417505212408218e-05, - "loss": 0.4642, + "epoch": 10.41442781956959, + "grad_norm": 5.632351398468018, + "learning_rate": 1.197462487656265e-05, + "loss": 0.2902, "step": 755900 }, { - "epoch": 7.7, - "learning_rate": 3.0410938575926427e-05, - "loss": 0.3967, + "epoch": 10.41580557162933, + "grad_norm": 2.395057201385498, + "learning_rate": 1.1968009649400427e-05, + "loss": 0.2568, "step": 756000 }, { - "epoch": 7.7, - "learning_rate": 3.04043055897923e-05, - "loss": 0.4332, + "epoch": 10.417183323689068, + "grad_norm": 3.044116258621216, + "learning_rate": 1.1961395741760237e-05, + "loss": 0.2637, "step": 756100 }, { - "epoch": 7.7, - "learning_rate": 3.0397672583890045e-05, - "loss": 0.4371, + "epoch": 10.418561075748809, + "grad_norm": 4.785831928253174, + "learning_rate": 1.1954783154203976e-05, + "loss": 0.2885, "step": 756200 }, { - "epoch": 7.71, - "learning_rate": 3.0391039558543972e-05, - "loss": 0.4412, + "epoch": 10.419938827808547, + "grad_norm": 4.038703918457031, + "learning_rate": 1.1948171887293424e-05, + "loss": 0.3689, "step": 756300 }, { - "epoch": 7.71, - "learning_rate": 3.0384406514078397e-05, - "loss": 0.385, + "epoch": 10.421316579868288, + "grad_norm": 1.6830896139144897, + "learning_rate": 1.194156194159023e-05, + "loss": 0.2559, "step": 756400 }, { - "epoch": 7.71, - "learning_rate": 3.0377773450817645e-05, - "loss": 0.4886, + "epoch": 10.422694331928026, + "grad_norm": 2.0679242610931396, + "learning_rate": 1.1934953317655937e-05, + "loss": 0.2213, "step": 756500 }, { - "epoch": 7.71, - "learning_rate": 3.0371140369086022e-05, - "loss": 0.4576, + "epoch": 10.424072083987765, + "grad_norm": 3.4630651473999023, + "learning_rate": 1.1928346016051989e-05, + "loss": 0.2433, "step": 756600 }, { - "epoch": 7.71, - "learning_rate": 3.036450726920784e-05, - "loss": 0.4771, + "epoch": 10.425449836047505, + "grad_norm": 2.459794759750366, + "learning_rate": 1.1921740037339724e-05, + "loss": 0.2893, "step": 756700 }, { - "epoch": 7.71, - "learning_rate": 3.035787415150744e-05, - "loss": 0.4661, + "epoch": 10.426827588107244, + "grad_norm": 4.568902969360352, + "learning_rate": 1.1915135382080328e-05, + "loss": 0.2658, "step": 756800 }, { - "epoch": 7.71, - "learning_rate": 3.0351241016309125e-05, - "loss": 0.4859, + "epoch": 10.428205340166983, + "grad_norm": 6.512533187866211, + "learning_rate": 1.1908532050834913e-05, + "loss": 0.3044, "step": 756900 }, { - "epoch": 7.71, - "learning_rate": 3.034460786393722e-05, - "loss": 0.393, + "epoch": 10.429583092226723, + "grad_norm": 2.0240204334259033, + "learning_rate": 1.1901930044164488e-05, + "loss": 0.2797, "step": 757000 }, { - "epoch": 7.71, - "learning_rate": 3.033797469471605e-05, - "loss": 0.4309, + "epoch": 10.430960844286462, + "grad_norm": 1.690431833267212, + "learning_rate": 1.1895329362629882e-05, + "loss": 0.2829, "step": 757100 }, { - "epoch": 7.71, - "learning_rate": 3.0331341508969936e-05, - "loss": 0.4964, + "epoch": 10.432338596346202, + "grad_norm": 4.47620964050293, + "learning_rate": 1.1888730006791883e-05, + "loss": 0.2543, "step": 757200 }, { - "epoch": 7.72, - "learning_rate": 3.0324708307023192e-05, - "loss": 0.3874, + "epoch": 10.433716348405941, + "grad_norm": 3.4773223400115967, + "learning_rate": 1.1882131977211145e-05, + "loss": 0.226, "step": 757300 }, { - "epoch": 7.72, - "learning_rate": 3.031807508920015e-05, - "loss": 0.4138, + "epoch": 10.43509410046568, + "grad_norm": 3.734255313873291, + "learning_rate": 1.1875535274448185e-05, + "loss": 0.2784, "step": 757400 }, { - "epoch": 7.72, - "learning_rate": 3.031144185582514e-05, - "loss": 0.4556, + "epoch": 10.43647185252542, + "grad_norm": 4.417179584503174, + "learning_rate": 1.1868939899063448e-05, + "loss": 0.276, "step": 757500 }, { - "epoch": 7.72, - "learning_rate": 3.030480860722248e-05, - "loss": 0.4617, + "epoch": 10.437849604585159, + "grad_norm": 1.5073133707046509, + "learning_rate": 1.1862345851617225e-05, + "loss": 0.2577, "step": 757600 }, { - "epoch": 7.72, - "learning_rate": 3.0298175343716496e-05, - "loss": 0.4925, + "epoch": 10.439227356644897, + "grad_norm": 1.0989325046539307, + "learning_rate": 1.1855753132669734e-05, + "loss": 0.2756, "step": 757700 }, { - "epoch": 7.72, - "learning_rate": 3.0291542065631522e-05, - "loss": 0.4762, + "epoch": 10.440605108704638, + "grad_norm": 2.778719902038574, + "learning_rate": 1.1849161742781037e-05, + "loss": 0.2515, "step": 757800 }, { - "epoch": 7.72, - "learning_rate": 3.0284908773291876e-05, - "loss": 0.4309, + "epoch": 10.441982860764377, + "grad_norm": 1.7250019311904907, + "learning_rate": 1.1842571682511122e-05, + "loss": 0.3062, "step": 757900 }, { - "epoch": 7.72, - "learning_rate": 3.027827546702189e-05, - "loss": 0.4236, + "epoch": 10.443360612824117, + "grad_norm": 0.7426195740699768, + "learning_rate": 1.1835982952419855e-05, + "loss": 0.3213, "step": 758000 }, { - "epoch": 7.72, - "learning_rate": 3.0271642147145894e-05, - "loss": 0.4589, + "epoch": 10.444738364883856, + "grad_norm": 2.1860883235931396, + "learning_rate": 1.1829395553066973e-05, + "loss": 0.3038, "step": 758100 }, { - "epoch": 7.72, - "learning_rate": 3.0265008813988224e-05, - "loss": 0.4284, + "epoch": 10.446116116943594, + "grad_norm": 2.356360673904419, + "learning_rate": 1.1822809485012097e-05, + "loss": 0.2423, "step": 758200 }, { - "epoch": 7.73, - "learning_rate": 3.0258375467873204e-05, - "loss": 0.3579, + "epoch": 10.447493869003335, + "grad_norm": 2.5138392448425293, + "learning_rate": 1.1816224748814774e-05, + "loss": 0.3191, "step": 758300 }, { - "epoch": 7.73, - "learning_rate": 3.0251808442774106e-05, - "loss": 0.4153, + "epoch": 10.448871621063073, + "grad_norm": 1.3884849548339844, + "learning_rate": 1.1809641345034382e-05, + "loss": 0.2583, "step": 758400 }, { - "epoch": 7.73, - "learning_rate": 3.02452414056081e-05, - "loss": 0.4179, + "epoch": 10.450249373122812, + "grad_norm": 4.124725818634033, + "learning_rate": 1.180305927423023e-05, + "loss": 0.2689, "step": 758500 }, { - "epoch": 7.73, - "learning_rate": 3.023860802280352e-05, - "loss": 0.4191, + "epoch": 10.451627125182553, + "grad_norm": 13.507261276245117, + "learning_rate": 1.1796478536961507e-05, + "loss": 0.2307, "step": 758600 }, { - "epoch": 7.73, - "learning_rate": 3.0231974628332435e-05, - "loss": 0.3204, + "epoch": 10.453004877242291, + "grad_norm": 7.8579277992248535, + "learning_rate": 1.1789899133787273e-05, + "loss": 0.2265, "step": 758700 }, { - "epoch": 7.73, - "learning_rate": 3.0225341222519183e-05, - "loss": 0.423, + "epoch": 10.454382629302032, + "grad_norm": 3.59801983833313, + "learning_rate": 1.1783321065266467e-05, + "loss": 0.2886, "step": 758800 }, { - "epoch": 7.73, - "learning_rate": 3.0218707805688085e-05, - "loss": 0.4338, + "epoch": 10.45576038136177, + "grad_norm": 1.100722074508667, + "learning_rate": 1.1776744331957945e-05, + "loss": 0.2614, "step": 758900 }, { - "epoch": 7.73, - "learning_rate": 3.0212074378163477e-05, - "loss": 0.5026, + "epoch": 10.457138133421509, + "grad_norm": 2.635401487350464, + "learning_rate": 1.177016893442044e-05, + "loss": 0.3052, "step": 759000 }, { - "epoch": 7.73, - "learning_rate": 3.020544094026971e-05, - "loss": 0.4036, + "epoch": 10.45851588548125, + "grad_norm": 1.6851189136505127, + "learning_rate": 1.176359487321255e-05, + "loss": 0.2378, "step": 759100 }, { - "epoch": 7.73, - "learning_rate": 3.0198807492331093e-05, - "loss": 0.3537, + "epoch": 10.459893637540988, + "grad_norm": 2.566514253616333, + "learning_rate": 1.1757022148892788e-05, + "loss": 0.2898, "step": 759200 }, { - "epoch": 7.74, - "learning_rate": 3.019217403467198e-05, - "loss": 0.4651, + "epoch": 10.461271389600727, + "grad_norm": 2.4905130863189697, + "learning_rate": 1.1750450762019539e-05, + "loss": 0.2195, "step": 759300 }, { - "epoch": 7.74, - "learning_rate": 3.0185540567616704e-05, - "loss": 0.4638, + "epoch": 10.462649141660467, + "grad_norm": 3.3154752254486084, + "learning_rate": 1.174388071315106e-05, + "loss": 0.2952, "step": 759400 }, { - "epoch": 7.74, - "learning_rate": 3.0178907091489603e-05, - "loss": 0.4214, + "epoch": 10.464026893720206, + "grad_norm": 3.634671211242676, + "learning_rate": 1.1737312002845524e-05, + "loss": 0.266, "step": 759500 }, { - "epoch": 7.74, - "learning_rate": 3.0172273606615014e-05, - "loss": 0.3501, + "epoch": 10.465404645779945, + "grad_norm": 3.4203808307647705, + "learning_rate": 1.1730744631660983e-05, + "loss": 0.261, "step": 759600 }, { - "epoch": 7.74, - "learning_rate": 3.0165640113317275e-05, - "loss": 0.3865, + "epoch": 10.466782397839685, + "grad_norm": 2.1456093788146973, + "learning_rate": 1.172417860015536e-05, + "loss": 0.2862, "step": 759700 }, { - "epoch": 7.74, - "learning_rate": 3.0159006611920715e-05, - "loss": 0.5007, + "epoch": 10.468160149899424, + "grad_norm": 0.7388341426849365, + "learning_rate": 1.1717613908886463e-05, + "loss": 0.2782, "step": 759800 }, { - "epoch": 7.74, - "learning_rate": 3.015237310274968e-05, - "loss": 0.4444, + "epoch": 10.469537901959164, + "grad_norm": 1.9185842275619507, + "learning_rate": 1.1711050558412019e-05, + "loss": 0.2849, "step": 759900 }, { - "epoch": 7.74, - "learning_rate": 3.0145739586128512e-05, - "loss": 0.5021, + "epoch": 10.470915654018903, + "grad_norm": 2.684519052505493, + "learning_rate": 1.1704488549289596e-05, + "loss": 0.2788, "step": 760000 }, { - "epoch": 7.74, - "learning_rate": 3.0139106062381555e-05, - "loss": 0.4989, + "epoch": 10.472293406078641, + "grad_norm": 5.093529224395752, + "learning_rate": 1.1697927882076676e-05, + "loss": 0.3208, "step": 760100 }, { - "epoch": 7.75, - "learning_rate": 3.0132472531833143e-05, - "loss": 0.3362, + "epoch": 10.473671158138382, + "grad_norm": 8.305863380432129, + "learning_rate": 1.1691368557330639e-05, + "loss": 0.2597, "step": 760200 }, { - "epoch": 7.75, - "learning_rate": 3.0125838994807602e-05, - "loss": 0.4746, + "epoch": 10.47504891019812, + "grad_norm": 2.561605453491211, + "learning_rate": 1.1684810575608718e-05, + "loss": 0.2565, "step": 760300 }, { - "epoch": 7.75, - "learning_rate": 3.0119205451629295e-05, - "loss": 0.4278, + "epoch": 10.47642666225786, + "grad_norm": 8.820394515991211, + "learning_rate": 1.1678319497196891e-05, + "loss": 0.2822, "step": 760400 }, { - "epoch": 7.75, - "learning_rate": 3.0112571902622548e-05, - "loss": 0.4835, + "epoch": 10.4778044143176, + "grad_norm": 7.952504634857178, + "learning_rate": 1.1671764189750344e-05, + "loss": 0.2441, "step": 760500 }, { - "epoch": 7.75, - "learning_rate": 3.0105938348111707e-05, - "loss": 0.4286, + "epoch": 10.479182166377338, + "grad_norm": 1.5357391834259033, + "learning_rate": 1.1665210226993407e-05, + "loss": 0.2639, "step": 760600 }, { - "epoch": 7.75, - "learning_rate": 3.009930478842112e-05, - "loss": 0.3728, + "epoch": 10.480559918437079, + "grad_norm": 3.7912161350250244, + "learning_rate": 1.1658657609482879e-05, + "loss": 0.37, "step": 760700 }, { - "epoch": 7.75, - "learning_rate": 3.0092671223875112e-05, - "loss": 0.3697, + "epoch": 10.481937670496817, + "grad_norm": 1.7062973976135254, + "learning_rate": 1.1652106337775433e-05, + "loss": 0.2361, "step": 760800 }, { - "epoch": 7.75, - "learning_rate": 3.0086037654798045e-05, - "loss": 0.4391, + "epoch": 10.483315422556556, + "grad_norm": 3.3193728923797607, + "learning_rate": 1.164555641242762e-05, + "loss": 0.2882, "step": 760900 }, { - "epoch": 7.75, - "learning_rate": 3.007940408151425e-05, - "loss": 0.434, + "epoch": 10.484693174616297, + "grad_norm": 4.0004353523254395, + "learning_rate": 1.1639073313111158e-05, + "loss": 0.3063, "step": 761000 }, { - "epoch": 7.75, - "learning_rate": 3.0072770504348058e-05, - "loss": 0.4146, + "epoch": 10.486070926676035, + "grad_norm": 0.07798045128583908, + "learning_rate": 1.16325260686744e-05, + "loss": 0.2888, "step": 761100 }, { - "epoch": 7.76, - "learning_rate": 3.006613692362382e-05, - "loss": 0.4899, + "epoch": 10.487448678735774, + "grad_norm": 3.9397013187408447, + "learning_rate": 1.162598017226072e-05, + "loss": 0.265, "step": 761200 }, { - "epoch": 7.76, - "learning_rate": 3.0059503339665894e-05, - "loss": 0.4018, + "epoch": 10.488826430795514, + "grad_norm": 2.286264657974243, + "learning_rate": 1.1619435624426233e-05, + "loss": 0.3416, "step": 761300 }, { - "epoch": 7.76, - "learning_rate": 3.0052869752798597e-05, - "loss": 0.4563, + "epoch": 10.490204182855253, + "grad_norm": 1.174958348274231, + "learning_rate": 1.1612892425726945e-05, + "loss": 0.2919, "step": 761400 }, { - "epoch": 7.76, - "learning_rate": 3.00462361633463e-05, - "loss": 0.4224, + "epoch": 10.491581934914993, + "grad_norm": 1.3542767763137817, + "learning_rate": 1.1606350576718726e-05, + "loss": 0.2532, "step": 761500 }, { - "epoch": 7.76, - "learning_rate": 3.003960257163332e-05, - "loss": 0.4662, + "epoch": 10.492959686974732, + "grad_norm": 4.155183792114258, + "learning_rate": 1.1599810077957325e-05, + "loss": 0.2757, "step": 761600 }, { - "epoch": 7.76, - "learning_rate": 3.0032968977984015e-05, - "loss": 0.4046, + "epoch": 10.49433743903447, + "grad_norm": 5.336221694946289, + "learning_rate": 1.1593270929998414e-05, + "loss": 0.2772, "step": 761700 }, { - "epoch": 7.76, - "learning_rate": 3.002633538272272e-05, - "loss": 0.4397, + "epoch": 10.495715191094211, + "grad_norm": 1.6508089303970337, + "learning_rate": 1.1586733133397503e-05, + "loss": 0.2409, "step": 761800 }, { - "epoch": 7.76, - "learning_rate": 3.001970178617378e-05, - "loss": 0.3236, + "epoch": 10.49709294315395, + "grad_norm": 2.2604453563690186, + "learning_rate": 1.158026204646312e-05, + "loss": 0.2615, "step": 761900 }, { - "epoch": 7.76, - "learning_rate": 3.001306818866155e-05, - "loss": 0.4265, + "epoch": 10.498470695213689, + "grad_norm": 2.0784566402435303, + "learning_rate": 1.1573726940716941e-05, + "loss": 0.2886, "step": 762000 }, { - "epoch": 7.76, - "learning_rate": 3.000643459051036e-05, - "loss": 0.3938, + "epoch": 10.499848447273429, + "grad_norm": 4.2743916511535645, + "learning_rate": 1.156719318798914e-05, + "loss": 0.2743, "step": 762100 }, { - "epoch": 7.77, - "learning_rate": 2.9999800992044554e-05, - "loss": 0.4422, + "epoch": 10.501226199333168, + "grad_norm": 0.0776483416557312, + "learning_rate": 1.1560660788834797e-05, + "loss": 0.2677, "step": 762200 }, { - "epoch": 7.77, - "learning_rate": 2.9993167393588478e-05, - "loss": 0.4581, + "epoch": 10.502603951392908, + "grad_norm": 1.19465172290802, + "learning_rate": 1.1554129743808858e-05, + "loss": 0.2956, "step": 762300 }, { - "epoch": 7.77, - "learning_rate": 2.9986533795466474e-05, - "loss": 0.4401, + "epoch": 10.503981703452647, + "grad_norm": 1.1728020906448364, + "learning_rate": 1.1547600053466174e-05, + "loss": 0.2795, "step": 762400 }, { - "epoch": 7.77, - "learning_rate": 2.9979966533973197e-05, - "loss": 0.4503, + "epoch": 10.505359455512385, + "grad_norm": 1.5463505983352661, + "learning_rate": 1.1541071718361495e-05, + "loss": 0.2705, "step": 762500 }, { - "epoch": 7.77, - "learning_rate": 2.997333293748094e-05, - "loss": 0.3898, + "epoch": 10.506737207572126, + "grad_norm": 3.5012853145599365, + "learning_rate": 1.1534544739049402e-05, + "loss": 0.3235, "step": 762600 }, { - "epoch": 7.77, - "learning_rate": 2.996669934229255e-05, - "loss": 0.4769, + "epoch": 10.508114959631865, + "grad_norm": 2.580396890640259, + "learning_rate": 1.1528019116084414e-05, + "loss": 0.2739, "step": 762700 }, { - "epoch": 7.77, - "learning_rate": 2.9960065748732354e-05, - "loss": 0.4171, + "epoch": 10.509492711691603, + "grad_norm": 5.822792053222656, + "learning_rate": 1.152149485002092e-05, + "loss": 0.2574, "step": 762800 }, { - "epoch": 7.77, - "learning_rate": 2.99534321571247e-05, - "loss": 0.4162, + "epoch": 10.510870463751344, + "grad_norm": 2.132537603378296, + "learning_rate": 1.1514971941413176e-05, + "loss": 0.268, "step": 762900 }, { - "epoch": 7.77, - "learning_rate": 2.9946798567793947e-05, - "loss": 0.3888, + "epoch": 10.512248215811082, + "grad_norm": 1.9471551179885864, + "learning_rate": 1.1508450390815352e-05, + "loss": 0.2648, "step": 763000 }, { - "epoch": 7.77, - "learning_rate": 2.9940164981064422e-05, - "loss": 0.4745, + "epoch": 10.513625967870823, + "grad_norm": 3.351454019546509, + "learning_rate": 1.1501930198781473e-05, + "loss": 0.274, "step": 763100 }, { - "epoch": 7.78, - "learning_rate": 2.993353139726047e-05, - "loss": 0.4465, + "epoch": 10.515003719930561, + "grad_norm": 2.9792375564575195, + "learning_rate": 1.1495411365865477e-05, + "loss": 0.2832, "step": 763200 }, { - "epoch": 7.78, - "learning_rate": 2.992689781670645e-05, - "loss": 0.408, + "epoch": 10.5163814719903, + "grad_norm": 0.9721190929412842, + "learning_rate": 1.1488893892621157e-05, + "loss": 0.2185, "step": 763300 }, { - "epoch": 7.78, - "learning_rate": 2.9920264239726672e-05, - "loss": 0.4668, + "epoch": 10.51775922405004, + "grad_norm": 3.034794569015503, + "learning_rate": 1.1482377779602212e-05, + "loss": 0.2446, "step": 763400 }, { - "epoch": 7.78, - "learning_rate": 2.99136306666455e-05, - "loss": 0.4358, + "epoch": 10.51913697610978, + "grad_norm": 2.5384576320648193, + "learning_rate": 1.1475863027362227e-05, + "loss": 0.2745, "step": 763500 }, { - "epoch": 7.78, - "learning_rate": 2.9906997097787276e-05, - "loss": 0.409, + "epoch": 10.520514728169518, + "grad_norm": 3.1671626567840576, + "learning_rate": 1.146934963645466e-05, + "loss": 0.2794, "step": 763600 }, { - "epoch": 7.78, - "learning_rate": 2.9900363533476344e-05, - "loss": 0.4235, + "epoch": 10.521892480229258, + "grad_norm": 3.7602734565734863, + "learning_rate": 1.1462837607432845e-05, + "loss": 0.2817, "step": 763700 }, { - "epoch": 7.78, - "learning_rate": 2.9893729974037034e-05, - "loss": 0.4358, + "epoch": 10.523270232288997, + "grad_norm": 2.8570141792297363, + "learning_rate": 1.1456326940850027e-05, + "loss": 0.2683, "step": 763800 }, { - "epoch": 7.78, - "learning_rate": 2.9887096419793692e-05, - "loss": 0.4538, + "epoch": 10.524647984348736, + "grad_norm": 3.899150848388672, + "learning_rate": 1.1449817637259307e-05, + "loss": 0.312, "step": 763900 }, { - "epoch": 7.78, - "learning_rate": 2.9880462871070677e-05, - "loss": 0.3777, + "epoch": 10.526025736408476, + "grad_norm": 1.554233193397522, + "learning_rate": 1.144330969721369e-05, + "loss": 0.2748, "step": 764000 }, { - "epoch": 7.78, - "learning_rate": 2.98738293281923e-05, - "loss": 0.4248, + "epoch": 10.527403488468215, + "grad_norm": 1.1329936981201172, + "learning_rate": 1.143680312126607e-05, + "loss": 0.308, "step": 764100 }, { - "epoch": 7.79, - "learning_rate": 2.9867195791482914e-05, - "loss": 0.3158, + "epoch": 10.528781240527955, + "grad_norm": 1.8282331228256226, + "learning_rate": 1.1430297909969203e-05, + "loss": 0.2853, "step": 764200 }, { - "epoch": 7.79, - "learning_rate": 2.986056226126687e-05, - "loss": 0.4205, + "epoch": 10.530158992587694, + "grad_norm": 3.2350423336029053, + "learning_rate": 1.1423794063875728e-05, + "loss": 0.2603, "step": 764300 }, { - "epoch": 7.79, - "learning_rate": 2.9853928737868492e-05, - "loss": 0.4375, + "epoch": 10.531536744647433, + "grad_norm": 1.669077754020691, + "learning_rate": 1.1417291583538198e-05, + "loss": 0.3047, "step": 764400 }, { - "epoch": 7.79, - "learning_rate": 2.9847295221612128e-05, - "loss": 0.481, + "epoch": 10.532914496707173, + "grad_norm": 3.144928216934204, + "learning_rate": 1.1410855473884288e-05, + "loss": 0.2843, "step": 764500 }, { - "epoch": 7.79, - "learning_rate": 2.984066171282213e-05, - "loss": 0.3996, + "epoch": 10.534292248766912, + "grad_norm": 3.8136847019195557, + "learning_rate": 1.1404355713044435e-05, + "loss": 0.2771, "step": 764600 }, { - "epoch": 7.79, - "learning_rate": 2.9834028211822807e-05, - "loss": 0.4529, + "epoch": 10.535670000826652, + "grad_norm": 1.5378443002700806, + "learning_rate": 1.139785731961192e-05, + "loss": 0.2354, "step": 764700 }, { - "epoch": 7.79, - "learning_rate": 2.9827394718938518e-05, - "loss": 0.5103, + "epoch": 10.53704775288639, + "grad_norm": 0.5432489514350891, + "learning_rate": 1.1391360294138792e-05, + "loss": 0.247, "step": 764800 }, { - "epoch": 7.79, - "learning_rate": 2.98207612344936e-05, - "loss": 0.3829, + "epoch": 10.53842550494613, + "grad_norm": 9.730171203613281, + "learning_rate": 1.138486463717703e-05, + "loss": 0.3176, "step": 764900 }, { - "epoch": 7.79, - "learning_rate": 2.9814127758812382e-05, - "loss": 0.4805, + "epoch": 10.53980325700587, + "grad_norm": 0.02651713229715824, + "learning_rate": 1.137837034927845e-05, + "loss": 0.2837, "step": 765000 }, { - "epoch": 7.79, - "learning_rate": 2.980749429221921e-05, - "loss": 0.4785, + "epoch": 10.541181009065609, + "grad_norm": 3.5589966773986816, + "learning_rate": 1.1371877430994786e-05, + "loss": 0.2787, "step": 765100 }, { - "epoch": 7.8, - "learning_rate": 2.980086083503843e-05, - "loss": 0.4327, + "epoch": 10.542558761125347, + "grad_norm": 0.10133332014083862, + "learning_rate": 1.1365385882877657e-05, + "loss": 0.2651, "step": 765200 }, { - "epoch": 7.8, - "learning_rate": 2.9794227387594352e-05, - "loss": 0.3611, + "epoch": 10.543936513185088, + "grad_norm": 9.68149471282959, + "learning_rate": 1.1358895705478541e-05, + "loss": 0.255, "step": 765300 }, { - "epoch": 7.8, - "learning_rate": 2.978759395021133e-05, - "loss": 0.3893, + "epoch": 10.545314265244826, + "grad_norm": 0.5385342836380005, + "learning_rate": 1.1352406899348802e-05, + "loss": 0.2949, "step": 765400 }, { - "epoch": 7.8, - "learning_rate": 2.9780960523213694e-05, - "loss": 0.3991, + "epoch": 10.546692017304565, + "grad_norm": 3.5299580097198486, + "learning_rate": 1.134598433259048e-05, + "loss": 0.2727, "step": 765500 }, { - "epoch": 7.8, - "learning_rate": 2.977432710692579e-05, - "loss": 0.3756, + "epoch": 10.548069769364306, + "grad_norm": 7.250683784484863, + "learning_rate": 1.1339498256926735e-05, + "loss": 0.2468, "step": 765600 }, { - "epoch": 7.8, - "learning_rate": 2.9767693701671936e-05, - "loss": 0.3913, + "epoch": 10.549447521424044, + "grad_norm": 1.0019599199295044, + "learning_rate": 1.1333013554180278e-05, + "loss": 0.292, "step": 765700 }, { - "epoch": 7.8, - "learning_rate": 2.9761060307776475e-05, - "loss": 0.4296, + "epoch": 10.550825273483785, + "grad_norm": 2.9831621646881104, + "learning_rate": 1.1326530224902041e-05, + "loss": 0.2646, "step": 765800 }, { - "epoch": 7.8, - "learning_rate": 2.975442692556375e-05, - "loss": 0.3311, + "epoch": 10.552203025543523, + "grad_norm": 2.2045834064483643, + "learning_rate": 1.1320048269642794e-05, + "loss": 0.2564, "step": 765900 }, { - "epoch": 7.8, - "learning_rate": 2.974779355535807e-05, - "loss": 0.4821, + "epoch": 10.553580777603262, + "grad_norm": 1.5811694860458374, + "learning_rate": 1.1313567688953232e-05, + "loss": 0.2693, "step": 766000 }, { - "epoch": 7.81, - "learning_rate": 2.9741226531000425e-05, - "loss": 0.4509, + "epoch": 10.554958529663002, + "grad_norm": 10.051183700561523, + "learning_rate": 1.1307088483383894e-05, + "loss": 0.2753, "step": 766100 }, { - "epoch": 7.81, - "learning_rate": 2.9734593185653693e-05, - "loss": 0.3603, + "epoch": 10.556336281722741, + "grad_norm": 7.651517391204834, + "learning_rate": 1.1300610653485235e-05, + "loss": 0.3351, "step": 766200 }, { - "epoch": 7.81, - "learning_rate": 2.972795985328377e-05, - "loss": 0.4034, + "epoch": 10.55771403378248, + "grad_norm": 1.2165464162826538, + "learning_rate": 1.1294134199807593e-05, + "loss": 0.3092, "step": 766300 }, { - "epoch": 7.81, - "learning_rate": 2.9721326534214997e-05, - "loss": 0.4876, + "epoch": 10.55909178584222, + "grad_norm": 2.310483455657959, + "learning_rate": 1.1287659122901142e-05, + "loss": 0.2323, "step": 766400 }, { - "epoch": 7.81, - "learning_rate": 2.9714693228771686e-05, - "loss": 0.4139, + "epoch": 10.560469537901959, + "grad_norm": 0.7303256988525391, + "learning_rate": 1.1281185423315986e-05, + "loss": 0.3074, "step": 766500 }, { - "epoch": 7.81, - "learning_rate": 2.970805993727817e-05, - "loss": 0.4941, + "epoch": 10.5618472899617, + "grad_norm": 2.2746641635894775, + "learning_rate": 1.1274713101602117e-05, + "loss": 0.2492, "step": 766600 }, { - "epoch": 7.81, - "learning_rate": 2.9701426660058797e-05, - "loss": 0.5101, + "epoch": 10.563225042021438, + "grad_norm": 4.0299072265625, + "learning_rate": 1.1268242158309367e-05, + "loss": 0.2878, "step": 766700 }, { - "epoch": 7.81, - "learning_rate": 2.9694793397437867e-05, - "loss": 0.4774, + "epoch": 10.564602794081177, + "grad_norm": 2.308422803878784, + "learning_rate": 1.1261772593987485e-05, + "loss": 0.3317, "step": 766800 }, { - "epoch": 7.81, - "learning_rate": 2.9688160149739716e-05, - "loss": 0.4767, + "epoch": 10.565980546140917, + "grad_norm": 1.060150384902954, + "learning_rate": 1.1255304409186109e-05, + "loss": 0.2804, "step": 766900 }, { - "epoch": 7.81, - "learning_rate": 2.9681526917288677e-05, - "loss": 0.3823, + "epoch": 10.567358298200656, + "grad_norm": 4.13946008682251, + "learning_rate": 1.124883760445473e-05, + "loss": 0.2509, "step": 767000 }, { - "epoch": 7.82, - "learning_rate": 2.9674893700409068e-05, - "loss": 0.4467, + "epoch": 10.568736050260394, + "grad_norm": 1.7353650331497192, + "learning_rate": 1.1242372180342723e-05, + "loss": 0.2983, "step": 767100 }, { - "epoch": 7.82, - "learning_rate": 2.9668260499425214e-05, - "loss": 0.4203, + "epoch": 10.570113802320135, + "grad_norm": 2.8163986206054688, + "learning_rate": 1.1235908137399377e-05, + "loss": 0.2609, "step": 767200 }, { - "epoch": 7.82, - "learning_rate": 2.966162731466146e-05, - "loss": 0.3695, + "epoch": 10.571491554379874, + "grad_norm": 4.111927509307861, + "learning_rate": 1.122944547617385e-05, + "loss": 0.2775, "step": 767300 }, { - "epoch": 7.82, - "learning_rate": 2.9654994146442093e-05, - "loss": 0.4432, + "epoch": 10.572869306439614, + "grad_norm": 1.1802095174789429, + "learning_rate": 1.122298419721517e-05, + "loss": 0.2631, "step": 767400 }, { - "epoch": 7.82, - "learning_rate": 2.9648360995091457e-05, - "loss": 0.5392, + "epoch": 10.574247058499353, + "grad_norm": 2.2581400871276855, + "learning_rate": 1.1216524301072241e-05, + "loss": 0.281, "step": 767500 }, { - "epoch": 7.82, - "learning_rate": 2.9641727860933874e-05, - "loss": 0.3938, + "epoch": 10.575624810559091, + "grad_norm": 3.908735990524292, + "learning_rate": 1.1210065788293893e-05, + "loss": 0.2821, "step": 767600 }, { - "epoch": 7.82, - "learning_rate": 2.9635094744293655e-05, - "loss": 0.5172, + "epoch": 10.577002562618832, + "grad_norm": 4.9254302978515625, + "learning_rate": 1.1203608659428782e-05, + "loss": 0.2582, "step": 767700 }, { - "epoch": 7.82, - "learning_rate": 2.9628461645495124e-05, - "loss": 0.4691, + "epoch": 10.57838031467857, + "grad_norm": 2.7829370498657227, + "learning_rate": 1.1197152915025492e-05, + "loss": 0.2516, "step": 767800 }, { - "epoch": 7.82, - "learning_rate": 2.9621828564862605e-05, - "loss": 0.4172, + "epoch": 10.579758066738309, + "grad_norm": 5.179299354553223, + "learning_rate": 1.119069855563247e-05, + "loss": 0.2832, "step": 767900 }, { - "epoch": 7.82, - "learning_rate": 2.9615195502720417e-05, - "loss": 0.4765, + "epoch": 10.58113581879805, + "grad_norm": 52.03510665893555, + "learning_rate": 1.1184245581798048e-05, + "loss": 0.2683, "step": 768000 }, { - "epoch": 7.83, - "learning_rate": 2.960856245939287e-05, - "loss": 0.4873, + "epoch": 10.582513570857788, + "grad_norm": 4.16298770904541, + "learning_rate": 1.1177793994070426e-05, + "loss": 0.3071, "step": 768100 }, { - "epoch": 7.83, - "learning_rate": 2.9601929435204276e-05, - "loss": 0.4854, + "epoch": 10.583891322917527, + "grad_norm": 1.292060136795044, + "learning_rate": 1.1171343792997702e-05, + "loss": 0.3333, "step": 768200 }, { - "epoch": 7.83, - "learning_rate": 2.9595296430478966e-05, - "loss": 0.4274, + "epoch": 10.585269074977267, + "grad_norm": 0.9507945775985718, + "learning_rate": 1.1164894979127875e-05, + "loss": 0.2938, "step": 768300 }, { - "epoch": 7.83, - "learning_rate": 2.9588663445541237e-05, - "loss": 0.4945, + "epoch": 10.586646827037006, + "grad_norm": 0.2684326469898224, + "learning_rate": 1.1158447553008777e-05, + "loss": 0.2605, "step": 768400 }, { - "epoch": 7.83, - "learning_rate": 2.958203048071541e-05, - "loss": 0.4374, + "epoch": 10.588024579096746, + "grad_norm": 2.818380117416382, + "learning_rate": 1.1152001515188172e-05, + "loss": 0.3408, "step": 768500 }, { - "epoch": 7.83, - "learning_rate": 2.9575397536325816e-05, - "loss": 0.4128, + "epoch": 10.589402331156485, + "grad_norm": 3.7058634757995605, + "learning_rate": 1.1145556866213675e-05, + "loss": 0.2753, "step": 768600 }, { - "epoch": 7.83, - "learning_rate": 2.9568764612696734e-05, - "loss": 0.4845, + "epoch": 10.590780083216224, + "grad_norm": 5.835156440734863, + "learning_rate": 1.1139113606632781e-05, + "loss": 0.2879, "step": 768700 }, { - "epoch": 7.83, - "learning_rate": 2.9562131710152488e-05, - "loss": 0.5316, + "epoch": 10.592157835275964, + "grad_norm": 1.4286340475082397, + "learning_rate": 1.113267173699289e-05, + "loss": 0.2724, "step": 768800 }, { - "epoch": 7.83, - "learning_rate": 2.9555498829017396e-05, - "loss": 0.4748, + "epoch": 10.593535587335703, + "grad_norm": 2.8151721954345703, + "learning_rate": 1.1126231257841276e-05, + "loss": 0.2566, "step": 768900 }, { - "epoch": 7.83, - "learning_rate": 2.954886596961575e-05, - "loss": 0.487, + "epoch": 10.594913339395443, + "grad_norm": 5.139896869659424, + "learning_rate": 1.1119792169725075e-05, + "loss": 0.2791, "step": 769000 }, { - "epoch": 7.84, - "learning_rate": 2.9542233132271866e-05, - "loss": 0.4369, + "epoch": 10.596291091455182, + "grad_norm": 4.433551788330078, + "learning_rate": 1.111335447319134e-05, + "loss": 0.2404, "step": 769100 }, { - "epoch": 7.84, - "learning_rate": 2.9535600317310064e-05, - "loss": 0.4975, + "epoch": 10.59766884351492, + "grad_norm": 1.5295460224151611, + "learning_rate": 1.1106918168786967e-05, + "loss": 0.2794, "step": 769200 }, { - "epoch": 7.84, - "learning_rate": 2.9528967525054624e-05, - "loss": 0.4648, + "epoch": 10.599046595574661, + "grad_norm": 5.430045127868652, + "learning_rate": 1.110048325705877e-05, + "loss": 0.2423, "step": 769300 }, { - "epoch": 7.84, - "learning_rate": 2.9522334755829857e-05, - "loss": 0.4594, + "epoch": 10.6004243476344, + "grad_norm": 4.622608184814453, + "learning_rate": 1.1094049738553414e-05, + "loss": 0.3259, "step": 769400 }, { - "epoch": 7.84, - "learning_rate": 2.9515702009960072e-05, - "loss": 0.4362, + "epoch": 10.601802099694138, + "grad_norm": 4.558523654937744, + "learning_rate": 1.1087617613817459e-05, + "loss": 0.2754, "step": 769500 }, { - "epoch": 7.84, - "learning_rate": 2.9509069287769574e-05, - "loss": 0.5117, + "epoch": 10.603179851753879, + "grad_norm": 7.286407947540283, + "learning_rate": 1.1081186883397374e-05, + "loss": 0.2962, "step": 769600 }, { - "epoch": 7.84, - "learning_rate": 2.9502436589582652e-05, - "loss": 0.4575, + "epoch": 10.604557603813618, + "grad_norm": 8.92766284942627, + "learning_rate": 1.1074757547839447e-05, + "loss": 0.3157, "step": 769700 }, { - "epoch": 7.84, - "learning_rate": 2.9495803915723614e-05, - "loss": 0.4871, + "epoch": 10.605935355873356, + "grad_norm": 1.6710656881332397, + "learning_rate": 1.1068329607689891e-05, + "loss": 0.2983, "step": 769800 }, { - "epoch": 7.84, - "learning_rate": 2.9489171266516765e-05, - "loss": 0.4235, + "epoch": 10.607313107933097, + "grad_norm": 2.8927485942840576, + "learning_rate": 1.1061903063494814e-05, + "loss": 0.295, "step": 769900 }, { - "epoch": 7.84, - "learning_rate": 2.9482538642286383e-05, - "loss": 0.4833, + "epoch": 10.608690859992835, + "grad_norm": 3.3385496139526367, + "learning_rate": 1.1055477915800154e-05, + "loss": 0.2927, "step": 770000 }, { - "epoch": 7.85, - "learning_rate": 2.947590604335677e-05, - "loss": 0.3927, + "epoch": 10.610068612052576, + "grad_norm": 3.53165864944458, + "learning_rate": 1.1049054165151777e-05, + "loss": 0.3037, "step": 770100 }, { - "epoch": 7.85, - "learning_rate": 2.9469273470052232e-05, - "loss": 0.3888, + "epoch": 10.611446364112314, + "grad_norm": 7.7597784996032715, + "learning_rate": 1.1042631812095421e-05, + "loss": 0.2645, "step": 770200 }, { - "epoch": 7.85, - "learning_rate": 2.946264092269705e-05, - "loss": 0.4553, + "epoch": 10.612824116172053, + "grad_norm": 1.823333740234375, + "learning_rate": 1.1036210857176687e-05, + "loss": 0.2731, "step": 770300 }, { - "epoch": 7.85, - "learning_rate": 2.9456008401615514e-05, - "loss": 0.5253, + "epoch": 10.614201868231794, + "grad_norm": 2.3267180919647217, + "learning_rate": 1.102979130094106e-05, + "loss": 0.2912, "step": 770400 }, { - "epoch": 7.85, - "learning_rate": 2.944937590713194e-05, - "loss": 0.4941, + "epoch": 10.615579620291532, + "grad_norm": 3.1392085552215576, + "learning_rate": 1.1023373143933932e-05, + "loss": 0.2996, "step": 770500 }, { - "epoch": 7.85, - "learning_rate": 2.944274343957058e-05, - "loss": 0.4307, + "epoch": 10.616957372351271, + "grad_norm": 14.297091484069824, + "learning_rate": 1.1016956386700537e-05, + "loss": 0.3103, "step": 770600 }, { - "epoch": 7.85, - "learning_rate": 2.9436110999255744e-05, - "loss": 0.4545, + "epoch": 10.618335124411011, + "grad_norm": 2.082451820373535, + "learning_rate": 1.1010541029786028e-05, + "loss": 0.2963, "step": 770700 }, { - "epoch": 7.85, - "learning_rate": 2.942947858651172e-05, - "loss": 0.4316, + "epoch": 10.61971287647075, + "grad_norm": 5.52354097366333, + "learning_rate": 1.1004127073735427e-05, + "loss": 0.2985, "step": 770800 }, { - "epoch": 7.85, - "learning_rate": 2.942284620166278e-05, - "loss": 0.4079, + "epoch": 10.62109062853049, + "grad_norm": 2.6013951301574707, + "learning_rate": 1.0997714519093621e-05, + "loss": 0.3285, "step": 770900 }, { - "epoch": 7.86, - "learning_rate": 2.941621384503322e-05, - "loss": 0.3977, + "epoch": 10.62246838059023, + "grad_norm": 1.1924774646759033, + "learning_rate": 1.0991303366405385e-05, + "loss": 0.2757, "step": 771000 }, { - "epoch": 7.86, - "learning_rate": 2.940958151694733e-05, - "loss": 0.468, + "epoch": 10.623846132649968, + "grad_norm": 5.271380424499512, + "learning_rate": 1.0984893616215383e-05, + "loss": 0.3092, "step": 771100 }, { - "epoch": 7.86, - "learning_rate": 2.9402949217729372e-05, - "loss": 0.4052, + "epoch": 10.625223884709708, + "grad_norm": 3.838233232498169, + "learning_rate": 1.097848526906817e-05, + "loss": 0.251, "step": 771200 }, { - "epoch": 7.86, - "learning_rate": 2.939631694770363e-05, - "loss": 0.3703, + "epoch": 10.626601636769447, + "grad_norm": 20.113243103027344, + "learning_rate": 1.0972078325508158e-05, + "loss": 0.288, "step": 771300 }, { - "epoch": 7.86, - "learning_rate": 2.9389684707194388e-05, - "loss": 0.413, + "epoch": 10.627979388829186, + "grad_norm": 3.4025511741638184, + "learning_rate": 1.0965672786079636e-05, + "loss": 0.2736, "step": 771400 }, { - "epoch": 7.86, - "learning_rate": 2.9383118818483826e-05, - "loss": 0.4476, + "epoch": 10.629357140888926, + "grad_norm": 2.321033477783203, + "learning_rate": 1.0959268651326814e-05, + "loss": 0.2741, "step": 771500 }, { - "epoch": 7.86, - "learning_rate": 2.9376486637677168e-05, - "loss": 0.4474, + "epoch": 10.630734892948665, + "grad_norm": 2.4975409507751465, + "learning_rate": 1.0952865921793728e-05, + "loss": 0.2762, "step": 771600 }, { - "epoch": 7.86, - "learning_rate": 2.936985448735658e-05, - "loss": 0.5292, + "epoch": 10.632112645008405, + "grad_norm": 7.202080726623535, + "learning_rate": 1.0946464598024338e-05, + "loss": 0.3082, "step": 771700 }, { - "epoch": 7.86, - "learning_rate": 2.9363222367846345e-05, - "loss": 0.4571, + "epoch": 10.633490397068144, + "grad_norm": 6.662028789520264, + "learning_rate": 1.0940128672774091e-05, + "loss": 0.2679, "step": 771800 }, { - "epoch": 7.86, - "learning_rate": 2.9356590279470737e-05, - "loss": 0.4925, + "epoch": 10.634868149127882, + "grad_norm": 3.882417917251587, + "learning_rate": 1.0933730148092242e-05, + "loss": 0.2894, "step": 771900 }, { - "epoch": 7.87, - "learning_rate": 2.9349958222554035e-05, - "loss": 0.4041, + "epoch": 10.636245901187623, + "grad_norm": 2.4470911026000977, + "learning_rate": 1.0927333030799781e-05, + "loss": 0.3134, "step": 772000 }, { - "epoch": 7.87, - "learning_rate": 2.934332619742048e-05, - "loss": 0.4627, + "epoch": 10.637623653247362, + "grad_norm": 3.0687241554260254, + "learning_rate": 1.0920937321440154e-05, + "loss": 0.2108, "step": 772100 }, { - "epoch": 7.87, - "learning_rate": 2.9336694204394363e-05, - "loss": 0.3544, + "epoch": 10.6390014053071, + "grad_norm": 3.4485580921173096, + "learning_rate": 1.0914543020556718e-05, + "loss": 0.3533, "step": 772200 }, { - "epoch": 7.87, - "learning_rate": 2.9330062243799957e-05, - "loss": 0.4459, + "epoch": 10.64037915736684, + "grad_norm": 2.846353054046631, + "learning_rate": 1.0908150128692714e-05, + "loss": 0.2601, "step": 772300 }, { - "epoch": 7.87, - "learning_rate": 2.932343031596149e-05, - "loss": 0.4848, + "epoch": 10.64175690942658, + "grad_norm": 1.0572539567947388, + "learning_rate": 1.090175864639124e-05, + "loss": 0.2527, "step": 772400 }, { - "epoch": 7.87, - "learning_rate": 2.9316798421203245e-05, - "loss": 0.4471, + "epoch": 10.643134661486318, + "grad_norm": 2.260268211364746, + "learning_rate": 1.0895368574195275e-05, + "loss": 0.2597, "step": 772500 }, { - "epoch": 7.87, - "learning_rate": 2.931016655984949e-05, - "loss": 0.4639, + "epoch": 10.644512413546058, + "grad_norm": 2.93265438079834, + "learning_rate": 1.088897991264771e-05, + "loss": 0.2727, "step": 772600 }, { - "epoch": 7.87, - "learning_rate": 2.9303534732224456e-05, - "loss": 0.3954, + "epoch": 10.645890165605797, + "grad_norm": 5.983810901641846, + "learning_rate": 1.0882592662291271e-05, + "loss": 0.2436, "step": 772700 }, { - "epoch": 7.87, - "learning_rate": 2.9296902938652425e-05, - "loss": 0.5144, + "epoch": 10.647267917665538, + "grad_norm": 1.459702730178833, + "learning_rate": 1.0876206823668598e-05, + "loss": 0.2735, "step": 772800 }, { - "epoch": 7.87, - "learning_rate": 2.929027117945766e-05, - "loss": 0.4035, + "epoch": 10.648645669725276, + "grad_norm": 2.6378164291381836, + "learning_rate": 1.0869822397322213e-05, + "loss": 0.2779, "step": 772900 }, { - "epoch": 7.88, - "learning_rate": 2.9283639454964383e-05, - "loss": 0.4171, + "epoch": 10.650023421785015, + "grad_norm": 1.8068956136703491, + "learning_rate": 1.0863439383794498e-05, + "loss": 0.3083, "step": 773000 }, { - "epoch": 7.88, - "learning_rate": 2.9277007765496858e-05, - "loss": 0.4425, + "epoch": 10.651401173844755, + "grad_norm": 2.5236852169036865, + "learning_rate": 1.0857057783627707e-05, + "loss": 0.3019, "step": 773100 }, { - "epoch": 7.88, - "learning_rate": 2.927037611137936e-05, - "loss": 0.3869, + "epoch": 10.652778925904494, + "grad_norm": 3.3121347427368164, + "learning_rate": 1.0850677597364e-05, + "loss": 0.2628, "step": 773200 }, { - "epoch": 7.88, - "learning_rate": 2.9263744492936088e-05, - "loss": 0.3901, + "epoch": 10.654156677964234, + "grad_norm": 2.2889928817749023, + "learning_rate": 1.084429882554542e-05, + "loss": 0.2627, "step": 773300 }, { - "epoch": 7.88, - "learning_rate": 2.9257112910491325e-05, - "loss": 0.4184, + "epoch": 10.655534430023973, + "grad_norm": 1.271621823310852, + "learning_rate": 1.0837921468713854e-05, + "loss": 0.2799, "step": 773400 }, { - "epoch": 7.88, - "learning_rate": 2.925048136436931e-05, - "loss": 0.3735, + "epoch": 10.656912182083712, + "grad_norm": 0.14077383279800415, + "learning_rate": 1.0831545527411113e-05, + "loss": 0.2806, "step": 773500 }, { - "epoch": 7.88, - "learning_rate": 2.9243849854894297e-05, - "loss": 0.3871, + "epoch": 10.658289934143452, + "grad_norm": 2.4750192165374756, + "learning_rate": 1.0825171002178857e-05, + "loss": 0.3032, "step": 773600 }, { - "epoch": 7.88, - "learning_rate": 2.9237218382390498e-05, - "loss": 0.409, + "epoch": 10.659667686203191, + "grad_norm": 4.034747123718262, + "learning_rate": 1.0818797893558618e-05, + "loss": 0.2859, "step": 773700 }, { - "epoch": 7.88, - "learning_rate": 2.923058694718217e-05, - "loss": 0.4137, + "epoch": 10.66104543826293, + "grad_norm": 4.517995357513428, + "learning_rate": 1.081242620209184e-05, + "loss": 0.288, "step": 773800 }, { - "epoch": 7.88, - "learning_rate": 2.9223955549593554e-05, - "loss": 0.4442, + "epoch": 10.66242319032267, + "grad_norm": 1.7797316312789917, + "learning_rate": 1.0806055928319841e-05, + "loss": 0.2438, "step": 773900 }, { - "epoch": 7.89, - "learning_rate": 2.9217324189948878e-05, - "loss": 0.4641, + "epoch": 10.663800942382409, + "grad_norm": 4.498326301574707, + "learning_rate": 1.0799687072783785e-05, + "loss": 0.2748, "step": 774000 }, { - "epoch": 7.89, - "learning_rate": 2.921069286857237e-05, - "loss": 0.4573, + "epoch": 10.665178694442147, + "grad_norm": 4.289167404174805, + "learning_rate": 1.079331963602476e-05, + "loss": 0.3089, "step": 774100 }, { - "epoch": 7.89, - "learning_rate": 2.920406158578829e-05, - "loss": 0.3942, + "epoch": 10.666556446501888, + "grad_norm": 1.539554476737976, + "learning_rate": 1.078695361858369e-05, + "loss": 0.2936, "step": 774200 }, { - "epoch": 7.89, - "learning_rate": 2.9197430341920836e-05, - "loss": 0.4317, + "epoch": 10.667934198561627, + "grad_norm": 4.27036714553833, + "learning_rate": 1.0780589021001428e-05, + "loss": 0.2893, "step": 774300 }, { - "epoch": 7.89, - "learning_rate": 2.919093176100012e-05, - "loss": 0.3596, + "epoch": 10.669311950621367, + "grad_norm": 3.4730494022369385, + "learning_rate": 1.0774225843818655e-05, + "loss": 0.2613, "step": 774400 }, { - "epoch": 7.89, - "learning_rate": 2.9184300595144157e-05, - "loss": 0.4186, + "epoch": 10.670689702681106, + "grad_norm": 3.011395215988159, + "learning_rate": 1.0767864087575963e-05, + "loss": 0.2605, "step": 774500 }, { - "epoch": 7.89, - "learning_rate": 2.9177669469171013e-05, - "loss": 0.4761, + "epoch": 10.672067454740844, + "grad_norm": 1.6344833374023438, + "learning_rate": 1.0761503752813835e-05, + "loss": 0.2884, "step": 774600 }, { - "epoch": 7.89, - "learning_rate": 2.9171038383404927e-05, - "loss": 0.4271, + "epoch": 10.673445206800585, + "grad_norm": 1.7804741859436035, + "learning_rate": 1.0755144840072583e-05, + "loss": 0.2662, "step": 774700 }, { - "epoch": 7.89, - "learning_rate": 2.9164407338170112e-05, - "loss": 0.4048, + "epoch": 10.674822958860323, + "grad_norm": 5.178990364074707, + "learning_rate": 1.0748787349892438e-05, + "loss": 0.2516, "step": 774800 }, { - "epoch": 7.89, - "learning_rate": 2.9157776333790807e-05, - "loss": 0.3611, + "epoch": 10.676200710920062, + "grad_norm": 1.7916109561920166, + "learning_rate": 1.0742431282813519e-05, + "loss": 0.2946, "step": 774900 }, { - "epoch": 7.9, - "learning_rate": 2.9151145370591198e-05, - "loss": 0.5185, + "epoch": 10.677578462979803, + "grad_norm": 2.858278512954712, + "learning_rate": 1.0736076639375783e-05, + "loss": 0.2635, "step": 775000 }, { - "epoch": 7.9, - "learning_rate": 2.914451444889552e-05, - "loss": 0.337, + "epoch": 10.678956215039541, + "grad_norm": 0.6872262358665466, + "learning_rate": 1.0729723420119097e-05, + "loss": 0.2692, "step": 775100 }, { - "epoch": 7.9, - "learning_rate": 2.9137883569027994e-05, - "loss": 0.4029, + "epoch": 10.680333967099282, + "grad_norm": 7.852147102355957, + "learning_rate": 1.0723435136474438e-05, + "loss": 0.2918, "step": 775200 }, { - "epoch": 7.9, - "learning_rate": 2.91312527313128e-05, - "loss": 0.471, + "epoch": 10.68171171915902, + "grad_norm": 3.1291286945343018, + "learning_rate": 1.0717084752943691e-05, + "loss": 0.2944, "step": 775300 }, { - "epoch": 7.9, - "learning_rate": 2.9124621936074162e-05, - "loss": 0.3473, + "epoch": 10.683089471218759, + "grad_norm": 0.6081647276878357, + "learning_rate": 1.0710735795207444e-05, + "loss": 0.2385, "step": 775400 }, { - "epoch": 7.9, - "learning_rate": 2.9117991183636296e-05, - "loss": 0.3919, + "epoch": 10.6844672232785, + "grad_norm": 4.6381988525390625, + "learning_rate": 1.0704451732056988e-05, + "loss": 0.3411, "step": 775500 }, { - "epoch": 7.9, - "learning_rate": 2.911136047432339e-05, - "loss": 0.4196, + "epoch": 10.685844975338238, + "grad_norm": 1.4564452171325684, + "learning_rate": 1.0698169067379613e-05, + "loss": 0.2644, "step": 775600 }, { - "epoch": 7.9, - "learning_rate": 2.910472980845966e-05, - "loss": 0.3698, + "epoch": 10.687222727397977, + "grad_norm": 4.635112285614014, + "learning_rate": 1.0691824361709158e-05, + "loss": 0.3095, "step": 775700 }, { - "epoch": 7.9, - "learning_rate": 2.9098099186369304e-05, - "loss": 0.3723, + "epoch": 10.688600479457717, + "grad_norm": 5.736720561981201, + "learning_rate": 1.06854810839792e-05, + "loss": 0.2858, "step": 775800 }, { - "epoch": 7.91, - "learning_rate": 2.909146860837651e-05, - "loss": 0.5216, + "epoch": 10.689978231517456, + "grad_norm": 1.1003854274749756, + "learning_rate": 1.0679139234728655e-05, + "loss": 0.2512, "step": 775900 }, { - "epoch": 7.91, - "learning_rate": 2.908483807480547e-05, - "loss": 0.4367, + "epoch": 10.691355983577196, + "grad_norm": 0.9752110838890076, + "learning_rate": 1.067279881449628e-05, + "loss": 0.273, "step": 776000 }, { - "epoch": 7.91, - "learning_rate": 2.90782075859804e-05, - "loss": 0.3781, + "epoch": 10.692733735636935, + "grad_norm": 9.139801025390625, + "learning_rate": 1.0666459823820744e-05, + "loss": 0.2612, "step": 776100 }, { - "epoch": 7.91, - "learning_rate": 2.907157714222547e-05, - "loss": 0.3742, + "epoch": 10.694111487696674, + "grad_norm": 4.557865619659424, + "learning_rate": 1.0660122263240564e-05, + "loss": 0.278, "step": 776200 }, { - "epoch": 7.91, - "learning_rate": 2.906494674386488e-05, - "loss": 0.4354, + "epoch": 10.695489239756414, + "grad_norm": 1.6658694744110107, + "learning_rate": 1.0653786133294139e-05, + "loss": 0.2566, "step": 776300 }, { - "epoch": 7.91, - "learning_rate": 2.9058316391222824e-05, - "loss": 0.4891, + "epoch": 10.696866991816153, + "grad_norm": 5.797301292419434, + "learning_rate": 1.0647451434519769e-05, + "loss": 0.242, "step": 776400 }, { - "epoch": 7.91, - "learning_rate": 2.9051686084623464e-05, - "loss": 0.3558, + "epoch": 10.698244743875891, + "grad_norm": 1.4015822410583496, + "learning_rate": 1.0641118167455627e-05, + "loss": 0.2775, "step": 776500 }, { - "epoch": 7.91, - "learning_rate": 2.9045055824390992e-05, - "loss": 0.5089, + "epoch": 10.699622495935632, + "grad_norm": 0.024202750995755196, + "learning_rate": 1.0634786332639732e-05, + "loss": 0.2327, "step": 776600 }, { - "epoch": 7.91, - "learning_rate": 2.90384256108496e-05, - "loss": 0.4876, + "epoch": 10.70100024799537, + "grad_norm": 8.05154800415039, + "learning_rate": 1.0628455930610022e-05, + "loss": 0.299, "step": 776700 }, { - "epoch": 7.91, - "learning_rate": 2.9031795444323454e-05, - "loss": 0.4362, + "epoch": 10.70237800005511, + "grad_norm": 12.252554893493652, + "learning_rate": 1.0622126961904307e-05, + "loss": 0.2668, "step": 776800 }, { - "epoch": 7.92, - "learning_rate": 2.9025165325136733e-05, - "loss": 0.4833, + "epoch": 10.70375575211485, + "grad_norm": 3.6365504264831543, + "learning_rate": 1.061579942706025e-05, + "loss": 0.3129, "step": 776900 }, { - "epoch": 7.92, - "learning_rate": 2.9018535253613607e-05, - "loss": 0.4, + "epoch": 10.705133504174588, + "grad_norm": 2.3534231185913086, + "learning_rate": 1.0609473326615403e-05, + "loss": 0.2734, "step": 777000 }, { - "epoch": 7.92, - "learning_rate": 2.901190523007826e-05, - "loss": 0.4158, + "epoch": 10.706511256234329, + "grad_norm": 6.118051528930664, + "learning_rate": 1.0603148661107218e-05, + "loss": 0.311, "step": 777100 }, { - "epoch": 7.92, - "learning_rate": 2.9005275254854846e-05, - "loss": 0.4164, + "epoch": 10.707889008294067, + "grad_norm": 16.952945709228516, + "learning_rate": 1.0596825431072987e-05, + "loss": 0.2963, "step": 777200 }, { - "epoch": 7.92, - "learning_rate": 2.899864532826753e-05, - "loss": 0.4077, + "epoch": 10.709266760353806, + "grad_norm": 1.9015147686004639, + "learning_rate": 1.0590503637049922e-05, + "loss": 0.2367, "step": 777300 }, { - "epoch": 7.92, - "learning_rate": 2.899201545064049e-05, - "loss": 0.4443, + "epoch": 10.710644512413547, + "grad_norm": 2.550863265991211, + "learning_rate": 1.058418327957507e-05, + "loss": 0.3152, "step": 777400 }, { - "epoch": 7.92, - "learning_rate": 2.8985385622297872e-05, - "loss": 0.4285, + "epoch": 10.712022264473285, + "grad_norm": 1.7357666492462158, + "learning_rate": 1.0577864359185401e-05, + "loss": 0.2795, "step": 777500 }, { - "epoch": 7.92, - "learning_rate": 2.897882214110456e-05, - "loss": 0.4609, + "epoch": 10.713400016533026, + "grad_norm": 3.509672164916992, + "learning_rate": 1.0571546876417719e-05, + "loss": 0.297, "step": 777600 }, { - "epoch": 7.92, - "learning_rate": 2.8972192411802345e-05, - "loss": 0.3812, + "epoch": 10.714777768592764, + "grad_norm": 4.266869068145752, + "learning_rate": 1.0565230831808733e-05, + "loss": 0.2736, "step": 777700 }, { - "epoch": 7.92, - "learning_rate": 2.8965562732753785e-05, - "loss": 0.5075, + "epoch": 10.716155520652503, + "grad_norm": 3.8890371322631836, + "learning_rate": 1.0558916225895037e-05, + "loss": 0.3083, "step": 777800 }, { - "epoch": 7.93, - "learning_rate": 2.8958933104283042e-05, - "loss": 0.4293, + "epoch": 10.717533272712243, + "grad_norm": 2.2083747386932373, + "learning_rate": 1.0552603059213076e-05, + "loss": 0.273, "step": 777900 }, { - "epoch": 7.93, - "learning_rate": 2.895230352671425e-05, - "loss": 0.404, + "epoch": 10.718911024771982, + "grad_norm": 3.997001886367798, + "learning_rate": 1.0546291332299178e-05, + "loss": 0.2594, "step": 778000 }, { - "epoch": 7.93, - "learning_rate": 2.8945674000371565e-05, - "loss": 0.4797, + "epoch": 10.72028877683172, + "grad_norm": 1.5380661487579346, + "learning_rate": 1.0539981045689576e-05, + "loss": 0.313, "step": 778100 }, { - "epoch": 7.93, - "learning_rate": 2.893904452557914e-05, - "loss": 0.4554, + "epoch": 10.721666528891461, + "grad_norm": 3.687814235687256, + "learning_rate": 1.0533672199920342e-05, + "loss": 0.263, "step": 778200 }, { - "epoch": 7.93, - "learning_rate": 2.8932415102661093e-05, - "loss": 0.3975, + "epoch": 10.7230442809512, + "grad_norm": 3.641199827194214, + "learning_rate": 1.0527364795527454e-05, + "loss": 0.2477, "step": 778300 }, { - "epoch": 7.93, - "learning_rate": 2.8925785731941575e-05, - "loss": 0.3456, + "epoch": 10.724422033010939, + "grad_norm": 3.2585480213165283, + "learning_rate": 1.0521058833046766e-05, + "loss": 0.2532, "step": 778400 }, { - "epoch": 7.93, - "learning_rate": 2.8919156413744738e-05, - "loss": 0.4081, + "epoch": 10.725799785070679, + "grad_norm": 3.120032787322998, + "learning_rate": 1.0514754313013996e-05, + "loss": 0.2623, "step": 778500 }, { - "epoch": 7.93, - "learning_rate": 2.891252714839469e-05, - "loss": 0.451, + "epoch": 10.727177537130418, + "grad_norm": 2.5921339988708496, + "learning_rate": 1.0508451235964731e-05, + "loss": 0.2749, "step": 778600 }, { - "epoch": 7.93, - "learning_rate": 2.8905897936215575e-05, - "loss": 0.4636, + "epoch": 10.728555289190158, + "grad_norm": 5.440725326538086, + "learning_rate": 1.0502149602434463e-05, + "loss": 0.2984, "step": 778700 }, { - "epoch": 7.93, - "learning_rate": 2.8899268777531535e-05, - "loss": 0.4531, + "epoch": 10.729933041249897, + "grad_norm": 3.4707186222076416, + "learning_rate": 1.0495849412958556e-05, + "loss": 0.2831, "step": 778800 }, { - "epoch": 7.94, - "learning_rate": 2.8892639672666673e-05, - "loss": 0.4069, + "epoch": 10.731310793309635, + "grad_norm": 0.2926226556301117, + "learning_rate": 1.0489550668072223e-05, + "loss": 0.287, "step": 778900 }, { - "epoch": 7.94, - "learning_rate": 2.8886010621945117e-05, - "loss": 0.5336, + "epoch": 10.732688545369376, + "grad_norm": 6.388592720031738, + "learning_rate": 1.0483253368310598e-05, + "loss": 0.3076, "step": 779000 }, { - "epoch": 7.94, - "learning_rate": 2.8879381625690995e-05, - "loss": 0.4668, + "epoch": 10.734066297429115, + "grad_norm": 1.4608204364776611, + "learning_rate": 1.0476957514208656e-05, + "loss": 0.2912, "step": 779100 }, { - "epoch": 7.94, - "learning_rate": 2.8872752684228432e-05, - "loss": 0.4883, + "epoch": 10.735444049488853, + "grad_norm": 12.02379322052002, + "learning_rate": 1.047066310630125e-05, + "loss": 0.2722, "step": 779200 }, { - "epoch": 7.94, - "learning_rate": 2.886612379788153e-05, - "loss": 0.406, + "epoch": 10.736821801548594, + "grad_norm": 9.220056533813477, + "learning_rate": 1.0464370145123138e-05, + "loss": 0.231, "step": 779300 }, { - "epoch": 7.94, - "learning_rate": 2.8859494966974406e-05, - "loss": 0.3491, + "epoch": 10.738199553608332, + "grad_norm": 2.7719051837921143, + "learning_rate": 1.0458078631208944e-05, + "loss": 0.2743, "step": 779400 }, { - "epoch": 7.94, - "learning_rate": 2.885286619183119e-05, - "loss": 0.5367, + "epoch": 10.739577305668073, + "grad_norm": 2.0675292015075684, + "learning_rate": 1.0451788565093148e-05, + "loss": 0.2602, "step": 779500 }, { - "epoch": 7.94, - "learning_rate": 2.8846237472775952e-05, - "loss": 0.4691, + "epoch": 10.740955057727811, + "grad_norm": 1.8196871280670166, + "learning_rate": 1.0445499947310138e-05, + "loss": 0.3196, "step": 779600 }, { - "epoch": 7.94, - "learning_rate": 2.883960881013282e-05, - "loss": 0.4947, + "epoch": 10.74233280978755, + "grad_norm": 1.4246639013290405, + "learning_rate": 1.0439212778394152e-05, + "loss": 0.2636, "step": 779700 }, { - "epoch": 7.94, - "learning_rate": 2.8832980204225897e-05, - "loss": 0.4024, + "epoch": 10.74371056184729, + "grad_norm": 0.8489853143692017, + "learning_rate": 1.0432927058879329e-05, + "loss": 0.2927, "step": 779800 }, { - "epoch": 7.95, - "learning_rate": 2.882635165537927e-05, - "loss": 0.4028, + "epoch": 10.74508831390703, + "grad_norm": 4.6068243980407715, + "learning_rate": 1.0426642789299657e-05, + "loss": 0.2591, "step": 779900 }, { - "epoch": 7.95, - "learning_rate": 2.8819723163917048e-05, - "loss": 0.4522, + "epoch": 10.746466065966768, + "grad_norm": 2.7164809703826904, + "learning_rate": 1.042035997018903e-05, + "loss": 0.317, "step": 780000 }, { - "epoch": 7.95, - "learning_rate": 2.881309473016333e-05, - "loss": 0.5615, + "epoch": 10.747843818026508, + "grad_norm": 0.564778208732605, + "learning_rate": 1.041407860208122e-05, + "loss": 0.2914, "step": 780100 }, { - "epoch": 7.95, - "learning_rate": 2.880646635444218e-05, - "loss": 0.3804, + "epoch": 10.749221570086247, + "grad_norm": 2.1687119007110596, + "learning_rate": 1.0407798685509824e-05, + "loss": 0.2432, "step": 780200 }, { - "epoch": 7.95, - "learning_rate": 2.8799838037077707e-05, - "loss": 0.443, + "epoch": 10.750599322145987, + "grad_norm": 3.0310451984405518, + "learning_rate": 1.0401520221008372e-05, + "loss": 0.3147, "step": 780300 }, { - "epoch": 7.95, - "learning_rate": 2.8793209778393997e-05, - "loss": 0.3814, + "epoch": 10.751977074205726, + "grad_norm": 1.4888153076171875, + "learning_rate": 1.0395243209110265e-05, + "loss": 0.2278, "step": 780400 }, { - "epoch": 7.95, - "learning_rate": 2.8786581578715117e-05, - "loss": 0.4789, + "epoch": 10.753354826265465, + "grad_norm": 2.801438570022583, + "learning_rate": 1.0388967650348742e-05, + "loss": 0.2947, "step": 780500 }, { - "epoch": 7.95, - "learning_rate": 2.877995343836516e-05, - "loss": 0.4762, + "epoch": 10.754732578325205, + "grad_norm": 5.014444828033447, + "learning_rate": 1.0382693545256959e-05, + "loss": 0.3015, "step": 780600 }, { - "epoch": 7.95, - "learning_rate": 2.87733253576682e-05, - "loss": 0.4461, + "epoch": 10.756110330384944, + "grad_norm": 2.237200975418091, + "learning_rate": 1.037642089436794e-05, + "loss": 0.2948, "step": 780700 }, { - "epoch": 7.95, - "learning_rate": 2.8766697336948323e-05, - "loss": 0.4569, + "epoch": 10.757488082444683, + "grad_norm": 1.9687060117721558, + "learning_rate": 1.0370212402973403e-05, + "loss": 0.1908, "step": 780800 }, { - "epoch": 7.96, - "learning_rate": 2.8760069376529572e-05, - "loss": 0.4403, + "epoch": 10.758865834504423, + "grad_norm": 0.17617133259773254, + "learning_rate": 1.0363942647533131e-05, + "loss": 0.2616, "step": 780900 }, { - "epoch": 7.96, - "learning_rate": 2.875344147673603e-05, - "loss": 0.4716, + "epoch": 10.760243586564162, + "grad_norm": 20.812875747680664, + "learning_rate": 1.0357674347888586e-05, + "loss": 0.2735, "step": 781000 }, { - "epoch": 7.96, - "learning_rate": 2.8746813637891764e-05, - "loss": 0.4344, + "epoch": 10.7616213386239, + "grad_norm": 3.0998921394348145, + "learning_rate": 1.0351470165794902e-05, + "loss": 0.2434, "step": 781100 }, { - "epoch": 7.96, - "learning_rate": 2.874018586032083e-05, - "loss": 0.3955, + "epoch": 10.76299909068364, + "grad_norm": 2.067453145980835, + "learning_rate": 1.0345204764768053e-05, + "loss": 0.3096, "step": 781200 }, { - "epoch": 7.96, - "learning_rate": 2.8733558144347286e-05, - "loss": 0.4326, + "epoch": 10.76437684274338, + "grad_norm": 2.0949366092681885, + "learning_rate": 1.0338940821128823e-05, + "loss": 0.3119, "step": 781300 }, { - "epoch": 7.96, - "learning_rate": 2.8726930490295206e-05, - "loss": 0.4348, + "epoch": 10.76575459480312, + "grad_norm": 2.4511618614196777, + "learning_rate": 1.0332678335409358e-05, + "loss": 0.2608, "step": 781400 }, { - "epoch": 7.96, - "learning_rate": 2.8720302898488612e-05, - "loss": 0.4184, + "epoch": 10.767132346862859, + "grad_norm": 4.615516185760498, + "learning_rate": 1.0326417308141697e-05, + "loss": 0.2034, "step": 781500 }, { - "epoch": 7.96, - "learning_rate": 2.8713675369251573e-05, - "loss": 0.428, + "epoch": 10.768510098922597, + "grad_norm": 2.0676310062408447, + "learning_rate": 1.0320157739857754e-05, + "loss": 0.2587, "step": 781600 }, { - "epoch": 7.96, - "learning_rate": 2.8707047902908136e-05, - "loss": 0.4951, + "epoch": 10.769887850982338, + "grad_norm": 1.017842173576355, + "learning_rate": 1.0313899631089299e-05, + "loss": 0.2753, "step": 781700 }, { - "epoch": 7.97, - "learning_rate": 2.8700486773499604e-05, - "loss": 0.4341, + "epoch": 10.771265603042076, + "grad_norm": 2.830345630645752, + "learning_rate": 1.0307642982367988e-05, + "loss": 0.2845, "step": 781800 }, { - "epoch": 7.97, - "learning_rate": 2.869385943327847e-05, - "loss": 0.4145, + "epoch": 10.772643355101817, + "grad_norm": 3.0426549911499023, + "learning_rate": 1.0301387794225368e-05, + "loss": 0.2454, "step": 781900 }, { - "epoch": 7.97, - "learning_rate": 2.868723215691981e-05, - "loss": 0.3761, + "epoch": 10.774021107161555, + "grad_norm": 1.7366609573364258, + "learning_rate": 1.0295134067192834e-05, + "loss": 0.2683, "step": 782000 }, { - "epoch": 7.97, - "learning_rate": 2.8680604944747663e-05, - "loss": 0.4414, + "epoch": 10.775398859221294, + "grad_norm": 3.190941333770752, + "learning_rate": 1.0288881801801681e-05, + "loss": 0.288, "step": 782100 }, { - "epoch": 7.97, - "learning_rate": 2.867397779708607e-05, - "loss": 0.397, + "epoch": 10.776776611281035, + "grad_norm": 5.287817001342773, + "learning_rate": 1.0282630998583082e-05, + "loss": 0.3225, "step": 782200 }, { - "epoch": 7.97, - "learning_rate": 2.866735071425904e-05, - "loss": 0.4612, + "epoch": 10.778154363340773, + "grad_norm": 0.6237702369689941, + "learning_rate": 1.027638165806806e-05, + "loss": 0.2933, "step": 782300 }, { - "epoch": 7.97, - "learning_rate": 2.866072369659061e-05, - "loss": 0.4474, + "epoch": 10.779532115400512, + "grad_norm": 1.593192219734192, + "learning_rate": 1.0270133780787522e-05, + "loss": 0.2773, "step": 782400 }, { - "epoch": 7.97, - "learning_rate": 2.8654096744404802e-05, - "loss": 0.4279, + "epoch": 10.780909867460252, + "grad_norm": 6.648743152618408, + "learning_rate": 1.0263887367272266e-05, + "loss": 0.3082, "step": 782500 }, { - "epoch": 7.97, - "learning_rate": 2.8647469858025624e-05, - "loss": 0.4784, + "epoch": 10.782287619519991, + "grad_norm": 2.956794500350952, + "learning_rate": 1.0257642418052969e-05, + "loss": 0.2647, "step": 782600 }, { - "epoch": 7.97, - "learning_rate": 2.86408430377771e-05, - "loss": 0.459, + "epoch": 10.78366537157973, + "grad_norm": 3.487407684326172, + "learning_rate": 1.025139893366015e-05, + "loss": 0.2942, "step": 782700 }, { - "epoch": 7.98, - "learning_rate": 2.8634216283983254e-05, - "loss": 0.3945, + "epoch": 10.78504312363947, + "grad_norm": 8.754105567932129, + "learning_rate": 1.0245156914624236e-05, + "loss": 0.3385, "step": 782800 }, { - "epoch": 7.98, - "learning_rate": 2.8627589596968066e-05, - "loss": 0.5079, + "epoch": 10.786420875699209, + "grad_norm": 0.9119150638580322, + "learning_rate": 1.0238916361475533e-05, + "loss": 0.2473, "step": 782900 }, { - "epoch": 7.98, - "learning_rate": 2.862096297705556e-05, - "loss": 0.4698, + "epoch": 10.78779862775895, + "grad_norm": 5.8906569480896, + "learning_rate": 1.0232677274744178e-05, + "loss": 0.2508, "step": 783000 }, { - "epoch": 7.98, - "learning_rate": 2.861433642456973e-05, - "loss": 0.409, + "epoch": 10.789176379818688, + "grad_norm": 4.298729419708252, + "learning_rate": 1.0226439654960225e-05, + "loss": 0.264, "step": 783100 }, { - "epoch": 7.98, - "learning_rate": 2.8607709939834587e-05, - "loss": 0.4775, + "epoch": 10.790554131878427, + "grad_norm": 2.2790465354919434, + "learning_rate": 1.0220203502653602e-05, + "loss": 0.2989, "step": 783200 }, { - "epoch": 7.98, - "learning_rate": 2.8601083523174115e-05, - "loss": 0.466, + "epoch": 10.791931883938167, + "grad_norm": 5.4851861000061035, + "learning_rate": 1.0213968818354087e-05, + "loss": 0.2796, "step": 783300 }, { - "epoch": 7.98, - "learning_rate": 2.8594457174912308e-05, - "loss": 0.4365, + "epoch": 10.793309635997906, + "grad_norm": 1.391113519668579, + "learning_rate": 1.0207735602591366e-05, + "loss": 0.2437, "step": 783400 }, { - "epoch": 7.98, - "learning_rate": 2.8587830895373175e-05, - "loss": 0.3488, + "epoch": 10.794687388057644, + "grad_norm": 6.2586870193481445, + "learning_rate": 1.0201503855894959e-05, + "loss": 0.2422, "step": 783500 }, { - "epoch": 7.98, - "learning_rate": 2.8581204684880667e-05, - "loss": 0.4743, + "epoch": 10.796065140117385, + "grad_norm": 2.0227832794189453, + "learning_rate": 1.0195273578794311e-05, + "loss": 0.286, "step": 783600 }, { - "epoch": 7.98, - "learning_rate": 2.8574578543758786e-05, - "loss": 0.4358, + "epoch": 10.797442892177123, + "grad_norm": 1.4082139730453491, + "learning_rate": 1.0189044771818692e-05, + "loss": 0.2485, "step": 783700 }, { - "epoch": 7.99, - "learning_rate": 2.856795247233151e-05, - "loss": 0.4459, + "epoch": 10.798820644236864, + "grad_norm": 2.4410436153411865, + "learning_rate": 1.018281743549728e-05, + "loss": 0.2863, "step": 783800 }, { - "epoch": 7.99, - "learning_rate": 2.856132647092281e-05, - "loss": 0.3703, + "epoch": 10.800198396296603, + "grad_norm": 4.288768291473389, + "learning_rate": 1.0176591570359134e-05, + "loss": 0.2856, "step": 783900 }, { - "epoch": 7.99, - "learning_rate": 2.855470053985666e-05, - "loss": 0.3864, + "epoch": 10.801576148356341, + "grad_norm": 3.8041419982910156, + "learning_rate": 1.0170367176933156e-05, + "loss": 0.3127, "step": 784000 }, { - "epoch": 7.99, - "learning_rate": 2.8548074679457037e-05, - "loss": 0.3994, + "epoch": 10.802953900416082, + "grad_norm": 6.286481857299805, + "learning_rate": 1.016414425574814e-05, + "loss": 0.2277, "step": 784100 }, { - "epoch": 7.99, - "learning_rate": 2.8541448890047885e-05, - "loss": 0.433, + "epoch": 10.80433165247582, + "grad_norm": 2.96346116065979, + "learning_rate": 1.0157922807332765e-05, + "loss": 0.286, "step": 784200 }, { - "epoch": 7.99, - "learning_rate": 2.853482317195318e-05, - "loss": 0.4733, + "epoch": 10.805709404535559, + "grad_norm": 12.562845230102539, + "learning_rate": 1.015170283221556e-05, + "loss": 0.3062, "step": 784300 }, { - "epoch": 7.99, - "learning_rate": 2.8528197525496887e-05, - "loss": 0.4019, + "epoch": 10.8070871565953, + "grad_norm": 1.5047742128372192, + "learning_rate": 1.0145484330924953e-05, + "loss": 0.2456, "step": 784400 }, { - "epoch": 7.99, - "learning_rate": 2.852157195100294e-05, - "loss": 0.5195, + "epoch": 10.808464908655038, + "grad_norm": 2.896204948425293, + "learning_rate": 1.0139267303989247e-05, + "loss": 0.2627, "step": 784500 }, { - "epoch": 7.99, - "learning_rate": 2.8514946448795304e-05, - "loss": 0.3601, + "epoch": 10.809842660714779, + "grad_norm": 3.08923602104187, + "learning_rate": 1.01330517519366e-05, + "loss": 0.2397, "step": 784600 }, { - "epoch": 7.99, - "learning_rate": 2.850832101919794e-05, - "loss": 0.488, + "epoch": 10.811220412774517, + "grad_norm": 1.4122380018234253, + "learning_rate": 1.0126837675295044e-05, + "loss": 0.2702, "step": 784700 }, { - "epoch": 8.0, - "learning_rate": 2.850169566253476e-05, - "loss": 0.3672, + "epoch": 10.812598164834256, + "grad_norm": 2.1778883934020996, + "learning_rate": 1.0120625074592509e-05, + "loss": 0.2598, "step": 784800 }, { - "epoch": 8.0, - "learning_rate": 2.849507037912972e-05, - "loss": 0.3626, + "epoch": 10.813975916893996, + "grad_norm": 2.1174709796905518, + "learning_rate": 1.0114413950356797e-05, + "loss": 0.2657, "step": 784900 }, { - "epoch": 8.0, - "learning_rate": 2.848844516930676e-05, - "loss": 0.3349, + "epoch": 10.815353668953735, + "grad_norm": 1.0692293643951416, + "learning_rate": 1.0108204303115555e-05, + "loss": 0.2591, "step": 785000 }, { - "epoch": 8.0, - "learning_rate": 2.8481820033389815e-05, - "loss": 0.4162, + "epoch": 10.816731421013474, + "grad_norm": 6.256170272827148, + "learning_rate": 1.010199613339634e-05, + "loss": 0.2901, "step": 785100 }, { - "epoch": 8.0, - "learning_rate": 2.847519497170281e-05, - "loss": 0.4089, + "epoch": 10.818109173073214, + "grad_norm": 4.349870681762695, + "learning_rate": 1.0095789441726566e-05, + "loss": 0.2875, "step": 785200 }, { - "epoch": 8.0, - "learning_rate": 2.8468569984569667e-05, - "loss": 0.3878, + "epoch": 10.819486925132953, + "grad_norm": 2.953732967376709, + "learning_rate": 1.0089584228633505e-05, + "loss": 0.2792, "step": 785300 }, { - "epoch": 8.0, - "learning_rate": 2.8461945072314322e-05, - "loss": 0.3886, + "epoch": 10.820864677192692, + "grad_norm": 0.0522027425467968, + "learning_rate": 1.0083380494644337e-05, + "loss": 0.2931, "step": 785400 }, { - "epoch": 8.0, - "learning_rate": 2.8455320235260677e-05, - "loss": 0.4415, + "epoch": 10.822242429252432, + "grad_norm": 2.148622989654541, + "learning_rate": 1.0077178240286104e-05, + "loss": 0.3317, "step": 785500 }, { - "epoch": 8.0, - "learning_rate": 2.8448695473732653e-05, - "loss": 0.4555, + "epoch": 10.82362018131217, + "grad_norm": 4.769834041595459, + "learning_rate": 1.0070977466085716e-05, + "loss": 0.2721, "step": 785600 }, { - "epoch": 8.0, - "learning_rate": 2.8442070788054166e-05, - "loss": 0.3589, + "epoch": 10.824997933371911, + "grad_norm": 2.5272176265716553, + "learning_rate": 1.0064778172569951e-05, + "loss": 0.2646, "step": 785700 }, { - "epoch": 8.01, - "learning_rate": 2.8435446178549115e-05, - "loss": 0.3716, + "epoch": 10.82637568543165, + "grad_norm": 2.9587318897247314, + "learning_rate": 1.0058580360265478e-05, + "loss": 0.2996, "step": 785800 }, { - "epoch": 8.01, - "learning_rate": 2.8428821645541413e-05, - "loss": 0.3017, + "epoch": 10.827753437491388, + "grad_norm": 1.2316334247589111, + "learning_rate": 1.0052384029698842e-05, + "loss": 0.2998, "step": 785900 }, { - "epoch": 8.01, - "learning_rate": 2.8422263433535494e-05, - "loss": 0.3841, + "epoch": 10.829131189551129, + "grad_norm": 6.3303542137146, + "learning_rate": 1.0046189181396433e-05, + "loss": 0.2833, "step": 786000 }, { - "epoch": 8.01, - "learning_rate": 2.8415639053721123e-05, - "loss": 0.3499, + "epoch": 10.830508941610868, + "grad_norm": 1.4202358722686768, + "learning_rate": 1.0039995815884547e-05, + "loss": 0.2599, "step": 786100 }, { - "epoch": 8.01, - "learning_rate": 2.840901475137255e-05, - "loss": 0.3639, + "epoch": 10.831886693670608, + "grad_norm": 6.6583147048950195, + "learning_rate": 1.0033803933689363e-05, + "loss": 0.2541, "step": 786200 }, { - "epoch": 8.01, - "learning_rate": 2.8402390526813656e-05, - "loss": 0.3889, + "epoch": 10.833264445730347, + "grad_norm": 1.2129735946655273, + "learning_rate": 1.0027613535336868e-05, + "loss": 0.2253, "step": 786300 }, { - "epoch": 8.01, - "learning_rate": 2.8395766380368335e-05, - "loss": 0.3633, + "epoch": 10.834642197790085, + "grad_norm": 2.8143773078918457, + "learning_rate": 1.0021424621352993e-05, + "loss": 0.2644, "step": 786400 }, { - "epoch": 8.01, - "learning_rate": 2.8389142312360473e-05, - "loss": 0.405, + "epoch": 10.836019949849826, + "grad_norm": 2.1632111072540283, + "learning_rate": 1.0015237192263528e-05, + "loss": 0.296, "step": 786500 }, { - "epoch": 8.01, - "learning_rate": 2.8382518323113925e-05, - "loss": 0.4351, + "epoch": 10.837397701909564, + "grad_norm": 1.8440625667572021, + "learning_rate": 1.0009051248594102e-05, + "loss": 0.2441, "step": 786600 }, { - "epoch": 8.02, - "learning_rate": 2.837589441295258e-05, - "loss": 0.3936, + "epoch": 10.838775453969303, + "grad_norm": 4.708512306213379, + "learning_rate": 1.0002866790870267e-05, + "loss": 0.2372, "step": 786700 }, { - "epoch": 8.02, - "learning_rate": 2.83693368201137e-05, - "loss": 0.4673, + "epoch": 10.840153206029044, + "grad_norm": 1.7876691818237305, + "learning_rate": 9.996683819617402e-06, + "loss": 0.2759, "step": 786800 }, { - "epoch": 8.02, - "learning_rate": 2.836271306829543e-05, - "loss": 0.3995, + "epoch": 10.841530958088782, + "grad_norm": 3.8953144550323486, + "learning_rate": 9.990564142841016e-06, + "loss": 0.2666, "step": 786900 }, { - "epoch": 8.02, - "learning_rate": 2.8356089396530727e-05, - "loss": 0.3968, + "epoch": 10.84290871014852, + "grad_norm": 30.061588287353516, + "learning_rate": 9.984384131227999e-06, + "loss": 0.2214, "step": 787000 }, { - "epoch": 8.02, - "learning_rate": 2.8349465805143435e-05, - "loss": 0.367, + "epoch": 10.844286462208261, + "grad_norm": 0.29400742053985596, + "learning_rate": 9.978205607656158e-06, + "loss": 0.2554, "step": 787100 }, { - "epoch": 8.02, - "learning_rate": 2.834284229445742e-05, - "loss": 0.3448, + "epoch": 10.845664214268, + "grad_norm": 1.9202631711959839, + "learning_rate": 9.972028572650404e-06, + "loss": 0.3088, "step": 787200 }, { - "epoch": 8.02, - "learning_rate": 2.8336218864796537e-05, - "loss": 0.3847, + "epoch": 10.84704196632774, + "grad_norm": 4.585844993591309, + "learning_rate": 9.965853026735492e-06, + "loss": 0.3102, "step": 787300 }, { - "epoch": 8.02, - "learning_rate": 2.8329595516484608e-05, - "loss": 0.3208, + "epoch": 10.848419718387479, + "grad_norm": 2.103652238845825, + "learning_rate": 9.959678970436065e-06, + "loss": 0.262, "step": 787400 }, { - "epoch": 8.02, - "learning_rate": 2.832297224984549e-05, - "loss": 0.3983, + "epoch": 10.849797470447218, + "grad_norm": 3.472585916519165, + "learning_rate": 9.953506404276639e-06, + "loss": 0.2783, "step": 787500 }, { - "epoch": 8.02, - "learning_rate": 2.831634906520303e-05, - "loss": 0.4542, + "epoch": 10.851175222506958, + "grad_norm": 3.5521347522735596, + "learning_rate": 9.947335328781625e-06, + "loss": 0.2862, "step": 787600 }, { - "epoch": 8.03, - "learning_rate": 2.8309725962881047e-05, - "loss": 0.3175, + "epoch": 10.852552974566697, + "grad_norm": 4.6136250495910645, + "learning_rate": 9.941165744475263e-06, + "loss": 0.2973, "step": 787700 }, { - "epoch": 8.03, - "learning_rate": 2.8303102943203373e-05, - "loss": 0.4236, + "epoch": 10.853930726626436, + "grad_norm": 2.368006944656372, + "learning_rate": 9.934997651881709e-06, + "loss": 0.3062, "step": 787800 }, { - "epoch": 8.03, - "learning_rate": 2.829648000649384e-05, - "loss": 0.4052, + "epoch": 10.855308478686176, + "grad_norm": 3.4752609729766846, + "learning_rate": 9.928831051524967e-06, + "loss": 0.2935, "step": 787900 }, { - "epoch": 8.03, - "learning_rate": 2.8289857153076286e-05, - "loss": 0.4774, + "epoch": 10.856686230745915, + "grad_norm": 3.9899001121520996, + "learning_rate": 9.922665943928915e-06, + "loss": 0.2509, "step": 788000 }, { - "epoch": 8.03, - "learning_rate": 2.8283234383274494e-05, - "loss": 0.3554, + "epoch": 10.858063982805655, + "grad_norm": 5.404050350189209, + "learning_rate": 9.916502329617313e-06, + "loss": 0.2766, "step": 788100 }, { - "epoch": 8.03, - "learning_rate": 2.82766116974123e-05, - "loss": 0.4006, + "epoch": 10.859441734865394, + "grad_norm": 2.512906789779663, + "learning_rate": 9.910340209113805e-06, + "loss": 0.2563, "step": 788200 }, { - "epoch": 8.03, - "learning_rate": 2.826998909581351e-05, - "loss": 0.4178, + "epoch": 10.860819486925132, + "grad_norm": 2.6941845417022705, + "learning_rate": 9.904179582941874e-06, + "loss": 0.2486, "step": 788300 }, { - "epoch": 8.03, - "learning_rate": 2.8263366578801924e-05, - "loss": 0.3635, + "epoch": 10.862197238984873, + "grad_norm": 1.6795685291290283, + "learning_rate": 9.898020451624915e-06, + "loss": 0.3124, "step": 788400 }, { - "epoch": 8.03, - "learning_rate": 2.8256744146701353e-05, - "loss": 0.3836, + "epoch": 10.863574991044612, + "grad_norm": 0.1793992817401886, + "learning_rate": 9.891862815686164e-06, + "loss": 0.3055, "step": 788500 }, { - "epoch": 8.03, - "learning_rate": 2.82501217998356e-05, - "loss": 0.3945, + "epoch": 10.86495274310435, + "grad_norm": 3.1668787002563477, + "learning_rate": 9.885706675648757e-06, + "loss": 0.2696, "step": 788600 }, { - "epoch": 8.04, - "learning_rate": 2.8243499538528436e-05, - "loss": 0.3058, + "epoch": 10.86633049516409, + "grad_norm": 3.6055996417999268, + "learning_rate": 9.87955203203567e-06, + "loss": 0.2773, "step": 788700 }, { - "epoch": 8.04, - "learning_rate": 2.823687736310366e-05, - "loss": 0.3898, + "epoch": 10.86770824722383, + "grad_norm": 2.612092971801758, + "learning_rate": 9.873398885369788e-06, + "loss": 0.2717, "step": 788800 }, { - "epoch": 8.04, - "learning_rate": 2.823025527388507e-05, - "loss": 0.3929, + "epoch": 10.86908599928357, + "grad_norm": 2.8825790882110596, + "learning_rate": 9.867247236173865e-06, + "loss": 0.2737, "step": 788900 }, { - "epoch": 8.04, - "learning_rate": 2.8223633271196425e-05, - "loss": 0.4114, + "epoch": 10.870463751343308, + "grad_norm": 1.5203964710235596, + "learning_rate": 9.861097084970477e-06, + "loss": 0.3036, "step": 789000 }, { - "epoch": 8.04, - "learning_rate": 2.8217011355361514e-05, - "loss": 0.3514, + "epoch": 10.871841503403047, + "grad_norm": 2.303419351577759, + "learning_rate": 9.854948432282134e-06, + "loss": 0.2877, "step": 789100 }, { - "epoch": 8.04, - "learning_rate": 2.821038952670412e-05, - "loss": 0.366, + "epoch": 10.873219255462788, + "grad_norm": 2.1686339378356934, + "learning_rate": 9.848801278631202e-06, + "loss": 0.3271, "step": 789200 }, { - "epoch": 8.04, - "learning_rate": 2.8203767785547983e-05, - "loss": 0.3841, + "epoch": 10.874597007522526, + "grad_norm": 3.583930730819702, + "learning_rate": 9.842655624539894e-06, + "loss": 0.3394, "step": 789300 }, { - "epoch": 8.04, - "learning_rate": 2.819721234831441e-05, - "loss": 0.4348, + "epoch": 10.875974759582265, + "grad_norm": 5.633982181549072, + "learning_rate": 9.836511470530327e-06, + "loss": 0.3057, "step": 789400 }, { - "epoch": 8.04, - "learning_rate": 2.8190590782249008e-05, - "loss": 0.4019, + "epoch": 10.877352511642005, + "grad_norm": 1.0355987548828125, + "learning_rate": 9.830368817124482e-06, + "loss": 0.2814, "step": 789500 }, { - "epoch": 8.04, - "learning_rate": 2.8183969304652928e-05, - "loss": 0.4502, + "epoch": 10.878730263701744, + "grad_norm": 4.735191822052002, + "learning_rate": 9.82428906893472e-06, + "loss": 0.2501, "step": 789600 }, { - "epoch": 8.05, - "learning_rate": 2.8177347915849923e-05, - "loss": 0.4055, + "epoch": 10.880108015761483, + "grad_norm": 3.952105760574341, + "learning_rate": 9.818149403282677e-06, + "loss": 0.2682, "step": 789700 }, { - "epoch": 8.05, - "learning_rate": 2.8170726616163718e-05, - "loss": 0.4407, + "epoch": 10.881485767821223, + "grad_norm": 3.350972890853882, + "learning_rate": 9.812011239794292e-06, + "loss": 0.2955, "step": 789800 }, { - "epoch": 8.05, - "learning_rate": 2.816410540591806e-05, - "loss": 0.333, + "epoch": 10.882863519880962, + "grad_norm": 2.655555248260498, + "learning_rate": 9.805874578991054e-06, + "loss": 0.2557, "step": 789900 }, { - "epoch": 8.05, - "learning_rate": 2.8157484285436705e-05, - "loss": 0.3544, + "epoch": 10.884241271940702, + "grad_norm": 1.6831485033035278, + "learning_rate": 9.79973942139429e-06, + "loss": 0.2866, "step": 790000 }, { - "epoch": 8.05, - "learning_rate": 2.8150863255043356e-05, - "loss": 0.3477, + "epoch": 10.885619024000441, + "grad_norm": 0.10608773678541183, + "learning_rate": 9.793605767525213e-06, + "loss": 0.2722, "step": 790100 }, { - "epoch": 8.05, - "learning_rate": 2.8144242315061764e-05, - "loss": 0.502, + "epoch": 10.88699677606018, + "grad_norm": 0.6877732872962952, + "learning_rate": 9.787473617904908e-06, + "loss": 0.3316, "step": 790200 }, { - "epoch": 8.05, - "learning_rate": 2.8137621465815653e-05, - "loss": 0.3864, + "epoch": 10.88837452811992, + "grad_norm": 28.814584732055664, + "learning_rate": 9.781342973054344e-06, + "loss": 0.2958, "step": 790300 }, { - "epoch": 8.05, - "learning_rate": 2.8131000707628745e-05, - "loss": 0.442, + "epoch": 10.889752280179659, + "grad_norm": 1.559552550315857, + "learning_rate": 9.775213833494333e-06, + "loss": 0.2552, "step": 790400 }, { - "epoch": 8.05, - "learning_rate": 2.8124380040824738e-05, - "loss": 0.3383, + "epoch": 10.8911300322394, + "grad_norm": 1.3686186075210571, + "learning_rate": 9.769086199745588e-06, + "loss": 0.3365, "step": 790500 }, { - "epoch": 8.05, - "learning_rate": 2.8117759465727352e-05, - "loss": 0.3649, + "epoch": 10.892507784299138, + "grad_norm": 2.9944419860839844, + "learning_rate": 9.762960072328687e-06, + "loss": 0.2485, "step": 790600 }, { - "epoch": 8.06, - "learning_rate": 2.8111138982660306e-05, - "loss": 0.3887, + "epoch": 10.893885536358876, + "grad_norm": 1.8911799192428589, + "learning_rate": 9.756835451764074e-06, + "loss": 0.2793, "step": 790700 }, { - "epoch": 8.06, - "learning_rate": 2.8104518591947282e-05, - "loss": 0.3882, + "epoch": 10.895263288418617, + "grad_norm": 3.6825525760650635, + "learning_rate": 9.750712338572053e-06, + "loss": 0.3408, "step": 790800 }, { - "epoch": 8.06, - "learning_rate": 2.8097898293911988e-05, - "loss": 0.4001, + "epoch": 10.896641040478356, + "grad_norm": 2.4583678245544434, + "learning_rate": 9.744590733272832e-06, + "loss": 0.2775, "step": 790900 }, { - "epoch": 8.06, - "learning_rate": 2.809127808887813e-05, - "loss": 0.3981, + "epoch": 10.898018792538094, + "grad_norm": 5.033134937286377, + "learning_rate": 9.738470636386454e-06, + "loss": 0.2926, "step": 791000 }, { - "epoch": 8.06, - "learning_rate": 2.808465797716937e-05, - "loss": 0.31, + "epoch": 10.899396544597835, + "grad_norm": 3.261211633682251, + "learning_rate": 9.732352048432874e-06, + "loss": 0.3007, "step": 791100 }, { - "epoch": 8.06, - "learning_rate": 2.80780379591094e-05, - "loss": 0.3819, + "epoch": 10.900774296657573, + "grad_norm": 1.8147833347320557, + "learning_rate": 9.726234969931876e-06, + "loss": 0.2786, "step": 791200 }, { - "epoch": 8.06, - "learning_rate": 2.8071418035021925e-05, - "loss": 0.4158, + "epoch": 10.902152048717312, + "grad_norm": 2.4565181732177734, + "learning_rate": 9.720119401403155e-06, + "loss": 0.2951, "step": 791300 }, { - "epoch": 8.06, - "learning_rate": 2.8064798205230575e-05, - "loss": 0.3727, + "epoch": 10.903529800777052, + "grad_norm": 0.3112247884273529, + "learning_rate": 9.714005343366244e-06, + "loss": 0.2408, "step": 791400 }, { - "epoch": 8.06, - "learning_rate": 2.805817847005905e-05, - "loss": 0.4263, + "epoch": 10.904907552836791, + "grad_norm": 4.672830581665039, + "learning_rate": 9.707892796340574e-06, + "loss": 0.2718, "step": 791500 }, { - "epoch": 8.06, - "learning_rate": 2.8051558829831025e-05, - "loss": 0.4474, + "epoch": 10.906285304896532, + "grad_norm": 4.682251930236816, + "learning_rate": 9.70178176084544e-06, + "loss": 0.2391, "step": 791600 }, { - "epoch": 8.07, - "learning_rate": 2.804493928487013e-05, - "loss": 0.3959, + "epoch": 10.90766305695627, + "grad_norm": 1.8366261720657349, + "learning_rate": 9.695672237400002e-06, + "loss": 0.2814, "step": 791700 }, { - "epoch": 8.07, - "learning_rate": 2.803831983550004e-05, - "loss": 0.3244, + "epoch": 10.909040809016009, + "grad_norm": 1.409472942352295, + "learning_rate": 9.689564226523286e-06, + "loss": 0.2598, "step": 791800 }, { - "epoch": 8.07, - "learning_rate": 2.8031700482044412e-05, - "loss": 0.398, + "epoch": 10.91041856107575, + "grad_norm": 2.2234275341033936, + "learning_rate": 9.683457728734215e-06, + "loss": 0.2685, "step": 791900 }, { - "epoch": 8.07, - "learning_rate": 2.8025081224826866e-05, - "loss": 0.3517, + "epoch": 10.911796313135488, + "grad_norm": 0.5442966222763062, + "learning_rate": 9.67735274455155e-06, + "loss": 0.2757, "step": 792000 }, { - "epoch": 8.07, - "learning_rate": 2.801846206417107e-05, - "loss": 0.398, + "epoch": 10.913174065195227, + "grad_norm": 4.018232822418213, + "learning_rate": 9.671249274493948e-06, + "loss": 0.3243, "step": 792100 }, { - "epoch": 8.07, - "learning_rate": 2.8011843000400656e-05, - "loss": 0.3343, + "epoch": 10.914551817254967, + "grad_norm": 0.061273328959941864, + "learning_rate": 9.665147319079942e-06, + "loss": 0.3161, "step": 792200 }, { - "epoch": 8.07, - "learning_rate": 2.8005224033839264e-05, - "loss": 0.3486, + "epoch": 10.915929569314706, + "grad_norm": 2.9252161979675293, + "learning_rate": 9.659046878827912e-06, + "loss": 0.2236, "step": 792300 }, { - "epoch": 8.07, - "learning_rate": 2.79986051648105e-05, - "loss": 0.3729, + "epoch": 10.917307321374446, + "grad_norm": 1.966133952140808, + "learning_rate": 9.652947954256116e-06, + "loss": 0.2435, "step": 792400 }, { - "epoch": 8.07, - "learning_rate": 2.7992052580864283e-05, - "loss": 0.3993, + "epoch": 10.918685073434185, + "grad_norm": 0.2597993314266205, + "learning_rate": 9.646850545882694e-06, + "loss": 0.2444, "step": 792500 }, { - "epoch": 8.08, - "learning_rate": 2.7985433906888265e-05, - "loss": 0.4252, + "epoch": 10.920062825493924, + "grad_norm": 3.497936487197876, + "learning_rate": 9.640754654225667e-06, + "loss": 0.2782, "step": 792600 }, { - "epoch": 8.08, - "learning_rate": 2.7978815331412508e-05, - "loss": 0.4147, + "epoch": 10.921440577553664, + "grad_norm": 0.7135214805603027, + "learning_rate": 9.634660279802889e-06, + "loss": 0.3057, "step": 792700 }, { - "epoch": 8.08, - "learning_rate": 2.7972196854760624e-05, - "loss": 0.3548, + "epoch": 10.922818329613403, + "grad_norm": 2.791632890701294, + "learning_rate": 9.628567423132123e-06, + "loss": 0.2883, "step": 792800 }, { - "epoch": 8.08, - "learning_rate": 2.7965578477256206e-05, - "loss": 0.4039, + "epoch": 10.924196081673141, + "grad_norm": 0.2594141364097595, + "learning_rate": 9.622536990597859e-06, + "loss": 0.3237, "step": 792900 }, { - "epoch": 8.08, - "learning_rate": 2.7958960199222862e-05, - "loss": 0.3318, + "epoch": 10.925573833732882, + "grad_norm": 0.4906339943408966, + "learning_rate": 9.61644715579341e-06, + "loss": 0.274, "step": 793000 }, { - "epoch": 8.08, - "learning_rate": 2.7952342020984193e-05, - "loss": 0.3849, + "epoch": 10.92695158579262, + "grad_norm": 5.79037618637085, + "learning_rate": 9.610358840288256e-06, + "loss": 0.3077, "step": 793100 }, { - "epoch": 8.08, - "learning_rate": 2.794572394286377e-05, - "loss": 0.3709, + "epoch": 10.928329337852361, + "grad_norm": 1.7678356170654297, + "learning_rate": 9.604272044599641e-06, + "loss": 0.2919, "step": 793200 }, { - "epoch": 8.08, - "learning_rate": 2.793910596518519e-05, - "loss": 0.3574, + "epoch": 10.9297070899121, + "grad_norm": 1.488146185874939, + "learning_rate": 9.598186769244681e-06, + "loss": 0.2493, "step": 793300 }, { - "epoch": 8.08, - "learning_rate": 2.793248808827203e-05, - "loss": 0.3248, + "epoch": 10.931084841971838, + "grad_norm": 3.3995895385742188, + "learning_rate": 9.592163844755469e-06, + "loss": 0.289, "step": 793400 }, { - "epoch": 8.08, - "learning_rate": 2.792587031244786e-05, - "loss": 0.4135, + "epoch": 10.932462594031579, + "grad_norm": 2.4592740535736084, + "learning_rate": 9.586081596402357e-06, + "loss": 0.2593, "step": 793500 }, { - "epoch": 8.09, - "learning_rate": 2.7919318814277317e-05, - "loss": 0.3915, + "epoch": 10.933840346091317, + "grad_norm": 2.7111661434173584, + "learning_rate": 9.580000869928254e-06, + "loss": 0.2668, "step": 793600 }, { - "epoch": 8.09, - "learning_rate": 2.7912701240582877e-05, - "loss": 0.4585, + "epoch": 10.935218098151056, + "grad_norm": 0.9519830942153931, + "learning_rate": 9.573921665849765e-06, + "loss": 0.2518, "step": 793700 }, { - "epoch": 8.09, - "learning_rate": 2.790608376894489e-05, - "loss": 0.3571, + "epoch": 10.936595850210796, + "grad_norm": 1.3351515531539917, + "learning_rate": 9.567843984683329e-06, + "loss": 0.2927, "step": 793800 }, { - "epoch": 8.09, - "learning_rate": 2.7899466399686923e-05, - "loss": 0.3812, + "epoch": 10.937973602270535, + "grad_norm": 1.2917457818984985, + "learning_rate": 9.561767826945295e-06, + "loss": 0.3068, "step": 793900 }, { - "epoch": 8.09, - "learning_rate": 2.78928491331325e-05, - "loss": 0.4111, + "epoch": 10.939351354330274, + "grad_norm": 5.182137489318848, + "learning_rate": 9.555693193151848e-06, + "loss": 0.2685, "step": 794000 }, { - "epoch": 8.09, - "learning_rate": 2.7886231969605183e-05, - "loss": 0.3425, + "epoch": 10.940729106390014, + "grad_norm": 2.707472324371338, + "learning_rate": 9.549620083819076e-06, + "loss": 0.3139, "step": 794100 }, { - "epoch": 8.09, - "learning_rate": 2.787961490942851e-05, - "loss": 0.354, + "epoch": 10.942106858449753, + "grad_norm": 2.3209433555603027, + "learning_rate": 9.5435484994629e-06, + "loss": 0.3056, "step": 794200 }, { - "epoch": 8.09, - "learning_rate": 2.7872997952926012e-05, - "loss": 0.4097, + "epoch": 10.943484610509493, + "grad_norm": 1.4986072778701782, + "learning_rate": 9.537478440599146e-06, + "loss": 0.2547, "step": 794300 }, { - "epoch": 8.09, - "learning_rate": 2.7866381100421217e-05, - "loss": 0.415, + "epoch": 10.944862362569232, + "grad_norm": 1.9973628520965576, + "learning_rate": 9.531409907743513e-06, + "loss": 0.252, "step": 794400 }, { - "epoch": 8.09, - "learning_rate": 2.785976435223767e-05, - "loss": 0.3637, + "epoch": 10.94624011462897, + "grad_norm": 2.1416170597076416, + "learning_rate": 9.525342901411515e-06, + "loss": 0.2691, "step": 794500 }, { - "epoch": 8.1, - "learning_rate": 2.7853147708698855e-05, - "loss": 0.3777, + "epoch": 10.947617866688711, + "grad_norm": 1.4314957857131958, + "learning_rate": 9.519277422118596e-06, + "loss": 0.264, "step": 794600 }, { - "epoch": 8.1, - "learning_rate": 2.7846531170128312e-05, - "loss": 0.4248, + "epoch": 10.94899561874845, + "grad_norm": 2.565685749053955, + "learning_rate": 9.513213470380055e-06, + "loss": 0.3419, "step": 794700 }, { - "epoch": 8.1, - "learning_rate": 2.783991473684955e-05, - "loss": 0.4323, + "epoch": 10.95037337080819, + "grad_norm": 0.5604676008224487, + "learning_rate": 9.507151046711043e-06, + "loss": 0.2747, "step": 794800 }, { - "epoch": 8.1, - "learning_rate": 2.7833298409186053e-05, - "loss": 0.354, + "epoch": 10.951751122867929, + "grad_norm": 1.9582836627960205, + "learning_rate": 9.501090151626603e-06, + "loss": 0.2715, "step": 794900 }, { - "epoch": 8.1, - "learning_rate": 2.782668218746134e-05, - "loss": 0.4063, + "epoch": 10.953128874927668, + "grad_norm": 1.1105514764785767, + "learning_rate": 9.495030785641641e-06, + "loss": 0.2651, "step": 795000 }, { - "epoch": 8.1, - "learning_rate": 2.7820066071998897e-05, - "loss": 0.371, + "epoch": 10.954506626987408, + "grad_norm": 1.4833425283432007, + "learning_rate": 9.488972949270929e-06, + "loss": 0.2687, "step": 795100 }, { - "epoch": 8.1, - "learning_rate": 2.7813450063122224e-05, - "loss": 0.3024, + "epoch": 10.955884379047147, + "grad_norm": 2.4590578079223633, + "learning_rate": 9.4829166430291e-06, + "loss": 0.2663, "step": 795200 }, { - "epoch": 8.1, - "learning_rate": 2.7806834161154787e-05, - "loss": 0.4223, + "epoch": 10.957262131106885, + "grad_norm": 1.5963034629821777, + "learning_rate": 9.476861867430675e-06, + "loss": 0.2504, "step": 795300 }, { - "epoch": 8.1, - "learning_rate": 2.7800218366420068e-05, - "loss": 0.4384, + "epoch": 10.958639883166626, + "grad_norm": 2.6099188327789307, + "learning_rate": 9.470808622990049e-06, + "loss": 0.2456, "step": 795400 }, { - "epoch": 8.1, - "learning_rate": 2.779360267924155e-05, - "loss": 0.405, + "epoch": 10.960017635226365, + "grad_norm": 3.3358867168426514, + "learning_rate": 9.464756910221469e-06, + "loss": 0.3019, "step": 795500 }, { - "epoch": 8.11, - "learning_rate": 2.7786987099942686e-05, - "loss": 0.4108, + "epoch": 10.961395387286103, + "grad_norm": 1.5220539569854736, + "learning_rate": 9.458706729639049e-06, + "loss": 0.286, "step": 795600 }, { - "epoch": 8.11, - "learning_rate": 2.7780371628846947e-05, - "loss": 0.3609, + "epoch": 10.962773139345844, + "grad_norm": 2.2715752124786377, + "learning_rate": 9.452658081756798e-06, + "loss": 0.2799, "step": 795700 }, { - "epoch": 8.11, - "learning_rate": 2.7773756266277805e-05, - "loss": 0.402, + "epoch": 10.964150891405582, + "grad_norm": 0.5971372127532959, + "learning_rate": 9.446610967088564e-06, + "loss": 0.2653, "step": 795800 }, { - "epoch": 8.11, - "learning_rate": 2.776714101255868e-05, - "loss": 0.3637, + "epoch": 10.965528643465323, + "grad_norm": 2.510394334793091, + "learning_rate": 9.440565386148089e-06, + "loss": 0.3195, "step": 795900 }, { - "epoch": 8.11, - "learning_rate": 2.7760525868013037e-05, - "loss": 0.3973, + "epoch": 10.966906395525061, + "grad_norm": 1.0859014987945557, + "learning_rate": 9.434521339448982e-06, + "loss": 0.2384, "step": 796000 }, { - "epoch": 8.11, - "learning_rate": 2.7753910832964316e-05, - "loss": 0.3844, + "epoch": 10.9682841475848, + "grad_norm": 2.482124090194702, + "learning_rate": 9.428478827504712e-06, + "loss": 0.2625, "step": 796100 }, { - "epoch": 8.11, - "learning_rate": 2.774729590773595e-05, - "loss": 0.3676, + "epoch": 10.96966189964454, + "grad_norm": 1.537619948387146, + "learning_rate": 9.422437850828611e-06, + "loss": 0.255, "step": 796200 }, { - "epoch": 8.11, - "learning_rate": 2.774068109265137e-05, - "loss": 0.3675, + "epoch": 10.97103965170428, + "grad_norm": 3.8805816173553467, + "learning_rate": 9.4163984099339e-06, + "loss": 0.3077, "step": 796300 }, { - "epoch": 8.11, - "learning_rate": 2.7734066388034012e-05, - "loss": 0.2973, + "epoch": 10.972417403764018, + "grad_norm": 2.1127543449401855, + "learning_rate": 9.410360505333664e-06, + "loss": 0.2404, "step": 796400 }, { - "epoch": 8.11, - "learning_rate": 2.7727451794207274e-05, - "loss": 0.3723, + "epoch": 10.973795155823758, + "grad_norm": 1.609237790107727, + "learning_rate": 9.404324137540845e-06, + "loss": 0.3018, "step": 796500 }, { - "epoch": 8.12, - "learning_rate": 2.7720837311494583e-05, - "loss": 0.4179, + "epoch": 10.975172907883497, + "grad_norm": 1.9624664783477783, + "learning_rate": 9.39828930706826e-06, + "loss": 0.2487, "step": 796600 }, { - "epoch": 8.12, - "learning_rate": 2.771422294021934e-05, - "loss": 0.3517, + "epoch": 10.976550659943237, + "grad_norm": 4.536030292510986, + "learning_rate": 9.392256014428631e-06, + "loss": 0.2436, "step": 796700 }, { - "epoch": 8.12, - "learning_rate": 2.7707608680704968e-05, - "loss": 0.3568, + "epoch": 10.977928412002976, + "grad_norm": 3.4404239654541016, + "learning_rate": 9.386284570060922e-06, + "loss": 0.2802, "step": 796800 }, { - "epoch": 8.12, - "learning_rate": 2.770106067419327e-05, - "loss": 0.4073, + "epoch": 10.979306164062715, + "grad_norm": 2.6929402351379395, + "learning_rate": 9.380254339233568e-06, + "loss": 0.2337, "step": 796900 }, { - "epoch": 8.12, - "learning_rate": 2.7694446638045118e-05, - "loss": 0.3817, + "epoch": 10.980683916122455, + "grad_norm": 1.7988147735595703, + "learning_rate": 9.374225647771303e-06, + "loss": 0.2414, "step": 797000 }, { - "epoch": 8.12, - "learning_rate": 2.768783271462476e-05, - "loss": 0.404, + "epoch": 10.982061668182194, + "grad_norm": 2.566760301589966, + "learning_rate": 9.368198496186316e-06, + "loss": 0.2462, "step": 797100 }, { - "epoch": 8.12, - "learning_rate": 2.768121890425559e-05, - "loss": 0.4116, + "epoch": 10.983439420241933, + "grad_norm": 2.2484383583068848, + "learning_rate": 9.362172884990623e-06, + "loss": 0.2922, "step": 797200 }, { - "epoch": 8.12, - "learning_rate": 2.7674605207260978e-05, - "loss": 0.367, + "epoch": 10.984817172301673, + "grad_norm": 5.80052375793457, + "learning_rate": 9.35614881469613e-06, + "loss": 0.2992, "step": 797300 }, { - "epoch": 8.12, - "learning_rate": 2.766799162396429e-05, - "loss": 0.3202, + "epoch": 10.986194924361412, + "grad_norm": 2.1852049827575684, + "learning_rate": 9.350126285814615e-06, + "loss": 0.2221, "step": 797400 }, { - "epoch": 8.13, - "learning_rate": 2.7661378154688896e-05, - "loss": 0.3981, + "epoch": 10.987572676421152, + "grad_norm": 0.43161359429359436, + "learning_rate": 9.344105298857736e-06, + "loss": 0.2653, "step": 797500 }, { - "epoch": 8.13, - "learning_rate": 2.7654764799758168e-05, - "loss": 0.4123, + "epoch": 10.98895042848089, + "grad_norm": 5.528257369995117, + "learning_rate": 9.338085854336989e-06, + "loss": 0.2699, "step": 797600 }, { - "epoch": 8.13, - "learning_rate": 2.764815155949543e-05, - "loss": 0.3657, + "epoch": 10.99032818054063, + "grad_norm": 0.2190031260251999, + "learning_rate": 9.332067952763775e-06, + "loss": 0.222, "step": 797700 }, { - "epoch": 8.13, - "learning_rate": 2.7641538434224048e-05, - "loss": 0.4607, + "epoch": 10.99170593260037, + "grad_norm": 2.633159637451172, + "learning_rate": 9.326051594649331e-06, + "loss": 0.3166, "step": 797800 }, { - "epoch": 8.13, - "learning_rate": 2.763492542426737e-05, - "loss": 0.4206, + "epoch": 10.993083684660109, + "grad_norm": 4.198328018188477, + "learning_rate": 9.320036780504775e-06, + "loss": 0.2449, "step": 797900 }, { - "epoch": 8.13, - "learning_rate": 2.7628312529948712e-05, - "loss": 0.4538, + "epoch": 10.994461436719847, + "grad_norm": 1.6569182872772217, + "learning_rate": 9.314023510841102e-06, + "loss": 0.2913, "step": 798000 }, { - "epoch": 8.13, - "learning_rate": 2.7621699751591424e-05, - "loss": 0.3981, + "epoch": 10.995839188779588, + "grad_norm": 5.501071453094482, + "learning_rate": 9.308011786169178e-06, + "loss": 0.2805, "step": 798100 }, { - "epoch": 8.13, - "learning_rate": 2.7615087089518833e-05, - "loss": 0.3815, + "epoch": 10.997216940839326, + "grad_norm": 3.474391222000122, + "learning_rate": 9.302001606999717e-06, + "loss": 0.2498, "step": 798200 }, { - "epoch": 8.13, - "learning_rate": 2.7608474544054243e-05, - "loss": 0.401, + "epoch": 10.998594692899065, + "grad_norm": 2.8189375400543213, + "learning_rate": 9.295992973843326e-06, + "loss": 0.2577, "step": 798300 }, { - "epoch": 8.13, - "learning_rate": 2.7601862115520973e-05, - "loss": 0.4987, + "epoch": 10.999972444958805, + "grad_norm": 9.539854049682617, + "learning_rate": 9.289985887210456e-06, + "loss": 0.2943, "step": 798400 }, { - "epoch": 8.14, - "learning_rate": 2.759524980424234e-05, - "loss": 0.3782, + "epoch": 11.001350197018544, + "grad_norm": 0.051772598177194595, + "learning_rate": 9.283980347611454e-06, + "loss": 0.3072, "step": 798500 }, { - "epoch": 8.14, - "learning_rate": 2.758863761054164e-05, - "loss": 0.4511, + "epoch": 11.002727949078285, + "grad_norm": 30.267906188964844, + "learning_rate": 9.277976355556504e-06, + "loss": 0.2881, "step": 798600 }, { - "epoch": 8.14, - "learning_rate": 2.7582025534742162e-05, - "loss": 0.4578, + "epoch": 11.004105701138023, + "grad_norm": 2.5710878372192383, + "learning_rate": 9.271973911555697e-06, + "loss": 0.2722, "step": 798700 }, { - "epoch": 8.14, - "learning_rate": 2.7575413577167223e-05, - "loss": 0.417, + "epoch": 11.005483453197762, + "grad_norm": 0.3158681392669678, + "learning_rate": 9.26597301611895e-06, + "loss": 0.2993, "step": 798800 }, { - "epoch": 8.14, - "learning_rate": 2.7568801738140078e-05, - "loss": 0.3012, + "epoch": 11.006861205257502, + "grad_norm": 2.6168432235717773, + "learning_rate": 9.259973669756089e-06, + "loss": 0.3254, "step": 798900 }, { - "epoch": 8.14, - "learning_rate": 2.7562190017984022e-05, - "loss": 0.3443, + "epoch": 11.008238957317241, + "grad_norm": 2.0278635025024414, + "learning_rate": 9.253975872976772e-06, + "loss": 0.2269, "step": 799000 }, { - "epoch": 8.14, - "learning_rate": 2.7555578417022324e-05, - "loss": 0.4042, + "epoch": 11.00961670937698, + "grad_norm": 5.680697441101074, + "learning_rate": 9.247979626290558e-06, + "loss": 0.248, "step": 799100 }, { - "epoch": 8.14, - "learning_rate": 2.754896693557826e-05, - "loss": 0.4254, + "epoch": 11.01099446143672, + "grad_norm": 3.3901679515838623, + "learning_rate": 9.241984930206845e-06, + "loss": 0.2131, "step": 799200 }, { - "epoch": 8.14, - "learning_rate": 2.7542355573975074e-05, - "loss": 0.382, + "epoch": 11.012372213496459, + "grad_norm": 1.2839778661727905, + "learning_rate": 9.235991785234916e-06, + "loss": 0.2889, "step": 799300 }, { - "epoch": 8.14, - "learning_rate": 2.753574433253604e-05, - "loss": 0.4017, + "epoch": 11.0137499655562, + "grad_norm": 1.7533848285675049, + "learning_rate": 9.230000191883935e-06, + "loss": 0.2235, "step": 799400 }, { - "epoch": 8.15, - "learning_rate": 2.7529133211584413e-05, - "loss": 0.3609, + "epoch": 11.015127717615938, + "grad_norm": 1.2543329000473022, + "learning_rate": 9.224010150662905e-06, + "loss": 0.243, "step": 799500 }, { - "epoch": 8.15, - "learning_rate": 2.7522522211443415e-05, - "loss": 0.3628, + "epoch": 11.016505469675677, + "grad_norm": 1.1958026885986328, + "learning_rate": 9.218021662080702e-06, + "loss": 0.263, "step": 799600 }, { - "epoch": 8.15, - "learning_rate": 2.7515911332436297e-05, - "loss": 0.3011, + "epoch": 11.017883221735417, + "grad_norm": 1.521730661392212, + "learning_rate": 9.212034726646102e-06, + "loss": 0.2509, "step": 799700 }, { - "epoch": 8.15, - "learning_rate": 2.7509300574886297e-05, - "loss": 0.3527, + "epoch": 11.019260973795156, + "grad_norm": 3.23846435546875, + "learning_rate": 9.2060493448677e-06, + "loss": 0.2206, "step": 799800 }, { - "epoch": 8.15, - "learning_rate": 2.7502689939116623e-05, - "loss": 0.3675, + "epoch": 11.020638725854894, + "grad_norm": 2.9404137134552, + "learning_rate": 9.200065517253996e-06, + "loss": 0.2636, "step": 799900 }, { - "epoch": 8.15, - "learning_rate": 2.7496079425450513e-05, - "loss": 0.4093, + "epoch": 11.022016477914635, + "grad_norm": 2.66621994972229, + "learning_rate": 9.19408324431336e-06, + "loss": 0.1668, "step": 800000 }, { - "epoch": 8.15, - "learning_rate": 2.748946903421119e-05, - "loss": 0.3257, + "epoch": 11.023394229974373, + "grad_norm": 2.9230761528015137, + "learning_rate": 9.188102526554003e-06, + "loss": 0.2811, "step": 800100 }, { - "epoch": 8.15, - "learning_rate": 2.7482858765721836e-05, - "loss": 0.4685, + "epoch": 11.024771982034114, + "grad_norm": 2.5457375049591064, + "learning_rate": 9.182123364484014e-06, + "loss": 0.2705, "step": 800200 }, { - "epoch": 8.15, - "learning_rate": 2.7476248620305664e-05, - "loss": 0.3679, + "epoch": 11.026149734093853, + "grad_norm": 1.7223994731903076, + "learning_rate": 9.176145758611354e-06, + "loss": 0.2996, "step": 800300 }, { - "epoch": 8.15, - "learning_rate": 2.7469638598285885e-05, - "loss": 0.3379, + "epoch": 11.027527486153591, + "grad_norm": 2.6430580615997314, + "learning_rate": 9.170169709443869e-06, + "loss": 0.2233, "step": 800400 }, { - "epoch": 8.16, - "learning_rate": 2.7463028699985664e-05, - "loss": 0.4475, + "epoch": 11.028905238213332, + "grad_norm": 2.9103152751922607, + "learning_rate": 9.164195217489239e-06, + "loss": 0.2395, "step": 800500 }, { - "epoch": 8.16, - "learning_rate": 2.7456418925728206e-05, - "loss": 0.421, + "epoch": 11.03028299027307, + "grad_norm": 1.6545778512954712, + "learning_rate": 9.158222283255024e-06, + "loss": 0.2305, "step": 800600 }, { - "epoch": 8.16, - "learning_rate": 2.7449809275836697e-05, - "loss": 0.4038, + "epoch": 11.031660742332809, + "grad_norm": 1.0206098556518555, + "learning_rate": 9.15225090724867e-06, + "loss": 0.3106, "step": 800700 }, { - "epoch": 8.16, - "learning_rate": 2.7443199750634278e-05, - "loss": 0.3655, + "epoch": 11.03303849439255, + "grad_norm": 2.5714898109436035, + "learning_rate": 9.14628108997746e-06, + "loss": 0.3489, "step": 800800 }, { - "epoch": 8.16, - "learning_rate": 2.7436590350444134e-05, - "loss": 0.4012, + "epoch": 11.034416246452288, + "grad_norm": 3.59450626373291, + "learning_rate": 9.140312831948563e-06, + "loss": 0.2657, "step": 800900 }, { - "epoch": 8.16, - "learning_rate": 2.7429981075589425e-05, - "loss": 0.326, + "epoch": 11.035793998512029, + "grad_norm": 1.8729352951049805, + "learning_rate": 9.134346133669034e-06, + "loss": 0.207, "step": 801000 }, { - "epoch": 8.16, - "learning_rate": 2.742337192639331e-05, - "loss": 0.3347, + "epoch": 11.037171750571767, + "grad_norm": 6.837676048278809, + "learning_rate": 9.128380995645758e-06, + "loss": 0.2125, "step": 801100 }, { - "epoch": 8.16, - "learning_rate": 2.741676290317893e-05, - "loss": 0.386, + "epoch": 11.038549502631506, + "grad_norm": 11.230830192565918, + "learning_rate": 9.122417418385493e-06, + "loss": 0.2479, "step": 801200 }, { - "epoch": 8.16, - "learning_rate": 2.7410154006269424e-05, - "loss": 0.4057, + "epoch": 11.039927254691246, + "grad_norm": 3.2756309509277344, + "learning_rate": 9.116455402394887e-06, + "loss": 0.2557, "step": 801300 }, { - "epoch": 8.16, - "learning_rate": 2.7403545235987942e-05, - "loss": 0.4109, + "epoch": 11.041305006750985, + "grad_norm": 2.231621265411377, + "learning_rate": 9.110494948180457e-06, + "loss": 0.2321, "step": 801400 }, { - "epoch": 8.17, - "learning_rate": 2.739700267846142e-05, - "loss": 0.4051, + "epoch": 11.042682758810724, + "grad_norm": 1.2976126670837402, + "learning_rate": 9.10453605624855e-06, + "loss": 0.2535, "step": 801500 }, { - "epoch": 8.17, - "learning_rate": 2.7390394161131008e-05, - "loss": 0.3354, + "epoch": 11.044060510870464, + "grad_norm": 2.2005112171173096, + "learning_rate": 9.098578727105424e-06, + "loss": 0.2508, "step": 801600 }, { - "epoch": 8.17, - "learning_rate": 2.7383785771394737e-05, - "loss": 0.3916, + "epoch": 11.045438262930203, + "grad_norm": 1.8822119235992432, + "learning_rate": 9.092622961257178e-06, + "loss": 0.2127, "step": 801700 }, { - "epoch": 8.17, - "learning_rate": 2.737717750957573e-05, - "loss": 0.4673, + "epoch": 11.046816014989943, + "grad_norm": 0.2824857234954834, + "learning_rate": 9.086668759209774e-06, + "loss": 0.2373, "step": 801800 }, { - "epoch": 8.17, - "learning_rate": 2.7370569375997092e-05, - "loss": 0.4042, + "epoch": 11.048193767049682, + "grad_norm": 5.193145275115967, + "learning_rate": 9.08071612146906e-06, + "loss": 0.2958, "step": 801900 }, { - "epoch": 8.17, - "learning_rate": 2.7363961370981912e-05, - "loss": 0.376, + "epoch": 11.04957151910942, + "grad_norm": 1.5914549827575684, + "learning_rate": 9.074765048540752e-06, + "loss": 0.2585, "step": 802000 }, { - "epoch": 8.17, - "learning_rate": 2.73573534948533e-05, - "loss": 0.4333, + "epoch": 11.050949271169161, + "grad_norm": 3.134385108947754, + "learning_rate": 9.068815540930408e-06, + "loss": 0.265, "step": 802100 }, { - "epoch": 8.17, - "learning_rate": 2.7350745747934336e-05, - "loss": 0.4299, + "epoch": 11.0523270232289, + "grad_norm": 2.1289002895355225, + "learning_rate": 9.062867599143484e-06, + "loss": 0.2643, "step": 802200 }, { - "epoch": 8.17, - "learning_rate": 2.7344138130548084e-05, - "loss": 0.3573, + "epoch": 11.053704775288638, + "grad_norm": 4.590718746185303, + "learning_rate": 9.056921223685274e-06, + "loss": 0.2631, "step": 802300 }, { - "epoch": 8.17, - "learning_rate": 2.7337530643017627e-05, - "loss": 0.3713, + "epoch": 11.055082527348379, + "grad_norm": 1.478545904159546, + "learning_rate": 9.050976415060969e-06, + "loss": 0.2742, "step": 802400 }, { - "epoch": 8.18, - "learning_rate": 2.7330923285666038e-05, - "loss": 0.3894, + "epoch": 11.056460279408117, + "grad_norm": 1.5437530279159546, + "learning_rate": 9.045033173775595e-06, + "loss": 0.2155, "step": 802500 }, { - "epoch": 8.18, - "learning_rate": 2.732431605881637e-05, - "loss": 0.3556, + "epoch": 11.057838031467856, + "grad_norm": 4.8874101638793945, + "learning_rate": 9.039091500334066e-06, + "loss": 0.3035, "step": 802600 }, { - "epoch": 8.18, - "learning_rate": 2.7317708962791678e-05, - "loss": 0.4571, + "epoch": 11.059215783527597, + "grad_norm": 4.0117034912109375, + "learning_rate": 9.033151395241177e-06, + "loss": 0.2683, "step": 802700 }, { - "epoch": 8.18, - "learning_rate": 2.7311101997915024e-05, - "loss": 0.3544, + "epoch": 11.060593535587335, + "grad_norm": 2.4324307441711426, + "learning_rate": 9.027212859001535e-06, + "loss": 0.2695, "step": 802800 }, { - "epoch": 8.18, - "learning_rate": 2.730449516450942e-05, - "loss": 0.3864, + "epoch": 11.061971287647076, + "grad_norm": 1.3751418590545654, + "learning_rate": 9.021275892119669e-06, + "loss": 0.2259, "step": 802900 }, { - "epoch": 8.18, - "learning_rate": 2.7297888462897917e-05, - "loss": 0.4564, + "epoch": 11.063349039706814, + "grad_norm": 4.446757793426514, + "learning_rate": 9.015340495099959e-06, + "loss": 0.2446, "step": 803000 }, { - "epoch": 8.18, - "learning_rate": 2.729128189340355e-05, - "loss": 0.352, + "epoch": 11.064726791766553, + "grad_norm": 2.4575319290161133, + "learning_rate": 9.009406668446632e-06, + "loss": 0.2142, "step": 803100 }, { - "epoch": 8.18, - "learning_rate": 2.728467545634933e-05, - "loss": 0.4731, + "epoch": 11.066104543826293, + "grad_norm": 0.7464675903320312, + "learning_rate": 9.003474412663805e-06, + "loss": 0.3184, "step": 803200 }, { - "epoch": 8.18, - "learning_rate": 2.7278069152058275e-05, - "loss": 0.3897, + "epoch": 11.067482295886032, + "grad_norm": 3.1375722885131836, + "learning_rate": 8.997543728255466e-06, + "loss": 0.2399, "step": 803300 }, { - "epoch": 8.19, - "learning_rate": 2.7271462980853393e-05, - "loss": 0.3655, + "epoch": 11.06886004794577, + "grad_norm": 3.3817145824432373, + "learning_rate": 8.991614615725443e-06, + "loss": 0.2403, "step": 803400 }, { - "epoch": 8.19, - "learning_rate": 2.72648569430577e-05, - "loss": 0.3987, + "epoch": 11.070237800005511, + "grad_norm": 2.955003499984741, + "learning_rate": 8.985687075577438e-06, + "loss": 0.2406, "step": 803500 }, { - "epoch": 8.19, - "learning_rate": 2.725825103899417e-05, - "loss": 0.3869, + "epoch": 11.07161555206525, + "grad_norm": 0.18885573744773865, + "learning_rate": 8.979761108315036e-06, + "loss": 0.254, "step": 803600 }, { - "epoch": 8.19, - "learning_rate": 2.72516452689858e-05, - "loss": 0.4443, + "epoch": 11.07299330412499, + "grad_norm": 3.5578806400299072, + "learning_rate": 8.973836714441683e-06, + "loss": 0.2276, "step": 803700 }, { - "epoch": 8.19, - "learning_rate": 2.7245039633355582e-05, - "loss": 0.3532, + "epoch": 11.074371056184729, + "grad_norm": 5.592334747314453, + "learning_rate": 8.967913894460673e-06, + "loss": 0.2655, "step": 803800 }, { - "epoch": 8.19, - "learning_rate": 2.723843413242648e-05, - "loss": 0.3901, + "epoch": 11.075748808244468, + "grad_norm": 2.156681537628174, + "learning_rate": 8.961992648875198e-06, + "loss": 0.2558, "step": 803900 }, { - "epoch": 8.19, - "learning_rate": 2.723182876652147e-05, - "loss": 0.432, + "epoch": 11.077126560304208, + "grad_norm": 14.878314971923828, + "learning_rate": 8.956072978188283e-06, + "loss": 0.2464, "step": 804000 }, { - "epoch": 8.19, - "learning_rate": 2.7225223535963524e-05, - "loss": 0.3776, + "epoch": 11.078504312363947, + "grad_norm": 1.813043475151062, + "learning_rate": 8.950154882902835e-06, + "loss": 0.2349, "step": 804100 }, { - "epoch": 8.19, - "learning_rate": 2.7218618441075575e-05, - "loss": 0.4286, + "epoch": 11.079882064423686, + "grad_norm": 2.362821578979492, + "learning_rate": 8.944238363521623e-06, + "loss": 0.2504, "step": 804200 }, { - "epoch": 8.19, - "learning_rate": 2.7212013482180585e-05, - "loss": 0.4136, + "epoch": 11.081259816483426, + "grad_norm": 1.6859230995178223, + "learning_rate": 8.938323420547308e-06, + "loss": 0.2555, "step": 804300 }, { - "epoch": 8.2, - "learning_rate": 2.7205408659601508e-05, - "loss": 0.4645, + "epoch": 11.082637568543165, + "grad_norm": 1.8618556261062622, + "learning_rate": 8.932410054482376e-06, + "loss": 0.2499, "step": 804400 }, { - "epoch": 8.2, - "learning_rate": 2.7198803973661262e-05, - "loss": 0.4313, + "epoch": 11.084015320602905, + "grad_norm": 2.7331643104553223, + "learning_rate": 8.926498265829193e-06, + "loss": 0.2535, "step": 804500 }, { - "epoch": 8.2, - "learning_rate": 2.7192199424682785e-05, - "loss": 0.4228, + "epoch": 11.085393072662644, + "grad_norm": 2.594290256500244, + "learning_rate": 8.920588055090004e-06, + "loss": 0.3183, "step": 804600 }, { - "epoch": 8.2, - "learning_rate": 2.7185595012989012e-05, - "loss": 0.3905, + "epoch": 11.086770824722382, + "grad_norm": 2.9784297943115234, + "learning_rate": 8.91467942276692e-06, + "loss": 0.2438, "step": 804700 }, { - "epoch": 8.2, - "learning_rate": 2.7178990738902837e-05, - "loss": 0.3991, + "epoch": 11.088148576782123, + "grad_norm": 1.6363935470581055, + "learning_rate": 8.908772369361893e-06, + "loss": 0.2563, "step": 804800 }, { - "epoch": 8.2, - "learning_rate": 2.7172386602747174e-05, - "loss": 0.4426, + "epoch": 11.089526328841862, + "grad_norm": 2.9799437522888184, + "learning_rate": 8.902866895376763e-06, + "loss": 0.2468, "step": 804900 }, { - "epoch": 8.2, - "learning_rate": 2.7165782604844937e-05, - "loss": 0.3756, + "epoch": 11.0909040809016, + "grad_norm": 0.405773401260376, + "learning_rate": 8.896963001313254e-06, + "loss": 0.255, "step": 805000 }, { - "epoch": 8.2, - "learning_rate": 2.7159178745519022e-05, - "loss": 0.4078, + "epoch": 11.09228183296134, + "grad_norm": 3.7427282333374023, + "learning_rate": 8.891119702984551e-06, + "loss": 0.2647, "step": 805100 }, { - "epoch": 8.2, - "learning_rate": 2.7152641061607973e-05, - "loss": 0.4015, + "epoch": 11.09365958502108, + "grad_norm": 0.04064955562353134, + "learning_rate": 8.885218954457059e-06, + "loss": 0.2292, "step": 805200 }, { - "epoch": 8.2, - "learning_rate": 2.7146037479009524e-05, - "loss": 0.4056, + "epoch": 11.09503733708082, + "grad_norm": 1.1493785381317139, + "learning_rate": 8.879319787350447e-06, + "loss": 0.2376, "step": 805300 }, { - "epoch": 8.21, - "learning_rate": 2.7139434035952814e-05, - "loss": 0.3864, + "epoch": 11.096415089140558, + "grad_norm": 2.099971294403076, + "learning_rate": 8.873422202165893e-06, + "loss": 0.2452, "step": 805400 }, { - "epoch": 8.21, - "learning_rate": 2.713283073276072e-05, - "loss": 0.3798, + "epoch": 11.097792841200297, + "grad_norm": 2.8384177684783936, + "learning_rate": 8.867526199404411e-06, + "loss": 0.2156, "step": 805500 }, { - "epoch": 8.21, - "learning_rate": 2.712622756975608e-05, - "loss": 0.42, + "epoch": 11.099170593260038, + "grad_norm": 0.3695179522037506, + "learning_rate": 8.861631779566896e-06, + "loss": 0.2054, "step": 805600 }, { - "epoch": 8.21, - "learning_rate": 2.7119624547261763e-05, - "loss": 0.37, + "epoch": 11.100548345319776, + "grad_norm": 4.0854997634887695, + "learning_rate": 8.855738943154122e-06, + "loss": 0.2159, "step": 805700 }, { - "epoch": 8.21, - "learning_rate": 2.7113021665600618e-05, - "loss": 0.349, + "epoch": 11.101926097379515, + "grad_norm": 1.8802731037139893, + "learning_rate": 8.849847690666703e-06, + "loss": 0.2298, "step": 805800 }, { - "epoch": 8.21, - "learning_rate": 2.7106418925095498e-05, - "loss": 0.385, + "epoch": 11.103303849439255, + "grad_norm": 3.3940842151641846, + "learning_rate": 8.843958022605131e-06, + "loss": 0.3119, "step": 805900 }, { - "epoch": 8.21, - "learning_rate": 2.709981632606921e-05, - "loss": 0.3937, + "epoch": 11.104681601498994, + "grad_norm": 2.429543972015381, + "learning_rate": 8.838069939469783e-06, + "loss": 0.315, "step": 806000 }, { - "epoch": 8.21, - "learning_rate": 2.7093213868844597e-05, - "loss": 0.437, + "epoch": 11.106059353558734, + "grad_norm": 1.094051480293274, + "learning_rate": 8.832183441760864e-06, + "loss": 0.3217, "step": 806100 }, { - "epoch": 8.21, - "learning_rate": 2.708661155374449e-05, - "loss": 0.3217, + "epoch": 11.107437105618473, + "grad_norm": 1.3612397909164429, + "learning_rate": 8.826298529978456e-06, + "loss": 0.2853, "step": 806200 }, { - "epoch": 8.21, - "learning_rate": 2.7080009381091676e-05, - "loss": 0.3391, + "epoch": 11.108814857678212, + "grad_norm": 1.6259535551071167, + "learning_rate": 8.820415204622522e-06, + "loss": 0.236, "step": 806300 }, { - "epoch": 8.22, - "learning_rate": 2.7073407351208984e-05, - "loss": 0.3901, + "epoch": 11.110192609737952, + "grad_norm": 4.023580551147461, + "learning_rate": 8.814533466192894e-06, + "loss": 0.2455, "step": 806400 }, { - "epoch": 8.22, - "learning_rate": 2.706680546441922e-05, - "loss": 0.4509, + "epoch": 11.11157036179769, + "grad_norm": 0.9507949948310852, + "learning_rate": 8.808653315189227e-06, + "loss": 0.2696, "step": 806500 }, { - "epoch": 8.22, - "learning_rate": 2.7060203721045152e-05, - "loss": 0.306, + "epoch": 11.11294811385743, + "grad_norm": 1.7639344930648804, + "learning_rate": 8.802774752111098e-06, + "loss": 0.2289, "step": 806600 }, { - "epoch": 8.22, - "learning_rate": 2.7053602121409573e-05, - "loss": 0.3588, + "epoch": 11.11432586591717, + "grad_norm": 4.252608299255371, + "learning_rate": 8.796897777457905e-06, + "loss": 0.2456, "step": 806700 }, { - "epoch": 8.22, - "learning_rate": 2.704700066583528e-05, - "loss": 0.4371, + "epoch": 11.115703617976909, + "grad_norm": 3.606848955154419, + "learning_rate": 8.791022391728926e-06, + "loss": 0.2544, "step": 806800 }, { - "epoch": 8.22, - "learning_rate": 2.704039935464502e-05, - "loss": 0.4054, + "epoch": 11.117081370036647, + "grad_norm": 5.053679943084717, + "learning_rate": 8.785148595423306e-06, + "loss": 0.2693, "step": 806900 }, { - "epoch": 8.22, - "learning_rate": 2.7033798188161572e-05, - "loss": 0.4252, + "epoch": 11.118459122096388, + "grad_norm": 0.5052111148834229, + "learning_rate": 8.779276389040066e-06, + "loss": 0.2511, "step": 807000 }, { - "epoch": 8.22, - "learning_rate": 2.7027197166707708e-05, - "loss": 0.3887, + "epoch": 11.119836874156126, + "grad_norm": 2.9068005084991455, + "learning_rate": 8.773464471363464e-06, + "loss": 0.2321, "step": 807100 }, { - "epoch": 8.22, - "learning_rate": 2.7020596290606144e-05, - "loss": 0.3881, + "epoch": 11.121214626215867, + "grad_norm": 4.298202991485596, + "learning_rate": 8.767595430409787e-06, + "loss": 0.2916, "step": 807200 }, { - "epoch": 8.22, - "learning_rate": 2.701399556017964e-05, - "loss": 0.4711, + "epoch": 11.122592378275606, + "grad_norm": 1.2209360599517822, + "learning_rate": 8.7617279808697e-06, + "loss": 0.2222, "step": 807300 }, { - "epoch": 8.23, - "learning_rate": 2.700739497575094e-05, - "loss": 0.4083, + "epoch": 11.123970130335344, + "grad_norm": 2.6864845752716064, + "learning_rate": 8.75586212324169e-06, + "loss": 0.2031, "step": 807400 }, { - "epoch": 8.23, - "learning_rate": 2.700079453764277e-05, - "loss": 0.319, + "epoch": 11.125347882395085, + "grad_norm": 8.401939392089844, + "learning_rate": 8.749997858024092e-06, + "loss": 0.2732, "step": 807500 }, { - "epoch": 8.23, - "learning_rate": 2.699419424617784e-05, - "loss": 0.3657, + "epoch": 11.126725634454823, + "grad_norm": 1.0111336708068848, + "learning_rate": 8.744135185715095e-06, + "loss": 0.2857, "step": 807600 }, { - "epoch": 8.23, - "learning_rate": 2.6987594101678874e-05, - "loss": 0.3477, + "epoch": 11.128103386514562, + "grad_norm": 2.792137384414673, + "learning_rate": 8.738274106812775e-06, + "loss": 0.2324, "step": 807700 }, { - "epoch": 8.23, - "learning_rate": 2.6980994104468597e-05, - "loss": 0.3729, + "epoch": 11.129481138574302, + "grad_norm": 1.0686736106872559, + "learning_rate": 8.732414621815057e-06, + "loss": 0.2457, "step": 807800 }, { - "epoch": 8.23, - "learning_rate": 2.6974460252633937e-05, - "loss": 0.3645, + "epoch": 11.130858890634041, + "grad_norm": 1.4294456243515015, + "learning_rate": 8.726556731219716e-06, + "loss": 0.2718, "step": 807900 }, { - "epoch": 8.23, - "learning_rate": 2.696786054948814e-05, - "loss": 0.4134, + "epoch": 11.132236642693782, + "grad_norm": 1.3688405752182007, + "learning_rate": 8.720700435524432e-06, + "loss": 0.2688, "step": 808000 }, { - "epoch": 8.23, - "learning_rate": 2.696126099459587e-05, - "loss": 0.3765, + "epoch": 11.13361439475352, + "grad_norm": 3.928964853286743, + "learning_rate": 8.714845735226724e-06, + "loss": 0.2611, "step": 808100 }, { - "epoch": 8.23, - "learning_rate": 2.6954727581606454e-05, - "loss": 0.4042, + "epoch": 11.134992146813259, + "grad_norm": 5.94156551361084, + "learning_rate": 8.70899263082397e-06, + "loss": 0.2849, "step": 808200 }, { - "epoch": 8.24, - "learning_rate": 2.6948128322698676e-05, - "loss": 0.412, + "epoch": 11.136369898873, + "grad_norm": 2.9743258953094482, + "learning_rate": 8.703199629989768e-06, + "loss": 0.3191, "step": 808300 }, { - "epoch": 8.24, - "learning_rate": 2.6941529213009222e-05, - "loss": 0.3908, + "epoch": 11.137747650932738, + "grad_norm": 0.4662185311317444, + "learning_rate": 8.697349702897192e-06, + "loss": 0.2271, "step": 808400 }, { - "epoch": 8.24, - "learning_rate": 2.6934930252860725e-05, - "loss": 0.4179, + "epoch": 11.139125402992477, + "grad_norm": 0.9455997943878174, + "learning_rate": 8.691501373185965e-06, + "loss": 0.2527, "step": 808500 }, { - "epoch": 8.24, - "learning_rate": 2.692833144257585e-05, - "loss": 0.4404, + "epoch": 11.140503155052217, + "grad_norm": 1.784304141998291, + "learning_rate": 8.685654641352912e-06, + "loss": 0.2657, "step": 808600 }, { - "epoch": 8.24, - "learning_rate": 2.692173278247723e-05, - "loss": 0.3944, + "epoch": 11.141880907111956, + "grad_norm": 1.861839771270752, + "learning_rate": 8.679809507894747e-06, + "loss": 0.2373, "step": 808700 }, { - "epoch": 8.24, - "learning_rate": 2.691513427288752e-05, - "loss": 0.3863, + "epoch": 11.143258659171696, + "grad_norm": 2.105329990386963, + "learning_rate": 8.67396597330806e-06, + "loss": 0.2243, "step": 808800 }, { - "epoch": 8.24, - "learning_rate": 2.6908535914129325e-05, - "loss": 0.4, + "epoch": 11.144636411231435, + "grad_norm": 2.089622974395752, + "learning_rate": 8.668124038089272e-06, + "loss": 0.2589, "step": 808900 }, { - "epoch": 8.24, - "learning_rate": 2.6901937706525277e-05, - "loss": 0.365, + "epoch": 11.146014163291174, + "grad_norm": 2.660186290740967, + "learning_rate": 8.662283702734688e-06, + "loss": 0.2309, "step": 809000 }, { - "epoch": 8.24, - "learning_rate": 2.6895339650398e-05, - "loss": 0.4313, + "epoch": 11.147391915350914, + "grad_norm": 5.6004414558410645, + "learning_rate": 8.656444967740473e-06, + "loss": 0.2496, "step": 809100 }, { - "epoch": 8.24, - "learning_rate": 2.6888741746070074e-05, - "loss": 0.3198, + "epoch": 11.148769667410653, + "grad_norm": 2.58648419380188, + "learning_rate": 8.650607833602668e-06, + "loss": 0.2646, "step": 809200 }, { - "epoch": 8.25, - "learning_rate": 2.6882143993864105e-05, - "loss": 0.3701, + "epoch": 11.150147419470391, + "grad_norm": 2.803346872329712, + "learning_rate": 8.644772300817155e-06, + "loss": 0.2604, "step": 809300 }, { - "epoch": 8.25, - "learning_rate": 2.6875546394102705e-05, - "loss": 0.4001, + "epoch": 11.151525171530132, + "grad_norm": 1.6983903646469116, + "learning_rate": 8.638938369879705e-06, + "loss": 0.2919, "step": 809400 }, { - "epoch": 8.25, - "learning_rate": 2.6868948947108417e-05, - "loss": 0.3048, + "epoch": 11.15290292358987, + "grad_norm": 3.894742488861084, + "learning_rate": 8.633106041285934e-06, + "loss": 0.2834, "step": 809500 }, { - "epoch": 8.25, - "learning_rate": 2.686235165320385e-05, - "loss": 0.3634, + "epoch": 11.154280675649611, + "grad_norm": 4.846644401550293, + "learning_rate": 8.627275315531317e-06, + "loss": 0.2678, "step": 809600 }, { - "epoch": 8.25, - "learning_rate": 2.685575451271158e-05, - "loss": 0.4628, + "epoch": 11.15565842770935, + "grad_norm": 3.635613203048706, + "learning_rate": 8.621446193111214e-06, + "loss": 0.2639, "step": 809700 }, { - "epoch": 8.25, - "learning_rate": 2.6849157525954134e-05, - "loss": 0.3683, + "epoch": 11.157036179769088, + "grad_norm": 2.513417959213257, + "learning_rate": 8.615618674520847e-06, + "loss": 0.3041, "step": 809800 }, { - "epoch": 8.25, - "learning_rate": 2.684256069325408e-05, - "loss": 0.3695, + "epoch": 11.158413931828829, + "grad_norm": 11.026405334472656, + "learning_rate": 8.609792760255289e-06, + "loss": 0.2554, "step": 809900 }, { - "epoch": 8.25, - "learning_rate": 2.683596401493398e-05, - "loss": 0.3536, + "epoch": 11.159791683888567, + "grad_norm": 0.08315371721982956, + "learning_rate": 8.603968450809468e-06, + "loss": 0.2605, "step": 810000 }, { - "epoch": 8.25, - "learning_rate": 2.6829367491316336e-05, - "loss": 0.3876, + "epoch": 11.161169435948306, + "grad_norm": 2.526890516281128, + "learning_rate": 8.598145746678196e-06, + "loss": 0.2653, "step": 810100 }, { - "epoch": 8.25, - "learning_rate": 2.682277112272371e-05, - "loss": 0.4164, + "epoch": 11.162547188008046, + "grad_norm": 3.762650489807129, + "learning_rate": 8.592324648356153e-06, + "loss": 0.3039, "step": 810200 }, { - "epoch": 8.26, - "learning_rate": 2.6816174909478626e-05, - "loss": 0.3812, + "epoch": 11.163924940067785, + "grad_norm": 1.738736629486084, + "learning_rate": 8.586505156337854e-06, + "loss": 0.2277, "step": 810300 }, { - "epoch": 8.26, - "learning_rate": 2.6809578851903574e-05, - "loss": 0.3931, + "epoch": 11.165302692127526, + "grad_norm": 2.326447010040283, + "learning_rate": 8.580687271117703e-06, + "loss": 0.3003, "step": 810400 }, { - "epoch": 8.26, - "learning_rate": 2.680298295032108e-05, - "loss": 0.4033, + "epoch": 11.166680444187264, + "grad_norm": 0.4140963554382324, + "learning_rate": 8.574870993189978e-06, + "loss": 0.2757, "step": 810500 }, { - "epoch": 8.26, - "learning_rate": 2.679638720505364e-05, - "loss": 0.3362, + "epoch": 11.168058196247003, + "grad_norm": 4.045792579650879, + "learning_rate": 8.569056323048763e-06, + "loss": 0.2509, "step": 810600 }, { - "epoch": 8.26, - "learning_rate": 2.6789791616423756e-05, - "loss": 0.374, + "epoch": 11.169435948306743, + "grad_norm": 2.8864753246307373, + "learning_rate": 8.563243261188063e-06, + "loss": 0.279, "step": 810700 }, { - "epoch": 8.26, - "learning_rate": 2.67831961847539e-05, - "loss": 0.4119, + "epoch": 11.170813700366482, + "grad_norm": 0.9622142314910889, + "learning_rate": 8.557489914667541e-06, + "loss": 0.2455, "step": 810800 }, { - "epoch": 8.26, - "learning_rate": 2.6776600910366546e-05, - "loss": 0.5145, + "epoch": 11.17219145242622, + "grad_norm": 3.372331380844116, + "learning_rate": 8.551680054754168e-06, + "loss": 0.2429, "step": 810900 }, { - "epoch": 8.26, - "learning_rate": 2.677000579358419e-05, - "loss": 0.3856, + "epoch": 11.173569204485961, + "grad_norm": 2.4155023097991943, + "learning_rate": 8.545871804597516e-06, + "loss": 0.2758, "step": 811000 }, { - "epoch": 8.26, - "learning_rate": 2.6763410834729258e-05, - "loss": 0.4167, + "epoch": 11.1749469565457, + "grad_norm": 1.3167738914489746, + "learning_rate": 8.540065164691009e-06, + "loss": 0.2344, "step": 811100 }, { - "epoch": 8.26, - "learning_rate": 2.6756816034124225e-05, - "loss": 0.4414, + "epoch": 11.176324708605438, + "grad_norm": 0.4571068584918976, + "learning_rate": 8.53426013552797e-06, + "loss": 0.2495, "step": 811200 }, { - "epoch": 8.27, - "learning_rate": 2.6750221392091545e-05, - "loss": 0.4286, + "epoch": 11.177702460665179, + "grad_norm": 2.1056313514709473, + "learning_rate": 8.528456717601566e-06, + "loss": 0.2238, "step": 811300 }, { - "epoch": 8.27, - "learning_rate": 2.6743626908953624e-05, - "loss": 0.4037, + "epoch": 11.179080212724918, + "grad_norm": 4.007205009460449, + "learning_rate": 8.522654911404813e-06, + "loss": 0.2452, "step": 811400 }, { - "epoch": 8.27, - "learning_rate": 2.6737032585032923e-05, - "loss": 0.4283, + "epoch": 11.180457964784658, + "grad_norm": 3.4492440223693848, + "learning_rate": 8.516854717430615e-06, + "loss": 0.2214, "step": 811500 }, { - "epoch": 8.27, - "learning_rate": 2.673043842065187e-05, - "loss": 0.3777, + "epoch": 11.181835716844397, + "grad_norm": 6.595731258392334, + "learning_rate": 8.51105613617174e-06, + "loss": 0.2465, "step": 811600 }, { - "epoch": 8.27, - "learning_rate": 2.6723844416132854e-05, - "loss": 0.3557, + "epoch": 11.183213468904135, + "grad_norm": 1.945192813873291, + "learning_rate": 8.505259168120779e-06, + "loss": 0.2729, "step": 811700 }, { - "epoch": 8.27, - "learning_rate": 2.671731650944767e-05, - "loss": 0.381, + "epoch": 11.184591220963876, + "grad_norm": 0.3217388391494751, + "learning_rate": 8.499463813770227e-06, + "loss": 0.2178, "step": 811800 }, { - "epoch": 8.27, - "learning_rate": 2.67107228240133e-05, - "loss": 0.342, + "epoch": 11.185968973023614, + "grad_norm": 3.6789467334747314, + "learning_rate": 8.493670073612438e-06, + "loss": 0.2311, "step": 811900 }, { - "epoch": 8.27, - "learning_rate": 2.6704129299404957e-05, - "loss": 0.3632, + "epoch": 11.187346725083353, + "grad_norm": 1.3070392608642578, + "learning_rate": 8.487877948139603e-06, + "loss": 0.2507, "step": 812000 }, { - "epoch": 8.27, - "learning_rate": 2.669753593594503e-05, - "loss": 0.3779, + "epoch": 11.188724477143094, + "grad_norm": 4.377415657043457, + "learning_rate": 8.482087437843811e-06, + "loss": 0.2734, "step": 812100 }, { - "epoch": 8.27, - "learning_rate": 2.669094273395588e-05, - "loss": 0.3675, + "epoch": 11.190102229202832, + "grad_norm": 2.521705389022827, + "learning_rate": 8.47629854321698e-06, + "loss": 0.2785, "step": 812200 }, { - "epoch": 8.28, - "learning_rate": 2.668434969375988e-05, - "loss": 0.4647, + "epoch": 11.191479981262573, + "grad_norm": 2.87312912940979, + "learning_rate": 8.470511264750915e-06, + "loss": 0.2135, "step": 812300 }, { - "epoch": 8.28, - "learning_rate": 2.667775681567941e-05, - "loss": 0.3769, + "epoch": 11.192857733322311, + "grad_norm": 2.738273859024048, + "learning_rate": 8.46472560293727e-06, + "loss": 0.2837, "step": 812400 }, { - "epoch": 8.28, - "learning_rate": 2.667116410003679e-05, - "loss": 0.3933, + "epoch": 11.19423548538205, + "grad_norm": 0.4938458800315857, + "learning_rate": 8.458941558267574e-06, + "loss": 0.2707, "step": 812500 }, { - "epoch": 8.28, - "learning_rate": 2.666457154715438e-05, - "loss": 0.3458, + "epoch": 11.19561323744179, + "grad_norm": 3.2724947929382324, + "learning_rate": 8.453216947494634e-06, + "loss": 0.2319, "step": 812600 }, { - "epoch": 8.28, - "learning_rate": 2.6657979157354524e-05, - "loss": 0.3704, + "epoch": 11.19699098950153, + "grad_norm": 3.5377845764160156, + "learning_rate": 8.447436122403146e-06, + "loss": 0.2661, "step": 812700 }, { - "epoch": 8.28, - "learning_rate": 2.6651386930959535e-05, - "loss": 0.445, + "epoch": 11.198368741561268, + "grad_norm": 3.653688907623291, + "learning_rate": 8.441656915924423e-06, + "loss": 0.2337, "step": 812800 }, { - "epoch": 8.28, - "learning_rate": 2.6644794868291743e-05, - "loss": 0.4052, + "epoch": 11.199746493621008, + "grad_norm": 6.7691216468811035, + "learning_rate": 8.435879328549444e-06, + "loss": 0.2487, "step": 812900 }, { - "epoch": 8.28, - "learning_rate": 2.663820296967346e-05, - "loss": 0.4391, + "epoch": 11.201124245680747, + "grad_norm": 2.1774661540985107, + "learning_rate": 8.430103360769058e-06, + "loss": 0.2838, "step": 813000 }, { - "epoch": 8.28, - "learning_rate": 2.6631611235427e-05, - "loss": 0.4144, + "epoch": 11.202501997740487, + "grad_norm": 0.24400591850280762, + "learning_rate": 8.424329013073941e-06, + "loss": 0.2455, "step": 813100 }, { - "epoch": 8.29, - "learning_rate": 2.662501966587464e-05, - "loss": 0.3706, + "epoch": 11.203879749800226, + "grad_norm": 2.814746379852295, + "learning_rate": 8.418556285954676e-06, + "loss": 0.2258, "step": 813200 }, { - "epoch": 8.29, - "learning_rate": 2.6618428261338676e-05, - "loss": 0.4063, + "epoch": 11.205257501859965, + "grad_norm": 4.687884330749512, + "learning_rate": 8.412785179901674e-06, + "loss": 0.27, "step": 813300 }, { - "epoch": 8.29, - "learning_rate": 2.6611837022141397e-05, - "loss": 0.3801, + "epoch": 11.206635253919705, + "grad_norm": 1.1854617595672607, + "learning_rate": 8.407073382221865e-06, + "loss": 0.2543, "step": 813400 }, { - "epoch": 8.29, - "learning_rate": 2.660524594860506e-05, - "loss": 0.3702, + "epoch": 11.208013005979444, + "grad_norm": 2.406607151031494, + "learning_rate": 8.401305503549206e-06, + "loss": 0.2375, "step": 813500 }, { - "epoch": 8.29, - "learning_rate": 2.6598655041051937e-05, - "loss": 0.3654, + "epoch": 11.209390758039183, + "grad_norm": 5.085211753845215, + "learning_rate": 8.395539247408354e-06, + "loss": 0.236, "step": 813600 }, { - "epoch": 8.29, - "learning_rate": 2.6592064299804297e-05, - "loss": 0.4254, + "epoch": 11.210768510098923, + "grad_norm": 1.9845657348632812, + "learning_rate": 8.389774614289187e-06, + "loss": 0.2448, "step": 813700 }, { - "epoch": 8.29, - "learning_rate": 2.6585473725184365e-05, - "loss": 0.3777, + "epoch": 11.212146262158662, + "grad_norm": 3.0627851486206055, + "learning_rate": 8.384011604681435e-06, + "loss": 0.253, "step": 813800 }, { - "epoch": 8.29, - "learning_rate": 2.6578949220763635e-05, - "loss": 0.4033, + "epoch": 11.213524014218402, + "grad_norm": 1.4288195371627808, + "learning_rate": 8.378250219074684e-06, + "loss": 0.207, "step": 813900 }, { - "epoch": 8.29, - "learning_rate": 2.6572358978691532e-05, - "loss": 0.3838, + "epoch": 11.21490176627814, + "grad_norm": 0.9665579199790955, + "learning_rate": 8.372490457958405e-06, + "loss": 0.259, "step": 814000 }, { - "epoch": 8.29, - "learning_rate": 2.656576890421062e-05, - "loss": 0.3772, + "epoch": 11.21627951833788, + "grad_norm": 0.6886402368545532, + "learning_rate": 8.366732321821925e-06, + "loss": 0.2641, "step": 814100 }, { - "epoch": 8.3, - "learning_rate": 2.655917899764312e-05, - "loss": 0.3294, + "epoch": 11.21765727039762, + "grad_norm": 2.7925777435302734, + "learning_rate": 8.36097581115441e-06, + "loss": 0.2834, "step": 814200 }, { - "epoch": 8.3, - "learning_rate": 2.6552589259311226e-05, - "loss": 0.3966, + "epoch": 11.219035022457359, + "grad_norm": 0.9384520649909973, + "learning_rate": 8.355220926444926e-06, + "loss": 0.2398, "step": 814300 }, { - "epoch": 8.3, - "learning_rate": 2.6545999689537146e-05, - "loss": 0.3856, + "epoch": 11.220412774517097, + "grad_norm": 4.041644096374512, + "learning_rate": 8.349467668182366e-06, + "loss": 0.2637, "step": 814400 }, { - "epoch": 8.3, - "learning_rate": 2.653941028864308e-05, - "loss": 0.398, + "epoch": 11.221790526576838, + "grad_norm": 0.6133745908737183, + "learning_rate": 8.343716036855491e-06, + "loss": 0.2538, "step": 814500 }, { - "epoch": 8.3, - "learning_rate": 2.653282105695119e-05, - "loss": 0.4041, + "epoch": 11.223168278636576, + "grad_norm": 1.0587536096572876, + "learning_rate": 8.337966032952943e-06, + "loss": 0.2363, "step": 814600 }, { - "epoch": 8.3, - "learning_rate": 2.6526231994783654e-05, - "loss": 0.3772, + "epoch": 11.224546030696317, + "grad_norm": 0.3194536566734314, + "learning_rate": 8.332217656963222e-06, + "loss": 0.2388, "step": 814700 }, { - "epoch": 8.3, - "learning_rate": 2.6519643102462655e-05, - "loss": 0.4001, + "epoch": 11.225923782756055, + "grad_norm": 1.4994847774505615, + "learning_rate": 8.326470909374672e-06, + "loss": 0.2857, "step": 814800 }, { - "epoch": 8.3, - "learning_rate": 2.6513054380310325e-05, - "loss": 0.2917, + "epoch": 11.227301534815794, + "grad_norm": 2.0027263164520264, + "learning_rate": 8.320725790675505e-06, + "loss": 0.2446, "step": 814900 }, { - "epoch": 8.3, - "learning_rate": 2.6506465828648834e-05, - "loss": 0.3559, + "epoch": 11.228679286875535, + "grad_norm": 0.6959890127182007, + "learning_rate": 8.314982301353797e-06, + "loss": 0.2217, "step": 815000 }, { - "epoch": 8.3, - "learning_rate": 2.6499877447800313e-05, - "loss": 0.4071, + "epoch": 11.230057038935273, + "grad_norm": 1.6134371757507324, + "learning_rate": 8.309240441897503e-06, + "loss": 0.2418, "step": 815100 }, { - "epoch": 8.31, - "learning_rate": 2.649328923808691e-05, - "loss": 0.4038, + "epoch": 11.231434790995012, + "grad_norm": 0.4799920320510864, + "learning_rate": 8.303500212794405e-06, + "loss": 0.2415, "step": 815200 }, { - "epoch": 8.31, - "learning_rate": 2.6486701199830723e-05, - "loss": 0.3452, + "epoch": 11.232812543054752, + "grad_norm": 0.3603655695915222, + "learning_rate": 8.297761614532177e-06, + "loss": 0.3008, "step": 815300 }, { - "epoch": 8.31, - "learning_rate": 2.6480113333353882e-05, - "loss": 0.3623, + "epoch": 11.234190295114491, + "grad_norm": 3.2173988819122314, + "learning_rate": 8.292024647598342e-06, + "loss": 0.2451, "step": 815400 }, { - "epoch": 8.31, - "learning_rate": 2.64735256389785e-05, - "loss": 0.3731, + "epoch": 11.23556804717423, + "grad_norm": 2.5884618759155273, + "learning_rate": 8.286289312480272e-06, + "loss": 0.2684, "step": 815500 }, { - "epoch": 8.31, - "learning_rate": 2.646693811702666e-05, - "loss": 0.3631, + "epoch": 11.23694579923397, + "grad_norm": 1.267899990081787, + "learning_rate": 8.28055560966522e-06, + "loss": 0.2404, "step": 815600 }, { - "epoch": 8.31, - "learning_rate": 2.6460350767820464e-05, - "loss": 0.363, + "epoch": 11.238323551293709, + "grad_norm": 1.1178793907165527, + "learning_rate": 8.274823539640303e-06, + "loss": 0.251, "step": 815700 }, { - "epoch": 8.31, - "learning_rate": 2.6453763591682007e-05, - "loss": 0.363, + "epoch": 11.23970130335345, + "grad_norm": 0.8813108801841736, + "learning_rate": 8.269093102892476e-06, + "loss": 0.2574, "step": 815800 }, { - "epoch": 8.31, - "learning_rate": 2.6447176588933332e-05, - "loss": 0.3863, + "epoch": 11.241079055413188, + "grad_norm": 1.0528873205184937, + "learning_rate": 8.263364299908572e-06, + "loss": 0.2211, "step": 815900 }, { - "epoch": 8.31, - "learning_rate": 2.644058975989652e-05, - "loss": 0.3876, + "epoch": 11.242456807472927, + "grad_norm": 3.4537768363952637, + "learning_rate": 8.257637131175293e-06, + "loss": 0.2969, "step": 816000 }, { - "epoch": 8.31, - "learning_rate": 2.6434003104893632e-05, - "loss": 0.418, + "epoch": 11.243834559532667, + "grad_norm": 2.411853313446045, + "learning_rate": 8.251911597179185e-06, + "loss": 0.2534, "step": 816100 }, { - "epoch": 8.32, - "learning_rate": 2.6427416624246705e-05, - "loss": 0.3698, + "epoch": 11.245212311592406, + "grad_norm": 2.665461778640747, + "learning_rate": 8.246187698406648e-06, + "loss": 0.201, "step": 816200 }, { - "epoch": 8.32, - "learning_rate": 2.6420830318277788e-05, - "loss": 0.3431, + "epoch": 11.246590063652144, + "grad_norm": 3.1475048065185547, + "learning_rate": 8.240465435343977e-06, + "loss": 0.2649, "step": 816300 }, { - "epoch": 8.32, - "learning_rate": 2.641424418730892e-05, - "loss": 0.3325, + "epoch": 11.247967815711885, + "grad_norm": 4.453071117401123, + "learning_rate": 8.234744808477291e-06, + "loss": 0.2324, "step": 816400 }, { - "epoch": 8.32, - "learning_rate": 2.64076582316621e-05, - "loss": 0.4131, + "epoch": 11.249345567771623, + "grad_norm": 1.4761182069778442, + "learning_rate": 8.229025818292605e-06, + "loss": 0.2241, "step": 816500 }, { - "epoch": 8.32, - "learning_rate": 2.6401072451659357e-05, - "loss": 0.3672, + "epoch": 11.250723319831364, + "grad_norm": 9.515789031982422, + "learning_rate": 8.223308465275752e-06, + "loss": 0.2405, "step": 816600 }, { - "epoch": 8.32, - "learning_rate": 2.63944868476227e-05, - "loss": 0.4261, + "epoch": 11.252101071891103, + "grad_norm": 4.889324188232422, + "learning_rate": 8.217592749912477e-06, + "loss": 0.3041, "step": 816700 }, { - "epoch": 8.32, - "learning_rate": 2.638790141987411e-05, - "loss": 0.4085, + "epoch": 11.253478823950841, + "grad_norm": 0.2434859573841095, + "learning_rate": 8.211878672688332e-06, + "loss": 0.2368, "step": 816800 }, { - "epoch": 8.32, - "learning_rate": 2.6381316168735585e-05, - "loss": 0.3531, + "epoch": 11.254856576010582, + "grad_norm": 2.01082706451416, + "learning_rate": 8.206166234088775e-06, + "loss": 0.1933, "step": 816900 }, { - "epoch": 8.32, - "learning_rate": 2.6374731094529105e-05, - "loss": 0.411, + "epoch": 11.25623432807032, + "grad_norm": 2.4082260131835938, + "learning_rate": 8.20045543459911e-06, + "loss": 0.2033, "step": 817000 }, { - "epoch": 8.32, - "learning_rate": 2.6368146197576657e-05, - "loss": 0.359, + "epoch": 11.257612080130059, + "grad_norm": 1.6180120706558228, + "learning_rate": 8.194746274704495e-06, + "loss": 0.2716, "step": 817100 }, { - "epoch": 8.33, - "learning_rate": 2.6361561478200167e-05, - "loss": 0.3749, + "epoch": 11.2589898321898, + "grad_norm": 2.4339616298675537, + "learning_rate": 8.189038754889935e-06, + "loss": 0.2198, "step": 817200 }, { - "epoch": 8.33, - "learning_rate": 2.635497693672161e-05, - "loss": 0.3829, + "epoch": 11.260367584249538, + "grad_norm": 4.213840007781982, + "learning_rate": 8.183332875640339e-06, + "loss": 0.2347, "step": 817300 }, { - "epoch": 8.33, - "learning_rate": 2.6348392573462933e-05, - "loss": 0.3949, + "epoch": 11.261745336309279, + "grad_norm": 4.162495136260986, + "learning_rate": 8.177628637440429e-06, + "loss": 0.2333, "step": 817400 }, { - "epoch": 8.33, - "learning_rate": 2.6341808388746064e-05, - "loss": 0.4639, + "epoch": 11.263123088369017, + "grad_norm": 4.897585868835449, + "learning_rate": 8.17192604077482e-06, + "loss": 0.3174, "step": 817500 }, { - "epoch": 8.33, - "learning_rate": 2.6335224382892933e-05, - "loss": 0.397, + "epoch": 11.264500840428756, + "grad_norm": 2.8180994987487793, + "learning_rate": 8.166225086127982e-06, + "loss": 0.2626, "step": 817600 }, { - "epoch": 8.33, - "learning_rate": 2.6328640556225472e-05, - "loss": 0.4063, + "epoch": 11.265878592488496, + "grad_norm": 2.5467939376831055, + "learning_rate": 8.160525773984237e-06, + "loss": 0.2976, "step": 817700 }, { - "epoch": 8.33, - "learning_rate": 2.6322056909065567e-05, - "loss": 0.4633, + "epoch": 11.267256344548235, + "grad_norm": 2.797877311706543, + "learning_rate": 8.154828104827758e-06, + "loss": 0.2855, "step": 817800 }, { - "epoch": 8.33, - "learning_rate": 2.631547344173513e-05, - "loss": 0.4132, + "epoch": 11.268634096607974, + "grad_norm": 1.1890180110931396, + "learning_rate": 8.149132079142598e-06, + "loss": 0.3223, "step": 817900 }, { - "epoch": 8.33, - "learning_rate": 2.6308890154556056e-05, - "loss": 0.4191, + "epoch": 11.270011848667714, + "grad_norm": 2.9433889389038086, + "learning_rate": 8.143437697412677e-06, + "loss": 0.2795, "step": 818000 }, { - "epoch": 8.33, - "learning_rate": 2.6302307047850222e-05, - "loss": 0.4026, + "epoch": 11.271389600727453, + "grad_norm": 2.9260828495025635, + "learning_rate": 8.137744960121743e-06, + "loss": 0.2693, "step": 818100 }, { - "epoch": 8.34, - "learning_rate": 2.629572412193951e-05, - "loss": 0.3468, + "epoch": 11.272767352787193, + "grad_norm": 0.6027050614356995, + "learning_rate": 8.132053867753438e-06, + "loss": 0.2451, "step": 818200 }, { - "epoch": 8.34, - "learning_rate": 2.6289141377145795e-05, - "loss": 0.3166, + "epoch": 11.274145104846932, + "grad_norm": 5.5438666343688965, + "learning_rate": 8.126364420791244e-06, + "loss": 0.2571, "step": 818300 }, { - "epoch": 8.34, - "learning_rate": 2.6282558813790905e-05, - "loss": 0.4374, + "epoch": 11.27552285690667, + "grad_norm": 0.6320498585700989, + "learning_rate": 8.120676619718499e-06, + "loss": 0.2789, "step": 818400 }, { - "epoch": 8.34, - "learning_rate": 2.6275976432196703e-05, - "loss": 0.4317, + "epoch": 11.276900608966411, + "grad_norm": 2.2426869869232178, + "learning_rate": 8.114990465018417e-06, + "loss": 0.2221, "step": 818500 }, { - "epoch": 8.34, - "learning_rate": 2.6269394232685033e-05, - "loss": 0.3569, + "epoch": 11.27827836102615, + "grad_norm": 2.597557306289673, + "learning_rate": 8.10930595717408e-06, + "loss": 0.2575, "step": 818600 }, { - "epoch": 8.34, - "learning_rate": 2.6262812215577727e-05, - "loss": 0.3371, + "epoch": 11.279656113085888, + "grad_norm": 2.977400779724121, + "learning_rate": 8.103623096668404e-06, + "loss": 0.2387, "step": 818700 }, { - "epoch": 8.34, - "learning_rate": 2.6256230381196597e-05, - "loss": 0.3212, + "epoch": 11.281033865145629, + "grad_norm": 1.482456088066101, + "learning_rate": 8.097941883984165e-06, + "loss": 0.2356, "step": 818800 }, { - "epoch": 8.34, - "learning_rate": 2.6249648729863457e-05, - "loss": 0.3947, + "epoch": 11.282411617205367, + "grad_norm": 2.108266592025757, + "learning_rate": 8.092262319604025e-06, + "loss": 0.2447, "step": 818900 }, { - "epoch": 8.34, - "learning_rate": 2.6243067261900127e-05, - "loss": 0.41, + "epoch": 11.283789369265108, + "grad_norm": 3.8023102283477783, + "learning_rate": 8.0865844040105e-06, + "loss": 0.2554, "step": 819000 }, { - "epoch": 8.35, - "learning_rate": 2.6236485977628376e-05, - "loss": 0.4051, + "epoch": 11.285167121324847, + "grad_norm": 1.9457556009292603, + "learning_rate": 8.080908137685937e-06, + "loss": 0.2658, "step": 819100 }, { - "epoch": 8.35, - "learning_rate": 2.622990487737e-05, - "loss": 0.4169, + "epoch": 11.286544873384585, + "grad_norm": 63.96593475341797, + "learning_rate": 8.075233521112578e-06, + "loss": 0.2808, "step": 819200 }, { - "epoch": 8.35, - "learning_rate": 2.6223323961446783e-05, - "loss": 0.3598, + "epoch": 11.287922625444326, + "grad_norm": 2.547879934310913, + "learning_rate": 8.069560554772525e-06, + "loss": 0.2544, "step": 819300 }, { - "epoch": 8.35, - "learning_rate": 2.621674323018048e-05, - "loss": 0.3422, + "epoch": 11.289300377504064, + "grad_norm": 1.9845930337905884, + "learning_rate": 8.063889239147687e-06, + "loss": 0.2184, "step": 819400 }, { - "epoch": 8.35, - "learning_rate": 2.6210162683892855e-05, - "loss": 0.3323, + "epoch": 11.290678129563803, + "grad_norm": 0.7359409332275391, + "learning_rate": 8.058219574719897e-06, + "loss": 0.2235, "step": 819500 }, { - "epoch": 8.35, - "learning_rate": 2.6203582322905675e-05, - "loss": 0.4301, + "epoch": 11.292055881623543, + "grad_norm": 3.082850217819214, + "learning_rate": 8.052608233920918e-06, + "loss": 0.2599, "step": 819600 }, { - "epoch": 8.35, - "learning_rate": 2.6197002147540646e-05, - "loss": 0.3941, + "epoch": 11.293433633683282, + "grad_norm": 3.8624794483184814, + "learning_rate": 8.046941856808098e-06, + "loss": 0.2968, "step": 819700 }, { - "epoch": 8.35, - "learning_rate": 2.619042215811952e-05, - "loss": 0.3936, + "epoch": 11.29481138574302, + "grad_norm": 1.931824803352356, + "learning_rate": 8.041277132332085e-06, + "loss": 0.2476, "step": 819800 }, { - "epoch": 8.35, - "learning_rate": 2.618384235496402e-05, - "loss": 0.3335, + "epoch": 11.296189137802761, + "grad_norm": 0.8910874128341675, + "learning_rate": 8.035614060974126e-06, + "loss": 0.2725, "step": 819900 }, { - "epoch": 8.35, - "learning_rate": 2.617732853363686e-05, - "loss": 0.4094, + "epoch": 11.2975668898625, + "grad_norm": 2.8127384185791016, + "learning_rate": 8.02995264321533e-06, + "loss": 0.2901, "step": 820000 }, { - "epoch": 8.36, - "learning_rate": 2.617074910210706e-05, - "loss": 0.3468, + "epoch": 11.29894464192224, + "grad_norm": 3.815598249435425, + "learning_rate": 8.024292879536673e-06, + "loss": 0.2368, "step": 820100 }, { - "epoch": 8.36, - "learning_rate": 2.6164169857804765e-05, - "loss": 0.3371, + "epoch": 11.300322393981979, + "grad_norm": 0.46387845277786255, + "learning_rate": 8.018634770418964e-06, + "loss": 0.3128, "step": 820200 }, { - "epoch": 8.36, - "learning_rate": 2.615765659068978e-05, - "loss": 0.4077, + "epoch": 11.301700146041718, + "grad_norm": 2.069627285003662, + "learning_rate": 8.012978316342898e-06, + "loss": 0.2631, "step": 820300 }, { - "epoch": 8.36, - "learning_rate": 2.6151077719927266e-05, - "loss": 0.3025, + "epoch": 11.303077898101458, + "grad_norm": 4.214239120483398, + "learning_rate": 8.007323517789038e-06, + "loss": 0.238, "step": 820400 }, { - "epoch": 8.36, - "learning_rate": 2.6144499037354083e-05, - "loss": 0.4115, + "epoch": 11.304455650161197, + "grad_norm": 2.893728256225586, + "learning_rate": 8.001670375237748e-06, + "loss": 0.2818, "step": 820500 }, { - "epoch": 8.36, - "learning_rate": 2.6137920543291893e-05, - "loss": 0.3197, + "epoch": 11.305833402220935, + "grad_norm": 3.32974910736084, + "learning_rate": 7.996018889169315e-06, + "loss": 0.3177, "step": 820600 }, { - "epoch": 8.36, - "learning_rate": 2.6131342238062333e-05, - "loss": 0.382, + "epoch": 11.307211154280676, + "grad_norm": 6.075284481048584, + "learning_rate": 7.990369060063866e-06, + "loss": 0.2842, "step": 820700 }, { - "epoch": 8.36, - "learning_rate": 2.612476412198705e-05, - "loss": 0.4047, + "epoch": 11.308588906340415, + "grad_norm": 2.7968854904174805, + "learning_rate": 7.984720888401369e-06, + "loss": 0.2269, "step": 820800 }, { - "epoch": 8.36, - "learning_rate": 2.6118186195387694e-05, - "loss": 0.3783, + "epoch": 11.309966658400155, + "grad_norm": 4.132541179656982, + "learning_rate": 7.979074374661682e-06, + "loss": 0.2761, "step": 820900 }, { - "epoch": 8.36, - "learning_rate": 2.611160845858585e-05, - "loss": 0.3644, + "epoch": 11.311344410459894, + "grad_norm": 1.629943609237671, + "learning_rate": 7.973429519324482e-06, + "loss": 0.2532, "step": 821000 }, { - "epoch": 8.37, - "learning_rate": 2.610503091190315e-05, - "loss": 0.3731, + "epoch": 11.312722162519632, + "grad_norm": 1.9705862998962402, + "learning_rate": 7.967786322869353e-06, + "loss": 0.2355, "step": 821100 }, { - "epoch": 8.37, - "learning_rate": 2.60984535556612e-05, - "loss": 0.438, + "epoch": 11.314099914579373, + "grad_norm": 1.30427086353302, + "learning_rate": 7.962144785775694e-06, + "loss": 0.2754, "step": 821200 }, { - "epoch": 8.37, - "learning_rate": 2.609187639018158e-05, - "loss": 0.3298, + "epoch": 11.315477666639111, + "grad_norm": 5.001206398010254, + "learning_rate": 7.956504908522798e-06, + "loss": 0.2215, "step": 821300 }, { - "epoch": 8.37, - "learning_rate": 2.6085299415785885e-05, - "loss": 0.3654, + "epoch": 11.31685541869885, + "grad_norm": 3.0664825439453125, + "learning_rate": 7.950866691589785e-06, + "loss": 0.2376, "step": 821400 }, { - "epoch": 8.37, - "learning_rate": 2.6078722632795694e-05, - "loss": 0.4221, + "epoch": 11.31823317075859, + "grad_norm": 1.475354552268982, + "learning_rate": 7.94523013545567e-06, + "loss": 0.2293, "step": 821500 }, { - "epoch": 8.37, - "learning_rate": 2.6072146041532556e-05, - "loss": 0.3866, + "epoch": 11.31961092281833, + "grad_norm": 1.8670979738235474, + "learning_rate": 7.939651581322954e-06, + "loss": 0.2205, "step": 821600 }, { - "epoch": 8.37, - "learning_rate": 2.606556964231804e-05, - "loss": 0.4139, + "epoch": 11.32098867487807, + "grad_norm": 4.623377799987793, + "learning_rate": 7.934018331603092e-06, + "loss": 0.2757, "step": 821700 }, { - "epoch": 8.37, - "learning_rate": 2.6058993435473685e-05, - "loss": 0.4018, + "epoch": 11.322366426937808, + "grad_norm": 1.1972743272781372, + "learning_rate": 7.928386744113479e-06, + "loss": 0.2226, "step": 821800 }, { - "epoch": 8.37, - "learning_rate": 2.6052417421321037e-05, - "loss": 0.3903, + "epoch": 11.323744178997547, + "grad_norm": 6.778616428375244, + "learning_rate": 7.922756819332532e-06, + "loss": 0.2771, "step": 821900 }, { - "epoch": 8.37, - "learning_rate": 2.6045841600181616e-05, - "loss": 0.3648, + "epoch": 11.325121931057287, + "grad_norm": 2.5903711318969727, + "learning_rate": 7.917128557738557e-06, + "loss": 0.2705, "step": 822000 }, { - "epoch": 8.38, - "learning_rate": 2.6039265972376945e-05, - "loss": 0.3833, + "epoch": 11.326499683117026, + "grad_norm": 7.197052955627441, + "learning_rate": 7.911501959809697e-06, + "loss": 0.2325, "step": 822100 }, { - "epoch": 8.38, - "learning_rate": 2.603269053822854e-05, - "loss": 0.3732, + "epoch": 11.327877435176765, + "grad_norm": 1.4442648887634277, + "learning_rate": 7.905877026023951e-06, + "loss": 0.2209, "step": 822200 }, { - "epoch": 8.38, - "learning_rate": 2.6026115298057883e-05, - "loss": 0.4303, + "epoch": 11.329255187236505, + "grad_norm": 1.5552010536193848, + "learning_rate": 7.900253756859196e-06, + "loss": 0.2312, "step": 822300 }, { - "epoch": 8.38, - "learning_rate": 2.601954025218647e-05, - "loss": 0.4423, + "epoch": 11.330632939296244, + "grad_norm": 3.9230003356933594, + "learning_rate": 7.894632152793163e-06, + "loss": 0.2728, "step": 822400 }, { - "epoch": 8.38, - "learning_rate": 2.6012965400935792e-05, - "loss": 0.392, + "epoch": 11.332010691355984, + "grad_norm": 2.773829698562622, + "learning_rate": 7.889012214303421e-06, + "loss": 0.2302, "step": 822500 }, { - "epoch": 8.38, - "learning_rate": 2.600639074462731e-05, - "loss": 0.4075, + "epoch": 11.333388443415723, + "grad_norm": 3.3008341789245605, + "learning_rate": 7.883393941867428e-06, + "loss": 0.2351, "step": 822600 }, { - "epoch": 8.38, - "learning_rate": 2.5999816283582482e-05, - "loss": 0.3736, + "epoch": 11.334766195475462, + "grad_norm": 3.915314197540283, + "learning_rate": 7.877777335962469e-06, + "loss": 0.227, "step": 822700 }, { - "epoch": 8.38, - "learning_rate": 2.5993242018122777e-05, - "loss": 0.3871, + "epoch": 11.336143947535202, + "grad_norm": 2.4626643657684326, + "learning_rate": 7.872162397065723e-06, + "loss": 0.1891, "step": 822800 }, { - "epoch": 8.38, - "learning_rate": 2.5986667948569622e-05, - "loss": 0.3345, + "epoch": 11.33752169959494, + "grad_norm": 2.7219760417938232, + "learning_rate": 7.866549125654187e-06, + "loss": 0.2518, "step": 822900 }, { - "epoch": 8.38, - "learning_rate": 2.598009407524445e-05, - "loss": 0.3735, + "epoch": 11.33889945165468, + "grad_norm": 2.2657501697540283, + "learning_rate": 7.860937522204746e-06, + "loss": 0.2352, "step": 823000 }, { - "epoch": 8.39, - "learning_rate": 2.5973520398468692e-05, - "loss": 0.4066, + "epoch": 11.34027720371442, + "grad_norm": 3.819279432296753, + "learning_rate": 7.855327587194152e-06, + "loss": 0.2787, "step": 823100 }, { - "epoch": 8.39, - "learning_rate": 2.596694691856375e-05, - "loss": 0.3382, + "epoch": 11.341654955774159, + "grad_norm": 0.12675903737545013, + "learning_rate": 7.849719321098963e-06, + "loss": 0.2518, "step": 823200 }, { - "epoch": 8.39, - "learning_rate": 2.596037363585104e-05, - "loss": 0.3773, + "epoch": 11.343032707833899, + "grad_norm": 1.066680908203125, + "learning_rate": 7.844112724395649e-06, + "loss": 0.275, "step": 823300 }, { - "epoch": 8.39, - "learning_rate": 2.5953800550651954e-05, - "loss": 0.2968, + "epoch": 11.344410459893638, + "grad_norm": 0.3959878981113434, + "learning_rate": 7.838507797560528e-06, + "loss": 0.2252, "step": 823400 }, { - "epoch": 8.39, - "learning_rate": 2.5947227663287884e-05, - "loss": 0.4196, + "epoch": 11.345788211953376, + "grad_norm": 3.517610788345337, + "learning_rate": 7.832904541069748e-06, + "loss": 0.2338, "step": 823500 }, { - "epoch": 8.39, - "learning_rate": 2.5940654974080183e-05, - "loss": 0.4093, + "epoch": 11.347165964013117, + "grad_norm": 2.9037115573883057, + "learning_rate": 7.827302955399343e-06, + "loss": 0.2456, "step": 823600 }, { - "epoch": 8.39, - "learning_rate": 2.5934082483350227e-05, - "loss": 0.4187, + "epoch": 11.348543716072856, + "grad_norm": 3.2053000926971436, + "learning_rate": 7.821703041025206e-06, + "loss": 0.2672, "step": 823700 }, { - "epoch": 8.39, - "learning_rate": 2.5927510191419386e-05, - "loss": 0.3528, + "epoch": 11.349921468132594, + "grad_norm": 2.894637107849121, + "learning_rate": 7.816104798423065e-06, + "loss": 0.2373, "step": 823800 }, { - "epoch": 8.39, - "learning_rate": 2.5920938098608982e-05, - "loss": 0.4095, + "epoch": 11.351299220192335, + "grad_norm": 3.433753252029419, + "learning_rate": 7.810508228068518e-06, + "loss": 0.2369, "step": 823900 }, { - "epoch": 8.4, - "learning_rate": 2.5914366205240363e-05, - "loss": 0.4034, + "epoch": 11.352676972252073, + "grad_norm": 1.3889623880386353, + "learning_rate": 7.804913330437029e-06, + "loss": 0.2429, "step": 824000 }, { - "epoch": 8.4, - "learning_rate": 2.590779451163487e-05, - "loss": 0.4511, + "epoch": 11.354054724311812, + "grad_norm": 1.1524457931518555, + "learning_rate": 7.799320106003922e-06, + "loss": 0.2301, "step": 824100 }, { - "epoch": 8.4, - "learning_rate": 2.5901223018113784e-05, - "loss": 0.374, + "epoch": 11.355432476371552, + "grad_norm": 3.7813222408294678, + "learning_rate": 7.793728555244349e-06, + "loss": 0.2977, "step": 824200 }, { - "epoch": 8.4, - "learning_rate": 2.5894651724998438e-05, - "loss": 0.4259, + "epoch": 11.356810228431291, + "grad_norm": 2.7795779705047607, + "learning_rate": 7.78813867863336e-06, + "loss": 0.3137, "step": 824300 }, { - "epoch": 8.4, - "learning_rate": 2.588808063261013e-05, - "loss": 0.3929, + "epoch": 11.358187980491032, + "grad_norm": 0.5353711843490601, + "learning_rate": 7.782550476645833e-06, + "loss": 0.2466, "step": 824400 }, { - "epoch": 8.4, - "learning_rate": 2.5881509741270134e-05, - "loss": 0.3305, + "epoch": 11.35956573255077, + "grad_norm": 7.459709167480469, + "learning_rate": 7.777019806732113e-06, + "loss": 0.1837, "step": 824500 }, { - "epoch": 8.4, - "learning_rate": 2.5874939051299732e-05, - "loss": 0.359, + "epoch": 11.360943484610509, + "grad_norm": 2.112063407897949, + "learning_rate": 7.771434938657527e-06, + "loss": 0.2484, "step": 824600 }, { - "epoch": 8.4, - "learning_rate": 2.5868368563020205e-05, - "loss": 0.3497, + "epoch": 11.36232123667025, + "grad_norm": 2.9148638248443604, + "learning_rate": 7.765851746625476e-06, + "loss": 0.2424, "step": 824700 }, { - "epoch": 8.4, - "learning_rate": 2.586179827675279e-05, - "loss": 0.4083, + "epoch": 11.363698988729988, + "grad_norm": 4.938729286193848, + "learning_rate": 7.760270231110287e-06, + "loss": 0.2741, "step": 824800 }, { - "epoch": 8.4, - "learning_rate": 2.5855228192818742e-05, - "loss": 0.356, + "epoch": 11.365076740789727, + "grad_norm": 4.208714008331299, + "learning_rate": 7.75469039258613e-06, + "loss": 0.225, "step": 824900 }, { - "epoch": 8.41, - "learning_rate": 2.5848658311539305e-05, - "loss": 0.3516, + "epoch": 11.366454492849467, + "grad_norm": 3.2872002124786377, + "learning_rate": 7.74911223152703e-06, + "loss": 0.2465, "step": 825000 }, { - "epoch": 8.41, - "learning_rate": 2.58420886332357e-05, - "loss": 0.3894, + "epoch": 11.367832244909206, + "grad_norm": 1.6545212268829346, + "learning_rate": 7.7435357484069e-06, + "loss": 0.2494, "step": 825100 }, { - "epoch": 8.41, - "learning_rate": 2.583565054573489e-05, - "loss": 0.3765, + "epoch": 11.369209996968946, + "grad_norm": 1.3026200532913208, + "learning_rate": 7.737960943699476e-06, + "loss": 0.2847, "step": 825200 }, { - "epoch": 8.41, - "learning_rate": 2.5829081270271084e-05, - "loss": 0.4601, + "epoch": 11.370587749028685, + "grad_norm": 0.009530747309327126, + "learning_rate": 7.732387817878377e-06, + "loss": 0.2516, "step": 825300 }, { - "epoch": 8.41, - "learning_rate": 2.5822512198740325e-05, - "loss": 0.3807, + "epoch": 11.371965501088424, + "grad_norm": 3.345026969909668, + "learning_rate": 7.726816371417053e-06, + "loss": 0.2219, "step": 825400 }, { - "epoch": 8.41, - "learning_rate": 2.5815943331463778e-05, - "loss": 0.4226, + "epoch": 11.373343253148164, + "grad_norm": 7.327831745147705, + "learning_rate": 7.72124660478885e-06, + "loss": 0.2598, "step": 825500 }, { - "epoch": 8.41, - "learning_rate": 2.5809374668762632e-05, - "loss": 0.3577, + "epoch": 11.374721005207903, + "grad_norm": 2.048129081726074, + "learning_rate": 7.715678518466923e-06, + "loss": 0.248, "step": 825600 }, { - "epoch": 8.41, - "learning_rate": 2.5802806210958065e-05, - "loss": 0.3851, + "epoch": 11.376098757267641, + "grad_norm": 1.9174838066101074, + "learning_rate": 7.710112112924326e-06, + "loss": 0.2183, "step": 825700 }, { - "epoch": 8.41, - "learning_rate": 2.5796237958371217e-05, - "loss": 0.4374, + "epoch": 11.377476509327382, + "grad_norm": 5.567663669586182, + "learning_rate": 7.704547388633953e-06, + "loss": 0.2857, "step": 825800 }, { - "epoch": 8.41, - "learning_rate": 2.5789669911323246e-05, - "loss": 0.3843, + "epoch": 11.37885426138712, + "grad_norm": 1.9314908981323242, + "learning_rate": 7.698984346068558e-06, + "loss": 0.2518, "step": 825900 }, { - "epoch": 8.42, - "learning_rate": 2.578316774752712e-05, - "loss": 0.4252, + "epoch": 11.38023201344686, + "grad_norm": 2.685676097869873, + "learning_rate": 7.693422985700731e-06, + "loss": 0.2404, "step": 826000 }, { - "epoch": 8.42, - "learning_rate": 2.5776600110456908e-05, - "loss": 0.4374, + "epoch": 11.3816097655066, + "grad_norm": 0.1806098371744156, + "learning_rate": 7.68786330800296e-06, + "loss": 0.2852, "step": 826100 }, { - "epoch": 8.42, - "learning_rate": 2.5770032679885757e-05, - "loss": 0.4129, + "epoch": 11.382987517566338, + "grad_norm": 2.8371596336364746, + "learning_rate": 7.682305313447552e-06, + "loss": 0.3025, "step": 826200 }, { - "epoch": 8.42, - "learning_rate": 2.576346545613475e-05, - "loss": 0.4307, + "epoch": 11.384365269626079, + "grad_norm": 1.0369031429290771, + "learning_rate": 7.676749002506695e-06, + "loss": 0.252, "step": 826300 }, { - "epoch": 8.42, - "learning_rate": 2.5756898439525005e-05, - "loss": 0.4115, + "epoch": 11.385743021685817, + "grad_norm": 2.0922703742980957, + "learning_rate": 7.671194375652431e-06, + "loss": 0.2278, "step": 826400 }, { - "epoch": 8.42, - "learning_rate": 2.5750331630377618e-05, - "loss": 0.3378, + "epoch": 11.387120773745556, + "grad_norm": 3.1735498905181885, + "learning_rate": 7.66564143335665e-06, + "loss": 0.2324, "step": 826500 }, { - "epoch": 8.42, - "learning_rate": 2.5743765029013643e-05, - "loss": 0.3292, + "epoch": 11.388498525805296, + "grad_norm": 2.1867239475250244, + "learning_rate": 7.660090176091093e-06, + "loss": 0.2322, "step": 826600 }, { - "epoch": 8.42, - "learning_rate": 2.5737198635754154e-05, - "loss": 0.3697, + "epoch": 11.389876277865035, + "grad_norm": 5.179723262786865, + "learning_rate": 7.654540604327376e-06, + "loss": 0.2584, "step": 826700 }, { - "epoch": 8.42, - "learning_rate": 2.5730632450920227e-05, - "loss": 0.4742, + "epoch": 11.391254029924776, + "grad_norm": 2.861086845397949, + "learning_rate": 7.648992718536972e-06, + "loss": 0.212, "step": 826800 }, { - "epoch": 8.42, - "learning_rate": 2.5724066474832877e-05, - "loss": 0.3555, + "epoch": 11.392631781984514, + "grad_norm": 3.130150556564331, + "learning_rate": 7.643446519191186e-06, + "loss": 0.2519, "step": 826900 }, { - "epoch": 8.43, - "learning_rate": 2.5717500707813164e-05, - "loss": 0.3648, + "epoch": 11.394009534044253, + "grad_norm": 1.7339131832122803, + "learning_rate": 7.637902006761209e-06, + "loss": 0.2844, "step": 827000 }, { - "epoch": 8.43, - "learning_rate": 2.571093515018211e-05, - "loss": 0.3602, + "epoch": 11.395387286103993, + "grad_norm": 3.3964924812316895, + "learning_rate": 7.632359181718072e-06, + "loss": 0.2686, "step": 827100 }, { - "epoch": 8.43, - "learning_rate": 2.5704369802260743e-05, - "loss": 0.3672, + "epoch": 11.396765038163732, + "grad_norm": 1.8778835535049438, + "learning_rate": 7.626818044532655e-06, + "loss": 0.2757, "step": 827200 }, { - "epoch": 8.43, - "learning_rate": 2.5697804664370046e-05, - "loss": 0.4047, + "epoch": 11.39814279022347, + "grad_norm": 3.3743250370025635, + "learning_rate": 7.621278595675717e-06, + "loss": 0.2364, "step": 827300 }, { - "epoch": 8.43, - "learning_rate": 2.5691239736831026e-05, - "loss": 0.378, + "epoch": 11.399520542283211, + "grad_norm": 2.3929708003997803, + "learning_rate": 7.6157408356178705e-06, + "loss": 0.3007, "step": 827400 }, { - "epoch": 8.43, - "learning_rate": 2.5684675019964678e-05, - "loss": 0.3654, + "epoch": 11.40089829434295, + "grad_norm": 4.376112461090088, + "learning_rate": 7.610204764829557e-06, + "loss": 0.2602, "step": 827500 }, { - "epoch": 8.43, - "learning_rate": 2.567811051409196e-05, - "loss": 0.3638, + "epoch": 11.40227604640269, + "grad_norm": 13.557110786437988, + "learning_rate": 7.604670383781113e-06, + "loss": 0.2618, "step": 827600 }, { - "epoch": 8.43, - "learning_rate": 2.567154621953385e-05, - "loss": 0.362, + "epoch": 11.403653798462429, + "grad_norm": 4.27698278427124, + "learning_rate": 7.599137692942695e-06, + "loss": 0.238, "step": 827700 }, { - "epoch": 8.43, - "learning_rate": 2.5664982136611307e-05, - "loss": 0.3511, + "epoch": 11.405031550522168, + "grad_norm": 3.7122349739074707, + "learning_rate": 7.59366199441552e-06, + "loss": 0.2832, "step": 827800 }, { - "epoch": 8.43, - "learning_rate": 2.5658418265645257e-05, - "loss": 0.3793, + "epoch": 11.406409302581908, + "grad_norm": 3.6863527297973633, + "learning_rate": 7.588132668493303e-06, + "loss": 0.2471, "step": 827900 }, { - "epoch": 8.44, - "learning_rate": 2.5651854606956646e-05, - "loss": 0.3702, + "epoch": 11.407787054641647, + "grad_norm": 3.14986515045166, + "learning_rate": 7.582605034186076e-06, + "loss": 0.2478, "step": 828000 }, { - "epoch": 8.44, - "learning_rate": 2.564529116086641e-05, - "loss": 0.4241, + "epoch": 11.409164806701385, + "grad_norm": 2.0508217811584473, + "learning_rate": 7.577079091963455e-06, + "loss": 0.2425, "step": 828100 }, { - "epoch": 8.44, - "learning_rate": 2.5638727927695423e-05, - "loss": 0.4239, + "epoch": 11.410542558761126, + "grad_norm": 0.10330581665039062, + "learning_rate": 7.571554842294894e-06, + "loss": 0.2787, "step": 828200 }, { - "epoch": 8.44, - "learning_rate": 2.5632164907764627e-05, - "loss": 0.3703, + "epoch": 11.411920310820864, + "grad_norm": 2.3158676624298096, + "learning_rate": 7.566032285649689e-06, + "loss": 0.2445, "step": 828300 }, { - "epoch": 8.44, - "learning_rate": 2.5625602101394912e-05, - "loss": 0.3697, + "epoch": 11.413298062880603, + "grad_norm": 3.709444999694824, + "learning_rate": 7.560511422497025e-06, + "loss": 0.223, "step": 828400 }, { - "epoch": 8.44, - "learning_rate": 2.5619039508907132e-05, - "loss": 0.3987, + "epoch": 11.414675814940344, + "grad_norm": 3.366015911102295, + "learning_rate": 7.554992253305936e-06, + "loss": 0.206, "step": 828500 }, { - "epoch": 8.44, - "learning_rate": 2.561254275334368e-05, - "loss": 0.4274, + "epoch": 11.416053567000082, + "grad_norm": 3.258861541748047, + "learning_rate": 7.54947477854528e-06, + "loss": 0.2309, "step": 828600 }, { - "epoch": 8.44, - "learning_rate": 2.5605980587435584e-05, - "loss": 0.3799, + "epoch": 11.417431319059823, + "grad_norm": 1.229088306427002, + "learning_rate": 7.5439589986838205e-06, + "loss": 0.2624, "step": 828700 }, { - "epoch": 8.44, - "learning_rate": 2.559941863636882e-05, - "loss": 0.4027, + "epoch": 11.418809071119561, + "grad_norm": 1.9554978609085083, + "learning_rate": 7.538444914190128e-06, + "loss": 0.2266, "step": 828800 }, { - "epoch": 8.44, - "learning_rate": 2.559285690046423e-05, - "loss": 0.4281, + "epoch": 11.4201868231793, + "grad_norm": 3.9325079917907715, + "learning_rate": 7.532932525532672e-06, + "loss": 0.25, "step": 828900 }, { - "epoch": 8.45, - "learning_rate": 2.5586295380042625e-05, - "loss": 0.415, + "epoch": 11.42156457523904, + "grad_norm": 1.9586195945739746, + "learning_rate": 7.52742183317974e-06, + "loss": 0.251, "step": 829000 }, { - "epoch": 8.45, - "learning_rate": 2.557973407542484e-05, - "loss": 0.3991, + "epoch": 11.42294232729878, + "grad_norm": 3.803563117980957, + "learning_rate": 7.52191283759951e-06, + "loss": 0.3028, "step": 829100 }, { - "epoch": 8.45, - "learning_rate": 2.5573172986931695e-05, - "loss": 0.3228, + "epoch": 11.424320079358518, + "grad_norm": 0.4155932068824768, + "learning_rate": 7.516405539259993e-06, + "loss": 0.2765, "step": 829200 }, { - "epoch": 8.45, - "learning_rate": 2.556661211488396e-05, - "loss": 0.3241, + "epoch": 11.425697831418258, + "grad_norm": 1.5774508714675903, + "learning_rate": 7.5108999386290515e-06, + "loss": 0.2887, "step": 829300 }, { - "epoch": 8.45, - "learning_rate": 2.5560051459602434e-05, - "loss": 0.4066, + "epoch": 11.427075583477997, + "grad_norm": 2.342172622680664, + "learning_rate": 7.505396036174424e-06, + "loss": 0.2473, "step": 829400 }, { - "epoch": 8.45, - "learning_rate": 2.5553491021407894e-05, - "loss": 0.4635, + "epoch": 11.428453335537737, + "grad_norm": 1.164090871810913, + "learning_rate": 7.4998938323637e-06, + "loss": 0.2701, "step": 829500 }, { - "epoch": 8.45, - "learning_rate": 2.5546930800621115e-05, - "loss": 0.3895, + "epoch": 11.429831087597476, + "grad_norm": 5.180591106414795, + "learning_rate": 7.494393327664313e-06, + "loss": 0.2647, "step": 829600 }, { - "epoch": 8.45, - "learning_rate": 2.554037079756284e-05, - "loss": 0.419, + "epoch": 11.431208839657215, + "grad_norm": 2.7870445251464844, + "learning_rate": 7.488894522543556e-06, + "loss": 0.2297, "step": 829700 }, { - "epoch": 8.45, - "learning_rate": 2.5533811012553817e-05, - "loss": 0.3383, + "epoch": 11.432586591716955, + "grad_norm": 3.5560641288757324, + "learning_rate": 7.483397417468594e-06, + "loss": 0.2615, "step": 829800 }, { - "epoch": 8.46, - "learning_rate": 2.5527251445914798e-05, - "loss": 0.3552, + "epoch": 11.433964343776694, + "grad_norm": 1.4392467737197876, + "learning_rate": 7.477902012906427e-06, + "loss": 0.273, "step": 829900 }, { - "epoch": 8.46, - "learning_rate": 2.5520692097966475e-05, - "loss": 0.3996, + "epoch": 11.435342095836432, + "grad_norm": 0.049318090081214905, + "learning_rate": 7.472408309323909e-06, + "loss": 0.2499, "step": 830000 }, { - "epoch": 8.46, - "learning_rate": 2.5514132969029582e-05, - "loss": 0.4659, + "epoch": 11.436719847896173, + "grad_norm": 2.0783803462982178, + "learning_rate": 7.4669163071877715e-06, + "loss": 0.2269, "step": 830100 }, { - "epoch": 8.46, - "learning_rate": 2.5507574059424818e-05, - "loss": 0.3715, + "epoch": 11.438097599955912, + "grad_norm": 0.8156190514564514, + "learning_rate": 7.461426006964577e-06, + "loss": 0.2097, "step": 830200 }, { - "epoch": 8.46, - "learning_rate": 2.550101536947287e-05, - "loss": 0.3717, + "epoch": 11.439475352015652, + "grad_norm": 0.13967573642730713, + "learning_rate": 7.455992286670897e-06, + "loss": 0.3045, "step": 830300 }, { - "epoch": 8.46, - "learning_rate": 2.549445689949442e-05, - "loss": 0.3539, + "epoch": 11.44085310407539, + "grad_norm": 5.60945987701416, + "learning_rate": 7.450505374641975e-06, + "loss": 0.2743, "step": 830400 }, { - "epoch": 8.46, - "learning_rate": 2.5487898649810157e-05, - "loss": 0.4722, + "epoch": 11.44223085613513, + "grad_norm": 3.027225971221924, + "learning_rate": 7.445020165920197e-06, + "loss": 0.2571, "step": 830500 }, { - "epoch": 8.46, - "learning_rate": 2.5481340620740705e-05, - "loss": 0.4086, + "epoch": 11.44360860819487, + "grad_norm": 3.76334810256958, + "learning_rate": 7.439536660971562e-06, + "loss": 0.2436, "step": 830600 }, { - "epoch": 8.46, - "learning_rate": 2.5474782812606733e-05, - "loss": 0.3772, + "epoch": 11.444986360254608, + "grad_norm": 1.4432705640792847, + "learning_rate": 7.434054860261907e-06, + "loss": 0.2469, "step": 830700 }, { - "epoch": 8.46, - "learning_rate": 2.5468225225728883e-05, - "loss": 0.3444, + "epoch": 11.446364112314347, + "grad_norm": 2.087167501449585, + "learning_rate": 7.428574764256951e-06, + "loss": 0.2584, "step": 830800 }, { - "epoch": 8.47, - "learning_rate": 2.5461667860427767e-05, - "loss": 0.375, + "epoch": 11.447741864374088, + "grad_norm": 1.2185357809066772, + "learning_rate": 7.423096373422268e-06, + "loss": 0.2636, "step": 830900 }, { - "epoch": 8.47, - "learning_rate": 2.5455110717024015e-05, - "loss": 0.3737, + "epoch": 11.449119616433826, + "grad_norm": 3.5740129947662354, + "learning_rate": 7.417619688223244e-06, + "loss": 0.2329, "step": 831000 }, { - "epoch": 8.47, - "learning_rate": 2.5448553795838232e-05, - "loss": 0.3984, + "epoch": 11.450497368493567, + "grad_norm": 1.6161437034606934, + "learning_rate": 7.41214470912517e-06, + "loss": 0.2437, "step": 831100 }, { - "epoch": 8.47, - "learning_rate": 2.5441997097191e-05, - "loss": 0.4175, + "epoch": 11.451875120553305, + "grad_norm": 1.40609872341156, + "learning_rate": 7.40667143659318e-06, + "loss": 0.3074, "step": 831200 }, { - "epoch": 8.47, - "learning_rate": 2.5435440621402906e-05, - "loss": 0.3895, + "epoch": 11.453252872613044, + "grad_norm": 2.3260626792907715, + "learning_rate": 7.401199871092239e-06, + "loss": 0.2335, "step": 831300 }, { - "epoch": 8.47, - "learning_rate": 2.5428884368794526e-05, - "loss": 0.354, + "epoch": 11.454630624672784, + "grad_norm": 0.19453950226306915, + "learning_rate": 7.395730013087202e-06, + "loss": 0.2505, "step": 831400 }, { - "epoch": 8.47, - "learning_rate": 2.5422328339686428e-05, - "loss": 0.3669, + "epoch": 11.456008376732523, + "grad_norm": 2.5065441131591797, + "learning_rate": 7.390261863042747e-06, + "loss": 0.3053, "step": 831500 }, { - "epoch": 8.47, - "learning_rate": 2.5415772534399146e-05, - "loss": 0.4207, + "epoch": 11.457386128792262, + "grad_norm": 0.7695389986038208, + "learning_rate": 7.384795421423431e-06, + "loss": 0.2472, "step": 831600 }, { - "epoch": 8.47, - "learning_rate": 2.5409216953253235e-05, - "loss": 0.3863, + "epoch": 11.458763880852002, + "grad_norm": 5.614015579223633, + "learning_rate": 7.379330688693648e-06, + "loss": 0.2897, "step": 831700 }, { - "epoch": 8.47, - "learning_rate": 2.540266159656923e-05, - "loss": 0.407, + "epoch": 11.460141632911741, + "grad_norm": 1.5453665256500244, + "learning_rate": 7.373867665317661e-06, + "loss": 0.2391, "step": 831800 }, { - "epoch": 8.48, - "learning_rate": 2.5396106464667627e-05, - "loss": 0.4362, + "epoch": 11.461519384971481, + "grad_norm": 2.757171392440796, + "learning_rate": 7.368406351759584e-06, + "loss": 0.23, "step": 831900 }, { - "epoch": 8.48, - "learning_rate": 2.538955155786894e-05, - "loss": 0.3383, + "epoch": 11.46289713703122, + "grad_norm": 6.970754623413086, + "learning_rate": 7.362946748483385e-06, + "loss": 0.2459, "step": 832000 }, { - "epoch": 8.48, - "learning_rate": 2.5382996876493675e-05, - "loss": 0.448, + "epoch": 11.464274889090959, + "grad_norm": 4.505046367645264, + "learning_rate": 7.357488855952867e-06, + "loss": 0.2707, "step": 832100 }, { - "epoch": 8.48, - "learning_rate": 2.5376442420862304e-05, - "loss": 0.3632, + "epoch": 11.4656526411507, + "grad_norm": 1.1355433464050293, + "learning_rate": 7.3520326746317305e-06, + "loss": 0.2396, "step": 832200 }, { - "epoch": 8.48, - "learning_rate": 2.5369888191295306e-05, - "loss": 0.4123, + "epoch": 11.467030393210438, + "grad_norm": 2.8249053955078125, + "learning_rate": 7.346578204983487e-06, + "loss": 0.2348, "step": 832300 }, { - "epoch": 8.48, - "learning_rate": 2.5363334188113158e-05, - "loss": 0.3583, + "epoch": 11.468408145270176, + "grad_norm": 3.498345375061035, + "learning_rate": 7.3411254474715276e-06, + "loss": 0.2403, "step": 832400 }, { - "epoch": 8.48, - "learning_rate": 2.5356780411636284e-05, - "loss": 0.4055, + "epoch": 11.469785897329917, + "grad_norm": 3.023056983947754, + "learning_rate": 7.3356744025591e-06, + "loss": 0.2341, "step": 832500 }, { - "epoch": 8.48, - "learning_rate": 2.5350226862185138e-05, - "loss": 0.3471, + "epoch": 11.471163649389656, + "grad_norm": 3.470332384109497, + "learning_rate": 7.330225070709296e-06, + "loss": 0.3224, "step": 832600 }, { - "epoch": 8.48, - "learning_rate": 2.534367354008016e-05, - "loss": 0.35, + "epoch": 11.472541401449394, + "grad_norm": 0.06458701938390732, + "learning_rate": 7.324777452385049e-06, + "loss": 0.2624, "step": 832700 }, { - "epoch": 8.48, - "learning_rate": 2.533712044564175e-05, - "loss": 0.4121, + "epoch": 11.473919153509135, + "grad_norm": 1.161067008972168, + "learning_rate": 7.319331548049174e-06, + "loss": 0.2452, "step": 832800 }, { - "epoch": 8.49, - "learning_rate": 2.5330633106725255e-05, - "loss": 0.3793, + "epoch": 11.475296905568873, + "grad_norm": 1.7118459939956665, + "learning_rate": 7.313887358164337e-06, + "loss": 0.2769, "step": 832900 }, { - "epoch": 8.49, - "learning_rate": 2.532408046629655e-05, - "loss": 0.3512, + "epoch": 11.476674657628614, + "grad_norm": 0.36733904480934143, + "learning_rate": 7.308444883193029e-06, + "loss": 0.2156, "step": 833000 }, { - "epoch": 8.49, - "learning_rate": 2.5317528054492404e-05, - "loss": 0.3417, + "epoch": 11.478052409688352, + "grad_norm": 4.318186283111572, + "learning_rate": 7.303004123597639e-06, + "loss": 0.2413, "step": 833100 }, { - "epoch": 8.49, - "learning_rate": 2.5310975871633207e-05, - "loss": 0.399, + "epoch": 11.479430161748091, + "grad_norm": 3.4880118370056152, + "learning_rate": 7.297565079840373e-06, + "loss": 0.2868, "step": 833200 }, { - "epoch": 8.49, - "learning_rate": 2.530442391803929e-05, - "loss": 0.4513, + "epoch": 11.480807913807832, + "grad_norm": 3.008613348007202, + "learning_rate": 7.292127752383302e-06, + "loss": 0.2551, "step": 833300 }, { - "epoch": 8.49, - "learning_rate": 2.5297872194031024e-05, - "loss": 0.3194, + "epoch": 11.48218566586757, + "grad_norm": 1.0480797290802002, + "learning_rate": 7.286692141688362e-06, + "loss": 0.263, "step": 833400 }, { - "epoch": 8.49, - "learning_rate": 2.5291320699928757e-05, - "loss": 0.3695, + "epoch": 11.483563417927309, + "grad_norm": 4.988347053527832, + "learning_rate": 7.281258248217341e-06, + "loss": 0.2553, "step": 833500 }, { - "epoch": 8.49, - "learning_rate": 2.5284769436052796e-05, - "loss": 0.4037, + "epoch": 11.48494116998705, + "grad_norm": 1.2590678930282593, + "learning_rate": 7.275826072431872e-06, + "loss": 0.2465, "step": 833600 }, { - "epoch": 8.49, - "learning_rate": 2.5278218402723476e-05, - "loss": 0.3444, + "epoch": 11.486318922046788, + "grad_norm": 1.4309666156768799, + "learning_rate": 7.270395614793436e-06, + "loss": 0.2601, "step": 833700 }, { - "epoch": 8.49, - "learning_rate": 2.5271667600261096e-05, - "loss": 0.3373, + "epoch": 11.487696674106529, + "grad_norm": 2.491604804992676, + "learning_rate": 7.2650211546450615e-06, + "loss": 0.2371, "step": 833800 }, { - "epoch": 8.5, - "learning_rate": 2.526511702898596e-05, - "loss": 0.3818, + "epoch": 11.489074426166267, + "grad_norm": 1.3212556838989258, + "learning_rate": 7.259594117491629e-06, + "loss": 0.2475, "step": 833900 }, { - "epoch": 8.5, - "learning_rate": 2.5258566689218342e-05, - "loss": 0.3999, + "epoch": 11.490452178226006, + "grad_norm": 4.518353462219238, + "learning_rate": 7.254168799864214e-06, + "loss": 0.2192, "step": 834000 }, { - "epoch": 8.5, - "learning_rate": 2.5252016581278507e-05, - "loss": 0.411, + "epoch": 11.491829930285746, + "grad_norm": 5.0000786781311035, + "learning_rate": 7.248745202223739e-06, + "loss": 0.2575, "step": 834100 }, { - "epoch": 8.5, - "learning_rate": 2.5245466705486743e-05, - "loss": 0.3549, + "epoch": 11.493207682345485, + "grad_norm": 3.223984956741333, + "learning_rate": 7.2433233250309705e-06, + "loss": 0.2803, "step": 834200 }, { - "epoch": 8.5, - "learning_rate": 2.523891706216327e-05, - "loss": 0.344, + "epoch": 11.494585434405224, + "grad_norm": 0.8992794156074524, + "learning_rate": 7.2379031687465175e-06, + "loss": 0.2776, "step": 834300 }, { - "epoch": 8.5, - "learning_rate": 2.5232367651628344e-05, - "loss": 0.4131, + "epoch": 11.495963186464964, + "grad_norm": 0.08151814341545105, + "learning_rate": 7.232484733830844e-06, + "loss": 0.2626, "step": 834400 }, { - "epoch": 8.5, - "learning_rate": 2.52258184742022e-05, - "loss": 0.3825, + "epoch": 11.497340938524703, + "grad_norm": 4.147632122039795, + "learning_rate": 7.227068020744279e-06, + "loss": 0.2282, "step": 834500 }, { - "epoch": 8.5, - "learning_rate": 2.521926953020503e-05, - "loss": 0.419, + "epoch": 11.498718690584443, + "grad_norm": 0.3707927167415619, + "learning_rate": 7.2216530299470095e-06, + "loss": 0.2473, "step": 834600 }, { - "epoch": 8.5, - "learning_rate": 2.5212720819957046e-05, - "loss": 0.4012, + "epoch": 11.500096442644182, + "grad_norm": 3.010037899017334, + "learning_rate": 7.216239761899059e-06, + "loss": 0.2504, "step": 834700 }, { - "epoch": 8.51, - "learning_rate": 2.5206172343778455e-05, - "loss": 0.4012, + "epoch": 11.50147419470392, + "grad_norm": 3.460237503051758, + "learning_rate": 7.210828217060303e-06, + "loss": 0.2582, "step": 834800 }, { - "epoch": 8.51, - "learning_rate": 2.5199624101989422e-05, - "loss": 0.4201, + "epoch": 11.502851946763661, + "grad_norm": 2.639714479446411, + "learning_rate": 7.205418395890496e-06, + "loss": 0.2535, "step": 834900 }, { - "epoch": 8.51, - "learning_rate": 2.5193076094910122e-05, - "loss": 0.3052, + "epoch": 11.5042296988234, + "grad_norm": 2.8627631664276123, + "learning_rate": 7.200010298849218e-06, + "loss": 0.2206, "step": 835000 }, { - "epoch": 8.51, - "learning_rate": 2.5186528322860726e-05, - "loss": 0.3954, + "epoch": 11.505607450883138, + "grad_norm": 1.9577405452728271, + "learning_rate": 7.194603926395916e-06, + "loss": 0.2695, "step": 835100 }, { - "epoch": 8.51, - "learning_rate": 2.517998078616136e-05, - "loss": 0.353, + "epoch": 11.506985202942879, + "grad_norm": 6.856706619262695, + "learning_rate": 7.189199278989902e-06, + "loss": 0.2888, "step": 835200 }, { - "epoch": 8.51, - "learning_rate": 2.517343348513217e-05, - "loss": 0.3584, + "epoch": 11.508362955002617, + "grad_norm": 1.630034327507019, + "learning_rate": 7.183796357090317e-06, + "loss": 0.2348, "step": 835300 }, { - "epoch": 8.51, - "learning_rate": 2.516695188957446e-05, - "loss": 0.3826, + "epoch": 11.509740707062358, + "grad_norm": 2.0960500240325928, + "learning_rate": 7.178395161156159e-06, + "loss": 0.228, "step": 835400 }, { - "epoch": 8.51, - "learning_rate": 2.516040505848129e-05, - "loss": 0.3807, + "epoch": 11.511118459122097, + "grad_norm": 3.3258602619171143, + "learning_rate": 7.172995691646301e-06, + "loss": 0.2986, "step": 835500 }, { - "epoch": 8.51, - "learning_rate": 2.5153858464015445e-05, - "loss": 0.3492, + "epoch": 11.512496211181835, + "grad_norm": 2.4146888256073, + "learning_rate": 7.167597949019458e-06, + "loss": 0.2644, "step": 835600 }, { - "epoch": 8.51, - "learning_rate": 2.5147312106496987e-05, - "loss": 0.343, + "epoch": 11.513873963241576, + "grad_norm": 0.9140627384185791, + "learning_rate": 7.1622019337341784e-06, + "loss": 0.2552, "step": 835700 }, { - "epoch": 8.52, - "learning_rate": 2.514076598624601e-05, - "loss": 0.4204, + "epoch": 11.515251715301314, + "grad_norm": 1.3609150648117065, + "learning_rate": 7.156807646248902e-06, + "loss": 0.2445, "step": 835800 }, { - "epoch": 8.52, - "learning_rate": 2.5134220103582574e-05, - "loss": 0.398, + "epoch": 11.516629467361053, + "grad_norm": 2.699533462524414, + "learning_rate": 7.151415087021891e-06, + "loss": 0.2059, "step": 835900 }, { - "epoch": 8.52, - "learning_rate": 2.5127674458826745e-05, - "loss": 0.36, + "epoch": 11.518007219420793, + "grad_norm": 2.312948226928711, + "learning_rate": 7.146024256511262e-06, + "loss": 0.2191, "step": 836000 }, { - "epoch": 8.52, - "learning_rate": 2.5121194505183565e-05, - "loss": 0.3538, + "epoch": 11.519384971480532, + "grad_norm": 2.9952449798583984, + "learning_rate": 7.140635155175005e-06, + "loss": 0.2373, "step": 836100 }, { - "epoch": 8.52, - "learning_rate": 2.5114649334815982e-05, - "loss": 0.3591, + "epoch": 11.520762723540273, + "grad_norm": 2.5041937828063965, + "learning_rate": 7.1352477834709555e-06, + "loss": 0.2313, "step": 836200 }, { - "epoch": 8.52, - "learning_rate": 2.5108104403312897e-05, - "loss": 0.4186, + "epoch": 11.522140475600011, + "grad_norm": 3.4156904220581055, + "learning_rate": 7.129862141856784e-06, + "loss": 0.2752, "step": 836300 }, { - "epoch": 8.52, - "learning_rate": 2.5101559710994328e-05, - "loss": 0.3692, + "epoch": 11.52351822765975, + "grad_norm": 3.0293500423431396, + "learning_rate": 7.1244782307900456e-06, + "loss": 0.2655, "step": 836400 }, { - "epoch": 8.52, - "learning_rate": 2.509501525818024e-05, - "loss": 0.3875, + "epoch": 11.52489597971949, + "grad_norm": 19.489761352539062, + "learning_rate": 7.119096050728113e-06, + "loss": 0.2432, "step": 836500 }, { - "epoch": 8.52, - "learning_rate": 2.5088471045190634e-05, - "loss": 0.4127, + "epoch": 11.526273731779229, + "grad_norm": 2.130206823348999, + "learning_rate": 7.113769398042e-06, + "loss": 0.2462, "step": 836600 }, { - "epoch": 8.52, - "learning_rate": 2.5081927072345486e-05, - "loss": 0.3646, + "epoch": 11.527651483838968, + "grad_norm": 3.5766313076019287, + "learning_rate": 7.108390664039837e-06, + "loss": 0.2663, "step": 836700 }, { - "epoch": 8.53, - "learning_rate": 2.5075383339964758e-05, - "loss": 0.373, + "epoch": 11.529029235898708, + "grad_norm": 2.1663081645965576, + "learning_rate": 7.103013662409202e-06, + "loss": 0.2895, "step": 836800 }, { - "epoch": 8.53, - "learning_rate": 2.5068839848368396e-05, - "loss": 0.3717, + "epoch": 11.530406987958447, + "grad_norm": 1.807091474533081, + "learning_rate": 7.097638393606913e-06, + "loss": 0.2139, "step": 836900 }, { - "epoch": 8.53, - "learning_rate": 2.506229659787633e-05, - "loss": 0.3989, + "epoch": 11.531784740018185, + "grad_norm": 2.310610055923462, + "learning_rate": 7.0922648580896195e-06, + "loss": 0.2193, "step": 837000 }, { - "epoch": 8.53, - "learning_rate": 2.5055753588808512e-05, - "loss": 0.4239, + "epoch": 11.533162492077926, + "grad_norm": 1.3277782201766968, + "learning_rate": 7.086893056313826e-06, + "loss": 0.2319, "step": 837100 }, { - "epoch": 8.53, - "learning_rate": 2.5049210821484822e-05, - "loss": 0.3465, + "epoch": 11.534540244137665, + "grad_norm": 7.29790735244751, + "learning_rate": 7.0815766808259e-06, + "loss": 0.2355, "step": 837200 }, { - "epoch": 8.53, - "learning_rate": 2.5042668296225178e-05, - "loss": 0.372, + "epoch": 11.535917996197405, + "grad_norm": 5.29612398147583, + "learning_rate": 7.076208330553256e-06, + "loss": 0.2602, "step": 837300 }, { - "epoch": 8.53, - "learning_rate": 2.5036126013349477e-05, - "loss": 0.3368, + "epoch": 11.537295748257144, + "grad_norm": 0.4412059187889099, + "learning_rate": 7.07084171538621e-06, + "loss": 0.2582, "step": 837400 }, { - "epoch": 8.53, - "learning_rate": 2.5029583973177585e-05, - "loss": 0.2711, + "epoch": 11.538673500316882, + "grad_norm": 1.5155586004257202, + "learning_rate": 7.065476835780676e-06, + "loss": 0.2515, "step": 837500 }, { - "epoch": 8.53, - "learning_rate": 2.502304217602938e-05, - "loss": 0.4348, + "epoch": 11.540051252376623, + "grad_norm": 0.13803496956825256, + "learning_rate": 7.060113692192419e-06, + "loss": 0.246, "step": 837600 }, { - "epoch": 8.53, - "learning_rate": 2.5016500622224718e-05, - "loss": 0.4055, + "epoch": 11.541429004436361, + "grad_norm": 1.4031720161437988, + "learning_rate": 7.054752285077082e-06, + "loss": 0.2418, "step": 837700 }, { - "epoch": 8.54, - "learning_rate": 2.5009959312083422e-05, - "loss": 0.4398, + "epoch": 11.542806756496102, + "grad_norm": 2.182058572769165, + "learning_rate": 7.049392614890126e-06, + "loss": 0.2996, "step": 837800 }, { - "epoch": 8.54, - "learning_rate": 2.500341824592534e-05, - "loss": 0.4162, + "epoch": 11.54418450855584, + "grad_norm": 2.9265987873077393, + "learning_rate": 7.04403468208689e-06, + "loss": 0.2314, "step": 837900 }, { - "epoch": 8.54, - "learning_rate": 2.4996877424070292e-05, - "loss": 0.4295, + "epoch": 11.54556226061558, + "grad_norm": 3.079571008682251, + "learning_rate": 7.038678487122573e-06, + "loss": 0.2492, "step": 838000 }, { - "epoch": 8.54, - "learning_rate": 2.4990336846838078e-05, - "loss": 0.338, + "epoch": 11.54694001267532, + "grad_norm": 4.0816497802734375, + "learning_rate": 7.033324030452195e-06, + "loss": 0.2177, "step": 838100 }, { - "epoch": 8.54, - "learning_rate": 2.4983796514548496e-05, - "loss": 0.4273, + "epoch": 11.548317764735058, + "grad_norm": 3.5425825119018555, + "learning_rate": 7.027971312530638e-06, + "loss": 0.2991, "step": 838200 }, { - "epoch": 8.54, - "learning_rate": 2.4977256427521325e-05, - "loss": 0.3346, + "epoch": 11.549695516794797, + "grad_norm": 3.13627552986145, + "learning_rate": 7.0226203338126556e-06, + "loss": 0.2338, "step": 838300 }, { - "epoch": 8.54, - "learning_rate": 2.4970716586076355e-05, - "loss": 0.3765, + "epoch": 11.551073268854537, + "grad_norm": 0.2931537926197052, + "learning_rate": 7.017271094752844e-06, + "loss": 0.2394, "step": 838400 }, { - "epoch": 8.54, - "learning_rate": 2.4964176990533317e-05, - "loss": 0.4378, + "epoch": 11.552451020914276, + "grad_norm": 1.6076018810272217, + "learning_rate": 7.0119235958056315e-06, + "loss": 0.235, "step": 838500 }, { - "epoch": 8.54, - "learning_rate": 2.4957637641211975e-05, - "loss": 0.3292, + "epoch": 11.553828772974015, + "grad_norm": 2.1508982181549072, + "learning_rate": 7.006577837425336e-06, + "loss": 0.2503, "step": 838600 }, { - "epoch": 8.54, - "learning_rate": 2.495109853843207e-05, - "loss": 0.3692, + "epoch": 11.555206525033755, + "grad_norm": 3.9645638465881348, + "learning_rate": 7.001233820066097e-06, + "loss": 0.2565, "step": 838700 }, { - "epoch": 8.55, - "learning_rate": 2.4944559682513304e-05, - "loss": 0.4078, + "epoch": 11.556584277093494, + "grad_norm": 1.8535536527633667, + "learning_rate": 6.995891544181909e-06, + "loss": 0.2275, "step": 838800 }, { - "epoch": 8.55, - "learning_rate": 2.49380210737754e-05, - "loss": 0.3623, + "epoch": 11.557962029153234, + "grad_norm": 1.0841606855392456, + "learning_rate": 6.990551010226628e-06, + "loss": 0.2394, "step": 838900 }, { - "epoch": 8.55, - "learning_rate": 2.4931482712538075e-05, - "loss": 0.4284, + "epoch": 11.559339781212973, + "grad_norm": 1.7006827592849731, + "learning_rate": 6.985212218653977e-06, + "loss": 0.2384, "step": 839000 }, { - "epoch": 8.55, - "learning_rate": 2.492494459912098e-05, - "loss": 0.3763, + "epoch": 11.560717533272712, + "grad_norm": 2.692349433898926, + "learning_rate": 6.9798751699175e-06, + "loss": 0.2523, "step": 839100 }, { - "epoch": 8.55, - "learning_rate": 2.491840673384381e-05, - "loss": 0.4238, + "epoch": 11.562095285332452, + "grad_norm": 0.5705708861351013, + "learning_rate": 6.9745398644706e-06, + "loss": 0.2699, "step": 839200 }, { - "epoch": 8.55, - "learning_rate": 2.491186911702623e-05, - "loss": 0.3712, + "epoch": 11.56347303739219, + "grad_norm": 1.5722116231918335, + "learning_rate": 6.969206302766546e-06, + "loss": 0.2228, "step": 839300 }, { - "epoch": 8.55, - "learning_rate": 2.4905331748987882e-05, - "loss": 0.4037, + "epoch": 11.56485078945193, + "grad_norm": 1.0157722234725952, + "learning_rate": 6.963874485258458e-06, + "loss": 0.2048, "step": 839400 }, { - "epoch": 8.55, - "learning_rate": 2.489879463004841e-05, - "loss": 0.3426, + "epoch": 11.56622854151167, + "grad_norm": 0.03590843081474304, + "learning_rate": 6.9585444123992855e-06, + "loss": 0.2562, "step": 839500 }, { - "epoch": 8.55, - "learning_rate": 2.4892257760527445e-05, - "loss": 0.3279, + "epoch": 11.567606293571409, + "grad_norm": 2.249206066131592, + "learning_rate": 6.953216084641856e-06, + "loss": 0.2291, "step": 839600 }, { - "epoch": 8.56, - "learning_rate": 2.4885721140744582e-05, - "loss": 0.3978, + "epoch": 11.568984045631149, + "grad_norm": 2.757840156555176, + "learning_rate": 6.947889502438856e-06, + "loss": 0.2386, "step": 839700 }, { - "epoch": 8.56, - "learning_rate": 2.4879184771019438e-05, - "loss": 0.3805, + "epoch": 11.570361797690888, + "grad_norm": 0.055385906249284744, + "learning_rate": 6.942564666242767e-06, + "loss": 0.2459, "step": 839800 }, { - "epoch": 8.56, - "learning_rate": 2.487264865167161e-05, - "loss": 0.392, + "epoch": 11.571739549750626, + "grad_norm": 4.499365329742432, + "learning_rate": 6.937241576505979e-06, + "loss": 0.2494, "step": 839900 }, { - "epoch": 8.56, - "learning_rate": 2.486611278302065e-05, - "loss": 0.3517, + "epoch": 11.573117301810367, + "grad_norm": 2.5777533054351807, + "learning_rate": 6.931920233680729e-06, + "loss": 0.3055, "step": 840000 }, { - "epoch": 8.56, - "learning_rate": 2.4859577165386143e-05, - "loss": 0.3113, + "epoch": 11.574495053870105, + "grad_norm": 3.3405919075012207, + "learning_rate": 6.926600638219072e-06, + "loss": 0.2401, "step": 840100 }, { - "epoch": 8.56, - "learning_rate": 2.4853041799087633e-05, - "loss": 0.3593, + "epoch": 11.575872805929844, + "grad_norm": 2.5483813285827637, + "learning_rate": 6.921282790572949e-06, + "loss": 0.2528, "step": 840200 }, { - "epoch": 8.56, - "learning_rate": 2.4846506684444678e-05, - "loss": 0.3953, + "epoch": 11.577250557989585, + "grad_norm": 0.49346649646759033, + "learning_rate": 6.915966691194129e-06, + "loss": 0.2241, "step": 840300 }, { - "epoch": 8.56, - "learning_rate": 2.4839971821776776e-05, - "loss": 0.4712, + "epoch": 11.578628310049323, + "grad_norm": 4.195450782775879, + "learning_rate": 6.9106523405342526e-06, + "loss": 0.2292, "step": 840400 }, { - "epoch": 8.56, - "learning_rate": 2.4833437211403462e-05, - "loss": 0.3553, + "epoch": 11.580006062109064, + "grad_norm": 0.4413321912288666, + "learning_rate": 6.905339739044784e-06, + "loss": 0.2374, "step": 840500 }, { - "epoch": 8.56, - "learning_rate": 2.482690285364424e-05, - "loss": 0.3651, + "epoch": 11.581383814168802, + "grad_norm": 0.779982328414917, + "learning_rate": 6.900028887177068e-06, + "loss": 0.2414, "step": 840600 }, { - "epoch": 8.57, - "learning_rate": 2.4820368748818592e-05, - "loss": 0.4076, + "epoch": 11.582761566228541, + "grad_norm": 2.2243194580078125, + "learning_rate": 6.894719785382293e-06, + "loss": 0.2511, "step": 840700 }, { - "epoch": 8.57, - "learning_rate": 2.4813834897246e-05, - "loss": 0.3867, + "epoch": 11.584139318288281, + "grad_norm": 47.015743255615234, + "learning_rate": 6.889412434111491e-06, + "loss": 0.2608, "step": 840800 }, { - "epoch": 8.57, - "learning_rate": 2.4807301299245947e-05, - "loss": 0.3883, + "epoch": 11.58551707034802, + "grad_norm": 3.3263680934906006, + "learning_rate": 6.884106833815538e-06, + "loss": 0.2373, "step": 840900 }, { - "epoch": 8.57, - "learning_rate": 2.480076795513786e-05, - "loss": 0.423, + "epoch": 11.586894822407759, + "grad_norm": 2.4658260345458984, + "learning_rate": 6.878802984945184e-06, + "loss": 0.1951, "step": 841000 }, { - "epoch": 8.57, - "learning_rate": 2.479423486524119e-05, - "loss": 0.3962, + "epoch": 11.5882725744675, + "grad_norm": 0.18568913638591766, + "learning_rate": 6.873500887951007e-06, + "loss": 0.2802, "step": 841100 }, { - "epoch": 8.57, - "learning_rate": 2.4787702029875376e-05, - "loss": 0.3681, + "epoch": 11.589650326527238, + "grad_norm": 4.120938301086426, + "learning_rate": 6.868200543283455e-06, + "loss": 0.2541, "step": 841200 }, { - "epoch": 8.57, - "learning_rate": 2.478116944935982e-05, - "loss": 0.3009, + "epoch": 11.591028078586977, + "grad_norm": 1.1207767724990845, + "learning_rate": 6.8629019513928284e-06, + "loss": 0.2878, "step": 841300 }, { - "epoch": 8.57, - "learning_rate": 2.4774637124013934e-05, - "loss": 0.3684, + "epoch": 11.592405830646717, + "grad_norm": 0.6157251596450806, + "learning_rate": 6.8576051127292565e-06, + "loss": 0.2254, "step": 841400 }, { - "epoch": 8.57, - "learning_rate": 2.4768105054157123e-05, - "loss": 0.4103, + "epoch": 11.593783582706456, + "grad_norm": 3.177783489227295, + "learning_rate": 6.852310027742729e-06, + "loss": 0.259, "step": 841500 }, { - "epoch": 8.57, - "learning_rate": 2.4761573240108738e-05, - "loss": 0.4381, + "epoch": 11.595161334766196, + "grad_norm": 15.110915184020996, + "learning_rate": 6.847016696883097e-06, + "loss": 0.2834, "step": 841600 }, { - "epoch": 8.58, - "learning_rate": 2.4755041682188164e-05, - "loss": 0.4026, + "epoch": 11.596539086825935, + "grad_norm": 5.06250524520874, + "learning_rate": 6.841725120600067e-06, + "loss": 0.2575, "step": 841700 }, { - "epoch": 8.58, - "learning_rate": 2.474851038071475e-05, - "loss": 0.3927, + "epoch": 11.597916838885673, + "grad_norm": 3.6861960887908936, + "learning_rate": 6.836435299343162e-06, + "loss": 0.2741, "step": 841800 }, { - "epoch": 8.58, - "learning_rate": 2.4741979336007844e-05, - "loss": 0.4146, + "epoch": 11.599294590945414, + "grad_norm": 2.7153666019439697, + "learning_rate": 6.831147233561806e-06, + "loss": 0.2765, "step": 841900 }, { - "epoch": 8.58, - "learning_rate": 2.4735448548386764e-05, - "loss": 0.3842, + "epoch": 11.600672343005153, + "grad_norm": 3.024301767349243, + "learning_rate": 6.8258609237052294e-06, + "loss": 0.2263, "step": 842000 }, { - "epoch": 8.58, - "learning_rate": 2.4728918018170834e-05, - "loss": 0.3957, + "epoch": 11.602050095064893, + "grad_norm": 1.231520652770996, + "learning_rate": 6.82057637022253e-06, + "loss": 0.2378, "step": 842100 }, { - "epoch": 8.58, - "learning_rate": 2.4722387745679367e-05, - "loss": 0.3555, + "epoch": 11.603427847124632, + "grad_norm": 1.4079076051712036, + "learning_rate": 6.8153463928315165e-06, + "loss": 0.2738, "step": 842200 }, { - "epoch": 8.58, - "learning_rate": 2.4715857731231635e-05, - "loss": 0.4289, + "epoch": 11.60480559918437, + "grad_norm": 1.1549965143203735, + "learning_rate": 6.810065335868344e-06, + "loss": 0.253, "step": 842300 }, { - "epoch": 8.58, - "learning_rate": 2.4709393271427827e-05, - "loss": 0.268, + "epoch": 11.60618335124411, + "grad_norm": 2.221769094467163, + "learning_rate": 6.80478603662098e-06, + "loss": 0.2604, "step": 842400 }, { - "epoch": 8.58, - "learning_rate": 2.4702863771437e-05, - "loss": 0.3139, + "epoch": 11.60756110330385, + "grad_norm": 3.4098496437072754, + "learning_rate": 6.799508495537916e-06, + "loss": 0.208, "step": 842500 }, { - "epoch": 8.58, - "learning_rate": 2.4696334530444523e-05, - "loss": 0.4254, + "epoch": 11.608938855363588, + "grad_norm": 6.142827987670898, + "learning_rate": 6.794232713067504e-06, + "loss": 0.2786, "step": 842600 }, { - "epoch": 8.59, - "learning_rate": 2.4689805548769654e-05, - "loss": 0.4048, + "epoch": 11.610316607423329, + "grad_norm": 2.446105718612671, + "learning_rate": 6.788958689657957e-06, + "loss": 0.2241, "step": 842700 }, { - "epoch": 8.59, - "learning_rate": 2.468327682673159e-05, - "loss": 0.3038, + "epoch": 11.611694359483067, + "grad_norm": 2.647216320037842, + "learning_rate": 6.783686425757321e-06, + "loss": 0.2446, "step": 842800 }, { - "epoch": 8.59, - "learning_rate": 2.4676748364649556e-05, - "loss": 0.4015, + "epoch": 11.613072111542806, + "grad_norm": 0.5367299914360046, + "learning_rate": 6.778415921813505e-06, + "loss": 0.3103, "step": 842900 }, { - "epoch": 8.59, - "learning_rate": 2.4670220162842773e-05, - "loss": 0.426, + "epoch": 11.614449863602546, + "grad_norm": 3.217264413833618, + "learning_rate": 6.773147178274276e-06, + "loss": 0.3026, "step": 843000 }, { - "epoch": 8.59, - "learning_rate": 2.4663692221630406e-05, - "loss": 0.4436, + "epoch": 11.615827615662285, + "grad_norm": 2.834268808364868, + "learning_rate": 6.767880195587232e-06, + "loss": 0.2602, "step": 843100 }, { - "epoch": 8.59, - "learning_rate": 2.465716454133164e-05, - "loss": 0.4208, + "epoch": 11.617205367722025, + "grad_norm": 10.983319282531738, + "learning_rate": 6.762614974199824e-06, + "loss": 0.295, "step": 843200 }, { - "epoch": 8.59, - "learning_rate": 2.4650637122265667e-05, - "loss": 0.3984, + "epoch": 11.618583119781764, + "grad_norm": 1.2028474807739258, + "learning_rate": 6.7573515145593644e-06, + "loss": 0.2698, "step": 843300 }, { - "epoch": 8.59, - "learning_rate": 2.4644109964751595e-05, - "loss": 0.3664, + "epoch": 11.619960871841503, + "grad_norm": 5.578560829162598, + "learning_rate": 6.7520898171130195e-06, + "loss": 0.2524, "step": 843400 }, { - "epoch": 8.59, - "learning_rate": 2.4637583069108588e-05, - "loss": 0.3739, + "epoch": 11.621338623901243, + "grad_norm": 2.5727131366729736, + "learning_rate": 6.746829882307782e-06, + "loss": 0.2366, "step": 843500 }, { - "epoch": 8.59, - "learning_rate": 2.463112170069141e-05, - "loss": 0.3658, + "epoch": 11.622716375960982, + "grad_norm": 2.489309310913086, + "learning_rate": 6.741571710590527e-06, + "loss": 0.2462, "step": 843600 }, { - "epoch": 8.6, - "learning_rate": 2.4624595327121215e-05, - "loss": 0.3545, + "epoch": 11.62409412802072, + "grad_norm": 1.6929024457931519, + "learning_rate": 6.736315302407955e-06, + "loss": 0.2186, "step": 843700 }, { - "epoch": 8.6, - "learning_rate": 2.4618069216376236e-05, - "loss": 0.3572, + "epoch": 11.625471880080461, + "grad_norm": 2.097480297088623, + "learning_rate": 6.731060658206617e-06, + "loss": 0.2461, "step": 843800 }, { - "epoch": 8.6, - "learning_rate": 2.4611543368775544e-05, - "loss": 0.3912, + "epoch": 11.6268496321402, + "grad_norm": 2.3578040599823, + "learning_rate": 6.725807778432927e-06, + "loss": 0.2533, "step": 843900 }, { - "epoch": 8.6, - "learning_rate": 2.460501778463823e-05, - "loss": 0.3803, + "epoch": 11.62822738419994, + "grad_norm": 2.406205177307129, + "learning_rate": 6.720556663533157e-06, + "loss": 0.2551, "step": 844000 }, { - "epoch": 8.6, - "learning_rate": 2.4598492464283363e-05, - "loss": 0.4127, + "epoch": 11.629605136259679, + "grad_norm": 1.4332845211029053, + "learning_rate": 6.715307313953405e-06, + "loss": 0.2927, "step": 844100 }, { - "epoch": 8.6, - "learning_rate": 2.4591967408029968e-05, - "loss": 0.3454, + "epoch": 11.630982888319418, + "grad_norm": 2.657992362976074, + "learning_rate": 6.710059730139621e-06, + "loss": 0.225, "step": 844200 }, { - "epoch": 8.6, - "learning_rate": 2.458544261619709e-05, - "loss": 0.3731, + "epoch": 11.632360640379158, + "grad_norm": 3.3074700832366943, + "learning_rate": 6.704813912537625e-06, + "loss": 0.2334, "step": 844300 }, { - "epoch": 8.6, - "learning_rate": 2.4578918089103765e-05, - "loss": 0.3481, + "epoch": 11.633738392438897, + "grad_norm": 0.8223738670349121, + "learning_rate": 6.6995698615930815e-06, + "loss": 0.2664, "step": 844400 }, { - "epoch": 8.6, - "learning_rate": 2.457239382706899e-05, - "loss": 0.3904, + "epoch": 11.635116144498635, + "grad_norm": 2.2995352745056152, + "learning_rate": 6.6943275777514825e-06, + "loss": 0.2291, "step": 844500 }, { - "epoch": 8.6, - "learning_rate": 2.456586983041176e-05, - "loss": 0.3711, + "epoch": 11.636493896558376, + "grad_norm": 4.352041244506836, + "learning_rate": 6.6890870614581995e-06, + "loss": 0.2339, "step": 844600 }, { - "epoch": 8.61, - "learning_rate": 2.4559346099451083e-05, - "loss": 0.3356, + "epoch": 11.637871648618114, + "grad_norm": 1.7202414274215698, + "learning_rate": 6.683848313158454e-06, + "loss": 0.2454, "step": 844700 }, { - "epoch": 8.61, - "learning_rate": 2.45528226345059e-05, - "loss": 0.4755, + "epoch": 11.639249400677855, + "grad_norm": 1.3922858238220215, + "learning_rate": 6.678663694340655e-06, + "loss": 0.2758, "step": 844800 }, { - "epoch": 8.61, - "learning_rate": 2.4546299435895182e-05, - "loss": 0.3877, + "epoch": 11.640627152737594, + "grad_norm": 3.378896474838257, + "learning_rate": 6.673428465671927e-06, + "loss": 0.2689, "step": 844900 }, { - "epoch": 8.61, - "learning_rate": 2.4539776503937877e-05, - "loss": 0.3691, + "epoch": 11.642004904797332, + "grad_norm": 2.1404786109924316, + "learning_rate": 6.668195006326998e-06, + "loss": 0.2037, "step": 845000 }, { - "epoch": 8.61, - "learning_rate": 2.453325383895292e-05, - "loss": 0.4066, + "epoch": 11.643382656857073, + "grad_norm": 3.217641830444336, + "learning_rate": 6.662963316750487e-06, + "loss": 0.2665, "step": 845100 }, { - "epoch": 8.61, - "learning_rate": 2.452679666391203e-05, - "loss": 0.4694, + "epoch": 11.644760408916811, + "grad_norm": 2.824876070022583, + "learning_rate": 6.657733397386842e-06, + "loss": 0.2418, "step": 845200 }, { - "epoch": 8.61, - "learning_rate": 2.452027453115082e-05, - "loss": 0.3818, + "epoch": 11.64613816097655, + "grad_norm": 1.7568999528884888, + "learning_rate": 6.6525052486803655e-06, + "loss": 0.249, "step": 845300 }, { - "epoch": 8.61, - "learning_rate": 2.4513752666315484e-05, - "loss": 0.3189, + "epoch": 11.64751591303629, + "grad_norm": 2.377793073654175, + "learning_rate": 6.647278871075225e-06, + "loss": 0.2878, "step": 845400 }, { - "epoch": 8.61, - "learning_rate": 2.4507231069724906e-05, - "loss": 0.3506, + "epoch": 11.648893665096029, + "grad_norm": 3.0749120712280273, + "learning_rate": 6.6420542650154345e-06, + "loss": 0.2163, "step": 845500 }, { - "epoch": 8.62, - "learning_rate": 2.4500709741697955e-05, - "loss": 0.3871, + "epoch": 11.650271417155768, + "grad_norm": 2.069171190261841, + "learning_rate": 6.636831430944832e-06, + "loss": 0.2063, "step": 845600 }, { - "epoch": 8.62, - "learning_rate": 2.449418868255347e-05, - "loss": 0.3776, + "epoch": 11.651649169215508, + "grad_norm": 4.620945453643799, + "learning_rate": 6.631610369307137e-06, + "loss": 0.2982, "step": 845700 }, { - "epoch": 8.62, - "learning_rate": 2.4487667892610305e-05, - "loss": 0.4138, + "epoch": 11.653026921275247, + "grad_norm": 3.1854896545410156, + "learning_rate": 6.6264432646563294e-06, + "loss": 0.2258, "step": 845800 }, { - "epoch": 8.62, - "learning_rate": 2.4481147372187297e-05, - "loss": 0.3187, + "epoch": 11.654404673334987, + "grad_norm": 2.1610448360443115, + "learning_rate": 6.621225731479576e-06, + "loss": 0.2812, "step": 845900 }, { - "epoch": 8.62, - "learning_rate": 2.4474627121603233e-05, - "loss": 0.3914, + "epoch": 11.655782425394726, + "grad_norm": 2.504307985305786, + "learning_rate": 6.616009972061503e-06, + "loss": 0.2451, "step": 846000 }, { - "epoch": 8.62, - "learning_rate": 2.4468107141176932e-05, - "loss": 0.3118, + "epoch": 11.657160177454465, + "grad_norm": 1.9328135251998901, + "learning_rate": 6.6107959868452245e-06, + "loss": 0.2721, "step": 846100 }, { - "epoch": 8.62, - "learning_rate": 2.446158743122718e-05, - "loss": 0.3505, + "epoch": 11.658537929514205, + "grad_norm": 0.039619311690330505, + "learning_rate": 6.605583776273706e-06, + "loss": 0.2269, "step": 846200 }, { - "epoch": 8.62, - "learning_rate": 2.445506799207275e-05, - "loss": 0.4134, + "epoch": 11.659915681573944, + "grad_norm": 2.4206550121307373, + "learning_rate": 6.600373340789739e-06, + "loss": 0.2651, "step": 846300 }, { - "epoch": 8.62, - "learning_rate": 2.4448548824032404e-05, - "loss": 0.4096, + "epoch": 11.661293433633684, + "grad_norm": 2.9655678272247314, + "learning_rate": 6.595164680835969e-06, + "loss": 0.2417, "step": 846400 }, { - "epoch": 8.62, - "learning_rate": 2.4442029927424902e-05, - "loss": 0.3713, + "epoch": 11.662671185693423, + "grad_norm": 1.7894355058670044, + "learning_rate": 6.5899577968549165e-06, + "loss": 0.2761, "step": 846500 }, { - "epoch": 8.63, - "learning_rate": 2.4435511302568954e-05, - "loss": 0.3785, + "epoch": 11.664048937753162, + "grad_norm": 0.5256152749061584, + "learning_rate": 6.584752689288916e-06, + "loss": 0.2438, "step": 846600 }, { - "epoch": 8.63, - "learning_rate": 2.44289929497833e-05, - "loss": 0.3656, + "epoch": 11.665426689812902, + "grad_norm": 0.6700682640075684, + "learning_rate": 6.5795493585801786e-06, + "loss": 0.2662, "step": 846700 }, { - "epoch": 8.63, - "learning_rate": 2.4422474869386656e-05, - "loss": 0.3956, + "epoch": 11.66680444187264, + "grad_norm": 1.5879675149917603, + "learning_rate": 6.574347805170759e-06, + "loss": 0.2817, "step": 846800 }, { - "epoch": 8.63, - "learning_rate": 2.441595706169769e-05, - "loss": 0.4015, + "epoch": 11.66818219393238, + "grad_norm": 20.59263038635254, + "learning_rate": 6.569148029502546e-06, + "loss": 0.3195, "step": 846900 }, { - "epoch": 8.63, - "learning_rate": 2.44094395270351e-05, - "loss": 0.3579, + "epoch": 11.66955994599212, + "grad_norm": 2.663388967514038, + "learning_rate": 6.563950032017284e-06, + "loss": 0.243, "step": 847000 }, { - "epoch": 8.63, - "learning_rate": 2.440292226571757e-05, - "loss": 0.4449, + "epoch": 11.670937698051858, + "grad_norm": 2.23234486579895, + "learning_rate": 6.558753813156573e-06, + "loss": 0.2536, "step": 847100 }, { - "epoch": 8.63, - "learning_rate": 2.4396405278063728e-05, - "loss": 0.4126, + "epoch": 11.672315450111597, + "grad_norm": 2.4206724166870117, + "learning_rate": 6.553559373361867e-06, + "loss": 0.2948, "step": 847200 }, { - "epoch": 8.63, - "learning_rate": 2.4389888564392224e-05, - "loss": 0.4421, + "epoch": 11.673693202171338, + "grad_norm": 2.317769765853882, + "learning_rate": 6.548366713074447e-06, + "loss": 0.2916, "step": 847300 }, { - "epoch": 8.63, - "learning_rate": 2.4383372125021695e-05, - "loss": 0.3349, + "epoch": 11.675070954231076, + "grad_norm": 4.524205207824707, + "learning_rate": 6.543175832735471e-06, + "loss": 0.2382, "step": 847400 }, { - "epoch": 8.63, - "learning_rate": 2.4376855960270753e-05, - "loss": 0.3259, + "epoch": 11.676448706290817, + "grad_norm": 1.031415581703186, + "learning_rate": 6.537986732785923e-06, + "loss": 0.1966, "step": 847500 }, { - "epoch": 8.64, - "learning_rate": 2.4370340070457995e-05, - "loss": 0.3161, + "epoch": 11.677826458350555, + "grad_norm": 1.2295082807540894, + "learning_rate": 6.532799413666634e-06, + "loss": 0.2782, "step": 847600 }, { - "epoch": 8.64, - "learning_rate": 2.4363824455902007e-05, - "loss": 0.4078, + "epoch": 11.679204210410294, + "grad_norm": 2.6194474697113037, + "learning_rate": 6.527613875818305e-06, + "loss": 0.2493, "step": 847700 }, { - "epoch": 8.64, - "learning_rate": 2.435730911692138e-05, - "loss": 0.4066, + "epoch": 11.680581962470034, + "grad_norm": 1.2957565784454346, + "learning_rate": 6.522430119681476e-06, + "loss": 0.2252, "step": 847800 }, { - "epoch": 8.64, - "learning_rate": 2.4350794053834654e-05, - "loss": 0.3384, + "epoch": 11.681959714529773, + "grad_norm": 2.196861982345581, + "learning_rate": 6.5172481456965225e-06, + "loss": 0.2161, "step": 847900 }, { - "epoch": 8.64, - "learning_rate": 2.4344279266960386e-05, - "loss": 0.4505, + "epoch": 11.683337466589512, + "grad_norm": 3.391505002975464, + "learning_rate": 6.512067954303693e-06, + "loss": 0.2171, "step": 848000 }, { - "epoch": 8.64, - "learning_rate": 2.4337764756617117e-05, - "loss": 0.4436, + "epoch": 11.684715218649252, + "grad_norm": 1.6934423446655273, + "learning_rate": 6.506889545943055e-06, + "loss": 0.2348, "step": 848100 }, { - "epoch": 8.64, - "learning_rate": 2.4331250523123356e-05, - "loss": 0.359, + "epoch": 11.686092970708991, + "grad_norm": 4.701761245727539, + "learning_rate": 6.501712921054559e-06, + "loss": 0.2143, "step": 848200 }, { - "epoch": 8.64, - "learning_rate": 2.432480170498785e-05, - "loss": 0.3258, + "epoch": 11.687470722768731, + "grad_norm": 10.194208145141602, + "learning_rate": 6.496538080077969e-06, + "loss": 0.2064, "step": 848300 }, { - "epoch": 8.64, - "learning_rate": 2.4318288023372174e-05, - "loss": 0.3435, + "epoch": 11.68884847482847, + "grad_norm": 2.013437509536743, + "learning_rate": 6.491365023452925e-06, + "loss": 0.2317, "step": 848400 }, { - "epoch": 8.64, - "learning_rate": 2.4311774619558314e-05, - "loss": 0.3454, + "epoch": 11.690226226888209, + "grad_norm": 2.2102787494659424, + "learning_rate": 6.486193751618916e-06, + "loss": 0.2331, "step": 848500 }, { - "epoch": 8.65, - "learning_rate": 2.4305261493864744e-05, - "loss": 0.3235, + "epoch": 11.69160397894795, + "grad_norm": 2.1643428802490234, + "learning_rate": 6.481024265015235e-06, + "loss": 0.2533, "step": 848600 }, { - "epoch": 8.65, - "learning_rate": 2.4298748646609885e-05, - "loss": 0.3534, + "epoch": 11.692981731007688, + "grad_norm": 1.741525411605835, + "learning_rate": 6.475856564081078e-06, + "loss": 0.2438, "step": 848700 }, { - "epoch": 8.65, - "learning_rate": 2.4292236078112192e-05, - "loss": 0.352, + "epoch": 11.694359483067426, + "grad_norm": 1.0578083992004395, + "learning_rate": 6.470690649255471e-06, + "loss": 0.2508, "step": 848800 }, { - "epoch": 8.65, - "learning_rate": 2.4285723788690104e-05, - "loss": 0.4259, + "epoch": 11.695737235127167, + "grad_norm": 2.697371244430542, + "learning_rate": 6.4655265209772726e-06, + "loss": 0.2536, "step": 848900 }, { - "epoch": 8.65, - "learning_rate": 2.4279211778662013e-05, - "loss": 0.4238, + "epoch": 11.697114987186906, + "grad_norm": 1.634110689163208, + "learning_rate": 6.460364179685206e-06, + "loss": 0.2694, "step": 849000 }, { - "epoch": 8.65, - "learning_rate": 2.4272700048346322e-05, - "loss": 0.3218, + "epoch": 11.698492739246646, + "grad_norm": 2.7399866580963135, + "learning_rate": 6.455203625817849e-06, + "loss": 0.3063, "step": 849100 }, { - "epoch": 8.65, - "learning_rate": 2.426618859806144e-05, - "loss": 0.4105, + "epoch": 11.699870491306385, + "grad_norm": 2.803243637084961, + "learning_rate": 6.4500448598136085e-06, + "loss": 0.2526, "step": 849200 }, { - "epoch": 8.65, - "learning_rate": 2.4259677428125703e-05, - "loss": 0.3747, + "epoch": 11.701248243366123, + "grad_norm": 1.7457787990570068, + "learning_rate": 6.444887882110738e-06, + "loss": 0.2406, "step": 849300 }, { - "epoch": 8.65, - "learning_rate": 2.4253166538857482e-05, - "loss": 0.3516, + "epoch": 11.702625995425864, + "grad_norm": 1.6382092237472534, + "learning_rate": 6.439732693147364e-06, + "loss": 0.2248, "step": 849400 }, { - "epoch": 8.65, - "learning_rate": 2.424665593057513e-05, - "loss": 0.381, + "epoch": 11.704003747485602, + "grad_norm": 1.9540066719055176, + "learning_rate": 6.434579293361444e-06, + "loss": 0.2177, "step": 849500 }, { - "epoch": 8.66, - "learning_rate": 2.424014560359696e-05, - "loss": 0.4997, + "epoch": 11.705381499545341, + "grad_norm": 2.8187649250030518, + "learning_rate": 6.429427683190786e-06, + "loss": 0.2725, "step": 849600 }, { - "epoch": 8.66, - "learning_rate": 2.42336355582413e-05, - "loss": 0.4258, + "epoch": 11.706759251605082, + "grad_norm": 0.25920233130455017, + "learning_rate": 6.424277863073033e-06, + "loss": 0.2198, "step": 849700 }, { - "epoch": 8.66, - "learning_rate": 2.4227125794826447e-05, - "loss": 0.3508, + "epoch": 11.70813700366482, + "grad_norm": 1.2644122838974, + "learning_rate": 6.419129833445709e-06, + "loss": 0.251, "step": 849800 }, { - "epoch": 8.66, - "learning_rate": 2.4220616313670704e-05, - "loss": 0.3723, + "epoch": 11.709514755724559, + "grad_norm": 2.229199171066284, + "learning_rate": 6.413983594746143e-06, + "loss": 0.2385, "step": 849900 }, { - "epoch": 8.66, - "learning_rate": 2.421410711509232e-05, - "loss": 0.3587, + "epoch": 11.7108925077843, + "grad_norm": 6.176631927490234, + "learning_rate": 6.408839147411549e-06, + "loss": 0.2272, "step": 850000 }, { - "epoch": 8.66, - "learning_rate": 2.4207598199409574e-05, - "loss": 0.3808, + "epoch": 11.712270259844038, + "grad_norm": 4.0549139976501465, + "learning_rate": 6.403696491878979e-06, + "loss": 0.2202, "step": 850100 }, { - "epoch": 8.66, - "learning_rate": 2.4201089566940715e-05, - "loss": 0.369, + "epoch": 11.713648011903778, + "grad_norm": 0.3421748876571655, + "learning_rate": 6.398555628585318e-06, + "loss": 0.2301, "step": 850200 }, { - "epoch": 8.66, - "learning_rate": 2.4194581218003958e-05, - "loss": 0.4437, + "epoch": 11.715025763963517, + "grad_norm": 8.835271835327148, + "learning_rate": 6.393416557967303e-06, + "loss": 0.2232, "step": 850300 }, { - "epoch": 8.66, - "learning_rate": 2.4188073152917537e-05, - "loss": 0.395, + "epoch": 11.716403516023256, + "grad_norm": 6.385168075561523, + "learning_rate": 6.3882792804615335e-06, + "loss": 0.246, "step": 850400 }, { - "epoch": 8.67, - "learning_rate": 2.4181565371999672e-05, - "loss": 0.3717, + "epoch": 11.717781268082996, + "grad_norm": 8.807048797607422, + "learning_rate": 6.383143796504454e-06, + "loss": 0.2604, "step": 850500 }, { - "epoch": 8.67, - "learning_rate": 2.417505787556852e-05, - "loss": 0.4071, + "epoch": 11.719159020142735, + "grad_norm": 3.4053125381469727, + "learning_rate": 6.378010106532331e-06, + "loss": 0.2763, "step": 850600 }, { - "epoch": 8.67, - "learning_rate": 2.416855066394228e-05, - "loss": 0.412, + "epoch": 11.720536772202475, + "grad_norm": 2.7771785259246826, + "learning_rate": 6.372878210981319e-06, + "loss": 0.2316, "step": 850700 }, { - "epoch": 8.67, - "learning_rate": 2.416204373743912e-05, - "loss": 0.3478, + "epoch": 11.721914524262214, + "grad_norm": 2.2014052867889404, + "learning_rate": 6.367748110287389e-06, + "loss": 0.2479, "step": 850800 }, { - "epoch": 8.67, - "learning_rate": 2.4155602161373812e-05, - "loss": 0.4224, + "epoch": 11.723292276321953, + "grad_norm": 1.3412858247756958, + "learning_rate": 6.362619804886361e-06, + "loss": 0.2076, "step": 850900 }, { - "epoch": 8.67, - "learning_rate": 2.4149095803212062e-05, - "loss": 0.4397, + "epoch": 11.724670028381693, + "grad_norm": 4.020803928375244, + "learning_rate": 6.357493295213918e-06, + "loss": 0.2481, "step": 851000 }, { - "epoch": 8.67, - "learning_rate": 2.41425897311246e-05, - "loss": 0.3413, + "epoch": 11.726047780441432, + "grad_norm": 3.667405843734741, + "learning_rate": 6.352368581705591e-06, + "loss": 0.2889, "step": 851100 }, { - "epoch": 8.67, - "learning_rate": 2.413608394542955e-05, - "loss": 0.3791, + "epoch": 11.72742553250117, + "grad_norm": 2.3711185455322266, + "learning_rate": 6.347245664796738e-06, + "loss": 0.2992, "step": 851200 }, { - "epoch": 8.67, - "learning_rate": 2.4129578446445012e-05, - "loss": 0.3796, + "epoch": 11.728803284560911, + "grad_norm": 2.894505500793457, + "learning_rate": 6.342124544922587e-06, + "loss": 0.2308, "step": 851300 }, { - "epoch": 8.67, - "learning_rate": 2.4123073234489043e-05, - "loss": 0.3877, + "epoch": 11.73018103662065, + "grad_norm": 1.6895906925201416, + "learning_rate": 6.337005222518194e-06, + "loss": 0.2841, "step": 851400 }, { - "epoch": 8.68, - "learning_rate": 2.411656830987972e-05, - "loss": 0.3652, + "epoch": 11.731558788680388, + "grad_norm": 0.23754964768886566, + "learning_rate": 6.331887698018483e-06, + "loss": 0.2685, "step": 851500 }, { - "epoch": 8.68, - "learning_rate": 2.4110063672935103e-05, - "loss": 0.3182, + "epoch": 11.732936540740129, + "grad_norm": 3.2579987049102783, + "learning_rate": 6.3267719718582004e-06, + "loss": 0.2747, "step": 851600 }, { - "epoch": 8.68, - "learning_rate": 2.4103559323973218e-05, - "loss": 0.4186, + "epoch": 11.734314292799867, + "grad_norm": 6.439496994018555, + "learning_rate": 6.321658044471959e-06, + "loss": 0.2848, "step": 851700 }, { - "epoch": 8.68, - "learning_rate": 2.4097055263312094e-05, - "loss": 0.3629, + "epoch": 11.735692044859608, + "grad_norm": 2.8947651386260986, + "learning_rate": 6.316597028668494e-06, + "loss": 0.2633, "step": 851800 }, { - "epoch": 8.68, - "learning_rate": 2.409055149126975e-05, - "loss": 0.3527, + "epoch": 11.737069796919346, + "grad_norm": 2.745131731033325, + "learning_rate": 6.311486682134979e-06, + "loss": 0.2302, "step": 851900 }, { - "epoch": 8.68, - "learning_rate": 2.4084048008164164e-05, - "loss": 0.467, + "epoch": 11.738447548979085, + "grad_norm": 0.8721083402633667, + "learning_rate": 6.306378135674061e-06, + "loss": 0.2778, "step": 852000 }, { - "epoch": 8.68, - "learning_rate": 2.4077544814313323e-05, - "loss": 0.3594, + "epoch": 11.739825301038826, + "grad_norm": 1.388418436050415, + "learning_rate": 6.301271389719749e-06, + "loss": 0.253, "step": 852100 }, { - "epoch": 8.68, - "learning_rate": 2.40710419100352e-05, - "loss": 0.4084, + "epoch": 11.741203053098564, + "grad_norm": 3.2740159034729004, + "learning_rate": 6.29616644470589e-06, + "loss": 0.2729, "step": 852200 }, { - "epoch": 8.68, - "learning_rate": 2.4064539295647746e-05, - "loss": 0.4436, + "epoch": 11.742580805158303, + "grad_norm": 2.527209520339966, + "learning_rate": 6.291063301066161e-06, + "loss": 0.2941, "step": 852300 }, { - "epoch": 8.68, - "learning_rate": 2.4058036971468898e-05, - "loss": 0.3908, + "epoch": 11.743958557218043, + "grad_norm": 4.9052414894104, + "learning_rate": 6.285961959234119e-06, + "loss": 0.2582, "step": 852400 }, { - "epoch": 8.69, - "learning_rate": 2.405153493781658e-05, - "loss": 0.445, + "epoch": 11.745336309277782, + "grad_norm": 1.8431191444396973, + "learning_rate": 6.280862419643139e-06, + "loss": 0.2782, "step": 852500 }, { - "epoch": 8.69, - "learning_rate": 2.404503319500872e-05, - "loss": 0.3772, + "epoch": 11.746714061337522, + "grad_norm": 2.012159585952759, + "learning_rate": 6.275764682726441e-06, + "loss": 0.2934, "step": 852600 }, { - "epoch": 8.69, - "learning_rate": 2.4038531743363187e-05, - "loss": 0.4613, + "epoch": 11.748091813397261, + "grad_norm": 5.73352575302124, + "learning_rate": 6.27066874891712e-06, + "loss": 0.2461, "step": 852700 }, { - "epoch": 8.69, - "learning_rate": 2.4032030583197873e-05, - "loss": 0.4279, + "epoch": 11.749469565457, + "grad_norm": 4.38605260848999, + "learning_rate": 6.265574618648104e-06, + "loss": 0.2805, "step": 852800 }, { - "epoch": 8.69, - "learning_rate": 2.402552971483066e-05, - "loss": 0.317, + "epoch": 11.75084731751674, + "grad_norm": 0.7146219611167908, + "learning_rate": 6.260482292352158e-06, + "loss": 0.2547, "step": 852900 }, { - "epoch": 8.69, - "learning_rate": 2.4019029138579375e-05, - "loss": 0.3509, + "epoch": 11.752225069576479, + "grad_norm": 3.161679983139038, + "learning_rate": 6.2553917704618946e-06, + "loss": 0.2342, "step": 853000 }, { - "epoch": 8.69, - "learning_rate": 2.401252885476188e-05, - "loss": 0.3642, + "epoch": 11.753602821636218, + "grad_norm": 1.6239672899246216, + "learning_rate": 6.250303053409787e-06, + "loss": 0.2584, "step": 853100 }, { - "epoch": 8.69, - "learning_rate": 2.4006028863696e-05, - "loss": 0.4467, + "epoch": 11.754980573695958, + "grad_norm": 1.5768834352493286, + "learning_rate": 6.245216141628156e-06, + "loss": 0.271, "step": 853200 }, { - "epoch": 8.69, - "learning_rate": 2.399952916569953e-05, - "loss": 0.3865, + "epoch": 11.756358325755697, + "grad_norm": 1.0062748193740845, + "learning_rate": 6.240181877670289e-06, + "loss": 0.2344, "step": 853300 }, { - "epoch": 8.69, - "learning_rate": 2.3993029761090275e-05, - "loss": 0.3215, + "epoch": 11.757736077815437, + "grad_norm": 4.526578426361084, + "learning_rate": 6.235098559662425e-06, + "loss": 0.2503, "step": 853400 }, { - "epoch": 8.7, - "learning_rate": 2.398653065018602e-05, - "loss": 0.4535, + "epoch": 11.759113829875176, + "grad_norm": 2.852250337600708, + "learning_rate": 6.230017048216739e-06, + "loss": 0.2317, "step": 853500 }, { - "epoch": 8.7, - "learning_rate": 2.3980031833304528e-05, - "loss": 0.3422, + "epoch": 11.760491581934915, + "grad_norm": 2.481154680252075, + "learning_rate": 6.224937343764918e-06, + "loss": 0.2559, "step": 853600 }, { - "epoch": 8.7, - "learning_rate": 2.3973533310763548e-05, - "loss": 0.3153, + "epoch": 11.761869333994655, + "grad_norm": 2.323820114135742, + "learning_rate": 6.219859446738507e-06, + "loss": 0.2722, "step": 853700 }, { - "epoch": 8.7, - "learning_rate": 2.396703508288083e-05, - "loss": 0.3948, + "epoch": 11.763247086054394, + "grad_norm": 19.35616683959961, + "learning_rate": 6.214783357568904e-06, + "loss": 0.2672, "step": 853800 }, { - "epoch": 8.7, - "learning_rate": 2.3960537149974102e-05, - "loss": 0.3586, + "epoch": 11.764624838114132, + "grad_norm": 2.4237706661224365, + "learning_rate": 6.209709076687358e-06, + "loss": 0.2722, "step": 853900 }, { - "epoch": 8.7, - "learning_rate": 2.3954039512361056e-05, - "loss": 0.3999, + "epoch": 11.766002590173873, + "grad_norm": 1.6251882314682007, + "learning_rate": 6.204636604524946e-06, + "loss": 0.1929, "step": 854000 }, { - "epoch": 8.7, - "learning_rate": 2.3947542170359395e-05, - "loss": 0.3659, + "epoch": 11.767380342233611, + "grad_norm": 2.4958372116088867, + "learning_rate": 6.199565941512592e-06, + "loss": 0.2408, "step": 854100 }, { - "epoch": 8.7, - "learning_rate": 2.394104512428681e-05, - "loss": 0.2989, + "epoch": 11.76875809429335, + "grad_norm": 0.703154444694519, + "learning_rate": 6.194497088081081e-06, + "loss": 0.2558, "step": 854200 }, { - "epoch": 8.7, - "learning_rate": 2.393454837446096e-05, - "loss": 0.3735, + "epoch": 11.77013584635309, + "grad_norm": 1.5437828302383423, + "learning_rate": 6.189430044661052e-06, + "loss": 0.2491, "step": 854300 }, { - "epoch": 8.7, - "learning_rate": 2.392805192119949e-05, - "loss": 0.3472, + "epoch": 11.77151359841283, + "grad_norm": 1.4526002407073975, + "learning_rate": 6.184364811682955e-06, + "loss": 0.2851, "step": 854400 }, { - "epoch": 8.71, - "learning_rate": 2.3921555764820063e-05, - "loss": 0.4072, + "epoch": 11.77289135047257, + "grad_norm": 2.455385684967041, + "learning_rate": 6.1793013895771286e-06, + "loss": 0.2481, "step": 854500 }, { - "epoch": 8.71, - "learning_rate": 2.391505990564027e-05, - "loss": 0.3424, + "epoch": 11.774269102532308, + "grad_norm": 0.01590082049369812, + "learning_rate": 6.1742397787737224e-06, + "loss": 0.246, "step": 854600 }, { - "epoch": 8.71, - "learning_rate": 2.3908564343977734e-05, - "loss": 0.4638, + "epoch": 11.775646854592047, + "grad_norm": 0.9369410276412964, + "learning_rate": 6.169179979702741e-06, + "loss": 0.2364, "step": 854700 }, { - "epoch": 8.71, - "learning_rate": 2.3902069080150057e-05, - "loss": 0.4096, + "epoch": 11.777024606651787, + "grad_norm": 3.834155321121216, + "learning_rate": 6.164121992794048e-06, + "loss": 0.2143, "step": 854800 }, { - "epoch": 8.71, - "learning_rate": 2.3895574114474803e-05, - "loss": 0.3484, + "epoch": 11.778402358711526, + "grad_norm": 0.04187556728720665, + "learning_rate": 6.159065818477354e-06, + "loss": 0.2607, "step": 854900 }, { - "epoch": 8.71, - "learning_rate": 2.388907944726955e-05, - "loss": 0.4059, + "epoch": 11.779780110771267, + "grad_norm": 2.9726696014404297, + "learning_rate": 6.154011457182189e-06, + "loss": 0.2266, "step": 855000 }, { - "epoch": 8.71, - "learning_rate": 2.3882585078851855e-05, - "loss": 0.4194, + "epoch": 11.781157862831005, + "grad_norm": 2.342405319213867, + "learning_rate": 6.148958909337964e-06, + "loss": 0.2263, "step": 855100 }, { - "epoch": 8.71, - "learning_rate": 2.3876091009539226e-05, - "loss": 0.3678, + "epoch": 11.782535614890744, + "grad_norm": 0.011447679251432419, + "learning_rate": 6.143908175373906e-06, + "loss": 0.2763, "step": 855200 }, { - "epoch": 8.71, - "learning_rate": 2.3869597239649207e-05, - "loss": 0.3274, + "epoch": 11.783913366950484, + "grad_norm": 2.4221973419189453, + "learning_rate": 6.13885925571911e-06, + "loss": 0.251, "step": 855300 }, { - "epoch": 8.71, - "learning_rate": 2.3863103769499302e-05, - "loss": 0.3594, + "epoch": 11.785291119010223, + "grad_norm": 1.5310330390930176, + "learning_rate": 6.133812150802497e-06, + "loss": 0.2434, "step": 855400 }, { - "epoch": 8.72, - "learning_rate": 2.3856610599406994e-05, - "loss": 0.4245, + "epoch": 11.786668871069962, + "grad_norm": 4.3105950355529785, + "learning_rate": 6.1287668610528596e-06, + "loss": 0.2476, "step": 855500 }, { - "epoch": 8.72, - "learning_rate": 2.3850117729689763e-05, - "loss": 0.4125, + "epoch": 11.788046623129702, + "grad_norm": 1.146614909172058, + "learning_rate": 6.123723386898799e-06, + "loss": 0.2231, "step": 855600 }, { - "epoch": 8.72, - "learning_rate": 2.384362516066508e-05, - "loss": 0.3751, + "epoch": 11.78942437518944, + "grad_norm": 0.026918886229395866, + "learning_rate": 6.118681728768809e-06, + "loss": 0.2525, "step": 855700 }, { - "epoch": 8.72, - "learning_rate": 2.3837132892650395e-05, - "loss": 0.3692, + "epoch": 11.79080212724918, + "grad_norm": 3.294219970703125, + "learning_rate": 6.113641887091185e-06, + "loss": 0.275, "step": 855800 }, { - "epoch": 8.72, - "learning_rate": 2.3830640925963124e-05, - "loss": 0.3596, + "epoch": 11.79217987930892, + "grad_norm": 1.6708319187164307, + "learning_rate": 6.108603862294099e-06, + "loss": 0.2368, "step": 855900 }, { - "epoch": 8.72, - "learning_rate": 2.3824214176076933e-05, - "loss": 0.3382, + "epoch": 11.793557631368659, + "grad_norm": 3.0290560722351074, + "learning_rate": 6.103567654805547e-06, + "loss": 0.2509, "step": 856000 }, { - "epoch": 8.72, - "learning_rate": 2.3817722809975555e-05, - "loss": 0.4534, + "epoch": 11.794935383428399, + "grad_norm": 1.4740827083587646, + "learning_rate": 6.098533265053388e-06, + "loss": 0.2361, "step": 856100 }, { - "epoch": 8.72, - "learning_rate": 2.3811231746150637e-05, - "loss": 0.3445, + "epoch": 11.796313135488138, + "grad_norm": 1.8212511539459229, + "learning_rate": 6.093500693465319e-06, + "loss": 0.2402, "step": 856200 }, { - "epoch": 8.72, - "learning_rate": 2.3804740984919566e-05, - "loss": 0.4049, + "epoch": 11.797690887547876, + "grad_norm": 1.5977269411087036, + "learning_rate": 6.088469940468885e-06, + "loss": 0.2547, "step": 856300 }, { - "epoch": 8.73, - "learning_rate": 2.3798250526599677e-05, - "loss": 0.3716, + "epoch": 11.799068639607617, + "grad_norm": 1.766414761543274, + "learning_rate": 6.083441006491464e-06, + "loss": 0.2545, "step": 856400 }, { - "epoch": 8.73, - "learning_rate": 2.3791760371508327e-05, - "loss": 0.3565, + "epoch": 11.800446391667355, + "grad_norm": 0.024048594757914543, + "learning_rate": 6.0784138919603005e-06, + "loss": 0.2307, "step": 856500 }, { - "epoch": 8.73, - "learning_rate": 2.3785270519962856e-05, - "loss": 0.3954, + "epoch": 11.801824143727094, + "grad_norm": 5.119098663330078, + "learning_rate": 6.073388597302463e-06, + "loss": 0.2675, "step": 856600 }, { - "epoch": 8.73, - "learning_rate": 2.3778780972280557e-05, - "loss": 0.4018, + "epoch": 11.803201895786835, + "grad_norm": 7.258779048919678, + "learning_rate": 6.068365122944882e-06, + "loss": 0.2393, "step": 856700 }, { - "epoch": 8.73, - "learning_rate": 2.3772291728778747e-05, - "loss": 0.4143, + "epoch": 11.804579647846573, + "grad_norm": 2.4528822898864746, + "learning_rate": 6.063343469314335e-06, + "loss": 0.2634, "step": 856800 }, { - "epoch": 8.73, - "learning_rate": 2.3765802789774717e-05, - "loss": 0.3814, + "epoch": 11.805957399906314, + "grad_norm": 2.5590145587921143, + "learning_rate": 6.058323636837429e-06, + "loss": 0.2004, "step": 856900 }, { - "epoch": 8.73, - "learning_rate": 2.3759314155585715e-05, - "loss": 0.3421, + "epoch": 11.807335151966052, + "grad_norm": 1.577837347984314, + "learning_rate": 6.053305625940616e-06, + "loss": 0.1994, "step": 857000 }, { - "epoch": 8.73, - "learning_rate": 2.3752825826529006e-05, - "loss": 0.3872, + "epoch": 11.808712904025791, + "grad_norm": 2.923936367034912, + "learning_rate": 6.048289437050213e-06, + "loss": 0.2318, "step": 857100 }, { - "epoch": 8.73, - "learning_rate": 2.374633780292184e-05, - "loss": 0.3791, + "epoch": 11.810090656085531, + "grad_norm": 5.290594100952148, + "learning_rate": 6.043275070592373e-06, + "loss": 0.251, "step": 857200 }, { - "epoch": 8.73, - "learning_rate": 2.373985008508143e-05, - "loss": 0.352, + "epoch": 11.81146840814527, + "grad_norm": 0.627910315990448, + "learning_rate": 6.038262526993083e-06, + "loss": 0.2148, "step": 857300 }, { - "epoch": 8.74, - "learning_rate": 2.3733362673324994e-05, - "loss": 0.3969, + "epoch": 11.812846160205009, + "grad_norm": 1.210464358329773, + "learning_rate": 6.033251806678197e-06, + "loss": 0.2952, "step": 857400 }, { - "epoch": 8.74, - "learning_rate": 2.372687556796974e-05, - "loss": 0.3291, + "epoch": 11.81422391226475, + "grad_norm": 4.6392903327941895, + "learning_rate": 6.028242910073391e-06, + "loss": 0.2146, "step": 857500 }, { - "epoch": 8.74, - "learning_rate": 2.3720388769332817e-05, - "loss": 0.3487, + "epoch": 11.815601664324488, + "grad_norm": 2.447746992111206, + "learning_rate": 6.023235837604192e-06, + "loss": 0.2665, "step": 857600 }, { - "epoch": 8.74, - "learning_rate": 2.371390227773141e-05, - "loss": 0.349, + "epoch": 11.816979416384228, + "grad_norm": 0.88821941614151, + "learning_rate": 6.018280633142093e-06, + "loss": 0.1949, "step": 857700 }, { - "epoch": 8.74, - "learning_rate": 2.3707416093482666e-05, - "loss": 0.3632, + "epoch": 11.818357168443967, + "grad_norm": 2.402385711669922, + "learning_rate": 6.0132771919681295e-06, + "loss": 0.2509, "step": 857800 }, { - "epoch": 8.74, - "learning_rate": 2.3700930216903732e-05, - "loss": 0.3661, + "epoch": 11.819734920503706, + "grad_norm": 2.473785877227783, + "learning_rate": 6.0082755762012e-06, + "loss": 0.2573, "step": 857900 }, { - "epoch": 8.74, - "learning_rate": 2.369444464831171e-05, - "loss": 0.3857, + "epoch": 11.821112672563446, + "grad_norm": 2.7061538696289062, + "learning_rate": 6.00327578626621e-06, + "loss": 0.2898, "step": 858000 }, { - "epoch": 8.74, - "learning_rate": 2.368795938802372e-05, - "loss": 0.2929, + "epoch": 11.822490424623185, + "grad_norm": 2.047415256500244, + "learning_rate": 5.998277822587914e-06, + "loss": 0.2523, "step": 858100 }, { - "epoch": 8.74, - "learning_rate": 2.3681474436356862e-05, - "loss": 0.3742, + "epoch": 11.823868176682923, + "grad_norm": 2.0770041942596436, + "learning_rate": 5.993331637917423e-06, + "loss": 0.2451, "step": 858200 }, { - "epoch": 8.74, - "learning_rate": 2.367498979362818e-05, - "loss": 0.4031, + "epoch": 11.825245928742664, + "grad_norm": 3.0573275089263916, + "learning_rate": 5.9883373097530255e-06, + "loss": 0.2033, "step": 858300 }, { - "epoch": 8.75, - "learning_rate": 2.366850546015475e-05, - "loss": 0.3961, + "epoch": 11.826623680802403, + "grad_norm": 1.6069756746292114, + "learning_rate": 5.983344809114425e-06, + "loss": 0.2668, "step": 858400 }, { - "epoch": 8.75, - "learning_rate": 2.366202143625363e-05, - "loss": 0.4409, + "epoch": 11.828001432862141, + "grad_norm": 2.2553911209106445, + "learning_rate": 5.978354136425756e-06, + "loss": 0.2654, "step": 858500 }, { - "epoch": 8.75, - "learning_rate": 2.3655537722241828e-05, - "loss": 0.3471, + "epoch": 11.829379184921882, + "grad_norm": 4.689764499664307, + "learning_rate": 5.973365292111005e-06, + "loss": 0.2621, "step": 858600 }, { - "epoch": 8.75, - "learning_rate": 2.364905431843637e-05, - "loss": 0.3734, + "epoch": 11.83075693698162, + "grad_norm": 0.35373467206954956, + "learning_rate": 5.96837827659401e-06, + "loss": 0.2149, "step": 858700 }, { - "epoch": 8.75, - "learning_rate": 2.364257122515427e-05, - "loss": 0.4246, + "epoch": 11.83213468904136, + "grad_norm": 0.8575272560119629, + "learning_rate": 5.963393090298426e-06, + "loss": 0.1969, "step": 858800 }, { - "epoch": 8.75, - "learning_rate": 2.3636088442712483e-05, - "loss": 0.4379, + "epoch": 11.8335124411011, + "grad_norm": 2.2419016361236572, + "learning_rate": 5.958409733647782e-06, + "loss": 0.2443, "step": 858900 }, { - "epoch": 8.75, - "learning_rate": 2.3629605971427994e-05, - "loss": 0.383, + "epoch": 11.834890193160838, + "grad_norm": 3.074385404586792, + "learning_rate": 5.9534282070654434e-06, + "loss": 0.2682, "step": 859000 }, { - "epoch": 8.75, - "learning_rate": 2.3623123811617765e-05, - "loss": 0.4016, + "epoch": 11.836267945220579, + "grad_norm": 2.369394063949585, + "learning_rate": 5.948448510974612e-06, + "loss": 0.3274, "step": 859100 }, { - "epoch": 8.75, - "learning_rate": 2.3616641963598717e-05, - "loss": 0.3784, + "epoch": 11.837645697280317, + "grad_norm": 13.421640396118164, + "learning_rate": 5.9434706457983295e-06, + "loss": 0.275, "step": 859200 }, { - "epoch": 8.75, - "learning_rate": 2.3610160427687786e-05, - "loss": 0.4102, + "epoch": 11.839023449340058, + "grad_norm": 2.7182819843292236, + "learning_rate": 5.938494611959504e-06, + "loss": 0.279, "step": 859300 }, { - "epoch": 8.76, - "learning_rate": 2.3603679204201888e-05, - "loss": 0.3736, + "epoch": 11.840401201399796, + "grad_norm": 2.0081160068511963, + "learning_rate": 5.93352040988086e-06, + "loss": 0.3023, "step": 859400 }, { - "epoch": 8.76, - "learning_rate": 2.3597198293457895e-05, - "loss": 0.3562, + "epoch": 11.841778953459535, + "grad_norm": 3.94633150100708, + "learning_rate": 5.9285480399850014e-06, + "loss": 0.2488, "step": 859500 }, { - "epoch": 8.76, - "learning_rate": 2.359071769577269e-05, - "loss": 0.3579, + "epoch": 11.843156705519275, + "grad_norm": 10.242376327514648, + "learning_rate": 5.923577502694336e-06, + "loss": 0.2418, "step": 859600 }, { - "epoch": 8.76, - "learning_rate": 2.358423741146315e-05, - "loss": 0.3902, + "epoch": 11.844534457579014, + "grad_norm": 0.5839519500732422, + "learning_rate": 5.918608798431154e-06, + "loss": 0.2283, "step": 859700 }, { - "epoch": 8.76, - "learning_rate": 2.3577757440846113e-05, - "loss": 0.4738, + "epoch": 11.845912209638753, + "grad_norm": 4.316906452178955, + "learning_rate": 5.913641927617556e-06, + "loss": 0.2945, "step": 859800 }, { - "epoch": 8.76, - "learning_rate": 2.3571277784238407e-05, - "loss": 0.3674, + "epoch": 11.847289961698493, + "grad_norm": 3.4589645862579346, + "learning_rate": 5.9086768906755135e-06, + "loss": 0.2747, "step": 859900 }, { - "epoch": 8.76, - "learning_rate": 2.3564798441956855e-05, - "loss": 0.3031, + "epoch": 11.848667713758232, + "grad_norm": 5.400808811187744, + "learning_rate": 5.903713688026836e-06, + "loss": 0.2716, "step": 860000 }, { - "epoch": 8.76, - "learning_rate": 2.3558319414318265e-05, - "loss": 0.3128, + "epoch": 11.85004546581797, + "grad_norm": 2.687760829925537, + "learning_rate": 5.898752320093165e-06, + "loss": 0.2363, "step": 860100 }, { - "epoch": 8.76, - "learning_rate": 2.3551840701639403e-05, - "loss": 0.4049, + "epoch": 11.851423217877711, + "grad_norm": 1.7053422927856445, + "learning_rate": 5.8937927872959885e-06, + "loss": 0.2538, "step": 860200 }, { - "epoch": 8.76, - "learning_rate": 2.3545362304237046e-05, - "loss": 0.3726, + "epoch": 11.85280096993745, + "grad_norm": 1.6721010208129883, + "learning_rate": 5.888835090056662e-06, + "loss": 0.235, "step": 860300 }, { - "epoch": 8.77, - "learning_rate": 2.3538884222427964e-05, - "loss": 0.2949, + "epoch": 11.85417872199719, + "grad_norm": 0.1034838855266571, + "learning_rate": 5.883879228796346e-06, + "loss": 0.2095, "step": 860400 }, { - "epoch": 8.77, - "learning_rate": 2.3532406456528876e-05, - "loss": 0.3964, + "epoch": 11.855556474056929, + "grad_norm": 1.4383562803268433, + "learning_rate": 5.878925203936077e-06, + "loss": 0.2646, "step": 860500 }, { - "epoch": 8.77, - "learning_rate": 2.352592900685652e-05, - "loss": 0.3615, + "epoch": 11.856934226116667, + "grad_norm": 2.245439052581787, + "learning_rate": 5.873973015896733e-06, + "loss": 0.2436, "step": 860600 }, { - "epoch": 8.77, - "learning_rate": 2.351945187372761e-05, - "loss": 0.3582, + "epoch": 11.858311978176408, + "grad_norm": 0.015083450824022293, + "learning_rate": 5.869022665099018e-06, + "loss": 0.2509, "step": 860700 }, { - "epoch": 8.77, - "learning_rate": 2.351297505745882e-05, - "loss": 0.3898, + "epoch": 11.859689730236147, + "grad_norm": 1.7889741659164429, + "learning_rate": 5.864123627997033e-06, + "loss": 0.2687, "step": 860800 }, { - "epoch": 8.77, - "learning_rate": 2.3506498558366833e-05, - "loss": 0.428, + "epoch": 11.861067482295885, + "grad_norm": 1.2468163967132568, + "learning_rate": 5.859176934561183e-06, + "loss": 0.2137, "step": 860900 }, { - "epoch": 8.77, - "learning_rate": 2.350002237676832e-05, - "loss": 0.4382, + "epoch": 11.862445234355626, + "grad_norm": 2.873322010040283, + "learning_rate": 5.854232079623966e-06, + "loss": 0.2442, "step": 861000 }, { - "epoch": 8.77, - "learning_rate": 2.349354651297992e-05, - "loss": 0.4487, + "epoch": 11.863822986415364, + "grad_norm": 6.619929313659668, + "learning_rate": 5.84928906360548e-06, + "loss": 0.2811, "step": 861100 }, { - "epoch": 8.77, - "learning_rate": 2.3487070967318272e-05, - "loss": 0.3897, + "epoch": 11.865200738475105, + "grad_norm": 2.808436632156372, + "learning_rate": 5.844347886925654e-06, + "loss": 0.2339, "step": 861200 }, { - "epoch": 8.78, - "learning_rate": 2.3480660490794838e-05, - "loss": 0.3818, + "epoch": 11.866578490534843, + "grad_norm": 0.7952606678009033, + "learning_rate": 5.839408550004253e-06, + "loss": 0.2548, "step": 861300 }, { - "epoch": 8.78, - "learning_rate": 2.3474185579147338e-05, - "loss": 0.404, + "epoch": 11.867956242594582, + "grad_norm": 2.291318655014038, + "learning_rate": 5.834471053260921e-06, + "loss": 0.2271, "step": 861400 }, { - "epoch": 8.78, - "learning_rate": 2.346771098657324e-05, - "loss": 0.3735, + "epoch": 11.869333994654323, + "grad_norm": 0.4325435757637024, + "learning_rate": 5.829535397115104e-06, + "loss": 0.261, "step": 861500 }, { - "epoch": 8.78, - "learning_rate": 2.3461301454538894e-05, - "loss": 0.3872, + "epoch": 11.870711746714061, + "grad_norm": 2.2044060230255127, + "learning_rate": 5.824601581986122e-06, + "loss": 0.221, "step": 861600 }, { - "epoch": 8.78, - "learning_rate": 2.34548922358458e-05, - "loss": 0.3779, + "epoch": 11.8720894987738, + "grad_norm": 0.16177278757095337, + "learning_rate": 5.819669608293133e-06, + "loss": 0.2211, "step": 861700 }, { - "epoch": 8.78, - "learning_rate": 2.3448418595987615e-05, - "loss": 0.3781, + "epoch": 11.87346725083354, + "grad_norm": 1.851431131362915, + "learning_rate": 5.814739476455127e-06, + "loss": 0.2219, "step": 861800 }, { - "epoch": 8.78, - "learning_rate": 2.3441945276462677e-05, - "loss": 0.3861, + "epoch": 11.874845002893279, + "grad_norm": 2.610363721847534, + "learning_rate": 5.809860460665949e-06, + "loss": 0.2337, "step": 861900 }, { - "epoch": 8.78, - "learning_rate": 2.3435472277587465e-05, - "loss": 0.4113, + "epoch": 11.87622275495302, + "grad_norm": 2.0685129165649414, + "learning_rate": 5.804933995365266e-06, + "loss": 0.2659, "step": 862000 }, { - "epoch": 8.78, - "learning_rate": 2.3428999599678486e-05, - "loss": 0.3263, + "epoch": 11.877600507012758, + "grad_norm": 3.0322415828704834, + "learning_rate": 5.800009373171441e-06, + "loss": 0.242, "step": 862100 }, { - "epoch": 8.78, - "learning_rate": 2.3422527243052227e-05, - "loss": 0.3562, + "epoch": 11.878978259072497, + "grad_norm": 2.206204652786255, + "learning_rate": 5.795086594502827e-06, + "loss": 0.2253, "step": 862200 }, { - "epoch": 8.79, - "learning_rate": 2.3416055208025117e-05, - "loss": 0.3379, + "epoch": 11.880356011132237, + "grad_norm": 0.14909206330776215, + "learning_rate": 5.790165659777657e-06, + "loss": 0.2298, "step": 862300 }, { - "epoch": 8.79, - "learning_rate": 2.340958349491362e-05, - "loss": 0.3817, + "epoch": 11.881733763191976, + "grad_norm": 3.1101622581481934, + "learning_rate": 5.78524656941397e-06, + "loss": 0.2383, "step": 862400 }, { - "epoch": 8.79, - "learning_rate": 2.3403112104034168e-05, - "loss": 0.3887, + "epoch": 11.883111515251715, + "grad_norm": 3.2435362339019775, + "learning_rate": 5.780329323829688e-06, + "loss": 0.2325, "step": 862500 }, { - "epoch": 8.79, - "learning_rate": 2.3396641035703155e-05, - "loss": 0.4708, + "epoch": 11.884489267311455, + "grad_norm": 1.3852076530456543, + "learning_rate": 5.775413923442543e-06, + "loss": 0.2124, "step": 862600 }, { - "epoch": 8.79, - "learning_rate": 2.339017029023699e-05, - "loss": 0.3445, + "epoch": 11.885867019371194, + "grad_norm": 5.233317852020264, + "learning_rate": 5.770500368670125e-06, + "loss": 0.198, "step": 862700 }, { - "epoch": 8.79, - "learning_rate": 2.3383699867952067e-05, - "loss": 0.3657, + "epoch": 11.887244771430932, + "grad_norm": 2.648132562637329, + "learning_rate": 5.765588659929881e-06, + "loss": 0.2392, "step": 862800 }, { - "epoch": 8.79, - "learning_rate": 2.3377229769164724e-05, - "loss": 0.3361, + "epoch": 11.888622523490673, + "grad_norm": 5.508492946624756, + "learning_rate": 5.760678797639062e-06, + "loss": 0.2509, "step": 862900 }, { - "epoch": 8.79, - "learning_rate": 2.337075999419132e-05, - "loss": 0.3158, + "epoch": 11.890000275550412, + "grad_norm": 2.671232223510742, + "learning_rate": 5.7557707822147924e-06, + "loss": 0.2147, "step": 863000 }, { - "epoch": 8.79, - "learning_rate": 2.33642905433482e-05, - "loss": 0.382, + "epoch": 11.891378027610152, + "grad_norm": 0.22083638608455658, + "learning_rate": 5.750864614074047e-06, + "loss": 0.2501, "step": 863100 }, { - "epoch": 8.79, - "learning_rate": 2.3357821416951666e-05, - "loss": 0.4576, + "epoch": 11.89275577966989, + "grad_norm": 0.7079687118530273, + "learning_rate": 5.745960293633614e-06, + "loss": 0.2703, "step": 863200 }, { - "epoch": 8.8, - "learning_rate": 2.3351352615318025e-05, - "loss": 0.3747, + "epoch": 11.89413353172963, + "grad_norm": 0.8027519583702087, + "learning_rate": 5.7410578213101455e-06, + "loss": 0.2168, "step": 863300 }, { - "epoch": 8.8, - "learning_rate": 2.3344884138763567e-05, - "loss": 0.3458, + "epoch": 11.89551128378937, + "grad_norm": 2.065736770629883, + "learning_rate": 5.7361571975201425e-06, + "loss": 0.2071, "step": 863400 }, { - "epoch": 8.8, - "learning_rate": 2.3338415987604568e-05, - "loss": 0.4122, + "epoch": 11.896889035849108, + "grad_norm": 1.3039445877075195, + "learning_rate": 5.731258422679926e-06, + "loss": 0.2674, "step": 863500 }, { - "epoch": 8.8, - "learning_rate": 2.3331948162157262e-05, - "loss": 0.3068, + "epoch": 11.898266787908849, + "grad_norm": 1.9713029861450195, + "learning_rate": 5.726361497205667e-06, + "loss": 0.2245, "step": 863600 }, { - "epoch": 8.8, - "learning_rate": 2.3325480662737895e-05, - "loss": 0.3605, + "epoch": 11.899644539968588, + "grad_norm": 2.251127004623413, + "learning_rate": 5.721466421513394e-06, + "loss": 0.252, "step": 863700 }, { - "epoch": 8.8, - "learning_rate": 2.3319013489662703e-05, - "loss": 0.4243, + "epoch": 11.901022292028326, + "grad_norm": 5.01297664642334, + "learning_rate": 5.7165731960189754e-06, + "loss": 0.3057, "step": 863800 }, { - "epoch": 8.8, - "learning_rate": 2.3312546643247872e-05, - "loss": 0.4094, + "epoch": 11.902400044088067, + "grad_norm": 0.8853189945220947, + "learning_rate": 5.711681821138104e-06, + "loss": 0.2352, "step": 863900 }, { - "epoch": 8.8, - "learning_rate": 2.3306080123809602e-05, - "loss": 0.3784, + "epoch": 11.903777796147805, + "grad_norm": 2.0619289875030518, + "learning_rate": 5.706792297286325e-06, + "loss": 0.2204, "step": 864000 }, { - "epoch": 8.8, - "learning_rate": 2.329961393166408e-05, - "loss": 0.3239, + "epoch": 11.905155548207544, + "grad_norm": 2.3923721313476562, + "learning_rate": 5.7019046248790375e-06, + "loss": 0.2724, "step": 864100 }, { - "epoch": 8.8, - "learning_rate": 2.329314806712744e-05, - "loss": 0.3502, + "epoch": 11.906533300267284, + "grad_norm": 2.575901746749878, + "learning_rate": 5.697018804331466e-06, + "loss": 0.2564, "step": 864200 }, { - "epoch": 8.81, - "learning_rate": 2.3286682530515833e-05, - "loss": 0.3298, + "epoch": 11.907911052327023, + "grad_norm": 3.117515802383423, + "learning_rate": 5.69213483605869e-06, + "loss": 0.2141, "step": 864300 }, { - "epoch": 8.81, - "learning_rate": 2.3280217322145394e-05, - "loss": 0.4014, + "epoch": 11.909288804386762, + "grad_norm": 2.222916603088379, + "learning_rate": 5.687252720475637e-06, + "loss": 0.2245, "step": 864400 }, { - "epoch": 8.81, - "learning_rate": 2.3273752442332216e-05, - "loss": 0.3276, + "epoch": 11.910666556446502, + "grad_norm": 2.944580316543579, + "learning_rate": 5.682372457997056e-06, + "loss": 0.2625, "step": 864500 }, { - "epoch": 8.81, - "learning_rate": 2.326728789139241e-05, - "loss": 0.3383, + "epoch": 11.91204430850624, + "grad_norm": 1.9862767457962036, + "learning_rate": 5.677494049037547e-06, + "loss": 0.1844, "step": 864600 }, { - "epoch": 8.81, - "learning_rate": 2.3260823669642055e-05, - "loss": 0.367, + "epoch": 11.913422060565981, + "grad_norm": 0.038270335644483566, + "learning_rate": 5.672617494011561e-06, + "loss": 0.2479, "step": 864700 }, { - "epoch": 8.81, - "learning_rate": 2.32543597773972e-05, - "loss": 0.3276, + "epoch": 11.91479981262572, + "grad_norm": 1.7927380800247192, + "learning_rate": 5.667742793333395e-06, + "loss": 0.2493, "step": 864800 }, { - "epoch": 8.81, - "learning_rate": 2.324789621497389e-05, - "loss": 0.4795, + "epoch": 11.916177564685459, + "grad_norm": 7.083962440490723, + "learning_rate": 5.6628699474171645e-06, + "loss": 0.2163, "step": 864900 }, { - "epoch": 8.81, - "learning_rate": 2.324143298268816e-05, - "loss": 0.3384, + "epoch": 11.917555316745199, + "grad_norm": 2.262510061264038, + "learning_rate": 5.657998956676849e-06, + "loss": 0.281, "step": 865000 }, { - "epoch": 8.81, - "learning_rate": 2.3234970080856032e-05, - "loss": 0.3837, + "epoch": 11.918933068804938, + "grad_norm": 1.6414446830749512, + "learning_rate": 5.6531298215262774e-06, + "loss": 0.3048, "step": 865100 }, { - "epoch": 8.81, - "learning_rate": 2.322850750979349e-05, - "loss": 0.3614, + "epoch": 11.920310820864676, + "grad_norm": 2.964630365371704, + "learning_rate": 5.648262542379086e-06, + "loss": 0.2005, "step": 865200 }, { - "epoch": 8.82, - "learning_rate": 2.3222045269816516e-05, - "loss": 0.3658, + "epoch": 11.921688572924417, + "grad_norm": 1.1724342107772827, + "learning_rate": 5.643397119648781e-06, + "loss": 0.2459, "step": 865300 }, { - "epoch": 8.82, - "learning_rate": 2.32155833612411e-05, - "loss": 0.3504, + "epoch": 11.923066324984156, + "grad_norm": 1.5080740451812744, + "learning_rate": 5.638533553748716e-06, + "loss": 0.2525, "step": 865400 }, { - "epoch": 8.82, - "learning_rate": 2.3209121784383153e-05, - "loss": 0.3241, + "epoch": 11.924444077043896, + "grad_norm": 4.028144836425781, + "learning_rate": 5.633671845092061e-06, + "loss": 0.2629, "step": 865500 }, { - "epoch": 8.82, - "learning_rate": 2.3202660539558627e-05, - "loss": 0.3082, + "epoch": 11.925821829103635, + "grad_norm": 0.2799889147281647, + "learning_rate": 5.628811994091857e-06, + "loss": 0.2503, "step": 865600 }, { - "epoch": 8.82, - "learning_rate": 2.319619962708344e-05, - "loss": 0.3549, + "epoch": 11.927199581163373, + "grad_norm": 5.697725772857666, + "learning_rate": 5.623954001160961e-06, + "loss": 0.2125, "step": 865700 }, { - "epoch": 8.82, - "learning_rate": 2.3189739047273484e-05, - "loss": 0.3427, + "epoch": 11.928577333223114, + "grad_norm": 0.8435084223747253, + "learning_rate": 5.6190978667120945e-06, + "loss": 0.263, "step": 865800 }, { - "epoch": 8.82, - "learning_rate": 2.3183278800444643e-05, - "loss": 0.2942, + "epoch": 11.929955085282852, + "grad_norm": 3.621826171875, + "learning_rate": 5.614243591157801e-06, + "loss": 0.2493, "step": 865900 }, { - "epoch": 8.82, - "learning_rate": 2.3176818886912804e-05, - "loss": 0.4499, + "epoch": 11.931332837342591, + "grad_norm": 0.9587473273277283, + "learning_rate": 5.609391174910479e-06, + "loss": 0.2625, "step": 866000 }, { - "epoch": 8.82, - "learning_rate": 2.3170359306993792e-05, - "loss": 0.4341, + "epoch": 11.932710589402332, + "grad_norm": 3.8569228649139404, + "learning_rate": 5.604589114740699e-06, + "loss": 0.2509, "step": 866100 }, { - "epoch": 8.82, - "learning_rate": 2.3163900061003452e-05, - "loss": 0.3517, + "epoch": 11.93408834146207, + "grad_norm": 2.1629412174224854, + "learning_rate": 5.5997403997405334e-06, + "loss": 0.2198, "step": 866200 }, { - "epoch": 8.83, - "learning_rate": 2.315744114925761e-05, - "loss": 0.3898, + "epoch": 11.93546609352181, + "grad_norm": 3.3584821224212646, + "learning_rate": 5.594893545279458e-06, + "loss": 0.2788, "step": 866300 }, { - "epoch": 8.83, - "learning_rate": 2.3150982572072056e-05, - "loss": 0.3587, + "epoch": 11.93684384558155, + "grad_norm": 2.0743823051452637, + "learning_rate": 5.590048551769239e-06, + "loss": 0.1929, "step": 866400 }, { - "epoch": 8.83, - "learning_rate": 2.314452432976258e-05, - "loss": 0.351, + "epoch": 11.938221597641288, + "grad_norm": 0.39749419689178467, + "learning_rate": 5.585205419621496e-06, + "loss": 0.2537, "step": 866500 }, { - "epoch": 8.83, - "learning_rate": 2.313806642264496e-05, - "loss": 0.3449, + "epoch": 11.939599349701028, + "grad_norm": 0.5560060143470764, + "learning_rate": 5.58036414924766e-06, + "loss": 0.2549, "step": 866600 }, { - "epoch": 8.83, - "learning_rate": 2.3131608851034936e-05, - "loss": 0.3623, + "epoch": 11.940977101760767, + "grad_norm": 2.6576650142669678, + "learning_rate": 5.575524741059037e-06, + "loss": 0.2328, "step": 866700 }, { - "epoch": 8.83, - "learning_rate": 2.312515161524825e-05, - "loss": 0.3404, + "epoch": 11.942354853820506, + "grad_norm": 2.8591365814208984, + "learning_rate": 5.5706871954667415e-06, + "loss": 0.2104, "step": 866800 }, { - "epoch": 8.83, - "learning_rate": 2.3118694715600623e-05, - "loss": 0.3106, + "epoch": 11.943732605880246, + "grad_norm": 4.699713706970215, + "learning_rate": 5.565851512881769e-06, + "loss": 0.2423, "step": 866900 }, { - "epoch": 8.83, - "learning_rate": 2.3112238152407766e-05, - "loss": 0.341, + "epoch": 11.945110357939985, + "grad_norm": 3.336887836456299, + "learning_rate": 5.561017693714917e-06, + "loss": 0.2518, "step": 867000 }, { - "epoch": 8.83, - "learning_rate": 2.3105781925985352e-05, - "loss": 0.4352, + "epoch": 11.946488109999724, + "grad_norm": 1.633097529411316, + "learning_rate": 5.556185738376859e-06, + "loss": 0.2308, "step": 867100 }, { - "epoch": 8.84, - "learning_rate": 2.3099326036649058e-05, - "loss": 0.3826, + "epoch": 11.947865862059464, + "grad_norm": 1.0202280282974243, + "learning_rate": 5.551355647278079e-06, + "loss": 0.2672, "step": 867200 }, { - "epoch": 8.84, - "learning_rate": 2.309287048471455e-05, - "loss": 0.3706, + "epoch": 11.949243614119203, + "grad_norm": 1.6913769245147705, + "learning_rate": 5.54652742082893e-06, + "loss": 0.2413, "step": 867300 }, { - "epoch": 8.84, - "learning_rate": 2.3086479820966876e-05, - "loss": 0.3864, + "epoch": 11.950621366178943, + "grad_norm": 3.2953696250915527, + "learning_rate": 5.541701059439583e-06, + "loss": 0.2349, "step": 867400 }, { - "epoch": 8.84, - "learning_rate": 2.308002494140092e-05, - "loss": 0.3543, + "epoch": 11.951999118238682, + "grad_norm": 1.0987346172332764, + "learning_rate": 5.536876563520078e-06, + "loss": 0.2648, "step": 867500 }, { - "epoch": 8.84, - "learning_rate": 2.3073570400180445e-05, - "loss": 0.3551, + "epoch": 11.95337687029842, + "grad_norm": 1.3132703304290771, + "learning_rate": 5.532053933480265e-06, + "loss": 0.2708, "step": 867600 }, { - "epoch": 8.84, - "learning_rate": 2.306711619762104e-05, - "loss": 0.3865, + "epoch": 11.95475462235816, + "grad_norm": 3.4657421112060547, + "learning_rate": 5.527233169729855e-06, + "loss": 0.2191, "step": 867700 }, { - "epoch": 8.84, - "learning_rate": 2.3060662334038295e-05, - "loss": 0.3415, + "epoch": 11.9561323744179, + "grad_norm": 1.6055585145950317, + "learning_rate": 5.522414272678409e-06, + "loss": 0.2337, "step": 867800 }, { - "epoch": 8.84, - "learning_rate": 2.305420880974773e-05, - "loss": 0.368, + "epoch": 11.95751012647764, + "grad_norm": 0.8866155743598938, + "learning_rate": 5.517597242735306e-06, + "loss": 0.2758, "step": 867900 }, { - "epoch": 8.84, - "learning_rate": 2.304775562506491e-05, - "loss": 0.3873, + "epoch": 11.958887878537379, + "grad_norm": 10.017451286315918, + "learning_rate": 5.51278208030977e-06, + "loss": 0.2741, "step": 868000 }, { - "epoch": 8.84, - "learning_rate": 2.304130278030536e-05, - "loss": 0.3782, + "epoch": 11.960265630597117, + "grad_norm": 0.23922261595726013, + "learning_rate": 5.507968785810889e-06, + "loss": 0.2453, "step": 868100 }, { - "epoch": 8.85, - "learning_rate": 2.3034850275784565e-05, - "loss": 0.3734, + "epoch": 11.961643382656858, + "grad_norm": 2.668151378631592, + "learning_rate": 5.503205464659598e-06, + "loss": 0.264, "step": 868200 }, { - "epoch": 8.85, - "learning_rate": 2.3028398111818028e-05, - "loss": 0.3852, + "epoch": 11.963021134716596, + "grad_norm": 0.8464562296867371, + "learning_rate": 5.498395888551132e-06, + "loss": 0.2098, "step": 868300 }, { - "epoch": 8.85, - "learning_rate": 2.3021946288721232e-05, - "loss": 0.3559, + "epoch": 11.964398886776335, + "grad_norm": 0.1544121652841568, + "learning_rate": 5.493588181591486e-06, + "loss": 0.2496, "step": 868400 }, { - "epoch": 8.85, - "learning_rate": 2.3015494806809606e-05, - "loss": 0.3234, + "epoch": 11.965776638836076, + "grad_norm": 0.04807332530617714, + "learning_rate": 5.48878234418911e-06, + "loss": 0.2393, "step": 868500 }, { - "epoch": 8.85, - "learning_rate": 2.3009043666398604e-05, - "loss": 0.3559, + "epoch": 11.967154390895814, + "grad_norm": 2.580390691757202, + "learning_rate": 5.4839783767522905e-06, + "loss": 0.2847, "step": 868600 }, { - "epoch": 8.85, - "learning_rate": 2.3002592867803652e-05, - "loss": 0.4089, + "epoch": 11.968532142955553, + "grad_norm": 3.217088222503662, + "learning_rate": 5.479176279689132e-06, + "loss": 0.2727, "step": 868700 }, { - "epoch": 8.85, - "learning_rate": 2.2996142411340148e-05, - "loss": 0.3633, + "epoch": 11.969909895015293, + "grad_norm": 3.766310214996338, + "learning_rate": 5.474424046408715e-06, + "loss": 0.229, "step": 868800 }, { - "epoch": 8.85, - "learning_rate": 2.2989692297323475e-05, - "loss": 0.3026, + "epoch": 11.971287647075032, + "grad_norm": 1.7429208755493164, + "learning_rate": 5.46962567260272e-06, + "loss": 0.2284, "step": 868900 }, { - "epoch": 8.85, - "learning_rate": 2.298324252606903e-05, - "loss": 0.3558, + "epoch": 11.972665399134772, + "grad_norm": 1.832765817642212, + "learning_rate": 5.464829170389732e-06, + "loss": 0.2363, "step": 869000 }, { - "epoch": 8.85, - "learning_rate": 2.297679309789214e-05, - "loss": 0.4046, + "epoch": 11.974043151194511, + "grad_norm": 2.1697936058044434, + "learning_rate": 5.460034540177231e-06, + "loss": 0.2654, "step": 869100 }, { - "epoch": 8.86, - "learning_rate": 2.2970344013108156e-05, - "loss": 0.3959, + "epoch": 11.97542090325425, + "grad_norm": 1.1668633222579956, + "learning_rate": 5.45524178237255e-06, + "loss": 0.2486, "step": 869200 }, { - "epoch": 8.86, - "learning_rate": 2.29638952720324e-05, - "loss": 0.3386, + "epoch": 11.97679865531399, + "grad_norm": 1.0695405006408691, + "learning_rate": 5.450450897382867e-06, + "loss": 0.2484, "step": 869300 }, { - "epoch": 8.86, - "learning_rate": 2.2957446874980177e-05, - "loss": 0.3771, + "epoch": 11.978176407373729, + "grad_norm": 3.3183772563934326, + "learning_rate": 5.445661885615186e-06, + "loss": 0.2399, "step": 869400 }, { - "epoch": 8.86, - "learning_rate": 2.2950998822266773e-05, - "loss": 0.3176, + "epoch": 11.979554159433468, + "grad_norm": 1.032438039779663, + "learning_rate": 5.44087474747635e-06, + "loss": 0.2586, "step": 869500 }, { - "epoch": 8.86, - "learning_rate": 2.2944551114207454e-05, - "loss": 0.3724, + "epoch": 11.980931911493208, + "grad_norm": 3.3048338890075684, + "learning_rate": 5.436089483373056e-06, + "loss": 0.2623, "step": 869600 }, { - "epoch": 8.86, - "learning_rate": 2.29381037511175e-05, - "loss": 0.3508, + "epoch": 11.982309663552947, + "grad_norm": 3.9166269302368164, + "learning_rate": 5.431306093711846e-06, + "loss": 0.2211, "step": 869700 }, { - "epoch": 8.86, - "learning_rate": 2.293172120177997e-05, - "loss": 0.4425, + "epoch": 11.983687415612687, + "grad_norm": 1.294677734375, + "learning_rate": 5.426524578899079e-06, + "loss": 0.2232, "step": 869800 }, { - "epoch": 8.86, - "learning_rate": 2.292527452611684e-05, - "loss": 0.435, + "epoch": 11.985065167672426, + "grad_norm": 3.3069231510162354, + "learning_rate": 5.421744939340986e-06, + "loss": 0.253, "step": 869900 }, { - "epoch": 8.86, - "learning_rate": 2.2918828196365556e-05, - "loss": 0.3845, + "epoch": 11.986442919732164, + "grad_norm": 2.4778196811676025, + "learning_rate": 5.416967175443611e-06, + "loss": 0.286, "step": 870000 }, { - "epoch": 8.86, - "learning_rate": 2.2912382212841316e-05, - "loss": 0.3656, + "epoch": 11.987820671791905, + "grad_norm": 2.4469211101531982, + "learning_rate": 5.41219128761284e-06, + "loss": 0.2347, "step": 870100 }, { - "epoch": 8.87, - "learning_rate": 2.2905936575859292e-05, - "loss": 0.3998, + "epoch": 11.989198423851644, + "grad_norm": 0.7883899807929993, + "learning_rate": 5.407417276254422e-06, + "loss": 0.2491, "step": 870200 }, { - "epoch": 8.87, - "learning_rate": 2.289949128573463e-05, - "loss": 0.4382, + "epoch": 11.990576175911382, + "grad_norm": 1.967146873474121, + "learning_rate": 5.402645141773938e-06, + "loss": 0.2675, "step": 870300 }, { - "epoch": 8.87, - "learning_rate": 2.2893046342782472e-05, - "loss": 0.3756, + "epoch": 11.991953927971123, + "grad_norm": 8.64643669128418, + "learning_rate": 5.397874884576792e-06, + "loss": 0.2785, "step": 870400 }, { - "epoch": 8.87, - "learning_rate": 2.288660174731795e-05, - "loss": 0.3815, + "epoch": 11.993331680030861, + "grad_norm": 2.2792370319366455, + "learning_rate": 5.393106505068254e-06, + "loss": 0.2357, "step": 870500 }, { - "epoch": 8.87, - "learning_rate": 2.2880157499656137e-05, - "loss": 0.476, + "epoch": 11.994709432090602, + "grad_norm": 1.4582953453063965, + "learning_rate": 5.388340003653418e-06, + "loss": 0.291, "step": 870600 }, { - "epoch": 8.87, - "learning_rate": 2.287371360011213e-05, - "loss": 0.324, + "epoch": 11.99608718415034, + "grad_norm": 1.8107428550720215, + "learning_rate": 5.383575380737216e-06, + "loss": 0.2461, "step": 870700 }, { - "epoch": 8.87, - "learning_rate": 2.286727004900101e-05, - "loss": 0.3781, + "epoch": 11.99746493621008, + "grad_norm": 1.6535526514053345, + "learning_rate": 5.378812636724431e-06, + "loss": 0.2305, "step": 870800 }, { - "epoch": 8.87, - "learning_rate": 2.286082684663781e-05, - "loss": 0.3932, + "epoch": 11.99884268826982, + "grad_norm": 5.824621677398682, + "learning_rate": 5.374051772019689e-06, + "loss": 0.2651, "step": 870900 }, { - "epoch": 8.87, - "learning_rate": 2.2854383993337576e-05, - "loss": 0.35, + "epoch": 12.000220440329558, + "grad_norm": 0.4002041518688202, + "learning_rate": 5.369292787027438e-06, + "loss": 0.2277, "step": 871000 }, { - "epoch": 8.87, - "learning_rate": 2.2847941489415336e-05, - "loss": 0.3491, + "epoch": 12.001598192389297, + "grad_norm": 2.512850522994995, + "learning_rate": 5.364535682151991e-06, + "loss": 0.2099, "step": 871100 }, { - "epoch": 8.88, - "learning_rate": 2.284149933518606e-05, - "loss": 0.2841, + "epoch": 12.002975944449037, + "grad_norm": 5.065108776092529, + "learning_rate": 5.359780457797477e-06, + "loss": 0.2162, "step": 871200 }, { - "epoch": 8.88, - "learning_rate": 2.2835057530964744e-05, - "loss": 0.4027, + "epoch": 12.004353696508776, + "grad_norm": 0.08953443169593811, + "learning_rate": 5.355027114367887e-06, + "loss": 0.1947, "step": 871300 }, { - "epoch": 8.88, - "learning_rate": 2.282861607706637e-05, - "loss": 0.3563, + "epoch": 12.005731448568516, + "grad_norm": 1.594722867012024, + "learning_rate": 5.350275652267031e-06, + "loss": 0.1853, "step": 871400 }, { - "epoch": 8.88, - "learning_rate": 2.2822174973805863e-05, - "loss": 0.4229, + "epoch": 12.007109200628255, + "grad_norm": 0.008780485019087791, + "learning_rate": 5.345526071898575e-06, + "loss": 0.1885, "step": 871500 }, { - "epoch": 8.88, - "learning_rate": 2.2815734221498166e-05, - "loss": 0.36, + "epoch": 12.008486952687994, + "grad_norm": 1.3999711275100708, + "learning_rate": 5.340778373666026e-06, + "loss": 0.223, "step": 871600 }, { - "epoch": 8.88, - "learning_rate": 2.280929382045819e-05, - "loss": 0.3585, + "epoch": 12.009864704747734, + "grad_norm": 3.716535806655884, + "learning_rate": 5.336032557972723e-06, + "loss": 0.2287, "step": 871700 }, { - "epoch": 8.88, - "learning_rate": 2.2802853771000847e-05, - "loss": 0.2945, + "epoch": 12.011242456807473, + "grad_norm": 2.367682695388794, + "learning_rate": 5.331288625221834e-06, + "loss": 0.1967, "step": 871800 }, { - "epoch": 8.88, - "learning_rate": 2.2796414073440995e-05, - "loss": 0.3245, + "epoch": 12.012620208867212, + "grad_norm": 1.9853510856628418, + "learning_rate": 5.3265465758164e-06, + "loss": 0.2365, "step": 871900 }, { - "epoch": 8.88, - "learning_rate": 2.27899747280935e-05, - "loss": 0.3865, + "epoch": 12.013997960926952, + "grad_norm": 2.095587730407715, + "learning_rate": 5.321806410159267e-06, + "loss": 0.266, "step": 872000 }, { - "epoch": 8.89, - "learning_rate": 2.2783535735273227e-05, - "loss": 0.3873, + "epoch": 12.01537571298669, + "grad_norm": 2.4229087829589844, + "learning_rate": 5.31706812865314e-06, + "loss": 0.27, "step": 872100 }, { - "epoch": 8.89, - "learning_rate": 2.2777097095294982e-05, - "loss": 0.3004, + "epoch": 12.01675346504643, + "grad_norm": 4.114567279815674, + "learning_rate": 5.312331731700572e-06, + "loss": 0.2225, "step": 872200 }, { - "epoch": 8.89, - "learning_rate": 2.2770658808473582e-05, - "loss": 0.2668, + "epoch": 12.01813121710617, + "grad_norm": 2.40048885345459, + "learning_rate": 5.307597219703931e-06, + "loss": 0.2504, "step": 872300 }, { - "epoch": 8.89, - "learning_rate": 2.276422087512384e-05, - "loss": 0.3826, + "epoch": 12.019508969165908, + "grad_norm": 3.690300464630127, + "learning_rate": 5.302864593065439e-06, + "loss": 0.2213, "step": 872400 }, { - "epoch": 8.89, - "learning_rate": 2.2757783295560503e-05, - "loss": 0.3585, + "epoch": 12.020886721225649, + "grad_norm": 4.3351664543151855, + "learning_rate": 5.298133852187156e-06, + "loss": 0.265, "step": 872500 }, { - "epoch": 8.89, - "learning_rate": 2.2751346070098345e-05, - "loss": 0.3303, + "epoch": 12.022264473285388, + "grad_norm": 0.5285466909408569, + "learning_rate": 5.293404997470995e-06, + "loss": 0.2106, "step": 872600 }, { - "epoch": 8.89, - "learning_rate": 2.274490919905211e-05, - "loss": 0.3826, + "epoch": 12.023642225345126, + "grad_norm": 2.2210826873779297, + "learning_rate": 5.28867802931868e-06, + "loss": 0.195, "step": 872700 }, { - "epoch": 8.89, - "learning_rate": 2.2738472682736514e-05, - "loss": 0.3798, + "epoch": 12.025019977404867, + "grad_norm": 2.008704662322998, + "learning_rate": 5.283952948131807e-06, + "loss": 0.2497, "step": 872800 }, { - "epoch": 8.89, - "learning_rate": 2.2732036521466266e-05, - "loss": 0.4135, + "epoch": 12.026397729464605, + "grad_norm": 0.09192755073308945, + "learning_rate": 5.279229754311786e-06, + "loss": 0.2486, "step": 872900 }, { - "epoch": 8.89, - "learning_rate": 2.2725600715556074e-05, - "loss": 0.3373, + "epoch": 12.027775481524344, + "grad_norm": 3.7384469509124756, + "learning_rate": 5.274508448259871e-06, + "loss": 0.2452, "step": 873000 }, { - "epoch": 8.9, - "learning_rate": 2.2719165265320584e-05, - "loss": 0.3993, + "epoch": 12.029153233584085, + "grad_norm": 6.8460540771484375, + "learning_rate": 5.2697890303771695e-06, + "loss": 0.2496, "step": 873100 }, { - "epoch": 8.9, - "learning_rate": 2.2712730171074464e-05, - "loss": 0.3426, + "epoch": 12.030530985643823, + "grad_norm": 2.203611373901367, + "learning_rate": 5.265071501064626e-06, + "loss": 0.2551, "step": 873200 }, { - "epoch": 8.9, - "learning_rate": 2.270629543313235e-05, - "loss": 0.3983, + "epoch": 12.031908737703564, + "grad_norm": 4.169670581817627, + "learning_rate": 5.260355860723012e-06, + "loss": 0.2495, "step": 873300 }, { - "epoch": 8.9, - "learning_rate": 2.2699861051808864e-05, - "loss": 0.3553, + "epoch": 12.033286489763302, + "grad_norm": 0.34469807147979736, + "learning_rate": 5.255642109752935e-06, + "loss": 0.2493, "step": 873400 }, { - "epoch": 8.9, - "learning_rate": 2.2693427027418608e-05, - "loss": 0.3724, + "epoch": 12.034664241823041, + "grad_norm": 0.015197351574897766, + "learning_rate": 5.250930248554864e-06, + "loss": 0.2417, "step": 873500 }, { - "epoch": 8.9, - "learning_rate": 2.2686993360276166e-05, - "loss": 0.315, + "epoch": 12.036041993882781, + "grad_norm": 1.5760308504104614, + "learning_rate": 5.246220277529103e-06, + "loss": 0.2191, "step": 873600 }, { - "epoch": 8.9, - "learning_rate": 2.2680560050696116e-05, - "loss": 0.4486, + "epoch": 12.03741974594252, + "grad_norm": 9.653153419494629, + "learning_rate": 5.241512197075768e-06, + "loss": 0.2308, "step": 873700 }, { - "epoch": 8.9, - "learning_rate": 2.267412709899299e-05, - "loss": 0.3247, + "epoch": 12.038797498002259, + "grad_norm": 2.7352209091186523, + "learning_rate": 5.236806007594847e-06, + "loss": 0.2294, "step": 873800 }, { - "epoch": 8.9, - "learning_rate": 2.2667694505481328e-05, - "loss": 0.4153, + "epoch": 12.040175250062, + "grad_norm": 2.580380916595459, + "learning_rate": 5.232101709486168e-06, + "loss": 0.2309, "step": 873900 }, { - "epoch": 8.9, - "learning_rate": 2.2661326591050074e-05, - "loss": 0.4166, + "epoch": 12.041553002121738, + "grad_norm": 1.1028344631195068, + "learning_rate": 5.227399303149357e-06, + "loss": 0.2563, "step": 874000 }, { - "epoch": 8.91, - "learning_rate": 2.2654894711275113e-05, - "loss": 0.4076, + "epoch": 12.042930754181478, + "grad_norm": 1.87930428981781, + "learning_rate": 5.222698788983922e-06, + "loss": 0.2322, "step": 874100 }, { - "epoch": 8.91, - "learning_rate": 2.2648463190631975e-05, - "loss": 0.3584, + "epoch": 12.044308506241217, + "grad_norm": NaN, + "learning_rate": 5.218047144235605e-06, + "loss": 0.2192, "step": 874200 }, { - "epoch": 8.91, - "learning_rate": 2.264203202943511e-05, - "loss": 0.441, + "epoch": 12.045686258300956, + "grad_norm": 3.269401788711548, + "learning_rate": 5.213350396679088e-06, + "loss": 0.2469, "step": 874300 }, { - "epoch": 8.91, - "learning_rate": 2.2635601227998965e-05, - "loss": 0.2954, + "epoch": 12.047064010360696, + "grad_norm": 4.109653949737549, + "learning_rate": 5.208655542487473e-06, + "loss": 0.1925, "step": 874400 }, { - "epoch": 8.91, - "learning_rate": 2.2629170786637985e-05, - "loss": 0.3917, + "epoch": 12.048441762420435, + "grad_norm": 2.3858397006988525, + "learning_rate": 5.203962582059605e-06, + "loss": 0.1866, "step": 874500 }, { - "epoch": 8.91, - "learning_rate": 2.2622740705666556e-05, - "loss": 0.3529, + "epoch": 12.049819514480173, + "grad_norm": 2.3483970165252686, + "learning_rate": 5.199271515794183e-06, + "loss": 0.2412, "step": 874600 }, { - "epoch": 8.91, - "learning_rate": 2.2616310985399075e-05, - "loss": 0.3282, + "epoch": 12.051197266539914, + "grad_norm": 2.6427602767944336, + "learning_rate": 5.1945823440897394e-06, + "loss": 0.187, "step": 874700 }, { - "epoch": 8.91, - "learning_rate": 2.2609881626149936e-05, - "loss": 0.3399, + "epoch": 12.052575018599653, + "grad_norm": 1.9800199270248413, + "learning_rate": 5.189895067344634e-06, + "loss": 0.2479, "step": 874800 }, { - "epoch": 8.91, - "learning_rate": 2.2603452628233478e-05, - "loss": 0.3534, + "epoch": 12.053952770659393, + "grad_norm": 3.604522466659546, + "learning_rate": 5.185209685957087e-06, + "loss": 0.2122, "step": 874900 }, { - "epoch": 8.91, - "learning_rate": 2.2597023991964042e-05, - "loss": 0.3037, + "epoch": 12.055330522719132, + "grad_norm": 0.6902385950088501, + "learning_rate": 5.1805262003251395e-06, + "loss": 0.2121, "step": 875000 }, { - "epoch": 8.92, - "learning_rate": 2.259059571765597e-05, - "loss": 0.4174, + "epoch": 12.05670827477887, + "grad_norm": 4.640689849853516, + "learning_rate": 5.175844610846669e-06, + "loss": 0.2789, "step": 875100 }, { - "epoch": 8.92, - "learning_rate": 2.2584232082949558e-05, - "loss": 0.3525, + "epoch": 12.05808602683861, + "grad_norm": 5.568762302398682, + "learning_rate": 5.171164917919408e-06, + "loss": 0.2276, "step": 875200 }, { - "epoch": 8.92, - "learning_rate": 2.2577804529879614e-05, - "loss": 0.3346, + "epoch": 12.05946377889835, + "grad_norm": 1.4291658401489258, + "learning_rate": 5.166487121940927e-06, + "loss": 0.2379, "step": 875300 }, { - "epoch": 8.92, - "learning_rate": 2.257137733971073e-05, - "loss": 0.3791, + "epoch": 12.060841530958088, + "grad_norm": 4.2890825271606445, + "learning_rate": 5.16181122330862e-06, + "loss": 0.1896, "step": 875400 }, { - "epoch": 8.92, - "learning_rate": 2.2564950512757163e-05, - "loss": 0.3454, + "epoch": 12.062219283017829, + "grad_norm": 1.5138558149337769, + "learning_rate": 5.157137222419735e-06, + "loss": 0.1939, "step": 875500 }, { - "epoch": 8.92, - "learning_rate": 2.2558524049333157e-05, - "loss": 0.3183, + "epoch": 12.063597035077567, + "grad_norm": 1.2165333032608032, + "learning_rate": 5.15246511967134e-06, + "loss": 0.2266, "step": 875600 }, { - "epoch": 8.92, - "learning_rate": 2.2552162208946643e-05, - "loss": 0.3629, + "epoch": 12.064974787137308, + "grad_norm": 2.8036587238311768, + "learning_rate": 5.1477949154603696e-06, + "loss": 0.2747, "step": 875700 }, { - "epoch": 8.92, - "learning_rate": 2.254573646988123e-05, - "loss": 0.3673, + "epoch": 12.066352539197046, + "grad_norm": 1.6879605054855347, + "learning_rate": 5.14312661018357e-06, + "loss": 0.2572, "step": 875800 }, { - "epoch": 8.92, - "learning_rate": 2.253931109528481e-05, - "loss": 0.4075, + "epoch": 12.067730291256785, + "grad_norm": 2.993696689605713, + "learning_rate": 5.138460204237547e-06, + "loss": 0.223, "step": 875900 }, { - "epoch": 8.92, - "learning_rate": 2.253288608547155e-05, - "loss": 0.3859, + "epoch": 12.069108043316525, + "grad_norm": 4.333001613616943, + "learning_rate": 5.133795698018722e-06, + "loss": 0.2698, "step": 876000 }, { - "epoch": 8.93, - "learning_rate": 2.252646144075561e-05, - "loss": 0.4102, + "epoch": 12.070485795376264, + "grad_norm": 1.8231858015060425, + "learning_rate": 5.1291330919233875e-06, + "loss": 0.2684, "step": 876100 }, { - "epoch": 8.93, - "learning_rate": 2.2520037161451096e-05, - "loss": 0.3128, + "epoch": 12.071863547436003, + "grad_norm": 7.267999649047852, + "learning_rate": 5.124472386347634e-06, + "loss": 0.2417, "step": 876200 }, { - "epoch": 8.93, - "learning_rate": 2.2513613247872126e-05, - "loss": 0.3682, + "epoch": 12.073241299495743, + "grad_norm": 3.1360344886779785, + "learning_rate": 5.119813581687435e-06, + "loss": 0.211, "step": 876300 }, { - "epoch": 8.93, - "learning_rate": 2.2507189700332804e-05, - "loss": 0.3214, + "epoch": 12.074619051555482, + "grad_norm": 1.487412691116333, + "learning_rate": 5.1151566783385594e-06, + "loss": 0.2242, "step": 876400 }, { - "epoch": 8.93, - "learning_rate": 2.250076651914718e-05, - "loss": 0.3917, + "epoch": 12.07599680361522, + "grad_norm": 1.644063115119934, + "learning_rate": 5.110501676696643e-06, + "loss": 0.2221, "step": 876500 }, { - "epoch": 8.93, - "learning_rate": 2.249434370462932e-05, - "loss": 0.3817, + "epoch": 12.077374555674961, + "grad_norm": 2.2688229084014893, + "learning_rate": 5.105848577157169e-06, + "loss": 0.2127, "step": 876600 }, { - "epoch": 8.93, - "learning_rate": 2.2487921257093273e-05, - "loss": 0.4182, + "epoch": 12.0787523077347, + "grad_norm": 3.4036290645599365, + "learning_rate": 5.101197380115413e-06, + "loss": 0.2438, "step": 876700 }, { - "epoch": 8.93, - "learning_rate": 2.2481499176853034e-05, - "loss": 0.3712, + "epoch": 12.08013005979444, + "grad_norm": 1.6512223482131958, + "learning_rate": 5.096548085966529e-06, + "loss": 0.2217, "step": 876800 }, { - "epoch": 8.93, - "learning_rate": 2.247507746422262e-05, - "loss": 0.4004, + "epoch": 12.081507811854179, + "grad_norm": 0.5855816602706909, + "learning_rate": 5.091900695105509e-06, + "loss": 0.1952, "step": 876900 }, { - "epoch": 8.94, - "learning_rate": 2.246865611951601e-05, - "loss": 0.3938, + "epoch": 12.082885563913917, + "grad_norm": 0.8231651186943054, + "learning_rate": 5.087255207927157e-06, + "loss": 0.2064, "step": 877000 }, { - "epoch": 8.94, - "learning_rate": 2.2462235143047184e-05, - "loss": 0.345, + "epoch": 12.084263315973658, + "grad_norm": 3.6263163089752197, + "learning_rate": 5.082611624826135e-06, + "loss": 0.2598, "step": 877100 }, { - "epoch": 8.94, - "learning_rate": 2.2455814535130064e-05, - "loss": 0.3507, + "epoch": 12.085641068033397, + "grad_norm": 2.1534066200256348, + "learning_rate": 5.077969946196951e-06, + "loss": 0.2222, "step": 877200 }, { - "epoch": 8.94, - "learning_rate": 2.2449394296078594e-05, - "loss": 0.3857, + "epoch": 12.087018820093135, + "grad_norm": 1.4784257411956787, + "learning_rate": 5.0733301724339315e-06, + "loss": 0.2467, "step": 877300 }, { - "epoch": 8.94, - "learning_rate": 2.2442974426206685e-05, - "loss": 0.3696, + "epoch": 12.088396572152876, + "grad_norm": 2.992199659347534, + "learning_rate": 5.068692303931241e-06, + "loss": 0.2829, "step": 877400 }, { - "epoch": 8.94, - "learning_rate": 2.2436554925828223e-05, - "loss": 0.3706, + "epoch": 12.089774324212614, + "grad_norm": 0.5653468370437622, + "learning_rate": 5.064056341082895e-06, + "loss": 0.2068, "step": 877500 }, { - "epoch": 8.94, - "learning_rate": 2.2430135795257092e-05, - "loss": 0.3789, + "epoch": 12.091152076272355, + "grad_norm": 1.0328912734985352, + "learning_rate": 5.0594222842827525e-06, + "loss": 0.1839, "step": 877600 }, { - "epoch": 8.94, - "learning_rate": 2.242371703480716e-05, - "loss": 0.3475, + "epoch": 12.092529828332093, + "grad_norm": 2.5018107891082764, + "learning_rate": 5.054790133924492e-06, + "loss": 0.2418, "step": 877700 }, { - "epoch": 8.94, - "learning_rate": 2.241729864479223e-05, - "loss": 0.3374, + "epoch": 12.093907580391832, + "grad_norm": 6.622915744781494, + "learning_rate": 5.050159890401631e-06, + "loss": 0.2373, "step": 877800 }, { - "epoch": 8.94, - "learning_rate": 2.241088062552616e-05, - "loss": 0.3478, + "epoch": 12.095285332451573, + "grad_norm": 3.7919318675994873, + "learning_rate": 5.045531554107548e-06, + "loss": 0.2533, "step": 877900 }, { - "epoch": 8.95, - "learning_rate": 2.240446297732273e-05, - "loss": 0.3472, + "epoch": 12.096663084511311, + "grad_norm": 2.1864354610443115, + "learning_rate": 5.040905125435428e-06, + "loss": 0.2453, "step": 878000 }, { - "epoch": 8.95, - "learning_rate": 2.2398045700495734e-05, - "loss": 0.3454, + "epoch": 12.09804083657105, + "grad_norm": 1.983436107635498, + "learning_rate": 5.0362806047783165e-06, + "loss": 0.1929, "step": 878100 }, { - "epoch": 8.95, - "learning_rate": 2.2391628795358934e-05, - "loss": 0.3301, + "epoch": 12.09941858863079, + "grad_norm": 2.7104105949401855, + "learning_rate": 5.031657992529101e-06, + "loss": 0.2468, "step": 878200 }, { - "epoch": 8.95, - "learning_rate": 2.2385212262226096e-05, - "loss": 0.3804, + "epoch": 12.100796340690529, + "grad_norm": 0.5343512892723083, + "learning_rate": 5.0270372890804854e-06, + "loss": 0.2232, "step": 878300 }, { - "epoch": 8.95, - "learning_rate": 2.2378796101410924e-05, - "loss": 0.3561, + "epoch": 12.10217409275027, + "grad_norm": 2.8182480335235596, + "learning_rate": 5.022418494825014e-06, + "loss": 0.2684, "step": 878400 }, { - "epoch": 8.95, - "learning_rate": 2.2372380313227137e-05, - "loss": 0.3418, + "epoch": 12.103551844810008, + "grad_norm": 2.4777164459228516, + "learning_rate": 5.0178016101550885e-06, + "loss": 0.2103, "step": 878500 }, { - "epoch": 8.95, - "learning_rate": 2.236596489798844e-05, - "loss": 0.4083, + "epoch": 12.104929596869747, + "grad_norm": 1.2576487064361572, + "learning_rate": 5.0131866354629395e-06, + "loss": 0.2323, "step": 878600 }, { - "epoch": 8.95, - "learning_rate": 2.2359549856008507e-05, - "loss": 0.3715, + "epoch": 12.106307348929487, + "grad_norm": 7.165399551391602, + "learning_rate": 5.008573571140623e-06, + "loss": 0.2498, "step": 878700 }, { - "epoch": 8.95, - "learning_rate": 2.235313518760098e-05, - "loss": 0.3934, + "epoch": 12.107685100989226, + "grad_norm": 2.007286310195923, + "learning_rate": 5.0039624175800425e-06, + "loss": 0.1984, "step": 878800 }, { - "epoch": 8.95, - "learning_rate": 2.234672089307951e-05, - "loss": 0.3033, + "epoch": 12.109062853048965, + "grad_norm": 2.9858615398406982, + "learning_rate": 4.9993531751729625e-06, + "loss": 0.2142, "step": 878900 }, { - "epoch": 8.96, - "learning_rate": 2.2340306972757732e-05, - "loss": 0.3762, + "epoch": 12.110440605108705, + "grad_norm": 15.602959632873535, + "learning_rate": 4.994745844310928e-06, + "loss": 0.2963, "step": 879000 }, { - "epoch": 8.96, - "learning_rate": 2.2333893426949214e-05, - "loss": 0.3874, + "epoch": 12.111818357168444, + "grad_norm": 3.1856021881103516, + "learning_rate": 4.990140425385366e-06, + "loss": 0.2316, "step": 879100 }, { - "epoch": 8.96, - "learning_rate": 2.232748025596756e-05, - "loss": 0.3433, + "epoch": 12.113196109228184, + "grad_norm": 13.943312644958496, + "learning_rate": 4.985582944386211e-06, + "loss": 0.2195, "step": 879200 }, { - "epoch": 8.96, - "learning_rate": 2.2321067460126344e-05, - "loss": 0.3843, + "epoch": 12.114573861287923, + "grad_norm": 3.4007277488708496, + "learning_rate": 4.980981331378088e-06, + "loss": 0.246, "step": 879300 }, { - "epoch": 8.96, - "learning_rate": 2.2314655039739095e-05, - "loss": 0.3974, + "epoch": 12.115951613347661, + "grad_norm": 2.927015542984009, + "learning_rate": 4.976381631475808e-06, + "loss": 0.239, "step": 879400 }, { - "epoch": 8.96, - "learning_rate": 2.230824299511935e-05, - "loss": 0.3449, + "epoch": 12.117329365407402, + "grad_norm": 1.080040693283081, + "learning_rate": 4.971783845070132e-06, + "loss": 0.2341, "step": 879500 }, { - "epoch": 8.96, - "learning_rate": 2.2301831326580633e-05, - "loss": 0.3849, + "epoch": 12.11870711746714, + "grad_norm": 2.457324981689453, + "learning_rate": 4.96718797255167e-06, + "loss": 0.2177, "step": 879600 }, { - "epoch": 8.96, - "learning_rate": 2.2295420034436417e-05, - "loss": 0.3229, + "epoch": 12.12008486952688, + "grad_norm": 2.072075366973877, + "learning_rate": 4.962594014310875e-06, + "loss": 0.2554, "step": 879700 }, { - "epoch": 8.96, - "learning_rate": 2.2289009119000175e-05, - "loss": 0.3901, + "epoch": 12.12146262158662, + "grad_norm": 2.2446069717407227, + "learning_rate": 4.95800197073801e-06, + "loss": 0.2392, "step": 879800 }, { - "epoch": 8.96, - "learning_rate": 2.2282598580585378e-05, - "loss": 0.3185, + "epoch": 12.122840373646358, + "grad_norm": 0.6934237480163574, + "learning_rate": 4.9534118422232095e-06, + "loss": 0.2147, "step": 879900 }, { - "epoch": 8.97, - "learning_rate": 2.227618841950545e-05, - "loss": 0.3859, + "epoch": 12.124218125706099, + "grad_norm": 0.6362463235855103, + "learning_rate": 4.948823629156422e-06, + "loss": 0.2125, "step": 880000 }, { - "epoch": 8.97, - "learning_rate": 2.2269778636073816e-05, - "loss": 0.349, + "epoch": 12.125595877765837, + "grad_norm": 1.511073350906372, + "learning_rate": 4.94423733192743e-06, + "loss": 0.2167, "step": 880100 }, { - "epoch": 8.97, - "learning_rate": 2.2263369230603893e-05, - "loss": 0.3515, + "epoch": 12.126973629825576, + "grad_norm": 3.7890899181365967, + "learning_rate": 4.939652950925873e-06, + "loss": 0.2275, "step": 880200 }, { - "epoch": 8.97, - "learning_rate": 2.2256960203409024e-05, - "loss": 0.3549, + "epoch": 12.128351381885317, + "grad_norm": 2.48779559135437, + "learning_rate": 4.93507048654122e-06, + "loss": 0.2282, "step": 880300 }, { - "epoch": 8.97, - "learning_rate": 2.2250551554802595e-05, - "loss": 0.3362, + "epoch": 12.129729133945055, + "grad_norm": 2.0339746475219727, + "learning_rate": 4.930489939162764e-06, + "loss": 0.225, "step": 880400 }, { - "epoch": 8.97, - "learning_rate": 2.2244143285097948e-05, - "loss": 0.3947, + "epoch": 12.131106886004794, + "grad_norm": 3.99558687210083, + "learning_rate": 4.925911309179661e-06, + "loss": 0.2301, "step": 880500 }, { - "epoch": 8.97, - "learning_rate": 2.223773539460842e-05, - "loss": 0.3221, + "epoch": 12.132484638064534, + "grad_norm": 3.0762410163879395, + "learning_rate": 4.9213345969808755e-06, + "loss": 0.2474, "step": 880600 }, { - "epoch": 8.97, - "learning_rate": 2.223139195687722e-05, - "loss": 0.4539, + "epoch": 12.133862390124273, + "grad_norm": 9.100939750671387, + "learning_rate": 4.916759802955232e-06, + "loss": 0.2379, "step": 880700 }, { - "epoch": 8.97, - "learning_rate": 2.2224984821957842e-05, - "loss": 0.3804, + "epoch": 12.135240142184012, + "grad_norm": 2.1262528896331787, + "learning_rate": 4.91218692749137e-06, + "loss": 0.2263, "step": 880800 }, { - "epoch": 8.97, - "learning_rate": 2.22185780671903e-05, - "loss": 0.3907, + "epoch": 12.136617894243752, + "grad_norm": 2.44498872756958, + "learning_rate": 4.9076159709777966e-06, + "loss": 0.2089, "step": 880900 }, { - "epoch": 8.98, - "learning_rate": 2.2212171692887856e-05, - "loss": 0.4091, + "epoch": 12.13799564630349, + "grad_norm": 1.489201307296753, + "learning_rate": 4.903046933802819e-06, + "loss": 0.2558, "step": 881000 }, { - "epoch": 8.98, - "learning_rate": 2.220576569936375e-05, - "loss": 0.3566, + "epoch": 12.139373398363231, + "grad_norm": 3.68166446685791, + "learning_rate": 4.8984798163546165e-06, + "loss": 0.2055, "step": 881100 }, { - "epoch": 8.98, - "learning_rate": 2.2199360086931175e-05, - "loss": 0.3679, + "epoch": 12.14075115042297, + "grad_norm": 3.419917583465576, + "learning_rate": 4.893914619021177e-06, + "loss": 0.2586, "step": 881200 }, { - "epoch": 8.98, - "learning_rate": 2.2192954855903344e-05, - "loss": 0.3753, + "epoch": 12.142128902482709, + "grad_norm": 2.3484318256378174, + "learning_rate": 4.889396965450891e-06, + "loss": 0.243, "step": 881300 }, { - "epoch": 8.98, - "learning_rate": 2.2186550006593445e-05, - "loss": 0.3357, + "epoch": 12.143506654542449, + "grad_norm": 4.913581371307373, + "learning_rate": 4.88483559029952e-06, + "loss": 0.2082, "step": 881400 }, { - "epoch": 8.98, - "learning_rate": 2.2180145539314606e-05, - "loss": 0.3901, + "epoch": 12.144884406602188, + "grad_norm": 0.4809964895248413, + "learning_rate": 4.880276136422061e-06, + "loss": 0.2868, "step": 881500 }, { - "epoch": 8.98, - "learning_rate": 2.2173741454379982e-05, - "loss": 0.4165, + "epoch": 12.146262158661926, + "grad_norm": 3.337538242340088, + "learning_rate": 4.875718604205867e-06, + "loss": 0.2334, "step": 881600 }, { - "epoch": 8.98, - "learning_rate": 2.216733775210271e-05, - "loss": 0.4139, + "epoch": 12.147639910721667, + "grad_norm": 1.8974077701568604, + "learning_rate": 4.871162994038128e-06, + "loss": 0.2057, "step": 881700 }, { - "epoch": 8.98, - "learning_rate": 2.2160934432795865e-05, - "loss": 0.2995, + "epoch": 12.149017662781405, + "grad_norm": 3.166837215423584, + "learning_rate": 4.86660930630585e-06, + "loss": 0.2001, "step": 881800 }, { - "epoch": 8.98, - "learning_rate": 2.215459552423451e-05, - "loss": 0.4039, + "epoch": 12.150395414841146, + "grad_norm": 9.224626541137695, + "learning_rate": 4.862057541395904e-06, + "loss": 0.1922, "step": 881900 }, { - "epoch": 8.99, - "learning_rate": 2.214819296797027e-05, - "loss": 0.3238, + "epoch": 12.151773166900885, + "grad_norm": 9.962299346923828, + "learning_rate": 4.857507699694992e-06, + "loss": 0.2057, "step": 882000 }, { - "epoch": 8.99, - "learning_rate": 2.2141790795612533e-05, - "loss": 0.3972, + "epoch": 12.153150918960623, + "grad_norm": 4.584589958190918, + "learning_rate": 4.852959781589634e-06, + "loss": 0.2164, "step": 882100 }, { - "epoch": 8.99, - "learning_rate": 2.2135389007474346e-05, - "loss": 0.3677, + "epoch": 12.154528671020364, + "grad_norm": 2.1607563495635986, + "learning_rate": 4.848413787466205e-06, + "loss": 0.2065, "step": 882200 }, { - "epoch": 8.99, - "learning_rate": 2.212898760386869e-05, - "loss": 0.288, + "epoch": 12.155906423080102, + "grad_norm": 0.4329064190387726, + "learning_rate": 4.843869717710909e-06, + "loss": 0.2134, "step": 882300 }, { - "epoch": 8.99, - "learning_rate": 2.212258658510857e-05, - "loss": 0.3738, + "epoch": 12.157284175139841, + "grad_norm": 3.1414289474487305, + "learning_rate": 4.839327572709792e-06, + "loss": 0.2389, "step": 882400 }, { - "epoch": 8.99, - "learning_rate": 2.2116185951506962e-05, - "loss": 0.3134, + "epoch": 12.158661927199581, + "grad_norm": 1.0964374542236328, + "learning_rate": 4.834787352848724e-06, + "loss": 0.2629, "step": 882500 }, { - "epoch": 8.99, - "learning_rate": 2.2109785703376805e-05, - "loss": 0.4154, + "epoch": 12.16003967925932, + "grad_norm": 1.6723058223724365, + "learning_rate": 4.830249058513425e-06, + "loss": 0.2363, "step": 882600 }, { - "epoch": 8.99, - "learning_rate": 2.210338584103104e-05, - "loss": 0.3371, + "epoch": 12.16141743131906, + "grad_norm": 3.7809197902679443, + "learning_rate": 4.825712690089459e-06, + "loss": 0.228, "step": 882700 }, { - "epoch": 8.99, - "learning_rate": 2.2096986364782596e-05, - "loss": 0.36, + "epoch": 12.1627951833788, + "grad_norm": 2.670841693878174, + "learning_rate": 4.821178247962187e-06, + "loss": 0.2436, "step": 882800 }, { - "epoch": 9.0, - "learning_rate": 2.2090587274944346e-05, - "loss": 0.3375, + "epoch": 12.164172935438538, + "grad_norm": 3.943086624145508, + "learning_rate": 4.816645732516846e-06, + "loss": 0.2013, "step": 882900 }, { - "epoch": 9.0, - "learning_rate": 2.2084188571829176e-05, - "loss": 0.3393, + "epoch": 12.165550687498278, + "grad_norm": 2.231783151626587, + "learning_rate": 4.812115144138505e-06, + "loss": 0.2307, "step": 883000 }, { - "epoch": 9.0, - "learning_rate": 2.2077790255749953e-05, - "loss": 0.4107, + "epoch": 12.166928439558017, + "grad_norm": 1.2964115142822266, + "learning_rate": 4.807586483212047e-06, + "loss": 0.244, "step": 883100 }, { - "epoch": 9.0, - "learning_rate": 2.20713923270195e-05, - "loss": 0.3377, + "epoch": 12.168306191617756, + "grad_norm": 5.346635818481445, + "learning_rate": 4.80305975012221e-06, + "loss": 0.2537, "step": 883200 }, { - "epoch": 9.0, - "learning_rate": 2.2064994785950647e-05, - "loss": 0.2958, + "epoch": 12.169683943677496, + "grad_norm": 2.408895254135132, + "learning_rate": 4.798534945253569e-06, + "loss": 0.236, "step": 883300 }, { - "epoch": 9.0, - "learning_rate": 2.2058597632856195e-05, - "loss": 0.3808, + "epoch": 12.171061695737235, + "grad_norm": 0.8133533596992493, + "learning_rate": 4.7940120689905274e-06, + "loss": 0.2125, "step": 883400 }, { - "epoch": 9.0, - "learning_rate": 2.2052200868048945e-05, - "loss": 0.3802, + "epoch": 12.172439447796975, + "grad_norm": 1.6550309658050537, + "learning_rate": 4.789491121717313e-06, + "loss": 0.2146, "step": 883500 }, { - "epoch": 9.0, - "learning_rate": 2.204580449184162e-05, - "loss": 0.3254, + "epoch": 12.173817199856714, + "grad_norm": 1.2474266290664673, + "learning_rate": 4.784972103818018e-06, + "loss": 0.2861, "step": 883600 }, { - "epoch": 9.0, - "learning_rate": 2.2039408504546992e-05, - "loss": 0.3308, + "epoch": 12.175194951916453, + "grad_norm": 1.8229119777679443, + "learning_rate": 4.780500177004414e-06, + "loss": 0.2229, "step": 883700 }, { - "epoch": 9.0, - "learning_rate": 2.203301290647779e-05, - "loss": 0.3309, + "epoch": 12.176572703976193, + "grad_norm": 2.0488181114196777, + "learning_rate": 4.775984999701215e-06, + "loss": 0.2405, "step": 883800 }, { - "epoch": 9.01, - "learning_rate": 2.2026617697946712e-05, - "loss": 0.3307, + "epoch": 12.177950456035932, + "grad_norm": 2.033342123031616, + "learning_rate": 4.771471752919341e-06, + "loss": 0.2396, "step": 883900 }, { - "epoch": 9.01, - "learning_rate": 2.2020222879266437e-05, - "loss": 0.2735, + "epoch": 12.17932820809567, + "grad_norm": 1.845656156539917, + "learning_rate": 4.766960437042222e-06, + "loss": 0.2318, "step": 884000 }, { - "epoch": 9.01, - "learning_rate": 2.2013828450749656e-05, - "loss": 0.389, + "epoch": 12.18070596015541, + "grad_norm": 8.718055725097656, + "learning_rate": 4.762451052453121e-06, + "loss": 0.2059, "step": 884100 }, { - "epoch": 9.01, - "learning_rate": 2.2007434412708992e-05, - "loss": 0.2833, + "epoch": 12.18208371221515, + "grad_norm": 2.8062326908111572, + "learning_rate": 4.757943599535119e-06, + "loss": 0.233, "step": 884200 }, { - "epoch": 9.01, - "learning_rate": 2.2001040765457092e-05, - "loss": 0.4388, + "epoch": 12.18346146427489, + "grad_norm": 1.0413531064987183, + "learning_rate": 4.753438078671156e-06, + "loss": 0.1812, "step": 884300 }, { - "epoch": 9.01, - "learning_rate": 2.199464750930657e-05, - "loss": 0.3243, + "epoch": 12.184839216334629, + "grad_norm": 0.8240475654602051, + "learning_rate": 4.748934490244007e-06, + "loss": 0.2503, "step": 884400 }, { - "epoch": 9.01, - "learning_rate": 2.198825464457e-05, - "loss": 0.3244, + "epoch": 12.186216968394367, + "grad_norm": 3.3376643657684326, + "learning_rate": 4.744432834636266e-06, + "loss": 0.2409, "step": 884500 }, { - "epoch": 9.01, - "learning_rate": 2.198186217155997e-05, - "loss": 0.3891, + "epoch": 12.187594720454108, + "grad_norm": 1.5429868698120117, + "learning_rate": 4.739933112230367e-06, + "loss": 0.2177, "step": 884600 }, { - "epoch": 9.01, - "learning_rate": 2.197547009058904e-05, - "loss": 0.3369, + "epoch": 12.188972472513846, + "grad_norm": 1.6091680526733398, + "learning_rate": 4.735435323408594e-06, + "loss": 0.1963, "step": 884700 }, { - "epoch": 9.01, - "learning_rate": 2.1969078401969723e-05, - "loss": 0.4376, + "epoch": 12.190350224573585, + "grad_norm": 1.2839714288711548, + "learning_rate": 4.730939468553048e-06, + "loss": 0.2544, "step": 884800 }, { - "epoch": 9.02, - "learning_rate": 2.1962687106014546e-05, - "loss": 0.4019, + "epoch": 12.191727976633326, + "grad_norm": 1.3931279182434082, + "learning_rate": 4.726445548045685e-06, + "loss": 0.2169, "step": 884900 }, { - "epoch": 9.02, - "learning_rate": 2.1956296203036014e-05, - "loss": 0.3237, + "epoch": 12.193105728693064, + "grad_norm": 1.524802803993225, + "learning_rate": 4.721953562268275e-06, + "loss": 0.245, "step": 885000 }, { - "epoch": 9.02, - "learning_rate": 2.1949905693346586e-05, - "loss": 0.3623, + "epoch": 12.194483480752803, + "grad_norm": 11.86330509185791, + "learning_rate": 4.717463511602451e-06, + "loss": 0.2022, "step": 885100 }, { - "epoch": 9.02, - "learning_rate": 2.194351557725873e-05, - "loss": 0.3407, + "epoch": 12.195861232812543, + "grad_norm": 2.1775686740875244, + "learning_rate": 4.712975396429649e-06, + "loss": 0.2351, "step": 885200 }, { - "epoch": 9.02, - "learning_rate": 2.1937189750355728e-05, - "loss": 0.3451, + "epoch": 12.197238984872282, + "grad_norm": 2.8879382610321045, + "learning_rate": 4.708489217131163e-06, + "loss": 0.2248, "step": 885300 }, { - "epoch": 9.02, - "learning_rate": 2.1930800418464498e-05, - "loss": 0.3438, + "epoch": 12.198616736932022, + "grad_norm": 2.509336471557617, + "learning_rate": 4.704004974088124e-06, + "loss": 0.2066, "step": 885400 }, { - "epoch": 9.02, - "learning_rate": 2.192441148110899e-05, - "loss": 0.3366, + "epoch": 12.199994488991761, + "grad_norm": 3.2225842475891113, + "learning_rate": 4.69952266768149e-06, + "loss": 0.2198, "step": 885500 }, { - "epoch": 9.02, - "learning_rate": 2.1918022938601545e-05, - "loss": 0.3374, + "epoch": 12.2013722410515, + "grad_norm": 2.8396997451782227, + "learning_rate": 4.6950422982920416e-06, + "loss": 0.2396, "step": 885600 }, { - "epoch": 9.02, - "learning_rate": 2.1911634791254556e-05, - "loss": 0.3125, + "epoch": 12.20274999311124, + "grad_norm": 4.007664203643799, + "learning_rate": 4.690563866300429e-06, + "loss": 0.2499, "step": 885700 }, { - "epoch": 9.02, - "learning_rate": 2.1905247039380358e-05, - "loss": 0.3958, + "epoch": 12.204127745170979, + "grad_norm": 2.9423089027404785, + "learning_rate": 4.686087372087101e-06, + "loss": 0.2396, "step": 885800 }, { - "epoch": 9.03, - "learning_rate": 2.1898859683291283e-05, - "loss": 0.3175, + "epoch": 12.205505497230718, + "grad_norm": 3.4315288066864014, + "learning_rate": 4.681612816032364e-06, + "loss": 0.2249, "step": 885900 }, { - "epoch": 9.03, - "learning_rate": 2.1892472723299612e-05, - "loss": 0.3339, + "epoch": 12.206883249290458, + "grad_norm": 3.3260977268218994, + "learning_rate": 4.677140198516363e-06, + "loss": 0.2616, "step": 886000 }, { - "epoch": 9.03, - "learning_rate": 2.1886086159717643e-05, - "loss": 0.2993, + "epoch": 12.208261001350197, + "grad_norm": 0.037134140729904175, + "learning_rate": 4.672669519919064e-06, + "loss": 0.2274, "step": 886100 }, { - "epoch": 9.03, - "learning_rate": 2.1879699992857645e-05, - "loss": 0.3254, + "epoch": 12.209638753409937, + "grad_norm": 1.937679648399353, + "learning_rate": 4.6682007806202665e-06, + "loss": 0.2588, "step": 886200 }, { - "epoch": 9.03, - "learning_rate": 2.187331422303185e-05, - "loss": 0.3495, + "epoch": 12.211016505469676, + "grad_norm": 3.2251245975494385, + "learning_rate": 4.663733980999618e-06, + "loss": 0.2281, "step": 886300 }, { - "epoch": 9.03, - "learning_rate": 2.1866928850552497e-05, - "loss": 0.2787, + "epoch": 12.212394257529414, + "grad_norm": 0.41416215896606445, + "learning_rate": 4.659269121436607e-06, + "loss": 0.2491, "step": 886400 }, { - "epoch": 9.03, - "learning_rate": 2.1860543875731795e-05, - "loss": 0.2362, + "epoch": 12.213772009589155, + "grad_norm": 2.800989866256714, + "learning_rate": 4.654806202310528e-06, + "loss": 0.1746, "step": 886500 }, { - "epoch": 9.03, - "learning_rate": 2.1854159298881917e-05, - "loss": 0.284, + "epoch": 12.215149761648894, + "grad_norm": 2.3401637077331543, + "learning_rate": 4.650345224000545e-06, + "loss": 0.2385, "step": 886600 }, { - "epoch": 9.03, - "learning_rate": 2.1847775120315033e-05, - "loss": 0.2854, + "epoch": 12.216527513708632, + "grad_norm": 1.9223545789718628, + "learning_rate": 4.645886186885631e-06, + "loss": 0.2076, "step": 886700 }, { - "epoch": 9.03, - "learning_rate": 2.18413913403433e-05, - "loss": 0.3182, + "epoch": 12.217905265768373, + "grad_norm": 1.6307142972946167, + "learning_rate": 4.641429091344601e-06, + "loss": 0.2321, "step": 886800 }, { - "epoch": 9.04, - "learning_rate": 2.183500795927884e-05, - "loss": 0.3598, + "epoch": 12.219283017828111, + "grad_norm": 2.2302122116088867, + "learning_rate": 4.636973937756115e-06, + "loss": 0.2296, "step": 886900 }, { - "epoch": 9.04, - "learning_rate": 2.1828624977433763e-05, - "loss": 0.3338, + "epoch": 12.220660769887852, + "grad_norm": 0.6077303290367126, + "learning_rate": 4.632520726498663e-06, + "loss": 0.1791, "step": 887000 }, { - "epoch": 9.04, - "learning_rate": 2.1822242395120176e-05, - "loss": 0.3546, + "epoch": 12.22203852194759, + "grad_norm": 3.4959311485290527, + "learning_rate": 4.628069457950566e-06, + "loss": 0.2605, "step": 887100 }, { - "epoch": 9.04, - "learning_rate": 2.1815860212650118e-05, - "loss": 0.3045, + "epoch": 12.223416274007329, + "grad_norm": 2.5908241271972656, + "learning_rate": 4.6236201324899745e-06, + "loss": 0.2163, "step": 887200 }, { - "epoch": 9.04, - "learning_rate": 2.1809478430335653e-05, - "loss": 0.3488, + "epoch": 12.22479402606707, + "grad_norm": 2.3392996788024902, + "learning_rate": 4.619172750494888e-06, + "loss": 0.2211, "step": 887300 }, { - "epoch": 9.04, - "learning_rate": 2.180309704848882e-05, - "loss": 0.2899, + "epoch": 12.226171778126808, + "grad_norm": 1.9364537000656128, + "learning_rate": 4.614771757101388e-06, + "loss": 0.2107, "step": 887400 }, { - "epoch": 9.04, - "learning_rate": 2.1796716067421625e-05, - "loss": 0.3058, + "epoch": 12.227549530186547, + "grad_norm": 1.2026662826538086, + "learning_rate": 4.610328243726554e-06, + "loss": 0.2417, "step": 887500 }, { - "epoch": 9.04, - "learning_rate": 2.1790335487446054e-05, - "loss": 0.3558, + "epoch": 12.228927282246287, + "grad_norm": 3.282721757888794, + "learning_rate": 4.605886674946436e-06, + "loss": 0.231, "step": 887600 }, { - "epoch": 9.04, - "learning_rate": 2.178395530887409e-05, - "loss": 0.2944, + "epoch": 12.230305034306026, + "grad_norm": 2.1140096187591553, + "learning_rate": 4.601447051138372e-06, + "loss": 0.2234, "step": 887700 }, { - "epoch": 9.05, - "learning_rate": 2.1777575532017692e-05, - "loss": 0.3304, + "epoch": 12.231682786365766, + "grad_norm": 0.3606036901473999, + "learning_rate": 4.597009372679533e-06, + "loss": 0.2164, "step": 887800 }, { - "epoch": 9.05, - "learning_rate": 2.1771196157188768e-05, - "loss": 0.345, + "epoch": 12.233060538425505, + "grad_norm": 2.3173654079437256, + "learning_rate": 4.592573639946912e-06, + "loss": 0.2653, "step": 887900 }, { - "epoch": 9.05, - "learning_rate": 2.1764817184699243e-05, - "loss": 0.3127, + "epoch": 12.234438290485244, + "grad_norm": 2.7412893772125244, + "learning_rate": 4.5881398533173525e-06, + "loss": 0.2733, "step": 888000 }, { - "epoch": 9.05, - "learning_rate": 2.175843861486102e-05, - "loss": 0.2735, + "epoch": 12.235816042544984, + "grad_norm": 1.1821815967559814, + "learning_rate": 4.583708013167536e-06, + "loss": 0.1725, "step": 888100 }, { - "epoch": 9.05, - "learning_rate": 2.175206044798596e-05, - "loss": 0.3153, + "epoch": 12.237193794604723, + "grad_norm": 3.445291757583618, + "learning_rate": 4.579278119873964e-06, + "loss": 0.3038, "step": 888200 }, { - "epoch": 9.05, - "learning_rate": 2.1745682684385923e-05, - "loss": 0.2883, + "epoch": 12.238571546664462, + "grad_norm": 1.0347822904586792, + "learning_rate": 4.5748501738129666e-06, + "loss": 0.2278, "step": 888300 }, { - "epoch": 9.05, - "learning_rate": 2.1739305324372755e-05, - "loss": 0.2833, + "epoch": 12.239949298724202, + "grad_norm": 1.1081361770629883, + "learning_rate": 4.570424175360736e-06, + "loss": 0.2082, "step": 888400 }, { - "epoch": 9.05, - "learning_rate": 2.1732928368258246e-05, - "loss": 0.2982, + "epoch": 12.24132705078394, + "grad_norm": 0.13742347061634064, + "learning_rate": 4.566000124893283e-06, + "loss": 0.199, "step": 888500 }, { - "epoch": 9.05, - "learning_rate": 2.172655181635421e-05, - "loss": 0.3608, + "epoch": 12.242704802843681, + "grad_norm": 3.644533395767212, + "learning_rate": 4.561578022786443e-06, + "loss": 0.1849, "step": 888600 }, { - "epoch": 9.05, - "learning_rate": 2.172017566897242e-05, - "loss": 0.3856, + "epoch": 12.24408255490342, + "grad_norm": 2.0544638633728027, + "learning_rate": 4.557157869415907e-06, + "loss": 0.2204, "step": 888700 }, { - "epoch": 9.06, - "learning_rate": 2.171379992642462e-05, - "loss": 0.3354, + "epoch": 12.245460306963158, + "grad_norm": 1.9735850095748901, + "learning_rate": 4.5527396651571854e-06, + "loss": 0.2315, "step": 888800 }, { - "epoch": 9.06, - "learning_rate": 2.1707424589022555e-05, - "loss": 0.351, + "epoch": 12.246838059022899, + "grad_norm": 1.8074002265930176, + "learning_rate": 4.548323410385621e-06, + "loss": 0.2105, "step": 888900 }, { - "epoch": 9.06, - "learning_rate": 2.170104965707795e-05, - "loss": 0.295, + "epoch": 12.248215811082638, + "grad_norm": 2.581815004348755, + "learning_rate": 4.543909105476401e-06, + "loss": 0.2099, "step": 889000 }, { - "epoch": 9.06, - "learning_rate": 2.1694675130902487e-05, - "loss": 0.3501, + "epoch": 12.249593563142376, + "grad_norm": 4.641282081604004, + "learning_rate": 4.539496750804555e-06, + "loss": 0.2868, "step": 889100 }, { - "epoch": 9.06, - "learning_rate": 2.168830101080784e-05, - "loss": 0.3283, + "epoch": 12.250971315202117, + "grad_norm": 3.4576730728149414, + "learning_rate": 4.5350863467449174e-06, + "loss": 0.2512, "step": 889200 }, { - "epoch": 9.06, - "learning_rate": 2.1681927297105673e-05, - "loss": 0.3277, + "epoch": 12.252349067261855, + "grad_norm": 0.7717374563217163, + "learning_rate": 4.5306778936721865e-06, + "loss": 0.2304, "step": 889300 }, { - "epoch": 9.06, - "learning_rate": 2.1675553990107625e-05, - "loss": 0.3587, + "epoch": 12.253726819321596, + "grad_norm": 2.422650098800659, + "learning_rate": 4.526271391960874e-06, + "loss": 0.2352, "step": 889400 }, { - "epoch": 9.06, - "learning_rate": 2.166918109012531e-05, - "loss": 0.3034, + "epoch": 12.255104571381334, + "grad_norm": 1.3010914325714111, + "learning_rate": 4.521866841985348e-06, + "loss": 0.256, "step": 889500 }, { - "epoch": 9.06, - "learning_rate": 2.166280859747032e-05, - "loss": 0.3152, + "epoch": 12.256482323441073, + "grad_norm": 2.878810167312622, + "learning_rate": 4.51746424411978e-06, + "loss": 0.2366, "step": 889600 }, { - "epoch": 9.06, - "learning_rate": 2.1656436512454245e-05, - "loss": 0.3185, + "epoch": 12.257860075500814, + "grad_norm": 2.018059253692627, + "learning_rate": 4.513063598738207e-06, + "loss": 0.2004, "step": 889700 }, { - "epoch": 9.07, - "learning_rate": 2.1650064835388626e-05, - "loss": 0.3244, + "epoch": 12.259237827560552, + "grad_norm": 3.264045238494873, + "learning_rate": 4.508664906214478e-06, + "loss": 0.2386, "step": 889800 }, { - "epoch": 9.07, - "learning_rate": 2.1643693566585006e-05, - "loss": 0.3147, + "epoch": 12.260615579620291, + "grad_norm": 2.5459461212158203, + "learning_rate": 4.504268166922295e-06, + "loss": 0.187, "step": 889900 }, { - "epoch": 9.07, - "learning_rate": 2.1637322706354907e-05, - "loss": 0.3203, + "epoch": 12.261993331680031, + "grad_norm": 5.724930763244629, + "learning_rate": 4.499873381235169e-06, + "loss": 0.2518, "step": 890000 }, { - "epoch": 9.07, - "learning_rate": 2.1630952255009815e-05, - "loss": 0.3263, + "epoch": 12.26337108373977, + "grad_norm": 2.125060558319092, + "learning_rate": 4.495480549526476e-06, + "loss": 0.2456, "step": 890100 }, { - "epoch": 9.07, - "learning_rate": 2.1624582212861213e-05, - "loss": 0.3388, + "epoch": 12.264748835799509, + "grad_norm": 3.596890926361084, + "learning_rate": 4.491133571267698e-06, + "loss": 0.2602, "step": 890200 }, { - "epoch": 9.07, - "learning_rate": 2.1618212580220573e-05, - "loss": 0.3537, + "epoch": 12.26612658785925, + "grad_norm": 5.890937805175781, + "learning_rate": 4.486744629086165e-06, + "loss": 0.2585, "step": 890300 }, { - "epoch": 9.07, - "learning_rate": 2.1611843357399307e-05, - "loss": 0.2945, + "epoch": 12.267504339918988, + "grad_norm": 1.3215614557266235, + "learning_rate": 4.482357641998419e-06, + "loss": 0.2216, "step": 890400 }, { - "epoch": 9.07, - "learning_rate": 2.160547454470884e-05, - "loss": 0.3448, + "epoch": 12.268882091978728, + "grad_norm": 3.3703508377075195, + "learning_rate": 4.477972610377151e-06, + "loss": 0.2246, "step": 890500 }, { - "epoch": 9.07, - "learning_rate": 2.1599106142460582e-05, - "loss": 0.3481, + "epoch": 12.270259844038467, + "grad_norm": 3.8755743503570557, + "learning_rate": 4.473589534594881e-06, + "loss": 0.2704, "step": 890600 }, { - "epoch": 9.07, - "learning_rate": 2.159273815096589e-05, - "loss": 0.4116, + "epoch": 12.271637596098206, + "grad_norm": 1.4406187534332275, + "learning_rate": 4.46920841502398e-06, + "loss": 0.2184, "step": 890700 }, { - "epoch": 9.08, - "learning_rate": 2.158637057053613e-05, - "loss": 0.3605, + "epoch": 12.273015348157946, + "grad_norm": 2.0199484825134277, + "learning_rate": 4.464829252036659e-06, + "loss": 0.2556, "step": 890800 }, { - "epoch": 9.08, - "learning_rate": 2.1580003401482655e-05, - "loss": 0.3257, + "epoch": 12.274393100217685, + "grad_norm": 6.4199538230896, + "learning_rate": 4.460452046004932e-06, + "loss": 0.2037, "step": 890900 }, { - "epoch": 9.08, - "learning_rate": 2.157363664411675e-05, - "loss": 0.2823, + "epoch": 12.275770852277423, + "grad_norm": 2.7342798709869385, + "learning_rate": 4.45607679730068e-06, + "loss": 0.2416, "step": 891000 }, { - "epoch": 9.08, - "learning_rate": 2.156727029874973e-05, - "loss": 0.3674, + "epoch": 12.277148604337164, + "grad_norm": 0.003172761993482709, + "learning_rate": 4.451747229513811e-06, + "loss": 0.2181, "step": 891100 }, { - "epoch": 9.08, - "learning_rate": 2.1560904365692868e-05, - "loss": 0.3899, + "epoch": 12.278526356396902, + "grad_norm": 3.498115062713623, + "learning_rate": 4.447375876996878e-06, + "loss": 0.2134, "step": 891200 }, { - "epoch": 9.08, - "learning_rate": 2.1554538845257426e-05, - "loss": 0.3276, + "epoch": 12.279904108456643, + "grad_norm": 0.88975590467453, + "learning_rate": 4.443006482918307e-06, + "loss": 0.2293, "step": 891300 }, { - "epoch": 9.08, - "learning_rate": 2.154817373775463e-05, - "loss": 0.3276, + "epoch": 12.281281860516382, + "grad_norm": 2.969186544418335, + "learning_rate": 4.4386390476492855e-06, + "loss": 0.2407, "step": 891400 }, { - "epoch": 9.08, - "learning_rate": 2.1541809043495696e-05, - "loss": 0.3679, + "epoch": 12.28265961257612, + "grad_norm": 1.2038480043411255, + "learning_rate": 4.434273571560864e-06, + "loss": 0.2118, "step": 891500 }, { - "epoch": 9.08, - "learning_rate": 2.153544476279184e-05, - "loss": 0.4239, + "epoch": 12.28403736463586, + "grad_norm": 1.5316039323806763, + "learning_rate": 4.4299100550239095e-06, + "loss": 0.2167, "step": 891600 }, { - "epoch": 9.08, - "learning_rate": 2.1529080895954208e-05, - "loss": 0.3851, + "epoch": 12.2854151166956, + "grad_norm": 5.393610954284668, + "learning_rate": 4.42554849840911e-06, + "loss": 0.2733, "step": 891700 }, { - "epoch": 9.09, - "learning_rate": 2.1522717443293967e-05, - "loss": 0.3356, + "epoch": 12.286792868755338, + "grad_norm": 2.724255323410034, + "learning_rate": 4.421188902087019e-06, + "loss": 0.2297, "step": 891800 }, { - "epoch": 9.09, - "learning_rate": 2.1516418033451237e-05, - "loss": 0.3124, + "epoch": 12.288170620815078, + "grad_norm": 1.8789863586425781, + "learning_rate": 4.416831266428005e-06, + "loss": 0.229, "step": 891900 }, { - "epoch": 9.09, - "learning_rate": 2.1510055405929633e-05, - "loss": 0.3814, + "epoch": 12.289548372874817, + "grad_norm": 1.1820642948150635, + "learning_rate": 4.412475591802261e-06, + "loss": 0.2215, "step": 892000 }, { - "epoch": 9.09, - "learning_rate": 2.1503693193515657e-05, - "loss": 0.2813, + "epoch": 12.290926124934558, + "grad_norm": 2.520380973815918, + "learning_rate": 4.4081218785798366e-06, + "loss": 0.1738, "step": 892100 }, { - "epoch": 9.09, - "learning_rate": 2.1497331396520378e-05, - "loss": 0.3414, + "epoch": 12.292303876994296, + "grad_norm": 0.02531392127275467, + "learning_rate": 4.403770127130591e-06, + "loss": 0.2548, "step": 892200 }, { - "epoch": 9.09, - "learning_rate": 2.1490970015254846e-05, - "loss": 0.3077, + "epoch": 12.293681629054035, + "grad_norm": 2.480377674102783, + "learning_rate": 4.399420337824237e-06, + "loss": 0.2467, "step": 892300 }, { - "epoch": 9.09, - "learning_rate": 2.148460905003012e-05, - "loss": 0.3334, + "epoch": 12.295059381113775, + "grad_norm": 2.417511224746704, + "learning_rate": 4.395072511030302e-06, + "loss": 0.1968, "step": 892400 }, { - "epoch": 9.09, - "learning_rate": 2.147824850115718e-05, - "loss": 0.3069, + "epoch": 12.296437133173514, + "grad_norm": 2.6826090812683105, + "learning_rate": 4.390726647118168e-06, + "loss": 0.2393, "step": 892500 }, { - "epoch": 9.09, - "learning_rate": 2.1471888368947027e-05, - "loss": 0.3017, + "epoch": 12.297814885233253, + "grad_norm": 11.318028450012207, + "learning_rate": 4.386382746457031e-06, + "loss": 0.2612, "step": 892600 }, { - "epoch": 9.09, - "learning_rate": 2.1465528653710653e-05, - "loss": 0.3992, + "epoch": 12.299192637292993, + "grad_norm": 3.5370519161224365, + "learning_rate": 4.382040809415922e-06, + "loss": 0.2466, "step": 892700 }, { - "epoch": 9.1, - "learning_rate": 2.1459169355758986e-05, - "loss": 0.2886, + "epoch": 12.300570389352732, + "grad_norm": 1.5823495388031006, + "learning_rate": 4.377700836363717e-06, + "loss": 0.2041, "step": 892800 }, { - "epoch": 9.1, - "learning_rate": 2.1452810475402974e-05, - "loss": 0.3876, + "epoch": 12.30194814141247, + "grad_norm": 1.4828613996505737, + "learning_rate": 4.373362827669124e-06, + "loss": 0.2203, "step": 892900 }, { - "epoch": 9.1, - "learning_rate": 2.144645201295353e-05, - "loss": 0.3238, + "epoch": 12.303325893472211, + "grad_norm": 2.4236152172088623, + "learning_rate": 4.3690267837006685e-06, + "loss": 0.2482, "step": 893000 }, { - "epoch": 9.1, - "learning_rate": 2.1440093968721528e-05, - "loss": 0.3607, + "epoch": 12.30470364553195, + "grad_norm": 1.5310337543487549, + "learning_rate": 4.364692704826726e-06, + "loss": 0.2862, "step": 893100 }, { - "epoch": 9.1, - "learning_rate": 2.1433736343017843e-05, - "loss": 0.3252, + "epoch": 12.30608139759169, + "grad_norm": 3.797976016998291, + "learning_rate": 4.3603605914155015e-06, + "loss": 0.2368, "step": 893200 }, { - "epoch": 9.1, - "learning_rate": 2.1427379136153343e-05, - "loss": 0.3228, + "epoch": 12.307459149651429, + "grad_norm": 2.7901999950408936, + "learning_rate": 4.356030443835029e-06, + "loss": 0.216, "step": 893300 }, { - "epoch": 9.1, - "learning_rate": 2.1421022348438835e-05, - "loss": 0.3175, + "epoch": 12.308836901711167, + "grad_norm": 0.10551207512617111, + "learning_rate": 4.351702262453166e-06, + "loss": 0.2539, "step": 893400 }, { - "epoch": 9.1, - "learning_rate": 2.141466598018514e-05, - "loss": 0.357, + "epoch": 12.310214653770908, + "grad_norm": 4.287848472595215, + "learning_rate": 4.347376047637629e-06, + "loss": 0.272, "step": 893500 }, { - "epoch": 9.1, - "learning_rate": 2.140831003170304e-05, - "loss": 0.3736, + "epoch": 12.311592405830647, + "grad_norm": 8.090595245361328, + "learning_rate": 4.343051799755937e-06, + "loss": 0.2153, "step": 893600 }, { - "epoch": 9.11, - "learning_rate": 2.1401954503303325e-05, - "loss": 0.3691, + "epoch": 12.312970157890387, + "grad_norm": 3.8656468391418457, + "learning_rate": 4.3387295191754736e-06, + "loss": 0.2262, "step": 893700 }, { - "epoch": 9.11, - "learning_rate": 2.1395599395296707e-05, - "loss": 0.3348, + "epoch": 12.314347909950126, + "grad_norm": 1.2874608039855957, + "learning_rate": 4.3344523996513814e-06, + "loss": 0.1788, "step": 893800 }, { - "epoch": 9.11, - "learning_rate": 2.138924470799393e-05, - "loss": 0.3403, + "epoch": 12.315725662009864, + "grad_norm": 1.7229597568511963, + "learning_rate": 4.33013403509261e-06, + "loss": 0.2682, "step": 893900 }, { - "epoch": 9.11, - "learning_rate": 2.138289044170571e-05, - "loss": 0.3537, + "epoch": 12.317103414069605, + "grad_norm": 0.9462223052978516, + "learning_rate": 4.325817638932496e-06, + "loss": 0.2544, "step": 894000 }, { - "epoch": 9.11, - "learning_rate": 2.1376536596742717e-05, - "loss": 0.3755, + "epoch": 12.318481166129343, + "grad_norm": 2.1751644611358643, + "learning_rate": 4.321503211537727e-06, + "loss": 0.1984, "step": 894100 }, { - "epoch": 9.11, - "learning_rate": 2.137018317341562e-05, - "loss": 0.3867, + "epoch": 12.319858918189082, + "grad_norm": 2.7860021591186523, + "learning_rate": 4.317190753274837e-06, + "loss": 0.265, "step": 894200 }, { - "epoch": 9.11, - "learning_rate": 2.1363830172035077e-05, - "loss": 0.294, + "epoch": 12.321236670248823, + "grad_norm": 1.043503761291504, + "learning_rate": 4.312880264510211e-06, + "loss": 0.235, "step": 894300 }, { - "epoch": 9.11, - "learning_rate": 2.1357477592911693e-05, - "loss": 0.3694, + "epoch": 12.322614422308561, + "grad_norm": 2.568237781524658, + "learning_rate": 4.308571745610018e-06, + "loss": 0.2363, "step": 894400 }, { - "epoch": 9.11, - "learning_rate": 2.1351125436356075e-05, - "loss": 0.3327, + "epoch": 12.3239921743683, + "grad_norm": 1.396524429321289, + "learning_rate": 4.3042651969403006e-06, + "loss": 0.2025, "step": 894500 }, { - "epoch": 9.11, - "learning_rate": 2.1344773702678816e-05, - "loss": 0.3944, + "epoch": 12.32536992642804, + "grad_norm": 1.2532455921173096, + "learning_rate": 4.2999606188669285e-06, + "loss": 0.2292, "step": 894600 }, { - "epoch": 9.12, - "learning_rate": 2.1338422392190467e-05, - "loss": 0.389, + "epoch": 12.326747678487779, + "grad_norm": 3.6814794540405273, + "learning_rate": 4.295658011755589e-06, + "loss": 0.223, "step": 894700 }, { - "epoch": 9.12, - "learning_rate": 2.133207150520157e-05, - "loss": 0.3491, + "epoch": 12.32812543054752, + "grad_norm": 3.2660319805145264, + "learning_rate": 4.291357375971819e-06, + "loss": 0.2176, "step": 894800 }, { - "epoch": 9.12, - "learning_rate": 2.1325721042022665e-05, - "loss": 0.366, + "epoch": 12.329503182607258, + "grad_norm": 2.4927618503570557, + "learning_rate": 4.287058711880967e-06, + "loss": 0.1855, "step": 894900 }, { - "epoch": 9.12, - "learning_rate": 2.131937100296422e-05, - "loss": 0.3215, + "epoch": 12.330880934666997, + "grad_norm": 0.12143804877996445, + "learning_rate": 4.282762019848241e-06, + "loss": 0.1919, "step": 895000 }, { - "epoch": 9.12, - "learning_rate": 2.1313021388336734e-05, - "loss": 0.3427, + "epoch": 12.332258686726737, + "grad_norm": 3.500173568725586, + "learning_rate": 4.278467300238656e-06, + "loss": 0.2131, "step": 895100 }, { - "epoch": 9.12, - "learning_rate": 2.1306672198450655e-05, - "loss": 0.3093, + "epoch": 12.333636438786476, + "grad_norm": 0.8549736738204956, + "learning_rate": 4.274174553417072e-06, + "loss": 0.2439, "step": 895200 }, { - "epoch": 9.12, - "learning_rate": 2.1300323433616436e-05, - "loss": 0.2929, + "epoch": 12.335014190846215, + "grad_norm": 0.2323756068944931, + "learning_rate": 4.269883779748191e-06, + "loss": 0.2316, "step": 895300 }, { - "epoch": 9.12, - "learning_rate": 2.1293975094144474e-05, - "loss": 0.3055, + "epoch": 12.336391942905955, + "grad_norm": 2.947979688644409, + "learning_rate": 4.265594979596523e-06, + "loss": 0.203, "step": 895400 }, { - "epoch": 9.12, - "learning_rate": 2.1287627180345183e-05, - "loss": 0.3795, + "epoch": 12.337769694965694, + "grad_norm": 3.2621753215789795, + "learning_rate": 4.261308153326424e-06, + "loss": 0.2663, "step": 895500 }, { - "epoch": 9.12, - "learning_rate": 2.128127969252894e-05, - "loss": 0.3204, + "epoch": 12.339147447025434, + "grad_norm": 2.5823898315429688, + "learning_rate": 4.257023301302094e-06, + "loss": 0.2162, "step": 895600 }, { - "epoch": 9.13, - "learning_rate": 2.1274932631006074e-05, - "loss": 0.3132, + "epoch": 12.340525199085173, + "grad_norm": 4.950196266174316, + "learning_rate": 4.252740423887534e-06, + "loss": 0.1999, "step": 895700 }, { - "epoch": 9.13, - "learning_rate": 2.1268649460323425e-05, - "loss": 0.3137, + "epoch": 12.341902951144911, + "grad_norm": 1.231783151626587, + "learning_rate": 4.248459521446612e-06, + "loss": 0.1752, "step": 895800 }, { - "epoch": 9.13, - "learning_rate": 2.126230324804765e-05, - "loss": 0.342, + "epoch": 12.343280703204652, + "grad_norm": 0.034924864768981934, + "learning_rate": 4.24418059434301e-06, + "loss": 0.2263, "step": 895900 }, { - "epoch": 9.13, - "learning_rate": 2.1256020918727882e-05, - "loss": 0.346, + "epoch": 12.34465845526439, + "grad_norm": 2.1649532318115234, + "learning_rate": 4.239903642940245e-06, + "loss": 0.2122, "step": 896000 }, { - "epoch": 9.13, - "learning_rate": 2.1249675556927977e-05, - "loss": 0.2847, + "epoch": 12.34603620732413, + "grad_norm": 1.5345295667648315, + "learning_rate": 4.235628667601659e-06, + "loss": 0.1807, "step": 896100 }, { - "epoch": 9.13, - "learning_rate": 2.1243330622966738e-05, - "loss": 0.2751, + "epoch": 12.34741395938387, + "grad_norm": 0.6377744078636169, + "learning_rate": 4.231355668690435e-06, + "loss": 0.1801, "step": 896200 }, { - "epoch": 9.13, - "learning_rate": 2.1236986117154352e-05, - "loss": 0.3403, + "epoch": 12.348791711443608, + "grad_norm": 2.3109936714172363, + "learning_rate": 4.227084646569593e-06, + "loss": 0.2436, "step": 896300 }, { - "epoch": 9.13, - "learning_rate": 2.123064203980106e-05, - "loss": 0.3621, + "epoch": 12.350169463503349, + "grad_norm": 2.32199764251709, + "learning_rate": 4.22281560160197e-06, + "loss": 0.242, "step": 896400 }, { - "epoch": 9.13, - "learning_rate": 2.1224298391217034e-05, - "loss": 0.3389, + "epoch": 12.351547215563087, + "grad_norm": 2.498436212539673, + "learning_rate": 4.21854853415025e-06, + "loss": 0.265, "step": 896500 }, { - "epoch": 9.13, - "learning_rate": 2.1217955171712453e-05, - "loss": 0.3517, + "epoch": 12.352924967622826, + "grad_norm": 2.88193941116333, + "learning_rate": 4.214283444576942e-06, + "loss": 0.2731, "step": 896600 }, { - "epoch": 9.14, - "learning_rate": 2.1211612381597448e-05, - "loss": 0.3359, + "epoch": 12.354302719682567, + "grad_norm": 0.9335037469863892, + "learning_rate": 4.210020333244378e-06, + "loss": 0.2004, "step": 896700 }, { - "epoch": 9.14, - "learning_rate": 2.1205270021182152e-05, - "loss": 0.3247, + "epoch": 12.355680471742305, + "grad_norm": 1.1421068906784058, + "learning_rate": 4.2057592005147375e-06, + "loss": 0.2313, "step": 896800 }, { - "epoch": 9.14, - "learning_rate": 2.119892809077668e-05, - "loss": 0.3571, + "epoch": 12.357058223802044, + "grad_norm": 2.845714569091797, + "learning_rate": 4.20150004675003e-06, + "loss": 0.2327, "step": 896900 }, { - "epoch": 9.14, - "learning_rate": 2.1192586590691097e-05, - "loss": 0.3163, + "epoch": 12.358435975861784, + "grad_norm": 2.658376932144165, + "learning_rate": 4.197242872312088e-06, + "loss": 0.2203, "step": 897000 }, { - "epoch": 9.14, - "learning_rate": 2.118624552123546e-05, - "loss": 0.2878, + "epoch": 12.359813727921523, + "grad_norm": 9.237821578979492, + "learning_rate": 4.192987677562576e-06, + "loss": 0.2354, "step": 897100 }, { - "epoch": 9.14, - "learning_rate": 2.117990488271983e-05, - "loss": 0.3187, + "epoch": 12.361191479981262, + "grad_norm": 1.3623300790786743, + "learning_rate": 4.188734462863e-06, + "loss": 0.2281, "step": 897200 }, { - "epoch": 9.14, - "learning_rate": 2.1173564675454208e-05, - "loss": 0.3617, + "epoch": 12.362569232041002, + "grad_norm": 2.9656741619110107, + "learning_rate": 4.184483228574694e-06, + "loss": 0.2365, "step": 897300 }, { - "epoch": 9.14, - "learning_rate": 2.11672248997486e-05, - "loss": 0.3062, + "epoch": 12.36394698410074, + "grad_norm": 1.0035396814346313, + "learning_rate": 4.180233975058813e-06, + "loss": 0.2448, "step": 897400 }, { - "epoch": 9.14, - "learning_rate": 2.1160885555912997e-05, - "loss": 0.3277, + "epoch": 12.365324736160481, + "grad_norm": 3.20845365524292, + "learning_rate": 4.175986702676366e-06, + "loss": 0.2278, "step": 897500 }, { - "epoch": 9.14, - "learning_rate": 2.115454664425733e-05, - "loss": 0.3038, + "epoch": 12.36670248822022, + "grad_norm": 0.6244051456451416, + "learning_rate": 4.1717838548874674e-06, + "loss": 0.2104, "step": 897600 }, { - "epoch": 9.15, - "learning_rate": 2.114820816509154e-05, - "loss": 0.2859, + "epoch": 12.368080240279959, + "grad_norm": 2.60233211517334, + "learning_rate": 4.167540526033856e-06, + "loss": 0.2557, "step": 897700 }, { - "epoch": 9.15, - "learning_rate": 2.114187011872556e-05, - "loss": 0.3145, + "epoch": 12.369457992339699, + "grad_norm": 1.045715570449829, + "learning_rate": 4.163299179392036e-06, + "loss": 0.1759, "step": 897800 }, { - "epoch": 9.15, - "learning_rate": 2.1135532505469258e-05, - "loss": 0.3544, + "epoch": 12.370835744399438, + "grad_norm": 3.569092273712158, + "learning_rate": 4.159059815322343e-06, + "loss": 0.2475, "step": 897900 }, { - "epoch": 9.15, - "learning_rate": 2.1129195325632513e-05, - "loss": 0.3418, + "epoch": 12.372213496459178, + "grad_norm": 1.6751595735549927, + "learning_rate": 4.1548224341849355e-06, + "loss": 0.2556, "step": 898000 }, { - "epoch": 9.15, - "learning_rate": 2.1122858579525183e-05, - "loss": 0.4104, + "epoch": 12.373591248518917, + "grad_norm": 1.4152687788009644, + "learning_rate": 4.150587036339794e-06, + "loss": 0.233, "step": 898100 }, { - "epoch": 9.15, - "learning_rate": 2.11165222674571e-05, - "loss": 0.3477, + "epoch": 12.374969000578655, + "grad_norm": 0.4961412250995636, + "learning_rate": 4.1463536221467336e-06, + "loss": 0.2386, "step": 898200 }, { - "epoch": 9.15, - "learning_rate": 2.111018638973806e-05, - "loss": 0.289, + "epoch": 12.376346752638396, + "grad_norm": 6.542469501495361, + "learning_rate": 4.142122191965414e-06, + "loss": 0.229, "step": 898300 }, { - "epoch": 9.15, - "learning_rate": 2.1103850946677845e-05, - "loss": 0.3994, + "epoch": 12.377724504698135, + "grad_norm": 0.7540145516395569, + "learning_rate": 4.137892746155309e-06, + "loss": 0.2058, "step": 898400 }, { - "epoch": 9.15, - "learning_rate": 2.1097515938586242e-05, - "loss": 0.418, + "epoch": 12.379102256757873, + "grad_norm": 1.9386975765228271, + "learning_rate": 4.133665285075733e-06, + "loss": 0.2001, "step": 898500 }, { - "epoch": 9.16, - "learning_rate": 2.109124470934547e-05, - "loss": 0.3169, + "epoch": 12.380480008817614, + "grad_norm": 0.07088484615087509, + "learning_rate": 4.129439809085837e-06, + "loss": 0.2456, "step": 898600 }, { - "epoch": 9.16, - "learning_rate": 2.1084910567762853e-05, - "loss": 0.3689, + "epoch": 12.381857760877352, + "grad_norm": 1.2496007680892944, + "learning_rate": 4.125216318544596e-06, + "loss": 0.2389, "step": 898700 }, { - "epoch": 9.16, - "learning_rate": 2.1078576862074907e-05, - "loss": 0.3391, + "epoch": 12.383235512937091, + "grad_norm": 2.4799609184265137, + "learning_rate": 4.120994813810806e-06, + "loss": 0.2254, "step": 898800 }, { - "epoch": 9.16, - "learning_rate": 2.1072243592591313e-05, - "loss": 0.2758, + "epoch": 12.384613264996831, + "grad_norm": 1.1736985445022583, + "learning_rate": 4.1167752952431094e-06, + "loss": 0.2178, "step": 898900 }, { - "epoch": 9.16, - "learning_rate": 2.106591075962174e-05, - "loss": 0.2883, + "epoch": 12.38599101705657, + "grad_norm": 1.794600248336792, + "learning_rate": 4.112557763199989e-06, + "loss": 0.2218, "step": 899000 }, { - "epoch": 9.16, - "learning_rate": 2.1059578363475804e-05, - "loss": 0.3156, + "epoch": 12.38736876911631, + "grad_norm": 3.3393659591674805, + "learning_rate": 4.10834221803973e-06, + "loss": 0.2787, "step": 899100 }, { - "epoch": 9.16, - "learning_rate": 2.1053246404463134e-05, - "loss": 0.3305, + "epoch": 12.38874652117605, + "grad_norm": 1.3142123222351074, + "learning_rate": 4.104128660120468e-06, + "loss": 0.2515, "step": 899200 }, { - "epoch": 9.16, - "learning_rate": 2.1046914882893332e-05, - "loss": 0.3461, + "epoch": 12.390124273235788, + "grad_norm": 2.613729238510132, + "learning_rate": 4.099917089800181e-06, + "loss": 0.2493, "step": 899300 }, { - "epoch": 9.16, - "learning_rate": 2.104058379907595e-05, - "loss": 0.3643, + "epoch": 12.391502025295528, + "grad_norm": 1.1995364427566528, + "learning_rate": 4.095707507436643e-06, + "loss": 0.2546, "step": 899400 }, { - "epoch": 9.16, - "learning_rate": 2.1034253153320554e-05, - "loss": 0.3445, + "epoch": 12.392879777355267, + "grad_norm": 1.927012324333191, + "learning_rate": 4.091499913387485e-06, + "loss": 0.2654, "step": 899500 }, { - "epoch": 9.17, - "learning_rate": 2.1027922945936694e-05, - "loss": 0.3085, + "epoch": 12.394257529415006, + "grad_norm": 4.1164326667785645, + "learning_rate": 4.087294308010172e-06, + "loss": 0.2524, "step": 899600 }, { - "epoch": 9.17, - "learning_rate": 2.1021593177233838e-05, - "loss": 0.2875, + "epoch": 12.395635281474746, + "grad_norm": 2.5458950996398926, + "learning_rate": 4.08309069166198e-06, + "loss": 0.1971, "step": 899700 }, { - "epoch": 9.17, - "learning_rate": 2.1015263847521503e-05, - "loss": 0.2734, + "epoch": 12.397013033534485, + "grad_norm": 3.189155101776123, + "learning_rate": 4.078889064700041e-06, + "loss": 0.2665, "step": 899800 }, { - "epoch": 9.17, - "learning_rate": 2.1008934957109145e-05, - "loss": 0.3414, + "epoch": 12.398390785594225, + "grad_norm": 2.174677848815918, + "learning_rate": 4.074689427481289e-06, + "loss": 0.1797, "step": 899900 }, { - "epoch": 9.17, - "learning_rate": 2.100260650630621e-05, - "loss": 0.334, + "epoch": 12.399768537653964, + "grad_norm": 0.23012645542621613, + "learning_rate": 4.070491780362524e-06, + "loss": 0.2148, + "step": 900000 + }, + { + "epoch": 12.399768537653964, + "eval_accuracy": 0.8890996644481934, + "eval_cer": 0.05624914639098662, + "eval_loss": 0.321673184633255, + "eval_runtime": 8874.9383, + "eval_samples_per_second": 6.078, + "eval_steps_per_second": 0.38, + "eval_wer": 0.1209215257687301, "step": 900000 }, { - "epoch": 9.17, - "learning_rate": 2.0996278495422124e-05, - "loss": 0.3778, + "epoch": 12.401146289713703, + "grad_norm": 3.005718231201172, + "learning_rate": 4.066296123700337e-06, + "loss": 0.2209, "step": 900100 }, { - "epoch": 9.17, - "learning_rate": 2.0989950924766303e-05, - "loss": 0.3103, + "epoch": 12.402524041773443, + "grad_norm": 5.2009687423706055, + "learning_rate": 4.06210245785118e-06, + "loss": 0.2225, "step": 900200 }, { - "epoch": 9.17, - "learning_rate": 2.0983623794648096e-05, - "loss": 0.3198, + "epoch": 12.403901793833182, + "grad_norm": 1.7844475507736206, + "learning_rate": 4.057952690060678e-06, + "loss": 0.2415, "step": 900300 }, { - "epoch": 9.17, - "learning_rate": 2.0977297105376884e-05, - "loss": 0.2932, + "epoch": 12.40527954589292, + "grad_norm": 4.088395595550537, + "learning_rate": 4.053762986989224e-06, + "loss": 0.2257, "step": 900400 }, { - "epoch": 9.17, - "learning_rate": 2.0970970857261992e-05, - "loss": 0.3217, + "epoch": 12.40665729795266, + "grad_norm": 2.0080440044403076, + "learning_rate": 4.049575275795548e-06, + "loss": 0.1994, "step": 900500 }, { - "epoch": 9.18, - "learning_rate": 2.0964645050612752e-05, - "loss": 0.3642, + "epoch": 12.4080350500124, + "grad_norm": 1.5811798572540283, + "learning_rate": 4.045389556835425e-06, + "loss": 0.2094, "step": 900600 }, { - "epoch": 9.18, - "learning_rate": 2.095831968573844e-05, - "loss": 0.3612, + "epoch": 12.40941280207214, + "grad_norm": 1.148227572441101, + "learning_rate": 4.041205830464457e-06, + "loss": 0.259, "step": 900700 }, { - "epoch": 9.18, - "learning_rate": 2.095199476294834e-05, - "loss": 0.3448, + "epoch": 12.410790554131879, + "grad_norm": 2.7475385665893555, + "learning_rate": 4.037024097038062e-06, + "loss": 0.2099, "step": 900800 }, { - "epoch": 9.18, - "learning_rate": 2.0945670282551715e-05, - "loss": 0.3051, + "epoch": 12.412168306191617, + "grad_norm": 4.51393461227417, + "learning_rate": 4.0328443569115135e-06, + "loss": 0.229, "step": 900900 }, { - "epoch": 9.18, - "learning_rate": 2.0939346244857764e-05, - "loss": 0.3763, + "epoch": 12.413546058251358, + "grad_norm": 1.3130977153778076, + "learning_rate": 4.028666610439884e-06, + "loss": 0.2277, "step": 901000 }, { - "epoch": 9.18, - "learning_rate": 2.0933022650175712e-05, - "loss": 0.3157, + "epoch": 12.414923810311096, + "grad_norm": 1.9018644094467163, + "learning_rate": 4.024490857978115e-06, + "loss": 0.2268, "step": 901100 }, { - "epoch": 9.18, - "learning_rate": 2.092669949881475e-05, - "loss": 0.3653, + "epoch": 12.416301562370835, + "grad_norm": 1.2803382873535156, + "learning_rate": 4.020317099880938e-06, + "loss": 0.2331, "step": 901200 }, { - "epoch": 9.18, - "learning_rate": 2.0920376791084027e-05, - "loss": 0.3389, + "epoch": 12.417679314430575, + "grad_norm": 6.235776424407959, + "learning_rate": 4.016145336502953e-06, + "loss": 0.2287, "step": 901300 }, { - "epoch": 9.18, - "learning_rate": 2.0914054527292695e-05, - "loss": 0.2971, + "epoch": 12.419057066490314, + "grad_norm": 4.757741451263428, + "learning_rate": 4.011975568198557e-06, + "loss": 0.2029, "step": 901400 }, { - "epoch": 9.18, - "learning_rate": 2.090773270774989e-05, - "loss": 0.3478, + "epoch": 12.420434818550053, + "grad_norm": 2.5834996700286865, + "learning_rate": 4.007807795322006e-06, + "loss": 0.2198, "step": 901500 }, { - "epoch": 9.19, - "learning_rate": 2.0901411332764682e-05, - "loss": 0.2938, + "epoch": 12.421812570609793, + "grad_norm": 4.512655735015869, + "learning_rate": 4.003642018227363e-06, + "loss": 0.2549, "step": 901600 }, { - "epoch": 9.19, - "learning_rate": 2.0895090402646163e-05, - "loss": 0.2879, + "epoch": 12.423190322669532, + "grad_norm": 1.9069846868515015, + "learning_rate": 3.999478237268544e-06, + "loss": 0.253, "step": 901700 }, { - "epoch": 9.19, - "learning_rate": 2.0888769917703394e-05, - "loss": 0.3268, + "epoch": 12.424568074729272, + "grad_norm": 1.8341480493545532, + "learning_rate": 3.995316452799269e-06, + "loss": 0.2333, "step": 901800 }, { - "epoch": 9.19, - "learning_rate": 2.0882449878245396e-05, - "loss": 0.3666, + "epoch": 12.425945826789011, + "grad_norm": 2.6564137935638428, + "learning_rate": 3.99115666517311e-06, + "loss": 0.1904, "step": 901900 }, { - "epoch": 9.19, - "learning_rate": 2.087613028458119e-05, - "loss": 0.2863, + "epoch": 12.42732357884875, + "grad_norm": 1.4518638849258423, + "learning_rate": 3.986998874743479e-06, + "loss": 0.2384, "step": 902000 }, { - "epoch": 9.19, - "learning_rate": 2.086981113701977e-05, - "loss": 0.3884, + "epoch": 12.42870133090849, + "grad_norm": 0.8784197568893433, + "learning_rate": 3.98284308186357e-06, + "loss": 0.2003, "step": 902100 }, { - "epoch": 9.19, - "learning_rate": 2.0863492435870096e-05, - "loss": 0.3236, + "epoch": 12.430079082968229, + "grad_norm": 3.6955509185791016, + "learning_rate": 3.978689286886453e-06, + "loss": 0.2796, "step": 902200 }, { - "epoch": 9.19, - "learning_rate": 2.085717418144111e-05, - "loss": 0.3201, + "epoch": 12.43145683502797, + "grad_norm": 3.228609800338745, + "learning_rate": 3.974537490165021e-06, + "loss": 0.2464, "step": 902300 }, { - "epoch": 9.19, - "learning_rate": 2.0850856374041754e-05, - "loss": 0.3157, + "epoch": 12.432834587087708, + "grad_norm": 1.7909231185913086, + "learning_rate": 3.970429180138843e-06, + "loss": 0.2529, "step": 902400 }, { - "epoch": 9.19, - "learning_rate": 2.0844539013980923e-05, - "loss": 0.2792, + "epoch": 12.434212339147447, + "grad_norm": 1.9172004461288452, + "learning_rate": 3.9662813609953965e-06, + "loss": 0.2404, "step": 902500 }, { - "epoch": 9.2, - "learning_rate": 2.083822210156749e-05, - "loss": 0.3213, + "epoch": 12.435590091207187, + "grad_norm": 0.07181792706251144, + "learning_rate": 3.96213554116174e-06, + "loss": 0.2176, "step": 902600 }, { - "epoch": 9.2, - "learning_rate": 2.0831905637110322e-05, - "loss": 0.2996, + "epoch": 12.436967843266926, + "grad_norm": 4.632223129272461, + "learning_rate": 3.95799172099009e-06, + "loss": 0.222, "step": 902700 }, { - "epoch": 9.2, - "learning_rate": 2.0825589620918274e-05, - "loss": 0.3459, + "epoch": 12.438345595326664, + "grad_norm": 1.0618491172790527, + "learning_rate": 3.953849900832488e-06, + "loss": 0.2516, "step": 902800 }, { - "epoch": 9.2, - "learning_rate": 2.0819274053300125e-05, - "loss": 0.3258, + "epoch": 12.439723347386405, + "grad_norm": 2.681830644607544, + "learning_rate": 3.94971008104079e-06, + "loss": 0.2139, "step": 902900 }, { - "epoch": 9.2, - "learning_rate": 2.081295893456469e-05, - "loss": 0.3304, + "epoch": 12.441101099446144, + "grad_norm": 4.660548210144043, + "learning_rate": 3.945572261966706e-06, + "loss": 0.1928, "step": 903000 }, { - "epoch": 9.2, - "learning_rate": 2.080664426502074e-05, - "loss": 0.3157, + "epoch": 12.442478851505882, + "grad_norm": 1.443814992904663, + "learning_rate": 3.9414364439617714e-06, + "loss": 0.2347, "step": 903100 }, { - "epoch": 9.2, - "learning_rate": 2.0800330044977023e-05, - "loss": 0.3705, + "epoch": 12.443856603565623, + "grad_norm": 2.1228506565093994, + "learning_rate": 3.937302627377326e-06, + "loss": 0.2138, "step": 903200 }, { - "epoch": 9.2, - "learning_rate": 2.0794016274742263e-05, - "loss": 0.331, + "epoch": 12.445234355625361, + "grad_norm": 4.037171840667725, + "learning_rate": 3.933170812564566e-06, + "loss": 0.2817, "step": 903300 }, { - "epoch": 9.2, - "learning_rate": 2.078770295462518e-05, - "loss": 0.3201, + "epoch": 12.446612107685102, + "grad_norm": 0.15107671916484833, + "learning_rate": 3.929040999874521e-06, + "loss": 0.1826, "step": 903400 }, { - "epoch": 9.21, - "learning_rate": 2.078139008493443e-05, - "loss": 0.3381, + "epoch": 12.44798985974484, + "grad_norm": 2.934523105621338, + "learning_rate": 3.924913189658024e-06, + "loss": 0.2109, "step": 903500 }, { - "epoch": 9.21, - "learning_rate": 2.07750776659787e-05, - "loss": 0.2666, + "epoch": 12.449367611804579, + "grad_norm": 8.833013534545898, + "learning_rate": 3.920787382265762e-06, + "loss": 0.2436, "step": 903600 }, { - "epoch": 9.21, - "learning_rate": 2.076876569806662e-05, - "loss": 0.3046, + "epoch": 12.45074536386432, + "grad_norm": 0.7497681975364685, + "learning_rate": 3.9166635780482495e-06, + "loss": 0.1859, "step": 903700 }, { - "epoch": 9.21, - "learning_rate": 2.07624541815068e-05, - "loss": 0.3493, + "epoch": 12.452123115924058, + "grad_norm": 1.4005178213119507, + "learning_rate": 3.912541777355814e-06, + "loss": 0.2303, "step": 903800 }, { - "epoch": 9.21, - "learning_rate": 2.075614311660785e-05, - "loss": 0.3168, + "epoch": 12.453500867983797, + "grad_norm": 2.5171241760253906, + "learning_rate": 3.9084219805386226e-06, + "loss": 0.2566, "step": 903900 }, { - "epoch": 9.21, - "learning_rate": 2.0749832503678333e-05, - "loss": 0.3286, + "epoch": 12.454878620043537, + "grad_norm": 3.2477614879608154, + "learning_rate": 3.904304187946678e-06, + "loss": 0.2437, "step": 904000 }, { - "epoch": 9.21, - "learning_rate": 2.074352234302682e-05, - "loss": 0.3193, + "epoch": 12.456256372103276, + "grad_norm": 4.606788158416748, + "learning_rate": 3.900188399929813e-06, + "loss": 0.2321, "step": 904100 }, { - "epoch": 9.21, - "learning_rate": 2.0737212634961815e-05, - "loss": 0.3513, + "epoch": 12.457634124163016, + "grad_norm": 4.003744602203369, + "learning_rate": 3.896074616837682e-06, + "loss": 0.2302, "step": 904200 }, { - "epoch": 9.21, - "learning_rate": 2.0730903379791826e-05, - "loss": 0.335, + "epoch": 12.459011876222755, + "grad_norm": 2.9413046836853027, + "learning_rate": 3.8919628390197585e-06, + "loss": 0.2074, "step": 904300 }, { - "epoch": 9.21, - "learning_rate": 2.072459457782536e-05, - "loss": 0.2932, + "epoch": 12.460389628282494, + "grad_norm": 2.080765724182129, + "learning_rate": 3.887853066825378e-06, + "loss": 0.2469, "step": 904400 }, { - "epoch": 9.22, - "learning_rate": 2.0718286229370863e-05, - "loss": 0.3319, + "epoch": 12.461767380342234, + "grad_norm": 2.9030494689941406, + "learning_rate": 3.883745300603672e-06, + "loss": 0.2506, "step": 904500 }, { - "epoch": 9.22, - "learning_rate": 2.071197833473677e-05, - "loss": 0.2998, + "epoch": 12.463145132401973, + "grad_norm": 4.502427577972412, + "learning_rate": 3.879639540703623e-06, + "loss": 0.2383, "step": 904600 }, { - "epoch": 9.22, - "learning_rate": 2.070567089423153e-05, - "loss": 0.2895, + "epoch": 12.464522884461712, + "grad_norm": 0.03270488604903221, + "learning_rate": 3.875535787474043e-06, + "loss": 0.2046, "step": 904700 }, { - "epoch": 9.22, - "learning_rate": 2.0699363908163497e-05, - "loss": 0.3352, + "epoch": 12.465900636521452, + "grad_norm": 1.1360697746276855, + "learning_rate": 3.8714340412635586e-06, + "loss": 0.2155, "step": 904800 }, { - "epoch": 9.22, - "learning_rate": 2.0693057376841064e-05, - "loss": 0.3524, + "epoch": 12.46727838858119, + "grad_norm": 4.122684001922607, + "learning_rate": 3.867334302420629e-06, + "loss": 0.2237, "step": 904900 }, { - "epoch": 9.22, - "learning_rate": 2.06867513005726e-05, - "loss": 0.3333, + "epoch": 12.468656140640931, + "grad_norm": 3.7630555629730225, + "learning_rate": 3.863236571293553e-06, + "loss": 0.2542, "step": 905000 }, { - "epoch": 9.22, - "learning_rate": 2.0680445679666405e-05, - "loss": 0.3793, + "epoch": 12.47003389270067, + "grad_norm": 2.492401599884033, + "learning_rate": 3.859140848230464e-06, + "loss": 0.2326, "step": 905100 }, { - "epoch": 9.22, - "learning_rate": 2.0674140514430805e-05, - "loss": 0.3032, + "epoch": 12.471411644760408, + "grad_norm": 1.852944254875183, + "learning_rate": 3.8550471335792995e-06, + "loss": 0.2512, "step": 905200 }, { - "epoch": 9.22, - "learning_rate": 2.066783580517409e-05, - "loss": 0.3082, + "epoch": 12.472789396820149, + "grad_norm": 2.078153133392334, + "learning_rate": 3.850955427687853e-06, + "loss": 0.206, "step": 905300 }, { - "epoch": 9.22, - "learning_rate": 2.0661531552204502e-05, - "loss": 0.3306, + "epoch": 12.474167148879888, + "grad_norm": 1.2404680252075195, + "learning_rate": 3.846865730903731e-06, + "loss": 0.2035, "step": 905400 }, { - "epoch": 9.23, - "learning_rate": 2.065522775583029e-05, - "loss": 0.3505, + "epoch": 12.475544900939626, + "grad_norm": 1.0016200542449951, + "learning_rate": 3.842778043574367e-06, + "loss": 0.2042, "step": 905500 }, { - "epoch": 9.23, - "learning_rate": 2.064892441635968e-05, - "loss": 0.3569, + "epoch": 12.476922652999367, + "grad_norm": 1.049678087234497, + "learning_rate": 3.838692366047036e-06, + "loss": 0.2212, "step": 905600 }, { - "epoch": 9.23, - "learning_rate": 2.064262153410086e-05, - "loss": 0.344, + "epoch": 12.478300405059105, + "grad_norm": 3.5079498291015625, + "learning_rate": 3.834608698668847e-06, + "loss": 0.1877, "step": 905700 }, { - "epoch": 9.23, - "learning_rate": 2.0636319109362e-05, - "loss": 0.3577, + "epoch": 12.479678157118844, + "grad_norm": 3.895480155944824, + "learning_rate": 3.830527041786716e-06, + "loss": 0.2373, "step": 905800 }, { - "epoch": 9.23, - "learning_rate": 2.0630017142451254e-05, - "loss": 0.3228, + "epoch": 12.481055909178584, + "grad_norm": 2.989802122116089, + "learning_rate": 3.82644739574741e-06, + "loss": 0.2498, "step": 905900 }, { - "epoch": 9.23, - "learning_rate": 2.062371563367677e-05, - "loss": 0.3035, + "epoch": 12.482433661238323, + "grad_norm": 13.865406036376953, + "learning_rate": 3.822369760897504e-06, + "loss": 0.2434, "step": 906000 }, { - "epoch": 9.23, - "learning_rate": 2.0617414583346622e-05, - "loss": 0.3536, + "epoch": 12.483811413298064, + "grad_norm": 0.863244354724884, + "learning_rate": 3.818294137583426e-06, + "loss": 0.204, "step": 906100 }, { - "epoch": 9.23, - "learning_rate": 2.0611113991768908e-05, - "loss": 0.2919, + "epoch": 12.485189165357802, + "grad_norm": 3.711275100708008, + "learning_rate": 3.8142205261514113e-06, + "loss": 0.2441, "step": 906200 }, { - "epoch": 9.23, - "learning_rate": 2.06048138592517e-05, - "loss": 0.3205, + "epoch": 12.48656691741754, + "grad_norm": 0.5521007180213928, + "learning_rate": 3.8101489269475444e-06, + "loss": 0.2277, "step": 906300 }, { - "epoch": 9.23, - "learning_rate": 2.059851418610302e-05, - "loss": 0.3169, + "epoch": 12.487944669477281, + "grad_norm": 3.017240285873413, + "learning_rate": 3.806120026220639e-06, + "loss": 0.226, "step": 906400 }, { - "epoch": 9.24, - "learning_rate": 2.059221497263089e-05, - "loss": 0.3428, + "epoch": 12.48932242153702, + "grad_norm": 3.55344295501709, + "learning_rate": 3.802052432379686e-06, + "loss": 0.2306, "step": 906500 }, { - "epoch": 9.24, - "learning_rate": 2.058597920440025e-05, - "loss": 0.3203, + "epoch": 12.49070017359676, + "grad_norm": 5.871715068817139, + "learning_rate": 3.7979868518006098e-06, + "loss": 0.2471, "step": 906600 }, { - "epoch": 9.24, - "learning_rate": 2.0579743887299284e-05, - "loss": 0.3546, + "epoch": 12.492077925656499, + "grad_norm": 2.2820990085601807, + "learning_rate": 3.7939232848288086e-06, + "loss": 0.2522, "step": 906700 }, { - "epoch": 9.24, - "learning_rate": 2.0573446045489658e-05, - "loss": 0.3498, + "epoch": 12.493455677716238, + "grad_norm": 2.5023770332336426, + "learning_rate": 3.7898617318095103e-06, + "loss": 0.2637, "step": 906800 }, { - "epoch": 9.24, - "learning_rate": 2.0567148664582263e-05, - "loss": 0.3282, + "epoch": 12.494833429775978, + "grad_norm": 2.548797369003296, + "learning_rate": 3.785802193087753e-06, + "loss": 0.2174, "step": 906900 }, { - "epoch": 9.24, - "learning_rate": 2.0560851744885007e-05, - "loss": 0.3148, + "epoch": 12.496211181835717, + "grad_norm": 3.1944901943206787, + "learning_rate": 3.781744669008426e-06, + "loss": 0.1894, "step": 907000 }, { - "epoch": 9.24, - "learning_rate": 2.0554555286705763e-05, - "loss": 0.3111, + "epoch": 12.497588933895456, + "grad_norm": 1.6214133501052856, + "learning_rate": 3.777689159916232e-06, + "loss": 0.2956, "step": 907100 }, { - "epoch": 9.24, - "learning_rate": 2.0548259290352404e-05, - "loss": 0.3779, + "epoch": 12.498966685955196, + "grad_norm": 1.4016005992889404, + "learning_rate": 3.773635666155699e-06, + "loss": 0.2662, "step": 907200 }, { - "epoch": 9.24, - "learning_rate": 2.0541963756132776e-05, - "loss": 0.3257, + "epoch": 12.500344438014935, + "grad_norm": 4.037769317626953, + "learning_rate": 3.7695841880712e-06, + "loss": 0.2751, "step": 907300 }, { - "epoch": 9.24, - "learning_rate": 2.053566868435466e-05, - "loss": 0.3249, + "epoch": 12.501722190074673, + "grad_norm": 4.5331854820251465, + "learning_rate": 3.7655347260069344e-06, + "loss": 0.2395, "step": 907400 }, { - "epoch": 9.25, - "learning_rate": 2.0529374075325867e-05, - "loss": 0.3524, + "epoch": 12.503099942134414, + "grad_norm": 1.1057956218719482, + "learning_rate": 3.76148728030692e-06, + "loss": 0.2236, "step": 907500 }, { - "epoch": 9.25, - "learning_rate": 2.0523079929354168e-05, - "loss": 0.3021, + "epoch": 12.504477694194152, + "grad_norm": 0.8608847856521606, + "learning_rate": 3.757441851314999e-06, + "loss": 0.2605, "step": 907600 }, { - "epoch": 9.25, - "learning_rate": 2.0516786246747303e-05, - "loss": 0.2624, + "epoch": 12.505855446253893, + "grad_norm": 0.7966085076332092, + "learning_rate": 3.7533984393748573e-06, + "loss": 0.2164, "step": 907700 }, { - "epoch": 9.25, - "learning_rate": 2.0510493027812995e-05, - "loss": 0.2991, + "epoch": 12.507233198313632, + "grad_norm": 4.589109897613525, + "learning_rate": 3.749357044830013e-06, + "loss": 0.2551, "step": 907800 }, { - "epoch": 9.25, - "learning_rate": 2.0504200272858958e-05, - "loss": 0.2984, + "epoch": 12.50861095037337, + "grad_norm": 2.9132936000823975, + "learning_rate": 3.745317668023788e-06, + "loss": 0.2292, "step": 907900 }, { - "epoch": 9.25, - "learning_rate": 2.049790798219285e-05, - "loss": 0.3312, + "epoch": 12.50998870243311, + "grad_norm": 2.2123827934265137, + "learning_rate": 3.7412803092993564e-06, + "loss": 0.2141, "step": 908000 }, { - "epoch": 9.25, - "learning_rate": 2.0491616156122337e-05, - "loss": 0.3694, + "epoch": 12.51136645449285, + "grad_norm": 0.7828893661499023, + "learning_rate": 3.7372449689997225e-06, + "loss": 0.2494, "step": 908100 }, { - "epoch": 9.25, - "learning_rate": 2.0485324794955055e-05, - "loss": 0.2578, + "epoch": 12.512744206552588, + "grad_norm": 1.6209080219268799, + "learning_rate": 3.733211647467687e-06, + "loss": 0.1986, "step": 908200 }, { - "epoch": 9.25, - "learning_rate": 2.0479033898998603e-05, - "loss": 0.305, + "epoch": 12.514121958612328, + "grad_norm": 3.1257684230804443, + "learning_rate": 3.7291803450459125e-06, + "loss": 0.2128, "step": 908300 }, { - "epoch": 9.25, - "learning_rate": 2.0472743468560577e-05, - "loss": 0.4065, + "epoch": 12.515499710672067, + "grad_norm": 3.1367995738983154, + "learning_rate": 3.7251510620768827e-06, + "loss": 0.247, "step": 908400 }, { - "epoch": 9.26, - "learning_rate": 2.0466453503948553e-05, - "loss": 0.3144, + "epoch": 12.516877462731808, + "grad_norm": 0.22655609250068665, + "learning_rate": 3.721164061535527e-06, + "loss": 0.2375, "step": 908500 }, { - "epoch": 9.26, - "learning_rate": 2.0460164005470043e-05, - "loss": 0.3224, + "epoch": 12.518255214791546, + "grad_norm": 3.3604624271392822, + "learning_rate": 3.7171387982956686e-06, + "loss": 0.2511, "step": 908600 }, { - "epoch": 9.26, - "learning_rate": 2.0453937861443067e-05, - "loss": 0.3921, + "epoch": 12.519632966851285, + "grad_norm": 2.938551425933838, + "learning_rate": 3.7131155555315338e-06, + "loss": 0.2493, "step": 908700 }, { - "epoch": 9.26, - "learning_rate": 2.0447649291485146e-05, - "loss": 0.3147, + "epoch": 12.521010718911025, + "grad_norm": 0.12210047990083694, + "learning_rate": 3.7090943335849244e-06, + "loss": 0.2302, "step": 908800 }, { - "epoch": 9.26, - "learning_rate": 2.0441361188580173e-05, - "loss": 0.3601, + "epoch": 12.522388470970764, + "grad_norm": 4.032182216644287, + "learning_rate": 3.7050751327974725e-06, + "loss": 0.226, "step": 908900 }, { - "epoch": 9.26, - "learning_rate": 2.0435073553035604e-05, - "loss": 0.3046, + "epoch": 12.523766223030503, + "grad_norm": 3.6658716201782227, + "learning_rate": 3.7010579535106163e-06, + "loss": 0.2752, "step": 909000 }, { - "epoch": 9.26, - "learning_rate": 2.0428786385158846e-05, - "loss": 0.3536, + "epoch": 12.525143975090243, + "grad_norm": 0.4287591576576233, + "learning_rate": 3.6970427960656452e-06, + "loss": 0.2131, "step": 909100 }, { - "epoch": 9.26, - "learning_rate": 2.0422499685257315e-05, - "loss": 0.3932, + "epoch": 12.526521727149982, + "grad_norm": 4.468510150909424, + "learning_rate": 3.693029660803665e-06, + "loss": 0.2594, "step": 909200 }, { - "epoch": 9.26, - "learning_rate": 2.04162134536384e-05, - "loss": 0.3, + "epoch": 12.527899479209722, + "grad_norm": 2.376677989959717, + "learning_rate": 3.6890185480656045e-06, + "loss": 0.219, "step": 909300 }, { - "epoch": 9.27, - "learning_rate": 2.0409927690609448e-05, - "loss": 0.3164, + "epoch": 12.529277231269461, + "grad_norm": 4.313483238220215, + "learning_rate": 3.6850094581922346e-06, + "loss": 0.1837, "step": 909400 }, { - "epoch": 9.27, - "learning_rate": 2.0403642396477803e-05, - "loss": 0.3422, + "epoch": 12.5306549833292, + "grad_norm": 0.854306161403656, + "learning_rate": 3.681002391524154e-06, + "loss": 0.2229, "step": 909500 }, { - "epoch": 9.27, - "learning_rate": 2.0397357571550773e-05, - "loss": 0.3163, + "epoch": 12.53203273538894, + "grad_norm": 0.8100183606147766, + "learning_rate": 3.676997348401773e-06, + "loss": 0.2147, "step": 909600 }, { - "epoch": 9.27, - "learning_rate": 2.0391073216135667e-05, - "loss": 0.3678, + "epoch": 12.533410487448679, + "grad_norm": 2.148639678955078, + "learning_rate": 3.6729943291653483e-06, + "loss": 0.2059, "step": 909700 }, { - "epoch": 9.27, - "learning_rate": 2.038478933053973e-05, - "loss": 0.288, + "epoch": 12.534788239508417, + "grad_norm": 1.154579758644104, + "learning_rate": 3.6689933341549483e-06, + "loss": 0.2261, "step": 909800 }, { - "epoch": 9.27, - "learning_rate": 2.0378505915070207e-05, - "loss": 0.3197, + "epoch": 12.536165991568158, + "grad_norm": 2.6155924797058105, + "learning_rate": 3.6649943637104913e-06, + "loss": 0.1948, "step": 909900 }, { - "epoch": 9.27, - "learning_rate": 2.0372222970034337e-05, - "loss": 0.336, + "epoch": 12.537543743627896, + "grad_norm": 2.6509344577789307, + "learning_rate": 3.660997418171694e-06, + "loss": 0.2297, "step": 910000 }, { - "epoch": 9.27, - "learning_rate": 2.0365940495739297e-05, - "loss": 0.3721, + "epoch": 12.538921495687635, + "grad_norm": 2.1501147747039795, + "learning_rate": 3.65700249787813e-06, + "loss": 0.2085, "step": 910100 }, { - "epoch": 9.27, - "learning_rate": 2.0359658492492277e-05, - "loss": 0.3335, + "epoch": 12.540299247747376, + "grad_norm": 1.3811872005462646, + "learning_rate": 3.6530096031691802e-06, + "loss": 0.2425, "step": 910200 }, { - "epoch": 9.27, - "learning_rate": 2.0353376960600443e-05, - "loss": 0.3519, + "epoch": 12.541676999807114, + "grad_norm": 0.125303253531456, + "learning_rate": 3.64901873438407e-06, + "loss": 0.2037, "step": 910300 }, { - "epoch": 9.28, - "learning_rate": 2.034709590037089e-05, - "loss": 0.3604, + "epoch": 12.543054751866855, + "grad_norm": 1.7733478546142578, + "learning_rate": 3.645029891861835e-06, + "loss": 0.1853, "step": 910400 }, { - "epoch": 9.28, - "learning_rate": 2.0340815312110744e-05, - "loss": 0.2631, + "epoch": 12.544432503926593, + "grad_norm": 1.8176164627075195, + "learning_rate": 3.6410829340677655e-06, + "loss": 0.1872, "step": 910500 }, { - "epoch": 9.28, - "learning_rate": 2.033453519612709e-05, - "loss": 0.3429, + "epoch": 12.545810255986332, + "grad_norm": 2.1111767292022705, + "learning_rate": 3.6370981248166578e-06, + "loss": 0.2447, "step": 910600 }, { - "epoch": 9.28, - "learning_rate": 2.032825555272698e-05, - "loss": 0.2839, + "epoch": 12.547188008046072, + "grad_norm": 0.6519777774810791, + "learning_rate": 3.633115342841143e-06, + "loss": 0.2328, "step": 910700 }, { - "epoch": 9.28, - "learning_rate": 2.0321976382217455e-05, - "loss": 0.3275, + "epoch": 12.548565760105811, + "grad_norm": 2.3024773597717285, + "learning_rate": 3.6291345884795865e-06, + "loss": 0.225, "step": 910800 }, { - "epoch": 9.28, - "learning_rate": 2.0315697684905543e-05, - "loss": 0.3996, + "epoch": 12.549943512165552, + "grad_norm": 12.88027572631836, + "learning_rate": 3.625155862070168e-06, + "loss": 0.2381, "step": 910900 }, { - "epoch": 9.28, - "learning_rate": 2.0309419461098205e-05, - "loss": 0.3707, + "epoch": 12.55132126422529, + "grad_norm": 2.284512519836426, + "learning_rate": 3.6211791639508958e-06, + "loss": 0.2086, "step": 911000 }, { - "epoch": 9.28, - "learning_rate": 2.030314171110243e-05, - "loss": 0.3409, + "epoch": 12.552699016285029, + "grad_norm": 0.705697774887085, + "learning_rate": 3.6172044944596177e-06, + "loss": 0.2481, "step": 911100 }, { - "epoch": 9.28, - "learning_rate": 2.0296864435225153e-05, - "loss": 0.3432, + "epoch": 12.55407676834477, + "grad_norm": 1.4522850513458252, + "learning_rate": 3.6132318539340078e-06, + "loss": 0.2004, "step": 911200 }, { - "epoch": 9.28, - "learning_rate": 2.029058763377331e-05, - "loss": 0.2944, + "epoch": 12.555454520404508, + "grad_norm": 9.470004081726074, + "learning_rate": 3.609261242711549e-06, + "loss": 0.2399, "step": 911300 }, { - "epoch": 9.29, - "learning_rate": 2.028431130705378e-05, - "loss": 0.3277, + "epoch": 12.556832272464247, + "grad_norm": 1.3187912702560425, + "learning_rate": 3.6052926611295787e-06, + "loss": 0.2248, "step": 911400 }, { - "epoch": 9.29, - "learning_rate": 2.027803545537345e-05, - "loss": 0.4145, + "epoch": 12.558210024523987, + "grad_norm": 0.30397436022758484, + "learning_rate": 3.6013261095252344e-06, + "loss": 0.2389, "step": 911500 }, { - "epoch": 9.29, - "learning_rate": 2.0271760079039177e-05, - "loss": 0.3353, + "epoch": 12.559587776583726, + "grad_norm": 2.7222912311553955, + "learning_rate": 3.59736158823551e-06, + "loss": 0.2245, "step": 911600 }, { - "epoch": 9.29, - "learning_rate": 2.026548517835777e-05, - "loss": 0.3333, + "epoch": 12.560965528643464, + "grad_norm": 2.3611936569213867, + "learning_rate": 3.5933990975971936e-06, + "loss": 0.2144, "step": 911700 }, { - "epoch": 9.29, - "learning_rate": 2.0259210753636047e-05, - "loss": 0.3308, + "epoch": 12.562343280703205, + "grad_norm": 2.5766351222991943, + "learning_rate": 3.5894386379469313e-06, + "loss": 0.2337, "step": 911800 }, { - "epoch": 9.29, - "learning_rate": 2.0252936805180794e-05, - "loss": 0.3806, + "epoch": 12.563721032762944, + "grad_norm": 2.4747798442840576, + "learning_rate": 3.5854802096211974e-06, + "loss": 0.2302, "step": 911900 }, { - "epoch": 9.29, - "learning_rate": 2.024666333329876e-05, - "loss": 0.3543, + "epoch": 12.565098784822684, + "grad_norm": 0.3169412612915039, + "learning_rate": 3.5815238129562493e-06, + "loss": 0.1844, "step": 912000 }, { - "epoch": 9.29, - "learning_rate": 2.0240390338296675e-05, - "loss": 0.2934, + "epoch": 12.566476536882423, + "grad_norm": 3.081221580505371, + "learning_rate": 3.577569448288223e-06, + "loss": 0.2151, "step": 912100 }, { - "epoch": 9.29, - "learning_rate": 2.0234117820481283e-05, - "loss": 0.2882, + "epoch": 12.567854288942161, + "grad_norm": 6.372676849365234, + "learning_rate": 3.573617115953061e-06, + "loss": 0.2274, "step": 912200 }, { - "epoch": 9.29, - "learning_rate": 2.0227845780159235e-05, - "loss": 0.3608, + "epoch": 12.569232041001902, + "grad_norm": 9.849974632263184, + "learning_rate": 3.569666816286527e-06, + "loss": 0.2364, "step": 912300 }, { - "epoch": 9.3, - "learning_rate": 2.0221636930896315e-05, - "loss": 0.3361, + "epoch": 12.57060979306164, + "grad_norm": 1.6972519159317017, + "learning_rate": 3.5657185496242225e-06, + "loss": 0.2398, "step": 912400 }, { - "epoch": 9.3, - "learning_rate": 2.021536584169837e-05, - "loss": 0.3523, + "epoch": 12.57198754512138, + "grad_norm": 1.846554160118103, + "learning_rate": 3.561772316301583e-06, + "loss": 0.2424, "step": 912500 }, { - "epoch": 9.3, - "learning_rate": 2.0209095230910643e-05, - "loss": 0.312, + "epoch": 12.57336529718112, + "grad_norm": 2.0066514015197754, + "learning_rate": 3.5578281166538505e-06, + "loss": 0.2034, "step": 912600 }, { - "epoch": 9.3, - "learning_rate": 2.020282509883974e-05, - "loss": 0.4134, + "epoch": 12.574743049240858, + "grad_norm": 9.855225563049316, + "learning_rate": 3.5538859510161003e-06, + "loss": 0.2016, "step": 912700 }, { - "epoch": 9.3, - "learning_rate": 2.0196555445792216e-05, - "loss": 0.2895, + "epoch": 12.576120801300599, + "grad_norm": 5.288704872131348, + "learning_rate": 3.5499458197232475e-06, + "loss": 0.2447, "step": 912800 }, { - "epoch": 9.3, - "learning_rate": 2.0190286272074623e-05, - "loss": 0.3297, + "epoch": 12.577498553360337, + "grad_norm": 6.687885761260986, + "learning_rate": 3.5460077231100285e-06, + "loss": 0.1968, "step": 912900 }, { - "epoch": 9.3, - "learning_rate": 2.0184017577993506e-05, - "loss": 0.3511, + "epoch": 12.578876305420076, + "grad_norm": 4.185986042022705, + "learning_rate": 3.5420716615110043e-06, + "loss": 0.2282, "step": 913000 }, { - "epoch": 9.3, - "learning_rate": 2.0177749363855338e-05, - "loss": 0.303, + "epoch": 12.580254057479817, + "grad_norm": 2.8870296478271484, + "learning_rate": 3.538137635260554e-06, + "loss": 0.2503, "step": 913100 }, { - "epoch": 9.3, - "learning_rate": 2.017148162996661e-05, - "loss": 0.3839, + "epoch": 12.581631809539555, + "grad_norm": 1.2261135578155518, + "learning_rate": 3.5342056446929027e-06, + "loss": 0.187, "step": 913200 }, { - "epoch": 9.3, - "learning_rate": 2.0165214376633782e-05, - "loss": 0.34, + "epoch": 12.583009561599294, + "grad_norm": 2.374559164047241, + "learning_rate": 3.530275690142085e-06, + "loss": 0.2245, "step": 913300 }, { - "epoch": 9.31, - "learning_rate": 2.0158947604163275e-05, - "loss": 0.3327, + "epoch": 12.584387313659034, + "grad_norm": 0.7190760374069214, + "learning_rate": 3.5263477719419775e-06, + "loss": 0.242, "step": 913400 }, { - "epoch": 9.31, - "learning_rate": 2.01526813128615e-05, - "loss": 0.2973, + "epoch": 12.585765065718773, + "grad_norm": 0.626494288444519, + "learning_rate": 3.5224218904262824e-06, + "loss": 0.2191, "step": 913500 }, { - "epoch": 9.31, - "learning_rate": 2.0146415503034844e-05, - "loss": 0.2784, + "epoch": 12.587142817778513, + "grad_norm": 3.52441668510437, + "learning_rate": 3.5184980459285123e-06, + "loss": 0.1824, "step": 913600 }, { - "epoch": 9.31, - "learning_rate": 2.01402128258843e-05, - "loss": 0.3444, + "epoch": 12.588520569838252, + "grad_norm": 2.043318510055542, + "learning_rate": 3.5145762387820183e-06, + "loss": 0.2007, "step": 913700 }, { - "epoch": 9.31, - "learning_rate": 2.0133947975104555e-05, - "loss": 0.3038, + "epoch": 12.58989832189799, + "grad_norm": 3.781277656555176, + "learning_rate": 3.5106564693199807e-06, + "loss": 0.2322, "step": 913800 }, { - "epoch": 9.31, - "learning_rate": 2.012768360671587e-05, - "loss": 0.2758, + "epoch": 12.591276073957731, + "grad_norm": 2.291917085647583, + "learning_rate": 3.506738737875412e-06, + "loss": 0.2408, "step": 913900 }, { - "epoch": 9.31, - "learning_rate": 2.0121419721024537e-05, - "loss": 0.3848, + "epoch": 12.59265382601747, + "grad_norm": 2.0271263122558594, + "learning_rate": 3.502823044781129e-06, + "loss": 0.2152, "step": 914000 }, { - "epoch": 9.31, - "learning_rate": 2.011515631833683e-05, - "loss": 0.2813, + "epoch": 12.594031578077209, + "grad_norm": 4.473174095153809, + "learning_rate": 3.4989093903698015e-06, + "loss": 0.257, "step": 914100 }, { - "epoch": 9.31, - "learning_rate": 2.010889339895899e-05, - "loss": 0.3729, + "epoch": 12.595409330136949, + "grad_norm": 2.161417007446289, + "learning_rate": 3.4949977749739106e-06, + "loss": 0.2172, "step": 914200 }, { - "epoch": 9.32, - "learning_rate": 2.0102630963197236e-05, - "loss": 0.3408, + "epoch": 12.596787082196688, + "grad_norm": 4.593111038208008, + "learning_rate": 3.491088198925764e-06, + "loss": 0.1643, "step": 914300 }, { - "epoch": 9.32, - "learning_rate": 2.0096369011357754e-05, - "loss": 0.435, + "epoch": 12.598164834256426, + "grad_norm": 4.854642868041992, + "learning_rate": 3.4871806625575025e-06, + "loss": 0.2242, "step": 914400 }, { - "epoch": 9.32, - "learning_rate": 2.0090107543746748e-05, - "loss": 0.321, + "epoch": 12.599542586316167, + "grad_norm": 14.07918643951416, + "learning_rate": 3.4832751662010984e-06, + "loss": 0.2112, "step": 914500 }, { - "epoch": 9.32, - "learning_rate": 2.0083846560670327e-05, - "loss": 0.3769, + "epoch": 12.600920338375905, + "grad_norm": 0.7469869256019592, + "learning_rate": 3.47937171018833e-06, + "loss": 0.2229, "step": 914600 }, { - "epoch": 9.32, - "learning_rate": 2.0077586062434637e-05, - "loss": 0.36, + "epoch": 12.602298090435646, + "grad_norm": 3.4395291805267334, + "learning_rate": 3.4754702948508324e-06, + "loss": 0.2401, "step": 914700 }, { - "epoch": 9.32, - "learning_rate": 2.0071326049345778e-05, - "loss": 0.3714, + "epoch": 12.603675842495385, + "grad_norm": 1.2302637100219727, + "learning_rate": 3.47160990415927e-06, + "loss": 0.2078, "step": 914800 }, { - "epoch": 9.32, - "learning_rate": 2.0065066521709817e-05, - "loss": 0.3316, + "epoch": 12.605053594555123, + "grad_norm": 4.362504005432129, + "learning_rate": 3.467712550751439e-06, + "loss": 0.2112, "step": 914900 }, { - "epoch": 9.32, - "learning_rate": 2.0058807479832815e-05, - "loss": 0.3098, + "epoch": 12.606431346614864, + "grad_norm": 1.5781856775283813, + "learning_rate": 3.4638172390093675e-06, + "loss": 0.1686, "step": 915000 }, { - "epoch": 9.32, - "learning_rate": 2.005254892402081e-05, - "loss": 0.3316, + "epoch": 12.607809098674602, + "grad_norm": 0.9203447103500366, + "learning_rate": 3.459923969263991e-06, + "loss": 0.2644, "step": 915100 }, { - "epoch": 9.32, - "learning_rate": 2.004629085457979e-05, - "loss": 0.3342, + "epoch": 12.609186850734343, + "grad_norm": 3.0363094806671143, + "learning_rate": 3.4560327418460683e-06, + "loss": 0.2001, "step": 915200 }, { - "epoch": 9.33, - "learning_rate": 2.0040033271815747e-05, - "loss": 0.2706, + "epoch": 12.610564602794081, + "grad_norm": 2.0067203044891357, + "learning_rate": 3.4521435570861694e-06, + "loss": 0.2253, "step": 915300 }, { - "epoch": 9.33, - "learning_rate": 2.0033776176034648e-05, - "loss": 0.3341, + "epoch": 12.61194235485382, + "grad_norm": 3.2417232990264893, + "learning_rate": 3.4482564153146995e-06, + "loss": 0.2113, "step": 915400 }, { - "epoch": 9.33, - "learning_rate": 2.0027519567542408e-05, - "loss": 0.3202, + "epoch": 12.61332010691356, + "grad_norm": 2.598731279373169, + "learning_rate": 3.444371316861895e-06, + "loss": 0.2063, "step": 915500 }, { - "epoch": 9.33, - "learning_rate": 2.0021263446644952e-05, - "loss": 0.3026, + "epoch": 12.6146978589733, + "grad_norm": 3.812410354614258, + "learning_rate": 3.4404882620578192e-06, + "loss": 0.2644, "step": 915600 }, { - "epoch": 9.33, - "learning_rate": 2.0015007813648177e-05, - "loss": 0.3208, + "epoch": 12.616075611033038, + "grad_norm": 4.084008693695068, + "learning_rate": 3.4366460512218275e-06, + "loss": 0.205, "step": 915700 }, { - "epoch": 9.33, - "learning_rate": 2.0008752668857916e-05, - "loss": 0.2898, + "epoch": 12.617453363092778, + "grad_norm": 2.0450711250305176, + "learning_rate": 3.4327670642599675e-06, + "loss": 0.2303, "step": 915800 }, { - "epoch": 9.33, - "learning_rate": 2.0002498012580023e-05, - "loss": 0.288, + "epoch": 12.618831115152517, + "grad_norm": 2.5086967945098877, + "learning_rate": 3.4288901219326684e-06, + "loss": 0.185, "step": 915900 }, { - "epoch": 9.33, - "learning_rate": 1.999624384512031e-05, - "loss": 0.3118, + "epoch": 12.620208867212256, + "grad_norm": 4.411999702453613, + "learning_rate": 3.425015224569309e-06, + "loss": 0.2069, "step": 916000 }, { - "epoch": 9.33, - "learning_rate": 1.9989990166784598e-05, - "loss": 0.3159, + "epoch": 12.621586619271996, + "grad_norm": 2.2588372230529785, + "learning_rate": 3.4211423724990635e-06, + "loss": 0.2363, "step": 916100 }, { - "epoch": 9.33, - "learning_rate": 1.9983736977878614e-05, - "loss": 0.318, + "epoch": 12.622964371331735, + "grad_norm": 2.837456464767456, + "learning_rate": 3.41727156605096e-06, + "loss": 0.2485, "step": 916200 }, { - "epoch": 9.34, - "learning_rate": 1.997748427870812e-05, - "loss": 0.3758, + "epoch": 12.624342123391475, + "grad_norm": 1.386113166809082, + "learning_rate": 3.413402805553852e-06, + "loss": 0.2264, "step": 916300 }, { - "epoch": 9.34, - "learning_rate": 1.9971232069578844e-05, - "loss": 0.3612, + "epoch": 12.625719875451214, + "grad_norm": 1.0794447660446167, + "learning_rate": 3.409536091336399e-06, + "loss": 0.227, "step": 916400 }, { - "epoch": 9.34, - "learning_rate": 1.9964980350796457e-05, - "loss": 0.3794, + "epoch": 12.627097627510953, + "grad_norm": 5.324759483337402, + "learning_rate": 3.4056714237270965e-06, + "loss": 0.228, "step": 916500 }, { - "epoch": 9.34, - "learning_rate": 1.9958729122666642e-05, - "loss": 0.2223, + "epoch": 12.628475379570693, + "grad_norm": 2.1033549308776855, + "learning_rate": 3.4018088030542707e-06, + "loss": 0.2354, "step": 916600 }, { - "epoch": 9.34, - "learning_rate": 1.9952478385495064e-05, - "loss": 0.3282, + "epoch": 12.629853131630432, + "grad_norm": 0.15262453258037567, + "learning_rate": 3.3979482296460795e-06, + "loss": 0.2475, "step": 916700 }, { - "epoch": 9.34, - "learning_rate": 1.9946228139587325e-05, - "loss": 0.2717, + "epoch": 12.63123088369017, + "grad_norm": 3.1930205821990967, + "learning_rate": 3.394089703830488e-06, + "loss": 0.2416, "step": 916800 }, { - "epoch": 9.34, - "learning_rate": 1.993997838524903e-05, - "loss": 0.3397, + "epoch": 12.63260863574991, + "grad_norm": 3.10556960105896, + "learning_rate": 3.390233225935304e-06, + "loss": 0.2634, "step": 916900 }, { - "epoch": 9.34, - "learning_rate": 1.9933729122785772e-05, - "loss": 0.3636, + "epoch": 12.63398638780965, + "grad_norm": 0.5082910060882568, + "learning_rate": 3.386378796288152e-06, + "loss": 0.2421, "step": 917000 }, { - "epoch": 9.34, - "learning_rate": 1.992748035250307e-05, - "loss": 0.2541, + "epoch": 12.63536413986939, + "grad_norm": 0.2184629589319229, + "learning_rate": 3.382526415216483e-06, + "loss": 0.2201, "step": 917100 }, { - "epoch": 9.34, - "learning_rate": 1.9921232074706472e-05, - "loss": 0.352, + "epoch": 12.636741891929129, + "grad_norm": 5.219517230987549, + "learning_rate": 3.3786760830475754e-06, + "loss": 0.2477, "step": 917200 }, { - "epoch": 9.35, - "learning_rate": 1.9914984289701488e-05, - "loss": 0.3271, + "epoch": 12.638119643988867, + "grad_norm": 4.889597415924072, + "learning_rate": 3.374827800108548e-06, + "loss": 0.2021, "step": 917300 }, { - "epoch": 9.35, - "learning_rate": 1.9908736997793577e-05, - "loss": 0.3089, + "epoch": 12.639497396048608, + "grad_norm": 2.4116554260253906, + "learning_rate": 3.37098156672632e-06, + "loss": 0.2139, "step": 917400 }, { - "epoch": 9.35, - "learning_rate": 1.9902490199288208e-05, - "loss": 0.3281, + "epoch": 12.640875148108346, + "grad_norm": 6.757554054260254, + "learning_rate": 3.367137383227644e-06, + "loss": 0.2159, "step": 917500 }, { - "epoch": 9.35, - "learning_rate": 1.9896243894490816e-05, - "loss": 0.3559, + "epoch": 12.642252900168085, + "grad_norm": 2.021207332611084, + "learning_rate": 3.3632952499391123e-06, + "loss": 0.2143, "step": 917600 }, { - "epoch": 9.35, - "learning_rate": 1.9889998083706797e-05, - "loss": 0.3359, + "epoch": 12.643630652227825, + "grad_norm": 1.2735322713851929, + "learning_rate": 3.3594551671871387e-06, + "loss": 0.2334, "step": 917700 }, { - "epoch": 9.35, - "learning_rate": 1.9883752767241535e-05, - "loss": 0.4028, + "epoch": 12.645008404287564, + "grad_norm": 0.10791590809822083, + "learning_rate": 3.3556171352979427e-06, + "loss": 0.2308, "step": 917800 }, { - "epoch": 9.35, - "learning_rate": 1.9877507945400384e-05, - "loss": 0.3127, + "epoch": 12.646386156347305, + "grad_norm": 0.05612052232027054, + "learning_rate": 3.351857854107844e-06, + "loss": 0.2477, "step": 917900 }, { - "epoch": 9.35, - "learning_rate": 1.987126361848871e-05, - "loss": 0.3184, + "epoch": 12.647763908407043, + "grad_norm": 1.6612247228622437, + "learning_rate": 3.348023883888748e-06, + "loss": 0.2338, "step": 918000 }, { - "epoch": 9.35, - "learning_rate": 1.9865019786811784e-05, - "loss": 0.3315, + "epoch": 12.649141660466782, + "grad_norm": 3.7704150676727295, + "learning_rate": 3.344191965503576e-06, + "loss": 0.1596, "step": 918100 }, { - "epoch": 9.35, - "learning_rate": 1.9858776450674906e-05, - "loss": 0.3147, + "epoch": 12.650519412526522, + "grad_norm": 0.6422763466835022, + "learning_rate": 3.3403620992778815e-06, + "loss": 0.2067, "step": 918200 }, { - "epoch": 9.36, - "learning_rate": 1.985253361038336e-05, - "loss": 0.3388, + "epoch": 12.651897164586261, + "grad_norm": 3.9612650871276855, + "learning_rate": 3.336534285537022e-06, + "loss": 0.2644, "step": 918300 }, { - "epoch": 9.36, - "learning_rate": 1.98463536872268e-05, - "loss": 0.3196, + "epoch": 12.653274916646, + "grad_norm": 1.3923494815826416, + "learning_rate": 3.3327085246061947e-06, + "loss": 0.2099, "step": 918400 }, { - "epoch": 9.36, - "learning_rate": 1.9840111834575488e-05, - "loss": 0.298, + "epoch": 12.65465266870574, + "grad_norm": 3.3482720851898193, + "learning_rate": 3.328884816810427e-06, + "loss": 0.2368, "step": 918500 }, { - "epoch": 9.36, - "learning_rate": 1.983387047868207e-05, - "loss": 0.2837, + "epoch": 12.656030420765479, + "grad_norm": 3.072597026824951, + "learning_rate": 3.325063162474543e-06, + "loss": 0.2576, "step": 918600 }, { - "epoch": 9.36, - "learning_rate": 1.9827629619851713e-05, - "loss": 0.2821, + "epoch": 12.657408172825217, + "grad_norm": 1.5982545614242554, + "learning_rate": 3.3212435619232206e-06, + "loss": 0.239, "step": 918700 }, { - "epoch": 9.36, - "learning_rate": 1.982138925838958e-05, - "loss": 0.3031, + "epoch": 12.658785924884958, + "grad_norm": 3.929764747619629, + "learning_rate": 3.317426015480962e-06, + "loss": 0.234, "step": 918800 }, { - "epoch": 9.36, - "learning_rate": 1.9815149394600754e-05, - "loss": 0.344, + "epoch": 12.660163676944697, + "grad_norm": 4.8823561668396, + "learning_rate": 3.3136105234720753e-06, + "loss": 0.188, "step": 918900 }, { - "epoch": 9.36, - "learning_rate": 1.980891002879034e-05, - "loss": 0.3065, + "epoch": 12.661541429004437, + "grad_norm": 2.1174111366271973, + "learning_rate": 3.30979708622071e-06, + "loss": 0.2065, "step": 919000 }, { - "epoch": 9.36, - "learning_rate": 1.9802671161263425e-05, - "loss": 0.3203, + "epoch": 12.662919181064176, + "grad_norm": 1.4198808670043945, + "learning_rate": 3.3059857040508436e-06, + "loss": 0.1962, "step": 919100 }, { - "epoch": 9.36, - "learning_rate": 1.9796432792325017e-05, - "loss": 0.3317, + "epoch": 12.664296933123914, + "grad_norm": 2.373792886734009, + "learning_rate": 3.3021763772862684e-06, + "loss": 0.2206, "step": 919200 }, { - "epoch": 9.37, - "learning_rate": 1.9790194922280162e-05, - "loss": 0.4298, + "epoch": 12.665674685183655, + "grad_norm": 2.4396631717681885, + "learning_rate": 3.298369106250599e-06, + "loss": 0.2255, "step": 919300 }, { - "epoch": 9.37, - "learning_rate": 1.9783957551433856e-05, - "loss": 0.3787, + "epoch": 12.667052437243393, + "grad_norm": 0.6654471158981323, + "learning_rate": 3.294563891267291e-06, + "loss": 0.2217, "step": 919400 }, { - "epoch": 9.37, - "learning_rate": 1.9777720680091044e-05, - "loss": 0.3159, + "epoch": 12.668430189303134, + "grad_norm": 2.4668378829956055, + "learning_rate": 3.2907607326596178e-06, + "loss": 0.219, "step": 919500 }, { - "epoch": 9.37, - "learning_rate": 1.9771484308556693e-05, - "loss": 0.2785, + "epoch": 12.669807941362873, + "grad_norm": 2.653829336166382, + "learning_rate": 3.2869596307506744e-06, + "loss": 0.252, "step": 919600 }, { - "epoch": 9.37, - "learning_rate": 1.9765248437135723e-05, - "loss": 0.3456, + "epoch": 12.671185693422611, + "grad_norm": 1.5821677446365356, + "learning_rate": 3.283160585863377e-06, + "loss": 0.1857, "step": 919700 }, { - "epoch": 9.37, - "learning_rate": 1.975901306613302e-05, - "loss": 0.3292, + "epoch": 12.672563445482352, + "grad_norm": 1.2112700939178467, + "learning_rate": 3.2793635983204882e-06, + "loss": 0.2284, "step": 919800 }, { - "epoch": 9.37, - "learning_rate": 1.975277819585346e-05, - "loss": 0.2588, + "epoch": 12.67394119754209, + "grad_norm": 2.146636486053467, + "learning_rate": 3.275568668444567e-06, + "loss": 0.2127, "step": 919900 }, { - "epoch": 9.37, - "learning_rate": 1.9746543826601902e-05, - "loss": 0.3248, + "epoch": 12.675318949601829, + "grad_norm": 0.02897159568965435, + "learning_rate": 3.271775796558018e-06, + "loss": 0.1937, "step": 920000 }, { - "epoch": 9.37, - "learning_rate": 1.9740309958683165e-05, - "loss": 0.2978, + "epoch": 12.67669670166157, + "grad_norm": 4.477741718292236, + "learning_rate": 3.267984982983074e-06, + "loss": 0.2606, "step": 920100 }, { - "epoch": 9.38, - "learning_rate": 1.9734076592402035e-05, - "loss": 0.3046, + "epoch": 12.678074453721308, + "grad_norm": 1.341055154800415, + "learning_rate": 3.2641962280417734e-06, + "loss": 0.2119, "step": 920200 }, { - "epoch": 9.38, - "learning_rate": 1.97278437280633e-05, - "loss": 0.3603, + "epoch": 12.679452205781047, + "grad_norm": 3.609816551208496, + "learning_rate": 3.260409532055988e-06, + "loss": 0.1916, "step": 920300 }, { - "epoch": 9.38, - "learning_rate": 1.972161136597171e-05, - "loss": 0.339, + "epoch": 12.680829957840787, + "grad_norm": 2.864924669265747, + "learning_rate": 3.2566248953474217e-06, + "loss": 0.2361, "step": 920400 }, { - "epoch": 9.38, - "learning_rate": 1.9715379506431973e-05, - "loss": 0.3629, + "epoch": 12.682207709900526, + "grad_norm": 1.8209729194641113, + "learning_rate": 3.2528423182376037e-06, + "loss": 0.2382, "step": 920500 }, { - "epoch": 9.38, - "learning_rate": 1.970914814974881e-05, - "loss": 0.3389, + "epoch": 12.683585461960266, + "grad_norm": 3.0421018600463867, + "learning_rate": 3.2490618010478736e-06, + "loss": 0.1922, "step": 920600 }, { - "epoch": 9.38, - "learning_rate": 1.9702979602270465e-05, - "loss": 0.3156, + "epoch": 12.684963214020005, + "grad_norm": 1.7290154695510864, + "learning_rate": 3.2452833440994135e-06, + "loss": 0.1992, "step": 920700 }, { - "epoch": 9.38, - "learning_rate": 1.969674924717827e-05, - "loss": 0.3194, + "epoch": 12.686340966079744, + "grad_norm": 2.664268732070923, + "learning_rate": 3.2415069477132218e-06, + "loss": 0.2129, "step": 920800 }, { - "epoch": 9.38, - "learning_rate": 1.9690519395853563e-05, - "loss": 0.2644, + "epoch": 12.687718718139484, + "grad_norm": 1.547699213027954, + "learning_rate": 3.237732612210116e-06, + "loss": 0.2012, "step": 920900 }, { - "epoch": 9.38, - "learning_rate": 1.968429004860092e-05, - "loss": 0.3638, + "epoch": 12.689096470199223, + "grad_norm": 1.4671459197998047, + "learning_rate": 3.233960337910749e-06, + "loss": 0.2738, "step": 921000 }, { - "epoch": 9.38, - "learning_rate": 1.9678061205724937e-05, - "loss": 0.3224, + "epoch": 12.690474222258961, + "grad_norm": 1.930884838104248, + "learning_rate": 3.2301901251356016e-06, + "loss": 0.234, "step": 921100 }, { - "epoch": 9.39, - "learning_rate": 1.9671832867530167e-05, - "loss": 0.3336, + "epoch": 12.691851974318702, + "grad_norm": 0.25040969252586365, + "learning_rate": 3.2264219742049587e-06, + "loss": 0.2381, "step": 921200 }, { - "epoch": 9.39, - "learning_rate": 1.966560503432113e-05, - "loss": 0.2763, + "epoch": 12.69322972637844, + "grad_norm": 14.818979263305664, + "learning_rate": 3.222655885438959e-06, + "loss": 0.2421, "step": 921300 }, { - "epoch": 9.39, - "learning_rate": 1.9659377706402328e-05, - "loss": 0.4111, + "epoch": 12.694607478438181, + "grad_norm": 3.110816240310669, + "learning_rate": 3.2188918591575435e-06, + "loss": 0.2065, "step": 921400 }, { - "epoch": 9.39, - "learning_rate": 1.9653150884078262e-05, - "loss": 0.3353, + "epoch": 12.69598523049792, + "grad_norm": 3.54752516746521, + "learning_rate": 3.2151298956804805e-06, + "loss": 0.2343, "step": 921500 }, { - "epoch": 9.39, - "learning_rate": 1.964692456765336e-05, - "loss": 0.2812, + "epoch": 12.697362982557658, + "grad_norm": 1.3571783304214478, + "learning_rate": 3.2113699953273727e-06, + "loss": 0.1953, "step": 921600 }, { - "epoch": 9.39, - "learning_rate": 1.9640698757432065e-05, - "loss": 0.3167, + "epoch": 12.698740734617399, + "grad_norm": 1.5337263345718384, + "learning_rate": 3.2076121584176523e-06, + "loss": 0.2161, "step": 921700 }, { - "epoch": 9.39, - "learning_rate": 1.963447345371878e-05, - "loss": 0.3123, + "epoch": 12.700118486677137, + "grad_norm": 1.7826619148254395, + "learning_rate": 3.203856385270556e-06, + "loss": 0.2171, "step": 921800 }, { - "epoch": 9.39, - "learning_rate": 1.9628248656817883e-05, - "loss": 0.3657, + "epoch": 12.701496238736876, + "grad_norm": 2.599698066711426, + "learning_rate": 3.200102676205152e-06, + "loss": 0.2214, "step": 921900 }, { - "epoch": 9.39, - "learning_rate": 1.962202436703373e-05, - "loss": 0.3143, + "epoch": 12.702873990796617, + "grad_norm": 1.8907020092010498, + "learning_rate": 3.196351031540344e-06, + "loss": 0.1873, "step": 922000 }, { - "epoch": 9.39, - "learning_rate": 1.9615800584670664e-05, - "loss": 0.3026, + "epoch": 12.704251742856355, + "grad_norm": 3.0847809314727783, + "learning_rate": 3.192601451594856e-06, + "loss": 0.2382, "step": 922100 }, { - "epoch": 9.4, - "learning_rate": 1.960957731003297e-05, - "loss": 0.35, + "epoch": 12.705629494916096, + "grad_norm": 0.013093199580907822, + "learning_rate": 3.1888539366872256e-06, + "loss": 0.1631, "step": 922200 }, { - "epoch": 9.4, - "learning_rate": 1.9603354543424938e-05, - "loss": 0.306, + "epoch": 12.707007246975834, + "grad_norm": 1.5631515979766846, + "learning_rate": 3.1851084871358312e-06, + "loss": 0.2269, "step": 922300 }, { - "epoch": 9.4, - "learning_rate": 1.9597132285150823e-05, - "loss": 0.3539, + "epoch": 12.708384999035573, + "grad_norm": 1.1403919458389282, + "learning_rate": 3.1813651032588714e-06, + "loss": 0.2208, "step": 922400 }, { - "epoch": 9.4, - "learning_rate": 1.9590910535514866e-05, - "loss": 0.364, + "epoch": 12.709762751095314, + "grad_norm": 1.0077670812606812, + "learning_rate": 3.177623785374352e-06, + "loss": 0.25, "step": 922500 }, { - "epoch": 9.4, - "learning_rate": 1.958468929482126e-05, - "loss": 0.3137, + "epoch": 12.711140503155052, + "grad_norm": 4.851685523986816, + "learning_rate": 3.173884533800124e-06, + "loss": 0.2044, "step": 922600 }, { - "epoch": 9.4, - "learning_rate": 1.95785307681669e-05, - "loss": 0.3578, + "epoch": 12.71251825521479, + "grad_norm": 4.300386428833008, + "learning_rate": 3.1701473488538606e-06, + "loss": 0.2626, "step": 922700 }, { - "epoch": 9.4, - "learning_rate": 1.9572310541173514e-05, - "loss": 0.3345, + "epoch": 12.713896007274531, + "grad_norm": 4.452045440673828, + "learning_rate": 3.166412230853046e-06, + "loss": 0.2153, "step": 922800 }, { - "epoch": 9.4, - "learning_rate": 1.9566090824031918e-05, - "loss": 0.3182, + "epoch": 12.71527375933427, + "grad_norm": 2.243765354156494, + "learning_rate": 3.1627165003883963e-06, + "loss": 0.2166, "step": 922900 }, { - "epoch": 9.4, - "learning_rate": 1.955987161704621e-05, - "loss": 0.3324, + "epoch": 12.716651511394009, + "grad_norm": 1.0505355596542358, + "learning_rate": 3.1589854965528974e-06, + "loss": 0.1921, "step": 923000 }, { - "epoch": 9.4, - "learning_rate": 1.9553652920520476e-05, - "loss": 0.3403, + "epoch": 12.718029263453749, + "grad_norm": 0.570681631565094, + "learning_rate": 3.155256560611107e-06, + "loss": 0.2383, "step": 923100 }, { - "epoch": 9.41, - "learning_rate": 1.9547434734758772e-05, - "loss": 0.2897, + "epoch": 12.719407015513488, + "grad_norm": 0.11060261726379395, + "learning_rate": 3.151529692879823e-06, + "loss": 0.2283, "step": 923200 }, { - "epoch": 9.41, - "learning_rate": 1.9541217060065148e-05, - "loss": 0.2998, + "epoch": 12.720784767573228, + "grad_norm": 5.472596168518066, + "learning_rate": 3.147804893675653e-06, + "loss": 0.2118, "step": 923300 }, { - "epoch": 9.41, - "learning_rate": 1.953499989674358e-05, - "loss": 0.3266, + "epoch": 12.722162519632967, + "grad_norm": 1.9719280004501343, + "learning_rate": 3.1440821633150428e-06, + "loss": 0.2351, "step": 923400 }, { - "epoch": 9.41, - "learning_rate": 1.9528783245098065e-05, - "loss": 0.3917, + "epoch": 12.723540271692706, + "grad_norm": 4.066338539123535, + "learning_rate": 3.140361502114268e-06, + "loss": 0.2319, "step": 923500 }, { - "epoch": 9.41, - "learning_rate": 1.952256710543257e-05, - "loss": 0.3807, + "epoch": 12.724918023752446, + "grad_norm": 0.9245819449424744, + "learning_rate": 3.136642910389396e-06, + "loss": 0.2482, "step": 923600 }, { - "epoch": 9.41, - "learning_rate": 1.9516351478051007e-05, - "loss": 0.2952, + "epoch": 12.726295775812185, + "grad_norm": 0.9695680737495422, + "learning_rate": 3.13292638845635e-06, + "loss": 0.1953, "step": 923700 }, { - "epoch": 9.41, - "learning_rate": 1.9510136363257293e-05, - "loss": 0.366, + "epoch": 12.727673527871925, + "grad_norm": 2.442044258117676, + "learning_rate": 3.129211936630875e-06, + "loss": 0.2758, "step": 923800 }, { - "epoch": 9.41, - "learning_rate": 1.9503921761355326e-05, - "loss": 0.2897, + "epoch": 12.729051279931664, + "grad_norm": 10.180858612060547, + "learning_rate": 3.1254995552285186e-06, + "loss": 0.2707, "step": 923900 }, { - "epoch": 9.41, - "learning_rate": 1.9497707672648932e-05, - "loss": 0.3665, + "epoch": 12.730429031991402, + "grad_norm": 1.1940913200378418, + "learning_rate": 3.1217892445646795e-06, + "loss": 0.2352, "step": 924000 }, { - "epoch": 9.41, - "learning_rate": 1.9491494097441964e-05, - "loss": 0.3481, + "epoch": 12.731806784051143, + "grad_norm": 2.8272998332977295, + "learning_rate": 3.1180810049545587e-06, + "loss": 0.1894, "step": 924100 }, { - "epoch": 9.42, - "learning_rate": 1.9485281036038222e-05, - "loss": 0.3293, + "epoch": 12.733184536110882, + "grad_norm": 3.493452310562134, + "learning_rate": 3.114374836713197e-06, + "loss": 0.1851, "step": 924200 }, { - "epoch": 9.42, - "learning_rate": 1.9479068488741487e-05, - "loss": 0.264, + "epoch": 12.73456228817062, + "grad_norm": 2.7276203632354736, + "learning_rate": 3.1106707401554478e-06, + "loss": 0.2224, "step": 924300 }, { - "epoch": 9.42, - "learning_rate": 1.9472918573637055e-05, - "loss": 0.37, + "epoch": 12.73594004023036, + "grad_norm": 2.1621506214141846, + "learning_rate": 3.1069687155959882e-06, + "loss": 0.187, "step": 924400 }, { - "epoch": 9.42, - "learning_rate": 1.946670705031693e-05, - "loss": 0.2919, + "epoch": 12.7373177922901, + "grad_norm": 1.2508805990219116, + "learning_rate": 3.1032687633493387e-06, + "loss": 0.22, "step": 924500 }, { - "epoch": 9.42, - "learning_rate": 1.9460496042011974e-05, - "loss": 0.3493, + "epoch": 12.738695544349838, + "grad_norm": 2.0268301963806152, + "learning_rate": 3.099570883729823e-06, + "loss": 0.2047, "step": 924600 }, { - "epoch": 9.42, - "learning_rate": 1.9454285549025882e-05, - "loss": 0.3264, + "epoch": 12.740073296409578, + "grad_norm": 3.811948776245117, + "learning_rate": 3.0958750770515818e-06, + "loss": 0.265, "step": 924700 }, { - "epoch": 9.42, - "learning_rate": 1.944807557166227e-05, - "loss": 0.3204, + "epoch": 12.741451048469317, + "grad_norm": 2.4735653400421143, + "learning_rate": 3.092181343628613e-06, + "loss": 0.2221, "step": 924800 }, { - "epoch": 9.42, - "learning_rate": 1.9441866110224802e-05, - "loss": 0.3042, + "epoch": 12.742828800529058, + "grad_norm": 2.6457033157348633, + "learning_rate": 3.0885265901080436e-06, + "loss": 0.2125, "step": 924900 }, { - "epoch": 9.42, - "learning_rate": 1.9435657165017075e-05, - "loss": 0.3913, + "epoch": 12.744206552588796, + "grad_norm": 1.1905360221862793, + "learning_rate": 3.084836983396445e-06, + "loss": 0.2397, "step": 925000 }, { - "epoch": 9.43, - "learning_rate": 1.9429448736342663e-05, - "loss": 0.3397, + "epoch": 12.745584304648535, + "grad_norm": 2.320091485977173, + "learning_rate": 3.0811494508778544e-06, + "loss": 0.1946, "step": 925100 }, { - "epoch": 9.43, - "learning_rate": 1.9423240824505124e-05, - "loss": 0.3346, + "epoch": 12.746962056708275, + "grad_norm": 0.17135082185268402, + "learning_rate": 3.0774639928655466e-06, + "loss": 0.2581, "step": 925200 }, { - "epoch": 9.43, - "learning_rate": 1.9417033429808005e-05, - "loss": 0.3326, + "epoch": 12.748339808768014, + "grad_norm": 0.7217452526092529, + "learning_rate": 3.0737806096726115e-06, + "loss": 0.247, "step": 925300 }, { - "epoch": 9.43, - "learning_rate": 1.9410826552554782e-05, - "loss": 0.2725, + "epoch": 12.749717560827753, + "grad_norm": 0.8664499521255493, + "learning_rate": 3.0700993016119753e-06, + "loss": 0.1523, "step": 925400 }, { - "epoch": 9.43, - "learning_rate": 1.9404620193048945e-05, - "loss": 0.2656, + "epoch": 12.751095312887493, + "grad_norm": 3.5013489723205566, + "learning_rate": 3.0664200689963927e-06, + "loss": 0.26, "step": 925500 }, { - "epoch": 9.43, - "learning_rate": 1.939841435159396e-05, - "loss": 0.3593, + "epoch": 12.752473064947232, + "grad_norm": 4.590202331542969, + "learning_rate": 3.0627429121384197e-06, + "loss": 0.224, "step": 925600 }, { - "epoch": 9.43, - "learning_rate": 1.939220902849324e-05, - "loss": 0.3433, + "epoch": 12.753850817006972, + "grad_norm": 0.34800752997398376, + "learning_rate": 3.059067831350464e-06, + "loss": 0.1885, "step": 925700 }, { - "epoch": 9.43, - "learning_rate": 1.9386004224050194e-05, - "loss": 0.3171, + "epoch": 12.75522856906671, + "grad_norm": 2.6511430740356445, + "learning_rate": 3.05539482694473e-06, + "loss": 0.2265, "step": 925800 }, { - "epoch": 9.43, - "learning_rate": 1.9379799938568204e-05, - "loss": 0.3885, + "epoch": 12.75660632112645, + "grad_norm": 6.315014839172363, + "learning_rate": 3.0517238992332643e-06, + "loss": 0.2249, "step": 925900 }, { - "epoch": 9.43, - "learning_rate": 1.9373596172350625e-05, - "loss": 0.3114, + "epoch": 12.75798407318619, + "grad_norm": 0.6136699318885803, + "learning_rate": 3.0480550485279278e-06, + "loss": 0.2069, "step": 926000 }, { - "epoch": 9.44, - "learning_rate": 1.9367392925700768e-05, - "loss": 0.3563, + "epoch": 12.759361825245929, + "grad_norm": 181.0465087890625, + "learning_rate": 3.044388275140412e-06, + "loss": 0.2215, "step": 926100 }, { - "epoch": 9.44, - "learning_rate": 1.9361190198921947e-05, - "loss": 0.3192, + "epoch": 12.760739577305667, + "grad_norm": 1.8276182413101196, + "learning_rate": 3.040723579382236e-06, + "loss": 0.2215, "step": 926200 }, { - "epoch": 9.44, - "learning_rate": 1.935498799231744e-05, - "loss": 0.2337, + "epoch": 12.762117329365408, + "grad_norm": 3.4303765296936035, + "learning_rate": 3.037060961564713e-06, + "loss": 0.2435, "step": 926300 }, { - "epoch": 9.44, - "learning_rate": 1.9348786306190486e-05, - "loss": 0.3357, + "epoch": 12.763495081425146, + "grad_norm": 2.020491600036621, + "learning_rate": 3.03340042199901e-06, + "loss": 0.1892, "step": 926400 }, { - "epoch": 9.44, - "learning_rate": 1.9342585140844322e-05, - "loss": 0.3395, + "epoch": 12.764872833484887, + "grad_norm": 7.699017524719238, + "learning_rate": 3.0297419609961206e-06, + "loss": 0.2643, "step": 926500 }, { - "epoch": 9.44, - "learning_rate": 1.9336384496582155e-05, - "loss": 0.2727, + "epoch": 12.766250585544626, + "grad_norm": 2.8943793773651123, + "learning_rate": 3.0260855788668337e-06, + "loss": 0.2231, "step": 926600 }, { - "epoch": 9.44, - "learning_rate": 1.9330184373707138e-05, - "loss": 0.2971, + "epoch": 12.767628337604364, + "grad_norm": 2.7637150287628174, + "learning_rate": 3.022431275921785e-06, + "loss": 0.2547, "step": 926700 }, { - "epoch": 9.44, - "learning_rate": 1.9323984772522433e-05, - "loss": 0.301, + "epoch": 12.769006089664105, + "grad_norm": 0.6149296760559082, + "learning_rate": 3.0187790524714327e-06, + "loss": 0.2699, "step": 926800 }, { - "epoch": 9.44, - "learning_rate": 1.9317785693331167e-05, - "loss": 0.2801, + "epoch": 12.770383841723843, + "grad_norm": 3.2291924953460693, + "learning_rate": 3.0151289088260443e-06, + "loss": 0.194, "step": 926900 }, { - "epoch": 9.44, - "learning_rate": 1.9311587136436423e-05, - "loss": 0.3746, + "epoch": 12.771761593783582, + "grad_norm": 1.6513943672180176, + "learning_rate": 3.011480845295712e-06, + "loss": 0.2346, "step": 927000 }, { - "epoch": 9.45, - "learning_rate": 1.9305389102141287e-05, - "loss": 0.2438, + "epoch": 12.773139345843322, + "grad_norm": 2.3989291191101074, + "learning_rate": 3.0078348621903633e-06, + "loss": 0.2043, "step": 927100 }, { - "epoch": 9.45, - "learning_rate": 1.9299191590748796e-05, - "loss": 0.3846, + "epoch": 12.774517097903061, + "grad_norm": 2.5076005458831787, + "learning_rate": 3.004190959819752e-06, + "loss": 0.2165, "step": 927200 }, { - "epoch": 9.45, - "learning_rate": 1.9292994602561993e-05, - "loss": 0.4086, + "epoch": 12.7758948499628, + "grad_norm": 1.2464557886123657, + "learning_rate": 3.000549138493433e-06, + "loss": 0.2054, "step": 927300 }, { - "epoch": 9.45, - "learning_rate": 1.9286798137883846e-05, - "loss": 0.3257, + "epoch": 12.77727260202254, + "grad_norm": 0.9744536280632019, + "learning_rate": 2.9969093985208093e-06, + "loss": 0.2291, "step": 927400 }, { - "epoch": 9.45, - "learning_rate": 1.9280602197017334e-05, - "loss": 0.3081, + "epoch": 12.778650354082279, + "grad_norm": 2.2464873790740967, + "learning_rate": 2.993271740211087e-06, + "loss": 0.2311, "step": 927500 }, { - "epoch": 9.45, - "learning_rate": 1.9274406780265415e-05, - "loss": 0.2826, + "epoch": 12.78002810614202, + "grad_norm": 2.430288791656494, + "learning_rate": 2.9896361638733004e-06, + "loss": 0.229, "step": 927600 }, { - "epoch": 9.45, - "learning_rate": 1.926821188793099e-05, - "loss": 0.3484, + "epoch": 12.781405858201758, + "grad_norm": 1.9284194707870483, + "learning_rate": 2.9860026698163167e-06, + "loss": 0.2161, "step": 927700 }, { - "epoch": 9.45, - "learning_rate": 1.9262017520316957e-05, - "loss": 0.322, + "epoch": 12.782783610261497, + "grad_norm": 2.662445545196533, + "learning_rate": 2.982371258348825e-06, + "loss": 0.213, "step": 927800 }, { - "epoch": 9.45, - "learning_rate": 1.9255823677726202e-05, - "loss": 0.3212, + "epoch": 12.784161362321237, + "grad_norm": 3.4701831340789795, + "learning_rate": 2.9787419297793257e-06, + "loss": 0.2138, "step": 927900 }, { - "epoch": 9.45, - "learning_rate": 1.9249630360461537e-05, - "loss": 0.3369, + "epoch": 12.785539114380976, + "grad_norm": 0.5050792098045349, + "learning_rate": 2.9751146844161422e-06, + "loss": 0.2462, "step": 928000 }, { - "epoch": 9.46, - "learning_rate": 1.9243437568825793e-05, - "loss": 0.2335, + "epoch": 12.786916866440716, + "grad_norm": 2.81074595451355, + "learning_rate": 2.971489522567436e-06, + "loss": 0.229, "step": 928100 }, { - "epoch": 9.46, - "learning_rate": 1.9237245303121768e-05, - "loss": 0.2942, + "epoch": 12.788294618500455, + "grad_norm": 3.9507157802581787, + "learning_rate": 2.967866444541188e-06, + "loss": 0.1931, "step": 928200 }, { - "epoch": 9.46, - "learning_rate": 1.9231053563652208e-05, - "loss": 0.3172, + "epoch": 12.789672370560194, + "grad_norm": 0.31623491644859314, + "learning_rate": 2.9642454506451874e-06, + "loss": 0.2449, "step": 928300 }, { - "epoch": 9.46, - "learning_rate": 1.9224862350719866e-05, - "loss": 0.3546, + "epoch": 12.791050122619934, + "grad_norm": 2.87270188331604, + "learning_rate": 2.9606265411870546e-06, + "loss": 0.254, "step": 928400 }, { - "epoch": 9.46, - "learning_rate": 1.9218671664627465e-05, - "loss": 0.3159, + "epoch": 12.792427874679673, + "grad_norm": 3.3676488399505615, + "learning_rate": 2.9570097164742566e-06, + "loss": 0.2609, "step": 928500 }, { - "epoch": 9.46, - "learning_rate": 1.921248150567767e-05, - "loss": 0.3635, + "epoch": 12.793805626739411, + "grad_norm": 1.3202564716339111, + "learning_rate": 2.953394976814031e-06, + "loss": 0.2308, "step": 928600 }, { - "epoch": 9.46, - "learning_rate": 1.9206291874173148e-05, - "loss": 0.3394, + "epoch": 12.795183378799152, + "grad_norm": 0.3617525100708008, + "learning_rate": 2.9497823225134804e-06, + "loss": 0.2109, "step": 928700 }, { - "epoch": 9.46, - "learning_rate": 1.920010277041655e-05, - "loss": 0.3483, + "epoch": 12.79656113085889, + "grad_norm": 1.5670219659805298, + "learning_rate": 2.946171753879526e-06, + "loss": 0.2324, "step": 928800 }, { - "epoch": 9.46, - "learning_rate": 1.919397607785268e-05, - "loss": 0.3327, + "epoch": 12.79793888291863, + "grad_norm": 5.255171298980713, + "learning_rate": 2.94256327121889e-06, + "loss": 0.252, "step": 928900 }, { - "epoch": 9.46, - "learning_rate": 1.9187788025214687e-05, - "loss": 0.315, + "epoch": 12.79931663497837, + "grad_norm": 1.8571407794952393, + "learning_rate": 2.938992928473866e-06, + "loss": 0.2673, "step": 929000 }, { - "epoch": 9.47, - "learning_rate": 1.9181662373851353e-05, - "loss": 0.3735, + "epoch": 12.800694387038108, + "grad_norm": 2.7903871536254883, + "learning_rate": 2.9354246307890307e-06, + "loss": 0.2503, "step": 929100 }, { - "epoch": 9.47, - "learning_rate": 1.917547537353013e-05, - "loss": 0.2762, + "epoch": 12.802072139097849, + "grad_norm": 1.9524593353271484, + "learning_rate": 2.9318223661461725e-06, + "loss": 0.2133, "step": 929200 }, { - "epoch": 9.47, - "learning_rate": 1.9169288902463553e-05, - "loss": 0.3147, + "epoch": 12.803449891157587, + "grad_norm": 3.577744722366333, + "learning_rate": 2.9282221886956976e-06, + "loss": 0.2361, "step": 929300 }, { - "epoch": 9.47, - "learning_rate": 1.916310296095411e-05, - "loss": 0.3477, + "epoch": 12.804827643217326, + "grad_norm": 2.7197766304016113, + "learning_rate": 2.9246240987434517e-06, + "loss": 0.2062, "step": 929400 }, { - "epoch": 9.47, - "learning_rate": 1.915691754930427e-05, - "loss": 0.3936, + "epoch": 12.806205395277066, + "grad_norm": 0.5694272518157959, + "learning_rate": 2.921028096595122e-06, + "loss": 0.2517, "step": 929500 }, { - "epoch": 9.47, - "learning_rate": 1.9150732667816453e-05, - "loss": 0.3668, + "epoch": 12.807583147336805, + "grad_norm": 3.6552529335021973, + "learning_rate": 2.917434182556193e-06, + "loss": 0.195, "step": 929600 }, { - "epoch": 9.47, - "learning_rate": 1.9144548316793057e-05, - "loss": 0.3583, + "epoch": 12.808960899396544, + "grad_norm": 1.040052890777588, + "learning_rate": 2.9138423569319983e-06, + "loss": 0.2327, "step": 929700 }, { - "epoch": 9.47, - "learning_rate": 1.9138364496536478e-05, - "loss": 0.2812, + "epoch": 12.810338651456284, + "grad_norm": 1.8768765926361084, + "learning_rate": 2.9102526200276742e-06, + "loss": 0.2158, "step": 929800 }, { - "epoch": 9.47, - "learning_rate": 1.9132181207349052e-05, - "loss": 0.3011, + "epoch": 12.811716403516023, + "grad_norm": 0.9721648097038269, + "learning_rate": 2.9066649721481914e-06, + "loss": 0.2361, "step": 929900 }, { - "epoch": 9.48, - "learning_rate": 1.9125998449533098e-05, - "loss": 0.3456, + "epoch": 12.813094155575763, + "grad_norm": 0.9108976721763611, + "learning_rate": 2.9030794135983415e-06, + "loss": 0.1929, "step": 930000 }, { - "epoch": 9.48, - "learning_rate": 1.9119816223390943e-05, - "loss": 0.3027, + "epoch": 12.814471907635502, + "grad_norm": 1.3948016166687012, + "learning_rate": 2.89949594468273e-06, + "loss": 0.1867, "step": 930100 }, { - "epoch": 9.48, - "learning_rate": 1.911363452922483e-05, - "loss": 0.3307, + "epoch": 12.81584965969524, + "grad_norm": 4.907284736633301, + "learning_rate": 2.895914565705793e-06, + "loss": 0.1979, "step": 930200 }, { - "epoch": 9.48, - "learning_rate": 1.910745336733703e-05, - "loss": 0.3546, + "epoch": 12.817227411754981, + "grad_norm": 0.08066895604133606, + "learning_rate": 2.892335276971788e-06, + "loss": 0.2316, "step": 930300 }, { - "epoch": 9.48, - "learning_rate": 1.9101272738029754e-05, - "loss": 0.2646, + "epoch": 12.81860516381472, + "grad_norm": 2.7154369354248047, + "learning_rate": 2.88875807878479e-06, + "loss": 0.2536, "step": 930400 }, { - "epoch": 9.48, - "learning_rate": 1.9095092641605212e-05, - "loss": 0.3663, + "epoch": 12.819982915874458, + "grad_norm": 1.7118443250656128, + "learning_rate": 2.8852187121713565e-06, + "loss": 0.1956, "step": 930500 }, { - "epoch": 9.48, - "learning_rate": 1.9088913078365552e-05, - "loss": 0.3289, + "epoch": 12.821360667934199, + "grad_norm": 0.07313413172960281, + "learning_rate": 2.8816456750768617e-06, + "loss": 0.1884, "step": 930600 }, { - "epoch": 9.48, - "learning_rate": 1.9082734048612928e-05, - "loss": 0.3208, + "epoch": 12.822738419993938, + "grad_norm": 1.630340337753296, + "learning_rate": 2.8780747294375075e-06, + "loss": 0.1979, "step": 930700 }, { - "epoch": 9.48, - "learning_rate": 1.9076555552649462e-05, - "loss": 0.3703, + "epoch": 12.824116172053678, + "grad_norm": 5.865341663360596, + "learning_rate": 2.874505875556675e-06, + "loss": 0.2635, "step": 930800 }, { - "epoch": 9.48, - "learning_rate": 1.907037759077724e-05, - "loss": 0.3086, + "epoch": 12.825493924113417, + "grad_norm": 1.1222562789916992, + "learning_rate": 2.8709391137375482e-06, + "loss": 0.1927, "step": 930900 }, { - "epoch": 9.49, - "learning_rate": 1.906420016329833e-05, - "loss": 0.3045, + "epoch": 12.826871676173155, + "grad_norm": 2.5637333393096924, + "learning_rate": 2.867374444283142e-06, + "loss": 0.2584, "step": 931000 }, { - "epoch": 9.49, - "learning_rate": 1.9058023270514774e-05, - "loss": 0.3831, + "epoch": 12.828249428232896, + "grad_norm": 0.18940003216266632, + "learning_rate": 2.8638118674963037e-06, + "loss": 0.2199, "step": 931100 }, { - "epoch": 9.49, - "learning_rate": 1.905184691272858e-05, - "loss": 0.3018, + "epoch": 12.829627180292634, + "grad_norm": 2.914417266845703, + "learning_rate": 2.8602513836796854e-06, + "loss": 0.2029, "step": 931200 }, { - "epoch": 9.49, - "learning_rate": 1.9045671090241732e-05, - "loss": 0.3416, + "epoch": 12.831004932352373, + "grad_norm": 5.160257816314697, + "learning_rate": 2.8566929931357624e-06, + "loss": 0.2475, "step": 931300 }, { - "epoch": 9.49, - "learning_rate": 1.90394958033562e-05, - "loss": 0.3218, + "epoch": 12.832382684412114, + "grad_norm": 3.4218485355377197, + "learning_rate": 2.853136696166848e-06, + "loss": 0.2021, "step": 931400 }, { - "epoch": 9.49, - "learning_rate": 1.903332105237391e-05, - "loss": 0.2794, + "epoch": 12.833760436471852, + "grad_norm": 1.1191118955612183, + "learning_rate": 2.8495824930750595e-06, + "loss": 0.207, "step": 931500 }, { - "epoch": 9.49, - "learning_rate": 1.902714683759678e-05, - "loss": 0.325, + "epoch": 12.835138188531591, + "grad_norm": 1.8268364667892456, + "learning_rate": 2.84603038416235e-06, + "loss": 0.2199, "step": 931600 }, { - "epoch": 9.49, - "learning_rate": 1.9020973159326693e-05, - "loss": 0.3508, + "epoch": 12.836515940591331, + "grad_norm": 0.5255799293518066, + "learning_rate": 2.8424803697304924e-06, + "loss": 0.2533, "step": 931700 }, { - "epoch": 9.49, - "learning_rate": 1.9014800017865486e-05, - "loss": 0.2985, + "epoch": 12.83789369265107, + "grad_norm": 7.88160514831543, + "learning_rate": 2.838932450081073e-06, + "loss": 0.2927, "step": 931800 }, { - "epoch": 9.49, - "learning_rate": 1.9008627413515003e-05, - "loss": 0.3053, + "epoch": 12.83927144471081, + "grad_norm": 1.5226892232894897, + "learning_rate": 2.8353866255155047e-06, + "loss": 0.2365, "step": 931900 }, { - "epoch": 9.5, - "learning_rate": 1.900245534657705e-05, - "loss": 0.3645, + "epoch": 12.84064919677055, + "grad_norm": 5.4155988693237305, + "learning_rate": 2.831842896335029e-06, + "loss": 0.2479, "step": 932000 }, { - "epoch": 9.5, - "learning_rate": 1.8996283817353407e-05, - "loss": 0.3499, + "epoch": 12.842026948830288, + "grad_norm": 0.7205844521522522, + "learning_rate": 2.8283012628406974e-06, + "loss": 0.2546, "step": 932100 }, { - "epoch": 9.5, - "learning_rate": 1.8990112826145805e-05, - "loss": 0.3113, + "epoch": 12.843404700890028, + "grad_norm": 2.0160913467407227, + "learning_rate": 2.8247617253333918e-06, + "loss": 0.2207, "step": 932200 }, { - "epoch": 9.5, - "learning_rate": 1.898394237325599e-05, - "loss": 0.3031, + "epoch": 12.844782452949767, + "grad_norm": 4.22540807723999, + "learning_rate": 2.8212242841138194e-06, + "loss": 0.2313, "step": 932300 }, { - "epoch": 9.5, - "learning_rate": 1.897777245898566e-05, - "loss": 0.2854, + "epoch": 12.846160205009507, + "grad_norm": 0.6966512203216553, + "learning_rate": 2.8176889394825012e-06, + "loss": 0.2136, "step": 932400 }, { - "epoch": 9.5, - "learning_rate": 1.8971603083636468e-05, - "loss": 0.3474, + "epoch": 12.847537957069246, + "grad_norm": 2.6707377433776855, + "learning_rate": 2.814155691739775e-06, + "loss": 0.2458, "step": 932500 }, { - "epoch": 9.5, - "learning_rate": 1.8965434247510074e-05, - "loss": 0.31, + "epoch": 12.848915709128985, + "grad_norm": 1.7527315616607666, + "learning_rate": 2.8106245411858164e-06, + "loss": 0.2839, "step": 932600 }, { - "epoch": 9.5, - "learning_rate": 1.8959265950908102e-05, - "loss": 0.326, + "epoch": 12.850293461188725, + "grad_norm": 2.891397476196289, + "learning_rate": 2.8070954881206187e-06, + "loss": 0.2075, "step": 932700 }, { - "epoch": 9.5, - "learning_rate": 1.8953098194132123e-05, - "loss": 0.3732, + "epoch": 12.851671213248464, + "grad_norm": 1.4294129610061646, + "learning_rate": 2.8035685328439852e-06, + "loss": 0.1966, "step": 932800 }, { - "epoch": 9.5, - "learning_rate": 1.8946930977483724e-05, - "loss": 0.3472, + "epoch": 12.853048965308203, + "grad_norm": 1.9177244901657104, + "learning_rate": 2.8000436756555455e-06, + "loss": 0.228, "step": 932900 }, { - "epoch": 9.51, - "learning_rate": 1.894076430126445e-05, - "loss": 0.3234, + "epoch": 12.854426717367943, + "grad_norm": 0.9147353768348694, + "learning_rate": 2.7965209168547675e-06, + "loss": 0.2048, "step": 933000 }, { - "epoch": 9.51, - "learning_rate": 1.893459816577579e-05, - "loss": 0.3579, + "epoch": 12.855804469427682, + "grad_norm": 1.4930061101913452, + "learning_rate": 2.7930002567409135e-06, + "loss": 0.2296, "step": 933100 }, { - "epoch": 9.51, - "learning_rate": 1.8928432571319248e-05, - "loss": 0.3296, + "epoch": 12.85718222148742, + "grad_norm": 0.21898534893989563, + "learning_rate": 2.7894816956130858e-06, + "loss": 0.2954, "step": 933200 }, { - "epoch": 9.51, - "learning_rate": 1.8922267518196288e-05, - "loss": 0.3194, + "epoch": 12.85855997354716, + "grad_norm": 1.4664571285247803, + "learning_rate": 2.785965233770214e-06, + "loss": 0.2613, "step": 933300 }, { - "epoch": 9.51, - "learning_rate": 1.891610300670833e-05, - "loss": 0.3488, + "epoch": 12.8599377256069, + "grad_norm": 1.0566760301589966, + "learning_rate": 2.78245087151103e-06, + "loss": 0.2428, "step": 933400 }, { - "epoch": 9.51, - "learning_rate": 1.890993903715679e-05, - "loss": 0.3599, + "epoch": 12.86131547766664, + "grad_norm": 2.049891710281372, + "learning_rate": 2.778938609134095e-06, + "loss": 0.1929, "step": 933500 }, { - "epoch": 9.51, - "learning_rate": 1.8903775609843063e-05, - "loss": 0.3465, + "epoch": 12.862693229726379, + "grad_norm": 3.200713872909546, + "learning_rate": 2.775428446937796e-06, + "loss": 0.2296, "step": 933600 }, { - "epoch": 9.51, - "learning_rate": 1.8897612725068473e-05, - "loss": 0.2989, + "epoch": 12.864070981786117, + "grad_norm": 0.13515597581863403, + "learning_rate": 2.7719203852203487e-06, + "loss": 0.2412, "step": 933700 }, { - "epoch": 9.51, - "learning_rate": 1.8891450383134364e-05, - "loss": 0.371, + "epoch": 12.865448733845858, + "grad_norm": 5.488831996917725, + "learning_rate": 2.7684144242797642e-06, + "loss": 0.2238, "step": 933800 }, { - "epoch": 9.51, - "learning_rate": 1.8885288584342044e-05, - "loss": 0.3601, + "epoch": 12.866826485905596, + "grad_norm": 4.567749500274658, + "learning_rate": 2.7649105644139016e-06, + "loss": 0.2156, "step": 933900 }, { - "epoch": 9.52, - "learning_rate": 1.8879127328992788e-05, - "loss": 0.3255, + "epoch": 12.868204237965335, + "grad_norm": 1.842706561088562, + "learning_rate": 2.7614088059204394e-06, + "loss": 0.2435, "step": 934000 }, { - "epoch": 9.52, - "learning_rate": 1.8872966617387832e-05, - "loss": 0.3259, + "epoch": 12.869581990025075, + "grad_norm": 3.7605953216552734, + "learning_rate": 2.757909149096851e-06, + "loss": 0.2333, "step": 934100 }, { - "epoch": 9.52, - "learning_rate": 1.886680644982841e-05, - "loss": 0.3435, + "epoch": 12.870959742084814, + "grad_norm": 1.7115031480789185, + "learning_rate": 2.7544115942404614e-06, + "loss": 0.2246, "step": 934200 }, { - "epoch": 9.52, - "learning_rate": 1.8860646826615722e-05, - "loss": 0.3123, + "epoch": 12.872337494144555, + "grad_norm": 0.9463909864425659, + "learning_rate": 2.750916141648409e-06, + "loss": 0.1825, "step": 934300 }, { - "epoch": 9.52, - "learning_rate": 1.8854487748050918e-05, - "loss": 0.3671, + "epoch": 12.873715246204293, + "grad_norm": 3.504456043243408, + "learning_rate": 2.7474227916176386e-06, + "loss": 0.1957, "step": 934400 }, { - "epoch": 9.52, - "learning_rate": 1.884832921443515e-05, - "loss": 0.3207, + "epoch": 12.875092998264032, + "grad_norm": 3.7877490520477295, + "learning_rate": 2.743931544444944e-06, + "loss": 0.2352, "step": 934500 }, { - "epoch": 9.52, - "learning_rate": 1.8842171226069534e-05, - "loss": 0.3347, + "epoch": 12.876470750323772, + "grad_norm": 5.8775315284729, + "learning_rate": 2.740442400426908e-06, + "loss": 0.2593, "step": 934600 }, { - "epoch": 9.52, - "learning_rate": 1.8836013783255157e-05, - "loss": 0.3057, + "epoch": 12.877848502383511, + "grad_norm": 1.6690183877944946, + "learning_rate": 2.736955359859966e-06, + "loss": 0.2463, "step": 934700 }, { - "epoch": 9.52, - "learning_rate": 1.8829856886293087e-05, - "loss": 0.3647, + "epoch": 12.87922625444325, + "grad_norm": 5.3186726570129395, + "learning_rate": 2.7334704230403464e-06, + "loss": 0.2403, "step": 934800 }, { - "epoch": 9.52, - "learning_rate": 1.882370053548436e-05, - "loss": 0.2993, + "epoch": 12.88060400650299, + "grad_norm": 2.5305628776550293, + "learning_rate": 2.7299875902641216e-06, + "loss": 0.2154, "step": 934900 }, { - "epoch": 9.53, - "learning_rate": 1.881754473112997e-05, - "loss": 0.3453, + "epoch": 12.881981758562729, + "grad_norm": 1.443049669265747, + "learning_rate": 2.7265068618271754e-06, + "loss": 0.2262, "step": 935000 }, { - "epoch": 9.53, - "learning_rate": 1.8811389473530914e-05, - "loss": 0.381, + "epoch": 12.88335951062247, + "grad_norm": 1.4582191705703735, + "learning_rate": 2.7230630138443222e-06, + "loss": 0.1933, "step": 935100 }, { - "epoch": 9.53, - "learning_rate": 1.8805296307384655e-05, - "loss": 0.3264, + "epoch": 12.884737262682208, + "grad_norm": 4.619357585906982, + "learning_rate": 2.7195864739220946e-06, + "loss": 0.2521, "step": 935200 }, { - "epoch": 9.53, - "learning_rate": 1.8799142138724033e-05, - "loss": 0.339, + "epoch": 12.886115014741947, + "grad_norm": 3.4242618083953857, + "learning_rate": 2.7161120392227746e-06, + "loss": 0.2651, "step": 935300 }, { - "epoch": 9.53, - "learning_rate": 1.879298851771852e-05, - "loss": 0.3389, + "epoch": 12.887492766801687, + "grad_norm": 0.3394777476787567, + "learning_rate": 2.7126397100415338e-06, + "loss": 0.2234, "step": 935400 }, { - "epoch": 9.53, - "learning_rate": 1.878683544466898e-05, - "loss": 0.3199, + "epoch": 12.888870518861426, + "grad_norm": 1.9548254013061523, + "learning_rate": 2.709169486673358e-06, + "loss": 0.2577, "step": 935500 }, { - "epoch": 9.53, - "learning_rate": 1.878068291987628e-05, - "loss": 0.2764, + "epoch": 12.890248270921164, + "grad_norm": 3.070148229598999, + "learning_rate": 2.705701369413068e-06, + "loss": 0.2371, "step": 935600 }, { - "epoch": 9.53, - "learning_rate": 1.8774530943641236e-05, - "loss": 0.2999, + "epoch": 12.891626022980905, + "grad_norm": 3.368596076965332, + "learning_rate": 2.7022353585552863e-06, + "loss": 0.2194, "step": 935700 }, { - "epoch": 9.53, - "learning_rate": 1.876837951626463e-05, - "loss": 0.283, + "epoch": 12.893003775040643, + "grad_norm": 0.44243016839027405, + "learning_rate": 2.6987714543944826e-06, + "loss": 0.1999, "step": 935800 }, { - "epoch": 9.54, - "learning_rate": 1.8762228638047237e-05, - "loss": 0.3211, + "epoch": 12.894381527100382, + "grad_norm": 1.993676781654358, + "learning_rate": 2.695309657224919e-06, + "loss": 0.2388, "step": 935900 }, { - "epoch": 9.54, - "learning_rate": 1.8756078309289806e-05, - "loss": 0.3113, + "epoch": 12.895759279160123, + "grad_norm": 2.328325033187866, + "learning_rate": 2.691849967340702e-06, + "loss": 0.2731, "step": 936000 }, { - "epoch": 9.54, - "learning_rate": 1.8749928530293042e-05, - "loss": 0.3389, + "epoch": 12.897137031219861, + "grad_norm": 0.3272148072719574, + "learning_rate": 2.6883923850357547e-06, + "loss": 0.2333, "step": 936100 }, { - "epoch": 9.54, - "learning_rate": 1.8743779301357634e-05, - "loss": 0.3582, + "epoch": 12.898514783279602, + "grad_norm": 2.8061811923980713, + "learning_rate": 2.684936910603805e-06, + "loss": 0.1965, "step": 936200 }, { - "epoch": 9.54, - "learning_rate": 1.8737692106844705e-05, - "loss": 0.3761, + "epoch": 12.89989253533934, + "grad_norm": 1.714197039604187, + "learning_rate": 2.6814835443384156e-06, + "loss": 0.2244, "step": 936300 }, { - "epoch": 9.54, - "learning_rate": 1.8731543973425855e-05, - "loss": 0.3622, + "epoch": 12.901270287399079, + "grad_norm": 0.4128607511520386, + "learning_rate": 2.6780322865329756e-06, + "loss": 0.2103, "step": 936400 }, { - "epoch": 9.54, - "learning_rate": 1.872539639096726e-05, - "loss": 0.3108, + "epoch": 12.90264803945882, + "grad_norm": 1.9874459505081177, + "learning_rate": 2.6745831374806725e-06, + "loss": 0.2009, "step": 936500 }, { - "epoch": 9.54, - "learning_rate": 1.87192493597695e-05, - "loss": 0.3457, + "epoch": 12.904025791518558, + "grad_norm": 3.0516269207000732, + "learning_rate": 2.671136097474541e-06, + "loss": 0.2346, "step": 936600 }, { - "epoch": 9.54, - "learning_rate": 1.871310288013312e-05, - "loss": 0.2999, + "epoch": 12.905403543578299, + "grad_norm": 2.2331130504608154, + "learning_rate": 2.6676911668074268e-06, + "loss": 0.2312, "step": 936700 }, { - "epoch": 9.54, - "learning_rate": 1.8706956952358654e-05, - "loss": 0.2616, + "epoch": 12.906781295638037, + "grad_norm": 59.183021545410156, + "learning_rate": 2.6642483457719856e-06, + "loss": 0.2966, "step": 936800 }, { - "epoch": 9.55, - "learning_rate": 1.870081157674661e-05, - "loss": 0.2915, + "epoch": 12.908159047697776, + "grad_norm": 2.3145411014556885, + "learning_rate": 2.660807634660706e-06, + "loss": 0.2227, "step": 936900 }, { - "epoch": 9.55, - "learning_rate": 1.869466675359744e-05, - "loss": 0.3232, + "epoch": 12.909536799757516, + "grad_norm": 4.065418720245361, + "learning_rate": 2.6573690337658897e-06, + "loss": 0.241, "step": 937000 }, { - "epoch": 9.55, - "learning_rate": 1.8688522483211598e-05, - "loss": 0.2869, + "epoch": 12.910914551817255, + "grad_norm": 0.19847795367240906, + "learning_rate": 2.653932543379677e-06, + "loss": 0.2636, "step": 937100 }, { - "epoch": 9.55, - "learning_rate": 1.8682378765889507e-05, - "loss": 0.334, + "epoch": 12.912292303876994, + "grad_norm": 1.0193102359771729, + "learning_rate": 2.6504981637939978e-06, + "loss": 0.223, "step": 937200 }, { - "epoch": 9.55, - "learning_rate": 1.867623560193155e-05, - "loss": 0.3291, + "epoch": 12.913670055936734, + "grad_norm": 1.5910488367080688, + "learning_rate": 2.647065895300638e-06, + "loss": 0.2265, "step": 937300 }, { - "epoch": 9.55, - "learning_rate": 1.8670092991638098e-05, - "loss": 0.3268, + "epoch": 12.915047807996473, + "grad_norm": 3.2115275859832764, + "learning_rate": 2.6436357381911726e-06, + "loss": 0.2152, "step": 937400 }, { - "epoch": 9.55, - "learning_rate": 1.8663950935309494e-05, - "loss": 0.3465, + "epoch": 12.916425560056211, + "grad_norm": 2.2084672451019287, + "learning_rate": 2.640207692757016e-06, + "loss": 0.2171, "step": 937500 }, { - "epoch": 9.55, - "learning_rate": 1.865780943324603e-05, - "loss": 0.3628, + "epoch": 12.917803312115952, + "grad_norm": 1.376097321510315, + "learning_rate": 2.636781759289394e-06, + "loss": 0.2162, "step": 937600 }, { - "epoch": 9.55, - "learning_rate": 1.8651668485747993e-05, - "loss": 0.3472, + "epoch": 12.91918106417569, + "grad_norm": 3.4237771034240723, + "learning_rate": 2.6333579380793646e-06, + "loss": 0.2232, "step": 937700 }, { - "epoch": 9.55, - "learning_rate": 1.8645528093115656e-05, - "loss": 0.2972, + "epoch": 12.920558816235431, + "grad_norm": 5.032687187194824, + "learning_rate": 2.629936229417797e-06, + "loss": 0.2732, "step": 937800 }, { - "epoch": 9.56, - "learning_rate": 1.863938825564923e-05, - "loss": 0.3047, + "epoch": 12.92193656829517, + "grad_norm": 0.3038492798805237, + "learning_rate": 2.6265166335953785e-06, + "loss": 0.2202, "step": 937900 }, { - "epoch": 9.56, - "learning_rate": 1.8633248973648917e-05, - "loss": 0.3088, + "epoch": 12.923314320354908, + "grad_norm": 0.8990733623504639, + "learning_rate": 2.6230991509026246e-06, + "loss": 0.2226, "step": 938000 }, { - "epoch": 9.56, - "learning_rate": 1.8627110247414913e-05, - "loss": 0.3036, + "epoch": 12.924692072414649, + "grad_norm": 2.5408823490142822, + "learning_rate": 2.619683781629865e-06, + "loss": 0.2241, "step": 938100 }, { - "epoch": 9.56, - "learning_rate": 1.8620972077247328e-05, - "loss": 0.3116, + "epoch": 12.926069824474387, + "grad_norm": 2.3964927196502686, + "learning_rate": 2.616270526067253e-06, + "loss": 0.2076, "step": 938200 }, { - "epoch": 9.56, - "learning_rate": 1.8614834463446308e-05, - "loss": 0.3692, + "epoch": 12.927447576534126, + "grad_norm": 2.4010443687438965, + "learning_rate": 2.61285938450477e-06, + "loss": 0.2193, "step": 938300 }, { - "epoch": 9.56, - "learning_rate": 1.8608697406311942e-05, - "loss": 0.2802, + "epoch": 12.928825328593867, + "grad_norm": 1.8050990104675293, + "learning_rate": 2.6094503572322047e-06, + "loss": 0.2464, "step": 938400 }, { - "epoch": 9.56, - "learning_rate": 1.8602560906144288e-05, - "loss": 0.2593, + "epoch": 12.930203080653605, + "grad_norm": 1.831526517868042, + "learning_rate": 2.606043444539167e-06, + "loss": 0.25, "step": 938500 }, { - "epoch": 9.56, - "learning_rate": 1.8596424963243394e-05, - "loss": 0.3279, + "epoch": 12.931580832713346, + "grad_norm": 0.004637610632926226, + "learning_rate": 2.6026386467150916e-06, + "loss": 0.2155, "step": 938600 }, { - "epoch": 9.56, - "learning_rate": 1.8590289577909262e-05, - "loss": 0.3254, + "epoch": 12.932958584773084, + "grad_norm": 3.2540037631988525, + "learning_rate": 2.599235964049246e-06, + "loss": 0.2097, "step": 938700 }, { - "epoch": 9.56, - "learning_rate": 1.8584154750441892e-05, - "loss": 0.3673, + "epoch": 12.934336336832823, + "grad_norm": 1.7191284894943237, + "learning_rate": 2.5958353968306897e-06, + "loss": 0.2256, "step": 938800 }, { - "epoch": 9.57, - "learning_rate": 1.8578020481141215e-05, - "loss": 0.3439, + "epoch": 12.935714088892563, + "grad_norm": 3.144895553588867, + "learning_rate": 2.5924369453483296e-06, + "loss": 0.2248, "step": 938900 }, { - "epoch": 9.57, - "learning_rate": 1.8571886770307177e-05, - "loss": 0.2918, + "epoch": 12.937091840952302, + "grad_norm": 4.147371768951416, + "learning_rate": 2.5890406098908774e-06, + "loss": 0.2142, "step": 939000 }, { - "epoch": 9.57, - "learning_rate": 1.8565753618239684e-05, - "loss": 0.2621, + "epoch": 12.93846959301204, + "grad_norm": 1.9305498600006104, + "learning_rate": 2.5856463907468673e-06, + "loss": 0.2392, "step": 939100 }, { - "epoch": 9.57, - "learning_rate": 1.8559621025238595e-05, - "loss": 0.3566, + "epoch": 12.939847345071781, + "grad_norm": 0.2543761134147644, + "learning_rate": 2.5822542882046538e-06, + "loss": 0.1918, "step": 939200 }, { - "epoch": 9.57, - "learning_rate": 1.8553550309170273e-05, - "loss": 0.3257, + "epoch": 12.94122509713152, + "grad_norm": 1.9301526546478271, + "learning_rate": 2.5788643025524233e-06, + "loss": 0.2186, "step": 939300 }, { - "epoch": 9.57, - "learning_rate": 1.8547418829603375e-05, - "loss": 0.318, + "epoch": 12.94260284919126, + "grad_norm": 0.950579047203064, + "learning_rate": 2.575476434078161e-06, + "loss": 0.2142, "step": 939400 }, { - "epoch": 9.57, - "learning_rate": 1.8541287909999353e-05, - "loss": 0.3259, + "epoch": 12.943980601250999, + "grad_norm": 0.687894344329834, + "learning_rate": 2.5720906830696927e-06, + "loss": 0.2301, "step": 939500 }, { - "epoch": 9.57, - "learning_rate": 1.8535157550657983e-05, - "loss": 0.3354, + "epoch": 12.945358353310738, + "grad_norm": 5.440469741821289, + "learning_rate": 2.568707049814646e-06, + "loss": 0.1948, "step": 939600 }, { - "epoch": 9.57, - "learning_rate": 1.8529027751878976e-05, - "loss": 0.3221, + "epoch": 12.946736105370478, + "grad_norm": 3.061732053756714, + "learning_rate": 2.565393144146111e-06, + "loss": 0.2262, "step": 939700 }, { - "epoch": 9.57, - "learning_rate": 1.852289851396206e-05, - "loss": 0.3656, + "epoch": 12.948113857430217, + "grad_norm": 1.799528956413269, + "learning_rate": 2.5620137048907313e-06, + "loss": 0.2535, "step": 939800 }, { - "epoch": 9.58, - "learning_rate": 1.851676983720692e-05, - "loss": 0.3728, + "epoch": 12.949491609489955, + "grad_norm": 2.2766990661621094, + "learning_rate": 2.558636384244876e-06, + "loss": 0.2397, "step": 939900 }, { - "epoch": 9.58, - "learning_rate": 1.8510641721913195e-05, - "loss": 0.3119, + "epoch": 12.950869361549696, + "grad_norm": 3.5763628482818604, + "learning_rate": 2.555261182495458e-06, + "loss": 0.1948, "step": 940000 }, { - "epoch": 9.58, - "learning_rate": 1.850451416838053e-05, - "loss": 0.3523, + "epoch": 12.952247113609435, + "grad_norm": 3.525674343109131, + "learning_rate": 2.551888099929213e-06, + "loss": 0.227, "step": 940100 }, { - "epoch": 9.58, - "learning_rate": 1.8498387176908537e-05, - "loss": 0.3259, + "epoch": 12.953624865669173, + "grad_norm": 1.8106833696365356, + "learning_rate": 2.5485171368327127e-06, + "loss": 0.2047, "step": 940200 }, { - "epoch": 9.58, - "learning_rate": 1.8492260747796752e-05, - "loss": 0.3156, + "epoch": 12.955002617728914, + "grad_norm": 10.527534484863281, + "learning_rate": 2.5451482934923304e-06, + "loss": 0.2699, "step": 940300 }, { - "epoch": 9.58, - "learning_rate": 1.848613488134475e-05, - "loss": 0.26, + "epoch": 12.956380369788652, + "grad_norm": 6.805759429931641, + "learning_rate": 2.5417815701942643e-06, + "loss": 0.2258, "step": 940400 }, { - "epoch": 9.58, - "learning_rate": 1.848000957785204e-05, - "loss": 0.2989, + "epoch": 12.957758121848393, + "grad_norm": 5.308080673217773, + "learning_rate": 2.538416967224549e-06, + "loss": 0.2215, "step": 940500 }, { - "epoch": 9.58, - "learning_rate": 1.8473884837618114e-05, - "loss": 0.2945, + "epoch": 12.959135873908131, + "grad_norm": 1.2921286821365356, + "learning_rate": 2.5350544848690128e-06, + "loss": 0.2433, "step": 940600 }, { - "epoch": 9.58, - "learning_rate": 1.846776066094243e-05, - "loss": 0.3647, + "epoch": 12.96051362596787, + "grad_norm": 5.0306396484375, + "learning_rate": 2.531694123413312e-06, + "loss": 0.2281, "step": 940700 }, { - "epoch": 9.59, - "learning_rate": 1.8461637048124432e-05, - "loss": 0.2589, + "epoch": 12.96189137802761, + "grad_norm": 1.7793551683425903, + "learning_rate": 2.528335883142937e-06, + "loss": 0.1787, "step": 940800 }, { - "epoch": 9.59, - "learning_rate": 1.8455513999463535e-05, - "loss": 0.2857, + "epoch": 12.96326913008735, + "grad_norm": 0.9906152486801147, + "learning_rate": 2.52497976434318e-06, + "loss": 0.1993, "step": 940900 }, { - "epoch": 9.59, - "learning_rate": 1.84493915152591e-05, - "loss": 0.3096, + "epoch": 12.96464688214709, + "grad_norm": 2.697572946548462, + "learning_rate": 2.5216257672991616e-06, + "loss": 0.1932, "step": 941000 }, { - "epoch": 9.59, - "learning_rate": 1.844326959581049e-05, - "loss": 0.3388, + "epoch": 12.966024634206828, + "grad_norm": 0.06797800213098526, + "learning_rate": 2.5182738922958295e-06, + "loss": 0.2159, "step": 941100 }, { - "epoch": 9.59, - "learning_rate": 1.843714824141703e-05, - "loss": 0.345, + "epoch": 12.967402386266567, + "grad_norm": 1.9671000242233276, + "learning_rate": 2.5149241396179345e-06, + "loss": 0.2189, "step": 941200 }, { - "epoch": 9.59, - "learning_rate": 1.8431027452378018e-05, - "loss": 0.2894, + "epoch": 12.968780138326307, + "grad_norm": 1.4161914587020874, + "learning_rate": 2.5115765095500522e-06, + "loss": 0.2081, "step": 941300 }, { - "epoch": 9.59, - "learning_rate": 1.842490722899272e-05, - "loss": 0.3153, + "epoch": 12.970157890386046, + "grad_norm": 1.1342270374298096, + "learning_rate": 2.5082310023765853e-06, + "loss": 0.2168, "step": 941400 }, { - "epoch": 9.59, - "learning_rate": 1.8418787571560396e-05, - "loss": 0.3543, + "epoch": 12.971535642445785, + "grad_norm": 0.3216767907142639, + "learning_rate": 2.504887618381755e-06, + "loss": 0.2311, "step": 941500 }, { - "epoch": 9.59, - "learning_rate": 1.8412668480380235e-05, - "loss": 0.3022, + "epoch": 12.972913394505525, + "grad_norm": 2.3021531105041504, + "learning_rate": 2.501546357849595e-06, + "loss": 0.2276, "step": 941600 }, { - "epoch": 9.59, - "learning_rate": 1.840654995575143e-05, - "loss": 0.3695, + "epoch": 12.974291146565264, + "grad_norm": 4.159495830535889, + "learning_rate": 2.4982072210639653e-06, + "loss": 0.1994, "step": 941700 }, { - "epoch": 9.6, - "learning_rate": 1.840043199797316e-05, - "loss": 0.2953, + "epoch": 12.975668898625003, + "grad_norm": 3.3270316123962402, + "learning_rate": 2.49487020830854e-06, + "loss": 0.2704, "step": 941800 }, { - "epoch": 9.6, - "learning_rate": 1.8394375778442438e-05, - "loss": 0.3445, + "epoch": 12.977046650684743, + "grad_norm": 2.290621519088745, + "learning_rate": 2.4915353198668104e-06, + "loss": 0.2257, "step": 941900 }, { - "epoch": 9.6, - "learning_rate": 1.8388258949586614e-05, - "loss": 0.2999, + "epoch": 12.978424402744482, + "grad_norm": 2.0975193977355957, + "learning_rate": 2.4882025560220988e-06, + "loss": 0.2188, "step": 942000 }, { - "epoch": 9.6, - "learning_rate": 1.8382142688475616e-05, - "loss": 0.3378, + "epoch": 12.979802154804222, + "grad_norm": 3.7833104133605957, + "learning_rate": 2.4848719170575447e-06, + "loss": 0.2107, "step": 942100 }, { - "epoch": 9.6, - "learning_rate": 1.837602699540851e-05, - "loss": 0.2987, + "epoch": 12.98117990686396, + "grad_norm": 5.014673709869385, + "learning_rate": 2.481543403256099e-06, + "loss": 0.2462, "step": 942200 }, { - "epoch": 9.6, - "learning_rate": 1.8369911870684325e-05, - "loss": 0.3371, + "epoch": 12.9825576589237, + "grad_norm": 4.888100624084473, + "learning_rate": 2.478217014900528e-06, + "loss": 0.2307, "step": 942300 }, { - "epoch": 9.6, - "learning_rate": 1.8363797314602033e-05, - "loss": 0.3431, + "epoch": 12.98393541098344, + "grad_norm": 1.8661445379257202, + "learning_rate": 2.4748927522734313e-06, + "loss": 0.2354, "step": 942400 }, { - "epoch": 9.6, - "learning_rate": 1.8357683327460615e-05, - "loss": 0.2755, + "epoch": 12.985313163043179, + "grad_norm": 1.4494556188583374, + "learning_rate": 2.471570615657231e-06, + "loss": 0.1723, "step": 942500 }, { - "epoch": 9.6, - "learning_rate": 1.8351569909559016e-05, - "loss": 0.3518, + "epoch": 12.986690915102917, + "grad_norm": 1.0541070699691772, + "learning_rate": 2.468250605334145e-06, + "loss": 0.2193, "step": 942600 }, { - "epoch": 9.6, - "learning_rate": 1.8345457061196123e-05, - "loss": 0.3358, + "epoch": 12.988068667162658, + "grad_norm": 1.1592589616775513, + "learning_rate": 2.4649327215862323e-06, + "loss": 0.1988, "step": 942700 }, { - "epoch": 9.61, - "learning_rate": 1.8339344782670826e-05, - "loss": 0.2931, + "epoch": 12.989446419222396, + "grad_norm": 0.9725094437599182, + "learning_rate": 2.461616964695371e-06, + "loss": 0.2019, "step": 942800 }, { - "epoch": 9.61, - "learning_rate": 1.8333233074281994e-05, - "loss": 0.3903, + "epoch": 12.990824171282137, + "grad_norm": 0.08320147544145584, + "learning_rate": 2.458303334943235e-06, + "loss": 0.2232, "step": 942900 }, { - "epoch": 9.61, - "learning_rate": 1.8327121936328428e-05, - "loss": 0.2877, + "epoch": 12.992201923341876, + "grad_norm": 3.3967432975769043, + "learning_rate": 2.4549918326113433e-06, + "loss": 0.2343, "step": 943000 }, { - "epoch": 9.61, - "learning_rate": 1.832101136910894e-05, - "loss": 0.3325, + "epoch": 12.993579675401614, + "grad_norm": 1.8226722478866577, + "learning_rate": 2.451682457981031e-06, + "loss": 0.2472, "step": 943100 }, { - "epoch": 9.61, - "learning_rate": 1.8314901372922304e-05, - "loss": 0.3296, + "epoch": 12.994957427461355, + "grad_norm": 0.3414977192878723, + "learning_rate": 2.448375211333435e-06, + "loss": 0.2276, "step": 943200 }, { - "epoch": 9.61, - "learning_rate": 1.8308791948067266e-05, - "loss": 0.3776, + "epoch": 12.996335179521093, + "grad_norm": 3.5121781826019287, + "learning_rate": 2.445070092949533e-06, + "loss": 0.2186, "step": 943300 }, { - "epoch": 9.61, - "learning_rate": 1.8302683094842523e-05, - "loss": 0.3226, + "epoch": 12.997712931580832, + "grad_norm": 5.230090141296387, + "learning_rate": 2.4417671031101014e-06, + "loss": 0.2229, "step": 943400 }, { - "epoch": 9.61, - "learning_rate": 1.8296574813546773e-05, - "loss": 0.3434, + "epoch": 12.999090683640572, + "grad_norm": 2.0974578857421875, + "learning_rate": 2.4384662420957556e-06, + "loss": 0.2212, "step": 943500 }, { - "epoch": 9.61, - "learning_rate": 1.8290467104478675e-05, - "loss": 0.2968, + "epoch": 13.000468435700311, + "grad_norm": 1.4741615056991577, + "learning_rate": 2.4351675101869115e-06, + "loss": 0.2435, "step": 943600 }, { - "epoch": 9.61, - "learning_rate": 1.828435996793685e-05, - "loss": 0.3417, + "epoch": 13.001846187760052, + "grad_norm": 3.1395866870880127, + "learning_rate": 2.43187090766382e-06, + "loss": 0.1824, "step": 943700 }, { - "epoch": 9.62, - "learning_rate": 1.8278253404219907e-05, - "loss": 0.2782, + "epoch": 13.00322393981979, + "grad_norm": 1.5867871046066284, + "learning_rate": 2.428576434806547e-06, + "loss": 0.2141, "step": 943800 }, { - "epoch": 9.62, - "learning_rate": 1.827214741362643e-05, - "loss": 0.3648, + "epoch": 13.004601691879529, + "grad_norm": 1.372540831565857, + "learning_rate": 2.4252840918949677e-06, + "loss": 0.207, "step": 943900 }, { - "epoch": 9.62, - "learning_rate": 1.826604199645495e-05, - "loss": 0.257, + "epoch": 13.00597944393927, + "grad_norm": 1.4043537378311157, + "learning_rate": 2.4219938792087845e-06, + "loss": 0.2296, "step": 944000 }, { - "epoch": 9.62, - "learning_rate": 1.8259937153003992e-05, - "loss": 0.3411, + "epoch": 13.007357195999008, + "grad_norm": 0.22934818267822266, + "learning_rate": 2.4187057970275247e-06, + "loss": 0.1613, "step": 944100 }, { - "epoch": 9.62, - "learning_rate": 1.825383288357205e-05, - "loss": 0.3366, + "epoch": 13.008734948058747, + "grad_norm": 0.3385646641254425, + "learning_rate": 2.415419845630515e-06, + "loss": 0.2279, "step": 944200 }, { - "epoch": 9.62, - "learning_rate": 1.8247729188457574e-05, - "loss": 0.345, + "epoch": 13.010112700118487, + "grad_norm": 2.983510971069336, + "learning_rate": 2.4121360252969226e-06, + "loss": 0.1874, "step": 944300 }, { - "epoch": 9.62, - "learning_rate": 1.824162606795901e-05, - "loss": 0.3212, + "epoch": 13.011490452178226, + "grad_norm": 2.225717067718506, + "learning_rate": 2.408854336305729e-06, + "loss": 0.2004, "step": 944400 }, { - "epoch": 9.62, - "learning_rate": 1.8235523522374775e-05, - "loss": 0.3291, + "epoch": 13.012868204237966, + "grad_norm": 2.111985921859741, + "learning_rate": 2.4055747789357225e-06, + "loss": 0.1925, "step": 944500 }, { - "epoch": 9.62, - "learning_rate": 1.8229421552003216e-05, - "loss": 0.2982, + "epoch": 13.014245956297705, + "grad_norm": 1.034456729888916, + "learning_rate": 2.4022973534655186e-06, + "loss": 0.188, "step": 944600 }, { - "epoch": 9.62, - "learning_rate": 1.82233201571427e-05, - "loss": 0.3682, + "epoch": 13.015623708357444, + "grad_norm": 3.1166813373565674, + "learning_rate": 2.3990220601735526e-06, + "loss": 0.1915, "step": 944700 }, { - "epoch": 9.63, - "learning_rate": 1.8217219338091562e-05, - "loss": 0.3242, + "epoch": 13.017001460417184, + "grad_norm": 3.778953790664673, + "learning_rate": 2.3957816203898616e-06, + "loss": 0.2181, "step": 944800 }, { - "epoch": 9.63, - "learning_rate": 1.821111909514807e-05, - "loss": 0.3214, + "epoch": 13.018379212476923, + "grad_norm": 2.072683334350586, + "learning_rate": 2.392510570960234e-06, + "loss": 0.2229, "step": 944900 }, { - "epoch": 9.63, - "learning_rate": 1.82050194286105e-05, - "loss": 0.355, + "epoch": 13.019756964536661, + "grad_norm": 1.4086980819702148, + "learning_rate": 2.3892416545402825e-06, + "loss": 0.2564, "step": 945000 }, { - "epoch": 9.63, - "learning_rate": 1.8198920338777096e-05, - "loss": 0.2785, + "epoch": 13.021134716596402, + "grad_norm": 0.9210970997810364, + "learning_rate": 2.3859748714077122e-06, + "loss": 0.27, "step": 945100 }, { - "epoch": 9.63, - "learning_rate": 1.8192882808217236e-05, - "loss": 0.3414, + "epoch": 13.02251246865614, + "grad_norm": 0.13293832540512085, + "learning_rate": 2.382710221840067e-06, + "loss": 0.1868, "step": 945200 }, { - "epoch": 9.63, - "learning_rate": 1.8186784866912285e-05, - "loss": 0.3257, + "epoch": 13.023890220715879, + "grad_norm": 4.4459614753723145, + "learning_rate": 2.379447706114679e-06, + "loss": 0.1813, "step": 945300 }, { - "epoch": 9.63, - "learning_rate": 1.818068750320304e-05, - "loss": 0.3437, + "epoch": 13.02526797277562, + "grad_norm": 0.6108028292655945, + "learning_rate": 2.3761873245087284e-06, + "loss": 0.2208, "step": 945400 }, { - "epoch": 9.63, - "learning_rate": 1.8174590717387643e-05, - "loss": 0.2897, + "epoch": 13.026645724835358, + "grad_norm": 0.9959761500358582, + "learning_rate": 2.3729290772991997e-06, + "loss": 0.2104, "step": 945500 }, { - "epoch": 9.63, - "learning_rate": 1.816849450976419e-05, - "loss": 0.4203, + "epoch": 13.028023476895099, + "grad_norm": 0.8236709833145142, + "learning_rate": 2.369672964762894e-06, + "loss": 0.2571, "step": 945600 }, { - "epoch": 9.63, - "learning_rate": 1.816239888063076e-05, - "loss": 0.3522, + "epoch": 13.029401228954837, + "grad_norm": 4.525708198547363, + "learning_rate": 2.3664189871764326e-06, + "loss": 0.2243, "step": 945700 }, { - "epoch": 9.64, - "learning_rate": 1.815630383028537e-05, - "loss": 0.2626, + "epoch": 13.030778981014576, + "grad_norm": 1.7330988645553589, + "learning_rate": 2.36316714481626e-06, + "loss": 0.2646, "step": 945800 }, { - "epoch": 9.64, - "learning_rate": 1.815020935902604e-05, - "loss": 0.2832, + "epoch": 13.032156733074316, + "grad_norm": 0.5445881485939026, + "learning_rate": 2.359917437958643e-06, + "loss": 0.2063, "step": 945900 }, { - "epoch": 9.64, - "learning_rate": 1.8144115467150775e-05, - "loss": 0.2541, + "epoch": 13.033534485134055, + "grad_norm": 1.733644723892212, + "learning_rate": 2.35666986687965e-06, + "loss": 0.1692, "step": 946000 }, { - "epoch": 9.64, - "learning_rate": 1.8138022154957497e-05, - "loss": 0.3358, + "epoch": 13.034912237193794, + "grad_norm": 4.179257392883301, + "learning_rate": 2.3534244318551886e-06, + "loss": 0.2677, "step": 946100 }, { - "epoch": 9.64, - "learning_rate": 1.813192942274415e-05, - "loss": 0.2665, + "epoch": 13.036289989253534, + "grad_norm": 1.500658392906189, + "learning_rate": 2.3501811331609723e-06, + "loss": 0.2066, "step": 946200 }, { - "epoch": 9.64, - "learning_rate": 1.812583727080863e-05, - "loss": 0.3268, + "epoch": 13.037667741313273, + "grad_norm": 1.2771934270858765, + "learning_rate": 2.3469399710725295e-06, + "loss": 0.1976, "step": 946300 }, { - "epoch": 9.64, - "learning_rate": 1.81197456994488e-05, - "loss": 0.284, + "epoch": 13.039045493373013, + "grad_norm": 3.5511856079101562, + "learning_rate": 2.343700945865218e-06, + "loss": 0.238, "step": 946400 }, { - "epoch": 9.64, - "learning_rate": 1.8113654708962514e-05, - "loss": 0.2967, + "epoch": 13.040423245432752, + "grad_norm": 3.867316246032715, + "learning_rate": 2.340464057814214e-06, + "loss": 0.2391, "step": 946500 }, { - "epoch": 9.64, - "learning_rate": 1.810756429964759e-05, - "loss": 0.3049, + "epoch": 13.04180099749249, + "grad_norm": 3.279280662536621, + "learning_rate": 2.3372293071944973e-06, + "loss": 0.2014, "step": 946600 }, { - "epoch": 9.65, - "learning_rate": 1.810147447180178e-05, - "loss": 0.3393, + "epoch": 13.043178749552231, + "grad_norm": 2.9935896396636963, + "learning_rate": 2.3339966942808898e-06, + "loss": 0.2268, "step": 946700 }, { - "epoch": 9.65, - "learning_rate": 1.809538522572287e-05, - "loss": 0.3331, + "epoch": 13.04455650161197, + "grad_norm": 2.1359305381774902, + "learning_rate": 2.330766219348014e-06, + "loss": 0.2052, "step": 946800 }, { - "epoch": 9.65, - "learning_rate": 1.8089296561708583e-05, - "loss": 0.3136, + "epoch": 13.045934253671708, + "grad_norm": 0.7234707474708557, + "learning_rate": 2.3275378826703035e-06, + "loss": 0.2043, "step": 946900 }, { - "epoch": 9.65, - "learning_rate": 1.8083208480056612e-05, - "loss": 0.3812, + "epoch": 13.047312005731449, + "grad_norm": 2.754171133041382, + "learning_rate": 2.324311684522033e-06, + "loss": 0.2088, "step": 947000 }, { - "epoch": 9.65, - "learning_rate": 1.8077120981064627e-05, - "loss": 0.2568, + "epoch": 13.048689757791188, + "grad_norm": 3.9752023220062256, + "learning_rate": 2.3210876251772865e-06, + "loss": 0.1975, "step": 947100 }, { - "epoch": 9.65, - "learning_rate": 1.8071034065030284e-05, - "loss": 0.3142, + "epoch": 13.050067509850928, + "grad_norm": 2.000788688659668, + "learning_rate": 2.3178657049099615e-06, + "loss": 0.1886, "step": 947200 }, { - "epoch": 9.65, - "learning_rate": 1.806494773225118e-05, - "loss": 0.2968, + "epoch": 13.051445261910667, + "grad_norm": 0.27197399735450745, + "learning_rate": 2.31467811121225e-06, + "loss": 0.2077, "step": 947300 }, { - "epoch": 9.65, - "learning_rate": 1.8058922837627606e-05, - "loss": 0.3202, + "epoch": 13.052823013970405, + "grad_norm": 4.119304656982422, + "learning_rate": 2.3114604485231368e-06, + "loss": 0.2077, "step": 947400 }, { - "epoch": 9.65, - "learning_rate": 1.805283766641174e-05, - "loss": 0.2442, + "epoch": 13.054200766030146, + "grad_norm": 1.2764922380447388, + "learning_rate": 2.308244925729319e-06, + "loss": 0.2202, "step": 947500 }, { - "epoch": 9.65, - "learning_rate": 1.804675307934082e-05, - "loss": 0.2768, + "epoch": 13.055578518089884, + "grad_norm": 5.638263702392578, + "learning_rate": 2.305031543103984e-06, + "loss": 0.2564, "step": 947600 }, { - "epoch": 9.66, - "learning_rate": 1.8040669076712345e-05, - "loss": 0.3627, + "epoch": 13.056956270149623, + "grad_norm": 0.6445918679237366, + "learning_rate": 2.301820300920113e-06, + "loss": 0.2113, "step": 947700 }, { - "epoch": 9.66, - "learning_rate": 1.803458565882378e-05, - "loss": 0.3062, + "epoch": 13.058334022209364, + "grad_norm": 3.727322816848755, + "learning_rate": 2.2986111994505165e-06, + "loss": 0.2043, "step": 947800 }, { - "epoch": 9.66, - "learning_rate": 1.8028502825972565e-05, - "loss": 0.3172, + "epoch": 13.059711774269102, + "grad_norm": 1.9680469036102295, + "learning_rate": 2.295404238967832e-06, + "loss": 0.2156, "step": 947900 }, { - "epoch": 9.66, - "learning_rate": 1.8022420578456123e-05, - "loss": 0.3759, + "epoch": 13.061089526328843, + "grad_norm": 2.260833501815796, + "learning_rate": 2.2921994197445017e-06, + "loss": 0.2213, "step": 948000 }, { - "epoch": 9.66, - "learning_rate": 1.801633891657184e-05, - "loss": 0.3859, + "epoch": 13.062467278388581, + "grad_norm": 7.015494346618652, + "learning_rate": 2.2889967420527886e-06, + "loss": 0.1871, "step": 948100 }, { - "epoch": 9.66, - "learning_rate": 1.8010257840617067e-05, - "loss": 0.2892, + "epoch": 13.06384503044832, + "grad_norm": 1.0621708631515503, + "learning_rate": 2.285796206164787e-06, + "loss": 0.2027, "step": 948200 }, { - "epoch": 9.66, - "learning_rate": 1.800417735088913e-05, - "loss": 0.384, + "epoch": 13.06522278250806, + "grad_norm": 4.520688056945801, + "learning_rate": 2.2825978123523946e-06, + "loss": 0.2413, "step": 948300 }, { - "epoch": 9.66, - "learning_rate": 1.7998097447685344e-05, - "loss": 0.373, + "epoch": 13.0666005345678, + "grad_norm": 3.529108762741089, + "learning_rate": 2.2794015608873236e-06, + "loss": 0.209, "step": 948400 }, { - "epoch": 9.66, - "learning_rate": 1.7992018131302964e-05, - "loss": 0.2987, + "epoch": 13.067978286627538, + "grad_norm": 0.6948244571685791, + "learning_rate": 2.2762074520411178e-06, + "loss": 0.242, "step": 948500 }, { - "epoch": 9.66, - "learning_rate": 1.7985939402039228e-05, - "loss": 0.3173, + "epoch": 13.069356038687278, + "grad_norm": 1.436937689781189, + "learning_rate": 2.2730154860851375e-06, + "loss": 0.2235, "step": 948600 }, { - "epoch": 9.67, - "learning_rate": 1.797986126019137e-05, - "loss": 0.345, + "epoch": 13.070733790747017, + "grad_norm": 3.428218364715576, + "learning_rate": 2.2698256632905457e-06, + "loss": 0.2318, "step": 948700 }, { - "epoch": 9.67, - "learning_rate": 1.797378370605656e-05, - "loss": 0.331, + "epoch": 13.072111542806757, + "grad_norm": 2.1471517086029053, + "learning_rate": 2.266637983928342e-06, + "loss": 0.2314, "step": 948800 }, { - "epoch": 9.67, - "learning_rate": 1.7967706739931957e-05, - "loss": 0.3334, + "epoch": 13.073489294866496, + "grad_norm": 2.222318172454834, + "learning_rate": 2.2634524482693445e-06, + "loss": 0.2157, "step": 948900 }, { - "epoch": 9.67, - "learning_rate": 1.79616303621147e-05, - "loss": 0.3022, + "epoch": 13.074867046926235, + "grad_norm": 2.199751853942871, + "learning_rate": 2.2602690565841624e-06, + "loss": 0.2463, "step": 949000 }, { - "epoch": 9.67, - "learning_rate": 1.795555457290187e-05, - "loss": 0.2796, + "epoch": 13.076244798985975, + "grad_norm": 2.260749340057373, + "learning_rate": 2.257087809143246e-06, + "loss": 0.2114, "step": 949100 }, { - "epoch": 9.67, - "learning_rate": 1.794947937259054e-05, - "loss": 0.3351, + "epoch": 13.077622551045714, + "grad_norm": 0.5413200855255127, + "learning_rate": 2.2539087062168702e-06, + "loss": 0.1575, "step": 949200 }, { - "epoch": 9.67, - "learning_rate": 1.7943404761477757e-05, - "loss": 0.3258, + "epoch": 13.079000303105452, + "grad_norm": 3.971824884414673, + "learning_rate": 2.2507317480751008e-06, + "loss": 0.2287, "step": 949300 }, { - "epoch": 9.67, - "learning_rate": 1.793733073986053e-05, - "loss": 0.3273, + "epoch": 13.080378055165193, + "grad_norm": 5.059444904327393, + "learning_rate": 2.247556934987851e-06, + "loss": 0.2036, "step": 949400 }, { - "epoch": 9.67, - "learning_rate": 1.793125730803584e-05, - "loss": 0.2412, + "epoch": 13.081755807224932, + "grad_norm": 2.146378755569458, + "learning_rate": 2.244384267224824e-06, + "loss": 0.2179, "step": 949500 }, { - "epoch": 9.67, - "learning_rate": 1.792518446630066e-05, - "loss": 0.3301, + "epoch": 13.08313355928467, + "grad_norm": 3.3127827644348145, + "learning_rate": 2.241213745055564e-06, + "loss": 0.2128, "step": 949600 }, { - "epoch": 9.68, - "learning_rate": 1.791911221495189e-05, - "loss": 0.3519, + "epoch": 13.08451131134441, + "grad_norm": 0.6094133853912354, + "learning_rate": 2.2380453687494165e-06, + "loss": 0.2573, "step": 949700 }, { - "epoch": 9.68, - "learning_rate": 1.7913040554286437e-05, - "loss": 0.3109, + "epoch": 13.08588906340415, + "grad_norm": 15.943194389343262, + "learning_rate": 2.234879138575556e-06, + "loss": 0.209, "step": 949800 }, { - "epoch": 9.68, - "learning_rate": 1.7906969484601164e-05, - "loss": 0.3209, + "epoch": 13.08726681546389, + "grad_norm": 0.180172860622406, + "learning_rate": 2.2317150548029677e-06, + "loss": 0.2386, "step": 949900 }, { - "epoch": 9.68, - "learning_rate": 1.790089900619293e-05, - "loss": 0.3012, + "epoch": 13.088644567523628, + "grad_norm": 1.3473585844039917, + "learning_rate": 2.2285531177004605e-06, + "loss": 0.1988, "step": 950000 }, { - "epoch": 9.68, - "learning_rate": 1.7894829119358522e-05, - "loss": 0.3074, + "epoch": 13.090022319583367, + "grad_norm": 2.9297616481781006, + "learning_rate": 2.2253933275366504e-06, + "loss": 0.189, "step": 950100 }, { - "epoch": 9.68, - "learning_rate": 1.7888759824394733e-05, - "loss": 0.3183, + "epoch": 13.091400071643108, + "grad_norm": 1.48695707321167, + "learning_rate": 2.222235684579982e-06, + "loss": 0.205, "step": 950200 }, { - "epoch": 9.68, - "learning_rate": 1.7882751805694074e-05, - "loss": 0.3269, + "epoch": 13.092777823702846, + "grad_norm": 0.36406728625297546, + "learning_rate": 2.2190801890987108e-06, + "loss": 0.206, "step": 950300 }, { - "epoch": 9.68, - "learning_rate": 1.7876683689435628e-05, - "loss": 0.3758, + "epoch": 13.094155575762585, + "grad_norm": 1.1817207336425781, + "learning_rate": 2.215926841360913e-06, + "loss": 0.1814, "step": 950400 }, { - "epoch": 9.68, - "learning_rate": 1.7870616165935023e-05, - "loss": 0.2989, + "epoch": 13.095533327822325, + "grad_norm": 2.4070940017700195, + "learning_rate": 2.212775641634487e-06, + "loss": 0.1978, "step": 950500 }, { - "epoch": 9.68, - "learning_rate": 1.7864549235488883e-05, - "loss": 0.3228, + "epoch": 13.096911079882064, + "grad_norm": 1.542210340499878, + "learning_rate": 2.209626590187138e-06, + "loss": 0.2053, "step": 950600 }, { - "epoch": 9.69, - "learning_rate": 1.785848289839387e-05, - "loss": 0.315, + "epoch": 13.098288831941804, + "grad_norm": 0.3750985264778137, + "learning_rate": 2.20647968728639e-06, + "loss": 0.2066, "step": 950700 }, { - "epoch": 9.69, - "learning_rate": 1.7852417154946587e-05, - "loss": 0.3129, + "epoch": 13.099666584001543, + "grad_norm": 0.5737673044204712, + "learning_rate": 2.2033349331995946e-06, + "loss": 0.2242, "step": 950800 }, { - "epoch": 9.69, - "learning_rate": 1.7846352005443614e-05, - "loss": 0.3209, + "epoch": 13.101044336061282, + "grad_norm": 1.8188323974609375, + "learning_rate": 2.2001923281939174e-06, + "loss": 0.2257, "step": 950900 }, { - "epoch": 9.69, - "learning_rate": 1.784028745018149e-05, - "loss": 0.3411, + "epoch": 13.102422088121022, + "grad_norm": 4.118083953857422, + "learning_rate": 2.197051872536332e-06, + "loss": 0.2651, "step": 951000 }, { - "epoch": 9.69, - "learning_rate": 1.783422348945676e-05, - "loss": 0.2458, + "epoch": 13.103799840180761, + "grad_norm": 2.199143648147583, + "learning_rate": 2.193913566493641e-06, + "loss": 0.1719, "step": 951100 }, { - "epoch": 9.69, - "learning_rate": 1.782816012356589e-05, - "loss": 0.3247, + "epoch": 13.1051775922405, + "grad_norm": 2.603848695755005, + "learning_rate": 2.19077741033246e-06, + "loss": 0.21, "step": 951200 }, { - "epoch": 9.69, - "learning_rate": 1.7822097352805352e-05, - "loss": 0.3981, + "epoch": 13.10655534430024, + "grad_norm": 2.3072054386138916, + "learning_rate": 2.187643404319213e-06, + "loss": 0.2331, "step": 951300 }, { - "epoch": 9.69, - "learning_rate": 1.7816035177471582e-05, - "loss": 0.3162, + "epoch": 13.107933096359979, + "grad_norm": 3.031574010848999, + "learning_rate": 2.1845115487201563e-06, + "loss": 0.2345, "step": 951400 }, { - "epoch": 9.69, - "learning_rate": 1.780997359786098e-05, - "loss": 0.3335, + "epoch": 13.10931084841972, + "grad_norm": 3.446669101715088, + "learning_rate": 2.1813818438013625e-06, + "loss": 0.2338, "step": 951500 }, { - "epoch": 9.7, - "learning_rate": 1.780391261426992e-05, - "loss": 0.2945, + "epoch": 13.110688600479458, + "grad_norm": 0.2870500981807709, + "learning_rate": 2.1782542898287087e-06, + "loss": 0.2383, "step": 951600 }, { - "epoch": 9.7, - "learning_rate": 1.7797852226994764e-05, - "loss": 0.2774, + "epoch": 13.112066352539197, + "grad_norm": 3.003225088119507, + "learning_rate": 2.175128887067895e-06, + "loss": 0.2094, "step": 951700 }, { - "epoch": 9.7, - "learning_rate": 1.77917924363318e-05, - "loss": 0.2643, + "epoch": 13.113444104598937, + "grad_norm": 3.1960954666137695, + "learning_rate": 2.1720056357844422e-06, + "loss": 0.2479, "step": 951800 }, { - "epoch": 9.7, - "learning_rate": 1.7785733242577336e-05, - "loss": 0.3485, + "epoch": 13.114821856658676, + "grad_norm": 0.12802423536777496, + "learning_rate": 2.1688845362436926e-06, + "loss": 0.2587, "step": 951900 }, { - "epoch": 9.7, - "learning_rate": 1.7779674646027632e-05, - "loss": 0.3083, + "epoch": 13.116199608718414, + "grad_norm": 20.819339752197266, + "learning_rate": 2.165765588710788e-06, + "loss": 0.2292, "step": 952000 }, { - "epoch": 9.7, - "learning_rate": 1.7773616646978902e-05, - "loss": 0.3619, + "epoch": 13.117577360778155, + "grad_norm": 2.5628113746643066, + "learning_rate": 2.162648793450714e-06, + "loss": 0.2212, "step": 952100 }, { - "epoch": 9.7, - "learning_rate": 1.7767559245727348e-05, - "loss": 0.352, + "epoch": 13.118955112837893, + "grad_norm": 3.513842821121216, + "learning_rate": 2.159534150728249e-06, + "loss": 0.2552, "step": 952200 }, { - "epoch": 9.7, - "learning_rate": 1.776150244256916e-05, - "loss": 0.3251, + "epoch": 13.120332864897634, + "grad_norm": 2.1069796085357666, + "learning_rate": 2.156421660807994e-06, + "loss": 0.1862, "step": 952300 }, { - "epoch": 9.7, - "learning_rate": 1.775544623780047e-05, - "loss": 0.3868, + "epoch": 13.121710616957373, + "grad_norm": 2.3737189769744873, + "learning_rate": 2.1533113239543735e-06, + "loss": 0.1936, "step": 952400 }, { - "epoch": 9.7, - "learning_rate": 1.774939063171738e-05, - "loss": 0.3201, + "epoch": 13.123088369017111, + "grad_norm": 1.7955330610275269, + "learning_rate": 2.150203140431637e-06, + "loss": 0.2481, "step": 952500 }, { - "epoch": 9.71, - "learning_rate": 1.7743335624615976e-05, - "loss": 0.2968, + "epoch": 13.124466121076852, + "grad_norm": 0.1346665918827057, + "learning_rate": 2.1471281601419422e-06, + "loss": 0.2082, "step": 952600 }, { - "epoch": 9.71, - "learning_rate": 1.7737281216792328e-05, - "loss": 0.3359, + "epoch": 13.12584387313659, + "grad_norm": 1.8241961002349854, + "learning_rate": 2.144024262533048e-06, + "loss": 0.1907, "step": 952700 }, { - "epoch": 9.71, - "learning_rate": 1.7731227408542437e-05, - "loss": 0.2829, + "epoch": 13.127221625196329, + "grad_norm": 1.0534404516220093, + "learning_rate": 2.1409225190440078e-06, + "loss": 0.2172, "step": 952800 }, { - "epoch": 9.71, - "learning_rate": 1.7725174200162315e-05, - "loss": 0.2845, + "epoch": 13.12859937725607, + "grad_norm": 2.357387065887451, + "learning_rate": 2.1378539151643304e-06, + "loss": 0.2096, "step": 952900 }, { - "epoch": 9.71, - "learning_rate": 1.771912159194793e-05, - "loss": 0.315, + "epoch": 13.129977129315808, + "grad_norm": 2.7878684997558594, + "learning_rate": 2.1347564591575875e-06, + "loss": 0.1871, "step": 953000 }, { - "epoch": 9.71, - "learning_rate": 1.7713069584195202e-05, - "loss": 0.3863, + "epoch": 13.131354881375549, + "grad_norm": 0.9517192840576172, + "learning_rate": 2.1316611580580478e-06, + "loss": 0.2152, "step": 953100 }, { - "epoch": 9.71, - "learning_rate": 1.770701817720004e-05, - "loss": 0.2756, + "epoch": 13.132732633435287, + "grad_norm": 1.9640759229660034, + "learning_rate": 2.1285680121286735e-06, + "loss": 0.1884, "step": 953200 }, { - "epoch": 9.71, - "learning_rate": 1.770096737125834e-05, - "loss": 0.362, + "epoch": 13.134110385495026, + "grad_norm": 1.5480353832244873, + "learning_rate": 2.1254770216322544e-06, + "loss": 0.2142, "step": 953300 }, { - "epoch": 9.71, - "learning_rate": 1.76949776657342e-05, - "loss": 0.3517, + "epoch": 13.135488137554766, + "grad_norm": 2.0094008445739746, + "learning_rate": 2.122388186831374e-06, + "loss": 0.2851, "step": 953400 }, { - "epoch": 9.71, - "learning_rate": 1.7688928056769005e-05, - "loss": 0.3375, + "epoch": 13.136865889614505, + "grad_norm": 18.14215087890625, + "learning_rate": 2.119301507988445e-06, + "loss": 0.2144, "step": 953500 }, { - "epoch": 9.72, - "learning_rate": 1.7682879049741748e-05, - "loss": 0.2857, + "epoch": 13.138243641674244, + "grad_norm": 3.405684471130371, + "learning_rate": 2.1162169853656987e-06, + "loss": 0.2239, "step": 953600 }, { - "epoch": 9.72, - "learning_rate": 1.7676830644948205e-05, - "loss": 0.3049, + "epoch": 13.139621393733984, + "grad_norm": 3.234578847885132, + "learning_rate": 2.1131346192251863e-06, + "loss": 0.2252, "step": 953700 }, { - "epoch": 9.72, - "learning_rate": 1.767078284268411e-05, - "loss": 0.2927, + "epoch": 13.140999145793723, + "grad_norm": 0.4654708504676819, + "learning_rate": 2.110054409828768e-06, + "loss": 0.2141, "step": 953800 }, { - "epoch": 9.72, - "learning_rate": 1.7664735643245148e-05, - "loss": 0.3209, + "epoch": 13.142376897853463, + "grad_norm": 5.575819969177246, + "learning_rate": 2.1069763574381168e-06, + "loss": 0.2165, "step": 953900 }, { - "epoch": 9.72, - "learning_rate": 1.7658689046926998e-05, - "loss": 0.3286, + "epoch": 13.143754649913202, + "grad_norm": 2.115338087081909, + "learning_rate": 2.103900462314738e-06, + "loss": 0.1974, "step": 954000 }, { - "epoch": 9.72, - "learning_rate": 1.765264305402531e-05, - "loss": 0.3487, + "epoch": 13.14513240197294, + "grad_norm": 0.2580846846103668, + "learning_rate": 2.100826724719935e-06, + "loss": 0.1772, "step": 954100 }, { - "epoch": 9.72, - "learning_rate": 1.7646597664835686e-05, - "loss": 0.3281, + "epoch": 13.146510154032681, + "grad_norm": 4.814138889312744, + "learning_rate": 2.0977551449148443e-06, + "loss": 0.1916, "step": 954200 }, { - "epoch": 9.72, - "learning_rate": 1.764055287965371e-05, - "loss": 0.3062, + "epoch": 13.14788790609242, + "grad_norm": 2.7448670864105225, + "learning_rate": 2.0946857231604145e-06, + "loss": 0.1971, "step": 954300 }, { - "epoch": 9.72, - "learning_rate": 1.7634508698774962e-05, - "loss": 0.3668, + "epoch": 13.149265658152158, + "grad_norm": 0.6680305600166321, + "learning_rate": 2.091618459717407e-06, + "loss": 0.1799, "step": 954400 }, { - "epoch": 9.72, - "learning_rate": 1.7628465122494927e-05, - "loss": 0.3369, + "epoch": 13.150643410211899, + "grad_norm": 0.2847000062465668, + "learning_rate": 2.0885533548463924e-06, + "loss": 0.1846, "step": 954500 }, { - "epoch": 9.73, - "learning_rate": 1.7622422151109122e-05, - "loss": 0.329, + "epoch": 13.152021162271637, + "grad_norm": 1.1524426937103271, + "learning_rate": 2.0854904088077765e-06, + "loss": 0.174, "step": 954600 }, { - "epoch": 9.73, - "learning_rate": 1.7616379784912996e-05, - "loss": 0.2533, + "epoch": 13.153398914331376, + "grad_norm": 1.1759401559829712, + "learning_rate": 2.0824296218617734e-06, + "loss": 0.2072, "step": 954700 }, { - "epoch": 9.73, - "learning_rate": 1.7610338024202022e-05, - "loss": 0.3386, + "epoch": 13.154776666391117, + "grad_norm": 1.0690033435821533, + "learning_rate": 2.0793709942684073e-06, + "loss": 0.2452, "step": 954800 }, { - "epoch": 9.73, - "learning_rate": 1.7604296869271566e-05, - "loss": 0.3683, + "epoch": 13.156154418450855, + "grad_norm": 2.0779855251312256, + "learning_rate": 2.0763145262875295e-06, + "loss": 0.2358, "step": 954900 }, { - "epoch": 9.73, - "learning_rate": 1.759825632041702e-05, - "loss": 0.289, + "epoch": 13.157532170510596, + "grad_norm": 2.2275679111480713, + "learning_rate": 2.0732602181787972e-06, + "loss": 0.1873, "step": 955000 }, { - "epoch": 9.73, - "learning_rate": 1.7592216377933742e-05, - "loss": 0.3211, + "epoch": 13.158909922570334, + "grad_norm": 2.71479868888855, + "learning_rate": 2.07023858098796e-06, + "loss": 0.2167, "step": 955100 }, { - "epoch": 9.73, - "learning_rate": 1.758617704211702e-05, - "loss": 0.2747, + "epoch": 13.160287674630073, + "grad_norm": 2.918341875076294, + "learning_rate": 2.067188571796582e-06, + "loss": 0.2021, "step": 955200 }, { - "epoch": 9.73, - "learning_rate": 1.7580138313262164e-05, - "loss": 0.3396, + "epoch": 13.161665426689813, + "grad_norm": 1.2662862539291382, + "learning_rate": 2.0641407232526478e-06, + "loss": 0.187, "step": 955300 }, { - "epoch": 9.73, - "learning_rate": 1.7574100191664435e-05, - "loss": 0.3423, + "epoch": 13.163043178749552, + "grad_norm": 2.664578676223755, + "learning_rate": 2.061095035615093e-06, + "loss": 0.1996, "step": 955400 }, { - "epoch": 9.73, - "learning_rate": 1.7568062677619044e-05, - "loss": 0.287, + "epoch": 13.16442093080929, + "grad_norm": 1.7334948778152466, + "learning_rate": 2.058051509142659e-06, + "loss": 0.193, "step": 955500 }, { - "epoch": 9.74, - "learning_rate": 1.7562025771421203e-05, - "loss": 0.2905, + "epoch": 13.165798682869031, + "grad_norm": 2.573725700378418, + "learning_rate": 2.055010144093905e-06, + "loss": 0.2287, "step": 955600 }, { - "epoch": 9.74, - "learning_rate": 1.755598947336608e-05, - "loss": 0.3421, + "epoch": 13.16717643492877, + "grad_norm": 3.56535005569458, + "learning_rate": 2.051970940727222e-06, + "loss": 0.251, "step": 955700 }, { - "epoch": 9.74, - "learning_rate": 1.755001413763223e-05, - "loss": 0.3286, + "epoch": 13.16855418698851, + "grad_norm": 0.7944051027297974, + "learning_rate": 2.0489338993007905e-06, + "loss": 0.1961, "step": 955800 }, { - "epoch": 9.74, - "learning_rate": 1.7543979050659132e-05, - "loss": 0.3017, + "epoch": 13.169931939048249, + "grad_norm": 1.9641187191009521, + "learning_rate": 2.0458990200726322e-06, + "loss": 0.2402, "step": 955900 }, { - "epoch": 9.74, - "learning_rate": 1.7538004914474955e-05, - "loss": 0.333, + "epoch": 13.171309691107988, + "grad_norm": 2.6603341102600098, + "learning_rate": 2.04286630330058e-06, + "loss": 0.2279, "step": 956000 }, { - "epoch": 9.74, - "learning_rate": 1.753197103975242e-05, - "loss": 0.2482, + "epoch": 13.172687443167728, + "grad_norm": 2.809593915939331, + "learning_rate": 2.039835749242273e-06, + "loss": 0.1981, "step": 956100 }, { - "epoch": 9.74, - "learning_rate": 1.7525937774642095e-05, - "loss": 0.3185, + "epoch": 13.174065195227467, + "grad_norm": 1.3900868892669678, + "learning_rate": 2.0368073581551683e-06, + "loss": 0.1711, "step": 956200 }, { - "epoch": 9.74, - "learning_rate": 1.751990511943897e-05, - "loss": 0.304, + "epoch": 13.175442947287205, + "grad_norm": 0.32162657380104065, + "learning_rate": 2.033781130296546e-06, + "loss": 0.1836, "step": 956300 }, { - "epoch": 9.74, - "learning_rate": 1.751387307443801e-05, - "loss": 0.3216, + "epoch": 13.176820699346946, + "grad_norm": 1.0811519622802734, + "learning_rate": 2.0307570659235053e-06, + "loss": 0.2027, "step": 956400 }, { - "epoch": 9.74, - "learning_rate": 1.7507841639934145e-05, - "loss": 0.333, + "epoch": 13.178198451406685, + "grad_norm": 2.0750279426574707, + "learning_rate": 2.0277351652929506e-06, + "loss": 0.2139, "step": 956500 }, { - "epoch": 9.75, - "learning_rate": 1.750181081622227e-05, - "loss": 0.336, + "epoch": 13.179576203466425, + "grad_norm": 1.5830872058868408, + "learning_rate": 2.0247154286616093e-06, + "loss": 0.165, "step": 956600 }, { - "epoch": 9.75, - "learning_rate": 1.749578060359727e-05, - "loss": 0.3124, + "epoch": 13.180953955526164, + "grad_norm": 2.5654876232147217, + "learning_rate": 2.021697856286025e-06, + "loss": 0.2213, "step": 956700 }, { - "epoch": 9.75, - "learning_rate": 1.7489751002353968e-05, - "loss": 0.2965, + "epoch": 13.182331707585902, + "grad_norm": 3.174516439437866, + "learning_rate": 2.0186824484225493e-06, + "loss": 0.2213, "step": 956800 }, { - "epoch": 9.75, - "learning_rate": 1.7483722012787186e-05, - "loss": 0.3397, + "epoch": 13.183709459645643, + "grad_norm": 2.442420721054077, + "learning_rate": 2.01566920532736e-06, + "loss": 0.2125, "step": 956900 }, { - "epoch": 9.75, - "learning_rate": 1.7477693635191713e-05, - "loss": 0.3522, + "epoch": 13.185087211705381, + "grad_norm": 3.190927743911743, + "learning_rate": 2.0126581272564545e-06, + "loss": 0.1767, "step": 957000 }, { - "epoch": 9.75, - "learning_rate": 1.7471665869862283e-05, - "loss": 0.3062, + "epoch": 13.18646496376512, + "grad_norm": 0.8228018879890442, + "learning_rate": 2.0096492144656316e-06, + "loss": 0.2207, "step": 957100 }, { - "epoch": 9.75, - "learning_rate": 1.7465638717093622e-05, - "loss": 0.3099, + "epoch": 13.18784271582486, + "grad_norm": 2.233006238937378, + "learning_rate": 2.0066424672105135e-06, + "loss": 0.2723, "step": 957200 }, { - "epoch": 9.75, - "learning_rate": 1.7459612177180445e-05, - "loss": 0.2801, + "epoch": 13.1892204678846, + "grad_norm": 2.4445199966430664, + "learning_rate": 2.0036378857465387e-06, + "loss": 0.1895, "step": 957300 }, { - "epoch": 9.75, - "learning_rate": 1.7453586250417383e-05, - "loss": 0.3369, + "epoch": 13.19059821994434, + "grad_norm": 1.401798129081726, + "learning_rate": 2.000635470328972e-06, + "loss": 0.2085, "step": 957400 }, { - "epoch": 9.76, - "learning_rate": 1.7447560937099078e-05, - "loss": 0.2762, + "epoch": 13.191975972004078, + "grad_norm": 2.0793230533599854, + "learning_rate": 1.997665212980002e-06, + "loss": 0.2387, "step": 957500 }, { - "epoch": 9.76, - "learning_rate": 1.744153623752013e-05, - "loss": 0.3651, + "epoch": 13.193353724063817, + "grad_norm": 0.5570358037948608, + "learning_rate": 1.9946671087534297e-06, + "loss": 0.2172, "step": 957600 }, { - "epoch": 9.76, - "learning_rate": 1.743551215197512e-05, - "loss": 0.2883, + "epoch": 13.194731476123557, + "grad_norm": 1.3516061305999756, + "learning_rate": 1.991671171335378e-06, + "loss": 0.1906, "step": 957700 }, { - "epoch": 9.76, - "learning_rate": 1.7429488680758584e-05, - "loss": 0.2659, + "epoch": 13.196109228183296, + "grad_norm": 1.9212381839752197, + "learning_rate": 1.98867740098036e-06, + "loss": 0.2601, "step": 957800 }, { - "epoch": 9.76, - "learning_rate": 1.7423465824165037e-05, - "loss": 0.2809, + "epoch": 13.197486980243035, + "grad_norm": 3.625758171081543, + "learning_rate": 1.985685797942706e-06, + "loss": 0.2119, "step": 957900 }, { - "epoch": 9.76, - "learning_rate": 1.7417443582488967e-05, - "loss": 0.3626, + "epoch": 13.198864732302775, + "grad_norm": 0.3776724338531494, + "learning_rate": 1.9826963624765757e-06, + "loss": 0.2288, "step": 958000 }, { - "epoch": 9.76, - "learning_rate": 1.7411421956024808e-05, - "loss": 0.3193, + "epoch": 13.200242484362514, + "grad_norm": 1.0738612413406372, + "learning_rate": 1.979709094835936e-06, + "loss": 0.1992, "step": 958100 }, { - "epoch": 9.76, - "learning_rate": 1.740540094506699e-05, - "loss": 0.2949, + "epoch": 13.201620236422254, + "grad_norm": 0.25447648763656616, + "learning_rate": 1.9767239952745746e-06, + "loss": 0.23, "step": 958200 }, { - "epoch": 9.76, - "learning_rate": 1.7399380549909914e-05, - "loss": 0.3369, + "epoch": 13.202997988481993, + "grad_norm": 0.29381680488586426, + "learning_rate": 1.973741064046079e-06, + "loss": 0.2077, "step": 958300 }, { - "epoch": 9.76, - "learning_rate": 1.7393360770847923e-05, - "loss": 0.3442, + "epoch": 13.204375740541732, + "grad_norm": 1.5760165452957153, + "learning_rate": 1.9707603014038735e-06, + "loss": 0.2302, "step": 958400 }, { - "epoch": 9.77, - "learning_rate": 1.738734160817536e-05, - "loss": 0.3093, + "epoch": 13.205753492601472, + "grad_norm": 1.596274971961975, + "learning_rate": 1.9677817076011866e-06, + "loss": 0.2719, "step": 958500 }, { - "epoch": 9.77, - "learning_rate": 1.7381323062186532e-05, - "loss": 0.2892, + "epoch": 13.20713124466121, + "grad_norm": 0.08021339029073715, + "learning_rate": 1.9648052828910627e-06, + "loss": 0.213, "step": 958600 }, { - "epoch": 9.77, - "learning_rate": 1.7375305133175693e-05, - "loss": 0.3051, + "epoch": 13.20850899672095, + "grad_norm": 0.735159158706665, + "learning_rate": 1.9618310275263734e-06, + "loss": 0.1986, "step": 958700 }, { - "epoch": 9.77, - "learning_rate": 1.7369287821437086e-05, - "loss": 0.3391, + "epoch": 13.20988674878069, + "grad_norm": 5.046648025512695, + "learning_rate": 1.9588589417597907e-06, + "loss": 0.2363, "step": 958800 }, { - "epoch": 9.77, - "learning_rate": 1.7363271127264933e-05, - "loss": 0.3334, + "epoch": 13.211264500840429, + "grad_norm": 3.062516927719116, + "learning_rate": 1.9558890258438046e-06, + "loss": 0.269, "step": 958900 }, { - "epoch": 9.77, - "learning_rate": 1.735725505095341e-05, - "loss": 0.3476, + "epoch": 13.212642252900167, + "grad_norm": 2.159027099609375, + "learning_rate": 1.9529212800307275e-06, + "loss": 0.1784, "step": 959000 }, { - "epoch": 9.77, - "learning_rate": 1.735123959279667e-05, - "loss": 0.2868, + "epoch": 13.214020004959908, + "grad_norm": 3.4646830558776855, + "learning_rate": 1.949955704572688e-06, + "loss": 0.2355, "step": 959100 }, { - "epoch": 9.77, - "learning_rate": 1.734522475308884e-05, - "loss": 0.3282, + "epoch": 13.215397757019646, + "grad_norm": 6.0226311683654785, + "learning_rate": 1.9469922997216237e-06, + "loss": 0.2051, "step": 959200 }, { - "epoch": 9.77, - "learning_rate": 1.7339210532123982e-05, - "loss": 0.2685, + "epoch": 13.216775509079387, + "grad_norm": 1.7110134363174438, + "learning_rate": 1.9440310657292935e-06, + "loss": 0.2245, "step": 959300 }, { - "epoch": 9.77, - "learning_rate": 1.733319693019618e-05, - "loss": 0.3464, + "epoch": 13.218153261139125, + "grad_norm": 3.0870511531829834, + "learning_rate": 1.941072002847271e-06, + "loss": 0.2163, "step": 959400 }, { - "epoch": 9.78, - "learning_rate": 1.7327183947599457e-05, - "loss": 0.2802, + "epoch": 13.219531013198864, + "grad_norm": 2.9375531673431396, + "learning_rate": 1.938115111326935e-06, + "loss": 0.235, "step": 959500 }, { - "epoch": 9.78, - "learning_rate": 1.7321171584627822e-05, - "loss": 0.2892, + "epoch": 13.220908765258605, + "grad_norm": 1.4334698915481567, + "learning_rate": 1.935160391419492e-06, + "loss": 0.2279, "step": 959600 }, { - "epoch": 9.78, - "learning_rate": 1.731515984157522e-05, - "loss": 0.3621, + "epoch": 13.222286517318343, + "grad_norm": 0.6312920451164246, + "learning_rate": 1.9322078433759688e-06, + "loss": 0.2083, "step": 959700 }, { - "epoch": 9.78, - "learning_rate": 1.7309148718735602e-05, - "loss": 0.257, + "epoch": 13.223664269378082, + "grad_norm": 2.858332872390747, + "learning_rate": 1.9292574674471884e-06, + "loss": 0.2322, "step": 959800 }, { - "epoch": 9.78, - "learning_rate": 1.730313821640289e-05, - "loss": 0.2846, + "epoch": 13.225042021437822, + "grad_norm": 1.0100257396697998, + "learning_rate": 1.926309263883808e-06, + "loss": 0.2033, "step": 959900 }, { - "epoch": 9.78, - "learning_rate": 1.7297128334870936e-05, - "loss": 0.3072, + "epoch": 13.226419773497561, + "grad_norm": 3.617982864379883, + "learning_rate": 1.923392682490494e-06, + "loss": 0.1817, "step": 960000 }, { - "epoch": 9.78, - "learning_rate": 1.7291119074433603e-05, - "loss": 0.3312, + "epoch": 13.227797525557301, + "grad_norm": 4.050951957702637, + "learning_rate": 1.9204488026792146e-06, + "loss": 0.1905, "step": 960100 }, { - "epoch": 9.78, - "learning_rate": 1.72851104353847e-05, - "loss": 0.3154, + "epoch": 13.22917527761704, + "grad_norm": 3.6145012378692627, + "learning_rate": 1.917507095981678e-06, + "loss": 0.1909, "step": 960200 }, { - "epoch": 9.78, - "learning_rate": 1.7279102418018018e-05, - "loss": 0.2991, + "epoch": 13.230553029676779, + "grad_norm": 0.3611353039741516, + "learning_rate": 1.9145675626477906e-06, + "loss": 0.2711, "step": 960300 }, { - "epoch": 9.78, - "learning_rate": 1.7273095022627315e-05, - "loss": 0.3472, + "epoch": 13.23193078173652, + "grad_norm": 1.8225626945495605, + "learning_rate": 1.9116302029272848e-06, + "loss": 0.2352, "step": 960400 }, { - "epoch": 9.79, - "learning_rate": 1.7267088249506328e-05, - "loss": 0.2988, + "epoch": 13.233308533796258, + "grad_norm": 3.953374147415161, + "learning_rate": 1.9086950170697037e-06, + "loss": 0.2131, "step": 960500 }, { - "epoch": 9.79, - "learning_rate": 1.7261082098948726e-05, - "loss": 0.3408, + "epoch": 13.234686285855997, + "grad_norm": 1.9514292478561401, + "learning_rate": 1.9057620053244007e-06, + "loss": 0.2047, "step": 960600 }, { - "epoch": 9.79, - "learning_rate": 1.7255076571248186e-05, - "loss": 0.3137, + "epoch": 13.236064037915737, + "grad_norm": 1.420623779296875, + "learning_rate": 1.9028311679405533e-06, + "loss": 0.1969, "step": 960700 }, { - "epoch": 9.79, - "learning_rate": 1.724907166669836e-05, - "loss": 0.3279, + "epoch": 13.237441789975476, + "grad_norm": 12.847402572631836, + "learning_rate": 1.8999025051671578e-06, + "loss": 0.2108, "step": 960800 }, { - "epoch": 9.79, - "learning_rate": 1.7243067385592827e-05, - "loss": 0.3174, + "epoch": 13.238819542035216, + "grad_norm": 1.4847880601882935, + "learning_rate": 1.8969760172530095e-06, + "loss": 0.2143, "step": 960900 }, { - "epoch": 9.79, - "learning_rate": 1.7237063728225165e-05, - "loss": 0.3038, + "epoch": 13.240197294094955, + "grad_norm": 2.621593475341797, + "learning_rate": 1.8940517044467387e-06, + "loss": 0.1947, "step": 961000 }, { - "epoch": 9.79, - "learning_rate": 1.7231060694888915e-05, - "loss": 0.3558, + "epoch": 13.241575046154693, + "grad_norm": 1.2573065757751465, + "learning_rate": 1.891129566996768e-06, + "loss": 0.1817, "step": 961100 }, { - "epoch": 9.79, - "learning_rate": 1.7225058285877618e-05, - "loss": 0.3511, + "epoch": 13.242952798214434, + "grad_norm": 2.4884815216064453, + "learning_rate": 1.8882096051513612e-06, + "loss": 0.2152, "step": 961200 }, { - "epoch": 9.79, - "learning_rate": 1.7219056501484717e-05, - "loss": 0.2429, + "epoch": 13.244330550274173, + "grad_norm": 0.638947606086731, + "learning_rate": 1.8852918191585716e-06, + "loss": 0.2189, "step": 961300 }, { - "epoch": 9.79, - "learning_rate": 1.7213055342003684e-05, - "loss": 0.3099, + "epoch": 13.245708302333911, + "grad_norm": 1.6240088939666748, + "learning_rate": 1.8823762092662847e-06, + "loss": 0.195, "step": 961400 }, { - "epoch": 9.8, - "learning_rate": 1.7207054807727933e-05, - "loss": 0.2822, + "epoch": 13.247086054393652, + "grad_norm": 1.3978030681610107, + "learning_rate": 1.8794627757222086e-06, + "loss": 0.1933, "step": 961500 }, { - "epoch": 9.8, - "learning_rate": 1.7201054898950857e-05, - "loss": 0.3304, + "epoch": 13.24846380645339, + "grad_norm": 1.0554022789001465, + "learning_rate": 1.8765515187738347e-06, + "loss": 0.1826, "step": 961600 }, { - "epoch": 9.8, - "learning_rate": 1.719505561596582e-05, - "loss": 0.3695, + "epoch": 13.24984155851313, + "grad_norm": 2.806983709335327, + "learning_rate": 1.873642438668496e-06, + "loss": 0.243, "step": 961700 }, { - "epoch": 9.8, - "learning_rate": 1.7189056959066157e-05, - "loss": 0.3645, + "epoch": 13.25121931057287, + "grad_norm": 0.32036328315734863, + "learning_rate": 1.8707355356533415e-06, + "loss": 0.1708, "step": 961800 }, { - "epoch": 9.8, - "learning_rate": 1.7183058928545144e-05, - "loss": 0.355, + "epoch": 13.252597062632608, + "grad_norm": 3.681459903717041, + "learning_rate": 1.8678308099753197e-06, + "loss": 0.2217, "step": 961900 }, { - "epoch": 9.8, - "learning_rate": 1.7177061524696062e-05, - "loss": 0.3278, + "epoch": 13.253974814692349, + "grad_norm": 1.3223588466644287, + "learning_rate": 1.8649282618811983e-06, + "loss": 0.2476, "step": 962000 }, { - "epoch": 9.8, - "learning_rate": 1.717106474781216e-05, - "loss": 0.3164, + "epoch": 13.255352566752087, + "grad_norm": 7.37537956237793, + "learning_rate": 1.8620278916175774e-06, + "loss": 0.2365, "step": 962100 }, { - "epoch": 9.8, - "learning_rate": 1.7165068598186623e-05, - "loss": 0.3258, + "epoch": 13.256730318811826, + "grad_norm": 2.0773894786834717, + "learning_rate": 1.859129699430849e-06, + "loss": 0.2179, "step": 962200 }, { - "epoch": 9.8, - "learning_rate": 1.715907307611264e-05, - "loss": 0.2344, + "epoch": 13.258108070871566, + "grad_norm": 7.191392421722412, + "learning_rate": 1.8562336855672288e-06, + "loss": 0.1976, "step": 962300 }, { - "epoch": 9.81, - "learning_rate": 1.7153078181883358e-05, - "loss": 0.2856, + "epoch": 13.259485822931305, + "grad_norm": 5.158681869506836, + "learning_rate": 1.8533398502727455e-06, + "loss": 0.2421, "step": 962400 }, { - "epoch": 9.81, - "learning_rate": 1.7147083915791874e-05, - "loss": 0.2957, + "epoch": 13.260863574991046, + "grad_norm": 4.292149066925049, + "learning_rate": 1.8504481937932543e-06, + "loss": 0.2222, "step": 962500 }, { - "epoch": 9.81, - "learning_rate": 1.7141090278131288e-05, - "loss": 0.3144, + "epoch": 13.262241327050784, + "grad_norm": 1.4448665380477905, + "learning_rate": 1.8475587163744149e-06, + "loss": 0.169, "step": 962600 }, { - "epoch": 9.81, - "learning_rate": 1.713509726919465e-05, - "loss": 0.3048, + "epoch": 13.263619079110523, + "grad_norm": 2.8384995460510254, + "learning_rate": 1.8446714182616912e-06, + "loss": 0.2924, "step": 962700 }, { - "epoch": 9.81, - "learning_rate": 1.7129104889274978e-05, - "loss": 0.3243, + "epoch": 13.264996831170263, + "grad_norm": 1.0152028799057007, + "learning_rate": 1.841815140096413e-06, + "loss": 0.2406, "step": 962800 }, { - "epoch": 9.81, - "learning_rate": 1.712317305305533e-05, - "loss": 0.3445, + "epoch": 13.266374583230002, + "grad_norm": 0.9772372841835022, + "learning_rate": 1.8389321795324526e-06, + "loss": 0.2323, "step": 962900 }, { - "epoch": 9.81, - "learning_rate": 1.7117181925751064e-05, - "loss": 0.303, + "epoch": 13.26775233528974, + "grad_norm": 2.675663948059082, + "learning_rate": 1.8360513990074804e-06, + "loss": 0.2386, "step": 963000 }, { - "epoch": 9.81, - "learning_rate": 1.711119142833972e-05, - "loss": 0.3156, + "epoch": 13.269130087349481, + "grad_norm": 0.9422595500946045, + "learning_rate": 1.8331727987662369e-06, + "loss": 0.2058, "step": 963100 }, { - "epoch": 9.81, - "learning_rate": 1.7105201561114214e-05, - "loss": 0.27, + "epoch": 13.27050783940922, + "grad_norm": 2.2564454078674316, + "learning_rate": 1.830296379053286e-06, + "loss": 0.2187, "step": 963200 }, { - "epoch": 9.81, - "learning_rate": 1.7099212324367388e-05, - "loss": 0.3537, + "epoch": 13.271885591468958, + "grad_norm": 4.06761360168457, + "learning_rate": 1.827422140112971e-06, + "loss": 0.2481, "step": 963300 }, { - "epoch": 9.82, - "learning_rate": 1.709322371839209e-05, - "loss": 0.3466, + "epoch": 13.273263343528699, + "grad_norm": 4.00928258895874, + "learning_rate": 1.8245500821894862e-06, + "loss": 0.2098, "step": 963400 }, { - "epoch": 9.82, - "learning_rate": 1.7087235743481127e-05, - "loss": 0.3857, + "epoch": 13.274641095588438, + "grad_norm": 1.5838277339935303, + "learning_rate": 1.8216802055268333e-06, + "loss": 0.2395, "step": 963500 }, { - "epoch": 9.82, - "learning_rate": 1.708124839992729e-05, - "loss": 0.3252, + "epoch": 13.276018847648178, + "grad_norm": 2.5445423126220703, + "learning_rate": 1.8188125103688097e-06, + "loss": 0.255, "step": 963600 }, { - "epoch": 9.82, - "learning_rate": 1.7075261688023303e-05, - "loss": 0.2562, + "epoch": 13.277396599707917, + "grad_norm": 0.20837311446666718, + "learning_rate": 1.8159469969590506e-06, + "loss": 0.1614, "step": 963700 }, { - "epoch": 9.82, - "learning_rate": 1.7069275608061884e-05, - "loss": 0.3116, + "epoch": 13.278774351767655, + "grad_norm": 0.6879075169563293, + "learning_rate": 1.81308366554099e-06, + "loss": 0.2176, "step": 963800 }, { - "epoch": 9.82, - "learning_rate": 1.7063290160335723e-05, - "loss": 0.2645, + "epoch": 13.280152103827396, + "grad_norm": 1.2926067113876343, + "learning_rate": 1.8102225163578907e-06, + "loss": 0.206, "step": 963900 }, { - "epoch": 9.82, - "learning_rate": 1.705730534513747e-05, - "loss": 0.3129, + "epoch": 13.281529855887134, + "grad_norm": 0.3677089810371399, + "learning_rate": 1.807363549652811e-06, + "loss": 0.2267, "step": 964000 }, { - "epoch": 9.82, - "learning_rate": 1.7051321162759747e-05, - "loss": 0.359, + "epoch": 13.282907607946873, + "grad_norm": 0.8048468232154846, + "learning_rate": 1.8045067656686416e-06, + "loss": 0.1896, "step": 964100 }, { - "epoch": 9.82, - "learning_rate": 1.704539744585293e-05, - "loss": 0.2894, + "epoch": 13.284285360006614, + "grad_norm": 2.8021392822265625, + "learning_rate": 1.8016521646480833e-06, + "loss": 0.1946, "step": 964200 }, { - "epoch": 9.82, - "learning_rate": 1.7039414523658502e-05, - "loss": 0.2653, + "epoch": 13.285663112066352, + "grad_norm": 3.0912606716156006, + "learning_rate": 1.7987997468336451e-06, + "loss": 0.2375, "step": 964300 }, { - "epoch": 9.83, - "learning_rate": 1.7033432235159375e-05, - "loss": 0.2812, + "epoch": 13.287040864126093, + "grad_norm": 0.29730942845344543, + "learning_rate": 1.7959495124676495e-06, + "loss": 0.2421, "step": 964400 }, { - "epoch": 9.83, - "learning_rate": 1.7027510394053937e-05, - "loss": 0.3099, + "epoch": 13.288418616185831, + "grad_norm": 0.2854321002960205, + "learning_rate": 1.7931014617922512e-06, + "loss": 0.1812, "step": 964500 }, { - "epoch": 9.83, - "learning_rate": 1.7021529367478582e-05, - "loss": 0.371, + "epoch": 13.28979636824557, + "grad_norm": 3.0549330711364746, + "learning_rate": 1.790255595049394e-06, + "loss": 0.259, "step": 964600 }, { - "epoch": 9.83, - "learning_rate": 1.7015548975472993e-05, - "loss": 0.2287, + "epoch": 13.29117412030531, + "grad_norm": 3.5647294521331787, + "learning_rate": 1.7874119124808511e-06, + "loss": 0.2054, "step": 964700 }, { - "epoch": 9.83, - "learning_rate": 1.700956921832957e-05, - "loss": 0.3162, + "epoch": 13.292551872365049, + "grad_norm": 2.9089515209198, + "learning_rate": 1.7845704143282181e-06, + "loss": 0.2317, "step": 964800 }, { - "epoch": 9.83, - "learning_rate": 1.7003649884415603e-05, - "loss": 0.2911, + "epoch": 13.293929624424788, + "grad_norm": 0.3399277627468109, + "learning_rate": 1.7817594831529938e-06, + "loss": 0.2213, "step": 964900 }, { - "epoch": 9.83, - "learning_rate": 1.6997671391517702e-05, - "loss": 0.4066, + "epoch": 13.295307376484528, + "grad_norm": 2.450773239135742, + "learning_rate": 1.7789223327059987e-06, + "loss": 0.2322, "step": 965000 }, { - "epoch": 9.83, - "learning_rate": 1.6991693534356064e-05, - "loss": 0.3096, + "epoch": 13.296685128544267, + "grad_norm": 1.8029911518096924, + "learning_rate": 1.7760873673961383e-06, + "loss": 0.1924, "step": 965100 }, { - "epoch": 9.83, - "learning_rate": 1.6985716313222985e-05, - "loss": 0.32, + "epoch": 13.298062880604007, + "grad_norm": 0.9154857993125916, + "learning_rate": 1.7732545874642578e-06, + "loss": 0.2255, "step": 965200 }, { - "epoch": 9.83, - "learning_rate": 1.6979739728410712e-05, - "loss": 0.368, + "epoch": 13.299440632663746, + "grad_norm": 3.634655714035034, + "learning_rate": 1.7704239931510217e-06, + "loss": 0.203, "step": 965300 }, { - "epoch": 9.84, - "learning_rate": 1.6973763780211462e-05, - "loss": 0.3208, + "epoch": 13.300818384723485, + "grad_norm": 2.506727933883667, + "learning_rate": 1.7675955846968992e-06, + "loss": 0.195, "step": 965400 }, { - "epoch": 9.84, - "learning_rate": 1.696778846891743e-05, - "loss": 0.2783, + "epoch": 13.302196136783225, + "grad_norm": 2.1380488872528076, + "learning_rate": 1.7647693623421734e-06, + "loss": 0.2426, "step": 965500 }, { - "epoch": 9.84, - "learning_rate": 1.696181379482077e-05, - "loss": 0.3401, + "epoch": 13.303573888842964, + "grad_norm": 2.5635151863098145, + "learning_rate": 1.7619453263269567e-06, + "loss": 0.2361, "step": 965600 }, { - "epoch": 9.84, - "learning_rate": 1.695583975821362e-05, - "loss": 0.3067, + "epoch": 13.304951640902702, + "grad_norm": 0.0536142997443676, + "learning_rate": 1.7591234768911533e-06, + "loss": 0.2229, "step": 965700 }, { - "epoch": 9.84, - "learning_rate": 1.694986635938805e-05, - "loss": 0.3021, + "epoch": 13.306329392962443, + "grad_norm": 1.680090308189392, + "learning_rate": 1.7563038142744998e-06, + "loss": 0.2149, "step": 965800 }, { - "epoch": 9.84, - "learning_rate": 1.6943893598636134e-05, - "loss": 0.291, + "epoch": 13.307707145022182, + "grad_norm": 1.297087550163269, + "learning_rate": 1.7534863387165465e-06, + "loss": 0.2408, "step": 965900 }, { - "epoch": 9.84, - "learning_rate": 1.693792147624992e-05, - "loss": 0.3209, + "epoch": 13.309084897081922, + "grad_norm": 4.679940223693848, + "learning_rate": 1.7506710504566452e-06, + "loss": 0.2246, "step": 966000 }, { - "epoch": 9.84, - "learning_rate": 1.693194999252138e-05, - "loss": 0.3106, + "epoch": 13.31046264914166, + "grad_norm": 0.42619386315345764, + "learning_rate": 1.747857949733968e-06, + "loss": 0.1815, "step": 966100 }, { - "epoch": 9.84, - "learning_rate": 1.69259791477425e-05, - "loss": 0.2939, + "epoch": 13.3118404012014, + "grad_norm": 2.690516471862793, + "learning_rate": 1.7450470367875094e-06, + "loss": 0.2053, "step": 966200 }, { - "epoch": 9.84, - "learning_rate": 1.6920008942205233e-05, - "loss": 0.3537, + "epoch": 13.31321815326114, + "grad_norm": 3.1921546459198, + "learning_rate": 1.7422383118560623e-06, + "loss": 0.2479, "step": 966300 }, { - "epoch": 9.85, - "learning_rate": 1.6914039376201455e-05, - "loss": 0.3198, + "epoch": 13.314595905320878, + "grad_norm": 3.8973655700683594, + "learning_rate": 1.739431775178246e-06, + "loss": 0.2306, "step": 966400 }, { - "epoch": 9.85, - "learning_rate": 1.6908070450023065e-05, - "loss": 0.3086, + "epoch": 13.315973657380617, + "grad_norm": 2.7223026752471924, + "learning_rate": 1.7366274269924964e-06, + "loss": 0.2305, "step": 966500 }, { - "epoch": 9.85, - "learning_rate": 1.6902102163961896e-05, - "loss": 0.3634, + "epoch": 13.317351409440358, + "grad_norm": 2.297441005706787, + "learning_rate": 1.733825267537048e-06, + "loss": 0.2017, "step": 966600 }, { - "epoch": 9.85, - "learning_rate": 1.689613451830977e-05, - "loss": 0.3076, + "epoch": 13.318729161500096, + "grad_norm": 1.2714729309082031, + "learning_rate": 1.7310252970499614e-06, + "loss": 0.2593, "step": 966700 }, { - "epoch": 9.85, - "learning_rate": 1.6890167513358466e-05, - "loss": 0.3054, + "epoch": 13.320106913559837, + "grad_norm": 2.4037818908691406, + "learning_rate": 1.7282275157691072e-06, + "loss": 0.23, "step": 966800 }, { - "epoch": 9.85, - "learning_rate": 1.6884201149399745e-05, - "loss": 0.3128, + "epoch": 13.321484665619575, + "grad_norm": 1.806141972541809, + "learning_rate": 1.7254319239321768e-06, + "loss": 0.2197, "step": 966900 }, { - "epoch": 9.85, - "learning_rate": 1.687823542672531e-05, - "loss": 0.3169, + "epoch": 13.322862417679314, + "grad_norm": 0.31874725222587585, + "learning_rate": 1.7226385217766686e-06, + "loss": 0.2088, "step": 967000 }, { - "epoch": 9.85, - "learning_rate": 1.6872270345626853e-05, - "loss": 0.3464, + "epoch": 13.324240169739054, + "grad_norm": 1.0153437852859497, + "learning_rate": 1.719847309539886e-06, + "loss": 0.1966, "step": 967100 }, { - "epoch": 9.85, - "learning_rate": 1.686630590639603e-05, - "loss": 0.2938, + "epoch": 13.325617921798793, + "grad_norm": 2.779550313949585, + "learning_rate": 1.7170582874589701e-06, + "loss": 0.2138, "step": 967200 }, { - "epoch": 9.86, - "learning_rate": 1.6860342109324486e-05, - "loss": 0.2432, + "epoch": 13.326995673858532, + "grad_norm": 0.6179196238517761, + "learning_rate": 1.7142714557708552e-06, + "loss": 0.1844, "step": 967300 }, { - "epoch": 9.86, - "learning_rate": 1.6854378954703787e-05, - "loss": 0.2634, + "epoch": 13.328373425918272, + "grad_norm": 0.23961561918258667, + "learning_rate": 1.7114868147122978e-06, + "loss": 0.1959, "step": 967400 }, { - "epoch": 9.86, - "learning_rate": 1.6848416442825508e-05, - "loss": 0.3411, + "epoch": 13.329751177978011, + "grad_norm": 1.4346020221710205, + "learning_rate": 1.7087043645198743e-06, + "loss": 0.2056, "step": 967500 }, { - "epoch": 9.86, - "learning_rate": 1.6842454573981195e-05, - "loss": 0.2726, + "epoch": 13.33112893003775, + "grad_norm": 1.4697591066360474, + "learning_rate": 1.7059241054299602e-06, + "loss": 0.2327, "step": 967600 }, { - "epoch": 9.86, - "learning_rate": 1.6836493348462322e-05, - "loss": 0.2996, + "epoch": 13.33250668209749, + "grad_norm": 0.1433299332857132, + "learning_rate": 1.703146037678753e-06, + "loss": 0.2196, "step": 967700 }, { - "epoch": 9.86, - "learning_rate": 1.683053276656036e-05, - "loss": 0.3057, + "epoch": 13.333884434157229, + "grad_norm": 2.8740367889404297, + "learning_rate": 1.7003701615022648e-06, + "loss": 0.1792, "step": 967800 }, { - "epoch": 9.86, - "learning_rate": 1.682457282856676e-05, - "loss": 0.3355, + "epoch": 13.33526218621697, + "grad_norm": 1.9139193296432495, + "learning_rate": 1.6975964771363243e-06, + "loss": 0.1985, "step": 967900 }, { - "epoch": 9.86, - "learning_rate": 1.6818613534772924e-05, - "loss": 0.3784, + "epoch": 13.336639938276708, + "grad_norm": 1.7368520498275757, + "learning_rate": 1.6948249848165642e-06, + "loss": 0.1736, "step": 968000 }, { - "epoch": 9.86, - "learning_rate": 1.681265488547022e-05, - "loss": 0.3397, + "epoch": 13.338017690336446, + "grad_norm": 1.6556569337844849, + "learning_rate": 1.692055684778438e-06, + "loss": 0.202, "step": 968100 }, { - "epoch": 9.86, - "learning_rate": 1.6806756457802574e-05, - "loss": 0.2966, + "epoch": 13.339395442396187, + "grad_norm": 3.377519130706787, + "learning_rate": 1.6892885772572245e-06, + "loss": 0.2611, "step": 968200 }, { - "epoch": 9.87, - "learning_rate": 1.6800799091903963e-05, - "loss": 0.2685, + "epoch": 13.340773194455926, + "grad_norm": 6.001559257507324, + "learning_rate": 1.686523662487986e-06, + "loss": 0.2483, "step": 968300 }, { - "epoch": 9.87, - "learning_rate": 1.6794842371367513e-05, - "loss": 0.2549, + "epoch": 13.342150946515664, + "grad_norm": 0.0038635018281638622, + "learning_rate": 1.6837609407056198e-06, + "loss": 0.2409, "step": 968400 }, { - "epoch": 9.87, - "learning_rate": 1.678888629648446e-05, - "loss": 0.2952, + "epoch": 13.343528698575405, + "grad_norm": 2.0864150524139404, + "learning_rate": 1.6810004121448434e-06, + "loss": 0.2225, "step": 968500 }, { - "epoch": 9.87, - "learning_rate": 1.6782930867546022e-05, - "loss": 0.2872, + "epoch": 13.344906450635143, + "grad_norm": 3.923978567123413, + "learning_rate": 1.678242077040166e-06, + "loss": 0.2032, "step": 968600 }, { - "epoch": 9.87, - "learning_rate": 1.6776976084843404e-05, - "loss": 0.3113, + "epoch": 13.346284202694884, + "grad_norm": 2.1420841217041016, + "learning_rate": 1.6754859356259328e-06, + "loss": 0.1897, "step": 968700 }, { - "epoch": 9.87, - "learning_rate": 1.6771021948667736e-05, - "loss": 0.2226, + "epoch": 13.347661954754622, + "grad_norm": 0.8581820130348206, + "learning_rate": 1.6727319881362807e-06, + "loss": 0.1966, "step": 968800 }, { - "epoch": 9.87, - "learning_rate": 1.676506845931014e-05, - "loss": 0.3066, + "epoch": 13.349039706814361, + "grad_norm": 1.9391624927520752, + "learning_rate": 1.669980234805182e-06, + "loss": 0.2227, "step": 968900 }, { - "epoch": 9.87, - "learning_rate": 1.675911561706173e-05, - "loss": 0.3617, + "epoch": 13.350417458874102, + "grad_norm": NaN, + "learning_rate": 1.6672581605927847e-06, + "loss": 0.2526, "step": 969000 }, { - "epoch": 9.87, - "learning_rate": 1.6753163422213543e-05, - "loss": 0.3817, + "epoch": 13.35179521093384, + "grad_norm": 2.4621922969818115, + "learning_rate": 1.6645382373309544e-06, + "loss": 0.2089, "step": 969100 }, { - "epoch": 9.87, - "learning_rate": 1.6747211875056607e-05, - "loss": 0.3065, + "epoch": 13.353172962993579, + "grad_norm": 2.3735692501068115, + "learning_rate": 1.6617930239779405e-06, + "loss": 0.1911, "step": 969200 }, { - "epoch": 9.88, - "learning_rate": 1.674126097588192e-05, - "loss": 0.2769, + "epoch": 13.35455071505332, + "grad_norm": 2.3573076725006104, + "learning_rate": 1.6590500057127898e-06, + "loss": 0.2427, "step": 969300 }, { - "epoch": 9.88, - "learning_rate": 1.673531072498045e-05, - "loss": 0.3657, + "epoch": 13.355928467113058, + "grad_norm": 0.6580946445465088, + "learning_rate": 1.6563091827685462e-06, + "loss": 0.2079, "step": 969400 }, { - "epoch": 9.88, - "learning_rate": 1.6729361122643126e-05, - "loss": 0.3369, + "epoch": 13.357306219172798, + "grad_norm": 1.3953064680099487, + "learning_rate": 1.6535705553780494e-06, + "loss": 0.2252, "step": 969500 }, { - "epoch": 9.88, - "learning_rate": 1.6723412169160844e-05, - "loss": 0.3616, + "epoch": 13.358683971232537, + "grad_norm": 1.5804858207702637, + "learning_rate": 1.6508341237739653e-06, + "loss": 0.185, "step": 969600 }, { - "epoch": 9.88, - "learning_rate": 1.671746386482449e-05, - "loss": 0.3032, + "epoch": 13.360061723292276, + "grad_norm": 2.270874500274658, + "learning_rate": 1.6480998881887665e-06, + "loss": 0.1747, "step": 969700 }, { - "epoch": 9.88, - "learning_rate": 1.6711516209924876e-05, - "loss": 0.3673, + "epoch": 13.361439475352016, + "grad_norm": 0.686627984046936, + "learning_rate": 1.6453678488547405e-06, + "loss": 0.2254, "step": 969800 }, { - "epoch": 9.88, - "learning_rate": 1.6705569204752817e-05, - "loss": 0.2822, + "epoch": 13.362817227411755, + "grad_norm": 0.15113112330436707, + "learning_rate": 1.6426380060039848e-06, + "loss": 0.2203, "step": 969900 }, { - "epoch": 9.88, - "learning_rate": 1.6699622849599103e-05, - "loss": 0.3379, + "epoch": 13.364194979471494, + "grad_norm": 2.3919641971588135, + "learning_rate": 1.6399103598684198e-06, + "loss": 0.1706, "step": 970000 }, { - "epoch": 9.88, - "learning_rate": 1.6693677144754444e-05, - "loss": 0.2835, + "epoch": 13.365572731531234, + "grad_norm": 6.637493133544922, + "learning_rate": 1.6371849106797648e-06, + "loss": 0.2116, "step": 970100 }, { - "epoch": 9.88, - "learning_rate": 1.6687732090509567e-05, - "loss": 0.3514, + "epoch": 13.366950483590973, + "grad_norm": 2.3451459407806396, + "learning_rate": 1.6344888803128737e-06, + "loss": 0.1782, "step": 970200 }, { - "epoch": 9.89, - "learning_rate": 1.6681787687155158e-05, - "loss": 0.2895, + "epoch": 13.368328235650713, + "grad_norm": 1.4470887184143066, + "learning_rate": 1.6317678037372435e-06, + "loss": 0.2063, "step": 970300 }, { - "epoch": 9.89, - "learning_rate": 1.6675843934981833e-05, - "loss": 0.3096, + "epoch": 13.369705987710452, + "grad_norm": 1.908995270729065, + "learning_rate": 1.6290489248002763e-06, + "loss": 0.2095, "step": 970400 }, { - "epoch": 9.89, - "learning_rate": 1.6669900834280234e-05, - "loss": 0.2796, + "epoch": 13.37108373977019, + "grad_norm": 1.9326345920562744, + "learning_rate": 1.6263322437329615e-06, + "loss": 0.1824, "step": 970500 }, { - "epoch": 9.89, - "learning_rate": 1.6663958385340933e-05, - "loss": 0.2901, + "epoch": 13.372461491829931, + "grad_norm": 4.54675817489624, + "learning_rate": 1.6236177607660867e-06, + "loss": 0.2222, "step": 970600 }, { - "epoch": 9.89, - "learning_rate": 1.6658016588454482e-05, - "loss": 0.3148, + "epoch": 13.37383924388967, + "grad_norm": 8.290882110595703, + "learning_rate": 1.6209054761302664e-06, + "loss": 0.2496, "step": 970700 }, { - "epoch": 9.89, - "learning_rate": 1.6652075443911395e-05, - "loss": 0.373, + "epoch": 13.375216995949408, + "grad_norm": 0.9323617219924927, + "learning_rate": 1.6181953900559245e-06, + "loss": 0.2073, "step": 970800 }, { - "epoch": 9.89, - "learning_rate": 1.664613495200218e-05, - "loss": 0.306, + "epoch": 13.376594748009149, + "grad_norm": 2.27738356590271, + "learning_rate": 1.6154875027732968e-06, + "loss": 0.1709, "step": 970900 }, { - "epoch": 9.89, - "learning_rate": 1.6640195113017255e-05, - "loss": 0.3155, + "epoch": 13.377972500068887, + "grad_norm": 10.14094066619873, + "learning_rate": 1.612781814512426e-06, + "loss": 0.1959, "step": 971000 }, { - "epoch": 9.89, - "learning_rate": 1.6634255927247067e-05, - "loss": 0.31, + "epoch": 13.379350252128628, + "grad_norm": 2.2707149982452393, + "learning_rate": 1.610078325503184e-06, + "loss": 0.1881, "step": 971100 }, { - "epoch": 9.89, - "learning_rate": 1.6628317394982e-05, - "loss": 0.3228, + "epoch": 13.380728004188366, + "grad_norm": 1.2219228744506836, + "learning_rate": 1.607377035975232e-06, + "loss": 0.2055, "step": 971200 }, { - "epoch": 9.9, - "learning_rate": 1.662237951651243e-05, - "loss": 0.3089, + "epoch": 13.382105756248105, + "grad_norm": 2.8613808155059814, + "learning_rate": 1.60467794615807e-06, + "loss": 0.209, "step": 971300 }, { - "epoch": 9.9, - "learning_rate": 1.661644229212865e-05, - "loss": 0.3738, + "epoch": 13.383483508307846, + "grad_norm": 1.4485281705856323, + "learning_rate": 1.6019810562809983e-06, + "loss": 0.2442, "step": 971400 }, { - "epoch": 9.9, - "learning_rate": 1.6610505722120978e-05, - "loss": 0.345, + "epoch": 13.384861260367584, + "grad_norm": 2.4416680335998535, + "learning_rate": 1.5992863665731291e-06, + "loss": 0.196, "step": 971500 }, { - "epoch": 9.9, - "learning_rate": 1.660456980677968e-05, - "loss": 0.3154, + "epoch": 13.386239012427323, + "grad_norm": 3.0236451625823975, + "learning_rate": 1.5965938772633879e-06, + "loss": 0.2033, "step": 971600 }, { - "epoch": 9.9, - "learning_rate": 1.659863454639497e-05, - "loss": 0.3149, + "epoch": 13.387616764487063, + "grad_norm": 4.0460052490234375, + "learning_rate": 1.5939035885805168e-06, + "loss": 0.1949, "step": 971700 }, { - "epoch": 9.9, - "learning_rate": 1.659269994125704e-05, - "loss": 0.3704, + "epoch": 13.388994516546802, + "grad_norm": 3.5044283866882324, + "learning_rate": 1.5912155007530718e-06, + "loss": 0.1987, "step": 971800 }, { - "epoch": 9.9, - "learning_rate": 1.65867659916561e-05, - "loss": 0.3407, + "epoch": 13.39037226860654, + "grad_norm": 0.32671087980270386, + "learning_rate": 1.5885296140094169e-06, + "loss": 0.215, "step": 971900 }, { - "epoch": 9.9, - "learning_rate": 1.6580832697882242e-05, - "loss": 0.3011, + "epoch": 13.391750020666281, + "grad_norm": 0.9594303369522095, + "learning_rate": 1.5858459285777386e-06, + "loss": 0.2355, "step": 972000 }, { - "epoch": 9.9, - "learning_rate": 1.6574900060225584e-05, - "loss": 0.3437, + "epoch": 13.39312777272602, + "grad_norm": 0.8852246403694153, + "learning_rate": 1.583164444686025e-06, + "loss": 0.2103, "step": 972100 }, { - "epoch": 9.9, - "learning_rate": 1.656896807897621e-05, - "loss": 0.2981, + "epoch": 13.39450552478576, + "grad_norm": 1.6586456298828125, + "learning_rate": 1.580485162562075e-06, + "loss": 0.2076, "step": 972200 }, { - "epoch": 9.91, - "learning_rate": 1.6563036754424134e-05, - "loss": 0.2986, + "epoch": 13.395883276845499, + "grad_norm": 3.041682481765747, + "learning_rate": 1.5778080824335136e-06, + "loss": 0.2326, "step": 972300 }, { - "epoch": 9.91, - "learning_rate": 1.6557106086859374e-05, - "loss": 0.3135, + "epoch": 13.397261028905238, + "grad_norm": 5.47319221496582, + "learning_rate": 1.5751332045277794e-06, + "loss": 0.2231, "step": 972400 }, { - "epoch": 9.91, - "learning_rate": 1.6551235373420303e-05, - "loss": 0.2921, + "epoch": 13.398638780964978, + "grad_norm": 1.3653414249420166, + "learning_rate": 1.5724605290721094e-06, + "loss": 0.2244, "step": 972500 }, { - "epoch": 9.91, - "learning_rate": 1.6545306014122952e-05, - "loss": 0.2743, + "epoch": 13.400016533024717, + "grad_norm": 2.543056011199951, + "learning_rate": 1.5697900562935546e-06, + "loss": 0.1928, "step": 972600 }, { - "epoch": 9.91, - "learning_rate": 1.6539377312679858e-05, - "loss": 0.2732, + "epoch": 13.401394285084455, + "grad_norm": 4.055856704711914, + "learning_rate": 1.5671217864189948e-06, + "loss": 0.207, "step": 972700 }, { - "epoch": 9.91, - "learning_rate": 1.6533449269380872e-05, - "loss": 0.2474, + "epoch": 13.402772037144196, + "grad_norm": 1.5557111501693726, + "learning_rate": 1.5644557196751177e-06, + "loss": 0.2174, "step": 972800 }, { - "epoch": 9.91, - "learning_rate": 1.6527521884515854e-05, - "loss": 0.2414, + "epoch": 13.404149789203935, + "grad_norm": 0.6921003460884094, + "learning_rate": 1.5617918562884062e-06, + "loss": 0.2233, "step": 972900 }, { - "epoch": 9.91, - "learning_rate": 1.6521595158374616e-05, - "loss": 0.2974, + "epoch": 13.405527541263675, + "grad_norm": 2.644334554672241, + "learning_rate": 1.5591568021747277e-06, + "loss": 0.2124, "step": 973000 }, { - "epoch": 9.91, - "learning_rate": 1.6515669091246933e-05, - "loss": 0.2977, + "epoch": 13.406905293323414, + "grad_norm": 2.2516720294952393, + "learning_rate": 1.5564973241418872e-06, + "loss": 0.2312, "step": 973100 }, { - "epoch": 9.92, - "learning_rate": 1.650974368342257e-05, - "loss": 0.2877, + "epoch": 13.408283045383152, + "grad_norm": 3.5470683574676514, + "learning_rate": 1.5538400501423274e-06, + "loss": 0.2675, "step": 973200 }, { - "epoch": 9.92, - "learning_rate": 1.650381893519124e-05, - "loss": 0.3629, + "epoch": 13.409660797442893, + "grad_norm": 9.808758735656738, + "learning_rate": 1.5511849804017895e-06, + "loss": 0.3152, "step": 973300 }, { - "epoch": 9.92, - "learning_rate": 1.649789484684262e-05, - "loss": 0.3576, + "epoch": 13.411038549502631, + "grad_norm": 1.2279080152511597, + "learning_rate": 1.54853211514584e-06, + "loss": 0.2358, "step": 973400 }, { - "epoch": 9.92, - "learning_rate": 1.649197141866636e-05, - "loss": 0.311, + "epoch": 13.41241630156237, + "grad_norm": 5.738792419433594, + "learning_rate": 1.5458814545998604e-06, + "loss": 0.2255, "step": 973500 }, { - "epoch": 9.92, - "learning_rate": 1.6486048650952083e-05, - "loss": 0.3238, + "epoch": 13.41379405362211, + "grad_norm": 1.5541588068008423, + "learning_rate": 1.5432329989890295e-06, + "loss": 0.2018, "step": 973600 }, { - "epoch": 9.92, - "learning_rate": 1.6480126543989397e-05, - "loss": 0.2817, + "epoch": 13.41517180568185, + "grad_norm": 5.201451778411865, + "learning_rate": 1.5405867485383467e-06, + "loss": 0.2686, "step": 973700 }, { - "epoch": 9.92, - "learning_rate": 1.6474205098067827e-05, - "loss": 0.306, + "epoch": 13.41654955774159, + "grad_norm": 1.6326016187667847, + "learning_rate": 1.5379427034726312e-06, + "loss": 0.1861, "step": 973800 }, { - "epoch": 9.92, - "learning_rate": 1.646828431347691e-05, - "loss": 0.3195, + "epoch": 13.417927309801328, + "grad_norm": 3.761275053024292, + "learning_rate": 1.5353008640165003e-06, + "loss": 0.2297, "step": 973900 }, { - "epoch": 9.92, - "learning_rate": 1.646236419050615e-05, - "loss": 0.3032, + "epoch": 13.419305061861067, + "grad_norm": 1.3702479600906372, + "learning_rate": 1.5326612303943946e-06, + "loss": 0.2145, "step": 974000 }, { - "epoch": 9.92, - "learning_rate": 1.6456444729444978e-05, - "loss": 0.2901, + "epoch": 13.420682813920807, + "grad_norm": 2.6068058013916016, + "learning_rate": 1.5300238028305682e-06, + "loss": 0.2391, "step": 974100 }, { - "epoch": 9.93, - "learning_rate": 1.6450525930582837e-05, - "loss": 0.336, + "epoch": 13.422060565980546, + "grad_norm": 1.8667982816696167, + "learning_rate": 1.5273885815490831e-06, + "loss": 0.2306, "step": 974200 }, { - "epoch": 9.93, - "learning_rate": 1.644460779420912e-05, - "loss": 0.2921, + "epoch": 13.423438318040285, + "grad_norm": 0.05423307791352272, + "learning_rate": 1.5247555667738088e-06, + "loss": 0.2275, "step": 974300 }, { - "epoch": 9.93, - "learning_rate": 1.6438690320613187e-05, - "loss": 0.3396, + "epoch": 13.424816070100025, + "grad_norm": 1.0515514612197876, + "learning_rate": 1.522124758728438e-06, + "loss": 0.2424, "step": 974400 }, { - "epoch": 9.93, - "learning_rate": 1.6432773510084368e-05, - "loss": 0.2329, + "epoch": 13.426193822159764, + "grad_norm": 2.653064489364624, + "learning_rate": 1.5194961576364732e-06, + "loss": 0.167, "step": 974500 }, { - "epoch": 9.93, - "learning_rate": 1.642685736291197e-05, - "loss": 0.2667, + "epoch": 13.427571574219504, + "grad_norm": 4.222901821136475, + "learning_rate": 1.5168697637212229e-06, + "loss": 0.1886, "step": 974600 }, { - "epoch": 9.93, - "learning_rate": 1.642094187938523e-05, - "loss": 0.2805, + "epoch": 13.428949326279243, + "grad_norm": 0.6469887495040894, + "learning_rate": 1.5142455772058172e-06, + "loss": 0.2236, "step": 974700 }, { - "epoch": 9.93, - "learning_rate": 1.6415027059793403e-05, - "loss": 0.3168, + "epoch": 13.430327078338982, + "grad_norm": 0.8529631495475769, + "learning_rate": 1.5116235983131948e-06, + "loss": 0.2271, "step": 974800 }, { - "epoch": 9.93, - "learning_rate": 1.6409112904425695e-05, - "loss": 0.3657, + "epoch": 13.431704830398722, + "grad_norm": 0.9263003468513489, + "learning_rate": 1.5090038272660955e-06, + "loss": 0.1843, "step": 974900 }, { - "epoch": 9.93, - "learning_rate": 1.6403199413571243e-05, - "loss": 0.3018, + "epoch": 13.43308258245846, + "grad_norm": 0.8160406351089478, + "learning_rate": 1.5063862642870942e-06, + "loss": 0.2261, "step": 975000 }, { - "epoch": 9.93, - "learning_rate": 1.63972865875192e-05, - "loss": 0.3193, + "epoch": 13.4344603345182, + "grad_norm": 0.4991409182548523, + "learning_rate": 1.5037709095985647e-06, + "loss": 0.229, "step": 975100 }, { - "epoch": 9.94, - "learning_rate": 1.639137442655868e-05, - "loss": 0.2272, + "epoch": 13.43583808657794, + "grad_norm": 3.1712472438812256, + "learning_rate": 1.5011577634226878e-06, + "loss": 0.215, "step": 975200 }, { - "epoch": 9.94, - "learning_rate": 1.6385462930978724e-05, - "loss": 0.3226, + "epoch": 13.437215838637679, + "grad_norm": 2.931771993637085, + "learning_rate": 1.498546825981471e-06, + "loss": 0.1871, "step": 975300 }, { - "epoch": 9.94, - "learning_rate": 1.6379552101068385e-05, - "loss": 0.3661, + "epoch": 13.438593590697419, + "grad_norm": 2.5258891582489014, + "learning_rate": 1.4959380974967194e-06, + "loss": 0.2022, "step": 975400 }, { - "epoch": 9.94, - "learning_rate": 1.6373641937116667e-05, - "loss": 0.2919, + "epoch": 13.439971342757158, + "grad_norm": 2.1388068199157715, + "learning_rate": 1.493331578190068e-06, + "loss": 0.2185, "step": 975500 }, { - "epoch": 9.94, - "learning_rate": 1.6367732439412538e-05, - "loss": 0.2695, + "epoch": 13.441349094816896, + "grad_norm": 0.913186252117157, + "learning_rate": 1.490727268282944e-06, + "loss": 0.2461, "step": 975600 }, { - "epoch": 9.94, - "learning_rate": 1.6361882693256315e-05, - "loss": 0.3266, + "epoch": 13.442726846876637, + "grad_norm": 1.7823710441589355, + "learning_rate": 1.4881251679966002e-06, + "loss": 0.2024, "step": 975700 }, { - "epoch": 9.94, - "learning_rate": 1.635597452224447e-05, - "loss": 0.3197, + "epoch": 13.444104598936375, + "grad_norm": 1.3149183988571167, + "learning_rate": 1.4855252775521064e-06, + "loss": 0.1734, "step": 975800 }, { - "epoch": 9.94, - "learning_rate": 1.6350067018344044e-05, - "loss": 0.3561, + "epoch": 13.445482350996114, + "grad_norm": 1.0991275310516357, + "learning_rate": 1.4829275971703248e-06, + "loss": 0.1982, "step": 975900 }, { - "epoch": 9.94, - "learning_rate": 1.634416018184389e-05, - "loss": 0.3026, + "epoch": 13.446860103055855, + "grad_norm": 2.2895333766937256, + "learning_rate": 1.480332127071947e-06, + "loss": 0.1653, "step": 976000 }, { - "epoch": 9.94, - "learning_rate": 1.6338254013032815e-05, - "loss": 0.317, + "epoch": 13.448237855115593, + "grad_norm": 3.155470848083496, + "learning_rate": 1.4777388674774713e-06, + "loss": 0.2139, "step": 976100 }, { - "epoch": 9.95, - "learning_rate": 1.633234851219958e-05, - "loss": 0.2814, + "epoch": 13.449615607175332, + "grad_norm": 2.3851096630096436, + "learning_rate": 1.4751478186072048e-06, + "loss": 0.2386, "step": 976200 }, { - "epoch": 9.95, - "learning_rate": 1.632644367963294e-05, - "loss": 0.2904, + "epoch": 13.450993359235072, + "grad_norm": 3.124310255050659, + "learning_rate": 1.4725589806812736e-06, + "loss": 0.2577, "step": 976300 }, { - "epoch": 9.95, - "learning_rate": 1.6320539515621614e-05, - "loss": 0.3409, + "epoch": 13.452371111294811, + "grad_norm": 1.242732048034668, + "learning_rate": 1.469972353919615e-06, + "loss": 0.2735, "step": 976400 }, { - "epoch": 9.95, - "learning_rate": 1.631463602045426e-05, - "loss": 0.3202, + "epoch": 13.453748863354551, + "grad_norm": 2.21231746673584, + "learning_rate": 1.4673879385419768e-06, + "loss": 0.2592, "step": 976500 }, { - "epoch": 9.95, - "learning_rate": 1.6308733194419536e-05, - "loss": 0.2994, + "epoch": 13.45512661541429, + "grad_norm": 4.025476455688477, + "learning_rate": 1.4648057347679082e-06, + "loss": 0.2109, "step": 976600 }, { - "epoch": 9.95, - "learning_rate": 1.630283103780606e-05, - "loss": 0.4042, + "epoch": 13.456504367474029, + "grad_norm": 0.8984253406524658, + "learning_rate": 1.4622257428167877e-06, + "loss": 0.2303, "step": 976700 }, { - "epoch": 9.95, - "learning_rate": 1.6296929550902398e-05, - "loss": 0.3344, + "epoch": 13.45788211953377, + "grad_norm": 1.9153467416763306, + "learning_rate": 1.4596479629078045e-06, + "loss": 0.1826, "step": 976800 }, { - "epoch": 9.95, - "learning_rate": 1.6291028733997105e-05, - "loss": 0.297, + "epoch": 13.459259871593508, + "grad_norm": 1.0118606090545654, + "learning_rate": 1.4570723952599429e-06, + "loss": 0.2025, "step": 976900 }, { - "epoch": 9.95, - "learning_rate": 1.6285128587378698e-05, - "loss": 0.3386, + "epoch": 13.460637623653247, + "grad_norm": 5.3036274909973145, + "learning_rate": 1.454499040092023e-06, + "loss": 0.2601, "step": 977000 }, { - "epoch": 9.95, - "learning_rate": 1.627922911133565e-05, - "loss": 0.3284, + "epoch": 13.462015375712987, + "grad_norm": 4.049126625061035, + "learning_rate": 1.451953598093768e-06, + "loss": 0.2297, "step": 977100 }, { - "epoch": 9.96, - "learning_rate": 1.6273330306156428e-05, - "loss": 0.2862, + "epoch": 13.463393127772726, + "grad_norm": 2.906615972518921, + "learning_rate": 1.4493846464111365e-06, + "loss": 0.2111, "step": 977200 }, { - "epoch": 9.96, - "learning_rate": 1.626743217212944e-05, - "loss": 0.2907, + "epoch": 13.464770879832466, + "grad_norm": 0.4944547116756439, + "learning_rate": 1.4468179078615502e-06, + "loss": 0.2222, "step": 977300 }, { - "epoch": 9.96, - "learning_rate": 1.6261534709543057e-05, - "loss": 0.2829, + "epoch": 13.466148631892205, + "grad_norm": 2.989488124847412, + "learning_rate": 1.4442533826630684e-06, + "loss": 0.236, "step": 977400 }, { - "epoch": 9.96, - "learning_rate": 1.6255637918685645e-05, - "loss": 0.3092, + "epoch": 13.467526383951943, + "grad_norm": 2.149967908859253, + "learning_rate": 1.4416910710335643e-06, + "loss": 0.2114, "step": 977500 }, { - "epoch": 9.96, - "learning_rate": 1.6249741799845525e-05, - "loss": 0.3296, + "epoch": 13.468904136011684, + "grad_norm": 3.3817970752716064, + "learning_rate": 1.4391565632101897e-06, + "loss": 0.1915, "step": 977600 }, { - "epoch": 9.96, - "learning_rate": 1.6243846353310962e-05, - "loss": 0.2976, + "epoch": 13.470281888071423, + "grad_norm": 3.9822964668273926, + "learning_rate": 1.436598657230375e-06, + "loss": 0.212, "step": 977700 }, { - "epoch": 9.96, - "learning_rate": 1.623795157937022e-05, - "loss": 0.304, + "epoch": 13.471659640131161, + "grad_norm": 2.298999547958374, + "learning_rate": 1.434042965469842e-06, + "loss": 0.1999, "step": 977800 }, { - "epoch": 9.96, - "learning_rate": 1.623205747831152e-05, - "loss": 0.3195, + "epoch": 13.473037392190902, + "grad_norm": 2.3098762035369873, + "learning_rate": 1.4314894881457131e-06, + "loss": 0.2194, "step": 977900 }, { - "epoch": 9.96, - "learning_rate": 1.6226164050423054e-05, - "loss": 0.2916, + "epoch": 13.47441514425064, + "grad_norm": 0.9896675944328308, + "learning_rate": 1.4289382254749208e-06, + "loss": 0.1758, "step": 978000 }, { - "epoch": 9.97, - "learning_rate": 1.622027129599296e-05, - "loss": 0.3167, + "epoch": 13.47579289631038, + "grad_norm": 2.371516227722168, + "learning_rate": 1.4263891776742e-06, + "loss": 0.2214, "step": 978100 }, { - "epoch": 9.97, - "learning_rate": 1.6214379215309366e-05, - "loss": 0.2819, + "epoch": 13.47717064837012, + "grad_norm": 2.395801305770874, + "learning_rate": 1.4238423449601107e-06, + "loss": 0.2366, "step": 978200 }, { - "epoch": 9.97, - "learning_rate": 1.620848780866036e-05, - "loss": 0.2663, + "epoch": 13.478548400429858, + "grad_norm": 3.5755250453948975, + "learning_rate": 1.421297727549024e-06, + "loss": 0.2622, "step": 978300 }, { - "epoch": 9.97, - "learning_rate": 1.6202597076333997e-05, - "loss": 0.3281, + "epoch": 13.479926152489599, + "grad_norm": 3.4193718433380127, + "learning_rate": 1.4187553256571067e-06, + "loss": 0.1903, "step": 978400 }, { - "epoch": 9.97, - "learning_rate": 1.6196707018618292e-05, - "loss": 0.3317, + "epoch": 13.481303904549337, + "grad_norm": 0.9844844341278076, + "learning_rate": 1.4162151395003572e-06, + "loss": 0.1894, "step": 978500 }, { - "epoch": 9.97, - "learning_rate": 1.619081763580126e-05, - "loss": 0.3404, + "epoch": 13.482681656609076, + "grad_norm": 2.724477767944336, + "learning_rate": 1.413677169294582e-06, + "loss": 0.262, "step": 978600 }, { - "epoch": 9.97, - "learning_rate": 1.618492892817082e-05, - "loss": 0.2943, + "epoch": 13.484059408668816, + "grad_norm": 0.596457302570343, + "learning_rate": 1.4111414152553823e-06, + "loss": 0.2137, "step": 978700 }, { - "epoch": 9.97, - "learning_rate": 1.6179040896014916e-05, - "loss": 0.3872, + "epoch": 13.485437160728555, + "grad_norm": 4.129903316497803, + "learning_rate": 1.4086078775981861e-06, + "loss": 0.2445, "step": 978800 }, { - "epoch": 9.97, - "learning_rate": 1.6173153539621442e-05, - "loss": 0.3008, + "epoch": 13.486814912788295, + "grad_norm": 0.13706065714359283, + "learning_rate": 1.406076556538238e-06, + "loss": 0.2244, "step": 978900 }, { - "epoch": 9.97, - "learning_rate": 1.6167266859278232e-05, - "loss": 0.2887, + "epoch": 13.488192664848034, + "grad_norm": 2.330181837081909, + "learning_rate": 1.4035474522905778e-06, + "loss": 0.2507, "step": 979000 }, { - "epoch": 9.98, - "learning_rate": 1.6161380855273127e-05, - "loss": 0.2822, + "epoch": 13.489570416907773, + "grad_norm": 3.4639618396759033, + "learning_rate": 1.4010205650700744e-06, + "loss": 0.1634, "step": 979100 }, { - "epoch": 9.98, - "learning_rate": 1.6155495527893926e-05, - "loss": 0.3184, + "epoch": 13.490948168967513, + "grad_norm": 6.897192478179932, + "learning_rate": 1.3984958950913924e-06, + "loss": 0.2109, "step": 979200 }, { - "epoch": 9.98, - "learning_rate": 1.6149610877428355e-05, - "loss": 0.3062, + "epoch": 13.492325921027252, + "grad_norm": 4.98928165435791, + "learning_rate": 1.3959734425690218e-06, + "loss": 0.1964, "step": 979300 }, { - "epoch": 9.98, - "learning_rate": 1.6143726904164165e-05, - "loss": 0.303, + "epoch": 13.49370367308699, + "grad_norm": 1.832289695739746, + "learning_rate": 1.393453207717252e-06, + "loss": 0.2792, "step": 979400 }, { - "epoch": 9.98, - "learning_rate": 1.6137843608389035e-05, - "loss": 0.3176, + "epoch": 13.495081425146731, + "grad_norm": 0.4960339367389679, + "learning_rate": 1.390935190750191e-06, + "loss": 0.2485, "step": 979500 }, { - "epoch": 9.98, - "learning_rate": 1.6131960990390632e-05, - "loss": 0.2602, + "epoch": 13.49645917720647, + "grad_norm": 3.3870530128479004, + "learning_rate": 1.388419391881762e-06, + "loss": 0.2613, "step": 979600 }, { - "epoch": 9.98, - "learning_rate": 1.6126079050456574e-05, - "loss": 0.3141, + "epoch": 13.49783692926621, + "grad_norm": 0.7139763832092285, + "learning_rate": 1.3859058113256947e-06, + "loss": 0.197, "step": 979700 }, { - "epoch": 9.98, - "learning_rate": 1.6120197788874456e-05, - "loss": 0.3205, + "epoch": 13.499214681325949, + "grad_norm": 4.498809337615967, + "learning_rate": 1.3833944492955212e-06, + "loss": 0.2105, "step": 979800 }, { - "epoch": 9.98, - "learning_rate": 1.6114317205931845e-05, - "loss": 0.2979, + "epoch": 13.500592433385687, + "grad_norm": 1.9053908586502075, + "learning_rate": 1.3808853060046047e-06, + "loss": 0.251, "step": 979900 }, { - "epoch": 9.98, - "learning_rate": 1.6108437301916245e-05, - "loss": 0.3681, + "epoch": 13.501970185445428, + "grad_norm": 0.7259538173675537, + "learning_rate": 1.3783783816661019e-06, + "loss": 0.21, "step": 980000 }, { - "epoch": 9.99, - "learning_rate": 1.6102558077115163e-05, - "loss": 0.3203, + "epoch": 13.503347937505167, + "grad_norm": 4.234583377838135, + "learning_rate": 1.3758736764929916e-06, + "loss": 0.206, "step": 980100 }, { - "epoch": 9.99, - "learning_rate": 1.6096679531816067e-05, - "loss": 0.274, + "epoch": 13.504725689564905, + "grad_norm": 0.6961900591850281, + "learning_rate": 1.3733711906980699e-06, + "loss": 0.2574, "step": 980200 }, { - "epoch": 9.99, - "learning_rate": 1.609080166630636e-05, - "loss": 0.2613, + "epoch": 13.506103441624646, + "grad_norm": 2.5326108932495117, + "learning_rate": 1.3708709244939247e-06, + "loss": 0.1808, "step": 980300 }, { - "epoch": 9.99, - "learning_rate": 1.6084924480873443e-05, - "loss": 0.3276, + "epoch": 13.507481193684384, + "grad_norm": 2.8855934143066406, + "learning_rate": 1.3683728780929705e-06, + "loss": 0.2214, "step": 980400 }, { - "epoch": 9.99, - "learning_rate": 1.607904797580469e-05, - "loss": 0.3026, + "epoch": 13.508858945744123, + "grad_norm": 0.2484571486711502, + "learning_rate": 1.3658770517074262e-06, + "loss": 0.1636, "step": 980500 }, { - "epoch": 9.99, - "learning_rate": 1.6073172151387407e-05, - "loss": 0.2725, + "epoch": 13.510236697803863, + "grad_norm": 1.1764426231384277, + "learning_rate": 1.3633834455493304e-06, + "loss": 0.1869, "step": 980600 }, { - "epoch": 9.99, - "learning_rate": 1.6067297007908892e-05, - "loss": 0.3273, + "epoch": 13.511614449863602, + "grad_norm": 2.2267537117004395, + "learning_rate": 1.3608920598305235e-06, + "loss": 0.2249, "step": 980700 }, { - "epoch": 9.99, - "learning_rate": 1.6061422545656407e-05, - "loss": 0.3502, + "epoch": 13.512992201923343, + "grad_norm": 2.6020455360412598, + "learning_rate": 1.3584028947626656e-06, + "loss": 0.1859, "step": 980800 }, { - "epoch": 9.99, - "learning_rate": 1.6055548764917183e-05, - "loss": 0.3256, + "epoch": 13.514369953983081, + "grad_norm": 0.9432273507118225, + "learning_rate": 1.3559159505572213e-06, + "loss": 0.1966, "step": 980900 }, { - "epoch": 9.99, - "learning_rate": 1.6049675665978406e-05, - "loss": 0.2735, + "epoch": 13.51574770604282, + "grad_norm": 0.07814761251211166, + "learning_rate": 1.3534312274254663e-06, + "loss": 0.2162, "step": 981000 }, { - "epoch": 10.0, - "learning_rate": 1.6043803249127232e-05, - "loss": 0.335, + "epoch": 13.51712545810256, + "grad_norm": 1.8752347230911255, + "learning_rate": 1.3509487255784925e-06, + "loss": 0.2358, "step": 981100 }, { - "epoch": 10.0, - "learning_rate": 1.6037931514650808e-05, - "loss": 0.279, + "epoch": 13.518503210162299, + "grad_norm": 0.3885021209716797, + "learning_rate": 1.348468445227206e-06, + "loss": 0.197, "step": 981200 }, { - "epoch": 10.0, - "learning_rate": 1.6032119169974234e-05, - "loss": 0.2971, + "epoch": 13.519880962222038, + "grad_norm": 4.643098831176758, + "learning_rate": 1.3459903865823118e-06, + "loss": 0.1794, "step": 981300 }, { - "epoch": 10.0, - "learning_rate": 1.6026248794277608e-05, - "loss": 0.3959, + "epoch": 13.521258714281778, + "grad_norm": 2.188894748687744, + "learning_rate": 1.3435145498543427e-06, + "loss": 0.2237, "step": 981400 }, { - "epoch": 10.0, - "learning_rate": 1.6020379101814024e-05, - "loss": 0.3691, + "epoch": 13.522636466341517, + "grad_norm": 2.5729386806488037, + "learning_rate": 1.341040935253622e-06, + "loss": 0.2553, "step": 981500 }, { - "epoch": 10.0, - "learning_rate": 1.6014510092870483e-05, - "loss": 0.3122, + "epoch": 13.524014218401257, + "grad_norm": 2.813927173614502, + "learning_rate": 1.3385695429903078e-06, + "loss": 0.2104, "step": 981600 }, { - "epoch": 10.0, - "learning_rate": 1.6008641767733926e-05, - "loss": 0.2939, + "epoch": 13.525391970460996, + "grad_norm": 4.45389461517334, + "learning_rate": 1.3361003732743472e-06, + "loss": 0.2118, "step": 981700 }, { - "epoch": 10.0, - "learning_rate": 1.600277412669129e-05, - "loss": 0.2935, + "epoch": 13.526769722520735, + "grad_norm": 2.084127426147461, + "learning_rate": 1.3336334263155134e-06, + "loss": 0.2364, "step": 981800 }, { - "epoch": 10.0, - "learning_rate": 1.5996907170029472e-05, - "loss": 0.3251, + "epoch": 13.528147474580475, + "grad_norm": 1.1524569988250732, + "learning_rate": 1.3311687023233938e-06, + "loss": 0.1987, "step": 981900 }, { - "epoch": 10.0, - "learning_rate": 1.5991040898035328e-05, - "loss": 0.3105, + "epoch": 13.529525226640214, + "grad_norm": 2.9172592163085938, + "learning_rate": 1.3287062015073645e-06, + "loss": 0.2349, "step": 982000 }, { - "epoch": 10.01, - "learning_rate": 1.5985175310995677e-05, - "loss": 0.2986, + "epoch": 13.530902978699952, + "grad_norm": 1.9078935384750366, + "learning_rate": 1.3262459240766343e-06, + "loss": 0.1774, "step": 982100 }, { - "epoch": 10.01, - "learning_rate": 1.5979310409197323e-05, - "loss": 0.2763, + "epoch": 13.532280730759693, + "grad_norm": 1.2338684797286987, + "learning_rate": 1.3237878702402221e-06, + "loss": 0.2046, "step": 982200 }, { - "epoch": 10.01, - "learning_rate": 1.597350483169542e-05, - "loss": 0.2218, + "epoch": 13.533658482819432, + "grad_norm": 1.4152686595916748, + "learning_rate": 1.3213320402069396e-06, + "loss": 0.1813, "step": 982300 }, { - "epoch": 10.01, - "learning_rate": 1.596764129438032e-05, - "loss": 0.2732, + "epoch": 13.535036234879172, + "grad_norm": 4.223006248474121, + "learning_rate": 1.318902959236107e-06, + "loss": 0.2005, "step": 982400 }, { - "epoch": 10.01, - "learning_rate": 1.5961778443163833e-05, - "loss": 0.2277, + "epoch": 13.53641398693891, + "grad_norm": 0.9802374243736267, + "learning_rate": 1.3164515551915884e-06, + "loss": 0.1695, "step": 982500 }, { - "epoch": 10.01, - "learning_rate": 1.5955916278332595e-05, - "loss": 0.3058, + "epoch": 13.53779173899865, + "grad_norm": 0.04625668004155159, + "learning_rate": 1.3140023755734621e-06, + "loss": 0.2189, "step": 982600 }, { - "epoch": 10.01, - "learning_rate": 1.5950054800173237e-05, - "loss": 0.2583, + "epoch": 13.53916949105839, + "grad_norm": 8.70906925201416, + "learning_rate": 1.3115554205898006e-06, + "loss": 0.2712, "step": 982700 }, { - "epoch": 10.01, - "learning_rate": 1.594419400897237e-05, - "loss": 0.3665, + "epoch": 13.540547243118128, + "grad_norm": 0.3222040832042694, + "learning_rate": 1.3091106904484846e-06, + "loss": 0.1882, "step": 982800 }, { - "epoch": 10.01, - "learning_rate": 1.593833390501652e-05, - "loss": 0.3212, + "epoch": 13.541924995177867, + "grad_norm": 1.4261754751205444, + "learning_rate": 1.306668185357208e-06, + "loss": 0.1873, "step": 982900 }, { - "epoch": 10.01, - "learning_rate": 1.5932474488592225e-05, - "loss": 0.3234, + "epoch": 13.543302747237608, + "grad_norm": 2.110771417617798, + "learning_rate": 1.3042279055234757e-06, + "loss": 0.1871, "step": 983000 }, { - "epoch": 10.02, - "learning_rate": 1.5926615759985992e-05, - "loss": 0.2754, + "epoch": 13.544680499297346, + "grad_norm": 0.22240127623081207, + "learning_rate": 1.3017898511545972e-06, + "loss": 0.2234, "step": 983100 }, { - "epoch": 10.02, - "learning_rate": 1.592075771948425e-05, - "loss": 0.3154, + "epoch": 13.546058251357085, + "grad_norm": 2.1917483806610107, + "learning_rate": 1.2993540224576958e-06, + "loss": 0.2447, "step": 983200 }, { - "epoch": 10.02, - "learning_rate": 1.591490036737343e-05, - "loss": 0.2552, + "epoch": 13.547436003416825, + "grad_norm": 0.8603069186210632, + "learning_rate": 1.2969204196397204e-06, + "loss": 0.2014, "step": 983300 }, { - "epoch": 10.02, - "learning_rate": 1.5909043703939935e-05, - "loss": 0.3053, + "epoch": 13.548813755476564, + "grad_norm": 0.83849036693573, + "learning_rate": 1.2944890429074009e-06, + "loss": 0.2598, "step": 983400 }, { - "epoch": 10.02, - "learning_rate": 1.5903187729470105e-05, - "loss": 0.3005, + "epoch": 13.550191507536304, + "grad_norm": 1.582377552986145, + "learning_rate": 1.292059892467307e-06, + "loss": 0.2073, "step": 983500 }, { - "epoch": 10.02, - "learning_rate": 1.5897332444250277e-05, - "loss": 0.2663, + "epoch": 13.551569259596043, + "grad_norm": 1.7436193227767944, + "learning_rate": 1.2896329685258119e-06, + "loss": 0.2114, "step": 983600 }, { - "epoch": 10.02, - "learning_rate": 1.5891477848566737e-05, - "loss": 0.2669, + "epoch": 13.552947011655782, + "grad_norm": 1.3982709646224976, + "learning_rate": 1.2872082712890762e-06, + "loss": 0.231, "step": 983700 }, { - "epoch": 10.02, - "learning_rate": 1.5885623942705724e-05, - "loss": 0.2834, + "epoch": 13.554324763715522, + "grad_norm": 0.29163679480552673, + "learning_rate": 1.284785800963106e-06, + "loss": 0.1872, "step": 983800 }, { - "epoch": 10.02, - "learning_rate": 1.5879770726953475e-05, - "loss": 0.3085, + "epoch": 13.55570251577526, + "grad_norm": 0.5178783535957336, + "learning_rate": 1.2823655577536998e-06, + "loss": 0.2617, "step": 983900 }, { - "epoch": 10.03, - "learning_rate": 1.587391820159618e-05, - "loss": 0.2767, + "epoch": 13.557080267835001, + "grad_norm": 3.3753626346588135, + "learning_rate": 1.2799475418664628e-06, + "loss": 0.2044, "step": 984000 }, { - "epoch": 10.03, - "learning_rate": 1.5868066366919973e-05, - "loss": 0.277, + "epoch": 13.55845801989474, + "grad_norm": 2.0288450717926025, + "learning_rate": 1.2775317535068301e-06, + "loss": 0.1904, "step": 984100 }, { - "epoch": 10.03, - "learning_rate": 1.586221522321099e-05, - "loss": 0.2924, + "epoch": 13.559835771954479, + "grad_norm": 6.1607537269592285, + "learning_rate": 1.275118192880023e-06, + "loss": 0.1998, "step": 984200 }, { - "epoch": 10.03, - "learning_rate": 1.5856364770755307e-05, - "loss": 0.2547, + "epoch": 13.561213524014219, + "grad_norm": 0.929041862487793, + "learning_rate": 1.2727068601910942e-06, + "loss": 0.221, "step": 984300 }, { - "epoch": 10.03, - "learning_rate": 1.5850515009838997e-05, - "loss": 0.3107, + "epoch": 13.562591276073958, + "grad_norm": 1.6923526525497437, + "learning_rate": 1.27029775564489e-06, + "loss": 0.2305, "step": 984400 }, { - "epoch": 10.03, - "learning_rate": 1.5844665940748054e-05, - "loss": 0.2904, + "epoch": 13.563969028133696, + "grad_norm": 2.6941702365875244, + "learning_rate": 1.2678908794460844e-06, + "loss": 0.2258, "step": 984500 }, { - "epoch": 10.03, - "learning_rate": 1.5838817563768465e-05, - "loss": 0.2946, + "epoch": 13.565346780193437, + "grad_norm": 3.6822738647460938, + "learning_rate": 1.265486231799154e-06, + "loss": 0.201, "step": 984600 }, { - "epoch": 10.03, - "learning_rate": 1.5832969879186193e-05, - "loss": 0.2867, + "epoch": 13.566724532253176, + "grad_norm": 1.1845561265945435, + "learning_rate": 1.2630838129083852e-06, + "loss": 0.2061, "step": 984700 }, { - "epoch": 10.03, - "learning_rate": 1.5827122887287148e-05, - "loss": 0.2729, + "epoch": 13.568102284312914, + "grad_norm": 2.522339105606079, + "learning_rate": 1.2606836229778669e-06, + "loss": 0.2448, "step": 984800 }, { - "epoch": 10.03, - "learning_rate": 1.5821276588357215e-05, - "loss": 0.2389, + "epoch": 13.569480036372655, + "grad_norm": 4.268677234649658, + "learning_rate": 1.2582856622115191e-06, + "loss": 0.2089, "step": 984900 }, { - "epoch": 10.04, - "learning_rate": 1.581543098268225e-05, - "loss": 0.3456, + "epoch": 13.570857788432393, + "grad_norm": 2.647533416748047, + "learning_rate": 1.255889930813052e-06, + "loss": 0.2057, "step": 985000 }, { - "epoch": 10.04, - "learning_rate": 1.5809586070548056e-05, - "loss": 0.2962, + "epoch": 13.572235540492134, + "grad_norm": 0.933820366859436, + "learning_rate": 1.2534964289860013e-06, + "loss": 0.2017, "step": 985100 }, { - "epoch": 10.04, - "learning_rate": 1.5803741852240416e-05, - "loss": 0.2721, + "epoch": 13.573613292551872, + "grad_norm": 3.9596025943756104, + "learning_rate": 1.2511051569337073e-06, + "loss": 0.2423, "step": 985200 }, { - "epoch": 10.04, - "learning_rate": 1.579789832804509e-05, - "loss": 0.2853, + "epoch": 13.574991044611611, + "grad_norm": 0.7913817763328552, + "learning_rate": 1.2487161148593181e-06, + "loss": 0.1956, "step": 985300 }, { - "epoch": 10.04, - "learning_rate": 1.5792055498247776e-05, - "loss": 0.2383, + "epoch": 13.576368796671352, + "grad_norm": 3.467693328857422, + "learning_rate": 1.2463293029657927e-06, + "loss": 0.219, "step": 985400 }, { - "epoch": 10.04, - "learning_rate": 1.5786213363134155e-05, - "loss": 0.2945, + "epoch": 13.57774654873109, + "grad_norm": 3.1986021995544434, + "learning_rate": 1.2439447214559032e-06, + "loss": 0.1895, "step": 985500 }, { - "epoch": 10.04, - "learning_rate": 1.5780371922989895e-05, - "loss": 0.256, + "epoch": 13.579124300790829, + "grad_norm": 0.7990707159042358, + "learning_rate": 1.2415623705322424e-06, + "loss": 0.1707, "step": 985600 }, { - "epoch": 10.04, - "learning_rate": 1.577453117810057e-05, - "loss": 0.217, + "epoch": 13.58050205285057, + "grad_norm": 1.7855857610702515, + "learning_rate": 1.2391822503971887e-06, + "loss": 0.2125, "step": 985700 }, { - "epoch": 10.04, - "learning_rate": 1.576869112875178e-05, - "loss": 0.3075, + "epoch": 13.581879804910308, + "grad_norm": 8.833712577819824, + "learning_rate": 1.236804361252959e-06, + "loss": 0.196, "step": 985800 }, { - "epoch": 10.04, - "learning_rate": 1.5762851775229067e-05, - "loss": 0.2215, + "epoch": 13.583257556970048, + "grad_norm": 1.7128103971481323, + "learning_rate": 1.2344287033015594e-06, + "loss": 0.2051, "step": 985900 }, { - "epoch": 10.05, - "learning_rate": 1.5757013117817937e-05, - "loss": 0.2744, + "epoch": 13.584635309029787, + "grad_norm": 3.971196174621582, + "learning_rate": 1.23205527674481e-06, + "loss": 0.192, "step": 986000 }, { - "epoch": 10.05, - "learning_rate": 1.5751175156803868e-05, - "loss": 0.207, + "epoch": 13.586013061089526, + "grad_norm": 1.8333990573883057, + "learning_rate": 1.2296840817843508e-06, + "loss": 0.2049, "step": 986100 }, { - "epoch": 10.05, - "learning_rate": 1.5745337892472302e-05, - "loss": 0.2779, + "epoch": 13.587390813149266, + "grad_norm": 2.9046077728271484, + "learning_rate": 1.2273151186216318e-06, + "loss": 0.1688, "step": 986200 }, { - "epoch": 10.05, - "learning_rate": 1.5739501325108657e-05, - "loss": 0.2733, + "epoch": 13.588768565209005, + "grad_norm": 2.5722877979278564, + "learning_rate": 1.2249483874579026e-06, + "loss": 0.2952, "step": 986300 }, { - "epoch": 10.05, - "learning_rate": 1.5733665454998285e-05, - "loss": 0.2797, + "epoch": 13.590146317268744, + "grad_norm": 0.1348728984594345, + "learning_rate": 1.2225838884942286e-06, + "loss": 0.1694, "step": 986400 }, { - "epoch": 10.05, - "learning_rate": 1.572783028242654e-05, - "loss": 0.3152, + "epoch": 13.591524069328484, + "grad_norm": 1.7274844646453857, + "learning_rate": 1.2202216219314836e-06, + "loss": 0.2174, "step": 986500 }, { - "epoch": 10.05, - "learning_rate": 1.5721995807678725e-05, - "loss": 0.2526, + "epoch": 13.592901821388223, + "grad_norm": 1.9118367433547974, + "learning_rate": 1.2178615879703668e-06, + "loss": 0.164, "step": 986600 }, { - "epoch": 10.05, - "learning_rate": 1.57161620310401e-05, - "loss": 0.3135, + "epoch": 13.594279573447963, + "grad_norm": 1.7584048509597778, + "learning_rate": 1.2155037868113608e-06, + "loss": 0.1618, "step": 986700 }, { - "epoch": 10.05, - "learning_rate": 1.5710387280120368e-05, - "loss": 0.2638, + "epoch": 13.595657325507702, + "grad_norm": 3.2685728073120117, + "learning_rate": 1.2131717632823233e-06, + "loss": 0.2098, "step": 986800 }, { - "epoch": 10.05, - "learning_rate": 1.5704554893567612e-05, - "loss": 0.2701, + "epoch": 13.59703507756744, + "grad_norm": 1.5389854907989502, + "learning_rate": 1.2108184059952712e-06, + "loss": 0.2153, "step": 986900 }, { - "epoch": 10.06, - "learning_rate": 1.5698723205976806e-05, - "loss": 0.2445, + "epoch": 13.598412829627181, + "grad_norm": 1.4094879627227783, + "learning_rate": 1.2084672821086915e-06, + "loss": 0.2123, "step": 987000 }, { - "epoch": 10.06, - "learning_rate": 1.569289221763309e-05, - "loss": 0.2108, + "epoch": 13.59979058168692, + "grad_norm": 2.0222158432006836, + "learning_rate": 1.2061183918223156e-06, + "loss": 0.2132, "step": 987100 }, { - "epoch": 10.06, - "learning_rate": 1.5687061928821565e-05, - "loss": 0.2852, + "epoch": 13.601168333746658, + "grad_norm": 3.6543211936950684, + "learning_rate": 1.2037717353357044e-06, + "loss": 0.2079, "step": 987200 }, { - "epoch": 10.06, - "learning_rate": 1.5681232339827297e-05, - "loss": 0.2912, + "epoch": 13.602546085806399, + "grad_norm": 1.1921967267990112, + "learning_rate": 1.201427312848214e-06, + "loss": 0.2015, "step": 987300 }, { - "epoch": 10.06, - "learning_rate": 1.5675403450935327e-05, - "loss": 0.2703, + "epoch": 13.603923837866137, + "grad_norm": 0.08158989250659943, + "learning_rate": 1.19908512455902e-06, + "loss": 0.2636, "step": 987400 }, { - "epoch": 10.06, - "learning_rate": 1.5669575262430636e-05, - "loss": 0.308, + "epoch": 13.605301589925876, + "grad_norm": 1.0798386335372925, + "learning_rate": 1.1967451706670942e-06, + "loss": 0.2592, "step": 987500 }, { - "epoch": 10.06, - "learning_rate": 1.566380604600726e-05, - "loss": 0.375, + "epoch": 13.606679341985616, + "grad_norm": 4.925256729125977, + "learning_rate": 1.194407451371237e-06, + "loss": 0.2021, "step": 987600 }, { - "epoch": 10.06, - "learning_rate": 1.5657979252121014e-05, - "loss": 0.2807, + "epoch": 13.608057094045355, + "grad_norm": 0.1371840387582779, + "learning_rate": 1.1920719668700408e-06, + "loss": 0.2036, "step": 987700 }, { - "epoch": 10.06, - "learning_rate": 1.5652153159473994e-05, - "loss": 0.2775, + "epoch": 13.609434846105096, + "grad_norm": 1.224448561668396, + "learning_rate": 1.1897387173619217e-06, + "loss": 0.2313, "step": 987800 }, { - "epoch": 10.06, - "learning_rate": 1.5646327768351074e-05, - "loss": 0.2998, + "epoch": 13.610812598164834, + "grad_norm": 2.2813880443573, + "learning_rate": 1.187431002123421e-06, + "loss": 0.2392, "step": 987900 }, { - "epoch": 10.07, - "learning_rate": 1.5640503079037053e-05, - "loss": 0.2323, + "epoch": 13.612190350224573, + "grad_norm": 1.0893574953079224, + "learning_rate": 1.1851022008410625e-06, + "loss": 0.1959, "step": 988000 }, { - "epoch": 10.07, - "learning_rate": 1.5634679091816742e-05, - "loss": 0.3269, + "epoch": 13.613568102284313, + "grad_norm": 0.4824713468551636, + "learning_rate": 1.1827756351438956e-06, + "loss": 0.2337, "step": 988100 }, { - "epoch": 10.07, - "learning_rate": 1.5628855806974905e-05, - "loss": 0.27, + "epoch": 13.614945854344052, + "grad_norm": 2.2070438861846924, + "learning_rate": 1.1804513052295729e-06, + "loss": 0.234, "step": 988200 }, { - "epoch": 10.07, - "learning_rate": 1.5623033224796245e-05, - "loss": 0.3193, + "epoch": 13.616323606403792, + "grad_norm": 2.84987211227417, + "learning_rate": 1.1781524211661543e-06, + "loss": 0.249, "step": 988300 }, { - "epoch": 10.07, - "learning_rate": 1.5617211345565465e-05, - "loss": 0.2666, + "epoch": 13.617701358463531, + "grad_norm": 2.451464891433716, + "learning_rate": 1.175832541046981e-06, + "loss": 0.2389, "step": 988400 }, { - "epoch": 10.07, - "learning_rate": 1.5611390169567214e-05, - "loss": 0.2633, + "epoch": 13.61907911052327, + "grad_norm": 0.5440343022346497, + "learning_rate": 1.173514897300507e-06, + "loss": 0.1602, "step": 988500 }, { - "epoch": 10.07, - "learning_rate": 1.5605569697086124e-05, - "loss": 0.2576, + "epoch": 13.62045686258301, + "grad_norm": 3.420504093170166, + "learning_rate": 1.1711994901236248e-06, + "loss": 0.2052, "step": 988600 }, { - "epoch": 10.07, - "learning_rate": 1.559974992840677e-05, - "loss": 0.2532, + "epoch": 13.621834614642749, + "grad_norm": 3.50215220451355, + "learning_rate": 1.1688863197130433e-06, + "loss": 0.2116, "step": 988700 }, { - "epoch": 10.07, - "learning_rate": 1.559393086381372e-05, - "loss": 0.3164, + "epoch": 13.623212366702488, + "grad_norm": 1.4052149057388306, + "learning_rate": 1.1665753862652795e-06, + "loss": 0.231, "step": 988800 }, { - "epoch": 10.08, - "learning_rate": 1.558811250359147e-05, - "loss": 0.2757, + "epoch": 13.624590118762228, + "grad_norm": 0.8020164370536804, + "learning_rate": 1.1642666899766544e-06, + "loss": 0.2174, "step": 988900 }, { - "epoch": 10.08, - "learning_rate": 1.5582294848024513e-05, - "loss": 0.2995, + "epoch": 13.625967870821967, + "grad_norm": 2.215566635131836, + "learning_rate": 1.1619602310433091e-06, + "loss": 0.2302, "step": 989000 }, { - "epoch": 10.08, - "learning_rate": 1.5576477897397292e-05, - "loss": 0.2972, + "epoch": 13.627345622881705, + "grad_norm": 1.5796433687210083, + "learning_rate": 1.1596560096611898e-06, + "loss": 0.2336, "step": 989100 }, { - "epoch": 10.08, - "learning_rate": 1.5570661651994246e-05, - "loss": 0.3842, + "epoch": 13.628723374941446, + "grad_norm": 1.0190300941467285, + "learning_rate": 1.1573540260260463e-06, + "loss": 0.2074, "step": 989200 }, { - "epoch": 10.08, - "learning_rate": 1.5564846112099714e-05, - "loss": 0.2577, + "epoch": 13.630101127001184, + "grad_norm": 3.5776174068450928, + "learning_rate": 1.155054280333443e-06, + "loss": 0.2182, "step": 989300 }, { - "epoch": 10.08, - "learning_rate": 1.555903127799807e-05, - "loss": 0.2734, + "epoch": 13.631478879060925, + "grad_norm": 10.981512069702148, + "learning_rate": 1.1527567727787639e-06, + "loss": 0.2357, "step": 989400 }, { - "epoch": 10.08, - "learning_rate": 1.555321714997363e-05, - "loss": 0.3683, + "epoch": 13.632856631120664, + "grad_norm": 3.514577865600586, + "learning_rate": 1.1504615035571853e-06, + "loss": 0.2244, "step": 989500 }, { - "epoch": 10.08, - "learning_rate": 1.5547403728310642e-05, - "loss": 0.2567, + "epoch": 13.634234383180402, + "grad_norm": 3.2168173789978027, + "learning_rate": 1.1481684728637064e-06, + "loss": 0.2155, "step": 989600 }, { - "epoch": 10.08, - "learning_rate": 1.5541591013293365e-05, - "loss": 0.2533, + "epoch": 13.635612135240143, + "grad_norm": 5.93967866897583, + "learning_rate": 1.1458776808931315e-06, + "loss": 0.2627, "step": 989700 }, { - "epoch": 10.08, - "learning_rate": 1.5535779005206004e-05, - "loss": 0.2926, + "epoch": 13.636989887299881, + "grad_norm": 1.6903858184814453, + "learning_rate": 1.1435891278400748e-06, + "loss": 0.2069, "step": 989800 }, { - "epoch": 10.09, - "learning_rate": 1.5529967704332736e-05, - "loss": 0.3219, + "epoch": 13.63836763935962, + "grad_norm": 2.683765411376953, + "learning_rate": 1.1413028138989556e-06, + "loss": 0.1935, "step": 989900 }, { - "epoch": 10.09, - "learning_rate": 1.5524157110957686e-05, - "loss": 0.3513, + "epoch": 13.63974539141936, + "grad_norm": 2.301539659500122, + "learning_rate": 1.139018739264013e-06, + "loss": 0.1958, "step": 990000 }, { - "epoch": 10.09, - "learning_rate": 1.5518347225364983e-05, - "loss": 0.2537, + "epoch": 13.6411231434791, + "grad_norm": 1.6838740110397339, + "learning_rate": 1.136736904129297e-06, + "loss": 0.1662, "step": 990100 }, { - "epoch": 10.09, - "learning_rate": 1.5512538047838667e-05, - "loss": 0.2954, + "epoch": 13.64250089553884, + "grad_norm": 3.7607204914093018, + "learning_rate": 1.1344573086886522e-06, + "loss": 0.1705, "step": 990200 }, { - "epoch": 10.09, - "learning_rate": 1.550672957866278e-05, - "loss": 0.3591, + "epoch": 13.643878647598578, + "grad_norm": 3.1095991134643555, + "learning_rate": 1.1321799531357412e-06, + "loss": 0.1986, "step": 990300 }, { - "epoch": 10.09, - "learning_rate": 1.550092181812134e-05, - "loss": 0.2868, + "epoch": 13.645256399658317, + "grad_norm": 1.0303610563278198, + "learning_rate": 1.1299048376640428e-06, + "loss": 0.2119, "step": 990400 }, { - "epoch": 10.09, - "learning_rate": 1.5495114766498283e-05, - "loss": 0.2792, + "epoch": 13.646634151718057, + "grad_norm": 3.0687148571014404, + "learning_rate": 1.127631962466834e-06, + "loss": 0.179, "step": 990500 }, { - "epoch": 10.09, - "learning_rate": 1.5489308424077558e-05, - "loss": 0.318, + "epoch": 13.648011903777796, + "grad_norm": 4.130395412445068, + "learning_rate": 1.1253613277372093e-06, + "loss": 0.2061, "step": 990600 }, { - "epoch": 10.09, - "learning_rate": 1.5483502791143054e-05, - "loss": 0.2978, + "epoch": 13.649389655837535, + "grad_norm": 2.410235643386841, + "learning_rate": 1.1230929336680765e-06, + "loss": 0.2268, "step": 990700 }, { - "epoch": 10.09, - "learning_rate": 1.5477697867978643e-05, - "loss": 0.2333, + "epoch": 13.650767407897275, + "grad_norm": 0.22902274131774902, + "learning_rate": 1.120849430891443e-06, + "loss": 0.2119, "step": 990800 }, { - "epoch": 10.1, - "learning_rate": 1.5471893654868132e-05, - "loss": 0.2195, + "epoch": 13.652145159957014, + "grad_norm": 2.778848171234131, + "learning_rate": 1.118585496309818e-06, + "loss": 0.2246, "step": 990900 }, { - "epoch": 10.1, - "learning_rate": 1.546609015209532e-05, - "loss": 0.3341, + "epoch": 13.653522912016754, + "grad_norm": 8.463459968566895, + "learning_rate": 1.1163238029643188e-06, + "loss": 0.2076, "step": 991000 }, { - "epoch": 10.1, - "learning_rate": 1.546028735994397e-05, - "loss": 0.2692, + "epoch": 13.654900664076493, + "grad_norm": 0.40749770402908325, + "learning_rate": 1.1140643510470902e-06, + "loss": 0.2262, "step": 991100 }, { - "epoch": 10.1, - "learning_rate": 1.5454485278697795e-05, - "loss": 0.2784, + "epoch": 13.656278416136232, + "grad_norm": 1.7824000120162964, + "learning_rate": 1.1118071407500874e-06, + "loss": 0.2231, "step": 991200 }, { - "epoch": 10.1, - "learning_rate": 1.544868390864049e-05, - "loss": 0.2951, + "epoch": 13.657656168195972, + "grad_norm": 2.1625523567199707, + "learning_rate": 1.1095521722650704e-06, + "loss": 0.2067, "step": 991300 }, { - "epoch": 10.1, - "learning_rate": 1.544288325005571e-05, - "loss": 0.2697, + "epoch": 13.65903392025571, + "grad_norm": 0.9409769177436829, + "learning_rate": 1.1072994457836037e-06, + "loss": 0.1769, "step": 991400 }, { - "epoch": 10.1, - "learning_rate": 1.5437083303227063e-05, - "loss": 0.2839, + "epoch": 13.66041167231545, + "grad_norm": 5.067215442657471, + "learning_rate": 1.1050489614970777e-06, + "loss": 0.2467, "step": 991500 }, { - "epoch": 10.1, - "learning_rate": 1.5431284068438133e-05, - "loss": 0.2644, + "epoch": 13.66178942437519, + "grad_norm": 1.7076265811920166, + "learning_rate": 1.1028007195966754e-06, + "loss": 0.2279, "step": 991600 }, { - "epoch": 10.1, - "learning_rate": 1.5425485545972483e-05, - "loss": 0.2783, + "epoch": 13.663167176434929, + "grad_norm": 2.7417099475860596, + "learning_rate": 1.1005547202733997e-06, + "loss": 0.1992, "step": 991700 }, { - "epoch": 10.1, - "learning_rate": 1.54196877361136e-05, - "loss": 0.3005, + "epoch": 13.664544928494667, + "grad_norm": 0.27878376841545105, + "learning_rate": 1.0983109637180612e-06, + "loss": 0.2397, "step": 991800 }, { - "epoch": 10.11, - "learning_rate": 1.5413890639144974e-05, - "loss": 0.3097, + "epoch": 13.665922680554408, + "grad_norm": 1.955369472503662, + "learning_rate": 1.0960694501212743e-06, + "loss": 0.2357, "step": 991900 }, { - "epoch": 10.11, - "learning_rate": 1.5408094255350065e-05, - "loss": 0.2648, + "epoch": 13.667300432614146, + "grad_norm": 1.3347461223602295, + "learning_rate": 1.0938301796734682e-06, + "loss": 0.2288, "step": 992000 }, { - "epoch": 10.11, - "learning_rate": 1.540229858501225e-05, - "loss": 0.2843, + "epoch": 13.668678184673887, + "grad_norm": 2.3524982929229736, + "learning_rate": 1.0915931525648822e-06, + "loss": 0.1911, "step": 992100 }, { - "epoch": 10.11, - "learning_rate": 1.539650362841493e-05, - "loss": 0.3322, + "epoch": 13.670055936733625, + "grad_norm": 2.027970314025879, + "learning_rate": 1.0893583689855638e-06, + "loss": 0.2098, "step": 992200 }, { - "epoch": 10.11, - "learning_rate": 1.5390709385841427e-05, - "loss": 0.26, + "epoch": 13.671433688793364, + "grad_norm": 0.11719391494989395, + "learning_rate": 1.0871258291253674e-06, + "loss": 0.2284, "step": 992300 }, { - "epoch": 10.11, - "learning_rate": 1.538491585757505e-05, - "loss": 0.2946, + "epoch": 13.672811440853105, + "grad_norm": 2.669922351837158, + "learning_rate": 1.0848955331739587e-06, + "loss": 0.2456, "step": 992400 }, { - "epoch": 10.11, - "learning_rate": 1.5379123043899068e-05, - "loss": 0.286, + "epoch": 13.674189192912843, + "grad_norm": 1.5742334127426147, + "learning_rate": 1.082667481320817e-06, + "loss": 0.1785, "step": 992500 }, { - "epoch": 10.11, - "learning_rate": 1.537333094509672e-05, - "loss": 0.314, + "epoch": 13.675566944972584, + "grad_norm": 4.7714738845825195, + "learning_rate": 1.080441673755217e-06, + "loss": 0.1965, "step": 992600 }, { - "epoch": 10.11, - "learning_rate": 1.5367539561451213e-05, - "loss": 0.3357, + "epoch": 13.676944697032322, + "grad_norm": 4.0279645919799805, + "learning_rate": 1.0782181106662591e-06, + "loss": 0.1919, "step": 992700 }, { - "epoch": 10.11, - "learning_rate": 1.536174889324569e-05, - "loss": 0.3149, + "epoch": 13.678322449092061, + "grad_norm": 1.4492229223251343, + "learning_rate": 1.075996792242849e-06, + "loss": 0.1907, "step": 992800 }, { - "epoch": 10.12, - "learning_rate": 1.5355958940763284e-05, - "loss": 0.3232, + "epoch": 13.679700201151801, + "grad_norm": 1.688649296760559, + "learning_rate": 1.0737777186736964e-06, + "loss": 0.2021, "step": 992900 }, { - "epoch": 10.12, - "learning_rate": 1.5350169704287112e-05, - "loss": 0.2927, + "epoch": 13.68107795321154, + "grad_norm": 4.979525089263916, + "learning_rate": 1.0715608901473188e-06, + "loss": 0.2204, "step": 993000 }, { - "epoch": 10.12, - "learning_rate": 1.5344381184100202e-05, - "loss": 0.271, + "epoch": 13.682455705271279, + "grad_norm": 2.755791664123535, + "learning_rate": 1.0693463068520505e-06, + "loss": 0.1683, "step": 993100 }, { - "epoch": 10.12, - "learning_rate": 1.533859338048559e-05, - "loss": 0.331, + "epoch": 13.68383345733102, + "grad_norm": 1.8954098224639893, + "learning_rate": 1.0671339689760334e-06, + "loss": 0.254, "step": 993200 }, { - "epoch": 10.12, - "learning_rate": 1.5332864161044506e-05, - "loss": 0.2794, + "epoch": 13.685211209390758, + "grad_norm": 2.886615753173828, + "learning_rate": 1.0649238767072145e-06, + "loss": 0.2435, "step": 993300 }, { - "epoch": 10.12, - "learning_rate": 1.5327077784250646e-05, - "loss": 0.2842, + "epoch": 13.686588961450497, + "grad_norm": 3.025954246520996, + "learning_rate": 1.062716030233354e-06, + "loss": 0.2532, "step": 993400 }, { - "epoch": 10.12, - "learning_rate": 1.5321292124875126e-05, - "loss": 0.2823, + "epoch": 13.687966713510237, + "grad_norm": 2.97761607170105, + "learning_rate": 1.0605104297420196e-06, + "loss": 0.226, "step": 993500 }, { - "epoch": 10.12, - "learning_rate": 1.5315507183200818e-05, - "loss": 0.2254, + "epoch": 13.689344465569976, + "grad_norm": 0.6922687888145447, + "learning_rate": 1.0583070754205843e-06, + "loss": 0.1939, "step": 993600 }, { - "epoch": 10.12, - "learning_rate": 1.5309722959510558e-05, - "loss": 0.3056, + "epoch": 13.690722217629716, + "grad_norm": 1.5756398439407349, + "learning_rate": 1.0561059674562404e-06, + "loss": 0.2093, "step": 993700 }, { - "epoch": 10.13, - "learning_rate": 1.5303939454087208e-05, - "loss": 0.2531, + "epoch": 13.692099969689455, + "grad_norm": 2.02323579788208, + "learning_rate": 1.053907106035982e-06, + "loss": 0.1889, "step": 993800 }, { - "epoch": 10.13, - "learning_rate": 1.5298156667213502e-05, - "loss": 0.3364, + "epoch": 13.693477721749193, + "grad_norm": 4.709471225738525, + "learning_rate": 1.0517104913466076e-06, + "loss": 0.2284, "step": 993900 }, { - "epoch": 10.13, - "learning_rate": 1.5292374599172196e-05, - "loss": 0.2686, + "epoch": 13.694855473808934, + "grad_norm": 1.2264903783798218, + "learning_rate": 1.0495161235747392e-06, + "loss": 0.2226, "step": 994000 }, { - "epoch": 10.13, - "learning_rate": 1.5286593250246016e-05, - "loss": 0.3001, + "epoch": 13.696233225868673, + "grad_norm": 3.4645321369171143, + "learning_rate": 1.0473240029067932e-06, + "loss": 0.221, "step": 994100 }, { - "epoch": 10.13, - "learning_rate": 1.5280812620717613e-05, - "loss": 0.2476, + "epoch": 13.697610977928411, + "grad_norm": 12.041801452636719, + "learning_rate": 1.0451341295290098e-06, + "loss": 0.202, "step": 994200 }, { - "epoch": 10.13, - "learning_rate": 1.527503271086963e-05, - "loss": 0.3258, + "epoch": 13.698988729988152, + "grad_norm": 3.8063607215881348, + "learning_rate": 1.0429683687608186e-06, + "loss": 0.2177, "step": 994300 }, { - "epoch": 10.13, - "learning_rate": 1.5269253520984685e-05, - "loss": 0.2895, + "epoch": 13.70036648204789, + "grad_norm": 0.010017875581979752, + "learning_rate": 1.0407829680437347e-06, + "loss": 0.2172, "step": 994400 }, { - "epoch": 10.13, - "learning_rate": 1.5263475051345327e-05, - "loss": 0.2941, + "epoch": 13.70174423410763, + "grad_norm": 1.863673448562622, + "learning_rate": 1.0385998151725046e-06, + "loss": 0.2285, "step": 994500 }, { - "epoch": 10.13, - "learning_rate": 1.525769730223409e-05, - "loss": 0.3051, + "epoch": 13.70312198616737, + "grad_norm": 2.777113914489746, + "learning_rate": 1.0364189103325986e-06, + "loss": 0.2135, "step": 994600 }, { - "epoch": 10.13, - "learning_rate": 1.5251920273933491e-05, - "loss": 0.3001, + "epoch": 13.704499738227108, + "grad_norm": 9.070704460144043, + "learning_rate": 1.0342402537092889e-06, + "loss": 0.233, "step": 994700 }, { - "epoch": 10.14, - "learning_rate": 1.5246143966725967e-05, - "loss": 0.2115, + "epoch": 13.705877490286849, + "grad_norm": 1.3899420499801636, + "learning_rate": 1.0320638454876672e-06, + "loss": 0.2038, "step": 994800 }, { - "epoch": 10.14, - "learning_rate": 1.5240368380893957e-05, - "loss": 0.3273, + "epoch": 13.707255242346587, + "grad_norm": 1.8444868326187134, + "learning_rate": 1.0298896858526334e-06, + "loss": 0.1944, "step": 994900 }, { - "epoch": 10.14, - "learning_rate": 1.523459351671984e-05, - "loss": 0.2714, + "epoch": 13.708632994406326, + "grad_norm": 2.5521552562713623, + "learning_rate": 1.0277177749888883e-06, + "loss": 0.2217, "step": 995000 }, { - "epoch": 10.14, - "learning_rate": 1.522881937448601e-05, - "loss": 0.355, + "epoch": 13.710010746466066, + "grad_norm": 1.6306613683700562, + "learning_rate": 1.0255481130809532e-06, + "loss": 0.2268, "step": 995100 }, { - "epoch": 10.14, - "learning_rate": 1.5223045954474748e-05, - "loss": 0.3331, + "epoch": 13.711388498525805, + "grad_norm": 1.5699774026870728, + "learning_rate": 1.0233807003131475e-06, + "loss": 0.2458, "step": 995200 }, { - "epoch": 10.14, - "learning_rate": 1.5217273256968356e-05, - "loss": 0.2924, + "epoch": 13.712766250585545, + "grad_norm": 4.638448715209961, + "learning_rate": 1.0212155368696048e-06, + "loss": 0.2135, "step": 995300 }, { - "epoch": 10.14, - "learning_rate": 1.521155899841756e-05, - "loss": 0.3031, + "epoch": 13.714144002645284, + "grad_norm": 3.931145191192627, + "learning_rate": 1.0190526229342623e-06, + "loss": 0.2232, "step": 995400 }, { - "epoch": 10.14, - "learning_rate": 1.5205787739535532e-05, - "loss": 0.2933, + "epoch": 13.715521754705023, + "grad_norm": 4.689014434814453, + "learning_rate": 1.0168919586908812e-06, + "loss": 0.2145, "step": 995500 }, { - "epoch": 10.14, - "learning_rate": 1.5200017204002205e-05, - "loss": 0.2439, + "epoch": 13.716899506764763, + "grad_norm": 3.678219795227051, + "learning_rate": 1.0147335443230115e-06, + "loss": 0.1754, "step": 995600 }, { - "epoch": 10.14, - "learning_rate": 1.5194247392099704e-05, - "loss": 0.2544, + "epoch": 13.718277258824502, + "grad_norm": 2.0498316287994385, + "learning_rate": 1.0125773800140264e-06, + "loss": 0.2468, "step": 995700 }, { - "epoch": 10.15, - "learning_rate": 1.5188478304110151e-05, - "loss": 0.3055, + "epoch": 13.71965501088424, + "grad_norm": 1.4617855548858643, + "learning_rate": 1.0104449939484708e-06, + "loss": 0.234, "step": 995800 }, { - "epoch": 10.15, - "learning_rate": 1.5182709940315633e-05, - "loss": 0.2707, + "epoch": 13.721032762943981, + "grad_norm": 4.918128490447998, + "learning_rate": 1.0082933078014345e-06, + "loss": 0.2375, "step": 995900 }, { - "epoch": 10.15, - "learning_rate": 1.5176942300998157e-05, - "loss": 0.2763, + "epoch": 13.72241051500372, + "grad_norm": 1.2750688791275024, + "learning_rate": 1.0061438722604128e-06, + "loss": 0.1992, "step": 996000 }, { - "epoch": 10.15, - "learning_rate": 1.5171175386439746e-05, - "loss": 0.3561, + "epoch": 13.723788267063458, + "grad_norm": 3.687493085861206, + "learning_rate": 1.0039966875080122e-06, + "loss": 0.2197, "step": 996100 }, { - "epoch": 10.15, - "learning_rate": 1.5165409196922365e-05, - "loss": 0.2769, + "epoch": 13.725166019123199, + "grad_norm": 3.3317105770111084, + "learning_rate": 1.0018517537266474e-06, + "loss": 0.2298, "step": 996200 }, { - "epoch": 10.15, - "learning_rate": 1.5159643732727946e-05, - "loss": 0.2655, + "epoch": 13.726543771182937, + "grad_norm": 0.7477369904518127, + "learning_rate": 9.997090710985374e-07, + "loss": 0.232, "step": 996300 }, { - "epoch": 10.15, - "learning_rate": 1.5153878994138386e-05, - "loss": 0.3115, + "epoch": 13.727921523242678, + "grad_norm": 5.2168288230896, + "learning_rate": 9.975686398057147e-07, + "loss": 0.2517, "step": 996400 }, { - "epoch": 10.15, - "learning_rate": 1.5148114981435556e-05, - "loss": 0.3356, + "epoch": 13.729299275302417, + "grad_norm": 11.2040376663208, + "learning_rate": 9.954304600300226e-07, + "loss": 0.2599, "step": 996500 }, { - "epoch": 10.15, - "learning_rate": 1.514235169490126e-05, - "loss": 0.2953, + "epoch": 13.730677027362155, + "grad_norm": 3.6550655364990234, + "learning_rate": 9.932945319531126e-07, + "loss": 0.1728, "step": 996600 }, { - "epoch": 10.15, - "learning_rate": 1.5136589134817303e-05, - "loss": 0.2695, + "epoch": 13.732054779421896, + "grad_norm": 3.051327705383301, + "learning_rate": 9.91160855756434e-07, + "loss": 0.1992, "step": 996700 }, { - "epoch": 10.16, - "learning_rate": 1.5130827301465446e-05, - "loss": 0.3138, + "epoch": 13.733432531481634, + "grad_norm": 1.389105200767517, + "learning_rate": 9.890294316212653e-07, + "loss": 0.2389, "step": 996800 }, { - "epoch": 10.16, - "learning_rate": 1.512506619512739e-05, - "loss": 0.273, + "epoch": 13.734810283541375, + "grad_norm": 1.5840896368026733, + "learning_rate": 9.869002597286686e-07, + "loss": 0.1737, "step": 996900 }, { - "epoch": 10.16, - "learning_rate": 1.5119305816084826e-05, - "loss": 0.3209, + "epoch": 13.736188035601113, + "grad_norm": 1.3473501205444336, + "learning_rate": 9.847733402595408e-07, + "loss": 0.2269, "step": 997000 }, { - "epoch": 10.16, - "learning_rate": 1.5113546164619418e-05, - "loss": 0.2925, + "epoch": 13.737565787660852, + "grad_norm": 1.070029616355896, + "learning_rate": 9.82648673394565e-07, + "loss": 0.2188, "step": 997100 }, { - "epoch": 10.16, - "learning_rate": 1.5107787241012753e-05, - "loss": 0.2819, + "epoch": 13.738943539720593, + "grad_norm": 10.101550102233887, + "learning_rate": 9.805262593142505e-07, + "loss": 0.2078, "step": 997200 }, { - "epoch": 10.16, - "learning_rate": 1.5102029045546417e-05, - "loss": 0.2901, + "epoch": 13.740321291780331, + "grad_norm": 3.133347511291504, + "learning_rate": 9.78406098198902e-07, + "loss": 0.2422, "step": 997300 }, { - "epoch": 10.16, - "learning_rate": 1.5096271578501957e-05, - "loss": 0.2541, + "epoch": 13.74169904384007, + "grad_norm": 3.3848319053649902, + "learning_rate": 9.76288190228635e-07, + "loss": 0.1921, "step": 997400 }, { - "epoch": 10.16, - "learning_rate": 1.5090514840160875e-05, - "loss": 0.2015, + "epoch": 13.74307679589981, + "grad_norm": 0.38242432475090027, + "learning_rate": 9.741725355833814e-07, + "loss": 0.2518, "step": 997500 }, { - "epoch": 10.16, - "learning_rate": 1.5084758830804643e-05, - "loss": 0.3325, + "epoch": 13.744454547959549, + "grad_norm": 1.6492681503295898, + "learning_rate": 9.72059134442881e-07, + "loss": 0.1939, "step": 997600 }, { - "epoch": 10.16, - "learning_rate": 1.5079003550714694e-05, - "loss": 0.2414, + "epoch": 13.745832300019288, + "grad_norm": 1.232793927192688, + "learning_rate": 9.699479869866666e-07, + "loss": 0.2167, "step": 997700 }, { - "epoch": 10.17, - "learning_rate": 1.507324900017244e-05, - "loss": 0.3616, + "epoch": 13.747210052079028, + "grad_norm": 0.5539990663528442, + "learning_rate": 9.678390933941017e-07, + "loss": 0.2003, "step": 997800 }, { - "epoch": 10.17, - "learning_rate": 1.506749517945922e-05, - "loss": 0.284, + "epoch": 13.748587804138767, + "grad_norm": 1.1356955766677856, + "learning_rate": 9.657324538443434e-07, + "loss": 0.2069, "step": 997900 }, { - "epoch": 10.17, - "learning_rate": 1.5061742088856372e-05, - "loss": 0.2425, + "epoch": 13.749965556198507, + "grad_norm": 2.537792682647705, + "learning_rate": 9.636280685163622e-07, + "loss": 0.2454, "step": 998000 }, { - "epoch": 10.17, - "learning_rate": 1.5055989728645201e-05, - "loss": 0.2997, + "epoch": 13.751343308258246, + "grad_norm": 0.834906280040741, + "learning_rate": 9.615259375889333e-07, + "loss": 0.2072, "step": 998100 }, { - "epoch": 10.17, - "learning_rate": 1.5050238099106938e-05, - "loss": 0.2713, + "epoch": 13.752721060317985, + "grad_norm": 2.8623859882354736, + "learning_rate": 9.59426061240648e-07, + "loss": 0.2287, "step": 998200 }, { - "epoch": 10.17, - "learning_rate": 1.504448720052282e-05, - "loss": 0.3018, + "epoch": 13.754098812377725, + "grad_norm": 1.4833942651748657, + "learning_rate": 9.57328439649897e-07, + "loss": 0.2205, "step": 998300 }, { - "epoch": 10.17, - "learning_rate": 1.5038737033174039e-05, - "loss": 0.3064, + "epoch": 13.755476564437464, + "grad_norm": 19.024581909179688, + "learning_rate": 9.552330729948877e-07, + "loss": 0.1983, "step": 998400 }, { - "epoch": 10.17, - "learning_rate": 1.5032987597341716e-05, - "loss": 0.3314, + "epoch": 13.756854316497202, + "grad_norm": 2.09698224067688, + "learning_rate": 9.531399614536315e-07, + "loss": 0.1883, "step": 998500 }, { - "epoch": 10.17, - "learning_rate": 1.5027238893306987e-05, - "loss": 0.3292, + "epoch": 13.758232068556943, + "grad_norm": 2.571943998336792, + "learning_rate": 9.510700026021655e-07, + "loss": 0.2129, "step": 998600 }, { - "epoch": 10.17, - "learning_rate": 1.5021490921350923e-05, - "loss": 0.2814, + "epoch": 13.759609820616681, + "grad_norm": 0.8022386431694031, + "learning_rate": 9.48981379266117e-07, + "loss": 0.1529, "step": 998700 }, { - "epoch": 10.18, - "learning_rate": 1.5015743681754563e-05, - "loss": 0.2881, + "epoch": 13.760987572676422, + "grad_norm": 0.9395287036895752, + "learning_rate": 9.468950115749292e-07, + "loss": 0.2552, "step": 998800 }, { - "epoch": 10.18, - "learning_rate": 1.500999717479892e-05, - "loss": 0.3107, + "epoch": 13.76236532473616, + "grad_norm": 1.4781537055969238, + "learning_rate": 9.448108997058536e-07, + "loss": 0.2411, "step": 998900 }, { - "epoch": 10.18, - "learning_rate": 1.5004251400764952e-05, - "loss": 0.2446, + "epoch": 13.7637430767959, + "grad_norm": 8.282505989074707, + "learning_rate": 9.427290438359553e-07, + "loss": 0.1979, "step": 999000 }, { - "epoch": 10.18, - "learning_rate": 1.4998506359933617e-05, - "loss": 0.2392, + "epoch": 13.76512082885564, + "grad_norm": 6.674598693847656, + "learning_rate": 9.406494441420862e-07, + "loss": 0.2053, "step": 999100 }, { - "epoch": 10.18, - "learning_rate": 1.4992762052585785e-05, - "loss": 0.2992, + "epoch": 13.766498580915378, + "grad_norm": 1.7220368385314941, + "learning_rate": 9.385721008009207e-07, + "loss": 0.2095, "step": 999200 }, { - "epoch": 10.18, - "learning_rate": 1.498701847900233e-05, - "loss": 0.3003, + "epoch": 13.767876332975117, + "grad_norm": 0.21435314416885376, + "learning_rate": 9.36497013988947e-07, + "loss": 0.1936, "step": 999300 }, { - "epoch": 10.18, - "learning_rate": 1.4981275639464093e-05, - "loss": 0.2858, + "epoch": 13.769254085034857, + "grad_norm": 3.895564556121826, + "learning_rate": 9.344241838824458e-07, + "loss": 0.2279, "step": 999400 }, { - "epoch": 10.18, - "learning_rate": 1.4975533534251835e-05, - "loss": 0.2691, + "epoch": 13.770631837094596, + "grad_norm": 4.139699459075928, + "learning_rate": 9.323536106575236e-07, + "loss": 0.2265, "step": 999500 }, { - "epoch": 10.18, - "learning_rate": 1.4969792163646327e-05, - "loss": 0.2896, + "epoch": 13.772009589154337, + "grad_norm": 3.963125228881836, + "learning_rate": 9.302852944900767e-07, + "loss": 0.2422, "step": 999600 }, { - "epoch": 10.19, - "learning_rate": 1.4964108930646857e-05, - "loss": 0.318, + "epoch": 13.773387341214075, + "grad_norm": 6.898953914642334, + "learning_rate": 9.282192355558267e-07, + "loss": 0.207, "step": 999700 }, { - "epoch": 10.19, - "learning_rate": 1.4958369022743894e-05, - "loss": 0.2662, + "epoch": 13.774765093273814, + "grad_norm": 2.6803648471832275, + "learning_rate": 9.261554340302911e-07, + "loss": 0.1944, "step": 999800 }, { - "epoch": 10.19, - "learning_rate": 1.4952629850286935e-05, - "loss": 0.2526, + "epoch": 13.776142845333554, + "grad_norm": 2.988354206085205, + "learning_rate": 9.240938900888011e-07, + "loss": 0.2193, "step": 999900 }, { - "epoch": 10.19, - "learning_rate": 1.4946891413556568e-05, - "loss": 0.2716, + "epoch": 13.777520597393293, + "grad_norm": 5.72231388092041, + "learning_rate": 9.220346039065014e-07, + "loss": 0.2411, "step": 1000000 }, { - "epoch": 10.19, - "learning_rate": 1.494121108619645e-05, - "loss": 0.3408, + "epoch": 13.778898349453032, + "grad_norm": 1.2038748264312744, + "learning_rate": 9.199775756583326e-07, + "loss": 0.2343, "step": 1000100 }, { - "epoch": 10.19, - "learning_rate": 1.493547411439671e-05, - "loss": 0.3107, + "epoch": 13.780276101512772, + "grad_norm": 0.06246373429894447, + "learning_rate": 9.179228055190486e-07, + "loss": 0.2434, "step": 1000200 }, { - "epoch": 10.19, - "learning_rate": 1.4929737879162386e-05, - "loss": 0.2623, + "epoch": 13.78165385357251, + "grad_norm": 0.20930127799510956, + "learning_rate": 9.158702936632174e-07, + "loss": 0.1959, "step": 1000300 }, { - "epoch": 10.19, - "learning_rate": 1.4924002380773956e-05, - "loss": 0.2901, + "epoch": 13.78303160563225, + "grad_norm": 1.3775873184204102, + "learning_rate": 9.138200402652053e-07, + "loss": 0.2168, "step": 1000400 }, { - "epoch": 10.19, - "learning_rate": 1.4918267619511831e-05, - "loss": 0.2981, + "epoch": 13.78440935769199, + "grad_norm": 1.4312326908111572, + "learning_rate": 9.117720454991954e-07, + "loss": 0.2183, "step": 1000500 }, { - "epoch": 10.19, - "learning_rate": 1.4912533595656413e-05, - "loss": 0.3036, + "epoch": 13.785787109751729, + "grad_norm": 1.8419893980026245, + "learning_rate": 9.097263095391786e-07, + "loss": 0.2076, "step": 1000600 }, { - "epoch": 10.2, - "learning_rate": 1.4906800309488081e-05, - "loss": 0.2691, + "epoch": 13.787164861811469, + "grad_norm": 1.177031397819519, + "learning_rate": 9.076828325589473e-07, + "loss": 0.2087, "step": 1000700 }, { - "epoch": 10.2, - "learning_rate": 1.490106776128713e-05, - "loss": 0.227, + "epoch": 13.788542613871208, + "grad_norm": 1.5196750164031982, + "learning_rate": 9.056416147320984e-07, + "loss": 0.1741, "step": 1000800 }, { - "epoch": 10.2, - "learning_rate": 1.4895335951333857e-05, - "loss": 0.3355, + "epoch": 13.789920365930946, + "grad_norm": 0.05920976772904396, + "learning_rate": 9.036026562320518e-07, + "loss": 0.1747, "step": 1000900 }, { - "epoch": 10.2, - "learning_rate": 1.488960487990853e-05, - "loss": 0.337, + "epoch": 13.791298117990687, + "grad_norm": 1.096441626548767, + "learning_rate": 9.01565957232032e-07, + "loss": 0.2272, "step": 1001000 }, { - "epoch": 10.2, - "learning_rate": 1.4883874547291335e-05, - "loss": 0.3396, + "epoch": 13.792675870050426, + "grad_norm": 1.5265923738479614, + "learning_rate": 8.995315179050528e-07, + "loss": 0.2286, "step": 1001100 }, { - "epoch": 10.2, - "learning_rate": 1.4878144953762471e-05, - "loss": 0.2745, + "epoch": 13.794053622110166, + "grad_norm": 0.43982061743736267, + "learning_rate": 8.974993384239633e-07, + "loss": 0.2073, "step": 1001200 }, { - "epoch": 10.2, - "learning_rate": 1.4872416099602075e-05, - "loss": 0.2923, + "epoch": 13.795431374169905, + "grad_norm": 1.9879732131958008, + "learning_rate": 8.954694189614046e-07, + "loss": 0.2374, "step": 1001300 }, { - "epoch": 10.2, - "learning_rate": 1.4866687985090253e-05, - "loss": 0.2976, + "epoch": 13.796809126229643, + "grad_norm": 0.34522032737731934, + "learning_rate": 8.934417596898198e-07, + "loss": 0.1905, "step": 1001400 }, { - "epoch": 10.2, - "learning_rate": 1.4860960610507079e-05, - "loss": 0.2592, + "epoch": 13.798186878289384, + "grad_norm": 2.9486215114593506, + "learning_rate": 8.914163607814777e-07, + "loss": 0.2642, "step": 1001500 }, { - "epoch": 10.2, - "learning_rate": 1.4855233976132598e-05, - "loss": 0.2883, + "epoch": 13.799564630349122, + "grad_norm": 1.0944488048553467, + "learning_rate": 8.893932224084486e-07, + "loss": 0.2213, "step": 1001600 }, { - "epoch": 10.21, - "learning_rate": 1.4849508082246785e-05, - "loss": 0.2998, + "epoch": 13.800942382408861, + "grad_norm": 1.9349406957626343, + "learning_rate": 8.873723447425986e-07, + "loss": 0.2633, "step": 1001700 }, { - "epoch": 10.21, - "learning_rate": 1.4843782929129612e-05, - "loss": 0.2677, + "epoch": 13.802320134468602, + "grad_norm": 0.47075796127319336, + "learning_rate": 8.853537279556223e-07, + "loss": 0.168, "step": 1001800 }, { - "epoch": 10.21, - "learning_rate": 1.4838058517061008e-05, - "loss": 0.2596, + "epoch": 13.80369788652834, + "grad_norm": 2.7314095497131348, + "learning_rate": 8.833373722190011e-07, + "loss": 0.2101, "step": 1001900 }, { - "epoch": 10.21, - "learning_rate": 1.483233484632087e-05, - "loss": 0.299, + "epoch": 13.805075638588079, + "grad_norm": 8.204957962036133, + "learning_rate": 8.813232777040451e-07, + "loss": 0.2512, "step": 1002000 }, { - "epoch": 10.21, - "learning_rate": 1.4826611917189032e-05, - "loss": 0.298, + "epoch": 13.80645339064782, + "grad_norm": 0.17167074978351593, + "learning_rate": 8.793114445818537e-07, + "loss": 0.2208, "step": 1002100 }, { - "epoch": 10.21, - "learning_rate": 1.4820889729945322e-05, - "loss": 0.3756, + "epoch": 13.807831142707558, + "grad_norm": 3.848662853240967, + "learning_rate": 8.773018730233465e-07, + "loss": 0.1929, "step": 1002200 }, { - "epoch": 10.21, - "learning_rate": 1.481516828486953e-05, - "loss": 0.2905, + "epoch": 13.809208894767298, + "grad_norm": 1.5230530500411987, + "learning_rate": 8.752945631992473e-07, + "loss": 0.182, "step": 1002300 }, { - "epoch": 10.21, - "learning_rate": 1.480944758224138e-05, - "loss": 0.3306, + "epoch": 13.810586646827037, + "grad_norm": 3.1771936416625977, + "learning_rate": 8.732895152800849e-07, + "loss": 0.219, "step": 1002400 }, { - "epoch": 10.21, - "learning_rate": 1.4803727622340591e-05, - "loss": 0.2646, + "epoch": 13.811964398886776, + "grad_norm": 2.234166383743286, + "learning_rate": 8.712867294361984e-07, + "loss": 0.2348, "step": 1002500 }, { - "epoch": 10.21, - "learning_rate": 1.4798065593936974e-05, - "loss": 0.3013, + "epoch": 13.813342150946516, + "grad_norm": 3.100041151046753, + "learning_rate": 8.692862058377379e-07, + "loss": 0.2369, "step": 1002600 }, { - "epoch": 10.22, - "learning_rate": 1.4792347112895637e-05, - "loss": 0.2957, + "epoch": 13.814719903006255, + "grad_norm": 0.47130268812179565, + "learning_rate": 8.67287944654652e-07, + "loss": 0.2623, "step": 1002700 }, { - "epoch": 10.22, - "learning_rate": 1.4786629375417766e-05, - "loss": 0.3258, + "epoch": 13.816097655065994, + "grad_norm": 1.7812787294387817, + "learning_rate": 8.652919460567089e-07, + "loss": 0.1906, "step": 1002800 }, { - "epoch": 10.22, - "learning_rate": 1.4780912381782936e-05, - "loss": 0.2585, + "epoch": 13.817475407125734, + "grad_norm": 3.329913854598999, + "learning_rate": 8.632982102134814e-07, + "loss": 0.1965, "step": 1002900 }, { - "epoch": 10.22, - "learning_rate": 1.4775196132270664e-05, - "loss": 0.2778, + "epoch": 13.818853159185473, + "grad_norm": 0.4932354986667633, + "learning_rate": 8.613067372943415e-07, + "loss": 0.1987, "step": 1003000 }, { - "epoch": 10.22, - "learning_rate": 1.4769480627160457e-05, - "loss": 0.2376, + "epoch": 13.820230911245213, + "grad_norm": 1.7606476545333862, + "learning_rate": 8.593175274684739e-07, + "loss": 0.209, "step": 1003100 }, { - "epoch": 10.22, - "learning_rate": 1.4763765866731744e-05, - "loss": 0.205, + "epoch": 13.821608663304952, + "grad_norm": 0.14409102499485016, + "learning_rate": 8.57330580904881e-07, + "loss": 0.2578, "step": 1003200 }, { - "epoch": 10.22, - "learning_rate": 1.4758051851263956e-05, - "loss": 0.3115, + "epoch": 13.82298641536469, + "grad_norm": 1.501898169517517, + "learning_rate": 8.553458977723508e-07, + "loss": 0.2093, "step": 1003300 }, { - "epoch": 10.22, - "learning_rate": 1.4752338581036482e-05, - "loss": 0.3018, + "epoch": 13.82436416742443, + "grad_norm": 3.4398627281188965, + "learning_rate": 8.53363478239507e-07, + "loss": 0.1864, "step": 1003400 }, { - "epoch": 10.22, - "learning_rate": 1.474662605632865e-05, - "loss": 0.2891, + "epoch": 13.82574191948417, + "grad_norm": 2.122560739517212, + "learning_rate": 8.51383322474753e-07, + "loss": 0.1831, "step": 1003500 }, { - "epoch": 10.22, - "learning_rate": 1.4740914277419775e-05, - "loss": 0.2489, + "epoch": 13.827119671543908, + "grad_norm": 2.6567516326904297, + "learning_rate": 8.494054306463217e-07, + "loss": 0.2242, "step": 1003600 }, { - "epoch": 10.23, - "learning_rate": 1.4735203244589141e-05, - "loss": 0.2955, + "epoch": 13.828497423603649, + "grad_norm": 2.4729018211364746, + "learning_rate": 8.47429802922241e-07, + "loss": 0.2249, "step": 1003700 }, { - "epoch": 10.23, - "learning_rate": 1.4729492958115965e-05, - "loss": 0.2232, + "epoch": 13.829875175663387, + "grad_norm": 0.2673816978931427, + "learning_rate": 8.454564394703529e-07, + "loss": 0.213, "step": 1003800 }, { - "epoch": 10.23, - "learning_rate": 1.472378341827945e-05, - "loss": 0.2541, + "epoch": 13.831252927723128, + "grad_norm": 3.0601062774658203, + "learning_rate": 8.434853404583068e-07, + "loss": 0.1995, "step": 1003900 }, { - "epoch": 10.23, - "learning_rate": 1.4718074625358765e-05, - "loss": 0.2894, + "epoch": 13.832630679782866, + "grad_norm": 0.12234704941511154, + "learning_rate": 8.415165060535537e-07, + "loss": 0.2038, "step": 1004000 }, { - "epoch": 10.23, - "learning_rate": 1.4712366579633032e-05, - "loss": 0.3067, + "epoch": 13.834008431842605, + "grad_norm": 1.2257379293441772, + "learning_rate": 8.395499364233556e-07, + "loss": 0.2476, "step": 1004100 }, { - "epoch": 10.23, - "learning_rate": 1.470665928138134e-05, - "loss": 0.271, + "epoch": 13.835386183902346, + "grad_norm": 0.32526853680610657, + "learning_rate": 8.375856317347847e-07, + "loss": 0.1825, "step": 1004200 }, { - "epoch": 10.23, - "learning_rate": 1.4700952730882744e-05, - "loss": 0.3345, + "epoch": 13.836763935962084, + "grad_norm": 1.6427152156829834, + "learning_rate": 8.356235921547181e-07, + "loss": 0.2387, "step": 1004300 }, { - "epoch": 10.23, - "learning_rate": 1.4695246928416271e-05, - "loss": 0.2724, + "epoch": 13.838141688021823, + "grad_norm": 1.9942232370376587, + "learning_rate": 8.336638178498407e-07, + "loss": 0.2403, "step": 1004400 }, { - "epoch": 10.23, - "learning_rate": 1.4689541874260877e-05, - "loss": 0.3322, + "epoch": 13.839519440081563, + "grad_norm": 4.2254767417907715, + "learning_rate": 8.317063089866478e-07, + "loss": 0.2026, "step": 1004500 }, { - "epoch": 10.24, - "learning_rate": 1.4683837568695518e-05, - "loss": 0.301, + "epoch": 13.840897192141302, + "grad_norm": 3.479304790496826, + "learning_rate": 8.297510657314394e-07, + "loss": 0.1911, "step": 1004600 }, { - "epoch": 10.24, - "learning_rate": 1.4678134011999113e-05, - "loss": 0.2814, + "epoch": 13.84227494420104, + "grad_norm": 0.5361469984054565, + "learning_rate": 8.277980882503202e-07, + "loss": 0.201, "step": 1004700 }, { - "epoch": 10.24, - "learning_rate": 1.4672431204450506e-05, - "loss": 0.3129, + "epoch": 13.843652696260781, + "grad_norm": 2.803344249725342, + "learning_rate": 8.258473767092057e-07, + "loss": 0.2316, "step": 1004800 }, { - "epoch": 10.24, - "learning_rate": 1.4666729146328542e-05, - "loss": 0.2399, + "epoch": 13.84503044832052, + "grad_norm": 4.163804054260254, + "learning_rate": 8.238989312738246e-07, + "loss": 0.2546, "step": 1004900 }, { - "epoch": 10.24, - "learning_rate": 1.4661084847284234e-05, - "loss": 0.2578, + "epoch": 13.84640820038026, + "grad_norm": 0.056751832365989685, + "learning_rate": 8.219527521096987e-07, + "loss": 0.2349, "step": 1005000 }, { - "epoch": 10.24, - "learning_rate": 1.4655384281350689e-05, - "loss": 0.3071, + "epoch": 13.847785952439999, + "grad_norm": 0.1597357541322708, + "learning_rate": 8.200088393821753e-07, + "loss": 0.2149, "step": 1005100 }, { - "epoch": 10.24, - "learning_rate": 1.4649684465677282e-05, - "loss": 0.2754, + "epoch": 13.849163704499738, + "grad_norm": 2.5986826419830322, + "learning_rate": 8.180865984974265e-07, + "loss": 0.2138, "step": 1005200 }, { - "epoch": 10.24, - "learning_rate": 1.4643985400542697e-05, - "loss": 0.3098, + "epoch": 13.850541456559478, + "grad_norm": 3.167032480239868, + "learning_rate": 8.161471964698605e-07, + "loss": 0.1619, "step": 1005300 }, { - "epoch": 10.24, - "learning_rate": 1.4638287086225589e-05, - "loss": 0.3784, + "epoch": 13.851919208619217, + "grad_norm": 4.284632682800293, + "learning_rate": 8.142100613720971e-07, + "loss": 0.2072, "step": 1005400 }, { - "epoch": 10.24, - "learning_rate": 1.4632589523004577e-05, - "loss": 0.2742, + "epoch": 13.853296960678957, + "grad_norm": 1.47388756275177, + "learning_rate": 8.122751933687142e-07, + "loss": 0.2735, "step": 1005500 }, { - "epoch": 10.25, - "learning_rate": 1.462689271115822e-05, - "loss": 0.2835, + "epoch": 13.854674712738696, + "grad_norm": 0.350028932094574, + "learning_rate": 8.103425926240885e-07, + "loss": 0.1751, "step": 1005600 }, { - "epoch": 10.25, - "learning_rate": 1.4621196650965068e-05, - "loss": 0.2826, + "epoch": 13.856052464798434, + "grad_norm": 2.342237710952759, + "learning_rate": 8.084122593024041e-07, + "loss": 0.226, "step": 1005700 }, { - "epoch": 10.25, - "learning_rate": 1.4615501342703634e-05, - "loss": 0.2995, + "epoch": 13.857430216858175, + "grad_norm": 2.5162465572357178, + "learning_rate": 8.064841935676498e-07, + "loss": 0.2213, "step": 1005800 }, { - "epoch": 10.25, - "learning_rate": 1.4609806786652363e-05, - "loss": 0.2898, + "epoch": 13.858807968917914, + "grad_norm": 1.6593633890151978, + "learning_rate": 8.045583955836252e-07, + "loss": 0.1827, "step": 1005900 }, { - "epoch": 10.25, - "learning_rate": 1.4604112983089697e-05, - "loss": 0.3088, + "epoch": 13.860185720977652, + "grad_norm": 0.9531621932983398, + "learning_rate": 8.026348655139406e-07, + "loss": 0.2231, "step": 1006000 }, { - "epoch": 10.25, - "learning_rate": 1.459841993229404e-05, - "loss": 0.28, + "epoch": 13.861563473037393, + "grad_norm": 2.44447660446167, + "learning_rate": 8.007136035220047e-07, + "loss": 0.2264, "step": 1006100 }, { - "epoch": 10.25, - "learning_rate": 1.4592727634543723e-05, - "loss": 0.2632, + "epoch": 13.862941225097131, + "grad_norm": 1.9165898561477661, + "learning_rate": 7.98794609771043e-07, + "loss": 0.1865, "step": 1006200 }, { - "epoch": 10.25, - "learning_rate": 1.458703609011708e-05, - "loss": 0.2627, + "epoch": 13.86431897715687, + "grad_norm": 0.11040189117193222, + "learning_rate": 7.968778844240826e-07, + "loss": 0.1818, "step": 1006300 }, { - "epoch": 10.25, - "learning_rate": 1.4581345299292392e-05, - "loss": 0.2809, + "epoch": 13.86569672921661, + "grad_norm": 0.032923463732004166, + "learning_rate": 7.949634276439552e-07, + "loss": 0.2265, "step": 1006400 }, { - "epoch": 10.25, - "learning_rate": 1.4575655262347908e-05, - "loss": 0.3289, + "epoch": 13.86707448127635, + "grad_norm": 4.999039649963379, + "learning_rate": 7.930512395933063e-07, + "loss": 0.2294, "step": 1006500 }, { - "epoch": 10.26, - "learning_rate": 1.4569965979561832e-05, - "loss": 0.2821, + "epoch": 13.86845223333609, + "grad_norm": 2.618626594543457, + "learning_rate": 7.911413204345888e-07, + "loss": 0.1847, "step": 1006600 }, { - "epoch": 10.26, - "learning_rate": 1.4564277451212336e-05, - "loss": 0.3254, + "epoch": 13.869829985395828, + "grad_norm": 2.1467878818511963, + "learning_rate": 7.892527355987517e-07, + "loss": 0.2381, "step": 1006700 }, { - "epoch": 10.26, - "learning_rate": 1.4558589677577574e-05, - "loss": 0.2879, + "epoch": 13.871207737455567, + "grad_norm": 1.197317361831665, + "learning_rate": 7.873473320175051e-07, + "loss": 0.2774, "step": 1006800 }, { - "epoch": 10.26, - "learning_rate": 1.455290265893561e-05, - "loss": 0.2809, + "epoch": 13.872585489515307, + "grad_norm": 3.0321991443634033, + "learning_rate": 7.854441978127649e-07, + "loss": 0.245, "step": 1006900 }, { - "epoch": 10.26, - "learning_rate": 1.4547216395564524e-05, - "loss": 0.3273, + "epoch": 13.873963241575046, + "grad_norm": 3.3284738063812256, + "learning_rate": 7.835433331462119e-07, + "loss": 0.2107, "step": 1007000 }, { - "epoch": 10.26, - "learning_rate": 1.4541530887742352e-05, - "loss": 0.3139, + "epoch": 13.875340993634785, + "grad_norm": 2.685920476913452, + "learning_rate": 7.816447381793312e-07, + "loss": 0.2048, "step": 1007100 }, { - "epoch": 10.26, - "learning_rate": 1.4535846135747057e-05, - "loss": 0.3257, + "epoch": 13.876718745694525, + "grad_norm": 2.6791586875915527, + "learning_rate": 7.797484130734189e-07, + "loss": 0.2281, "step": 1007200 }, { - "epoch": 10.26, - "learning_rate": 1.4530162139856596e-05, - "loss": 0.2878, + "epoch": 13.878096497754264, + "grad_norm": 0.05113573744893074, + "learning_rate": 7.778543579895847e-07, + "loss": 0.1838, "step": 1007300 }, { - "epoch": 10.26, - "learning_rate": 1.4524478900348904e-05, - "loss": 0.2488, + "epoch": 13.879474249814004, + "grad_norm": 0.7338482141494751, + "learning_rate": 7.759625730887277e-07, + "loss": 0.2053, "step": 1007400 }, { - "epoch": 10.26, - "learning_rate": 1.4518796417501828e-05, - "loss": 0.2924, + "epoch": 13.880852001873743, + "grad_norm": 2.5004451274871826, + "learning_rate": 7.740730585315697e-07, + "loss": 0.1948, "step": 1007500 }, { - "epoch": 10.27, - "learning_rate": 1.4513114691593216e-05, - "loss": 0.2851, + "epoch": 13.882229753933482, + "grad_norm": 0.45705825090408325, + "learning_rate": 7.721858144786345e-07, + "loss": 0.2043, "step": 1007600 }, { - "epoch": 10.27, - "learning_rate": 1.4507433722900878e-05, - "loss": 0.2275, + "epoch": 13.883607505993222, + "grad_norm": 2.3425233364105225, + "learning_rate": 7.703008410902563e-07, + "loss": 0.1871, "step": 1007700 }, { - "epoch": 10.27, - "learning_rate": 1.4501753511702579e-05, - "loss": 0.2354, + "epoch": 13.88498525805296, + "grad_norm": 2.4338724613189697, + "learning_rate": 7.684181385265676e-07, + "loss": 0.2233, "step": 1007800 }, { - "epoch": 10.27, - "learning_rate": 1.449607405827604e-05, - "loss": 0.335, + "epoch": 13.8863630101127, + "grad_norm": 1.2534384727478027, + "learning_rate": 7.665377069475121e-07, + "loss": 0.2093, "step": 1007900 }, { - "epoch": 10.27, - "learning_rate": 1.4490395362898973e-05, - "loss": 0.2356, + "epoch": 13.88774076217244, + "grad_norm": 2.705702543258667, + "learning_rate": 7.646595465128531e-07, + "loss": 0.2155, "step": 1008000 }, { - "epoch": 10.27, - "learning_rate": 1.4484717425849003e-05, - "loss": 0.2337, + "epoch": 13.889118514232178, + "grad_norm": 0.4062190651893616, + "learning_rate": 7.62783657382137e-07, + "loss": 0.2067, "step": 1008100 }, { - "epoch": 10.27, - "learning_rate": 1.4479040247403758e-05, - "loss": 0.279, + "epoch": 13.890496266291919, + "grad_norm": 0.5641285181045532, + "learning_rate": 7.609100397147364e-07, + "loss": 0.2015, "step": 1008200 }, { - "epoch": 10.27, - "learning_rate": 1.4473363827840834e-05, - "loss": 0.2913, + "epoch": 13.891874018351658, + "grad_norm": 0.33847561478614807, + "learning_rate": 7.590386936698254e-07, + "loss": 0.2376, "step": 1008300 }, { - "epoch": 10.27, - "learning_rate": 1.446768816743775e-05, - "loss": 0.2791, + "epoch": 13.893251770411396, + "grad_norm": 1.2037060260772705, + "learning_rate": 7.571696194063857e-07, + "loss": 0.2193, "step": 1008400 }, { - "epoch": 10.27, - "learning_rate": 1.446201326647202e-05, - "loss": 0.2633, + "epoch": 13.894629522471137, + "grad_norm": 1.8664073944091797, + "learning_rate": 7.553028170831974e-07, + "loss": 0.2396, "step": 1008500 }, { - "epoch": 10.28, - "learning_rate": 1.4456339125221118e-05, - "loss": 0.2324, + "epoch": 13.896007274530875, + "grad_norm": 2.2433054447174072, + "learning_rate": 7.534382868588607e-07, + "loss": 0.1571, "step": 1008600 }, { - "epoch": 10.28, - "learning_rate": 1.4450665743962483e-05, - "loss": 0.2764, + "epoch": 13.897385026590614, + "grad_norm": 5.6116766929626465, + "learning_rate": 7.515760288917803e-07, + "loss": 0.2249, "step": 1008700 }, { - "epoch": 10.28, - "learning_rate": 1.444499312297349e-05, - "loss": 0.2692, + "epoch": 13.898762778650354, + "grad_norm": 3.0396549701690674, + "learning_rate": 7.497160433401531e-07, + "loss": 0.2012, "step": 1008800 }, { - "epoch": 10.28, - "learning_rate": 1.4439321262531502e-05, - "loss": 0.2472, + "epoch": 13.900140530710093, + "grad_norm": 0.0740162581205368, + "learning_rate": 7.478583303620084e-07, + "loss": 0.1661, "step": 1008900 }, { - "epoch": 10.28, - "learning_rate": 1.4433650162913847e-05, - "loss": 0.2268, + "epoch": 13.901518282769832, + "grad_norm": 1.833280324935913, + "learning_rate": 7.460028901151617e-07, + "loss": 0.183, "step": 1009000 }, { - "epoch": 10.28, - "learning_rate": 1.4428036524014607e-05, - "loss": 0.2616, + "epoch": 13.902896034829572, + "grad_norm": 2.4394686222076416, + "learning_rate": 7.441497227572359e-07, + "loss": 0.1971, "step": 1009100 }, { - "epoch": 10.28, - "learning_rate": 1.442236693926226e-05, - "loss": 0.2654, + "epoch": 13.904273786889311, + "grad_norm": 2.9945454597473145, + "learning_rate": 7.422988284456769e-07, + "loss": 0.2536, "step": 1009200 }, { - "epoch": 10.28, - "learning_rate": 1.4416698116163203e-05, - "loss": 0.3265, + "epoch": 13.905651538949051, + "grad_norm": 3.1405553817749023, + "learning_rate": 7.404686822959275e-07, + "loss": 0.2015, "step": 1009300 }, { - "epoch": 10.28, - "learning_rate": 1.4411086731833844e-05, - "loss": 0.3136, + "epoch": 13.90702929100879, + "grad_norm": 1.5918724536895752, + "learning_rate": 7.38622311814249e-07, + "loss": 0.2287, "step": 1009400 }, { - "epoch": 10.28, - "learning_rate": 1.4405419425249408e-05, - "loss": 0.2876, + "epoch": 13.908407043068529, + "grad_norm": 1.7790374755859375, + "learning_rate": 7.367782148485125e-07, + "loss": 0.2501, "step": 1009500 }, { - "epoch": 10.29, - "learning_rate": 1.4399752881146909e-05, - "loss": 0.2668, + "epoch": 13.90978479512827, + "grad_norm": 8.950652122497559, + "learning_rate": 7.349363915553883e-07, + "loss": 0.2294, "step": 1009600 }, { - "epoch": 10.29, - "learning_rate": 1.4394087099803386e-05, - "loss": 0.2245, + "epoch": 13.911162547188008, + "grad_norm": 3.0867457389831543, + "learning_rate": 7.330968420913425e-07, + "loss": 0.2601, "step": 1009700 }, { - "epoch": 10.29, - "learning_rate": 1.4388422081495875e-05, - "loss": 0.2939, + "epoch": 13.912540299247748, + "grad_norm": 0.4622548818588257, + "learning_rate": 7.31259566612664e-07, + "loss": 0.2155, "step": 1009800 }, { - "epoch": 10.29, - "learning_rate": 1.4382757826501358e-05, - "loss": 0.2816, + "epoch": 13.913918051307487, + "grad_norm": 0.2727733552455902, + "learning_rate": 7.294245652754278e-07, + "loss": 0.211, "step": 1009900 }, { - "epoch": 10.29, - "learning_rate": 1.4377094335096797e-05, - "loss": 0.3088, + "epoch": 13.915295803367226, + "grad_norm": 2.8583593368530273, + "learning_rate": 7.275918382355381e-07, + "loss": 0.1938, "step": 1010000 }, { - "epoch": 10.29, - "learning_rate": 1.4371431607559075e-05, - "loss": 0.2335, + "epoch": 13.916673555426966, + "grad_norm": 3.9037747383117676, + "learning_rate": 7.257613856486856e-07, + "loss": 0.2063, "step": 1010100 }, { - "epoch": 10.29, - "learning_rate": 1.4365769644165086e-05, - "loss": 0.3214, + "epoch": 13.918051307486705, + "grad_norm": 2.1687731742858887, + "learning_rate": 7.239332076703773e-07, + "loss": 0.2308, "step": 1010200 }, { - "epoch": 10.29, - "learning_rate": 1.436010844519167e-05, - "loss": 0.2468, + "epoch": 13.919429059546443, + "grad_norm": 3.1149818897247314, + "learning_rate": 7.221073044559281e-07, + "loss": 0.1956, "step": 1010300 }, { - "epoch": 10.29, - "learning_rate": 1.4354448010915608e-05, - "loss": 0.3226, + "epoch": 13.920806811606184, + "grad_norm": 1.0580086708068848, + "learning_rate": 7.202836761604639e-07, + "loss": 0.197, "step": 1010400 }, { - "epoch": 10.3, - "learning_rate": 1.4348788341613654e-05, - "loss": 0.2676, + "epoch": 13.922184563665922, + "grad_norm": 0.49563419818878174, + "learning_rate": 7.184623229388995e-07, + "loss": 0.1805, "step": 1010500 }, { - "epoch": 10.3, - "learning_rate": 1.4343129437562575e-05, - "loss": 0.3115, + "epoch": 13.923562315725661, + "grad_norm": 1.5151976346969604, + "learning_rate": 7.166432449459789e-07, + "loss": 0.1768, "step": 1010600 }, { - "epoch": 10.3, - "learning_rate": 1.4337471299039019e-05, - "loss": 0.2609, + "epoch": 13.924940067785402, + "grad_norm": 4.352602958679199, + "learning_rate": 7.148264423362325e-07, + "loss": 0.2123, "step": 1010700 }, { - "epoch": 10.3, - "learning_rate": 1.4331813926319647e-05, - "loss": 0.2781, + "epoch": 13.92631781984514, + "grad_norm": 1.7908936738967896, + "learning_rate": 7.130119152640164e-07, + "loss": 0.196, "step": 1010800 }, { - "epoch": 10.3, - "learning_rate": 1.4326157319681084e-05, - "loss": 0.2394, + "epoch": 13.92769557190488, + "grad_norm": 1.6136554479599, + "learning_rate": 7.111996638834761e-07, + "loss": 0.1815, "step": 1010900 }, { - "epoch": 10.3, - "learning_rate": 1.4320501479399878e-05, - "loss": 0.2736, + "epoch": 13.92907332396462, + "grad_norm": 0.3812917470932007, + "learning_rate": 7.093896883485774e-07, + "loss": 0.2002, "step": 1011000 }, { - "epoch": 10.3, - "learning_rate": 1.4314846405752581e-05, - "loss": 0.3235, + "epoch": 13.930451076024358, + "grad_norm": 1.072643518447876, + "learning_rate": 7.075819888130869e-07, + "loss": 0.199, "step": 1011100 }, { - "epoch": 10.3, - "learning_rate": 1.4309192099015702e-05, - "loss": 0.2278, + "epoch": 13.931828828084099, + "grad_norm": 1.3197944164276123, + "learning_rate": 7.057765654305703e-07, + "loss": 0.2064, "step": 1011200 }, { - "epoch": 10.3, - "learning_rate": 1.430353855946568e-05, - "loss": 0.2628, + "epoch": 13.933206580143837, + "grad_norm": 2.436164617538452, + "learning_rate": 7.039734183544131e-07, + "loss": 0.1635, "step": 1011300 }, { - "epoch": 10.3, - "learning_rate": 1.429788578737895e-05, - "loss": 0.3476, + "epoch": 13.934584332203576, + "grad_norm": 2.967456579208374, + "learning_rate": 7.021725477378049e-07, + "loss": 0.2232, "step": 1011400 }, { - "epoch": 10.31, - "learning_rate": 1.4292233783031902e-05, - "loss": 0.2903, + "epoch": 13.935962084263316, + "grad_norm": 3.6786534786224365, + "learning_rate": 7.003739537337314e-07, + "loss": 0.1966, "step": 1011500 }, { - "epoch": 10.31, - "learning_rate": 1.428658254670089e-05, - "loss": 0.2653, + "epoch": 13.937339836323055, + "grad_norm": 2.5281527042388916, + "learning_rate": 6.985776364949977e-07, + "loss": 0.2103, "step": 1011600 }, { - "epoch": 10.31, - "learning_rate": 1.4280932078662208e-05, - "loss": 0.3061, + "epoch": 13.938717588382795, + "grad_norm": 2.496382236480713, + "learning_rate": 6.967835961742108e-07, + "loss": 0.2035, "step": 1011700 }, { - "epoch": 10.31, - "learning_rate": 1.4275282379192133e-05, - "loss": 0.2899, + "epoch": 13.940095340442534, + "grad_norm": 21.985082626342773, + "learning_rate": 6.94991832923782e-07, + "loss": 0.2613, "step": 1011800 }, { - "epoch": 10.31, - "learning_rate": 1.4269633448566931e-05, - "loss": 0.3433, + "epoch": 13.941473092502273, + "grad_norm": 3.655707359313965, + "learning_rate": 6.932023468959303e-07, + "loss": 0.2185, "step": 1011900 }, { - "epoch": 10.31, - "learning_rate": 1.4263985287062766e-05, - "loss": 0.2535, + "epoch": 13.942850844562013, + "grad_norm": 3.2711689472198486, + "learning_rate": 6.914151382426826e-07, + "loss": 0.2301, "step": 1012000 }, { - "epoch": 10.31, - "learning_rate": 1.4258337894955818e-05, - "loss": 0.2514, + "epoch": 13.944228596621752, + "grad_norm": 1.7792415618896484, + "learning_rate": 6.896302071158669e-07, + "loss": 0.195, "step": 1012100 }, { - "epoch": 10.31, - "learning_rate": 1.425269127252221e-05, - "loss": 0.298, + "epoch": 13.94560634868149, + "grad_norm": 1.6625088453292847, + "learning_rate": 6.878475536671255e-07, + "loss": 0.2309, "step": 1012200 }, { - "epoch": 10.31, - "learning_rate": 1.4247045420038017e-05, - "loss": 0.2794, + "epoch": 13.946984100741231, + "grad_norm": 3.4740829467773438, + "learning_rate": 6.860671780479111e-07, + "loss": 0.1751, "step": 1012300 }, { - "epoch": 10.31, - "learning_rate": 1.4241400337779294e-05, - "loss": 0.274, + "epoch": 13.94836185280097, + "grad_norm": 1.5779759883880615, + "learning_rate": 6.843068501093472e-07, + "loss": 0.3096, "step": 1012400 }, { - "epoch": 10.32, - "learning_rate": 1.4235756026022061e-05, - "loss": 0.3269, + "epoch": 13.94973960486071, + "grad_norm": 0.8223806023597717, + "learning_rate": 6.825310078206668e-07, + "loss": 0.1736, "step": 1012500 }, { - "epoch": 10.32, - "learning_rate": 1.423016891663581e-05, - "loss": 0.3077, + "epoch": 13.951117356920449, + "grad_norm": 1.7619963884353638, + "learning_rate": 6.807574438131756e-07, + "loss": 0.1928, "step": 1012600 }, { - "epoch": 10.32, - "learning_rate": 1.4224526138997522e-05, - "loss": 0.2878, + "epoch": 13.952495108980187, + "grad_norm": 2.0087437629699707, + "learning_rate": 6.789861582375418e-07, + "loss": 0.2031, "step": 1012700 }, { - "epoch": 10.32, - "learning_rate": 1.4218884132685753e-05, - "loss": 0.3027, + "epoch": 13.953872861039928, + "grad_norm": 2.0862858295440674, + "learning_rate": 6.772171512442563e-07, + "loss": 0.1802, "step": 1012800 }, { - "epoch": 10.32, - "learning_rate": 1.4213242897976373e-05, - "loss": 0.2861, + "epoch": 13.955250613099667, + "grad_norm": 2.0899112224578857, + "learning_rate": 6.754504229835934e-07, + "loss": 0.2334, "step": 1012900 }, { - "epoch": 10.32, - "learning_rate": 1.4207602435145204e-05, - "loss": 0.2575, + "epoch": 13.956628365159405, + "grad_norm": 2.539127826690674, + "learning_rate": 6.736859736056503e-07, + "loss": 0.1871, "step": 1013000 }, { - "epoch": 10.32, - "learning_rate": 1.4201962744468032e-05, - "loss": 0.2585, + "epoch": 13.958006117219146, + "grad_norm": 2.8806445598602295, + "learning_rate": 6.719238032603286e-07, + "loss": 0.2357, "step": 1013100 }, { - "epoch": 10.32, - "learning_rate": 1.4196323826220604e-05, - "loss": 0.2759, + "epoch": 13.959383869278884, + "grad_norm": 2.864682197570801, + "learning_rate": 6.701639120973288e-07, + "loss": 0.21, "step": 1013200 }, { - "epoch": 10.32, - "learning_rate": 1.419068568067864e-05, - "loss": 0.285, + "epoch": 13.960761621338623, + "grad_norm": 2.4140005111694336, + "learning_rate": 6.684063002661678e-07, + "loss": 0.1821, "step": 1013300 }, { - "epoch": 10.32, - "learning_rate": 1.418504830811779e-05, - "loss": 0.3083, + "epoch": 13.962139373398363, + "grad_norm": 5.064892768859863, + "learning_rate": 6.666509679161581e-07, + "loss": 0.2281, "step": 1013400 }, { - "epoch": 10.33, - "learning_rate": 1.4179411708813703e-05, - "loss": 0.2934, + "epoch": 13.963517125458102, + "grad_norm": 2.8160386085510254, + "learning_rate": 6.648979151964291e-07, + "loss": 0.1904, "step": 1013500 }, { - "epoch": 10.33, - "learning_rate": 1.4173775883041976e-05, - "loss": 0.3112, + "epoch": 13.964894877517843, + "grad_norm": 1.712924838066101, + "learning_rate": 6.631471422559085e-07, + "loss": 0.2553, "step": 1013600 }, { - "epoch": 10.33, - "learning_rate": 1.4168140831078155e-05, - "loss": 0.3182, + "epoch": 13.966272629577581, + "grad_norm": 5.080384254455566, + "learning_rate": 6.613986492433349e-07, + "loss": 0.1726, "step": 1013700 }, { - "epoch": 10.33, - "learning_rate": 1.4162506553197767e-05, - "loss": 0.2306, + "epoch": 13.96765038163732, + "grad_norm": 2.5360894203186035, + "learning_rate": 6.596524363072515e-07, + "loss": 0.1918, "step": 1013800 }, { - "epoch": 10.33, - "learning_rate": 1.4156873049676293e-05, - "loss": 0.2309, + "epoch": 13.96902813369706, + "grad_norm": 1.59514582157135, + "learning_rate": 6.57908503596012e-07, + "loss": 0.2174, "step": 1013900 }, { - "epoch": 10.33, - "learning_rate": 1.4151240320789196e-05, - "loss": 0.2198, + "epoch": 13.970405885756799, + "grad_norm": 1.158673882484436, + "learning_rate": 6.561668512577626e-07, + "loss": 0.226, "step": 1014000 }, { - "epoch": 10.33, - "learning_rate": 1.4145608366811852e-05, - "loss": 0.302, + "epoch": 13.97178363781654, + "grad_norm": 3.0452091693878174, + "learning_rate": 6.544274794404726e-07, + "loss": 0.1834, "step": 1014100 }, { - "epoch": 10.33, - "learning_rate": 1.4139977188019648e-05, - "loss": 0.3323, + "epoch": 13.973161389876278, + "grad_norm": 3.675121784210205, + "learning_rate": 6.526903882919095e-07, + "loss": 0.2099, "step": 1014200 }, { - "epoch": 10.33, - "learning_rate": 1.4134346784687907e-05, - "loss": 0.2781, + "epoch": 13.974539141936017, + "grad_norm": 1.7934209108352661, + "learning_rate": 6.509555779596485e-07, + "loss": 0.2266, "step": 1014300 }, { - "epoch": 10.33, - "learning_rate": 1.4128717157091932e-05, - "loss": 0.3646, + "epoch": 13.975916893995757, + "grad_norm": 3.093475580215454, + "learning_rate": 6.492230485910697e-07, + "loss": 0.2081, "step": 1014400 }, { - "epoch": 10.34, - "learning_rate": 1.412308830550697e-05, - "loss": 0.2902, + "epoch": 13.977294646055496, + "grad_norm": 0.9700685143470764, + "learning_rate": 6.474928003333636e-07, + "loss": 0.2426, "step": 1014500 }, { - "epoch": 10.34, - "learning_rate": 1.4117460230208251e-05, - "loss": 0.2753, + "epoch": 13.978672398115235, + "grad_norm": 4.574120998382568, + "learning_rate": 6.457648333335162e-07, + "loss": 0.24, "step": 1014600 }, { - "epoch": 10.34, - "learning_rate": 1.4111832931470938e-05, - "loss": 0.2284, + "epoch": 13.980050150174975, + "grad_norm": 2.2313599586486816, + "learning_rate": 6.440391477383304e-07, + "loss": 0.2406, "step": 1014700 }, { - "epoch": 10.34, - "learning_rate": 1.4106206409570178e-05, - "loss": 0.2779, + "epoch": 13.981427902234714, + "grad_norm": 1.7652546167373657, + "learning_rate": 6.423157436944166e-07, + "loss": 0.2368, "step": 1014800 }, { - "epoch": 10.34, - "learning_rate": 1.4100580664781085e-05, - "loss": 0.2778, + "epoch": 13.982805654294452, + "grad_norm": 0.9916251301765442, + "learning_rate": 6.405946213481809e-07, + "loss": 0.1892, "step": 1014900 }, { - "epoch": 10.34, - "learning_rate": 1.4094955697378702e-05, - "loss": 0.2788, + "epoch": 13.984183406354193, + "grad_norm": 4.213309288024902, + "learning_rate": 6.38875780845849e-07, + "loss": 0.2334, "step": 1015000 }, { - "epoch": 10.34, - "learning_rate": 1.408933150763807e-05, - "loss": 0.3298, + "epoch": 13.985561158413931, + "grad_norm": 1.217786431312561, + "learning_rate": 6.37159222333436e-07, + "loss": 0.1665, "step": 1015100 }, { - "epoch": 10.34, - "learning_rate": 1.4083708095834193e-05, - "loss": 0.2597, + "epoch": 13.986938910473672, + "grad_norm": 0.12633667886257172, + "learning_rate": 6.354449459567772e-07, + "loss": 0.1934, "step": 1015200 }, { - "epoch": 10.34, - "learning_rate": 1.4078141684724859e-05, - "loss": 0.3054, + "epoch": 13.98831666253341, + "grad_norm": 5.602499961853027, + "learning_rate": 6.337329518615056e-07, + "loss": 0.2053, "step": 1015300 }, { - "epoch": 10.35, - "learning_rate": 1.4072519821833044e-05, - "loss": 0.2876, + "epoch": 13.98969441459315, + "grad_norm": 4.697601318359375, + "learning_rate": 6.32023240193072e-07, + "loss": 0.2348, "step": 1015400 }, { - "epoch": 10.35, - "learning_rate": 1.406689873769995e-05, - "loss": 0.2735, + "epoch": 13.99107216665289, + "grad_norm": 2.557914972305298, + "learning_rate": 6.303158110967158e-07, + "loss": 0.1661, "step": 1015500 }, { - "epoch": 10.35, - "learning_rate": 1.4061278432600426e-05, - "loss": 0.2667, + "epoch": 13.992449918712628, + "grad_norm": 0.2315978705883026, + "learning_rate": 6.286106647174967e-07, + "loss": 0.22, "step": 1015600 }, { - "epoch": 10.35, - "learning_rate": 1.4055715098208696e-05, - "loss": 0.2679, + "epoch": 13.993827670772367, + "grad_norm": 3.4364569187164307, + "learning_rate": 6.269078012002754e-07, + "loss": 0.2358, "step": 1015700 }, { - "epoch": 10.35, - "learning_rate": 1.4050096344203473e-05, - "loss": 0.2615, + "epoch": 13.995205422832107, + "grad_norm": 5.167016983032227, + "learning_rate": 6.252072206897181e-07, + "loss": 0.2474, "step": 1015800 }, { - "epoch": 10.35, - "learning_rate": 1.4044478370053372e-05, - "loss": 0.2905, + "epoch": 13.996583174891846, + "grad_norm": 4.080760955810547, + "learning_rate": 6.235089233302977e-07, + "loss": 0.3004, "step": 1015900 }, { - "epoch": 10.35, - "learning_rate": 1.4038861176033044e-05, - "loss": 0.3114, + "epoch": 13.997960926951587, + "grad_norm": 2.131598711013794, + "learning_rate": 6.218129092662922e-07, + "loss": 0.2469, "step": 1016000 }, { - "epoch": 10.35, - "learning_rate": 1.4033244762417158e-05, - "loss": 0.2336, + "epoch": 13.999338679011325, + "grad_norm": 3.778364419937134, + "learning_rate": 6.201191786417934e-07, + "loss": 0.1964, "step": 1016100 }, { - "epoch": 10.35, - "learning_rate": 1.4027629129480326e-05, - "loss": 0.2921, + "epoch": 14.000716431071064, + "grad_norm": 1.4112939834594727, + "learning_rate": 6.184277316006823e-07, + "loss": 0.2074, "step": 1016200 }, { - "epoch": 10.35, - "learning_rate": 1.4022014277497103e-05, - "loss": 0.309, + "epoch": 14.002094183130804, + "grad_norm": 1.1307002305984497, + "learning_rate": 6.167385682866597e-07, + "loss": 0.1817, "step": 1016300 }, { - "epoch": 10.36, - "learning_rate": 1.4016400206742028e-05, - "loss": 0.2655, + "epoch": 14.003471935190543, + "grad_norm": 2.6714022159576416, + "learning_rate": 6.150516888432312e-07, + "loss": 0.2157, "step": 1016400 }, { - "epoch": 10.36, - "learning_rate": 1.401078691748961e-05, - "loss": 0.2981, + "epoch": 14.004849687250282, + "grad_norm": 1.5242327451705933, + "learning_rate": 6.133670934137037e-07, + "loss": 0.2606, "step": 1016500 }, { - "epoch": 10.36, - "learning_rate": 1.400517441001428e-05, - "loss": 0.3011, + "epoch": 14.006227439310022, + "grad_norm": 3.3772196769714355, + "learning_rate": 6.117015939468687e-07, + "loss": 0.2208, "step": 1016600 }, { - "epoch": 10.36, - "learning_rate": 1.3999562684590473e-05, - "loss": 0.2545, + "epoch": 14.00760519136976, + "grad_norm": 12.760124206542969, + "learning_rate": 6.100215441305898e-07, + "loss": 0.244, "step": 1016700 }, { - "epoch": 10.36, - "learning_rate": 1.3993951741492562e-05, - "loss": 0.2682, + "epoch": 14.008982943429501, + "grad_norm": 0.41571125388145447, + "learning_rate": 6.083437787555502e-07, + "loss": 0.2474, "step": 1016800 }, { - "epoch": 10.36, - "learning_rate": 1.3988341580994891e-05, - "loss": 0.2627, + "epoch": 14.01036069548924, + "grad_norm": 1.3224139213562012, + "learning_rate": 6.066682979642813e-07, + "loss": 0.2156, "step": 1016900 }, { - "epoch": 10.36, - "learning_rate": 1.3982732203371767e-05, - "loss": 0.2615, + "epoch": 14.011738447548979, + "grad_norm": 1.5913996696472168, + "learning_rate": 6.049951018991254e-07, + "loss": 0.1787, "step": 1017000 }, { - "epoch": 10.36, - "learning_rate": 1.397712360889745e-05, - "loss": 0.2143, + "epoch": 14.013116199608719, + "grad_norm": 2.211167335510254, + "learning_rate": 6.033241907022294e-07, + "loss": 0.2308, "step": 1017100 }, { - "epoch": 10.36, - "learning_rate": 1.3971515797846182e-05, - "loss": 0.2843, + "epoch": 14.014493951668458, + "grad_norm": 0.6107139587402344, + "learning_rate": 6.016722394661467e-07, + "loss": 0.2251, "step": 1017200 }, { - "epoch": 10.36, - "learning_rate": 1.3965908770492126e-05, - "loss": 0.2962, + "epoch": 14.015871703728196, + "grad_norm": 0.005240975879132748, + "learning_rate": 6.000058755792151e-07, + "loss": 0.2176, "step": 1017300 }, { - "epoch": 10.37, - "learning_rate": 1.3960302527109445e-05, - "loss": 0.3491, + "epoch": 14.017249455787937, + "grad_norm": 3.663726806640625, + "learning_rate": 5.98341796984401e-07, + "loss": 0.2221, "step": 1017400 }, { - "epoch": 10.37, - "learning_rate": 1.3954697067972263e-05, - "loss": 0.2462, + "epoch": 14.018627207847675, + "grad_norm": 0.14198751747608185, + "learning_rate": 5.966800038230819e-07, + "loss": 0.2232, "step": 1017500 }, { - "epoch": 10.37, - "learning_rate": 1.3949148436216528e-05, - "loss": 0.2718, + "epoch": 14.020004959907416, + "grad_norm": 2.2149317264556885, + "learning_rate": 5.950204962364336e-07, + "loss": 0.1956, "step": 1017600 }, { - "epoch": 10.37, - "learning_rate": 1.3943544538543207e-05, - "loss": 0.2526, + "epoch": 14.021382711967155, + "grad_norm": 0.46175140142440796, + "learning_rate": 5.933632743654369e-07, + "loss": 0.1594, "step": 1017700 }, { - "epoch": 10.37, - "learning_rate": 1.3937941425934729e-05, - "loss": 0.2897, + "epoch": 14.022760464026893, + "grad_norm": 4.762292861938477, + "learning_rate": 5.917083383508828e-07, + "loss": 0.2216, "step": 1017800 }, { - "epoch": 10.37, - "learning_rate": 1.393233909866506e-05, - "loss": 0.2456, + "epoch": 14.024138216086634, + "grad_norm": 0.09483277052640915, + "learning_rate": 5.900556883333705e-07, + "loss": 0.2253, "step": 1017900 }, { - "epoch": 10.37, - "learning_rate": 1.3926737557008134e-05, - "loss": 0.2567, + "epoch": 14.025515968146372, + "grad_norm": 1.2487127780914307, + "learning_rate": 5.884053244532941e-07, + "loss": 0.215, "step": 1018000 }, { - "epoch": 10.37, - "learning_rate": 1.392113680123781e-05, - "loss": 0.2849, + "epoch": 14.026893720206111, + "grad_norm": 2.1889774799346924, + "learning_rate": 5.86757246850865e-07, + "loss": 0.2072, "step": 1018100 }, { - "epoch": 10.37, - "learning_rate": 1.3915536831627945e-05, - "loss": 0.3188, + "epoch": 14.028271472265851, + "grad_norm": 0.42818793654441833, + "learning_rate": 5.851114556660958e-07, + "loss": 0.1606, "step": 1018200 }, { - "epoch": 10.37, - "learning_rate": 1.3909937648452339e-05, - "loss": 0.3197, + "epoch": 14.02964922432559, + "grad_norm": 0.7637725472450256, + "learning_rate": 5.834679510388069e-07, + "loss": 0.24, "step": 1018300 }, { - "epoch": 10.38, - "learning_rate": 1.3904339251984762e-05, - "loss": 0.2359, + "epoch": 14.03102697638533, + "grad_norm": 2.0872621536254883, + "learning_rate": 5.818267331086142e-07, + "loss": 0.1759, "step": 1018400 }, { - "epoch": 10.38, - "learning_rate": 1.3898741642498943e-05, - "loss": 0.2664, + "epoch": 14.03240472844507, + "grad_norm": 1.8280794620513916, + "learning_rate": 5.801878020149562e-07, + "loss": 0.1676, "step": 1018500 }, { - "epoch": 10.38, - "learning_rate": 1.389314482026858e-05, - "loss": 0.2717, + "epoch": 14.033782480504808, + "grad_norm": 3.9696295261383057, + "learning_rate": 5.785511578970642e-07, + "loss": 0.1973, "step": 1018600 }, { - "epoch": 10.38, - "learning_rate": 1.3887548785567302e-05, - "loss": 0.3299, + "epoch": 14.035160232564548, + "grad_norm": 1.1307194232940674, + "learning_rate": 5.769168008939802e-07, + "loss": 0.2402, "step": 1018700 }, { - "epoch": 10.38, - "learning_rate": 1.3881953538668738e-05, - "loss": 0.3645, + "epoch": 14.036537984624287, + "grad_norm": 2.9659740924835205, + "learning_rate": 5.752847311445533e-07, + "loss": 0.1905, "step": 1018800 }, { - "epoch": 10.38, - "learning_rate": 1.3876359079846468e-05, - "loss": 0.2912, + "epoch": 14.037915736684026, + "grad_norm": 0.9227334260940552, + "learning_rate": 5.736549487874348e-07, + "loss": 0.1929, "step": 1018900 }, { - "epoch": 10.38, - "learning_rate": 1.3870765409374007e-05, - "loss": 0.3089, + "epoch": 14.039293488743766, + "grad_norm": 3.533618450164795, + "learning_rate": 5.720274539610804e-07, + "loss": 0.2527, "step": 1019000 }, { - "epoch": 10.38, - "learning_rate": 1.3865172527524861e-05, - "loss": 0.2536, + "epoch": 14.040671240803505, + "grad_norm": 2.9905810356140137, + "learning_rate": 5.704022468037562e-07, + "loss": 0.2228, "step": 1019100 }, { - "epoch": 10.38, - "learning_rate": 1.3859580434572506e-05, - "loss": 0.312, + "epoch": 14.042048992863243, + "grad_norm": 3.513597249984741, + "learning_rate": 5.687793274535332e-07, + "loss": 0.2106, "step": 1019200 }, { - "epoch": 10.38, - "learning_rate": 1.3853989130790335e-05, - "loss": 0.2893, + "epoch": 14.043426744922984, + "grad_norm": 0.49585723876953125, + "learning_rate": 5.671586960482841e-07, + "loss": 0.2263, "step": 1019300 }, { - "epoch": 10.39, - "learning_rate": 1.3848398616451737e-05, - "loss": 0.2817, + "epoch": 14.044804496982723, + "grad_norm": 1.862160086631775, + "learning_rate": 5.65540352725695e-07, + "loss": 0.2132, "step": 1019400 }, { - "epoch": 10.39, - "learning_rate": 1.3842808891830063e-05, - "loss": 0.2938, + "epoch": 14.046182249042463, + "grad_norm": 4.592262268066406, + "learning_rate": 5.639404468471296e-07, + "loss": 0.2217, "step": 1019500 }, { - "epoch": 10.39, - "learning_rate": 1.3837219957198614e-05, - "loss": 0.2705, + "epoch": 14.047560001102202, + "grad_norm": 4.919544219970703, + "learning_rate": 5.623266572178626e-07, + "loss": 0.2057, "step": 1019600 }, { - "epoch": 10.39, - "learning_rate": 1.383163181283065e-05, - "loss": 0.2773, + "epoch": 14.04893775316194, + "grad_norm": 2.0715248584747314, + "learning_rate": 5.607151560817589e-07, + "loss": 0.1741, "step": 1019700 }, { - "epoch": 10.39, - "learning_rate": 1.3826044458999405e-05, - "loss": 0.2924, + "epoch": 14.05031550522168, + "grad_norm": 2.83178448677063, + "learning_rate": 5.591059435757201e-07, + "loss": 0.214, "step": 1019800 }, { - "epoch": 10.39, - "learning_rate": 1.3820457895978079e-05, - "loss": 0.3082, + "epoch": 14.05169325728142, + "grad_norm": 0.12600405514240265, + "learning_rate": 5.574990198364646e-07, + "loss": 0.1995, "step": 1019900 }, { - "epoch": 10.39, - "learning_rate": 1.3814872124039793e-05, - "loss": 0.2714, + "epoch": 14.053071009341158, + "grad_norm": 3.3065154552459717, + "learning_rate": 5.558943850005033e-07, + "loss": 0.2187, "step": 1020000 }, { - "epoch": 10.39, - "learning_rate": 1.3809287143457674e-05, - "loss": 0.2994, + "epoch": 14.054448761400899, + "grad_norm": 0.8885064721107483, + "learning_rate": 5.542920392041545e-07, + "loss": 0.1924, "step": 1020100 }, { - "epoch": 10.39, - "learning_rate": 1.3803702954504801e-05, - "loss": 0.2513, + "epoch": 14.055826513460637, + "grad_norm": 2.5127980709075928, + "learning_rate": 5.526919825835536e-07, + "loss": 0.1906, "step": 1020200 }, { - "epoch": 10.39, - "learning_rate": 1.379811955745419e-05, - "loss": 0.2874, + "epoch": 14.057204265520378, + "grad_norm": 0.5123410224914551, + "learning_rate": 5.510942152746313e-07, + "loss": 0.1973, "step": 1020300 }, { - "epoch": 10.4, - "learning_rate": 1.3792536952578844e-05, - "loss": 0.253, + "epoch": 14.058582017580116, + "grad_norm": 0.9705925583839417, + "learning_rate": 5.494987374131227e-07, + "loss": 0.2141, "step": 1020400 }, { - "epoch": 10.4, - "learning_rate": 1.378695514015173e-05, - "loss": 0.2595, + "epoch": 14.059959769639855, + "grad_norm": 2.7959299087524414, + "learning_rate": 5.479055491345772e-07, + "loss": 0.1816, "step": 1020500 }, { - "epoch": 10.4, - "learning_rate": 1.3781374120445744e-05, - "loss": 0.2544, + "epoch": 14.061337521699595, + "grad_norm": 1.8931347131729126, + "learning_rate": 5.463146505743391e-07, + "loss": 0.1953, "step": 1020600 }, { - "epoch": 10.4, - "learning_rate": 1.3775793893733779e-05, - "loss": 0.3081, + "epoch": 14.062715273759334, + "grad_norm": 1.8882026672363281, + "learning_rate": 5.447260418675668e-07, + "loss": 0.2321, "step": 1020700 }, { - "epoch": 10.4, - "learning_rate": 1.377021446028867e-05, - "loss": 0.258, + "epoch": 14.064093025819073, + "grad_norm": 5.866182327270508, + "learning_rate": 5.431397231492169e-07, + "loss": 0.2262, "step": 1020800 }, { - "epoch": 10.4, - "learning_rate": 1.376463582038322e-05, - "loss": 0.308, + "epoch": 14.065470777878813, + "grad_norm": 0.38003942370414734, + "learning_rate": 5.415556945540601e-07, + "loss": 0.2101, "step": 1020900 }, { - "epoch": 10.4, - "learning_rate": 1.3759057974290189e-05, - "loss": 0.2972, + "epoch": 14.066848529938552, + "grad_norm": 1.1763534545898438, + "learning_rate": 5.399739562166653e-07, + "loss": 0.2222, "step": 1021000 }, { - "epoch": 10.4, - "learning_rate": 1.3753480922282306e-05, - "loss": 0.3601, + "epoch": 14.068226281998292, + "grad_norm": 1.1725133657455444, + "learning_rate": 5.383945082714092e-07, + "loss": 0.215, "step": 1021100 }, { - "epoch": 10.4, - "learning_rate": 1.3747904664632258e-05, - "loss": 0.2819, + "epoch": 14.069604034058031, + "grad_norm": 1.883790373802185, + "learning_rate": 5.368173508524762e-07, + "loss": 0.2252, "step": 1021200 }, { - "epoch": 10.41, - "learning_rate": 1.3742329201612676e-05, - "loss": 0.2776, + "epoch": 14.07098178611777, + "grad_norm": 1.001843810081482, + "learning_rate": 5.352424840938492e-07, + "loss": 0.1866, "step": 1021300 }, { - "epoch": 10.41, - "learning_rate": 1.3736754533496176e-05, - "loss": 0.2636, + "epoch": 14.07235953817751, + "grad_norm": 4.79423189163208, + "learning_rate": 5.336699081293245e-07, + "loss": 0.2031, "step": 1021400 }, { - "epoch": 10.41, - "learning_rate": 1.3731180660555336e-05, - "loss": 0.3103, + "epoch": 14.073737290237249, + "grad_norm": 2.1388587951660156, + "learning_rate": 5.320996230925004e-07, + "loss": 0.2071, "step": 1021500 }, { - "epoch": 10.41, - "learning_rate": 1.3725607583062663e-05, - "loss": 0.3004, + "epoch": 14.075115042296988, + "grad_norm": 7.936727046966553, + "learning_rate": 5.305316291167828e-07, + "loss": 0.1927, "step": 1021600 }, { - "epoch": 10.41, - "learning_rate": 1.372003530129066e-05, - "loss": 0.2535, + "epoch": 14.076492794356728, + "grad_norm": 3.701247215270996, + "learning_rate": 5.289659263353758e-07, + "loss": 0.2305, "step": 1021700 }, { - "epoch": 10.41, - "learning_rate": 1.3714463815511783e-05, - "loss": 0.2895, + "epoch": 14.077870546416467, + "grad_norm": 6.462217807769775, + "learning_rate": 5.274025148812944e-07, + "loss": 0.2144, "step": 1021800 }, { - "epoch": 10.41, - "learning_rate": 1.3708893125998434e-05, - "loss": 0.2502, + "epoch": 14.079248298476207, + "grad_norm": 4.1912007331848145, + "learning_rate": 5.258413948873614e-07, + "loss": 0.2159, "step": 1021900 }, { - "epoch": 10.41, - "learning_rate": 1.3703323233022988e-05, - "loss": 0.27, + "epoch": 14.080626050535946, + "grad_norm": 1.2235301733016968, + "learning_rate": 5.242825664862008e-07, + "loss": 0.2348, "step": 1022000 }, { - "epoch": 10.41, - "learning_rate": 1.3697754136857782e-05, - "loss": 0.3237, + "epoch": 14.082003802595684, + "grad_norm": 1.829254388809204, + "learning_rate": 5.227260298102415e-07, + "loss": 0.1844, "step": 1022100 }, { - "epoch": 10.41, - "learning_rate": 1.3692185837775109e-05, - "loss": 0.2404, + "epoch": 14.083381554655425, + "grad_norm": 2.716498374938965, + "learning_rate": 5.211717849917261e-07, + "loss": 0.2225, "step": 1022200 }, { - "epoch": 10.42, - "learning_rate": 1.3686618336047233e-05, - "loss": 0.2656, + "epoch": 14.084759306715164, + "grad_norm": 2.048232078552246, + "learning_rate": 5.196198321626864e-07, + "loss": 0.2164, "step": 1022300 }, { - "epoch": 10.42, - "learning_rate": 1.3681051631946377e-05, - "loss": 0.3085, + "epoch": 14.086137058774902, + "grad_norm": 1.251952052116394, + "learning_rate": 5.18070171454971e-07, + "loss": 0.2067, "step": 1022400 }, { - "epoch": 10.42, - "learning_rate": 1.3675485725744699e-05, - "loss": 0.3039, + "epoch": 14.087514810834643, + "grad_norm": 1.424296259880066, + "learning_rate": 5.165228030002364e-07, + "loss": 0.2389, "step": 1022500 }, { - "epoch": 10.42, - "learning_rate": 1.366992061771435e-05, - "loss": 0.2641, + "epoch": 14.088892562894381, + "grad_norm": 3.7526659965515137, + "learning_rate": 5.149777269299344e-07, + "loss": 0.2216, "step": 1022600 }, { - "epoch": 10.42, - "learning_rate": 1.366435630812743e-05, - "loss": 0.2702, + "epoch": 14.090270314954122, + "grad_norm": 0.20451049506664276, + "learning_rate": 5.134349433753305e-07, + "loss": 0.2002, "step": 1022700 }, { - "epoch": 10.42, - "learning_rate": 1.365879279725601e-05, - "loss": 0.3275, + "epoch": 14.09164806701386, + "grad_norm": 3.32766056060791, + "learning_rate": 5.118944524674887e-07, + "loss": 0.1986, "step": 1022800 }, { - "epoch": 10.42, - "learning_rate": 1.3653230085372094e-05, - "loss": 0.2841, + "epoch": 14.093025819073599, + "grad_norm": 2.618166446685791, + "learning_rate": 5.10356254337287e-07, + "loss": 0.223, "step": 1022900 }, { - "epoch": 10.42, - "learning_rate": 1.3647668172747674e-05, - "loss": 0.2551, + "epoch": 14.09440357113334, + "grad_norm": 0.6933869123458862, + "learning_rate": 5.088203491153953e-07, + "loss": 0.1785, "step": 1023000 }, { - "epoch": 10.42, - "learning_rate": 1.3642107059654704e-05, - "loss": 0.3274, + "epoch": 14.095781323193078, + "grad_norm": 2.1524736881256104, + "learning_rate": 5.072867369323008e-07, + "loss": 0.2094, "step": 1023100 }, { - "epoch": 10.42, - "learning_rate": 1.363654674636507e-05, - "loss": 0.3192, + "epoch": 14.097159075252817, + "grad_norm": 1.6353240013122559, + "learning_rate": 5.057554179182953e-07, + "loss": 0.226, "step": 1023200 }, { - "epoch": 10.43, - "learning_rate": 1.3630987233150647e-05, - "loss": 0.2678, + "epoch": 14.098536827312557, + "grad_norm": 1.0612972974777222, + "learning_rate": 5.042263922034684e-07, + "loss": 0.203, "step": 1023300 }, { - "epoch": 10.43, - "learning_rate": 1.3625428520283262e-05, - "loss": 0.2892, + "epoch": 14.099914579372296, + "grad_norm": 3.8534348011016846, + "learning_rate": 5.026996599177153e-07, + "loss": 0.1985, "step": 1023400 }, { - "epoch": 10.43, - "learning_rate": 1.361987060803471e-05, - "loss": 0.2651, + "epoch": 14.101292331432035, + "grad_norm": 3.1799464225769043, + "learning_rate": 5.011752211907442e-07, + "loss": 0.2081, "step": 1023500 }, { - "epoch": 10.43, - "learning_rate": 1.3614313496676729e-05, - "loss": 0.272, + "epoch": 14.102670083491775, + "grad_norm": 5.872501373291016, + "learning_rate": 4.996530761520652e-07, + "loss": 0.1871, "step": 1023600 }, { - "epoch": 10.43, - "learning_rate": 1.3608757186481046e-05, - "loss": 0.2925, + "epoch": 14.104047835551514, + "grad_norm": 4.943983554840088, + "learning_rate": 4.981332249309898e-07, + "loss": 0.2084, "step": 1023700 }, { - "epoch": 10.43, - "learning_rate": 1.3603201677719306e-05, - "loss": 0.2509, + "epoch": 14.105425587611254, + "grad_norm": 0.810875415802002, + "learning_rate": 4.966156676566372e-07, + "loss": 0.1928, "step": 1023800 }, { - "epoch": 10.43, - "learning_rate": 1.3597646970663155e-05, - "loss": 0.2421, + "epoch": 14.106803339670993, + "grad_norm": 3.209590196609497, + "learning_rate": 4.951155457338216e-07, + "loss": 0.2027, "step": 1023900 }, { - "epoch": 10.43, - "learning_rate": 1.3592093065584191e-05, - "loss": 0.2687, + "epoch": 14.108181091730732, + "grad_norm": 3.388915538787842, + "learning_rate": 4.936025537968158e-07, + "loss": 0.1498, "step": 1024000 }, { - "epoch": 10.43, - "learning_rate": 1.3586539962753953e-05, - "loss": 0.2712, + "epoch": 14.109558843790472, + "grad_norm": 2.952042579650879, + "learning_rate": 4.920918561914329e-07, + "loss": 0.2354, "step": 1024100 }, { - "epoch": 10.43, - "learning_rate": 1.3580987662443957e-05, - "loss": 0.2586, + "epoch": 14.11093659585021, + "grad_norm": 0.910190224647522, + "learning_rate": 4.905834530460163e-07, + "loss": 0.1662, "step": 1024200 }, { - "epoch": 10.44, - "learning_rate": 1.3575436164925691e-05, - "loss": 0.2506, + "epoch": 14.11231434790995, + "grad_norm": 1.0930262804031372, + "learning_rate": 4.890773444887206e-07, + "loss": 0.17, "step": 1024300 }, { - "epoch": 10.44, - "learning_rate": 1.356988547047057e-05, - "loss": 0.2864, + "epoch": 14.11369209996969, + "grad_norm": 1.4029775857925415, + "learning_rate": 4.875735306474896e-07, + "loss": 0.2043, "step": 1024400 }, { - "epoch": 10.44, - "learning_rate": 1.3564391074283802e-05, - "loss": 0.2512, + "epoch": 14.115069852029428, + "grad_norm": 4.97995662689209, + "learning_rate": 4.860720116500778e-07, + "loss": 0.232, "step": 1024500 }, { - "epoch": 10.44, - "learning_rate": 1.3558841978731736e-05, - "loss": 0.2833, + "epoch": 14.116447604089169, + "grad_norm": 2.449876546859741, + "learning_rate": 4.845727876240563e-07, + "loss": 0.2092, "step": 1024600 }, { - "epoch": 10.44, - "learning_rate": 1.3553293687054182e-05, - "loss": 0.2969, + "epoch": 14.117825356148908, + "grad_norm": 0.9248343110084534, + "learning_rate": 4.830758586967829e-07, + "loss": 0.2432, "step": 1024700 }, { - "epoch": 10.44, - "learning_rate": 1.3547746199522429e-05, - "loss": 0.2711, + "epoch": 14.119203108208646, + "grad_norm": 1.7604528665542603, + "learning_rate": 4.815812249954318e-07, + "loss": 0.18, "step": 1024800 }, { - "epoch": 10.44, - "learning_rate": 1.3542199516407697e-05, - "loss": 0.2126, + "epoch": 14.120580860268387, + "grad_norm": 1.6153812408447266, + "learning_rate": 4.800888866469852e-07, + "loss": 0.2024, "step": 1024900 }, { - "epoch": 10.44, - "learning_rate": 1.353665363798119e-05, - "loss": 0.24, + "epoch": 14.121958612328125, + "grad_norm": 1.0927902460098267, + "learning_rate": 4.785988437782204e-07, + "loss": 0.171, "step": 1025000 }, { - "epoch": 10.44, - "learning_rate": 1.3531108564514071e-05, - "loss": 0.2585, + "epoch": 14.123336364387864, + "grad_norm": 0.6424771547317505, + "learning_rate": 4.771110965157197e-07, + "loss": 0.1928, "step": 1025100 }, { - "epoch": 10.44, - "learning_rate": 1.3525564296277474e-05, - "loss": 0.3302, + "epoch": 14.124714116447604, + "grad_norm": 3.5146942138671875, + "learning_rate": 4.756256449858817e-07, + "loss": 0.2516, "step": 1025200 }, { - "epoch": 10.45, - "learning_rate": 1.3520020833542461e-05, - "loss": 0.2986, + "epoch": 14.126091868507343, + "grad_norm": 2.9304721355438232, + "learning_rate": 4.7414248931490404e-07, + "loss": 0.2025, "step": 1025300 }, { - "epoch": 10.45, - "learning_rate": 1.351447817658008e-05, - "loss": 0.257, + "epoch": 14.127469620567084, + "grad_norm": 3.652332067489624, + "learning_rate": 4.726616296287795e-07, + "loss": 0.2029, "step": 1025400 }, { - "epoch": 10.45, - "learning_rate": 1.3508936325661343e-05, - "loss": 0.2964, + "epoch": 14.128847372626822, + "grad_norm": 0.7800684571266174, + "learning_rate": 4.711830660533267e-07, + "loss": 0.2122, "step": 1025500 }, { - "epoch": 10.45, - "learning_rate": 1.3503395281057199e-05, - "loss": 0.2673, + "epoch": 14.130225124686561, + "grad_norm": 2.5150647163391113, + "learning_rate": 4.697067987141479e-07, + "loss": 0.2148, "step": 1025600 }, { - "epoch": 10.45, - "learning_rate": 1.3497855043038578e-05, - "loss": 0.2207, + "epoch": 14.131602876746301, + "grad_norm": 0.49390849471092224, + "learning_rate": 4.6823282773666184e-07, + "loss": 0.1895, "step": 1025700 }, { - "epoch": 10.45, - "learning_rate": 1.3492315611876366e-05, - "loss": 0.3436, + "epoch": 14.13298062880604, + "grad_norm": 0.7920058369636536, + "learning_rate": 4.667611532460919e-07, + "loss": 0.179, "step": 1025800 }, { - "epoch": 10.45, - "learning_rate": 1.3486776987841408e-05, - "loss": 0.2608, + "epoch": 14.134358380865779, + "grad_norm": 0.6743540167808533, + "learning_rate": 4.652917753674632e-07, + "loss": 0.2115, "step": 1025900 }, { - "epoch": 10.45, - "learning_rate": 1.3481239171204509e-05, - "loss": 0.2775, + "epoch": 14.13573613292552, + "grad_norm": 1.0219744443893433, + "learning_rate": 4.638246942256083e-07, + "loss": 0.2658, "step": 1026000 }, { - "epoch": 10.45, - "learning_rate": 1.3475702162236446e-05, - "loss": 0.2963, + "epoch": 14.137113884985258, + "grad_norm": 2.968315601348877, + "learning_rate": 4.623599099451584e-07, + "loss": 0.1751, "step": 1026100 }, { - "epoch": 10.46, - "learning_rate": 1.3470165961207926e-05, - "loss": 0.3303, + "epoch": 14.138491637044998, + "grad_norm": 2.6218507289886475, + "learning_rate": 4.608974226505583e-07, + "loss": 0.2254, "step": 1026200 }, { - "epoch": 10.46, - "learning_rate": 1.3464630568389647e-05, - "loss": 0.2665, + "epoch": 14.139869389104737, + "grad_norm": 1.8848381042480469, + "learning_rate": 4.5943723246605453e-07, + "loss": 0.1708, "step": 1026300 }, { - "epoch": 10.46, - "learning_rate": 1.3459095984052264e-05, - "loss": 0.2331, + "epoch": 14.141247141164476, + "grad_norm": 1.5641224384307861, + "learning_rate": 4.5797933951569506e-07, + "loss": 0.2044, "step": 1026400 }, { - "epoch": 10.46, - "learning_rate": 1.3453562208466366e-05, - "loss": 0.2721, + "epoch": 14.142624893224216, + "grad_norm": 1.3623244762420654, + "learning_rate": 4.565237439233355e-07, + "loss": 0.2322, "step": 1026500 }, { - "epoch": 10.46, - "learning_rate": 1.344802924190253e-05, - "loss": 0.3123, + "epoch": 14.144002645283955, + "grad_norm": 1.5846881866455078, + "learning_rate": 4.5507044581264535e-07, + "loss": 0.1639, "step": 1026600 }, { - "epoch": 10.46, - "learning_rate": 1.3442497084631303e-05, - "loss": 0.2714, + "epoch": 14.145380397343693, + "grad_norm": 2.354372024536133, + "learning_rate": 4.536194453070741e-07, + "loss": 0.2222, "step": 1026700 }, { - "epoch": 10.46, - "learning_rate": 1.3436965736923143e-05, - "loss": 0.3093, + "epoch": 14.146758149403434, + "grad_norm": 1.9994189739227295, + "learning_rate": 4.521707425299004e-07, + "loss": 0.2047, "step": 1026800 }, { - "epoch": 10.46, - "learning_rate": 1.3431435199048515e-05, - "loss": 0.2504, + "epoch": 14.148135901463172, + "grad_norm": 0.7828368544578552, + "learning_rate": 4.5072433760419835e-07, + "loss": 0.2253, "step": 1026900 }, { - "epoch": 10.46, - "learning_rate": 1.3425905471277834e-05, - "loss": 0.2519, + "epoch": 14.149513653522913, + "grad_norm": 1.2019603252410889, + "learning_rate": 4.492802306528465e-07, + "loss": 0.178, "step": 1027000 }, { - "epoch": 10.46, - "learning_rate": 1.3420376553881463e-05, - "loss": 0.2558, + "epoch": 14.150891405582652, + "grad_norm": 1.1777639389038086, + "learning_rate": 4.478384217985343e-07, + "loss": 0.198, "step": 1027100 }, { - "epoch": 10.47, - "learning_rate": 1.3414848447129738e-05, - "loss": 0.277, + "epoch": 14.15226915764239, + "grad_norm": 0.26375922560691833, + "learning_rate": 4.4639891116374036e-07, + "loss": 0.2648, "step": 1027200 }, { - "epoch": 10.47, - "learning_rate": 1.3409321151292949e-05, - "loss": 0.2347, + "epoch": 14.15364690970213, + "grad_norm": 2.4622080326080322, + "learning_rate": 4.449616988707664e-07, + "loss": 0.1996, "step": 1027300 }, { - "epoch": 10.47, - "learning_rate": 1.3403794666641357e-05, - "loss": 0.2802, + "epoch": 14.15502466176187, + "grad_norm": 3.7776124477386475, + "learning_rate": 4.4352678504170937e-07, + "loss": 0.2053, "step": 1027400 }, { - "epoch": 10.47, - "learning_rate": 1.3398324246159519e-05, - "loss": 0.2539, + "epoch": 14.156402413821608, + "grad_norm": 1.59080970287323, + "learning_rate": 4.420941697984679e-07, + "loss": 0.2273, "step": 1027500 }, { - "epoch": 10.47, - "learning_rate": 1.3392799376570308e-05, - "loss": 0.3249, + "epoch": 14.157780165881348, + "grad_norm": 3.6957881450653076, + "learning_rate": 4.406638532627605e-07, + "loss": 0.2413, "step": 1027600 }, { - "epoch": 10.47, - "learning_rate": 1.338727531897409e-05, - "loss": 0.2917, + "epoch": 14.159157917941087, + "grad_norm": 0.9523907899856567, + "learning_rate": 4.392358355560888e-07, + "loss": 0.225, "step": 1027700 }, { - "epoch": 10.47, - "learning_rate": 1.338175207364097e-05, - "loss": 0.3196, + "epoch": 14.160535670000826, + "grad_norm": 2.3711841106414795, + "learning_rate": 4.3781011679977753e-07, + "loss": 0.209, "step": 1027800 }, { - "epoch": 10.47, - "learning_rate": 1.337622964084101e-05, - "loss": 0.2767, + "epoch": 14.161913422060566, + "grad_norm": 2.091735363006592, + "learning_rate": 4.363866971149436e-07, + "loss": 0.2291, "step": 1027900 }, { - "epoch": 10.47, - "learning_rate": 1.3370708020844203e-05, - "loss": 0.2957, + "epoch": 14.163291174120305, + "grad_norm": 7.050431251525879, + "learning_rate": 4.3496557662251773e-07, + "loss": 0.2145, "step": 1028000 }, { - "epoch": 10.47, - "learning_rate": 1.3365187213920537e-05, - "loss": 0.2711, + "epoch": 14.164668926180045, + "grad_norm": 0.635335385799408, + "learning_rate": 4.3356093227302526e-07, + "loss": 0.2004, "step": 1028100 }, { - "epoch": 10.48, - "learning_rate": 1.3359667220339956e-05, - "loss": 0.3122, + "epoch": 14.166046678239784, + "grad_norm": 2.8612728118896484, + "learning_rate": 4.3214438753247904e-07, + "loss": 0.1923, "step": 1028200 }, { - "epoch": 10.48, - "learning_rate": 1.3354148040372332e-05, - "loss": 0.3273, + "epoch": 14.167424430299523, + "grad_norm": 1.9540824890136719, + "learning_rate": 4.307442734134412e-07, + "loss": 0.1946, "step": 1028300 }, { - "epoch": 10.48, - "learning_rate": 1.3348629674287534e-05, - "loss": 0.271, + "epoch": 14.168802182359263, + "grad_norm": 1.2414122819900513, + "learning_rate": 4.293464132026928e-07, + "loss": 0.2225, "step": 1028400 }, { - "epoch": 10.48, - "learning_rate": 1.3343112122355373e-05, - "loss": 0.3193, + "epoch": 14.170179934419002, + "grad_norm": 52.50362777709961, + "learning_rate": 4.2793672148380147e-07, + "loss": 0.2644, "step": 1028500 }, { - "epoch": 10.48, - "learning_rate": 1.3337595384845625e-05, - "loss": 0.2864, + "epoch": 14.17155768647874, + "grad_norm": 1.7245001792907715, + "learning_rate": 4.265293296751785e-07, + "loss": 0.2321, "step": 1028600 }, { - "epoch": 10.48, - "learning_rate": 1.333207946202803e-05, - "loss": 0.2699, + "epoch": 14.172935438538481, + "grad_norm": 2.8156352043151855, + "learning_rate": 4.2512423789639903e-07, + "loss": 0.1948, "step": 1028700 }, { - "epoch": 10.48, - "learning_rate": 1.3326564354172285e-05, - "loss": 0.2543, + "epoch": 14.17431319059822, + "grad_norm": 4.342079162597656, + "learning_rate": 4.2372144626683067e-07, + "loss": 0.2536, "step": 1028800 }, { - "epoch": 10.48, - "learning_rate": 1.3321050061548036e-05, - "loss": 0.2413, + "epoch": 14.17569094265796, + "grad_norm": 2.5000643730163574, + "learning_rate": 4.223209549056456e-07, + "loss": 0.2202, "step": 1028900 }, { - "epoch": 10.48, - "learning_rate": 1.3315536584424902e-05, - "loss": 0.3171, + "epoch": 14.177068694717699, + "grad_norm": 0.9814404249191284, + "learning_rate": 4.209227639318236e-07, + "loss": 0.2067, "step": 1029000 }, { - "epoch": 10.48, - "learning_rate": 1.3310023923072473e-05, - "loss": 0.2681, + "epoch": 14.178446446777437, + "grad_norm": 0.12182557582855225, + "learning_rate": 4.1952687346414916e-07, + "loss": 0.1907, "step": 1029100 }, { - "epoch": 10.49, - "learning_rate": 1.3304512077760263e-05, - "loss": 0.2935, + "epoch": 14.179824198837178, + "grad_norm": 3.392672538757324, + "learning_rate": 4.181332836212082e-07, + "loss": 0.1708, "step": 1029200 }, { - "epoch": 10.49, - "learning_rate": 1.329905615500619e-05, - "loss": 0.2852, + "epoch": 14.181201950896916, + "grad_norm": 0.6692277789115906, + "learning_rate": 4.1674199452139737e-07, + "loss": 0.2452, "step": 1029300 }, { - "epoch": 10.49, - "learning_rate": 1.3293545934415763e-05, - "loss": 0.2537, + "epoch": 14.182579702956655, + "grad_norm": 2.9966447353363037, + "learning_rate": 4.153530062829089e-07, + "loss": 0.205, "step": 1029400 }, { - "epoch": 10.49, - "learning_rate": 1.328803653067124e-05, - "loss": 0.2641, + "epoch": 14.183957455016396, + "grad_norm": 1.4956156015396118, + "learning_rate": 4.139663190237486e-07, + "loss": 0.199, "step": 1029500 }, { - "epoch": 10.49, - "learning_rate": 1.328252794404201e-05, - "loss": 0.2946, + "epoch": 14.185335207076134, + "grad_norm": 1.746561884880066, + "learning_rate": 4.125819328617178e-07, + "loss": 0.2582, "step": 1029600 }, { - "epoch": 10.49, - "learning_rate": 1.327702017479739e-05, - "loss": 0.2822, + "epoch": 14.186712959135875, + "grad_norm": 1.776915192604065, + "learning_rate": 4.1119984791443163e-07, + "loss": 0.1487, "step": 1029700 }, { - "epoch": 10.49, - "learning_rate": 1.327151322320667e-05, - "loss": 0.282, + "epoch": 14.188090711195613, + "grad_norm": 2.4574389457702637, + "learning_rate": 4.0982006429930356e-07, + "loss": 0.2156, "step": 1029800 }, { - "epoch": 10.49, - "learning_rate": 1.3266007089539137e-05, - "loss": 0.335, + "epoch": 14.189468463255352, + "grad_norm": 0.12091220915317535, + "learning_rate": 4.0844258213355493e-07, + "loss": 0.1912, "step": 1029900 }, { - "epoch": 10.49, - "learning_rate": 1.3260501774064004e-05, - "loss": 0.2721, + "epoch": 14.190846215315092, + "grad_norm": 2.144056797027588, + "learning_rate": 4.0706740153420854e-07, + "loss": 0.2134, "step": 1030000 }, { - "epoch": 10.49, - "learning_rate": 1.3254997277050431e-05, - "loss": 0.2471, + "epoch": 14.192223967374831, + "grad_norm": 1.8108185529708862, + "learning_rate": 4.056945226180886e-07, + "loss": 0.1911, "step": 1030100 }, { - "epoch": 10.5, - "learning_rate": 1.324949359876756e-05, - "loss": 0.2721, + "epoch": 14.19360171943457, + "grad_norm": 1.456079363822937, + "learning_rate": 4.0432394550183634e-07, + "loss": 0.2564, "step": 1030200 }, { - "epoch": 10.5, - "learning_rate": 1.32439907394845e-05, - "loss": 0.3018, + "epoch": 14.19497947149431, + "grad_norm": 0.5807664394378662, + "learning_rate": 4.0295567030188526e-07, + "loss": 0.2304, "step": 1030300 }, { - "epoch": 10.5, - "learning_rate": 1.3238488699470286e-05, - "loss": 0.2618, + "epoch": 14.196357223554049, + "grad_norm": 1.4189187288284302, + "learning_rate": 4.0158969713447654e-07, + "loss": 0.2176, "step": 1030400 }, { - "epoch": 10.5, - "learning_rate": 1.323298747899395e-05, - "loss": 0.2444, + "epoch": 14.19773497561379, + "grad_norm": 0.04461994394659996, + "learning_rate": 4.0022602611565903e-07, + "loss": 0.2117, "step": 1030500 }, { - "epoch": 10.5, - "learning_rate": 1.3227487078324472e-05, - "loss": 0.4154, + "epoch": 14.199112727673528, + "grad_norm": 2.271803855895996, + "learning_rate": 3.9886465736128006e-07, + "loss": 0.2387, "step": 1030600 }, { - "epoch": 10.5, - "learning_rate": 1.3221987497730776e-05, - "loss": 0.2795, + "epoch": 14.200490479733267, + "grad_norm": 0.8087264895439148, + "learning_rate": 3.9750559098699763e-07, + "loss": 0.1875, "step": 1030700 }, { - "epoch": 10.5, - "learning_rate": 1.3216488737481765e-05, - "loss": 0.2556, + "epoch": 14.201868231793007, + "grad_norm": 2.196885347366333, + "learning_rate": 3.961488271082714e-07, + "loss": 0.212, "step": 1030800 }, { - "epoch": 10.5, - "learning_rate": 1.3210990797846309e-05, - "loss": 0.2928, + "epoch": 14.203245983852746, + "grad_norm": 4.261157035827637, + "learning_rate": 3.9479436584036247e-07, + "loss": 0.2407, "step": 1030900 }, { - "epoch": 10.5, - "learning_rate": 1.3205493679093197e-05, - "loss": 0.3417, + "epoch": 14.204623735912485, + "grad_norm": 0.02784602902829647, + "learning_rate": 3.9344220729834567e-07, + "loss": 0.1892, "step": 1031000 }, { - "epoch": 10.51, - "learning_rate": 1.3199997381491213e-05, - "loss": 0.3086, + "epoch": 14.206001487972225, + "grad_norm": 3.5351369380950928, + "learning_rate": 3.9209235159708535e-07, + "loss": 0.2243, "step": 1031100 }, { - "epoch": 10.51, - "learning_rate": 1.3194501905309117e-05, - "loss": 0.2956, + "epoch": 14.207379240031964, + "grad_norm": 3.901085376739502, + "learning_rate": 3.907447988512655e-07, + "loss": 0.2107, "step": 1031200 }, { - "epoch": 10.51, - "learning_rate": 1.3189007250815578e-05, - "loss": 0.2879, + "epoch": 14.208756992091704, + "grad_norm": 1.521775722503662, + "learning_rate": 3.893995491753627e-07, + "loss": 0.1966, "step": 1031300 }, { - "epoch": 10.51, - "learning_rate": 1.3183513418279261e-05, - "loss": 0.3078, + "epoch": 14.210134744151443, + "grad_norm": 1.7670669555664062, + "learning_rate": 3.8805660268367015e-07, + "loss": 0.1676, "step": 1031400 }, { - "epoch": 10.51, - "learning_rate": 1.3178020407968796e-05, - "loss": 0.2236, + "epoch": 14.211512496211181, + "grad_norm": 2.405217409133911, + "learning_rate": 3.867159594902675e-07, + "loss": 0.218, "step": 1031500 }, { - "epoch": 10.51, - "learning_rate": 1.3172528220152733e-05, - "loss": 0.2434, + "epoch": 14.212890248270922, + "grad_norm": 3.611844301223755, + "learning_rate": 3.8537761970905723e-07, + "loss": 0.218, "step": 1031600 }, { - "epoch": 10.51, - "learning_rate": 1.3167036855099618e-05, - "loss": 0.2131, + "epoch": 14.21426800033066, + "grad_norm": 2.2119975090026855, + "learning_rate": 3.840415834537342e-07, + "loss": 0.2458, "step": 1031700 }, { - "epoch": 10.51, - "learning_rate": 1.316154631307795e-05, - "loss": 0.3165, + "epoch": 14.2156457523904, + "grad_norm": 2.266141891479492, + "learning_rate": 3.8270785083780405e-07, + "loss": 0.1986, "step": 1031800 }, { - "epoch": 10.51, - "learning_rate": 1.3156056594356186e-05, - "loss": 0.2376, + "epoch": 14.21702350445014, + "grad_norm": 3.746082305908203, + "learning_rate": 3.813764219745708e-07, + "loss": 0.205, "step": 1031900 }, { - "epoch": 10.51, - "learning_rate": 1.3150567699202729e-05, - "loss": 0.2577, + "epoch": 14.218401256509878, + "grad_norm": 1.9543709754943848, + "learning_rate": 3.800472969771493e-07, + "loss": 0.1745, "step": 1032000 }, { - "epoch": 10.52, - "learning_rate": 1.3145079627885957e-05, - "loss": 0.2425, + "epoch": 14.219779008569617, + "grad_norm": 0.3729618191719055, + "learning_rate": 3.7872047595845894e-07, + "loss": 0.2688, "step": 1032100 }, { - "epoch": 10.52, - "learning_rate": 1.3139592380674216e-05, - "loss": 0.2437, + "epoch": 14.221156760629357, + "grad_norm": 3.6639039516448975, + "learning_rate": 3.773959590312115e-07, + "loss": 0.1647, "step": 1032200 }, { - "epoch": 10.52, - "learning_rate": 1.313410595783578e-05, - "loss": 0.2647, + "epoch": 14.222534512689096, + "grad_norm": 2.0133955478668213, + "learning_rate": 3.760737463079386e-07, + "loss": 0.2309, "step": 1032300 }, { - "epoch": 10.52, - "learning_rate": 1.3128620359638902e-05, - "loss": 0.2701, + "epoch": 14.223912264748837, + "grad_norm": 2.9937479496002197, + "learning_rate": 3.7475383790096437e-07, + "loss": 0.127, "step": 1032400 }, { - "epoch": 10.52, - "learning_rate": 1.312313558635183e-05, - "loss": 0.283, + "epoch": 14.225290016808575, + "grad_norm": 1.7002424001693726, + "learning_rate": 3.734493985549198e-07, + "loss": 0.1855, "step": 1032500 }, { - "epoch": 10.52, - "learning_rate": 1.3117651638242698e-05, - "loss": 0.2777, + "epoch": 14.226667768868314, + "grad_norm": 0.47065451741218567, + "learning_rate": 3.721340760707928e-07, + "loss": 0.1921, "step": 1032600 }, { - "epoch": 10.52, - "learning_rate": 1.3112168515579652e-05, - "loss": 0.2281, + "epoch": 14.228045520928054, + "grad_norm": 2.343005895614624, + "learning_rate": 3.708210582376589e-07, + "loss": 0.2018, "step": 1032700 }, { - "epoch": 10.52, - "learning_rate": 1.3106686218630794e-05, - "loss": 0.2804, + "epoch": 14.229423272987793, + "grad_norm": 1.2562779188156128, + "learning_rate": 3.695103451670698e-07, + "loss": 0.2436, "step": 1032800 }, { - "epoch": 10.52, - "learning_rate": 1.3101204747664156e-05, - "loss": 0.3569, + "epoch": 14.230801025047532, + "grad_norm": 4.102870941162109, + "learning_rate": 3.682019369703754e-07, + "loss": 0.1779, "step": 1032900 }, { - "epoch": 10.52, - "learning_rate": 1.3095724102947762e-05, - "loss": 0.2745, + "epoch": 14.232178777107272, + "grad_norm": 2.85564923286438, + "learning_rate": 3.6689583375873056e-07, + "loss": 0.1897, "step": 1033000 }, { - "epoch": 10.53, - "learning_rate": 1.3090244284749586e-05, - "loss": 0.2451, + "epoch": 14.23355652916701, + "grad_norm": 4.5034403800964355, + "learning_rate": 3.6559203564309754e-07, + "loss": 0.2206, "step": 1033100 }, { - "epoch": 10.53, - "learning_rate": 1.3084765293337547e-05, - "loss": 0.2882, + "epoch": 14.234934281226751, + "grad_norm": 2.7022995948791504, + "learning_rate": 3.642905427342433e-07, + "loss": 0.1476, "step": 1033200 }, { - "epoch": 10.53, - "learning_rate": 1.3079287128979534e-05, - "loss": 0.286, + "epoch": 14.23631203328649, + "grad_norm": 2.3958001136779785, + "learning_rate": 3.629913551427272e-07, + "loss": 0.2131, "step": 1033300 }, { - "epoch": 10.53, - "learning_rate": 1.3073809791943408e-05, - "loss": 0.285, + "epoch": 14.237689785346229, + "grad_norm": 2.8164851665496826, + "learning_rate": 3.6169447297892845e-07, + "loss": 0.2254, "step": 1033400 }, { - "epoch": 10.53, - "learning_rate": 1.3068333282496984e-05, - "loss": 0.2538, + "epoch": 14.239067537405969, + "grad_norm": 0.8474321961402893, + "learning_rate": 3.603998963530247e-07, + "loss": 0.2243, "step": 1033500 }, { - "epoch": 10.53, - "learning_rate": 1.3062857600907992e-05, - "loss": 0.305, + "epoch": 14.240445289465708, + "grad_norm": 2.588263988494873, + "learning_rate": 3.591076253749953e-07, + "loss": 0.2303, "step": 1033600 }, { - "epoch": 10.53, - "learning_rate": 1.3057382747444203e-05, - "loss": 0.3515, + "epoch": 14.241823041525446, + "grad_norm": 3.9786689281463623, + "learning_rate": 3.5783054839296976e-07, + "loss": 0.2251, "step": 1033700 }, { - "epoch": 10.53, - "learning_rate": 1.30519087223733e-05, - "loss": 0.2378, + "epoch": 14.243200793585187, + "grad_norm": 0.8122533559799194, + "learning_rate": 3.565428659806297e-07, + "loss": 0.1943, "step": 1033800 }, { - "epoch": 10.53, - "learning_rate": 1.304643552596291e-05, - "loss": 0.3107, + "epoch": 14.244578545644925, + "grad_norm": 1.5650519132614136, + "learning_rate": 3.552574895438393e-07, + "loss": 0.2648, "step": 1033900 }, { - "epoch": 10.53, - "learning_rate": 1.3040963158480647e-05, - "loss": 0.2763, + "epoch": 14.245956297704666, + "grad_norm": 1.2846943140029907, + "learning_rate": 3.5397441919180233e-07, + "loss": 0.2161, "step": 1034000 }, { - "epoch": 10.54, - "learning_rate": 1.3035491620194091e-05, - "loss": 0.2253, + "epoch": 14.247334049764405, + "grad_norm": 3.9933454990386963, + "learning_rate": 3.526936550335089e-07, + "loss": 0.2523, "step": 1034100 }, { - "epoch": 10.54, - "learning_rate": 1.3030020911370746e-05, - "loss": 0.2658, + "epoch": 14.248711801824143, + "grad_norm": 1.4556498527526855, + "learning_rate": 3.51415197177781e-07, + "loss": 0.1964, "step": 1034200 }, { - "epoch": 10.54, - "learning_rate": 1.3024551032278105e-05, - "loss": 0.2419, + "epoch": 14.250089553883884, + "grad_norm": 2.9921977519989014, + "learning_rate": 3.5013904573322096e-07, + "loss": 0.2148, "step": 1034300 }, { - "epoch": 10.54, - "learning_rate": 1.3019081983183624e-05, - "loss": 0.2059, + "epoch": 14.251467305943622, + "grad_norm": 0.990747332572937, + "learning_rate": 3.488652008082449e-07, + "loss": 0.1995, "step": 1034400 }, { - "epoch": 10.54, - "learning_rate": 1.3013613764354689e-05, - "loss": 0.2757, + "epoch": 14.252845058003361, + "grad_norm": 1.2467118501663208, + "learning_rate": 3.475936625110734e-07, + "loss": 0.2419, "step": 1034500 }, { - "epoch": 10.54, - "learning_rate": 1.300814637605867e-05, - "loss": 0.2753, + "epoch": 14.254222810063101, + "grad_norm": 0.8108887672424316, + "learning_rate": 3.4632443094973487e-07, + "loss": 0.1927, "step": 1034600 }, { - "epoch": 10.54, - "learning_rate": 1.3002679818562904e-05, - "loss": 0.2589, + "epoch": 14.25560056212284, + "grad_norm": 1.928206205368042, + "learning_rate": 3.450575062320499e-07, + "loss": 0.2018, "step": 1034700 }, { - "epoch": 10.54, - "learning_rate": 1.2997214092134648e-05, - "loss": 0.3521, + "epoch": 14.25697831418258, + "grad_norm": 1.2115625143051147, + "learning_rate": 3.43792888465656e-07, + "loss": 0.1865, "step": 1034800 }, { - "epoch": 10.54, - "learning_rate": 1.2991749197041145e-05, - "loss": 0.2238, + "epoch": 14.25835606624232, + "grad_norm": 0.04710285738110542, + "learning_rate": 3.425305777579893e-07, + "loss": 0.1874, "step": 1034900 }, { - "epoch": 10.54, - "learning_rate": 1.2986285133549627e-05, - "loss": 0.2856, + "epoch": 14.259733818302058, + "grad_norm": 5.471766948699951, + "learning_rate": 3.4127057421628113e-07, + "loss": 0.2167, "step": 1035000 }, { - "epoch": 10.55, - "learning_rate": 1.2980821901927223e-05, - "loss": 0.2649, + "epoch": 14.261111570361798, + "grad_norm": 1.6544393301010132, + "learning_rate": 3.400128779475828e-07, + "loss": 0.1954, "step": 1035100 }, { - "epoch": 10.55, - "learning_rate": 1.2975359502441063e-05, - "loss": 0.3332, + "epoch": 14.262489322421537, + "grad_norm": 3.087660074234009, + "learning_rate": 3.38757489058738e-07, + "loss": 0.2379, "step": 1035200 }, { - "epoch": 10.55, - "learning_rate": 1.2969897935358225e-05, - "loss": 0.2993, + "epoch": 14.263867074481276, + "grad_norm": 5.120940208435059, + "learning_rate": 3.375044076564043e-07, + "loss": 0.2025, "step": 1035300 }, { - "epoch": 10.55, - "learning_rate": 1.2964437200945762e-05, - "loss": 0.2289, + "epoch": 14.265244826541016, + "grad_norm": 2.240692138671875, + "learning_rate": 3.362536338470315e-07, + "loss": 0.1794, "step": 1035400 }, { - "epoch": 10.55, - "learning_rate": 1.2958977299470643e-05, - "loss": 0.2643, + "epoch": 14.266622578600755, + "grad_norm": 3.7346484661102295, + "learning_rate": 3.350051677368801e-07, + "loss": 0.2205, "step": 1035500 }, { - "epoch": 10.55, - "learning_rate": 1.2953518231199838e-05, - "loss": 0.3464, + "epoch": 14.268000330660495, + "grad_norm": 0.34074950218200684, + "learning_rate": 3.3375900943201835e-07, + "loss": 0.2057, "step": 1035600 }, { - "epoch": 10.55, - "learning_rate": 1.2948059996400274e-05, - "loss": 0.2811, + "epoch": 14.269378082720234, + "grad_norm": 2.1143715381622314, + "learning_rate": 3.325151590383069e-07, + "loss": 0.215, "step": 1035700 }, { - "epoch": 10.55, - "learning_rate": 1.2942602595338809e-05, - "loss": 0.2458, + "epoch": 14.270755834779973, + "grad_norm": 3.158684015274048, + "learning_rate": 3.3127361666142e-07, + "loss": 0.1385, "step": 1035800 }, { - "epoch": 10.55, - "learning_rate": 1.293714602828228e-05, - "loss": 0.2886, + "epoch": 14.272133586839713, + "grad_norm": 2.260822057723999, + "learning_rate": 3.300343824068397e-07, + "loss": 0.1932, "step": 1035900 }, { - "epoch": 10.55, - "learning_rate": 1.2931690295497494e-05, - "loss": 0.2732, + "epoch": 14.273511338899452, + "grad_norm": 11.548611640930176, + "learning_rate": 3.287974563798343e-07, + "loss": 0.2551, "step": 1036000 }, { - "epoch": 10.56, - "learning_rate": 1.2926235397251187e-05, - "loss": 0.2798, + "epoch": 14.27488909095919, + "grad_norm": 3.603050470352173, + "learning_rate": 3.2756283868549204e-07, + "loss": 0.2327, "step": 1036100 }, { - "epoch": 10.56, - "learning_rate": 1.2920781333810065e-05, - "loss": 0.3005, + "epoch": 14.27626684301893, + "grad_norm": 2.7603049278259277, + "learning_rate": 3.2633052942869947e-07, + "loss": 0.2235, "step": 1036200 }, { - "epoch": 10.56, - "learning_rate": 1.2915328105440833e-05, - "loss": 0.2721, + "epoch": 14.27764459507867, + "grad_norm": 1.892252802848816, + "learning_rate": 3.2510052871414785e-07, + "loss": 0.1934, "step": 1036300 }, { - "epoch": 10.56, - "learning_rate": 1.2909930232204593e-05, - "loss": 0.2797, + "epoch": 14.279022347138408, + "grad_norm": 0.1928088665008545, + "learning_rate": 3.238728366463331e-07, + "loss": 0.174, "step": 1036400 }, { - "epoch": 10.56, - "learning_rate": 1.290447866642158e-05, - "loss": 0.2698, + "epoch": 14.280400099198149, + "grad_norm": 0.5428099632263184, + "learning_rate": 3.2264745332955566e-07, + "loss": 0.2188, "step": 1036500 }, { - "epoch": 10.56, - "learning_rate": 1.2899027936507533e-05, - "loss": 0.2406, + "epoch": 14.281777851257887, + "grad_norm": 1.455741047859192, + "learning_rate": 3.214243788679114e-07, + "loss": 0.2832, "step": 1036600 }, { - "epoch": 10.56, - "learning_rate": 1.2893578042728961e-05, - "loss": 0.3171, + "epoch": 14.283155603317628, + "grad_norm": 3.8896660804748535, + "learning_rate": 3.2020361336531303e-07, + "loss": 0.2049, "step": 1036700 }, { - "epoch": 10.56, - "learning_rate": 1.2888128985352353e-05, - "loss": 0.2934, + "epoch": 14.284533355377366, + "grad_norm": 2.817736864089966, + "learning_rate": 3.1898515692546563e-07, + "loss": 0.1822, "step": 1036800 }, { - "epoch": 10.56, - "learning_rate": 1.2882680764644104e-05, - "loss": 0.2967, + "epoch": 14.285911107437105, + "grad_norm": 2.8720290660858154, + "learning_rate": 3.177690096518912e-07, + "loss": 0.2856, "step": 1036900 }, { - "epoch": 10.57, - "learning_rate": 1.2877233380870617e-05, - "loss": 0.2378, + "epoch": 14.287288859496845, + "grad_norm": 1.3023791313171387, + "learning_rate": 3.1655517164790093e-07, + "loss": 0.2144, "step": 1037000 }, { - "epoch": 10.57, - "learning_rate": 1.2871786834298245e-05, - "loss": 0.2542, + "epoch": 14.288666611556584, + "grad_norm": 1.185239553451538, + "learning_rate": 3.1534364301661677e-07, + "loss": 0.1773, "step": 1037100 }, { - "epoch": 10.57, - "learning_rate": 1.2866341125193268e-05, - "loss": 0.318, + "epoch": 14.290044363616323, + "grad_norm": 5.254380702972412, + "learning_rate": 3.1413442386096836e-07, + "loss": 0.2329, "step": 1037200 }, { - "epoch": 10.57, - "learning_rate": 1.2860896253821964e-05, - "loss": 0.2745, + "epoch": 14.291422115676063, + "grad_norm": 1.3584585189819336, + "learning_rate": 3.129275142836807e-07, + "loss": 0.2291, "step": 1037300 }, { - "epoch": 10.57, - "learning_rate": 1.2855452220450553e-05, - "loss": 0.2525, + "epoch": 14.292799867735802, + "grad_norm": 7.368195533752441, + "learning_rate": 3.1172291438728657e-07, + "loss": 0.2296, "step": 1037400 }, { - "epoch": 10.57, - "learning_rate": 1.2850009025345211e-05, - "loss": 0.2986, + "epoch": 14.294177619795542, + "grad_norm": 2.3495419025421143, + "learning_rate": 3.105206242741293e-07, + "loss": 0.1709, "step": 1037500 }, { - "epoch": 10.57, - "learning_rate": 1.2844566668772088e-05, - "loss": 0.2589, + "epoch": 14.295555371855281, + "grad_norm": 0.49491262435913086, + "learning_rate": 3.0932064404634475e-07, + "loss": 0.232, "step": 1037600 }, { - "epoch": 10.57, - "learning_rate": 1.283912515099727e-05, - "loss": 0.3451, + "epoch": 14.29693312391502, + "grad_norm": 2.5167791843414307, + "learning_rate": 3.081229738058763e-07, + "loss": 0.1803, "step": 1037700 }, { - "epoch": 10.57, - "learning_rate": 1.2833684472286838e-05, - "loss": 0.2608, + "epoch": 14.29831087597476, + "grad_norm": 1.661225438117981, + "learning_rate": 3.069276136544752e-07, + "loss": 0.2078, "step": 1037800 }, { - "epoch": 10.57, - "learning_rate": 1.2828299027145025e-05, - "loss": 0.2631, + "epoch": 14.299688628034499, + "grad_norm": 0.7844659686088562, + "learning_rate": 3.0573456369369093e-07, + "loss": 0.1832, "step": 1037900 }, { - "epoch": 10.58, - "learning_rate": 1.2822860018964047e-05, - "loss": 0.2084, + "epoch": 14.301066380094237, + "grad_norm": 1.6307048797607422, + "learning_rate": 3.0454382402488075e-07, + "loss": 0.1902, "step": 1038000 }, { - "epoch": 10.58, - "learning_rate": 1.2817421850642708e-05, - "loss": 0.2737, + "epoch": 14.302444132153978, + "grad_norm": 4.161242485046387, + "learning_rate": 3.033553947492035e-07, + "loss": 0.1761, "step": 1038100 }, { - "epoch": 10.58, - "learning_rate": 1.28119845224469e-05, - "loss": 0.2389, + "epoch": 14.303821884213717, + "grad_norm": 0.62209552526474, + "learning_rate": 3.0216927596762564e-07, + "loss": 0.1998, "step": 1038200 }, { - "epoch": 10.58, - "learning_rate": 1.2806548034642485e-05, - "loss": 0.3165, + "epoch": 14.305199636273457, + "grad_norm": 1.5336649417877197, + "learning_rate": 3.00985467780909e-07, + "loss": 0.1986, "step": 1038300 }, { - "epoch": 10.58, - "learning_rate": 1.2801112387495257e-05, - "loss": 0.2492, + "epoch": 14.306577388333196, + "grad_norm": 2.5388731956481934, + "learning_rate": 2.9980397028962325e-07, + "loss": 0.1673, "step": 1038400 }, { - "epoch": 10.58, - "learning_rate": 1.2795677581270992e-05, - "loss": 0.2584, + "epoch": 14.307955140392934, + "grad_norm": 1.5935280323028564, + "learning_rate": 2.9862478359414856e-07, + "loss": 0.229, "step": 1038500 }, { - "epoch": 10.58, - "learning_rate": 1.2790243616235439e-05, - "loss": 0.2842, + "epoch": 14.309332892452675, + "grad_norm": 3.3074448108673096, + "learning_rate": 2.9744790779466067e-07, + "loss": 0.2502, "step": 1038600 }, { - "epoch": 10.58, - "learning_rate": 1.2784810492654255e-05, - "loss": 0.2721, + "epoch": 14.310710644512413, + "grad_norm": 4.7681450843811035, + "learning_rate": 2.9627334299113685e-07, + "loss": 0.239, "step": 1038700 }, { - "epoch": 10.58, - "learning_rate": 1.2779378210793105e-05, - "loss": 0.2377, + "epoch": 14.312088396572152, + "grad_norm": 1.3199881315231323, + "learning_rate": 2.9510108928336806e-07, + "loss": 0.2078, "step": 1038800 }, { - "epoch": 10.58, - "learning_rate": 1.2773946770917597e-05, - "loss": 0.3191, + "epoch": 14.313466148631893, + "grad_norm": 2.629986047744751, + "learning_rate": 2.9393114677094076e-07, + "loss": 0.2086, "step": 1038900 }, { - "epoch": 10.59, - "learning_rate": 1.2768516173293287e-05, - "loss": 0.3363, + "epoch": 14.314843900691631, + "grad_norm": 2.506103515625, + "learning_rate": 2.92763515553249e-07, + "loss": 0.231, "step": 1039000 }, { - "epoch": 10.59, - "learning_rate": 1.2763086418185707e-05, - "loss": 0.279, + "epoch": 14.316221652751372, + "grad_norm": 2.377462148666382, + "learning_rate": 2.9159819572948553e-07, + "loss": 0.2545, "step": 1039100 }, { - "epoch": 10.59, - "learning_rate": 1.2757657505860348e-05, - "loss": 0.3198, + "epoch": 14.31759940481111, + "grad_norm": 2.1929492950439453, + "learning_rate": 2.904351873986535e-07, + "loss": 0.1833, "step": 1039200 }, { - "epoch": 10.59, - "learning_rate": 1.2752229436582628e-05, - "loss": 0.2981, + "epoch": 14.318977156870849, + "grad_norm": 2.1182632446289062, + "learning_rate": 2.892744906595579e-07, + "loss": 0.1721, "step": 1039300 }, { - "epoch": 10.59, - "learning_rate": 1.2746802210617956e-05, - "loss": 0.2473, + "epoch": 14.32035490893059, + "grad_norm": 1.0325742959976196, + "learning_rate": 2.88116105610802e-07, + "loss": 0.2236, "step": 1039400 }, { - "epoch": 10.59, - "learning_rate": 1.2741375828231709e-05, - "loss": 0.3092, + "epoch": 14.321732660990328, + "grad_norm": 1.8284990787506104, + "learning_rate": 2.8696003235079677e-07, + "loss": 0.2339, "step": 1039500 }, { - "epoch": 10.59, - "learning_rate": 1.2735950289689179e-05, - "loss": 0.2804, + "epoch": 14.323110413050067, + "grad_norm": 1.4432164430618286, + "learning_rate": 2.858062709777579e-07, + "loss": 0.1798, "step": 1039600 }, { - "epoch": 10.59, - "learning_rate": 1.2730525595255652e-05, - "loss": 0.2842, + "epoch": 14.324488165109807, + "grad_norm": 0.15447640419006348, + "learning_rate": 2.846548215897024e-07, + "loss": 0.1717, "step": 1039700 }, { - "epoch": 10.59, - "learning_rate": 1.2725101745196376e-05, - "loss": 0.3003, + "epoch": 14.325865917169546, + "grad_norm": 3.6791911125183105, + "learning_rate": 2.8351716421237823e-07, + "loss": 0.1807, "step": 1039800 }, { - "epoch": 10.59, - "learning_rate": 1.2719678739776522e-05, - "loss": 0.2434, + "epoch": 14.327243669229286, + "grad_norm": 3.0506296157836914, + "learning_rate": 2.8237031596527296e-07, + "loss": 0.1826, "step": 1039900 }, { - "epoch": 10.6, - "learning_rate": 1.2714256579261256e-05, - "loss": 0.2634, + "epoch": 14.328621421289025, + "grad_norm": 2.111396074295044, + "learning_rate": 2.8122577999505325e-07, + "loss": 0.2454, "step": 1040000 }, { - "epoch": 10.6, - "learning_rate": 1.2708835263915687e-05, - "loss": 0.272, + "epoch": 14.329999173348764, + "grad_norm": 3.7517457008361816, + "learning_rate": 2.8008355639895457e-07, + "loss": 0.211, "step": 1040100 }, { - "epoch": 10.6, - "learning_rate": 1.2703414794004884e-05, - "loss": 0.2914, + "epoch": 14.331376925408504, + "grad_norm": 2.1078410148620605, + "learning_rate": 2.7894364527401714e-07, + "loss": 0.2764, "step": 1040200 }, { - "epoch": 10.6, - "learning_rate": 1.2697995169793876e-05, - "loss": 0.2863, + "epoch": 14.332754677468243, + "grad_norm": 1.7874021530151367, + "learning_rate": 2.7780604671707345e-07, + "loss": 0.1847, "step": 1040300 }, { - "epoch": 10.6, - "learning_rate": 1.2692576391547652e-05, - "loss": 0.2631, + "epoch": 14.334132429527982, + "grad_norm": 0.3600746989250183, + "learning_rate": 2.76670760824779e-07, + "loss": 0.2282, "step": 1040400 }, { - "epoch": 10.6, - "learning_rate": 1.2687158459531169e-05, - "loss": 0.2966, + "epoch": 14.335510181587722, + "grad_norm": 2.2421083450317383, + "learning_rate": 2.7553778769357245e-07, + "loss": 0.2402, "step": 1040500 }, { - "epoch": 10.6, - "learning_rate": 1.2681741374009308e-05, - "loss": 0.3026, + "epoch": 14.33688793364746, + "grad_norm": 4.3449015617370605, + "learning_rate": 2.7440712741971545e-07, + "loss": 0.2527, "step": 1040600 }, { - "epoch": 10.6, - "learning_rate": 1.2676325135246943e-05, - "loss": 0.2804, + "epoch": 14.3382656857072, + "grad_norm": 0.47316887974739075, + "learning_rate": 2.732900521230247e-07, + "loss": 0.1971, "step": 1040700 }, { - "epoch": 10.6, - "learning_rate": 1.267090974350891e-05, - "loss": 0.2797, + "epoch": 14.33964343776694, + "grad_norm": 1.0531458854675293, + "learning_rate": 2.721639947208546e-07, + "loss": 0.2036, "step": 1040800 }, { - "epoch": 10.6, - "learning_rate": 1.2665549340309496e-05, - "loss": 0.3039, + "epoch": 14.341021189826678, + "grad_norm": 3.1056787967681885, + "learning_rate": 2.7104025046265414e-07, + "loss": 0.2267, "step": 1040900 }, { - "epoch": 10.61, - "learning_rate": 1.2660135634937549e-05, - "loss": 0.2177, + "epoch": 14.342398941886419, + "grad_norm": 4.312679767608643, + "learning_rate": 2.699188194438851e-07, + "loss": 0.2364, "step": 1041000 }, { - "epoch": 10.61, - "learning_rate": 1.2654722777381482e-05, - "loss": 0.2859, + "epoch": 14.343776693946158, + "grad_norm": 2.8823482990264893, + "learning_rate": 2.6879970175981984e-07, + "loss": 0.1825, "step": 1041100 }, { - "epoch": 10.61, - "learning_rate": 1.2649310767905958e-05, - "loss": 0.307, + "epoch": 14.345154446005896, + "grad_norm": 0.7732188701629639, + "learning_rate": 2.6768289750553234e-07, + "loss": 0.2046, "step": 1041200 }, { - "epoch": 10.61, - "learning_rate": 1.2643899606775592e-05, - "loss": 0.2716, + "epoch": 14.346532198065637, + "grad_norm": 2.2512500286102295, + "learning_rate": 2.665684067759042e-07, + "loss": 0.2477, "step": 1041300 }, { - "epoch": 10.61, - "learning_rate": 1.2638489294254964e-05, - "loss": 0.3091, + "epoch": 14.347909950125375, + "grad_norm": 0.3430367708206177, + "learning_rate": 2.6545622966561546e-07, + "loss": 0.2026, "step": 1041400 }, { - "epoch": 10.61, - "learning_rate": 1.2633079830608598e-05, - "loss": 0.3225, + "epoch": 14.349287702185114, + "grad_norm": 3.5744383335113525, + "learning_rate": 2.6434636626915087e-07, + "loss": 0.2179, "step": 1041500 }, { - "epoch": 10.61, - "learning_rate": 1.2627671216101002e-05, - "loss": 0.2548, + "epoch": 14.350665454244854, + "grad_norm": 1.2464557886123657, + "learning_rate": 2.632388166807967e-07, + "loss": 0.2235, "step": 1041600 }, { - "epoch": 10.61, - "learning_rate": 1.2622263450996597e-05, - "loss": 0.2766, + "epoch": 14.352043206304593, + "grad_norm": 3.767289638519287, + "learning_rate": 2.621335809946499e-07, + "loss": 0.1569, "step": 1041700 }, { - "epoch": 10.61, - "learning_rate": 1.2616856535559806e-05, - "loss": 0.2809, + "epoch": 14.353420958364334, + "grad_norm": 1.6704542636871338, + "learning_rate": 2.6103065930460275e-07, + "loss": 0.1786, "step": 1041800 }, { - "epoch": 10.62, - "learning_rate": 1.2611450470055e-05, - "loss": 0.2533, + "epoch": 14.354798710424072, + "grad_norm": 4.37132453918457, + "learning_rate": 2.5993005170435236e-07, + "loss": 0.2047, "step": 1041900 }, { - "epoch": 10.62, - "learning_rate": 1.2606045254746491e-05, - "loss": 0.2307, + "epoch": 14.35617646248381, + "grad_norm": 2.1292221546173096, + "learning_rate": 2.5883175828740947e-07, + "loss": 0.2634, "step": 1042000 }, { - "epoch": 10.62, - "learning_rate": 1.2600640889898566e-05, - "loss": 0.3126, + "epoch": 14.357554214543551, + "grad_norm": 3.1232669353485107, + "learning_rate": 2.57735779147068e-07, + "loss": 0.2209, "step": 1042100 }, { - "epoch": 10.62, - "learning_rate": 1.2595237375775478e-05, - "loss": 0.2625, + "epoch": 14.35893196660329, + "grad_norm": 0.2823559641838074, + "learning_rate": 2.56642114376445e-07, + "loss": 0.2517, "step": 1042200 }, { - "epoch": 10.62, - "learning_rate": 1.2589834712641406e-05, - "loss": 0.2565, + "epoch": 14.360309718663029, + "grad_norm": 5.655496120452881, + "learning_rate": 2.555507640684496e-07, + "loss": 0.1845, "step": 1042300 }, { - "epoch": 10.62, - "learning_rate": 1.2584432900760515e-05, - "loss": 0.2231, + "epoch": 14.361687470722769, + "grad_norm": 1.1234709024429321, + "learning_rate": 2.544617283157988e-07, + "loss": 0.2384, "step": 1042400 }, { - "epoch": 10.62, - "learning_rate": 1.2579031940396928e-05, - "loss": 0.3016, + "epoch": 14.363065222782508, + "grad_norm": 1.0338001251220703, + "learning_rate": 2.533750072110111e-07, + "loss": 0.19, "step": 1042500 }, { - "epoch": 10.62, - "learning_rate": 1.2573631831814715e-05, - "loss": 0.2425, + "epoch": 14.364442974842248, + "grad_norm": 0.4390679895877838, + "learning_rate": 2.522906008464096e-07, + "loss": 0.2171, "step": 1042600 }, { - "epoch": 10.62, - "learning_rate": 1.2568232575277912e-05, - "loss": 0.3709, + "epoch": 14.365820726901987, + "grad_norm": 3.9069883823394775, + "learning_rate": 2.51208509314122e-07, + "loss": 0.2064, "step": 1042700 }, { - "epoch": 10.62, - "learning_rate": 1.2562834171050507e-05, - "loss": 0.2491, + "epoch": 14.367198478961726, + "grad_norm": 1.9762722253799438, + "learning_rate": 2.501287327060714e-07, + "loss": 0.2292, "step": 1042800 }, { - "epoch": 10.63, - "learning_rate": 1.255743661939646e-05, - "loss": 0.3117, + "epoch": 14.368576231021466, + "grad_norm": 2.0985963344573975, + "learning_rate": 2.490512711139978e-07, + "loss": 0.216, "step": 1042900 }, { - "epoch": 10.63, - "learning_rate": 1.2552039920579662e-05, - "loss": 0.2427, + "epoch": 14.369953983081205, + "grad_norm": 0.6314952969551086, + "learning_rate": 2.4797612462943044e-07, + "loss": 0.1884, "step": 1043000 }, { - "epoch": 10.63, - "learning_rate": 1.254664407486399e-05, - "loss": 0.3099, + "epoch": 14.371331735140943, + "grad_norm": 3.535656452178955, + "learning_rate": 2.4690329334371544e-07, + "loss": 0.2505, "step": 1043100 }, { - "epoch": 10.63, - "learning_rate": 1.2541249082513273e-05, - "loss": 0.3007, + "epoch": 14.372709487200684, + "grad_norm": 1.6125764846801758, + "learning_rate": 2.4583277734798823e-07, + "loss": 0.1939, "step": 1043200 }, { - "epoch": 10.63, - "learning_rate": 1.253585494379128e-05, - "loss": 0.3353, + "epoch": 14.374087239260422, + "grad_norm": 2.146817445755005, + "learning_rate": 2.4476457673320104e-07, + "loss": 0.1761, "step": 1043300 }, { - "epoch": 10.63, - "learning_rate": 1.2530461658961758e-05, - "loss": 0.2218, + "epoch": 14.375464991320163, + "grad_norm": 1.5963634252548218, + "learning_rate": 2.4369869159009546e-07, + "loss": 0.2002, "step": 1043400 }, { - "epoch": 10.63, - "learning_rate": 1.2525069228288422e-05, - "loss": 0.2991, + "epoch": 14.376842743379902, + "grad_norm": 0.5893296003341675, + "learning_rate": 2.426351220092329e-07, + "loss": 0.2026, "step": 1043500 }, { - "epoch": 10.63, - "learning_rate": 1.25196776520349e-05, - "loss": 0.2639, + "epoch": 14.37822049543964, + "grad_norm": 4.40700101852417, + "learning_rate": 2.415738680809612e-07, + "loss": 0.247, "step": 1043600 }, { - "epoch": 10.63, - "learning_rate": 1.2514286930464825e-05, - "loss": 0.2918, + "epoch": 14.37959824749938, + "grad_norm": 2.344078540802002, + "learning_rate": 2.405149298954448e-07, + "loss": 0.2117, "step": 1043700 }, { - "epoch": 10.63, - "learning_rate": 1.2508897063841774e-05, - "loss": 0.2688, + "epoch": 14.38097599955912, + "grad_norm": 4.357010364532471, + "learning_rate": 2.3945830754264377e-07, + "loss": 0.231, "step": 1043800 }, { - "epoch": 10.64, - "learning_rate": 1.2503508052429273e-05, - "loss": 0.3198, + "epoch": 14.382353751618858, + "grad_norm": 1.6458518505096436, + "learning_rate": 2.384145327125134e-07, + "loss": 0.2255, "step": 1043900 }, { - "epoch": 10.64, - "learning_rate": 1.2498119896490812e-05, - "loss": 0.3172, + "epoch": 14.383731503678598, + "grad_norm": 1.5913604497909546, + "learning_rate": 2.373625191336809e-07, + "loss": 0.1868, "step": 1044000 }, { - "epoch": 10.64, - "learning_rate": 1.2492732596289845e-05, - "loss": 0.2465, + "epoch": 14.385109255738337, + "grad_norm": 1.3978865146636963, + "learning_rate": 2.3631282165537366e-07, + "loss": 0.1935, "step": 1044100 }, { - "epoch": 10.64, - "learning_rate": 1.2487346152089786e-05, - "loss": 0.2734, + "epoch": 14.386487007798078, + "grad_norm": 0.16621717810630798, + "learning_rate": 2.3526544036677312e-07, + "loss": 0.215, "step": 1044200 }, { - "epoch": 10.64, - "learning_rate": 1.248196056415398e-05, - "loss": 0.2866, + "epoch": 14.387864759857816, + "grad_norm": 1.7409483194351196, + "learning_rate": 2.342203753568592e-07, + "loss": 0.2474, "step": 1044300 }, { - "epoch": 10.64, - "learning_rate": 1.2476575832745758e-05, - "loss": 0.2983, + "epoch": 14.389242511917555, + "grad_norm": 1.9124393463134766, + "learning_rate": 2.3317762671441646e-07, + "loss": 0.1718, "step": 1044400 }, { - "epoch": 10.64, - "learning_rate": 1.2471191958128415e-05, - "loss": 0.2287, + "epoch": 14.390620263977295, + "grad_norm": 7.344611644744873, + "learning_rate": 2.321371945280279e-07, + "loss": 0.2719, "step": 1044500 }, { - "epoch": 10.64, - "learning_rate": 1.246580894056517e-05, - "loss": 0.2757, + "epoch": 14.391998016037034, + "grad_norm": 2.652233839035034, + "learning_rate": 2.3109907888608727e-07, + "loss": 0.2683, "step": 1044600 }, { - "epoch": 10.64, - "learning_rate": 1.246042678031923e-05, - "loss": 0.297, + "epoch": 14.393375768096773, + "grad_norm": 1.747808575630188, + "learning_rate": 2.3006327987678683e-07, + "loss": 0.2501, "step": 1044700 }, { - "epoch": 10.64, - "learning_rate": 1.2455045477653758e-05, - "loss": 0.278, + "epoch": 14.394753520156513, + "grad_norm": 1.9243394136428833, + "learning_rate": 2.2902979758812336e-07, + "loss": 0.1941, "step": 1044800 }, { - "epoch": 10.65, - "learning_rate": 1.2449772633319718e-05, - "loss": 0.196, + "epoch": 14.396131272216252, + "grad_norm": 11.358366966247559, + "learning_rate": 2.2799863210789529e-07, + "loss": 0.209, "step": 1044900 }, { - "epoch": 10.65, - "learning_rate": 1.2444393029439743e-05, - "loss": 0.3136, + "epoch": 14.39750902427599, + "grad_norm": 1.5777690410614014, + "learning_rate": 2.2696978352370558e-07, + "loss": 0.2328, "step": 1045000 }, { - "epoch": 10.65, - "learning_rate": 1.2439014283924182e-05, - "loss": 0.3101, + "epoch": 14.39888677633573, + "grad_norm": 1.2036610841751099, + "learning_rate": 2.2594325192295874e-07, + "loss": 0.1965, "step": 1045100 }, { - "epoch": 10.65, - "learning_rate": 1.2433636397036026e-05, - "loss": 0.2677, + "epoch": 14.40026452839547, + "grad_norm": 2.236189842224121, + "learning_rate": 2.2491903739287e-07, + "loss": 0.2193, "step": 1045200 }, { - "epoch": 10.65, - "learning_rate": 1.2428259369038217e-05, - "loss": 0.2532, + "epoch": 14.40164228045521, + "grad_norm": 3.3195059299468994, + "learning_rate": 2.2389714002044398e-07, + "loss": 0.1785, "step": 1045300 }, { - "epoch": 10.65, - "learning_rate": 1.2422883200193665e-05, - "loss": 0.203, + "epoch": 14.403020032514949, + "grad_norm": 2.5880942344665527, + "learning_rate": 2.2287755989250203e-07, + "loss": 0.221, "step": 1045400 }, { - "epoch": 10.65, - "learning_rate": 1.241750789076524e-05, - "loss": 0.2797, + "epoch": 14.404397784574687, + "grad_norm": 2.9236528873443604, + "learning_rate": 2.2186029709565791e-07, + "loss": 0.2898, "step": 1045500 }, { - "epoch": 10.65, - "learning_rate": 1.241213344101574e-05, - "loss": 0.2525, + "epoch": 14.405775536634428, + "grad_norm": 1.9024989604949951, + "learning_rate": 2.2084535171633613e-07, + "loss": 0.2406, "step": 1045600 }, { - "epoch": 10.65, - "learning_rate": 1.2406759851207956e-05, - "loss": 0.2852, + "epoch": 14.407153288694166, + "grad_norm": 2.324446439743042, + "learning_rate": 2.198327238407627e-07, + "loss": 0.2347, "step": 1045700 }, { - "epoch": 10.65, - "learning_rate": 1.2401387121604636e-05, - "loss": 0.3302, + "epoch": 14.408531040753905, + "grad_norm": 0.6329560875892639, + "learning_rate": 2.1882241355496213e-07, + "loss": 0.1909, "step": 1045800 }, { - "epoch": 10.66, - "learning_rate": 1.2396015252468452e-05, - "loss": 0.2541, + "epoch": 14.409908792813646, + "grad_norm": 1.1651264429092407, + "learning_rate": 2.1781442094476968e-07, + "loss": 0.216, "step": 1045900 }, { - "epoch": 10.66, - "learning_rate": 1.2390644244062066e-05, - "loss": 0.2347, + "epoch": 14.411286544873384, + "grad_norm": 3.119415283203125, + "learning_rate": 2.1681879137110339e-07, + "loss": 0.1995, "step": 1046000 }, { - "epoch": 10.66, - "learning_rate": 1.2385274096648101e-05, - "loss": 0.2502, + "epoch": 14.412664296933125, + "grad_norm": 2.7042925357818604, + "learning_rate": 2.1581541118993852e-07, + "loss": 0.1916, "step": 1046100 }, { - "epoch": 10.66, - "learning_rate": 1.2379904810489102e-05, - "loss": 0.2737, + "epoch": 14.414042048992863, + "grad_norm": 0.9542060494422913, + "learning_rate": 2.1481434893983576e-07, + "loss": 0.1834, "step": 1046200 }, { - "epoch": 10.66, - "learning_rate": 1.2374536385847607e-05, - "loss": 0.2928, + "epoch": 14.415419801052602, + "grad_norm": 1.2856370210647583, + "learning_rate": 2.1381560470584566e-07, + "loss": 0.1932, "step": 1046300 }, { - "epoch": 10.66, - "learning_rate": 1.2369168822986101e-05, - "loss": 0.2496, + "epoch": 14.416797553112342, + "grad_norm": 3.630920648574829, + "learning_rate": 2.128291313592698e-07, + "loss": 0.2054, "step": 1046400 }, { - "epoch": 10.66, - "learning_rate": 1.2363802122167022e-05, - "loss": 0.2656, + "epoch": 14.418175305172081, + "grad_norm": 1.5007926225662231, + "learning_rate": 2.1183500022957657e-07, + "loss": 0.1866, "step": 1046500 }, { - "epoch": 10.66, - "learning_rate": 1.2358436283652772e-05, - "loss": 0.264, + "epoch": 14.41955305723182, + "grad_norm": 3.446928024291992, + "learning_rate": 2.108431873691022e-07, + "loss": 0.2541, "step": 1046600 }, { - "epoch": 10.66, - "learning_rate": 1.2353071307705719e-05, - "loss": 0.3014, + "epoch": 14.42093080929156, + "grad_norm": 10.052985191345215, + "learning_rate": 2.0985369286211253e-07, + "loss": 0.1924, "step": 1046700 }, { - "epoch": 10.66, - "learning_rate": 1.2347707194588159e-05, - "loss": 0.2846, + "epoch": 14.422308561351299, + "grad_norm": 1.9225763082504272, + "learning_rate": 2.0886651679266593e-07, + "loss": 0.196, "step": 1046800 }, { - "epoch": 10.67, - "learning_rate": 1.234234394456237e-05, - "loss": 0.2465, + "epoch": 14.42368631341104, + "grad_norm": 1.3549712896347046, + "learning_rate": 2.0788165924463144e-07, + "loss": 0.2095, "step": 1046900 }, { - "epoch": 10.67, - "learning_rate": 1.233698155789059e-05, - "loss": 0.228, + "epoch": 14.425064065470778, + "grad_norm": 1.2966724634170532, + "learning_rate": 2.0689912030167347e-07, + "loss": 0.1784, "step": 1047000 }, { - "epoch": 10.67, - "learning_rate": 1.2331620034835016e-05, - "loss": 0.343, + "epoch": 14.426441817530517, + "grad_norm": 1.7539267539978027, + "learning_rate": 2.0591890004726416e-07, + "loss": 0.169, "step": 1047100 }, { - "epoch": 10.67, - "learning_rate": 1.232625937565777e-05, - "loss": 0.3441, + "epoch": 14.427819569590257, + "grad_norm": 2.6548092365264893, + "learning_rate": 2.049409985646833e-07, + "loss": 0.1441, "step": 1047200 }, { - "epoch": 10.67, - "learning_rate": 1.232089958062097e-05, - "loss": 0.2887, + "epoch": 14.429197321649996, + "grad_norm": 0.853512167930603, + "learning_rate": 2.0396541593700302e-07, + "loss": 0.1988, "step": 1047300 }, { - "epoch": 10.67, - "learning_rate": 1.2315540649986685e-05, - "loss": 0.3047, + "epoch": 14.430575073709734, + "grad_norm": 1.3353285789489746, + "learning_rate": 2.029921522471062e-07, + "loss": 0.1963, "step": 1047400 }, { - "epoch": 10.67, - "learning_rate": 1.2310182584016922e-05, - "loss": 0.2157, + "epoch": 14.431952825769475, + "grad_norm": 1.4992989301681519, + "learning_rate": 2.0202120757768032e-07, + "loss": 0.1771, "step": 1047500 }, { - "epoch": 10.67, - "learning_rate": 1.230482538297366e-05, - "loss": 0.2479, + "epoch": 14.433330577829214, + "grad_norm": 3.35964298248291, + "learning_rate": 2.0105258201120525e-07, + "loss": 0.1991, "step": 1047600 }, { - "epoch": 10.67, - "learning_rate": 1.2299469047118837e-05, - "loss": 0.2763, + "epoch": 14.434708329888954, + "grad_norm": 1.370593547821045, + "learning_rate": 2.0008627562997456e-07, + "loss": 0.207, "step": 1047700 }, { - "epoch": 10.68, - "learning_rate": 1.229411357671435e-05, - "loss": 0.3059, + "epoch": 14.436086081948693, + "grad_norm": 3.285459518432617, + "learning_rate": 1.9912228851608038e-07, + "loss": 0.216, "step": 1047800 }, { - "epoch": 10.68, - "learning_rate": 1.2288758972022039e-05, - "loss": 0.2955, + "epoch": 14.437463834008431, + "grad_norm": 2.591484546661377, + "learning_rate": 1.981606207514164e-07, + "loss": 0.2149, "step": 1047900 }, { - "epoch": 10.68, - "learning_rate": 1.2283405233303737e-05, - "loss": 0.2784, + "epoch": 14.438841586068172, + "grad_norm": 0.5036186575889587, + "learning_rate": 1.9720127241768081e-07, + "loss": 0.2712, "step": 1048000 }, { - "epoch": 10.68, - "learning_rate": 1.2278052360821176e-05, - "loss": 0.2677, + "epoch": 14.44021933812791, + "grad_norm": 2.6519598960876465, + "learning_rate": 1.9624424359638266e-07, + "loss": 0.249, "step": 1048100 }, { - "epoch": 10.68, - "learning_rate": 1.2272700354836097e-05, - "loss": 0.2926, + "epoch": 14.44159709018765, + "grad_norm": 1.8656333684921265, + "learning_rate": 1.952895343688202e-07, + "loss": 0.1891, "step": 1048200 }, { - "epoch": 10.68, - "learning_rate": 1.2267349215610191e-05, - "loss": 0.2836, + "epoch": 14.44297484224739, + "grad_norm": 4.056741237640381, + "learning_rate": 1.9433714481609938e-07, + "loss": 0.2325, "step": 1048300 }, { - "epoch": 10.68, - "learning_rate": 1.2261998943405075e-05, - "loss": 0.3303, + "epoch": 14.444352594307128, + "grad_norm": 1.381493091583252, + "learning_rate": 1.9338707501913383e-07, + "loss": 0.2016, "step": 1048400 }, { - "epoch": 10.68, - "learning_rate": 1.2256649538482356e-05, - "loss": 0.2773, + "epoch": 14.445730346366869, + "grad_norm": 3.2945094108581543, + "learning_rate": 1.9243932505863565e-07, + "loss": 0.1888, "step": 1048500 }, { - "epoch": 10.68, - "learning_rate": 1.22513010011036e-05, - "loss": 0.2628, + "epoch": 14.447108098426607, + "grad_norm": 4.610701084136963, + "learning_rate": 1.9149389501512155e-07, + "loss": 0.2216, "step": 1048600 }, { - "epoch": 10.68, - "learning_rate": 1.2245953331530292e-05, - "loss": 0.3077, + "epoch": 14.448485850486346, + "grad_norm": 2.909604787826538, + "learning_rate": 1.9055078496891282e-07, + "loss": 0.1726, "step": 1048700 }, { - "epoch": 10.69, - "learning_rate": 1.2240606530023917e-05, - "loss": 0.2776, + "epoch": 14.449863602546086, + "grad_norm": 1.805124282836914, + "learning_rate": 1.896099950001262e-07, + "loss": 0.1931, "step": 1048800 }, { - "epoch": 10.69, - "learning_rate": 1.2235260596845898e-05, - "loss": 0.2588, + "epoch": 14.451241354605825, + "grad_norm": 3.58864688873291, + "learning_rate": 1.886715251886892e-07, + "loss": 0.2032, "step": 1048900 }, { - "epoch": 10.69, - "learning_rate": 1.2229915532257626e-05, - "loss": 0.2881, + "epoch": 14.452619106665564, + "grad_norm": 3.1392288208007812, + "learning_rate": 1.8773537561433075e-07, + "loss": 0.2271, "step": 1049000 }, { - "epoch": 10.69, - "learning_rate": 1.2224571336520434e-05, - "loss": 0.2468, + "epoch": 14.453996858725304, + "grad_norm": 0.464388370513916, + "learning_rate": 1.8680154635658142e-07, + "loss": 0.2278, "step": 1049100 }, { - "epoch": 10.69, - "learning_rate": 1.2219228009895628e-05, - "loss": 0.3087, + "epoch": 14.455374610785043, + "grad_norm": 1.773710012435913, + "learning_rate": 1.858700374947764e-07, + "loss": 0.1921, "step": 1049200 }, { - "epoch": 10.69, - "learning_rate": 1.2213885552644472e-05, - "loss": 0.2883, + "epoch": 14.456752362844782, + "grad_norm": 1.4253270626068115, + "learning_rate": 1.849408491080462e-07, + "loss": 0.2238, "step": 1049300 }, { - "epoch": 10.69, - "learning_rate": 1.220854396502816e-05, - "loss": 0.3225, + "epoch": 14.458130114904522, + "grad_norm": 0.6480837464332581, + "learning_rate": 1.840139812753322e-07, + "loss": 0.1597, "step": 1049400 }, { - "epoch": 10.69, - "learning_rate": 1.2203203247307873e-05, - "loss": 0.3172, + "epoch": 14.45950786696426, + "grad_norm": 0.861703097820282, + "learning_rate": 1.830894340753833e-07, + "loss": 0.2195, "step": 1049500 }, { - "epoch": 10.69, - "learning_rate": 1.2197863399744752e-05, - "loss": 0.2876, + "epoch": 14.460885619024001, + "grad_norm": 2.280820846557617, + "learning_rate": 1.8216720758673478e-07, + "loss": 0.2362, "step": 1049600 }, { - "epoch": 10.69, - "learning_rate": 1.2192524422599865e-05, - "loss": 0.3468, + "epoch": 14.46226337108374, + "grad_norm": 0.08329978585243225, + "learning_rate": 1.8124730188774168e-07, + "loss": 0.2407, "step": 1049700 }, { - "epoch": 10.7, - "learning_rate": 1.2187186316134264e-05, - "loss": 0.2775, + "epoch": 14.463641123143478, + "grad_norm": 1.8552747964859009, + "learning_rate": 1.8032971705655155e-07, + "loss": 0.1436, "step": 1049800 }, { - "epoch": 10.7, - "learning_rate": 1.218184908060896e-05, - "loss": 0.2877, + "epoch": 14.465018875203219, + "grad_norm": 0.7528455257415771, + "learning_rate": 1.794144531711195e-07, + "loss": 0.1894, "step": 1049900 }, { - "epoch": 10.7, - "learning_rate": 1.2176512716284893e-05, - "loss": 0.3316, - "step": 1050000 - }, - { - "epoch": 10.7, - "eval_cer": 0.061191920985045785, - "eval_loss": 0.3567371368408203, - "eval_runtime": 9398.6411, - "eval_samples_per_second": 5.821, - "eval_steps_per_second": 0.364, - "eval_wer": 0.1293959408625667, + "epoch": 14.466396627262958, + "grad_norm": 1.3289313316345215, + "learning_rate": 1.7850151030919915e-07, + "loss": 0.2204, "step": 1050000 }, { - "epoch": 10.7, - "learning_rate": 1.2171177223422988e-05, - "loss": 0.3312, + "epoch": 14.467774379322696, + "grad_norm": 0.7229489088058472, + "learning_rate": 1.7759088854835183e-07, + "loss": 0.1797, "step": 1050100 }, { - "epoch": 10.7, - "learning_rate": 1.2165842602284122e-05, - "loss": 0.2976, + "epoch": 14.469152131382437, + "grad_norm": 4.091299057006836, + "learning_rate": 1.766825879659373e-07, + "loss": 0.1949, "step": 1050200 }, { - "epoch": 10.7, - "learning_rate": 1.2160508853129123e-05, - "loss": 0.3116, + "epoch": 14.470529883442175, + "grad_norm": 3.66206955909729, + "learning_rate": 1.7577660863912604e-07, + "loss": 0.1907, "step": 1050300 }, { - "epoch": 10.7, - "learning_rate": 1.2155175976218777e-05, - "loss": 0.2894, + "epoch": 14.471907635501916, + "grad_norm": 1.943058967590332, + "learning_rate": 1.748729506448779e-07, + "loss": 0.1852, "step": 1050400 }, { - "epoch": 10.7, - "learning_rate": 1.2149843971813839e-05, - "loss": 0.2531, + "epoch": 14.473285387561655, + "grad_norm": 5.444952487945557, + "learning_rate": 1.7397161405996953e-07, + "loss": 0.2327, "step": 1050500 }, { - "epoch": 10.7, - "learning_rate": 1.2144512840175014e-05, - "loss": 0.2489, + "epoch": 14.474663139621393, + "grad_norm": 2.428382396697998, + "learning_rate": 1.7307259896096994e-07, + "loss": 0.2187, "step": 1050600 }, { - "epoch": 10.7, - "learning_rate": 1.2139182581562948e-05, - "loss": 0.3093, + "epoch": 14.476040891681134, + "grad_norm": 1.6242905855178833, + "learning_rate": 1.7217590542425582e-07, + "loss": 0.1826, "step": 1050700 }, { - "epoch": 10.71, - "learning_rate": 1.2133853196238263e-05, - "loss": 0.298, + "epoch": 14.477418643740872, + "grad_norm": 2.304988384246826, + "learning_rate": 1.712815335260115e-07, + "loss": 0.2175, "step": 1050800 }, { - "epoch": 10.71, - "learning_rate": 1.2128524684461548e-05, - "loss": 0.3105, + "epoch": 14.478796395800611, + "grad_norm": 12.82276439666748, + "learning_rate": 1.7038948334221066e-07, + "loss": 0.213, "step": 1050900 }, { - "epoch": 10.71, - "learning_rate": 1.2123197046493313e-05, - "loss": 0.2771, + "epoch": 14.480174147860351, + "grad_norm": 0.9803490042686462, + "learning_rate": 1.6950864073946554e-07, + "loss": 0.1998, "step": 1051000 }, { - "epoch": 10.71, - "learning_rate": 1.2117870282594062e-05, - "loss": 0.2568, + "epoch": 14.48155189992009, + "grad_norm": 2.679659128189087, + "learning_rate": 1.686212109926788e-07, + "loss": 0.1887, "step": 1051100 }, { - "epoch": 10.71, - "learning_rate": 1.2112544393024248e-05, - "loss": 0.3078, + "epoch": 14.48292965197983, + "grad_norm": 0.6335778832435608, + "learning_rate": 1.6773610318635142e-07, + "loss": 0.2328, "step": 1051200 }, { - "epoch": 10.71, - "learning_rate": 1.2107219378044257e-05, - "loss": 0.3364, + "epoch": 14.48430740403957, + "grad_norm": 3.6239070892333984, + "learning_rate": 1.6685331739566934e-07, + "loss": 0.1979, "step": 1051300 }, { - "epoch": 10.71, - "learning_rate": 1.2101895237914456e-05, - "loss": 0.2345, + "epoch": 14.485685156099308, + "grad_norm": 4.630417346954346, + "learning_rate": 1.6597285369563536e-07, + "loss": 0.2133, "step": 1051400 }, { - "epoch": 10.71, - "learning_rate": 1.2096571972895171e-05, - "loss": 0.3483, + "epoch": 14.487062908159048, + "grad_norm": 2.279193878173828, + "learning_rate": 1.6509471216104772e-07, + "loss": 0.1993, "step": 1051500 }, { - "epoch": 10.71, - "learning_rate": 1.2091249583246673e-05, - "loss": 0.2517, + "epoch": 14.488440660218787, + "grad_norm": 1.0107131004333496, + "learning_rate": 1.6421889286650616e-07, + "loss": 0.2402, "step": 1051600 }, { - "epoch": 10.71, - "learning_rate": 1.2085928069229198e-05, - "loss": 0.2499, + "epoch": 14.489818412278526, + "grad_norm": 6.315639972686768, + "learning_rate": 1.6334539588642116e-07, + "loss": 0.2363, "step": 1051700 }, { - "epoch": 10.72, - "learning_rate": 1.2080607431102946e-05, - "loss": 0.2603, + "epoch": 14.491196164338266, + "grad_norm": 0.8674686551094055, + "learning_rate": 1.6247422129499562e-07, + "loss": 0.2261, "step": 1051800 }, { - "epoch": 10.72, - "learning_rate": 1.2075287669128041e-05, - "loss": 0.2684, + "epoch": 14.492573916398005, + "grad_norm": 2.2753660678863525, + "learning_rate": 1.6160536916624614e-07, + "loss": 0.2249, "step": 1051900 }, { - "epoch": 10.72, - "learning_rate": 1.20699687835646e-05, - "loss": 0.2777, + "epoch": 14.493951668457745, + "grad_norm": 0.1291486620903015, + "learning_rate": 1.6073883957397866e-07, + "loss": 0.1907, "step": 1052000 }, { - "epoch": 10.72, - "learning_rate": 1.2064650774672698e-05, - "loss": 0.2891, + "epoch": 14.495329420517484, + "grad_norm": 2.8889153003692627, + "learning_rate": 1.59874632591816e-07, + "loss": 0.242, "step": 1052100 }, { - "epoch": 10.72, - "learning_rate": 1.2059333642712326e-05, - "loss": 0.3044, + "epoch": 14.496707172577223, + "grad_norm": 2.4183170795440674, + "learning_rate": 1.5901274829317635e-07, + "loss": 0.1841, "step": 1052200 }, { - "epoch": 10.72, - "learning_rate": 1.2054017387943477e-05, - "loss": 0.2769, + "epoch": 14.498084924636963, + "grad_norm": 2.2250900268554688, + "learning_rate": 1.5815318675127644e-07, + "loss": 0.2223, "step": 1052300 }, { - "epoch": 10.72, - "learning_rate": 1.204870201062608e-05, - "loss": 0.3124, + "epoch": 14.499462676696702, + "grad_norm": 1.2737767696380615, + "learning_rate": 1.5729594803914365e-07, + "loss": 0.2176, "step": 1052400 }, { - "epoch": 10.72, - "learning_rate": 1.2043387511020039e-05, - "loss": 0.2743, + "epoch": 14.50084042875644, + "grad_norm": 1.689103126525879, + "learning_rate": 1.56441032229607e-07, + "loss": 0.216, "step": 1052500 }, { - "epoch": 10.72, - "learning_rate": 1.2038127021254721e-05, - "loss": 0.2733, + "epoch": 14.50221818081618, + "grad_norm": 0.18839338421821594, + "learning_rate": 1.5558843939529083e-07, + "loss": 0.2649, "step": 1052600 }, { - "epoch": 10.73, - "learning_rate": 1.203281426906726e-05, - "loss": 0.3063, + "epoch": 14.50359593287592, + "grad_norm": 4.971494674682617, + "learning_rate": 1.5473816960862724e-07, + "loss": 0.1975, "step": 1052700 }, { - "epoch": 10.73, - "learning_rate": 1.2027502395367958e-05, - "loss": 0.2884, + "epoch": 14.50497368493566, + "grad_norm": 1.2953412532806396, + "learning_rate": 1.53890222941859e-07, + "loss": 0.2142, "step": 1052800 }, { - "epoch": 10.73, - "learning_rate": 1.2022191400416539e-05, - "loss": 0.2914, + "epoch": 14.506351436995399, + "grad_norm": 4.013678073883057, + "learning_rate": 1.5304459946701517e-07, + "loss": 0.1891, "step": 1052900 }, { - "epoch": 10.73, - "learning_rate": 1.2016881284472668e-05, - "loss": 0.2463, + "epoch": 14.507729189055137, + "grad_norm": 1.1028567552566528, + "learning_rate": 1.5220129925593852e-07, + "loss": 0.2217, "step": 1053000 }, { - "epoch": 10.73, - "learning_rate": 1.201157204779599e-05, - "loss": 0.2284, + "epoch": 14.509106941114878, + "grad_norm": 1.6715927124023438, + "learning_rate": 1.5136872064828445e-07, + "loss": 0.223, "step": 1053100 }, { - "epoch": 10.73, - "learning_rate": 1.2006263690646085e-05, - "loss": 0.2898, + "epoch": 14.510484693174616, + "grad_norm": 0.015158869326114655, + "learning_rate": 1.5053004394504939e-07, + "loss": 0.22, "step": 1053200 }, { - "epoch": 10.73, - "learning_rate": 1.2000956213282515e-05, - "loss": 0.29, + "epoch": 14.511862445234355, + "grad_norm": 3.99090313911438, + "learning_rate": 1.4969369071921091e-07, + "loss": 0.207, "step": 1053300 }, { - "epoch": 10.73, - "learning_rate": 1.1995649615964765e-05, - "loss": 0.3346, + "epoch": 14.513240197294095, + "grad_norm": 3.285388231277466, + "learning_rate": 1.4885966104181191e-07, + "loss": 0.2281, "step": 1053400 }, { - "epoch": 10.73, - "learning_rate": 1.1990343898952308e-05, - "loss": 0.2832, + "epoch": 14.514617949353834, + "grad_norm": 3.254018783569336, + "learning_rate": 1.4802795498371519e-07, + "loss": 0.2487, "step": 1053500 }, { - "epoch": 10.73, - "learning_rate": 1.1985039062504563e-05, - "loss": 0.2972, + "epoch": 14.515995701413573, + "grad_norm": 1.1891891956329346, + "learning_rate": 1.4719857261557585e-07, + "loss": 0.2147, "step": 1053600 }, { - "epoch": 10.74, - "learning_rate": 1.197973510688089e-05, - "loss": 0.2529, + "epoch": 14.517373453473313, + "grad_norm": 1.6442031860351562, + "learning_rate": 1.4637151400785366e-07, + "loss": 0.2143, "step": 1053700 }, { - "epoch": 10.74, - "learning_rate": 1.1974432032340634e-05, - "loss": 0.2802, + "epoch": 14.518751205533052, + "grad_norm": 1.2956016063690186, + "learning_rate": 1.4554677923080988e-07, + "loss": 0.161, "step": 1053800 }, { - "epoch": 10.74, - "learning_rate": 1.196912983914309e-05, - "loss": 0.3207, + "epoch": 14.520128957592792, + "grad_norm": 0.8181973099708557, + "learning_rate": 1.4472436835451352e-07, + "loss": 0.2314, "step": 1053900 }, { - "epoch": 10.74, - "learning_rate": 1.1963828527547485e-05, - "loss": 0.2685, + "epoch": 14.521506709652531, + "grad_norm": 3.5142598152160645, + "learning_rate": 1.4390428144882894e-07, + "loss": 0.1941, "step": 1054000 }, { - "epoch": 10.74, - "learning_rate": 1.1958528097813028e-05, - "loss": 0.2447, + "epoch": 14.52288446171227, + "grad_norm": 3.131718873977661, + "learning_rate": 1.4308651858343125e-07, + "loss": 0.21, "step": 1054100 }, { - "epoch": 10.74, - "learning_rate": 1.1953228550198883e-05, - "loss": 0.3094, + "epoch": 14.52426221377201, + "grad_norm": 0.9682480692863464, + "learning_rate": 1.4227107982778792e-07, + "loss": 0.255, "step": 1054200 }, { - "epoch": 10.74, - "learning_rate": 1.1947929884964166e-05, - "loss": 0.1863, + "epoch": 14.525639965831749, + "grad_norm": 0.4797624945640564, + "learning_rate": 1.4145796525117717e-07, + "loss": 0.2381, "step": 1054300 }, { - "epoch": 10.74, - "learning_rate": 1.1942632102367946e-05, - "loss": 0.2305, + "epoch": 14.527017717891487, + "grad_norm": 3.029003143310547, + "learning_rate": 1.406471749226787e-07, + "loss": 0.2386, "step": 1054400 }, { - "epoch": 10.74, - "learning_rate": 1.1937335202669266e-05, - "loss": 0.2776, + "epoch": 14.528395469951228, + "grad_norm": 0.8405803442001343, + "learning_rate": 1.3983870891117384e-07, + "loss": 0.2399, "step": 1054500 }, { - "epoch": 10.74, - "learning_rate": 1.1932039186127091e-05, - "loss": 0.301, + "epoch": 14.529773222010967, + "grad_norm": 3.6884028911590576, + "learning_rate": 1.3903256728534236e-07, + "loss": 0.1855, "step": 1054600 }, { - "epoch": 10.75, - "learning_rate": 1.1926744053000379e-05, - "loss": 0.2865, + "epoch": 14.531150974070707, + "grad_norm": 1.092498779296875, + "learning_rate": 1.3822875011367474e-07, + "loss": 0.1926, "step": 1054700 }, { - "epoch": 10.75, - "learning_rate": 1.1921449803548024e-05, - "loss": 0.3265, + "epoch": 14.532528726130446, + "grad_norm": 0.5491926074028015, + "learning_rate": 1.374272574644539e-07, + "loss": 0.1999, "step": 1054800 }, { - "epoch": 10.75, - "learning_rate": 1.1916156438028898e-05, - "loss": 0.2762, + "epoch": 14.533906478190184, + "grad_norm": 3.4490442276000977, + "learning_rate": 1.3662808940577336e-07, + "loss": 0.2043, "step": 1054900 }, { - "epoch": 10.75, - "learning_rate": 1.1910863956701795e-05, - "loss": 0.297, + "epoch": 14.535284230249925, + "grad_norm": 3.400589942932129, + "learning_rate": 1.3583124600552833e-07, + "loss": 0.1838, "step": 1055000 }, { - "epoch": 10.75, - "learning_rate": 1.1905625271415379e-05, - "loss": 0.2922, + "epoch": 14.536661982309663, + "grad_norm": 2.4204816818237305, + "learning_rate": 1.3503672733141237e-07, + "loss": 0.2028, "step": 1055100 }, { - "epoch": 10.75, - "learning_rate": 1.1900334550400229e-05, - "loss": 0.3215, + "epoch": 14.538039734369402, + "grad_norm": 0.10947739332914352, + "learning_rate": 1.3424453345092376e-07, + "loss": 0.2289, "step": 1055200 }, { - "epoch": 10.75, - "learning_rate": 1.1895044714350709e-05, - "loss": 0.2783, + "epoch": 14.539417486429143, + "grad_norm": 1.800245761871338, + "learning_rate": 1.334546644313653e-07, + "loss": 0.2524, "step": 1055300 }, { - "epoch": 10.75, - "learning_rate": 1.1889755763525464e-05, - "loss": 0.304, + "epoch": 14.540795238488881, + "grad_norm": 3.0236077308654785, + "learning_rate": 1.326671203398383e-07, + "loss": 0.2177, "step": 1055400 }, { - "epoch": 10.75, - "learning_rate": 1.1884467698183069e-05, - "loss": 0.2784, + "epoch": 14.542172990548622, + "grad_norm": 1.1610027551651, + "learning_rate": 1.3188190124325177e-07, + "loss": 0.1819, "step": 1055500 }, { - "epoch": 10.75, - "learning_rate": 1.1879180518582107e-05, - "loss": 0.3194, + "epoch": 14.54355074260836, + "grad_norm": 2.971428871154785, + "learning_rate": 1.3109900720831015e-07, + "loss": 0.2543, "step": 1055600 }, { - "epoch": 10.76, - "learning_rate": 1.1873894224981084e-05, - "loss": 0.3031, + "epoch": 14.544928494668099, + "grad_norm": 2.1985690593719482, + "learning_rate": 1.303262324809934e-07, + "loss": 0.2089, "step": 1055700 }, { - "epoch": 10.76, - "learning_rate": 1.1868608817638455e-05, - "loss": 0.2789, + "epoch": 14.54630624672784, + "grad_norm": 1.9219588041305542, + "learning_rate": 1.2954796551640946e-07, + "loss": 0.2119, "step": 1055800 }, { - "epoch": 10.76, - "learning_rate": 1.1863324296812653e-05, - "loss": 0.2452, + "epoch": 14.547683998787578, + "grad_norm": 1.1498929262161255, + "learning_rate": 1.287720238117518e-07, + "loss": 0.1835, "step": 1055900 }, { - "epoch": 10.76, - "learning_rate": 1.1858040662762068e-05, - "loss": 0.3012, + "epoch": 14.549061750847317, + "grad_norm": 0.21879388391971588, + "learning_rate": 1.2799840743294016e-07, + "loss": 0.213, "step": 1056000 }, { - "epoch": 10.76, - "learning_rate": 1.1852757915745019e-05, - "loss": 0.2687, + "epoch": 14.550439502907057, + "grad_norm": 0.8429954648017883, + "learning_rate": 1.2722711644569895e-07, + "loss": 0.2225, "step": 1056100 }, { - "epoch": 10.76, - "learning_rate": 1.1847476056019812e-05, - "loss": 0.3049, + "epoch": 14.551817254966796, + "grad_norm": 1.203163504600525, + "learning_rate": 1.2645815091555103e-07, + "loss": 0.2079, "step": 1056200 }, { - "epoch": 10.76, - "learning_rate": 1.1842195083844706e-05, - "loss": 0.2425, + "epoch": 14.553195007026535, + "grad_norm": 2.9745748043060303, + "learning_rate": 1.256915109078269e-07, + "loss": 0.2331, "step": 1056300 }, { - "epoch": 10.76, - "learning_rate": 1.1836914999477892e-05, - "loss": 0.2592, + "epoch": 14.554572759086275, + "grad_norm": 1.7408654689788818, + "learning_rate": 1.2492719648765554e-07, + "loss": 0.2146, "step": 1056400 }, { - "epoch": 10.76, - "learning_rate": 1.183163580317754e-05, - "loss": 0.3253, + "epoch": 14.555950511146014, + "grad_norm": 2.820443868637085, + "learning_rate": 1.2417281609545278e-07, + "loss": 0.2372, "step": 1056500 }, { - "epoch": 10.76, - "learning_rate": 1.1826357495201785e-05, - "loss": 0.2794, + "epoch": 14.557328263205754, + "grad_norm": 1.4155102968215942, + "learning_rate": 1.2341312978748964e-07, + "loss": 0.2082, "step": 1056600 }, { - "epoch": 10.77, - "learning_rate": 1.1821080075808684e-05, - "loss": 0.2835, + "epoch": 14.558706015265493, + "grad_norm": 4.878104209899902, + "learning_rate": 1.2265576926064365e-07, + "loss": 0.19, "step": 1056700 }, { - "epoch": 10.77, - "learning_rate": 1.1815803545256272e-05, - "loss": 0.3021, + "epoch": 14.560083767325231, + "grad_norm": 5.549022197723389, + "learning_rate": 1.2190073457924694e-07, + "loss": 0.2395, "step": 1056800 }, { - "epoch": 10.77, - "learning_rate": 1.1810527903802568e-05, - "loss": 0.2741, + "epoch": 14.561461519384972, + "grad_norm": 4.503925323486328, + "learning_rate": 1.2114802580745155e-07, + "loss": 0.2035, "step": 1056900 }, { - "epoch": 10.77, - "learning_rate": 1.1805253151705491e-05, - "loss": 0.2585, + "epoch": 14.56283927144471, + "grad_norm": 1.402661681175232, + "learning_rate": 1.2039764300920187e-07, + "loss": 0.1996, "step": 1057000 }, { - "epoch": 10.77, - "learning_rate": 1.1799979289222954e-05, - "loss": 0.2507, + "epoch": 14.564217023504451, + "grad_norm": 5.117333889007568, + "learning_rate": 1.1964958624824386e-07, + "loss": 0.2313, "step": 1057100 }, { - "epoch": 10.77, - "learning_rate": 1.1794706316612819e-05, - "loss": 0.2366, + "epoch": 14.56559477556419, + "grad_norm": 4.271954536437988, + "learning_rate": 1.1890385558812806e-07, + "loss": 0.1967, "step": 1057200 }, { - "epoch": 10.77, - "learning_rate": 1.178943423413291e-05, - "loss": 0.3218, + "epoch": 14.566972527623928, + "grad_norm": 2.0071210861206055, + "learning_rate": 1.1816045109221268e-07, + "loss": 0.19, "step": 1057300 }, { - "epoch": 10.77, - "learning_rate": 1.1784163042040983e-05, - "loss": 0.3204, + "epoch": 14.568350279683669, + "grad_norm": 1.0216478109359741, + "learning_rate": 1.174193728236514e-07, + "loss": 0.2271, "step": 1057400 }, { - "epoch": 10.77, - "learning_rate": 1.1778892740594775e-05, - "loss": 0.294, + "epoch": 14.569728031743407, + "grad_norm": 3.0407822132110596, + "learning_rate": 1.1668062084540243e-07, + "loss": 0.2184, "step": 1057500 }, { - "epoch": 10.78, - "learning_rate": 1.1773623330051987e-05, - "loss": 0.2625, + "epoch": 14.571105783803146, + "grad_norm": 2.05863356590271, + "learning_rate": 1.1594419522022865e-07, + "loss": 0.2326, "step": 1057600 }, { - "epoch": 10.78, - "learning_rate": 1.1768354810670234e-05, - "loss": 0.3092, + "epoch": 14.572483535862887, + "grad_norm": 1.6558059453964233, + "learning_rate": 1.1521009601068833e-07, + "loss": 0.2138, "step": 1057700 }, { - "epoch": 10.78, - "learning_rate": 1.1763087182707135e-05, - "loss": 0.2805, + "epoch": 14.573861287922625, + "grad_norm": 2.2125537395477295, + "learning_rate": 1.1447832327915048e-07, + "loss": 0.2411, "step": 1057800 }, { - "epoch": 10.78, - "learning_rate": 1.1757925772405894e-05, - "loss": 0.2903, + "epoch": 14.575239039982364, + "grad_norm": 1.699074149131775, + "learning_rate": 1.1374887708778565e-07, + "loss": 0.1932, "step": 1057900 }, { - "epoch": 10.78, - "learning_rate": 1.1752659910211517e-05, - "loss": 0.314, + "epoch": 14.576616792042104, + "grad_norm": 2.9789435863494873, + "learning_rate": 1.1302175749855676e-07, + "loss": 0.1811, "step": 1058000 }, { - "epoch": 10.78, - "learning_rate": 1.1747394940203177e-05, - "loss": 0.2116, + "epoch": 14.577994544101843, + "grad_norm": 2.0149011611938477, + "learning_rate": 1.1229696457324357e-07, + "loss": 0.1888, "step": 1058100 }, { - "epoch": 10.78, - "learning_rate": 1.1742130862638295e-05, - "loss": 0.3067, + "epoch": 14.579372296161583, + "grad_norm": 1.9105106592178345, + "learning_rate": 1.1157449837341516e-07, + "loss": 0.194, "step": 1058200 }, { - "epoch": 10.78, - "learning_rate": 1.173686767777426e-05, - "loss": 0.2975, + "epoch": 14.580750048221322, + "grad_norm": 1.6746035814285278, + "learning_rate": 1.1085435896045437e-07, + "loss": 0.2688, "step": 1058300 }, { - "epoch": 10.78, - "learning_rate": 1.1731605385868405e-05, - "loss": 0.2672, + "epoch": 14.58212780028106, + "grad_norm": 2.158827066421509, + "learning_rate": 1.1013654639553644e-07, + "loss": 0.2634, "step": 1058400 }, { - "epoch": 10.78, - "learning_rate": 1.1726343987178027e-05, - "loss": 0.3345, + "epoch": 14.583505552340801, + "grad_norm": 0.6203633546829224, + "learning_rate": 1.0942106073964425e-07, + "loss": 0.1886, "step": 1058500 }, { - "epoch": 10.79, - "learning_rate": 1.1721083481960385e-05, - "loss": 0.3027, + "epoch": 14.58488330440054, + "grad_norm": 5.5968217849731445, + "learning_rate": 1.0870790205356223e-07, + "loss": 0.1952, "step": 1058600 }, { - "epoch": 10.79, - "learning_rate": 1.171582387047267e-05, - "loss": 0.2517, + "epoch": 14.586261056460279, + "grad_norm": 0.3262626826763153, + "learning_rate": 1.0799707039787638e-07, + "loss": 0.2004, "step": 1058700 }, { - "epoch": 10.79, - "learning_rate": 1.1710565152972053e-05, - "loss": 0.2893, + "epoch": 14.587638808520019, + "grad_norm": 3.674304723739624, + "learning_rate": 1.072885658329742e-07, + "loss": 0.2524, "step": 1058800 }, { - "epoch": 10.79, - "learning_rate": 1.1705307329715666e-05, - "loss": 0.2599, + "epoch": 14.589016560579758, + "grad_norm": 1.5658506155014038, + "learning_rate": 1.0658238841905088e-07, + "loss": 0.2123, "step": 1058900 }, { - "epoch": 10.79, - "learning_rate": 1.1700050400960562e-05, - "loss": 0.2761, + "epoch": 14.590394312639498, + "grad_norm": 4.069824695587158, + "learning_rate": 1.0587853821609705e-07, + "loss": 0.2105, "step": 1059000 }, { - "epoch": 10.79, - "learning_rate": 1.1694794366963786e-05, - "loss": 0.2171, + "epoch": 14.591772064699237, + "grad_norm": 1.7818678617477417, + "learning_rate": 1.05177015283911e-07, + "loss": 0.1986, "step": 1059100 }, { - "epoch": 10.79, - "learning_rate": 1.1689539227982333e-05, - "loss": 0.3005, + "epoch": 14.593149816758975, + "grad_norm": 10.689050674438477, + "learning_rate": 1.0447781968208336e-07, + "loss": 0.1888, "step": 1059200 }, { - "epoch": 10.79, - "learning_rate": 1.1684284984273132e-05, - "loss": 0.2295, + "epoch": 14.594527568818716, + "grad_norm": 0.26548296213150024, + "learning_rate": 1.0378095147002164e-07, + "loss": 0.1931, "step": 1059300 }, { - "epoch": 10.79, - "learning_rate": 1.1679031636093094e-05, - "loss": 0.2274, + "epoch": 14.595905320878455, + "grad_norm": 1.739495873451233, + "learning_rate": 1.0308641070692875e-07, + "loss": 0.1732, "step": 1059400 }, { - "epoch": 10.79, - "learning_rate": 1.1673779183699072e-05, - "loss": 0.2898, + "epoch": 14.597283072938193, + "grad_norm": 1.6456894874572754, + "learning_rate": 1.0239419745180611e-07, + "loss": 0.1993, "step": 1059500 }, { - "epoch": 10.8, - "learning_rate": 1.1668527627347887e-05, - "loss": 0.2617, + "epoch": 14.598660824997934, + "grad_norm": 3.7128641605377197, + "learning_rate": 1.0170431176345974e-07, + "loss": 0.2487, "step": 1059600 }, { - "epoch": 10.8, - "learning_rate": 1.1663276967296303e-05, - "loss": 0.2826, + "epoch": 14.600038577057672, + "grad_norm": 2.0437982082366943, + "learning_rate": 1.0101675370050329e-07, + "loss": 0.1728, "step": 1059700 }, { - "epoch": 10.8, - "learning_rate": 1.1658027203801057e-05, - "loss": 0.269, + "epoch": 14.601416329117413, + "grad_norm": 5.03501033782959, + "learning_rate": 1.0033152332134282e-07, + "loss": 0.2589, "step": 1059800 }, { - "epoch": 10.8, - "learning_rate": 1.1652778337118813e-05, - "loss": 0.2225, + "epoch": 14.602794081177151, + "grad_norm": 1.3488279581069946, + "learning_rate": 9.964862068419512e-08, + "loss": 0.2037, "step": 1059900 }, { - "epoch": 10.8, - "learning_rate": 1.1647530367506214e-05, - "loss": 0.252, + "epoch": 14.60417183323689, + "grad_norm": 2.6241307258605957, + "learning_rate": 9.897484007264951e-08, + "loss": 0.1876, "step": 1060000 }, { - "epoch": 10.8, - "learning_rate": 1.164228329521986e-05, - "loss": 0.2755, + "epoch": 14.60554958529663, + "grad_norm": 1.7825360298156738, + "learning_rate": 9.829656981451524e-08, + "loss": 0.2385, "step": 1060100 }, { - "epoch": 10.8, - "learning_rate": 1.1637037120516312e-05, - "loss": 0.2118, + "epoch": 14.60692733735637, + "grad_norm": 1.813906192779541, + "learning_rate": 9.762062747127626e-08, + "loss": 0.2377, "step": 1060200 }, { - "epoch": 10.8, - "learning_rate": 1.1631791843652054e-05, - "loss": 0.3242, + "epoch": 14.608305089416108, + "grad_norm": 2.9254839420318604, + "learning_rate": 9.694701310035248e-08, + "loss": 0.1574, "step": 1060300 }, { - "epoch": 10.8, - "learning_rate": 1.1626547464883555e-05, - "loss": 0.2567, + "epoch": 14.609682841475848, + "grad_norm": 1.04872727394104, + "learning_rate": 9.627572675897145e-08, + "loss": 0.1972, "step": 1060400 }, { - "epoch": 10.8, - "learning_rate": 1.1621303984467251e-05, - "loss": 0.3587, + "epoch": 14.611060593535587, + "grad_norm": 0.01565225049853325, + "learning_rate": 9.560676850416534e-08, + "loss": 0.1872, "step": 1060500 }, { - "epoch": 10.81, - "learning_rate": 1.1616061402659494e-05, - "loss": 0.3384, + "epoch": 14.612438345595326, + "grad_norm": 1.5857715606689453, + "learning_rate": 9.494013839276173e-08, + "loss": 0.245, "step": 1060600 }, { - "epoch": 10.81, - "learning_rate": 1.1610819719716625e-05, - "loss": 0.259, + "epoch": 14.613816097655066, + "grad_norm": 12.168656349182129, + "learning_rate": 9.427583648140198e-08, + "loss": 0.1845, "step": 1060700 }, { - "epoch": 10.81, - "learning_rate": 1.1605578935894927e-05, - "loss": 0.3, + "epoch": 14.615193849714805, + "grad_norm": 4.860556125640869, + "learning_rate": 9.361386282651374e-08, + "loss": 0.1731, "step": 1060800 }, { - "epoch": 10.81, - "learning_rate": 1.1600339051450653e-05, - "loss": 0.3417, + "epoch": 14.616571601774545, + "grad_norm": 3.238278388977051, + "learning_rate": 9.295421748433536e-08, + "loss": 0.2143, "step": 1060900 }, { - "epoch": 10.81, - "learning_rate": 1.1595100066639994e-05, - "loss": 0.2429, + "epoch": 14.617949353834284, + "grad_norm": 2.28961443901062, + "learning_rate": 9.229690051091283e-08, + "loss": 0.1696, "step": 1061000 }, { - "epoch": 10.81, - "learning_rate": 1.1589861981719115e-05, - "loss": 0.269, + "epoch": 14.619327105894023, + "grad_norm": 1.147296667098999, + "learning_rate": 9.164191196208455e-08, + "loss": 0.218, "step": 1061100 }, { - "epoch": 10.81, - "learning_rate": 1.158462479694411e-05, - "loss": 0.265, + "epoch": 14.620704857953763, + "grad_norm": 0.628308117389679, + "learning_rate": 9.098925189349655e-08, + "loss": 0.1883, "step": 1061200 }, { - "epoch": 10.81, - "learning_rate": 1.1579388512571055e-05, - "loss": 0.2421, + "epoch": 14.622082610013502, + "grad_norm": 2.5557363033294678, + "learning_rate": 9.033892036059643e-08, + "loss": 0.1671, "step": 1061300 }, { - "epoch": 10.81, - "learning_rate": 1.1574153128855983e-05, - "loss": 0.2823, + "epoch": 14.623460362073242, + "grad_norm": 2.456542491912842, + "learning_rate": 8.969091741863028e-08, + "loss": 0.2625, "step": 1061400 }, { - "epoch": 10.81, - "learning_rate": 1.1568918646054854e-05, - "loss": 0.284, + "epoch": 14.62483811413298, + "grad_norm": 2.364161491394043, + "learning_rate": 8.904524312265184e-08, + "loss": 0.1918, "step": 1061500 }, { - "epoch": 10.82, - "learning_rate": 1.1563685064423615e-05, - "loss": 0.3135, + "epoch": 14.62621586619272, + "grad_norm": 1.5944273471832275, + "learning_rate": 8.840189752751027e-08, + "loss": 0.1826, "step": 1061600 }, { - "epoch": 10.82, - "learning_rate": 1.1558452384218158e-05, - "loss": 0.3103, + "epoch": 14.62759361825246, + "grad_norm": 1.6017801761627197, + "learning_rate": 8.776088068786852e-08, + "loss": 0.2363, "step": 1061700 }, { - "epoch": 10.82, - "learning_rate": 1.1553220605694322e-05, - "loss": 0.2525, + "epoch": 14.628971370312199, + "grad_norm": 2.457019329071045, + "learning_rate": 8.712219265817889e-08, + "loss": 0.2035, "step": 1061800 }, { - "epoch": 10.82, - "learning_rate": 1.1547989729107911e-05, - "loss": 0.2729, + "epoch": 14.630349122371937, + "grad_norm": 0.2548597455024719, + "learning_rate": 8.648583349270128e-08, + "loss": 0.2163, "step": 1061900 }, { - "epoch": 10.82, - "learning_rate": 1.1542759754714686e-05, - "loss": 0.266, + "epoch": 14.631726874431678, + "grad_norm": 1.0474525690078735, + "learning_rate": 8.585180324549718e-08, + "loss": 0.225, "step": 1062000 }, { - "epoch": 10.82, - "learning_rate": 1.1537530682770363e-05, - "loss": 0.251, + "epoch": 14.633104626491416, + "grad_norm": 0.396475225687027, + "learning_rate": 8.522010197043572e-08, + "loss": 0.2023, "step": 1062100 }, { - "epoch": 10.82, - "learning_rate": 1.1532302513530611e-05, - "loss": 0.2831, + "epoch": 14.634482378551155, + "grad_norm": 4.0248703956604, + "learning_rate": 8.459072972117843e-08, + "loss": 0.2102, "step": 1062200 }, { - "epoch": 10.82, - "learning_rate": 1.1527075247251057e-05, - "loss": 0.3287, + "epoch": 14.635860130610896, + "grad_norm": 4.017994403839111, + "learning_rate": 8.396368655119446e-08, + "loss": 0.1701, "step": 1062300 }, { - "epoch": 10.82, - "learning_rate": 1.152184888418729e-05, - "loss": 0.2754, + "epoch": 14.637237882670634, + "grad_norm": 4.064123153686523, + "learning_rate": 8.333897251375455e-08, + "loss": 0.1793, "step": 1062400 }, { - "epoch": 10.82, - "learning_rate": 1.1516623424594835e-05, - "loss": 0.3436, + "epoch": 14.638615634730375, + "grad_norm": 1.98574960231781, + "learning_rate": 8.271658766193402e-08, + "loss": 0.208, "step": 1062500 }, { - "epoch": 10.83, - "learning_rate": 1.1511398868729187e-05, - "loss": 0.2508, + "epoch": 14.639993386790113, + "grad_norm": 0.863537073135376, + "learning_rate": 8.209653204860057e-08, + "loss": 0.2334, "step": 1062600 }, { - "epoch": 10.83, - "learning_rate": 1.1506175216845815e-05, - "loss": 0.2733, + "epoch": 14.641371138849852, + "grad_norm": 0.8978185653686523, + "learning_rate": 8.147880572644178e-08, + "loss": 0.1951, "step": 1062700 }, { - "epoch": 10.83, - "learning_rate": 1.1500952469200097e-05, - "loss": 0.2791, + "epoch": 14.642748890909592, + "grad_norm": 1.044740080833435, + "learning_rate": 8.086340874792542e-08, + "loss": 0.2432, "step": 1062800 }, { - "epoch": 10.83, - "learning_rate": 1.1495730626047406e-05, - "loss": 0.2727, + "epoch": 14.644126642969331, + "grad_norm": 2.7601523399353027, + "learning_rate": 8.025034116533908e-08, + "loss": 0.2094, "step": 1062900 }, { - "epoch": 10.83, - "learning_rate": 1.1490509687643072e-05, - "loss": 0.2692, + "epoch": 14.64550439502907, + "grad_norm": 1.0072929859161377, + "learning_rate": 7.963960303076584e-08, + "loss": 0.1743, "step": 1063000 }, { - "epoch": 10.83, - "learning_rate": 1.1485289654242343e-05, - "loss": 0.2941, + "epoch": 14.64688214708881, + "grad_norm": 1.8595679998397827, + "learning_rate": 7.90311943960903e-08, + "loss": 0.2207, "step": 1063100 }, { - "epoch": 10.83, - "learning_rate": 1.1480070526100461e-05, - "loss": 0.2291, + "epoch": 14.648259899148549, + "grad_norm": 7.076990604400635, + "learning_rate": 7.842511531299862e-08, + "loss": 0.2471, "step": 1063200 }, { - "epoch": 10.83, - "learning_rate": 1.1474852303472607e-05, - "loss": 0.3021, + "epoch": 14.64963765120829, + "grad_norm": 1.6304078102111816, + "learning_rate": 7.782136583298155e-08, + "loss": 0.2289, "step": 1063300 }, { - "epoch": 10.83, - "learning_rate": 1.1469634986613927e-05, - "loss": 0.2798, + "epoch": 14.651015403268028, + "grad_norm": 2.103257417678833, + "learning_rate": 7.72199460073314e-08, + "loss": 0.1702, "step": 1063400 }, { - "epoch": 10.84, - "learning_rate": 1.146441857577951e-05, - "loss": 0.299, + "epoch": 14.652393155327767, + "grad_norm": 1.096548318862915, + "learning_rate": 7.662085588713896e-08, + "loss": 0.2147, "step": 1063500 }, { - "epoch": 10.84, - "learning_rate": 1.145920307122441e-05, - "loss": 0.2369, + "epoch": 14.653770907387507, + "grad_norm": 0.15656167268753052, + "learning_rate": 7.602409552330269e-08, + "loss": 0.1845, "step": 1063600 }, { - "epoch": 10.84, - "learning_rate": 1.1453988473203648e-05, - "loss": 0.243, + "epoch": 14.655148659447246, + "grad_norm": 1.7596184015274048, + "learning_rate": 7.542966496651954e-08, + "loss": 0.2288, "step": 1063700 }, { - "epoch": 10.84, - "learning_rate": 1.1448826914395034e-05, - "loss": 0.2869, + "epoch": 14.656526411506984, + "grad_norm": 1.2369574308395386, + "learning_rate": 7.4837564267288e-08, + "loss": 0.19, "step": 1063800 }, { - "epoch": 10.84, - "learning_rate": 1.1443614121136047e-05, - "loss": 0.275, + "epoch": 14.657904163566725, + "grad_norm": 1.3500478267669678, + "learning_rate": 7.42477934759142e-08, + "loss": 0.2014, "step": 1063900 }, { - "epoch": 10.84, - "learning_rate": 1.1438402235173591e-05, - "loss": 0.2783, + "epoch": 14.659281915626464, + "grad_norm": 1.9070484638214111, + "learning_rate": 7.36603526424967e-08, + "loss": 0.2189, "step": 1064000 }, { - "epoch": 10.84, - "learning_rate": 1.1433191256762498e-05, - "loss": 0.3269, + "epoch": 14.660659667686204, + "grad_norm": 2.2372629642486572, + "learning_rate": 7.30752418169478e-08, + "loss": 0.2175, "step": 1064100 }, { - "epoch": 10.84, - "learning_rate": 1.1427981186157556e-05, - "loss": 0.2247, + "epoch": 14.662037419745943, + "grad_norm": 2.9730851650238037, + "learning_rate": 7.249246104896912e-08, + "loss": 0.1969, "step": 1064200 }, { - "epoch": 10.84, - "learning_rate": 1.1422772023613497e-05, - "loss": 0.2494, + "epoch": 14.663415171805681, + "grad_norm": 9.116437911987305, + "learning_rate": 7.191201038807604e-08, + "loss": 0.1958, "step": 1064300 }, { - "epoch": 10.84, - "learning_rate": 1.1417563769385021e-05, - "loss": 0.3307, + "epoch": 14.664792923865422, + "grad_norm": 0.8251709938049316, + "learning_rate": 7.133388988357942e-08, + "loss": 0.2244, "step": 1064400 }, { - "epoch": 10.85, - "learning_rate": 1.1412408492685114e-05, - "loss": 0.2266, + "epoch": 14.66617067592516, + "grad_norm": 0.7052600979804993, + "learning_rate": 7.075809958459161e-08, + "loss": 0.1703, "step": 1064500 }, { - "epoch": 10.85, - "learning_rate": 1.140725410672193e-05, - "loss": 0.2585, + "epoch": 14.667548427984899, + "grad_norm": 2.356926679611206, + "learning_rate": 7.018463954002963e-08, + "loss": 0.1958, "step": 1064600 }, { - "epoch": 10.85, - "learning_rate": 1.1402048560783865e-05, - "loss": 0.2668, + "epoch": 14.66892618004464, + "grad_norm": 2.4160759449005127, + "learning_rate": 6.961350979861503e-08, + "loss": 0.2274, "step": 1064700 }, { - "epoch": 10.85, - "learning_rate": 1.1396843924174647e-05, - "loss": 0.271, + "epoch": 14.670303932104378, + "grad_norm": 3.823357343673706, + "learning_rate": 6.904471040886484e-08, + "loss": 0.1798, "step": 1064800 }, { - "epoch": 10.85, - "learning_rate": 1.1391640197148735e-05, - "loss": 0.2935, + "epoch": 14.671681684164117, + "grad_norm": 7.0631303787231445, + "learning_rate": 6.847824141910069e-08, + "loss": 0.2236, "step": 1064900 }, { - "epoch": 10.85, - "learning_rate": 1.1386437379960565e-05, - "loss": 0.2748, + "epoch": 14.673059436223857, + "grad_norm": 0.5727594494819641, + "learning_rate": 6.791410287744881e-08, + "loss": 0.2236, "step": 1065000 }, { - "epoch": 10.85, - "learning_rate": 1.138123547286454e-05, - "loss": 0.1948, + "epoch": 14.674437188283596, + "grad_norm": 1.1596064567565918, + "learning_rate": 6.735229483183697e-08, + "loss": 0.1897, "step": 1065100 }, { - "epoch": 10.85, - "learning_rate": 1.1376034476114979e-05, - "loss": 0.3112, + "epoch": 14.675814940343336, + "grad_norm": 3.9859416484832764, + "learning_rate": 6.679281732999143e-08, + "loss": 0.2029, "step": 1065200 }, { - "epoch": 10.85, - "learning_rate": 1.137083438996619e-05, - "loss": 0.2922, + "epoch": 14.677192692403075, + "grad_norm": 0.9898383021354675, + "learning_rate": 6.624123035196606e-08, + "loss": 0.2259, "step": 1065300 }, { - "epoch": 10.85, - "learning_rate": 1.1365635214672426e-05, - "loss": 0.2692, + "epoch": 14.678570444462814, + "grad_norm": 1.6187522411346436, + "learning_rate": 6.568639077342897e-08, + "loss": 0.2377, "step": 1065400 }, { - "epoch": 10.86, - "learning_rate": 1.136043695048791e-05, - "loss": 0.333, + "epoch": 14.679948196522554, + "grad_norm": 1.5964347124099731, + "learning_rate": 6.513388188018526e-08, + "loss": 0.2121, "step": 1065500 }, { - "epoch": 10.86, - "learning_rate": 1.1355239597666776e-05, - "loss": 0.2755, + "epoch": 14.681325948582293, + "grad_norm": 0.2603021264076233, + "learning_rate": 6.45837037191735e-08, + "loss": 0.1998, "step": 1065600 }, { - "epoch": 10.86, - "learning_rate": 1.1350043156463163e-05, - "loss": 0.2815, + "epoch": 14.682703700642033, + "grad_norm": 1.0766738653182983, + "learning_rate": 6.403585633713682e-08, + "loss": 0.2192, "step": 1065700 }, { - "epoch": 10.86, - "learning_rate": 1.1344847627131141e-05, - "loss": 0.2943, + "epoch": 14.684081452701772, + "grad_norm": 1.3574103116989136, + "learning_rate": 6.349033978061386e-08, + "loss": 0.2341, "step": 1065800 }, { - "epoch": 10.86, - "learning_rate": 1.1339653009924745e-05, - "loss": 0.2631, + "epoch": 14.68545920476151, + "grad_norm": 2.3115594387054443, + "learning_rate": 6.294715409595086e-08, + "loss": 0.204, "step": 1065900 }, { - "epoch": 10.86, - "learning_rate": 1.1334459305097953e-05, - "loss": 0.3429, + "epoch": 14.686836956821251, + "grad_norm": 1.500071406364441, + "learning_rate": 6.240629932929259e-08, + "loss": 0.2029, "step": 1066000 }, { - "epoch": 10.86, - "learning_rate": 1.1329266512904723e-05, - "loss": 0.2423, + "epoch": 14.68821470888099, + "grad_norm": 4.5576324462890625, + "learning_rate": 6.186777552659145e-08, + "loss": 0.1847, "step": 1066100 }, { - "epoch": 10.86, - "learning_rate": 1.132407463359893e-05, - "loss": 0.2732, + "epoch": 14.689592460940728, + "grad_norm": 2.3135814666748047, + "learning_rate": 6.133693312287581e-08, + "loss": 0.1796, "step": 1066200 }, { - "epoch": 10.86, - "learning_rate": 1.1318883667434435e-05, - "loss": 0.262, + "epoch": 14.690970213000469, + "grad_norm": 1.7553790807724, + "learning_rate": 6.080304807436043e-08, + "loss": 0.1912, "step": 1066300 }, { - "epoch": 10.86, - "learning_rate": 1.1313693614665058e-05, - "loss": 0.2419, + "epoch": 14.692347965060208, + "grad_norm": 1.0685458183288574, + "learning_rate": 6.027149412600158e-08, + "loss": 0.1607, "step": 1066400 }, { - "epoch": 10.87, - "learning_rate": 1.1308504475544535e-05, - "loss": 0.2457, + "epoch": 14.693725717119946, + "grad_norm": 0.9714058041572571, + "learning_rate": 5.97422713229609e-08, + "loss": 0.2066, "step": 1066500 }, { - "epoch": 10.87, - "learning_rate": 1.1303316250326597e-05, - "loss": 0.2388, + "epoch": 14.695103469179687, + "grad_norm": 2.1571431159973145, + "learning_rate": 5.921537971019852e-08, + "loss": 0.1889, "step": 1066600 }, { - "epoch": 10.87, - "learning_rate": 1.1298128939264933e-05, - "loss": 0.332, + "epoch": 14.696481221239425, + "grad_norm": 0.6031253337860107, + "learning_rate": 5.869081933247311e-08, + "loss": 0.2272, "step": 1066700 }, { - "epoch": 10.87, - "learning_rate": 1.1292942542613145e-05, - "loss": 0.2874, + "epoch": 14.697858973299166, + "grad_norm": 3.8849663734436035, + "learning_rate": 5.8168590234354003e-08, + "loss": 0.2067, "step": 1066800 }, { - "epoch": 10.87, - "learning_rate": 1.128775706062483e-05, - "loss": 0.2773, + "epoch": 14.699236725358904, + "grad_norm": 2.919374465942383, + "learning_rate": 5.7648692460202936e-08, + "loss": 0.2238, "step": 1066900 }, { - "epoch": 10.87, - "learning_rate": 1.1282572493553523e-05, - "loss": 0.2743, + "epoch": 14.700614477418643, + "grad_norm": 3.165018320083618, + "learning_rate": 5.71311260541893e-08, + "loss": 0.1995, "step": 1067000 }, { - "epoch": 10.87, - "learning_rate": 1.1277388841652723e-05, - "loss": 0.308, + "epoch": 14.701992229478384, + "grad_norm": 3.9404499530792236, + "learning_rate": 5.6615891060284034e-08, + "loss": 0.2541, "step": 1067100 }, { - "epoch": 10.87, - "learning_rate": 1.1272206105175878e-05, - "loss": 0.2355, + "epoch": 14.703369981538122, + "grad_norm": 9.033873558044434, + "learning_rate": 5.610298752225962e-08, + "loss": 0.2389, "step": 1067200 }, { - "epoch": 10.87, - "learning_rate": 1.1267024284376395e-05, - "loss": 0.2861, + "epoch": 14.704747733597861, + "grad_norm": 4.983729839324951, + "learning_rate": 5.559241548368704e-08, + "loss": 0.205, "step": 1067300 }, { - "epoch": 10.87, - "learning_rate": 1.1261843379507637e-05, - "loss": 0.2815, + "epoch": 14.706125485657601, + "grad_norm": 1.9892619848251343, + "learning_rate": 5.508417498794493e-08, + "loss": 0.1708, "step": 1067400 }, { - "epoch": 10.88, - "learning_rate": 1.1256663390822908e-05, - "loss": 0.26, + "epoch": 14.70750323771734, + "grad_norm": 2.351402759552002, + "learning_rate": 5.457826607820737e-08, + "loss": 0.222, "step": 1067500 }, { - "epoch": 10.88, - "learning_rate": 1.1251484318575485e-05, - "loss": 0.2703, + "epoch": 14.70888098977708, + "grad_norm": 4.581531524658203, + "learning_rate": 5.407468879745608e-08, + "loss": 0.2354, "step": 1067600 }, { - "epoch": 10.88, - "learning_rate": 1.1246306163018602e-05, - "loss": 0.2284, + "epoch": 14.71025874183682, + "grad_norm": 1.7210137844085693, + "learning_rate": 5.3573443188474345e-08, + "loss": 0.2035, "step": 1067700 }, { - "epoch": 10.88, - "learning_rate": 1.1241128924405422e-05, - "loss": 0.2831, + "epoch": 14.711636493896558, + "grad_norm": 1.5514737367630005, + "learning_rate": 5.307452929384393e-08, + "loss": 0.2351, "step": 1067800 }, { - "epoch": 10.88, - "learning_rate": 1.1235952602989088e-05, - "loss": 0.3114, + "epoch": 14.713014245956298, + "grad_norm": 0.25403597950935364, + "learning_rate": 5.257794715594816e-08, + "loss": 0.2048, "step": 1067900 }, { - "epoch": 10.88, - "learning_rate": 1.1230777199022708e-05, - "loss": 0.2777, + "epoch": 14.714391998016037, + "grad_norm": 2.524069309234619, + "learning_rate": 5.2083696816974945e-08, + "loss": 0.2079, "step": 1068000 }, { - "epoch": 10.88, - "learning_rate": 1.1225602712759301e-05, - "loss": 0.2936, + "epoch": 14.715769750075776, + "grad_norm": 3.748978853225708, + "learning_rate": 5.1591778318913766e-08, + "loss": 0.2187, "step": 1068100 }, { - "epoch": 10.88, - "learning_rate": 1.1220429144451885e-05, - "loss": 0.2651, + "epoch": 14.717147502135516, + "grad_norm": 3.0733766555786133, + "learning_rate": 5.1102191703558676e-08, + "loss": 0.2276, "step": 1068200 }, { - "epoch": 10.88, - "learning_rate": 1.1215256494353411e-05, - "loss": 0.3062, + "epoch": 14.718525254195255, + "grad_norm": 0.0589502677321434, + "learning_rate": 5.06149370124992e-08, + "loss": 0.1545, "step": 1068300 }, { - "epoch": 10.89, - "learning_rate": 1.121008476271679e-05, - "loss": 0.2919, + "epoch": 14.719903006254995, + "grad_norm": 0.9088813066482544, + "learning_rate": 5.0130014287129436e-08, + "loss": 0.2433, "step": 1068400 }, { - "epoch": 10.89, - "learning_rate": 1.1204913949794897e-05, - "loss": 0.2939, + "epoch": 14.721280758314734, + "grad_norm": 1.7251689434051514, + "learning_rate": 4.964742356865115e-08, + "loss": 0.2171, "step": 1068500 }, { - "epoch": 10.89, - "learning_rate": 1.1199744055840556e-05, - "loss": 0.2576, + "epoch": 14.722658510374472, + "grad_norm": 0.9921254515647888, + "learning_rate": 4.916716489805545e-08, + "loss": 0.1837, "step": 1068600 }, { - "epoch": 10.89, - "learning_rate": 1.1194575081106525e-05, - "loss": 0.2934, + "epoch": 14.724036262434213, + "grad_norm": 1.2840731143951416, + "learning_rate": 4.868923831614719e-08, + "loss": 0.2241, "step": 1068700 }, { - "epoch": 10.89, - "learning_rate": 1.1189458701845944e-05, - "loss": 0.3137, + "epoch": 14.725414014493952, + "grad_norm": 1.6657763719558716, + "learning_rate": 4.821364386352972e-08, + "loss": 0.2346, "step": 1068800 }, { - "epoch": 10.89, - "learning_rate": 1.11842915571122e-05, - "loss": 0.2576, + "epoch": 14.72679176655369, + "grad_norm": 1.3147339820861816, + "learning_rate": 4.7740381580604896e-08, + "loss": 0.2284, "step": 1068900 }, { - "epoch": 10.89, - "learning_rate": 1.1179125332354317e-05, - "loss": 0.3034, + "epoch": 14.72816951861343, + "grad_norm": 24.405963897705078, + "learning_rate": 4.7269451507579166e-08, + "loss": 0.2302, "step": 1069000 }, { - "epoch": 10.89, - "learning_rate": 1.1173960027824892e-05, - "loss": 0.2962, + "epoch": 14.72954727067317, + "grad_norm": 0.21909376978874207, + "learning_rate": 4.6800853684460523e-08, + "loss": 0.1833, "step": 1069100 }, { - "epoch": 10.89, - "learning_rate": 1.116879564377647e-05, - "loss": 0.2792, + "epoch": 14.730925022732908, + "grad_norm": 1.447287678718567, + "learning_rate": 4.6339239261428966e-08, + "loss": 0.2394, "step": 1069200 }, { - "epoch": 10.89, - "learning_rate": 1.1163632180461555e-05, - "loss": 0.2989, + "epoch": 14.732302774792648, + "grad_norm": 0.07940816879272461, + "learning_rate": 4.587528273386793e-08, + "loss": 0.1938, "step": 1069300 }, { - "epoch": 10.9, - "learning_rate": 1.1158469638132634e-05, - "loss": 0.308, + "epoch": 14.733680526852387, + "grad_norm": 0.9355390071868896, + "learning_rate": 4.5413658574656456e-08, + "loss": 0.2453, "step": 1069400 }, { - "epoch": 10.9, - "learning_rate": 1.1153308017042093e-05, - "loss": 0.2874, + "epoch": 14.735058278912128, + "grad_norm": 4.140065670013428, + "learning_rate": 4.495436682301179e-08, + "loss": 0.213, "step": 1069500 }, { - "epoch": 10.9, - "learning_rate": 1.114814731744232e-05, - "loss": 0.2469, + "epoch": 14.736436030971866, + "grad_norm": 2.872631311416626, + "learning_rate": 4.4497407517952725e-08, + "loss": 0.2331, "step": 1069600 }, { - "epoch": 10.9, - "learning_rate": 1.1142987539585638e-05, - "loss": 0.2508, + "epoch": 14.737813783031605, + "grad_norm": 2.5901694297790527, + "learning_rate": 4.404278069829654e-08, + "loss": 0.2099, "step": 1069700 }, { - "epoch": 10.9, - "learning_rate": 1.1137828683724336e-05, - "loss": 0.3061, + "epoch": 14.739191535091345, + "grad_norm": 1.375916600227356, + "learning_rate": 4.3590486402674266e-08, + "loss": 0.2176, "step": 1069800 }, { - "epoch": 10.9, - "learning_rate": 1.1132670750110647e-05, - "loss": 0.2286, + "epoch": 14.740569287151084, + "grad_norm": 0.6927363276481628, + "learning_rate": 4.314052466950325e-08, + "loss": 0.1963, "step": 1069900 }, { - "epoch": 10.9, - "learning_rate": 1.1127513738996767e-05, - "loss": 0.2077, + "epoch": 14.741947039210824, + "grad_norm": 1.8821014165878296, + "learning_rate": 4.2692895537014554e-08, + "loss": 0.1848, "step": 1070000 }, { - "epoch": 10.9, - "learning_rate": 1.1122357650634846e-05, - "loss": 0.2853, + "epoch": 14.743324791270563, + "grad_norm": 2.1611859798431396, + "learning_rate": 4.224759904323472e-08, + "loss": 0.2128, "step": 1070100 }, { - "epoch": 10.9, - "learning_rate": 1.1117202485276971e-05, - "loss": 0.2585, + "epoch": 14.744702543330302, + "grad_norm": 1.9220291376113892, + "learning_rate": 4.180463522599487e-08, + "loss": 0.1986, "step": 1070200 }, { - "epoch": 10.9, - "learning_rate": 1.111204824317521e-05, - "loss": 0.3103, + "epoch": 14.746080295390042, + "grad_norm": 1.5371798276901245, + "learning_rate": 4.136400412292768e-08, + "loss": 0.1789, "step": 1070300 }, { - "epoch": 10.91, - "learning_rate": 1.110689492458158e-05, - "loss": 0.3086, + "epoch": 14.747458047449781, + "grad_norm": 0.46057751774787903, + "learning_rate": 4.093007720773368e-08, + "loss": 0.1826, "step": 1070400 }, { - "epoch": 10.91, - "learning_rate": 1.1101742529748031e-05, - "loss": 0.3598, + "epoch": 14.74883579950952, + "grad_norm": 1.2110824584960938, + "learning_rate": 4.0494088317040435e-08, + "loss": 0.2037, "step": 1070500 }, { - "epoch": 10.91, - "learning_rate": 1.1096591058926493e-05, - "loss": 0.3084, + "epoch": 14.75021355156926, + "grad_norm": 3.3303277492523193, + "learning_rate": 4.006043225185469e-08, + "loss": 0.2121, "step": 1070600 }, { - "epoch": 10.91, - "learning_rate": 1.1091440512368854e-05, - "loss": 0.311, + "epoch": 14.751591303628999, + "grad_norm": 1.4010446071624756, + "learning_rate": 3.962910904902139e-08, + "loss": 0.1754, "step": 1070700 }, { - "epoch": 10.91, - "learning_rate": 1.108629089032692e-05, - "loss": 0.33, + "epoch": 14.752969055688737, + "grad_norm": 1.2539904117584229, + "learning_rate": 3.920011874517793e-08, + "loss": 0.1949, "step": 1070800 }, { - "epoch": 10.91, - "learning_rate": 1.1081142193052495e-05, - "loss": 0.2534, + "epoch": 14.754346807748478, + "grad_norm": 1.4284390211105347, + "learning_rate": 3.87734613767754e-08, + "loss": 0.1975, "step": 1070900 }, { - "epoch": 10.91, - "learning_rate": 1.107599442079731e-05, - "loss": 0.2796, + "epoch": 14.755724559808217, + "grad_norm": 1.6033798456192017, + "learning_rate": 3.834913698005732e-08, + "loss": 0.2314, "step": 1071000 }, { - "epoch": 10.91, - "learning_rate": 1.1070847573813067e-05, - "loss": 0.2555, + "epoch": 14.757102311867957, + "grad_norm": 0.05855090916156769, + "learning_rate": 3.79271455910718e-08, + "loss": 0.1851, "step": 1071100 }, { - "epoch": 10.91, - "learning_rate": 1.1065701652351417e-05, - "loss": 0.2869, + "epoch": 14.758480063927696, + "grad_norm": 1.4979583024978638, + "learning_rate": 3.750748724567154e-08, + "loss": 0.179, "step": 1071200 }, { - "epoch": 10.91, - "learning_rate": 1.1060556656663961e-05, - "loss": 0.2588, + "epoch": 14.759857815987434, + "grad_norm": 0.3084957003593445, + "learning_rate": 3.709016197950777e-08, + "loss": 0.2041, "step": 1071300 }, { - "epoch": 10.92, - "learning_rate": 1.1055412587002268e-05, - "loss": 0.2879, + "epoch": 14.761235568047175, + "grad_norm": 3.2187559604644775, + "learning_rate": 3.6675169828033204e-08, + "loss": 0.1783, "step": 1071400 }, { - "epoch": 10.92, - "learning_rate": 1.1050269443617836e-05, - "loss": 0.3026, + "epoch": 14.762613320106913, + "grad_norm": 0.8990810513496399, + "learning_rate": 3.6262510826505224e-08, + "loss": 0.2162, "step": 1071500 }, { - "epoch": 10.92, - "learning_rate": 1.1045127226762142e-05, - "loss": 0.2603, + "epoch": 14.763991072166652, + "grad_norm": 0.5769587159156799, + "learning_rate": 3.585218500997967e-08, + "loss": 0.1756, "step": 1071600 }, { - "epoch": 10.92, - "learning_rate": 1.103998593668662e-05, - "loss": 0.3057, + "epoch": 14.765368824226393, + "grad_norm": 2.4321980476379395, + "learning_rate": 3.544419241331698e-08, + "loss": 0.1889, "step": 1071700 }, { - "epoch": 10.92, - "learning_rate": 1.103489697268344e-05, - "loss": 0.2814, + "epoch": 14.766746576286131, + "grad_norm": 2.0992283821105957, + "learning_rate": 3.5038533071176106e-08, + "loss": 0.1913, "step": 1071800 }, { - "epoch": 10.92, - "learning_rate": 1.1029757527648247e-05, - "loss": 0.2707, + "epoch": 14.768124328345872, + "grad_norm": 3.6120078563690186, + "learning_rate": 3.463520701802364e-08, + "loss": 0.2296, "step": 1071900 }, { - "epoch": 10.92, - "learning_rate": 1.1024619010144705e-05, - "loss": 0.2808, + "epoch": 14.76950208040561, + "grad_norm": 2.59912109375, + "learning_rate": 3.423421428812162e-08, + "loss": 0.2362, "step": 1072000 }, { - "epoch": 10.92, - "learning_rate": 1.1019481420424051e-05, - "loss": 0.3063, + "epoch": 14.770879832465349, + "grad_norm": 2.28425669670105, + "learning_rate": 3.3835554915536693e-08, + "loss": 0.2052, "step": 1072100 }, { - "epoch": 10.92, - "learning_rate": 1.1014344758737499e-05, - "loss": 0.2732, + "epoch": 14.77225758452509, + "grad_norm": 7.000552177429199, + "learning_rate": 3.343922893414009e-08, + "loss": 0.2059, "step": 1072200 }, { - "epoch": 10.92, - "learning_rate": 1.1009209025336166e-05, - "loss": 0.2387, + "epoch": 14.773635336584828, + "grad_norm": 11.770084381103516, + "learning_rate": 3.3045236377595434e-08, + "loss": 0.2147, "step": 1072300 }, { - "epoch": 10.93, - "learning_rate": 1.100407422047119e-05, - "loss": 0.2773, + "epoch": 14.775013088644567, + "grad_norm": 1.35153329372406, + "learning_rate": 3.265357727938012e-08, + "loss": 0.2329, "step": 1072400 }, { - "epoch": 10.93, - "learning_rate": 1.0998940344393634e-05, - "loss": 0.2122, + "epoch": 14.776390840704307, + "grad_norm": 1.5613043308258057, + "learning_rate": 3.226425167276392e-08, + "loss": 0.2304, "step": 1072500 }, { - "epoch": 10.93, - "learning_rate": 1.0993807397354493e-05, - "loss": 0.263, + "epoch": 14.777768592764046, + "grad_norm": 2.1376333236694336, + "learning_rate": 3.187725959082427e-08, + "loss": 0.2047, "step": 1072600 }, { - "epoch": 10.93, - "learning_rate": 1.0988675379604748e-05, - "loss": 0.2922, + "epoch": 14.779146344823786, + "grad_norm": 3.5426976680755615, + "learning_rate": 3.149260106643709e-08, + "loss": 0.2647, "step": 1072700 }, { - "epoch": 10.93, - "learning_rate": 1.098354429139533e-05, - "loss": 0.2611, + "epoch": 14.780524096883525, + "grad_norm": 2.5625240802764893, + "learning_rate": 3.11102761322829e-08, + "loss": 0.199, "step": 1072800 }, { - "epoch": 10.93, - "learning_rate": 1.0978414132977105e-05, - "loss": 0.3001, + "epoch": 14.781901848943264, + "grad_norm": 2.60762095451355, + "learning_rate": 3.0730284820837675e-08, + "loss": 0.2011, "step": 1072900 }, { - "epoch": 10.93, - "learning_rate": 1.0973284904600911e-05, + "epoch": 14.783279601003004, + "grad_norm": 2.644148349761963, + "learning_rate": 3.035262716438808e-08, "loss": 0.2481, "step": 1073000 }, { - "epoch": 10.93, - "learning_rate": 1.0968156606517555e-05, - "loss": 0.3033, + "epoch": 14.784657353062743, + "grad_norm": 7.524131774902344, + "learning_rate": 2.997730319501624e-08, + "loss": 0.2187, "step": 1073100 }, { - "epoch": 10.93, - "learning_rate": 1.0963029238977752e-05, - "loss": 0.3337, + "epoch": 14.786035105122481, + "grad_norm": 2.492258310317993, + "learning_rate": 2.9604312944608858e-08, + "loss": 0.1927, "step": 1073200 }, { - "epoch": 10.93, - "learning_rate": 1.0957902802232212e-05, - "loss": 0.3331, + "epoch": 14.787412857182222, + "grad_norm": 2.18353009223938, + "learning_rate": 2.9233656444854208e-08, + "loss": 0.1852, "step": 1073300 }, { - "epoch": 10.94, - "learning_rate": 1.0952777296531596e-05, - "loss": 0.2766, + "epoch": 14.78879060924196, + "grad_norm": 0.9161109328269958, + "learning_rate": 2.886533372723904e-08, + "loss": 0.2131, "step": 1073400 }, { - "epoch": 10.94, - "learning_rate": 1.0947652722126493e-05, - "loss": 0.2874, + "epoch": 14.7901683613017, + "grad_norm": 2.2895407676696777, + "learning_rate": 2.8499344823051664e-08, + "loss": 0.2557, "step": 1073500 }, { - "epoch": 10.94, - "learning_rate": 1.0942529079267464e-05, - "loss": 0.3068, + "epoch": 14.79154611336144, + "grad_norm": 0.7073013186454773, + "learning_rate": 2.8135689763394146e-08, + "loss": 0.2226, "step": 1073600 }, { - "epoch": 10.94, - "learning_rate": 1.0937406368205044e-05, - "loss": 0.221, + "epoch": 14.792923865421178, + "grad_norm": 1.5413271188735962, + "learning_rate": 2.7774368579148725e-08, + "loss": 0.2377, "step": 1073700 }, { - "epoch": 10.94, - "learning_rate": 1.0932284589189706e-05, - "loss": 0.243, + "epoch": 14.794301617480919, + "grad_norm": 0.024904552847146988, + "learning_rate": 2.741538130102056e-08, + "loss": 0.2036, "step": 1073800 }, { - "epoch": 10.94, - "learning_rate": 1.0927163742471847e-05, - "loss": 0.2689, + "epoch": 14.795679369540657, + "grad_norm": 4.637572765350342, + "learning_rate": 2.7058727959507203e-08, + "loss": 0.188, "step": 1073900 }, { - "epoch": 10.94, - "learning_rate": 1.0922043828301861e-05, - "loss": 0.3436, + "epoch": 14.797057121600396, + "grad_norm": 0.07714895904064178, + "learning_rate": 2.670440858490164e-08, + "loss": 0.2196, "step": 1074000 }, { - "epoch": 10.94, - "learning_rate": 1.0916924846930093e-05, - "loss": 0.3128, + "epoch": 14.798434873660137, + "grad_norm": 2.255117416381836, + "learning_rate": 2.6352423207310628e-08, + "loss": 0.2038, "step": 1074100 }, { - "epoch": 10.94, - "learning_rate": 1.0911806798606807e-05, - "loss": 0.2527, + "epoch": 14.799812625719875, + "grad_norm": 7.185609817504883, + "learning_rate": 2.6002771856633302e-08, + "loss": 0.2429, "step": 1074200 }, { - "epoch": 10.95, - "learning_rate": 1.0906689683582256e-05, - "loss": 0.2803, + "epoch": 14.801190377779616, + "grad_norm": 3.932537794113159, + "learning_rate": 2.5655454562579507e-08, + "loss": 0.2382, "step": 1074300 }, { - "epoch": 10.95, - "learning_rate": 1.0901573502106647e-05, - "loss": 0.2724, + "epoch": 14.802568129839354, + "grad_norm": 5.433434009552002, + "learning_rate": 2.5310471354651478e-08, + "loss": 0.2214, "step": 1074400 }, { - "epoch": 10.95, - "learning_rate": 1.0896458254430108e-05, - "loss": 0.2911, + "epoch": 14.803945881899093, + "grad_norm": 1.2009291648864746, + "learning_rate": 2.49678222621591e-08, + "loss": 0.2264, "step": 1074500 }, { - "epoch": 10.95, - "learning_rate": 1.0891343940802758e-05, - "loss": 0.2565, + "epoch": 14.805323633958833, + "grad_norm": 2.134117364883423, + "learning_rate": 2.4627507314210762e-08, + "loss": 0.2095, "step": 1074600 }, { - "epoch": 10.95, - "learning_rate": 1.0886230561474666e-05, - "loss": 0.2822, + "epoch": 14.806701386018572, + "grad_norm": 0.10432631522417068, + "learning_rate": 2.4289526539719444e-08, + "loss": 0.2169, "step": 1074700 }, { - "epoch": 10.95, - "learning_rate": 1.0881118116695811e-05, - "loss": 0.2482, + "epoch": 14.80807913807831, + "grad_norm": 0.342690110206604, + "learning_rate": 2.395387996739662e-08, + "loss": 0.2105, "step": 1074800 }, { - "epoch": 10.95, - "learning_rate": 1.0876006606716199e-05, - "loss": 0.2617, + "epoch": 14.809456890138051, + "grad_norm": 1.82905113697052, + "learning_rate": 2.3620567625758373e-08, + "loss": 0.1896, "step": 1074900 }, { - "epoch": 10.95, - "learning_rate": 1.0870947132905734e-05, - "loss": 0.2806, + "epoch": 14.81083464219779, + "grad_norm": 3.019597053527832, + "learning_rate": 2.3289589543122324e-08, + "loss": 0.1681, "step": 1075000 }, { - "epoch": 10.95, - "learning_rate": 1.086583748392007e-05, - "loss": 0.2568, + "epoch": 14.812212394257529, + "grad_norm": 2.151141405105591, + "learning_rate": 2.2960945747604602e-08, + "loss": 0.1567, "step": 1075100 }, { - "epoch": 10.95, - "learning_rate": 1.0860728770480775e-05, - "loss": 0.2325, + "epoch": 14.813590146317269, + "grad_norm": 5.149314880371094, + "learning_rate": 2.263463626712592e-08, + "loss": 0.2242, "step": 1075200 }, { - "epoch": 10.96, - "learning_rate": 1.0855620992837619e-05, - "loss": 0.2636, + "epoch": 14.814967898377008, + "grad_norm": 1.173034906387329, + "learning_rate": 2.231066112940855e-08, + "loss": 0.2111, "step": 1075300 }, { - "epoch": 10.96, - "learning_rate": 1.085051415124035e-05, - "loss": 0.26, + "epoch": 14.816345650436748, + "grad_norm": 6.555375576019287, + "learning_rate": 2.1989020361973256e-08, + "loss": 0.2075, "step": 1075400 }, { - "epoch": 10.96, - "learning_rate": 1.0845408245938669e-05, - "loss": 0.2873, + "epoch": 14.817723402496487, + "grad_norm": 1.99422287940979, + "learning_rate": 2.1669713992151508e-08, + "loss": 0.2053, "step": 1075500 }, { - "epoch": 10.96, - "learning_rate": 1.0840303277182206e-05, - "loss": 0.2296, + "epoch": 14.819101154556225, + "grad_norm": 3.53043270111084, + "learning_rate": 2.1352742047061058e-08, + "loss": 0.1721, "step": 1075600 }, { - "epoch": 10.96, - "learning_rate": 1.0835199245220575e-05, - "loss": 0.2318, + "epoch": 14.820478906615966, + "grad_norm": 0.13173238933086395, + "learning_rate": 2.1038104553639524e-08, + "loss": 0.1758, "step": 1075700 }, { - "epoch": 10.96, - "learning_rate": 1.0830096150303339e-05, - "loss": 0.205, + "epoch": 14.821856658675705, + "grad_norm": 1.1103326082229614, + "learning_rate": 2.072580153860776e-08, + "loss": 0.2443, "step": 1075800 }, { - "epoch": 10.96, - "learning_rate": 1.0824993992679993e-05, - "loss": 0.2774, + "epoch": 14.823234410735443, + "grad_norm": 1.9461760520935059, + "learning_rate": 2.041583302850647e-08, + "loss": 0.2006, "step": 1075900 }, { - "epoch": 10.96, - "learning_rate": 1.0819892772600011e-05, - "loss": 0.3017, + "epoch": 14.824612162795184, + "grad_norm": 2.5116820335388184, + "learning_rate": 2.0108199049662658e-08, + "loss": 0.2304, "step": 1076000 }, { - "epoch": 10.96, - "learning_rate": 1.081479249031281e-05, - "loss": 0.3272, + "epoch": 14.825989914854922, + "grad_norm": 1.13321852684021, + "learning_rate": 1.9802899628214022e-08, + "loss": 0.2242, "step": 1076100 }, { - "epoch": 10.96, - "learning_rate": 1.0809693146067766e-05, - "loss": 0.2773, + "epoch": 14.827367666914663, + "grad_norm": 3.4111907482147217, + "learning_rate": 1.9499934790096764e-08, + "loss": 0.2218, "step": 1076200 }, { - "epoch": 10.97, - "learning_rate": 1.0804594740114203e-05, - "loss": 0.2825, + "epoch": 14.828745418974401, + "grad_norm": 4.788069248199463, + "learning_rate": 1.9199304561048624e-08, + "loss": 0.2265, "step": 1076300 }, { - "epoch": 10.97, - "learning_rate": 1.079949727270141e-05, - "loss": 0.2769, + "epoch": 14.83012317103414, + "grad_norm": 1.3524816036224365, + "learning_rate": 1.8901008966611954e-08, + "loss": 0.2136, "step": 1076400 }, { - "epoch": 10.97, - "learning_rate": 1.0794400744078621e-05, - "loss": 0.2197, + "epoch": 14.83150092309388, + "grad_norm": 1.0137866735458374, + "learning_rate": 1.8605048032124537e-08, + "loss": 0.2123, "step": 1076500 }, { - "epoch": 10.97, - "learning_rate": 1.0789305154495017e-05, - "loss": 0.2071, + "epoch": 14.83287867515362, + "grad_norm": 1.0612092018127441, + "learning_rate": 1.831142178273487e-08, + "loss": 0.2048, "step": 1076600 }, { - "epoch": 10.97, - "learning_rate": 1.0784210504199746e-05, - "loss": 0.2886, + "epoch": 14.834256427213358, + "grad_norm": 2.682030200958252, + "learning_rate": 1.8020130243383837e-08, + "loss": 0.2071, "step": 1076700 }, { - "epoch": 10.97, - "learning_rate": 1.077911679344192e-05, - "loss": 0.2845, + "epoch": 14.835634179273098, + "grad_norm": 0.1536763608455658, + "learning_rate": 1.773117343881997e-08, + "loss": 0.1803, "step": 1076800 }, { - "epoch": 10.97, - "learning_rate": 1.0774024022470566e-05, - "loss": 0.2775, + "epoch": 14.837011931332837, + "grad_norm": 2.6570160388946533, + "learning_rate": 1.74445513935903e-08, + "loss": 0.1824, "step": 1076900 }, { - "epoch": 10.97, - "learning_rate": 1.0768932191534704e-05, - "loss": 0.2425, + "epoch": 14.838389683392577, + "grad_norm": 0.8415933847427368, + "learning_rate": 1.7160264132046465e-08, + "loss": 0.2261, "step": 1077000 }, { - "epoch": 10.97, - "learning_rate": 1.0763841300883302e-05, - "loss": 0.2594, + "epoch": 14.839767435452316, + "grad_norm": 1.7778992652893066, + "learning_rate": 1.6878311678338587e-08, + "loss": 0.1959, "step": 1077100 }, { - "epoch": 10.97, - "learning_rate": 1.0758751350765256e-05, - "loss": 0.2068, + "epoch": 14.841145187512055, + "grad_norm": 3.090742349624634, + "learning_rate": 1.6598694056421394e-08, + "loss": 0.2064, "step": 1077200 }, { - "epoch": 10.98, - "learning_rate": 1.0753662341429441e-05, - "loss": 0.2967, + "epoch": 14.842522939571795, + "grad_norm": 1.2722197771072388, + "learning_rate": 1.632141129005116e-08, + "loss": 0.2408, "step": 1077300 }, { - "epoch": 10.98, - "learning_rate": 1.0748574273124682e-05, - "loss": 0.2913, + "epoch": 14.843900691631534, + "grad_norm": 7.700870037078857, + "learning_rate": 1.6046463402779597e-08, + "loss": 0.2148, "step": 1077400 }, { - "epoch": 10.98, - "learning_rate": 1.0743487146099755e-05, - "loss": 0.3363, + "epoch": 14.845278443691273, + "grad_norm": 1.0658072233200073, + "learning_rate": 1.577385041797219e-08, + "loss": 0.1818, "step": 1077500 }, { - "epoch": 10.98, - "learning_rate": 1.0738400960603393e-05, - "loss": 0.2453, + "epoch": 14.846656195751013, + "grad_norm": 7.456512451171875, + "learning_rate": 1.5506263581414047e-08, + "loss": 0.214, "step": 1077600 }, { - "epoch": 10.98, - "learning_rate": 1.0733315716884284e-05, - "loss": 0.198, + "epoch": 14.848033947810752, + "grad_norm": 9.465802192687988, + "learning_rate": 1.5238297121207003e-08, + "loss": 0.2303, "step": 1077700 }, { - "epoch": 10.98, - "learning_rate": 1.0728231415191048e-05, - "loss": 0.2987, + "epoch": 14.84941169987049, + "grad_norm": 2.7150468826293945, + "learning_rate": 1.4972665632116413e-08, + "loss": 0.2153, "step": 1077800 }, { - "epoch": 10.98, - "learning_rate": 1.0723148055772287e-05, - "loss": 0.2397, + "epoch": 14.85078945193023, + "grad_norm": 1.6926552057266235, + "learning_rate": 1.4709369136710892e-08, + "loss": 0.2293, "step": 1077900 }, { - "epoch": 10.98, - "learning_rate": 1.071806563887655e-05, - "loss": 0.3058, + "epoch": 14.85216720398997, + "grad_norm": 0.7267380952835083, + "learning_rate": 1.4448407657357555e-08, + "loss": 0.1856, "step": 1078000 }, { - "epoch": 10.98, - "learning_rate": 1.0712984164752347e-05, - "loss": 0.2543, + "epoch": 14.85354495604971, + "grad_norm": 2.963726282119751, + "learning_rate": 1.4189781216228104e-08, + "loss": 0.2041, "step": 1078100 }, { - "epoch": 10.98, - "learning_rate": 1.0707954434290388e-05, - "loss": 0.3364, + "epoch": 14.854922708109449, + "grad_norm": 2.1079416275024414, + "learning_rate": 1.3933489835292745e-08, + "loss": 0.195, "step": 1078200 }, { - "epoch": 10.99, - "learning_rate": 1.0702874837020616e-05, - "loss": 0.302, + "epoch": 14.856300460169187, + "grad_norm": 3.451512336730957, + "learning_rate": 1.3679533536326283e-08, + "loss": 0.1698, "step": 1078300 }, { - "epoch": 10.99, - "learning_rate": 1.0697796183265109e-05, - "loss": 0.2379, + "epoch": 14.857678212228928, + "grad_norm": 3.796050786972046, + "learning_rate": 1.3427912340902016e-08, + "loss": 0.2202, "step": 1078400 }, { - "epoch": 10.99, - "learning_rate": 1.0692718473272174e-05, - "loss": 0.2963, + "epoch": 14.859055964288666, + "grad_norm": 2.5348920822143555, + "learning_rate": 1.3178626270394789e-08, + "loss": 0.1946, "step": 1078500 }, { - "epoch": 10.99, - "learning_rate": 1.0687641707290102e-05, - "loss": 0.22, + "epoch": 14.860433716348407, + "grad_norm": 3.499793767929077, + "learning_rate": 1.2931675345987105e-08, + "loss": 0.2607, "step": 1078600 }, { - "epoch": 10.99, - "learning_rate": 1.0682565885567084e-05, - "loss": 0.2839, + "epoch": 14.861811468408145, + "grad_norm": 4.89334774017334, + "learning_rate": 1.2687059588653849e-08, + "loss": 0.195, "step": 1078700 }, { - "epoch": 10.99, - "learning_rate": 1.0677491008351317e-05, - "loss": 0.2741, + "epoch": 14.863189220467884, + "grad_norm": 1.0662612915039062, + "learning_rate": 1.2444779019183672e-08, + "loss": 0.2004, "step": 1078800 }, { - "epoch": 10.99, - "learning_rate": 1.0672417075890928e-05, - "loss": 0.243, + "epoch": 14.864566972527625, + "grad_norm": 1.6833910942077637, + "learning_rate": 1.2204833658151504e-08, + "loss": 0.2387, "step": 1078900 }, { - "epoch": 10.99, - "learning_rate": 1.0667344088434003e-05, - "loss": 0.2969, + "epoch": 14.865944724587363, + "grad_norm": 0.4727188050746918, + "learning_rate": 1.1967223525946036e-08, + "loss": 0.1838, "step": 1079000 }, { - "epoch": 10.99, - "learning_rate": 1.0662272046228583e-05, - "loss": 0.3048, + "epoch": 14.867322476647102, + "grad_norm": 3.2705109119415283, + "learning_rate": 1.17319486427514e-08, + "loss": 0.2367, "step": 1079100 }, { - "epoch": 11.0, - "learning_rate": 1.0657200949522667e-05, - "loss": 0.2868, + "epoch": 14.868700228706842, + "grad_norm": 1.2091560363769531, + "learning_rate": 1.1499009028559382e-08, + "loss": 0.2003, "step": 1079200 }, { - "epoch": 11.0, - "learning_rate": 1.0652130798564182e-05, - "loss": 0.2892, + "epoch": 14.870077980766581, + "grad_norm": 4.0432353019714355, + "learning_rate": 1.1268404703154156e-08, + "loss": 0.2124, "step": 1079300 }, { - "epoch": 11.0, - "learning_rate": 1.064706159360104e-05, - "loss": 0.2874, + "epoch": 14.87145573282632, + "grad_norm": 3.511845827102661, + "learning_rate": 1.1040135686130603e-08, + "loss": 0.2148, "step": 1079400 }, { - "epoch": 11.0, - "learning_rate": 1.0641993334881105e-05, - "loss": 0.2933, + "epoch": 14.87283348488606, + "grad_norm": 2.3320748805999756, + "learning_rate": 1.08142019968821e-08, + "loss": 0.2155, "step": 1079500 }, { - "epoch": 11.0, - "learning_rate": 1.063692602265216e-05, - "loss": 0.2796, + "epoch": 14.874211236945799, + "grad_norm": 3.4457144737243652, + "learning_rate": 1.0590603654597464e-08, + "loss": 0.1825, "step": 1079600 }, { - "epoch": 11.0, - "learning_rate": 1.0631859657161981e-05, - "loss": 0.2958, + "epoch": 14.87558898900554, + "grad_norm": 2.767977714538574, + "learning_rate": 1.036934067827927e-08, + "loss": 0.19, "step": 1079700 }, { - "epoch": 11.0, - "learning_rate": 1.0626794238658294e-05, - "loss": 0.2668, + "epoch": 14.876966741065278, + "grad_norm": 7.836250305175781, + "learning_rate": 1.0150413086719434e-08, + "loss": 0.1909, "step": 1079800 }, { - "epoch": 11.0, - "learning_rate": 1.0621729767388744e-05, - "loss": 0.2655, + "epoch": 14.878344493125017, + "grad_norm": 2.2359414100646973, + "learning_rate": 9.933820898520573e-09, + "loss": 0.2338, "step": 1079900 }, { - "epoch": 11.0, - "learning_rate": 1.0616666243600965e-05, - "loss": 0.3121, + "epoch": 14.879722245184757, + "grad_norm": 0.8423241376876831, + "learning_rate": 9.719564132077696e-09, + "loss": 0.2579, "step": 1080000 }, { - "epoch": 11.0, - "learning_rate": 1.061160366754253e-05, - "loss": 0.2478, + "epoch": 14.881099997244496, + "grad_norm": 5.39343786239624, + "learning_rate": 9.507642805602623e-09, + "loss": 0.165, "step": 1080100 }, { - "epoch": 11.01, - "learning_rate": 1.0606542039460977e-05, - "loss": 0.2302, + "epoch": 14.882477749304234, + "grad_norm": 2.855870246887207, + "learning_rate": 9.298056937090405e-09, + "loss": 0.1553, "step": 1080200 }, { - "epoch": 11.01, - "learning_rate": 1.0601481359603782e-05, - "loss": 0.1871, + "epoch": 14.883855501363975, + "grad_norm": 0.983640193939209, + "learning_rate": 9.09080654434985e-09, + "loss": 0.1924, "step": 1080300 }, { - "epoch": 11.01, - "learning_rate": 1.0596421628218382e-05, - "loss": 0.2569, + "epoch": 14.885233253423714, + "grad_norm": 3.0346145629882812, + "learning_rate": 8.885891644988265e-09, + "loss": 0.1918, "step": 1080400 }, { - "epoch": 11.01, - "learning_rate": 1.059136284555218e-05, - "loss": 0.2927, + "epoch": 14.886611005483454, + "grad_norm": 1.5431973934173584, + "learning_rate": 8.6833122564145e-09, + "loss": 0.199, "step": 1080500 }, { - "epoch": 11.01, - "learning_rate": 1.0586305011852506e-05, - "loss": 0.2609, + "epoch": 14.887988757543193, + "grad_norm": 0.37619274854660034, + "learning_rate": 8.483068395838955e-09, + "loss": 0.2262, "step": 1080600 }, { - "epoch": 11.01, - "learning_rate": 1.0581248127366655e-05, - "loss": 0.2269, + "epoch": 14.889366509602931, + "grad_norm": 1.83773672580719, + "learning_rate": 8.287127602424338e-09, + "loss": 0.2508, "step": 1080700 }, { - "epoch": 11.01, - "learning_rate": 1.0576192192341899e-05, - "loss": 0.2605, + "epoch": 14.890744261662672, + "grad_norm": 3.050055503845215, + "learning_rate": 8.091531492976368e-09, + "loss": 0.194, "step": 1080800 }, { - "epoch": 11.01, - "learning_rate": 1.057113720702542e-05, - "loss": 0.3192, + "epoch": 14.89212201372241, + "grad_norm": 3.382828950881958, + "learning_rate": 7.898270961802235e-09, + "loss": 0.1849, "step": 1080900 }, { - "epoch": 11.01, - "learning_rate": 1.0566083171664385e-05, - "loss": 0.2275, + "epoch": 14.893499765782149, + "grad_norm": 1.7832273244857788, + "learning_rate": 7.70734602531853e-09, + "loss": 0.2015, "step": 1081000 }, { - "epoch": 11.01, - "learning_rate": 1.0561030086505916e-05, - "loss": 0.2833, + "epoch": 14.89487751784189, + "grad_norm": 2.680617570877075, + "learning_rate": 7.518756699746442e-09, + "loss": 0.2336, "step": 1081100 }, { - "epoch": 11.02, - "learning_rate": 1.0555977951797063e-05, - "loss": 0.2413, + "epoch": 14.896255269901628, + "grad_norm": 1.1461893320083618, + "learning_rate": 7.3325030011056615e-09, + "loss": 0.2221, "step": 1081200 }, { - "epoch": 11.02, - "learning_rate": 1.0550926767784848e-05, - "loss": 0.2779, + "epoch": 14.897633021961369, + "grad_norm": 0.8372901082038879, + "learning_rate": 7.148584945220471e-09, + "loss": 0.2151, "step": 1081300 }, { - "epoch": 11.02, - "learning_rate": 1.0545876534716251e-05, - "loss": 0.2448, + "epoch": 14.899010774021107, + "grad_norm": 6.631724834442139, + "learning_rate": 6.968806810128403e-09, + "loss": 0.2686, "step": 1081400 }, { - "epoch": 11.02, - "learning_rate": 1.0540827252838196e-05, - "loss": 0.292, + "epoch": 14.900388526080846, + "grad_norm": 2.7890655994415283, + "learning_rate": 6.789536729615853e-09, + "loss": 0.2132, "step": 1081500 }, { - "epoch": 11.02, - "learning_rate": 1.0535778922397561e-05, - "loss": 0.2421, + "epoch": 14.901766278140586, + "grad_norm": 1.8802226781845093, + "learning_rate": 6.61260233798533e-09, + "loss": 0.1949, "step": 1081600 }, { - "epoch": 11.02, - "learning_rate": 1.0530731543641189e-05, - "loss": 0.2143, + "epoch": 14.903144030200325, + "grad_norm": 4.693185806274414, + "learning_rate": 6.438003650273416e-09, + "loss": 0.2887, "step": 1081700 }, { - "epoch": 11.02, - "learning_rate": 1.0525685116815852e-05, - "loss": 0.2427, + "epoch": 14.904521782260064, + "grad_norm": 3.1848788261413574, + "learning_rate": 6.265740681306031e-09, + "loss": 0.2087, "step": 1081800 }, { - "epoch": 11.02, - "learning_rate": 1.0520639642168292e-05, - "loss": 0.2288, + "epoch": 14.905899534319804, + "grad_norm": 4.509172439575195, + "learning_rate": 6.0958134457198e-09, + "loss": 0.2162, "step": 1081900 }, { - "epoch": 11.02, - "learning_rate": 1.0515645560452136e-05, - "loss": 0.2725, + "epoch": 14.907277286379543, + "grad_norm": 2.687525987625122, + "learning_rate": 5.9282219579528956e-09, + "loss": 0.2687, "step": 1082000 }, { - "epoch": 11.02, - "learning_rate": 1.0510601981372242e-05, - "loss": 0.229, + "epoch": 14.908655038439282, + "grad_norm": 1.9521030187606812, + "learning_rate": 5.762966232245037e-09, + "loss": 0.1885, "step": 1082100 }, { - "epoch": 11.03, - "learning_rate": 1.0505559355207614e-05, - "loss": 0.2837, + "epoch": 14.910032790499022, + "grad_norm": 3.8227665424346924, + "learning_rate": 5.600046282628335e-09, + "loss": 0.215, "step": 1082200 }, { - "epoch": 11.03, - "learning_rate": 1.0500517682204786e-05, - "loss": 0.2693, + "epoch": 14.91141054255876, + "grad_norm": 1.545579433441162, + "learning_rate": 5.439462122951711e-09, + "loss": 0.1997, "step": 1082300 }, { - "epoch": 11.03, - "learning_rate": 1.049547696261028e-05, - "loss": 0.2187, + "epoch": 14.912788294618501, + "grad_norm": 1.51990807056427, + "learning_rate": 5.281213766853421e-09, + "loss": 0.2156, "step": 1082400 }, { - "epoch": 11.03, - "learning_rate": 1.049043719667055e-05, - "loss": 0.2427, + "epoch": 14.91416604667824, + "grad_norm": 0.6883875131607056, + "learning_rate": 5.125301227776324e-09, + "loss": 0.1959, "step": 1082500 }, { - "epoch": 11.03, - "learning_rate": 1.0485398384632018e-05, - "loss": 0.2386, + "epoch": 14.915543798737978, + "grad_norm": 0.03276563435792923, + "learning_rate": 4.97172451896788e-09, + "loss": 0.2152, "step": 1082600 }, { - "epoch": 11.03, - "learning_rate": 1.0480360526741046e-05, - "loss": 0.229, + "epoch": 14.916921550797719, + "grad_norm": 3.164340019226074, + "learning_rate": 4.820483653477092e-09, + "loss": 0.2196, "step": 1082700 }, { - "epoch": 11.03, - "learning_rate": 1.0475323623243955e-05, - "loss": 0.2368, + "epoch": 14.918299302857458, + "grad_norm": 0.6809417605400085, + "learning_rate": 4.671578644148411e-09, + "loss": 0.2363, "step": 1082800 }, { - "epoch": 11.03, - "learning_rate": 1.0470287674387034e-05, - "loss": 0.1841, + "epoch": 14.919677054917198, + "grad_norm": 1.171447515487671, + "learning_rate": 4.52500950363699e-09, + "loss": 0.246, "step": 1082900 }, { - "epoch": 11.03, - "learning_rate": 1.0465252680416488e-05, - "loss": 0.1819, + "epoch": 14.921054806976937, + "grad_norm": 0.0383504256606102, + "learning_rate": 4.380776244390372e-09, + "loss": 0.1976, "step": 1083000 }, { - "epoch": 11.03, - "learning_rate": 1.0460218641578509e-05, - "loss": 0.2825, + "epoch": 14.922432559036675, + "grad_norm": 2.2809674739837646, + "learning_rate": 4.238878878660702e-09, + "loss": 0.187, "step": 1083100 }, { - "epoch": 11.04, - "learning_rate": 1.0455185558119242e-05, - "loss": 0.2501, + "epoch": 14.923810311096416, + "grad_norm": 4.4233598709106445, + "learning_rate": 4.099317418507775e-09, + "loss": 0.1974, "step": 1083200 }, { - "epoch": 11.04, - "learning_rate": 1.0450153430284753e-05, - "loss": 0.1888, + "epoch": 14.925188063156154, + "grad_norm": 1.016135334968567, + "learning_rate": 3.962091875786833e-09, + "loss": 0.2372, "step": 1083300 }, { - "epoch": 11.04, - "learning_rate": 1.0445122258321093e-05, - "loss": 0.3063, + "epoch": 14.926565815215893, + "grad_norm": 2.259232759475708, + "learning_rate": 3.827202262154661e-09, + "loss": 0.1914, "step": 1083400 }, { - "epoch": 11.04, - "learning_rate": 1.0440092042474266e-05, - "loss": 0.1888, + "epoch": 14.927943567275634, + "grad_norm": 0.9471726417541504, + "learning_rate": 3.6946485890726466e-09, + "loss": 0.2094, "step": 1083500 }, { - "epoch": 11.04, - "learning_rate": 1.0435062782990203e-05, - "loss": 0.216, + "epoch": 14.929321319335372, + "grad_norm": 0.5536746382713318, + "learning_rate": 3.564430867797619e-09, + "loss": 0.1755, "step": 1083600 }, { - "epoch": 11.04, - "learning_rate": 1.043003448011481e-05, - "loss": 0.2816, + "epoch": 14.93069907139511, + "grad_norm": 7.622779369354248, + "learning_rate": 3.4365491093971136e-09, + "loss": 0.2402, "step": 1083700 }, { - "epoch": 11.04, - "learning_rate": 1.0425007134093947e-05, - "loss": 0.2314, + "epoch": 14.932076823454851, + "grad_norm": 2.0714480876922607, + "learning_rate": 3.3110033247310547e-09, + "loss": 0.2251, "step": 1083800 }, { - "epoch": 11.04, - "learning_rate": 1.0419980745173416e-05, - "loss": 0.3227, + "epoch": 14.93345457551459, + "grad_norm": 5.540711879730225, + "learning_rate": 3.187793524470073e-09, + "loss": 0.2017, "step": 1083900 }, { - "epoch": 11.04, - "learning_rate": 1.041495531359898e-05, - "loss": 0.2708, + "epoch": 14.93483232757433, + "grad_norm": 2.172900915145874, + "learning_rate": 3.066919719077188e-09, + "loss": 0.2436, "step": 1084000 }, { - "epoch": 11.05, - "learning_rate": 1.040993083961636e-05, - "loss": 0.2744, + "epoch": 14.936210079634069, + "grad_norm": 2.606396436691284, + "learning_rate": 2.9483819188230733e-09, + "loss": 0.1817, "step": 1084100 }, { - "epoch": 11.05, - "learning_rate": 1.0404907323471205e-05, - "loss": 0.2308, + "epoch": 14.937587831693808, + "grad_norm": 1.8836302757263184, + "learning_rate": 2.832180133776896e-09, + "loss": 0.2187, "step": 1084200 }, { - "epoch": 11.05, - "learning_rate": 1.0399884765409144e-05, - "loss": 0.2296, + "epoch": 14.938965583753548, + "grad_norm": 1.4190055131912231, + "learning_rate": 2.718314373812425e-09, + "loss": 0.2018, "step": 1084300 }, { - "epoch": 11.05, - "learning_rate": 1.0394863165675753e-05, - "loss": 0.2314, + "epoch": 14.940343335813287, + "grad_norm": 0.7984558939933777, + "learning_rate": 2.606784648601923e-09, + "loss": 0.2236, "step": 1084400 }, { - "epoch": 11.05, - "learning_rate": 1.0389842524516562e-05, - "loss": 0.2277, + "epoch": 14.941721087873026, + "grad_norm": 0.8938072323799133, + "learning_rate": 2.497590967619201e-09, + "loss": 0.2266, "step": 1084500 }, { - "epoch": 11.05, - "learning_rate": 1.0384822842177037e-05, - "loss": 0.2134, + "epoch": 14.943098839932766, + "grad_norm": 0.49862140417099, + "learning_rate": 2.3907333401457232e-09, + "loss": 0.2348, "step": 1084600 }, { - "epoch": 11.05, - "learning_rate": 1.0379804118902617e-05, - "loss": 0.2359, + "epoch": 14.944476591992505, + "grad_norm": 2.713656187057495, + "learning_rate": 2.2862117752553426e-09, + "loss": 0.1814, "step": 1084700 }, { - "epoch": 11.05, - "learning_rate": 1.0374786354938702e-05, - "loss": 0.2351, + "epoch": 14.945854344052245, + "grad_norm": 2.6186256408691406, + "learning_rate": 2.1840262818265123e-09, + "loss": 0.1899, "step": 1084800 }, { - "epoch": 11.05, - "learning_rate": 1.03697695505306e-05, - "loss": 0.2613, + "epoch": 14.947232096111984, + "grad_norm": 2.87308931350708, + "learning_rate": 2.08417686854534e-09, + "loss": 0.1576, "step": 1084900 }, { - "epoch": 11.05, - "learning_rate": 1.036480385961788e-05, - "loss": 0.2573, + "epoch": 14.948609848171722, + "grad_norm": 2.1665875911712646, + "learning_rate": 1.9866635438903213e-09, + "loss": 0.2265, "step": 1085000 }, { - "epoch": 11.06, - "learning_rate": 1.0359788965455593e-05, - "loss": 0.3162, + "epoch": 14.949987600231463, + "grad_norm": 0.17462141811847687, + "learning_rate": 1.891486316147606e-09, + "loss": 0.1973, "step": 1085100 }, { - "epoch": 11.06, - "learning_rate": 1.0354775031582417e-05, - "loss": 0.3106, + "epoch": 14.951365352291202, + "grad_norm": 4.120203495025635, + "learning_rate": 1.7986451934018378e-09, + "loss": 0.1834, "step": 1085200 }, { - "epoch": 11.06, - "learning_rate": 1.0349762058243508e-05, - "loss": 0.2271, + "epoch": 14.95274310435094, + "grad_norm": 1.6549655199050903, + "learning_rate": 1.7081401835422616e-09, + "loss": 0.2599, "step": 1085300 }, { - "epoch": 11.06, - "learning_rate": 1.0344750045683968e-05, - "loss": 0.2894, + "epoch": 14.95412085641068, + "grad_norm": 2.00540828704834, + "learning_rate": 1.6199712942535638e-09, + "loss": 0.2118, "step": 1085400 }, { - "epoch": 11.06, - "learning_rate": 1.0339738994148853e-05, - "loss": 0.2174, + "epoch": 14.95549860847042, + "grad_norm": 4.009399890899658, + "learning_rate": 1.5349852967871948e-09, + "loss": 0.2142, "step": 1085500 }, { - "epoch": 11.06, - "learning_rate": 1.0334728903883186e-05, - "loss": 0.2483, + "epoch": 14.95687636053016, + "grad_norm": 3.2442891597747803, + "learning_rate": 1.4514653095307195e-09, + "loss": 0.2155, "step": 1085600 }, { - "epoch": 11.06, - "learning_rate": 1.0329719775131909e-05, - "loss": 0.2722, + "epoch": 14.958254112589898, + "grad_norm": 0.10502032190561295, + "learning_rate": 1.3702814646565643e-09, + "loss": 0.1879, "step": 1085700 }, { - "epoch": 11.06, - "learning_rate": 1.0324711608139947e-05, - "loss": 0.2459, + "epoch": 14.959631864649637, + "grad_norm": 0.705955445766449, + "learning_rate": 1.2914337690556056e-09, + "loss": 0.1655, "step": 1085800 }, { - "epoch": 11.06, - "learning_rate": 1.0319704403152183e-05, - "loss": 0.271, + "epoch": 14.961009616709378, + "grad_norm": 2.398799419403076, + "learning_rate": 1.2149222294324803e-09, + "loss": 0.2199, "step": 1085900 }, { - "epoch": 11.06, - "learning_rate": 1.0314698160413418e-05, - "loss": 0.2736, + "epoch": 14.962387368769116, + "grad_norm": 1.5619040727615356, + "learning_rate": 1.1407468522842136e-09, + "loss": 0.2179, "step": 1086000 }, { - "epoch": 11.07, - "learning_rate": 1.0309742928205745e-05, - "loss": 0.23, + "epoch": 14.963765120828855, + "grad_norm": 0.13245314359664917, + "learning_rate": 1.0689076439124313e-09, + "loss": 0.1967, "step": 1086100 }, { - "epoch": 11.07, - "learning_rate": 1.0304738601070678e-05, - "loss": 0.2054, + "epoch": 14.965142872888595, + "grad_norm": 3.4771838188171387, + "learning_rate": 9.99404610420307e-10, + "loss": 0.1999, "step": 1086200 }, { - "epoch": 11.07, - "learning_rate": 1.0299735236916362e-05, - "loss": 0.2435, + "epoch": 14.966520624948334, + "grad_norm": 1.209195852279663, + "learning_rate": 9.32237757712562e-10, + "loss": 0.1733, "step": 1086300 }, { - "epoch": 11.07, - "learning_rate": 1.0294732835987427e-05, - "loss": 0.3107, + "epoch": 14.967898377008073, + "grad_norm": 1.1353882551193237, + "learning_rate": 8.674070914954647e-10, + "loss": 0.191, "step": 1086400 }, { - "epoch": 11.07, - "learning_rate": 1.0289731398528462e-05, - "loss": 0.3021, + "epoch": 14.969276129067813, + "grad_norm": 3.6483943462371826, + "learning_rate": 8.049126172768317e-10, + "loss": 0.1901, "step": 1086500 }, { - "epoch": 11.07, - "learning_rate": 1.0284730924784004e-05, - "loss": 0.2802, + "epoch": 14.970653881127552, + "grad_norm": 1.3389338254928589, + "learning_rate": 7.447543403660274e-10, + "loss": 0.2129, "step": 1086600 }, { - "epoch": 11.07, - "learning_rate": 1.0279731414998562e-05, - "loss": 0.2751, + "epoch": 14.972031633187292, + "grad_norm": 2.003077983856201, + "learning_rate": 6.869322658739629e-10, + "loss": 0.2038, "step": 1086700 }, { - "epoch": 11.07, - "learning_rate": 1.0274732869416558e-05, - "loss": 0.228, + "epoch": 14.973409385247031, + "grad_norm": 0.5058888792991638, + "learning_rate": 6.314463987130981e-10, + "loss": 0.2021, "step": 1086800 }, { - "epoch": 11.07, - "learning_rate": 1.0269735288282402e-05, - "loss": 0.2323, + "epoch": 14.97478713730677, + "grad_norm": 3.1928555965423584, + "learning_rate": 5.782967435943865e-10, + "loss": 0.1892, "step": 1086900 }, { - "epoch": 11.07, - "learning_rate": 1.0264738671840456e-05, - "loss": 0.3224, + "epoch": 14.97616488936651, + "grad_norm": 1.9641062021255493, + "learning_rate": 5.274833050364358e-10, + "loss": 0.1995, "step": 1087000 }, { - "epoch": 11.08, - "learning_rate": 1.0259743020335008e-05, - "loss": 0.2442, + "epoch": 14.977542641426249, + "grad_norm": 0.17440913617610931, + "learning_rate": 4.790060873563485e-10, + "loss": 0.15, "step": 1087100 }, { - "epoch": 11.08, - "learning_rate": 1.0254748334010319e-05, - "loss": 0.1885, + "epoch": 14.97892039348599, + "grad_norm": 4.006913185119629, + "learning_rate": 4.3286509466972104e-10, + "loss": 0.1976, "step": 1087200 }, { - "epoch": 11.08, - "learning_rate": 1.0249754613110607e-05, - "loss": 0.22, + "epoch": 14.980298145545728, + "grad_norm": 0.13254722952842712, + "learning_rate": 3.890603308998042e-10, + "loss": 0.2745, "step": 1087300 }, { - "epoch": 11.08, - "learning_rate": 1.0244761857880036e-05, - "loss": 0.2203, + "epoch": 14.981675897605466, + "grad_norm": 2.0417821407318115, + "learning_rate": 3.4759179976528997e-10, + "loss": 0.1987, "step": 1087400 }, { - "epoch": 11.08, - "learning_rate": 1.0239770068562712e-05, - "loss": 0.2236, + "epoch": 14.983053649665207, + "grad_norm": 1.5335716009140015, + "learning_rate": 3.0845950478947117e-10, + "loss": 0.2327, "step": 1087500 }, { - "epoch": 11.08, - "learning_rate": 1.0234779245402707e-05, - "loss": 0.229, + "epoch": 14.984431401724946, + "grad_norm": 0.9508352279663086, + "learning_rate": 2.716634492971881e-10, + "loss": 0.2147, "step": 1087600 }, { - "epoch": 11.08, - "learning_rate": 1.0229789388644044e-05, - "loss": 0.2967, + "epoch": 14.985809153784684, + "grad_norm": 1.4289687871932983, + "learning_rate": 2.37203636417882e-10, + "loss": 0.164, "step": 1087700 }, { - "epoch": 11.08, - "learning_rate": 1.0224800498530695e-05, - "loss": 0.2588, + "epoch": 14.987186905844425, + "grad_norm": 0.03259569779038429, + "learning_rate": 2.0508006907338228e-10, + "loss": 0.215, "step": 1087800 }, { - "epoch": 11.08, - "learning_rate": 1.0219812575306592e-05, - "loss": 0.2188, + "epoch": 14.988564657904163, + "grad_norm": 5.39491081237793, + "learning_rate": 1.7529274999622536e-10, + "loss": 0.2425, "step": 1087900 }, { - "epoch": 11.08, - "learning_rate": 1.0214825619215622e-05, - "loss": 0.2873, + "epoch": 14.989942409963902, + "grad_norm": 1.7730236053466797, + "learning_rate": 1.4810462794806468e-10, + "loss": 0.1963, "step": 1088000 }, { - "epoch": 11.09, - "learning_rate": 1.0209839630501597e-05, - "loss": 0.258, + "epoch": 14.991320162023642, + "grad_norm": 1.2670625448226929, + "learning_rate": 1.2296645025500207e-10, + "loss": 0.2181, "step": 1088100 }, { - "epoch": 11.09, - "learning_rate": 1.0204854609408313e-05, - "loss": 0.2368, + "epoch": 14.992697914083381, + "grad_norm": 0.2452702820301056, + "learning_rate": 1.0016452780259311e-10, + "loss": 0.218, "step": 1088200 }, { - "epoch": 11.09, - "learning_rate": 1.0199870556179517e-05, - "loss": 0.2656, + "epoch": 14.994075666143122, + "grad_norm": 2.5829875469207764, + "learning_rate": 7.969886252956471e-11, + "loss": 0.2539, "step": 1088300 }, { - "epoch": 11.09, - "learning_rate": 1.0194887471058882e-05, - "loss": 0.3304, + "epoch": 14.99545341820286, + "grad_norm": 0.038913942873477936, + "learning_rate": 6.156945617619147e-11, + "loss": 0.2639, "step": 1088400 }, { - "epoch": 11.09, - "learning_rate": 1.0189905354290054e-05, - "loss": 0.2654, + "epoch": 14.996831170262599, + "grad_norm": 2.4882044792175293, + "learning_rate": 4.577631028124252e-11, + "loss": 0.2202, "step": 1088500 }, { - "epoch": 11.09, - "learning_rate": 1.0184924206116649e-05, - "loss": 0.2574, + "epoch": 14.99820892232234, + "grad_norm": 0.9612501263618469, + "learning_rate": 3.23194261880877e-11, + "loss": 0.2061, "step": 1088600 }, { - "epoch": 11.09, - "learning_rate": 1.0179944026782188e-05, - "loss": 0.2319, + "epoch": 14.999586674382078, + "grad_norm": 1.4983134269714355, + "learning_rate": 2.1198805035538284e-11, + "loss": 0.1861, "step": 1088700 - }, - { - "epoch": 11.09, - "learning_rate": 1.0174964816530184e-05, - "loss": 0.2388, - "step": 1088800 - }, - { - "epoch": 11.09, - "learning_rate": 1.0169986575604091e-05, - "loss": 0.314, - "step": 1088900 - }, - { - "epoch": 11.09, - "learning_rate": 1.0165009304247312e-05, - "loss": 0.1714, - "step": 1089000 - }, - { - "epoch": 11.1, - "learning_rate": 1.0160033002703214e-05, - "loss": 0.2221, - "step": 1089100 - }, - { - "epoch": 11.1, - "learning_rate": 1.01550576712151e-05, - "loss": 0.2334, - "step": 1089200 - }, - { - "epoch": 11.1, - "learning_rate": 1.0150083310026249e-05, - "loss": 0.2556, - "step": 1089300 - }, - { - "epoch": 11.1, - "learning_rate": 1.0145109919379858e-05, - "loss": 0.2601, - "step": 1089400 - }, - { - "epoch": 11.1, - "learning_rate": 1.0140187218911526e-05, - "loss": 0.2681, - "step": 1089500 - }, - { - "epoch": 11.1, - "learning_rate": 1.0135215760368038e-05, - "loss": 0.253, - "step": 1089600 - }, - { - "epoch": 11.1, - "learning_rate": 1.0130245273093952e-05, - "loss": 0.2239, - "step": 1089700 - }, - { - "epoch": 11.1, - "learning_rate": 1.0125275757332307e-05, - "loss": 0.244, - "step": 1089800 - }, - { - "epoch": 11.1, - "learning_rate": 1.012030721332606e-05, - "loss": 0.2634, - "step": 1089900 - }, - { - "epoch": 11.11, - "learning_rate": 1.0115339641318155e-05, - "loss": 0.2386, - "step": 1090000 - }, - { - "epoch": 11.11, - "learning_rate": 1.0110373041551484e-05, - "loss": 0.2098, - "step": 1090100 - }, - { - "epoch": 11.11, - "learning_rate": 1.0105407414268872e-05, - "loss": 0.2611, - "step": 1090200 - }, - { - "epoch": 11.11, - "learning_rate": 1.0100442759713107e-05, - "loss": 0.2266, - "step": 1090300 - }, - { - "epoch": 11.11, - "learning_rate": 1.0095479078126937e-05, - "loss": 0.2761, - "step": 1090400 - }, - { - "epoch": 11.11, - "learning_rate": 1.0090516369753058e-05, - "loss": 0.2043, - "step": 1090500 - }, - { - "epoch": 11.11, - "learning_rate": 1.0085554634834116e-05, - "loss": 0.2103, - "step": 1090600 - }, - { - "epoch": 11.11, - "learning_rate": 1.0080593873612716e-05, - "loss": 0.27, - "step": 1090700 - }, - { - "epoch": 11.11, - "learning_rate": 1.0075634086331394e-05, - "loss": 0.2141, - "step": 1090800 - }, - { - "epoch": 11.11, - "learning_rate": 1.0070675273232664e-05, - "loss": 0.2306, - "step": 1090900 - }, - { - "epoch": 11.12, - "learning_rate": 1.0065717434558992e-05, - "loss": 0.2496, - "step": 1091000 - }, - { - "epoch": 11.12, - "learning_rate": 1.0060760570552763e-05, - "loss": 0.2689, - "step": 1091100 - }, - { - "epoch": 11.12, - "learning_rate": 1.0055804681456353e-05, - "loss": 0.2644, - "step": 1091200 - }, - { - "epoch": 11.12, - "learning_rate": 1.0050849767512082e-05, - "loss": 0.252, - "step": 1091300 - }, - { - "epoch": 11.12, - "learning_rate": 1.0045895828962197e-05, - "loss": 0.2269, - "step": 1091400 - }, - { - "epoch": 11.12, - "learning_rate": 1.0040942866048927e-05, - "loss": 0.2624, - "step": 1091500 - }, - { - "epoch": 11.12, - "learning_rate": 1.003599087901444e-05, - "loss": 0.2052, - "step": 1091600 - }, - { - "epoch": 11.12, - "learning_rate": 1.0031039868100862e-05, - "loss": 0.2326, - "step": 1091700 - }, - { - "epoch": 11.12, - "learning_rate": 1.0026089833550267e-05, - "loss": 0.2682, - "step": 1091800 - }, - { - "epoch": 11.12, - "learning_rate": 1.0021140775604679e-05, - "loss": 0.2513, - "step": 1091900 - }, - { - "epoch": 11.13, - "learning_rate": 1.0016192694506089e-05, - "loss": 0.2548, - "step": 1092000 - }, - { - "epoch": 11.13, - "learning_rate": 1.001124559049641e-05, - "loss": 0.2903, - "step": 1092100 - }, - { - "epoch": 11.13, - "learning_rate": 1.0006299463817534e-05, - "loss": 0.2184, - "step": 1092200 - }, - { - "epoch": 11.13, - "learning_rate": 1.0001354314711308e-05, - "loss": 0.2194, - "step": 1092300 - }, - { - "epoch": 11.13, - "learning_rate": 9.996410143419502e-06, - "loss": 0.2702, - "step": 1092400 - }, - { - "epoch": 11.13, - "learning_rate": 9.991466950183865e-06, - "loss": 0.1782, - "step": 1092500 - }, - { - "epoch": 11.13, - "learning_rate": 9.986524735246098e-06, - "loss": 0.2684, - "step": 1092600 - }, - { - "epoch": 11.13, - "learning_rate": 9.981583498847826e-06, - "loss": 0.2679, - "step": 1092700 - }, - { - "epoch": 11.13, - "learning_rate": 9.976643241230661e-06, - "loss": 0.2351, - "step": 1092800 - }, - { - "epoch": 11.13, - "learning_rate": 9.971703962636146e-06, - "loss": 0.2435, - "step": 1092900 - }, - { - "epoch": 11.14, - "learning_rate": 9.966765663305785e-06, - "loss": 0.2564, - "step": 1093000 - }, - { - "epoch": 11.14, - "learning_rate": 9.96182834348103e-06, - "loss": 0.2589, - "step": 1093100 - }, - { - "epoch": 11.14, - "learning_rate": 9.956892003403288e-06, - "loss": 0.2782, - "step": 1093200 - }, - { - "epoch": 11.14, - "learning_rate": 9.951956643313926e-06, - "loss": 0.2291, - "step": 1093300 - }, - { - "epoch": 11.14, - "learning_rate": 9.947022263454235e-06, - "loss": 0.2733, - "step": 1093400 - }, - { - "epoch": 11.14, - "learning_rate": 9.942088864065484e-06, - "loss": 0.286, - "step": 1093500 - }, - { - "epoch": 11.14, - "learning_rate": 9.937156445388899e-06, - "loss": 0.2805, - "step": 1093600 - }, - { - "epoch": 11.14, - "learning_rate": 9.932274317186351e-06, - "loss": 0.2811, - "step": 1093700 - }, - { - "epoch": 11.14, - "learning_rate": 9.927343850844377e-06, - "loss": 0.2573, - "step": 1093800 - }, - { - "epoch": 11.14, - "learning_rate": 9.922414365935504e-06, - "loss": 0.219, - "step": 1093900 - }, - { - "epoch": 11.15, - "learning_rate": 9.91748586270075e-06, - "loss": 0.2118, - "step": 1094000 - }, - { - "epoch": 11.15, - "learning_rate": 9.9125583413811e-06, - "loss": 0.2598, - "step": 1094100 - }, - { - "epoch": 11.15, - "learning_rate": 9.907631802217452e-06, - "loss": 0.2825, - "step": 1094200 - }, - { - "epoch": 11.15, - "learning_rate": 9.902706245450716e-06, - "loss": 0.2593, - "step": 1094300 - }, - { - "epoch": 11.15, - "learning_rate": 9.897781671321715e-06, - "loss": 0.224, - "step": 1094400 - }, - { - "epoch": 11.15, - "learning_rate": 9.892858080071224e-06, - "loss": 0.191, - "step": 1094500 - }, - { - "epoch": 11.15, - "learning_rate": 9.887935471939975e-06, - "loss": 0.2395, - "step": 1094600 - }, - { - "epoch": 11.15, - "learning_rate": 9.88301384716867e-06, - "loss": 0.3252, - "step": 1094700 - }, - { - "epoch": 11.15, - "learning_rate": 9.878093205997925e-06, - "loss": 0.2023, - "step": 1094800 - }, - { - "epoch": 11.16, - "learning_rate": 9.873173548668341e-06, - "loss": 0.2434, - "step": 1094900 - }, - { - "epoch": 11.16, - "learning_rate": 9.868254875420471e-06, - "loss": 0.2056, - "step": 1095000 - }, - { - "epoch": 11.16, - "learning_rate": 9.863337186494788e-06, - "loss": 0.2409, - "step": 1095100 - }, - { - "epoch": 11.16, - "learning_rate": 9.858420482131747e-06, - "loss": 0.2046, - "step": 1095200 - }, - { - "epoch": 11.16, - "learning_rate": 9.853504762571754e-06, - "loss": 0.2685, - "step": 1095300 - }, - { - "epoch": 11.16, - "learning_rate": 9.848590028055132e-06, - "loss": 0.2885, - "step": 1095400 - }, - { - "epoch": 11.16, - "learning_rate": 9.843676278822207e-06, - "loss": 0.2538, - "step": 1095500 - }, - { - "epoch": 11.16, - "learning_rate": 9.83876351511323e-06, - "loss": 0.2163, - "step": 1095600 - }, - { - "epoch": 11.16, - "learning_rate": 9.833851737168409e-06, - "loss": 0.282, - "step": 1095700 - }, - { - "epoch": 11.16, - "learning_rate": 9.828940945227884e-06, - "loss": 0.245, - "step": 1095800 - }, - { - "epoch": 11.17, - "learning_rate": 9.82403113953177e-06, - "loss": 0.2793, - "step": 1095900 - }, - { - "epoch": 11.17, - "learning_rate": 9.819122320320141e-06, - "loss": 0.2225, - "step": 1096000 - }, - { - "epoch": 11.17, - "learning_rate": 9.814214487832986e-06, - "loss": 0.2277, - "step": 1096100 - }, - { - "epoch": 11.17, - "learning_rate": 9.809307642310282e-06, - "loss": 0.3204, - "step": 1096200 - }, - { - "epoch": 11.17, - "learning_rate": 9.80440178399195e-06, - "loss": 0.2549, - "step": 1096300 - }, - { - "epoch": 11.17, - "learning_rate": 9.79949691311784e-06, - "loss": 0.2146, - "step": 1096400 - }, - { - "epoch": 11.17, - "learning_rate": 9.794593029927782e-06, - "loss": 0.2491, - "step": 1096500 - }, - { - "epoch": 11.17, - "learning_rate": 9.789690134661557e-06, - "loss": 0.2345, - "step": 1096600 - }, - { - "epoch": 11.17, - "learning_rate": 9.784788227558853e-06, - "loss": 0.2521, - "step": 1096700 - }, - { - "epoch": 11.17, - "learning_rate": 9.779887308859377e-06, - "loss": 0.2429, - "step": 1096800 - }, - { - "epoch": 11.18, - "learning_rate": 9.774987378802756e-06, - "loss": 0.2547, - "step": 1096900 - }, - { - "epoch": 11.18, - "learning_rate": 9.770088437628544e-06, - "loss": 0.2894, - "step": 1097000 - }, - { - "epoch": 11.18, - "learning_rate": 9.765190485576279e-06, - "loss": 0.2417, - "step": 1097100 - }, - { - "epoch": 11.18, - "learning_rate": 9.760293522885459e-06, - "loss": 0.2676, - "step": 1097200 - }, - { - "epoch": 11.18, - "learning_rate": 9.755397549795488e-06, - "loss": 0.2874, - "step": 1097300 - }, - { - "epoch": 11.18, - "learning_rate": 9.750502566545764e-06, - "loss": 0.2726, - "step": 1097400 - }, - { - "epoch": 11.18, - "learning_rate": 9.745608573375624e-06, - "loss": 0.2193, - "step": 1097500 - }, - { - "epoch": 11.18, - "learning_rate": 9.740715570524362e-06, - "loss": 0.2308, - "step": 1097600 - }, - { - "epoch": 11.18, - "learning_rate": 9.735823558231194e-06, - "loss": 0.2367, - "step": 1097700 - }, - { - "epoch": 11.18, - "learning_rate": 9.730981442045062e-06, - "loss": 0.2638, - "step": 1097800 - }, - { - "epoch": 11.19, - "learning_rate": 9.726091401674084e-06, - "loss": 0.2513, - "step": 1097900 - }, - { - "epoch": 11.19, - "learning_rate": 9.721202352576251e-06, - "loss": 0.2133, - "step": 1098000 - }, - { - "epoch": 11.19, - "learning_rate": 9.716314294990605e-06, - "loss": 0.2448, - "step": 1098100 - }, - { - "epoch": 11.19, - "learning_rate": 9.711427229156142e-06, - "loss": 0.2527, - "step": 1098200 - }, - { - "epoch": 11.19, - "learning_rate": 9.706541155311812e-06, - "loss": 0.2798, - "step": 1098300 - }, - { - "epoch": 11.19, - "learning_rate": 9.701656073696527e-06, - "loss": 0.1893, - "step": 1098400 - }, - { - "epoch": 11.19, - "learning_rate": 9.696771984549115e-06, - "loss": 0.2699, - "step": 1098500 - }, - { - "epoch": 11.19, - "learning_rate": 9.691888888108387e-06, - "loss": 0.2361, - "step": 1098600 - }, - { - "epoch": 11.19, - "learning_rate": 9.687006784613111e-06, - "loss": 0.2325, - "step": 1098700 - }, - { - "epoch": 11.19, - "learning_rate": 9.682125674301972e-06, - "loss": 0.2585, - "step": 1098800 - }, - { - "epoch": 11.2, - "learning_rate": 9.677245557413637e-06, - "loss": 0.2605, - "step": 1098900 - }, - { - "epoch": 11.2, - "learning_rate": 9.672366434186725e-06, - "loss": 0.2529, - "step": 1099000 - }, - { - "epoch": 11.2, - "learning_rate": 9.667488304859776e-06, - "loss": 0.2174, - "step": 1099100 - }, - { - "epoch": 11.2, - "learning_rate": 9.662611169671307e-06, - "loss": 0.2868, - "step": 1099200 - }, - { - "epoch": 11.2, - "learning_rate": 9.657735028859786e-06, - "loss": 0.3691, - "step": 1099300 - }, - { - "epoch": 11.2, - "learning_rate": 9.652859882663629e-06, - "loss": 0.2901, - "step": 1099400 - }, - { - "epoch": 11.2, - "learning_rate": 9.647985731321195e-06, - "loss": 0.2485, - "step": 1099500 - }, - { - "epoch": 11.2, - "learning_rate": 9.643112575070816e-06, - "loss": 0.2401, - "step": 1099600 - }, - { - "epoch": 11.2, - "learning_rate": 9.638240414150739e-06, - "loss": 0.2749, - "step": 1099700 - }, - { - "epoch": 11.2, - "learning_rate": 9.633369248799193e-06, - "loss": 0.2583, - "step": 1099800 - }, - { - "epoch": 11.21, - "learning_rate": 9.628499079254345e-06, - "loss": 0.2556, - "step": 1099900 - }, - { - "epoch": 11.21, - "learning_rate": 9.623629905754336e-06, - "loss": 0.2313, - "step": 1100000 - }, - { - "epoch": 11.21, - "learning_rate": 9.618761728537214e-06, - "loss": 0.2232, - "step": 1100100 - }, - { - "epoch": 11.21, - "learning_rate": 9.613894547841012e-06, - "loss": 0.2703, - "step": 1100200 - }, - { - "epoch": 11.21, - "learning_rate": 9.60902836390372e-06, - "loss": 0.2167, - "step": 1100300 - }, - { - "epoch": 11.21, - "learning_rate": 9.604163176963244e-06, - "loss": 0.2469, - "step": 1100400 - }, - { - "epoch": 11.21, - "learning_rate": 9.599298987257474e-06, - "loss": 0.2512, - "step": 1100500 - }, - { - "epoch": 11.21, - "learning_rate": 9.594435795024237e-06, - "loss": 0.2867, - "step": 1100600 - }, - { - "epoch": 11.21, - "learning_rate": 9.589573600501316e-06, - "loss": 0.2228, - "step": 1100700 - }, - { - "epoch": 11.22, - "learning_rate": 9.584712403926444e-06, - "loss": 0.3096, - "step": 1100800 - }, - { - "epoch": 11.22, - "learning_rate": 9.579852205537313e-06, - "loss": 0.226, - "step": 1100900 - }, - { - "epoch": 11.22, - "learning_rate": 9.574993005571539e-06, - "loss": 0.2858, - "step": 1101000 - }, - { - "epoch": 11.22, - "learning_rate": 9.570134804266714e-06, - "loss": 0.2318, - "step": 1101100 - }, - { - "epoch": 11.22, - "learning_rate": 9.565277601860389e-06, - "loss": 0.2638, - "step": 1101200 - }, - { - "epoch": 11.22, - "learning_rate": 9.560421398590032e-06, - "loss": 0.2363, - "step": 1101300 - }, - { - "epoch": 11.22, - "learning_rate": 9.55556619469309e-06, - "loss": 0.3175, - "step": 1101400 - }, - { - "epoch": 11.22, - "learning_rate": 9.55071199040696e-06, - "loss": 0.2233, - "step": 1101500 - }, - { - "epoch": 11.22, - "learning_rate": 9.545858785968983e-06, - "loss": 0.1778, - "step": 1101600 - }, - { - "epoch": 11.22, - "learning_rate": 9.54100658161644e-06, - "loss": 0.2173, - "step": 1101700 - }, - { - "epoch": 11.23, - "learning_rate": 9.536155377586584e-06, - "loss": 0.25, - "step": 1101800 - }, - { - "epoch": 11.23, - "learning_rate": 9.531402168378981e-06, - "loss": 0.2577, - "step": 1101900 - }, - { - "epoch": 11.23, - "learning_rate": 9.526552945687767e-06, - "loss": 0.2201, - "step": 1102000 - }, - { - "epoch": 11.23, - "learning_rate": 9.521704724025933e-06, - "loss": 0.2554, - "step": 1102100 - }, - { - "epoch": 11.23, - "learning_rate": 9.51685750363054e-06, - "loss": 0.2544, - "step": 1102200 - }, - { - "epoch": 11.23, - "learning_rate": 9.512011284738565e-06, - "loss": 0.2718, - "step": 1102300 - }, - { - "epoch": 11.23, - "learning_rate": 9.507166067586978e-06, - "loss": 0.2633, - "step": 1102400 - }, - { - "epoch": 11.23, - "learning_rate": 9.50232185241268e-06, - "loss": 0.2468, - "step": 1102500 - }, - { - "epoch": 11.23, - "learning_rate": 9.497478639452512e-06, - "loss": 0.3042, - "step": 1102600 - }, - { - "epoch": 11.23, - "learning_rate": 9.492636428943286e-06, - "loss": 0.2437, - "step": 1102700 - }, - { - "epoch": 11.24, - "learning_rate": 9.48779522112176e-06, - "loss": 0.2314, - "step": 1102800 - }, - { - "epoch": 11.24, - "learning_rate": 9.482955016224644e-06, - "loss": 0.2235, - "step": 1102900 - }, - { - "epoch": 11.24, - "learning_rate": 9.478115814488579e-06, - "loss": 0.2859, - "step": 1103000 - }, - { - "epoch": 11.24, - "learning_rate": 9.47327761615018e-06, - "loss": 0.2305, - "step": 1103100 - }, - { - "epoch": 11.24, - "learning_rate": 9.468440421446013e-06, - "loss": 0.2415, - "step": 1103200 - }, - { - "epoch": 11.24, - "learning_rate": 9.463604230612582e-06, - "loss": 0.2918, - "step": 1103300 - }, - { - "epoch": 11.24, - "learning_rate": 9.458769043886348e-06, - "loss": 0.2818, - "step": 1103400 - }, - { - "epoch": 11.24, - "learning_rate": 9.453934861503735e-06, - "loss": 0.2354, - "step": 1103500 - }, - { - "epoch": 11.24, - "learning_rate": 9.449150010505656e-06, - "loss": 0.3007, - "step": 1103600 - }, - { - "epoch": 11.24, - "learning_rate": 9.444317827469971e-06, - "loss": 0.2248, - "step": 1103700 - }, - { - "epoch": 11.25, - "learning_rate": 9.439486649484464e-06, - "loss": 0.3142, - "step": 1103800 - }, - { - "epoch": 11.25, - "learning_rate": 9.43465647678536e-06, - "loss": 0.25, - "step": 1103900 - }, - { - "epoch": 11.25, - "learning_rate": 9.429827309608834e-06, - "loss": 0.2178, - "step": 1104000 - }, - { - "epoch": 11.25, - "learning_rate": 9.424999148190988e-06, - "loss": 0.2025, - "step": 1104100 - }, - { - "epoch": 11.25, - "learning_rate": 9.420171992767899e-06, - "loss": 0.2574, - "step": 1104200 - }, - { - "epoch": 11.25, - "learning_rate": 9.415345843575597e-06, - "loss": 0.2503, - "step": 1104300 - }, - { - "epoch": 11.25, - "learning_rate": 9.410520700850026e-06, - "loss": 0.2247, - "step": 1104400 - }, - { - "epoch": 11.25, - "learning_rate": 9.405696564827125e-06, - "loss": 0.2393, - "step": 1104500 - }, - { - "epoch": 11.25, - "learning_rate": 9.400873435742764e-06, - "loss": 0.2436, - "step": 1104600 - }, - { - "epoch": 11.25, - "learning_rate": 9.396051313832758e-06, - "loss": 0.2417, - "step": 1104700 - }, - { - "epoch": 11.26, - "learning_rate": 9.391230199332888e-06, - "loss": 0.2083, - "step": 1104800 - }, - { - "epoch": 11.26, - "learning_rate": 9.38641009247888e-06, - "loss": 0.2711, - "step": 1104900 - }, - { - "epoch": 11.26, - "learning_rate": 9.381590993506399e-06, - "loss": 0.2685, - "step": 1105000 - }, - { - "epoch": 11.26, - "learning_rate": 9.37677290265107e-06, - "loss": 0.2438, - "step": 1105100 - }, - { - "epoch": 11.26, - "learning_rate": 9.371955820148473e-06, - "loss": 0.2705, - "step": 1105200 - }, - { - "epoch": 11.26, - "learning_rate": 9.367139746234145e-06, - "loss": 0.2962, - "step": 1105300 - }, - { - "epoch": 11.26, - "learning_rate": 9.362324681143546e-06, - "loss": 0.1828, - "step": 1105400 - }, - { - "epoch": 11.26, - "learning_rate": 9.35751062511211e-06, - "loss": 0.2308, - "step": 1105500 - }, - { - "epoch": 11.26, - "learning_rate": 9.352697578375223e-06, - "loss": 0.3031, - "step": 1105600 - }, - { - "epoch": 11.27, - "learning_rate": 9.347885541168201e-06, - "loss": 0.2791, - "step": 1105700 - }, - { - "epoch": 11.27, - "learning_rate": 9.343074513726329e-06, - "loss": 0.2563, - "step": 1105800 - }, - { - "epoch": 11.27, - "learning_rate": 9.338264496284842e-06, - "loss": 0.2278, - "step": 1105900 - }, - { - "epoch": 11.27, - "learning_rate": 9.333455489078916e-06, - "loss": 0.2247, - "step": 1106000 - }, - { - "epoch": 11.27, - "learning_rate": 9.328647492343683e-06, - "loss": 0.2185, - "step": 1106100 - }, - { - "epoch": 11.27, - "learning_rate": 9.32384050631424e-06, - "loss": 0.2492, - "step": 1106200 - }, - { - "epoch": 11.27, - "learning_rate": 9.319034531225598e-06, - "loss": 0.2573, - "step": 1106300 - }, - { - "epoch": 11.27, - "learning_rate": 9.31422956731275e-06, - "loss": 0.2292, - "step": 1106400 - }, - { - "epoch": 11.27, - "learning_rate": 9.30942561481064e-06, - "loss": 0.278, - "step": 1106500 - }, - { - "epoch": 11.27, - "learning_rate": 9.30462267395413e-06, - "loss": 0.2508, - "step": 1106600 - }, - { - "epoch": 11.28, - "learning_rate": 9.299868759258255e-06, - "loss": 0.2503, - "step": 1106700 - }, - { - "epoch": 11.28, - "learning_rate": 9.295067832275112e-06, - "loss": 0.254, - "step": 1106800 - }, - { - "epoch": 11.28, - "learning_rate": 9.290267917639594e-06, - "loss": 0.2278, - "step": 1106900 - }, - { - "epoch": 11.28, - "learning_rate": 9.285469015586385e-06, - "loss": 0.2312, - "step": 1107000 - }, - { - "epoch": 11.28, - "learning_rate": 9.28067112635012e-06, - "loss": 0.3294, - "step": 1107100 - }, - { - "epoch": 11.28, - "learning_rate": 9.275874250165397e-06, - "loss": 0.2234, - "step": 1107200 - }, - { - "epoch": 11.28, - "learning_rate": 9.271078387266753e-06, - "loss": 0.2337, - "step": 1107300 - }, - { - "epoch": 11.28, - "learning_rate": 9.266283537888664e-06, - "loss": 0.2904, - "step": 1107400 - }, - { - "epoch": 11.28, - "learning_rate": 9.261489702265575e-06, - "loss": 0.2549, - "step": 1107500 - }, - { - "epoch": 11.28, - "learning_rate": 9.256696880631881e-06, - "loss": 0.2348, - "step": 1107600 - }, - { - "epoch": 11.29, - "learning_rate": 9.25190507322193e-06, - "loss": 0.2742, - "step": 1107700 - }, - { - "epoch": 11.29, - "learning_rate": 9.247114280269992e-06, - "loss": 0.254, - "step": 1107800 - }, - { - "epoch": 11.29, - "learning_rate": 9.242324502010317e-06, - "loss": 0.2302, - "step": 1107900 - }, - { - "epoch": 11.29, - "learning_rate": 9.23753573867711e-06, - "loss": 0.2125, - "step": 1108000 - }, - { - "epoch": 11.29, - "learning_rate": 9.232747990504495e-06, - "loss": 0.2703, - "step": 1108100 - }, - { - "epoch": 11.29, - "learning_rate": 9.227961257726568e-06, - "loss": 0.2733, - "step": 1108200 - }, - { - "epoch": 11.29, - "learning_rate": 9.22317554057737e-06, - "loss": 0.2538, - "step": 1108300 - }, - { - "epoch": 11.29, - "learning_rate": 9.218390839290906e-06, - "loss": 0.2074, - "step": 1108400 - }, - { - "epoch": 11.29, - "learning_rate": 9.213607154101107e-06, - "loss": 0.2511, - "step": 1108500 - }, - { - "epoch": 11.29, - "learning_rate": 9.208824485241885e-06, - "loss": 0.2174, - "step": 1108600 - }, - { - "epoch": 11.3, - "learning_rate": 9.204042832947059e-06, - "loss": 0.2279, - "step": 1108700 - }, - { - "epoch": 11.3, - "learning_rate": 9.19926219745043e-06, - "loss": 0.2482, - "step": 1108800 - }, - { - "epoch": 11.3, - "learning_rate": 9.194482578985762e-06, - "loss": 0.2284, - "step": 1108900 - }, - { - "epoch": 11.3, - "learning_rate": 9.189703977786726e-06, - "loss": 0.2609, - "step": 1109000 - }, - { - "epoch": 11.3, - "learning_rate": 9.184926394086973e-06, - "loss": 0.2266, - "step": 1109100 - }, - { - "epoch": 11.3, - "learning_rate": 9.180149828120105e-06, - "loss": 0.2748, - "step": 1109200 - }, - { - "epoch": 11.3, - "learning_rate": 9.175374280119673e-06, - "loss": 0.1941, - "step": 1109300 - }, - { - "epoch": 11.3, - "learning_rate": 9.170599750319151e-06, - "loss": 0.2377, - "step": 1109400 - }, - { - "epoch": 11.3, - "learning_rate": 9.165826238952e-06, - "loss": 0.233, - "step": 1109500 - }, - { - "epoch": 11.3, - "learning_rate": 9.161053746251613e-06, - "loss": 0.2503, - "step": 1109600 - }, - { - "epoch": 11.31, - "learning_rate": 9.15628227245134e-06, - "loss": 0.2699, - "step": 1109700 - }, - { - "epoch": 11.31, - "learning_rate": 9.151511817784473e-06, - "loss": 0.2711, - "step": 1109800 - }, - { - "epoch": 11.31, - "learning_rate": 9.146742382484268e-06, - "loss": 0.27, - "step": 1109900 - }, - { - "epoch": 11.31, - "learning_rate": 9.14197396678391e-06, - "loss": 0.301, - "step": 1110000 - }, - { - "epoch": 11.31, - "learning_rate": 9.137206570916548e-06, - "loss": 0.2497, - "step": 1110100 - }, - { - "epoch": 11.31, - "learning_rate": 9.132440195115293e-06, - "loss": 0.2357, - "step": 1110200 - }, - { - "epoch": 11.31, - "learning_rate": 9.127674839613172e-06, - "loss": 0.2449, - "step": 1110300 - }, - { - "epoch": 11.31, - "learning_rate": 9.12291050464319e-06, - "loss": 0.2163, - "step": 1110400 - }, - { - "epoch": 11.31, - "learning_rate": 9.11814719043831e-06, - "loss": 0.2634, - "step": 1110500 - }, - { - "epoch": 11.31, - "learning_rate": 9.113384897231402e-06, - "loss": 0.3037, - "step": 1110600 - }, - { - "epoch": 11.32, - "learning_rate": 9.10862362525533e-06, - "loss": 0.2274, - "step": 1110700 - }, - { - "epoch": 11.32, - "learning_rate": 9.103863374742885e-06, - "loss": 0.2484, - "step": 1110800 - }, - { - "epoch": 11.32, - "learning_rate": 9.099104145926826e-06, - "loss": 0.2354, - "step": 1110900 - }, - { - "epoch": 11.32, - "learning_rate": 9.094345939039841e-06, - "loss": 0.28, - "step": 1111000 - }, - { - "epoch": 11.32, - "learning_rate": 9.08958875431458e-06, - "loss": 0.2382, - "step": 1111100 - }, - { - "epoch": 11.32, - "learning_rate": 9.084832591983653e-06, - "loss": 0.2432, - "step": 1111200 - }, - { - "epoch": 11.32, - "learning_rate": 9.08007745227959e-06, - "loss": 0.1926, - "step": 1111300 - }, - { - "epoch": 11.32, - "learning_rate": 9.075323335434894e-06, - "loss": 0.2333, - "step": 1111400 - }, - { - "epoch": 11.32, - "learning_rate": 9.07057024168202e-06, - "loss": 0.2106, - "step": 1111500 - }, - { - "epoch": 11.33, - "learning_rate": 9.065865686891423e-06, - "loss": 0.3099, - "step": 1111600 - }, - { - "epoch": 11.33, - "learning_rate": 9.061114629782614e-06, - "loss": 0.2983, - "step": 1111700 - }, - { - "epoch": 11.33, - "learning_rate": 9.056364596460335e-06, - "loss": 0.2427, - "step": 1111800 - }, - { - "epoch": 11.33, - "learning_rate": 9.051615587156841e-06, - "loss": 0.2421, - "step": 1111900 - }, - { - "epoch": 11.33, - "learning_rate": 9.046867602104342e-06, - "loss": 0.2486, - "step": 1112000 - }, - { - "epoch": 11.33, - "learning_rate": 9.042120641534966e-06, - "loss": 0.2288, - "step": 1112100 - }, - { - "epoch": 11.33, - "learning_rate": 9.03737470568082e-06, - "loss": 0.2508, - "step": 1112200 - }, - { - "epoch": 11.33, - "learning_rate": 9.032629794773953e-06, - "loss": 0.2838, - "step": 1112300 - }, - { - "epoch": 11.33, - "learning_rate": 9.02788590904636e-06, - "loss": 0.3441, - "step": 1112400 - }, - { - "epoch": 11.33, - "learning_rate": 9.023143048729995e-06, - "loss": 0.2789, - "step": 1112500 - }, - { - "epoch": 11.34, - "learning_rate": 9.018401214056757e-06, - "loss": 0.2803, - "step": 1112600 - }, - { - "epoch": 11.34, - "learning_rate": 9.013660405258482e-06, - "loss": 0.1871, - "step": 1112700 - }, - { - "epoch": 11.34, - "learning_rate": 9.00892062256697e-06, - "loss": 0.2903, - "step": 1112800 - }, - { - "epoch": 11.34, - "learning_rate": 9.004181866213984e-06, - "loss": 0.2762, - "step": 1112900 - }, - { - "epoch": 11.34, - "learning_rate": 8.999444136431198e-06, - "loss": 0.2408, - "step": 1113000 - }, - { - "epoch": 11.34, - "learning_rate": 8.994707433450266e-06, - "loss": 0.2688, - "step": 1113100 - }, - { - "epoch": 11.34, - "learning_rate": 8.989971757502802e-06, - "loss": 0.27, - "step": 1113200 - }, - { - "epoch": 11.34, - "learning_rate": 8.985237108820325e-06, - "loss": 0.2623, - "step": 1113300 - }, - { - "epoch": 11.34, - "learning_rate": 8.980503487634342e-06, - "loss": 0.2144, - "step": 1113400 - }, - { - "epoch": 11.34, - "learning_rate": 8.975770894176305e-06, - "loss": 0.3118, - "step": 1113500 - }, - { - "epoch": 11.35, - "learning_rate": 8.971039328677602e-06, - "loss": 0.2141, - "step": 1113600 - }, - { - "epoch": 11.35, - "learning_rate": 8.966308791369584e-06, - "loss": 0.2264, - "step": 1113700 - }, - { - "epoch": 11.35, - "learning_rate": 8.961579282483541e-06, - "loss": 0.3177, - "step": 1113800 - }, - { - "epoch": 11.35, - "learning_rate": 8.956850802250731e-06, - "loss": 0.2223, - "step": 1113900 - }, - { - "epoch": 11.35, - "learning_rate": 8.95212335090233e-06, - "loss": 0.2289, - "step": 1114000 - }, - { - "epoch": 11.35, - "learning_rate": 8.947396928669487e-06, - "loss": 0.2775, - "step": 1114100 - }, - { - "epoch": 11.35, - "learning_rate": 8.942671535783312e-06, - "loss": 0.2603, - "step": 1114200 - }, - { - "epoch": 11.35, - "learning_rate": 8.93794717247482e-06, - "loss": 0.2497, - "step": 1114300 - }, - { - "epoch": 11.35, - "learning_rate": 8.933223838975024e-06, - "loss": 0.2598, - "step": 1114400 - }, - { - "epoch": 11.35, - "learning_rate": 8.928501535514872e-06, - "loss": 0.2459, - "step": 1114500 - }, - { - "epoch": 11.36, - "learning_rate": 8.923780262325238e-06, - "loss": 0.2689, - "step": 1114600 - }, - { - "epoch": 11.36, - "learning_rate": 8.919060019636971e-06, - "loss": 0.2829, - "step": 1114700 - }, - { - "epoch": 11.36, - "learning_rate": 8.914340807680864e-06, - "loss": 0.2499, - "step": 1114800 - }, - { - "epoch": 11.36, - "learning_rate": 8.909622626687659e-06, - "loss": 0.2415, - "step": 1114900 - }, - { - "epoch": 11.36, - "learning_rate": 8.904905476888047e-06, - "loss": 0.2401, - "step": 1115000 - }, - { - "epoch": 11.36, - "learning_rate": 8.90018935851267e-06, - "loss": 0.2214, - "step": 1115100 - }, - { - "epoch": 11.36, - "learning_rate": 8.895474271792123e-06, - "loss": 0.1966, - "step": 1115200 - }, - { - "epoch": 11.36, - "learning_rate": 8.890760216956932e-06, - "loss": 0.2463, - "step": 1115300 - }, - { - "epoch": 11.36, - "learning_rate": 8.886047194237592e-06, - "loss": 0.2686, - "step": 1115400 - }, - { - "epoch": 11.36, - "learning_rate": 8.88133520386455e-06, - "loss": 0.2695, - "step": 1115500 - }, - { - "epoch": 11.37, - "learning_rate": 8.876624246068182e-06, - "loss": 0.2358, - "step": 1115600 - }, - { - "epoch": 11.37, - "learning_rate": 8.871914321078829e-06, - "loss": 0.2154, - "step": 1115700 - }, - { - "epoch": 11.37, - "learning_rate": 8.867205429126785e-06, - "loss": 0.1999, - "step": 1115800 - }, - { - "epoch": 11.37, - "learning_rate": 8.862544643913687e-06, - "loss": 0.2084, - "step": 1115900 - }, - { - "epoch": 11.37, - "learning_rate": 8.857837808390787e-06, - "loss": 0.2649, - "step": 1116000 - }, - { - "epoch": 11.37, - "learning_rate": 8.853132006593451e-06, - "loss": 0.3073, - "step": 1116100 - }, - { - "epoch": 11.37, - "learning_rate": 8.848427238751758e-06, - "loss": 0.2839, - "step": 1116200 - }, - { - "epoch": 11.37, - "learning_rate": 8.843770537312341e-06, - "loss": 0.2202, - "step": 1116300 - }, - { - "epoch": 11.37, - "learning_rate": 8.839067827726704e-06, - "loss": 0.2431, - "step": 1116400 - }, - { - "epoch": 11.38, - "learning_rate": 8.83436615278438e-06, - "loss": 0.2187, - "step": 1116500 - }, - { - "epoch": 11.38, - "learning_rate": 8.829665512715227e-06, - "loss": 0.2599, - "step": 1116600 - }, - { - "epoch": 11.38, - "learning_rate": 8.824965907749096e-06, - "loss": 0.2829, - "step": 1116700 - }, - { - "epoch": 11.38, - "learning_rate": 8.820267338115773e-06, - "loss": 0.2389, - "step": 1116800 - }, - { - "epoch": 11.38, - "learning_rate": 8.815569804044976e-06, - "loss": 0.2879, - "step": 1116900 - }, - { - "epoch": 11.38, - "learning_rate": 8.810873305766392e-06, - "loss": 0.2317, - "step": 1117000 - }, - { - "epoch": 11.38, - "learning_rate": 8.806177843509665e-06, - "loss": 0.2996, - "step": 1117100 - }, - { - "epoch": 11.38, - "learning_rate": 8.80148341750435e-06, - "loss": 0.257, - "step": 1117200 - }, - { - "epoch": 11.38, - "learning_rate": 8.796790027979993e-06, - "loss": 0.3114, - "step": 1117300 - }, - { - "epoch": 11.38, - "learning_rate": 8.79209767516607e-06, - "loss": 0.3328, - "step": 1117400 - }, - { - "epoch": 11.39, - "learning_rate": 8.787406359292008e-06, - "loss": 0.198, - "step": 1117500 - }, - { - "epoch": 11.39, - "learning_rate": 8.782716080587182e-06, - "loss": 0.276, - "step": 1117600 - }, - { - "epoch": 11.39, - "learning_rate": 8.778026839280933e-06, - "loss": 0.2459, - "step": 1117700 - }, - { - "epoch": 11.39, - "learning_rate": 8.773338635602516e-06, - "loss": 0.2008, - "step": 1117800 - }, - { - "epoch": 11.39, - "learning_rate": 8.768651469781165e-06, - "loss": 0.2947, - "step": 1117900 - }, - { - "epoch": 11.39, - "learning_rate": 8.763965342046063e-06, - "loss": 0.3186, - "step": 1118000 - }, - { - "epoch": 11.39, - "learning_rate": 8.75928025262632e-06, - "loss": 0.2611, - "step": 1118100 - }, - { - "epoch": 11.39, - "learning_rate": 8.754596201751013e-06, - "loss": 0.25, - "step": 1118200 - }, - { - "epoch": 11.39, - "learning_rate": 8.749913189649169e-06, - "loss": 0.3154, - "step": 1118300 - }, - { - "epoch": 11.39, - "learning_rate": 8.74523121654976e-06, - "loss": 0.2813, - "step": 1118400 - }, - { - "epoch": 11.4, - "learning_rate": 8.740550282681695e-06, - "loss": 0.2386, - "step": 1118500 - }, - { - "epoch": 11.4, - "learning_rate": 8.735870388273852e-06, - "loss": 0.2897, - "step": 1118600 - }, - { - "epoch": 11.4, - "learning_rate": 8.731191533555051e-06, - "loss": 0.2677, - "step": 1118700 - }, - { - "epoch": 11.4, - "learning_rate": 8.726513718754055e-06, - "loss": 0.2415, - "step": 1118800 - }, - { - "epoch": 11.4, - "learning_rate": 8.721836944099588e-06, - "loss": 0.2088, - "step": 1118900 - }, - { - "epoch": 11.4, - "learning_rate": 8.717161209820319e-06, - "loss": 0.2467, - "step": 1119000 - }, - { - "epoch": 11.4, - "learning_rate": 8.712486516144851e-06, - "loss": 0.2554, - "step": 1119100 - }, - { - "epoch": 11.4, - "learning_rate": 8.707812863301753e-06, - "loss": 0.2665, - "step": 1119200 - }, - { - "epoch": 11.4, - "learning_rate": 8.703140251519553e-06, - "loss": 0.2061, - "step": 1119300 - }, - { - "epoch": 11.4, - "learning_rate": 8.698515391576482e-06, - "loss": 0.2945, - "step": 1119400 - }, - { - "epoch": 11.41, - "learning_rate": 8.69384485218508e-06, - "loss": 0.2303, - "step": 1119500 - }, - { - "epoch": 11.41, - "learning_rate": 8.68917535453751e-06, - "loss": 0.2162, - "step": 1119600 - }, - { - "epoch": 11.41, - "learning_rate": 8.684506898862085e-06, - "loss": 0.24, - "step": 1119700 - }, - { - "epoch": 11.41, - "learning_rate": 8.679839485387082e-06, - "loss": 0.2839, - "step": 1119800 - }, - { - "epoch": 11.41, - "learning_rate": 8.675173114340683e-06, - "loss": 0.2601, - "step": 1119900 - }, - { - "epoch": 11.41, - "learning_rate": 8.670507785951062e-06, - "loss": 0.2738, - "step": 1120000 - }, - { - "epoch": 11.41, - "learning_rate": 8.665843500446319e-06, - "loss": 0.2245, - "step": 1120100 - }, - { - "epoch": 11.41, - "learning_rate": 8.661180258054515e-06, - "loss": 0.2389, - "step": 1120200 - }, - { - "epoch": 11.41, - "learning_rate": 8.656518059003655e-06, - "loss": 0.2317, - "step": 1120300 - }, - { - "epoch": 11.41, - "learning_rate": 8.651856903521697e-06, - "loss": 0.242, - "step": 1120400 - }, - { - "epoch": 11.42, - "learning_rate": 8.64719679183653e-06, - "loss": 0.3016, - "step": 1120500 - }, - { - "epoch": 11.42, - "learning_rate": 8.64253772417601e-06, - "loss": 0.2405, - "step": 1120600 - }, - { - "epoch": 11.42, - "learning_rate": 8.637879700767945e-06, - "loss": 0.2584, - "step": 1120700 - }, - { - "epoch": 11.42, - "learning_rate": 8.633222721840086e-06, - "loss": 0.2246, - "step": 1120800 - }, - { - "epoch": 11.42, - "learning_rate": 8.62856678762012e-06, - "loss": 0.2455, - "step": 1120900 - }, - { - "epoch": 11.42, - "learning_rate": 8.623911898335696e-06, - "loss": 0.2441, - "step": 1121000 - }, - { - "epoch": 11.42, - "learning_rate": 8.619258054214428e-06, - "loss": 0.2857, - "step": 1121100 - }, - { - "epoch": 11.42, - "learning_rate": 8.614605255483833e-06, - "loss": 0.2593, - "step": 1121200 - }, - { - "epoch": 11.42, - "learning_rate": 8.609953502371423e-06, - "loss": 0.2909, - "step": 1121300 - }, - { - "epoch": 11.43, - "learning_rate": 8.605302795104638e-06, - "loss": 0.2012, - "step": 1121400 - }, - { - "epoch": 11.43, - "learning_rate": 8.600653133910868e-06, - "loss": 0.2331, - "step": 1121500 - }, - { - "epoch": 11.43, - "learning_rate": 8.596004519017453e-06, - "loss": 0.2169, - "step": 1121600 - }, - { - "epoch": 11.43, - "learning_rate": 8.591356950651696e-06, - "loss": 0.2651, - "step": 1121700 - }, - { - "epoch": 11.43, - "learning_rate": 8.58675688907474e-06, - "loss": 0.211, - "step": 1121800 - }, - { - "epoch": 11.43, - "learning_rate": 8.582157853642692e-06, - "loss": 0.262, - "step": 1121900 - }, - { - "epoch": 11.43, - "learning_rate": 8.57751340527668e-06, - "loss": 0.1897, - "step": 1122000 - }, - { - "epoch": 11.43, - "learning_rate": 8.57287000434243e-06, - "loss": 0.2229, - "step": 1122100 - }, - { - "epoch": 11.43, - "learning_rate": 8.56822765106696e-06, - "loss": 0.225, - "step": 1122200 - }, - { - "epoch": 11.43, - "learning_rate": 8.563586345677258e-06, - "loss": 0.1998, - "step": 1122300 - }, - { - "epoch": 11.44, - "learning_rate": 8.558946088400266e-06, - "loss": 0.2715, - "step": 1122400 - }, - { - "epoch": 11.44, - "learning_rate": 8.554306879462849e-06, - "loss": 0.221, - "step": 1122500 - }, - { - "epoch": 11.44, - "learning_rate": 8.549668719091842e-06, - "loss": 0.237, - "step": 1122600 - }, - { - "epoch": 11.44, - "learning_rate": 8.545031607514027e-06, - "loss": 0.2734, - "step": 1122700 - }, - { - "epoch": 11.44, - "learning_rate": 8.540395544956131e-06, - "loss": 0.2586, - "step": 1122800 - }, - { - "epoch": 11.44, - "learning_rate": 8.535760531644826e-06, - "loss": 0.2386, - "step": 1122900 - }, - { - "epoch": 11.44, - "learning_rate": 8.531126567806752e-06, - "loss": 0.2444, - "step": 1123000 - }, - { - "epoch": 11.44, - "learning_rate": 8.526493653668457e-06, - "loss": 0.3106, - "step": 1123100 - }, - { - "epoch": 11.44, - "learning_rate": 8.521861789456478e-06, - "loss": 0.2626, - "step": 1123200 - }, - { - "epoch": 11.44, - "learning_rate": 8.51723097539729e-06, - "loss": 0.2389, - "step": 1123300 - }, - { - "epoch": 11.45, - "learning_rate": 8.512601211717296e-06, - "loss": 0.2402, - "step": 1123400 - }, - { - "epoch": 11.45, - "learning_rate": 8.50797249864287e-06, - "loss": 0.232, - "step": 1123500 - }, - { - "epoch": 11.45, - "learning_rate": 8.503344836400338e-06, - "loss": 0.256, - "step": 1123600 - }, - { - "epoch": 11.45, - "learning_rate": 8.498718225215962e-06, - "loss": 0.2544, - "step": 1123700 - }, - { - "epoch": 11.45, - "learning_rate": 8.49409266531594e-06, - "loss": 0.2962, - "step": 1123800 - }, - { - "epoch": 11.45, - "learning_rate": 8.489468156926453e-06, - "loss": 0.2529, - "step": 1123900 - }, - { - "epoch": 11.45, - "learning_rate": 8.484844700273599e-06, - "loss": 0.1944, - "step": 1124000 - }, - { - "epoch": 11.45, - "learning_rate": 8.480222295583449e-06, - "loss": 0.2254, - "step": 1124100 - }, - { - "epoch": 11.45, - "learning_rate": 8.475600943082003e-06, - "loss": 0.2691, - "step": 1124200 - }, - { - "epoch": 11.45, - "learning_rate": 8.470980642995227e-06, - "loss": 0.2943, - "step": 1124300 - }, - { - "epoch": 11.46, - "learning_rate": 8.466407582812158e-06, - "loss": 0.276, - "step": 1124400 - }, - { - "epoch": 11.46, - "learning_rate": 8.461789377702592e-06, - "loss": 0.2427, - "step": 1124500 - }, - { - "epoch": 11.46, - "learning_rate": 8.457172225682983e-06, - "loss": 0.2672, - "step": 1124600 - }, - { - "epoch": 11.46, - "learning_rate": 8.45255612697909e-06, - "loss": 0.2455, - "step": 1124700 - }, - { - "epoch": 11.46, - "learning_rate": 8.447941081816623e-06, - "loss": 0.2612, - "step": 1124800 - }, - { - "epoch": 11.46, - "learning_rate": 8.443327090421214e-06, - "loss": 0.2799, - "step": 1124900 - }, - { - "epoch": 11.46, - "learning_rate": 8.438714153018465e-06, - "loss": 0.2344, - "step": 1125000 - }, - { - "epoch": 11.46, - "learning_rate": 8.434102269833923e-06, - "loss": 0.2661, - "step": 1125100 - }, - { - "epoch": 11.46, - "learning_rate": 8.429491441093081e-06, - "loss": 0.2576, - "step": 1125200 - }, - { - "epoch": 11.46, - "learning_rate": 8.424881667021383e-06, - "loss": 0.2211, - "step": 1125300 - }, - { - "epoch": 11.47, - "learning_rate": 8.420272947844228e-06, - "loss": 0.2409, - "step": 1125400 - }, - { - "epoch": 11.47, - "learning_rate": 8.415665283786933e-06, - "loss": 0.2915, - "step": 1125500 - }, - { - "epoch": 11.47, - "learning_rate": 8.411058675074798e-06, - "loss": 0.2499, - "step": 1125600 - }, - { - "epoch": 11.47, - "learning_rate": 8.406453121933069e-06, - "loss": 0.2381, - "step": 1125700 - }, - { - "epoch": 11.47, - "learning_rate": 8.401848624586906e-06, - "loss": 0.2692, - "step": 1125800 - }, - { - "epoch": 11.47, - "learning_rate": 8.397245183261458e-06, - "loss": 0.201, - "step": 1125900 - }, - { - "epoch": 11.47, - "learning_rate": 8.392642798181797e-06, - "loss": 0.2577, - "step": 1126000 - }, - { - "epoch": 11.47, - "learning_rate": 8.38804146957297e-06, - "loss": 0.2479, - "step": 1126100 - }, - { - "epoch": 11.47, - "learning_rate": 8.383441197659933e-06, - "loss": 0.2651, - "step": 1126200 - }, - { - "epoch": 11.47, - "learning_rate": 8.37884198266762e-06, - "loss": 0.2586, - "step": 1126300 - }, - { - "epoch": 11.48, - "learning_rate": 8.374243824820904e-06, - "loss": 0.2045, - "step": 1126400 - }, - { - "epoch": 11.48, - "learning_rate": 8.369646724344612e-06, - "loss": 0.2677, - "step": 1126500 - }, - { - "epoch": 11.48, - "learning_rate": 8.36505068146351e-06, - "loss": 0.2193, - "step": 1126600 - }, - { - "epoch": 11.48, - "learning_rate": 8.360455696402328e-06, - "loss": 0.241, - "step": 1126700 - }, - { - "epoch": 11.48, - "learning_rate": 8.355861769385714e-06, - "loss": 0.2096, - "step": 1126800 - }, - { - "epoch": 11.48, - "learning_rate": 8.3512689006383e-06, - "loss": 0.2663, - "step": 1126900 - }, - { - "epoch": 11.48, - "learning_rate": 8.346677090384648e-06, - "loss": 0.215, - "step": 1127000 - }, - { - "epoch": 11.48, - "learning_rate": 8.342086338849257e-06, - "loss": 0.1918, - "step": 1127100 - }, - { - "epoch": 11.48, - "learning_rate": 8.337496646256594e-06, - "loss": 0.2514, - "step": 1127200 - }, - { - "epoch": 11.49, - "learning_rate": 8.332908012831083e-06, - "loss": 0.2513, - "step": 1127300 - }, - { - "epoch": 11.49, - "learning_rate": 8.328320438797059e-06, - "loss": 0.2373, - "step": 1127400 - }, - { - "epoch": 11.49, - "learning_rate": 8.323733924378834e-06, - "loss": 0.264, - "step": 1127500 - }, - { - "epoch": 11.49, - "learning_rate": 8.319148469800664e-06, - "loss": 0.217, - "step": 1127600 - }, - { - "epoch": 11.49, - "learning_rate": 8.31456407528675e-06, - "loss": 0.2766, - "step": 1127700 - }, - { - "epoch": 11.49, - "learning_rate": 8.309980741061237e-06, - "loss": 0.3015, - "step": 1127800 - }, - { - "epoch": 11.49, - "learning_rate": 8.30539846734823e-06, - "loss": 0.2212, - "step": 1127900 - }, - { - "epoch": 11.49, - "learning_rate": 8.300817254371778e-06, - "loss": 0.2584, - "step": 1128000 - }, - { - "epoch": 11.49, - "learning_rate": 8.296237102355861e-06, - "loss": 0.2558, - "step": 1128100 - }, - { - "epoch": 11.49, - "learning_rate": 8.291658011524428e-06, - "loss": 0.2327, - "step": 1128200 - }, - { - "epoch": 11.5, - "learning_rate": 8.287079982101378e-06, - "loss": 0.2482, - "step": 1128300 - }, - { - "epoch": 11.5, - "learning_rate": 8.282503014310536e-06, - "loss": 0.2373, - "step": 1128400 - }, - { - "epoch": 11.5, - "learning_rate": 8.277927108375688e-06, - "loss": 0.2513, - "step": 1128500 - }, - { - "epoch": 11.5, - "learning_rate": 8.273398007701102e-06, - "loss": 0.2712, - "step": 1128600 - }, - { - "epoch": 11.5, - "learning_rate": 8.268824215525265e-06, - "loss": 0.2207, - "step": 1128700 - }, - { - "epoch": 11.5, - "learning_rate": 8.264297207910522e-06, - "loss": 0.2049, - "step": 1128800 - }, - { - "epoch": 11.5, - "learning_rate": 8.259725530379295e-06, - "loss": 0.2752, - "step": 1128900 - }, - { - "epoch": 11.5, - "learning_rate": 8.255154915817761e-06, - "loss": 0.2713, - "step": 1129000 - }, - { - "epoch": 11.5, - "learning_rate": 8.250585364449367e-06, - "loss": 0.3167, - "step": 1129100 - }, - { - "epoch": 11.5, - "learning_rate": 8.246016876497563e-06, - "loss": 0.3313, - "step": 1129200 - }, - { - "epoch": 11.51, - "learning_rate": 8.241449452185721e-06, - "loss": 0.1974, - "step": 1129300 - }, - { - "epoch": 11.51, - "learning_rate": 8.236883091737145e-06, - "loss": 0.2334, - "step": 1129400 - }, - { - "epoch": 11.51, - "learning_rate": 8.232317795375112e-06, - "loss": 0.2275, - "step": 1129500 - }, - { - "epoch": 11.51, - "learning_rate": 8.227753563322842e-06, - "loss": 0.2314, - "step": 1129600 - }, - { - "epoch": 11.51, - "learning_rate": 8.223190395803484e-06, - "loss": 0.218, - "step": 1129700 - }, - { - "epoch": 11.51, - "learning_rate": 8.218628293040158e-06, - "loss": 0.2637, - "step": 1129800 - }, - { - "epoch": 11.51, - "learning_rate": 8.214067255255936e-06, - "loss": 0.226, - "step": 1129900 - }, - { - "epoch": 11.51, - "learning_rate": 8.209507282673803e-06, - "loss": 0.3224, - "step": 1130000 - }, - { - "epoch": 11.51, - "learning_rate": 8.204948375516724e-06, - "loss": 0.2724, - "step": 1130100 - }, - { - "epoch": 11.51, - "learning_rate": 8.200390534007613e-06, - "loss": 0.2374, - "step": 1130200 - }, - { - "epoch": 11.52, - "learning_rate": 8.195833758369291e-06, - "loss": 0.2553, - "step": 1130300 - }, - { - "epoch": 11.52, - "learning_rate": 8.191278048824592e-06, - "loss": 0.2486, - "step": 1130400 - }, - { - "epoch": 11.52, - "learning_rate": 8.186723405596257e-06, - "loss": 0.2825, - "step": 1130500 - }, - { - "epoch": 11.52, - "learning_rate": 8.182169828906958e-06, - "loss": 0.2323, - "step": 1130600 - }, - { - "epoch": 11.52, - "learning_rate": 8.17761731897936e-06, - "loss": 0.2573, - "step": 1130700 - }, - { - "epoch": 11.52, - "learning_rate": 8.173065876036042e-06, - "loss": 0.2436, - "step": 1130800 - }, - { - "epoch": 11.52, - "learning_rate": 8.168515500299554e-06, - "loss": 0.2826, - "step": 1130900 - }, - { - "epoch": 11.52, - "learning_rate": 8.163966191992368e-06, - "loss": 0.2274, - "step": 1131000 - }, - { - "epoch": 11.52, - "learning_rate": 8.159417951336923e-06, - "loss": 0.2045, - "step": 1131100 - }, - { - "epoch": 11.52, - "learning_rate": 8.154870778555614e-06, - "loss": 0.2427, - "step": 1131200 - }, - { - "epoch": 11.53, - "learning_rate": 8.150324673870747e-06, - "loss": 0.2535, - "step": 1131300 - }, - { - "epoch": 11.53, - "learning_rate": 8.145779637504611e-06, - "loss": 0.2387, - "step": 1131400 - }, - { - "epoch": 11.53, - "learning_rate": 8.141235669679444e-06, - "loss": 0.2663, - "step": 1131500 - }, - { - "epoch": 11.53, - "learning_rate": 8.136692770617385e-06, - "loss": 0.2248, - "step": 1131600 - }, - { - "epoch": 11.53, - "learning_rate": 8.132150940540587e-06, - "loss": 0.2392, - "step": 1131700 - }, - { - "epoch": 11.53, - "learning_rate": 8.127610179671115e-06, - "loss": 0.2513, - "step": 1131800 - }, - { - "epoch": 11.53, - "learning_rate": 8.12307048823097e-06, - "loss": 0.2371, - "step": 1131900 - }, - { - "epoch": 11.53, - "learning_rate": 8.118531866442125e-06, - "loss": 0.2454, - "step": 1132000 - }, - { - "epoch": 11.53, - "learning_rate": 8.113994314526497e-06, - "loss": 0.2625, - "step": 1132100 - }, - { - "epoch": 11.54, - "learning_rate": 8.10945783270593e-06, - "loss": 0.2332, - "step": 1132200 - }, - { - "epoch": 11.54, - "learning_rate": 8.104922421202238e-06, - "loss": 0.2195, - "step": 1132300 - }, - { - "epoch": 11.54, - "learning_rate": 8.100388080237176e-06, - "loss": 0.2015, - "step": 1132400 - }, - { - "epoch": 11.54, - "learning_rate": 8.095854810032457e-06, - "loss": 0.2598, - "step": 1132500 - }, - { - "epoch": 11.54, - "learning_rate": 8.091322610809712e-06, - "loss": 0.1707, - "step": 1132600 - }, - { - "epoch": 11.54, - "learning_rate": 8.086791482790547e-06, - "loss": 0.2585, - "step": 1132700 - }, - { - "epoch": 11.54, - "learning_rate": 8.082261426196513e-06, - "loss": 0.2203, - "step": 1132800 - }, - { - "epoch": 11.54, - "learning_rate": 8.077732441249084e-06, - "loss": 0.3006, - "step": 1132900 - }, - { - "epoch": 11.54, - "learning_rate": 8.07320452816972e-06, - "loss": 0.2512, - "step": 1133000 - }, - { - "epoch": 11.54, - "learning_rate": 8.06867768717981e-06, - "loss": 0.1911, - "step": 1133100 - }, - { - "epoch": 11.55, - "learning_rate": 8.064151918500677e-06, - "loss": 0.2379, - "step": 1133200 - }, - { - "epoch": 11.55, - "learning_rate": 8.059627222353605e-06, - "loss": 0.263, - "step": 1133300 - }, - { - "epoch": 11.55, - "learning_rate": 8.05510359895984e-06, - "loss": 0.2785, - "step": 1133400 - }, - { - "epoch": 11.55, - "learning_rate": 8.050581048540538e-06, - "loss": 0.2055, - "step": 1133500 - }, - { - "epoch": 11.55, - "learning_rate": 8.046059571316837e-06, - "loss": 0.2352, - "step": 1133600 - }, - { - "epoch": 11.55, - "learning_rate": 8.041539167509816e-06, - "loss": 0.2152, - "step": 1133700 - }, - { - "epoch": 11.55, - "learning_rate": 8.037019837340481e-06, - "loss": 0.2434, - "step": 1133800 - }, - { - "epoch": 11.55, - "learning_rate": 8.032501581029805e-06, - "loss": 0.2359, - "step": 1133900 - }, - { - "epoch": 11.55, - "learning_rate": 8.027984398798719e-06, - "loss": 0.226, - "step": 1134000 - }, - { - "epoch": 11.55, - "learning_rate": 8.023513446628853e-06, - "loss": 0.2821, - "step": 1134100 - }, - { - "epoch": 11.56, - "learning_rate": 8.018998402473143e-06, - "loss": 0.2621, - "step": 1134200 - }, - { - "epoch": 11.56, - "learning_rate": 8.014484433057235e-06, - "loss": 0.2528, - "step": 1134300 - }, - { - "epoch": 11.56, - "learning_rate": 8.009971538601836e-06, - "loss": 0.2007, - "step": 1134400 - }, - { - "epoch": 11.56, - "learning_rate": 8.00545971932761e-06, - "loss": 0.3308, - "step": 1134500 - }, - { - "epoch": 11.56, - "learning_rate": 8.000948975455136e-06, - "loss": 0.2429, - "step": 1134600 - }, - { - "epoch": 11.56, - "learning_rate": 7.996439307204972e-06, - "loss": 0.2068, - "step": 1134700 - }, - { - "epoch": 11.56, - "learning_rate": 7.99193071479762e-06, - "loss": 0.2598, - "step": 1134800 - }, - { - "epoch": 11.56, - "learning_rate": 7.987423198453525e-06, - "loss": 0.2464, - "step": 1134900 - }, - { - "epoch": 11.56, - "learning_rate": 7.982916758393063e-06, - "loss": 0.2525, - "step": 1135000 - }, - { - "epoch": 11.56, - "learning_rate": 7.978411394836582e-06, - "loss": 0.1883, - "step": 1135100 - }, - { - "epoch": 11.57, - "learning_rate": 7.973907108004372e-06, - "loss": 0.2542, - "step": 1135200 - }, - { - "epoch": 11.57, - "learning_rate": 7.96940389811665e-06, - "loss": 0.23, - "step": 1135300 - }, - { - "epoch": 11.57, - "learning_rate": 7.96490176539361e-06, - "loss": 0.2751, - "step": 1135400 - }, - { - "epoch": 11.57, - "learning_rate": 7.960400710055369e-06, - "loss": 0.2053, - "step": 1135500 - }, - { - "epoch": 11.57, - "learning_rate": 7.955900732322012e-06, - "loss": 0.2265, - "step": 1135600 - }, - { - "epoch": 11.57, - "learning_rate": 7.951401832413558e-06, - "loss": 0.2861, - "step": 1135700 - }, - { - "epoch": 11.57, - "learning_rate": 7.946904010549983e-06, - "loss": 0.3055, - "step": 1135800 - }, - { - "epoch": 11.57, - "learning_rate": 7.942407266951184e-06, - "loss": 0.2429, - "step": 1135900 - }, - { - "epoch": 11.57, - "learning_rate": 7.937911601837039e-06, - "loss": 0.2211, - "step": 1136000 - }, - { - "epoch": 11.57, - "learning_rate": 7.933417015427366e-06, - "loss": 0.2417, - "step": 1136100 - }, - { - "epoch": 11.58, - "learning_rate": 7.928923507941905e-06, - "loss": 0.2095, - "step": 1136200 - }, - { - "epoch": 11.58, - "learning_rate": 7.924431079600373e-06, - "loss": 0.2881, - "step": 1136300 - }, - { - "epoch": 11.58, - "learning_rate": 7.919939730622428e-06, - "loss": 0.2396, - "step": 1136400 - }, - { - "epoch": 11.58, - "learning_rate": 7.915449461227657e-06, - "loss": 0.2454, - "step": 1136500 - }, - { - "epoch": 11.58, - "learning_rate": 7.91096027163561e-06, - "loss": 0.2346, - "step": 1136600 - }, - { - "epoch": 11.58, - "learning_rate": 7.906472162065788e-06, - "loss": 0.2291, - "step": 1136700 - }, - { - "epoch": 11.58, - "learning_rate": 7.901985132737627e-06, - "loss": 0.2471, - "step": 1136800 - }, - { - "epoch": 11.58, - "learning_rate": 7.897499183870523e-06, - "loss": 0.2177, - "step": 1136900 - }, - { - "epoch": 11.58, - "learning_rate": 7.893014315683805e-06, - "loss": 0.263, - "step": 1137000 - }, - { - "epoch": 11.58, - "learning_rate": 7.888530528396769e-06, - "loss": 0.239, - "step": 1137100 - }, - { - "epoch": 11.59, - "learning_rate": 7.884047822228629e-06, - "loss": 0.2319, - "step": 1137200 - }, - { - "epoch": 11.59, - "learning_rate": 7.879566197398564e-06, - "loss": 0.2267, - "step": 1137300 - }, - { - "epoch": 11.59, - "learning_rate": 7.875130454204015e-06, - "loss": 0.3413, - "step": 1137400 - }, - { - "epoch": 11.59, - "learning_rate": 7.870650981888587e-06, - "loss": 0.2029, - "step": 1137500 - }, - { - "epoch": 11.59, - "learning_rate": 7.866172591566278e-06, - "loss": 0.2635, - "step": 1137600 - }, - { - "epoch": 11.59, - "learning_rate": 7.86169528345603e-06, - "loss": 0.2751, - "step": 1137700 - }, - { - "epoch": 11.59, - "learning_rate": 7.857219057776768e-06, - "loss": 0.2769, - "step": 1137800 - }, - { - "epoch": 11.59, - "learning_rate": 7.852743914747361e-06, - "loss": 0.175, - "step": 1137900 - }, - { - "epoch": 11.59, - "learning_rate": 7.848269854586602e-06, - "loss": 0.2576, - "step": 1138000 - }, - { - "epoch": 11.6, - "learning_rate": 7.843796877513252e-06, - "loss": 0.2695, - "step": 1138100 - }, - { - "epoch": 11.6, - "learning_rate": 7.83932498374601e-06, - "loss": 0.2203, - "step": 1138200 - }, - { - "epoch": 11.6, - "learning_rate": 7.834854173503531e-06, - "loss": 0.2768, - "step": 1138300 - }, - { - "epoch": 11.6, - "learning_rate": 7.830384447004408e-06, - "loss": 0.2128, - "step": 1138400 - }, - { - "epoch": 11.6, - "learning_rate": 7.825915804467195e-06, - "loss": 0.2594, - "step": 1138500 - }, - { - "epoch": 11.6, - "learning_rate": 7.821448246110358e-06, - "loss": 0.2573, - "step": 1138600 - }, - { - "epoch": 11.6, - "learning_rate": 7.816981772152351e-06, - "loss": 0.2274, - "step": 1138700 - }, - { - "epoch": 11.6, - "learning_rate": 7.81251638281156e-06, - "loss": 0.234, - "step": 1138800 - }, - { - "epoch": 11.6, - "learning_rate": 7.808052078306303e-06, - "loss": 0.2165, - "step": 1138900 - }, - { - "epoch": 11.6, - "learning_rate": 7.803588858854864e-06, - "loss": 0.261, - "step": 1139000 - }, - { - "epoch": 11.61, - "learning_rate": 7.799126724675471e-06, - "loss": 0.2577, - "step": 1139100 - }, - { - "epoch": 11.61, - "learning_rate": 7.794665675986298e-06, - "loss": 0.2092, - "step": 1139200 - }, - { - "epoch": 11.61, - "learning_rate": 7.790205713005451e-06, - "loss": 0.2325, - "step": 1139300 - }, - { - "epoch": 11.61, - "learning_rate": 7.785746835951005e-06, - "loss": 0.2148, - "step": 1139400 - }, - { - "epoch": 11.61, - "learning_rate": 7.781289045040971e-06, - "loss": 0.248, - "step": 1139500 - }, - { - "epoch": 11.61, - "learning_rate": 7.776832340493308e-06, - "loss": 0.2333, - "step": 1139600 - }, - { - "epoch": 11.61, - "learning_rate": 7.77237672252592e-06, - "loss": 0.2843, - "step": 1139700 - }, - { - "epoch": 11.61, - "learning_rate": 7.767966731287999e-06, - "loss": 0.2384, - "step": 1139800 - }, - { - "epoch": 11.61, - "learning_rate": 7.763513276263436e-06, - "loss": 0.2442, - "step": 1139900 - }, - { - "epoch": 11.61, - "learning_rate": 7.759060908470385e-06, - "loss": 0.2434, - "step": 1140000 - }, - { - "epoch": 11.62, - "learning_rate": 7.75460962812652e-06, - "loss": 0.2053, - "step": 1140100 - }, - { - "epoch": 11.62, - "learning_rate": 7.75015943544949e-06, - "loss": 0.2155, - "step": 1140200 - }, - { - "epoch": 11.62, - "learning_rate": 7.74571033065689e-06, - "loss": 0.2313, - "step": 1140300 - }, - { - "epoch": 11.62, - "learning_rate": 7.741262313966243e-06, - "loss": 0.1823, - "step": 1140400 - }, - { - "epoch": 11.62, - "learning_rate": 7.73681538559503e-06, - "loss": 0.2079, - "step": 1140500 - }, - { - "epoch": 11.62, - "learning_rate": 7.73236954576069e-06, - "loss": 0.2357, - "step": 1140600 - }, - { - "epoch": 11.62, - "learning_rate": 7.72792479468059e-06, - "loss": 0.2409, - "step": 1140700 - }, - { - "epoch": 11.62, - "learning_rate": 7.723481132572059e-06, - "loss": 0.2401, - "step": 1140800 - }, - { - "epoch": 11.62, - "learning_rate": 7.719038559652367e-06, - "loss": 0.2646, - "step": 1140900 - }, - { - "epoch": 11.62, - "learning_rate": 7.714641485580574e-06, - "loss": 0.2994, - "step": 1141000 - }, - { - "epoch": 11.63, - "learning_rate": 7.710201080792834e-06, - "loss": 0.2026, - "step": 1141100 - }, - { - "epoch": 11.63, - "learning_rate": 7.705761765843235e-06, - "loss": 0.2892, - "step": 1141200 - }, - { - "epoch": 11.63, - "learning_rate": 7.70132354094884e-06, - "loss": 0.254, - "step": 1141300 - }, - { - "epoch": 11.63, - "learning_rate": 7.69688640632666e-06, - "loss": 0.1882, - "step": 1141400 - }, - { - "epoch": 11.63, - "learning_rate": 7.69245036219363e-06, - "loss": 0.2452, - "step": 1141500 - }, - { - "epoch": 11.63, - "learning_rate": 7.688015408766653e-06, - "loss": 0.2627, - "step": 1141600 - }, - { - "epoch": 11.63, - "learning_rate": 7.68358154626258e-06, - "loss": 0.2267, - "step": 1141700 - }, - { - "epoch": 11.63, - "learning_rate": 7.679148774898182e-06, - "loss": 0.2547, - "step": 1141800 - }, - { - "epoch": 11.63, - "learning_rate": 7.674717094890202e-06, - "loss": 0.239, - "step": 1141900 - }, - { - "epoch": 11.63, - "learning_rate": 7.670286506455331e-06, - "loss": 0.2257, - "step": 1142000 - }, - { - "epoch": 11.64, - "learning_rate": 7.66585700981019e-06, - "loss": 0.2617, - "step": 1142100 - }, - { - "epoch": 11.64, - "learning_rate": 7.66142860517136e-06, - "loss": 0.2619, - "step": 1142200 - }, - { - "epoch": 11.64, - "learning_rate": 7.657001292755362e-06, - "loss": 0.2801, - "step": 1142300 - }, - { - "epoch": 11.64, - "learning_rate": 7.65257507277867e-06, - "loss": 0.2248, - "step": 1142400 - }, - { - "epoch": 11.64, - "learning_rate": 7.64814994545769e-06, - "loss": 0.2726, - "step": 1142500 - }, - { - "epoch": 11.64, - "learning_rate": 7.643725911008788e-06, - "loss": 0.2284, - "step": 1142600 - }, - { - "epoch": 11.64, - "learning_rate": 7.63930296964828e-06, - "loss": 0.226, - "step": 1142700 - }, - { - "epoch": 11.64, - "learning_rate": 7.634881121592408e-06, - "loss": 0.2877, - "step": 1142800 - }, - { - "epoch": 11.64, - "learning_rate": 7.630460367057382e-06, - "loss": 0.2866, - "step": 1142900 - }, - { - "epoch": 11.65, - "learning_rate": 7.626040706259356e-06, - "loss": 0.3113, - "step": 1143000 - }, - { - "epoch": 11.65, - "learning_rate": 7.621622139414409e-06, - "loss": 0.2346, - "step": 1143100 - }, - { - "epoch": 11.65, - "learning_rate": 7.617204666738594e-06, - "loss": 0.2649, - "step": 1143200 - }, - { - "epoch": 11.65, - "learning_rate": 7.612788288447898e-06, - "loss": 0.2137, - "step": 1143300 - }, - { - "epoch": 11.65, - "learning_rate": 7.608373004758255e-06, - "loss": 0.2532, - "step": 1143400 - }, - { - "epoch": 11.65, - "learning_rate": 7.6039588158855464e-06, - "loss": 0.2321, - "step": 1143500 - }, - { - "epoch": 11.65, - "learning_rate": 7.599545722045605e-06, - "loss": 0.2109, - "step": 1143600 - }, - { - "epoch": 11.65, - "learning_rate": 7.595133723454193e-06, - "loss": 0.2093, - "step": 1143700 - }, - { - "epoch": 11.65, - "learning_rate": 7.590722820327033e-06, - "loss": 0.2685, - "step": 1143800 - }, - { - "epoch": 11.65, - "learning_rate": 7.5863130128797994e-06, - "loss": 0.2812, - "step": 1143900 - }, - { - "epoch": 11.66, - "learning_rate": 7.581904301328107e-06, - "loss": 0.2595, - "step": 1144000 - }, - { - "epoch": 11.66, - "learning_rate": 7.577496685887504e-06, - "loss": 0.2707, - "step": 1144100 - }, - { - "epoch": 11.66, - "learning_rate": 7.573090166773503e-06, - "loss": 0.2091, - "step": 1144200 - }, - { - "epoch": 11.66, - "learning_rate": 7.568684744201563e-06, - "loss": 0.219, - "step": 1144300 - }, - { - "epoch": 11.66, - "learning_rate": 7.5642804183870674e-06, - "loss": 0.2688, - "step": 1144400 - }, - { - "epoch": 11.66, - "learning_rate": 7.559877189545371e-06, - "loss": 0.2118, - "step": 1144500 - }, - { - "epoch": 11.66, - "learning_rate": 7.555475057891762e-06, - "loss": 0.2481, - "step": 1144600 - }, - { - "epoch": 11.66, - "learning_rate": 7.551074023641484e-06, - "loss": 0.1824, - "step": 1144700 - }, - { - "epoch": 11.66, - "learning_rate": 7.546674087009716e-06, - "loss": 0.2284, - "step": 1144800 - }, - { - "epoch": 11.66, - "learning_rate": 7.542275248211599e-06, - "loss": 0.1952, - "step": 1144900 - }, - { - "epoch": 11.67, - "learning_rate": 7.537877507462193e-06, - "loss": 0.2555, - "step": 1145000 - }, - { - "epoch": 11.67, - "learning_rate": 7.533480864976531e-06, - "loss": 0.2245, - "step": 1145100 - }, - { - "epoch": 11.67, - "learning_rate": 7.52908532096959e-06, - "loss": 0.2448, - "step": 1145200 - }, - { - "epoch": 11.67, - "learning_rate": 7.524690875656271e-06, - "loss": 0.2394, - "step": 1145300 - }, - { - "epoch": 11.67, - "learning_rate": 7.520297529251437e-06, - "loss": 0.2534, - "step": 1145400 - }, - { - "epoch": 11.67, - "learning_rate": 7.515949199001361e-06, - "loss": 0.2384, - "step": 1145500 - }, - { - "epoch": 11.67, - "learning_rate": 7.511558040063441e-06, - "loss": 0.1979, - "step": 1145600 - }, - { - "epoch": 11.67, - "learning_rate": 7.507167980676125e-06, - "loss": 0.2418, - "step": 1145700 - }, - { - "epoch": 11.67, - "learning_rate": 7.502779021054068e-06, - "loss": 0.2243, - "step": 1145800 - }, - { - "epoch": 11.67, - "learning_rate": 7.498391161411858e-06, - "loss": 0.2261, - "step": 1145900 - }, - { - "epoch": 11.68, - "learning_rate": 7.494004401964041e-06, - "loss": 0.3014, - "step": 1146000 - }, - { - "epoch": 11.68, - "learning_rate": 7.489618742925107e-06, - "loss": 0.2252, - "step": 1146100 - }, - { - "epoch": 11.68, - "learning_rate": 7.485234184509469e-06, - "loss": 0.2785, - "step": 1146200 - }, - { - "epoch": 11.68, - "learning_rate": 7.480850726931523e-06, - "loss": 0.2664, - "step": 1146300 - }, - { - "epoch": 11.68, - "learning_rate": 7.476468370405599e-06, - "loss": 0.2599, - "step": 1146400 - }, - { - "epoch": 11.68, - "learning_rate": 7.472087115145947e-06, - "loss": 0.273, - "step": 1146500 - }, - { - "epoch": 11.68, - "learning_rate": 7.467706961366795e-06, - "loss": 0.251, - "step": 1146600 - }, - { - "epoch": 11.68, - "learning_rate": 7.463327909282315e-06, - "loss": 0.2868, - "step": 1146700 - }, - { - "epoch": 11.68, - "learning_rate": 7.458949959106602e-06, - "loss": 0.267, - "step": 1146800 - }, - { - "epoch": 11.68, - "learning_rate": 7.4545731110537164e-06, - "loss": 0.259, - "step": 1146900 - }, - { - "epoch": 11.69, - "learning_rate": 7.450197365337662e-06, - "loss": 0.3567, - "step": 1147000 - }, - { - "epoch": 11.69, - "learning_rate": 7.445822722172388e-06, - "loss": 0.2657, - "step": 1147100 - }, - { - "epoch": 11.69, - "learning_rate": 7.441449181771784e-06, - "loss": 0.2561, - "step": 1147200 - }, - { - "epoch": 11.69, - "learning_rate": 7.4370767443497025e-06, - "loss": 0.2243, - "step": 1147300 - }, - { - "epoch": 11.69, - "learning_rate": 7.432705410119911e-06, - "loss": 0.2659, - "step": 1147400 - }, - { - "epoch": 11.69, - "learning_rate": 7.428335179296149e-06, - "loss": 0.2777, - "step": 1147500 - }, - { - "epoch": 11.69, - "learning_rate": 7.423966052092107e-06, - "loss": 0.2606, - "step": 1147600 - }, - { - "epoch": 11.69, - "learning_rate": 7.419598028721389e-06, - "loss": 0.2583, - "step": 1147700 - }, - { - "epoch": 11.69, - "learning_rate": 7.415231109397576e-06, - "loss": 0.2056, - "step": 1147800 - }, - { - "epoch": 11.7, - "learning_rate": 7.410908947018029e-06, - "loss": 0.2228, - "step": 1147900 - }, - { - "epoch": 11.7, - "learning_rate": 7.40654422538272e-06, - "loss": 0.2702, - "step": 1148000 - }, - { - "epoch": 11.7, - "learning_rate": 7.402180608432577e-06, - "loss": 0.2582, - "step": 1148100 - }, - { - "epoch": 11.7, - "learning_rate": 7.397818096380937e-06, - "loss": 0.2254, - "step": 1148200 - }, - { - "epoch": 11.7, - "learning_rate": 7.39345668944111e-06, - "loss": 0.2605, - "step": 1148300 - }, - { - "epoch": 11.7, - "learning_rate": 7.389096387826343e-06, - "loss": 0.2714, - "step": 1148400 - }, - { - "epoch": 11.7, - "learning_rate": 7.384737191749827e-06, - "loss": 0.2545, - "step": 1148500 - }, - { - "epoch": 11.7, - "learning_rate": 7.380379101424699e-06, - "loss": 0.2483, - "step": 1148600 - }, - { - "epoch": 11.7, - "learning_rate": 7.376022117064051e-06, - "loss": 0.271, - "step": 1148700 - }, - { - "epoch": 11.7, - "learning_rate": 7.371666238880912e-06, - "loss": 0.2313, - "step": 1148800 - }, - { - "epoch": 11.71, - "learning_rate": 7.367311467088249e-06, - "loss": 0.2377, - "step": 1148900 - }, - { - "epoch": 11.71, - "learning_rate": 7.362957801898992e-06, - "loss": 0.2541, - "step": 1149000 - }, - { - "epoch": 11.71, - "learning_rate": 7.358605243526013e-06, - "loss": 0.2491, - "step": 1149100 - }, - { - "epoch": 11.71, - "learning_rate": 7.354253792182113e-06, - "loss": 0.284, - "step": 1149200 - }, - { - "epoch": 11.71, - "learning_rate": 7.349903448080061e-06, - "loss": 0.2598, - "step": 1149300 - }, - { - "epoch": 11.71, - "learning_rate": 7.34555421143257e-06, - "loss": 0.2446, - "step": 1149400 - }, - { - "epoch": 11.71, - "learning_rate": 7.341206082452274e-06, - "loss": 0.2346, - "step": 1149500 - }, - { - "epoch": 11.71, - "learning_rate": 7.3368590613517826e-06, - "loss": 0.2387, - "step": 1149600 - }, - { - "epoch": 11.71, - "learning_rate": 7.332513148343632e-06, - "loss": 0.2272, - "step": 1149700 - }, - { - "epoch": 11.71, - "learning_rate": 7.32816834364032e-06, - "loss": 0.2424, - "step": 1149800 - }, - { - "epoch": 11.72, - "learning_rate": 7.323824647454276e-06, - "loss": 0.2689, - "step": 1149900 - }, - { - "epoch": 11.72, - "learning_rate": 7.319482059997893e-06, - "loss": 0.2854, - "step": 1150000 - }, - { - "epoch": 11.72, - "learning_rate": 7.315140581483475e-06, - "loss": 0.2393, - "step": 1150100 - }, - { - "epoch": 11.72, - "learning_rate": 7.310800212123308e-06, - "loss": 0.2883, - "step": 1150200 - }, - { - "epoch": 11.72, - "learning_rate": 7.306460952129608e-06, - "loss": 0.2099, - "step": 1150300 - }, - { - "epoch": 11.72, - "learning_rate": 7.302166177725585e-06, - "loss": 0.1915, - "step": 1150400 - }, - { - "epoch": 11.72, - "learning_rate": 7.297829126002311e-06, - "loss": 0.2302, - "step": 1150500 - }, - { - "epoch": 11.72, - "learning_rate": 7.293493184279702e-06, - "loss": 0.2065, - "step": 1150600 - }, - { - "epoch": 11.72, - "learning_rate": 7.2891583527697735e-06, - "loss": 0.2425, - "step": 1150700 - }, - { - "epoch": 11.72, - "learning_rate": 7.284824631684468e-06, - "loss": 0.225, - "step": 1150800 - }, - { - "epoch": 11.73, - "learning_rate": 7.280492021235682e-06, - "loss": 0.213, - "step": 1150900 - }, - { - "epoch": 11.73, - "learning_rate": 7.276160521635252e-06, - "loss": 0.2395, - "step": 1151000 - }, - { - "epoch": 11.73, - "learning_rate": 7.271830133094966e-06, - "loss": 0.2361, - "step": 1151100 - }, - { - "epoch": 11.73, - "learning_rate": 7.267500855826559e-06, - "loss": 0.2492, - "step": 1151200 - }, - { - "epoch": 11.73, - "learning_rate": 7.263172690041693e-06, - "loss": 0.2378, - "step": 1151300 - }, - { - "epoch": 11.73, - "learning_rate": 7.2588456359519925e-06, - "loss": 0.2649, - "step": 1151400 - }, - { - "epoch": 11.73, - "learning_rate": 7.25451969376904e-06, - "loss": 0.184, - "step": 1151500 - }, - { - "epoch": 11.73, - "learning_rate": 7.250194863704327e-06, - "loss": 0.2562, - "step": 1151600 - }, - { - "epoch": 11.73, - "learning_rate": 7.245871145969319e-06, - "loss": 0.2202, - "step": 1151700 - }, - { - "epoch": 11.73, - "learning_rate": 7.2415485407754345e-06, - "loss": 0.2377, - "step": 1151800 - }, - { - "epoch": 11.74, - "learning_rate": 7.237227048334003e-06, - "loss": 0.2551, - "step": 1151900 - }, - { - "epoch": 11.74, - "learning_rate": 7.2329066688563265e-06, - "loss": 0.2539, - "step": 1152000 - }, - { - "epoch": 11.74, - "learning_rate": 7.228587402553655e-06, - "loss": 0.2203, - "step": 1152100 - }, - { - "epoch": 11.74, - "learning_rate": 7.22426924963715e-06, - "loss": 0.2048, - "step": 1152200 - }, - { - "epoch": 11.74, - "learning_rate": 7.219952210317966e-06, - "loss": 0.2594, - "step": 1152300 - }, - { - "epoch": 11.74, - "learning_rate": 7.2156362848071876e-06, - "loss": 0.2247, - "step": 1152400 - }, - { - "epoch": 11.74, - "learning_rate": 7.211321473315813e-06, - "loss": 0.246, - "step": 1152500 - }, - { - "epoch": 11.74, - "learning_rate": 7.207007776054826e-06, - "loss": 0.186, - "step": 1152600 - }, - { - "epoch": 11.74, - "learning_rate": 7.202695193235135e-06, - "loss": 0.2458, - "step": 1152700 - }, - { - "epoch": 11.74, - "learning_rate": 7.198383725067611e-06, - "loss": 0.2704, - "step": 1152800 - }, - { - "epoch": 11.75, - "learning_rate": 7.194073371763039e-06, - "loss": 0.2351, - "step": 1152900 - }, - { - "epoch": 11.75, - "learning_rate": 7.189764133532184e-06, - "loss": 0.2608, - "step": 1153000 - }, - { - "epoch": 11.75, - "learning_rate": 7.185456010585747e-06, - "loss": 0.1851, - "step": 1153100 - }, - { - "epoch": 11.75, - "learning_rate": 7.181149003134349e-06, - "loss": 0.2579, - "step": 1153200 - }, - { - "epoch": 11.75, - "learning_rate": 7.176843111388593e-06, - "loss": 0.2511, - "step": 1153300 - }, - { - "epoch": 11.75, - "learning_rate": 7.172538335559013e-06, - "loss": 0.2934, - "step": 1153400 - }, - { - "epoch": 11.75, - "learning_rate": 7.168234675856065e-06, - "loss": 0.2121, - "step": 1153500 - }, - { - "epoch": 11.75, - "learning_rate": 7.1639321324901985e-06, - "loss": 0.2278, - "step": 1153600 - }, - { - "epoch": 11.75, - "learning_rate": 7.15963070567178e-06, - "loss": 0.2686, - "step": 1153700 - }, - { - "epoch": 11.76, - "learning_rate": 7.155330395611105e-06, - "loss": 0.2226, - "step": 1153800 - }, - { - "epoch": 11.76, - "learning_rate": 7.151031202518448e-06, - "loss": 0.269, - "step": 1153900 - }, - { - "epoch": 11.76, - "learning_rate": 7.146733126604014e-06, - "loss": 0.283, - "step": 1154000 - }, - { - "epoch": 11.76, - "learning_rate": 7.142436168077942e-06, - "loss": 0.2585, - "step": 1154100 - }, - { - "epoch": 11.76, - "learning_rate": 7.138140327150336e-06, - "loss": 0.2466, - "step": 1154200 - }, - { - "epoch": 11.76, - "learning_rate": 7.133888545728589e-06, - "loss": 0.2525, - "step": 1154300 - }, - { - "epoch": 11.76, - "learning_rate": 7.129594929446754e-06, - "loss": 0.2082, - "step": 1154400 - }, - { - "epoch": 11.76, - "learning_rate": 7.1253024313912515e-06, - "loss": 0.2671, - "step": 1154500 - }, - { - "epoch": 11.76, - "learning_rate": 7.1210110517719465e-06, - "loss": 0.231, - "step": 1154600 - }, - { - "epoch": 11.76, - "learning_rate": 7.116720790798663e-06, - "loss": 0.2435, - "step": 1154700 - }, - { - "epoch": 11.77, - "learning_rate": 7.1124316486811715e-06, - "loss": 0.2008, - "step": 1154800 - }, - { - "epoch": 11.77, - "learning_rate": 7.1081436256291896e-06, - "loss": 0.2499, - "step": 1154900 - }, - { - "epoch": 11.77, - "learning_rate": 7.103856721852371e-06, - "loss": 0.2598, - "step": 1155000 - }, - { - "epoch": 11.77, - "learning_rate": 7.099570937560324e-06, - "loss": 0.229, - "step": 1155100 - }, - { - "epoch": 11.77, - "learning_rate": 7.095286272962604e-06, - "loss": 0.2004, - "step": 1155200 - }, - { - "epoch": 11.77, - "learning_rate": 7.091002728268687e-06, - "loss": 0.2631, - "step": 1155300 - }, - { - "epoch": 11.77, - "learning_rate": 7.086720303688021e-06, - "loss": 0.3411, - "step": 1155400 - }, - { - "epoch": 11.77, - "learning_rate": 7.082438999430005e-06, - "loss": 0.3084, - "step": 1155500 - }, - { - "epoch": 11.77, - "learning_rate": 7.078158815703944e-06, - "loss": 0.3051, - "step": 1155600 - }, - { - "epoch": 11.77, - "learning_rate": 7.073879752719131e-06, - "loss": 0.2377, - "step": 1155700 - }, - { - "epoch": 11.78, - "learning_rate": 7.06960181068479e-06, - "loss": 0.2755, - "step": 1155800 - }, - { - "epoch": 11.78, - "learning_rate": 7.065324989810068e-06, - "loss": 0.2336, - "step": 1155900 - }, - { - "epoch": 11.78, - "learning_rate": 7.0610492903040904e-06, - "loss": 0.204, - "step": 1156000 - }, - { - "epoch": 11.78, - "learning_rate": 7.056774712375906e-06, - "loss": 0.3013, - "step": 1156100 - }, - { - "epoch": 11.78, - "learning_rate": 7.052501256234522e-06, - "loss": 0.2581, - "step": 1156200 - }, - { - "epoch": 11.78, - "learning_rate": 7.048228922088885e-06, - "loss": 0.1816, - "step": 1156300 - }, - { - "epoch": 11.78, - "learning_rate": 7.043957710147894e-06, - "loss": 0.2596, - "step": 1156400 - }, - { - "epoch": 11.78, - "learning_rate": 7.0396876206203645e-06, - "loss": 0.2262, - "step": 1156500 - }, - { - "epoch": 11.78, - "learning_rate": 7.03541865371509e-06, - "loss": 0.2667, - "step": 1156600 - }, - { - "epoch": 11.78, - "learning_rate": 7.0311508096408e-06, - "loss": 0.2018, - "step": 1156700 - }, - { - "epoch": 11.79, - "learning_rate": 7.026884088606173e-06, - "loss": 0.2791, - "step": 1156800 - }, - { - "epoch": 11.79, - "learning_rate": 7.022618490819809e-06, - "loss": 0.2138, - "step": 1156900 - }, - { - "epoch": 11.79, - "learning_rate": 7.018354016490276e-06, - "loss": 0.3167, - "step": 1157000 - }, - { - "epoch": 11.79, - "learning_rate": 7.0140906658260975e-06, - "loss": 0.2574, - "step": 1157100 - }, - { - "epoch": 11.79, - "learning_rate": 7.009828439035703e-06, - "loss": 0.248, - "step": 1157200 - }, - { - "epoch": 11.79, - "learning_rate": 7.0055673363274955e-06, - "loss": 0.261, - "step": 1157300 - }, - { - "epoch": 11.79, - "learning_rate": 7.001307357909825e-06, - "loss": 0.2317, - "step": 1157400 - }, - { - "epoch": 11.79, - "learning_rate": 6.9970485039909756e-06, - "loss": 0.2397, - "step": 1157500 - }, - { - "epoch": 11.79, - "learning_rate": 6.99283334650332e-06, - "loss": 0.238, - "step": 1157600 - }, - { - "epoch": 11.79, - "learning_rate": 6.988576730956572e-06, - "loss": 0.2373, - "step": 1157700 - }, - { - "epoch": 11.8, - "learning_rate": 6.9843212405311e-06, - "loss": 0.246, - "step": 1157800 - }, - { - "epoch": 11.8, - "learning_rate": 6.980066875434974e-06, - "loss": 0.2024, - "step": 1157900 - }, - { - "epoch": 11.8, - "learning_rate": 6.975813635876194e-06, - "loss": 0.3501, - "step": 1158000 - }, - { - "epoch": 11.8, - "learning_rate": 6.971561522062726e-06, - "loss": 0.2775, - "step": 1158100 - }, - { - "epoch": 11.8, - "learning_rate": 6.9673105342024846e-06, - "loss": 0.2071, - "step": 1158200 - }, - { - "epoch": 11.8, - "learning_rate": 6.963060672503294e-06, - "loss": 0.2182, - "step": 1158300 - }, - { - "epoch": 11.8, - "learning_rate": 6.958811937172962e-06, - "loss": 0.2439, - "step": 1158400 - }, - { - "epoch": 11.8, - "learning_rate": 6.9545643284192284e-06, - "loss": 0.2539, - "step": 1158500 - }, - { - "epoch": 11.8, - "learning_rate": 6.950317846449764e-06, - "loss": 0.2314, - "step": 1158600 - }, - { - "epoch": 11.81, - "learning_rate": 6.946072491472206e-06, - "loss": 0.2474, - "step": 1158700 - }, - { - "epoch": 11.81, - "learning_rate": 6.941828263694122e-06, - "loss": 0.2692, - "step": 1158800 - }, - { - "epoch": 11.81, - "learning_rate": 6.9375851633230354e-06, - "loss": 0.2201, - "step": 1158900 - }, - { - "epoch": 11.81, - "learning_rate": 6.933343190566401e-06, - "loss": 0.2004, - "step": 1159000 - }, - { - "epoch": 11.81, - "learning_rate": 6.929102345631633e-06, - "loss": 0.2893, - "step": 1159100 - }, - { - "epoch": 11.81, - "learning_rate": 6.924862628726088e-06, - "loss": 0.3181, - "step": 1159200 - }, - { - "epoch": 11.81, - "learning_rate": 6.920624040057046e-06, - "loss": 0.238, - "step": 1159300 - }, - { - "epoch": 11.81, - "learning_rate": 6.916386579831757e-06, - "loss": 0.2508, - "step": 1159400 - }, - { - "epoch": 11.81, - "learning_rate": 6.912150248257417e-06, - "loss": 0.212, - "step": 1159500 - }, - { - "epoch": 11.81, - "learning_rate": 6.907915045541145e-06, - "loss": 0.2471, - "step": 1159600 - }, - { - "epoch": 11.82, - "learning_rate": 6.903680971890016e-06, - "loss": 0.2751, - "step": 1159700 - }, - { - "epoch": 11.82, - "learning_rate": 6.8994480275110635e-06, - "loss": 0.2897, - "step": 1159800 - }, - { - "epoch": 11.82, - "learning_rate": 6.8952162126112414e-06, - "loss": 0.3096, - "step": 1159900 - }, - { - "epoch": 11.82, - "learning_rate": 6.890985527397461e-06, - "loss": 0.2527, - "step": 1160000 - }, - { - "epoch": 11.82, - "learning_rate": 6.886755972076583e-06, - "loss": 0.2886, - "step": 1160100 - }, - { - "epoch": 11.82, - "learning_rate": 6.882527546855405e-06, - "loss": 0.3058, - "step": 1160200 - }, - { - "epoch": 11.82, - "learning_rate": 6.8783002519406725e-06, - "loss": 0.2037, - "step": 1160300 - }, - { - "epoch": 11.82, - "learning_rate": 6.874074087539079e-06, - "loss": 0.2693, - "step": 1160400 - }, - { - "epoch": 11.82, - "learning_rate": 6.869849053857251e-06, - "loss": 0.2396, - "step": 1160500 - }, - { - "epoch": 11.82, - "learning_rate": 6.865625151101769e-06, - "loss": 0.1944, - "step": 1160600 - }, - { - "epoch": 11.83, - "learning_rate": 6.861402379479167e-06, - "loss": 0.2037, - "step": 1160700 - }, - { - "epoch": 11.83, - "learning_rate": 6.857180739195894e-06, - "loss": 0.2798, - "step": 1160800 - }, - { - "epoch": 11.83, - "learning_rate": 6.852960230458375e-06, - "loss": 0.2003, - "step": 1160900 - }, - { - "epoch": 11.83, - "learning_rate": 6.848740853472968e-06, - "loss": 0.2273, - "step": 1161000 - }, - { - "epoch": 11.83, - "learning_rate": 6.844522608445981e-06, - "loss": 0.2564, - "step": 1161100 - }, - { - "epoch": 11.83, - "learning_rate": 6.840305495583646e-06, - "loss": 0.2314, - "step": 1161200 - }, - { - "epoch": 11.83, - "learning_rate": 6.836089515092162e-06, - "loss": 0.2136, - "step": 1161300 - }, - { - "epoch": 11.83, - "learning_rate": 6.831874667177666e-06, - "loss": 0.2371, - "step": 1161400 - }, - { - "epoch": 11.83, - "learning_rate": 6.827660952046238e-06, - "loss": 0.2219, - "step": 1161500 - }, - { - "epoch": 11.83, - "learning_rate": 6.823448369903903e-06, - "loss": 0.2725, - "step": 1161600 - }, - { - "epoch": 11.84, - "learning_rate": 6.8192369209566415e-06, - "loss": 0.2493, - "step": 1161700 - }, - { - "epoch": 11.84, - "learning_rate": 6.8150687029548005e-06, - "loss": 0.1926, - "step": 1161800 - }, - { - "epoch": 11.84, - "learning_rate": 6.81085950967827e-06, - "loss": 0.2312, - "step": 1161900 - }, - { - "epoch": 11.84, - "learning_rate": 6.806651450212314e-06, - "loss": 0.2478, - "step": 1162000 - }, - { - "epoch": 11.84, - "learning_rate": 6.8024445247626935e-06, - "loss": 0.2449, - "step": 1162100 - }, - { - "epoch": 11.84, - "learning_rate": 6.798238733535108e-06, - "loss": 0.27, - "step": 1162200 - }, - { - "epoch": 11.84, - "learning_rate": 6.794034076735178e-06, - "loss": 0.2564, - "step": 1162300 - }, - { - "epoch": 11.84, - "learning_rate": 6.789830554568496e-06, - "loss": 0.2615, - "step": 1162400 - }, - { - "epoch": 11.84, - "learning_rate": 6.78562816724059e-06, - "loss": 0.2622, - "step": 1162500 - }, - { - "epoch": 11.84, - "learning_rate": 6.781426914956932e-06, - "loss": 0.2738, - "step": 1162600 - }, - { - "epoch": 11.85, - "learning_rate": 6.777226797922932e-06, - "loss": 0.2076, - "step": 1162700 - }, - { - "epoch": 11.85, - "learning_rate": 6.773027816343965e-06, - "loss": 0.2665, - "step": 1162800 - }, - { - "epoch": 11.85, - "learning_rate": 6.768829970425321e-06, - "loss": 0.2238, - "step": 1162900 - }, - { - "epoch": 11.85, - "learning_rate": 6.764633260372252e-06, - "loss": 0.2109, - "step": 1163000 - }, - { - "epoch": 11.85, - "learning_rate": 6.7604376863899666e-06, - "loss": 0.2511, - "step": 1163100 - }, - { - "epoch": 11.85, - "learning_rate": 6.756243248683581e-06, - "loss": 0.2269, - "step": 1163200 - }, - { - "epoch": 11.85, - "learning_rate": 6.752049947458189e-06, - "loss": 0.2819, - "step": 1163300 - }, - { - "epoch": 11.85, - "learning_rate": 6.747857782918818e-06, - "loss": 0.2323, - "step": 1163400 - }, - { - "epoch": 11.85, - "learning_rate": 6.743666755270453e-06, - "loss": 0.2073, - "step": 1163500 - }, - { - "epoch": 11.85, - "learning_rate": 6.739476864717984e-06, - "loss": 0.2277, - "step": 1163600 - }, - { - "epoch": 11.86, - "learning_rate": 6.735288111466286e-06, - "loss": 0.2034, - "step": 1163700 - }, - { - "epoch": 11.86, - "learning_rate": 6.7311004957201606e-06, - "loss": 0.2553, - "step": 1163800 - }, - { - "epoch": 11.86, - "learning_rate": 6.726914017684361e-06, - "loss": 0.2135, - "step": 1163900 - }, - { - "epoch": 11.86, - "learning_rate": 6.722728677563581e-06, - "loss": 0.2243, - "step": 1164000 - }, - { - "epoch": 11.86, - "learning_rate": 6.718544475562463e-06, - "loss": 0.2562, - "step": 1164100 - }, - { - "epoch": 11.86, - "learning_rate": 6.714361411885576e-06, - "loss": 0.2388, - "step": 1164200 - }, - { - "epoch": 11.86, - "learning_rate": 6.710221300352551e-06, - "loss": 0.2806, - "step": 1164300 - }, - { - "epoch": 11.86, - "learning_rate": 6.7060405025493225e-06, - "loss": 0.2225, - "step": 1164400 - }, - { - "epoch": 11.86, - "learning_rate": 6.701860843681703e-06, - "loss": 0.223, - "step": 1164500 - }, - { - "epoch": 11.87, - "learning_rate": 6.6976823239540596e-06, - "loss": 0.2382, - "step": 1164600 - }, - { - "epoch": 11.87, - "learning_rate": 6.693504943570682e-06, - "loss": 0.2327, - "step": 1164700 - }, - { - "epoch": 11.87, - "learning_rate": 6.689328702735827e-06, - "loss": 0.2279, - "step": 1164800 - }, - { - "epoch": 11.87, - "learning_rate": 6.685153601653693e-06, - "loss": 0.2106, - "step": 1164900 - }, - { - "epoch": 11.87, - "learning_rate": 6.680979640528409e-06, - "loss": 0.2345, - "step": 1165000 - }, - { - "epoch": 11.87, - "learning_rate": 6.676806819564057e-06, - "loss": 0.2598, - "step": 1165100 - }, - { - "epoch": 11.87, - "learning_rate": 6.6726351389646634e-06, - "loss": 0.1701, - "step": 1165200 - }, - { - "epoch": 11.87, - "learning_rate": 6.668464598934203e-06, - "loss": 0.2381, - "step": 1165300 - }, - { - "epoch": 11.87, - "learning_rate": 6.664295199676589e-06, - "loss": 0.2512, - "step": 1165400 - }, - { - "epoch": 11.87, - "learning_rate": 6.660126941395687e-06, - "loss": 0.2568, - "step": 1165500 - }, - { - "epoch": 11.88, - "learning_rate": 6.655959824295287e-06, - "loss": 0.2545, - "step": 1165600 - }, - { - "epoch": 11.88, - "learning_rate": 6.651793848579139e-06, - "loss": 0.2831, - "step": 1165700 - }, - { - "epoch": 11.88, - "learning_rate": 6.64762901445094e-06, - "loss": 0.2313, - "step": 1165800 - }, - { - "epoch": 11.88, - "learning_rate": 6.6434653221143285e-06, - "loss": 0.2157, - "step": 1165900 - }, - { - "epoch": 11.88, - "learning_rate": 6.6393027717728685e-06, - "loss": 0.2528, - "step": 1166000 - }, - { - "epoch": 11.88, - "learning_rate": 6.635141363630094e-06, - "loss": 0.1951, - "step": 1166100 - }, - { - "epoch": 11.88, - "learning_rate": 6.630981097889482e-06, - "loss": 0.2493, - "step": 1166200 - }, - { - "epoch": 11.88, - "learning_rate": 6.626821974754428e-06, - "loss": 0.3139, - "step": 1166300 - }, - { - "epoch": 11.88, - "learning_rate": 6.62270556857398e-06, - "loss": 0.2844, - "step": 1166400 - }, - { - "epoch": 11.88, - "learning_rate": 6.618548719828942e-06, - "loss": 0.2324, - "step": 1166500 - }, - { - "epoch": 11.89, - "learning_rate": 6.614393014297334e-06, - "loss": 0.2753, - "step": 1166600 - }, - { - "epoch": 11.89, - "learning_rate": 6.610238452182361e-06, - "loss": 0.2517, - "step": 1166700 - }, - { - "epoch": 11.89, - "learning_rate": 6.606085033687137e-06, - "loss": 0.2265, - "step": 1166800 - }, - { - "epoch": 11.89, - "learning_rate": 6.601932759014746e-06, - "loss": 0.2535, - "step": 1166900 - }, - { - "epoch": 11.89, - "learning_rate": 6.597781628368219e-06, - "loss": 0.1908, - "step": 1167000 - }, - { - "epoch": 11.89, - "learning_rate": 6.593631641950507e-06, - "loss": 0.2158, - "step": 1167100 - }, - { - "epoch": 11.89, - "learning_rate": 6.589482799964525e-06, - "loss": 0.2664, - "step": 1167200 - }, - { - "epoch": 11.89, - "learning_rate": 6.585335102613132e-06, - "loss": 0.2553, - "step": 1167300 - }, - { - "epoch": 11.89, - "learning_rate": 6.581230009956642e-06, - "loss": 0.2295, - "step": 1167400 - }, - { - "epoch": 11.89, - "learning_rate": 6.577084591031352e-06, - "loss": 0.2518, - "step": 1167500 - }, - { - "epoch": 11.9, - "learning_rate": 6.5729403173468395e-06, - "loss": 0.2317, - "step": 1167600 - }, - { - "epoch": 11.9, - "learning_rate": 6.568797189105737e-06, - "loss": 0.2719, - "step": 1167700 - }, - { - "epoch": 11.9, - "learning_rate": 6.564655206510623e-06, - "loss": 0.2418, - "step": 1167800 - }, - { - "epoch": 11.9, - "learning_rate": 6.560514369764016e-06, - "loss": 0.2612, - "step": 1167900 - }, - { - "epoch": 11.9, - "learning_rate": 6.556374679068376e-06, - "loss": 0.2801, - "step": 1168000 - }, - { - "epoch": 11.9, - "learning_rate": 6.552236134626117e-06, - "loss": 0.1918, - "step": 1168100 - }, - { - "epoch": 11.9, - "learning_rate": 6.548098736639573e-06, - "loss": 0.1976, - "step": 1168200 - }, - { - "epoch": 11.9, - "learning_rate": 6.5439624853110505e-06, - "loss": 0.2548, - "step": 1168300 - }, - { - "epoch": 11.9, - "learning_rate": 6.539827380842788e-06, - "loss": 0.2499, - "step": 1168400 - }, - { - "epoch": 11.9, - "learning_rate": 6.53569342343696e-06, - "loss": 0.3015, - "step": 1168500 - }, - { - "epoch": 11.91, - "learning_rate": 6.531560613295693e-06, - "loss": 0.2354, - "step": 1168600 - }, - { - "epoch": 11.91, - "learning_rate": 6.52742895062106e-06, - "loss": 0.2404, - "step": 1168700 - }, - { - "epoch": 11.91, - "learning_rate": 6.523298435615082e-06, - "loss": 0.2466, - "step": 1168800 - }, - { - "epoch": 11.91, - "learning_rate": 6.519169068479701e-06, - "loss": 0.2117, - "step": 1168900 - }, - { - "epoch": 11.91, - "learning_rate": 6.515040849416828e-06, - "loss": 0.2554, - "step": 1169000 - }, - { - "epoch": 11.91, - "learning_rate": 6.510913778628307e-06, - "loss": 0.2689, - "step": 1169100 - }, - { - "epoch": 11.91, - "learning_rate": 6.506787856315923e-06, - "loss": 0.2252, - "step": 1169200 - }, - { - "epoch": 11.91, - "learning_rate": 6.502663082681416e-06, - "loss": 0.2192, - "step": 1169300 - }, - { - "epoch": 11.91, - "learning_rate": 6.498539457926467e-06, - "loss": 0.2402, - "step": 1169400 - }, - { - "epoch": 11.92, - "learning_rate": 6.494416982252681e-06, - "loss": 0.2447, - "step": 1169500 - }, - { - "epoch": 11.92, - "learning_rate": 6.490295655861628e-06, - "loss": 0.2507, - "step": 1169600 - }, - { - "epoch": 11.92, - "learning_rate": 6.486175478954829e-06, - "loss": 0.2389, - "step": 1169700 - }, - { - "epoch": 11.92, - "learning_rate": 6.48205645173372e-06, - "loss": 0.3392, - "step": 1169800 - }, - { - "epoch": 11.92, - "learning_rate": 6.477979747480441e-06, - "loss": 0.2747, - "step": 1169900 - }, - { - "epoch": 11.92, - "learning_rate": 6.473863008732973e-06, - "loss": 0.2774, - "step": 1170000 - }, - { - "epoch": 11.92, - "learning_rate": 6.469747420273207e-06, - "loss": 0.2352, - "step": 1170100 - }, - { - "epoch": 11.92, - "learning_rate": 6.465632982302377e-06, - "loss": 0.2389, - "step": 1170200 - }, - { - "epoch": 11.92, - "learning_rate": 6.461519695021635e-06, - "loss": 0.2098, - "step": 1170300 - }, - { - "epoch": 11.92, - "learning_rate": 6.457407558632114e-06, - "loss": 0.2576, - "step": 1170400 - }, - { - "epoch": 11.93, - "learning_rate": 6.4533376774892825e-06, - "loss": 0.2439, - "step": 1170500 - }, - { - "epoch": 11.93, - "learning_rate": 6.449227831971392e-06, - "loss": 0.2691, - "step": 1170600 - }, - { - "epoch": 11.93, - "learning_rate": 6.445119137945722e-06, - "loss": 0.2116, - "step": 1170700 - }, - { - "epoch": 11.93, - "learning_rate": 6.44101159561315e-06, - "loss": 0.2175, - "step": 1170800 - }, - { - "epoch": 11.93, - "learning_rate": 6.436905205174518e-06, - "loss": 0.224, - "step": 1170900 - }, - { - "epoch": 11.93, - "learning_rate": 6.4327999668306155e-06, - "loss": 0.2425, - "step": 1171000 - }, - { - "epoch": 11.93, - "learning_rate": 6.428695880782143e-06, - "loss": 0.2448, - "step": 1171100 - }, - { - "epoch": 11.93, - "learning_rate": 6.424592947229775e-06, - "loss": 0.2237, - "step": 1171200 - }, - { - "epoch": 11.93, - "learning_rate": 6.420491166374129e-06, - "loss": 0.2428, - "step": 1171300 - }, - { - "epoch": 11.93, - "learning_rate": 6.416390538415743e-06, - "loss": 0.2052, - "step": 1171400 - }, - { - "epoch": 11.94, - "learning_rate": 6.4122910635551216e-06, - "loss": 0.2223, - "step": 1171500 - }, - { - "epoch": 11.94, - "learning_rate": 6.4081927419927e-06, - "loss": 0.241, - "step": 1171600 - }, - { - "epoch": 11.94, - "learning_rate": 6.404095573928865e-06, - "loss": 0.2739, - "step": 1171700 - }, - { - "epoch": 11.94, - "learning_rate": 6.399999559563944e-06, - "loss": 0.2236, - "step": 1171800 - }, - { - "epoch": 11.94, - "learning_rate": 6.395904699098209e-06, - "loss": 0.257, - "step": 1171900 - }, - { - "epoch": 11.94, - "learning_rate": 6.391810992731879e-06, - "loss": 0.2363, - "step": 1172000 - }, - { - "epoch": 11.94, - "learning_rate": 6.387718440665096e-06, - "loss": 0.2196, - "step": 1172100 - }, - { - "epoch": 11.94, - "learning_rate": 6.383627043097971e-06, - "loss": 0.1908, - "step": 1172200 - }, - { - "epoch": 11.94, - "learning_rate": 6.379536800230557e-06, - "loss": 0.1991, - "step": 1172300 - }, - { - "epoch": 11.94, - "learning_rate": 6.375447712262828e-06, - "loss": 0.286, - "step": 1172400 - }, - { - "epoch": 11.95, - "learning_rate": 6.371359779394718e-06, - "loss": 0.1966, - "step": 1172500 - }, - { - "epoch": 11.95, - "learning_rate": 6.367273001826117e-06, - "loss": 0.2798, - "step": 1172600 - }, - { - "epoch": 11.95, - "learning_rate": 6.363187379756823e-06, - "loss": 0.2563, - "step": 1172700 - }, - { - "epoch": 11.95, - "learning_rate": 6.359102913386612e-06, - "loss": 0.2248, - "step": 1172800 - }, - { - "epoch": 11.95, - "learning_rate": 6.355019602915184e-06, - "loss": 0.252, - "step": 1172900 - }, - { - "epoch": 11.95, - "learning_rate": 6.350937448542193e-06, - "loss": 0.2364, - "step": 1173000 - }, - { - "epoch": 11.95, - "learning_rate": 6.346856450467233e-06, - "loss": 0.1841, - "step": 1173100 - }, - { - "epoch": 11.95, - "learning_rate": 6.342776608889844e-06, - "loss": 0.1939, - "step": 1173200 - }, - { - "epoch": 11.95, - "learning_rate": 6.33869792400949e-06, - "loss": 0.3072, - "step": 1173300 - }, - { - "epoch": 11.95, - "learning_rate": 6.334620396025608e-06, - "loss": 0.2535, - "step": 1173400 - }, - { - "epoch": 11.96, - "learning_rate": 6.330544025137558e-06, - "loss": 0.243, - "step": 1173500 - }, - { - "epoch": 11.96, - "learning_rate": 6.326468811544661e-06, - "loss": 0.2739, - "step": 1173600 - }, - { - "epoch": 11.96, - "learning_rate": 6.322394755446158e-06, - "loss": 0.2666, - "step": 1173700 - }, - { - "epoch": 11.96, - "learning_rate": 6.318321857041249e-06, - "loss": 0.2212, - "step": 1173800 - }, - { - "epoch": 11.96, - "learning_rate": 6.314250116529085e-06, - "loss": 0.2147, - "step": 1173900 - }, - { - "epoch": 11.96, - "learning_rate": 6.310179534108731e-06, - "loss": 0.2153, - "step": 1174000 - }, - { - "epoch": 11.96, - "learning_rate": 6.306110109979227e-06, - "loss": 0.3071, - "step": 1174100 - }, - { - "epoch": 11.96, - "learning_rate": 6.302041844339539e-06, - "loss": 0.2193, - "step": 1174200 - }, - { - "epoch": 11.96, - "learning_rate": 6.297974737388583e-06, - "loss": 0.2403, - "step": 1174300 - }, - { - "epoch": 11.96, - "learning_rate": 6.293908789325217e-06, - "loss": 0.3001, - "step": 1174400 - }, - { - "epoch": 11.97, - "learning_rate": 6.289844000348247e-06, - "loss": 0.2297, - "step": 1174500 - }, - { - "epoch": 11.97, - "learning_rate": 6.285780370656405e-06, - "loss": 0.2326, - "step": 1174600 - }, - { - "epoch": 11.97, - "learning_rate": 6.2817179004483825e-06, - "loss": 0.2468, - "step": 1174700 - }, - { - "epoch": 11.97, - "learning_rate": 6.27765658992282e-06, - "loss": 0.2235, - "step": 1174800 - }, - { - "epoch": 11.97, - "learning_rate": 6.273596439278276e-06, - "loss": 0.286, - "step": 1174900 - }, - { - "epoch": 11.97, - "learning_rate": 6.26957803287588e-06, - "loss": 0.2005, - "step": 1175000 - }, - { - "epoch": 11.97, - "learning_rate": 6.265520190985116e-06, - "loss": 0.2845, - "step": 1175100 - }, - { - "epoch": 11.97, - "learning_rate": 6.2614635095687765e-06, - "loss": 0.2673, - "step": 1175200 - }, - { - "epoch": 11.97, - "learning_rate": 6.257407988825214e-06, - "loss": 0.2481, - "step": 1175300 - }, - { - "epoch": 11.98, - "learning_rate": 6.253353628952705e-06, - "loss": 0.2085, - "step": 1175400 - }, - { - "epoch": 11.98, - "learning_rate": 6.249300430149489e-06, - "loss": 0.2044, - "step": 1175500 - }, - { - "epoch": 11.98, - "learning_rate": 6.245248392613749e-06, - "loss": 0.2566, - "step": 1175600 - }, - { - "epoch": 11.98, - "learning_rate": 6.2411975165436e-06, - "loss": 0.3407, - "step": 1175700 - }, - { - "epoch": 11.98, - "learning_rate": 6.237147802137105e-06, - "loss": 0.2743, - "step": 1175800 - }, - { - "epoch": 11.98, - "learning_rate": 6.2330992495922774e-06, - "loss": 0.2759, - "step": 1175900 - }, - { - "epoch": 11.98, - "learning_rate": 6.2290518591070685e-06, - "loss": 0.2256, - "step": 1176000 - }, - { - "epoch": 11.98, - "learning_rate": 6.2250056308793566e-06, - "loss": 0.23, - "step": 1176100 - }, - { - "epoch": 11.98, - "learning_rate": 6.22096056510699e-06, - "loss": 0.2308, - "step": 1176200 - }, - { - "epoch": 11.98, - "learning_rate": 6.216916661987754e-06, - "loss": 0.269, - "step": 1176300 - }, - { - "epoch": 11.99, - "learning_rate": 6.212873921719353e-06, - "loss": 0.2227, - "step": 1176400 - }, - { - "epoch": 11.99, - "learning_rate": 6.208832344499467e-06, - "loss": 0.2639, - "step": 1176500 - }, - { - "epoch": 11.99, - "learning_rate": 6.204791930525706e-06, - "loss": 0.2603, - "step": 1176600 - }, - { - "epoch": 11.99, - "learning_rate": 6.20075267999561e-06, - "loss": 0.2627, - "step": 1176700 - }, - { - "epoch": 11.99, - "learning_rate": 6.1967145931066815e-06, - "loss": 0.2436, - "step": 1176800 - }, - { - "epoch": 11.99, - "learning_rate": 6.192677670056358e-06, - "loss": 0.2036, - "step": 1176900 - }, - { - "epoch": 11.99, - "learning_rate": 6.188641911042023e-06, - "loss": 0.219, - "step": 1177000 - }, - { - "epoch": 11.99, - "learning_rate": 6.1846073162609996e-06, - "loss": 0.2089, - "step": 1177100 - }, - { - "epoch": 11.99, - "learning_rate": 6.180573885910562e-06, - "loss": 0.313, - "step": 1177200 - }, - { - "epoch": 11.99, - "learning_rate": 6.176541620187909e-06, - "loss": 0.2801, - "step": 1177300 - }, - { - "epoch": 12.0, - "learning_rate": 6.172510519290199e-06, - "loss": 0.2762, - "step": 1177400 - }, - { - "epoch": 12.0, - "learning_rate": 6.168480583414528e-06, - "loss": 0.2524, - "step": 1177500 - }, - { - "epoch": 12.0, - "learning_rate": 6.164451812757947e-06, - "loss": 0.2386, - "step": 1177600 - }, - { - "epoch": 12.0, - "learning_rate": 6.160424207517421e-06, - "loss": 0.1891, - "step": 1177700 - }, - { - "epoch": 12.0, - "learning_rate": 6.1563977678898844e-06, - "loss": 0.263, - "step": 1177800 - }, - { - "epoch": 12.0, - "learning_rate": 6.152372494072212e-06, - "loss": 0.1968, - "step": 1177900 - }, - { - "epoch": 12.0, - "learning_rate": 6.148348386261202e-06, - "loss": 0.2215, - "step": 1178000 - }, - { - "epoch": 12.0, - "learning_rate": 6.144325444653619e-06, - "loss": 0.2364, - "step": 1178100 - }, - { - "epoch": 12.0, - "learning_rate": 6.1403438814239e-06, - "loss": 0.155, - "step": 1178200 - }, - { - "epoch": 12.0, - "learning_rate": 6.136323261146262e-06, - "loss": 0.2097, - "step": 1178300 - }, - { - "epoch": 12.01, - "learning_rate": 6.13230380766001e-06, - "loss": 0.2099, - "step": 1178400 - }, - { - "epoch": 12.01, - "learning_rate": 6.128285521161662e-06, - "loss": 0.2241, - "step": 1178500 - }, - { - "epoch": 12.01, - "learning_rate": 6.124268401847693e-06, - "loss": 0.2039, - "step": 1178600 - }, - { - "epoch": 12.01, - "learning_rate": 6.1202524499145214e-06, - "loss": 0.2385, - "step": 1178700 - }, - { - "epoch": 12.01, - "learning_rate": 6.116237665558493e-06, - "loss": 0.2293, - "step": 1178800 - }, - { - "epoch": 12.01, - "learning_rate": 6.112224048975909e-06, - "loss": 0.1754, - "step": 1178900 - }, - { - "epoch": 12.01, - "learning_rate": 6.108211600363025e-06, - "loss": 0.1833, - "step": 1179000 - }, - { - "epoch": 12.01, - "learning_rate": 6.104200319916006e-06, - "loss": 0.263, - "step": 1179100 - }, - { - "epoch": 12.01, - "learning_rate": 6.10019020783099e-06, - "loss": 0.284, - "step": 1179200 - }, - { - "epoch": 12.01, - "learning_rate": 6.096181264304045e-06, - "loss": 0.2011, - "step": 1179300 - }, - { - "epoch": 12.02, - "learning_rate": 6.092173489531183e-06, - "loss": 0.1946, - "step": 1179400 - }, - { - "epoch": 12.02, - "learning_rate": 6.088166883708368e-06, - "loss": 0.2353, - "step": 1179500 - }, - { - "epoch": 12.02, - "learning_rate": 6.084161447031497e-06, - "loss": 0.2333, - "step": 1179600 - }, - { - "epoch": 12.02, - "learning_rate": 6.080157179696405e-06, - "loss": 0.2367, - "step": 1179700 - }, - { - "epoch": 12.02, - "learning_rate": 6.07615408189888e-06, - "loss": 0.2724, - "step": 1179800 - }, - { - "epoch": 12.02, - "learning_rate": 6.072152153834649e-06, - "loss": 0.2384, - "step": 1179900 - }, - { - "epoch": 12.02, - "learning_rate": 6.0681513956993915e-06, - "loss": 0.1453, - "step": 1180000 - }, - { - "epoch": 12.02, - "learning_rate": 6.064151807688705e-06, - "loss": 0.2373, - "step": 1180100 - }, - { - "epoch": 12.02, - "learning_rate": 6.060153389998156e-06, - "loss": 0.2313, - "step": 1180200 - }, - { - "epoch": 12.03, - "learning_rate": 6.056156142823247e-06, - "loss": 0.1842, - "step": 1180300 - }, - { - "epoch": 12.03, - "learning_rate": 6.0522000213283864e-06, - "loss": 0.2297, - "step": 1180400 - }, - { - "epoch": 12.03, - "learning_rate": 6.048205104060972e-06, - "loss": 0.2362, - "step": 1180500 - }, - { - "epoch": 12.03, - "learning_rate": 6.044211357893391e-06, - "loss": 0.2167, - "step": 1180600 - }, - { - "epoch": 12.03, - "learning_rate": 6.040218783020914e-06, - "loss": 0.2713, - "step": 1180700 - }, - { - "epoch": 12.03, - "learning_rate": 6.036227379638755e-06, - "loss": 0.2305, - "step": 1180800 - }, - { - "epoch": 12.03, - "learning_rate": 6.032237147942069e-06, - "loss": 0.2272, - "step": 1180900 - }, - { - "epoch": 12.03, - "learning_rate": 6.028248088125951e-06, - "loss": 0.2375, - "step": 1181000 - }, - { - "epoch": 12.03, - "learning_rate": 6.024260200385457e-06, - "loss": 0.1942, - "step": 1181100 - }, - { - "epoch": 12.03, - "learning_rate": 6.020273484915551e-06, - "loss": 0.198, - "step": 1181200 - }, - { - "epoch": 12.04, - "learning_rate": 6.016287941911168e-06, - "loss": 0.2514, - "step": 1181300 - }, - { - "epoch": 12.04, - "learning_rate": 6.012303571567185e-06, - "loss": 0.2215, - "step": 1181400 - }, - { - "epoch": 12.04, - "learning_rate": 6.008320374078401e-06, - "loss": 0.2249, - "step": 1181500 - }, - { - "epoch": 12.04, - "learning_rate": 6.004338349639571e-06, - "loss": 0.2369, - "step": 1181600 - }, - { - "epoch": 12.04, - "learning_rate": 6.000357498445409e-06, - "loss": 0.2364, - "step": 1181700 - }, - { - "epoch": 12.04, - "learning_rate": 5.996377820690533e-06, - "loss": 0.2258, - "step": 1181800 - }, - { - "epoch": 12.04, - "learning_rate": 5.992399316569533e-06, - "loss": 0.2115, - "step": 1181900 - }, - { - "epoch": 12.04, - "learning_rate": 5.988421986276942e-06, - "loss": 0.2235, - "step": 1182000 - }, - { - "epoch": 12.04, - "learning_rate": 5.9844458300072195e-06, - "loss": 0.2195, - "step": 1182100 - }, - { - "epoch": 12.04, - "learning_rate": 5.980470847954777e-06, - "loss": 0.1962, - "step": 1182200 - }, - { - "epoch": 12.05, - "learning_rate": 5.976497040313972e-06, - "loss": 0.2203, - "step": 1182300 - }, - { - "epoch": 12.05, - "learning_rate": 5.972524407279103e-06, - "loss": 0.1895, - "step": 1182400 - }, - { - "epoch": 12.05, - "learning_rate": 5.968592657810847e-06, - "loss": 0.2247, - "step": 1182500 - }, - { - "epoch": 12.05, - "learning_rate": 5.964622362819583e-06, - "loss": 0.2261, - "step": 1182600 - }, - { - "epoch": 12.05, - "learning_rate": 5.9606532430148505e-06, - "loss": 0.1873, - "step": 1182700 - }, - { - "epoch": 12.05, - "learning_rate": 5.9566852985907215e-06, - "loss": 0.2488, - "step": 1182800 - }, - { - "epoch": 12.05, - "learning_rate": 5.95271852974119e-06, - "loss": 0.2466, - "step": 1182900 - }, - { - "epoch": 12.05, - "learning_rate": 5.948752936660218e-06, - "loss": 0.2512, - "step": 1183000 - }, - { - "epoch": 12.05, - "learning_rate": 5.944788519541696e-06, - "loss": 0.2119, - "step": 1183100 - }, - { - "epoch": 12.05, - "learning_rate": 5.940825278579461e-06, - "loss": 0.1975, - "step": 1183200 - }, - { - "epoch": 12.06, - "learning_rate": 5.9368632139672914e-06, - "loss": 0.2035, - "step": 1183300 - }, - { - "epoch": 12.06, - "learning_rate": 5.932902325898912e-06, - "loss": 0.239, - "step": 1183400 - }, - { - "epoch": 12.06, - "learning_rate": 5.9289426145679905e-06, - "loss": 0.2056, - "step": 1183500 - }, - { - "epoch": 12.06, - "learning_rate": 5.924984080168117e-06, - "loss": 0.2116, - "step": 1183600 - }, - { - "epoch": 12.06, - "learning_rate": 5.921026722892853e-06, - "loss": 0.2453, - "step": 1183700 - }, - { - "epoch": 12.06, - "learning_rate": 5.917070542935691e-06, - "loss": 0.2276, - "step": 1183800 - }, - { - "epoch": 12.06, - "learning_rate": 5.913115540490054e-06, - "loss": 0.209, - "step": 1183900 - }, - { - "epoch": 12.06, - "learning_rate": 5.909161715749324e-06, - "loss": 0.2191, - "step": 1184000 - }, - { - "epoch": 12.06, - "learning_rate": 5.905209068906826e-06, - "loss": 0.2538, - "step": 1184100 - }, - { - "epoch": 12.06, - "learning_rate": 5.901257600155809e-06, - "loss": 0.2137, - "step": 1184200 - }, - { - "epoch": 12.07, - "learning_rate": 5.897307309689479e-06, - "loss": 0.1999, - "step": 1184300 - }, - { - "epoch": 12.07, - "learning_rate": 5.893358197700984e-06, - "loss": 0.2364, - "step": 1184400 - }, - { - "epoch": 12.07, - "learning_rate": 5.8894102643834136e-06, - "loss": 0.2057, - "step": 1184500 - }, - { - "epoch": 12.07, - "learning_rate": 5.885463509929799e-06, - "loss": 0.238, - "step": 1184600 - }, - { - "epoch": 12.07, - "learning_rate": 5.881517934533107e-06, - "loss": 0.2521, - "step": 1184700 - }, - { - "epoch": 12.07, - "learning_rate": 5.877573538386266e-06, - "loss": 0.2517, - "step": 1184800 - }, - { - "epoch": 12.07, - "learning_rate": 5.873630321682115e-06, - "loss": 0.2311, - "step": 1184900 - }, - { - "epoch": 12.07, - "learning_rate": 5.869688284613464e-06, - "loss": 0.2099, - "step": 1185000 - }, - { - "epoch": 12.07, - "learning_rate": 5.86574742737306e-06, - "loss": 0.2479, - "step": 1185100 - }, - { - "epoch": 12.08, - "learning_rate": 5.861807750153575e-06, - "loss": 0.2148, - "step": 1185200 - }, - { - "epoch": 12.08, - "learning_rate": 5.857869253147643e-06, - "loss": 0.2039, - "step": 1185300 - }, - { - "epoch": 12.08, - "learning_rate": 5.853931936547837e-06, - "loss": 0.2312, - "step": 1185400 - }, - { - "epoch": 12.08, - "learning_rate": 5.849995800546657e-06, - "loss": 0.2346, - "step": 1185500 - }, - { - "epoch": 12.08, - "learning_rate": 5.846060845336562e-06, - "loss": 0.2456, - "step": 1185600 - }, - { - "epoch": 12.08, - "learning_rate": 5.842127071109947e-06, - "loss": 0.2131, - "step": 1185700 - }, - { - "epoch": 12.08, - "learning_rate": 5.838194478059153e-06, - "loss": 0.2345, - "step": 1185800 - }, - { - "epoch": 12.08, - "learning_rate": 5.834263066376459e-06, - "loss": 0.2535, - "step": 1185900 - }, - { - "epoch": 12.08, - "learning_rate": 5.8303328362540955e-06, - "loss": 0.2182, - "step": 1186000 - }, - { - "epoch": 12.08, - "learning_rate": 5.8264430725176e-06, - "loss": 0.1988, - "step": 1186100 - }, - { - "epoch": 12.09, - "learning_rate": 5.822515194271919e-06, - "loss": 0.1989, - "step": 1186200 - }, - { - "epoch": 12.09, - "learning_rate": 5.818588498160955e-06, - "loss": 0.2226, - "step": 1186300 - }, - { - "epoch": 12.09, - "learning_rate": 5.814662984376708e-06, - "loss": 0.2808, - "step": 1186400 - }, - { - "epoch": 12.09, - "learning_rate": 5.810738653111116e-06, - "loss": 0.2488, - "step": 1186500 - }, - { - "epoch": 12.09, - "learning_rate": 5.8068155045560445e-06, - "loss": 0.1876, - "step": 1186600 - }, - { - "epoch": 12.09, - "learning_rate": 5.802893538903316e-06, - "loss": 0.2658, - "step": 1186700 - }, - { - "epoch": 12.09, - "learning_rate": 5.798972756344699e-06, - "loss": 0.2007, - "step": 1186800 - }, - { - "epoch": 12.09, - "learning_rate": 5.795053157071886e-06, - "loss": 0.2657, - "step": 1186900 - }, - { - "epoch": 12.09, - "learning_rate": 5.791134741276522e-06, - "loss": 0.2765, - "step": 1187000 - }, - { - "epoch": 12.09, - "learning_rate": 5.7872175091502e-06, - "loss": 0.2175, - "step": 1187100 - }, - { - "epoch": 12.1, - "learning_rate": 5.783301460884448e-06, - "loss": 0.2679, - "step": 1187200 - }, - { - "epoch": 12.1, - "learning_rate": 5.779386596670736e-06, - "loss": 0.2014, - "step": 1187300 - }, - { - "epoch": 12.1, - "learning_rate": 5.775472916700477e-06, - "loss": 0.2764, - "step": 1187400 - }, - { - "epoch": 12.1, - "learning_rate": 5.771560421165039e-06, - "loss": 0.2564, - "step": 1187500 - }, - { - "epoch": 12.1, - "learning_rate": 5.767649110255701e-06, - "loss": 0.2239, - "step": 1187600 - }, - { - "epoch": 12.1, - "learning_rate": 5.763738984163706e-06, - "loss": 0.2307, - "step": 1187700 - }, - { - "epoch": 12.1, - "learning_rate": 5.759830043080251e-06, - "loss": 0.1952, - "step": 1187800 - }, - { - "epoch": 12.1, - "learning_rate": 5.755961358887909e-06, - "loss": 0.1936, - "step": 1187900 - }, - { - "epoch": 12.1, - "learning_rate": 5.752054776539973e-06, - "loss": 0.2363, - "step": 1188000 - }, - { - "epoch": 12.1, - "learning_rate": 5.748149379771848e-06, - "loss": 0.2463, - "step": 1188100 - }, - { - "epoch": 12.11, - "learning_rate": 5.744245168774489e-06, - "loss": 0.1701, - "step": 1188200 - }, - { - "epoch": 12.11, - "learning_rate": 5.7403421437387915e-06, - "loss": 0.2309, - "step": 1188300 - }, - { - "epoch": 12.11, - "learning_rate": 5.736440304855587e-06, - "loss": 0.2323, - "step": 1188400 - }, - { - "epoch": 12.11, - "learning_rate": 5.732539652315656e-06, - "loss": 0.1869, - "step": 1188500 - }, - { - "epoch": 12.11, - "learning_rate": 5.728640186309722e-06, - "loss": 0.2177, - "step": 1188600 - }, - { - "epoch": 12.11, - "learning_rate": 5.724741907028435e-06, - "loss": 0.2275, - "step": 1188700 - }, - { - "epoch": 12.11, - "learning_rate": 5.720844814662396e-06, - "loss": 0.2522, - "step": 1188800 - }, - { - "epoch": 12.11, - "learning_rate": 5.7169489094021676e-06, - "loss": 0.2505, - "step": 1188900 - }, - { - "epoch": 12.11, - "learning_rate": 5.713054191438214e-06, - "loss": 0.2645, - "step": 1189000 - }, - { - "epoch": 12.11, - "learning_rate": 5.709160660960973e-06, - "loss": 0.2761, - "step": 1189100 - }, - { - "epoch": 12.12, - "learning_rate": 5.705268318160824e-06, - "loss": 0.2524, - "step": 1189200 - }, - { - "epoch": 12.12, - "learning_rate": 5.701377163228061e-06, - "loss": 0.2037, - "step": 1189300 - }, - { - "epoch": 12.12, - "learning_rate": 5.6974871963529525e-06, - "loss": 0.2255, - "step": 1189400 - }, - { - "epoch": 12.12, - "learning_rate": 5.693598417725688e-06, - "loss": 0.2822, - "step": 1189500 - }, - { - "epoch": 12.12, - "learning_rate": 5.689710827536405e-06, - "loss": 0.2574, - "step": 1189600 - }, - { - "epoch": 12.12, - "learning_rate": 5.685824425975191e-06, - "loss": 0.2085, - "step": 1189700 - }, - { - "epoch": 12.12, - "learning_rate": 5.681939213232061e-06, - "loss": 0.2147, - "step": 1189800 - }, - { - "epoch": 12.12, - "learning_rate": 5.67805518949699e-06, - "loss": 0.2297, - "step": 1189900 - }, - { - "epoch": 12.12, - "learning_rate": 5.674211177418077e-06, - "loss": 0.205, - "step": 1190000 - }, - { - "epoch": 12.12, - "learning_rate": 5.670329520373946e-06, - "loss": 0.2191, - "step": 1190100 - }, - { - "epoch": 12.13, - "learning_rate": 5.666449052905502e-06, - "loss": 0.2219, - "step": 1190200 - }, - { - "epoch": 12.13, - "learning_rate": 5.662569775202485e-06, - "loss": 0.1995, - "step": 1190300 - }, - { - "epoch": 12.13, - "learning_rate": 5.658691687454573e-06, - "loss": 0.2285, - "step": 1190400 - }, - { - "epoch": 12.13, - "learning_rate": 5.654814789851369e-06, - "loss": 0.2425, - "step": 1190500 - }, - { - "epoch": 12.13, - "learning_rate": 5.650939082582441e-06, - "loss": 0.2197, - "step": 1190600 - }, - { - "epoch": 12.13, - "learning_rate": 5.647064565837288e-06, - "loss": 0.2013, - "step": 1190700 - }, - { - "epoch": 12.13, - "learning_rate": 5.64319123980534e-06, - "loss": 0.2198, - "step": 1190800 - }, - { - "epoch": 12.13, - "learning_rate": 5.639319104675985e-06, - "loss": 0.192, - "step": 1190900 - }, - { - "epoch": 12.13, - "learning_rate": 5.635448160638552e-06, - "loss": 0.1812, - "step": 1191000 - }, - { - "epoch": 12.14, - "learning_rate": 5.631578407882303e-06, - "loss": 0.2556, - "step": 1191100 - }, - { - "epoch": 12.14, - "learning_rate": 5.627709846596446e-06, - "loss": 0.247, - "step": 1191200 - }, - { - "epoch": 12.14, - "learning_rate": 5.623842476970137e-06, - "loss": 0.2239, - "step": 1191300 - }, - { - "epoch": 12.14, - "learning_rate": 5.619976299192458e-06, - "loss": 0.2324, - "step": 1191400 - }, - { - "epoch": 12.14, - "learning_rate": 5.616111313452444e-06, - "loss": 0.1882, - "step": 1191500 - }, - { - "epoch": 12.14, - "learning_rate": 5.612247519939077e-06, - "loss": 0.2084, - "step": 1191600 - }, - { - "epoch": 12.14, - "learning_rate": 5.608384918841263e-06, - "loss": 0.2229, - "step": 1191700 - }, - { - "epoch": 12.14, - "learning_rate": 5.604523510347864e-06, - "loss": 0.2961, - "step": 1191800 - }, - { - "epoch": 12.14, - "learning_rate": 5.60066329464768e-06, - "loss": 0.2462, - "step": 1191900 - }, - { - "epoch": 12.14, - "learning_rate": 5.596804271929462e-06, - "loss": 0.2348, - "step": 1192000 - }, - { - "epoch": 12.15, - "learning_rate": 5.592946442381876e-06, - "loss": 0.1792, - "step": 1192100 - }, - { - "epoch": 12.15, - "learning_rate": 5.5890898061935556e-06, - "loss": 0.2599, - "step": 1192200 - }, - { - "epoch": 12.15, - "learning_rate": 5.585234363553066e-06, - "loss": 0.2435, - "step": 1192300 - }, - { - "epoch": 12.15, - "learning_rate": 5.5813801146489154e-06, - "loss": 0.2803, - "step": 1192400 - }, - { - "epoch": 12.15, - "learning_rate": 5.577527059669557e-06, - "loss": 0.2307, - "step": 1192500 - }, - { - "epoch": 12.15, - "learning_rate": 5.573675198803388e-06, - "loss": 0.2317, - "step": 1192600 - }, - { - "epoch": 12.15, - "learning_rate": 5.569824532238722e-06, - "loss": 0.2588, - "step": 1192700 - }, - { - "epoch": 12.15, - "learning_rate": 5.565975060163847e-06, - "loss": 0.2538, - "step": 1192800 - }, - { - "epoch": 12.15, - "learning_rate": 5.562126782766985e-06, - "loss": 0.2845, - "step": 1192900 - }, - { - "epoch": 12.15, - "learning_rate": 5.558279700236276e-06, - "loss": 0.2356, - "step": 1193000 - }, - { - "epoch": 12.16, - "learning_rate": 5.5544338127598326e-06, - "loss": 0.2123, - "step": 1193100 - }, - { - "epoch": 12.16, - "learning_rate": 5.550589120525698e-06, - "loss": 0.2023, - "step": 1193200 - }, - { - "epoch": 12.16, - "learning_rate": 5.54674562372184e-06, - "loss": 0.2274, - "step": 1193300 - }, - { - "epoch": 12.16, - "learning_rate": 5.542903322536195e-06, - "loss": 0.2366, - "step": 1193400 - }, - { - "epoch": 12.16, - "learning_rate": 5.5390622171566195e-06, - "loss": 0.2467, - "step": 1193500 - }, - { - "epoch": 12.16, - "learning_rate": 5.535222307770932e-06, - "loss": 0.2067, - "step": 1193600 - }, - { - "epoch": 12.16, - "learning_rate": 5.531383594566871e-06, - "loss": 0.1379, - "step": 1193700 - }, - { - "epoch": 12.16, - "learning_rate": 5.527584446977841e-06, - "loss": 0.1965, - "step": 1193800 - }, - { - "epoch": 12.16, - "learning_rate": 5.52374811473356e-06, - "loss": 0.2256, - "step": 1193900 - }, - { - "epoch": 12.16, - "learning_rate": 5.519912979231934e-06, - "loss": 0.2426, - "step": 1194000 - }, - { - "epoch": 12.17, - "learning_rate": 5.516079040660469e-06, - "loss": 0.2182, - "step": 1194100 - }, - { - "epoch": 12.17, - "learning_rate": 5.5122462992066225e-06, - "loss": 0.2037, - "step": 1194200 - }, - { - "epoch": 12.17, - "learning_rate": 5.5084147550578014e-06, - "loss": 0.2147, - "step": 1194300 - }, - { - "epoch": 12.17, - "learning_rate": 5.504584408401345e-06, - "loss": 0.2347, - "step": 1194400 - }, - { - "epoch": 12.17, - "learning_rate": 5.500755259424525e-06, - "loss": 0.276, - "step": 1194500 - }, - { - "epoch": 12.17, - "learning_rate": 5.496927308314566e-06, - "loss": 0.2341, - "step": 1194600 - }, - { - "epoch": 12.17, - "learning_rate": 5.4931005552586445e-06, - "loss": 0.2088, - "step": 1194700 - }, - { - "epoch": 12.17, - "learning_rate": 5.489275000443849e-06, - "loss": 0.2089, - "step": 1194800 - }, - { - "epoch": 12.17, - "learning_rate": 5.485450644057229e-06, - "loss": 0.2107, - "step": 1194900 - }, - { - "epoch": 12.17, - "learning_rate": 5.481627486285784e-06, - "loss": 0.2007, - "step": 1195000 - }, - { - "epoch": 12.18, - "learning_rate": 5.477805527316433e-06, - "loss": 0.1798, - "step": 1195100 - }, - { - "epoch": 12.18, - "learning_rate": 5.4739847673360535e-06, - "loss": 0.2359, - "step": 1195200 - }, - { - "epoch": 12.18, - "learning_rate": 5.470165206531459e-06, - "loss": 0.2025, - "step": 1195300 - }, - { - "epoch": 12.18, - "learning_rate": 5.466346845089394e-06, - "loss": 0.2625, - "step": 1195400 - }, - { - "epoch": 12.18, - "learning_rate": 5.462529683196559e-06, - "loss": 0.2147, - "step": 1195500 - }, - { - "epoch": 12.18, - "learning_rate": 5.458713721039601e-06, - "loss": 0.2435, - "step": 1195600 - }, - { - "epoch": 12.18, - "learning_rate": 5.454898958805074e-06, - "loss": 0.2342, - "step": 1195700 - }, - { - "epoch": 12.18, - "learning_rate": 5.451085396679514e-06, - "loss": 0.2058, - "step": 1195800 - }, - { - "epoch": 12.18, - "learning_rate": 5.447273034849379e-06, - "loss": 0.2085, - "step": 1195900 - }, - { - "epoch": 12.19, - "learning_rate": 5.443499979171558e-06, - "loss": 0.2046, - "step": 1196000 - }, - { - "epoch": 12.19, - "learning_rate": 5.439690006483811e-06, - "loss": 0.1802, - "step": 1196100 - }, - { - "epoch": 12.19, - "learning_rate": 5.435881234648654e-06, - "loss": 0.224, - "step": 1196200 - }, - { - "epoch": 12.19, - "learning_rate": 5.432073663852316e-06, - "loss": 0.2383, - "step": 1196300 - }, - { - "epoch": 12.19, - "learning_rate": 5.428267294280967e-06, - "loss": 0.2441, - "step": 1196400 - }, - { - "epoch": 12.19, - "learning_rate": 5.424462126120706e-06, - "loss": 0.2703, - "step": 1196500 - }, - { - "epoch": 12.19, - "learning_rate": 5.4206581595575865e-06, - "loss": 0.2688, - "step": 1196600 - }, - { - "epoch": 12.19, - "learning_rate": 5.4168553947776e-06, - "loss": 0.2066, - "step": 1196700 - }, - { - "epoch": 12.19, - "learning_rate": 5.413053831966691e-06, - "loss": 0.2303, - "step": 1196800 - }, - { - "epoch": 12.19, - "learning_rate": 5.4092534713107115e-06, - "loss": 0.2423, - "step": 1196900 - }, - { - "epoch": 12.2, - "learning_rate": 5.405454312995488e-06, - "loss": 0.2009, - "step": 1197000 - }, - { - "epoch": 12.2, - "learning_rate": 5.4016563572067864e-06, - "loss": 0.2278, - "step": 1197100 - }, - { - "epoch": 12.2, - "learning_rate": 5.397859604130282e-06, - "loss": 0.2269, - "step": 1197200 - }, - { - "epoch": 12.2, - "learning_rate": 5.3940640539516285e-06, - "loss": 0.2075, - "step": 1197300 - }, - { - "epoch": 12.2, - "learning_rate": 5.390269706856401e-06, - "loss": 0.2453, - "step": 1197400 - }, - { - "epoch": 12.2, - "learning_rate": 5.386476563030122e-06, - "loss": 0.2014, - "step": 1197500 - }, - { - "epoch": 12.2, - "learning_rate": 5.382684622658252e-06, - "loss": 0.2342, - "step": 1197600 - }, - { - "epoch": 12.2, - "learning_rate": 5.378893885926203e-06, - "loss": 0.2383, - "step": 1197700 - }, - { - "epoch": 12.2, - "learning_rate": 5.375104353019307e-06, - "loss": 0.3112, - "step": 1197800 - }, - { - "epoch": 12.2, - "learning_rate": 5.371316024122852e-06, - "loss": 0.2621, - "step": 1197900 - }, - { - "epoch": 12.21, - "learning_rate": 5.367528899422074e-06, - "loss": 0.1848, - "step": 1198000 - }, - { - "epoch": 12.21, - "learning_rate": 5.363742979102128e-06, - "loss": 0.2435, - "step": 1198100 - }, - { - "epoch": 12.21, - "learning_rate": 5.359958263348128e-06, - "loss": 0.2564, - "step": 1198200 - }, - { - "epoch": 12.21, - "learning_rate": 5.356174752345124e-06, - "loss": 0.196, - "step": 1198300 - }, - { - "epoch": 12.21, - "learning_rate": 5.352392446278118e-06, - "loss": 0.249, - "step": 1198400 - }, - { - "epoch": 12.21, - "learning_rate": 5.3486113453320216e-06, - "loss": 0.205, - "step": 1198500 - }, - { - "epoch": 12.21, - "learning_rate": 5.34483144969172e-06, - "loss": 0.2446, - "step": 1198600 - }, - { - "epoch": 12.21, - "learning_rate": 5.341052759542026e-06, - "loss": 0.2557, - "step": 1198700 - }, - { - "epoch": 12.21, - "learning_rate": 5.337275275067699e-06, - "loss": 0.1655, - "step": 1198800 - }, - { - "epoch": 12.21, - "learning_rate": 5.333498996453426e-06, - "loss": 0.2122, - "step": 1198900 - }, - { - "epoch": 12.22, - "learning_rate": 5.3297239238838616e-06, - "loss": 0.2202, - "step": 1199000 - }, - { - "epoch": 12.22, - "learning_rate": 5.325950057543563e-06, - "loss": 0.2126, - "step": 1199100 - }, - { - "epoch": 12.22, - "learning_rate": 5.32217739761706e-06, - "loss": 0.2496, - "step": 1199200 - }, - { - "epoch": 12.22, - "learning_rate": 5.3184059442888214e-06, - "loss": 0.2643, - "step": 1199300 - }, - { - "epoch": 12.22, - "learning_rate": 5.31463569774323e-06, - "loss": 0.2007, - "step": 1199400 - }, - { - "epoch": 12.22, - "learning_rate": 5.310866658164641e-06, - "loss": 0.2174, - "step": 1199500 - }, - { - "epoch": 12.22, - "learning_rate": 5.307098825737342e-06, - "loss": 0.265, - "step": 1199600 - }, - { - "epoch": 12.22, - "learning_rate": 5.303332200645541e-06, - "loss": 0.1903, - "step": 1199700 - }, - { - "epoch": 12.22, - "learning_rate": 5.299566783073417e-06, - "loss": 0.2136, - "step": 1199800 - }, - { - "epoch": 12.22, - "learning_rate": 5.295802573205071e-06, - "loss": 0.1896, - "step": 1199900 - }, - { - "epoch": 12.23, - "learning_rate": 5.292039571224554e-06, - "loss": 0.221, - "step": 1200000 - }, - { - "epoch": 12.23, - "learning_rate": 5.28827777731585e-06, - "loss": 0.2142, - "step": 1200100 - }, - { - "epoch": 12.23, - "learning_rate": 5.284517191662891e-06, - "loss": 0.2944, - "step": 1200200 - }, - { - "epoch": 12.23, - "learning_rate": 5.280757814449557e-06, - "loss": 0.2238, - "step": 1200300 - }, - { - "epoch": 12.23, - "learning_rate": 5.276999645859638e-06, - "loss": 0.2468, - "step": 1200400 - }, - { - "epoch": 12.23, - "learning_rate": 5.2732426860769e-06, - "loss": 0.22, - "step": 1200500 - }, - { - "epoch": 12.23, - "learning_rate": 5.269486935285039e-06, - "loss": 0.2572, - "step": 1200600 - }, - { - "epoch": 12.23, - "learning_rate": 5.265732393667679e-06, - "loss": 0.2472, - "step": 1200700 - }, - { - "epoch": 12.23, - "learning_rate": 5.261979061408396e-06, - "loss": 0.227, - "step": 1200800 - }, - { - "epoch": 12.23, - "learning_rate": 5.258226938690713e-06, - "loss": 0.2292, - "step": 1200900 - }, - { - "epoch": 12.24, - "learning_rate": 5.254476025698077e-06, - "loss": 0.2499, - "step": 1201000 - }, - { - "epoch": 12.24, - "learning_rate": 5.250726322613891e-06, - "loss": 0.2848, - "step": 1201100 - }, - { - "epoch": 12.24, - "learning_rate": 5.246977829621489e-06, - "loss": 0.2267, - "step": 1201200 - }, - { - "epoch": 12.24, - "learning_rate": 5.243230546904153e-06, - "loss": 0.2004, - "step": 1201300 - }, - { - "epoch": 12.24, - "learning_rate": 5.239484474645104e-06, - "loss": 0.247, - "step": 1201400 - }, - { - "epoch": 12.24, - "learning_rate": 5.235739613027509e-06, - "loss": 0.2139, - "step": 1201500 - }, - { - "epoch": 12.24, - "learning_rate": 5.2319959622344515e-06, - "loss": 0.2193, - "step": 1201600 - }, - { - "epoch": 12.24, - "learning_rate": 5.228253522448988e-06, - "loss": 0.1867, - "step": 1201700 - }, - { - "epoch": 12.24, - "learning_rate": 5.224512293854092e-06, - "loss": 0.2241, - "step": 1201800 - }, - { - "epoch": 12.25, - "learning_rate": 5.220772276632705e-06, - "loss": 0.2311, - "step": 1201900 - }, - { - "epoch": 12.25, - "learning_rate": 5.217070853026518e-06, - "loss": 0.2056, - "step": 1202000 - }, - { - "epoch": 12.25, - "learning_rate": 5.213333246982352e-06, - "loss": 0.2375, - "step": 1202100 - }, - { - "epoch": 12.25, - "learning_rate": 5.209596852858272e-06, - "loss": 0.2576, - "step": 1202200 - }, - { - "epoch": 12.25, - "learning_rate": 5.205861670836972e-06, - "loss": 0.1892, - "step": 1202300 - }, - { - "epoch": 12.25, - "learning_rate": 5.202127701101064e-06, - "loss": 0.2904, - "step": 1202400 - }, - { - "epoch": 12.25, - "learning_rate": 5.198394943833125e-06, - "loss": 0.2432, - "step": 1202500 - }, - { - "epoch": 12.25, - "learning_rate": 5.194663399215666e-06, - "loss": 0.2085, - "step": 1202600 - }, - { - "epoch": 12.25, - "learning_rate": 5.190933067431134e-06, - "loss": 0.1573, - "step": 1202700 - }, - { - "epoch": 12.25, - "learning_rate": 5.187203948661927e-06, - "loss": 0.2333, - "step": 1202800 - }, - { - "epoch": 12.26, - "learning_rate": 5.183476043090366e-06, - "loss": 0.2463, - "step": 1202900 - }, - { - "epoch": 12.26, - "learning_rate": 5.179749350898741e-06, - "loss": 0.2392, - "step": 1203000 - }, - { - "epoch": 12.26, - "learning_rate": 5.176023872269244e-06, - "loss": 0.246, - "step": 1203100 - }, - { - "epoch": 12.26, - "learning_rate": 5.172299607384039e-06, - "loss": 0.2328, - "step": 1203200 - }, - { - "epoch": 12.26, - "learning_rate": 5.168576556425227e-06, - "loss": 0.2144, - "step": 1203300 - }, - { - "epoch": 12.26, - "learning_rate": 5.164854719574827e-06, - "loss": 0.1606, - "step": 1203400 - }, - { - "epoch": 12.26, - "learning_rate": 5.161134097014824e-06, - "loss": 0.2422, - "step": 1203500 - }, - { - "epoch": 12.26, - "learning_rate": 5.157414688927139e-06, - "loss": 0.2301, - "step": 1203600 - }, - { - "epoch": 12.26, - "learning_rate": 5.153696495493615e-06, - "loss": 0.2578, - "step": 1203700 - }, - { - "epoch": 12.26, - "learning_rate": 5.149979516896056e-06, - "loss": 0.2068, - "step": 1203800 - }, - { - "epoch": 12.27, - "learning_rate": 5.146263753316204e-06, - "loss": 0.2467, - "step": 1203900 - }, - { - "epoch": 12.27, - "learning_rate": 5.142549204935736e-06, - "loss": 0.2034, - "step": 1204000 - }, - { - "epoch": 12.27, - "learning_rate": 5.138835871936267e-06, - "loss": 0.1897, - "step": 1204100 - }, - { - "epoch": 12.27, - "learning_rate": 5.135123754499362e-06, - "loss": 0.1804, - "step": 1204200 - }, - { - "epoch": 12.27, - "learning_rate": 5.13141285280653e-06, - "loss": 0.2281, - "step": 1204300 - }, - { - "epoch": 12.27, - "learning_rate": 5.127703167039191e-06, - "loss": 0.2516, - "step": 1204400 - }, - { - "epoch": 12.27, - "learning_rate": 5.1239946973787365e-06, - "loss": 0.2353, - "step": 1204500 - }, - { - "epoch": 12.27, - "learning_rate": 5.1202874440064964e-06, - "loss": 0.2209, - "step": 1204600 - }, - { - "epoch": 12.27, - "learning_rate": 5.116581407103721e-06, - "loss": 0.214, - "step": 1204700 - }, - { - "epoch": 12.27, - "learning_rate": 5.112876586851616e-06, - "loss": 0.2345, - "step": 1204800 - }, - { - "epoch": 12.28, - "learning_rate": 5.109172983431336e-06, - "loss": 0.2109, - "step": 1204900 - }, - { - "epoch": 12.28, - "learning_rate": 5.105470597023947e-06, - "loss": 0.1926, - "step": 1205000 - }, - { - "epoch": 12.28, - "learning_rate": 5.101769427810486e-06, - "loss": 0.2467, - "step": 1205100 - }, - { - "epoch": 12.28, - "learning_rate": 5.098069475971913e-06, - "loss": 0.2104, - "step": 1205200 - }, - { - "epoch": 12.28, - "learning_rate": 5.094370741689137e-06, - "loss": 0.1784, - "step": 1205300 - }, - { - "epoch": 12.28, - "learning_rate": 5.090673225143002e-06, - "loss": 0.163, - "step": 1205400 - }, - { - "epoch": 12.28, - "learning_rate": 5.086976926514301e-06, - "loss": 0.2049, - "step": 1205500 - }, - { - "epoch": 12.28, - "learning_rate": 5.083281845983752e-06, - "loss": 0.2129, - "step": 1205600 - }, - { - "epoch": 12.28, - "learning_rate": 5.079624916323469e-06, - "loss": 0.1898, - "step": 1205700 - }, - { - "epoch": 12.28, - "learning_rate": 5.0759322603456814e-06, - "loss": 0.1772, - "step": 1205800 - }, - { - "epoch": 12.29, - "learning_rate": 5.072240823006068e-06, - "loss": 0.2404, - "step": 1205900 - }, - { - "epoch": 12.29, - "learning_rate": 5.068550604485124e-06, - "loss": 0.245, - "step": 1206000 - }, - { - "epoch": 12.29, - "learning_rate": 5.0648616049632675e-06, - "loss": 0.2545, - "step": 1206100 - }, - { - "epoch": 12.29, - "learning_rate": 5.061173824620869e-06, - "loss": 0.2213, - "step": 1206200 - }, - { - "epoch": 12.29, - "learning_rate": 5.057487263638247e-06, - "loss": 0.2699, - "step": 1206300 - }, - { - "epoch": 12.29, - "learning_rate": 5.0538019221956474e-06, - "loss": 0.2109, - "step": 1206400 - }, - { - "epoch": 12.29, - "learning_rate": 5.050117800473263e-06, - "loss": 0.2102, - "step": 1206500 - }, - { - "epoch": 12.29, - "learning_rate": 5.046434898651225e-06, - "loss": 0.281, - "step": 1206600 - }, - { - "epoch": 12.29, - "learning_rate": 5.042753216909612e-06, - "loss": 0.2118, - "step": 1206700 - }, - { - "epoch": 12.3, - "learning_rate": 5.0390727554284235e-06, - "loss": 0.2471, - "step": 1206800 - }, - { - "epoch": 12.3, - "learning_rate": 5.035393514387618e-06, - "loss": 0.2128, - "step": 1206900 - }, - { - "epoch": 12.3, - "learning_rate": 5.031715493967096e-06, - "loss": 0.2636, - "step": 1207000 - }, - { - "epoch": 12.3, - "learning_rate": 5.028038694346673e-06, - "loss": 0.2773, - "step": 1207100 - }, - { - "epoch": 12.3, - "learning_rate": 5.0243631157061365e-06, - "loss": 0.1882, - "step": 1207200 - }, - { - "epoch": 12.3, - "learning_rate": 5.020688758225203e-06, - "loss": 0.2464, - "step": 1207300 - }, - { - "epoch": 12.3, - "learning_rate": 5.017015622083514e-06, - "loss": 0.1952, - "step": 1207400 - }, - { - "epoch": 12.3, - "learning_rate": 5.01334370746067e-06, - "loss": 0.2506, - "step": 1207500 - }, - { - "epoch": 12.3, - "learning_rate": 5.009673014536206e-06, - "loss": 0.2445, - "step": 1207600 - }, - { - "epoch": 12.3, - "learning_rate": 5.006003543489597e-06, - "loss": 0.2303, - "step": 1207700 - }, - { - "epoch": 12.31, - "learning_rate": 5.002335294500257e-06, - "loss": 0.2483, - "step": 1207800 - }, - { - "epoch": 12.31, - "learning_rate": 4.998668267747552e-06, - "loss": 0.2403, - "step": 1207900 - }, - { - "epoch": 12.31, - "learning_rate": 4.995002463410762e-06, - "loss": 0.2162, - "step": 1208000 - }, - { - "epoch": 12.31, - "learning_rate": 4.991374521434112e-06, - "loss": 0.2039, - "step": 1208100 - }, - { - "epoch": 12.31, - "learning_rate": 4.9877111502381845e-06, - "loss": 0.2229, - "step": 1208200 - }, - { - "epoch": 12.31, - "learning_rate": 4.984049001993913e-06, - "loss": 0.1642, - "step": 1208300 - }, - { - "epoch": 12.31, - "learning_rate": 4.9803880768803655e-06, - "loss": 0.2086, - "step": 1208400 - }, - { - "epoch": 12.31, - "learning_rate": 4.976728375076525e-06, - "loss": 0.2273, - "step": 1208500 - }, - { - "epoch": 12.31, - "learning_rate": 4.973069896761334e-06, - "loss": 0.212, - "step": 1208600 - }, - { - "epoch": 12.31, - "learning_rate": 4.969412642113685e-06, - "loss": 0.1801, - "step": 1208700 - }, - { - "epoch": 12.32, - "learning_rate": 4.965756611312369e-06, - "loss": 0.2016, - "step": 1208800 - }, - { - "epoch": 12.32, - "learning_rate": 4.962101804536161e-06, - "loss": 0.2255, - "step": 1208900 - }, - { - "epoch": 12.32, - "learning_rate": 4.9584482219637585e-06, - "loss": 0.2341, - "step": 1209000 - }, - { - "epoch": 12.32, - "learning_rate": 4.9547958637737946e-06, - "loss": 0.1781, - "step": 1209100 - }, - { - "epoch": 12.32, - "learning_rate": 4.951144730144853e-06, - "loss": 0.2156, - "step": 1209200 - }, - { - "epoch": 12.32, - "learning_rate": 4.9474948212554484e-06, - "loss": 0.2324, - "step": 1209300 - }, - { - "epoch": 12.32, - "learning_rate": 4.943846137284051e-06, - "loss": 0.1833, - "step": 1209400 - }, - { - "epoch": 12.32, - "learning_rate": 4.9402351469329746e-06, - "loss": 0.1997, - "step": 1209500 - }, - { - "epoch": 12.32, - "learning_rate": 4.936588901079078e-06, - "loss": 0.2267, - "step": 1209600 - }, - { - "epoch": 12.32, - "learning_rate": 4.932943880676408e-06, - "loss": 0.236, - "step": 1209700 - }, - { - "epoch": 12.33, - "learning_rate": 4.929300085903187e-06, - "loss": 0.2224, - "step": 1209800 - }, - { - "epoch": 12.33, - "learning_rate": 4.925657516937587e-06, - "loss": 0.1528, - "step": 1209900 - }, - { - "epoch": 12.33, - "learning_rate": 4.922016173957691e-06, - "loss": 0.2231, - "step": 1210000 - }, - { - "epoch": 12.33, - "learning_rate": 4.918376057141545e-06, - "loss": 0.2014, - "step": 1210100 - }, - { - "epoch": 12.33, - "learning_rate": 4.914737166667133e-06, - "loss": 0.1906, - "step": 1210200 - }, - { - "epoch": 12.33, - "learning_rate": 4.911099502712372e-06, - "loss": 0.2156, - "step": 1210300 - }, - { - "epoch": 12.33, - "learning_rate": 4.90746306545512e-06, - "loss": 0.2098, - "step": 1210400 - }, - { - "epoch": 12.33, - "learning_rate": 4.90382785507319e-06, - "loss": 0.203, - "step": 1210500 - }, - { - "epoch": 12.33, - "learning_rate": 4.900193871744302e-06, - "loss": 0.2511, - "step": 1210600 - }, - { - "epoch": 12.33, - "learning_rate": 4.8965611156461476e-06, - "loss": 0.2519, - "step": 1210700 - }, - { - "epoch": 12.34, - "learning_rate": 4.892929586956352e-06, - "loss": 0.2074, - "step": 1210800 - }, - { - "epoch": 12.34, - "learning_rate": 4.889299285852462e-06, - "loss": 0.2638, - "step": 1210900 - }, - { - "epoch": 12.34, - "learning_rate": 4.885670212511984e-06, - "loss": 0.2433, - "step": 1211000 - }, - { - "epoch": 12.34, - "learning_rate": 4.882042367112364e-06, - "loss": 0.172, - "step": 1211100 - }, - { - "epoch": 12.34, - "learning_rate": 4.878415749830968e-06, - "loss": 0.2011, - "step": 1211200 - }, - { - "epoch": 12.34, - "learning_rate": 4.874790360845127e-06, - "loss": 0.2044, - "step": 1211300 - }, - { - "epoch": 12.34, - "learning_rate": 4.8711662003320965e-06, - "loss": 0.1757, - "step": 1211400 - }, - { - "epoch": 12.34, - "learning_rate": 4.867543268469074e-06, - "loss": 0.2454, - "step": 1211500 - }, - { - "epoch": 12.34, - "learning_rate": 4.863921565433209e-06, - "loss": 0.2564, - "step": 1211600 - }, - { - "epoch": 12.35, - "learning_rate": 4.860301091401572e-06, - "loss": 0.1924, - "step": 1211700 - }, - { - "epoch": 12.35, - "learning_rate": 4.856681846551196e-06, - "loss": 0.239, - "step": 1211800 - }, - { - "epoch": 12.35, - "learning_rate": 4.85306383105902e-06, - "loss": 0.208, - "step": 1211900 - }, - { - "epoch": 12.35, - "learning_rate": 4.849447045101952e-06, - "loss": 0.2724, - "step": 1212000 - }, - { - "epoch": 12.35, - "learning_rate": 4.845831488856843e-06, - "loss": 0.244, - "step": 1212100 - }, - { - "epoch": 12.35, - "learning_rate": 4.842217162500455e-06, - "loss": 0.253, - "step": 1212200 - }, - { - "epoch": 12.35, - "learning_rate": 4.8386040662095135e-06, - "loss": 0.2259, - "step": 1212300 - }, - { - "epoch": 12.35, - "learning_rate": 4.834992200160686e-06, - "loss": 0.1815, - "step": 1212400 - }, - { - "epoch": 12.35, - "learning_rate": 4.831381564530552e-06, - "loss": 0.2084, - "step": 1212500 - }, - { - "epoch": 12.35, - "learning_rate": 4.827772159495665e-06, - "loss": 0.2744, - "step": 1212600 - }, - { - "epoch": 12.36, - "learning_rate": 4.824200060882236e-06, - "loss": 0.2409, - "step": 1212700 - }, - { - "epoch": 12.36, - "learning_rate": 4.820593105256853e-06, - "loss": 0.1888, - "step": 1212800 - }, - { - "epoch": 12.36, - "learning_rate": 4.816987380754204e-06, - "loss": 0.2191, - "step": 1212900 - }, - { - "epoch": 12.36, - "learning_rate": 4.8133828875505915e-06, - "loss": 0.2571, - "step": 1213000 - }, - { - "epoch": 12.36, - "learning_rate": 4.809779625822247e-06, - "loss": 0.2353, - "step": 1213100 - }, - { - "epoch": 12.36, - "learning_rate": 4.806177595745357e-06, - "loss": 0.2062, - "step": 1213200 - }, - { - "epoch": 12.36, - "learning_rate": 4.80257679749603e-06, - "loss": 0.1968, - "step": 1213300 - }, - { - "epoch": 12.36, - "learning_rate": 4.7989772312503256e-06, - "loss": 0.2049, - "step": 1213400 - }, - { - "epoch": 12.36, - "learning_rate": 4.795378897184249e-06, - "loss": 0.2342, - "step": 1213500 - }, - { - "epoch": 12.36, - "learning_rate": 4.7917817954737275e-06, - "loss": 0.2101, - "step": 1213600 - }, - { - "epoch": 12.37, - "learning_rate": 4.788185926294639e-06, - "loss": 0.2385, - "step": 1213700 - }, - { - "epoch": 12.37, - "learning_rate": 4.784591289822801e-06, - "loss": 0.2374, - "step": 1213800 - }, - { - "epoch": 12.37, - "learning_rate": 4.780997886233981e-06, - "loss": 0.2039, - "step": 1213900 - }, - { - "epoch": 12.37, - "learning_rate": 4.777405715703856e-06, - "loss": 0.2387, - "step": 1214000 - }, - { - "epoch": 12.37, - "learning_rate": 4.773814778408073e-06, - "loss": 0.2071, - "step": 1214100 - }, - { - "epoch": 12.37, - "learning_rate": 4.770225074522204e-06, - "loss": 0.1997, - "step": 1214200 - }, - { - "epoch": 12.37, - "learning_rate": 4.7666366042217655e-06, - "loss": 0.2791, - "step": 1214300 - }, - { - "epoch": 12.37, - "learning_rate": 4.7630493676822105e-06, - "loss": 0.2756, - "step": 1214400 - }, - { - "epoch": 12.37, - "learning_rate": 4.759463365078945e-06, - "loss": 0.2188, - "step": 1214500 - }, - { - "epoch": 12.37, - "learning_rate": 4.755878596587286e-06, - "loss": 0.2, - "step": 1214600 - }, - { - "epoch": 12.38, - "learning_rate": 4.752295062382514e-06, - "loss": 0.2411, - "step": 1214700 - }, - { - "epoch": 12.38, - "learning_rate": 4.74871276263985e-06, - "loss": 0.204, - "step": 1214800 - }, - { - "epoch": 12.38, - "learning_rate": 4.745131697534432e-06, - "loss": 0.2263, - "step": 1214900 - }, - { - "epoch": 12.38, - "learning_rate": 4.7415518672413625e-06, - "loss": 0.2215, - "step": 1215000 - }, - { - "epoch": 12.38, - "learning_rate": 4.7379732719356795e-06, - "loss": 0.1962, - "step": 1215100 - }, - { - "epoch": 12.38, - "learning_rate": 4.734395911792338e-06, - "loss": 0.1752, - "step": 1215200 - }, - { - "epoch": 12.38, - "learning_rate": 4.73081978698626e-06, - "loss": 0.2872, - "step": 1215300 - }, - { - "epoch": 12.38, - "learning_rate": 4.727244897692298e-06, - "loss": 0.255, - "step": 1215400 - }, - { - "epoch": 12.38, - "learning_rate": 4.723671244085238e-06, - "loss": 0.1877, - "step": 1215500 - }, - { - "epoch": 12.38, - "learning_rate": 4.720098826339815e-06, - "loss": 0.1911, - "step": 1215600 - }, - { - "epoch": 12.39, - "learning_rate": 4.716527644630695e-06, - "loss": 0.1827, - "step": 1215700 - }, - { - "epoch": 12.39, - "learning_rate": 4.712957699132498e-06, - "loss": 0.1595, - "step": 1215800 - }, - { - "epoch": 12.39, - "learning_rate": 4.7093889900197565e-06, - "loss": 0.2572, - "step": 1215900 - }, - { - "epoch": 12.39, - "learning_rate": 4.705821517466968e-06, - "loss": 0.2585, - "step": 1216000 - }, - { - "epoch": 12.39, - "learning_rate": 4.702255281648565e-06, - "loss": 0.2224, - "step": 1216100 - }, - { - "epoch": 12.39, - "learning_rate": 4.698725926604726e-06, - "loss": 0.2318, - "step": 1216200 - }, - { - "epoch": 12.39, - "learning_rate": 4.695162152406431e-06, - "loss": 0.2037, - "step": 1216300 - }, - { - "epoch": 12.39, - "learning_rate": 4.6915996154636905e-06, - "loss": 0.2826, - "step": 1216400 - }, - { - "epoch": 12.39, - "learning_rate": 4.688038315950695e-06, - "loss": 0.1848, - "step": 1216500 - }, - { - "epoch": 12.39, - "learning_rate": 4.684478254041583e-06, - "loss": 0.2271, - "step": 1216600 - }, - { - "epoch": 12.4, - "learning_rate": 4.680919429910397e-06, - "loss": 0.1999, - "step": 1216700 - }, - { - "epoch": 12.4, - "learning_rate": 4.67736184373116e-06, - "loss": 0.2465, - "step": 1216800 - }, - { - "epoch": 12.4, - "learning_rate": 4.673805495677806e-06, - "loss": 0.2279, - "step": 1216900 - }, - { - "epoch": 12.4, - "learning_rate": 4.670250385924226e-06, - "loss": 0.2228, - "step": 1217000 - }, - { - "epoch": 12.4, - "learning_rate": 4.666696514644243e-06, - "loss": 0.2462, - "step": 1217100 - }, - { - "epoch": 12.4, - "learning_rate": 4.663143882011625e-06, - "loss": 0.1899, - "step": 1217200 - }, - { - "epoch": 12.4, - "learning_rate": 4.659592488200062e-06, - "loss": 0.2102, - "step": 1217300 - }, - { - "epoch": 12.4, - "learning_rate": 4.656042333383203e-06, - "loss": 0.2162, - "step": 1217400 - }, - { - "epoch": 12.4, - "learning_rate": 4.652493417734638e-06, - "loss": 0.1812, - "step": 1217500 - }, - { - "epoch": 12.41, - "learning_rate": 4.648945741427869e-06, - "loss": 0.2153, - "step": 1217600 - }, - { - "epoch": 12.41, - "learning_rate": 4.645399304636367e-06, - "loss": 0.2368, - "step": 1217700 - }, - { - "epoch": 12.41, - "learning_rate": 4.641854107533531e-06, - "loss": 0.191, - "step": 1217800 - }, - { - "epoch": 12.41, - "learning_rate": 4.638310150292707e-06, - "loss": 0.195, - "step": 1217900 - }, - { - "epoch": 12.41, - "learning_rate": 4.634767433087158e-06, - "loss": 0.2204, - "step": 1218000 - }, - { - "epoch": 12.41, - "learning_rate": 4.631225956090109e-06, - "loss": 0.194, - "step": 1218100 - }, - { - "epoch": 12.41, - "learning_rate": 4.627685719474719e-06, - "loss": 0.2484, - "step": 1218200 - }, - { - "epoch": 12.41, - "learning_rate": 4.62418210723338e-06, - "loss": 0.2406, - "step": 1218300 - }, - { - "epoch": 12.41, - "learning_rate": 4.620644339492399e-06, - "loss": 0.2459, - "step": 1218400 - }, - { - "epoch": 12.41, - "learning_rate": 4.6171078126504554e-06, - "loss": 0.2636, - "step": 1218500 - }, - { - "epoch": 12.42, - "learning_rate": 4.613572526880459e-06, - "loss": 0.2096, - "step": 1218600 - }, - { - "epoch": 12.42, - "learning_rate": 4.610038482355277e-06, - "loss": 0.2256, - "step": 1218700 - }, - { - "epoch": 12.42, - "learning_rate": 4.6065056792476875e-06, - "loss": 0.2083, - "step": 1218800 - }, - { - "epoch": 12.42, - "learning_rate": 4.602974117730432e-06, - "loss": 0.2418, - "step": 1218900 - }, - { - "epoch": 12.42, - "learning_rate": 4.599443797976185e-06, - "loss": 0.2429, - "step": 1219000 - }, - { - "epoch": 12.42, - "learning_rate": 4.595914720157546e-06, - "loss": 0.2151, - "step": 1219100 - }, - { - "epoch": 12.42, - "learning_rate": 4.592386884447076e-06, - "loss": 0.2322, - "step": 1219200 - }, - { - "epoch": 12.42, - "learning_rate": 4.588860291017262e-06, - "loss": 0.2162, - "step": 1219300 - }, - { - "epoch": 12.42, - "learning_rate": 4.585334940040531e-06, - "loss": 0.1853, - "step": 1219400 - }, - { - "epoch": 12.42, - "learning_rate": 4.581810831689259e-06, - "loss": 0.2322, - "step": 1219500 - }, - { - "epoch": 12.43, - "learning_rate": 4.578287966135753e-06, - "loss": 0.2736, - "step": 1219600 - }, - { - "epoch": 12.43, - "learning_rate": 4.574766343552252e-06, - "loss": 0.2395, - "step": 1219700 - }, - { - "epoch": 12.43, - "learning_rate": 4.571245964110945e-06, - "loss": 0.2142, - "step": 1219800 - }, - { - "epoch": 12.43, - "learning_rate": 4.567726827983969e-06, - "loss": 0.219, - "step": 1219900 - }, - { - "epoch": 12.43, - "learning_rate": 4.56420893534337e-06, - "loss": 0.2038, - "step": 1220000 - }, - { - "epoch": 12.43, - "learning_rate": 4.560692286361161e-06, - "loss": 0.2089, - "step": 1220100 - }, - { - "epoch": 12.43, - "learning_rate": 4.557176881209287e-06, - "loss": 0.2244, - "step": 1220200 - }, - { - "epoch": 12.43, - "learning_rate": 4.553662720059635e-06, - "loss": 0.2778, - "step": 1220300 - }, - { - "epoch": 12.43, - "learning_rate": 4.550149803084012e-06, - "loss": 0.196, - "step": 1220400 - }, - { - "epoch": 12.43, - "learning_rate": 4.546638130454187e-06, - "loss": 0.24, - "step": 1220500 - }, - { - "epoch": 12.44, - "learning_rate": 4.543127702341864e-06, - "loss": 0.2198, - "step": 1220600 - }, - { - "epoch": 12.44, - "learning_rate": 4.539618518918673e-06, - "loss": 0.226, - "step": 1220700 - }, - { - "epoch": 12.44, - "learning_rate": 4.536110580356199e-06, - "loss": 0.3253, - "step": 1220800 - }, - { - "epoch": 12.44, - "learning_rate": 4.5326038868259625e-06, - "loss": 0.2325, - "step": 1220900 - }, - { - "epoch": 12.44, - "learning_rate": 4.529098438499406e-06, - "loss": 0.213, - "step": 1221000 - }, - { - "epoch": 12.44, - "learning_rate": 4.525594235547936e-06, - "loss": 0.2438, - "step": 1221100 - }, - { - "epoch": 12.44, - "learning_rate": 4.522091278142892e-06, - "loss": 0.2613, - "step": 1221200 - }, - { - "epoch": 12.44, - "learning_rate": 4.518589566455534e-06, - "loss": 0.2278, - "step": 1221300 - }, - { - "epoch": 12.44, - "learning_rate": 4.515089100657078e-06, - "loss": 0.2247, - "step": 1221400 - }, - { - "epoch": 12.44, - "learning_rate": 4.5115898809186876e-06, - "loss": 0.2167, - "step": 1221500 - }, - { - "epoch": 12.45, - "learning_rate": 4.508091907411438e-06, - "loss": 0.2475, - "step": 1221600 - }, - { - "epoch": 12.45, - "learning_rate": 4.504595180306367e-06, - "loss": 0.1793, - "step": 1221700 - }, - { - "epoch": 12.45, - "learning_rate": 4.501099699774444e-06, - "loss": 0.2718, - "step": 1221800 - }, - { - "epoch": 12.45, - "learning_rate": 4.4976054659865754e-06, - "loss": 0.2141, - "step": 1221900 - }, - { - "epoch": 12.45, - "learning_rate": 4.494112479113608e-06, - "loss": 0.2188, - "step": 1222000 - }, - { - "epoch": 12.45, - "learning_rate": 4.4906207393263285e-06, - "loss": 0.1991, - "step": 1222100 - }, - { - "epoch": 12.45, - "learning_rate": 4.487130246795471e-06, - "loss": 0.2193, - "step": 1222200 - }, - { - "epoch": 12.45, - "learning_rate": 4.483641001691685e-06, - "loss": 0.2267, - "step": 1222300 - }, - { - "epoch": 12.45, - "learning_rate": 4.480153004185581e-06, - "loss": 0.2337, - "step": 1222400 - }, - { - "epoch": 12.46, - "learning_rate": 4.476666254447706e-06, - "loss": 0.2332, - "step": 1222500 - }, - { - "epoch": 12.46, - "learning_rate": 4.473180752648529e-06, - "loss": 0.2619, - "step": 1222600 - }, - { - "epoch": 12.46, - "learning_rate": 4.46973133531668e-06, - "loss": 0.2063, - "step": 1222700 - }, - { - "epoch": 12.46, - "learning_rate": 4.466248317422476e-06, - "loss": 0.2533, - "step": 1222800 - }, - { - "epoch": 12.46, - "learning_rate": 4.462766547976353e-06, - "loss": 0.2569, - "step": 1222900 - }, - { - "epoch": 12.46, - "learning_rate": 4.459286027148551e-06, - "loss": 0.2066, - "step": 1223000 - }, - { - "epoch": 12.46, - "learning_rate": 4.4558067551092366e-06, - "loss": 0.2132, - "step": 1223100 - }, - { - "epoch": 12.46, - "learning_rate": 4.452328732028534e-06, - "loss": 0.1798, - "step": 1223200 - }, - { - "epoch": 12.46, - "learning_rate": 4.448851958076497e-06, - "loss": 0.2261, - "step": 1223300 - }, - { - "epoch": 12.46, - "learning_rate": 4.445376433423113e-06, - "loss": 0.2648, - "step": 1223400 - }, - { - "epoch": 12.47, - "learning_rate": 4.441902158238325e-06, - "loss": 0.2231, - "step": 1223500 - }, - { - "epoch": 12.47, - "learning_rate": 4.438429132692002e-06, - "loss": 0.2391, - "step": 1223600 - }, - { - "epoch": 12.47, - "learning_rate": 4.4349573569539435e-06, - "loss": 0.1969, - "step": 1223700 - }, - { - "epoch": 12.47, - "learning_rate": 4.43148683119391e-06, - "loss": 0.2293, - "step": 1223800 - }, - { - "epoch": 12.47, - "learning_rate": 4.428017555581587e-06, - "loss": 0.198, - "step": 1223900 - }, - { - "epoch": 12.47, - "learning_rate": 4.424549530286598e-06, - "loss": 0.2304, - "step": 1224000 - }, - { - "epoch": 12.47, - "learning_rate": 4.421082755478509e-06, - "loss": 0.2181, - "step": 1224100 - }, - { - "epoch": 12.47, - "learning_rate": 4.417617231326833e-06, - "loss": 0.2346, - "step": 1224200 - }, - { - "epoch": 12.47, - "learning_rate": 4.4141529580009985e-06, - "loss": 0.2787, - "step": 1224300 - }, - { - "epoch": 12.47, - "learning_rate": 4.410689935670396e-06, - "loss": 0.2352, - "step": 1224400 - }, - { - "epoch": 12.48, - "learning_rate": 4.407228164504348e-06, - "loss": 0.2453, - "step": 1224500 - }, - { - "epoch": 12.48, - "learning_rate": 4.403767644672113e-06, - "loss": 0.1951, - "step": 1224600 - }, - { - "epoch": 12.48, - "learning_rate": 4.40030837634289e-06, - "loss": 0.2303, - "step": 1224700 - }, - { - "epoch": 12.48, - "learning_rate": 4.396850359685816e-06, - "loss": 0.1946, - "step": 1224800 - }, - { - "epoch": 12.48, - "learning_rate": 4.393393594869975e-06, - "loss": 0.2173, - "step": 1224900 - }, - { - "epoch": 12.48, - "learning_rate": 4.389938082064368e-06, - "loss": 0.1811, - "step": 1225000 - }, - { - "epoch": 12.48, - "learning_rate": 4.386483821437957e-06, - "loss": 0.2216, - "step": 1225100 - }, - { - "epoch": 12.48, - "learning_rate": 4.383030813159638e-06, - "loss": 0.2125, - "step": 1225200 - }, - { - "epoch": 12.48, - "learning_rate": 4.379579057398232e-06, - "loss": 0.2275, - "step": 1225300 - }, - { - "epoch": 12.48, - "learning_rate": 4.376128554322518e-06, - "loss": 0.2449, - "step": 1225400 - }, - { - "epoch": 12.49, - "learning_rate": 4.372679304101208e-06, - "loss": 0.2773, - "step": 1225500 - }, - { - "epoch": 12.49, - "learning_rate": 4.3692313069029366e-06, - "loss": 0.2104, - "step": 1225600 - }, - { - "epoch": 12.49, - "learning_rate": 4.365784562896296e-06, - "loss": 0.2496, - "step": 1225700 - }, - { - "epoch": 12.49, - "learning_rate": 4.362339072249817e-06, - "loss": 0.2082, - "step": 1225800 - }, - { - "epoch": 12.49, - "learning_rate": 4.358894835131956e-06, - "loss": 0.1835, - "step": 1225900 - }, - { - "epoch": 12.49, - "learning_rate": 4.3554518517111206e-06, - "loss": 0.256, - "step": 1226000 - }, - { - "epoch": 12.49, - "learning_rate": 4.352010122155654e-06, - "loss": 0.2591, - "step": 1226100 - }, - { - "epoch": 12.49, - "learning_rate": 4.348569646633838e-06, - "loss": 0.2852, - "step": 1226200 - }, - { - "epoch": 12.49, - "learning_rate": 4.34513042531388e-06, - "loss": 0.285, - "step": 1226300 - }, - { - "epoch": 12.49, - "learning_rate": 4.341692458363944e-06, - "loss": 0.2228, - "step": 1226400 - }, - { - "epoch": 12.5, - "learning_rate": 4.3382557459521314e-06, - "loss": 0.1911, - "step": 1226500 - }, - { - "epoch": 12.5, - "learning_rate": 4.334820288246467e-06, - "loss": 0.174, - "step": 1226600 - }, - { - "epoch": 12.5, - "learning_rate": 4.331386085414924e-06, - "loss": 0.2197, - "step": 1226700 - }, - { - "epoch": 12.5, - "learning_rate": 4.327987460890313e-06, - "loss": 0.2346, - "step": 1226800 - }, - { - "epoch": 12.5, - "learning_rate": 4.324555755757771e-06, - "loss": 0.2029, - "step": 1226900 - }, - { - "epoch": 12.5, - "learning_rate": 4.321125306001229e-06, - "loss": 0.1936, - "step": 1227000 - }, - { - "epoch": 12.5, - "learning_rate": 4.317696111788416e-06, - "loss": 0.2335, - "step": 1227100 - }, - { - "epoch": 12.5, - "learning_rate": 4.314268173287002e-06, - "loss": 0.2381, - "step": 1227200 - }, - { - "epoch": 12.5, - "learning_rate": 4.3108414906645946e-06, - "loss": 0.2339, - "step": 1227300 - }, - { - "epoch": 12.5, - "learning_rate": 4.307416064088724e-06, - "loss": 0.215, - "step": 1227400 - }, - { - "epoch": 12.51, - "learning_rate": 4.303991893726886e-06, - "loss": 0.2418, - "step": 1227500 - }, - { - "epoch": 12.51, - "learning_rate": 4.300568979746504e-06, - "loss": 0.2067, - "step": 1227600 - }, - { - "epoch": 12.51, - "learning_rate": 4.297147322314927e-06, - "loss": 0.1888, - "step": 1227700 - }, - { - "epoch": 12.51, - "learning_rate": 4.293761119385321e-06, - "loss": 0.2111, - "step": 1227800 - }, - { - "epoch": 12.51, - "learning_rate": 4.2903419629835406e-06, - "loss": 0.2542, - "step": 1227900 - }, - { - "epoch": 12.51, - "learning_rate": 4.286924063630605e-06, - "loss": 0.2368, - "step": 1228000 - }, - { - "epoch": 12.51, - "learning_rate": 4.283507421493648e-06, - "loss": 0.2724, - "step": 1228100 - }, - { - "epoch": 12.51, - "learning_rate": 4.280092036739698e-06, - "loss": 0.2585, - "step": 1228200 - }, - { - "epoch": 12.51, - "learning_rate": 4.2767120445823785e-06, - "loss": 0.2619, - "step": 1228300 - }, - { - "epoch": 12.52, - "learning_rate": 4.273299162517388e-06, - "loss": 0.2783, - "step": 1228400 - }, - { - "epoch": 12.52, - "learning_rate": 4.269887538334539e-06, - "loss": 0.1975, - "step": 1228500 - }, - { - "epoch": 12.52, - "learning_rate": 4.2664771722006455e-06, - "loss": 0.214, - "step": 1228600 - }, - { - "epoch": 12.52, - "learning_rate": 4.2630680642824475e-06, - "loss": 0.1948, - "step": 1228700 - }, - { - "epoch": 12.52, - "learning_rate": 4.2596602147466265e-06, - "loss": 0.2242, - "step": 1228800 - }, - { - "epoch": 12.52, - "learning_rate": 4.2562536237598184e-06, - "loss": 0.206, - "step": 1228900 - }, - { - "epoch": 12.52, - "learning_rate": 4.252848291488568e-06, - "loss": 0.2079, - "step": 1229000 - }, - { - "epoch": 12.52, - "learning_rate": 4.24944421809939e-06, - "loss": 0.2565, - "step": 1229100 - }, - { - "epoch": 12.52, - "learning_rate": 4.246041403758724e-06, - "loss": 0.2417, - "step": 1229200 - }, - { - "epoch": 12.52, - "learning_rate": 4.242639848632933e-06, - "loss": 0.2478, - "step": 1229300 - }, - { - "epoch": 12.53, - "learning_rate": 4.239239552888342e-06, - "loss": 0.22, - "step": 1229400 - }, - { - "epoch": 12.53, - "learning_rate": 4.23584051669121e-06, - "loss": 0.1878, - "step": 1229500 - }, - { - "epoch": 12.53, - "learning_rate": 4.232442740207716e-06, - "loss": 0.2163, - "step": 1229600 - }, - { - "epoch": 12.53, - "learning_rate": 4.2290462236040016e-06, - "loss": 0.2526, - "step": 1229700 - }, - { - "epoch": 12.53, - "learning_rate": 4.225650967046135e-06, - "loss": 0.2172, - "step": 1229800 - }, - { - "epoch": 12.53, - "learning_rate": 4.22225697070013e-06, - "loss": 0.252, - "step": 1229900 - }, - { - "epoch": 12.53, - "learning_rate": 4.218864234731917e-06, - "loss": 0.194, - "step": 1230000 - }, - { - "epoch": 12.53, - "learning_rate": 4.2154727593073905e-06, - "loss": 0.2067, - "step": 1230100 - }, - { - "epoch": 12.53, - "learning_rate": 4.21208254459238e-06, - "loss": 0.1969, - "step": 1230200 - }, - { - "epoch": 12.53, - "learning_rate": 4.208693590752631e-06, - "loss": 0.2371, - "step": 1230300 - }, - { - "epoch": 12.54, - "learning_rate": 4.2053058979538505e-06, - "loss": 0.2593, - "step": 1230400 - }, - { - "epoch": 12.54, - "learning_rate": 4.201919466361685e-06, - "loss": 0.2375, - "step": 1230500 - }, - { - "epoch": 12.54, - "learning_rate": 4.198534296141694e-06, - "loss": 0.2355, - "step": 1230600 - }, - { - "epoch": 12.54, - "learning_rate": 4.195150387459403e-06, - "loss": 0.1997, - "step": 1230700 - }, - { - "epoch": 12.54, - "learning_rate": 4.1917677404802614e-06, - "loss": 0.2622, - "step": 1230800 - }, - { - "epoch": 12.54, - "learning_rate": 4.188386355369662e-06, - "loss": 0.2256, - "step": 1230900 - }, - { - "epoch": 12.54, - "learning_rate": 4.185006232292934e-06, - "loss": 0.2531, - "step": 1231000 - }, - { - "epoch": 12.54, - "learning_rate": 4.1816273714153504e-06, - "loss": 0.2901, - "step": 1231100 - }, - { - "epoch": 12.54, - "learning_rate": 4.178249772902108e-06, - "loss": 0.2242, - "step": 1231200 - }, - { - "epoch": 12.54, - "learning_rate": 4.174873436918355e-06, - "loss": 0.2488, - "step": 1231300 - }, - { - "epoch": 12.55, - "learning_rate": 4.171498363629175e-06, - "loss": 0.255, - "step": 1231400 - }, - { - "epoch": 12.55, - "learning_rate": 4.168124553199595e-06, - "loss": 0.1909, - "step": 1231500 - }, - { - "epoch": 12.55, - "learning_rate": 4.164752005794563e-06, - "loss": 0.2557, - "step": 1231600 - }, - { - "epoch": 12.55, - "learning_rate": 4.161380721578978e-06, - "loss": 0.2289, - "step": 1231700 - }, - { - "epoch": 12.55, - "learning_rate": 4.158010700717687e-06, - "loss": 0.2493, - "step": 1231800 - }, - { - "epoch": 12.55, - "learning_rate": 4.154641943375449e-06, - "loss": 0.209, - "step": 1231900 - }, - { - "epoch": 12.55, - "learning_rate": 4.1512744497169815e-06, - "loss": 0.2151, - "step": 1232000 - }, - { - "epoch": 12.55, - "learning_rate": 4.147908219906941e-06, - "loss": 0.2183, - "step": 1232100 - }, - { - "epoch": 12.55, - "learning_rate": 4.144543254109907e-06, - "loss": 0.2208, - "step": 1232200 - }, - { - "epoch": 12.55, - "learning_rate": 4.14117955249041e-06, - "loss": 0.2033, - "step": 1232300 - }, - { - "epoch": 12.56, - "learning_rate": 4.137817115212926e-06, - "loss": 0.2374, - "step": 1232400 - }, - { - "epoch": 12.56, - "learning_rate": 4.134455942441839e-06, - "loss": 0.2233, - "step": 1232500 - }, - { - "epoch": 12.56, - "learning_rate": 4.131096034341497e-06, - "loss": 0.2232, - "step": 1232600 - }, - { - "epoch": 12.56, - "learning_rate": 4.127770971247369e-06, - "loss": 0.2309, - "step": 1232700 - }, - { - "epoch": 12.56, - "learning_rate": 4.1244135803304915e-06, - "loss": 0.2124, - "step": 1232800 - }, - { - "epoch": 12.56, - "learning_rate": 4.121057454575383e-06, - "loss": 0.2514, - "step": 1232900 - }, - { - "epoch": 12.56, - "learning_rate": 4.1177025941461185e-06, - "loss": 0.2397, - "step": 1233000 - }, - { - "epoch": 12.56, - "learning_rate": 4.1143489992067405e-06, - "loss": 0.2597, - "step": 1233100 - }, - { - "epoch": 12.56, - "learning_rate": 4.110996669921228e-06, - "loss": 0.2075, - "step": 1233200 - }, - { - "epoch": 12.57, - "learning_rate": 4.107645606453472e-06, - "loss": 0.2468, - "step": 1233300 - }, - { - "epoch": 12.57, - "learning_rate": 4.104295808967326e-06, - "loss": 0.2435, - "step": 1233400 - }, - { - "epoch": 12.57, - "learning_rate": 4.100947277626581e-06, - "loss": 0.2433, - "step": 1233500 - }, - { - "epoch": 12.57, - "learning_rate": 4.097600012594954e-06, - "loss": 0.3267, - "step": 1233600 - }, - { - "epoch": 12.57, - "learning_rate": 4.094254014036111e-06, - "loss": 0.2352, - "step": 1233700 - }, - { - "epoch": 12.57, - "learning_rate": 4.0909092821136495e-06, - "loss": 0.1993, - "step": 1233800 - }, - { - "epoch": 12.57, - "learning_rate": 4.087565816991113e-06, - "loss": 0.2072, - "step": 1233900 - }, - { - "epoch": 12.57, - "learning_rate": 4.084223618831961e-06, - "loss": 0.2152, - "step": 1234000 - }, - { - "epoch": 12.57, - "learning_rate": 4.08088268779962e-06, - "loss": 0.2207, - "step": 1234100 - }, - { - "epoch": 12.57, - "learning_rate": 4.077543024057444e-06, - "loss": 0.2289, - "step": 1234200 - }, - { - "epoch": 12.58, - "learning_rate": 4.07420462776871e-06, - "loss": 0.2025, - "step": 1234300 - }, - { - "epoch": 12.58, - "learning_rate": 4.070867499096655e-06, - "loss": 0.2608, - "step": 1234400 - }, - { - "epoch": 12.58, - "learning_rate": 4.067531638204448e-06, - "loss": 0.2107, - "step": 1234500 - }, - { - "epoch": 12.58, - "learning_rate": 4.064197045255178e-06, - "loss": 0.2346, - "step": 1234600 - }, - { - "epoch": 12.58, - "learning_rate": 4.060863720411898e-06, - "loss": 0.2021, - "step": 1234700 - }, - { - "epoch": 12.58, - "learning_rate": 4.057531663837585e-06, - "loss": 0.2101, - "step": 1234800 - }, - { - "epoch": 12.58, - "learning_rate": 4.054200875695157e-06, - "loss": 0.1836, - "step": 1234900 - }, - { - "epoch": 12.58, - "learning_rate": 4.05087135614747e-06, - "loss": 0.22, - "step": 1235000 - }, - { - "epoch": 12.58, - "learning_rate": 4.047543105357324e-06, - "loss": 0.2208, - "step": 1235100 - }, - { - "epoch": 12.58, - "learning_rate": 4.044216123487437e-06, - "loss": 0.2185, - "step": 1235200 - }, - { - "epoch": 12.59, - "learning_rate": 4.040890410700486e-06, - "loss": 0.22, - "step": 1235300 - }, - { - "epoch": 12.59, - "learning_rate": 4.037565967159084e-06, - "loss": 0.2587, - "step": 1235400 - }, - { - "epoch": 12.59, - "learning_rate": 4.034242793025763e-06, - "loss": 0.1732, - "step": 1235500 - }, - { - "epoch": 12.59, - "learning_rate": 4.030920888463015e-06, - "loss": 0.1684, - "step": 1235600 - }, - { - "epoch": 12.59, - "learning_rate": 4.0276002536332615e-06, - "loss": 0.1287, - "step": 1235700 - }, - { - "epoch": 12.59, - "learning_rate": 4.024280888698867e-06, - "loss": 0.2497, - "step": 1235800 - }, - { - "epoch": 12.59, - "learning_rate": 4.020962793822112e-06, - "loss": 0.2051, - "step": 1235900 - }, - { - "epoch": 12.59, - "learning_rate": 4.017679131123695e-06, - "loss": 0.2319, - "step": 1236000 - }, - { - "epoch": 12.59, - "learning_rate": 4.014363564144261e-06, - "loss": 0.2176, - "step": 1236100 - }, - { - "epoch": 12.59, - "learning_rate": 4.011049267707375e-06, - "loss": 0.2349, - "step": 1236200 - }, - { - "epoch": 12.6, - "learning_rate": 4.007736241975088e-06, - "loss": 0.2112, - "step": 1236300 - }, - { - "epoch": 12.6, - "learning_rate": 4.004424487109381e-06, - "loss": 0.2135, - "step": 1236400 - }, - { - "epoch": 12.6, - "learning_rate": 4.0011140032721845e-06, - "loss": 0.2174, - "step": 1236500 - }, - { - "epoch": 12.6, - "learning_rate": 3.997804790625368e-06, - "loss": 0.2524, - "step": 1236600 - }, - { - "epoch": 12.6, - "learning_rate": 3.994496849330714e-06, - "loss": 0.2514, - "step": 1236700 - }, - { - "epoch": 12.6, - "learning_rate": 3.991190179549975e-06, - "loss": 0.2244, - "step": 1236800 - }, - { - "epoch": 12.6, - "learning_rate": 3.9878847814448305e-06, - "loss": 0.1802, - "step": 1236900 - }, - { - "epoch": 12.6, - "learning_rate": 3.984580655176878e-06, - "loss": 0.1939, - "step": 1237000 - }, - { - "epoch": 12.6, - "learning_rate": 3.9812778009076825e-06, - "loss": 0.2233, - "step": 1237100 - }, - { - "epoch": 12.6, - "learning_rate": 3.977976218798737e-06, - "loss": 0.2191, - "step": 1237200 - }, - { - "epoch": 12.61, - "learning_rate": 3.974675909011456e-06, - "loss": 0.2554, - "step": 1237300 - }, - { - "epoch": 12.61, - "learning_rate": 3.971376871707211e-06, - "loss": 0.2518, - "step": 1237400 - }, - { - "epoch": 12.61, - "learning_rate": 3.968079107047309e-06, - "loss": 0.2438, - "step": 1237500 - }, - { - "epoch": 12.61, - "learning_rate": 3.9647826151929845e-06, - "loss": 0.2781, - "step": 1237600 - }, - { - "epoch": 12.61, - "learning_rate": 3.961487396305422e-06, - "loss": 0.2082, - "step": 1237700 - }, - { - "epoch": 12.61, - "learning_rate": 3.958193450545743e-06, - "loss": 0.2189, - "step": 1237800 - }, - { - "epoch": 12.61, - "learning_rate": 3.954900778074991e-06, - "loss": 0.2311, - "step": 1237900 - }, - { - "epoch": 12.61, - "learning_rate": 3.951609379054158e-06, - "loss": 0.2088, - "step": 1238000 - }, - { - "epoch": 12.61, - "learning_rate": 3.948319253644179e-06, - "loss": 0.2479, - "step": 1238100 - }, - { - "epoch": 12.61, - "learning_rate": 3.945030402005928e-06, - "loss": 0.2011, - "step": 1238200 - }, - { - "epoch": 12.62, - "learning_rate": 3.941742824300193e-06, - "loss": 0.2212, - "step": 1238300 - }, - { - "epoch": 12.62, - "learning_rate": 3.938456520687728e-06, - "loss": 0.2395, - "step": 1238400 - }, - { - "epoch": 12.62, - "learning_rate": 3.935204335314715e-06, - "loss": 0.2501, - "step": 1238500 - }, - { - "epoch": 12.62, - "learning_rate": 3.931920567625825e-06, - "loss": 0.2087, - "step": 1238600 - }, - { - "epoch": 12.62, - "learning_rate": 3.928638074510456e-06, - "loss": 0.2407, - "step": 1238700 - }, - { - "epoch": 12.62, - "learning_rate": 3.925356856129098e-06, - "loss": 0.2344, - "step": 1238800 - }, - { - "epoch": 12.62, - "learning_rate": 3.922076912642183e-06, - "loss": 0.2266, - "step": 1238900 - }, - { - "epoch": 12.62, - "learning_rate": 3.918798244210087e-06, - "loss": 0.1954, - "step": 1239000 - }, - { - "epoch": 12.62, - "learning_rate": 3.915520850993107e-06, - "loss": 0.2628, - "step": 1239100 - }, - { - "epoch": 12.63, - "learning_rate": 3.912244733151497e-06, - "loss": 0.2627, - "step": 1239200 - }, - { - "epoch": 12.63, - "learning_rate": 3.90896989084544e-06, - "loss": 0.2336, - "step": 1239300 - }, - { - "epoch": 12.63, - "learning_rate": 3.905696324235045e-06, - "loss": 0.1922, - "step": 1239400 - }, - { - "epoch": 12.63, - "learning_rate": 3.902424033480378e-06, - "loss": 0.2303, - "step": 1239500 - }, - { - "epoch": 12.63, - "learning_rate": 3.899153018741438e-06, - "loss": 0.2029, - "step": 1239600 - }, - { - "epoch": 12.63, - "learning_rate": 3.895883280178149e-06, - "loss": 0.267, - "step": 1239700 - }, - { - "epoch": 12.63, - "learning_rate": 3.892614817950385e-06, - "loss": 0.207, - "step": 1239800 - }, - { - "epoch": 12.63, - "learning_rate": 3.88934763221796e-06, - "loss": 0.2342, - "step": 1239900 - }, - { - "epoch": 12.63, - "learning_rate": 3.88608172314061e-06, - "loss": 0.2576, - "step": 1240000 - }, - { - "epoch": 12.63, - "learning_rate": 3.882817090878027e-06, - "loss": 0.1877, - "step": 1240100 - }, - { - "epoch": 12.64, - "learning_rate": 3.879553735589837e-06, - "loss": 0.2317, - "step": 1240200 - }, - { - "epoch": 12.64, - "learning_rate": 3.876291657435579e-06, - "loss": 0.2257, - "step": 1240300 - }, - { - "epoch": 12.64, - "learning_rate": 3.873030856574765e-06, - "loss": 0.2293, - "step": 1240400 - }, - { - "epoch": 12.64, - "learning_rate": 3.869771333166825e-06, - "loss": 0.2207, - "step": 1240500 - }, - { - "epoch": 12.64, - "learning_rate": 3.866513087371134e-06, - "loss": 0.209, - "step": 1240600 - }, - { - "epoch": 12.64, - "learning_rate": 3.863256119346992e-06, - "loss": 0.2303, - "step": 1240700 - }, - { - "epoch": 12.64, - "learning_rate": 3.8600004292536475e-06, - "loss": 0.2329, - "step": 1240800 - }, - { - "epoch": 12.64, - "learning_rate": 3.856746017250293e-06, - "loss": 0.2276, - "step": 1240900 - }, - { - "epoch": 12.64, - "learning_rate": 3.85349288349604e-06, - "loss": 0.1865, - "step": 1241000 - }, - { - "epoch": 12.64, - "learning_rate": 3.850241028149948e-06, - "loss": 0.2043, - "step": 1241100 - }, - { - "epoch": 12.65, - "learning_rate": 3.846990451371018e-06, - "loss": 0.1961, - "step": 1241200 - }, - { - "epoch": 12.65, - "learning_rate": 3.843741153318181e-06, - "loss": 0.1672, - "step": 1241300 - }, - { - "epoch": 12.65, - "learning_rate": 3.840493134150308e-06, - "loss": 0.2048, - "step": 1241400 - }, - { - "epoch": 12.65, - "learning_rate": 3.837246394026216e-06, - "loss": 0.2019, - "step": 1241500 - }, - { - "epoch": 12.65, - "learning_rate": 3.834000933104641e-06, - "loss": 0.2755, - "step": 1241600 - }, - { - "epoch": 12.65, - "learning_rate": 3.830756751544266e-06, - "loss": 0.2129, - "step": 1241700 - }, - { - "epoch": 12.65, - "learning_rate": 3.827513849503725e-06, - "loss": 0.203, - "step": 1241800 - }, - { - "epoch": 12.65, - "learning_rate": 3.824272227141558e-06, - "loss": 0.1997, - "step": 1241900 - }, - { - "epoch": 12.65, - "learning_rate": 3.821031884616273e-06, - "loss": 0.2363, - "step": 1242000 - }, - { - "epoch": 12.65, - "learning_rate": 3.8178252063751054e-06, - "loss": 0.2274, - "step": 1242100 - }, - { - "epoch": 12.66, - "learning_rate": 3.8145874111964973e-06, - "loss": 0.2227, - "step": 1242200 - }, - { - "epoch": 12.66, - "learning_rate": 3.811350896328304e-06, - "loss": 0.2277, - "step": 1242300 - }, - { - "epoch": 12.66, - "learning_rate": 3.8081156619287585e-06, - "loss": 0.2135, - "step": 1242400 - }, - { - "epoch": 12.66, - "learning_rate": 3.804881708156054e-06, - "loss": 0.222, - "step": 1242500 - }, - { - "epoch": 12.66, - "learning_rate": 3.8016490351683074e-06, - "loss": 0.2122, - "step": 1242600 - }, - { - "epoch": 12.66, - "learning_rate": 3.7984176431235796e-06, - "loss": 0.2455, - "step": 1242700 - }, - { - "epoch": 12.66, - "learning_rate": 3.795187532179867e-06, - "loss": 0.196, - "step": 1242800 - }, - { - "epoch": 12.66, - "learning_rate": 3.791958702495103e-06, - "loss": 0.2124, - "step": 1242900 - }, - { - "epoch": 12.66, - "learning_rate": 3.788731154227162e-06, - "loss": 0.1895, - "step": 1243000 - }, - { - "epoch": 12.66, - "learning_rate": 3.7855048875338405e-06, - "loss": 0.263, - "step": 1243100 - }, - { - "epoch": 12.67, - "learning_rate": 3.7822799025728894e-06, - "loss": 0.2398, - "step": 1243200 - }, - { - "epoch": 12.67, - "learning_rate": 3.779056199501999e-06, - "loss": 0.1734, - "step": 1243300 - }, - { - "epoch": 12.67, - "learning_rate": 3.7758337784787767e-06, - "loss": 0.2823, - "step": 1243400 - }, - { - "epoch": 12.67, - "learning_rate": 3.7726126396607864e-06, - "loss": 0.2396, - "step": 1243500 - }, - { - "epoch": 12.67, - "learning_rate": 3.7693927832055254e-06, - "loss": 0.1879, - "step": 1243600 - }, - { - "epoch": 12.67, - "learning_rate": 3.7661742092704144e-06, - "loss": 0.247, - "step": 1243700 - }, - { - "epoch": 12.67, - "learning_rate": 3.7629569180128275e-06, - "loss": 0.2212, - "step": 1243800 - }, - { - "epoch": 12.67, - "learning_rate": 3.759740909590076e-06, - "loss": 0.2027, - "step": 1243900 - }, - { - "epoch": 12.67, - "learning_rate": 3.7565261841594002e-06, - "loss": 0.2104, - "step": 1244000 - }, - { - "epoch": 12.68, - "learning_rate": 3.753312741877981e-06, - "loss": 0.2023, - "step": 1244100 - }, - { - "epoch": 12.68, - "learning_rate": 3.75010058290294e-06, - "loss": 0.2484, - "step": 1244200 - }, - { - "epoch": 12.68, - "learning_rate": 3.7469218097927816e-06, - "loss": 0.1999, - "step": 1244300 - }, - { - "epoch": 12.68, - "learning_rate": 3.7437122050646146e-06, - "loss": 0.1633, - "step": 1244400 - }, - { - "epoch": 12.68, - "learning_rate": 3.7405038841122265e-06, - "loss": 0.2387, - "step": 1244500 - }, - { - "epoch": 12.68, - "learning_rate": 3.7372968470924883e-06, - "loss": 0.273, - "step": 1244600 - }, - { - "epoch": 12.68, - "learning_rate": 3.734091094162212e-06, - "loss": 0.2182, - "step": 1244700 - }, - { - "epoch": 12.68, - "learning_rate": 3.7308866254781317e-06, - "loss": 0.1834, - "step": 1244800 - }, - { - "epoch": 12.68, - "learning_rate": 3.7276834411969253e-06, - "loss": 0.2366, - "step": 1244900 - }, - { - "epoch": 12.68, - "learning_rate": 3.7244815414752176e-06, - "loss": 0.2192, - "step": 1245000 - }, - { - "epoch": 12.69, - "learning_rate": 3.721280926469557e-06, - "loss": 0.195, - "step": 1245100 - }, - { - "epoch": 12.69, - "learning_rate": 3.7180815963364345e-06, - "loss": 0.2124, - "step": 1245200 - }, - { - "epoch": 12.69, - "learning_rate": 3.714883551232282e-06, - "loss": 0.2394, - "step": 1245300 - }, - { - "epoch": 12.69, - "learning_rate": 3.711686791313468e-06, - "loss": 0.2343, - "step": 1245400 - }, - { - "epoch": 12.69, - "learning_rate": 3.7084913167362834e-06, - "loss": 0.2312, - "step": 1245500 - }, - { - "epoch": 12.69, - "learning_rate": 3.7052971276569738e-06, - "loss": 0.202, - "step": 1245600 - }, - { - "epoch": 12.69, - "learning_rate": 3.7021042242317204e-06, - "loss": 0.218, - "step": 1245700 - }, - { - "epoch": 12.69, - "learning_rate": 3.698912606616629e-06, - "loss": 0.2002, - "step": 1245800 - }, - { - "epoch": 12.69, - "learning_rate": 3.6957541719181987e-06, - "loss": 0.238, - "step": 1245900 - }, - { - "epoch": 12.69, - "learning_rate": 3.6925651135295345e-06, - "loss": 0.2071, - "step": 1246000 - }, - { - "epoch": 12.7, - "learning_rate": 3.6893773414174425e-06, - "loss": 0.1863, - "step": 1246100 - }, - { - "epoch": 12.7, - "learning_rate": 3.686190855737791e-06, - "loss": 0.1949, - "step": 1246200 - }, - { - "epoch": 12.7, - "learning_rate": 3.6830056566463657e-06, - "loss": 0.2641, - "step": 1246300 - }, - { - "epoch": 12.7, - "learning_rate": 3.6798217442989145e-06, - "loss": 0.2636, - "step": 1246400 - }, - { - "epoch": 12.7, - "learning_rate": 3.676639118851107e-06, - "loss": 0.2425, - "step": 1246500 - }, - { - "epoch": 12.7, - "learning_rate": 3.673457780458559e-06, - "loss": 0.2632, - "step": 1246600 - }, - { - "epoch": 12.7, - "learning_rate": 3.6702777292768153e-06, - "loss": 0.2242, - "step": 1246700 - }, - { - "epoch": 12.7, - "learning_rate": 3.6670989654613686e-06, - "loss": 0.2938, - "step": 1246800 - }, - { - "epoch": 12.7, - "learning_rate": 3.6639214891676343e-06, - "loss": 0.2096, - "step": 1246900 - }, - { - "epoch": 12.7, - "learning_rate": 3.660745300550968e-06, - "loss": 0.2146, - "step": 1247000 - }, - { - "epoch": 12.71, - "learning_rate": 3.657570399766683e-06, - "loss": 0.2656, - "step": 1247100 - }, - { - "epoch": 12.71, - "learning_rate": 3.6543967869699947e-06, - "loss": 0.2005, - "step": 1247200 - }, - { - "epoch": 12.71, - "learning_rate": 3.6512244623160783e-06, - "loss": 0.2437, - "step": 1247300 - }, - { - "epoch": 12.71, - "learning_rate": 3.648053425960054e-06, - "loss": 0.1781, - "step": 1247400 - }, - { - "epoch": 12.71, - "learning_rate": 3.644883678056947e-06, - "loss": 0.2044, - "step": 1247500 - }, - { - "epoch": 12.71, - "learning_rate": 3.6417152187617542e-06, - "loss": 0.3021, - "step": 1247600 - }, - { - "epoch": 12.71, - "learning_rate": 3.6385480482293844e-06, - "loss": 0.1454, - "step": 1247700 - }, - { - "epoch": 12.71, - "learning_rate": 3.6353821666147e-06, - "loss": 0.2472, - "step": 1247800 - }, - { - "epoch": 12.71, - "learning_rate": 3.6322175740724918e-06, - "loss": 0.2748, - "step": 1247900 - }, - { - "epoch": 12.71, - "learning_rate": 3.6290542707574914e-06, - "loss": 0.2055, - "step": 1248000 - }, - { - "epoch": 12.72, - "learning_rate": 3.6258922568243657e-06, - "loss": 0.257, - "step": 1248100 - }, - { - "epoch": 12.72, - "learning_rate": 3.6227315324277145e-06, - "loss": 0.2552, - "step": 1248200 - }, - { - "epoch": 12.72, - "learning_rate": 3.619572097722077e-06, - "loss": 0.2261, - "step": 1248300 - }, - { - "epoch": 12.72, - "learning_rate": 3.61641395286194e-06, - "loss": 0.2173, - "step": 1248400 - }, - { - "epoch": 12.72, - "learning_rate": 3.6132570980017066e-06, - "loss": 0.1773, - "step": 1248500 - }, - { - "epoch": 12.72, - "learning_rate": 3.6101015332957334e-06, - "loss": 0.2502, - "step": 1248600 - }, - { - "epoch": 12.72, - "learning_rate": 3.6069472588983134e-06, - "loss": 0.2122, - "step": 1248700 - }, - { - "epoch": 12.72, - "learning_rate": 3.6037942749636633e-06, - "loss": 0.2397, - "step": 1248800 - }, - { - "epoch": 12.72, - "learning_rate": 3.6006425816459466e-06, - "loss": 0.225, - "step": 1248900 - }, - { - "epoch": 12.73, - "learning_rate": 3.5974921790992667e-06, - "loss": 0.2706, - "step": 1249000 - }, - { - "epoch": 12.73, - "learning_rate": 3.5943430674776534e-06, - "loss": 0.2203, - "step": 1249100 - }, - { - "epoch": 12.73, - "learning_rate": 3.5911952469350875e-06, - "loss": 0.2058, - "step": 1249200 - }, - { - "epoch": 12.73, - "learning_rate": 3.588048717625472e-06, - "loss": 0.2383, - "step": 1249300 - }, - { - "epoch": 12.73, - "learning_rate": 3.5849034797026636e-06, - "loss": 0.2433, - "step": 1249400 - }, - { - "epoch": 12.73, - "learning_rate": 3.5817595333204302e-06, - "loss": 0.1825, - "step": 1249500 - }, - { - "epoch": 12.73, - "learning_rate": 3.5786168786325015e-06, - "loss": 0.2216, - "step": 1249600 - }, - { - "epoch": 12.73, - "learning_rate": 3.575475515792538e-06, - "loss": 0.2414, - "step": 1249700 - }, - { - "epoch": 12.73, - "learning_rate": 3.5723354449541237e-06, - "loss": 0.2186, - "step": 1249800 - }, - { - "epoch": 12.73, - "learning_rate": 3.569196666270792e-06, - "loss": 0.2087, - "step": 1249900 - }, - { - "epoch": 12.74, - "learning_rate": 3.5660591798960206e-06, - "loss": 0.1899, - "step": 1250000 - }, - { - "epoch": 12.74, - "learning_rate": 3.562922985983197e-06, - "loss": 0.207, - "step": 1250100 - }, - { - "epoch": 12.74, - "learning_rate": 3.5597880846856746e-06, - "loss": 0.2218, - "step": 1250200 - }, - { - "epoch": 12.74, - "learning_rate": 3.556654476156724e-06, - "loss": 0.2185, - "step": 1250300 - }, - { - "epoch": 12.74, - "learning_rate": 3.553522160549567e-06, - "loss": 0.2171, - "step": 1250400 - }, - { - "epoch": 12.74, - "learning_rate": 3.5503911380173504e-06, - "loss": 0.2632, - "step": 1250500 - }, - { - "epoch": 12.74, - "learning_rate": 3.5472614087131718e-06, - "loss": 0.2367, - "step": 1250600 - }, - { - "epoch": 12.74, - "learning_rate": 3.544132972790042e-06, - "loss": 0.202, - "step": 1250700 - }, - { - "epoch": 12.74, - "learning_rate": 3.5410058304009285e-06, - "loss": 0.2392, - "step": 1250800 - }, - { - "epoch": 12.74, - "learning_rate": 3.5378799816987295e-06, - "loss": 0.2399, - "step": 1250900 - }, - { - "epoch": 12.75, - "learning_rate": 3.5347554268362925e-06, - "loss": 0.1953, - "step": 1251000 - }, - { - "epoch": 12.75, - "learning_rate": 3.5316321659663688e-06, - "loss": 0.2697, - "step": 1251100 - }, - { - "epoch": 12.75, - "learning_rate": 3.5285101992416793e-06, - "loss": 0.2254, - "step": 1251200 - }, - { - "epoch": 12.75, - "learning_rate": 3.5253895268148716e-06, - "loss": 0.222, - "step": 1251300 - }, - { - "epoch": 12.75, - "learning_rate": 3.5223013362102496e-06, - "loss": 0.2022, - "step": 1251400 - }, - { - "epoch": 12.75, - "learning_rate": 3.51918323989009e-06, - "loss": 0.1915, - "step": 1251500 - }, - { - "epoch": 12.75, - "learning_rate": 3.5160664383238416e-06, - "loss": 0.2044, - "step": 1251600 - }, - { - "epoch": 12.75, - "learning_rate": 3.5129509316638986e-06, - "loss": 0.2395, - "step": 1251700 - }, - { - "epoch": 12.75, - "learning_rate": 3.5098367200625924e-06, - "loss": 0.221, - "step": 1251800 - }, - { - "epoch": 12.75, - "learning_rate": 3.506754926424294e-06, - "loss": 0.1924, - "step": 1251900 - }, - { - "epoch": 12.76, - "learning_rate": 3.5036432924425986e-06, - "loss": 0.1737, - "step": 1252000 - }, - { - "epoch": 12.76, - "learning_rate": 3.5005329539746234e-06, - "loss": 0.2119, - "step": 1252100 - }, - { - "epoch": 12.76, - "learning_rate": 3.4974239111724493e-06, - "loss": 0.2421, - "step": 1252200 - }, - { - "epoch": 12.76, - "learning_rate": 3.4943161641880916e-06, - "loss": 0.2157, - "step": 1252300 - }, - { - "epoch": 12.76, - "learning_rate": 3.491209713173489e-06, - "loss": 0.2019, - "step": 1252400 - }, - { - "epoch": 12.76, - "learning_rate": 3.488104558280536e-06, - "loss": 0.2264, - "step": 1252500 - }, - { - "epoch": 12.76, - "learning_rate": 3.4850006996610615e-06, - "loss": 0.209, - "step": 1252600 - }, - { - "epoch": 12.76, - "learning_rate": 3.4818981374668147e-06, - "loss": 0.2053, - "step": 1252700 - }, - { - "epoch": 12.76, - "learning_rate": 3.4787968718494968e-06, - "loss": 0.2242, - "step": 1252800 - }, - { - "epoch": 12.76, - "learning_rate": 3.47569690296074e-06, - "loss": 0.2207, - "step": 1252900 - }, - { - "epoch": 12.77, - "learning_rate": 3.4725982309521164e-06, - "loss": 0.2286, - "step": 1253000 - }, - { - "epoch": 12.77, - "learning_rate": 3.4695008559751317e-06, - "loss": 0.2125, - "step": 1253100 - }, - { - "epoch": 12.77, - "learning_rate": 3.466404778181238e-06, - "loss": 0.2674, - "step": 1253200 - }, - { - "epoch": 12.77, - "learning_rate": 3.4633099977217975e-06, - "loss": 0.2891, - "step": 1253300 - }, - { - "epoch": 12.77, - "learning_rate": 3.460216514748136e-06, - "loss": 0.2072, - "step": 1253400 - }, - { - "epoch": 12.77, - "learning_rate": 3.457124329411512e-06, - "loss": 0.202, - "step": 1253500 - }, - { - "epoch": 12.77, - "learning_rate": 3.454033441863105e-06, - "loss": 0.248, - "step": 1253600 - }, - { - "epoch": 12.77, - "learning_rate": 3.4509438522540405e-06, - "loss": 0.2102, - "step": 1253700 - }, - { - "epoch": 12.77, - "learning_rate": 3.4478555607353946e-06, - "loss": 0.2514, - "step": 1253800 - }, - { - "epoch": 12.77, - "learning_rate": 3.4447685674581496e-06, - "loss": 0.2228, - "step": 1253900 - }, - { - "epoch": 12.78, - "learning_rate": 3.4416828725732487e-06, - "loss": 0.1785, - "step": 1254000 - }, - { - "epoch": 12.78, - "learning_rate": 3.4385984762315635e-06, - "loss": 0.2306, - "step": 1254100 - }, - { - "epoch": 12.78, - "learning_rate": 3.435515378583901e-06, - "loss": 0.2107, - "step": 1254200 - }, - { - "epoch": 12.78, - "learning_rate": 3.4324335797810104e-06, - "loss": 0.2182, - "step": 1254300 - }, - { - "epoch": 12.78, - "learning_rate": 3.4293530799735673e-06, - "loss": 0.1743, - "step": 1254400 - }, - { - "epoch": 12.78, - "learning_rate": 3.426273879312202e-06, - "loss": 0.2042, - "step": 1254500 - }, - { - "epoch": 12.78, - "learning_rate": 3.423195977947453e-06, - "loss": 0.2586, - "step": 1254600 - }, - { - "epoch": 12.78, - "learning_rate": 3.420119376029821e-06, - "loss": 0.1865, - "step": 1254700 - }, - { - "epoch": 12.78, - "learning_rate": 3.4170440737097354e-06, - "loss": 0.2212, - "step": 1254800 - }, - { - "epoch": 12.79, - "learning_rate": 3.4139700711375486e-06, - "loss": 0.22, - "step": 1254900 - }, - { - "epoch": 12.79, - "learning_rate": 3.4108973684635714e-06, - "loss": 0.1886, - "step": 1255000 - }, - { - "epoch": 12.79, - "learning_rate": 3.4078259658380394e-06, - "loss": 0.2217, - "step": 1255100 - }, - { - "epoch": 12.79, - "learning_rate": 3.404755863411123e-06, - "loss": 0.2499, - "step": 1255200 - }, - { - "epoch": 12.79, - "learning_rate": 3.4016870613329288e-06, - "loss": 0.3172, - "step": 1255300 - }, - { - "epoch": 12.79, - "learning_rate": 3.3986195597535064e-06, - "loss": 0.261, - "step": 1255400 - }, - { - "epoch": 12.79, - "learning_rate": 3.3955533588228393e-06, - "loss": 0.2736, - "step": 1255500 - }, - { - "epoch": 12.79, - "learning_rate": 3.3924884586908477e-06, - "loss": 0.199, - "step": 1255600 - }, - { - "epoch": 12.79, - "learning_rate": 3.389424859507385e-06, - "loss": 0.2082, - "step": 1255700 - }, - { - "epoch": 12.79, - "learning_rate": 3.3863625614222484e-06, - "loss": 0.2232, - "step": 1255800 - }, - { - "epoch": 12.8, - "learning_rate": 3.383301564585154e-06, - "loss": 0.2304, - "step": 1255900 - }, - { - "epoch": 12.8, - "learning_rate": 3.3802418691457727e-06, - "loss": 0.2666, - "step": 1256000 - }, - { - "epoch": 12.8, - "learning_rate": 3.3771834752537147e-06, - "loss": 0.1965, - "step": 1256100 - }, - { - "epoch": 12.8, - "learning_rate": 3.3741263830585e-06, - "loss": 0.2617, - "step": 1256200 - }, - { - "epoch": 12.8, - "learning_rate": 3.3710705927096097e-06, - "loss": 0.1648, - "step": 1256300 - }, - { - "epoch": 12.8, - "learning_rate": 3.3680161043564604e-06, - "loss": 0.2394, - "step": 1256400 - }, - { - "epoch": 12.8, - "learning_rate": 3.3649629181483865e-06, - "loss": 0.2223, - "step": 1256500 - }, - { - "epoch": 12.8, - "learning_rate": 3.3619110342346747e-06, - "loss": 0.263, - "step": 1256600 - }, - { - "epoch": 12.8, - "learning_rate": 3.3588604527645496e-06, - "loss": 0.2334, - "step": 1256700 - }, - { - "epoch": 12.8, - "learning_rate": 3.3558111738871576e-06, - "loss": 0.2264, - "step": 1256800 - }, - { - "epoch": 12.81, - "learning_rate": 3.352763197751597e-06, - "loss": 0.2336, - "step": 1256900 - }, - { - "epoch": 12.81, - "learning_rate": 3.349716524506902e-06, - "loss": 0.2277, - "step": 1257000 - }, - { - "epoch": 12.81, - "learning_rate": 3.346671154302019e-06, - "loss": 0.2633, - "step": 1257100 - }, - { - "epoch": 12.81, - "learning_rate": 3.34362708728586e-06, - "loss": 0.2208, - "step": 1257200 - }, - { - "epoch": 12.81, - "learning_rate": 3.3406147447920367e-06, - "loss": 0.2203, - "step": 1257300 - }, - { - "epoch": 12.81, - "learning_rate": 3.337573271564168e-06, - "loss": 0.2002, - "step": 1257400 - }, - { - "epoch": 12.81, - "learning_rate": 3.334533101969858e-06, - "loss": 0.2369, - "step": 1257500 - }, - { - "epoch": 12.81, - "learning_rate": 3.3314942361577426e-06, - "loss": 0.2314, - "step": 1257600 - }, - { - "epoch": 12.81, - "learning_rate": 3.328456674276411e-06, - "loss": 0.2336, - "step": 1257700 - }, - { - "epoch": 12.81, - "learning_rate": 3.325420416474384e-06, - "loss": 0.2092, - "step": 1257800 - }, - { - "epoch": 12.82, - "learning_rate": 3.3223854629001095e-06, - "loss": 0.1809, - "step": 1257900 - }, - { - "epoch": 12.82, - "learning_rate": 3.319351813701982e-06, - "loss": 0.2206, - "step": 1258000 - }, - { - "epoch": 12.82, - "learning_rate": 3.316319469028323e-06, - "loss": 0.1884, - "step": 1258100 - }, - { - "epoch": 12.82, - "learning_rate": 3.31328842902741e-06, - "loss": 0.2154, - "step": 1258200 - }, - { - "epoch": 12.82, - "learning_rate": 3.310258693847432e-06, - "loss": 0.2492, - "step": 1258300 - }, - { - "epoch": 12.82, - "learning_rate": 3.3072302636365305e-06, - "loss": 0.1985, - "step": 1258400 - }, - { - "epoch": 12.82, - "learning_rate": 3.3042031385427764e-06, - "loss": 0.2178, - "step": 1258500 - }, - { - "epoch": 12.82, - "learning_rate": 3.301177318714175e-06, - "loss": 0.1553, - "step": 1258600 - }, - { - "epoch": 12.82, - "learning_rate": 3.2981528042986676e-06, - "loss": 0.1983, - "step": 1258700 - }, - { - "epoch": 12.82, - "learning_rate": 3.2951295954441497e-06, - "loss": 0.224, - "step": 1258800 - }, - { - "epoch": 12.83, - "learning_rate": 3.2921076922984196e-06, - "loss": 0.2727, - "step": 1258900 - }, - { - "epoch": 12.83, - "learning_rate": 3.2890870950092423e-06, - "loss": 0.253, - "step": 1259000 - }, - { - "epoch": 12.83, - "learning_rate": 3.2860678037243063e-06, - "loss": 0.2512, - "step": 1259100 - }, - { - "epoch": 12.83, - "learning_rate": 3.2830498185912305e-06, - "loss": 0.2603, - "step": 1259200 - }, - { - "epoch": 12.83, - "learning_rate": 3.28003313975758e-06, - "loss": 0.2506, - "step": 1259300 - }, - { - "epoch": 12.83, - "learning_rate": 3.27701776737085e-06, - "loss": 0.2656, - "step": 1259400 - }, - { - "epoch": 12.83, - "learning_rate": 3.2740037015784796e-06, - "loss": 0.2312, - "step": 1259500 - }, - { - "epoch": 12.83, - "learning_rate": 3.2709909425278337e-06, - "loss": 0.2062, - "step": 1259600 - }, - { - "epoch": 12.83, - "learning_rate": 3.2679794903662285e-06, - "loss": 0.2342, - "step": 1259700 - }, - { - "epoch": 12.84, - "learning_rate": 3.264969345240889e-06, - "loss": 0.2073, - "step": 1259800 - }, - { - "epoch": 12.84, - "learning_rate": 3.2619605072990046e-06, - "loss": 0.183, - "step": 1259900 - }, - { - "epoch": 12.84, - "learning_rate": 3.2589529766876867e-06, - "loss": 0.2461, - "step": 1260000 - }, - { - "epoch": 12.84, - "learning_rate": 3.2559467535539923e-06, - "loss": 0.203, - "step": 1260100 - }, - { - "epoch": 12.84, - "learning_rate": 3.252941838044893e-06, - "loss": 0.2304, - "step": 1260200 - }, - { - "epoch": 12.84, - "learning_rate": 3.249938230307321e-06, - "loss": 0.1951, - "step": 1260300 - }, - { - "epoch": 12.84, - "learning_rate": 3.246935930488143e-06, - "loss": 0.2134, - "step": 1260400 - }, - { - "epoch": 12.84, - "learning_rate": 3.2439349387341343e-06, - "loss": 0.2199, - "step": 1260500 - }, - { - "epoch": 12.84, - "learning_rate": 3.2409352551920345e-06, - "loss": 0.251, - "step": 1260600 - }, - { - "epoch": 12.84, - "learning_rate": 3.237936880008513e-06, - "loss": 0.2212, - "step": 1260700 - }, - { - "epoch": 12.85, - "learning_rate": 3.2349697775193687e-06, - "loss": 0.253, - "step": 1260800 - }, - { - "epoch": 12.85, - "learning_rate": 3.231974006405507e-06, - "loss": 0.1864, - "step": 1260900 - }, - { - "epoch": 12.85, - "learning_rate": 3.228979544088366e-06, - "loss": 0.227, - "step": 1261000 - }, - { - "epoch": 12.85, - "learning_rate": 3.225986390714366e-06, - "loss": 0.2086, - "step": 1261100 - }, - { - "epoch": 12.85, - "learning_rate": 3.2229945464298593e-06, - "loss": 0.1741, - "step": 1261200 - }, - { - "epoch": 12.85, - "learning_rate": 3.2200040113811156e-06, - "loss": 0.2172, - "step": 1261300 - }, - { - "epoch": 12.85, - "learning_rate": 3.2170147857143584e-06, - "loss": 0.2087, - "step": 1261400 - }, - { - "epoch": 12.85, - "learning_rate": 3.2140268695757537e-06, - "loss": 0.2071, - "step": 1261500 - }, - { - "epoch": 12.85, - "learning_rate": 3.211040263111381e-06, - "loss": 0.2179, - "step": 1261600 - }, - { - "epoch": 12.85, - "learning_rate": 3.2080549664672676e-06, - "loss": 0.2415, - "step": 1261700 - }, - { - "epoch": 12.86, - "learning_rate": 3.205070979789383e-06, - "loss": 0.2932, - "step": 1261800 - }, - { - "epoch": 12.86, - "learning_rate": 3.2020883032236237e-06, - "loss": 0.1732, - "step": 1261900 - }, - { - "epoch": 12.86, - "learning_rate": 3.199106936915823e-06, - "loss": 0.1786, - "step": 1262000 - }, - { - "epoch": 12.86, - "learning_rate": 3.1961268810117583e-06, - "loss": 0.2145, - "step": 1262100 - }, - { - "epoch": 12.86, - "learning_rate": 3.193148135657129e-06, - "loss": 0.2401, - "step": 1262200 - }, - { - "epoch": 12.86, - "learning_rate": 3.190170700997582e-06, - "loss": 0.233, - "step": 1262300 - }, - { - "epoch": 12.86, - "learning_rate": 3.1871945771786916e-06, - "loss": 0.1758, - "step": 1262400 - }, - { - "epoch": 12.86, - "learning_rate": 3.184219764345984e-06, - "loss": 0.1704, - "step": 1262500 - }, - { - "epoch": 12.86, - "learning_rate": 3.1812462626448958e-06, - "loss": 0.1918, - "step": 1262600 - }, - { - "epoch": 12.86, - "learning_rate": 3.178274072220818e-06, - "loss": 0.1828, - "step": 1262700 - }, - { - "epoch": 12.87, - "learning_rate": 3.175303193219081e-06, - "loss": 0.1996, - "step": 1262800 - }, - { - "epoch": 12.87, - "learning_rate": 3.172333625784929e-06, - "loss": 0.1982, - "step": 1262900 - }, - { - "epoch": 12.87, - "learning_rate": 3.169365370063565e-06, - "loss": 0.2785, - "step": 1263000 - }, - { - "epoch": 12.87, - "learning_rate": 3.16639842620012e-06, - "loss": 0.1806, - "step": 1263100 - }, - { - "epoch": 12.87, - "learning_rate": 3.1634327943396545e-06, - "loss": 0.2447, - "step": 1263200 - }, - { - "epoch": 12.87, - "learning_rate": 3.1604684746271727e-06, - "loss": 0.2121, - "step": 1263300 - }, - { - "epoch": 12.87, - "learning_rate": 3.1575054672076187e-06, - "loss": 0.228, - "step": 1263400 - }, - { - "epoch": 12.87, - "learning_rate": 3.1545437722258563e-06, - "loss": 0.1909, - "step": 1263500 - }, - { - "epoch": 12.87, - "learning_rate": 3.1515833898266968e-06, - "loss": 0.2571, - "step": 1263600 - }, - { - "epoch": 12.87, - "learning_rate": 3.1486243201548905e-06, - "loss": 0.2578, - "step": 1263700 - }, - { - "epoch": 12.88, - "learning_rate": 3.145666563355112e-06, - "loss": 0.1722, - "step": 1263800 - }, - { - "epoch": 12.88, - "learning_rate": 3.1427101195719755e-06, - "loss": 0.2266, - "step": 1263900 - }, - { - "epoch": 12.88, - "learning_rate": 3.139754988950042e-06, - "loss": 0.2073, - "step": 1264000 - }, - { - "epoch": 12.88, - "learning_rate": 3.1368011716337995e-06, - "loss": 0.1845, - "step": 1264100 - }, - { - "epoch": 12.88, - "learning_rate": 3.133848667767666e-06, - "loss": 0.2214, - "step": 1264200 - }, - { - "epoch": 12.88, - "learning_rate": 3.1308974774960018e-06, - "loss": 0.2291, - "step": 1264300 - }, - { - "epoch": 12.88, - "learning_rate": 3.1279476009631023e-06, - "loss": 0.2559, - "step": 1264400 - }, - { - "epoch": 12.88, - "learning_rate": 3.1249990383132055e-06, - "loss": 0.2103, - "step": 1264500 - }, - { - "epoch": 12.88, - "learning_rate": 3.122051789690472e-06, - "loss": 0.2682, - "step": 1264600 - }, - { - "epoch": 12.88, - "learning_rate": 3.1191058552390108e-06, - "loss": 0.1933, - "step": 1264700 - }, - { - "epoch": 12.89, - "learning_rate": 3.1161612351028523e-06, - "loss": 0.2581, - "step": 1264800 - }, - { - "epoch": 12.89, - "learning_rate": 3.1132179294259757e-06, - "loss": 0.2216, - "step": 1264900 - }, - { - "epoch": 12.89, - "learning_rate": 3.110275938352295e-06, - "loss": 0.1763, - "step": 1265000 - }, - { - "epoch": 12.89, - "learning_rate": 3.1073352620256457e-06, - "loss": 0.2431, - "step": 1265100 - }, - { - "epoch": 12.89, - "learning_rate": 3.104395900589816e-06, - "loss": 0.1939, - "step": 1265200 - }, - { - "epoch": 12.89, - "learning_rate": 3.1014578541885276e-06, - "loss": 0.2112, - "step": 1265300 - }, - { - "epoch": 12.89, - "learning_rate": 3.098521122965422e-06, - "loss": 0.2292, - "step": 1265400 - }, - { - "epoch": 12.89, - "learning_rate": 3.0955857070640945e-06, - "loss": 0.2514, - "step": 1265500 - }, - { - "epoch": 12.89, - "learning_rate": 3.0926516066280707e-06, - "loss": 0.1826, - "step": 1265600 - }, - { - "epoch": 12.9, - "learning_rate": 3.089748143136345e-06, - "loss": 0.1719, - "step": 1265700 - }, - { - "epoch": 12.9, - "learning_rate": 3.0868166609030092e-06, - "loss": 0.2578, - "step": 1265800 - }, - { - "epoch": 12.9, - "learning_rate": 3.083886494563727e-06, - "loss": 0.252, - "step": 1265900 - }, - { - "epoch": 12.9, - "learning_rate": 3.080957644261767e-06, - "loss": 0.194, - "step": 1266000 - }, - { - "epoch": 12.9, - "learning_rate": 3.0780301101403417e-06, - "loss": 0.198, - "step": 1266100 - }, - { - "epoch": 12.9, - "learning_rate": 3.0751038923425733e-06, - "loss": 0.2492, - "step": 1266200 - }, - { - "epoch": 12.9, - "learning_rate": 3.0721789910115473e-06, - "loss": 0.2562, - "step": 1266300 - }, - { - "epoch": 12.9, - "learning_rate": 3.069255406290272e-06, - "loss": 0.2293, - "step": 1266400 - }, - { - "epoch": 12.9, - "learning_rate": 3.066333138321701e-06, - "loss": 0.2576, - "step": 1266500 - }, - { - "epoch": 12.9, - "learning_rate": 3.0634121872486986e-06, - "loss": 0.1817, - "step": 1266600 - }, - { - "epoch": 12.91, - "learning_rate": 3.0604925532140915e-06, - "loss": 0.2688, - "step": 1266700 - }, - { - "epoch": 12.91, - "learning_rate": 3.0575742363606386e-06, - "loss": 0.2648, - "step": 1266800 - }, - { - "epoch": 12.91, - "learning_rate": 3.0546572368310156e-06, - "loss": 0.2493, - "step": 1266900 - }, - { - "epoch": 12.91, - "learning_rate": 3.0517415547678516e-06, - "loss": 0.2026, - "step": 1267000 - }, - { - "epoch": 12.91, - "learning_rate": 3.048827190313707e-06, - "loss": 0.2391, - "step": 1267100 - }, - { - "epoch": 12.91, - "learning_rate": 3.045914143611076e-06, - "loss": 0.2663, - "step": 1267200 - }, - { - "epoch": 12.91, - "learning_rate": 3.0430024148023926e-06, - "loss": 0.2347, - "step": 1267300 - }, - { - "epoch": 12.91, - "learning_rate": 3.040092004030023e-06, - "loss": 0.1936, - "step": 1267400 - }, - { - "epoch": 12.91, - "learning_rate": 3.0371829114362625e-06, - "loss": 0.2287, - "step": 1267500 - }, - { - "epoch": 12.91, - "learning_rate": 3.0342751371633514e-06, - "loss": 0.1961, - "step": 1267600 - }, - { - "epoch": 12.92, - "learning_rate": 3.0313686813534657e-06, - "loss": 0.2205, - "step": 1267700 - }, - { - "epoch": 12.92, - "learning_rate": 3.0284635441487087e-06, - "loss": 0.2306, - "step": 1267800 - }, - { - "epoch": 12.92, - "learning_rate": 3.025559725691126e-06, - "loss": 0.1799, - "step": 1267900 - }, - { - "epoch": 12.92, - "learning_rate": 3.022657226122705e-06, - "loss": 0.2064, - "step": 1268000 - }, - { - "epoch": 12.92, - "learning_rate": 3.0197560455853446e-06, - "loss": 0.2758, - "step": 1268100 - }, - { - "epoch": 12.92, - "learning_rate": 3.016885176304176e-06, - "loss": 0.2432, - "step": 1268200 - }, - { - "epoch": 12.92, - "learning_rate": 3.013986621060595e-06, - "loss": 0.2339, - "step": 1268300 - }, - { - "epoch": 12.92, - "learning_rate": 3.011118351098143e-06, - "loss": 0.1654, - "step": 1268400 - }, - { - "epoch": 12.92, - "learning_rate": 3.008222421709569e-06, - "loss": 0.2344, - "step": 1268500 - }, - { - "epoch": 12.92, - "learning_rate": 3.005327812057841e-06, - "loss": 0.2058, - "step": 1268600 - }, - { - "epoch": 12.93, - "learning_rate": 3.0024345222844963e-06, - "loss": 0.2158, - "step": 1268700 - }, - { - "epoch": 12.93, - "learning_rate": 2.9995425525309817e-06, - "loss": 0.2356, - "step": 1268800 - }, - { - "epoch": 12.93, - "learning_rate": 2.996651902938704e-06, - "loss": 0.205, - "step": 1268900 - }, - { - "epoch": 12.93, - "learning_rate": 2.9937625736490103e-06, - "loss": 0.2547, - "step": 1269000 - }, - { - "epoch": 12.93, - "learning_rate": 2.990874564803154e-06, - "loss": 0.2163, - "step": 1269100 - }, - { - "epoch": 12.93, - "learning_rate": 2.9879878765423498e-06, - "loss": 0.2134, - "step": 1269200 - }, - { - "epoch": 12.93, - "learning_rate": 2.9851025090077467e-06, - "loss": 0.2117, - "step": 1269300 - }, - { - "epoch": 12.93, - "learning_rate": 2.982218462340406e-06, - "loss": 0.2156, - "step": 1269400 - }, - { - "epoch": 12.93, - "learning_rate": 2.9793357366813512e-06, - "loss": 0.2025, - "step": 1269500 - }, - { - "epoch": 12.93, - "learning_rate": 2.9764543321715264e-06, - "loss": 0.1922, - "step": 1269600 - }, - { - "epoch": 12.94, - "learning_rate": 2.973574248951816e-06, - "loss": 0.2316, - "step": 1269700 - }, - { - "epoch": 12.94, - "learning_rate": 2.97069548716304e-06, - "loss": 0.176, - "step": 1269800 - }, - { - "epoch": 12.94, - "learning_rate": 2.9678180469459526e-06, - "loss": 0.1863, - "step": 1269900 - }, - { - "epoch": 12.94, - "learning_rate": 2.9649419284412515e-06, - "loss": 0.1843, - "step": 1270000 - }, - { - "epoch": 12.94, - "learning_rate": 2.9620671317895443e-06, - "loss": 0.212, - "step": 1270100 - }, - { - "epoch": 12.94, - "learning_rate": 2.9591936571314016e-06, - "loss": 0.2063, - "step": 1270200 - }, - { - "epoch": 12.94, - "learning_rate": 2.9563215046073244e-06, - "loss": 0.1611, - "step": 1270300 - }, - { - "epoch": 12.94, - "learning_rate": 2.9534506743577306e-06, - "loss": 0.2179, - "step": 1270400 - }, - { - "epoch": 12.94, - "learning_rate": 2.950581166522991e-06, - "loss": 0.2037, - "step": 1270500 - }, - { - "epoch": 12.95, - "learning_rate": 2.94771298124342e-06, - "loss": 0.2102, - "step": 1270600 - }, - { - "epoch": 12.95, - "learning_rate": 2.9448461186592355e-06, - "loss": 0.1963, - "step": 1270700 - }, - { - "epoch": 12.95, - "learning_rate": 2.9419805789106193e-06, - "loss": 0.1843, - "step": 1270800 - }, - { - "epoch": 12.95, - "learning_rate": 2.9391163621376816e-06, - "loss": 0.201, - "step": 1270900 - }, - { - "epoch": 12.95, - "learning_rate": 2.936253468480461e-06, - "loss": 0.2405, - "step": 1271000 - }, - { - "epoch": 12.95, - "learning_rate": 2.9333918980789386e-06, - "loss": 0.23, - "step": 1271100 - }, - { - "epoch": 12.95, - "learning_rate": 2.9305316510730264e-06, - "loss": 0.1909, - "step": 1271200 - }, - { - "epoch": 12.95, - "learning_rate": 2.9276727276025816e-06, - "loss": 0.2252, - "step": 1271300 - }, - { - "epoch": 12.95, - "learning_rate": 2.924815127807373e-06, - "loss": 0.1875, - "step": 1271400 - }, - { - "epoch": 12.95, - "learning_rate": 2.9219588518271324e-06, - "loss": 0.2182, - "step": 1271500 - }, - { - "epoch": 12.96, - "learning_rate": 2.9191038998015142e-06, - "loss": 0.247, - "step": 1271600 - }, - { - "epoch": 12.96, - "learning_rate": 2.916250271870097e-06, - "loss": 0.2781, - "step": 1271700 - }, - { - "epoch": 12.96, - "learning_rate": 2.9133979681724154e-06, - "loss": 0.2445, - "step": 1271800 - }, - { - "epoch": 12.96, - "learning_rate": 2.910546988847935e-06, - "loss": 0.3048, - "step": 1271900 - }, - { - "epoch": 12.96, - "learning_rate": 2.9076973340360412e-06, - "loss": 0.2027, - "step": 1272000 - }, - { - "epoch": 12.96, - "learning_rate": 2.904849003876069e-06, - "loss": 0.1911, - "step": 1272100 - }, - { - "epoch": 12.96, - "learning_rate": 2.902001998507283e-06, - "loss": 0.2373, - "step": 1272200 - }, - { - "epoch": 12.96, - "learning_rate": 2.899156318068886e-06, - "loss": 0.2123, - "step": 1272300 - }, - { - "epoch": 12.96, - "learning_rate": 2.89631196270002e-06, - "loss": 0.1913, - "step": 1272400 - }, - { - "epoch": 12.96, - "learning_rate": 2.893497356281117e-06, - "loss": 0.1929, - "step": 1272500 - }, - { - "epoch": 12.97, - "learning_rate": 2.890655638214288e-06, - "loss": 0.2622, - "step": 1272600 - }, - { - "epoch": 12.97, - "learning_rate": 2.8878152456326267e-06, - "loss": 0.2252, - "step": 1272700 - }, - { - "epoch": 12.97, - "learning_rate": 2.884976178674993e-06, - "loss": 0.1809, - "step": 1272800 - }, - { - "epoch": 12.97, - "learning_rate": 2.882138437480215e-06, - "loss": 0.2319, - "step": 1272900 - }, - { - "epoch": 12.97, - "learning_rate": 2.8793020221870393e-06, - "loss": 0.2011, - "step": 1273000 - }, - { - "epoch": 12.97, - "learning_rate": 2.8764669329341408e-06, - "loss": 0.2165, - "step": 1273100 - }, - { - "epoch": 12.97, - "learning_rate": 2.8736331698601457e-06, - "loss": 0.2009, - "step": 1273200 - }, - { - "epoch": 12.97, - "learning_rate": 2.8708007331036067e-06, - "loss": 0.2054, - "step": 1273300 - }, - { - "epoch": 12.97, - "learning_rate": 2.8679696228030127e-06, - "loss": 0.1802, - "step": 1273400 - }, - { - "epoch": 12.97, - "learning_rate": 2.8651398390967897e-06, - "loss": 0.2193, - "step": 1273500 - }, - { - "epoch": 12.98, - "learning_rate": 2.862311382123297e-06, - "loss": 0.255, - "step": 1273600 - }, - { - "epoch": 12.98, - "learning_rate": 2.859484252020831e-06, - "loss": 0.1829, - "step": 1273700 - }, - { - "epoch": 12.98, - "learning_rate": 2.8566584489276173e-06, - "loss": 0.1808, - "step": 1273800 - }, - { - "epoch": 12.98, - "learning_rate": 2.8538339729818184e-06, - "loss": 0.1934, - "step": 1273900 - }, - { - "epoch": 12.98, - "learning_rate": 2.851010824321545e-06, - "loss": 0.2283, - "step": 1274000 - }, - { - "epoch": 12.98, - "learning_rate": 2.8481890030848224e-06, - "loss": 0.219, - "step": 1274100 - }, - { - "epoch": 12.98, - "learning_rate": 2.8453685094096238e-06, - "loss": 0.2226, - "step": 1274200 - }, - { - "epoch": 12.98, - "learning_rate": 2.842549343433859e-06, - "loss": 0.1908, - "step": 1274300 - }, - { - "epoch": 12.98, - "learning_rate": 2.8397315052953578e-06, - "loss": 0.2106, - "step": 1274400 - }, - { - "epoch": 12.98, - "learning_rate": 2.8369149951319027e-06, - "loss": 0.1757, - "step": 1274500 - }, - { - "epoch": 12.99, - "learning_rate": 2.8341279583271018e-06, - "loss": 0.2463, - "step": 1274600 - }, - { - "epoch": 12.99, - "learning_rate": 2.8313422233391717e-06, - "loss": 0.2379, - "step": 1274700 - }, - { - "epoch": 12.99, - "learning_rate": 2.8285296713577493e-06, - "loss": 0.261, - "step": 1274800 - }, - { - "epoch": 12.99, - "learning_rate": 2.8257184478990683e-06, - "loss": 0.2291, - "step": 1274900 - }, - { - "epoch": 12.99, - "learning_rate": 2.8229085531005993e-06, - "loss": 0.2078, - "step": 1275000 - }, - { - "epoch": 12.99, - "learning_rate": 2.8200999870997123e-06, - "loss": 0.2188, - "step": 1275100 - }, - { - "epoch": 12.99, - "learning_rate": 2.8172927500337342e-06, - "loss": 0.2304, - "step": 1275200 - }, - { - "epoch": 12.99, - "learning_rate": 2.8144868420399285e-06, - "loss": 0.2302, - "step": 1275300 - }, - { - "epoch": 12.99, - "learning_rate": 2.8116822632554785e-06, - "loss": 0.2145, - "step": 1275400 - }, - { - "epoch": 13.0, - "learning_rate": 2.808879013817511e-06, - "loss": 0.1981, - "step": 1275500 - }, - { - "epoch": 13.0, - "learning_rate": 2.8060770938630977e-06, - "loss": 0.2136, - "step": 1275600 - }, - { - "epoch": 13.0, - "learning_rate": 2.803276503529224e-06, - "loss": 0.2133, - "step": 1275700 - }, - { - "epoch": 13.0, - "learning_rate": 2.800477242952828e-06, - "loss": 0.2356, - "step": 1275800 - }, - { - "epoch": 13.0, - "learning_rate": 2.797679312270783e-06, - "loss": 0.2524, - "step": 1275900 - }, - { - "epoch": 13.0, - "learning_rate": 2.7948827116198805e-06, - "loss": 0.1826, - "step": 1276000 - }, - { - "epoch": 13.0, - "learning_rate": 2.79208744113686e-06, - "loss": 0.2255, - "step": 1276100 - }, - { - "epoch": 13.0, - "learning_rate": 2.789293500958393e-06, - "loss": 0.2007, - "step": 1276200 - }, - { - "epoch": 13.0, - "learning_rate": 2.7865008912210933e-06, - "loss": 0.2394, - "step": 1276300 - }, - { - "epoch": 13.0, - "learning_rate": 2.783709612061495e-06, - "loss": 0.1754, - "step": 1276400 - }, - { - "epoch": 13.01, - "learning_rate": 2.780919663616078e-06, - "loss": 0.1855, - "step": 1276500 - }, - { - "epoch": 13.01, - "learning_rate": 2.7781310460212617e-06, - "loss": 0.1805, - "step": 1276600 - }, - { - "epoch": 13.01, - "learning_rate": 2.775371625690628e-06, - "loss": 0.2188, - "step": 1276700 - }, - { - "epoch": 13.01, - "learning_rate": 2.7725856568940623e-06, - "loss": 0.1744, - "step": 1276800 - }, - { - "epoch": 13.01, - "learning_rate": 2.7698010193555736e-06, - "loss": 0.212, - "step": 1276900 - }, - { - "epoch": 13.01, - "learning_rate": 2.7670177132113162e-06, - "loss": 0.2166, - "step": 1277000 - }, - { - "epoch": 13.01, - "learning_rate": 2.7642357385973717e-06, - "loss": 0.1695, - "step": 1277100 - }, - { - "epoch": 13.01, - "learning_rate": 2.761455095649764e-06, - "loss": 0.2044, - "step": 1277200 - }, - { - "epoch": 13.01, - "learning_rate": 2.758675784504452e-06, - "loss": 0.2411, - "step": 1277300 - }, - { - "epoch": 13.01, - "learning_rate": 2.755897805297326e-06, - "loss": 0.2257, - "step": 1277400 - }, - { - "epoch": 13.02, - "learning_rate": 2.753121158164211e-06, - "loss": 0.1747, - "step": 1277500 - }, - { - "epoch": 13.02, - "learning_rate": 2.750345843240872e-06, - "loss": 0.1966, - "step": 1277600 - }, - { - "epoch": 13.02, - "learning_rate": 2.7475718606630105e-06, - "loss": 0.2363, - "step": 1277700 - }, - { - "epoch": 13.02, - "learning_rate": 2.7447992105662443e-06, - "loss": 0.1844, - "step": 1277800 - }, - { - "epoch": 13.02, - "learning_rate": 2.7420278930861454e-06, - "loss": 0.1812, - "step": 1277900 - }, - { - "epoch": 13.02, - "learning_rate": 2.7392579083582216e-06, - "loss": 0.1732, - "step": 1278000 - }, - { - "epoch": 13.02, - "learning_rate": 2.736489256517898e-06, - "loss": 0.233, - "step": 1278100 - }, - { - "epoch": 13.02, - "learning_rate": 2.7337219377005496e-06, - "loss": 0.2338, - "step": 1278200 - }, - { - "epoch": 13.02, - "learning_rate": 2.730955952041485e-06, - "loss": 0.1915, - "step": 1278300 - }, - { - "epoch": 13.02, - "learning_rate": 2.7281912996759394e-06, - "loss": 0.2024, - "step": 1278400 - }, - { - "epoch": 13.03, - "learning_rate": 2.725427980739088e-06, - "loss": 0.2727, - "step": 1278500 - }, - { - "epoch": 13.03, - "learning_rate": 2.722665995366039e-06, - "loss": 0.1812, - "step": 1278600 - }, - { - "epoch": 13.03, - "learning_rate": 2.7199053436918455e-06, - "loss": 0.1963, - "step": 1278700 - }, - { - "epoch": 13.03, - "learning_rate": 2.717146025851478e-06, - "loss": 0.1704, - "step": 1278800 - }, - { - "epoch": 13.03, - "learning_rate": 2.714388041979856e-06, - "loss": 0.1571, - "step": 1278900 - }, - { - "epoch": 13.03, - "learning_rate": 2.711631392211832e-06, - "loss": 0.2113, - "step": 1279000 - }, - { - "epoch": 13.03, - "learning_rate": 2.7088760766821775e-06, - "loss": 0.2675, - "step": 1279100 - }, - { - "epoch": 13.03, - "learning_rate": 2.7061220955256183e-06, - "loss": 0.185, - "step": 1279200 - }, - { - "epoch": 13.03, - "learning_rate": 2.7033694488768133e-06, - "loss": 0.1758, - "step": 1279300 - }, - { - "epoch": 13.03, - "learning_rate": 2.700645643383486e-06, - "loss": 0.2531, - "step": 1279400 - }, - { - "epoch": 13.04, - "learning_rate": 2.6978956528054376e-06, - "loss": 0.2494, - "step": 1279500 - }, - { - "epoch": 13.04, - "learning_rate": 2.6951469971373586e-06, - "loss": 0.2141, - "step": 1279600 - }, - { - "epoch": 13.04, - "learning_rate": 2.692399676513645e-06, - "loss": 0.2596, - "step": 1279700 - }, - { - "epoch": 13.04, - "learning_rate": 2.6896536910686322e-06, - "loss": 0.2056, - "step": 1279800 - }, - { - "epoch": 13.04, - "learning_rate": 2.686909040936566e-06, - "loss": 0.2169, - "step": 1279900 - }, - { - "epoch": 13.04, - "learning_rate": 2.6841657262516527e-06, - "loss": 0.2268, - "step": 1280000 - }, - { - "epoch": 13.04, - "learning_rate": 2.681423747148025e-06, - "loss": 0.2101, - "step": 1280100 - }, - { - "epoch": 13.04, - "learning_rate": 2.678683103759746e-06, - "loss": 0.1668, - "step": 1280200 - }, - { - "epoch": 13.04, - "learning_rate": 2.675943796220818e-06, - "loss": 0.2065, - "step": 1280300 - }, - { - "epoch": 13.04, - "learning_rate": 2.67320582466518e-06, - "loss": 0.1874, - "step": 1280400 - }, - { - "epoch": 13.05, - "learning_rate": 2.6704691892266965e-06, - "loss": 0.1854, - "step": 1280500 - }, - { - "epoch": 13.05, - "learning_rate": 2.6677338900391755e-06, - "loss": 0.2888, - "step": 1280600 - }, - { - "epoch": 13.05, - "learning_rate": 2.6649999272363613e-06, - "loss": 0.2353, - "step": 1280700 - }, - { - "epoch": 13.05, - "learning_rate": 2.6622673009519192e-06, - "loss": 0.1758, - "step": 1280800 - }, - { - "epoch": 13.05, - "learning_rate": 2.6595360113194635e-06, - "loss": 0.2329, - "step": 1280900 - }, - { - "epoch": 13.05, - "learning_rate": 2.656806058472543e-06, - "loss": 0.2356, - "step": 1281000 - }, - { - "epoch": 13.05, - "learning_rate": 2.6540774425446247e-06, - "loss": 0.1953, - "step": 1281100 - }, - { - "epoch": 13.05, - "learning_rate": 2.651350163669125e-06, - "loss": 0.247, - "step": 1281200 - }, - { - "epoch": 13.05, - "learning_rate": 2.6486242219793975e-06, - "loss": 0.2204, - "step": 1281300 - }, - { - "epoch": 13.06, - "learning_rate": 2.645899617608719e-06, - "loss": 0.1702, - "step": 1281400 - }, - { - "epoch": 13.06, - "learning_rate": 2.6431763506903093e-06, - "loss": 0.1848, - "step": 1281500 - }, - { - "epoch": 13.06, - "learning_rate": 2.640454421357319e-06, - "loss": 0.2342, - "step": 1281600 - }, - { - "epoch": 13.06, - "learning_rate": 2.637733829742841e-06, - "loss": 0.134, - "step": 1281700 - }, - { - "epoch": 13.06, - "learning_rate": 2.6350145759798827e-06, - "loss": 0.1981, - "step": 1281800 - }, - { - "epoch": 13.06, - "learning_rate": 2.6322966602014074e-06, - "loss": 0.2358, - "step": 1281900 - }, - { - "epoch": 13.06, - "learning_rate": 2.6296072416927964e-06, - "loss": 0.225, - "step": 1282000 - }, - { - "epoch": 13.06, - "learning_rate": 2.6268919888987273e-06, - "loss": 0.2288, - "step": 1282100 - }, - { - "epoch": 13.06, - "learning_rate": 2.6241780744862944e-06, - "loss": 0.1776, - "step": 1282200 - }, - { - "epoch": 13.06, - "learning_rate": 2.6214654985881735e-06, - "loss": 0.2298, - "step": 1282300 - }, - { - "epoch": 13.07, - "learning_rate": 2.618754261337003e-06, - "loss": 0.1843, - "step": 1282400 - }, - { - "epoch": 13.07, - "learning_rate": 2.61604436286535e-06, - "loss": 0.2236, - "step": 1282500 - }, - { - "epoch": 13.07, - "learning_rate": 2.6133358033057077e-06, - "loss": 0.2694, - "step": 1282600 - }, - { - "epoch": 13.07, - "learning_rate": 2.6106285827905072e-06, - "loss": 0.2387, - "step": 1282700 - }, - { - "epoch": 13.07, - "learning_rate": 2.6079227014521234e-06, - "loss": 0.164, - "step": 1282800 - }, - { - "epoch": 13.07, - "learning_rate": 2.60521815942285e-06, - "loss": 0.2231, - "step": 1282900 - }, - { - "epoch": 13.07, - "learning_rate": 2.602514956834924e-06, - "loss": 0.212, - "step": 1283000 - }, - { - "epoch": 13.07, - "learning_rate": 2.59981309382052e-06, - "loss": 0.2049, - "step": 1283100 - }, - { - "epoch": 13.07, - "learning_rate": 2.597112570511736e-06, - "loss": 0.1811, - "step": 1283200 - }, - { - "epoch": 13.07, - "learning_rate": 2.594413387040616e-06, - "loss": 0.2247, - "step": 1283300 - }, - { - "epoch": 13.08, - "learning_rate": 2.591715543539138e-06, - "loss": 0.2144, - "step": 1283400 - }, - { - "epoch": 13.08, - "learning_rate": 2.5890190401392e-06, - "loss": 0.1901, - "step": 1283500 - }, - { - "epoch": 13.08, - "learning_rate": 2.5863238769726525e-06, - "loss": 0.202, - "step": 1283600 - }, - { - "epoch": 13.08, - "learning_rate": 2.5836300541712676e-06, - "loss": 0.1931, - "step": 1283700 - }, - { - "epoch": 13.08, - "learning_rate": 2.580937571866766e-06, - "loss": 0.1846, - "step": 1283800 - }, - { - "epoch": 13.08, - "learning_rate": 2.578246430190786e-06, - "loss": 0.2005, - "step": 1283900 - }, - { - "epoch": 13.08, - "learning_rate": 2.575556629274909e-06, - "loss": 0.2239, - "step": 1284000 - }, - { - "epoch": 13.08, - "learning_rate": 2.5728681692506595e-06, - "loss": 0.2564, - "step": 1284100 - }, - { - "epoch": 13.08, - "learning_rate": 2.570181050249476e-06, - "loss": 0.2362, - "step": 1284200 - }, - { - "epoch": 13.08, - "learning_rate": 2.5674952724027436e-06, - "loss": 0.1947, - "step": 1284300 - }, - { - "epoch": 13.09, - "learning_rate": 2.5648108358417932e-06, - "loss": 0.2044, - "step": 1284400 - }, - { - "epoch": 13.09, - "learning_rate": 2.5621277406978605e-06, - "loss": 0.2936, - "step": 1284500 - }, - { - "epoch": 13.09, - "learning_rate": 2.5594459871021404e-06, - "loss": 0.2254, - "step": 1284600 - }, - { - "epoch": 13.09, - "learning_rate": 2.5567655751857577e-06, - "loss": 0.2056, - "step": 1284700 - }, - { - "epoch": 13.09, - "learning_rate": 2.5540865050797646e-06, - "loss": 0.2359, - "step": 1284800 - }, - { - "epoch": 13.09, - "learning_rate": 2.551408776915153e-06, - "loss": 0.274, - "step": 1284900 - }, - { - "epoch": 13.09, - "learning_rate": 2.5487323908228445e-06, - "loss": 0.2545, - "step": 1285000 - }, - { - "epoch": 13.09, - "learning_rate": 2.546057346933701e-06, - "loss": 0.2, - "step": 1285100 - }, - { - "epoch": 13.09, - "learning_rate": 2.5433836453785186e-06, - "loss": 0.1946, - "step": 1285200 - }, - { - "epoch": 13.09, - "learning_rate": 2.540711286288029e-06, - "loss": 0.2423, - "step": 1285300 - }, - { - "epoch": 13.1, - "learning_rate": 2.5380402697928838e-06, - "loss": 0.1941, - "step": 1285400 - }, - { - "epoch": 13.1, - "learning_rate": 2.535370596023686e-06, - "loss": 0.2118, - "step": 1285500 - }, - { - "epoch": 13.1, - "learning_rate": 2.5327022651109643e-06, - "loss": 0.2394, - "step": 1285600 - }, - { - "epoch": 13.1, - "learning_rate": 2.530035277185191e-06, - "loss": 0.2447, - "step": 1285700 - }, - { - "epoch": 13.1, - "learning_rate": 2.527369632376755e-06, - "loss": 0.2029, - "step": 1285800 - }, - { - "epoch": 13.1, - "learning_rate": 2.524705330815995e-06, - "loss": 0.2284, - "step": 1285900 - }, - { - "epoch": 13.1, - "learning_rate": 2.5220689955648637e-06, - "loss": 0.2328, - "step": 1286000 - }, - { - "epoch": 13.1, - "learning_rate": 2.5194073674544714e-06, - "loss": 0.2084, - "step": 1286100 - }, - { - "epoch": 13.1, - "learning_rate": 2.516747082981071e-06, - "loss": 0.1773, - "step": 1286200 - }, - { - "epoch": 13.11, - "learning_rate": 2.514088142274719e-06, - "loss": 0.1723, - "step": 1286300 - }, - { - "epoch": 13.11, - "learning_rate": 2.5114305454654297e-06, - "loss": 0.1693, - "step": 1286400 - }, - { - "epoch": 13.11, - "learning_rate": 2.5087742926831434e-06, - "loss": 0.2111, - "step": 1286500 - }, - { - "epoch": 13.11, - "learning_rate": 2.5061193840577368e-06, - "loss": 0.1756, - "step": 1286600 - }, - { - "epoch": 13.11, - "learning_rate": 2.5034658197190184e-06, - "loss": 0.1985, - "step": 1286700 - }, - { - "epoch": 13.11, - "learning_rate": 2.500813599796732e-06, - "loss": 0.198, - "step": 1286800 - }, - { - "epoch": 13.11, - "learning_rate": 2.498162724420553e-06, - "loss": 0.211, - "step": 1286900 - }, - { - "epoch": 13.11, - "learning_rate": 2.4955131937200915e-06, - "loss": 0.1791, - "step": 1287000 - }, - { - "epoch": 13.11, - "learning_rate": 2.4928650078249005e-06, - "loss": 0.195, - "step": 1287100 - }, - { - "epoch": 13.11, - "learning_rate": 2.4902181668644563e-06, - "loss": 0.1999, - "step": 1287200 - }, - { - "epoch": 13.12, - "learning_rate": 2.487572670968169e-06, - "loss": 0.2193, - "step": 1287300 - }, - { - "epoch": 13.12, - "learning_rate": 2.4849285202654015e-06, - "loss": 0.2929, - "step": 1287400 - }, - { - "epoch": 13.12, - "learning_rate": 2.4822857148854205e-06, - "loss": 0.2414, - "step": 1287500 - }, - { - "epoch": 13.12, - "learning_rate": 2.4796442549574528e-06, - "loss": 0.2337, - "step": 1287600 - }, - { - "epoch": 13.12, - "learning_rate": 2.4770041406106446e-06, - "loss": 0.236, - "step": 1287700 - }, - { - "epoch": 13.12, - "learning_rate": 2.47436537197409e-06, - "loss": 0.2022, - "step": 1287800 - }, - { - "epoch": 13.12, - "learning_rate": 2.4717279491767987e-06, - "loss": 0.193, - "step": 1287900 - }, - { - "epoch": 13.12, - "learning_rate": 2.469091872347734e-06, - "loss": 0.1737, - "step": 1288000 - }, - { - "epoch": 13.12, - "learning_rate": 2.4664571416157865e-06, - "loss": 0.2324, - "step": 1288100 - }, - { - "epoch": 13.12, - "learning_rate": 2.4638237571097667e-06, - "loss": 0.2059, - "step": 1288200 - }, - { - "epoch": 13.13, - "learning_rate": 2.461191718958441e-06, - "loss": 0.1675, - "step": 1288300 - }, - { - "epoch": 13.13, - "learning_rate": 2.4585610272905e-06, - "loss": 0.2354, - "step": 1288400 - }, - { - "epoch": 13.13, - "learning_rate": 2.455931682234558e-06, - "loss": 0.1567, - "step": 1288500 - }, - { - "epoch": 13.13, - "learning_rate": 2.4533036839191847e-06, - "loss": 0.1926, - "step": 1288600 - }, - { - "epoch": 13.13, - "learning_rate": 2.450677032472878e-06, - "loss": 0.2297, - "step": 1288700 - }, - { - "epoch": 13.13, - "learning_rate": 2.448051728024051e-06, - "loss": 0.1655, - "step": 1288800 - }, - { - "epoch": 13.13, - "learning_rate": 2.4454277707010753e-06, - "loss": 0.2058, - "step": 1288900 - }, - { - "epoch": 13.13, - "learning_rate": 2.4428051606322443e-06, - "loss": 0.2448, - "step": 1289000 - }, - { - "epoch": 13.13, - "learning_rate": 2.4402101039026837e-06, - "loss": 0.2178, - "step": 1289100 - }, - { - "epoch": 13.13, - "learning_rate": 2.4375901752510345e-06, - "loss": 0.2184, - "step": 1289200 - }, - { - "epoch": 13.14, - "learning_rate": 2.4349715942367335e-06, - "loss": 0.2122, - "step": 1289300 - }, - { - "epoch": 13.14, - "learning_rate": 2.432354360987822e-06, - "loss": 0.2201, - "step": 1289400 - }, - { - "epoch": 13.14, - "learning_rate": 2.4297384756322704e-06, - "loss": 0.2243, - "step": 1289500 - }, - { - "epoch": 13.14, - "learning_rate": 2.4271239382979737e-06, - "loss": 0.1877, - "step": 1289600 - }, - { - "epoch": 13.14, - "learning_rate": 2.4245107491127695e-06, - "loss": 0.2237, - "step": 1289700 - }, - { - "epoch": 13.14, - "learning_rate": 2.421898908204435e-06, - "loss": 0.2152, - "step": 1289800 - }, - { - "epoch": 13.14, - "learning_rate": 2.4192884157006587e-06, - "loss": 0.2291, - "step": 1289900 - }, - { - "epoch": 13.14, - "learning_rate": 2.4166792717290853e-06, - "loss": 0.2416, - "step": 1290000 - }, - { - "epoch": 13.14, - "learning_rate": 2.414071476417289e-06, - "loss": 0.2054, - "step": 1290100 - }, - { - "epoch": 13.14, - "learning_rate": 2.411465029892774e-06, - "loss": 0.1767, - "step": 1290200 - }, - { - "epoch": 13.15, - "learning_rate": 2.408859932282976e-06, - "loss": 0.1706, - "step": 1290300 - }, - { - "epoch": 13.15, - "learning_rate": 2.406256183715273e-06, - "loss": 0.1765, - "step": 1290400 - }, - { - "epoch": 13.15, - "learning_rate": 2.4036537843169792e-06, - "loss": 0.2435, - "step": 1290500 - }, - { - "epoch": 13.15, - "learning_rate": 2.4010527342153234e-06, - "loss": 0.1746, - "step": 1290600 - }, - { - "epoch": 13.15, - "learning_rate": 2.3984530335374833e-06, - "loss": 0.2098, - "step": 1290700 - }, - { - "epoch": 13.15, - "learning_rate": 2.395854682410581e-06, - "loss": 0.2191, - "step": 1290800 - }, - { - "epoch": 13.15, - "learning_rate": 2.393257680961645e-06, - "loss": 0.1913, - "step": 1290900 - }, - { - "epoch": 13.15, - "learning_rate": 2.390662029317663e-06, - "loss": 0.2279, - "step": 1291000 - }, - { - "epoch": 13.15, - "learning_rate": 2.3880677276055472e-06, - "loss": 0.2013, - "step": 1291100 - }, - { - "epoch": 13.15, - "learning_rate": 2.385474775952133e-06, - "loss": 0.2196, - "step": 1291200 - }, - { - "epoch": 13.16, - "learning_rate": 2.3829090838150567e-06, - "loss": 0.2361, - "step": 1291300 - }, - { - "epoch": 13.16, - "learning_rate": 2.380318819155587e-06, - "loss": 0.2051, - "step": 1291400 - }, - { - "epoch": 13.16, - "learning_rate": 2.3777299049336986e-06, - "loss": 0.218, - "step": 1291500 - }, - { - "epoch": 13.16, - "learning_rate": 2.375142341275978e-06, - "loss": 0.183, - "step": 1291600 - }, - { - "epoch": 13.16, - "learning_rate": 2.3725561283089426e-06, - "loss": 0.1633, - "step": 1291700 - }, - { - "epoch": 13.16, - "learning_rate": 2.369971266159039e-06, - "loss": 0.2002, - "step": 1291800 - }, - { - "epoch": 13.16, - "learning_rate": 2.367387754952659e-06, - "loss": 0.1992, - "step": 1291900 - }, - { - "epoch": 13.16, - "learning_rate": 2.3648055948161086e-06, - "loss": 0.2223, - "step": 1292000 - }, - { - "epoch": 13.16, - "learning_rate": 2.362224785875646e-06, - "loss": 0.164, - "step": 1292100 - }, - { - "epoch": 13.17, - "learning_rate": 2.359645328257465e-06, - "loss": 0.2919, - "step": 1292200 - }, - { - "epoch": 13.17, - "learning_rate": 2.3570672220876698e-06, - "loss": 0.1723, - "step": 1292300 - }, - { - "epoch": 13.17, - "learning_rate": 2.3544904674923272e-06, - "loss": 0.2045, - "step": 1292400 - }, - { - "epoch": 13.17, - "learning_rate": 2.3519150645974165e-06, - "loss": 0.1613, - "step": 1292500 - }, - { - "epoch": 13.17, - "learning_rate": 2.3493410135288727e-06, - "loss": 0.1761, - "step": 1292600 - }, - { - "epoch": 13.17, - "learning_rate": 2.346768314412536e-06, - "loss": 0.2053, - "step": 1292700 - }, - { - "epoch": 13.17, - "learning_rate": 2.3441969673742057e-06, - "loss": 0.2205, - "step": 1292800 - }, - { - "epoch": 13.17, - "learning_rate": 2.341626972539598e-06, - "loss": 0.1725, - "step": 1292900 - }, - { - "epoch": 13.17, - "learning_rate": 2.339058330034378e-06, - "loss": 0.1668, - "step": 1293000 - }, - { - "epoch": 13.17, - "learning_rate": 2.3364910399841335e-06, - "loss": 0.2127, - "step": 1293100 - }, - { - "epoch": 13.18, - "learning_rate": 2.3339251025143925e-06, - "loss": 0.2762, - "step": 1293200 - }, - { - "epoch": 13.18, - "learning_rate": 2.3313605177506093e-06, - "loss": 0.213, - "step": 1293300 - }, - { - "epoch": 13.18, - "learning_rate": 2.3287972858181795e-06, - "loss": 0.1964, - "step": 1293400 - }, - { - "epoch": 13.18, - "learning_rate": 2.326235406842433e-06, - "loss": 0.1815, - "step": 1293500 - }, - { - "epoch": 13.18, - "learning_rate": 2.3236748809486195e-06, - "loss": 0.1965, - "step": 1293600 - }, - { - "epoch": 13.18, - "learning_rate": 2.3211157082619427e-06, - "loss": 0.2462, - "step": 1293700 - }, - { - "epoch": 13.18, - "learning_rate": 2.3185578889075355e-06, - "loss": 0.1957, - "step": 1293800 - }, - { - "epoch": 13.18, - "learning_rate": 2.3160014230104445e-06, - "loss": 0.1501, - "step": 1293900 - }, - { - "epoch": 13.18, - "learning_rate": 2.3134463106956794e-06, - "loss": 0.2759, - "step": 1294000 - }, - { - "epoch": 13.18, - "learning_rate": 2.3108925520881607e-06, - "loss": 0.193, - "step": 1294100 - }, - { - "epoch": 13.19, - "learning_rate": 2.3083401473127585e-06, - "loss": 0.2108, - "step": 1294200 - }, - { - "epoch": 13.19, - "learning_rate": 2.3057890964942664e-06, - "loss": 0.2017, - "step": 1294300 - }, - { - "epoch": 13.19, - "learning_rate": 2.3032393997574174e-06, - "loss": 0.2082, - "step": 1294400 - }, - { - "epoch": 13.19, - "learning_rate": 2.3006910572268823e-06, - "loss": 0.2394, - "step": 1294500 - }, - { - "epoch": 13.19, - "learning_rate": 2.2981440690272484e-06, - "loss": 0.1957, - "step": 1294600 - }, - { - "epoch": 13.19, - "learning_rate": 2.2955984352830518e-06, - "loss": 0.2277, - "step": 1294700 - }, - { - "epoch": 13.19, - "learning_rate": 2.293054156118767e-06, - "loss": 0.1751, - "step": 1294800 - }, - { - "epoch": 13.19, - "learning_rate": 2.290511231658781e-06, - "loss": 0.2401, - "step": 1294900 - }, - { - "epoch": 13.19, - "learning_rate": 2.287969662027437e-06, - "loss": 0.2058, - "step": 1295000 - }, - { - "epoch": 13.19, - "learning_rate": 2.2854294473490034e-06, - "loss": 0.2663, - "step": 1295100 - }, - { - "epoch": 13.2, - "learning_rate": 2.2828905877476735e-06, - "loss": 0.154, - "step": 1295200 - }, - { - "epoch": 13.2, - "learning_rate": 2.280353083347585e-06, - "loss": 0.2094, - "step": 1295300 - }, - { - "epoch": 13.2, - "learning_rate": 2.2778169342728084e-06, - "loss": 0.1893, - "step": 1295400 - }, - { - "epoch": 13.2, - "learning_rate": 2.2753074818737187e-06, - "loss": 0.2367, - "step": 1295500 - }, - { - "epoch": 13.2, - "learning_rate": 2.272774030265169e-06, - "loss": 0.2364, - "step": 1295600 - }, - { - "epoch": 13.2, - "learning_rate": 2.270241934352495e-06, - "loss": 0.2004, - "step": 1295700 - }, - { - "epoch": 13.2, - "learning_rate": 2.2677111942595077e-06, - "loss": 0.1868, - "step": 1295800 - }, - { - "epoch": 13.2, - "learning_rate": 2.265181810109951e-06, - "loss": 0.2013, - "step": 1295900 - }, - { - "epoch": 13.2, - "learning_rate": 2.262653782027486e-06, - "loss": 0.21, - "step": 1296000 - }, - { - "epoch": 13.2, - "learning_rate": 2.260127110135725e-06, - "loss": 0.1909, - "step": 1296100 - }, - { - "epoch": 13.21, - "learning_rate": 2.2576017945582105e-06, - "loss": 0.2021, - "step": 1296200 - }, - { - "epoch": 13.21, - "learning_rate": 2.255077835418402e-06, - "loss": 0.1461, - "step": 1296300 - }, - { - "epoch": 13.21, - "learning_rate": 2.252555232839717e-06, - "loss": 0.2706, - "step": 1296400 - }, - { - "epoch": 13.21, - "learning_rate": 2.2500339869454966e-06, - "loss": 0.1618, - "step": 1296500 - }, - { - "epoch": 13.21, - "learning_rate": 2.247514097859006e-06, - "loss": 0.2098, - "step": 1296600 - }, - { - "epoch": 13.21, - "learning_rate": 2.244995565703459e-06, - "loss": 0.2122, - "step": 1296700 - }, - { - "epoch": 13.21, - "learning_rate": 2.2424783906019887e-06, - "loss": 0.2163, - "step": 1296800 - }, - { - "epoch": 13.21, - "learning_rate": 2.2399877241384993e-06, - "loss": 0.2205, - "step": 1296900 - }, - { - "epoch": 13.21, - "learning_rate": 2.237473249940756e-06, - "loss": 0.2227, - "step": 1297000 - }, - { - "epoch": 13.22, - "learning_rate": 2.234960133164887e-06, - "loss": 0.1742, - "step": 1297100 - }, - { - "epoch": 13.22, - "learning_rate": 2.232448373933772e-06, - "loss": 0.1988, - "step": 1297200 - }, - { - "epoch": 13.22, - "learning_rate": 2.2299379723702285e-06, - "loss": 0.1702, - "step": 1297300 - }, - { - "epoch": 13.22, - "learning_rate": 2.227428928596995e-06, - "loss": 0.187, - "step": 1297400 - }, - { - "epoch": 13.22, - "learning_rate": 2.224921242736747e-06, - "loss": 0.2003, - "step": 1297500 - }, - { - "epoch": 13.22, - "learning_rate": 2.2224149149120963e-06, - "loss": 0.2346, - "step": 1297600 - }, - { - "epoch": 13.22, - "learning_rate": 2.219909945245595e-06, - "loss": 0.2773, - "step": 1297700 - }, - { - "epoch": 13.22, - "learning_rate": 2.2174063338597085e-06, - "loss": 0.2137, - "step": 1297800 - }, - { - "epoch": 13.22, - "learning_rate": 2.2149040808768516e-06, - "loss": 0.1803, - "step": 1297900 - }, - { - "epoch": 13.22, - "learning_rate": 2.2124031864193737e-06, - "loss": 0.2208, - "step": 1298000 - }, - { - "epoch": 13.23, - "learning_rate": 2.209903650609554e-06, - "loss": 0.2137, - "step": 1298100 - }, - { - "epoch": 13.23, - "learning_rate": 2.2074054735696035e-06, - "loss": 0.2112, - "step": 1298200 - }, - { - "epoch": 13.23, - "learning_rate": 2.2049086554216725e-06, - "loss": 0.2146, - "step": 1298300 - }, - { - "epoch": 13.23, - "learning_rate": 2.2024131962878293e-06, - "loss": 0.1706, - "step": 1298400 - }, - { - "epoch": 13.23, - "learning_rate": 2.199919096290093e-06, - "loss": 0.2422, - "step": 1298500 - }, - { - "epoch": 13.23, - "learning_rate": 2.1974263555504193e-06, - "loss": 0.2061, - "step": 1298600 - }, - { - "epoch": 13.23, - "learning_rate": 2.194934974190671e-06, - "loss": 0.1997, - "step": 1298700 - }, - { - "epoch": 13.23, - "learning_rate": 2.1924449523326705e-06, - "loss": 0.1908, - "step": 1298800 - }, - { - "epoch": 13.23, - "learning_rate": 2.189956290098174e-06, - "loss": 0.1955, - "step": 1298900 - }, - { - "epoch": 13.23, - "learning_rate": 2.187468987608844e-06, - "loss": 0.204, - "step": 1299000 - }, - { - "epoch": 13.24, - "learning_rate": 2.184983044986306e-06, - "loss": 0.2233, - "step": 1299100 - }, - { - "epoch": 13.24, - "learning_rate": 2.182498462352104e-06, - "loss": 0.2342, - "step": 1299200 - }, - { - "epoch": 13.24, - "learning_rate": 2.180015239827723e-06, - "loss": 0.2661, - "step": 1299300 - }, - { - "epoch": 13.24, - "learning_rate": 2.1775333775345762e-06, - "loss": 0.2683, - "step": 1299400 - }, - { - "epoch": 13.24, - "learning_rate": 2.1750528755940102e-06, - "loss": 0.2263, - "step": 1299500 - }, - { - "epoch": 13.24, - "learning_rate": 2.1725737341273145e-06, - "loss": 0.252, - "step": 1299600 - }, - { - "epoch": 13.24, - "learning_rate": 2.1700959532556886e-06, - "loss": 0.1767, - "step": 1299700 - }, - { - "epoch": 13.24, - "learning_rate": 2.1676195331002957e-06, - "loss": 0.2067, - "step": 1299800 - }, - { - "epoch": 13.24, - "learning_rate": 2.1651444737822155e-06, - "loss": 0.2118, - "step": 1299900 - }, - { - "epoch": 13.24, - "learning_rate": 2.162670775422455e-06, - "loss": 0.2218, - "step": 1300000 - }, - { - "epoch": 13.25, - "learning_rate": 2.1601984381419705e-06, - "loss": 0.1682, - "step": 1300100 - }, - { - "epoch": 13.25, - "learning_rate": 2.157727462061645e-06, - "loss": 0.2442, - "step": 1300200 - }, - { - "epoch": 13.25, - "learning_rate": 2.155257847302292e-06, - "loss": 0.1846, - "step": 1300300 - }, - { - "epoch": 13.25, - "learning_rate": 2.1527895939846587e-06, - "loss": 0.2003, - "step": 1300400 - }, - { - "epoch": 13.25, - "learning_rate": 2.1503227022294315e-06, - "loss": 0.1489, - "step": 1300500 - }, - { - "epoch": 13.25, - "learning_rate": 2.1478571721572236e-06, - "loss": 0.1738, - "step": 1300600 - }, - { - "epoch": 13.25, - "learning_rate": 2.1453930038885926e-06, - "loss": 0.2008, - "step": 1300700 - }, - { - "epoch": 13.25, - "learning_rate": 2.142930197544012e-06, - "loss": 0.2302, - "step": 1300800 - }, - { - "epoch": 13.25, - "learning_rate": 2.1404687532439084e-06, - "loss": 0.2071, - "step": 1300900 - }, - { - "epoch": 13.25, - "learning_rate": 2.1380086711086188e-06, - "loss": 0.2227, - "step": 1301000 - }, - { - "epoch": 13.26, - "learning_rate": 2.1355499512584376e-06, - "loss": 0.1952, - "step": 1301100 - }, - { - "epoch": 13.26, - "learning_rate": 2.133092593813578e-06, - "loss": 0.2361, - "step": 1301200 - }, - { - "epoch": 13.26, - "learning_rate": 2.1306365988941877e-06, - "loss": 0.2327, - "step": 1301300 - }, - { - "epoch": 13.26, - "learning_rate": 2.128181966620354e-06, - "loss": 0.2003, - "step": 1301400 - }, - { - "epoch": 13.26, - "learning_rate": 2.125728697112094e-06, - "loss": 0.1946, - "step": 1301500 - }, - { - "epoch": 13.26, - "learning_rate": 2.123276790489348e-06, - "loss": 0.2183, - "step": 1301600 - }, - { - "epoch": 13.26, - "learning_rate": 2.120826246872011e-06, - "loss": 0.2045, - "step": 1301700 - }, - { - "epoch": 13.26, - "learning_rate": 2.118401551436955e-06, - "loss": 0.1867, - "step": 1301800 - }, - { - "epoch": 13.26, - "learning_rate": 2.1159537205567725e-06, - "loss": 0.2007, - "step": 1301900 - }, - { - "epoch": 13.27, - "learning_rate": 2.1135072530400457e-06, - "loss": 0.2549, - "step": 1302000 - }, - { - "epoch": 13.27, - "learning_rate": 2.1110621490063987e-06, - "loss": 0.2518, - "step": 1302100 - }, - { - "epoch": 13.27, - "learning_rate": 2.108618408575379e-06, - "loss": 0.2147, - "step": 1302200 - }, - { - "epoch": 13.27, - "learning_rate": 2.106176031866478e-06, - "loss": 0.18, - "step": 1302300 - }, - { - "epoch": 13.27, - "learning_rate": 2.1037350189991e-06, - "loss": 0.2135, - "step": 1302400 - }, - { - "epoch": 13.27, - "learning_rate": 2.101295370092606e-06, - "loss": 0.2221, - "step": 1302500 - }, - { - "epoch": 13.27, - "learning_rate": 2.0988570852662813e-06, - "loss": 0.2158, - "step": 1302600 - }, - { - "epoch": 13.27, - "learning_rate": 2.0964201646393365e-06, - "loss": 0.1923, - "step": 1302700 - }, - { - "epoch": 13.27, - "learning_rate": 2.0939846083309233e-06, - "loss": 0.1902, - "step": 1302800 - }, - { - "epoch": 13.27, - "learning_rate": 2.091550416460133e-06, - "loss": 0.1689, - "step": 1302900 - }, - { - "epoch": 13.28, - "learning_rate": 2.089117589145977e-06, - "loss": 0.2489, - "step": 1303000 - }, - { - "epoch": 13.28, - "learning_rate": 2.0866861265074034e-06, - "loss": 0.1964, - "step": 1303100 - }, - { - "epoch": 13.28, - "learning_rate": 2.0842560286633037e-06, - "loss": 0.2397, - "step": 1303200 - }, - { - "epoch": 13.28, - "learning_rate": 2.08182729573249e-06, - "loss": 0.1869, - "step": 1303300 - }, - { - "epoch": 13.28, - "learning_rate": 2.079399927833714e-06, - "loss": 0.192, - "step": 1303400 - }, - { - "epoch": 13.28, - "learning_rate": 2.0769739250856633e-06, - "loss": 0.2188, - "step": 1303500 - }, - { - "epoch": 13.28, - "learning_rate": 2.0745492876069537e-06, - "loss": 0.2199, - "step": 1303600 - }, - { - "epoch": 13.28, - "learning_rate": 2.0721260155161303e-06, - "loss": 0.2211, - "step": 1303700 - }, - { - "epoch": 13.28, - "learning_rate": 2.069704108931678e-06, - "loss": 0.2024, - "step": 1303800 - }, - { - "epoch": 13.28, - "learning_rate": 2.0672835679720193e-06, - "loss": 0.2016, - "step": 1303900 - }, - { - "epoch": 13.29, - "learning_rate": 2.0648643927554987e-06, - "loss": 0.1638, - "step": 1304000 - }, - { - "epoch": 13.29, - "learning_rate": 2.0624465834003993e-06, - "loss": 0.1964, - "step": 1304100 - }, - { - "epoch": 13.29, - "learning_rate": 2.060054297696712e-06, - "loss": 0.1859, - "step": 1304200 - }, - { - "epoch": 13.29, - "learning_rate": 2.057639206757478e-06, - "loss": 0.2181, - "step": 1304300 - }, - { - "epoch": 13.29, - "learning_rate": 2.0552254820329387e-06, - "loss": 0.2092, - "step": 1304400 - }, - { - "epoch": 13.29, - "learning_rate": 2.052813123641103e-06, - "loss": 0.2251, - "step": 1304500 - }, - { - "epoch": 13.29, - "learning_rate": 2.05040213169993e-06, - "loss": 0.1899, - "step": 1304600 - }, - { - "epoch": 13.29, - "learning_rate": 2.047992506327302e-06, - "loss": 0.2154, - "step": 1304700 - }, - { - "epoch": 13.29, - "learning_rate": 2.045584247641025e-06, - "loss": 0.1794, - "step": 1304800 - }, - { - "epoch": 13.29, - "learning_rate": 2.043177355758857e-06, - "loss": 0.2475, - "step": 1304900 - }, - { - "epoch": 13.3, - "learning_rate": 2.040795879281434e-06, - "loss": 0.1884, - "step": 1305000 - }, - { - "epoch": 13.3, - "learning_rate": 2.038391707689483e-06, - "loss": 0.2538, - "step": 1305100 - }, - { - "epoch": 13.3, - "learning_rate": 2.0359889032533206e-06, - "loss": 0.2537, - "step": 1305200 - }, - { - "epoch": 13.3, - "learning_rate": 2.0335874660904097e-06, - "loss": 0.2283, - "step": 1305300 - }, - { - "epoch": 13.3, - "learning_rate": 2.031187396318179e-06, - "loss": 0.2475, - "step": 1305400 - }, - { - "epoch": 13.3, - "learning_rate": 2.0287886940539825e-06, - "loss": 0.2245, - "step": 1305500 - }, - { - "epoch": 13.3, - "learning_rate": 2.0263913594150884e-06, - "loss": 0.2335, - "step": 1305600 - }, - { - "epoch": 13.3, - "learning_rate": 2.0239953925187203e-06, - "loss": 0.2225, - "step": 1305700 - }, - { - "epoch": 13.3, - "learning_rate": 2.0216007934820245e-06, - "loss": 0.1594, - "step": 1305800 - }, - { - "epoch": 13.3, - "learning_rate": 2.019207562422084e-06, - "loss": 0.2922, - "step": 1305900 - }, - { - "epoch": 13.31, - "learning_rate": 2.016815699455915e-06, - "loss": 0.2299, - "step": 1306000 - }, - { - "epoch": 13.31, - "learning_rate": 2.014425204700464e-06, - "loss": 0.1795, - "step": 1306100 - }, - { - "epoch": 13.31, - "learning_rate": 2.0120360782726076e-06, - "loss": 0.2301, - "step": 1306200 - }, - { - "epoch": 13.31, - "learning_rate": 2.0096483202891658e-06, - "loss": 0.2518, - "step": 1306300 - }, - { - "epoch": 13.31, - "learning_rate": 2.0072619308668815e-06, - "loss": 0.2624, - "step": 1306400 - }, - { - "epoch": 13.31, - "learning_rate": 2.004876910122442e-06, - "loss": 0.2199, - "step": 1306500 - }, - { - "epoch": 13.31, - "learning_rate": 2.002493258172453e-06, - "loss": 0.2245, - "step": 1306600 - }, - { - "epoch": 13.31, - "learning_rate": 2.000110975133459e-06, - "loss": 0.2473, - "step": 1306700 - }, - { - "epoch": 13.31, - "learning_rate": 1.997730061121953e-06, - "loss": 0.2481, - "step": 1306800 - }, - { - "epoch": 13.31, - "learning_rate": 1.995350516254332e-06, - "loss": 0.1975, - "step": 1306900 - }, - { - "epoch": 13.32, - "learning_rate": 1.992972340646946e-06, - "loss": 0.1776, - "step": 1307000 - }, - { - "epoch": 13.32, - "learning_rate": 1.9905955344160754e-06, - "loss": 0.2122, - "step": 1307100 - }, - { - "epoch": 13.32, - "learning_rate": 1.988220097677934e-06, - "loss": 0.2808, - "step": 1307200 - }, - { - "epoch": 13.32, - "learning_rate": 1.9858460305486626e-06, - "loss": 0.196, - "step": 1307300 - }, - { - "epoch": 13.32, - "learning_rate": 1.9834733331443445e-06, - "loss": 0.2234, - "step": 1307400 - }, - { - "epoch": 13.32, - "learning_rate": 1.9811020055809836e-06, - "loss": 0.2014, - "step": 1307500 - }, - { - "epoch": 13.32, - "learning_rate": 1.9787320479745276e-06, - "loss": 0.2046, - "step": 1307600 - }, - { - "epoch": 13.32, - "learning_rate": 1.976363460440853e-06, - "loss": 0.2341, - "step": 1307700 - }, - { - "epoch": 13.32, - "learning_rate": 1.973996243095768e-06, - "loss": 0.2147, - "step": 1307800 - }, - { - "epoch": 13.33, - "learning_rate": 1.971630396055013e-06, - "loss": 0.214, - "step": 1307900 - }, - { - "epoch": 13.33, - "learning_rate": 1.9692659194342688e-06, - "loss": 0.2123, - "step": 1308000 - }, - { - "epoch": 13.33, - "learning_rate": 1.9669028133491496e-06, - "loss": 0.1957, - "step": 1308100 - }, - { - "epoch": 13.33, - "learning_rate": 1.964564688484416e-06, - "loss": 0.1706, - "step": 1308200 - }, - { - "epoch": 13.33, - "learning_rate": 1.962204310108845e-06, - "loss": 0.2276, - "step": 1308300 - }, - { - "epoch": 13.33, - "learning_rate": 1.959845302614166e-06, - "loss": 0.2188, - "step": 1308400 - }, - { - "epoch": 13.33, - "learning_rate": 1.9574876661157172e-06, - "loss": 0.1887, - "step": 1308500 - }, - { - "epoch": 13.33, - "learning_rate": 1.955131400728769e-06, - "loss": 0.1907, - "step": 1308600 - }, - { - "epoch": 13.33, - "learning_rate": 1.952776506568533e-06, - "loss": 0.2465, - "step": 1308700 - }, - { - "epoch": 13.33, - "learning_rate": 1.950446512189814e-06, - "loss": 0.2615, - "step": 1308800 - }, - { - "epoch": 13.34, - "learning_rate": 1.9480943471132174e-06, - "loss": 0.235, - "step": 1308900 - }, - { - "epoch": 13.34, - "learning_rate": 1.945743553607404e-06, - "loss": 0.2284, - "step": 1309000 - }, - { - "epoch": 13.34, - "learning_rate": 1.9433941317873094e-06, - "loss": 0.1765, - "step": 1309100 - }, - { - "epoch": 13.34, - "learning_rate": 1.9410460817678077e-06, - "loss": 0.215, - "step": 1309200 - }, - { - "epoch": 13.34, - "learning_rate": 1.9386994036637073e-06, - "loss": 0.2087, - "step": 1309300 - }, - { - "epoch": 13.34, - "learning_rate": 1.9363540975897463e-06, - "loss": 0.1956, - "step": 1309400 - }, - { - "epoch": 13.34, - "learning_rate": 1.9340101636605935e-06, - "loss": 0.2231, - "step": 1309500 - }, - { - "epoch": 13.34, - "learning_rate": 1.931667601990853e-06, - "loss": 0.214, - "step": 1309600 - }, - { - "epoch": 13.34, - "learning_rate": 1.9293264126950638e-06, - "loss": 0.1919, - "step": 1309700 - }, - { - "epoch": 13.34, - "learning_rate": 1.926986595887694e-06, - "loss": 0.2319, - "step": 1309800 - }, - { - "epoch": 13.35, - "learning_rate": 1.924648151683153e-06, - "loss": 0.2461, - "step": 1309900 - }, - { - "epoch": 13.35, - "learning_rate": 1.9223110801957745e-06, - "loss": 0.2181, - "step": 1310000 - }, - { - "epoch": 13.35, - "learning_rate": 1.9199753815398215e-06, - "loss": 0.2366, - "step": 1310100 - }, - { - "epoch": 13.35, - "learning_rate": 1.9176410558294956e-06, - "loss": 0.1822, - "step": 1310200 - }, - { - "epoch": 13.35, - "learning_rate": 1.9153081031789454e-06, - "loss": 0.2044, - "step": 1310300 - }, - { - "epoch": 13.35, - "learning_rate": 1.91297652370222e-06, - "loss": 0.1818, - "step": 1310400 - }, - { - "epoch": 13.35, - "learning_rate": 1.9106463175133314e-06, - "loss": 0.1839, - "step": 1310500 - }, - { - "epoch": 13.35, - "learning_rate": 1.908317484726212e-06, - "loss": 0.2503, - "step": 1310600 - }, - { - "epoch": 13.35, - "learning_rate": 1.9059900254547203e-06, - "loss": 0.2297, - "step": 1310700 - }, - { - "epoch": 13.35, - "learning_rate": 1.9036639398126587e-06, - "loss": 0.2208, - "step": 1310800 - }, - { - "epoch": 13.36, - "learning_rate": 1.901339227913763e-06, - "loss": 0.1812, - "step": 1310900 - }, - { - "epoch": 13.36, - "learning_rate": 1.8990158898716925e-06, - "loss": 0.1603, - "step": 1311000 - }, - { - "epoch": 13.36, - "learning_rate": 1.8966939258000494e-06, - "loss": 0.1914, - "step": 1311100 - }, - { - "epoch": 13.36, - "learning_rate": 1.8943733358123628e-06, - "loss": 0.206, - "step": 1311200 - }, - { - "epoch": 13.36, - "learning_rate": 1.8920541200220954e-06, - "loss": 0.2413, - "step": 1311300 - }, - { - "epoch": 13.36, - "learning_rate": 1.8897362785426398e-06, - "loss": 0.2332, - "step": 1311400 - }, - { - "epoch": 13.36, - "learning_rate": 1.8874198114873253e-06, - "loss": 0.1768, - "step": 1311500 - }, - { - "epoch": 13.36, - "learning_rate": 1.8851047189694215e-06, - "loss": 0.2015, - "step": 1311600 - }, - { - "epoch": 13.36, - "learning_rate": 1.8827910011021076e-06, - "loss": 0.1734, - "step": 1311700 - }, - { - "epoch": 13.36, - "learning_rate": 1.8804786579985233e-06, - "loss": 0.2484, - "step": 1311800 - }, - { - "epoch": 13.37, - "learning_rate": 1.8781676897717282e-06, - "loss": 0.2143, - "step": 1311900 - }, - { - "epoch": 13.37, - "learning_rate": 1.875858096534705e-06, - "loss": 0.2038, - "step": 1312000 - }, - { - "epoch": 13.37, - "learning_rate": 1.8735498784003834e-06, - "loss": 0.1588, - "step": 1312100 - }, - { - "epoch": 13.37, - "learning_rate": 1.8712430354816235e-06, - "loss": 0.2342, - "step": 1312200 - }, - { - "epoch": 13.37, - "learning_rate": 1.868937567891218e-06, - "loss": 0.2273, - "step": 1312300 - }, - { - "epoch": 13.37, - "learning_rate": 1.8666334757418868e-06, - "loss": 0.2276, - "step": 1312400 - }, - { - "epoch": 13.37, - "learning_rate": 1.86433075914629e-06, - "loss": 0.1749, - "step": 1312500 - }, - { - "epoch": 13.37, - "learning_rate": 1.8620294182170106e-06, - "loss": 0.1842, - "step": 1312600 - }, - { - "epoch": 13.37, - "learning_rate": 1.8597294530665755e-06, - "loss": 0.2137, - "step": 1312700 - }, - { - "epoch": 13.38, - "learning_rate": 1.8574308638074378e-06, - "loss": 0.1913, - "step": 1312800 - }, - { - "epoch": 13.38, - "learning_rate": 1.8551566158729527e-06, - "loss": 0.2594, - "step": 1312900 - }, - { - "epoch": 13.38, - "learning_rate": 1.852860764971792e-06, - "loss": 0.2749, - "step": 1313000 - }, - { - "epoch": 13.38, - "learning_rate": 1.8505662902977639e-06, - "loss": 0.198, - "step": 1313100 - }, - { - "epoch": 13.38, - "learning_rate": 1.8482731919630547e-06, - "loss": 0.2161, - "step": 1313200 - }, - { - "epoch": 13.38, - "learning_rate": 1.8459814700797883e-06, - "loss": 0.2551, - "step": 1313300 - }, - { - "epoch": 13.38, - "learning_rate": 1.8436911247600151e-06, - "loss": 0.2322, - "step": 1313400 - }, - { - "epoch": 13.38, - "learning_rate": 1.8414021561157158e-06, - "loss": 0.1807, - "step": 1313500 - }, - { - "epoch": 13.38, - "learning_rate": 1.8391145642588037e-06, - "loss": 0.2258, - "step": 1313600 - }, - { - "epoch": 13.38, - "learning_rate": 1.8368283493011428e-06, - "loss": 0.1659, - "step": 1313700 - }, - { - "epoch": 13.39, - "learning_rate": 1.8345435113545006e-06, - "loss": 0.1867, - "step": 1313800 - }, - { - "epoch": 13.39, - "learning_rate": 1.8322600505306009e-06, - "loss": 0.1888, - "step": 1313900 - }, - { - "epoch": 13.39, - "learning_rate": 1.829977966941091e-06, - "loss": 0.211, - "step": 1314000 - }, - { - "epoch": 13.39, - "learning_rate": 1.827697260697545e-06, - "loss": 0.2125, - "step": 1314100 - }, - { - "epoch": 13.39, - "learning_rate": 1.8254179319114806e-06, - "loss": 0.1995, - "step": 1314200 - }, - { - "epoch": 13.39, - "learning_rate": 1.823139980694345e-06, - "loss": 0.1801, - "step": 1314300 - }, - { - "epoch": 13.39, - "learning_rate": 1.8208861660729914e-06, - "loss": 0.1892, - "step": 1314400 - }, - { - "epoch": 13.39, - "learning_rate": 1.818610956549308e-06, - "loss": 0.2073, - "step": 1314500 - }, - { - "epoch": 13.39, - "learning_rate": 1.8163371249273675e-06, - "loss": 0.2189, - "step": 1314600 - }, - { - "epoch": 13.39, - "learning_rate": 1.8140646713183505e-06, - "loss": 0.204, - "step": 1314700 - }, - { - "epoch": 13.4, - "learning_rate": 1.8117935958333653e-06, - "loss": 0.2228, - "step": 1314800 - }, - { - "epoch": 13.4, - "learning_rate": 1.809523898583456e-06, - "loss": 0.1621, - "step": 1314900 - }, - { - "epoch": 13.4, - "learning_rate": 1.8072555796795942e-06, - "loss": 0.2624, - "step": 1315000 - }, - { - "epoch": 13.4, - "learning_rate": 1.8049886392326942e-06, - "loss": 0.2534, - "step": 1315100 - }, - { - "epoch": 13.4, - "learning_rate": 1.8027230773535874e-06, - "loss": 0.2113, - "step": 1315200 - }, - { - "epoch": 13.4, - "learning_rate": 1.8004588941530486e-06, - "loss": 0.1925, - "step": 1315300 - }, - { - "epoch": 13.4, - "learning_rate": 1.7981960897417892e-06, - "loss": 0.2222, - "step": 1315400 - }, - { - "epoch": 13.4, - "learning_rate": 1.7959346642304374e-06, - "loss": 0.2975, - "step": 1315500 - }, - { - "epoch": 13.4, - "learning_rate": 1.793674617729565e-06, - "loss": 0.2362, - "step": 1315600 - }, - { - "epoch": 13.4, - "learning_rate": 1.7914159503496797e-06, - "loss": 0.2463, - "step": 1315700 - }, - { - "epoch": 13.41, - "learning_rate": 1.7891586622012168e-06, - "loss": 0.2568, - "step": 1315800 - }, - { - "epoch": 13.41, - "learning_rate": 1.7869027533945381e-06, - "loss": 0.2061, - "step": 1315900 - }, - { - "epoch": 13.41, - "learning_rate": 1.7846482240399487e-06, - "loss": 0.2389, - "step": 1316000 - }, - { - "epoch": 13.41, - "learning_rate": 1.7823950742476803e-06, - "loss": 0.2261, - "step": 1316100 - }, - { - "epoch": 13.41, - "learning_rate": 1.780143304127898e-06, - "loss": 0.2666, - "step": 1316200 - }, - { - "epoch": 13.41, - "learning_rate": 1.7778929137907008e-06, - "loss": 0.1873, - "step": 1316300 - }, - { - "epoch": 13.41, - "learning_rate": 1.775643903346127e-06, - "loss": 0.1975, - "step": 1316400 - }, - { - "epoch": 13.41, - "learning_rate": 1.7733962729041254e-06, - "loss": 0.2175, - "step": 1316500 - }, - { - "epoch": 13.41, - "learning_rate": 1.771150022574598e-06, - "loss": 0.2301, - "step": 1316600 - }, - { - "epoch": 13.41, - "learning_rate": 1.768905152467377e-06, - "loss": 0.2216, - "step": 1316700 - }, - { - "epoch": 13.42, - "learning_rate": 1.7666840907569638e-06, - "loss": 0.1657, - "step": 1316800 - }, - { - "epoch": 13.42, - "learning_rate": 1.7644419676186018e-06, - "loss": 0.1761, - "step": 1316900 - }, - { - "epoch": 13.42, - "learning_rate": 1.7622012250305274e-06, - "loss": 0.1924, - "step": 1317000 - }, - { - "epoch": 13.42, - "learning_rate": 1.7599618631022963e-06, - "loss": 0.185, - "step": 1317100 - }, - { - "epoch": 13.42, - "learning_rate": 1.7577238819434037e-06, - "loss": 0.226, - "step": 1317200 - }, - { - "epoch": 13.42, - "learning_rate": 1.7554872816632728e-06, - "loss": 0.2805, - "step": 1317300 - }, - { - "epoch": 13.42, - "learning_rate": 1.7532520623712556e-06, - "loss": 0.1638, - "step": 1317400 - }, - { - "epoch": 13.42, - "learning_rate": 1.7510182241766482e-06, - "loss": 0.2193, - "step": 1317500 - }, - { - "epoch": 13.42, - "learning_rate": 1.7487857671886665e-06, - "loss": 0.1912, - "step": 1317600 - }, - { - "epoch": 13.42, - "learning_rate": 1.7465546915164665e-06, - "loss": 0.2153, - "step": 1317700 - }, - { - "epoch": 13.43, - "learning_rate": 1.7443249972691378e-06, - "loss": 0.2127, - "step": 1317800 - }, - { - "epoch": 13.43, - "learning_rate": 1.7420966845556928e-06, - "loss": 0.2098, - "step": 1317900 - }, - { - "epoch": 13.43, - "learning_rate": 1.739869753485085e-06, - "loss": 0.185, - "step": 1318000 - }, - { - "epoch": 13.43, - "learning_rate": 1.7376442041661999e-06, - "loss": 0.229, - "step": 1318100 - }, - { - "epoch": 13.43, - "learning_rate": 1.7354200367078544e-06, - "loss": 0.2156, - "step": 1318200 - }, - { - "epoch": 13.43, - "learning_rate": 1.733197251218791e-06, - "loss": 0.2034, - "step": 1318300 - }, - { - "epoch": 13.43, - "learning_rate": 1.7309758478076965e-06, - "loss": 0.2029, - "step": 1318400 - }, - { - "epoch": 13.43, - "learning_rate": 1.7287558265831838e-06, - "loss": 0.2004, - "step": 1318500 - }, - { - "epoch": 13.43, - "learning_rate": 1.7265371876537962e-06, - "loss": 0.206, - "step": 1318600 - }, - { - "epoch": 13.44, - "learning_rate": 1.72431993112801e-06, - "loss": 0.1752, - "step": 1318700 - }, - { - "epoch": 13.44, - "learning_rate": 1.7221040571142387e-06, - "loss": 0.1994, - "step": 1318800 - }, - { - "epoch": 13.44, - "learning_rate": 1.719889565720829e-06, - "loss": 0.2435, - "step": 1318900 - }, - { - "epoch": 13.44, - "learning_rate": 1.7176764570560477e-06, - "loss": 0.1548, - "step": 1319000 - }, - { - "epoch": 13.44, - "learning_rate": 1.7154647312281146e-06, - "loss": 0.149, - "step": 1319100 - }, - { - "epoch": 13.44, - "learning_rate": 1.7132764849280536e-06, - "loss": 0.2329, - "step": 1319200 - }, - { - "epoch": 13.44, - "learning_rate": 1.7110675112670915e-06, - "loss": 0.2269, - "step": 1319300 - }, - { - "epoch": 13.44, - "learning_rate": 1.7088599207661027e-06, - "loss": 0.2215, - "step": 1319400 - }, - { - "epoch": 13.44, - "learning_rate": 1.7066537135330307e-06, - "loss": 0.1804, - "step": 1319500 - }, - { - "epoch": 13.44, - "learning_rate": 1.7044488896757527e-06, - "loss": 0.2262, - "step": 1319600 - }, - { - "epoch": 13.45, - "learning_rate": 1.702245449302059e-06, - "loss": 0.2137, - "step": 1319700 - }, - { - "epoch": 13.45, - "learning_rate": 1.7000433925196902e-06, - "loss": 0.2126, - "step": 1319800 - }, - { - "epoch": 13.45, - "learning_rate": 1.6978427194363165e-06, - "loss": 0.1987, - "step": 1319900 - }, - { - "epoch": 13.45, - "learning_rate": 1.6956434301595324e-06, - "loss": 0.1603, - "step": 1320000 - }, - { - "epoch": 13.45, - "learning_rate": 1.693445524796875e-06, - "loss": 0.1622, - "step": 1320100 - }, - { - "epoch": 13.45, - "learning_rate": 1.6912490034558115e-06, - "loss": 0.2165, - "step": 1320200 - }, - { - "epoch": 13.45, - "learning_rate": 1.689053866243726e-06, - "loss": 0.2252, - "step": 1320300 - }, - { - "epoch": 13.45, - "learning_rate": 1.6868601132679594e-06, - "loss": 0.2113, - "step": 1320400 - }, - { - "epoch": 13.45, - "learning_rate": 1.6846677446357694e-06, - "loss": 0.2175, - "step": 1320500 - }, - { - "epoch": 13.45, - "learning_rate": 1.6824767604543501e-06, - "loss": 0.2395, - "step": 1320600 - }, - { - "epoch": 13.46, - "learning_rate": 1.6802871608308256e-06, - "loss": 0.2008, - "step": 1320700 - }, - { - "epoch": 13.46, - "learning_rate": 1.6780989458722573e-06, - "loss": 0.2328, - "step": 1320800 - }, - { - "epoch": 13.46, - "learning_rate": 1.675912115685636e-06, - "loss": 0.1976, - "step": 1320900 - }, - { - "epoch": 13.46, - "learning_rate": 1.6737266703778798e-06, - "loss": 0.223, - "step": 1321000 - }, - { - "epoch": 13.46, - "learning_rate": 1.6715426100558461e-06, - "loss": 0.2047, - "step": 1321100 - }, - { - "epoch": 13.46, - "learning_rate": 1.6693599348263234e-06, - "loss": 0.1869, - "step": 1321200 - }, - { - "epoch": 13.46, - "learning_rate": 1.6671786447960324e-06, - "loss": 0.2188, - "step": 1321300 - }, - { - "epoch": 13.46, - "learning_rate": 1.6649987400716215e-06, - "loss": 0.2397, - "step": 1321400 - }, - { - "epoch": 13.46, - "learning_rate": 1.6628202207596855e-06, - "loss": 0.2096, - "step": 1321500 - }, - { - "epoch": 13.46, - "learning_rate": 1.660643086966729e-06, - "loss": 0.207, - "step": 1321600 - }, - { - "epoch": 13.47, - "learning_rate": 1.6584673387992033e-06, - "loss": 0.2228, - "step": 1321700 - }, - { - "epoch": 13.47, - "learning_rate": 1.6562929763634937e-06, - "loss": 0.2435, - "step": 1321800 - }, - { - "epoch": 13.47, - "learning_rate": 1.6541199997659117e-06, - "loss": 0.2379, - "step": 1321900 - }, - { - "epoch": 13.47, - "learning_rate": 1.651948409112699e-06, - "loss": 0.2032, - "step": 1322000 - }, - { - "epoch": 13.47, - "learning_rate": 1.6497782045100407e-06, - "loss": 0.2393, - "step": 1322100 - }, - { - "epoch": 13.47, - "learning_rate": 1.6476093860640417e-06, - "loss": 0.2008, - "step": 1322200 - }, - { - "epoch": 13.47, - "learning_rate": 1.6454419538807442e-06, - "loss": 0.2627, - "step": 1322300 - }, - { - "epoch": 13.47, - "learning_rate": 1.643275908066123e-06, - "loss": 0.2101, - "step": 1322400 - }, - { - "epoch": 13.47, - "learning_rate": 1.641111248726087e-06, - "loss": 0.218, - "step": 1322500 - }, - { - "epoch": 13.47, - "learning_rate": 1.6389479759664715e-06, - "loss": 0.1734, - "step": 1322600 - }, - { - "epoch": 13.48, - "learning_rate": 1.636786089893052e-06, - "loss": 0.1947, - "step": 1322700 - }, - { - "epoch": 13.48, - "learning_rate": 1.6346255906115337e-06, - "loss": 0.2089, - "step": 1322800 - }, - { - "epoch": 13.48, - "learning_rate": 1.6324664782275456e-06, - "loss": 0.2132, - "step": 1322900 - }, - { - "epoch": 13.48, - "learning_rate": 1.6303087528466565e-06, - "loss": 0.2319, - "step": 1323000 - }, - { - "epoch": 13.48, - "learning_rate": 1.628152414574372e-06, - "loss": 0.1677, - "step": 1323100 - }, - { - "epoch": 13.48, - "learning_rate": 1.625997463516118e-06, - "loss": 0.1873, - "step": 1323200 - }, - { - "epoch": 13.48, - "learning_rate": 1.6238438997772564e-06, - "loss": 0.2386, - "step": 1323300 - }, - { - "epoch": 13.48, - "learning_rate": 1.6216917234630969e-06, - "loss": 0.2216, - "step": 1323400 - }, - { - "epoch": 13.48, - "learning_rate": 1.6195409346788515e-06, - "loss": 0.2242, - "step": 1323500 - }, - { - "epoch": 13.49, - "learning_rate": 1.617391533529693e-06, - "loss": 0.1793, - "step": 1323600 - }, - { - "epoch": 13.49, - "learning_rate": 1.6152435201207072e-06, - "loss": 0.1825, - "step": 1323700 - }, - { - "epoch": 13.49, - "learning_rate": 1.6130968945569202e-06, - "loss": 0.2579, - "step": 1323800 - }, - { - "epoch": 13.49, - "learning_rate": 1.6109516569432913e-06, - "loss": 0.2158, - "step": 1323900 - }, - { - "epoch": 13.49, - "learning_rate": 1.6088078073847135e-06, - "loss": 0.2043, - "step": 1324000 - }, - { - "epoch": 13.49, - "learning_rate": 1.6066653459860058e-06, - "loss": 0.2266, - "step": 1324100 - }, - { - "epoch": 13.49, - "learning_rate": 1.604524272851915e-06, - "loss": 0.203, - "step": 1324200 - }, - { - "epoch": 13.49, - "learning_rate": 1.6023845880871335e-06, - "loss": 0.1909, - "step": 1324300 - }, - { - "epoch": 13.49, - "learning_rate": 1.6002462917962813e-06, - "loss": 0.2193, - "step": 1324400 - }, - { - "epoch": 13.49, - "learning_rate": 1.598109384083898e-06, - "loss": 0.1878, - "step": 1324500 - }, - { - "epoch": 13.5, - "learning_rate": 1.5959738650544731e-06, - "loss": 0.221, - "step": 1324600 - }, - { - "epoch": 13.5, - "learning_rate": 1.5938397348124268e-06, - "loss": 0.2068, - "step": 1324700 - }, - { - "epoch": 13.5, - "learning_rate": 1.591706993462092e-06, - "loss": 0.2137, - "step": 1324800 - }, - { - "epoch": 13.5, - "learning_rate": 1.5895756411077555e-06, - "loss": 0.2195, - "step": 1324900 - }, - { - "epoch": 13.5, - "learning_rate": 1.587445677853624e-06, - "loss": 0.2003, - "step": 1325000 - }, - { - "epoch": 13.5, - "learning_rate": 1.5853171038038406e-06, - "loss": 0.2007, - "step": 1325100 - }, - { - "epoch": 13.5, - "learning_rate": 1.5831899190624821e-06, - "loss": 0.171, - "step": 1325200 - }, - { - "epoch": 13.5, - "learning_rate": 1.5810641237335555e-06, - "loss": 0.2751, - "step": 1325300 - }, - { - "epoch": 13.5, - "learning_rate": 1.5789397179209974e-06, - "loss": 0.2256, - "step": 1325400 - }, - { - "epoch": 13.5, - "learning_rate": 1.5768167017286783e-06, - "loss": 0.1816, - "step": 1325500 - }, - { - "epoch": 13.51, - "learning_rate": 1.574695075260405e-06, - "loss": 0.2249, - "step": 1325600 - }, - { - "epoch": 13.51, - "learning_rate": 1.5725960341063206e-06, - "loss": 0.2043, - "step": 1325700 - }, - { - "epoch": 13.51, - "learning_rate": 1.570477173497442e-06, - "loss": 0.2276, - "step": 1325800 - }, - { - "epoch": 13.51, - "learning_rate": 1.5683597029225682e-06, - "loss": 0.2454, - "step": 1325900 - }, - { - "epoch": 13.51, - "learning_rate": 1.5662436224852328e-06, - "loss": 0.1742, - "step": 1326000 - }, - { - "epoch": 13.51, - "learning_rate": 1.564128932288903e-06, - "loss": 0.2316, - "step": 1326100 - }, - { - "epoch": 13.51, - "learning_rate": 1.5620156324369695e-06, - "loss": 0.1776, - "step": 1326200 - }, - { - "epoch": 13.51, - "learning_rate": 1.5599248352437457e-06, - "loss": 0.1664, - "step": 1326300 - }, - { - "epoch": 13.51, - "learning_rate": 1.5578143024845038e-06, - "loss": 0.199, - "step": 1326400 - }, - { - "epoch": 13.51, - "learning_rate": 1.5557051603784112e-06, - "loss": 0.234, - "step": 1326500 - }, - { - "epoch": 13.52, - "learning_rate": 1.553597409028592e-06, - "loss": 0.2226, - "step": 1326600 - }, - { - "epoch": 13.52, - "learning_rate": 1.551491048538094e-06, - "loss": 0.2209, - "step": 1326700 - }, - { - "epoch": 13.52, - "learning_rate": 1.5493860790099147e-06, - "loss": 0.2261, - "step": 1326800 - }, - { - "epoch": 13.52, - "learning_rate": 1.5472825005469716e-06, - "loss": 0.1683, - "step": 1326900 - }, - { - "epoch": 13.52, - "learning_rate": 1.5451803132521159e-06, - "loss": 0.1759, - "step": 1327000 - }, - { - "epoch": 13.52, - "learning_rate": 1.5430795172281354e-06, - "loss": 0.2385, - "step": 1327100 - }, - { - "epoch": 13.52, - "learning_rate": 1.5409801125777413e-06, - "loss": 0.2113, - "step": 1327200 - }, - { - "epoch": 13.52, - "learning_rate": 1.5388820994035913e-06, - "loss": 0.2189, - "step": 1327300 - }, - { - "epoch": 13.52, - "learning_rate": 1.5367854778082534e-06, - "loss": 0.2549, - "step": 1327400 - }, - { - "epoch": 13.52, - "learning_rate": 1.5346902478942493e-06, - "loss": 0.1863, - "step": 1327500 - }, - { - "epoch": 13.53, - "learning_rate": 1.5325964097640167e-06, - "loss": 0.2191, - "step": 1327600 - }, - { - "epoch": 13.53, - "learning_rate": 1.5305039635199369e-06, - "loss": 0.1811, - "step": 1327700 - }, - { - "epoch": 13.53, - "learning_rate": 1.5284129092643151e-06, - "loss": 0.1757, - "step": 1327800 - }, - { - "epoch": 13.53, - "learning_rate": 1.5263232470993994e-06, - "loss": 0.218, - "step": 1327900 - }, - { - "epoch": 13.53, - "learning_rate": 1.524234977127348e-06, - "loss": 0.2151, - "step": 1328000 - }, - { - "epoch": 13.53, - "learning_rate": 1.5221480994502724e-06, - "loss": 0.2304, - "step": 1328100 - }, - { - "epoch": 13.53, - "learning_rate": 1.5200626141702112e-06, - "loss": 0.2028, - "step": 1328200 - }, - { - "epoch": 13.53, - "learning_rate": 1.5179785213891261e-06, - "loss": 0.2024, - "step": 1328300 - }, - { - "epoch": 13.53, - "learning_rate": 1.5158958212089191e-06, - "loss": 0.1947, - "step": 1328400 - }, - { - "epoch": 13.53, - "learning_rate": 1.5138145137314253e-06, - "loss": 0.2214, - "step": 1328500 - }, - { - "epoch": 13.54, - "learning_rate": 1.511734599058403e-06, - "loss": 0.1764, - "step": 1328600 - }, - { - "epoch": 13.54, - "learning_rate": 1.509656077291548e-06, - "loss": 0.1671, - "step": 1328700 - }, - { - "epoch": 13.54, - "learning_rate": 1.5075789485324886e-06, - "loss": 0.2435, - "step": 1328800 - }, - { - "epoch": 13.54, - "learning_rate": 1.5055032128827873e-06, - "loss": 0.2395, - "step": 1328900 - }, - { - "epoch": 13.54, - "learning_rate": 1.5034288704439291e-06, - "loss": 0.216, - "step": 1329000 - }, - { - "epoch": 13.54, - "learning_rate": 1.5013559213173432e-06, - "loss": 0.2409, - "step": 1329100 - }, - { - "epoch": 13.54, - "learning_rate": 1.4992843656043852e-06, - "loss": 0.2223, - "step": 1329200 - }, - { - "epoch": 13.54, - "learning_rate": 1.497214203406334e-06, - "loss": 0.197, - "step": 1329300 - }, - { - "epoch": 13.54, - "learning_rate": 1.495145434824412e-06, - "loss": 0.1817, - "step": 1329400 - }, - { - "epoch": 13.55, - "learning_rate": 1.4930780599597748e-06, - "loss": 0.1826, - "step": 1329500 - }, - { - "epoch": 13.55, - "learning_rate": 1.4910120789134985e-06, - "loss": 0.2249, - "step": 1329600 - }, - { - "epoch": 13.55, - "learning_rate": 1.4889474917865953e-06, - "loss": 0.2005, - "step": 1329700 - }, - { - "epoch": 13.55, - "learning_rate": 1.4868842986800212e-06, - "loss": 0.16, - "step": 1329800 - }, - { - "epoch": 13.55, - "learning_rate": 1.4848224996946426e-06, - "loss": 0.1925, - "step": 1329900 - }, - { - "epoch": 13.55, - "learning_rate": 1.4827620949312714e-06, - "loss": 0.2348, - "step": 1330000 - }, - { - "epoch": 13.55, - "learning_rate": 1.4807030844906545e-06, - "loss": 0.1808, - "step": 1330100 - }, - { - "epoch": 13.55, - "learning_rate": 1.478645468473464e-06, - "loss": 0.2362, - "step": 1330200 - }, - { - "epoch": 13.55, - "learning_rate": 1.4765892469803e-06, - "loss": 0.2008, - "step": 1330300 - }, - { - "epoch": 13.55, - "learning_rate": 1.4745549614766708e-06, - "loss": 0.2589, - "step": 1330400 - }, - { - "epoch": 13.56, - "learning_rate": 1.472501515385365e-06, - "loss": 0.1998, - "step": 1330500 - }, - { - "epoch": 13.56, - "learning_rate": 1.470449464118495e-06, - "loss": 0.1866, - "step": 1330600 - }, - { - "epoch": 13.56, - "learning_rate": 1.4684193074345987e-06, - "loss": 0.2029, - "step": 1330700 - }, - { - "epoch": 13.56, - "learning_rate": 1.4663700321667761e-06, - "loss": 0.2353, - "step": 1330800 - }, - { - "epoch": 13.56, - "learning_rate": 1.4643221520231797e-06, - "loss": 0.2346, - "step": 1330900 - }, - { - "epoch": 13.56, - "learning_rate": 1.4622756671039327e-06, - "loss": 0.2436, - "step": 1331000 - }, - { - "epoch": 13.56, - "learning_rate": 1.4602305775091086e-06, - "loss": 0.2584, - "step": 1331100 - }, - { - "epoch": 13.56, - "learning_rate": 1.4581868833386902e-06, - "loss": 0.1848, - "step": 1331200 - }, - { - "epoch": 13.56, - "learning_rate": 1.4561445846926013e-06, - "loss": 0.1686, - "step": 1331300 - }, - { - "epoch": 13.56, - "learning_rate": 1.4541036816707087e-06, - "loss": 0.2031, - "step": 1331400 - }, - { - "epoch": 13.57, - "learning_rate": 1.4520641743727859e-06, - "loss": 0.2068, - "step": 1331500 - }, - { - "epoch": 13.57, - "learning_rate": 1.4500260628985628e-06, - "loss": 0.2234, - "step": 1331600 - }, - { - "epoch": 13.57, - "learning_rate": 1.4479893473476835e-06, - "loss": 0.2105, - "step": 1331700 - }, - { - "epoch": 13.57, - "learning_rate": 1.4459540278197413e-06, - "loss": 0.2155, - "step": 1331800 - }, - { - "epoch": 13.57, - "learning_rate": 1.4439201044142435e-06, - "loss": 0.1634, - "step": 1331900 - }, - { - "epoch": 13.57, - "learning_rate": 1.4418875772306372e-06, - "loss": 0.1598, - "step": 1332000 - }, - { - "epoch": 13.57, - "learning_rate": 1.4398564463683062e-06, - "loss": 0.1878, - "step": 1332100 - }, - { - "epoch": 13.57, - "learning_rate": 1.4378267119265541e-06, - "loss": 0.2004, - "step": 1332200 - }, - { - "epoch": 13.57, - "learning_rate": 1.4357983740046253e-06, - "loss": 0.2845, - "step": 1332300 - }, - { - "epoch": 13.57, - "learning_rate": 1.4337714327017004e-06, - "loss": 0.2445, - "step": 1332400 - }, - { - "epoch": 13.58, - "learning_rate": 1.4317458881168698e-06, - "loss": 0.216, - "step": 1332500 - }, - { - "epoch": 13.58, - "learning_rate": 1.4297217403491814e-06, - "loss": 0.2543, - "step": 1332600 - }, - { - "epoch": 13.58, - "learning_rate": 1.4276989894976055e-06, - "loss": 0.1918, - "step": 1332700 - }, - { - "epoch": 13.58, - "learning_rate": 1.4256978422838485e-06, - "loss": 0.2331, - "step": 1332800 - }, - { - "epoch": 13.58, - "learning_rate": 1.4236778715894906e-06, - "loss": 0.1701, - "step": 1332900 - }, - { - "epoch": 13.58, - "learning_rate": 1.4216592981067488e-06, - "loss": 0.237, - "step": 1333000 - }, - { - "epoch": 13.58, - "learning_rate": 1.4196421219343213e-06, - "loss": 0.2225, - "step": 1333100 - }, - { - "epoch": 13.58, - "learning_rate": 1.417626343170839e-06, - "loss": 0.1462, - "step": 1333200 - }, - { - "epoch": 13.58, - "learning_rate": 1.4156119619148567e-06, - "loss": 0.2294, - "step": 1333300 - }, - { - "epoch": 13.58, - "learning_rate": 1.413598978264865e-06, - "loss": 0.2165, - "step": 1333400 - }, - { - "epoch": 13.59, - "learning_rate": 1.4115873923192924e-06, - "loss": 0.1814, - "step": 1333500 - }, - { - "epoch": 13.59, - "learning_rate": 1.4095772041764833e-06, - "loss": 0.1833, - "step": 1333600 - }, - { - "epoch": 13.59, - "learning_rate": 1.4075684139347322e-06, - "loss": 0.1908, - "step": 1333700 - }, - { - "epoch": 13.59, - "learning_rate": 1.405561021692261e-06, - "loss": 0.2033, - "step": 1333800 - }, - { - "epoch": 13.59, - "learning_rate": 1.4035750805677516e-06, - "loss": 0.181, - "step": 1333900 - }, - { - "epoch": 13.59, - "learning_rate": 1.4015704706357657e-06, - "loss": 0.213, - "step": 1334000 - }, - { - "epoch": 13.59, - "learning_rate": 1.3995672589963148e-06, - "loss": 0.2457, - "step": 1334100 - }, - { - "epoch": 13.59, - "learning_rate": 1.397565445747344e-06, - "loss": 0.2266, - "step": 1334200 - }, - { - "epoch": 13.59, - "learning_rate": 1.3955650309867352e-06, - "loss": 0.178, - "step": 1334300 - }, - { - "epoch": 13.6, - "learning_rate": 1.3935660148122898e-06, - "loss": 0.2108, - "step": 1334400 - }, - { - "epoch": 13.6, - "learning_rate": 1.3915683973217562e-06, - "loss": 0.2591, - "step": 1334500 - }, - { - "epoch": 13.6, - "learning_rate": 1.3895721786128034e-06, - "loss": 0.2117, - "step": 1334600 - }, - { - "epoch": 13.6, - "learning_rate": 1.3875773587830297e-06, - "loss": 0.2497, - "step": 1334700 - }, - { - "epoch": 13.6, - "learning_rate": 1.385583937929974e-06, - "loss": 0.2299, - "step": 1334800 - }, - { - "epoch": 13.6, - "learning_rate": 1.3835919161511012e-06, - "loss": 0.2171, - "step": 1334900 - }, - { - "epoch": 13.6, - "learning_rate": 1.3816012935438137e-06, - "loss": 0.2499, - "step": 1335000 - }, - { - "epoch": 13.6, - "learning_rate": 1.3796120702054337e-06, - "loss": 0.1941, - "step": 1335100 - }, - { - "epoch": 13.6, - "learning_rate": 1.3776242462332267e-06, - "loss": 0.2182, - "step": 1335200 - }, - { - "epoch": 13.6, - "learning_rate": 1.3756378217243882e-06, - "loss": 0.2139, - "step": 1335300 - }, - { - "epoch": 13.61, - "learning_rate": 1.3736527967760337e-06, - "loss": 0.2329, - "step": 1335400 - }, - { - "epoch": 13.61, - "learning_rate": 1.3716691714852258e-06, - "loss": 0.1933, - "step": 1335500 - }, - { - "epoch": 13.61, - "learning_rate": 1.3696869459489502e-06, - "loss": 0.1982, - "step": 1335600 - }, - { - "epoch": 13.61, - "learning_rate": 1.367706120264126e-06, - "loss": 0.1832, - "step": 1335700 - }, - { - "epoch": 13.61, - "learning_rate": 1.3657266945276026e-06, - "loss": 0.1527, - "step": 1335800 - }, - { - "epoch": 13.61, - "learning_rate": 1.3637486688361688e-06, - "loss": 0.2471, - "step": 1335900 - }, - { - "epoch": 13.61, - "learning_rate": 1.3617720432865277e-06, - "loss": 0.22, - "step": 1336000 - }, - { - "epoch": 13.61, - "learning_rate": 1.3597968179753316e-06, - "loss": 0.2425, - "step": 1336100 - }, - { - "epoch": 13.61, - "learning_rate": 1.357822992999157e-06, - "loss": 0.2314, - "step": 1336200 - }, - { - "epoch": 13.61, - "learning_rate": 1.3558505684545097e-06, - "loss": 0.2347, - "step": 1336300 - }, - { - "epoch": 13.62, - "learning_rate": 1.353879544437826e-06, - "loss": 0.2114, - "step": 1336400 - }, - { - "epoch": 13.62, - "learning_rate": 1.3519099210454855e-06, - "loss": 0.2007, - "step": 1336500 - }, - { - "epoch": 13.62, - "learning_rate": 1.3499416983737879e-06, - "loss": 0.2172, - "step": 1336600 - }, - { - "epoch": 13.62, - "learning_rate": 1.3479748765189658e-06, - "loss": 0.2393, - "step": 1336700 - }, - { - "epoch": 13.62, - "learning_rate": 1.3460094555771829e-06, - "loss": 0.1808, - "step": 1336800 - }, - { - "epoch": 13.62, - "learning_rate": 1.344045435644542e-06, - "loss": 0.2147, - "step": 1336900 - }, - { - "epoch": 13.62, - "learning_rate": 1.3420828168170661e-06, - "loss": 0.2247, - "step": 1337000 - }, - { - "epoch": 13.62, - "learning_rate": 1.3401215991907224e-06, - "loss": 0.183, - "step": 1337100 - }, - { - "epoch": 13.62, - "learning_rate": 1.3381617828614001e-06, - "loss": 0.2341, - "step": 1337200 - }, - { - "epoch": 13.62, - "learning_rate": 1.3362033679249198e-06, - "loss": 0.2208, - "step": 1337300 - }, - { - "epoch": 13.63, - "learning_rate": 1.3342463544770378e-06, - "loss": 0.1977, - "step": 1337400 - }, - { - "epoch": 13.63, - "learning_rate": 1.3322907426134446e-06, - "loss": 0.1909, - "step": 1337500 - }, - { - "epoch": 13.63, - "learning_rate": 1.3303365324297502e-06, - "loss": 0.2228, - "step": 1337600 - }, - { - "epoch": 13.63, - "learning_rate": 1.328383724021508e-06, - "loss": 0.2173, - "step": 1337700 - }, - { - "epoch": 13.63, - "learning_rate": 1.3264323174842053e-06, - "loss": 0.1757, - "step": 1337800 - }, - { - "epoch": 13.63, - "learning_rate": 1.324482312913239e-06, - "loss": 0.2557, - "step": 1337900 - }, - { - "epoch": 13.63, - "learning_rate": 1.3225337104039624e-06, - "loss": 0.1813, - "step": 1338000 - }, - { - "epoch": 13.63, - "learning_rate": 1.32058651005165e-06, - "loss": 0.1829, - "step": 1338100 - }, - { - "epoch": 13.63, - "learning_rate": 1.3186407119515053e-06, - "loss": 0.1795, - "step": 1338200 - }, - { - "epoch": 13.63, - "learning_rate": 1.3166963161986722e-06, - "loss": 0.2065, - "step": 1338300 - }, - { - "epoch": 13.64, - "learning_rate": 1.3147533228882146e-06, - "loss": 0.1729, - "step": 1338400 - }, - { - "epoch": 13.64, - "learning_rate": 1.3128117321151367e-06, - "loss": 0.2142, - "step": 1338500 - }, - { - "epoch": 13.64, - "learning_rate": 1.3108715439743657e-06, - "loss": 0.2107, - "step": 1338600 - }, - { - "epoch": 13.64, - "learning_rate": 1.3089327585607692e-06, - "loss": 0.2381, - "step": 1338700 - }, - { - "epoch": 13.64, - "learning_rate": 1.3069953759691478e-06, - "loss": 0.203, - "step": 1338800 - }, - { - "epoch": 13.64, - "learning_rate": 1.305059396294216e-06, - "loss": 0.2503, - "step": 1338900 - }, - { - "epoch": 13.64, - "learning_rate": 1.3031248196306344e-06, - "loss": 0.1986, - "step": 1339000 - }, - { - "epoch": 13.64, - "learning_rate": 1.301191646073001e-06, - "loss": 0.2303, - "step": 1339100 - }, - { - "epoch": 13.64, - "learning_rate": 1.299259875715827e-06, - "loss": 0.1724, - "step": 1339200 - }, - { - "epoch": 13.65, - "learning_rate": 1.2973295086535697e-06, - "loss": 0.2223, - "step": 1339300 - }, - { - "epoch": 13.65, - "learning_rate": 1.2954005449806073e-06, - "loss": 0.2239, - "step": 1339400 - }, - { - "epoch": 13.65, - "learning_rate": 1.2934729847912607e-06, - "loss": 0.1754, - "step": 1339500 - }, - { - "epoch": 13.65, - "learning_rate": 1.2915468281797716e-06, - "loss": 0.1977, - "step": 1339600 - }, - { - "epoch": 13.65, - "learning_rate": 1.2896220752403243e-06, - "loss": 0.1376, - "step": 1339700 - }, - { - "epoch": 13.65, - "learning_rate": 1.2876987260670204e-06, - "loss": 0.2314, - "step": 1339800 - }, - { - "epoch": 13.65, - "learning_rate": 1.2857767807539011e-06, - "loss": 0.1955, - "step": 1339900 - }, - { - "epoch": 13.65, - "learning_rate": 1.2838562393949415e-06, - "loss": 0.1969, - "step": 1340000 - }, - { - "epoch": 13.65, - "learning_rate": 1.2819371020840465e-06, - "loss": 0.2183, - "step": 1340100 - }, - { - "epoch": 13.65, - "learning_rate": 1.2800193689150408e-06, - "loss": 0.2017, - "step": 1340200 - }, - { - "epoch": 13.66, - "learning_rate": 1.2781030399816962e-06, - "loss": 0.1834, - "step": 1340300 - }, - { - "epoch": 13.66, - "learning_rate": 1.2761881153777144e-06, - "loss": 0.2344, - "step": 1340400 - }, - { - "epoch": 13.66, - "learning_rate": 1.2742745951967171e-06, - "loss": 0.2453, - "step": 1340500 - }, - { - "epoch": 13.66, - "learning_rate": 1.272362479532263e-06, - "loss": 0.226, - "step": 1340600 - }, - { - "epoch": 13.66, - "learning_rate": 1.2704517684778472e-06, - "loss": 0.2025, - "step": 1340700 - }, - { - "epoch": 13.66, - "learning_rate": 1.2685424621268916e-06, - "loss": 0.202, - "step": 1340800 - }, - { - "epoch": 13.66, - "learning_rate": 1.2666536326342416e-06, - "loss": 0.1776, - "step": 1340900 - }, - { - "epoch": 13.66, - "learning_rate": 1.264747121920834e-06, - "loss": 0.2617, - "step": 1341000 - }, - { - "epoch": 13.66, - "learning_rate": 1.262842016189807e-06, - "loss": 0.1817, - "step": 1341100 - }, - { - "epoch": 13.66, - "learning_rate": 1.260938315534319e-06, - "loss": 0.2157, - "step": 1341200 - }, - { - "epoch": 13.67, - "learning_rate": 1.2590360200474361e-06, - "loss": 0.1904, - "step": 1341300 - }, - { - "epoch": 13.67, - "learning_rate": 1.2571351298221734e-06, - "loss": 0.1985, - "step": 1341400 - }, - { - "epoch": 13.67, - "learning_rate": 1.2552356449514801e-06, - "loss": 0.1973, - "step": 1341500 - }, - { - "epoch": 13.67, - "learning_rate": 1.2533375655282186e-06, - "loss": 0.2081, - "step": 1341600 - }, - { - "epoch": 13.67, - "learning_rate": 1.2514408916452013e-06, - "loss": 0.1953, - "step": 1341700 - }, - { - "epoch": 13.67, - "learning_rate": 1.2495456233951608e-06, - "loss": 0.2323, - "step": 1341800 - }, - { - "epoch": 13.67, - "learning_rate": 1.2476706925373638e-06, - "loss": 0.1999, - "step": 1341900 - }, - { - "epoch": 13.67, - "learning_rate": 1.24577822177257e-06, - "loss": 0.2433, - "step": 1342000 - }, - { - "epoch": 13.67, - "learning_rate": 1.243887156917628e-06, - "loss": 0.19, - "step": 1342100 - }, - { - "epoch": 13.67, - "learning_rate": 1.241997498064994e-06, - "loss": 0.2104, - "step": 1342200 - }, - { - "epoch": 13.68, - "learning_rate": 1.2401092453070706e-06, - "loss": 0.236, - "step": 1342300 - }, - { - "epoch": 13.68, - "learning_rate": 1.2382223987361673e-06, - "loss": 0.1942, - "step": 1342400 - }, - { - "epoch": 13.68, - "learning_rate": 1.2363369584445506e-06, - "loss": 0.1848, - "step": 1342500 - }, - { - "epoch": 13.68, - "learning_rate": 1.2344529245244063e-06, - "loss": 0.2101, - "step": 1342600 - }, - { - "epoch": 13.68, - "learning_rate": 1.2325702970678444e-06, - "loss": 0.2176, - "step": 1342700 - }, - { - "epoch": 13.68, - "learning_rate": 1.2306890761669243e-06, - "loss": 0.1575, - "step": 1342800 - }, - { - "epoch": 13.68, - "learning_rate": 1.2288092619136227e-06, - "loss": 0.206, - "step": 1342900 - }, - { - "epoch": 13.68, - "learning_rate": 1.2269308543998493e-06, - "loss": 0.2597, - "step": 1343000 - }, - { - "epoch": 13.68, - "learning_rate": 1.2250538537174438e-06, - "loss": 0.2108, - "step": 1343100 - }, - { - "epoch": 13.68, - "learning_rate": 1.2231782599581897e-06, - "loss": 0.1961, - "step": 1343200 - }, - { - "epoch": 13.69, - "learning_rate": 1.221304073213787e-06, - "loss": 0.2042, - "step": 1343300 - }, - { - "epoch": 13.69, - "learning_rate": 1.2194312935758723e-06, - "loss": 0.2412, - "step": 1343400 - }, - { - "epoch": 13.69, - "learning_rate": 1.2175599211360123e-06, - "loss": 0.2314, - "step": 1343500 - }, - { - "epoch": 13.69, - "learning_rate": 1.2156899559857104e-06, - "loss": 0.2206, - "step": 1343600 - }, - { - "epoch": 13.69, - "learning_rate": 1.2138213982163937e-06, - "loss": 0.2297, - "step": 1343700 - }, - { - "epoch": 13.69, - "learning_rate": 1.2119542479194222e-06, - "loss": 0.1793, - "step": 1343800 - }, - { - "epoch": 13.69, - "learning_rate": 1.2100885051860933e-06, - "loss": 0.2258, - "step": 1343900 - }, - { - "epoch": 13.69, - "learning_rate": 1.2082241701076235e-06, - "loss": 0.2394, - "step": 1344000 - }, - { - "epoch": 13.69, - "learning_rate": 1.2063612427751737e-06, - "loss": 0.1718, - "step": 1344100 - }, - { - "epoch": 13.69, - "learning_rate": 1.204499723279827e-06, - "loss": 0.1549, - "step": 1344200 - }, - { - "epoch": 13.7, - "learning_rate": 1.2026582058587276e-06, - "loss": 0.1957, - "step": 1344300 - }, - { - "epoch": 13.7, - "learning_rate": 1.2007994882299301e-06, - "loss": 0.2412, - "step": 1344400 - }, - { - "epoch": 13.7, - "learning_rate": 1.1989421787101717e-06, - "loss": 0.2236, - "step": 1344500 - }, - { - "epoch": 13.7, - "learning_rate": 1.1970862773902669e-06, - "loss": 0.2188, - "step": 1344600 - }, - { - "epoch": 13.7, - "learning_rate": 1.1952317843609562e-06, - "loss": 0.2408, - "step": 1344700 - }, - { - "epoch": 13.7, - "learning_rate": 1.1933786997129138e-06, - "loss": 0.2043, - "step": 1344800 - }, - { - "epoch": 13.7, - "learning_rate": 1.1915270235367436e-06, - "loss": 0.203, - "step": 1344900 - }, - { - "epoch": 13.7, - "learning_rate": 1.1896767559229837e-06, - "loss": 0.2042, - "step": 1345000 - }, - { - "epoch": 13.7, - "learning_rate": 1.1878278969620948e-06, - "loss": 0.2058, - "step": 1345100 - }, - { - "epoch": 13.71, - "learning_rate": 1.1859804467444811e-06, - "loss": 0.2423, - "step": 1345200 - }, - { - "epoch": 13.71, - "learning_rate": 1.1841344053604741e-06, - "loss": 0.2037, - "step": 1345300 - }, - { - "epoch": 13.71, - "learning_rate": 1.1822897729003279e-06, - "loss": 0.2325, - "step": 1345400 - }, - { - "epoch": 13.71, - "learning_rate": 1.1804465494542372e-06, - "loss": 0.2199, - "step": 1345500 - }, - { - "epoch": 13.71, - "learning_rate": 1.178604735112323e-06, - "loss": 0.2114, - "step": 1345600 - }, - { - "epoch": 13.71, - "learning_rate": 1.1767643299646403e-06, - "loss": 0.2071, - "step": 1345700 - }, - { - "epoch": 13.71, - "learning_rate": 1.1749253341011734e-06, - "loss": 0.2042, - "step": 1345800 - }, - { - "epoch": 13.71, - "learning_rate": 1.1730877476118373e-06, - "loss": 0.1716, - "step": 1345900 - }, - { - "epoch": 13.71, - "learning_rate": 1.1712515705864869e-06, - "loss": 0.1795, - "step": 1346000 - }, - { - "epoch": 13.71, - "learning_rate": 1.1694168031148867e-06, - "loss": 0.2098, - "step": 1346100 - }, - { - "epoch": 13.72, - "learning_rate": 1.167583445286755e-06, - "loss": 0.2417, - "step": 1346200 - }, - { - "epoch": 13.72, - "learning_rate": 1.1657514971917338e-06, - "loss": 0.2004, - "step": 1346300 - }, - { - "epoch": 13.72, - "learning_rate": 1.1639209589193878e-06, - "loss": 0.2126, - "step": 1346400 - }, - { - "epoch": 13.72, - "learning_rate": 1.162091830559222e-06, - "loss": 0.2353, - "step": 1346500 - }, - { - "epoch": 13.72, - "learning_rate": 1.1602641122006718e-06, - "loss": 0.229, - "step": 1346600 - }, - { - "epoch": 13.72, - "learning_rate": 1.1584378039330955e-06, - "loss": 0.1893, - "step": 1346700 - }, - { - "epoch": 13.72, - "learning_rate": 1.1566129058457952e-06, - "loss": 0.2338, - "step": 1346800 - }, - { - "epoch": 13.72, - "learning_rate": 1.154789418027996e-06, - "loss": 0.1804, - "step": 1346900 - }, - { - "epoch": 13.72, - "learning_rate": 1.1529673405688535e-06, - "loss": 0.1732, - "step": 1347000 - }, - { - "epoch": 13.72, - "learning_rate": 1.1511466735574594e-06, - "loss": 0.2089, - "step": 1347100 - }, - { - "epoch": 13.73, - "learning_rate": 1.1493274170828294e-06, - "loss": 0.1962, - "step": 1347200 - }, - { - "epoch": 13.73, - "learning_rate": 1.1475095712339222e-06, - "loss": 0.2321, - "step": 1347300 - }, - { - "epoch": 13.73, - "learning_rate": 1.1456931360996136e-06, - "loss": 0.1992, - "step": 1347400 - }, - { - "epoch": 13.73, - "learning_rate": 1.1438781117687125e-06, - "loss": 0.2429, - "step": 1347500 - }, - { - "epoch": 13.73, - "learning_rate": 1.1420644983299744e-06, - "loss": 0.1914, - "step": 1347600 - }, - { - "epoch": 13.73, - "learning_rate": 1.1402522958720619e-06, - "loss": 0.2695, - "step": 1347700 - }, - { - "epoch": 13.73, - "learning_rate": 1.1384415044835872e-06, - "loss": 0.2031, - "step": 1347800 - }, - { - "epoch": 13.73, - "learning_rate": 1.1366321242530864e-06, - "loss": 0.1855, - "step": 1347900 - }, - { - "epoch": 13.73, - "learning_rate": 1.1348241552690287e-06, - "loss": 0.2095, - "step": 1348000 - }, - { - "epoch": 13.73, - "learning_rate": 1.1330356562099032e-06, - "loss": 0.2069, - "step": 1348100 - }, - { - "epoch": 13.74, - "learning_rate": 1.1312304958691865e-06, - "loss": 0.1922, - "step": 1348200 - }, - { - "epoch": 13.74, - "learning_rate": 1.1294267470390153e-06, - "loss": 0.2423, - "step": 1348300 - }, - { - "epoch": 13.74, - "learning_rate": 1.1276244098075883e-06, - "loss": 0.2276, - "step": 1348400 - }, - { - "epoch": 13.74, - "learning_rate": 1.1258234842630255e-06, - "loss": 0.2598, - "step": 1348500 - }, - { - "epoch": 13.74, - "learning_rate": 1.124023970493383e-06, - "loss": 0.1933, - "step": 1348600 - }, - { - "epoch": 13.74, - "learning_rate": 1.122225868586647e-06, - "loss": 0.2312, - "step": 1348700 - }, - { - "epoch": 13.74, - "learning_rate": 1.1204291786307309e-06, - "loss": 0.1899, - "step": 1348800 - }, - { - "epoch": 13.74, - "learning_rate": 1.1186339007134837e-06, - "loss": 0.2181, - "step": 1348900 - }, - { - "epoch": 13.74, - "learning_rate": 1.116840034922686e-06, - "loss": 0.1984, - "step": 1349000 - }, - { - "epoch": 13.74, - "learning_rate": 1.1150475813460403e-06, - "loss": 0.1969, - "step": 1349100 - }, - { - "epoch": 13.75, - "learning_rate": 1.1132565400711936e-06, - "loss": 0.1994, - "step": 1349200 - }, - { - "epoch": 13.75, - "learning_rate": 1.111466911185719e-06, - "loss": 0.1643, - "step": 1349300 - }, - { - "epoch": 13.75, - "learning_rate": 1.10967869477711e-06, - "loss": 0.1974, - "step": 1349400 - }, - { - "epoch": 13.75, - "learning_rate": 1.1078918909328063e-06, - "loss": 0.2078, - "step": 1349500 - }, - { - "epoch": 13.75, - "learning_rate": 1.1061064997401681e-06, - "loss": 0.2042, - "step": 1349600 - }, - { - "epoch": 13.75, - "learning_rate": 1.1043225212864926e-06, - "loss": 0.2167, - "step": 1349700 - }, - { - "epoch": 13.75, - "learning_rate": 1.1025399556590033e-06, - "loss": 0.1938, - "step": 1349800 - }, - { - "epoch": 13.75, - "learning_rate": 1.1007588029448635e-06, - "loss": 0.1988, - "step": 1349900 - }, - { - "epoch": 13.75, - "learning_rate": 1.098979063231157e-06, - "loss": 0.1803, - "step": 1350000 - }, - { - "epoch": 13.76, - "learning_rate": 1.097200736604901e-06, - "loss": 0.2072, - "step": 1350100 - }, - { - "epoch": 13.76, - "learning_rate": 1.0954415852920651e-06, - "loss": 0.2439, - "step": 1350200 - }, - { - "epoch": 13.76, - "learning_rate": 1.0936660709684466e-06, - "loss": 0.2492, - "step": 1350300 - }, - { - "epoch": 13.76, - "learning_rate": 1.0918919699920537e-06, - "loss": 0.1959, - "step": 1350400 - }, - { - "epoch": 13.76, - "learning_rate": 1.0901192824496341e-06, - "loss": 0.257, - "step": 1350500 - }, - { - "epoch": 13.76, - "learning_rate": 1.088348008427852e-06, - "loss": 0.2172, - "step": 1350600 - }, - { - "epoch": 13.76, - "learning_rate": 1.0865781480133175e-06, - "loss": 0.196, - "step": 1350700 - }, - { - "epoch": 13.76, - "learning_rate": 1.084809701292565e-06, - "loss": 0.2361, - "step": 1350800 - }, - { - "epoch": 13.76, - "learning_rate": 1.0830426683520623e-06, - "loss": 0.1932, - "step": 1350900 - }, - { - "epoch": 13.76, - "learning_rate": 1.0812770492782e-06, - "loss": 0.2471, - "step": 1351000 - }, - { - "epoch": 13.77, - "learning_rate": 1.0795128441573188e-06, - "loss": 0.1974, - "step": 1351100 - }, - { - "epoch": 13.77, - "learning_rate": 1.0777500530756668e-06, - "loss": 0.2172, - "step": 1351200 - }, - { - "epoch": 13.77, - "learning_rate": 1.0759886761194415e-06, - "loss": 0.2158, - "step": 1351300 - }, - { - "epoch": 13.77, - "learning_rate": 1.0742287133747607e-06, - "loss": 0.2752, - "step": 1351400 - }, - { - "epoch": 13.77, - "learning_rate": 1.0724701649276759e-06, - "loss": 0.1872, - "step": 1351500 - }, - { - "epoch": 13.77, - "learning_rate": 1.0707130308641644e-06, - "loss": 0.1631, - "step": 1351600 - }, - { - "epoch": 13.77, - "learning_rate": 1.068957311270151e-06, - "loss": 0.1537, - "step": 1351700 - }, - { - "epoch": 13.77, - "learning_rate": 1.0672030062314708e-06, - "loss": 0.2232, - "step": 1351800 - }, - { - "epoch": 13.77, - "learning_rate": 1.0654501158339014e-06, - "loss": 0.2205, - "step": 1351900 - }, - { - "epoch": 13.77, - "learning_rate": 1.063698640163151e-06, - "loss": 0.1882, - "step": 1352000 - }, - { - "epoch": 13.78, - "learning_rate": 1.0619485793048544e-06, - "loss": 0.2354, - "step": 1352100 - }, - { - "epoch": 13.78, - "learning_rate": 1.0601999333445767e-06, - "loss": 0.2481, - "step": 1352200 - }, - { - "epoch": 13.78, - "learning_rate": 1.0584527023678159e-06, - "loss": 0.1783, - "step": 1352300 - }, - { - "epoch": 13.78, - "learning_rate": 1.0567068864600104e-06, - "loss": 0.2189, - "step": 1352400 - }, - { - "epoch": 13.78, - "learning_rate": 1.0549624857065087e-06, - "loss": 0.2665, - "step": 1352500 - }, - { - "epoch": 13.78, - "learning_rate": 1.0532369230420325e-06, - "loss": 0.1976, - "step": 1352600 - }, - { - "epoch": 13.78, - "learning_rate": 1.051495338699282e-06, - "loss": 0.1791, - "step": 1352700 - }, - { - "epoch": 13.78, - "learning_rate": 1.0497551697656549e-06, - "loss": 0.2013, - "step": 1352800 - }, - { - "epoch": 13.78, - "learning_rate": 1.0480164163262362e-06, - "loss": 0.1993, - "step": 1352900 - }, - { - "epoch": 13.78, - "learning_rate": 1.0462790784660347e-06, - "loss": 0.1729, - "step": 1353000 - }, - { - "epoch": 13.79, - "learning_rate": 1.0445431562700025e-06, - "loss": 0.1825, - "step": 1353100 - }, - { - "epoch": 13.79, - "learning_rate": 1.0428086498230117e-06, - "loss": 0.2323, - "step": 1353200 - }, - { - "epoch": 13.79, - "learning_rate": 1.0410755592098709e-06, - "loss": 0.1713, - "step": 1353300 - }, - { - "epoch": 13.79, - "learning_rate": 1.0393438845153159e-06, - "loss": 0.1776, - "step": 1353400 - }, - { - "epoch": 13.79, - "learning_rate": 1.0376136258240155e-06, - "loss": 0.2117, - "step": 1353500 - }, - { - "epoch": 13.79, - "learning_rate": 1.0358847832205721e-06, - "loss": 0.2348, - "step": 1353600 - }, - { - "epoch": 13.79, - "learning_rate": 1.0341573567895146e-06, - "loss": 0.1787, - "step": 1353700 - }, - { - "epoch": 13.79, - "learning_rate": 1.0324313466153057e-06, - "loss": 0.209, - "step": 1353800 - }, - { - "epoch": 13.79, - "learning_rate": 1.0307067527823311e-06, - "loss": 0.2292, - "step": 1353900 - }, - { - "epoch": 13.79, - "learning_rate": 1.0289835753749165e-06, - "loss": 0.2422, - "step": 1354000 - }, - { - "epoch": 13.8, - "learning_rate": 1.0272618144773182e-06, - "loss": 0.1618, - "step": 1354100 - }, - { - "epoch": 13.8, - "learning_rate": 1.0255414701737153e-06, - "loss": 0.191, - "step": 1354200 - }, - { - "epoch": 13.8, - "learning_rate": 1.0238225425482206e-06, - "loss": 0.1843, - "step": 1354300 - }, - { - "epoch": 13.8, - "learning_rate": 1.0221050316848834e-06, - "loss": 0.2028, - "step": 1354400 - }, - { - "epoch": 13.8, - "learning_rate": 1.0203889376676866e-06, - "loss": 0.1957, - "step": 1354500 - }, - { - "epoch": 13.8, - "learning_rate": 1.0186742605805232e-06, - "loss": 0.1704, - "step": 1354600 - }, - { - "epoch": 13.8, - "learning_rate": 1.016961000507236e-06, - "loss": 0.2717, - "step": 1354700 - }, - { - "epoch": 13.8, - "learning_rate": 1.0152491575315946e-06, - "loss": 0.1586, - "step": 1354800 - }, - { - "epoch": 13.8, - "learning_rate": 1.013538731737299e-06, - "loss": 0.1901, - "step": 1354900 - }, - { - "epoch": 13.8, - "learning_rate": 1.0118468062775343e-06, - "loss": 0.2071, - "step": 1355000 - }, - { - "epoch": 13.81, - "learning_rate": 1.0101392009228472e-06, - "loss": 0.2108, - "step": 1355100 - }, - { - "epoch": 13.81, - "learning_rate": 1.008433012999349e-06, - "loss": 0.1786, - "step": 1355200 - }, - { - "epoch": 13.81, - "learning_rate": 1.00672824259047e-06, - "loss": 0.213, - "step": 1355300 - }, - { - "epoch": 13.81, - "learning_rate": 1.0050248897795533e-06, - "loss": 0.2435, - "step": 1355400 - }, - { - "epoch": 13.81, - "learning_rate": 1.0033229546498856e-06, - "loss": 0.1725, - "step": 1355500 - }, - { - "epoch": 13.81, - "learning_rate": 1.0016224372846872e-06, - "loss": 0.2103, - "step": 1355600 - }, - { - "epoch": 13.81, - "learning_rate": 9.999233377670913e-07, - "loss": 0.213, - "step": 1355700 - }, - { - "epoch": 13.81, - "learning_rate": 9.98225656180185e-07, - "loss": 0.2262, - "step": 1355800 - }, - { - "epoch": 13.81, - "learning_rate": 9.965293926069717e-07, - "loss": 0.2027, - "step": 1355900 - }, - { - "epoch": 13.82, - "learning_rate": 9.948345471303821e-07, - "loss": 0.2201, - "step": 1356000 - }, - { - "epoch": 13.82, - "learning_rate": 9.931411198332928e-07, - "loss": 0.1872, - "step": 1356100 - }, - { - "epoch": 13.82, - "learning_rate": 9.91449110798498e-07, - "loss": 0.2111, - "step": 1356200 - }, - { - "epoch": 13.82, - "learning_rate": 9.897585201087278e-07, - "loss": 0.2304, - "step": 1356300 - }, - { - "epoch": 13.82, - "learning_rate": 9.880693478466397e-07, - "loss": 0.1874, - "step": 1356400 - }, - { - "epoch": 13.82, - "learning_rate": 9.86381594094834e-07, - "loss": 0.2632, - "step": 1356500 - }, - { - "epoch": 13.82, - "learning_rate": 9.846952589358183e-07, - "loss": 0.1754, - "step": 1356600 - }, - { - "epoch": 13.82, - "learning_rate": 9.830103424520498e-07, - "loss": 0.189, - "step": 1356700 - }, - { - "epoch": 13.82, - "learning_rate": 9.813268447259094e-07, - "loss": 0.1604, - "step": 1356800 - }, - { - "epoch": 13.82, - "learning_rate": 9.79644765839718e-07, - "loss": 0.2082, - "step": 1356900 - }, - { - "epoch": 13.83, - "learning_rate": 9.779641058757094e-07, - "loss": 0.1818, - "step": 1357000 - }, - { - "epoch": 13.83, - "learning_rate": 9.762848649160617e-07, - "loss": 0.1773, - "step": 1357100 - }, - { - "epoch": 13.83, - "learning_rate": 9.74607043042882e-07, - "loss": 0.2078, - "step": 1357200 - }, - { - "epoch": 13.83, - "learning_rate": 9.72930640338202e-07, - "loss": 0.1908, - "step": 1357300 - }, - { - "epoch": 13.83, - "learning_rate": 9.712556568839858e-07, - "loss": 0.219, - "step": 1357400 - }, - { - "epoch": 13.83, - "learning_rate": 9.695820927621378e-07, - "loss": 0.2072, - "step": 1357500 - }, - { - "epoch": 13.83, - "learning_rate": 9.679099480544762e-07, - "loss": 0.1981, - "step": 1357600 - }, - { - "epoch": 13.83, - "learning_rate": 9.662392228427685e-07, - "loss": 0.154, - "step": 1357700 - }, - { - "epoch": 13.83, - "learning_rate": 9.645699172086997e-07, - "loss": 0.1855, - "step": 1357800 - }, - { - "epoch": 13.83, - "learning_rate": 9.629020312338844e-07, - "loss": 0.2133, - "step": 1357900 - }, - { - "epoch": 13.84, - "learning_rate": 9.61235564999874e-07, - "loss": 0.1754, - "step": 1358000 - }, - { - "epoch": 13.84, - "learning_rate": 9.595871620238828e-07, - "loss": 0.2041, - "step": 1358100 - }, - { - "epoch": 13.84, - "learning_rate": 9.579235213164183e-07, - "loss": 0.1983, - "step": 1358200 - }, - { - "epoch": 13.84, - "learning_rate": 9.562613005931808e-07, - "loss": 0.2206, - "step": 1358300 - }, - { - "epoch": 13.84, - "learning_rate": 9.546004999354386e-07, - "loss": 0.1887, - "step": 1358400 - }, - { - "epoch": 13.84, - "learning_rate": 9.529411194243998e-07, - "loss": 0.1693, - "step": 1358500 - }, - { - "epoch": 13.84, - "learning_rate": 9.512831591411964e-07, - "loss": 0.206, - "step": 1358600 - }, - { - "epoch": 13.84, - "learning_rate": 9.496431775358427e-07, - "loss": 0.2439, - "step": 1358700 - }, - { - "epoch": 13.84, - "learning_rate": 9.479880437471344e-07, - "loss": 0.2406, - "step": 1358800 - }, - { - "epoch": 13.84, - "learning_rate": 9.463343304284378e-07, - "loss": 0.1708, - "step": 1358900 - }, - { - "epoch": 13.85, - "learning_rate": 9.44682037660608e-07, - "loss": 0.2016, - "step": 1359000 - }, - { - "epoch": 13.85, - "learning_rate": 9.430476672134058e-07, - "loss": 0.197, - "step": 1359100 - }, - { - "epoch": 13.85, - "learning_rate": 9.41398201582081e-07, - "loss": 0.2005, - "step": 1359200 - }, - { - "epoch": 13.85, - "learning_rate": 9.397501567429734e-07, - "loss": 0.2254, - "step": 1359300 - }, - { - "epoch": 13.85, - "learning_rate": 9.381035327766618e-07, - "loss": 0.2383, - "step": 1359400 - }, - { - "epoch": 13.85, - "learning_rate": 9.36458329763652e-07, - "loss": 0.2219, - "step": 1359500 - }, - { - "epoch": 13.85, - "learning_rate": 9.348145477843894e-07, - "loss": 0.2187, - "step": 1359600 - }, - { - "epoch": 13.85, - "learning_rate": 9.331721869192434e-07, - "loss": 0.2614, - "step": 1359700 - }, - { - "epoch": 13.85, - "learning_rate": 9.31531247248516e-07, - "loss": 0.2242, - "step": 1359800 - }, - { - "epoch": 13.85, - "learning_rate": 9.298917288524366e-07, - "loss": 0.1758, - "step": 1359900 - }, - { - "epoch": 13.86, - "learning_rate": 9.282536318111712e-07, - "loss": 0.1845, - "step": 1360000 - }, - { - "epoch": 13.86, - "learning_rate": 9.266169562048121e-07, - "loss": 0.2176, - "step": 1360100 - }, - { - "epoch": 13.86, - "learning_rate": 9.249817021133889e-07, - "loss": 0.2008, - "step": 1360200 - }, - { - "epoch": 13.86, - "learning_rate": 9.23347869616844e-07, - "loss": 0.2349, - "step": 1360300 - }, - { - "epoch": 13.86, - "learning_rate": 9.217154587950671e-07, - "loss": 0.2072, - "step": 1360400 - }, - { - "epoch": 13.86, - "learning_rate": 9.200844697278743e-07, - "loss": 0.2232, - "step": 1360500 - }, - { - "epoch": 13.86, - "learning_rate": 9.18454902495015e-07, - "loss": 0.2593, - "step": 1360600 - }, - { - "epoch": 13.86, - "learning_rate": 9.168267571761591e-07, - "loss": 0.1869, - "step": 1360700 - }, - { - "epoch": 13.86, - "learning_rate": 9.152000338509159e-07, - "loss": 0.1816, - "step": 1360800 - }, - { - "epoch": 13.87, - "learning_rate": 9.13574732598822e-07, - "loss": 0.1794, - "step": 1360900 - }, - { - "epoch": 13.87, - "learning_rate": 9.119508534993437e-07, - "loss": 0.2065, - "step": 1361000 - }, - { - "epoch": 13.87, - "learning_rate": 9.103283966318809e-07, - "loss": 0.186, - "step": 1361100 - }, - { - "epoch": 13.87, - "learning_rate": 9.087073620757602e-07, - "loss": 0.1517, - "step": 1361200 - }, - { - "epoch": 13.87, - "learning_rate": 9.070877499102448e-07, - "loss": 0.2379, - "step": 1361300 - }, - { - "epoch": 13.87, - "learning_rate": 9.054695602145179e-07, - "loss": 0.2052, - "step": 1361400 - }, - { - "epoch": 13.87, - "learning_rate": 9.038527930677065e-07, - "loss": 0.2339, - "step": 1361500 - }, - { - "epoch": 13.87, - "learning_rate": 9.022374485488539e-07, - "loss": 0.1936, - "step": 1361600 - }, - { - "epoch": 13.87, - "learning_rate": 9.006235267369433e-07, - "loss": 0.1933, - "step": 1361700 - }, - { - "epoch": 13.87, - "learning_rate": 8.990110277108887e-07, - "loss": 0.2542, - "step": 1361800 - }, - { - "epoch": 13.88, - "learning_rate": 8.973999515495268e-07, - "loss": 0.2119, - "step": 1361900 - }, - { - "epoch": 13.88, - "learning_rate": 8.957902983316313e-07, - "loss": 0.1607, - "step": 1362000 - }, - { - "epoch": 13.88, - "learning_rate": 8.941820681359058e-07, - "loss": 0.1672, - "step": 1362100 - }, - { - "epoch": 13.88, - "learning_rate": 8.925752610409876e-07, - "loss": 0.2122, - "step": 1362200 - }, - { - "epoch": 13.88, - "learning_rate": 8.909698771254305e-07, - "loss": 0.2447, - "step": 1362300 - }, - { - "epoch": 13.88, - "learning_rate": 8.893659164677314e-07, - "loss": 0.2568, - "step": 1362400 - }, - { - "epoch": 13.88, - "learning_rate": 8.87763379146318e-07, - "loss": 0.2024, - "step": 1362500 - }, - { - "epoch": 13.88, - "learning_rate": 8.861622652395407e-07, - "loss": 0.1963, - "step": 1362600 - }, - { - "epoch": 13.88, - "learning_rate": 8.845625748256869e-07, - "loss": 0.1944, - "step": 1362700 - }, - { - "epoch": 13.88, - "learning_rate": 8.829643079829741e-07, - "loss": 0.1962, - "step": 1362800 - }, - { - "epoch": 13.89, - "learning_rate": 8.813674647895431e-07, - "loss": 0.2223, - "step": 1362900 - }, - { - "epoch": 13.89, - "learning_rate": 8.797720453234747e-07, - "loss": 0.1616, - "step": 1363000 - }, - { - "epoch": 13.89, - "learning_rate": 8.781780496627733e-07, - "loss": 0.1636, - "step": 1363100 - }, - { - "epoch": 13.89, - "learning_rate": 8.765854778853699e-07, - "loss": 0.2431, - "step": 1363200 - }, - { - "epoch": 13.89, - "learning_rate": 8.749943300691421e-07, - "loss": 0.201, - "step": 1363300 - }, - { - "epoch": 13.89, - "learning_rate": 8.73404606291881e-07, - "loss": 0.2386, - "step": 1363400 - }, - { - "epoch": 13.89, - "learning_rate": 8.718163066313145e-07, - "loss": 0.2301, - "step": 1363500 - }, - { - "epoch": 13.89, - "learning_rate": 8.702294311651037e-07, - "loss": 0.2196, - "step": 1363600 - }, - { - "epoch": 13.89, - "learning_rate": 8.686439799708368e-07, - "loss": 0.2112, - "step": 1363700 - }, - { - "epoch": 13.89, - "learning_rate": 8.670599531260314e-07, - "loss": 0.2002, - "step": 1363800 - }, - { - "epoch": 13.9, - "learning_rate": 8.654773507081393e-07, - "loss": 0.1783, - "step": 1363900 - }, - { - "epoch": 13.9, - "learning_rate": 8.638961727945383e-07, - "loss": 0.1831, - "step": 1364000 - }, - { - "epoch": 13.9, - "learning_rate": 8.6231641946254e-07, - "loss": 0.1891, - "step": 1364100 - }, - { - "epoch": 13.9, - "learning_rate": 8.607380907893858e-07, - "loss": 0.2412, - "step": 1364200 - }, - { - "epoch": 13.9, - "learning_rate": 8.591611868522409e-07, - "loss": 0.1903, - "step": 1364300 - }, - { - "epoch": 13.9, - "learning_rate": 8.575857077282168e-07, - "loss": 0.1933, - "step": 1364400 - }, - { - "epoch": 13.9, - "learning_rate": 8.560116534943319e-07, - "loss": 0.2099, - "step": 1364500 - }, - { - "epoch": 13.9, - "learning_rate": 8.54439024227558e-07, - "loss": 0.2389, - "step": 1364600 - }, - { - "epoch": 13.9, - "learning_rate": 8.528678200047868e-07, - "loss": 0.2983, - "step": 1364700 - }, - { - "epoch": 13.9, - "learning_rate": 8.512980409028337e-07, - "loss": 0.1835, - "step": 1364800 - }, - { - "epoch": 13.91, - "learning_rate": 8.497296869984572e-07, - "loss": 0.2241, - "step": 1364900 - }, - { - "epoch": 13.91, - "learning_rate": 8.481627583683394e-07, - "loss": 0.2122, - "step": 1365000 - }, - { - "epoch": 13.91, - "learning_rate": 8.465972550890922e-07, - "loss": 0.1856, - "step": 1365100 - }, - { - "epoch": 13.91, - "learning_rate": 8.45033177237261e-07, - "loss": 0.188, - "step": 1365200 - }, - { - "epoch": 13.91, - "learning_rate": 8.434705248893248e-07, - "loss": 0.1913, - "step": 1365300 - }, - { - "epoch": 13.91, - "learning_rate": 8.419092981216791e-07, - "loss": 0.2208, - "step": 1365400 - }, - { - "epoch": 13.91, - "learning_rate": 8.403494970106596e-07, - "loss": 0.2056, - "step": 1365500 - }, - { - "epoch": 13.91, - "learning_rate": 8.387911216325384e-07, - "loss": 0.1879, - "step": 1365600 - }, - { - "epoch": 13.91, - "learning_rate": 8.372341720635046e-07, - "loss": 0.2137, - "step": 1365700 - }, - { - "epoch": 13.92, - "learning_rate": 8.356786483796874e-07, - "loss": 0.2292, - "step": 1365800 - }, - { - "epoch": 13.92, - "learning_rate": 8.34124550657136e-07, - "loss": 0.176, - "step": 1365900 - }, - { - "epoch": 13.92, - "learning_rate": 8.325718789718495e-07, - "loss": 0.192, - "step": 1366000 - }, - { - "epoch": 13.92, - "learning_rate": 8.310206333997272e-07, - "loss": 0.1871, - "step": 1366100 - }, - { - "epoch": 13.92, - "learning_rate": 8.294708140166285e-07, - "loss": 0.1726, - "step": 1366200 - }, - { - "epoch": 13.92, - "learning_rate": 8.27922420898326e-07, - "loss": 0.1873, - "step": 1366300 - }, - { - "epoch": 13.92, - "learning_rate": 8.26375454120526e-07, - "loss": 0.2035, - "step": 1366400 - }, - { - "epoch": 13.92, - "learning_rate": 8.248299137588677e-07, - "loss": 0.2215, - "step": 1366500 - }, - { - "epoch": 13.92, - "learning_rate": 8.232857998889209e-07, - "loss": 0.2026, - "step": 1366600 - }, - { - "epoch": 13.92, - "learning_rate": 8.217431125861785e-07, - "loss": 0.1922, - "step": 1366700 - }, - { - "epoch": 13.93, - "learning_rate": 8.202018519260734e-07, - "loss": 0.2097, - "step": 1366800 - }, - { - "epoch": 13.93, - "learning_rate": 8.186620179839621e-07, - "loss": 0.2845, - "step": 1366900 - }, - { - "epoch": 13.93, - "learning_rate": 8.171236108351309e-07, - "loss": 0.2141, - "step": 1367000 - }, - { - "epoch": 13.93, - "learning_rate": 8.155866305547999e-07, - "loss": 0.2102, - "step": 1367100 - }, - { - "epoch": 13.93, - "learning_rate": 8.140510772181219e-07, - "loss": 0.2349, - "step": 1367200 - }, - { - "epoch": 13.93, - "learning_rate": 8.125169509001706e-07, - "loss": 0.2214, - "step": 1367300 - }, - { - "epoch": 13.93, - "learning_rate": 8.109842516759592e-07, - "loss": 0.2171, - "step": 1367400 - }, - { - "epoch": 13.93, - "learning_rate": 8.094529796204275e-07, - "loss": 0.1871, - "step": 1367500 - }, - { - "epoch": 13.93, - "learning_rate": 8.079231348084459e-07, - "loss": 0.2149, - "step": 1367600 - }, - { - "epoch": 13.93, - "learning_rate": 8.063947173148111e-07, - "loss": 0.1429, - "step": 1367700 - }, - { - "epoch": 13.94, - "learning_rate": 8.048677272142602e-07, - "loss": 0.2116, - "step": 1367800 - }, - { - "epoch": 13.94, - "learning_rate": 8.033574131415644e-07, - "loss": 0.1983, - "step": 1367900 - }, - { - "epoch": 13.94, - "learning_rate": 8.018332637752912e-07, - "loss": 0.2021, - "step": 1368000 - }, - { - "epoch": 13.94, - "learning_rate": 8.0031054202513e-07, - "loss": 0.2897, - "step": 1368100 - }, - { - "epoch": 13.94, - "learning_rate": 7.987892479655245e-07, - "loss": 0.2025, - "step": 1368200 - }, - { - "epoch": 13.94, - "learning_rate": 7.97269381670862e-07, - "loss": 0.2067, - "step": 1368300 - }, - { - "epoch": 13.94, - "learning_rate": 7.957509432154531e-07, - "loss": 0.2285, - "step": 1368400 - }, - { - "epoch": 13.94, - "learning_rate": 7.942339326735414e-07, - "loss": 0.2076, - "step": 1368500 - }, - { - "epoch": 13.94, - "learning_rate": 7.927183501193014e-07, - "loss": 0.1837, - "step": 1368600 - }, - { - "epoch": 13.94, - "learning_rate": 7.912041956268334e-07, - "loss": 0.1901, - "step": 1368700 - }, - { - "epoch": 13.95, - "learning_rate": 7.896914692701685e-07, - "loss": 0.1994, - "step": 1368800 - }, - { - "epoch": 13.95, - "learning_rate": 7.881801711232739e-07, - "loss": 0.2559, - "step": 1368900 - }, - { - "epoch": 13.95, - "learning_rate": 7.866703012600441e-07, - "loss": 0.2109, - "step": 1369000 - }, - { - "epoch": 13.95, - "learning_rate": 7.851618597542964e-07, - "loss": 0.2404, - "step": 1369100 - }, - { - "epoch": 13.95, - "learning_rate": 7.836548466797921e-07, - "loss": 0.2977, - "step": 1369200 - }, - { - "epoch": 13.95, - "learning_rate": 7.821492621102122e-07, - "loss": 0.2205, - "step": 1369300 - }, - { - "epoch": 13.95, - "learning_rate": 7.806451061191677e-07, - "loss": 0.2097, - "step": 1369400 - }, - { - "epoch": 13.95, - "learning_rate": 7.791423787802066e-07, - "loss": 0.2397, - "step": 1369500 - }, - { - "epoch": 13.95, - "learning_rate": 7.776410801668065e-07, - "loss": 0.2232, - "step": 1369600 - }, - { - "epoch": 13.95, - "learning_rate": 7.761412103523624e-07, - "loss": 0.1876, - "step": 1369700 - }, - { - "epoch": 13.96, - "learning_rate": 7.746427694102154e-07, - "loss": 0.1737, - "step": 1369800 - }, - { - "epoch": 13.96, - "learning_rate": 7.731457574136302e-07, - "loss": 0.1837, - "step": 1369900 - }, - { - "epoch": 13.96, - "learning_rate": 7.716501744358018e-07, - "loss": 0.2082, - "step": 1370000 - }, - { - "epoch": 13.96, - "learning_rate": 7.701709550144664e-07, - "loss": 0.2437, - "step": 1370100 - }, - { - "epoch": 13.96, - "learning_rate": 7.686782160014417e-07, - "loss": 0.234, - "step": 1370200 - }, - { - "epoch": 13.96, - "learning_rate": 7.671869062256087e-07, - "loss": 0.2361, - "step": 1370300 - }, - { - "epoch": 13.96, - "learning_rate": 7.656970257598794e-07, - "loss": 0.2298, - "step": 1370400 - }, - { - "epoch": 13.96, - "learning_rate": 7.642085746771121e-07, - "loss": 0.1473, - "step": 1370500 - }, - { - "epoch": 13.96, - "learning_rate": 7.62721553050072e-07, - "loss": 0.217, - "step": 1370600 - }, - { - "epoch": 13.96, - "learning_rate": 7.612359609514707e-07, - "loss": 0.2318, - "step": 1370700 - }, - { - "epoch": 13.97, - "learning_rate": 7.597517984539437e-07, - "loss": 0.1939, - "step": 1370800 - }, - { - "epoch": 13.97, - "learning_rate": 7.582690656300528e-07, - "loss": 0.2448, - "step": 1370900 - }, - { - "epoch": 13.97, - "learning_rate": 7.567877625523035e-07, - "loss": 0.2308, - "step": 1371000 - }, - { - "epoch": 13.97, - "learning_rate": 7.553078892931176e-07, - "loss": 0.1579, - "step": 1371100 - }, - { - "epoch": 13.97, - "learning_rate": 7.538294459248507e-07, - "loss": 0.1676, - "step": 1371200 - }, - { - "epoch": 13.97, - "learning_rate": 7.523524325197917e-07, - "loss": 0.1926, - "step": 1371300 - }, - { - "epoch": 13.97, - "learning_rate": 7.508768491501627e-07, - "loss": 0.2262, - "step": 1371400 - }, - { - "epoch": 13.97, - "learning_rate": 7.494026958880995e-07, - "loss": 0.2728, - "step": 1371500 - }, - { - "epoch": 13.97, - "learning_rate": 7.479299728056844e-07, - "loss": 0.1699, - "step": 1371600 - }, - { - "epoch": 13.98, - "learning_rate": 7.464733858232586e-07, - "loss": 0.2025, - "step": 1371700 - }, - { - "epoch": 13.98, - "learning_rate": 7.450035090125084e-07, - "loss": 0.2367, - "step": 1371800 - }, - { - "epoch": 13.98, - "learning_rate": 7.435350625965009e-07, - "loss": 0.2209, - "step": 1371900 - }, - { - "epoch": 13.98, - "learning_rate": 7.420680466470353e-07, - "loss": 0.1339, - "step": 1372000 - }, - { - "epoch": 13.98, - "learning_rate": 7.406024612358408e-07, - "loss": 0.1932, - "step": 1372100 - }, - { - "epoch": 13.98, - "learning_rate": 7.391383064345769e-07, - "loss": 0.2037, - "step": 1372200 - }, - { - "epoch": 13.98, - "learning_rate": 7.376755823148263e-07, - "loss": 0.2338, - "step": 1372300 - }, - { - "epoch": 13.98, - "learning_rate": 7.362142889481116e-07, - "loss": 0.2228, - "step": 1372400 - }, - { - "epoch": 13.98, - "learning_rate": 7.347544264058859e-07, - "loss": 0.2155, - "step": 1372500 - }, - { - "epoch": 13.98, - "learning_rate": 7.332959947595186e-07, - "loss": 0.2004, - "step": 1372600 - }, - { - "epoch": 13.99, - "learning_rate": 7.318389940803228e-07, - "loss": 0.2203, - "step": 1372700 - }, - { - "epoch": 13.99, - "learning_rate": 7.30383424439538e-07, - "loss": 0.2103, - "step": 1372800 - }, - { - "epoch": 13.99, - "learning_rate": 7.289292859083308e-07, - "loss": 0.2, - "step": 1372900 - }, - { - "epoch": 13.99, - "learning_rate": 7.274765785578008e-07, - "loss": 0.198, - "step": 1373000 - }, - { - "epoch": 13.99, - "learning_rate": 7.260253024589747e-07, - "loss": 0.2095, - "step": 1373100 - }, - { - "epoch": 13.99, - "learning_rate": 7.245754576828189e-07, - "loss": 0.2292, - "step": 1373200 - }, - { - "epoch": 13.99, - "learning_rate": 7.231270443002136e-07, - "loss": 0.2074, - "step": 1373300 - }, - { - "epoch": 13.99, - "learning_rate": 7.216800623819786e-07, - "loss": 0.2488, - "step": 1373400 - }, - { - "epoch": 13.99, - "learning_rate": 7.202345119988707e-07, - "loss": 0.2169, - "step": 1373500 - }, - { - "epoch": 13.99, - "learning_rate": 7.187903932215567e-07, - "loss": 0.2409, - "step": 1373600 - }, - { - "epoch": 14.0, - "learning_rate": 7.173477061206534e-07, - "loss": 0.1907, - "step": 1373700 - }, - { - "epoch": 14.0, - "learning_rate": 7.159064507667013e-07, - "loss": 0.2664, - "step": 1373800 - }, - { - "epoch": 14.0, - "learning_rate": 7.144666272301603e-07, - "loss": 0.2516, - "step": 1373900 - }, - { - "epoch": 14.0, - "learning_rate": 7.130282355814344e-07, - "loss": 0.1508, - "step": 1374000 - }, - { - "epoch": 14.0, - "learning_rate": 7.115912758908538e-07, - "loss": 0.2178, - "step": 1374100 - }, - { - "epoch": 14.0, - "learning_rate": 7.101557482286725e-07, - "loss": 0.2174, - "step": 1374200 - }, - { - "epoch": 14.0, - "learning_rate": 7.087216526650875e-07, - "loss": 0.1904, - "step": 1374300 - }, - { - "epoch": 14.0, - "learning_rate": 7.072889892702095e-07, - "loss": 0.2144, - "step": 1374400 - }, - { - "epoch": 14.0, - "learning_rate": 7.058720633358395e-07, - "loss": 0.2261, - "step": 1374500 - }, - { - "epoch": 14.0, - "learning_rate": 7.044422501650283e-07, - "loss": 0.1586, - "step": 1374600 - }, - { - "epoch": 14.01, - "learning_rate": 7.030138693721599e-07, - "loss": 0.1855, - "step": 1374700 - }, - { - "epoch": 14.01, - "learning_rate": 7.015869210270787e-07, - "loss": 0.2096, - "step": 1374800 - }, - { - "epoch": 14.01, - "learning_rate": 7.001614051995553e-07, - "loss": 0.2765, - "step": 1374900 - }, - { - "epoch": 14.01, - "learning_rate": 6.987373219592841e-07, - "loss": 0.2126, - "step": 1375000 - }, - { - "epoch": 14.01, - "learning_rate": 6.973146713758993e-07, - "loss": 0.2178, - "step": 1375100 - }, - { - "epoch": 14.01, - "learning_rate": 6.958934535189554e-07, - "loss": 0.2486, - "step": 1375200 - }, - { - "epoch": 14.01, - "learning_rate": 6.944736684579467e-07, - "loss": 0.2186, - "step": 1375300 - }, - { - "epoch": 14.01, - "learning_rate": 6.930553162622877e-07, - "loss": 0.1926, - "step": 1375400 - }, - { - "epoch": 14.01, - "learning_rate": 6.916383970013263e-07, - "loss": 0.1912, - "step": 1375500 - }, - { - "epoch": 14.01, - "learning_rate": 6.90222910744347e-07, - "loss": 0.197, - "step": 1375600 - }, - { - "epoch": 14.02, - "learning_rate": 6.888088575605545e-07, - "loss": 0.2234, - "step": 1375700 - }, - { - "epoch": 14.02, - "learning_rate": 6.8739623751909e-07, - "loss": 0.2101, - "step": 1375800 - }, - { - "epoch": 14.02, - "learning_rate": 6.85985050689022e-07, - "loss": 0.2333, - "step": 1375900 - }, - { - "epoch": 14.02, - "learning_rate": 6.845752971393482e-07, - "loss": 0.2454, - "step": 1376000 - }, - { - "epoch": 14.02, - "learning_rate": 6.831669769389937e-07, - "loss": 0.1732, - "step": 1376100 - }, - { - "epoch": 14.02, - "learning_rate": 6.817600901568266e-07, - "loss": 0.2336, - "step": 1376200 - }, - { - "epoch": 14.02, - "learning_rate": 6.803546368616254e-07, - "loss": 0.1923, - "step": 1376300 - }, - { - "epoch": 14.02, - "learning_rate": 6.789506171221083e-07, - "loss": 0.211, - "step": 1376400 - }, - { - "epoch": 14.02, - "learning_rate": 6.775480310069337e-07, - "loss": 0.2021, - "step": 1376500 - }, - { - "epoch": 14.03, - "learning_rate": 6.761468785846703e-07, - "loss": 0.2076, - "step": 1376600 - }, - { - "epoch": 14.03, - "learning_rate": 6.747611500130901e-07, - "loss": 0.2468, - "step": 1376700 - }, - { - "epoch": 14.03, - "learning_rate": 6.733628508434709e-07, - "loss": 0.1853, - "step": 1376800 - }, - { - "epoch": 14.03, - "learning_rate": 6.719659855713956e-07, - "loss": 0.1948, - "step": 1376900 - }, - { - "epoch": 14.03, - "learning_rate": 6.705705542651663e-07, - "loss": 0.1822, - "step": 1377000 - }, - { - "epoch": 14.03, - "learning_rate": 6.69176556993002e-07, - "loss": 0.2009, - "step": 1377100 - }, - { - "epoch": 14.03, - "learning_rate": 6.677839938230712e-07, - "loss": 0.2212, - "step": 1377200 - }, - { - "epoch": 14.03, - "learning_rate": 6.663928648234596e-07, - "loss": 0.2406, - "step": 1377300 - }, - { - "epoch": 14.03, - "learning_rate": 6.650031700621795e-07, - "loss": 0.1726, - "step": 1377400 - }, - { - "epoch": 14.03, - "learning_rate": 6.63614909607183e-07, - "loss": 0.1914, - "step": 1377500 - }, - { - "epoch": 14.04, - "learning_rate": 6.622280835263462e-07, - "loss": 0.1964, - "step": 1377600 - }, - { - "epoch": 14.04, - "learning_rate": 6.608426918874777e-07, - "loss": 0.2114, - "step": 1377700 - }, - { - "epoch": 14.04, - "learning_rate": 6.59458734758317e-07, - "loss": 0.212, - "step": 1377800 - }, - { - "epoch": 14.04, - "learning_rate": 6.580762122065264e-07, - "loss": 0.1895, - "step": 1377900 - }, - { - "epoch": 14.04, - "learning_rate": 6.566951242997054e-07, - "loss": 0.1641, - "step": 1378000 - }, - { - "epoch": 14.04, - "learning_rate": 6.553154711053799e-07, - "loss": 0.2579, - "step": 1378100 - }, - { - "epoch": 14.04, - "learning_rate": 6.539372526910092e-07, - "loss": 0.21, - "step": 1378200 - }, - { - "epoch": 14.04, - "learning_rate": 6.525604691239828e-07, - "loss": 0.1846, - "step": 1378300 - }, - { - "epoch": 14.04, - "learning_rate": 6.511851204716102e-07, - "loss": 0.1965, - "step": 1378400 - }, - { - "epoch": 14.04, - "learning_rate": 6.498112068011375e-07, - "loss": 0.2128, - "step": 1378500 - }, - { - "epoch": 14.05, - "learning_rate": 6.484387281797511e-07, - "loss": 0.1706, - "step": 1378600 - }, - { - "epoch": 14.05, - "learning_rate": 6.470676846745438e-07, - "loss": 0.2053, - "step": 1378700 - }, - { - "epoch": 14.05, - "learning_rate": 6.456980763525589e-07, - "loss": 0.1921, - "step": 1378800 - }, - { - "epoch": 14.05, - "learning_rate": 6.443299032807626e-07, - "loss": 0.2056, - "step": 1378900 - }, - { - "epoch": 14.05, - "learning_rate": 6.429631655260481e-07, - "loss": 0.2283, - "step": 1379000 - }, - { - "epoch": 14.05, - "learning_rate": 6.415978631552388e-07, - "loss": 0.2088, - "step": 1379100 - }, - { - "epoch": 14.05, - "learning_rate": 6.402339962350911e-07, - "loss": 0.2147, - "step": 1379200 - }, - { - "epoch": 14.05, - "learning_rate": 6.388715648322951e-07, - "loss": 0.2151, - "step": 1379300 - }, - { - "epoch": 14.05, - "learning_rate": 6.375105690134575e-07, - "loss": 0.1781, - "step": 1379400 - }, - { - "epoch": 14.05, - "learning_rate": 6.361510088451317e-07, - "loss": 0.2067, - "step": 1379500 - }, - { - "epoch": 14.06, - "learning_rate": 6.347928843937845e-07, - "loss": 0.1803, - "step": 1379600 - }, - { - "epoch": 14.06, - "learning_rate": 6.334361957258228e-07, - "loss": 0.1865, - "step": 1379700 - }, - { - "epoch": 14.06, - "learning_rate": 6.320809429075836e-07, - "loss": 0.2507, - "step": 1379800 - }, - { - "epoch": 14.06, - "learning_rate": 6.307271260053271e-07, - "loss": 0.1925, - "step": 1379900 - }, - { - "epoch": 14.06, - "learning_rate": 6.293747450852438e-07, - "loss": 0.2081, - "step": 1380000 - }, - { - "epoch": 14.06, - "learning_rate": 6.280238002134609e-07, - "loss": 0.2173, - "step": 1380100 - }, - { - "epoch": 14.06, - "learning_rate": 6.266742914560353e-07, - "loss": 0.2373, - "step": 1380200 - }, - { - "epoch": 14.06, - "learning_rate": 6.253262188789444e-07, - "loss": 0.2464, - "step": 1380300 - }, - { - "epoch": 14.06, - "learning_rate": 6.23979582548102e-07, - "loss": 0.201, - "step": 1380400 - }, - { - "epoch": 14.06, - "learning_rate": 6.22634382529349e-07, - "loss": 0.2276, - "step": 1380500 - }, - { - "epoch": 14.07, - "learning_rate": 6.212906188884626e-07, - "loss": 0.1941, - "step": 1380600 - }, - { - "epoch": 14.07, - "learning_rate": 6.199482916911403e-07, - "loss": 0.2484, - "step": 1380700 - }, - { - "epoch": 14.07, - "learning_rate": 6.186074010030196e-07, - "loss": 0.2423, - "step": 1380800 - }, - { - "epoch": 14.07, - "learning_rate": 6.172679468896514e-07, - "loss": 0.1975, - "step": 1380900 - }, - { - "epoch": 14.07, - "learning_rate": 6.159299294165365e-07, - "loss": 0.2437, - "step": 1381000 - }, - { - "epoch": 14.07, - "learning_rate": 6.145933486490929e-07, - "loss": 0.202, - "step": 1381100 - }, - { - "epoch": 14.07, - "learning_rate": 6.132582046526747e-07, - "loss": 0.1816, - "step": 1381200 - }, - { - "epoch": 14.07, - "learning_rate": 6.119244974925564e-07, - "loss": 0.1909, - "step": 1381300 - }, - { - "epoch": 14.07, - "learning_rate": 6.105922272339526e-07, - "loss": 0.1612, - "step": 1381400 - }, - { - "epoch": 14.07, - "learning_rate": 6.092613939420045e-07, - "loss": 0.2376, - "step": 1381500 - }, - { - "epoch": 14.08, - "learning_rate": 6.079319976817766e-07, - "loss": 0.1665, - "step": 1381600 - }, - { - "epoch": 14.08, - "learning_rate": 6.066040385182703e-07, - "loss": 0.1643, - "step": 1381700 - }, - { - "epoch": 14.08, - "learning_rate": 6.052907746222703e-07, - "loss": 0.1964, - "step": 1381800 - }, - { - "epoch": 14.08, - "learning_rate": 6.039656754743417e-07, - "loss": 0.1793, - "step": 1381900 - }, - { - "epoch": 14.08, - "learning_rate": 6.026420136170663e-07, - "loss": 0.2153, - "step": 1382000 - }, - { - "epoch": 14.08, - "learning_rate": 6.01319789115159e-07, - "loss": 0.208, - "step": 1382100 - }, - { - "epoch": 14.08, - "learning_rate": 5.999990020332746e-07, - "loss": 0.1943, - "step": 1382200 - }, - { - "epoch": 14.08, - "learning_rate": 5.986796524359884e-07, - "loss": 0.2236, - "step": 1382300 - }, - { - "epoch": 14.08, - "learning_rate": 5.973617403878084e-07, - "loss": 0.1855, - "step": 1382400 - }, - { - "epoch": 14.09, - "learning_rate": 5.960452659531735e-07, - "loss": 0.2359, - "step": 1382500 - }, - { - "epoch": 14.09, - "learning_rate": 5.947302291964552e-07, - "loss": 0.2403, - "step": 1382600 - }, - { - "epoch": 14.09, - "learning_rate": 5.934166301819455e-07, - "loss": 0.2001, - "step": 1382700 - }, - { - "epoch": 14.09, - "learning_rate": 5.921044689738697e-07, - "loss": 0.1828, - "step": 1382800 - }, - { - "epoch": 14.09, - "learning_rate": 5.907937456363932e-07, - "loss": 0.2559, - "step": 1382900 - }, - { - "epoch": 14.09, - "learning_rate": 5.894844602335947e-07, - "loss": 0.2229, - "step": 1383000 - }, - { - "epoch": 14.09, - "learning_rate": 5.88176612829493e-07, - "loss": 0.2073, - "step": 1383100 - }, - { - "epoch": 14.09, - "learning_rate": 5.868702034880336e-07, - "loss": 0.1828, - "step": 1383200 - }, - { - "epoch": 14.09, - "learning_rate": 5.855652322730953e-07, - "loss": 0.2384, - "step": 1383300 - }, - { - "epoch": 14.09, - "learning_rate": 5.842616992484773e-07, - "loss": 0.1898, - "step": 1383400 - }, - { - "epoch": 14.1, - "learning_rate": 5.829596044779218e-07, - "loss": 0.2207, - "step": 1383500 - }, - { - "epoch": 14.1, - "learning_rate": 5.816589480250912e-07, - "loss": 0.1993, - "step": 1383600 - }, - { - "epoch": 14.1, - "learning_rate": 5.803597299535746e-07, - "loss": 0.1938, - "step": 1383700 - }, - { - "epoch": 14.1, - "learning_rate": 5.790619503269013e-07, - "loss": 0.1448, - "step": 1383800 - }, - { - "epoch": 14.1, - "learning_rate": 5.777785654988843e-07, - "loss": 0.201, - "step": 1383900 - }, - { - "epoch": 14.1, - "learning_rate": 5.764836485661562e-07, - "loss": 0.186, - "step": 1384000 - }, - { - "epoch": 14.1, - "learning_rate": 5.751901702677864e-07, - "loss": 0.1587, - "step": 1384100 - }, - { - "epoch": 14.1, - "learning_rate": 5.73898130667021e-07, - "loss": 0.1755, - "step": 1384200 - }, - { - "epoch": 14.1, - "learning_rate": 5.726075298270294e-07, - "loss": 0.196, - "step": 1384300 - }, - { - "epoch": 14.1, - "learning_rate": 5.713183678109146e-07, - "loss": 0.2037, - "step": 1384400 - }, - { - "epoch": 14.11, - "learning_rate": 5.700306446817127e-07, - "loss": 0.1684, - "step": 1384500 - }, - { - "epoch": 14.11, - "learning_rate": 5.687443605023834e-07, - "loss": 0.1266, - "step": 1384600 - }, - { - "epoch": 14.11, - "learning_rate": 5.674595153358197e-07, - "loss": 0.1821, - "step": 1384700 - }, - { - "epoch": 14.11, - "learning_rate": 5.661761092448381e-07, - "loss": 0.2654, - "step": 1384800 - }, - { - "epoch": 14.11, - "learning_rate": 5.648941422921949e-07, - "loss": 0.1804, - "step": 1384900 - }, - { - "epoch": 14.11, - "learning_rate": 5.636136145405701e-07, - "loss": 0.2323, - "step": 1385000 - }, - { - "epoch": 14.11, - "learning_rate": 5.623345260525703e-07, - "loss": 0.1967, - "step": 1385100 - }, - { - "epoch": 14.11, - "learning_rate": 5.610568768907354e-07, - "loss": 0.1618, - "step": 1385200 - }, - { - "epoch": 14.11, - "learning_rate": 5.59780667117542e-07, - "loss": 0.2188, - "step": 1385300 - }, - { - "epoch": 14.11, - "learning_rate": 5.585058967953805e-07, - "loss": 0.2244, - "step": 1385400 - }, - { - "epoch": 14.12, - "learning_rate": 5.57232565986584e-07, - "loss": 0.2145, - "step": 1385500 - }, - { - "epoch": 14.12, - "learning_rate": 5.559606747534063e-07, - "loss": 0.1877, - "step": 1385600 - }, - { - "epoch": 14.12, - "learning_rate": 5.546902231580409e-07, - "loss": 0.1695, - "step": 1385700 - }, - { - "epoch": 14.12, - "learning_rate": 5.534212112626047e-07, - "loss": 0.2095, - "step": 1385800 - }, - { - "epoch": 14.12, - "learning_rate": 5.521536391291416e-07, - "loss": 0.1991, - "step": 1385900 - }, - { - "epoch": 14.12, - "learning_rate": 5.50887506819635e-07, - "loss": 0.2103, - "step": 1386000 - }, - { - "epoch": 14.12, - "learning_rate": 5.496228143959792e-07, - "loss": 0.2081, - "step": 1386100 - }, - { - "epoch": 14.12, - "learning_rate": 5.48359561920021e-07, - "loss": 0.1534, - "step": 1386200 - }, - { - "epoch": 14.12, - "learning_rate": 5.470977494535212e-07, - "loss": 0.2173, - "step": 1386300 - }, - { - "epoch": 14.12, - "learning_rate": 5.458373770581771e-07, - "loss": 0.1606, - "step": 1386400 - }, - { - "epoch": 14.13, - "learning_rate": 5.445784447956092e-07, - "loss": 0.2253, - "step": 1386500 - }, - { - "epoch": 14.13, - "learning_rate": 5.433209527273786e-07, - "loss": 0.2175, - "step": 1386600 - }, - { - "epoch": 14.13, - "learning_rate": 5.420649009149626e-07, - "loss": 0.1447, - "step": 1386700 - }, - { - "epoch": 14.13, - "learning_rate": 5.408102894197753e-07, - "loss": 0.179, - "step": 1386800 - }, - { - "epoch": 14.13, - "learning_rate": 5.395571183031611e-07, - "loss": 0.2296, - "step": 1386900 - }, - { - "epoch": 14.13, - "learning_rate": 5.383178978027859e-07, - "loss": 0.2671, - "step": 1387000 - }, - { - "epoch": 14.13, - "learning_rate": 5.37067593221754e-07, - "loss": 0.2065, - "step": 1387100 - }, - { - "epoch": 14.13, - "learning_rate": 5.35818729202292e-07, - "loss": 0.188, - "step": 1387200 - }, - { - "epoch": 14.13, - "learning_rate": 5.345713058054613e-07, - "loss": 0.2235, - "step": 1387300 - }, - { - "epoch": 14.14, - "learning_rate": 5.333253230922564e-07, - "loss": 0.1954, - "step": 1387400 - }, - { - "epoch": 14.14, - "learning_rate": 5.320807811235918e-07, - "loss": 0.2149, - "step": 1387500 - }, - { - "epoch": 14.14, - "learning_rate": 5.308501038397706e-07, - "loss": 0.2391, - "step": 1387600 - }, - { - "epoch": 14.14, - "learning_rate": 5.2960842913371e-07, - "loss": 0.1949, - "step": 1387700 - }, - { - "epoch": 14.14, - "learning_rate": 5.283681953539309e-07, - "loss": 0.2725, - "step": 1387800 - }, - { - "epoch": 14.14, - "learning_rate": 5.271294025610718e-07, - "loss": 0.1964, - "step": 1387900 - }, - { - "epoch": 14.14, - "learning_rate": 5.258920508156973e-07, - "loss": 0.1679, - "step": 1388000 - }, - { - "epoch": 14.14, - "learning_rate": 5.24656140178309e-07, - "loss": 0.1959, - "step": 1388100 - }, - { - "epoch": 14.14, - "learning_rate": 5.234216707093353e-07, - "loss": 0.2081, - "step": 1388200 - }, - { - "epoch": 14.14, - "learning_rate": 5.22188642469138e-07, - "loss": 0.1577, - "step": 1388300 - }, - { - "epoch": 14.15, - "learning_rate": 5.209570555179954e-07, - "loss": 0.1971, - "step": 1388400 - }, - { - "epoch": 14.15, - "learning_rate": 5.197269099161362e-07, - "loss": 0.2238, - "step": 1388500 - }, - { - "epoch": 14.15, - "learning_rate": 5.184982057236986e-07, - "loss": 0.2126, - "step": 1388600 - }, - { - "epoch": 14.15, - "learning_rate": 5.172709430007649e-07, - "loss": 0.1948, - "step": 1388700 - }, - { - "epoch": 14.15, - "learning_rate": 5.16045121807337e-07, - "loss": 0.1677, - "step": 1388800 - }, - { - "epoch": 14.15, - "learning_rate": 5.148207422033568e-07, - "loss": 0.2057, - "step": 1388900 - }, - { - "epoch": 14.15, - "learning_rate": 5.135978042486766e-07, - "loss": 0.1996, - "step": 1389000 - }, - { - "epoch": 14.15, - "learning_rate": 5.123763080031018e-07, - "loss": 0.2161, - "step": 1389100 - }, - { - "epoch": 14.15, - "learning_rate": 5.111562535263547e-07, - "loss": 0.2006, - "step": 1389200 - }, - { - "epoch": 14.15, - "learning_rate": 5.099376408780843e-07, - "loss": 0.2501, - "step": 1389300 - }, - { - "epoch": 14.16, - "learning_rate": 5.087204701178727e-07, - "loss": 0.2154, - "step": 1389400 - }, - { - "epoch": 14.16, - "learning_rate": 5.075047413052425e-07, - "loss": 0.1674, - "step": 1389500 - }, - { - "epoch": 14.16, - "learning_rate": 5.062904544996227e-07, - "loss": 0.183, - "step": 1389600 - }, - { - "epoch": 14.16, - "learning_rate": 5.050776097603893e-07, - "loss": 0.2123, - "step": 1389700 - }, - { - "epoch": 14.16, - "learning_rate": 5.038662071468481e-07, - "loss": 0.1631, - "step": 1389800 - }, - { - "epoch": 14.16, - "learning_rate": 5.026562467182216e-07, - "loss": 0.1917, - "step": 1389900 - }, - { - "epoch": 14.16, - "learning_rate": 5.014477285336761e-07, - "loss": 0.2145, - "step": 1390000 - }, - { - "epoch": 14.16, - "learning_rate": 5.002406526523007e-07, - "loss": 0.183, - "step": 1390100 - }, - { - "epoch": 14.16, - "learning_rate": 4.990350191331116e-07, - "loss": 0.1868, - "step": 1390200 - }, - { - "epoch": 14.16, - "learning_rate": 4.978308280350552e-07, - "loss": 0.2024, - "step": 1390300 - }, - { - "epoch": 14.17, - "learning_rate": 4.966280794170142e-07, - "loss": 0.1818, - "step": 1390400 - }, - { - "epoch": 14.17, - "learning_rate": 4.954267733377915e-07, - "loss": 0.2202, - "step": 1390500 - }, - { - "epoch": 14.17, - "learning_rate": 4.942269098561236e-07, - "loss": 0.1728, - "step": 1390600 - }, - { - "epoch": 14.17, - "learning_rate": 4.930284890306802e-07, - "loss": 0.1706, - "step": 1390700 - }, - { - "epoch": 14.17, - "learning_rate": 4.918315109200577e-07, - "loss": 0.1587, - "step": 1390800 - }, - { - "epoch": 14.17, - "learning_rate": 4.906359755827761e-07, - "loss": 0.1746, - "step": 1390900 - }, - { - "epoch": 14.17, - "learning_rate": 4.894418830772884e-07, - "loss": 0.1609, - "step": 1391000 - }, - { - "epoch": 14.17, - "learning_rate": 4.882492334619915e-07, - "loss": 0.2119, - "step": 1391100 - }, - { - "epoch": 14.17, - "learning_rate": 4.870580267951852e-07, - "loss": 0.1968, - "step": 1391200 - }, - { - "epoch": 14.17, - "learning_rate": 4.858682631351163e-07, - "loss": 0.1825, - "step": 1391300 - }, - { - "epoch": 14.18, - "learning_rate": 4.846799425399618e-07, - "loss": 0.2432, - "step": 1391400 - }, - { - "epoch": 14.18, - "learning_rate": 4.834930650678182e-07, - "loss": 0.2062, - "step": 1391500 - }, - { - "epoch": 14.18, - "learning_rate": 4.823076307767193e-07, - "loss": 0.1888, - "step": 1391600 - }, - { - "epoch": 14.18, - "learning_rate": 4.811236397246255e-07, - "loss": 0.1886, - "step": 1391700 - }, - { - "epoch": 14.18, - "learning_rate": 4.799410919694236e-07, - "loss": 0.1679, - "step": 1391800 - }, - { - "epoch": 14.18, - "learning_rate": 4.787599875689375e-07, - "loss": 0.2189, - "step": 1391900 - }, - { - "epoch": 14.18, - "learning_rate": 4.775803265809142e-07, - "loss": 0.1603, - "step": 1392000 - }, - { - "epoch": 14.18, - "learning_rate": 4.7640210906303104e-07, - "loss": 0.1994, - "step": 1392100 - }, - { - "epoch": 14.18, - "learning_rate": 4.7523709566714703e-07, - "loss": 0.1808, - "step": 1392200 - }, - { - "epoch": 14.18, - "learning_rate": 4.7406175082616353e-07, - "loss": 0.2467, - "step": 1392300 - }, - { - "epoch": 14.19, - "learning_rate": 4.728878496273592e-07, - "loss": 0.1827, - "step": 1392400 - }, - { - "epoch": 14.19, - "learning_rate": 4.717153921281314e-07, - "loss": 0.1897, - "step": 1392500 - }, - { - "epoch": 14.19, - "learning_rate": 4.70544378385801e-07, - "loss": 0.1695, - "step": 1392600 - }, - { - "epoch": 14.19, - "learning_rate": 4.6937480845763215e-07, - "loss": 0.1605, - "step": 1392700 - }, - { - "epoch": 14.19, - "learning_rate": 4.682066824008058e-07, - "loss": 0.2606, - "step": 1392800 - }, - { - "epoch": 14.19, - "learning_rate": 4.670400002724329e-07, - "loss": 0.2101, - "step": 1392900 - }, - { - "epoch": 14.19, - "learning_rate": 4.6587476212956115e-07, - "loss": 0.2361, - "step": 1393000 - }, - { - "epoch": 14.19, - "learning_rate": 4.6471096802916504e-07, - "loss": 0.1851, - "step": 1393100 - }, - { - "epoch": 14.19, - "learning_rate": 4.6354861802814564e-07, - "loss": 0.243, - "step": 1393200 - }, - { - "epoch": 14.2, - "learning_rate": 4.6238771218333087e-07, - "loss": 0.1852, - "step": 1393300 - }, - { - "epoch": 14.2, - "learning_rate": 4.6122825055148866e-07, - "loss": 0.1998, - "step": 1393400 - }, - { - "epoch": 14.2, - "learning_rate": 4.6007023318930695e-07, - "loss": 0.2208, - "step": 1393500 - }, - { - "epoch": 14.2, - "learning_rate": 4.589136601534039e-07, - "loss": 0.1844, - "step": 1393600 - }, - { - "epoch": 14.2, - "learning_rate": 4.577585315003308e-07, - "loss": 0.1984, - "step": 1393700 - }, - { - "epoch": 14.2, - "learning_rate": 4.5660484728656915e-07, - "loss": 0.1903, - "step": 1393800 - }, - { - "epoch": 14.2, - "learning_rate": 4.5545260756852057e-07, - "loss": 0.2057, - "step": 1393900 - }, - { - "epoch": 14.2, - "learning_rate": 4.5430181240253e-07, - "loss": 0.1688, - "step": 1394000 - }, - { - "epoch": 14.2, - "learning_rate": 4.5315246184486236e-07, - "loss": 0.2126, - "step": 1394100 - }, - { - "epoch": 14.2, - "learning_rate": 4.5200455595171275e-07, - "loss": 0.1581, - "step": 1394200 - }, - { - "epoch": 14.21, - "learning_rate": 4.5085809477920294e-07, - "loss": 0.1728, - "step": 1394300 - }, - { - "epoch": 14.21, - "learning_rate": 4.4971307838339805e-07, - "loss": 0.1743, - "step": 1394400 - }, - { - "epoch": 14.21, - "learning_rate": 4.4856950682027333e-07, - "loss": 0.2086, - "step": 1394500 - }, - { - "epoch": 14.21, - "learning_rate": 4.4742738014574404e-07, - "loss": 0.1823, - "step": 1394600 - }, - { - "epoch": 14.21, - "learning_rate": 4.462866984156588e-07, - "loss": 0.2249, - "step": 1394700 - }, - { - "epoch": 14.21, - "learning_rate": 4.4514746168578645e-07, - "loss": 0.1328, - "step": 1394800 - }, - { - "epoch": 14.21, - "learning_rate": 4.440096700118257e-07, - "loss": 0.217, - "step": 1394900 - }, - { - "epoch": 14.21, - "learning_rate": 4.4287332344941533e-07, - "loss": 0.2647, - "step": 1395000 - }, - { - "epoch": 14.21, - "learning_rate": 4.41738422054111e-07, - "loss": 0.1665, - "step": 1395100 - }, - { - "epoch": 14.21, - "learning_rate": 4.4060496588140155e-07, - "loss": 0.2088, - "step": 1395200 - }, - { - "epoch": 14.22, - "learning_rate": 4.3947295498670934e-07, - "loss": 0.1973, - "step": 1395300 - }, - { - "epoch": 14.22, - "learning_rate": 4.383423894253802e-07, - "loss": 0.1949, - "step": 1395400 - }, - { - "epoch": 14.22, - "learning_rate": 4.3721326925269643e-07, - "loss": 0.1474, - "step": 1395500 - }, - { - "epoch": 14.22, - "learning_rate": 4.360855945238573e-07, - "loss": 0.1958, - "step": 1395600 - }, - { - "epoch": 14.22, - "learning_rate": 4.34959365294012e-07, - "loss": 0.2716, - "step": 1395700 - }, - { - "epoch": 14.22, - "learning_rate": 4.338345816182132e-07, - "loss": 0.2024, - "step": 1395800 - }, - { - "epoch": 14.22, - "learning_rate": 4.327112435514602e-07, - "loss": 0.2038, - "step": 1395900 - }, - { - "epoch": 14.22, - "learning_rate": 4.315893511486824e-07, - "loss": 0.2189, - "step": 1396000 - }, - { - "epoch": 14.22, - "learning_rate": 4.304689044647292e-07, - "loss": 0.1734, - "step": 1396100 - }, - { - "epoch": 14.22, - "learning_rate": 4.293499035543835e-07, - "loss": 0.2226, - "step": 1396200 - }, - { - "epoch": 14.23, - "learning_rate": 4.282323484723616e-07, - "loss": 0.1769, - "step": 1396300 - }, - { - "epoch": 14.23, - "learning_rate": 4.2712739320799223e-07, - "loss": 0.2016, - "step": 1396400 - }, - { - "epoch": 14.23, - "learning_rate": 4.260238551064666e-07, - "loss": 0.2012, - "step": 1396500 - }, - { - "epoch": 14.23, - "learning_rate": 4.2491060891660103e-07, - "loss": 0.15, - "step": 1396600 - }, - { - "epoch": 14.23, - "learning_rate": 4.237988087721123e-07, - "loss": 0.1915, - "step": 1396700 - }, - { - "epoch": 14.23, - "learning_rate": 4.226884547273602e-07, - "loss": 0.1941, - "step": 1396800 - }, - { - "epoch": 14.23, - "learning_rate": 4.215795468366346e-07, - "loss": 0.1984, - "step": 1396900 - }, - { - "epoch": 14.23, - "learning_rate": 4.204720851541555e-07, - "loss": 0.1849, - "step": 1397000 - }, - { - "epoch": 14.23, - "learning_rate": 4.1936606973406624e-07, - "loss": 0.2001, - "step": 1397100 - }, - { - "epoch": 14.23, - "learning_rate": 4.182615006304502e-07, - "loss": 0.1795, - "step": 1397200 - }, - { - "epoch": 14.24, - "learning_rate": 4.1715837789731425e-07, - "loss": 0.2347, - "step": 1397300 - }, - { - "epoch": 14.24, - "learning_rate": 4.160677111917011e-07, - "loss": 0.2056, - "step": 1397400 - }, - { - "epoch": 14.24, - "learning_rate": 4.149674668962067e-07, - "loss": 0.2284, - "step": 1397500 - }, - { - "epoch": 14.24, - "learning_rate": 4.138686691322502e-07, - "loss": 0.2434, - "step": 1397600 - }, - { - "epoch": 14.24, - "learning_rate": 4.127713179535519e-07, - "loss": 0.2129, - "step": 1397700 - }, - { - "epoch": 14.24, - "learning_rate": 4.116754134137757e-07, - "loss": 0.1784, - "step": 1397800 - }, - { - "epoch": 14.24, - "learning_rate": 4.105809555664919e-07, - "loss": 0.2229, - "step": 1397900 - }, - { - "epoch": 14.24, - "learning_rate": 4.094879444652211e-07, - "loss": 0.2213, - "step": 1398000 - }, - { - "epoch": 14.24, - "learning_rate": 4.0839638016340385e-07, - "loss": 0.1849, - "step": 1398100 - }, - { - "epoch": 14.25, - "learning_rate": 4.073062627144075e-07, - "loss": 0.1949, - "step": 1398200 - }, - { - "epoch": 14.25, - "learning_rate": 4.062175921715361e-07, - "loss": 0.2545, - "step": 1398300 - }, - { - "epoch": 14.25, - "learning_rate": 4.051303685880203e-07, - "loss": 0.1914, - "step": 1398400 - }, - { - "epoch": 14.25, - "learning_rate": 4.040445920170144e-07, - "loss": 0.1827, - "step": 1398500 - }, - { - "epoch": 14.25, - "learning_rate": 4.029602625116091e-07, - "loss": 0.2008, - "step": 1398600 - }, - { - "epoch": 14.25, - "learning_rate": 4.0187738012482213e-07, - "loss": 0.1792, - "step": 1398700 - }, - { - "epoch": 14.25, - "learning_rate": 4.0079594490959437e-07, - "loss": 0.1552, - "step": 1398800 - }, - { - "epoch": 14.25, - "learning_rate": 3.9971595691881023e-07, - "loss": 0.2384, - "step": 1398900 - }, - { - "epoch": 14.25, - "learning_rate": 3.986374162052675e-07, - "loss": 0.2232, - "step": 1399000 - }, - { - "epoch": 14.25, - "learning_rate": 3.9756032282170727e-07, - "loss": 0.1909, - "step": 1399100 - }, - { - "epoch": 14.26, - "learning_rate": 3.964846768207875e-07, - "loss": 0.1698, - "step": 1399200 - }, - { - "epoch": 14.26, - "learning_rate": 3.9541047825510266e-07, - "loss": 0.255, - "step": 1399300 - }, - { - "epoch": 14.26, - "learning_rate": 3.943377271771742e-07, - "loss": 0.2387, - "step": 1399400 - }, - { - "epoch": 14.26, - "learning_rate": 3.932664236394534e-07, - "loss": 0.254, - "step": 1399500 - }, - { - "epoch": 14.26, - "learning_rate": 3.9219656769432177e-07, - "loss": 0.2257, - "step": 1399600 - }, - { - "epoch": 14.26, - "learning_rate": 3.911281593940874e-07, - "loss": 0.2108, - "step": 1399700 - }, - { - "epoch": 14.26, - "learning_rate": 3.900611987909886e-07, - "loss": 0.2215, - "step": 1399800 - }, - { - "epoch": 14.26, - "learning_rate": 3.889956859371935e-07, - "loss": 0.2236, - "step": 1399900 - }, - { - "epoch": 14.26, - "learning_rate": 3.879316208848005e-07, - "loss": 0.2375, - "step": 1400000 - }, - { - "epoch": 14.26, - "eval_cer": 0.054162580470109245, - "eval_loss": 0.315573126077652, - "eval_runtime": 9326.5244, - "eval_samples_per_second": 5.866, - "eval_steps_per_second": 0.367, - "eval_wer": 0.11891348088531187, - "step": 1400000 - }, - { - "epoch": 14.26, - "learning_rate": 3.86869003685838e-07, - "loss": 0.1934, - "step": 1400100 - }, - { - "epoch": 14.27, - "learning_rate": 3.8580783439225774e-07, - "loss": 0.1959, - "step": 1400200 - }, - { - "epoch": 14.27, - "learning_rate": 3.8475870310175186e-07, - "loss": 0.2004, - "step": 1400300 - }, - { - "epoch": 14.27, - "learning_rate": 3.8370041529417475e-07, - "loss": 0.1839, - "step": 1400400 - }, - { - "epoch": 14.27, - "learning_rate": 3.826435755469093e-07, - "loss": 0.2352, - "step": 1400500 - }, - { - "epoch": 14.27, - "learning_rate": 3.815881839116242e-07, - "loss": 0.1679, - "step": 1400600 - }, - { - "epoch": 14.27, - "learning_rate": 3.8053424043992145e-07, - "loss": 0.1999, - "step": 1400700 - }, - { - "epoch": 14.27, - "learning_rate": 3.7948174518333656e-07, - "loss": 0.1887, - "step": 1400800 - }, - { - "epoch": 14.27, - "learning_rate": 3.784306981933283e-07, - "loss": 0.2218, - "step": 1400900 - }, - { - "epoch": 14.27, - "learning_rate": 3.77381099521289e-07, - "loss": 0.1684, - "step": 1401000 - }, - { - "epoch": 14.27, - "learning_rate": 3.763329492185308e-07, - "loss": 0.2067, - "step": 1401100 - }, - { - "epoch": 14.28, - "learning_rate": 3.7528624733630947e-07, - "loss": 0.1933, - "step": 1401200 - }, - { - "epoch": 14.28, - "learning_rate": 3.7424099392579737e-07, - "loss": 0.1865, - "step": 1401300 - }, - { - "epoch": 14.28, - "learning_rate": 3.731971890381036e-07, - "loss": 0.2038, - "step": 1401400 - }, - { - "epoch": 14.28, - "learning_rate": 3.7215483272426745e-07, - "loss": 0.1879, - "step": 1401500 - }, - { - "epoch": 14.28, - "learning_rate": 3.71113925035248e-07, - "loss": 0.205, - "step": 1401600 - }, - { - "epoch": 14.28, - "learning_rate": 3.700744660219413e-07, - "loss": 0.1821, - "step": 1401700 - }, - { - "epoch": 14.28, - "learning_rate": 3.6903645573517333e-07, - "loss": 0.212, - "step": 1401800 - }, - { - "epoch": 14.28, - "learning_rate": 3.6799989422569346e-07, - "loss": 0.223, - "step": 1401900 - }, - { - "epoch": 14.28, - "learning_rate": 3.669647815441812e-07, - "loss": 0.2033, - "step": 1402000 - }, - { - "epoch": 14.28, - "learning_rate": 3.6593111774125274e-07, - "loss": 0.2208, - "step": 1402100 - }, - { - "epoch": 14.29, - "learning_rate": 3.6489890286744764e-07, - "loss": 0.2121, - "step": 1402200 - }, - { - "epoch": 14.29, - "learning_rate": 3.6386813697322884e-07, - "loss": 0.2131, - "step": 1402300 - }, - { - "epoch": 14.29, - "learning_rate": 3.6283882010899937e-07, - "loss": 0.2358, - "step": 1402400 - }, - { - "epoch": 14.29, - "learning_rate": 3.6181095232508896e-07, - "loss": 0.2021, - "step": 1402500 - }, - { - "epoch": 14.29, - "learning_rate": 3.607845336717508e-07, - "loss": 0.2058, - "step": 1402600 - }, - { - "epoch": 14.29, - "learning_rate": 3.597595641991713e-07, - "loss": 0.2058, - "step": 1402700 - }, - { - "epoch": 14.29, - "learning_rate": 3.5873604395746385e-07, - "loss": 0.2457, - "step": 1402800 - }, - { - "epoch": 14.29, - "learning_rate": 3.57713972996675e-07, - "loss": 0.1899, - "step": 1402900 - }, - { - "epoch": 14.29, - "learning_rate": 3.567035504087257e-07, - "loss": 0.1844, - "step": 1403000 - }, - { - "epoch": 14.3, - "learning_rate": 3.556843636655682e-07, - "loss": 0.2003, - "step": 1403100 - }, - { - "epoch": 14.3, - "learning_rate": 3.5466662635253334e-07, - "loss": 0.2235, - "step": 1403200 - }, - { - "epoch": 14.3, - "learning_rate": 3.5365033851939124e-07, - "loss": 0.2315, - "step": 1403300 - }, - { - "epoch": 14.3, - "learning_rate": 3.5263550021582214e-07, - "loss": 0.1889, - "step": 1403400 - }, - { - "epoch": 14.3, - "learning_rate": 3.51632238203119e-07, - "loss": 0.258, - "step": 1403500 - }, - { - "epoch": 14.3, - "learning_rate": 3.50620284610963e-07, - "loss": 0.1855, - "step": 1403600 - }, - { - "epoch": 14.3, - "learning_rate": 3.4960978069653813e-07, - "loss": 0.2425, - "step": 1403700 - }, - { - "epoch": 14.3, - "learning_rate": 3.486007265092517e-07, - "loss": 0.1947, - "step": 1403800 - }, - { - "epoch": 14.3, - "learning_rate": 3.475931220984374e-07, - "loss": 0.1692, - "step": 1403900 - }, - { - "epoch": 14.3, - "learning_rate": 3.465869675133626e-07, - "loss": 0.2303, - "step": 1404000 - }, - { - "epoch": 14.31, - "learning_rate": 3.4558226280322456e-07, - "loss": 0.2251, - "step": 1404100 - }, - { - "epoch": 14.31, - "learning_rate": 3.4457900801714737e-07, - "loss": 0.1505, - "step": 1404200 - }, - { - "epoch": 14.31, - "learning_rate": 3.435772032041784e-07, - "loss": 0.2383, - "step": 1404300 - }, - { - "epoch": 14.31, - "learning_rate": 3.425768484133085e-07, - "loss": 0.1873, - "step": 1404400 - }, - { - "epoch": 14.31, - "learning_rate": 3.4157794369344186e-07, - "loss": 0.189, - "step": 1404500 - }, - { - "epoch": 14.31, - "learning_rate": 3.4058048909341945e-07, - "loss": 0.227, - "step": 1404600 - }, - { - "epoch": 14.31, - "learning_rate": 3.3958448466201553e-07, - "loss": 0.2359, - "step": 1404700 - }, - { - "epoch": 14.31, - "learning_rate": 3.3858993044792783e-07, - "loss": 0.2039, - "step": 1404800 - }, - { - "epoch": 14.31, - "learning_rate": 3.3759682649977753e-07, - "loss": 0.1731, - "step": 1404900 - }, - { - "epoch": 14.31, - "learning_rate": 3.3660517286613233e-07, - "loss": 0.2045, - "step": 1405000 - }, - { - "epoch": 14.32, - "learning_rate": 3.356149695954669e-07, - "loss": 0.159, - "step": 1405100 - }, - { - "epoch": 14.32, - "learning_rate": 3.3462621673620574e-07, - "loss": 0.2304, - "step": 1405200 - }, - { - "epoch": 14.32, - "learning_rate": 3.336389143366869e-07, - "loss": 0.207, - "step": 1405300 - }, - { - "epoch": 14.32, - "learning_rate": 3.326530624451851e-07, - "loss": 0.2079, - "step": 1405400 - }, - { - "epoch": 14.32, - "learning_rate": 3.316686611099018e-07, - "loss": 0.1933, - "step": 1405500 - }, - { - "epoch": 14.32, - "learning_rate": 3.3068571037897176e-07, - "loss": 0.1882, - "step": 1405600 - }, - { - "epoch": 14.32, - "learning_rate": 3.297042103004533e-07, - "loss": 0.1945, - "step": 1405700 - }, - { - "epoch": 14.32, - "learning_rate": 3.2872416092233793e-07, - "loss": 0.2185, - "step": 1405800 - }, - { - "epoch": 14.32, - "learning_rate": 3.277455622925374e-07, - "loss": 0.1736, - "step": 1405900 - }, - { - "epoch": 14.32, - "learning_rate": 3.2676841445891003e-07, - "loss": 0.1721, - "step": 1406000 - }, - { - "epoch": 14.33, - "learning_rate": 3.25792717469221e-07, - "loss": 0.1797, - "step": 1406100 - }, - { - "epoch": 14.33, - "learning_rate": 3.248184713711855e-07, - "loss": 0.1878, - "step": 1406200 - }, - { - "epoch": 14.33, - "learning_rate": 3.23845676212432e-07, - "loss": 0.2051, - "step": 1406300 - }, - { - "epoch": 14.33, - "learning_rate": 3.2287433204052917e-07, - "loss": 0.2241, - "step": 1406400 - }, - { - "epoch": 14.33, - "learning_rate": 3.2190443890296237e-07, - "loss": 0.1911, - "step": 1406500 - }, - { - "epoch": 14.33, - "learning_rate": 3.209359968471637e-07, - "loss": 0.1842, - "step": 1406600 - }, - { - "epoch": 14.33, - "learning_rate": 3.199690059204785e-07, - "loss": 0.2067, - "step": 1406700 - }, - { - "epoch": 14.33, - "learning_rate": 3.1900346617018905e-07, - "loss": 0.2268, - "step": 1406800 - }, - { - "epoch": 14.33, - "learning_rate": 3.180393776435042e-07, - "loss": 0.2456, - "step": 1406900 - }, - { - "epoch": 14.33, - "learning_rate": 3.170767403875563e-07, - "loss": 0.1632, - "step": 1407000 - }, - { - "epoch": 14.34, - "learning_rate": 3.16115554449421e-07, - "loss": 0.2031, - "step": 1407100 - }, - { - "epoch": 14.34, - "learning_rate": 3.1515581987608734e-07, - "loss": 0.212, - "step": 1407200 - }, - { - "epoch": 14.34, - "learning_rate": 3.1419753671449114e-07, - "loss": 0.2412, - "step": 1407300 - }, - { - "epoch": 14.34, - "learning_rate": 3.132407050114716e-07, - "loss": 0.19, - "step": 1407400 - }, - { - "epoch": 14.34, - "learning_rate": 3.122853248138247e-07, - "loss": 0.1897, - "step": 1407500 - }, - { - "epoch": 14.34, - "learning_rate": 3.1133139616825955e-07, - "loss": 0.2226, - "step": 1407600 - }, - { - "epoch": 14.34, - "learning_rate": 3.103789191214157e-07, - "loss": 0.2243, - "step": 1407700 - }, - { - "epoch": 14.34, - "learning_rate": 3.094278937198591e-07, - "loss": 0.1816, - "step": 1407800 - }, - { - "epoch": 14.34, - "learning_rate": 3.0847832001009935e-07, - "loss": 0.1921, - "step": 1407900 - }, - { - "epoch": 14.34, - "learning_rate": 3.075301980385592e-07, - "loss": 0.2334, - "step": 1408000 - }, - { - "epoch": 14.35, - "learning_rate": 3.0658352785159826e-07, - "loss": 0.2164, - "step": 1408100 - }, - { - "epoch": 14.35, - "learning_rate": 3.056477544923475e-07, - "loss": 0.2118, - "step": 1408200 - }, - { - "epoch": 14.35, - "learning_rate": 3.04703973494328e-07, - "loss": 0.2029, - "step": 1408300 - }, - { - "epoch": 14.35, - "learning_rate": 3.0376164441907717e-07, - "loss": 0.1735, - "step": 1408400 - }, - { - "epoch": 14.35, - "learning_rate": 3.0282076731265816e-07, - "loss": 0.1516, - "step": 1408500 - }, - { - "epoch": 14.35, - "learning_rate": 3.0188134222108423e-07, - "loss": 0.2184, - "step": 1408600 - }, - { - "epoch": 14.35, - "learning_rate": 3.009433691902852e-07, - "loss": 0.2495, - "step": 1408700 - }, - { - "epoch": 14.35, - "learning_rate": 3.000068482661178e-07, - "loss": 0.1792, - "step": 1408800 - }, - { - "epoch": 14.35, - "learning_rate": 2.9907177949437204e-07, - "loss": 0.1768, - "step": 1408900 - }, - { - "epoch": 14.36, - "learning_rate": 2.981381629207747e-07, - "loss": 0.2052, - "step": 1409000 - }, - { - "epoch": 14.36, - "learning_rate": 2.972059985909692e-07, - "loss": 0.2071, - "step": 1409100 - }, - { - "epoch": 14.36, - "learning_rate": 2.962752865505325e-07, - "loss": 0.1965, - "step": 1409200 - }, - { - "epoch": 14.36, - "learning_rate": 2.9534602684497147e-07, - "loss": 0.1988, - "step": 1409300 - }, - { - "epoch": 14.36, - "learning_rate": 2.944182195197231e-07, - "loss": 0.1969, - "step": 1409400 - }, - { - "epoch": 14.36, - "learning_rate": 2.9349186462014787e-07, - "loss": 0.2292, - "step": 1409500 - }, - { - "epoch": 14.36, - "learning_rate": 2.9256696219154276e-07, - "loss": 0.2128, - "step": 1409600 - }, - { - "epoch": 14.36, - "learning_rate": 2.916435122791317e-07, - "loss": 0.2303, - "step": 1409700 - }, - { - "epoch": 14.36, - "learning_rate": 2.907215149280584e-07, - "loss": 0.2083, - "step": 1409800 - }, - { - "epoch": 14.36, - "learning_rate": 2.89800970183407e-07, - "loss": 0.2118, - "step": 1409900 - }, - { - "epoch": 14.37, - "learning_rate": 2.888818780901914e-07, - "loss": 0.2714, - "step": 1410000 - }, - { - "epoch": 14.37, - "learning_rate": 2.879642386933423e-07, - "loss": 0.2034, - "step": 1410100 - }, - { - "epoch": 14.37, - "learning_rate": 2.870480520377272e-07, - "loss": 0.286, - "step": 1410200 - }, - { - "epoch": 14.37, - "learning_rate": 2.861333181681469e-07, - "loss": 0.1986, - "step": 1410300 - }, - { - "epoch": 14.37, - "learning_rate": 2.8522003712932566e-07, - "loss": 0.2056, - "step": 1410400 - }, - { - "epoch": 14.37, - "learning_rate": 2.843082089659144e-07, - "loss": 0.2288, - "step": 1410500 - }, - { - "epoch": 14.37, - "learning_rate": 2.833978337225007e-07, - "loss": 0.178, - "step": 1410600 - }, - { - "epoch": 14.37, - "learning_rate": 2.8248891144358915e-07, - "loss": 0.2104, - "step": 1410700 - }, - { - "epoch": 14.37, - "learning_rate": 2.8158144217362736e-07, - "loss": 0.1839, - "step": 1410800 - }, - { - "epoch": 14.37, - "learning_rate": 2.8067542595698325e-07, - "loss": 0.2467, - "step": 1410900 - }, - { - "epoch": 14.38, - "learning_rate": 2.7977086283795474e-07, - "loss": 0.218, - "step": 1411000 - }, - { - "epoch": 14.38, - "learning_rate": 2.788677528607697e-07, - "loss": 0.2607, - "step": 1411100 - }, - { - "epoch": 14.38, - "learning_rate": 2.779660960695862e-07, - "loss": 0.1861, - "step": 1411200 - }, - { - "epoch": 14.38, - "learning_rate": 2.770658925084857e-07, - "loss": 0.1825, - "step": 1411300 - }, - { - "epoch": 14.38, - "learning_rate": 2.761671422214895e-07, - "loss": 0.2197, - "step": 1411400 - }, - { - "epoch": 14.38, - "learning_rate": 2.752698452525326e-07, - "loss": 0.1464, - "step": 1411500 - }, - { - "epoch": 14.38, - "learning_rate": 2.7437400164549654e-07, - "loss": 0.2427, - "step": 1411600 - }, - { - "epoch": 14.38, - "learning_rate": 2.734796114441762e-07, - "loss": 0.203, - "step": 1411700 - }, - { - "epoch": 14.38, - "learning_rate": 2.725866746923067e-07, - "loss": 0.2115, - "step": 1411800 - }, - { - "epoch": 14.38, - "learning_rate": 2.7169519143354307e-07, - "loss": 0.209, - "step": 1411900 - }, - { - "epoch": 14.39, - "learning_rate": 2.708051617114771e-07, - "loss": 0.2125, - "step": 1412000 - }, - { - "epoch": 14.39, - "learning_rate": 2.6991658556962396e-07, - "loss": 0.2238, - "step": 1412100 - }, - { - "epoch": 14.39, - "learning_rate": 2.6902946305142896e-07, - "loss": 0.195, - "step": 1412200 - }, - { - "epoch": 14.39, - "learning_rate": 2.6814379420026735e-07, - "loss": 0.2152, - "step": 1412300 - }, - { - "epoch": 14.39, - "learning_rate": 2.6726841401484515e-07, - "loss": 0.2234, - "step": 1412400 - }, - { - "epoch": 14.39, - "learning_rate": 2.663856380898455e-07, - "loss": 0.2014, - "step": 1412500 - }, - { - "epoch": 14.39, - "learning_rate": 2.6550431596114255e-07, - "loss": 0.223, - "step": 1412600 - }, - { - "epoch": 14.39, - "learning_rate": 2.6462444767183846e-07, - "loss": 0.1928, - "step": 1412700 - }, - { - "epoch": 14.39, - "learning_rate": 2.637460332649422e-07, - "loss": 0.1894, - "step": 1412800 - }, - { - "epoch": 14.39, - "learning_rate": 2.6286907278340933e-07, - "loss": 0.1982, - "step": 1412900 - }, - { - "epoch": 14.4, - "learning_rate": 2.6199356627011896e-07, - "loss": 0.2365, - "step": 1413000 - }, - { - "epoch": 14.4, - "learning_rate": 2.6111951376787346e-07, - "loss": 0.1752, - "step": 1413100 - }, - { - "epoch": 14.4, - "learning_rate": 2.6024691531941205e-07, - "loss": 0.2232, - "step": 1413200 - }, - { - "epoch": 14.4, - "learning_rate": 2.5937577096740384e-07, - "loss": 0.18, - "step": 1413300 - }, - { - "epoch": 14.4, - "learning_rate": 2.585060807544315e-07, - "loss": 0.1962, - "step": 1413400 - }, - { - "epoch": 14.4, - "learning_rate": 2.5763784472302434e-07, - "loss": 0.2041, - "step": 1413500 - }, - { - "epoch": 14.4, - "learning_rate": 2.5677106291563836e-07, - "loss": 0.2101, - "step": 1413600 - }, - { - "epoch": 14.4, - "learning_rate": 2.5590573537464633e-07, - "loss": 0.1921, - "step": 1413700 - }, - { - "epoch": 14.4, - "learning_rate": 2.5504186214235783e-07, - "loss": 0.1809, - "step": 1413800 - }, - { - "epoch": 14.41, - "learning_rate": 2.541794432610156e-07, - "loss": 0.2159, - "step": 1413900 - }, - { - "epoch": 14.41, - "learning_rate": 2.5331847877278604e-07, - "loss": 0.2292, - "step": 1414000 - }, - { - "epoch": 14.41, - "learning_rate": 2.5245896871976205e-07, - "loss": 0.214, - "step": 1414100 - }, - { - "epoch": 14.41, - "learning_rate": 2.5160091314397006e-07, - "loss": 0.2133, - "step": 1414200 - }, - { - "epoch": 14.41, - "learning_rate": 2.507443120873665e-07, - "loss": 0.2158, - "step": 1414300 - }, - { - "epoch": 14.41, - "learning_rate": 2.4989770985657026e-07, - "loss": 0.1902, - "step": 1414400 - }, - { - "epoch": 14.41, - "learning_rate": 2.4904400341767886e-07, - "loss": 0.1464, - "step": 1414500 - }, - { - "epoch": 14.41, - "learning_rate": 2.481917516229926e-07, - "loss": 0.1721, - "step": 1414600 - }, - { - "epoch": 14.41, - "learning_rate": 2.47340954514178e-07, - "loss": 0.176, - "step": 1414700 - }, - { - "epoch": 14.41, - "learning_rate": 2.4649161213283865e-07, - "loss": 0.1869, - "step": 1414800 - }, - { - "epoch": 14.42, - "learning_rate": 2.4564372452049786e-07, - "loss": 0.2168, - "step": 1414900 - }, - { - "epoch": 14.42, - "learning_rate": 2.4479729171861586e-07, - "loss": 0.198, - "step": 1415000 - }, - { - "epoch": 14.42, - "learning_rate": 2.439523137685762e-07, - "loss": 0.1802, - "step": 1415100 - }, - { - "epoch": 14.42, - "learning_rate": 2.4310879071169245e-07, - "loss": 0.2194, - "step": 1415200 - }, - { - "epoch": 14.42, - "learning_rate": 2.422667225892083e-07, - "loss": 0.1762, - "step": 1415300 - }, - { - "epoch": 14.42, - "learning_rate": 2.414261094422976e-07, - "loss": 0.2203, - "step": 1415400 - }, - { - "epoch": 14.42, - "learning_rate": 2.4058695131206067e-07, - "loss": 0.2207, - "step": 1415500 - }, - { - "epoch": 14.42, - "learning_rate": 2.397492482395247e-07, - "loss": 0.203, - "step": 1415600 - }, - { - "epoch": 14.42, - "learning_rate": 2.3891300026565366e-07, - "loss": 0.2151, - "step": 1415700 - }, - { - "epoch": 14.42, - "learning_rate": 2.3807820743133147e-07, - "loss": 0.2495, - "step": 1415800 - }, - { - "epoch": 14.43, - "learning_rate": 2.3724486977737548e-07, - "loss": 0.1774, - "step": 1415900 - }, - { - "epoch": 14.43, - "learning_rate": 2.3641298734452977e-07, - "loss": 0.2034, - "step": 1416000 - }, - { - "epoch": 14.43, - "learning_rate": 2.3558256017346847e-07, - "loss": 0.1667, - "step": 1416100 - }, - { - "epoch": 14.43, - "learning_rate": 2.3475358830479575e-07, - "loss": 0.2431, - "step": 1416200 - }, - { - "epoch": 14.43, - "learning_rate": 2.3392607177904258e-07, - "loss": 0.2481, - "step": 1416300 - }, - { - "epoch": 14.43, - "learning_rate": 2.3310001063666985e-07, - "loss": 0.2235, - "step": 1416400 - }, - { - "epoch": 14.43, - "learning_rate": 2.3227540491806864e-07, - "loss": 0.2294, - "step": 1416500 - }, - { - "epoch": 14.43, - "learning_rate": 2.3145225466355335e-07, - "loss": 0.215, - "step": 1416600 - }, - { - "epoch": 14.43, - "learning_rate": 2.3063055991337512e-07, - "loss": 0.2212, - "step": 1416700 - }, - { - "epoch": 14.43, - "learning_rate": 2.2981032070770515e-07, - "loss": 0.2246, - "step": 1416800 - }, - { - "epoch": 14.44, - "learning_rate": 2.2899153708665465e-07, - "loss": 0.1739, - "step": 1416900 - }, - { - "epoch": 14.44, - "learning_rate": 2.28174209090255e-07, - "loss": 0.1994, - "step": 1417000 - }, - { - "epoch": 14.44, - "learning_rate": 2.2735833675846418e-07, - "loss": 0.1758, - "step": 1417100 - }, - { - "epoch": 14.44, - "learning_rate": 2.2654392013117697e-07, - "loss": 0.2329, - "step": 1417200 - }, - { - "epoch": 14.44, - "learning_rate": 2.2573095924821486e-07, - "loss": 0.2287, - "step": 1417300 - }, - { - "epoch": 14.44, - "learning_rate": 2.2491945414932602e-07, - "loss": 0.1854, - "step": 1417400 - }, - { - "epoch": 14.44, - "learning_rate": 2.241094048741854e-07, - "loss": 0.1903, - "step": 1417500 - }, - { - "epoch": 14.44, - "learning_rate": 2.2330081146240465e-07, - "loss": 0.2217, - "step": 1417600 - }, - { - "epoch": 14.44, - "learning_rate": 2.2249367395351217e-07, - "loss": 0.257, - "step": 1417700 - }, - { - "epoch": 14.44, - "learning_rate": 2.2168799238697967e-07, - "loss": 0.1587, - "step": 1417800 - }, - { - "epoch": 14.45, - "learning_rate": 2.2088376680219235e-07, - "loss": 0.2158, - "step": 1417900 - }, - { - "epoch": 14.45, - "learning_rate": 2.2008099723847875e-07, - "loss": 0.2567, - "step": 1418000 - }, - { - "epoch": 14.45, - "learning_rate": 2.1927968373508744e-07, - "loss": 0.222, - "step": 1418100 - }, - { - "epoch": 14.45, - "learning_rate": 2.1847982633119712e-07, - "loss": 0.2449, - "step": 1418200 - }, - { - "epoch": 14.45, - "learning_rate": 2.1768142506591982e-07, - "loss": 0.1752, - "step": 1418300 - }, - { - "epoch": 14.45, - "learning_rate": 2.1688447997828432e-07, - "loss": 0.1575, - "step": 1418400 - }, - { - "epoch": 14.45, - "learning_rate": 2.160889911072661e-07, - "loss": 0.2165, - "step": 1418500 - }, - { - "epoch": 14.45, - "learning_rate": 2.1529495849175072e-07, - "loss": 0.22, - "step": 1418600 - }, - { - "epoch": 14.45, - "learning_rate": 2.145023821705705e-07, - "loss": 0.239, - "step": 1418700 - }, - { - "epoch": 14.45, - "learning_rate": 2.1371126218246773e-07, - "loss": 0.2044, - "step": 1418800 - }, - { - "epoch": 14.46, - "learning_rate": 2.129215985661348e-07, - "loss": 0.2092, - "step": 1418900 - }, - { - "epoch": 14.46, - "learning_rate": 2.1213339136017085e-07, - "loss": 0.1879, - "step": 1419000 - }, - { - "epoch": 14.46, - "learning_rate": 2.113466406031217e-07, - "loss": 0.1408, - "step": 1419100 - }, - { - "epoch": 14.46, - "learning_rate": 2.105691920664099e-07, - "loss": 0.2424, - "step": 1419200 - }, - { - "epoch": 14.46, - "learning_rate": 2.0978533975706927e-07, - "loss": 0.2032, - "step": 1419300 - }, - { - "epoch": 14.46, - "learning_rate": 2.0900294401144494e-07, - "loss": 0.1995, - "step": 1419400 - }, - { - "epoch": 14.46, - "learning_rate": 2.0822200486779297e-07, - "loss": 0.2042, - "step": 1419500 - }, - { - "epoch": 14.46, - "learning_rate": 2.0745030997884274e-07, - "loss": 0.2219, - "step": 1419600 - }, - { - "epoch": 14.46, - "learning_rate": 2.0667226958664298e-07, - "loss": 0.2007, - "step": 1419700 - }, - { - "epoch": 14.47, - "learning_rate": 2.0589568591037444e-07, - "loss": 0.2318, - "step": 1419800 - }, - { - "epoch": 14.47, - "learning_rate": 2.0512055898800674e-07, - "loss": 0.2155, - "step": 1419900 - }, - { - "epoch": 14.47, - "learning_rate": 2.043468888574329e-07, - "loss": 0.2334, - "step": 1420000 - }, - { - "epoch": 14.47, - "learning_rate": 2.0357467555648935e-07, - "loss": 0.1874, - "step": 1420100 - }, - { - "epoch": 14.47, - "learning_rate": 2.0280391912292583e-07, - "loss": 0.1717, - "step": 1420200 - }, - { - "epoch": 14.47, - "learning_rate": 2.0203461959442892e-07, - "loss": 0.1992, - "step": 1420300 - }, - { - "epoch": 14.47, - "learning_rate": 2.0126677700861852e-07, - "loss": 0.2148, - "step": 1420400 - }, - { - "epoch": 14.47, - "learning_rate": 2.005003914030279e-07, - "loss": 0.1274, - "step": 1420500 - }, - { - "epoch": 14.47, - "learning_rate": 1.9973546281513712e-07, - "loss": 0.202, - "step": 1420600 - }, - { - "epoch": 14.47, - "learning_rate": 1.9897199128234288e-07, - "loss": 0.2316, - "step": 1420700 - }, - { - "epoch": 14.48, - "learning_rate": 1.9820997684197538e-07, - "loss": 0.2668, - "step": 1420800 - }, - { - "epoch": 14.48, - "learning_rate": 1.9744941953129148e-07, - "loss": 0.1742, - "step": 1420900 - }, - { - "epoch": 14.48, - "learning_rate": 1.9669031938748138e-07, - "loss": 0.18, - "step": 1421000 - }, - { - "epoch": 14.48, - "learning_rate": 1.9593267644765546e-07, - "loss": 0.2254, - "step": 1421100 - }, - { - "epoch": 14.48, - "learning_rate": 1.9517649074886067e-07, - "loss": 0.198, - "step": 1421200 - }, - { - "epoch": 14.48, - "learning_rate": 1.944217623280675e-07, - "loss": 0.2121, - "step": 1421300 - }, - { - "epoch": 14.48, - "learning_rate": 1.936684912221831e-07, - "loss": 0.2157, - "step": 1421400 - }, - { - "epoch": 14.48, - "learning_rate": 1.9291667746803465e-07, - "loss": 0.2023, - "step": 1421500 - }, - { - "epoch": 14.48, - "learning_rate": 1.9216632110237942e-07, - "loss": 0.1913, - "step": 1421600 - }, - { - "epoch": 14.48, - "learning_rate": 1.9141742216190804e-07, - "loss": 0.213, - "step": 1421700 - }, - { - "epoch": 14.49, - "learning_rate": 1.906699806832346e-07, - "loss": 0.2428, - "step": 1421800 - }, - { - "epoch": 14.49, - "learning_rate": 1.899239967029065e-07, - "loss": 0.2529, - "step": 1421900 - }, - { - "epoch": 14.49, - "learning_rate": 1.891794702573979e-07, - "loss": 0.2234, - "step": 1422000 - }, - { - "epoch": 14.49, - "learning_rate": 1.8843640138311634e-07, - "loss": 0.2033, - "step": 1422100 - }, - { - "epoch": 14.49, - "learning_rate": 1.876947901163828e-07, - "loss": 0.1829, - "step": 1422200 - }, - { - "epoch": 14.49, - "learning_rate": 1.869546364934649e-07, - "loss": 0.2267, - "step": 1422300 - }, - { - "epoch": 14.49, - "learning_rate": 1.8621594055055368e-07, - "loss": 0.1933, - "step": 1422400 - }, - { - "epoch": 14.49, - "learning_rate": 1.8547870232376029e-07, - "loss": 0.1817, - "step": 1422500 - }, - { - "epoch": 14.49, - "learning_rate": 1.8474292184913587e-07, - "loss": 0.2166, - "step": 1422600 - }, - { - "epoch": 14.49, - "learning_rate": 1.8400859916265168e-07, - "loss": 0.1763, - "step": 1422700 - }, - { - "epoch": 14.5, - "learning_rate": 1.8327573430021894e-07, - "loss": 0.2089, - "step": 1422800 - }, - { - "epoch": 14.5, - "learning_rate": 1.8254432729766235e-07, - "loss": 0.2423, - "step": 1422900 - }, - { - "epoch": 14.5, - "learning_rate": 1.8181437819074664e-07, - "loss": 0.1788, - "step": 1423000 - }, - { - "epoch": 14.5, - "learning_rate": 1.8109316471004335e-07, - "loss": 0.2022, - "step": 1423100 - }, - { - "epoch": 14.5, - "learning_rate": 1.8037338018239923e-07, - "loss": 0.2311, - "step": 1423200 - }, - { - "epoch": 14.5, - "learning_rate": 1.7964777581586433e-07, - "loss": 0.1961, - "step": 1423300 - }, - { - "epoch": 14.5, - "learning_rate": 1.7892362948659703e-07, - "loss": 0.252, - "step": 1423400 - }, - { - "epoch": 14.5, - "learning_rate": 1.7820094123000231e-07, - "loss": 0.1657, - "step": 1423500 - }, - { - "epoch": 14.5, - "learning_rate": 1.7747971108141192e-07, - "loss": 0.1826, - "step": 1423600 - }, - { - "epoch": 14.5, - "learning_rate": 1.7675993907609434e-07, - "loss": 0.1501, - "step": 1423700 - }, - { - "epoch": 14.51, - "learning_rate": 1.7604162524923807e-07, - "loss": 0.218, - "step": 1423800 - }, - { - "epoch": 14.51, - "learning_rate": 1.7532476963596832e-07, - "loss": 0.2136, - "step": 1423900 - }, - { - "epoch": 14.51, - "learning_rate": 1.7460937227133046e-07, - "loss": 0.2229, - "step": 1424000 - }, - { - "epoch": 14.51, - "learning_rate": 1.7389543319030643e-07, - "loss": 0.1702, - "step": 1424100 - }, - { - "epoch": 14.51, - "learning_rate": 1.73182952427805e-07, - "loss": 0.1743, - "step": 1424200 - }, - { - "epoch": 14.51, - "learning_rate": 1.7247193001865836e-07, - "loss": 0.2272, - "step": 1424300 - }, - { - "epoch": 14.51, - "learning_rate": 1.7176236599763196e-07, - "loss": 0.1816, - "step": 1424400 - }, - { - "epoch": 14.51, - "learning_rate": 1.710542603994214e-07, - "loss": 0.1896, - "step": 1424500 - }, - { - "epoch": 14.51, - "learning_rate": 1.7034761325864568e-07, - "loss": 0.176, - "step": 1424600 - }, - { - "epoch": 14.52, - "learning_rate": 1.6964242460986045e-07, - "loss": 0.2085, - "step": 1424700 - }, - { - "epoch": 14.52, - "learning_rate": 1.689386944875415e-07, - "loss": 0.1882, - "step": 1424800 - }, - { - "epoch": 14.52, - "learning_rate": 1.68236422926098e-07, - "loss": 0.2101, - "step": 1424900 - }, - { - "epoch": 14.52, - "learning_rate": 1.675356099598657e-07, - "loss": 0.172, - "step": 1425000 - }, - { - "epoch": 14.52, - "learning_rate": 1.66836255623114e-07, - "loss": 0.2118, - "step": 1425100 - }, - { - "epoch": 14.52, - "learning_rate": 1.6613835995003213e-07, - "loss": 0.1902, - "step": 1425200 - }, - { - "epoch": 14.52, - "learning_rate": 1.6544192297474613e-07, - "loss": 0.1942, - "step": 1425300 - }, - { - "epoch": 14.52, - "learning_rate": 1.6474694473130548e-07, - "loss": 0.156, - "step": 1425400 - }, - { - "epoch": 14.52, - "learning_rate": 1.6405342525369293e-07, - "loss": 0.2409, - "step": 1425500 - }, - { - "epoch": 14.52, - "learning_rate": 1.6336136457581474e-07, - "loss": 0.2369, - "step": 1425600 - }, - { - "epoch": 14.53, - "learning_rate": 1.6267076273151382e-07, - "loss": 0.1972, - "step": 1425700 - }, - { - "epoch": 14.53, - "learning_rate": 1.6198161975454984e-07, - "loss": 0.1943, - "step": 1425800 - }, - { - "epoch": 14.53, - "learning_rate": 1.6129393567861918e-07, - "loss": 0.1644, - "step": 1425900 - }, - { - "epoch": 14.53, - "learning_rate": 1.6060771053734824e-07, - "loss": 0.2202, - "step": 1426000 - }, - { - "epoch": 14.53, - "learning_rate": 1.5992294436429021e-07, - "loss": 0.2041, - "step": 1426100 - }, - { - "epoch": 14.53, - "learning_rate": 1.5923963719292168e-07, - "loss": 0.2039, - "step": 1426200 - }, - { - "epoch": 14.53, - "learning_rate": 1.5855778905665585e-07, - "loss": 0.181, - "step": 1426300 - }, - { - "epoch": 14.53, - "learning_rate": 1.578773999888261e-07, - "loss": 0.1959, - "step": 1426400 - }, - { - "epoch": 14.53, - "learning_rate": 1.5719847002270582e-07, - "loss": 0.1755, - "step": 1426500 - }, - { - "epoch": 14.53, - "learning_rate": 1.5652099919149175e-07, - "loss": 0.1767, - "step": 1426600 - }, - { - "epoch": 14.54, - "learning_rate": 1.5584498752829745e-07, - "loss": 0.2131, - "step": 1426700 - }, - { - "epoch": 14.54, - "learning_rate": 1.5517043506618644e-07, - "loss": 0.2056, - "step": 1426800 - }, - { - "epoch": 14.54, - "learning_rate": 1.544973418381357e-07, - "loss": 0.2442, - "step": 1426900 - }, - { - "epoch": 14.54, - "learning_rate": 1.5382570787705885e-07, - "loss": 0.2094, - "step": 1427000 - }, - { - "epoch": 14.54, - "learning_rate": 1.5315553321578635e-07, - "loss": 0.2338, - "step": 1427100 - }, - { - "epoch": 14.54, - "learning_rate": 1.5248681788709528e-07, - "loss": 0.1638, - "step": 1427200 - }, - { - "epoch": 14.54, - "learning_rate": 1.5181956192367952e-07, - "loss": 0.1728, - "step": 1427300 - }, - { - "epoch": 14.54, - "learning_rate": 1.5115376535815962e-07, - "loss": 0.2184, - "step": 1427400 - }, - { - "epoch": 14.54, - "learning_rate": 1.5048942822309287e-07, - "loss": 0.2125, - "step": 1427500 - }, - { - "epoch": 14.54, - "learning_rate": 1.498265505509633e-07, - "loss": 0.2103, - "step": 1427600 - }, - { - "epoch": 14.55, - "learning_rate": 1.4916513237417495e-07, - "loss": 0.1977, - "step": 1427700 - }, - { - "epoch": 14.55, - "learning_rate": 1.48505173725072e-07, - "loss": 0.2221, - "step": 1427800 - }, - { - "epoch": 14.55, - "learning_rate": 1.4784667463592195e-07, - "loss": 0.1616, - "step": 1427900 - }, - { - "epoch": 14.55, - "learning_rate": 1.4718963513892236e-07, - "loss": 0.1818, - "step": 1428000 - }, - { - "epoch": 14.55, - "learning_rate": 1.4653405526619757e-07, - "loss": 0.1987, - "step": 1428100 - }, - { - "epoch": 14.55, - "learning_rate": 1.458799350498019e-07, - "loss": 0.2095, - "step": 1428200 - }, - { - "epoch": 14.55, - "learning_rate": 1.4522727452171313e-07, - "loss": 0.2101, - "step": 1428300 - }, - { - "epoch": 14.55, - "learning_rate": 1.4457607371384907e-07, - "loss": 0.2105, - "step": 1428400 - }, - { - "epoch": 14.55, - "learning_rate": 1.4392633265804756e-07, - "loss": 0.235, - "step": 1428500 - }, - { - "epoch": 14.55, - "learning_rate": 1.4327805138607653e-07, - "loss": 0.2212, - "step": 1428600 - }, - { - "epoch": 14.56, - "learning_rate": 1.4263122992963394e-07, - "loss": 0.1945, - "step": 1428700 - }, - { - "epoch": 14.56, - "learning_rate": 1.4198586832034453e-07, - "loss": 0.2177, - "step": 1428800 - }, - { - "epoch": 14.56, - "learning_rate": 1.4134196658975972e-07, - "loss": 0.2308, - "step": 1428900 - }, - { - "epoch": 14.56, - "learning_rate": 1.4069952476936765e-07, - "loss": 0.2084, - "step": 1429000 - }, - { - "epoch": 14.56, - "learning_rate": 1.4005854289057984e-07, - "loss": 0.1552, - "step": 1429100 - }, - { - "epoch": 14.56, - "learning_rate": 1.3941902098473457e-07, - "loss": 0.2081, - "step": 1429200 - }, - { - "epoch": 14.56, - "learning_rate": 1.3878095908309686e-07, - "loss": 0.2372, - "step": 1429300 - }, - { - "epoch": 14.56, - "learning_rate": 1.381443572168717e-07, - "loss": 0.2339, - "step": 1429400 - }, - { - "epoch": 14.56, - "learning_rate": 1.3750921541717755e-07, - "loss": 0.214, - "step": 1429500 - }, - { - "epoch": 14.57, - "learning_rate": 1.368755337150762e-07, - "loss": 0.2237, - "step": 1429600 - }, - { - "epoch": 14.57, - "learning_rate": 1.3624331214154294e-07, - "loss": 0.1608, - "step": 1429700 - }, - { - "epoch": 14.57, - "learning_rate": 1.3561255072749968e-07, - "loss": 0.2312, - "step": 1429800 - }, - { - "epoch": 14.57, - "learning_rate": 1.349832495037784e-07, - "loss": 0.2034, - "step": 1429900 - }, - { - "epoch": 14.57, - "learning_rate": 1.3435540850115113e-07, - "loss": 0.2329, - "step": 1430000 - }, - { - "epoch": 14.57, - "learning_rate": 1.337290277503167e-07, - "loss": 0.1773, - "step": 1430100 - }, - { - "epoch": 14.57, - "learning_rate": 1.3310410728189727e-07, - "loss": 0.2561, - "step": 1430200 - }, - { - "epoch": 14.57, - "learning_rate": 1.3248687449935904e-07, - "loss": 0.2569, - "step": 1430300 - }, - { - "epoch": 14.57, - "learning_rate": 1.3186486008378463e-07, - "loss": 0.1872, - "step": 1430400 - }, - { - "epoch": 14.57, - "learning_rate": 1.3124430604177562e-07, - "loss": 0.2419, - "step": 1430500 - }, - { - "epoch": 14.58, - "learning_rate": 1.3062521240367443e-07, - "loss": 0.2213, - "step": 1430600 - }, - { - "epoch": 14.58, - "learning_rate": 1.3000757919974682e-07, - "loss": 0.1973, - "step": 1430700 - }, - { - "epoch": 14.58, - "learning_rate": 1.2939140646019532e-07, - "loss": 0.2153, - "step": 1430800 - }, - { - "epoch": 14.58, - "learning_rate": 1.287828341080499e-07, - "loss": 0.185, - "step": 1430900 - }, - { - "epoch": 14.58, - "learning_rate": 1.2816956778216148e-07, - "loss": 0.1959, - "step": 1431000 - }, - { - "epoch": 14.58, - "learning_rate": 1.2755776201051728e-07, - "loss": 0.1721, - "step": 1431100 - }, - { - "epoch": 14.58, - "learning_rate": 1.269474168230267e-07, - "loss": 0.2047, - "step": 1431200 - }, - { - "epoch": 14.58, - "learning_rate": 1.2633853224953585e-07, - "loss": 0.2086, - "step": 1431300 - }, - { - "epoch": 14.58, - "learning_rate": 1.257311083198176e-07, - "loss": 0.217, - "step": 1431400 - }, - { - "epoch": 14.58, - "learning_rate": 1.2512514506356154e-07, - "loss": 0.2039, - "step": 1431500 - }, - { - "epoch": 14.59, - "learning_rate": 1.245206425104073e-07, - "loss": 0.1982, - "step": 1431600 - }, - { - "epoch": 14.59, - "learning_rate": 1.2391760068990454e-07, - "loss": 0.2017, - "step": 1431700 - }, - { - "epoch": 14.59, - "learning_rate": 1.233160196315397e-07, - "loss": 0.2036, - "step": 1431800 - }, - { - "epoch": 14.59, - "learning_rate": 1.2271589936472593e-07, - "loss": 0.1945, - "step": 1431900 - }, - { - "epoch": 14.59, - "learning_rate": 1.2211723991880642e-07, - "loss": 0.1792, - "step": 1432000 - }, - { - "epoch": 14.59, - "learning_rate": 1.2152004132305107e-07, - "loss": 0.2243, - "step": 1432100 - }, - { - "epoch": 14.59, - "learning_rate": 1.2092430360666317e-07, - "loss": 0.2319, - "step": 1432200 - }, - { - "epoch": 14.59, - "learning_rate": 1.2033002679876616e-07, - "loss": 0.2062, - "step": 1432300 - }, - { - "epoch": 14.59, - "learning_rate": 1.1973721092841672e-07, - "loss": 0.2132, - "step": 1432400 - }, - { - "epoch": 14.59, - "learning_rate": 1.1914585602460504e-07, - "loss": 0.1946, - "step": 1432500 - }, - { - "epoch": 14.6, - "learning_rate": 1.1855596211623799e-07, - "loss": 0.1502, - "step": 1432600 - }, - { - "epoch": 14.6, - "learning_rate": 1.179675292321658e-07, - "loss": 0.1987, - "step": 1432700 - }, - { - "epoch": 14.6, - "learning_rate": 1.1738055740114884e-07, - "loss": 0.1924, - "step": 1432800 - }, - { - "epoch": 14.6, - "learning_rate": 1.1679504665189744e-07, - "loss": 0.1746, - "step": 1432900 - }, - { - "epoch": 14.6, - "learning_rate": 1.1621099701303206e-07, - "loss": 0.2179, - "step": 1433000 - }, - { - "epoch": 14.6, - "learning_rate": 1.1562840851311318e-07, - "loss": 0.1678, - "step": 1433100 - }, - { - "epoch": 14.6, - "learning_rate": 1.1504728118062469e-07, - "loss": 0.2184, - "step": 1433200 - }, - { - "epoch": 14.6, - "learning_rate": 1.1446761504397718e-07, - "loss": 0.2252, - "step": 1433300 - }, - { - "epoch": 14.6, - "learning_rate": 1.1388941013151799e-07, - "loss": 0.2286, - "step": 1433400 - }, - { - "epoch": 14.6, - "learning_rate": 1.1331266647151784e-07, - "loss": 0.1877, - "step": 1433500 - }, - { - "epoch": 14.61, - "learning_rate": 1.1273738409217082e-07, - "loss": 0.2145, - "step": 1433600 - }, - { - "epoch": 14.61, - "learning_rate": 1.121635630216078e-07, - "loss": 0.2354, - "step": 1433700 - }, - { - "epoch": 14.61, - "learning_rate": 1.1159691965151208e-07, - "loss": 0.1992, - "step": 1433800 - }, - { - "epoch": 14.61, - "learning_rate": 1.1102600666882956e-07, - "loss": 0.2459, - "step": 1433900 - }, - { - "epoch": 14.61, - "learning_rate": 1.1045655507860742e-07, - "loss": 0.2001, - "step": 1434000 - }, - { - "epoch": 14.61, - "learning_rate": 1.0988856490869004e-07, - "loss": 0.2057, - "step": 1434100 - }, - { - "epoch": 14.61, - "learning_rate": 1.0932203618684189e-07, - "loss": 0.1938, - "step": 1434200 - }, - { - "epoch": 14.61, - "learning_rate": 1.087569689407708e-07, - "loss": 0.1999, - "step": 1434300 - }, - { - "epoch": 14.61, - "learning_rate": 1.0819336319810469e-07, - "loss": 0.2224, - "step": 1434400 - }, - { - "epoch": 14.61, - "learning_rate": 1.0763121898639483e-07, - "loss": 0.1968, - "step": 1434500 - }, - { - "epoch": 14.62, - "learning_rate": 1.0707053633312925e-07, - "loss": 0.2307, - "step": 1434600 - }, - { - "epoch": 14.62, - "learning_rate": 1.06511315265726e-07, - "loss": 0.1975, - "step": 1434700 - }, - { - "epoch": 14.62, - "learning_rate": 1.0595355581152322e-07, - "loss": 0.1965, - "step": 1434800 - }, - { - "epoch": 14.62, - "learning_rate": 1.0539725799779243e-07, - "loss": 0.2139, - "step": 1434900 - }, - { - "epoch": 14.62, - "learning_rate": 1.0484242185173187e-07, - "loss": 0.2093, - "step": 1435000 - }, - { - "epoch": 14.62, - "learning_rate": 1.0428904740047652e-07, - "loss": 0.2234, - "step": 1435100 - }, - { - "epoch": 14.62, - "learning_rate": 1.0373713467107471e-07, - "loss": 0.2333, - "step": 1435200 - }, - { - "epoch": 14.62, - "learning_rate": 1.031866836905182e-07, - "loss": 0.1831, - "step": 1435300 - }, - { - "epoch": 14.62, - "learning_rate": 1.026376944857188e-07, - "loss": 0.1656, - "step": 1435400 - }, - { - "epoch": 14.63, - "learning_rate": 1.0209016708351505e-07, - "loss": 0.2338, - "step": 1435500 - }, - { - "epoch": 14.63, - "learning_rate": 1.0154410151068217e-07, - "loss": 0.2262, - "step": 1435600 - }, - { - "epoch": 14.63, - "learning_rate": 1.0099949779391881e-07, - "loss": 0.1537, - "step": 1435700 - }, - { - "epoch": 14.63, - "learning_rate": 1.0045635595985036e-07, - "loss": 0.2081, - "step": 1435800 - }, - { - "epoch": 14.63, - "learning_rate": 9.991467603503557e-08, - "loss": 0.1948, - "step": 1435900 - }, - { - "epoch": 14.63, - "learning_rate": 9.937445804595657e-08, - "loss": 0.2263, - "step": 1436000 - }, - { - "epoch": 14.63, - "learning_rate": 9.883570201903225e-08, - "loss": 0.2592, - "step": 1436100 - }, - { - "epoch": 14.63, - "learning_rate": 9.829840798060152e-08, - "loss": 0.1886, - "step": 1436200 - }, - { - "epoch": 14.63, - "learning_rate": 9.776257595693005e-08, - "loss": 0.2336, - "step": 1436300 - }, - { - "epoch": 14.63, - "learning_rate": 9.722820597422689e-08, - "loss": 0.2219, - "step": 1436400 - }, - { - "epoch": 14.64, - "learning_rate": 9.669529805861111e-08, - "loss": 0.1937, - "step": 1436500 - }, - { - "epoch": 14.64, - "learning_rate": 9.61691594569214e-08, - "loss": 0.2555, - "step": 1436600 - }, - { - "epoch": 14.64, - "learning_rate": 9.563916113226379e-08, - "loss": 0.1777, - "step": 1436700 - }, - { - "epoch": 14.64, - "learning_rate": 9.511062495239143e-08, - "loss": 0.1838, - "step": 1436800 - }, - { - "epoch": 14.64, - "learning_rate": 9.458355094314697e-08, - "loss": 0.2266, - "step": 1436900 - }, - { - "epoch": 14.64, - "learning_rate": 9.40579391302998e-08, - "loss": 0.1942, - "step": 1437000 - }, - { - "epoch": 14.64, - "learning_rate": 9.353378953954938e-08, - "loss": 0.2122, - "step": 1437100 - }, - { - "epoch": 14.64, - "learning_rate": 9.30111021965252e-08, - "loss": 0.1645, - "step": 1437200 - }, - { - "epoch": 14.64, - "learning_rate": 9.248987712678347e-08, - "loss": 0.214, - "step": 1437300 - }, - { - "epoch": 14.64, - "learning_rate": 9.197011435580716e-08, - "loss": 0.202, - "step": 1437400 - }, - { - "epoch": 14.65, - "learning_rate": 9.145181390900925e-08, - "loss": 0.1924, - "step": 1437500 - }, - { - "epoch": 14.65, - "learning_rate": 9.093497581173616e-08, - "loss": 0.1819, - "step": 1437600 - }, - { - "epoch": 14.65, - "learning_rate": 9.041960008925099e-08, - "loss": 0.217, - "step": 1437700 - }, - { - "epoch": 14.65, - "learning_rate": 8.990568676676025e-08, - "loss": 0.1932, - "step": 1437800 - }, - { - "epoch": 14.65, - "learning_rate": 8.939323586938386e-08, - "loss": 0.201, - "step": 1437900 - }, - { - "epoch": 14.65, - "learning_rate": 8.888224742218176e-08, - "loss": 0.1855, - "step": 1438000 - }, - { - "epoch": 14.65, - "learning_rate": 8.83727214501373e-08, - "loss": 0.238, - "step": 1438100 - }, - { - "epoch": 14.65, - "learning_rate": 8.78646579781639e-08, - "loss": 0.1598, - "step": 1438200 - }, - { - "epoch": 14.65, - "learning_rate": 8.7358057031105e-08, - "loss": 0.1887, - "step": 1438300 - }, - { - "epoch": 14.65, - "learning_rate": 8.685291863372413e-08, - "loss": 0.2125, - "step": 1438400 - }, - { - "epoch": 14.66, - "learning_rate": 8.634924281072487e-08, - "loss": 0.2075, - "step": 1438500 - }, - { - "epoch": 14.66, - "learning_rate": 8.584702958673086e-08, - "loss": 0.2103, - "step": 1438600 - }, - { - "epoch": 14.66, - "learning_rate": 8.534627898629909e-08, - "loss": 0.2312, - "step": 1438700 - }, - { - "epoch": 14.66, - "learning_rate": 8.48469910339167e-08, - "loss": 0.2124, - "step": 1438800 - }, - { - "epoch": 14.66, - "learning_rate": 8.434916575398744e-08, - "loss": 0.2376, - "step": 1438900 - }, - { - "epoch": 14.66, - "learning_rate": 8.385280317085853e-08, - "loss": 0.1757, - "step": 1439000 - }, - { - "epoch": 14.66, - "learning_rate": 8.335790330879722e-08, - "loss": 0.1961, - "step": 1439100 - }, - { - "epoch": 14.66, - "learning_rate": 8.286446619200083e-08, - "loss": 0.1836, - "step": 1439200 - }, - { - "epoch": 14.66, - "learning_rate": 8.237249184459672e-08, - "loss": 0.2417, - "step": 1439300 - }, - { - "epoch": 14.66, - "learning_rate": 8.188198029063899e-08, - "loss": 0.2154, - "step": 1439400 - }, - { - "epoch": 14.67, - "learning_rate": 8.139293155410843e-08, - "loss": 0.1574, - "step": 1439500 - }, - { - "epoch": 14.67, - "learning_rate": 8.09053456589226e-08, - "loss": 0.211, - "step": 1439600 - }, - { - "epoch": 14.67, - "learning_rate": 8.041922262891577e-08, - "loss": 0.2026, - "step": 1439700 - }, - { - "epoch": 14.67, - "learning_rate": 7.993456248785558e-08, - "loss": 0.2018, - "step": 1439800 - }, - { - "epoch": 14.67, - "learning_rate": 7.945136525944307e-08, - "loss": 0.1629, - "step": 1439900 - }, - { - "epoch": 14.67, - "learning_rate": 7.89696309673027e-08, - "loss": 0.2521, - "step": 1440000 - }, - { - "epoch": 14.67, - "learning_rate": 7.848935963498893e-08, - "loss": 0.2006, - "step": 1440100 - }, - { - "epoch": 14.67, - "learning_rate": 7.801055128597967e-08, - "loss": 0.198, - "step": 1440200 - }, - { - "epoch": 14.67, - "learning_rate": 7.753320594369284e-08, - "loss": 0.2044, - "step": 1440300 - }, - { - "epoch": 14.68, - "learning_rate": 7.705732363146644e-08, - "loss": 0.2293, - "step": 1440400 - }, - { - "epoch": 14.68, - "learning_rate": 7.658290437256188e-08, - "loss": 0.2141, - "step": 1440500 - }, - { - "epoch": 14.68, - "learning_rate": 7.610994819018058e-08, - "loss": 0.1699, - "step": 1440600 - }, - { - "epoch": 14.68, - "learning_rate": 7.563845510744737e-08, - "loss": 0.1848, - "step": 1440700 - }, - { - "epoch": 14.68, - "learning_rate": 7.516842514741385e-08, - "loss": 0.1659, - "step": 1440800 - }, - { - "epoch": 14.68, - "learning_rate": 7.469985833306159e-08, - "loss": 0.2022, - "step": 1440900 - }, - { - "epoch": 14.68, - "learning_rate": 7.423741848100019e-08, - "loss": 0.2156, - "step": 1441000 - }, - { - "epoch": 14.68, - "learning_rate": 7.377176339464331e-08, - "loss": 0.217, - "step": 1441100 - }, - { - "epoch": 14.68, - "learning_rate": 7.330757152225975e-08, - "loss": 0.2187, - "step": 1441200 - }, - { - "epoch": 14.68, - "learning_rate": 7.284484288653803e-08, - "loss": 0.1582, - "step": 1441300 - }, - { - "epoch": 14.69, - "learning_rate": 7.238357751011005e-08, - "loss": 0.201, - "step": 1441400 - }, - { - "epoch": 14.69, - "learning_rate": 7.192377541552774e-08, - "loss": 0.182, - "step": 1441500 - }, - { - "epoch": 14.69, - "learning_rate": 7.146543662526983e-08, - "loss": 0.197, - "step": 1441600 - }, - { - "epoch": 14.69, - "learning_rate": 7.100856116174836e-08, - "loss": 0.1976, - "step": 1441700 - }, - { - "epoch": 14.69, - "learning_rate": 7.055314904730214e-08, - "loss": 0.2165, - "step": 1441800 - }, - { - "epoch": 14.69, - "learning_rate": 7.00992003041967e-08, - "loss": 0.1645, - "step": 1441900 - }, - { - "epoch": 14.69, - "learning_rate": 6.96467149546276e-08, - "loss": 0.1716, - "step": 1442000 - }, - { - "epoch": 14.69, - "learning_rate": 6.919569302072048e-08, - "loss": 0.2125, - "step": 1442100 - }, - { - "epoch": 14.69, - "learning_rate": 6.874613452452438e-08, - "loss": 0.1801, - "step": 1442200 - }, - { - "epoch": 14.69, - "learning_rate": 6.829803948802505e-08, - "loss": 0.247, - "step": 1442300 - }, - { - "epoch": 14.7, - "learning_rate": 6.785140793312494e-08, - "loss": 0.2013, - "step": 1442400 - }, - { - "epoch": 14.7, - "learning_rate": 6.740623988166661e-08, - "loss": 0.1739, - "step": 1442500 - }, - { - "epoch": 14.7, - "learning_rate": 6.696253535541264e-08, - "loss": 0.1993, - "step": 1442600 - }, - { - "epoch": 14.7, - "learning_rate": 6.652029437606232e-08, - "loss": 0.2091, - "step": 1442700 - }, - { - "epoch": 14.7, - "learning_rate": 6.607951696523839e-08, - "loss": 0.162, - "step": 1442800 - }, - { - "epoch": 14.7, - "learning_rate": 6.56402031444836e-08, - "loss": 0.1642, - "step": 1442900 - }, - { - "epoch": 14.7, - "learning_rate": 6.520235293528742e-08, - "loss": 0.2367, - "step": 1443000 - }, - { - "epoch": 14.7, - "learning_rate": 6.476596635905607e-08, - "loss": 0.202, - "step": 1443100 - }, - { - "epoch": 14.7, - "learning_rate": 6.433104343712249e-08, - "loss": 0.1882, - "step": 1443200 - }, - { - "epoch": 14.7, - "learning_rate": 6.390191153795688e-08, - "loss": 0.1843, - "step": 1443300 - }, - { - "epoch": 14.71, - "learning_rate": 6.34785272106464e-08, - "loss": 0.205, - "step": 1443400 - }, - { - "epoch": 14.71, - "learning_rate": 6.304795146707343e-08, - "loss": 0.2104, - "step": 1443500 - }, - { - "epoch": 14.71, - "learning_rate": 6.261883946180436e-08, - "loss": 0.168, - "step": 1443600 - }, - { - "epoch": 14.71, - "learning_rate": 6.219119121581573e-08, - "loss": 0.2349, - "step": 1443700 - }, - { - "epoch": 14.71, - "learning_rate": 6.17650067500175e-08, - "loss": 0.1979, - "step": 1443800 - }, - { - "epoch": 14.71, - "learning_rate": 6.134028608524967e-08, - "loss": 0.2192, - "step": 1443900 - }, - { - "epoch": 14.71, - "learning_rate": 6.09170292422756e-08, - "loss": 0.2286, - "step": 1444000 - }, - { - "epoch": 14.71, - "learning_rate": 6.049523624179542e-08, - "loss": 0.1581, - "step": 1444100 - }, - { - "epoch": 14.71, - "learning_rate": 6.007490710442597e-08, - "loss": 0.2618, - "step": 1444200 - }, - { - "epoch": 14.71, - "learning_rate": 5.965604185072083e-08, - "loss": 0.1825, - "step": 1444300 - }, - { - "epoch": 14.72, - "learning_rate": 5.9238640501160235e-08, - "loss": 0.2146, - "step": 1444400 - }, - { - "epoch": 14.72, - "learning_rate": 5.882270307615456e-08, - "loss": 0.1932, - "step": 1444500 - }, - { - "epoch": 14.72, - "learning_rate": 5.8408229596040865e-08, - "loss": 0.2597, - "step": 1444600 - }, - { - "epoch": 14.72, - "learning_rate": 5.79952200810796e-08, - "loss": 0.1506, - "step": 1444700 - }, - { - "epoch": 14.72, - "learning_rate": 5.758367455146796e-08, - "loss": 0.2036, - "step": 1444800 - }, - { - "epoch": 14.72, - "learning_rate": 5.717359302732983e-08, - "loss": 0.1882, - "step": 1444900 - }, - { - "epoch": 14.72, - "learning_rate": 5.6764975528712514e-08, - "loss": 0.1704, - "step": 1445000 - }, - { - "epoch": 14.72, - "learning_rate": 5.6357822075593366e-08, - "loss": 0.1686, - "step": 1445100 - }, - { - "epoch": 14.72, - "learning_rate": 5.5952132687883125e-08, - "loss": 0.1888, - "step": 1445200 - }, - { - "epoch": 14.72, - "learning_rate": 5.554790738541926e-08, - "loss": 0.149, - "step": 1445300 - }, - { - "epoch": 14.73, - "learning_rate": 5.514514618796263e-08, - "loss": 0.1875, - "step": 1445400 - }, - { - "epoch": 14.73, - "learning_rate": 5.4743849115207464e-08, - "loss": 0.2, - "step": 1445500 - }, - { - "epoch": 14.73, - "learning_rate": 5.4344016186771425e-08, - "loss": 0.2462, - "step": 1445600 - }, - { - "epoch": 14.73, - "learning_rate": 5.394564742220887e-08, - "loss": 0.176, - "step": 1445700 - }, - { - "epoch": 14.73, - "learning_rate": 5.3548742840997535e-08, - "loss": 0.1443, - "step": 1445800 - }, - { - "epoch": 14.73, - "learning_rate": 5.3153302462538577e-08, - "loss": 0.2075, - "step": 1445900 - }, - { - "epoch": 14.73, - "learning_rate": 5.2759326306169867e-08, - "loss": 0.1896, - "step": 1446000 - }, - { - "epoch": 14.73, - "learning_rate": 5.236681439115598e-08, - "loss": 0.2133, - "step": 1446100 - }, - { - "epoch": 14.73, - "learning_rate": 5.1975766736684916e-08, - "loss": 0.196, - "step": 1446200 - }, - { - "epoch": 14.74, - "learning_rate": 5.1586183361878037e-08, - "loss": 0.1778, - "step": 1446300 - }, - { - "epoch": 14.74, - "learning_rate": 5.1198064285783444e-08, - "loss": 0.2375, - "step": 1446400 - }, - { - "epoch": 14.74, - "learning_rate": 5.081140952738261e-08, - "loss": 0.185, - "step": 1446500 - }, - { - "epoch": 14.74, - "learning_rate": 5.042621910557377e-08, - "loss": 0.174, - "step": 1446600 - }, - { - "epoch": 14.74, - "learning_rate": 5.004249303919184e-08, - "loss": 0.1906, - "step": 1446700 - }, - { - "epoch": 14.74, - "learning_rate": 4.966023134700182e-08, - "loss": 0.2078, - "step": 1446800 - }, - { - "epoch": 14.74, - "learning_rate": 4.9279434047692084e-08, - "loss": 0.2539, - "step": 1446900 - }, - { - "epoch": 14.74, - "learning_rate": 4.8900101159881085e-08, - "loss": 0.2102, - "step": 1447000 - }, - { - "epoch": 14.74, - "learning_rate": 4.852223270211398e-08, - "loss": 0.1696, - "step": 1447100 - }, - { - "epoch": 14.74, - "learning_rate": 4.814582869287265e-08, - "loss": 0.1994, - "step": 1447200 - }, - { - "epoch": 14.75, - "learning_rate": 4.777088915055572e-08, - "loss": 0.2103, - "step": 1447300 - }, - { - "epoch": 14.75, - "learning_rate": 4.739741409349518e-08, - "loss": 0.2112, - "step": 1447400 - }, - { - "epoch": 14.75, - "learning_rate": 4.70254035399531e-08, - "loss": 0.2281, - "step": 1447500 - }, - { - "epoch": 14.75, - "learning_rate": 4.665485750812159e-08, - "loss": 0.177, - "step": 1447600 - }, - { - "epoch": 14.75, - "learning_rate": 4.628577601611283e-08, - "loss": 0.1839, - "step": 1447700 - }, - { - "epoch": 14.75, - "learning_rate": 4.591815908197572e-08, - "loss": 0.2113, - "step": 1447800 - }, - { - "epoch": 14.75, - "learning_rate": 4.555200672368587e-08, - "loss": 0.174, - "step": 1447900 - }, - { - "epoch": 14.75, - "learning_rate": 4.51873189591423e-08, - "loss": 0.2018, - "step": 1448000 - }, - { - "epoch": 14.75, - "learning_rate": 4.482409580617741e-08, - "loss": 0.2139, - "step": 1448100 - }, - { - "epoch": 14.75, - "learning_rate": 4.4462337282553664e-08, - "loss": 0.1494, - "step": 1448200 - }, - { - "epoch": 14.76, - "learning_rate": 4.410204340595359e-08, - "loss": 0.2573, - "step": 1448300 - }, - { - "epoch": 14.76, - "learning_rate": 4.3743214193996405e-08, - "loss": 0.2004, - "step": 1448400 - }, - { - "epoch": 14.76, - "learning_rate": 4.3385849664228094e-08, - "loss": 0.1965, - "step": 1448500 - }, - { - "epoch": 14.76, - "learning_rate": 4.302994983411801e-08, - "loss": 0.1839, - "step": 1448600 - }, - { - "epoch": 14.76, - "learning_rate": 4.267551472107223e-08, - "loss": 0.196, - "step": 1448700 - }, - { - "epoch": 14.76, - "learning_rate": 4.232254434241689e-08, - "loss": 0.1552, - "step": 1448800 - }, - { - "epoch": 14.76, - "learning_rate": 4.1971038715408194e-08, - "loss": 0.191, - "step": 1448900 - }, - { - "epoch": 14.76, - "learning_rate": 4.1620997857239054e-08, - "loss": 0.1813, - "step": 1449000 - }, - { - "epoch": 14.76, - "learning_rate": 4.127242178501911e-08, - "loss": 0.2098, - "step": 1449100 - }, - { - "epoch": 14.76, - "learning_rate": 4.0925310515794735e-08, - "loss": 0.1737, - "step": 1449200 - }, - { - "epoch": 14.77, - "learning_rate": 4.0579664066535684e-08, - "loss": 0.2085, - "step": 1449300 - }, - { - "epoch": 14.77, - "learning_rate": 4.023548245414177e-08, - "loss": 0.2144, - "step": 1449400 - }, - { - "epoch": 14.77, - "learning_rate": 3.989618561194952e-08, - "loss": 0.1543, - "step": 1449500 - }, - { - "epoch": 14.77, - "learning_rate": 3.955491907491338e-08, - "loss": 0.2019, - "step": 1449600 - }, - { - "epoch": 14.77, - "learning_rate": 3.9215117424848954e-08, - "loss": 0.2141, - "step": 1449700 - }, - { - "epoch": 14.77, - "learning_rate": 3.887678067836631e-08, - "loss": 0.2145, - "step": 1449800 - }, - { - "epoch": 14.77, - "learning_rate": 3.8539908852012194e-08, - "loss": 0.2203, - "step": 1449900 - }, - { - "epoch": 14.77, - "learning_rate": 3.820450196225678e-08, - "loss": 0.2092, - "step": 1450000 - }, - { - "epoch": 14.77, - "learning_rate": 3.7873892193291957e-08, - "loss": 0.2101, - "step": 1450100 - }, - { - "epoch": 14.77, - "learning_rate": 3.754140057608324e-08, - "loss": 0.1997, - "step": 1450200 - }, - { - "epoch": 14.78, - "learning_rate": 3.7210373944294164e-08, - "loss": 0.1872, - "step": 1450300 - }, - { - "epoch": 14.78, - "learning_rate": 3.6880812314111777e-08, - "loss": 0.2057, - "step": 1450400 - }, - { - "epoch": 14.78, - "learning_rate": 3.6552715701643204e-08, - "loss": 0.161, - "step": 1450500 - }, - { - "epoch": 14.78, - "learning_rate": 3.62260841229356e-08, - "loss": 0.1848, - "step": 1450600 - }, - { - "epoch": 14.78, - "learning_rate": 3.5900917593956193e-08, - "loss": 0.1614, - "step": 1450700 - }, - { - "epoch": 14.78, - "learning_rate": 3.557721613060894e-08, - "loss": 0.1744, - "step": 1450800 - }, - { - "epoch": 14.78, - "learning_rate": 3.52549797487145e-08, - "loss": 0.206, - "step": 1450900 - }, - { - "epoch": 14.78, - "learning_rate": 3.4934208464033614e-08, - "loss": 0.2206, - "step": 1451000 - }, - { - "epoch": 14.78, - "learning_rate": 3.4614902292247064e-08, - "loss": 0.1777, - "step": 1451100 - }, - { - "epoch": 14.79, - "learning_rate": 3.429706124896903e-08, - "loss": 0.1583, - "step": 1451200 - }, - { - "epoch": 14.79, - "learning_rate": 3.3980685349737085e-08, - "loss": 0.1605, - "step": 1451300 - }, - { - "epoch": 14.79, - "learning_rate": 3.366577461002218e-08, - "loss": 0.2356, - "step": 1451400 - }, - { - "epoch": 14.79, - "learning_rate": 3.335232904521868e-08, - "loss": 0.1783, - "step": 1451500 - }, - { - "epoch": 14.79, - "learning_rate": 3.304034867065764e-08, - "loss": 0.2089, - "step": 1451600 - }, - { - "epoch": 14.79, - "learning_rate": 3.27298335015902e-08, - "loss": 0.2676, - "step": 1451700 - }, - { - "epoch": 14.79, - "learning_rate": 3.242078355319755e-08, - "loss": 0.1623, - "step": 1451800 - }, - { - "epoch": 14.79, - "learning_rate": 3.211319884059094e-08, - "loss": 0.1803, - "step": 1451900 - }, - { - "epoch": 14.79, - "learning_rate": 3.180707937880834e-08, - "loss": 0.1781, - "step": 1452000 - }, - { - "epoch": 14.79, - "learning_rate": 3.1502425182821096e-08, - "loss": 0.1636, - "step": 1452100 - }, - { - "epoch": 14.8, - "learning_rate": 3.119923626752397e-08, - "loss": 0.2128, - "step": 1452200 - }, - { - "epoch": 14.8, - "learning_rate": 3.0897512647738436e-08, - "loss": 0.1898, - "step": 1452300 - }, - { - "epoch": 14.8, - "learning_rate": 3.059725433821603e-08, - "loss": 0.2207, - "step": 1452400 - }, - { - "epoch": 14.8, - "learning_rate": 3.0298461353645e-08, - "loss": 0.2046, - "step": 1452500 - }, - { - "epoch": 14.8, - "learning_rate": 3.000113370862367e-08, - "loss": 0.1874, - "step": 1452600 - }, - { - "epoch": 14.8, - "learning_rate": 2.970527141770041e-08, - "loss": 0.1721, - "step": 1452700 - }, - { - "epoch": 14.8, - "learning_rate": 2.941087449533364e-08, - "loss": 0.2124, - "step": 1452800 - }, - { - "epoch": 14.8, - "learning_rate": 2.911794295591852e-08, - "loss": 0.2263, - "step": 1452900 - }, - { - "epoch": 14.8, - "learning_rate": 2.8826476813780256e-08, - "loss": 0.1808, - "step": 1453000 - }, - { - "epoch": 14.8, - "learning_rate": 2.853647608316745e-08, - "loss": 0.1433, - "step": 1453100 - }, - { - "epoch": 14.81, - "learning_rate": 2.8247940778262094e-08, - "loss": 0.2499, - "step": 1453200 - }, - { - "epoch": 14.81, - "learning_rate": 2.7960870913169567e-08, - "loss": 0.1748, - "step": 1453300 - }, - { - "epoch": 14.81, - "learning_rate": 2.7675266501928643e-08, - "loss": 0.2086, - "step": 1453400 - }, - { - "epoch": 14.81, - "learning_rate": 2.739112755850148e-08, - "loss": 0.2081, - "step": 1453500 - }, - { - "epoch": 14.81, - "learning_rate": 2.710845409678031e-08, - "loss": 0.1882, - "step": 1453600 - }, - { - "epoch": 14.81, - "learning_rate": 2.682724613058407e-08, - "loss": 0.1835, - "step": 1453700 - }, - { - "epoch": 14.81, - "learning_rate": 2.6550293843923268e-08, - "loss": 0.1971, - "step": 1453800 - }, - { - "epoch": 14.81, - "learning_rate": 2.627200225466364e-08, - "loss": 0.2619, - "step": 1453900 - }, - { - "epoch": 14.81, - "learning_rate": 2.5995176201829563e-08, - "loss": 0.2528, - "step": 1454000 - }, - { - "epoch": 14.81, - "learning_rate": 2.5719815698956873e-08, - "loss": 0.1576, - "step": 1454100 - }, - { - "epoch": 14.82, - "learning_rate": 2.5445920759508135e-08, - "loss": 0.2124, - "step": 1454200 - }, - { - "epoch": 14.82, - "learning_rate": 2.517349139687264e-08, - "loss": 0.2121, - "step": 1454300 - }, - { - "epoch": 14.82, - "learning_rate": 2.4902527624376394e-08, - "loss": 0.21, - "step": 1454400 - }, - { - "epoch": 14.82, - "learning_rate": 2.4633029455262136e-08, - "loss": 0.2183, - "step": 1454500 - }, - { - "epoch": 14.82, - "learning_rate": 2.4364996902709325e-08, - "loss": 0.2468, - "step": 1454600 - }, - { - "epoch": 14.82, - "learning_rate": 2.4098429979824145e-08, - "loss": 0.2606, - "step": 1454700 - }, - { - "epoch": 14.82, - "learning_rate": 2.38333286996395e-08, - "loss": 0.2254, - "step": 1454800 - }, - { - "epoch": 14.82, - "learning_rate": 2.3569693075115027e-08, - "loss": 0.1622, - "step": 1454900 - }, - { - "epoch": 14.82, - "learning_rate": 2.3307523119143746e-08, - "loss": 0.2186, - "step": 1455000 - }, - { - "epoch": 14.82, - "learning_rate": 2.304681884454207e-08, - "loss": 0.2425, - "step": 1455100 - }, - { - "epoch": 14.83, - "learning_rate": 2.2787580264056473e-08, - "loss": 0.1784, - "step": 1455200 - }, - { - "epoch": 14.83, - "learning_rate": 2.252980739036681e-08, - "loss": 0.1803, - "step": 1455300 - }, - { - "epoch": 14.83, - "learning_rate": 2.2273500236069665e-08, - "loss": 0.1929, - "step": 1455400 - }, - { - "epoch": 14.83, - "learning_rate": 2.2018658813705017e-08, - "loss": 0.1776, - "step": 1455500 - }, - { - "epoch": 14.83, - "learning_rate": 2.1765283135726234e-08, - "loss": 0.1966, - "step": 1455600 - }, - { - "epoch": 14.83, - "learning_rate": 2.15133732145234e-08, - "loss": 0.1713, - "step": 1455700 - }, - { - "epoch": 14.83, - "learning_rate": 2.126292906241334e-08, - "loss": 0.2224, - "step": 1455800 - }, - { - "epoch": 14.83, - "learning_rate": 2.1013950691646245e-08, - "loss": 0.2192, - "step": 1455900 - }, - { - "epoch": 14.83, - "learning_rate": 2.0766438114389053e-08, - "loss": 0.1598, - "step": 1456000 - }, - { - "epoch": 14.84, - "learning_rate": 2.052284455468567e-08, - "loss": 0.2115, - "step": 1456100 - }, - { - "epoch": 14.84, - "learning_rate": 2.027824894245467e-08, - "loss": 0.2112, - "step": 1456200 - }, - { - "epoch": 14.84, - "learning_rate": 2.0035119159704795e-08, - "loss": 0.2143, - "step": 1456300 - }, - { - "epoch": 14.84, - "learning_rate": 1.979345521832654e-08, - "loss": 0.2128, - "step": 1456400 - }, - { - "epoch": 14.84, - "learning_rate": 1.9553257130137115e-08, - "loss": 0.2094, - "step": 1456500 - }, - { - "epoch": 14.84, - "learning_rate": 1.931452490688046e-08, - "loss": 0.1535, - "step": 1456600 - }, - { - "epoch": 14.84, - "learning_rate": 1.907725856022724e-08, - "loss": 0.2298, - "step": 1456700 - }, - { - "epoch": 14.84, - "learning_rate": 1.8841458101778175e-08, - "loss": 0.1824, - "step": 1456800 - }, - { - "epoch": 14.84, - "learning_rate": 1.8607123543064042e-08, - "loss": 0.1816, - "step": 1456900 - }, - { - "epoch": 14.84, - "learning_rate": 1.8374254895542343e-08, - "loss": 0.2051, - "step": 1457000 - }, - { - "epoch": 14.85, - "learning_rate": 1.81428521705973e-08, - "loss": 0.1971, - "step": 1457100 - }, - { - "epoch": 14.85, - "learning_rate": 1.7912915379546536e-08, - "loss": 0.2332, - "step": 1457200 - }, - { - "epoch": 14.85, - "learning_rate": 1.768444453363105e-08, - "loss": 0.1699, - "step": 1457300 - }, - { - "epoch": 14.85, - "learning_rate": 1.7457439644018582e-08, - "loss": 0.2063, - "step": 1457400 - }, - { - "epoch": 14.85, - "learning_rate": 1.7231900721810246e-08, - "loss": 0.2011, - "step": 1457500 - }, - { - "epoch": 14.85, - "learning_rate": 1.700782777803722e-08, - "loss": 0.2125, - "step": 1457600 - }, - { - "epoch": 14.85, - "learning_rate": 1.678522082365075e-08, - "loss": 0.2297, - "step": 1457700 - }, - { - "epoch": 14.85, - "learning_rate": 1.6564079869535453e-08, - "loss": 0.1793, - "step": 1457800 - }, - { - "epoch": 14.85, - "learning_rate": 1.634440492650602e-08, - "loss": 0.2206, - "step": 1457900 - }, - { - "epoch": 14.85, - "learning_rate": 1.612619600530385e-08, - "loss": 0.1695, - "step": 1458000 - }, - { - "epoch": 14.86, - "learning_rate": 1.5909453116593754e-08, - "loss": 0.1539, - "step": 1458100 - }, - { - "epoch": 14.86, - "learning_rate": 1.5694176270973914e-08, - "loss": 0.1763, - "step": 1458200 - }, - { - "epoch": 14.86, - "learning_rate": 1.5482496329894956e-08, - "loss": 0.186, - "step": 1458300 - }, - { - "epoch": 14.86, - "learning_rate": 1.5270136941277768e-08, - "loss": 0.1926, - "step": 1458400 - }, - { - "epoch": 14.86, - "learning_rate": 1.5059243627005837e-08, - "loss": 0.1641, - "step": 1458500 - }, - { - "epoch": 14.86, - "learning_rate": 1.4849816397400906e-08, - "loss": 0.1857, - "step": 1458600 - }, - { - "epoch": 14.86, - "learning_rate": 1.4641855262694793e-08, - "loss": 0.1581, - "step": 1458700 - }, - { - "epoch": 14.86, - "learning_rate": 1.4435360233059357e-08, - "loss": 0.1783, - "step": 1458800 - }, - { - "epoch": 14.86, - "learning_rate": 1.4230331318593193e-08, - "loss": 0.1566, - "step": 1458900 - }, - { - "epoch": 14.86, - "learning_rate": 1.4026768529314948e-08, - "loss": 0.1741, - "step": 1459000 - }, - { - "epoch": 14.87, - "learning_rate": 1.3824671875179995e-08, - "loss": 0.1815, - "step": 1459100 - }, - { - "epoch": 14.87, - "learning_rate": 1.3624041366073758e-08, - "loss": 0.2029, - "step": 1459200 - }, - { - "epoch": 14.87, - "learning_rate": 1.3424877011801728e-08, - "loss": 0.1572, - "step": 1459300 - }, - { - "epoch": 14.87, - "learning_rate": 1.3227178822099451e-08, - "loss": 0.1806, - "step": 1459400 - }, - { - "epoch": 14.87, - "learning_rate": 1.3030946806639187e-08, - "loss": 0.1842, - "step": 1459500 - }, - { - "epoch": 14.87, - "learning_rate": 1.2836180975013267e-08, - "loss": 0.1758, - "step": 1459600 - }, - { - "epoch": 14.87, - "learning_rate": 1.2642881336744072e-08, - "loss": 0.1915, - "step": 1459700 - }, - { - "epoch": 14.87, - "learning_rate": 1.245104790128071e-08, - "loss": 0.2067, - "step": 1459800 - }, - { - "epoch": 14.87, - "learning_rate": 1.2260680678005675e-08, - "loss": 0.2394, - "step": 1459900 - }, - { - "epoch": 14.87, - "learning_rate": 1.2071779676224859e-08, - "loss": 0.2072, - "step": 1460000 - }, - { - "epoch": 14.88, - "learning_rate": 1.1884344905177535e-08, - "loss": 0.2015, - "step": 1460100 - }, - { - "epoch": 14.88, - "learning_rate": 1.1698376374026376e-08, - "loss": 0.2372, - "step": 1460200 - }, - { - "epoch": 14.88, - "learning_rate": 1.1513874091860777e-08, - "loss": 0.2165, - "step": 1460300 - }, - { - "epoch": 14.88, - "learning_rate": 1.133083806770685e-08, - "loss": 0.2048, - "step": 1460400 - }, - { - "epoch": 14.88, - "learning_rate": 1.1149268310514105e-08, - "loss": 0.167, - "step": 1460500 - }, - { - "epoch": 14.88, - "learning_rate": 1.0969164829155443e-08, - "loss": 0.1775, - "step": 1460600 - }, - { - "epoch": 14.88, - "learning_rate": 1.0790527632437152e-08, - "loss": 0.2064, - "step": 1460700 - }, - { - "epoch": 14.88, - "learning_rate": 1.061335672909891e-08, - "loss": 0.2194, - "step": 1460800 - }, - { - "epoch": 14.88, - "learning_rate": 1.0437652127800457e-08, - "loss": 0.1784, - "step": 1460900 - }, - { - "epoch": 14.88, - "learning_rate": 1.0263413837131585e-08, - "loss": 0.2059, - "step": 1461000 - }, - { - "epoch": 14.89, - "learning_rate": 1.0090641865612149e-08, - "loss": 0.1452, - "step": 1461100 - }, - { - "epoch": 14.89, - "learning_rate": 9.919336221688724e-09, - "loss": 0.1968, - "step": 1461200 - }, - { - "epoch": 14.89, - "learning_rate": 9.749496913741273e-09, - "loss": 0.2167, - "step": 1461300 - }, - { - "epoch": 14.89, - "learning_rate": 9.581123950069826e-09, - "loss": 0.2541, - "step": 1461400 - }, - { - "epoch": 14.89, - "learning_rate": 9.414217338907794e-09, - "loss": 0.2173, - "step": 1461500 - }, - { - "epoch": 14.89, - "learning_rate": 9.248777088415316e-09, - "loss": 0.1757, - "step": 1461600 - }, - { - "epoch": 14.89, - "learning_rate": 9.08480320668259e-09, - "loss": 0.2664, - "step": 1461700 - }, - { - "epoch": 14.89, - "learning_rate": 8.922295701726535e-09, - "loss": 0.2454, - "step": 1461800 - }, - { - "epoch": 14.89, - "learning_rate": 8.761254581494127e-09, - "loss": 0.1952, - "step": 1461900 - }, - { - "epoch": 14.9, - "learning_rate": 8.601679853855737e-09, - "loss": 0.2253, - "step": 1462000 - }, - { - "epoch": 14.9, - "learning_rate": 8.443571526618454e-09, - "loss": 0.1812, - "step": 1462100 - }, - { - "epoch": 14.9, - "learning_rate": 8.286929607506099e-09, - "loss": 0.1555, - "step": 1462200 - }, - { - "epoch": 14.9, - "learning_rate": 8.131754104185874e-09, - "loss": 0.1728, - "step": 1462300 - }, - { - "epoch": 14.9, - "learning_rate": 7.978045024238379e-09, - "loss": 0.229, - "step": 1462400 - }, - { - "epoch": 14.9, - "learning_rate": 7.825802375184266e-09, - "loss": 0.1775, - "step": 1462500 - }, - { - "epoch": 14.9, - "learning_rate": 7.675026164460919e-09, - "loss": 0.1774, - "step": 1462600 - }, - { - "epoch": 14.9, - "learning_rate": 7.52571639944577e-09, - "loss": 0.2089, - "step": 1462700 - }, - { - "epoch": 14.9, - "learning_rate": 7.377873087436315e-09, - "loss": 0.2295, - "step": 1462800 - }, - { - "epoch": 14.9, - "learning_rate": 7.231496235663438e-09, - "loss": 0.2113, - "step": 1462900 - }, - { - "epoch": 14.91, - "learning_rate": 7.086585851284743e-09, - "loss": 0.1892, - "step": 1463000 - }, - { - "epoch": 14.91, - "learning_rate": 6.9431419413812366e-09, - "loss": 0.2007, - "step": 1463100 - }, - { - "epoch": 14.91, - "learning_rate": 6.801164512967306e-09, - "loss": 0.2063, - "step": 1463200 - }, - { - "epoch": 14.91, - "learning_rate": 6.660653572990727e-09, - "loss": 0.1756, - "step": 1463300 - }, - { - "epoch": 14.91, - "learning_rate": 6.5216091283126775e-09, - "loss": 0.1743, - "step": 1463400 - }, - { - "epoch": 14.91, - "learning_rate": 6.384031185741046e-09, - "loss": 0.1917, - "step": 1463500 - }, - { - "epoch": 14.91, - "learning_rate": 6.249273607091821e-09, - "loss": 0.1995, - "step": 1463600 - }, - { - "epoch": 14.91, - "learning_rate": 6.114614023641219e-09, - "loss": 0.2464, - "step": 1463700 - }, - { - "epoch": 14.91, - "learning_rate": 5.9814209621911195e-09, - "loss": 0.1897, - "step": 1463800 - }, - { - "epoch": 14.91, - "learning_rate": 5.849694429256313e-09, - "loss": 0.1688, - "step": 1463900 - }, - { - "epoch": 14.92, - "learning_rate": 5.7194344312749836e-09, - "loss": 0.232, - "step": 1464000 - }, - { - "epoch": 14.92, - "learning_rate": 5.590640974618699e-09, - "loss": 0.2194, - "step": 1464100 - }, - { - "epoch": 14.92, - "learning_rate": 5.463314065579095e-09, - "loss": 0.1677, - "step": 1464200 - }, - { - "epoch": 14.92, - "learning_rate": 5.3374537103878515e-09, - "loss": 0.2129, - "step": 1464300 - }, - { - "epoch": 14.92, - "learning_rate": 5.213059915196716e-09, - "loss": 0.1721, - "step": 1464400 - }, - { - "epoch": 14.92, - "learning_rate": 5.090132686087489e-09, - "loss": 0.2306, - "step": 1464500 - }, - { - "epoch": 14.92, - "learning_rate": 4.968672029068699e-09, - "loss": 0.2228, - "step": 1464600 - }, - { - "epoch": 14.92, - "learning_rate": 4.848677950082259e-09, - "loss": 0.2268, - "step": 1464700 - }, - { - "epoch": 14.92, - "learning_rate": 4.730150454993476e-09, - "loss": 0.1743, - "step": 1464800 - }, - { - "epoch": 14.92, - "learning_rate": 4.613089549597716e-09, - "loss": 0.1844, - "step": 1464900 - }, - { - "epoch": 14.93, - "learning_rate": 4.497495239617066e-09, - "loss": 0.2272, - "step": 1465000 - }, - { - "epoch": 14.93, - "learning_rate": 4.383367530707006e-09, - "loss": 0.2199, - "step": 1465100 - }, - { - "epoch": 14.93, - "learning_rate": 4.270706428446403e-09, - "loss": 0.1622, - "step": 1465200 - }, - { - "epoch": 14.93, - "learning_rate": 4.159511938340854e-09, - "loss": 0.2007, - "step": 1465300 - }, - { - "epoch": 14.93, - "learning_rate": 4.049784065832673e-09, - "loss": 0.1835, - "step": 1465400 - }, - { - "epoch": 14.93, - "learning_rate": 3.941522816280907e-09, - "loss": 0.1652, - "step": 1465500 - }, - { - "epoch": 14.93, - "learning_rate": 3.835788881370039e-09, - "loss": 0.1841, - "step": 1465600 - }, - { - "epoch": 14.93, - "learning_rate": 3.730446227183659e-09, - "loss": 0.2569, - "step": 1465700 - }, - { - "epoch": 14.93, - "learning_rate": 3.6265702115712008e-09, - "loss": 0.1563, - "step": 1465800 - }, - { - "epoch": 14.93, - "learning_rate": 3.524160839611934e-09, - "loss": 0.1511, - "step": 1465900 - }, - { - "epoch": 14.94, - "learning_rate": 3.423218116308524e-09, - "loss": 0.1571, - "step": 1466000 - }, - { - "epoch": 14.94, - "learning_rate": 3.323742046603684e-09, - "loss": 0.1864, - "step": 1466100 - }, - { - "epoch": 14.94, - "learning_rate": 3.2257326353535288e-09, - "loss": 0.237, - "step": 1466200 - }, - { - "epoch": 14.94, - "learning_rate": 3.1291898873542224e-09, - "loss": 0.2096, - "step": 1466300 - }, - { - "epoch": 14.94, - "learning_rate": 3.0341138073286535e-09, - "loss": 0.2151, - "step": 1466400 - }, - { - "epoch": 14.94, - "learning_rate": 2.9405043999197743e-09, - "loss": 0.1979, - "step": 1466500 - }, - { - "epoch": 14.94, - "learning_rate": 2.8483616697072555e-09, - "loss": 0.2167, - "step": 1466600 - }, - { - "epoch": 14.94, - "learning_rate": 2.757685621197492e-09, - "loss": 0.2118, - "step": 1466700 - }, - { - "epoch": 14.94, - "learning_rate": 2.668476258820274e-09, - "loss": 0.1512, - "step": 1466800 - }, - { - "epoch": 14.95, - "learning_rate": 2.5807335869421078e-09, - "loss": 0.1933, - "step": 1466900 - }, - { - "epoch": 14.95, - "learning_rate": 2.494457609849565e-09, - "loss": 0.196, - "step": 1467000 - }, - { - "epoch": 14.95, - "learning_rate": 2.409648331762604e-09, - "loss": 0.1769, - "step": 1467100 - }, - { - "epoch": 14.95, - "learning_rate": 2.326305756827907e-09, - "loss": 0.2022, - "step": 1467200 - }, - { - "epoch": 14.95, - "learning_rate": 2.244429889118882e-09, - "loss": 0.2537, - "step": 1467300 - }, - { - "epoch": 14.95, - "learning_rate": 2.1640207326423245e-09, - "loss": 0.2234, - "step": 1467400 - }, - { - "epoch": 14.95, - "learning_rate": 2.0850782913250933e-09, - "loss": 0.1695, - "step": 1467500 - }, - { - "epoch": 14.95, - "learning_rate": 2.0076025690307644e-09, - "loss": 0.1944, - "step": 1467600 - }, - { - "epoch": 14.95, - "learning_rate": 1.9315935695463083e-09, - "loss": 0.2002, - "step": 1467700 - }, - { - "epoch": 14.95, - "learning_rate": 1.8570512965854213e-09, - "loss": 0.1569, - "step": 1467800 - }, - { - "epoch": 14.96, - "learning_rate": 1.783975753798517e-09, - "loss": 0.2275, - "step": 1467900 - }, - { - "epoch": 14.96, - "learning_rate": 1.7123669447527413e-09, - "loss": 0.1596, - "step": 1468000 - }, - { - "epoch": 14.96, - "learning_rate": 1.6422248729519585e-09, - "loss": 0.2032, - "step": 1468100 - }, - { - "epoch": 14.96, - "learning_rate": 1.5735495418267577e-09, - "loss": 0.2173, - "step": 1468200 - }, - { - "epoch": 14.96, - "learning_rate": 1.5063409547344531e-09, - "loss": 0.2068, - "step": 1468300 - }, - { - "epoch": 14.96, - "learning_rate": 1.4405991149590846e-09, - "loss": 0.21, - "step": 1468400 - }, - { - "epoch": 14.96, - "learning_rate": 1.3769595161816639e-09, - "loss": 0.159, - "step": 1468500 - }, - { - "epoch": 14.96, - "learning_rate": 1.3141365130620032e-09, - "loss": 0.1855, - "step": 1468600 - }, - { - "epoch": 14.96, - "learning_rate": 1.2533865686670253e-09, - "loss": 0.1864, - "step": 1468700 - }, - { - "epoch": 14.96, - "learning_rate": 1.1934824143666756e-09, - "loss": 0.2205, - "step": 1468800 - }, - { - "epoch": 14.97, - "learning_rate": 1.1350450226843556e-09, - "loss": 0.2109, - "step": 1468900 - }, - { - "epoch": 14.97, - "learning_rate": 1.0780743964711182e-09, - "loss": 0.1973, - "step": 1469000 - }, - { - "epoch": 14.97, - "learning_rate": 1.022570538518064e-09, - "loss": 0.1953, - "step": 1469100 - }, - { - "epoch": 14.97, - "learning_rate": 9.685334515363575e-10, - "loss": 0.2021, - "step": 1469200 - }, - { - "epoch": 14.97, - "learning_rate": 9.159631381672196e-10, - "loss": 0.189, - "step": 1469300 - }, - { - "epoch": 14.97, - "learning_rate": 8.648596009852572e-10, - "loss": 0.1521, - "step": 1469400 - }, - { - "epoch": 14.97, - "learning_rate": 8.152228424818109e-10, - "loss": 0.2066, - "step": 1469500 - }, - { - "epoch": 14.97, - "learning_rate": 7.670528650915998e-10, - "loss": 0.1878, - "step": 1469600 - }, - { - "epoch": 14.97, - "learning_rate": 7.203496711660762e-10, - "loss": 0.254, - "step": 1469700 - }, - { - "epoch": 14.97, - "learning_rate": 6.751132629867485e-10, - "loss": 0.2346, - "step": 1469800 - }, - { - "epoch": 14.98, - "learning_rate": 6.313436427718422e-10, - "loss": 0.163, - "step": 1469900 - }, - { - "epoch": 14.98, - "learning_rate": 5.890408126529856e-10, - "loss": 0.2341, - "step": 1470000 - }, - { - "epoch": 14.98, - "learning_rate": 5.482047747051855e-10, - "loss": 0.2287, - "step": 1470100 - }, - { - "epoch": 14.98, - "learning_rate": 5.088355309235126e-10, - "loss": 0.1854, - "step": 1470200 - }, - { - "epoch": 14.98, - "learning_rate": 4.709330832297631e-10, - "loss": 0.2252, - "step": 1470300 - }, - { - "epoch": 14.98, - "learning_rate": 4.344974334824503e-10, - "loss": 0.1867, - "step": 1470400 - }, - { - "epoch": 14.98, - "learning_rate": 3.9952858345682076e-10, - "loss": 0.1612, - "step": 1470500 - }, - { - "epoch": 14.98, - "learning_rate": 3.6602653486816907e-10, - "loss": 0.2189, - "step": 1470600 - }, - { - "epoch": 14.98, - "learning_rate": 3.3399128935185375e-10, - "loss": 0.2209, - "step": 1470700 - }, - { - "epoch": 14.98, - "learning_rate": 3.0342284847661993e-10, - "loss": 0.2041, - "step": 1470800 - }, - { - "epoch": 14.99, - "learning_rate": 2.743212137346074e-10, - "loss": 0.2219, - "step": 1470900 - }, - { - "epoch": 14.99, - "learning_rate": 2.466863865480118e-10, - "loss": 0.2514, - "step": 1471000 - }, - { - "epoch": 14.99, - "learning_rate": 2.2051836826908477e-10, - "loss": 0.2179, - "step": 1471100 - }, - { - "epoch": 14.99, - "learning_rate": 1.9581716017680328e-10, - "loss": 0.2539, - "step": 1471200 - }, - { - "epoch": 14.99, - "learning_rate": 1.725827634802002e-10, - "loss": 0.2181, - "step": 1471300 - }, - { - "epoch": 14.99, - "learning_rate": 1.5081517931503364e-10, - "loss": 0.2339, - "step": 1471400 - }, - { - "epoch": 14.99, - "learning_rate": 1.3051440874711773e-10, - "loss": 0.1927, - "step": 1471500 - }, - { - "epoch": 14.99, - "learning_rate": 1.1168045276566119e-10, - "loss": 0.1793, - "step": 1471600 - }, - { - "epoch": 14.99, - "learning_rate": 9.431331228992867e-11, - "loss": 0.2391, - "step": 1471700 - }, - { - "epoch": 14.99, - "learning_rate": 7.841298817590215e-11, - "loss": 0.21, - "step": 1471800 - }, - { - "epoch": 15.0, - "learning_rate": 6.397948119629682e-11, - "loss": 0.2037, - "step": 1471900 - }, - { - "epoch": 15.0, - "learning_rate": 5.101279205721454e-11, - "loss": 0.2213, - "step": 1472000 - }, - { - "epoch": 15.0, - "learning_rate": 3.9620659350037006e-11, - "loss": 0.1799, - "step": 1472100 - }, - { - "epoch": 15.0, - "learning_rate": 2.9572939530542185e-11, - "loss": 0.2427, - "step": 1472200 } ], "logging_steps": 100, - "max_steps": 1472295, + "max_steps": 1088730, + "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, - "total_flos": 1.2061575769608998e+22, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.189236164022104e+22, + "train_batch_size": 8, "trial_name": null, "trial_params": null }