diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8952551477170994, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.61997274421151, + "learning_rate": 9.900990099009901e-08, + "loss": 0.3005, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 16.96129971490389, + "learning_rate": 1.9801980198019803e-07, + "loss": 1.903, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 16.059955618049987, + "learning_rate": 2.9702970297029703e-07, + "loss": 1.9233, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 19.103831500070694, + "learning_rate": 3.9603960396039606e-07, + "loss": 1.991, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 18.392721025912955, + "learning_rate": 4.950495049504951e-07, + "loss": 2.1935, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 16.140573034359793, + "learning_rate": 5.940594059405941e-07, + "loss": 1.9898, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 20.046640372445655, + "learning_rate": 6.930693069306931e-07, + "loss": 2.067, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 17.629696916819253, + "learning_rate": 7.920792079207921e-07, + "loss": 1.976, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 18.205985254680062, + "learning_rate": 8.910891089108911e-07, + "loss": 2.1618, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 18.46713316887076, + "learning_rate": 9.900990099009902e-07, + "loss": 1.9472, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 12.58775167810242, + "learning_rate": 1.0891089108910893e-06, + "loss": 1.8545, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 0.8016952473974713, + "learning_rate": 1.1881188118811881e-06, + "loss": 0.2673, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 17.19759391544971, + "learning_rate": 1.2871287128712872e-06, + "loss": 1.8575, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 9.692865737683729, + "learning_rate": 1.3861386138613863e-06, + "loss": 1.6309, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 8.933875275893488, + "learning_rate": 1.4851485148514852e-06, + "loss": 1.7404, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 8.478750370188157, + "learning_rate": 1.5841584158415842e-06, + "loss": 1.625, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 8.872117125031288, + "learning_rate": 1.6831683168316833e-06, + "loss": 1.8698, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 8.316127487534764, + "learning_rate": 1.7821782178217822e-06, + "loss": 1.721, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 10.345460825151939, + "learning_rate": 1.8811881188118813e-06, + "loss": 1.7876, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 11.154912232618392, + "learning_rate": 1.9801980198019803e-06, + "loss": 1.7299, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 5.423809728704567, + "learning_rate": 2.0792079207920794e-06, + "loss": 1.5514, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.057358450273484, + "learning_rate": 2.1782178217821785e-06, + "loss": 1.4899, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 4.609038984044864, + "learning_rate": 2.2772277227722776e-06, + "loss": 1.521, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 5.523777715195209, + "learning_rate": 2.3762376237623762e-06, + "loss": 1.4854, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 4.917318190494613, + "learning_rate": 2.4752475247524753e-06, + "loss": 1.4383, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 3.6832101082075663, + "learning_rate": 2.5742574257425744e-06, + "loss": 1.3966, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 4.11950097422884, + "learning_rate": 2.6732673267326735e-06, + "loss": 1.3713, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 3.870341350061341, + "learning_rate": 2.7722772277227726e-06, + "loss": 1.3955, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 4.538406561042297, + "learning_rate": 2.8712871287128712e-06, + "loss": 1.4885, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 3.897265745516444, + "learning_rate": 2.9702970297029703e-06, + "loss": 1.3748, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.110082061925306, + "learning_rate": 3.0693069306930694e-06, + "loss": 1.3526, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 3.1958344879587637, + "learning_rate": 3.1683168316831685e-06, + "loss": 1.3512, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 3.4997592400440376, + "learning_rate": 3.2673267326732676e-06, + "loss": 1.343, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 3.5120153248850117, + "learning_rate": 3.3663366336633666e-06, + "loss": 1.3795, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 0.966950103870316, + "learning_rate": 3.4653465346534653e-06, + "loss": 0.2754, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 2.984780628539569, + "learning_rate": 3.5643564356435644e-06, + "loss": 1.3494, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 2.9492507200213534, + "learning_rate": 3.6633663366336635e-06, + "loss": 1.3834, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 2.9985573150804936, + "learning_rate": 3.7623762376237625e-06, + "loss": 1.3432, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 2.85356604200625, + "learning_rate": 3.861386138613862e-06, + "loss": 1.3271, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 2.693328464433035, + "learning_rate": 3.960396039603961e-06, + "loss": 1.3504, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 2.6422555223277864, + "learning_rate": 4.05940594059406e-06, + "loss": 1.3333, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 3.0890843341925964, + "learning_rate": 4.158415841584159e-06, + "loss": 1.3359, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 2.6806443316179953, + "learning_rate": 4.2574257425742575e-06, + "loss": 1.3349, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 2.40103168013117, + "learning_rate": 4.356435643564357e-06, + "loss": 1.2479, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 2.6170663606307594, + "learning_rate": 4.455445544554456e-06, + "loss": 1.2362, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.9458924900429565, + "learning_rate": 4.554455445544555e-06, + "loss": 0.3158, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 3.018752566429295, + "learning_rate": 4.653465346534654e-06, + "loss": 1.2055, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 1.9931288795038689, + "learning_rate": 4.7524752475247525e-06, + "loss": 1.2299, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 3.1795775560871347, + "learning_rate": 4.851485148514852e-06, + "loss": 1.3277, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.0673924508492887, + "learning_rate": 4.950495049504951e-06, + "loss": 0.2725, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.2237872077237144, + "learning_rate": 5.04950495049505e-06, + "loss": 1.2136, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 2.0788688977429324, + "learning_rate": 5.148514851485149e-06, + "loss": 1.22, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 3.0966509718753015, + "learning_rate": 5.247524752475248e-06, + "loss": 1.2273, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 2.734413949144073, + "learning_rate": 5.346534653465347e-06, + "loss": 1.149, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 2.121929724048015, + "learning_rate": 5.4455445544554465e-06, + "loss": 1.3127, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 2.7037759311356657, + "learning_rate": 5.544554455445545e-06, + "loss": 1.2493, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 2.6789600438668426, + "learning_rate": 5.643564356435644e-06, + "loss": 1.2193, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 2.4444259630024483, + "learning_rate": 5.7425742574257425e-06, + "loss": 1.2431, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 2.556882297525969, + "learning_rate": 5.841584158415842e-06, + "loss": 1.2741, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 2.238520138493385, + "learning_rate": 5.940594059405941e-06, + "loss": 1.157, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 2.415764043872665, + "learning_rate": 6.03960396039604e-06, + "loss": 1.1484, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 2.3190309713539, + "learning_rate": 6.138613861386139e-06, + "loss": 1.2549, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.1736754940496277, + "learning_rate": 6.237623762376238e-06, + "loss": 1.1193, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 2.095907947880233, + "learning_rate": 6.336633663366337e-06, + "loss": 1.1703, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 2.730595662137738, + "learning_rate": 6.4356435643564364e-06, + "loss": 1.1338, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 2.3472798025027144, + "learning_rate": 6.534653465346535e-06, + "loss": 1.0909, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 2.437408914759312, + "learning_rate": 6.633663366336635e-06, + "loss": 1.2125, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 1.8735235958026046, + "learning_rate": 6.732673267326733e-06, + "loss": 1.2269, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 2.1360912316858367, + "learning_rate": 6.831683168316833e-06, + "loss": 1.161, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 2.134595139964946, + "learning_rate": 6.930693069306931e-06, + "loss": 1.1717, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.288370730967829, + "learning_rate": 7.02970297029703e-06, + "loss": 1.1607, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.9900389604402784, + "learning_rate": 7.128712871287129e-06, + "loss": 1.0875, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.8977512995416834, + "learning_rate": 7.227722772277228e-06, + "loss": 1.037, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 2.38588961482632, + "learning_rate": 7.326732673267327e-06, + "loss": 1.0846, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.9794592682896341, + "learning_rate": 7.425742574257426e-06, + "loss": 1.2375, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 1.9059594403889988, + "learning_rate": 7.524752475247525e-06, + "loss": 1.1174, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.0406430966361695, + "learning_rate": 7.6237623762376246e-06, + "loss": 0.3022, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 2.3048006434065953, + "learning_rate": 7.722772277227724e-06, + "loss": 1.065, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.286878240228043, + "learning_rate": 7.821782178217822e-06, + "loss": 1.2009, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 1.950766978671876, + "learning_rate": 7.920792079207921e-06, + "loss": 1.0968, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.2562402312894143, + "learning_rate": 8.019801980198021e-06, + "loss": 1.0641, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.4273820131869353, + "learning_rate": 8.11881188118812e-06, + "loss": 1.093, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.748782091404448, + "learning_rate": 8.217821782178218e-06, + "loss": 1.0757, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 2.1066063495283025, + "learning_rate": 8.316831683168318e-06, + "loss": 1.0849, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 2.226804248334672, + "learning_rate": 8.415841584158416e-06, + "loss": 1.2182, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 2.465113508694537, + "learning_rate": 8.514851485148515e-06, + "loss": 1.1054, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 2.0314927432927967, + "learning_rate": 8.613861386138615e-06, + "loss": 1.2359, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.9051492201816909, + "learning_rate": 8.712871287128714e-06, + "loss": 1.1209, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 2.5454329213950686, + "learning_rate": 8.811881188118812e-06, + "loss": 1.2238, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.277898763351694, + "learning_rate": 8.910891089108911e-06, + "loss": 1.1183, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 2.1921439263709614, + "learning_rate": 9.009900990099011e-06, + "loss": 1.2191, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 2.1118277735762523, + "learning_rate": 9.10891089108911e-06, + "loss": 1.1306, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.094034560815147, + "learning_rate": 9.20792079207921e-06, + "loss": 1.0911, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.1956751649437387, + "learning_rate": 9.306930693069308e-06, + "loss": 1.1, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.568304117862986, + "learning_rate": 9.405940594059405e-06, + "loss": 1.1316, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 2.0568845808895038, + "learning_rate": 9.504950495049505e-06, + "loss": 1.1461, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.9863236557601718, + "learning_rate": 9.603960396039604e-06, + "loss": 1.0696, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 1.9849082068957913, + "learning_rate": 9.702970297029704e-06, + "loss": 1.1677, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.161881947466254, + "learning_rate": 9.801980198019802e-06, + "loss": 1.2309, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 2.5290291810230108, + "learning_rate": 9.900990099009901e-06, + "loss": 1.0963, + "step": 100 + }, + { + "epoch": 0.02, + "grad_norm": 1.8701775563508618, + "learning_rate": 1e-05, + "loss": 1.1526, + "step": 101 + }, + { + "epoch": 0.02, + "grad_norm": 1.9515200801588286, + "learning_rate": 1.00990099009901e-05, + "loss": 1.0315, + "step": 102 + }, + { + "epoch": 0.02, + "grad_norm": 1.9314667803813104, + "learning_rate": 1.01980198019802e-05, + "loss": 1.1808, + "step": 103 + }, + { + "epoch": 0.02, + "grad_norm": 2.030072975262262, + "learning_rate": 1.0297029702970298e-05, + "loss": 1.1477, + "step": 104 + }, + { + "epoch": 0.02, + "grad_norm": 2.5820036091674017, + "learning_rate": 1.0396039603960397e-05, + "loss": 1.1251, + "step": 105 + }, + { + "epoch": 0.02, + "grad_norm": 2.281488203650231, + "learning_rate": 1.0495049504950497e-05, + "loss": 1.1243, + "step": 106 + }, + { + "epoch": 0.02, + "grad_norm": 2.1578590256842682, + "learning_rate": 1.0594059405940596e-05, + "loss": 1.1695, + "step": 107 + }, + { + "epoch": 0.02, + "grad_norm": 2.1702469274118794, + "learning_rate": 1.0693069306930694e-05, + "loss": 1.214, + "step": 108 + }, + { + "epoch": 0.02, + "grad_norm": 1.958633320084963, + "learning_rate": 1.0792079207920793e-05, + "loss": 1.0675, + "step": 109 + }, + { + "epoch": 0.02, + "grad_norm": 1.7990099787040412, + "learning_rate": 1.0891089108910893e-05, + "loss": 1.0354, + "step": 110 + }, + { + "epoch": 0.02, + "grad_norm": 2.3100199739513276, + "learning_rate": 1.0990099009900992e-05, + "loss": 1.1947, + "step": 111 + }, + { + "epoch": 0.02, + "grad_norm": 1.9146437443620536, + "learning_rate": 1.108910891089109e-05, + "loss": 1.1562, + "step": 112 + }, + { + "epoch": 0.02, + "grad_norm": 2.0933911830979057, + "learning_rate": 1.118811881188119e-05, + "loss": 1.1122, + "step": 113 + }, + { + "epoch": 0.02, + "grad_norm": 2.147201848587997, + "learning_rate": 1.1287128712871288e-05, + "loss": 1.1182, + "step": 114 + }, + { + "epoch": 0.02, + "grad_norm": 2.224570092314074, + "learning_rate": 1.1386138613861385e-05, + "loss": 1.1405, + "step": 115 + }, + { + "epoch": 0.02, + "grad_norm": 2.353149108522584, + "learning_rate": 1.1485148514851485e-05, + "loss": 1.1476, + "step": 116 + }, + { + "epoch": 0.02, + "grad_norm": 2.2498791946987926, + "learning_rate": 1.1584158415841584e-05, + "loss": 1.051, + "step": 117 + }, + { + "epoch": 0.02, + "grad_norm": 2.3488301438133083, + "learning_rate": 1.1683168316831684e-05, + "loss": 1.1357, + "step": 118 + }, + { + "epoch": 0.02, + "grad_norm": 2.536534949887731, + "learning_rate": 1.1782178217821782e-05, + "loss": 1.1184, + "step": 119 + }, + { + "epoch": 0.02, + "grad_norm": 2.4491744166716476, + "learning_rate": 1.1881188118811881e-05, + "loss": 1.1291, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 2.254394405663255, + "learning_rate": 1.198019801980198e-05, + "loss": 1.0794, + "step": 121 + }, + { + "epoch": 0.02, + "grad_norm": 2.302067290368141, + "learning_rate": 1.207920792079208e-05, + "loss": 1.0474, + "step": 122 + }, + { + "epoch": 0.02, + "grad_norm": 2.133676800950005, + "learning_rate": 1.217821782178218e-05, + "loss": 1.1153, + "step": 123 + }, + { + "epoch": 0.02, + "grad_norm": 2.0990622122422082, + "learning_rate": 1.2277227722772278e-05, + "loss": 1.1387, + "step": 124 + }, + { + "epoch": 0.02, + "grad_norm": 2.3939641900262445, + "learning_rate": 1.2376237623762377e-05, + "loss": 1.1263, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 2.469167927484086, + "learning_rate": 1.2475247524752477e-05, + "loss": 1.1487, + "step": 126 + }, + { + "epoch": 0.02, + "grad_norm": 2.11531848687401, + "learning_rate": 1.2574257425742576e-05, + "loss": 1.1051, + "step": 127 + }, + { + "epoch": 0.02, + "grad_norm": 1.9339220307775706, + "learning_rate": 1.2673267326732674e-05, + "loss": 1.0821, + "step": 128 + }, + { + "epoch": 0.02, + "grad_norm": 2.363460636424139, + "learning_rate": 1.2772277227722773e-05, + "loss": 1.1555, + "step": 129 + }, + { + "epoch": 0.02, + "grad_norm": 2.2165294099352177, + "learning_rate": 1.2871287128712873e-05, + "loss": 1.0388, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 2.136400737843224, + "learning_rate": 1.2970297029702972e-05, + "loss": 1.0771, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 1.9733133562414973, + "learning_rate": 1.306930693069307e-05, + "loss": 1.0488, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 1.831830200762256, + "learning_rate": 1.316831683168317e-05, + "loss": 1.0305, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 2.4462152054957795, + "learning_rate": 1.326732673267327e-05, + "loss": 1.0456, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 1.9833745142641062, + "learning_rate": 1.3366336633663369e-05, + "loss": 1.1242, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 2.245339693291035, + "learning_rate": 1.3465346534653467e-05, + "loss": 1.0501, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 2.141274120592458, + "learning_rate": 1.3564356435643566e-05, + "loss": 1.0582, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 2.374277998011458, + "learning_rate": 1.3663366336633666e-05, + "loss": 1.0625, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 1.9395677478874662, + "learning_rate": 1.3762376237623762e-05, + "loss": 1.0507, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 2.0312424081880573, + "learning_rate": 1.3861386138613861e-05, + "loss": 1.1991, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 2.3908910540964516, + "learning_rate": 1.396039603960396e-05, + "loss": 1.1068, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 1.8709075890872888, + "learning_rate": 1.405940594059406e-05, + "loss": 1.0952, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 2.000991169628112, + "learning_rate": 1.4158415841584158e-05, + "loss": 1.0709, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 2.0796126377623683, + "learning_rate": 1.4257425742574257e-05, + "loss": 1.0188, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 1.781014562824781, + "learning_rate": 1.4356435643564357e-05, + "loss": 1.1252, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 2.3517136145035034, + "learning_rate": 1.4455445544554456e-05, + "loss": 1.0104, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 1.8762707854357925, + "learning_rate": 1.4554455445544556e-05, + "loss": 1.0909, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 2.380889665506398, + "learning_rate": 1.4653465346534654e-05, + "loss": 1.0343, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 2.3265227494389067, + "learning_rate": 1.4752475247524753e-05, + "loss": 1.0973, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 1.8184422639009856, + "learning_rate": 1.4851485148514853e-05, + "loss": 1.1392, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 2.3622326550140116, + "learning_rate": 1.4950495049504952e-05, + "loss": 1.0573, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 2.3223206984371676, + "learning_rate": 1.504950495049505e-05, + "loss": 1.084, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 1.9331405224070533, + "learning_rate": 1.514851485148515e-05, + "loss": 1.0064, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 1.824550576019055, + "learning_rate": 1.5247524752475249e-05, + "loss": 1.1104, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 1.8952197937491162, + "learning_rate": 1.534653465346535e-05, + "loss": 1.0337, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 1.2666424543945978, + "learning_rate": 1.5445544554455448e-05, + "loss": 0.3005, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 1.8195748164659915, + "learning_rate": 1.5544554455445548e-05, + "loss": 1.0188, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 1.9899061164398222, + "learning_rate": 1.5643564356435644e-05, + "loss": 1.0611, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 1.8363408315251817, + "learning_rate": 1.5742574257425743e-05, + "loss": 1.0527, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 1.8801020982340528, + "learning_rate": 1.5841584158415843e-05, + "loss": 1.1418, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 2.1893456795945476, + "learning_rate": 1.5940594059405942e-05, + "loss": 1.1231, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 2.1579716933776347, + "learning_rate": 1.6039603960396042e-05, + "loss": 1.065, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 2.1190706654616207, + "learning_rate": 1.613861386138614e-05, + "loss": 1.0472, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 1.9383266093102474, + "learning_rate": 1.623762376237624e-05, + "loss": 1.0465, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 1.882368165924385, + "learning_rate": 1.6336633663366337e-05, + "loss": 1.0295, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 2.063286986987607, + "learning_rate": 1.6435643564356436e-05, + "loss": 1.0524, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 2.1721790601730753, + "learning_rate": 1.6534653465346536e-05, + "loss": 1.0208, + "step": 167 + }, + { + "epoch": 0.03, + "grad_norm": 1.7842244798772222, + "learning_rate": 1.6633663366336635e-05, + "loss": 1.0414, + "step": 168 + }, + { + "epoch": 0.03, + "grad_norm": 1.8412095101139438, + "learning_rate": 1.6732673267326735e-05, + "loss": 1.0799, + "step": 169 + }, + { + "epoch": 0.03, + "grad_norm": 2.0135741890005785, + "learning_rate": 1.683168316831683e-05, + "loss": 1.1115, + "step": 170 + }, + { + "epoch": 0.03, + "grad_norm": 1.9897759857740758, + "learning_rate": 1.693069306930693e-05, + "loss": 1.0938, + "step": 171 + }, + { + "epoch": 0.03, + "grad_norm": 1.9745107771408377, + "learning_rate": 1.702970297029703e-05, + "loss": 1.1316, + "step": 172 + }, + { + "epoch": 0.03, + "grad_norm": 1.8723737443904078, + "learning_rate": 1.712871287128713e-05, + "loss": 1.1484, + "step": 173 + }, + { + "epoch": 0.03, + "grad_norm": 1.9620152153533466, + "learning_rate": 1.722772277227723e-05, + "loss": 1.0567, + "step": 174 + }, + { + "epoch": 0.03, + "grad_norm": 1.6571345160418192, + "learning_rate": 1.732673267326733e-05, + "loss": 0.9909, + "step": 175 + }, + { + "epoch": 0.03, + "grad_norm": 2.3155871128220933, + "learning_rate": 1.7425742574257428e-05, + "loss": 1.0136, + "step": 176 + }, + { + "epoch": 0.03, + "grad_norm": 1.8975985315668868, + "learning_rate": 1.7524752475247528e-05, + "loss": 1.0782, + "step": 177 + }, + { + "epoch": 0.03, + "grad_norm": 2.0331616612731165, + "learning_rate": 1.7623762376237624e-05, + "loss": 1.1173, + "step": 178 + }, + { + "epoch": 0.03, + "grad_norm": 2.292007435673564, + "learning_rate": 1.7722772277227723e-05, + "loss": 1.0251, + "step": 179 + }, + { + "epoch": 0.03, + "grad_norm": 2.2625782524323026, + "learning_rate": 1.7821782178217823e-05, + "loss": 1.0902, + "step": 180 + }, + { + "epoch": 0.03, + "grad_norm": 2.44840691692822, + "learning_rate": 1.7920792079207922e-05, + "loss": 1.012, + "step": 181 + }, + { + "epoch": 0.03, + "grad_norm": 2.1918713608144023, + "learning_rate": 1.8019801980198022e-05, + "loss": 1.0646, + "step": 182 + }, + { + "epoch": 0.03, + "grad_norm": 1.6412360098068943, + "learning_rate": 1.811881188118812e-05, + "loss": 1.0053, + "step": 183 + }, + { + "epoch": 0.03, + "grad_norm": 2.1715811901191255, + "learning_rate": 1.821782178217822e-05, + "loss": 1.0353, + "step": 184 + }, + { + "epoch": 0.03, + "grad_norm": 1.721914291625859, + "learning_rate": 1.831683168316832e-05, + "loss": 1.0385, + "step": 185 + }, + { + "epoch": 0.03, + "grad_norm": 1.8854165035497128, + "learning_rate": 1.841584158415842e-05, + "loss": 1.0852, + "step": 186 + }, + { + "epoch": 0.03, + "grad_norm": 2.094190246998848, + "learning_rate": 1.8514851485148516e-05, + "loss": 0.9552, + "step": 187 + }, + { + "epoch": 0.03, + "grad_norm": 1.7715648330031164, + "learning_rate": 1.8613861386138615e-05, + "loss": 1.0648, + "step": 188 + }, + { + "epoch": 0.03, + "grad_norm": 1.9993014093448769, + "learning_rate": 1.8712871287128715e-05, + "loss": 1.0375, + "step": 189 + }, + { + "epoch": 0.03, + "grad_norm": 2.1229396887764107, + "learning_rate": 1.881188118811881e-05, + "loss": 1.1171, + "step": 190 + }, + { + "epoch": 0.03, + "grad_norm": 2.675628067421174, + "learning_rate": 1.891089108910891e-05, + "loss": 1.1088, + "step": 191 + }, + { + "epoch": 0.03, + "grad_norm": 1.967944255637049, + "learning_rate": 1.900990099009901e-05, + "loss": 1.0119, + "step": 192 + }, + { + "epoch": 0.03, + "grad_norm": 2.140395665588024, + "learning_rate": 1.910891089108911e-05, + "loss": 1.1057, + "step": 193 + }, + { + "epoch": 0.03, + "grad_norm": 2.041794683901522, + "learning_rate": 1.920792079207921e-05, + "loss": 1.047, + "step": 194 + }, + { + "epoch": 0.03, + "grad_norm": 2.1757371151486833, + "learning_rate": 1.930693069306931e-05, + "loss": 1.05, + "step": 195 + }, + { + "epoch": 0.03, + "grad_norm": 2.0122452148872787, + "learning_rate": 1.9405940594059408e-05, + "loss": 1.0408, + "step": 196 + }, + { + "epoch": 0.03, + "grad_norm": 2.453726450269774, + "learning_rate": 1.9504950495049508e-05, + "loss": 1.05, + "step": 197 + }, + { + "epoch": 0.03, + "grad_norm": 1.9361060861916906, + "learning_rate": 1.9603960396039604e-05, + "loss": 0.9558, + "step": 198 + }, + { + "epoch": 0.03, + "grad_norm": 2.0977788104405524, + "learning_rate": 1.9702970297029703e-05, + "loss": 1.0304, + "step": 199 + }, + { + "epoch": 0.03, + "grad_norm": 2.7380497408748563, + "learning_rate": 1.9801980198019803e-05, + "loss": 1.0248, + "step": 200 + }, + { + "epoch": 0.03, + "grad_norm": 2.061102223700244, + "learning_rate": 1.9900990099009902e-05, + "loss": 1.066, + "step": 201 + }, + { + "epoch": 0.03, + "grad_norm": 1.8769054208304206, + "learning_rate": 2e-05, + "loss": 1.0021, + "step": 202 + }, + { + "epoch": 0.03, + "grad_norm": 1.9700349198522595, + "learning_rate": 1.9999998831999504e-05, + "loss": 1.0194, + "step": 203 + }, + { + "epoch": 0.03, + "grad_norm": 2.1649539812066867, + "learning_rate": 1.9999995327998284e-05, + "loss": 1.0191, + "step": 204 + }, + { + "epoch": 0.03, + "grad_norm": 2.244717044191121, + "learning_rate": 1.9999989487997156e-05, + "loss": 1.0838, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 2.165347941470964, + "learning_rate": 1.9999981311997488e-05, + "loss": 0.9735, + "step": 206 + }, + { + "epoch": 0.03, + "grad_norm": 1.8396468310253444, + "learning_rate": 1.999997080000119e-05, + "loss": 1.0186, + "step": 207 + }, + { + "epoch": 0.03, + "grad_norm": 2.084120466227254, + "learning_rate": 1.999995795201072e-05, + "loss": 1.0204, + "step": 208 + }, + { + "epoch": 0.03, + "grad_norm": 2.012804798367043, + "learning_rate": 1.9999942768029073e-05, + "loss": 0.9858, + "step": 209 + }, + { + "epoch": 0.03, + "grad_norm": 1.8934152718067327, + "learning_rate": 1.99999252480598e-05, + "loss": 1.051, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 1.8814071954750615, + "learning_rate": 1.9999905392106993e-05, + "loss": 1.0256, + "step": 211 + }, + { + "epoch": 0.03, + "grad_norm": 2.015441666257054, + "learning_rate": 1.9999883200175286e-05, + "loss": 0.9485, + "step": 212 + }, + { + "epoch": 0.03, + "grad_norm": 1.884512155912271, + "learning_rate": 1.9999858672269874e-05, + "loss": 1.0199, + "step": 213 + }, + { + "epoch": 0.03, + "grad_norm": 0.9580818739678238, + "learning_rate": 1.9999831808396477e-05, + "loss": 0.3055, + "step": 214 + }, + { + "epoch": 0.03, + "grad_norm": 1.9852759613789064, + "learning_rate": 1.999980260856137e-05, + "loss": 1.1423, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 1.966574727366982, + "learning_rate": 1.9999771072771384e-05, + "loss": 1.1323, + "step": 216 + }, + { + "epoch": 0.03, + "grad_norm": 1.7852930104230016, + "learning_rate": 1.9999737201033877e-05, + "loss": 1.0521, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 1.7433767151616615, + "learning_rate": 1.999970099335676e-05, + "loss": 1.082, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 2.237094436350509, + "learning_rate": 1.99996624497485e-05, + "loss": 1.1611, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 1.724866004461281, + "learning_rate": 1.9999621570218092e-05, + "loss": 1.0491, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 1.9428030588375604, + "learning_rate": 1.999957835477509e-05, + "loss": 1.0146, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 2.0752598382230425, + "learning_rate": 1.999953280342959e-05, + "loss": 1.115, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 1.759065001688073, + "learning_rate": 1.9999484916192225e-05, + "loss": 0.9559, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 2.037851146449414, + "learning_rate": 1.9999434693074192e-05, + "loss": 1.0458, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 1.8568434491521704, + "learning_rate": 1.999938213408722e-05, + "loss": 1.1218, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 2.082233029119249, + "learning_rate": 1.9999327239243586e-05, + "loss": 1.0666, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 2.073497027934702, + "learning_rate": 1.9999270008556108e-05, + "loss": 1.0257, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 1.8903854115847758, + "learning_rate": 1.9999210442038164e-05, + "loss": 1.0818, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 2.5869163536214277, + "learning_rate": 1.9999148539703662e-05, + "loss": 1.0493, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 2.266002510945593, + "learning_rate": 1.9999084301567066e-05, + "loss": 1.1063, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 1.9129603073518342, + "learning_rate": 1.9999017727643378e-05, + "loss": 1.0675, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 1.8782768186563432, + "learning_rate": 1.9998948817948157e-05, + "loss": 1.0875, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 2.2753283414379606, + "learning_rate": 1.9998877572497493e-05, + "loss": 0.9763, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 2.0831375564536687, + "learning_rate": 1.9998803991308036e-05, + "loss": 1.1391, + "step": 234 + }, + { + "epoch": 0.04, + "grad_norm": 1.766648181745103, + "learning_rate": 1.999872807439697e-05, + "loss": 0.9554, + "step": 235 + }, + { + "epoch": 0.04, + "grad_norm": 1.726816959099485, + "learning_rate": 1.9998649821782027e-05, + "loss": 1.0681, + "step": 236 + }, + { + "epoch": 0.04, + "grad_norm": 1.955048102606591, + "learning_rate": 1.999856923348149e-05, + "loss": 1.0445, + "step": 237 + }, + { + "epoch": 0.04, + "grad_norm": 1.8429625110431678, + "learning_rate": 1.9998486309514184e-05, + "loss": 1.0983, + "step": 238 + }, + { + "epoch": 0.04, + "grad_norm": 1.985602888795277, + "learning_rate": 1.999840104989948e-05, + "loss": 1.1041, + "step": 239 + }, + { + "epoch": 0.04, + "grad_norm": 2.080965162810293, + "learning_rate": 1.9998313454657295e-05, + "loss": 1.1206, + "step": 240 + }, + { + "epoch": 0.04, + "grad_norm": 1.9744983192855934, + "learning_rate": 1.9998223523808092e-05, + "loss": 0.9096, + "step": 241 + }, + { + "epoch": 0.04, + "grad_norm": 1.7847360350104817, + "learning_rate": 1.9998131257372878e-05, + "loss": 1.0918, + "step": 242 + }, + { + "epoch": 0.04, + "grad_norm": 1.9233913080632643, + "learning_rate": 1.9998036655373206e-05, + "loss": 1.0834, + "step": 243 + }, + { + "epoch": 0.04, + "grad_norm": 1.9163129655517637, + "learning_rate": 1.9997939717831173e-05, + "loss": 0.9963, + "step": 244 + }, + { + "epoch": 0.04, + "grad_norm": 2.0982153571844986, + "learning_rate": 1.9997840444769428e-05, + "loss": 1.0588, + "step": 245 + }, + { + "epoch": 0.04, + "grad_norm": 1.9450012708748048, + "learning_rate": 1.9997738836211157e-05, + "loss": 1.023, + "step": 246 + }, + { + "epoch": 0.04, + "grad_norm": 1.908639429438158, + "learning_rate": 1.99976348921801e-05, + "loss": 0.9814, + "step": 247 + }, + { + "epoch": 0.04, + "grad_norm": 1.8994625310916364, + "learning_rate": 1.9997528612700536e-05, + "loss": 1.0355, + "step": 248 + }, + { + "epoch": 0.04, + "grad_norm": 1.9360007341939889, + "learning_rate": 1.999741999779729e-05, + "loss": 1.0732, + "step": 249 + }, + { + "epoch": 0.04, + "grad_norm": 1.8855786798393648, + "learning_rate": 1.999730904749574e-05, + "loss": 1.0306, + "step": 250 + }, + { + "epoch": 0.04, + "grad_norm": 1.8928087643137284, + "learning_rate": 1.9997195761821797e-05, + "loss": 1.0616, + "step": 251 + }, + { + "epoch": 0.04, + "grad_norm": 2.5177678894190167, + "learning_rate": 1.9997080140801932e-05, + "loss": 0.9914, + "step": 252 + }, + { + "epoch": 0.04, + "grad_norm": 1.9171654850753623, + "learning_rate": 1.9996962184463147e-05, + "loss": 1.0288, + "step": 253 + }, + { + "epoch": 0.04, + "grad_norm": 1.696907802479002, + "learning_rate": 1.9996841892833e-05, + "loss": 1.0123, + "step": 254 + }, + { + "epoch": 0.04, + "grad_norm": 2.159671670460747, + "learning_rate": 1.9996719265939594e-05, + "loss": 1.0899, + "step": 255 + }, + { + "epoch": 0.04, + "grad_norm": 2.026016334585346, + "learning_rate": 1.999659430381157e-05, + "loss": 0.9954, + "step": 256 + }, + { + "epoch": 0.04, + "grad_norm": 2.046475194876582, + "learning_rate": 1.999646700647812e-05, + "loss": 1.006, + "step": 257 + }, + { + "epoch": 0.04, + "grad_norm": 1.7231000352423613, + "learning_rate": 1.999633737396898e-05, + "loss": 1.0283, + "step": 258 + }, + { + "epoch": 0.04, + "grad_norm": 1.8544222339937602, + "learning_rate": 1.9996205406314434e-05, + "loss": 1.0276, + "step": 259 + }, + { + "epoch": 0.04, + "grad_norm": 1.6873838916765134, + "learning_rate": 1.9996071103545313e-05, + "loss": 1.0372, + "step": 260 + }, + { + "epoch": 0.04, + "grad_norm": 1.8972473662410652, + "learning_rate": 1.9995934465692984e-05, + "loss": 1.0258, + "step": 261 + }, + { + "epoch": 0.04, + "grad_norm": 1.82234148294883, + "learning_rate": 1.9995795492789368e-05, + "loss": 1.0849, + "step": 262 + }, + { + "epoch": 0.04, + "grad_norm": 1.862813177036648, + "learning_rate": 1.999565418486693e-05, + "loss": 1.0573, + "step": 263 + }, + { + "epoch": 0.04, + "grad_norm": 2.090188627218271, + "learning_rate": 1.999551054195868e-05, + "loss": 0.9734, + "step": 264 + }, + { + "epoch": 0.04, + "grad_norm": 1.7376482121628392, + "learning_rate": 1.9995364564098166e-05, + "loss": 0.9733, + "step": 265 + }, + { + "epoch": 0.04, + "grad_norm": 2.2116963201468174, + "learning_rate": 1.99952162513195e-05, + "loss": 1.0725, + "step": 266 + }, + { + "epoch": 0.04, + "grad_norm": 2.250349703077166, + "learning_rate": 1.9995065603657317e-05, + "loss": 0.9851, + "step": 267 + }, + { + "epoch": 0.04, + "grad_norm": 2.0398888877351142, + "learning_rate": 1.9994912621146814e-05, + "loss": 1.0597, + "step": 268 + }, + { + "epoch": 0.04, + "grad_norm": 1.9515067854595212, + "learning_rate": 1.999475730382373e-05, + "loss": 1.0157, + "step": 269 + }, + { + "epoch": 0.04, + "grad_norm": 2.018554443890871, + "learning_rate": 1.9994599651724345e-05, + "loss": 1.1006, + "step": 270 + }, + { + "epoch": 0.04, + "grad_norm": 1.7905254733302385, + "learning_rate": 1.9994439664885484e-05, + "loss": 1.0788, + "step": 271 + }, + { + "epoch": 0.04, + "grad_norm": 1.9693304466436599, + "learning_rate": 1.999427734334452e-05, + "loss": 0.9886, + "step": 272 + }, + { + "epoch": 0.04, + "grad_norm": 1.7373140794779989, + "learning_rate": 1.9994112687139373e-05, + "loss": 0.9662, + "step": 273 + }, + { + "epoch": 0.04, + "grad_norm": 1.9108285900577464, + "learning_rate": 1.9993945696308508e-05, + "loss": 1.0296, + "step": 274 + }, + { + "epoch": 0.04, + "grad_norm": 1.7065143719243163, + "learning_rate": 1.9993776370890932e-05, + "loss": 1.0147, + "step": 275 + }, + { + "epoch": 0.04, + "grad_norm": 2.2489597462270825, + "learning_rate": 1.9993604710926203e-05, + "loss": 0.9634, + "step": 276 + }, + { + "epoch": 0.04, + "grad_norm": 1.7126272748330709, + "learning_rate": 1.9993430716454415e-05, + "loss": 0.8735, + "step": 277 + }, + { + "epoch": 0.04, + "grad_norm": 2.121608926231665, + "learning_rate": 1.9993254387516216e-05, + "loss": 0.9658, + "step": 278 + }, + { + "epoch": 0.04, + "grad_norm": 1.95073604202899, + "learning_rate": 1.99930757241528e-05, + "loss": 1.033, + "step": 279 + }, + { + "epoch": 0.04, + "grad_norm": 1.7314585208120084, + "learning_rate": 1.9992894726405894e-05, + "loss": 0.9172, + "step": 280 + }, + { + "epoch": 0.04, + "grad_norm": 1.828127203842204, + "learning_rate": 1.9992711394317787e-05, + "loss": 0.9255, + "step": 281 + }, + { + "epoch": 0.04, + "grad_norm": 1.807205312990042, + "learning_rate": 1.9992525727931303e-05, + "loss": 1.0163, + "step": 282 + }, + { + "epoch": 0.04, + "grad_norm": 1.9628912323148937, + "learning_rate": 1.9992337727289813e-05, + "loss": 0.9669, + "step": 283 + }, + { + "epoch": 0.04, + "grad_norm": 2.0469560934703117, + "learning_rate": 1.9992147392437235e-05, + "loss": 1.0973, + "step": 284 + }, + { + "epoch": 0.04, + "grad_norm": 2.3830503454477934, + "learning_rate": 1.999195472341803e-05, + "loss": 0.9828, + "step": 285 + }, + { + "epoch": 0.04, + "grad_norm": 2.0586659085839196, + "learning_rate": 1.999175972027721e-05, + "loss": 1.057, + "step": 286 + }, + { + "epoch": 0.04, + "grad_norm": 1.9353614779926795, + "learning_rate": 1.9991562383060316e-05, + "loss": 1.0074, + "step": 287 + }, + { + "epoch": 0.04, + "grad_norm": 1.8419681886062762, + "learning_rate": 1.999136271181346e-05, + "loss": 1.0666, + "step": 288 + }, + { + "epoch": 0.04, + "grad_norm": 1.7879527348103896, + "learning_rate": 1.999116070658328e-05, + "loss": 1.0565, + "step": 289 + }, + { + "epoch": 0.04, + "grad_norm": 1.9199725384008703, + "learning_rate": 1.999095636741696e-05, + "loss": 1.0567, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 1.9017182933614696, + "learning_rate": 1.999074969436224e-05, + "loss": 1.0614, + "step": 291 + }, + { + "epoch": 0.04, + "grad_norm": 1.8817085773345408, + "learning_rate": 1.9990540687467394e-05, + "loss": 0.9546, + "step": 292 + }, + { + "epoch": 0.04, + "grad_norm": 1.7456010052817308, + "learning_rate": 1.999032934678125e-05, + "loss": 1.09, + "step": 293 + }, + { + "epoch": 0.04, + "grad_norm": 1.8772906893987726, + "learning_rate": 1.9990115672353176e-05, + "loss": 1.0998, + "step": 294 + }, + { + "epoch": 0.04, + "grad_norm": 1.7140211617623706, + "learning_rate": 1.998989966423308e-05, + "loss": 0.9885, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 2.020093630177855, + "learning_rate": 1.9989681322471434e-05, + "loss": 0.9418, + "step": 296 + }, + { + "epoch": 0.04, + "grad_norm": 1.7061585698765522, + "learning_rate": 1.9989460647119232e-05, + "loss": 1.0078, + "step": 297 + }, + { + "epoch": 0.04, + "grad_norm": 1.677045096579522, + "learning_rate": 1.998923763822803e-05, + "loss": 1.0491, + "step": 298 + }, + { + "epoch": 0.04, + "grad_norm": 1.736300799709409, + "learning_rate": 1.9989012295849917e-05, + "loss": 1.0532, + "step": 299 + }, + { + "epoch": 0.04, + "grad_norm": 1.672784489958878, + "learning_rate": 1.998878462003754e-05, + "loss": 1.0298, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 1.6542057723864698, + "learning_rate": 1.998855461084408e-05, + "loss": 0.9925, + "step": 301 + }, + { + "epoch": 0.05, + "grad_norm": 2.024508696247414, + "learning_rate": 1.998832226832327e-05, + "loss": 1.096, + "step": 302 + }, + { + "epoch": 0.05, + "grad_norm": 1.8182449765794237, + "learning_rate": 1.998808759252938e-05, + "loss": 1.0475, + "step": 303 + }, + { + "epoch": 0.05, + "grad_norm": 1.992777576197344, + "learning_rate": 1.9987850583517232e-05, + "loss": 1.0334, + "step": 304 + }, + { + "epoch": 0.05, + "grad_norm": 1.784599012720452, + "learning_rate": 1.9987611241342196e-05, + "loss": 1.0242, + "step": 305 + }, + { + "epoch": 0.05, + "grad_norm": 1.7102947181663875, + "learning_rate": 1.998736956606018e-05, + "loss": 0.9414, + "step": 306 + }, + { + "epoch": 0.05, + "grad_norm": 1.82319059334704, + "learning_rate": 1.9987125557727633e-05, + "loss": 1.0131, + "step": 307 + }, + { + "epoch": 0.05, + "grad_norm": 1.8249288275636182, + "learning_rate": 1.9986879216401562e-05, + "loss": 1.029, + "step": 308 + }, + { + "epoch": 0.05, + "grad_norm": 1.1554929681087978, + "learning_rate": 1.9986630542139513e-05, + "loss": 0.3005, + "step": 309 + }, + { + "epoch": 0.05, + "grad_norm": 2.283899568144886, + "learning_rate": 1.9986379534999577e-05, + "loss": 1.0279, + "step": 310 + }, + { + "epoch": 0.05, + "grad_norm": 0.8502366709886288, + "learning_rate": 1.9986126195040384e-05, + "loss": 0.3008, + "step": 311 + }, + { + "epoch": 0.05, + "grad_norm": 1.9804678198381456, + "learning_rate": 1.9985870522321118e-05, + "loss": 1.027, + "step": 312 + }, + { + "epoch": 0.05, + "grad_norm": 1.8523264206389816, + "learning_rate": 1.9985612516901504e-05, + "loss": 0.9465, + "step": 313 + }, + { + "epoch": 0.05, + "grad_norm": 1.7362400017550708, + "learning_rate": 1.9985352178841807e-05, + "loss": 0.9668, + "step": 314 + }, + { + "epoch": 0.05, + "grad_norm": 1.8851263402754863, + "learning_rate": 1.998508950820285e-05, + "loss": 1.0016, + "step": 315 + }, + { + "epoch": 0.05, + "grad_norm": 1.880179349622182, + "learning_rate": 1.9984824505045988e-05, + "loss": 1.0792, + "step": 316 + }, + { + "epoch": 0.05, + "grad_norm": 1.6622596229035727, + "learning_rate": 1.9984557169433126e-05, + "loss": 0.9558, + "step": 317 + }, + { + "epoch": 0.05, + "grad_norm": 1.7383853473997237, + "learning_rate": 1.998428750142672e-05, + "loss": 1.082, + "step": 318 + }, + { + "epoch": 0.05, + "grad_norm": 1.7835925618124706, + "learning_rate": 1.998401550108975e-05, + "loss": 0.8812, + "step": 319 + }, + { + "epoch": 0.05, + "grad_norm": 2.6013586634606467, + "learning_rate": 1.9983741168485772e-05, + "loss": 1.0246, + "step": 320 + }, + { + "epoch": 0.05, + "grad_norm": 1.8094094972336086, + "learning_rate": 1.998346450367886e-05, + "loss": 0.9858, + "step": 321 + }, + { + "epoch": 0.05, + "grad_norm": 1.8180493040233767, + "learning_rate": 1.9983185506733643e-05, + "loss": 1.0272, + "step": 322 + }, + { + "epoch": 0.05, + "grad_norm": 2.2021418975259297, + "learning_rate": 1.9982904177715297e-05, + "loss": 1.05, + "step": 323 + }, + { + "epoch": 0.05, + "grad_norm": 2.0115654312892914, + "learning_rate": 1.9982620516689544e-05, + "loss": 1.0401, + "step": 324 + }, + { + "epoch": 0.05, + "grad_norm": 2.0398671984922796, + "learning_rate": 1.9982334523722643e-05, + "loss": 0.9548, + "step": 325 + }, + { + "epoch": 0.05, + "grad_norm": 2.1646276400547193, + "learning_rate": 1.9982046198881403e-05, + "loss": 1.0689, + "step": 326 + }, + { + "epoch": 0.05, + "grad_norm": 1.1778496983117752, + "learning_rate": 1.9981755542233175e-05, + "loss": 0.3349, + "step": 327 + }, + { + "epoch": 0.05, + "grad_norm": 1.9221729245359622, + "learning_rate": 1.998146255384586e-05, + "loss": 1.0653, + "step": 328 + }, + { + "epoch": 0.05, + "grad_norm": 1.958836273193452, + "learning_rate": 1.9981167233787898e-05, + "loss": 1.0531, + "step": 329 + }, + { + "epoch": 0.05, + "grad_norm": 1.855521367983362, + "learning_rate": 1.9980869582128274e-05, + "loss": 0.9553, + "step": 330 + }, + { + "epoch": 0.05, + "grad_norm": 2.2028664105751212, + "learning_rate": 1.9980569598936524e-05, + "loss": 0.9979, + "step": 331 + }, + { + "epoch": 0.05, + "grad_norm": 1.7059139757042052, + "learning_rate": 1.9980267284282718e-05, + "loss": 1.0646, + "step": 332 + }, + { + "epoch": 0.05, + "grad_norm": 1.864030614349971, + "learning_rate": 1.997996263823748e-05, + "loss": 1.012, + "step": 333 + }, + { + "epoch": 0.05, + "grad_norm": 2.0264614662391174, + "learning_rate": 1.997965566087198e-05, + "loss": 0.9927, + "step": 334 + }, + { + "epoch": 0.05, + "grad_norm": 1.7346730357420757, + "learning_rate": 1.997934635225792e-05, + "loss": 1.0399, + "step": 335 + }, + { + "epoch": 0.05, + "grad_norm": 1.8750773135775893, + "learning_rate": 1.9979034712467556e-05, + "loss": 0.9405, + "step": 336 + }, + { + "epoch": 0.05, + "grad_norm": 2.0466712475331534, + "learning_rate": 1.9978720741573693e-05, + "loss": 1.0562, + "step": 337 + }, + { + "epoch": 0.05, + "grad_norm": 2.0050362975564058, + "learning_rate": 1.997840443964967e-05, + "loss": 1.0619, + "step": 338 + }, + { + "epoch": 0.05, + "grad_norm": 1.7409079251133657, + "learning_rate": 1.9978085806769375e-05, + "loss": 1.0322, + "step": 339 + }, + { + "epoch": 0.05, + "grad_norm": 1.9528789498793981, + "learning_rate": 1.9977764843007242e-05, + "loss": 0.9718, + "step": 340 + }, + { + "epoch": 0.05, + "grad_norm": 1.6951967762418776, + "learning_rate": 1.9977441548438246e-05, + "loss": 0.993, + "step": 341 + }, + { + "epoch": 0.05, + "grad_norm": 1.801089539690508, + "learning_rate": 1.9977115923137912e-05, + "loss": 1.0848, + "step": 342 + }, + { + "epoch": 0.05, + "grad_norm": 1.924276458472716, + "learning_rate": 1.9976787967182303e-05, + "loss": 1.1116, + "step": 343 + }, + { + "epoch": 0.05, + "grad_norm": 1.911339529842714, + "learning_rate": 1.9976457680648033e-05, + "loss": 1.0106, + "step": 344 + }, + { + "epoch": 0.05, + "grad_norm": 1.8795885286729412, + "learning_rate": 1.9976125063612254e-05, + "loss": 1.0892, + "step": 345 + }, + { + "epoch": 0.05, + "grad_norm": 1.8747873776798474, + "learning_rate": 1.9975790116152668e-05, + "loss": 1.0552, + "step": 346 + }, + { + "epoch": 0.05, + "grad_norm": 1.6759695411928928, + "learning_rate": 1.9975452838347513e-05, + "loss": 0.9651, + "step": 347 + }, + { + "epoch": 0.05, + "grad_norm": 1.892483736446108, + "learning_rate": 1.9975113230275583e-05, + "loss": 0.9574, + "step": 348 + }, + { + "epoch": 0.05, + "grad_norm": 1.8709728862932533, + "learning_rate": 1.997477129201621e-05, + "loss": 1.0756, + "step": 349 + }, + { + "epoch": 0.05, + "grad_norm": 1.9454781450273466, + "learning_rate": 1.997442702364927e-05, + "loss": 1.0243, + "step": 350 + }, + { + "epoch": 0.05, + "grad_norm": 1.2211417729946594, + "learning_rate": 1.997408042525518e-05, + "loss": 0.2988, + "step": 351 + }, + { + "epoch": 0.05, + "grad_norm": 2.0725325602143942, + "learning_rate": 1.9973731496914914e-05, + "loss": 1.0043, + "step": 352 + }, + { + "epoch": 0.05, + "grad_norm": 1.869161955493485, + "learning_rate": 1.9973380238709974e-05, + "loss": 0.9723, + "step": 353 + }, + { + "epoch": 0.05, + "grad_norm": 1.8280401841890608, + "learning_rate": 1.9973026650722417e-05, + "loss": 0.973, + "step": 354 + }, + { + "epoch": 0.05, + "grad_norm": 1.9158082367614362, + "learning_rate": 1.997267073303484e-05, + "loss": 1.0206, + "step": 355 + }, + { + "epoch": 0.05, + "grad_norm": 1.9184700180705447, + "learning_rate": 1.9972312485730384e-05, + "loss": 0.9818, + "step": 356 + }, + { + "epoch": 0.05, + "grad_norm": 1.8360498617161698, + "learning_rate": 1.9971951908892743e-05, + "loss": 0.9858, + "step": 357 + }, + { + "epoch": 0.05, + "grad_norm": 1.879228401720701, + "learning_rate": 1.997158900260614e-05, + "loss": 0.9905, + "step": 358 + }, + { + "epoch": 0.05, + "grad_norm": 1.9168440780518097, + "learning_rate": 1.9971223766955353e-05, + "loss": 1.0587, + "step": 359 + }, + { + "epoch": 0.05, + "grad_norm": 1.6694300100059905, + "learning_rate": 1.99708562020257e-05, + "loss": 0.9729, + "step": 360 + }, + { + "epoch": 0.05, + "grad_norm": 2.0261356800612624, + "learning_rate": 1.997048630790305e-05, + "loss": 1.022, + "step": 361 + }, + { + "epoch": 0.05, + "grad_norm": 1.6654939561470525, + "learning_rate": 1.9970114084673796e-05, + "loss": 1.0084, + "step": 362 + }, + { + "epoch": 0.05, + "grad_norm": 1.653267932249111, + "learning_rate": 1.9969739532424907e-05, + "loss": 1.081, + "step": 363 + }, + { + "epoch": 0.05, + "grad_norm": 1.6893220464759946, + "learning_rate": 1.9969362651243865e-05, + "loss": 0.9857, + "step": 364 + }, + { + "epoch": 0.05, + "grad_norm": 1.642384659467501, + "learning_rate": 1.9968983441218715e-05, + "loss": 1.0841, + "step": 365 + }, + { + "epoch": 0.05, + "grad_norm": 1.9159525667443391, + "learning_rate": 1.9968601902438043e-05, + "loss": 0.996, + "step": 366 + }, + { + "epoch": 0.05, + "grad_norm": 1.7973358848616572, + "learning_rate": 1.996821803499097e-05, + "loss": 1.0523, + "step": 367 + }, + { + "epoch": 0.05, + "grad_norm": 1.6687826721336216, + "learning_rate": 1.9967831838967175e-05, + "loss": 1.0016, + "step": 368 + }, + { + "epoch": 0.06, + "grad_norm": 1.7746941099314142, + "learning_rate": 1.9967443314456867e-05, + "loss": 1.0465, + "step": 369 + }, + { + "epoch": 0.06, + "grad_norm": 2.2004750933586537, + "learning_rate": 1.996705246155081e-05, + "loss": 1.0773, + "step": 370 + }, + { + "epoch": 0.06, + "grad_norm": 1.7655020719540753, + "learning_rate": 1.99666592803403e-05, + "loss": 1.0387, + "step": 371 + }, + { + "epoch": 0.06, + "grad_norm": 2.365683017709636, + "learning_rate": 1.9966263770917192e-05, + "loss": 1.0124, + "step": 372 + }, + { + "epoch": 0.06, + "grad_norm": 2.0366524707880345, + "learning_rate": 1.9965865933373874e-05, + "loss": 0.874, + "step": 373 + }, + { + "epoch": 0.06, + "grad_norm": 1.671223843411921, + "learning_rate": 1.9965465767803283e-05, + "loss": 0.9557, + "step": 374 + }, + { + "epoch": 0.06, + "grad_norm": 1.9471098475001651, + "learning_rate": 1.9965063274298893e-05, + "loss": 0.9588, + "step": 375 + }, + { + "epoch": 0.06, + "grad_norm": 1.8134046364665488, + "learning_rate": 1.996465845295473e-05, + "loss": 1.1017, + "step": 376 + }, + { + "epoch": 0.06, + "grad_norm": 1.9016930565882155, + "learning_rate": 1.9964251303865362e-05, + "loss": 0.9739, + "step": 377 + }, + { + "epoch": 0.06, + "grad_norm": 2.271598534245101, + "learning_rate": 1.9963841827125897e-05, + "loss": 0.9472, + "step": 378 + }, + { + "epoch": 0.06, + "grad_norm": 1.896049194221097, + "learning_rate": 1.9963430022831988e-05, + "loss": 0.9424, + "step": 379 + }, + { + "epoch": 0.06, + "grad_norm": 1.780642059590237, + "learning_rate": 1.996301589107983e-05, + "loss": 0.9207, + "step": 380 + }, + { + "epoch": 0.06, + "grad_norm": 2.270826279650387, + "learning_rate": 1.9962599431966168e-05, + "loss": 1.0037, + "step": 381 + }, + { + "epoch": 0.06, + "grad_norm": 1.8704117316418303, + "learning_rate": 1.996218064558829e-05, + "loss": 0.9496, + "step": 382 + }, + { + "epoch": 0.06, + "grad_norm": 1.8661718269613556, + "learning_rate": 1.9961759532044017e-05, + "loss": 1.0117, + "step": 383 + }, + { + "epoch": 0.06, + "grad_norm": 1.8417651698598574, + "learning_rate": 1.9961336091431728e-05, + "loss": 0.9955, + "step": 384 + }, + { + "epoch": 0.06, + "grad_norm": 1.6470843282071816, + "learning_rate": 1.9960910323850333e-05, + "loss": 0.9237, + "step": 385 + }, + { + "epoch": 0.06, + "grad_norm": 2.084013593695897, + "learning_rate": 1.9960482229399294e-05, + "loss": 0.9585, + "step": 386 + }, + { + "epoch": 0.06, + "grad_norm": 1.7429179544113134, + "learning_rate": 1.9960051808178616e-05, + "loss": 0.979, + "step": 387 + }, + { + "epoch": 0.06, + "grad_norm": 1.7214252112920558, + "learning_rate": 1.995961906028884e-05, + "loss": 0.9104, + "step": 388 + }, + { + "epoch": 0.06, + "grad_norm": 1.710460520148069, + "learning_rate": 1.9959183985831063e-05, + "loss": 1.025, + "step": 389 + }, + { + "epoch": 0.06, + "grad_norm": 1.7839407392827988, + "learning_rate": 1.9958746584906914e-05, + "loss": 0.9947, + "step": 390 + }, + { + "epoch": 0.06, + "grad_norm": 1.816552667971807, + "learning_rate": 1.995830685761857e-05, + "loss": 0.929, + "step": 391 + }, + { + "epoch": 0.06, + "grad_norm": 1.9080235563689414, + "learning_rate": 1.9957864804068752e-05, + "loss": 1.0407, + "step": 392 + }, + { + "epoch": 0.06, + "grad_norm": 1.609038295754576, + "learning_rate": 1.9957420424360726e-05, + "loss": 1.0248, + "step": 393 + }, + { + "epoch": 0.06, + "grad_norm": 1.6180642787300732, + "learning_rate": 1.9956973718598292e-05, + "loss": 1.0125, + "step": 394 + }, + { + "epoch": 0.06, + "grad_norm": 1.837362902823947, + "learning_rate": 1.9956524686885807e-05, + "loss": 0.9714, + "step": 395 + }, + { + "epoch": 0.06, + "grad_norm": 1.692729518219842, + "learning_rate": 1.9956073329328168e-05, + "loss": 0.9608, + "step": 396 + }, + { + "epoch": 0.06, + "grad_norm": 2.262973964163377, + "learning_rate": 1.99556196460308e-05, + "loss": 1.0805, + "step": 397 + }, + { + "epoch": 0.06, + "grad_norm": 2.486665981438739, + "learning_rate": 1.9955163637099698e-05, + "loss": 0.973, + "step": 398 + }, + { + "epoch": 0.06, + "grad_norm": 1.6279921082061088, + "learning_rate": 1.9954705302641373e-05, + "loss": 1.0719, + "step": 399 + }, + { + "epoch": 0.06, + "grad_norm": 1.8690799137248861, + "learning_rate": 1.99542446427629e-05, + "loss": 0.9738, + "step": 400 + }, + { + "epoch": 0.06, + "grad_norm": 1.8688923352953541, + "learning_rate": 1.9953781657571887e-05, + "loss": 0.8855, + "step": 401 + }, + { + "epoch": 0.06, + "grad_norm": 1.9138325981333588, + "learning_rate": 1.995331634717649e-05, + "loss": 1.008, + "step": 402 + }, + { + "epoch": 0.06, + "grad_norm": 0.9804330084088011, + "learning_rate": 1.9952848711685398e-05, + "loss": 0.3712, + "step": 403 + }, + { + "epoch": 0.06, + "grad_norm": 1.7239681728136025, + "learning_rate": 1.9952378751207858e-05, + "loss": 0.9922, + "step": 404 + }, + { + "epoch": 0.06, + "grad_norm": 2.272422800043243, + "learning_rate": 1.995190646585365e-05, + "loss": 1.0908, + "step": 405 + }, + { + "epoch": 0.06, + "grad_norm": 1.9734144396729554, + "learning_rate": 1.99514318557331e-05, + "loss": 1.0069, + "step": 406 + }, + { + "epoch": 0.06, + "grad_norm": 1.6705814464427644, + "learning_rate": 1.9950954920957074e-05, + "loss": 0.973, + "step": 407 + }, + { + "epoch": 0.06, + "grad_norm": 1.9164283455695768, + "learning_rate": 1.995047566163699e-05, + "loss": 0.967, + "step": 408 + }, + { + "epoch": 0.06, + "grad_norm": 2.0825054548353537, + "learning_rate": 1.99499940778848e-05, + "loss": 1.0111, + "step": 409 + }, + { + "epoch": 0.06, + "grad_norm": 1.6946507603916696, + "learning_rate": 1.9949510169813006e-05, + "loss": 1.0424, + "step": 410 + }, + { + "epoch": 0.06, + "grad_norm": 1.6963439332985022, + "learning_rate": 1.994902393753464e-05, + "loss": 0.925, + "step": 411 + }, + { + "epoch": 0.06, + "grad_norm": 1.7813881184752924, + "learning_rate": 1.994853538116329e-05, + "loss": 0.98, + "step": 412 + }, + { + "epoch": 0.06, + "grad_norm": 0.9476875471598322, + "learning_rate": 1.9948044500813085e-05, + "loss": 0.2952, + "step": 413 + }, + { + "epoch": 0.06, + "grad_norm": 1.631977815926575, + "learning_rate": 1.9947551296598698e-05, + "loss": 1.04, + "step": 414 + }, + { + "epoch": 0.06, + "grad_norm": 1.8654933240402845, + "learning_rate": 1.9947055768635333e-05, + "loss": 1.0291, + "step": 415 + }, + { + "epoch": 0.06, + "grad_norm": 1.724831197476821, + "learning_rate": 1.9946557917038752e-05, + "loss": 1.0285, + "step": 416 + }, + { + "epoch": 0.06, + "grad_norm": 1.7055810949008126, + "learning_rate": 1.994605774192525e-05, + "loss": 0.9754, + "step": 417 + }, + { + "epoch": 0.06, + "grad_norm": 1.902655632725055, + "learning_rate": 1.9945555243411666e-05, + "loss": 0.9884, + "step": 418 + }, + { + "epoch": 0.06, + "grad_norm": 1.7521010796747778, + "learning_rate": 1.994505042161539e-05, + "loss": 1.0392, + "step": 419 + }, + { + "epoch": 0.06, + "grad_norm": 1.9706370538272724, + "learning_rate": 1.9944543276654343e-05, + "loss": 1.0162, + "step": 420 + }, + { + "epoch": 0.06, + "grad_norm": 1.5913916826868926, + "learning_rate": 1.9944033808646997e-05, + "loss": 0.9555, + "step": 421 + }, + { + "epoch": 0.06, + "grad_norm": 1.947416664127582, + "learning_rate": 1.994352201771236e-05, + "loss": 1.0022, + "step": 422 + }, + { + "epoch": 0.06, + "grad_norm": 1.6636265488106525, + "learning_rate": 1.994300790396999e-05, + "loss": 0.9972, + "step": 423 + }, + { + "epoch": 0.06, + "grad_norm": 0.8301763145490835, + "learning_rate": 1.9942491467539984e-05, + "loss": 0.2936, + "step": 424 + }, + { + "epoch": 0.06, + "grad_norm": 1.7860710337754988, + "learning_rate": 1.994197270854298e-05, + "loss": 1.1105, + "step": 425 + }, + { + "epoch": 0.06, + "grad_norm": 1.7246742095943215, + "learning_rate": 1.9941451627100163e-05, + "loss": 1.0, + "step": 426 + }, + { + "epoch": 0.06, + "grad_norm": 1.7204170411400848, + "learning_rate": 1.9940928223333254e-05, + "loss": 1.0085, + "step": 427 + }, + { + "epoch": 0.06, + "grad_norm": 1.7230008336438516, + "learning_rate": 1.994040249736452e-05, + "loss": 1.0553, + "step": 428 + }, + { + "epoch": 0.06, + "grad_norm": 1.9500428810203743, + "learning_rate": 1.9939874449316776e-05, + "loss": 0.9951, + "step": 429 + }, + { + "epoch": 0.06, + "grad_norm": 1.9835723244770853, + "learning_rate": 1.9939344079313368e-05, + "loss": 0.9606, + "step": 430 + }, + { + "epoch": 0.06, + "grad_norm": 1.7612908926450928, + "learning_rate": 1.9938811387478193e-05, + "loss": 1.068, + "step": 431 + }, + { + "epoch": 0.06, + "grad_norm": 0.804672956335246, + "learning_rate": 1.9938276373935688e-05, + "loss": 0.2874, + "step": 432 + }, + { + "epoch": 0.06, + "grad_norm": 2.1189658835656644, + "learning_rate": 1.9937739038810833e-05, + "loss": 0.9592, + "step": 433 + }, + { + "epoch": 0.06, + "grad_norm": 1.907383084649791, + "learning_rate": 1.9937199382229147e-05, + "loss": 0.9922, + "step": 434 + }, + { + "epoch": 0.06, + "grad_norm": 1.8673974960543036, + "learning_rate": 1.9936657404316694e-05, + "loss": 1.0396, + "step": 435 + }, + { + "epoch": 0.07, + "grad_norm": 1.7609749617395538, + "learning_rate": 1.9936113105200085e-05, + "loss": 1.016, + "step": 436 + }, + { + "epoch": 0.07, + "grad_norm": 1.857177064408329, + "learning_rate": 1.9935566485006464e-05, + "loss": 1.0517, + "step": 437 + }, + { + "epoch": 0.07, + "grad_norm": 1.7881466257826963, + "learning_rate": 1.9935017543863522e-05, + "loss": 1.0058, + "step": 438 + }, + { + "epoch": 0.07, + "grad_norm": 0.7818711326981459, + "learning_rate": 1.993446628189949e-05, + "loss": 0.2837, + "step": 439 + }, + { + "epoch": 0.07, + "grad_norm": 1.732664123594324, + "learning_rate": 1.993391269924315e-05, + "loss": 1.0294, + "step": 440 + }, + { + "epoch": 0.07, + "grad_norm": 1.9324930738137451, + "learning_rate": 1.9933356796023808e-05, + "loss": 1.0167, + "step": 441 + }, + { + "epoch": 0.07, + "grad_norm": 1.8240254481494775, + "learning_rate": 1.993279857237133e-05, + "loss": 1.0563, + "step": 442 + }, + { + "epoch": 0.07, + "grad_norm": 1.6751234246176034, + "learning_rate": 1.9932238028416118e-05, + "loss": 0.969, + "step": 443 + }, + { + "epoch": 0.07, + "grad_norm": 0.8773278134733776, + "learning_rate": 1.9931675164289114e-05, + "loss": 0.3523, + "step": 444 + }, + { + "epoch": 0.07, + "grad_norm": 1.8829661421393336, + "learning_rate": 1.9931109980121797e-05, + "loss": 1.0271, + "step": 445 + }, + { + "epoch": 0.07, + "grad_norm": 1.670229941389474, + "learning_rate": 1.9930542476046204e-05, + "loss": 0.9869, + "step": 446 + }, + { + "epoch": 0.07, + "grad_norm": 1.9529735258492515, + "learning_rate": 1.99299726521949e-05, + "loss": 1.0093, + "step": 447 + }, + { + "epoch": 0.07, + "grad_norm": 1.8337061013793297, + "learning_rate": 1.9929400508700994e-05, + "loss": 1.0088, + "step": 448 + }, + { + "epoch": 0.07, + "grad_norm": 1.8868987054874113, + "learning_rate": 1.9928826045698138e-05, + "loss": 0.9401, + "step": 449 + }, + { + "epoch": 0.07, + "grad_norm": 1.7736822119106102, + "learning_rate": 1.992824926332053e-05, + "loss": 0.8959, + "step": 450 + }, + { + "epoch": 0.07, + "grad_norm": 1.8443091482171898, + "learning_rate": 1.9927670161702906e-05, + "loss": 0.9301, + "step": 451 + }, + { + "epoch": 0.07, + "grad_norm": 1.8673399290410109, + "learning_rate": 1.992708874098054e-05, + "loss": 1.0448, + "step": 452 + }, + { + "epoch": 0.07, + "grad_norm": 1.7480975521989213, + "learning_rate": 1.992650500128926e-05, + "loss": 1.0478, + "step": 453 + }, + { + "epoch": 0.07, + "grad_norm": 1.7284452650329931, + "learning_rate": 1.992591894276542e-05, + "loss": 0.9936, + "step": 454 + }, + { + "epoch": 0.07, + "grad_norm": 2.158507005520441, + "learning_rate": 1.9925330565545927e-05, + "loss": 1.0103, + "step": 455 + }, + { + "epoch": 0.07, + "grad_norm": 1.8754450935331193, + "learning_rate": 1.9924739869768222e-05, + "loss": 0.9605, + "step": 456 + }, + { + "epoch": 0.07, + "grad_norm": 1.597953439972833, + "learning_rate": 1.9924146855570298e-05, + "loss": 0.9656, + "step": 457 + }, + { + "epoch": 0.07, + "grad_norm": 1.7630046266649517, + "learning_rate": 1.992355152309068e-05, + "loss": 0.9888, + "step": 458 + }, + { + "epoch": 0.07, + "grad_norm": 1.673044052785185, + "learning_rate": 1.9922953872468436e-05, + "loss": 1.0132, + "step": 459 + }, + { + "epoch": 0.07, + "grad_norm": 1.93742425159078, + "learning_rate": 1.992235390384318e-05, + "loss": 0.9908, + "step": 460 + }, + { + "epoch": 0.07, + "grad_norm": 1.7712795240065684, + "learning_rate": 1.9921751617355063e-05, + "loss": 0.9565, + "step": 461 + }, + { + "epoch": 0.07, + "grad_norm": 1.691828531846784, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.9299, + "step": 462 + }, + { + "epoch": 0.07, + "grad_norm": 1.6994929004809474, + "learning_rate": 1.9920540091353567e-05, + "loss": 0.9843, + "step": 463 + }, + { + "epoch": 0.07, + "grad_norm": 1.6426793406324183, + "learning_rate": 1.99199308521232e-05, + "loss": 1.0106, + "step": 464 + }, + { + "epoch": 0.07, + "grad_norm": 1.9115990999206212, + "learning_rate": 1.9919319295596e-05, + "loss": 1.0126, + "step": 465 + }, + { + "epoch": 0.07, + "grad_norm": 1.8678655610247155, + "learning_rate": 1.9918705421914816e-05, + "loss": 0.9274, + "step": 466 + }, + { + "epoch": 0.07, + "grad_norm": 1.6057461570853822, + "learning_rate": 1.9918089231223066e-05, + "loss": 1.0117, + "step": 467 + }, + { + "epoch": 0.07, + "grad_norm": 1.6413082675476167, + "learning_rate": 1.991747072366468e-05, + "loss": 1.0145, + "step": 468 + }, + { + "epoch": 0.07, + "grad_norm": 1.7508791020380656, + "learning_rate": 1.9916849899384147e-05, + "loss": 0.9372, + "step": 469 + }, + { + "epoch": 0.07, + "grad_norm": 1.6100942765885784, + "learning_rate": 1.9916226758526487e-05, + "loss": 0.8925, + "step": 470 + }, + { + "epoch": 0.07, + "grad_norm": 1.7453075149314676, + "learning_rate": 1.9915601301237268e-05, + "loss": 1.0288, + "step": 471 + }, + { + "epoch": 0.07, + "grad_norm": 1.7705674758427683, + "learning_rate": 1.99149735276626e-05, + "loss": 0.9981, + "step": 472 + }, + { + "epoch": 0.07, + "grad_norm": 1.772355931431959, + "learning_rate": 1.9914343437949127e-05, + "loss": 1.0421, + "step": 473 + }, + { + "epoch": 0.07, + "grad_norm": 2.046502924909874, + "learning_rate": 1.991371103224404e-05, + "loss": 0.9943, + "step": 474 + }, + { + "epoch": 0.07, + "grad_norm": 1.4070735930320404, + "learning_rate": 1.9913076310695068e-05, + "loss": 1.0179, + "step": 475 + }, + { + "epoch": 0.07, + "grad_norm": 1.595258698737812, + "learning_rate": 1.991243927345048e-05, + "loss": 0.9025, + "step": 476 + }, + { + "epoch": 0.07, + "grad_norm": 1.7908365270325948, + "learning_rate": 1.9911799920659093e-05, + "loss": 0.9881, + "step": 477 + }, + { + "epoch": 0.07, + "grad_norm": 1.5227477921046528, + "learning_rate": 1.9911158252470257e-05, + "loss": 1.063, + "step": 478 + }, + { + "epoch": 0.07, + "grad_norm": 1.791504372040175, + "learning_rate": 1.9910514269033866e-05, + "loss": 0.985, + "step": 479 + }, + { + "epoch": 0.07, + "grad_norm": 1.850521202263624, + "learning_rate": 1.9909867970500353e-05, + "loss": 0.9251, + "step": 480 + }, + { + "epoch": 0.07, + "grad_norm": 1.8951528050679687, + "learning_rate": 1.9909219357020695e-05, + "loss": 1.0234, + "step": 481 + }, + { + "epoch": 0.07, + "grad_norm": 1.8356998614462292, + "learning_rate": 1.9908568428746408e-05, + "loss": 0.9865, + "step": 482 + }, + { + "epoch": 0.07, + "grad_norm": 1.6852686719838303, + "learning_rate": 1.990791518582955e-05, + "loss": 0.965, + "step": 483 + }, + { + "epoch": 0.07, + "grad_norm": 1.7180072631786487, + "learning_rate": 1.9907259628422718e-05, + "loss": 1.0102, + "step": 484 + }, + { + "epoch": 0.07, + "grad_norm": 1.7700524194227036, + "learning_rate": 1.9906601756679048e-05, + "loss": 0.9603, + "step": 485 + }, + { + "epoch": 0.07, + "grad_norm": 1.6828417658585442, + "learning_rate": 1.990594157075222e-05, + "loss": 0.9792, + "step": 486 + }, + { + "epoch": 0.07, + "grad_norm": 1.7557589626348973, + "learning_rate": 1.9905279070796454e-05, + "loss": 0.8588, + "step": 487 + }, + { + "epoch": 0.07, + "grad_norm": 1.7317388665444549, + "learning_rate": 1.9904614256966514e-05, + "loss": 0.9986, + "step": 488 + }, + { + "epoch": 0.07, + "grad_norm": 1.6931663180939196, + "learning_rate": 1.9903947129417696e-05, + "loss": 1.0184, + "step": 489 + }, + { + "epoch": 0.07, + "grad_norm": 1.6999141717029091, + "learning_rate": 1.990327768830584e-05, + "loss": 1.0062, + "step": 490 + }, + { + "epoch": 0.07, + "grad_norm": 1.990725713993706, + "learning_rate": 1.990260593378733e-05, + "loss": 1.0831, + "step": 491 + }, + { + "epoch": 0.07, + "grad_norm": 1.6705278936628876, + "learning_rate": 1.9901931866019087e-05, + "loss": 0.9517, + "step": 492 + }, + { + "epoch": 0.07, + "grad_norm": 1.9700353571405922, + "learning_rate": 1.9901255485158574e-05, + "loss": 1.024, + "step": 493 + }, + { + "epoch": 0.07, + "grad_norm": 2.0101515433726713, + "learning_rate": 1.9900576791363795e-05, + "loss": 0.9651, + "step": 494 + }, + { + "epoch": 0.07, + "grad_norm": 1.9079881135031231, + "learning_rate": 1.989989578479329e-05, + "loss": 1.0301, + "step": 495 + }, + { + "epoch": 0.07, + "grad_norm": 2.1563214017269066, + "learning_rate": 1.989921246560614e-05, + "loss": 0.9411, + "step": 496 + }, + { + "epoch": 0.07, + "grad_norm": 1.9433340483122155, + "learning_rate": 1.989852683396198e-05, + "loss": 1.0592, + "step": 497 + }, + { + "epoch": 0.07, + "grad_norm": 1.6650423245663724, + "learning_rate": 1.989783889002096e-05, + "loss": 0.9298, + "step": 498 + }, + { + "epoch": 0.07, + "grad_norm": 1.9234651813792842, + "learning_rate": 1.989714863394379e-05, + "loss": 0.9825, + "step": 499 + }, + { + "epoch": 0.07, + "grad_norm": 2.0082280975700524, + "learning_rate": 1.989645606589171e-05, + "loss": 1.0092, + "step": 500 + }, + { + "epoch": 0.07, + "grad_norm": 1.9815040662391932, + "learning_rate": 1.989576118602651e-05, + "loss": 1.0367, + "step": 501 + }, + { + "epoch": 0.07, + "grad_norm": 1.7069392081266068, + "learning_rate": 1.9895063994510512e-05, + "loss": 0.9731, + "step": 502 + }, + { + "epoch": 0.08, + "grad_norm": 2.085711779508058, + "learning_rate": 1.989436449150658e-05, + "loss": 1.0642, + "step": 503 + }, + { + "epoch": 0.08, + "grad_norm": 1.6401934567058296, + "learning_rate": 1.9893662677178116e-05, + "loss": 1.0061, + "step": 504 + }, + { + "epoch": 0.08, + "grad_norm": 1.8772927472174918, + "learning_rate": 1.9892958551689065e-05, + "loss": 1.0555, + "step": 505 + }, + { + "epoch": 0.08, + "grad_norm": 1.6825233296028157, + "learning_rate": 1.989225211520391e-05, + "loss": 0.9436, + "step": 506 + }, + { + "epoch": 0.08, + "grad_norm": 1.5274675019123192, + "learning_rate": 1.9891543367887675e-05, + "loss": 0.9617, + "step": 507 + }, + { + "epoch": 0.08, + "grad_norm": 1.5803756492693821, + "learning_rate": 1.9890832309905927e-05, + "loss": 0.963, + "step": 508 + }, + { + "epoch": 0.08, + "grad_norm": 1.6939482132596444, + "learning_rate": 1.989011894142476e-05, + "loss": 0.9643, + "step": 509 + }, + { + "epoch": 0.08, + "grad_norm": 1.0225263659357182, + "learning_rate": 1.9889403262610827e-05, + "loss": 0.3143, + "step": 510 + }, + { + "epoch": 0.08, + "grad_norm": 1.589048564009571, + "learning_rate": 1.9888685273631307e-05, + "loss": 0.9786, + "step": 511 + }, + { + "epoch": 0.08, + "grad_norm": 1.6228493525542471, + "learning_rate": 1.988796497465392e-05, + "loss": 0.9555, + "step": 512 + }, + { + "epoch": 0.08, + "grad_norm": 0.8677355778429002, + "learning_rate": 1.9887242365846933e-05, + "loss": 0.3073, + "step": 513 + }, + { + "epoch": 0.08, + "grad_norm": 1.947119784848312, + "learning_rate": 1.988651744737914e-05, + "loss": 0.907, + "step": 514 + }, + { + "epoch": 0.08, + "grad_norm": 1.786175594370177, + "learning_rate": 1.9885790219419888e-05, + "loss": 0.9491, + "step": 515 + }, + { + "epoch": 0.08, + "grad_norm": 2.2095671154084395, + "learning_rate": 1.988506068213906e-05, + "loss": 0.9752, + "step": 516 + }, + { + "epoch": 0.08, + "grad_norm": 1.1107332157479002, + "learning_rate": 1.988432883570707e-05, + "loss": 0.3077, + "step": 517 + }, + { + "epoch": 0.08, + "grad_norm": 2.0106255199643956, + "learning_rate": 1.9883594680294878e-05, + "loss": 0.9505, + "step": 518 + }, + { + "epoch": 0.08, + "grad_norm": 1.9913934845663013, + "learning_rate": 1.9882858216073982e-05, + "loss": 1.0423, + "step": 519 + }, + { + "epoch": 0.08, + "grad_norm": 1.5683550787091347, + "learning_rate": 1.9882119443216426e-05, + "loss": 1.0429, + "step": 520 + }, + { + "epoch": 0.08, + "grad_norm": 1.6720639321493218, + "learning_rate": 1.988137836189478e-05, + "loss": 0.9568, + "step": 521 + }, + { + "epoch": 0.08, + "grad_norm": 1.6874564869445212, + "learning_rate": 1.9880634972282168e-05, + "loss": 0.9808, + "step": 522 + }, + { + "epoch": 0.08, + "grad_norm": 1.9911132917647867, + "learning_rate": 1.987988927455224e-05, + "loss": 1.035, + "step": 523 + }, + { + "epoch": 0.08, + "grad_norm": 1.8655671766479962, + "learning_rate": 1.9879141268879194e-05, + "loss": 0.9435, + "step": 524 + }, + { + "epoch": 0.08, + "grad_norm": 1.7108612549789455, + "learning_rate": 1.9878390955437764e-05, + "loss": 1.0306, + "step": 525 + }, + { + "epoch": 0.08, + "grad_norm": 0.903099821347723, + "learning_rate": 1.987763833440322e-05, + "loss": 0.2912, + "step": 526 + }, + { + "epoch": 0.08, + "grad_norm": 1.7776897303273496, + "learning_rate": 1.9876883405951378e-05, + "loss": 1.0163, + "step": 527 + }, + { + "epoch": 0.08, + "grad_norm": 2.0838132509155445, + "learning_rate": 1.9876126170258588e-05, + "loss": 1.0462, + "step": 528 + }, + { + "epoch": 0.08, + "grad_norm": 2.1040942707606436, + "learning_rate": 1.9875366627501743e-05, + "loss": 0.9781, + "step": 529 + }, + { + "epoch": 0.08, + "grad_norm": 1.7766394928123865, + "learning_rate": 1.9874604777858272e-05, + "loss": 0.9783, + "step": 530 + }, + { + "epoch": 0.08, + "grad_norm": 1.7587165044446451, + "learning_rate": 1.9873840621506138e-05, + "loss": 1.0506, + "step": 531 + }, + { + "epoch": 0.08, + "grad_norm": 1.8520954719057352, + "learning_rate": 1.987307415862385e-05, + "loss": 1.0355, + "step": 532 + }, + { + "epoch": 0.08, + "grad_norm": 1.738546160539183, + "learning_rate": 1.9872305389390454e-05, + "loss": 1.0503, + "step": 533 + }, + { + "epoch": 0.08, + "grad_norm": 1.7963801967360402, + "learning_rate": 1.987153431398554e-05, + "loss": 1.0747, + "step": 534 + }, + { + "epoch": 0.08, + "grad_norm": 2.051071892987532, + "learning_rate": 1.9870760932589225e-05, + "loss": 0.9607, + "step": 535 + }, + { + "epoch": 0.08, + "grad_norm": 1.6462638295482712, + "learning_rate": 1.9869985245382172e-05, + "loss": 1.0054, + "step": 536 + }, + { + "epoch": 0.08, + "grad_norm": 1.9253676833173057, + "learning_rate": 1.9869207252545582e-05, + "loss": 0.9846, + "step": 537 + }, + { + "epoch": 0.08, + "grad_norm": 1.864572095881507, + "learning_rate": 1.9868426954261198e-05, + "loss": 1.0285, + "step": 538 + }, + { + "epoch": 0.08, + "grad_norm": 1.8687462565056039, + "learning_rate": 1.986764435071129e-05, + "loss": 0.9482, + "step": 539 + }, + { + "epoch": 0.08, + "grad_norm": 2.117948055904973, + "learning_rate": 1.986685944207868e-05, + "loss": 1.0006, + "step": 540 + }, + { + "epoch": 0.08, + "grad_norm": 1.8121410787300227, + "learning_rate": 1.9866072228546724e-05, + "loss": 0.9498, + "step": 541 + }, + { + "epoch": 0.08, + "grad_norm": 1.5886336473246425, + "learning_rate": 1.986528271029931e-05, + "loss": 0.889, + "step": 542 + }, + { + "epoch": 0.08, + "grad_norm": 1.9319387230966267, + "learning_rate": 1.9864490887520877e-05, + "loss": 0.8756, + "step": 543 + }, + { + "epoch": 0.08, + "grad_norm": 1.5417456523426636, + "learning_rate": 1.986369676039638e-05, + "loss": 0.8807, + "step": 544 + }, + { + "epoch": 0.08, + "grad_norm": 2.1763364590287626, + "learning_rate": 1.9862900329111344e-05, + "loss": 0.9774, + "step": 545 + }, + { + "epoch": 0.08, + "grad_norm": 1.9791657103778806, + "learning_rate": 1.986210159385181e-05, + "loss": 0.9612, + "step": 546 + }, + { + "epoch": 0.08, + "grad_norm": 1.6572905484841525, + "learning_rate": 1.9861300554804357e-05, + "loss": 0.9916, + "step": 547 + }, + { + "epoch": 0.08, + "grad_norm": 1.6777328584462108, + "learning_rate": 1.9860497212156114e-05, + "loss": 1.0715, + "step": 548 + }, + { + "epoch": 0.08, + "grad_norm": 1.7792046059260411, + "learning_rate": 1.985969156609474e-05, + "loss": 1.0082, + "step": 549 + }, + { + "epoch": 0.08, + "grad_norm": 1.5246067255277136, + "learning_rate": 1.9858883616808434e-05, + "loss": 1.0161, + "step": 550 + }, + { + "epoch": 0.08, + "grad_norm": 1.7274715039470692, + "learning_rate": 1.9858073364485933e-05, + "loss": 1.0161, + "step": 551 + }, + { + "epoch": 0.08, + "grad_norm": 1.6498690937955873, + "learning_rate": 1.985726080931651e-05, + "loss": 0.9486, + "step": 552 + }, + { + "epoch": 0.08, + "grad_norm": 1.8807515037160574, + "learning_rate": 1.9856445951489984e-05, + "loss": 0.9449, + "step": 553 + }, + { + "epoch": 0.08, + "grad_norm": 1.8208768935597974, + "learning_rate": 1.9855628791196698e-05, + "loss": 0.9693, + "step": 554 + }, + { + "epoch": 0.08, + "grad_norm": 1.6833041720935167, + "learning_rate": 1.9854809328627546e-05, + "loss": 1.0078, + "step": 555 + }, + { + "epoch": 0.08, + "grad_norm": 1.6543281933906169, + "learning_rate": 1.985398756397395e-05, + "loss": 0.9228, + "step": 556 + }, + { + "epoch": 0.08, + "grad_norm": 1.583195277532729, + "learning_rate": 1.9853163497427885e-05, + "loss": 0.9595, + "step": 557 + }, + { + "epoch": 0.08, + "grad_norm": 1.867090988857162, + "learning_rate": 1.985233712918184e-05, + "loss": 0.9916, + "step": 558 + }, + { + "epoch": 0.08, + "grad_norm": 1.5232151688440378, + "learning_rate": 1.9851508459428858e-05, + "loss": 1.0172, + "step": 559 + }, + { + "epoch": 0.08, + "grad_norm": 1.7777277343045925, + "learning_rate": 1.985067748836252e-05, + "loss": 0.9679, + "step": 560 + }, + { + "epoch": 0.08, + "grad_norm": 1.812436360647807, + "learning_rate": 1.9849844216176945e-05, + "loss": 0.9489, + "step": 561 + }, + { + "epoch": 0.08, + "grad_norm": 1.7085347777656354, + "learning_rate": 1.9849008643066774e-05, + "loss": 0.9593, + "step": 562 + }, + { + "epoch": 0.08, + "grad_norm": 1.73334444203699, + "learning_rate": 1.9848170769227203e-05, + "loss": 0.9853, + "step": 563 + }, + { + "epoch": 0.08, + "grad_norm": 1.728386131651639, + "learning_rate": 1.984733059485396e-05, + "loss": 0.9522, + "step": 564 + }, + { + "epoch": 0.08, + "grad_norm": 1.9050766614994752, + "learning_rate": 1.984648812014331e-05, + "loss": 1.0545, + "step": 565 + }, + { + "epoch": 0.08, + "grad_norm": 1.6863897587600656, + "learning_rate": 1.9845643345292055e-05, + "loss": 0.889, + "step": 566 + }, + { + "epoch": 0.08, + "grad_norm": 1.6819698208684524, + "learning_rate": 1.984479627049753e-05, + "loss": 0.9936, + "step": 567 + }, + { + "epoch": 0.08, + "grad_norm": 0.8885363255340635, + "learning_rate": 1.984394689595762e-05, + "loss": 0.3346, + "step": 568 + }, + { + "epoch": 0.08, + "grad_norm": 1.8871134864353314, + "learning_rate": 1.9843095221870736e-05, + "loss": 0.9714, + "step": 569 + }, + { + "epoch": 0.09, + "grad_norm": 1.4777184199148228, + "learning_rate": 1.984224124843582e-05, + "loss": 0.9833, + "step": 570 + }, + { + "epoch": 0.09, + "grad_norm": 1.829762953791349, + "learning_rate": 1.9841384975852373e-05, + "loss": 1.0405, + "step": 571 + }, + { + "epoch": 0.09, + "grad_norm": 1.7528872110219917, + "learning_rate": 1.9840526404320415e-05, + "loss": 1.0123, + "step": 572 + }, + { + "epoch": 0.09, + "grad_norm": 1.8760102619994743, + "learning_rate": 1.9839665534040507e-05, + "loss": 0.9951, + "step": 573 + }, + { + "epoch": 0.09, + "grad_norm": 1.5714402952846405, + "learning_rate": 1.9838802365213752e-05, + "loss": 0.945, + "step": 574 + }, + { + "epoch": 0.09, + "grad_norm": 2.2546242292651044, + "learning_rate": 1.9837936898041783e-05, + "loss": 0.8821, + "step": 575 + }, + { + "epoch": 0.09, + "grad_norm": 1.7115642594977378, + "learning_rate": 1.9837069132726775e-05, + "loss": 0.8517, + "step": 576 + }, + { + "epoch": 0.09, + "grad_norm": 1.6632874064960919, + "learning_rate": 1.983619906947144e-05, + "loss": 1.0078, + "step": 577 + }, + { + "epoch": 0.09, + "grad_norm": 1.6181599335511319, + "learning_rate": 1.9835326708479015e-05, + "loss": 1.009, + "step": 578 + }, + { + "epoch": 0.09, + "grad_norm": 1.7747866544314281, + "learning_rate": 1.98344520499533e-05, + "loss": 1.0198, + "step": 579 + }, + { + "epoch": 0.09, + "grad_norm": 0.8244597406979507, + "learning_rate": 1.9833575094098602e-05, + "loss": 0.3072, + "step": 580 + }, + { + "epoch": 0.09, + "grad_norm": 1.978657081005872, + "learning_rate": 1.9832695841119784e-05, + "loss": 1.0135, + "step": 581 + }, + { + "epoch": 0.09, + "grad_norm": 1.7488868677099108, + "learning_rate": 1.9831814291222233e-05, + "loss": 0.9888, + "step": 582 + }, + { + "epoch": 0.09, + "grad_norm": 1.6212122804901654, + "learning_rate": 1.983093044461189e-05, + "loss": 0.963, + "step": 583 + }, + { + "epoch": 0.09, + "grad_norm": 1.5699787639287242, + "learning_rate": 1.9830044301495213e-05, + "loss": 0.9359, + "step": 584 + }, + { + "epoch": 0.09, + "grad_norm": 2.0507971672167007, + "learning_rate": 1.9829155862079207e-05, + "loss": 0.9246, + "step": 585 + }, + { + "epoch": 0.09, + "grad_norm": 1.4874152469933142, + "learning_rate": 1.9828265126571414e-05, + "loss": 0.9327, + "step": 586 + }, + { + "epoch": 0.09, + "grad_norm": 1.718251280605203, + "learning_rate": 1.982737209517991e-05, + "loss": 0.9475, + "step": 587 + }, + { + "epoch": 0.09, + "grad_norm": 1.7479648720031697, + "learning_rate": 1.9826476768113304e-05, + "loss": 0.9925, + "step": 588 + }, + { + "epoch": 0.09, + "grad_norm": 1.7225239112520367, + "learning_rate": 1.9825579145580747e-05, + "loss": 0.9955, + "step": 589 + }, + { + "epoch": 0.09, + "grad_norm": 1.8573163420724126, + "learning_rate": 1.982467922779192e-05, + "loss": 0.96, + "step": 590 + }, + { + "epoch": 0.09, + "grad_norm": 1.9061968201895618, + "learning_rate": 1.982377701495705e-05, + "loss": 0.9714, + "step": 591 + }, + { + "epoch": 0.09, + "grad_norm": 1.5896774926760178, + "learning_rate": 1.982287250728689e-05, + "loss": 0.9445, + "step": 592 + }, + { + "epoch": 0.09, + "grad_norm": 0.8734917685883975, + "learning_rate": 1.982196570499273e-05, + "loss": 0.3112, + "step": 593 + }, + { + "epoch": 0.09, + "grad_norm": 1.674042343599907, + "learning_rate": 1.9821056608286406e-05, + "loss": 0.9634, + "step": 594 + }, + { + "epoch": 0.09, + "grad_norm": 1.6146271670195458, + "learning_rate": 1.982014521738028e-05, + "loss": 0.9387, + "step": 595 + }, + { + "epoch": 0.09, + "grad_norm": 1.6757936015929897, + "learning_rate": 1.9819231532487252e-05, + "loss": 0.979, + "step": 596 + }, + { + "epoch": 0.09, + "grad_norm": 1.9101585434975927, + "learning_rate": 1.981831555382076e-05, + "loss": 0.9978, + "step": 597 + }, + { + "epoch": 0.09, + "grad_norm": 1.638509774175569, + "learning_rate": 1.9817397281594778e-05, + "loss": 0.9985, + "step": 598 + }, + { + "epoch": 0.09, + "grad_norm": 1.6597048566534132, + "learning_rate": 1.981647671602381e-05, + "loss": 1.0344, + "step": 599 + }, + { + "epoch": 0.09, + "grad_norm": 1.5224908736480791, + "learning_rate": 1.9815553857322905e-05, + "loss": 0.9918, + "step": 600 + }, + { + "epoch": 0.09, + "grad_norm": 1.9985248529060036, + "learning_rate": 1.9814628705707643e-05, + "loss": 1.0289, + "step": 601 + }, + { + "epoch": 0.09, + "grad_norm": 1.5327615466480906, + "learning_rate": 1.9813701261394136e-05, + "loss": 0.9655, + "step": 602 + }, + { + "epoch": 0.09, + "grad_norm": 1.6229098866994003, + "learning_rate": 1.9812771524599036e-05, + "loss": 0.9565, + "step": 603 + }, + { + "epoch": 0.09, + "grad_norm": 1.5051253765063912, + "learning_rate": 1.981183949553953e-05, + "loss": 0.9854, + "step": 604 + }, + { + "epoch": 0.09, + "grad_norm": 1.706387313271035, + "learning_rate": 1.981090517443334e-05, + "loss": 0.8863, + "step": 605 + }, + { + "epoch": 0.09, + "grad_norm": 1.6174941774190525, + "learning_rate": 1.9809968561498728e-05, + "loss": 0.9844, + "step": 606 + }, + { + "epoch": 0.09, + "grad_norm": 1.7553072826263136, + "learning_rate": 1.980902965695448e-05, + "loss": 0.9222, + "step": 607 + }, + { + "epoch": 0.09, + "grad_norm": 1.8186270151144839, + "learning_rate": 1.9808088461019926e-05, + "loss": 0.96, + "step": 608 + }, + { + "epoch": 0.09, + "grad_norm": 1.891269831009788, + "learning_rate": 1.980714497391493e-05, + "loss": 0.9335, + "step": 609 + }, + { + "epoch": 0.09, + "grad_norm": 1.6874262635801682, + "learning_rate": 1.9806199195859893e-05, + "loss": 0.9475, + "step": 610 + }, + { + "epoch": 0.09, + "grad_norm": 1.7359133537168558, + "learning_rate": 1.9805251127075746e-05, + "loss": 1.0473, + "step": 611 + }, + { + "epoch": 0.09, + "grad_norm": 1.621453926565137, + "learning_rate": 1.9804300767783958e-05, + "loss": 0.9227, + "step": 612 + }, + { + "epoch": 0.09, + "grad_norm": 1.6230725930169774, + "learning_rate": 1.9803348118206536e-05, + "loss": 0.9708, + "step": 613 + }, + { + "epoch": 0.09, + "grad_norm": 1.983605922888492, + "learning_rate": 1.9802393178566017e-05, + "loss": 1.0104, + "step": 614 + }, + { + "epoch": 0.09, + "grad_norm": 1.9554202974102726, + "learning_rate": 1.9801435949085475e-05, + "loss": 0.9732, + "step": 615 + }, + { + "epoch": 0.09, + "grad_norm": 1.849294609509034, + "learning_rate": 1.9800476429988516e-05, + "loss": 0.9606, + "step": 616 + }, + { + "epoch": 0.09, + "grad_norm": 1.563357603037707, + "learning_rate": 1.979951462149929e-05, + "loss": 0.9642, + "step": 617 + }, + { + "epoch": 0.09, + "grad_norm": 2.15545925295218, + "learning_rate": 1.979855052384247e-05, + "loss": 0.9849, + "step": 618 + }, + { + "epoch": 0.09, + "grad_norm": 1.621135612314959, + "learning_rate": 1.9797584137243272e-05, + "loss": 1.0143, + "step": 619 + }, + { + "epoch": 0.09, + "grad_norm": 1.5509560147836148, + "learning_rate": 1.9796615461927443e-05, + "loss": 0.9174, + "step": 620 + }, + { + "epoch": 0.09, + "grad_norm": 1.7656446578189005, + "learning_rate": 1.9795644498121266e-05, + "loss": 0.9895, + "step": 621 + }, + { + "epoch": 0.09, + "grad_norm": 1.611425933181333, + "learning_rate": 1.979467124605156e-05, + "loss": 1.0008, + "step": 622 + }, + { + "epoch": 0.09, + "grad_norm": 1.8890890708045418, + "learning_rate": 1.9793695705945674e-05, + "loss": 1.017, + "step": 623 + }, + { + "epoch": 0.09, + "grad_norm": 2.0334166640022753, + "learning_rate": 1.9792717878031498e-05, + "loss": 0.8839, + "step": 624 + }, + { + "epoch": 0.09, + "grad_norm": 1.7456661635297466, + "learning_rate": 1.979173776253745e-05, + "loss": 0.9794, + "step": 625 + }, + { + "epoch": 0.09, + "grad_norm": 1.5294370940175432, + "learning_rate": 1.979075535969248e-05, + "loss": 0.9557, + "step": 626 + }, + { + "epoch": 0.09, + "grad_norm": 2.0296542777234383, + "learning_rate": 1.9789770669726088e-05, + "loss": 0.9594, + "step": 627 + }, + { + "epoch": 0.09, + "grad_norm": 1.5689454503809042, + "learning_rate": 1.9788783692868288e-05, + "loss": 1.0005, + "step": 628 + }, + { + "epoch": 0.09, + "grad_norm": 1.6518207403192, + "learning_rate": 1.9787794429349645e-05, + "loss": 0.994, + "step": 629 + }, + { + "epoch": 0.09, + "grad_norm": 1.6208285390544763, + "learning_rate": 1.9786802879401248e-05, + "loss": 0.9192, + "step": 630 + }, + { + "epoch": 0.09, + "grad_norm": 1.6425782996323863, + "learning_rate": 1.978580904325472e-05, + "loss": 0.97, + "step": 631 + }, + { + "epoch": 0.09, + "grad_norm": 1.6699630286070308, + "learning_rate": 1.9784812921142232e-05, + "loss": 0.9691, + "step": 632 + }, + { + "epoch": 0.09, + "grad_norm": 1.716663219773858, + "learning_rate": 1.9783814513296464e-05, + "loss": 0.8943, + "step": 633 + }, + { + "epoch": 0.09, + "grad_norm": 1.83085374874018, + "learning_rate": 1.9782813819950656e-05, + "loss": 0.849, + "step": 634 + }, + { + "epoch": 0.09, + "grad_norm": 1.576647997372439, + "learning_rate": 1.9781810841338563e-05, + "loss": 0.9435, + "step": 635 + }, + { + "epoch": 0.09, + "grad_norm": 1.5873338689852323, + "learning_rate": 1.978080557769448e-05, + "loss": 0.9405, + "step": 636 + }, + { + "epoch": 0.1, + "grad_norm": 1.8983319227664757, + "learning_rate": 1.977979802925324e-05, + "loss": 0.9337, + "step": 637 + }, + { + "epoch": 0.1, + "grad_norm": 2.001087829787801, + "learning_rate": 1.977878819625021e-05, + "loss": 0.9547, + "step": 638 + }, + { + "epoch": 0.1, + "grad_norm": 1.5462695415537155, + "learning_rate": 1.9777776078921282e-05, + "loss": 0.988, + "step": 639 + }, + { + "epoch": 0.1, + "grad_norm": 1.8648083574798346, + "learning_rate": 1.9776761677502888e-05, + "loss": 0.9448, + "step": 640 + }, + { + "epoch": 0.1, + "grad_norm": 1.7401880360865456, + "learning_rate": 1.977574499223199e-05, + "loss": 0.8825, + "step": 641 + }, + { + "epoch": 0.1, + "grad_norm": 1.588603384090599, + "learning_rate": 1.977472602334609e-05, + "loss": 0.9502, + "step": 642 + }, + { + "epoch": 0.1, + "grad_norm": 2.013363154080898, + "learning_rate": 1.9773704771083217e-05, + "loss": 0.8083, + "step": 643 + }, + { + "epoch": 0.1, + "grad_norm": 1.789818531110137, + "learning_rate": 1.9772681235681936e-05, + "loss": 0.9824, + "step": 644 + }, + { + "epoch": 0.1, + "grad_norm": 1.604910024704947, + "learning_rate": 1.9771655417381342e-05, + "loss": 0.9318, + "step": 645 + }, + { + "epoch": 0.1, + "grad_norm": 1.5895748848355196, + "learning_rate": 1.9770627316421074e-05, + "loss": 1.0216, + "step": 646 + }, + { + "epoch": 0.1, + "grad_norm": 1.7181986683114863, + "learning_rate": 1.976959693304129e-05, + "loss": 0.971, + "step": 647 + }, + { + "epoch": 0.1, + "grad_norm": 2.090795138197271, + "learning_rate": 1.9768564267482685e-05, + "loss": 1.0202, + "step": 648 + }, + { + "epoch": 0.1, + "grad_norm": 1.7739944108727936, + "learning_rate": 1.97675293199865e-05, + "loss": 0.9814, + "step": 649 + }, + { + "epoch": 0.1, + "grad_norm": 1.8472601771501216, + "learning_rate": 1.9766492090794488e-05, + "loss": 1.0092, + "step": 650 + }, + { + "epoch": 0.1, + "grad_norm": 1.7088780593500077, + "learning_rate": 1.9765452580148954e-05, + "loss": 0.9903, + "step": 651 + }, + { + "epoch": 0.1, + "grad_norm": 1.6144693677940285, + "learning_rate": 1.9764410788292724e-05, + "loss": 0.919, + "step": 652 + }, + { + "epoch": 0.1, + "grad_norm": 1.5807718951238905, + "learning_rate": 1.976336671546916e-05, + "loss": 0.9575, + "step": 653 + }, + { + "epoch": 0.1, + "grad_norm": 1.6875454987025957, + "learning_rate": 1.9762320361922156e-05, + "loss": 1.046, + "step": 654 + }, + { + "epoch": 0.1, + "grad_norm": 1.4835502843129302, + "learning_rate": 1.9761271727896148e-05, + "loss": 0.9545, + "step": 655 + }, + { + "epoch": 0.1, + "grad_norm": 1.552094217209983, + "learning_rate": 1.9760220813636087e-05, + "loss": 0.884, + "step": 656 + }, + { + "epoch": 0.1, + "grad_norm": 1.6201406423148235, + "learning_rate": 1.9759167619387474e-05, + "loss": 0.907, + "step": 657 + }, + { + "epoch": 0.1, + "grad_norm": 1.5753068772105716, + "learning_rate": 1.9758112145396335e-05, + "loss": 0.8854, + "step": 658 + }, + { + "epoch": 0.1, + "grad_norm": 1.8855973676547777, + "learning_rate": 1.9757054391909224e-05, + "loss": 1.0453, + "step": 659 + }, + { + "epoch": 0.1, + "grad_norm": 2.15880228589204, + "learning_rate": 1.9755994359173238e-05, + "loss": 0.9453, + "step": 660 + }, + { + "epoch": 0.1, + "grad_norm": 1.5953451254625532, + "learning_rate": 1.9754932047435994e-05, + "loss": 0.924, + "step": 661 + }, + { + "epoch": 0.1, + "grad_norm": 1.914323680924929, + "learning_rate": 1.9753867456945653e-05, + "loss": 0.9879, + "step": 662 + }, + { + "epoch": 0.1, + "grad_norm": 2.4891585281224864, + "learning_rate": 1.9752800587950903e-05, + "loss": 0.9308, + "step": 663 + }, + { + "epoch": 0.1, + "grad_norm": 1.9770560967582835, + "learning_rate": 1.9751731440700964e-05, + "loss": 0.9479, + "step": 664 + }, + { + "epoch": 0.1, + "grad_norm": 0.9414212346368325, + "learning_rate": 1.975066001544559e-05, + "loss": 0.3177, + "step": 665 + }, + { + "epoch": 0.1, + "grad_norm": 1.4875219115077953, + "learning_rate": 1.9749586312435065e-05, + "loss": 1.0379, + "step": 666 + }, + { + "epoch": 0.1, + "grad_norm": 1.696156265428028, + "learning_rate": 1.9748510331920204e-05, + "loss": 0.9922, + "step": 667 + }, + { + "epoch": 0.1, + "grad_norm": 1.833725443792948, + "learning_rate": 1.974743207415236e-05, + "loss": 0.9119, + "step": 668 + }, + { + "epoch": 0.1, + "grad_norm": 1.5354239446380398, + "learning_rate": 1.9746351539383412e-05, + "loss": 1.0048, + "step": 669 + }, + { + "epoch": 0.1, + "grad_norm": 1.5072036363757007, + "learning_rate": 1.9745268727865774e-05, + "loss": 0.9452, + "step": 670 + }, + { + "epoch": 0.1, + "grad_norm": 1.5520227210932784, + "learning_rate": 1.974418363985239e-05, + "loss": 1.019, + "step": 671 + }, + { + "epoch": 0.1, + "grad_norm": 1.7194076439498294, + "learning_rate": 1.9743096275596735e-05, + "loss": 0.9544, + "step": 672 + }, + { + "epoch": 0.1, + "grad_norm": 1.9746300444384288, + "learning_rate": 1.9742006635352822e-05, + "loss": 1.0593, + "step": 673 + }, + { + "epoch": 0.1, + "grad_norm": 1.4515460624215812, + "learning_rate": 1.9740914719375186e-05, + "loss": 0.9031, + "step": 674 + }, + { + "epoch": 0.1, + "grad_norm": 1.7203249098913107, + "learning_rate": 1.9739820527918904e-05, + "loss": 1.0139, + "step": 675 + }, + { + "epoch": 0.1, + "grad_norm": 1.6701071452379914, + "learning_rate": 1.9738724061239574e-05, + "loss": 0.9522, + "step": 676 + }, + { + "epoch": 0.1, + "grad_norm": 1.9822445033734273, + "learning_rate": 1.9737625319593338e-05, + "loss": 1.023, + "step": 677 + }, + { + "epoch": 0.1, + "grad_norm": 1.8078491756589703, + "learning_rate": 1.9736524303236852e-05, + "loss": 0.9974, + "step": 678 + }, + { + "epoch": 0.1, + "grad_norm": 1.620720465896709, + "learning_rate": 1.973542101242732e-05, + "loss": 0.9633, + "step": 679 + }, + { + "epoch": 0.1, + "grad_norm": 1.8149811065822787, + "learning_rate": 1.973431544742247e-05, + "loss": 0.9773, + "step": 680 + }, + { + "epoch": 0.1, + "grad_norm": 2.0294651699681596, + "learning_rate": 1.9733207608480563e-05, + "loss": 0.9759, + "step": 681 + }, + { + "epoch": 0.1, + "grad_norm": 1.8415138223323133, + "learning_rate": 1.9732097495860388e-05, + "loss": 0.892, + "step": 682 + }, + { + "epoch": 0.1, + "grad_norm": 2.184193274643253, + "learning_rate": 1.9730985109821268e-05, + "loss": 0.9952, + "step": 683 + }, + { + "epoch": 0.1, + "grad_norm": 1.829076071264722, + "learning_rate": 1.9729870450623056e-05, + "loss": 0.9834, + "step": 684 + }, + { + "epoch": 0.1, + "grad_norm": 1.9336315247670188, + "learning_rate": 1.972875351852614e-05, + "loss": 0.9684, + "step": 685 + }, + { + "epoch": 0.1, + "grad_norm": 1.843499861737653, + "learning_rate": 1.972763431379143e-05, + "loss": 1.0203, + "step": 686 + }, + { + "epoch": 0.1, + "grad_norm": 1.280939490502182, + "learning_rate": 1.972651283668038e-05, + "loss": 0.2995, + "step": 687 + }, + { + "epoch": 0.1, + "grad_norm": 1.7276681600565358, + "learning_rate": 1.9725389087454955e-05, + "loss": 1.0055, + "step": 688 + }, + { + "epoch": 0.1, + "grad_norm": 1.7860889549356194, + "learning_rate": 1.9724263066377678e-05, + "loss": 0.9907, + "step": 689 + }, + { + "epoch": 0.1, + "grad_norm": 1.580169419290622, + "learning_rate": 1.9723134773711577e-05, + "loss": 0.9858, + "step": 690 + }, + { + "epoch": 0.1, + "grad_norm": 1.7798074038021647, + "learning_rate": 1.972200420972022e-05, + "loss": 0.9992, + "step": 691 + }, + { + "epoch": 0.1, + "grad_norm": 1.6603925458254127, + "learning_rate": 1.9720871374667714e-05, + "loss": 0.9673, + "step": 692 + }, + { + "epoch": 0.1, + "grad_norm": 1.553690559546088, + "learning_rate": 1.971973626881869e-05, + "loss": 0.9464, + "step": 693 + }, + { + "epoch": 0.1, + "grad_norm": 1.5521020597463204, + "learning_rate": 1.97185988924383e-05, + "loss": 1.0286, + "step": 694 + }, + { + "epoch": 0.1, + "grad_norm": 2.1051505101313315, + "learning_rate": 1.9717459245792244e-05, + "loss": 0.9472, + "step": 695 + }, + { + "epoch": 0.1, + "grad_norm": 1.4890965884130531, + "learning_rate": 1.971631732914674e-05, + "loss": 1.0221, + "step": 696 + }, + { + "epoch": 0.1, + "grad_norm": 1.6719007066729015, + "learning_rate": 1.971517314276854e-05, + "loss": 0.9156, + "step": 697 + }, + { + "epoch": 0.1, + "grad_norm": 1.7548339892151512, + "learning_rate": 1.9714026686924925e-05, + "loss": 1.0017, + "step": 698 + }, + { + "epoch": 0.1, + "grad_norm": 1.6078271673560387, + "learning_rate": 1.971287796188371e-05, + "loss": 0.9495, + "step": 699 + }, + { + "epoch": 0.1, + "grad_norm": 1.9569815517061826, + "learning_rate": 1.971172696791323e-05, + "loss": 0.9591, + "step": 700 + }, + { + "epoch": 0.1, + "grad_norm": 1.5622121712700883, + "learning_rate": 1.971057370528237e-05, + "loss": 0.9649, + "step": 701 + }, + { + "epoch": 0.1, + "grad_norm": 1.5798150776059081, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.9079, + "step": 702 + }, + { + "epoch": 0.1, + "grad_norm": 1.964148706586664, + "learning_rate": 1.970826037511762e-05, + "loss": 0.9367, + "step": 703 + }, + { + "epoch": 0.11, + "grad_norm": 1.5867877522598344, + "learning_rate": 1.9707100308124128e-05, + "loss": 0.9014, + "step": 704 + }, + { + "epoch": 0.11, + "grad_norm": 1.5013019479606504, + "learning_rate": 1.9705937973551038e-05, + "loss": 0.8933, + "step": 705 + }, + { + "epoch": 0.11, + "grad_norm": 1.795385433687663, + "learning_rate": 1.9704773371669872e-05, + "loss": 0.8796, + "step": 706 + }, + { + "epoch": 0.11, + "grad_norm": 1.6536167372499206, + "learning_rate": 1.9703606502752674e-05, + "loss": 0.9731, + "step": 707 + }, + { + "epoch": 0.11, + "grad_norm": 1.6204598508801429, + "learning_rate": 1.9702437367072035e-05, + "loss": 1.0082, + "step": 708 + }, + { + "epoch": 0.11, + "grad_norm": 1.7234877711751868, + "learning_rate": 1.970126596490106e-05, + "loss": 0.9431, + "step": 709 + }, + { + "epoch": 0.11, + "grad_norm": 1.7680801758734348, + "learning_rate": 1.9700092296513386e-05, + "loss": 1.0432, + "step": 710 + }, + { + "epoch": 0.11, + "grad_norm": 1.6312546439488396, + "learning_rate": 1.969891636218319e-05, + "loss": 0.9005, + "step": 711 + }, + { + "epoch": 0.11, + "grad_norm": 1.6630434018495426, + "learning_rate": 1.9697738162185163e-05, + "loss": 0.9447, + "step": 712 + }, + { + "epoch": 0.11, + "grad_norm": 1.5979663983867929, + "learning_rate": 1.9696557696794537e-05, + "loss": 0.9786, + "step": 713 + }, + { + "epoch": 0.11, + "grad_norm": 2.010511880721655, + "learning_rate": 1.9695374966287065e-05, + "loss": 0.9573, + "step": 714 + }, + { + "epoch": 0.11, + "grad_norm": 1.451426944418813, + "learning_rate": 1.9694189970939033e-05, + "loss": 1.0436, + "step": 715 + }, + { + "epoch": 0.11, + "grad_norm": 1.5830455766111613, + "learning_rate": 1.9693002711027264e-05, + "loss": 0.957, + "step": 716 + }, + { + "epoch": 0.11, + "grad_norm": 1.850718215200661, + "learning_rate": 1.969181318682909e-05, + "loss": 0.9834, + "step": 717 + }, + { + "epoch": 0.11, + "grad_norm": 1.6516973570348639, + "learning_rate": 1.9690621398622394e-05, + "loss": 0.9334, + "step": 718 + }, + { + "epoch": 0.11, + "grad_norm": 1.7337003718934916, + "learning_rate": 1.968942734668557e-05, + "loss": 0.9681, + "step": 719 + }, + { + "epoch": 0.11, + "grad_norm": 1.620868496009823, + "learning_rate": 1.9688231031297556e-05, + "loss": 1.0357, + "step": 720 + }, + { + "epoch": 0.11, + "grad_norm": 1.5329059221659178, + "learning_rate": 1.9687032452737806e-05, + "loss": 0.9337, + "step": 721 + }, + { + "epoch": 0.11, + "grad_norm": 1.7384823136362557, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.9651, + "step": 722 + }, + { + "epoch": 0.11, + "grad_norm": 1.6136505602497995, + "learning_rate": 1.9684628507223588e-05, + "loss": 0.9546, + "step": 723 + }, + { + "epoch": 0.11, + "grad_norm": 1.7034927122136125, + "learning_rate": 1.968342314083068e-05, + "loss": 0.9271, + "step": 724 + }, + { + "epoch": 0.11, + "grad_norm": 1.607731683848297, + "learning_rate": 1.9682215512389163e-05, + "loss": 1.026, + "step": 725 + }, + { + "epoch": 0.11, + "grad_norm": 1.5943989963344436, + "learning_rate": 1.9681005622181137e-05, + "loss": 0.9623, + "step": 726 + }, + { + "epoch": 0.11, + "grad_norm": 1.7812770707599777, + "learning_rate": 1.967979347048923e-05, + "loss": 0.9548, + "step": 727 + }, + { + "epoch": 0.11, + "grad_norm": 1.8161840733654948, + "learning_rate": 1.9678579057596608e-05, + "loss": 1.063, + "step": 728 + }, + { + "epoch": 0.11, + "grad_norm": 1.508235574944762, + "learning_rate": 1.967736238378695e-05, + "loss": 0.9271, + "step": 729 + }, + { + "epoch": 0.11, + "grad_norm": 1.6764547210723733, + "learning_rate": 1.9676143449344477e-05, + "loss": 0.8779, + "step": 730 + }, + { + "epoch": 0.11, + "grad_norm": 1.620570709692484, + "learning_rate": 1.9674922254553933e-05, + "loss": 0.9964, + "step": 731 + }, + { + "epoch": 0.11, + "grad_norm": 1.580417804019471, + "learning_rate": 1.9673698799700582e-05, + "loss": 0.9744, + "step": 732 + }, + { + "epoch": 0.11, + "grad_norm": 1.5791244835449232, + "learning_rate": 1.967247308507023e-05, + "loss": 0.9742, + "step": 733 + }, + { + "epoch": 0.11, + "grad_norm": 1.792692921600134, + "learning_rate": 1.9671245110949202e-05, + "loss": 0.9572, + "step": 734 + }, + { + "epoch": 0.11, + "grad_norm": 1.5785025119118827, + "learning_rate": 1.9670014877624353e-05, + "loss": 0.9293, + "step": 735 + }, + { + "epoch": 0.11, + "grad_norm": 2.0725073120663478, + "learning_rate": 1.9668782385383065e-05, + "loss": 1.046, + "step": 736 + }, + { + "epoch": 0.11, + "grad_norm": 1.7035287151083034, + "learning_rate": 1.9667547634513248e-05, + "loss": 0.967, + "step": 737 + }, + { + "epoch": 0.11, + "grad_norm": 1.7512804545280596, + "learning_rate": 1.966631062530334e-05, + "loss": 0.9975, + "step": 738 + }, + { + "epoch": 0.11, + "grad_norm": 1.6133321229889348, + "learning_rate": 1.9665071358042307e-05, + "loss": 1.0493, + "step": 739 + }, + { + "epoch": 0.11, + "grad_norm": 1.4723658218407412, + "learning_rate": 1.9663829833019643e-05, + "loss": 0.8976, + "step": 740 + }, + { + "epoch": 0.11, + "grad_norm": 1.8201845036066666, + "learning_rate": 1.9662586050525365e-05, + "loss": 0.998, + "step": 741 + }, + { + "epoch": 0.11, + "grad_norm": 1.7125991303578467, + "learning_rate": 1.9661340010850025e-05, + "loss": 0.8724, + "step": 742 + }, + { + "epoch": 0.11, + "grad_norm": 1.5276290567322122, + "learning_rate": 1.9660091714284694e-05, + "loss": 1.0208, + "step": 743 + }, + { + "epoch": 0.11, + "grad_norm": 1.7215956625771214, + "learning_rate": 1.9658841161120982e-05, + "loss": 0.888, + "step": 744 + }, + { + "epoch": 0.11, + "grad_norm": 1.5381710598112661, + "learning_rate": 1.9657588351651007e-05, + "loss": 1.0088, + "step": 745 + }, + { + "epoch": 0.11, + "grad_norm": 1.8559954819162459, + "learning_rate": 1.9656333286167432e-05, + "loss": 0.996, + "step": 746 + }, + { + "epoch": 0.11, + "grad_norm": 1.6472589044431567, + "learning_rate": 1.9655075964963443e-05, + "loss": 0.9381, + "step": 747 + }, + { + "epoch": 0.11, + "grad_norm": 1.7694855146202892, + "learning_rate": 1.965381638833274e-05, + "loss": 0.9218, + "step": 748 + }, + { + "epoch": 0.11, + "grad_norm": 1.7817060959538993, + "learning_rate": 1.965255455656957e-05, + "loss": 0.8943, + "step": 749 + }, + { + "epoch": 0.11, + "grad_norm": 2.264459651465797, + "learning_rate": 1.9651290469968694e-05, + "loss": 1.0297, + "step": 750 + }, + { + "epoch": 0.11, + "grad_norm": 1.3211454847753379, + "learning_rate": 1.9650024128825406e-05, + "loss": 0.3338, + "step": 751 + }, + { + "epoch": 0.11, + "grad_norm": 1.787921990712669, + "learning_rate": 1.9648755533435517e-05, + "loss": 0.9523, + "step": 752 + }, + { + "epoch": 0.11, + "grad_norm": 1.4703297869429213, + "learning_rate": 1.9647484684095373e-05, + "loss": 1.0012, + "step": 753 + }, + { + "epoch": 0.11, + "grad_norm": 1.7385139352458288, + "learning_rate": 1.964621158110185e-05, + "loss": 1.0068, + "step": 754 + }, + { + "epoch": 0.11, + "grad_norm": 1.907685612810227, + "learning_rate": 1.9644936224752336e-05, + "loss": 0.9179, + "step": 755 + }, + { + "epoch": 0.11, + "grad_norm": 1.8933415826630855, + "learning_rate": 1.9643658615344762e-05, + "loss": 0.9486, + "step": 756 + }, + { + "epoch": 0.11, + "grad_norm": 1.992017226328773, + "learning_rate": 1.9642378753177573e-05, + "loss": 0.9536, + "step": 757 + }, + { + "epoch": 0.11, + "grad_norm": 2.023337729951108, + "learning_rate": 1.964109663854975e-05, + "loss": 1.0767, + "step": 758 + }, + { + "epoch": 0.11, + "grad_norm": 1.4346591890770255, + "learning_rate": 1.9639812271760784e-05, + "loss": 0.9116, + "step": 759 + }, + { + "epoch": 0.11, + "grad_norm": 1.518443313881122, + "learning_rate": 1.963852565311072e-05, + "loss": 0.9837, + "step": 760 + }, + { + "epoch": 0.11, + "grad_norm": 1.8199084318474694, + "learning_rate": 1.96372367829001e-05, + "loss": 0.9258, + "step": 761 + }, + { + "epoch": 0.11, + "grad_norm": 1.6245527643511086, + "learning_rate": 1.9635945661430006e-05, + "loss": 0.9377, + "step": 762 + }, + { + "epoch": 0.11, + "grad_norm": 1.5378969603202852, + "learning_rate": 1.9634652289002047e-05, + "loss": 0.9842, + "step": 763 + }, + { + "epoch": 0.11, + "grad_norm": 1.977930319428025, + "learning_rate": 1.9633356665918354e-05, + "loss": 0.9865, + "step": 764 + }, + { + "epoch": 0.11, + "grad_norm": 1.5555691411716785, + "learning_rate": 1.9632058792481582e-05, + "loss": 0.8887, + "step": 765 + }, + { + "epoch": 0.11, + "grad_norm": 1.9052615270432982, + "learning_rate": 1.963075866899492e-05, + "loss": 0.9831, + "step": 766 + }, + { + "epoch": 0.11, + "grad_norm": 1.515443301247266, + "learning_rate": 1.9629456295762067e-05, + "loss": 0.983, + "step": 767 + }, + { + "epoch": 0.11, + "grad_norm": 1.5899357968239571, + "learning_rate": 1.962815167308727e-05, + "loss": 0.9628, + "step": 768 + }, + { + "epoch": 0.11, + "grad_norm": 1.6180424261794053, + "learning_rate": 1.962684480127528e-05, + "loss": 0.9495, + "step": 769 + }, + { + "epoch": 0.11, + "grad_norm": 1.12702526476482, + "learning_rate": 1.9625535680631386e-05, + "loss": 0.3225, + "step": 770 + }, + { + "epoch": 0.12, + "grad_norm": 1.6043543503912514, + "learning_rate": 1.96242243114614e-05, + "loss": 0.9655, + "step": 771 + }, + { + "epoch": 0.12, + "grad_norm": 1.832430901685091, + "learning_rate": 1.9622910694071654e-05, + "loss": 0.9929, + "step": 772 + }, + { + "epoch": 0.12, + "grad_norm": 1.59752167598961, + "learning_rate": 1.962159482876901e-05, + "loss": 0.9495, + "step": 773 + }, + { + "epoch": 0.12, + "grad_norm": 1.7301326515733286, + "learning_rate": 1.962027671586086e-05, + "loss": 0.9566, + "step": 774 + }, + { + "epoch": 0.12, + "grad_norm": 1.4314960664362162, + "learning_rate": 1.9618956355655107e-05, + "loss": 0.9669, + "step": 775 + }, + { + "epoch": 0.12, + "grad_norm": 1.6231886139640732, + "learning_rate": 1.9617633748460193e-05, + "loss": 0.9891, + "step": 776 + }, + { + "epoch": 0.12, + "grad_norm": 1.9323902546254417, + "learning_rate": 1.9616308894585078e-05, + "loss": 0.9722, + "step": 777 + }, + { + "epoch": 0.12, + "grad_norm": 0.878864606294589, + "learning_rate": 1.9614981794339244e-05, + "loss": 0.3285, + "step": 778 + }, + { + "epoch": 0.12, + "grad_norm": 1.7802191332111326, + "learning_rate": 1.961365244803271e-05, + "loss": 1.0153, + "step": 779 + }, + { + "epoch": 0.12, + "grad_norm": 1.7960769954528806, + "learning_rate": 1.9612320855976002e-05, + "loss": 0.9685, + "step": 780 + }, + { + "epoch": 0.12, + "grad_norm": 1.6217763647045618, + "learning_rate": 1.9610987018480186e-05, + "loss": 1.0031, + "step": 781 + }, + { + "epoch": 0.12, + "grad_norm": 1.533360438567821, + "learning_rate": 1.9609650935856847e-05, + "loss": 0.8413, + "step": 782 + }, + { + "epoch": 0.12, + "grad_norm": 1.8564985910763983, + "learning_rate": 1.9608312608418087e-05, + "loss": 0.9401, + "step": 783 + }, + { + "epoch": 0.12, + "grad_norm": 1.7005211804337508, + "learning_rate": 1.960697203647655e-05, + "loss": 1.0817, + "step": 784 + }, + { + "epoch": 0.12, + "grad_norm": 1.7265343383287515, + "learning_rate": 1.9605629220345382e-05, + "loss": 1.0034, + "step": 785 + }, + { + "epoch": 0.12, + "grad_norm": 1.6573616078460243, + "learning_rate": 1.9604284160338276e-05, + "loss": 0.9444, + "step": 786 + }, + { + "epoch": 0.12, + "grad_norm": 1.5693599542419174, + "learning_rate": 1.9602936856769432e-05, + "loss": 0.9385, + "step": 787 + }, + { + "epoch": 0.12, + "grad_norm": 1.5807249079892538, + "learning_rate": 1.9601587309953584e-05, + "loss": 0.9423, + "step": 788 + }, + { + "epoch": 0.12, + "grad_norm": 1.595139212401982, + "learning_rate": 1.960023552020598e-05, + "loss": 0.9456, + "step": 789 + }, + { + "epoch": 0.12, + "grad_norm": 1.6573991073134278, + "learning_rate": 1.9598881487842406e-05, + "loss": 0.9606, + "step": 790 + }, + { + "epoch": 0.12, + "grad_norm": 1.5280299828053654, + "learning_rate": 1.9597525213179157e-05, + "loss": 1.0081, + "step": 791 + }, + { + "epoch": 0.12, + "grad_norm": 1.4739930349034827, + "learning_rate": 1.9596166696533062e-05, + "loss": 0.965, + "step": 792 + }, + { + "epoch": 0.12, + "grad_norm": 1.605344258189759, + "learning_rate": 1.9594805938221473e-05, + "loss": 0.9415, + "step": 793 + }, + { + "epoch": 0.12, + "grad_norm": 1.890047419613228, + "learning_rate": 1.959344293856226e-05, + "loss": 0.984, + "step": 794 + }, + { + "epoch": 0.12, + "grad_norm": 1.9133741187045141, + "learning_rate": 1.959207769787382e-05, + "loss": 0.9391, + "step": 795 + }, + { + "epoch": 0.12, + "grad_norm": 1.6111552983857254, + "learning_rate": 1.959071021647507e-05, + "loss": 0.9868, + "step": 796 + }, + { + "epoch": 0.12, + "grad_norm": 1.5936306249565788, + "learning_rate": 1.9589340494685464e-05, + "loss": 0.9812, + "step": 797 + }, + { + "epoch": 0.12, + "grad_norm": 1.5988902267485232, + "learning_rate": 1.9587968532824963e-05, + "loss": 1.0153, + "step": 798 + }, + { + "epoch": 0.12, + "grad_norm": 1.468728497784049, + "learning_rate": 1.958659433121406e-05, + "loss": 0.915, + "step": 799 + }, + { + "epoch": 0.12, + "grad_norm": 1.659695382942566, + "learning_rate": 1.958521789017376e-05, + "loss": 0.9228, + "step": 800 + }, + { + "epoch": 0.12, + "grad_norm": 1.5599141479387162, + "learning_rate": 1.958383921002561e-05, + "loss": 0.9287, + "step": 801 + }, + { + "epoch": 0.12, + "grad_norm": 1.7991215852729405, + "learning_rate": 1.9582458291091664e-05, + "loss": 1.0624, + "step": 802 + }, + { + "epoch": 0.12, + "grad_norm": 1.6836577013884875, + "learning_rate": 1.9581075133694508e-05, + "loss": 0.9117, + "step": 803 + }, + { + "epoch": 0.12, + "grad_norm": 2.1281370925839833, + "learning_rate": 1.9579689738157245e-05, + "loss": 0.9825, + "step": 804 + }, + { + "epoch": 0.12, + "grad_norm": 1.4650976673035854, + "learning_rate": 1.9578302104803506e-05, + "loss": 0.9993, + "step": 805 + }, + { + "epoch": 0.12, + "grad_norm": 0.8172287385584375, + "learning_rate": 1.957691223395744e-05, + "loss": 0.3352, + "step": 806 + }, + { + "epoch": 0.12, + "grad_norm": 1.6686968375695956, + "learning_rate": 1.957552012594372e-05, + "loss": 0.9538, + "step": 807 + }, + { + "epoch": 0.12, + "grad_norm": 1.597347173663805, + "learning_rate": 1.957412578108755e-05, + "loss": 0.9667, + "step": 808 + }, + { + "epoch": 0.12, + "grad_norm": 1.4868611847346722, + "learning_rate": 1.957272919971464e-05, + "loss": 1.0481, + "step": 809 + }, + { + "epoch": 0.12, + "grad_norm": 0.8735716920337183, + "learning_rate": 1.9571330382151236e-05, + "loss": 0.3028, + "step": 810 + }, + { + "epoch": 0.12, + "grad_norm": 1.4596733830732393, + "learning_rate": 1.95699293287241e-05, + "loss": 0.9571, + "step": 811 + }, + { + "epoch": 0.12, + "grad_norm": 1.6699330837029742, + "learning_rate": 1.956852603976052e-05, + "loss": 0.958, + "step": 812 + }, + { + "epoch": 0.12, + "grad_norm": 1.7239402323941375, + "learning_rate": 1.9567120515588307e-05, + "loss": 0.9794, + "step": 813 + }, + { + "epoch": 0.12, + "grad_norm": 1.6432445752609868, + "learning_rate": 1.9565712756535785e-05, + "loss": 0.987, + "step": 814 + }, + { + "epoch": 0.12, + "grad_norm": 1.6719034696097996, + "learning_rate": 1.9564302762931812e-05, + "loss": 0.8879, + "step": 815 + }, + { + "epoch": 0.12, + "grad_norm": 0.9281853724039748, + "learning_rate": 1.956289053510576e-05, + "loss": 0.35, + "step": 816 + }, + { + "epoch": 0.12, + "grad_norm": 1.7122314178552518, + "learning_rate": 1.9561476073387527e-05, + "loss": 0.9109, + "step": 817 + }, + { + "epoch": 0.12, + "grad_norm": 1.605043130921607, + "learning_rate": 1.956005937810753e-05, + "loss": 0.8973, + "step": 818 + }, + { + "epoch": 0.12, + "grad_norm": 1.767181525865368, + "learning_rate": 1.955864044959671e-05, + "loss": 0.9653, + "step": 819 + }, + { + "epoch": 0.12, + "grad_norm": 2.071947557845971, + "learning_rate": 1.955721928818653e-05, + "loss": 0.9143, + "step": 820 + }, + { + "epoch": 0.12, + "grad_norm": 2.0343881098299943, + "learning_rate": 1.955579589420897e-05, + "loss": 0.9355, + "step": 821 + }, + { + "epoch": 0.12, + "grad_norm": 1.7729730228894678, + "learning_rate": 1.9554370267996537e-05, + "loss": 0.9691, + "step": 822 + }, + { + "epoch": 0.12, + "grad_norm": 0.8539727953985209, + "learning_rate": 1.9552942409882257e-05, + "loss": 0.3385, + "step": 823 + }, + { + "epoch": 0.12, + "grad_norm": 1.6703362614428694, + "learning_rate": 1.9551512320199684e-05, + "loss": 0.9498, + "step": 824 + }, + { + "epoch": 0.12, + "grad_norm": 1.874488032500137, + "learning_rate": 1.9550079999282874e-05, + "loss": 0.9428, + "step": 825 + }, + { + "epoch": 0.12, + "grad_norm": 1.5722811011032523, + "learning_rate": 1.9548645447466433e-05, + "loss": 0.9365, + "step": 826 + }, + { + "epoch": 0.12, + "grad_norm": 1.611663630842481, + "learning_rate": 1.954720866508546e-05, + "loss": 0.9423, + "step": 827 + }, + { + "epoch": 0.12, + "grad_norm": 1.7398318468448093, + "learning_rate": 1.954576965247559e-05, + "loss": 0.9502, + "step": 828 + }, + { + "epoch": 0.12, + "grad_norm": 1.476052281238449, + "learning_rate": 1.9544328409972978e-05, + "loss": 0.9546, + "step": 829 + }, + { + "epoch": 0.12, + "grad_norm": 0.8316432385069856, + "learning_rate": 1.9542884937914302e-05, + "loss": 0.3392, + "step": 830 + }, + { + "epoch": 0.12, + "grad_norm": 1.6086582432602319, + "learning_rate": 1.9541439236636752e-05, + "loss": 0.9685, + "step": 831 + }, + { + "epoch": 0.12, + "grad_norm": 1.627499851444343, + "learning_rate": 1.9539991306478046e-05, + "loss": 0.9678, + "step": 832 + }, + { + "epoch": 0.12, + "grad_norm": 1.4601392681712042, + "learning_rate": 1.953854114777642e-05, + "loss": 0.8833, + "step": 833 + }, + { + "epoch": 0.12, + "grad_norm": 1.6841304801946464, + "learning_rate": 1.9537088760870632e-05, + "loss": 0.9526, + "step": 834 + }, + { + "epoch": 0.12, + "grad_norm": 1.6870119704868776, + "learning_rate": 1.953563414609996e-05, + "loss": 0.9043, + "step": 835 + }, + { + "epoch": 0.12, + "grad_norm": 1.6331167376822397, + "learning_rate": 1.95341773038042e-05, + "loss": 1.1093, + "step": 836 + }, + { + "epoch": 0.12, + "grad_norm": 1.3930486162495799, + "learning_rate": 1.953271823432367e-05, + "loss": 0.9266, + "step": 837 + }, + { + "epoch": 0.13, + "grad_norm": 1.923470220822032, + "learning_rate": 1.9531256937999217e-05, + "loss": 0.94, + "step": 838 + }, + { + "epoch": 0.13, + "grad_norm": 1.6023026640119524, + "learning_rate": 1.952979341517219e-05, + "loss": 1.0197, + "step": 839 + }, + { + "epoch": 0.13, + "grad_norm": 1.8427734011117904, + "learning_rate": 1.9528327666184472e-05, + "loss": 0.9667, + "step": 840 + }, + { + "epoch": 0.13, + "grad_norm": 1.7607792127358646, + "learning_rate": 1.9526859691378465e-05, + "loss": 0.87, + "step": 841 + }, + { + "epoch": 0.13, + "grad_norm": 1.738292970510681, + "learning_rate": 1.952538949109708e-05, + "loss": 0.9786, + "step": 842 + }, + { + "epoch": 0.13, + "grad_norm": 1.7149022077185407, + "learning_rate": 1.9523917065683764e-05, + "loss": 0.9481, + "step": 843 + }, + { + "epoch": 0.13, + "grad_norm": 1.6248001906539882, + "learning_rate": 1.952244241548247e-05, + "loss": 1.0003, + "step": 844 + }, + { + "epoch": 0.13, + "grad_norm": 1.5887165132952656, + "learning_rate": 1.952096554083768e-05, + "loss": 0.9199, + "step": 845 + }, + { + "epoch": 0.13, + "grad_norm": 1.4189858020589843, + "learning_rate": 1.9519486442094397e-05, + "loss": 0.9986, + "step": 846 + }, + { + "epoch": 0.13, + "grad_norm": 1.6247350070913582, + "learning_rate": 1.9518005119598124e-05, + "loss": 0.989, + "step": 847 + }, + { + "epoch": 0.13, + "grad_norm": 1.5457802659279052, + "learning_rate": 1.951652157369491e-05, + "loss": 0.9394, + "step": 848 + }, + { + "epoch": 0.13, + "grad_norm": 1.9914760328946208, + "learning_rate": 1.9515035804731313e-05, + "loss": 0.8794, + "step": 849 + }, + { + "epoch": 0.13, + "grad_norm": 1.5605248412676282, + "learning_rate": 1.9513547813054397e-05, + "loss": 0.9008, + "step": 850 + }, + { + "epoch": 0.13, + "grad_norm": 1.6121720699064361, + "learning_rate": 1.951205759901177e-05, + "loss": 1.0623, + "step": 851 + }, + { + "epoch": 0.13, + "grad_norm": 1.6745778433427454, + "learning_rate": 1.9510565162951538e-05, + "loss": 0.8868, + "step": 852 + }, + { + "epoch": 0.13, + "grad_norm": 1.7653379671174583, + "learning_rate": 1.9509070505222336e-05, + "loss": 0.9075, + "step": 853 + }, + { + "epoch": 0.13, + "grad_norm": 0.7888150269683845, + "learning_rate": 1.9507573626173317e-05, + "loss": 0.3222, + "step": 854 + }, + { + "epoch": 0.13, + "grad_norm": 1.7344204965414827, + "learning_rate": 1.9506074526154155e-05, + "loss": 0.9156, + "step": 855 + }, + { + "epoch": 0.13, + "grad_norm": 1.8533577253818907, + "learning_rate": 1.950457320551503e-05, + "loss": 0.9556, + "step": 856 + }, + { + "epoch": 0.13, + "grad_norm": 1.6143951594129295, + "learning_rate": 1.9503069664606663e-05, + "loss": 0.9598, + "step": 857 + }, + { + "epoch": 0.13, + "grad_norm": 0.818599168474104, + "learning_rate": 1.950156390378027e-05, + "loss": 0.3028, + "step": 858 + }, + { + "epoch": 0.13, + "grad_norm": 1.7817867682255593, + "learning_rate": 1.9500055923387608e-05, + "loss": 0.9002, + "step": 859 + }, + { + "epoch": 0.13, + "grad_norm": 1.5324585947406468, + "learning_rate": 1.9498545723780932e-05, + "loss": 0.912, + "step": 860 + }, + { + "epoch": 0.13, + "grad_norm": 1.6974837115858863, + "learning_rate": 1.9497033305313033e-05, + "loss": 0.9645, + "step": 861 + }, + { + "epoch": 0.13, + "grad_norm": 1.5706612451183646, + "learning_rate": 1.9495518668337204e-05, + "loss": 0.7744, + "step": 862 + }, + { + "epoch": 0.13, + "grad_norm": 1.638377908930441, + "learning_rate": 1.949400181320727e-05, + "loss": 0.9316, + "step": 863 + }, + { + "epoch": 0.13, + "grad_norm": 1.627236425754178, + "learning_rate": 1.9492482740277564e-05, + "loss": 0.9285, + "step": 864 + }, + { + "epoch": 0.13, + "grad_norm": 1.5474317221945242, + "learning_rate": 1.9490961449902946e-05, + "loss": 1.0384, + "step": 865 + }, + { + "epoch": 0.13, + "grad_norm": 1.8115569633404542, + "learning_rate": 1.948943794243879e-05, + "loss": 0.9834, + "step": 866 + }, + { + "epoch": 0.13, + "grad_norm": 1.5825512490629434, + "learning_rate": 1.9487912218240983e-05, + "loss": 0.9289, + "step": 867 + }, + { + "epoch": 0.13, + "grad_norm": 1.7189175199020104, + "learning_rate": 1.9486384277665938e-05, + "loss": 0.9621, + "step": 868 + }, + { + "epoch": 0.13, + "grad_norm": 1.8574553595973367, + "learning_rate": 1.9484854121070578e-05, + "loss": 1.0111, + "step": 869 + }, + { + "epoch": 0.13, + "grad_norm": 1.6584234855089184, + "learning_rate": 1.9483321748812353e-05, + "loss": 0.9371, + "step": 870 + }, + { + "epoch": 0.13, + "grad_norm": 1.4512580612383406, + "learning_rate": 1.948178716124922e-05, + "loss": 0.9797, + "step": 871 + }, + { + "epoch": 0.13, + "grad_norm": 1.6227528687952613, + "learning_rate": 1.9480250358739667e-05, + "loss": 1.0155, + "step": 872 + }, + { + "epoch": 0.13, + "grad_norm": 1.3941354403582422, + "learning_rate": 1.947871134164268e-05, + "loss": 0.9815, + "step": 873 + }, + { + "epoch": 0.13, + "grad_norm": 1.505416291604214, + "learning_rate": 1.9477170110317783e-05, + "loss": 0.9639, + "step": 874 + }, + { + "epoch": 0.13, + "grad_norm": 1.8965056361215689, + "learning_rate": 1.9475626665125e-05, + "loss": 0.9266, + "step": 875 + }, + { + "epoch": 0.13, + "grad_norm": 1.718278169891963, + "learning_rate": 1.947408100642489e-05, + "loss": 1.0076, + "step": 876 + }, + { + "epoch": 0.13, + "grad_norm": 1.5414496706199177, + "learning_rate": 1.947253313457851e-05, + "loss": 0.9159, + "step": 877 + }, + { + "epoch": 0.13, + "grad_norm": 1.523081742831939, + "learning_rate": 1.9470983049947446e-05, + "loss": 0.8902, + "step": 878 + }, + { + "epoch": 0.13, + "grad_norm": 1.7716021093844976, + "learning_rate": 1.9469430752893796e-05, + "loss": 0.976, + "step": 879 + }, + { + "epoch": 0.13, + "grad_norm": 1.5944098305813688, + "learning_rate": 1.946787624378018e-05, + "loss": 0.8876, + "step": 880 + }, + { + "epoch": 0.13, + "grad_norm": 1.582416853005702, + "learning_rate": 1.946631952296973e-05, + "loss": 0.9828, + "step": 881 + }, + { + "epoch": 0.13, + "grad_norm": 1.7543406872405687, + "learning_rate": 1.94647605908261e-05, + "loss": 0.9286, + "step": 882 + }, + { + "epoch": 0.13, + "grad_norm": 1.7509437192016315, + "learning_rate": 1.946319944771345e-05, + "loss": 0.9437, + "step": 883 + }, + { + "epoch": 0.13, + "grad_norm": 1.5976541584439565, + "learning_rate": 1.9461636093996468e-05, + "loss": 1.0232, + "step": 884 + }, + { + "epoch": 0.13, + "grad_norm": 1.3474434836678684, + "learning_rate": 1.9460070530040348e-05, + "loss": 0.8281, + "step": 885 + }, + { + "epoch": 0.13, + "grad_norm": 1.4156152856463997, + "learning_rate": 1.9458502756210814e-05, + "loss": 0.9457, + "step": 886 + }, + { + "epoch": 0.13, + "grad_norm": 1.8484432165552453, + "learning_rate": 1.9456932772874092e-05, + "loss": 0.8671, + "step": 887 + }, + { + "epoch": 0.13, + "grad_norm": 2.2946776434619025, + "learning_rate": 1.9455360580396934e-05, + "loss": 0.947, + "step": 888 + }, + { + "epoch": 0.13, + "grad_norm": 1.7082894116566696, + "learning_rate": 1.94537861791466e-05, + "loss": 0.9695, + "step": 889 + }, + { + "epoch": 0.13, + "grad_norm": 1.4976028657233036, + "learning_rate": 1.9452209569490874e-05, + "loss": 0.981, + "step": 890 + }, + { + "epoch": 0.13, + "grad_norm": 1.8254217524765315, + "learning_rate": 1.945063075179805e-05, + "loss": 0.9312, + "step": 891 + }, + { + "epoch": 0.13, + "grad_norm": 1.476431507148291, + "learning_rate": 1.944904972643694e-05, + "loss": 0.979, + "step": 892 + }, + { + "epoch": 0.13, + "grad_norm": 1.8768589768232298, + "learning_rate": 1.9447466493776877e-05, + "loss": 0.9469, + "step": 893 + }, + { + "epoch": 0.13, + "grad_norm": 1.5379945213156887, + "learning_rate": 1.9445881054187694e-05, + "loss": 0.9659, + "step": 894 + }, + { + "epoch": 0.13, + "grad_norm": 1.5909786619757784, + "learning_rate": 1.944429340803976e-05, + "loss": 0.9913, + "step": 895 + }, + { + "epoch": 0.13, + "grad_norm": 1.5529905341837291, + "learning_rate": 1.9442703555703945e-05, + "loss": 0.9874, + "step": 896 + }, + { + "epoch": 0.13, + "grad_norm": 1.4110654742267894, + "learning_rate": 1.944111149755164e-05, + "loss": 0.9531, + "step": 897 + }, + { + "epoch": 0.13, + "grad_norm": 0.9139645541321378, + "learning_rate": 1.9439517233954744e-05, + "loss": 0.3246, + "step": 898 + }, + { + "epoch": 0.13, + "grad_norm": 1.7918989713367353, + "learning_rate": 1.9437920765285683e-05, + "loss": 1.0115, + "step": 899 + }, + { + "epoch": 0.13, + "grad_norm": 1.6180711626310682, + "learning_rate": 1.9436322091917392e-05, + "loss": 1.0617, + "step": 900 + }, + { + "epoch": 0.13, + "grad_norm": 1.7613615976563735, + "learning_rate": 1.943472121422332e-05, + "loss": 0.8652, + "step": 901 + }, + { + "epoch": 0.13, + "grad_norm": 1.838596921762269, + "learning_rate": 1.9433118132577432e-05, + "loss": 0.9465, + "step": 902 + }, + { + "epoch": 0.13, + "grad_norm": 1.7593066826882449, + "learning_rate": 1.943151284735421e-05, + "loss": 0.9267, + "step": 903 + }, + { + "epoch": 0.13, + "grad_norm": 1.7286081547944228, + "learning_rate": 1.9429905358928648e-05, + "loss": 0.9482, + "step": 904 + }, + { + "epoch": 0.14, + "grad_norm": 1.6013748989484846, + "learning_rate": 1.9428295667676253e-05, + "loss": 0.9392, + "step": 905 + }, + { + "epoch": 0.14, + "grad_norm": 1.477979721648062, + "learning_rate": 1.942668377397305e-05, + "loss": 0.9403, + "step": 906 + }, + { + "epoch": 0.14, + "grad_norm": 1.8116506629543034, + "learning_rate": 1.9425069678195577e-05, + "loss": 0.9181, + "step": 907 + }, + { + "epoch": 0.14, + "grad_norm": 1.463518801071589, + "learning_rate": 1.9423453380720892e-05, + "loss": 0.9259, + "step": 908 + }, + { + "epoch": 0.14, + "grad_norm": 1.7889208815673736, + "learning_rate": 1.9421834881926558e-05, + "loss": 0.9983, + "step": 909 + }, + { + "epoch": 0.14, + "grad_norm": 1.6250521486424425, + "learning_rate": 1.9420214182190657e-05, + "loss": 0.9634, + "step": 910 + }, + { + "epoch": 0.14, + "grad_norm": 1.5477731149331013, + "learning_rate": 1.941859128189178e-05, + "loss": 0.961, + "step": 911 + }, + { + "epoch": 0.14, + "grad_norm": 1.9274218565859795, + "learning_rate": 1.9416966181409047e-05, + "loss": 0.973, + "step": 912 + }, + { + "epoch": 0.14, + "grad_norm": 1.7070551266080638, + "learning_rate": 1.9415338881122074e-05, + "loss": 0.9625, + "step": 913 + }, + { + "epoch": 0.14, + "grad_norm": 1.570203737031849, + "learning_rate": 1.9413709381411003e-05, + "loss": 0.8757, + "step": 914 + }, + { + "epoch": 0.14, + "grad_norm": 1.5458424859877031, + "learning_rate": 1.9412077682656477e-05, + "loss": 0.9537, + "step": 915 + }, + { + "epoch": 0.14, + "grad_norm": 1.4804993993661473, + "learning_rate": 1.941044378523967e-05, + "loss": 0.8397, + "step": 916 + }, + { + "epoch": 0.14, + "grad_norm": 1.7101371762548216, + "learning_rate": 1.9408807689542257e-05, + "loss": 0.924, + "step": 917 + }, + { + "epoch": 0.14, + "grad_norm": 1.7586849018709336, + "learning_rate": 1.9407169395946427e-05, + "loss": 0.9729, + "step": 918 + }, + { + "epoch": 0.14, + "grad_norm": 1.6309936159814857, + "learning_rate": 1.9405528904834895e-05, + "loss": 0.9666, + "step": 919 + }, + { + "epoch": 0.14, + "grad_norm": 1.6603488103603472, + "learning_rate": 1.940388621659087e-05, + "loss": 0.9298, + "step": 920 + }, + { + "epoch": 0.14, + "grad_norm": 1.5612286879365345, + "learning_rate": 1.9402241331598092e-05, + "loss": 0.9558, + "step": 921 + }, + { + "epoch": 0.14, + "grad_norm": 1.7696040056441986, + "learning_rate": 1.94005942502408e-05, + "loss": 0.9099, + "step": 922 + }, + { + "epoch": 0.14, + "grad_norm": 0.8667631018292109, + "learning_rate": 1.939894497290375e-05, + "loss": 0.3391, + "step": 923 + }, + { + "epoch": 0.14, + "grad_norm": 1.8132504592887957, + "learning_rate": 1.9397293499972224e-05, + "loss": 0.9872, + "step": 924 + }, + { + "epoch": 0.14, + "grad_norm": 0.8360100975331577, + "learning_rate": 1.9395639831831997e-05, + "loss": 0.3111, + "step": 925 + }, + { + "epoch": 0.14, + "grad_norm": 1.6230927918752514, + "learning_rate": 1.939398396886937e-05, + "loss": 0.9175, + "step": 926 + }, + { + "epoch": 0.14, + "grad_norm": 1.8194081757435474, + "learning_rate": 1.9392325911471154e-05, + "loss": 0.9112, + "step": 927 + }, + { + "epoch": 0.14, + "grad_norm": 1.6459187817926038, + "learning_rate": 1.939066566002467e-05, + "loss": 0.9796, + "step": 928 + }, + { + "epoch": 0.14, + "grad_norm": 1.4737549725477763, + "learning_rate": 1.938900321491775e-05, + "loss": 0.9073, + "step": 929 + }, + { + "epoch": 0.14, + "grad_norm": 1.9749829234217935, + "learning_rate": 1.9387338576538743e-05, + "loss": 0.9278, + "step": 930 + }, + { + "epoch": 0.14, + "grad_norm": 1.7038633436916437, + "learning_rate": 1.938567174527651e-05, + "loss": 0.9326, + "step": 931 + }, + { + "epoch": 0.14, + "grad_norm": 1.6000795965773242, + "learning_rate": 1.9384002721520423e-05, + "loss": 0.9664, + "step": 932 + }, + { + "epoch": 0.14, + "grad_norm": 1.528057646802909, + "learning_rate": 1.9382331505660364e-05, + "loss": 0.9458, + "step": 933 + }, + { + "epoch": 0.14, + "grad_norm": 1.3866788253946987, + "learning_rate": 1.938065809808673e-05, + "loss": 0.903, + "step": 934 + }, + { + "epoch": 0.14, + "grad_norm": 1.0752714504340406, + "learning_rate": 1.9378982499190434e-05, + "loss": 0.3537, + "step": 935 + }, + { + "epoch": 0.14, + "grad_norm": 1.6387625806708923, + "learning_rate": 1.937730470936289e-05, + "loss": 0.9443, + "step": 936 + }, + { + "epoch": 0.14, + "grad_norm": 1.6095798903658123, + "learning_rate": 1.937562472899603e-05, + "loss": 0.9536, + "step": 937 + }, + { + "epoch": 0.14, + "grad_norm": 1.8058085489516018, + "learning_rate": 1.9373942558482303e-05, + "loss": 1.0234, + "step": 938 + }, + { + "epoch": 0.14, + "grad_norm": 1.4679773865385315, + "learning_rate": 1.9372258198214654e-05, + "loss": 0.8131, + "step": 939 + }, + { + "epoch": 0.14, + "grad_norm": 1.504810337076543, + "learning_rate": 1.937057164858656e-05, + "loss": 0.9705, + "step": 940 + }, + { + "epoch": 0.14, + "grad_norm": 1.5347373957621673, + "learning_rate": 1.9368882909991996e-05, + "loss": 0.9555, + "step": 941 + }, + { + "epoch": 0.14, + "grad_norm": 0.8589917753550486, + "learning_rate": 1.936719198282545e-05, + "loss": 0.2939, + "step": 942 + }, + { + "epoch": 0.14, + "grad_norm": 0.8675059590818638, + "learning_rate": 1.9365498867481926e-05, + "loss": 0.3401, + "step": 943 + }, + { + "epoch": 0.14, + "grad_norm": 1.6451040502696443, + "learning_rate": 1.9363803564356932e-05, + "loss": 0.9992, + "step": 944 + }, + { + "epoch": 0.14, + "grad_norm": 1.741057516475174, + "learning_rate": 1.936210607384649e-05, + "loss": 0.9762, + "step": 945 + }, + { + "epoch": 0.14, + "grad_norm": 1.5599441319175684, + "learning_rate": 1.936040639634714e-05, + "loss": 0.8611, + "step": 946 + }, + { + "epoch": 0.14, + "grad_norm": 0.9412636173986958, + "learning_rate": 1.935870453225592e-05, + "loss": 0.353, + "step": 947 + }, + { + "epoch": 0.14, + "grad_norm": 2.142613107261814, + "learning_rate": 1.935700048197039e-05, + "loss": 0.9236, + "step": 948 + }, + { + "epoch": 0.14, + "grad_norm": 1.5365310075242953, + "learning_rate": 1.9355294245888617e-05, + "loss": 1.0025, + "step": 949 + }, + { + "epoch": 0.14, + "grad_norm": 1.589850715087705, + "learning_rate": 1.9353585824409178e-05, + "loss": 1.0179, + "step": 950 + }, + { + "epoch": 0.14, + "grad_norm": 1.6281875161324748, + "learning_rate": 1.9351875217931154e-05, + "loss": 0.9697, + "step": 951 + }, + { + "epoch": 0.14, + "grad_norm": 1.58210676742971, + "learning_rate": 1.9350162426854152e-05, + "loss": 0.9674, + "step": 952 + }, + { + "epoch": 0.14, + "grad_norm": 1.684299553382832, + "learning_rate": 1.9348447451578273e-05, + "loss": 1.0601, + "step": 953 + }, + { + "epoch": 0.14, + "grad_norm": 1.8102414769658999, + "learning_rate": 1.9346730292504134e-05, + "loss": 1.0156, + "step": 954 + }, + { + "epoch": 0.14, + "grad_norm": 1.7662935932176484, + "learning_rate": 1.934501095003287e-05, + "loss": 0.8664, + "step": 955 + }, + { + "epoch": 0.14, + "grad_norm": 1.456918291463753, + "learning_rate": 1.9343289424566122e-05, + "loss": 0.9181, + "step": 956 + }, + { + "epoch": 0.14, + "grad_norm": 1.7153072415109856, + "learning_rate": 1.934156571650603e-05, + "loss": 0.8278, + "step": 957 + }, + { + "epoch": 0.14, + "grad_norm": 1.693980805834375, + "learning_rate": 1.9339839826255257e-05, + "loss": 0.9765, + "step": 958 + }, + { + "epoch": 0.14, + "grad_norm": 1.6779858641600216, + "learning_rate": 1.9338111754216968e-05, + "loss": 0.9532, + "step": 959 + }, + { + "epoch": 0.14, + "grad_norm": 1.584152107553659, + "learning_rate": 1.9336381500794845e-05, + "loss": 0.8465, + "step": 960 + }, + { + "epoch": 0.14, + "grad_norm": 1.5607130262991993, + "learning_rate": 1.9334649066393072e-05, + "loss": 0.9158, + "step": 961 + }, + { + "epoch": 0.14, + "grad_norm": 1.4843447600597182, + "learning_rate": 1.933291445141635e-05, + "loss": 0.8929, + "step": 962 + }, + { + "epoch": 0.14, + "grad_norm": 1.6243375668224997, + "learning_rate": 1.9331177656269878e-05, + "loss": 0.9765, + "step": 963 + }, + { + "epoch": 0.14, + "grad_norm": 1.5112402630613377, + "learning_rate": 1.932943868135938e-05, + "loss": 0.976, + "step": 964 + }, + { + "epoch": 0.14, + "grad_norm": 1.5868170324152127, + "learning_rate": 1.9327697527091076e-05, + "loss": 0.8916, + "step": 965 + }, + { + "epoch": 0.14, + "grad_norm": 1.600139313552218, + "learning_rate": 1.9325954193871698e-05, + "loss": 0.9441, + "step": 966 + }, + { + "epoch": 0.14, + "grad_norm": 1.423233759805251, + "learning_rate": 1.9324208682108493e-05, + "loss": 0.8994, + "step": 967 + }, + { + "epoch": 0.14, + "grad_norm": 0.8389678335153199, + "learning_rate": 1.932246099220921e-05, + "loss": 0.3334, + "step": 968 + }, + { + "epoch": 0.14, + "grad_norm": 1.6511189112258235, + "learning_rate": 1.932071112458211e-05, + "loss": 0.8791, + "step": 969 + }, + { + "epoch": 0.14, + "grad_norm": 1.5945456870031507, + "learning_rate": 1.9318959079635965e-05, + "loss": 0.9591, + "step": 970 + }, + { + "epoch": 0.14, + "grad_norm": 1.573552942160686, + "learning_rate": 1.931720485778005e-05, + "loss": 1.0872, + "step": 971 + }, + { + "epoch": 0.15, + "grad_norm": 1.4453527792911074, + "learning_rate": 1.931544845942415e-05, + "loss": 0.9623, + "step": 972 + }, + { + "epoch": 0.15, + "grad_norm": 0.8379703185945078, + "learning_rate": 1.9313689884978567e-05, + "loss": 0.3613, + "step": 973 + }, + { + "epoch": 0.15, + "grad_norm": 1.6831574449960423, + "learning_rate": 1.9311929134854093e-05, + "loss": 0.9332, + "step": 974 + }, + { + "epoch": 0.15, + "grad_norm": 1.6223101659621977, + "learning_rate": 1.931016620946205e-05, + "loss": 0.9178, + "step": 975 + }, + { + "epoch": 0.15, + "grad_norm": 0.855097133121107, + "learning_rate": 1.930840110921425e-05, + "loss": 0.2847, + "step": 976 + }, + { + "epoch": 0.15, + "grad_norm": 1.5152435523234884, + "learning_rate": 1.9306633834523022e-05, + "loss": 0.8288, + "step": 977 + }, + { + "epoch": 0.15, + "grad_norm": 0.8569361260096856, + "learning_rate": 1.9304864385801204e-05, + "loss": 0.3056, + "step": 978 + }, + { + "epoch": 0.15, + "grad_norm": 1.8565481660094034, + "learning_rate": 1.9303092763462142e-05, + "loss": 0.8711, + "step": 979 + }, + { + "epoch": 0.15, + "grad_norm": 1.9333937936525332, + "learning_rate": 1.9301318967919684e-05, + "loss": 0.9718, + "step": 980 + }, + { + "epoch": 0.15, + "grad_norm": 1.9502049743382441, + "learning_rate": 1.9299542999588184e-05, + "loss": 1.0259, + "step": 981 + }, + { + "epoch": 0.15, + "grad_norm": 1.7067858532691014, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.9195, + "step": 982 + }, + { + "epoch": 0.15, + "grad_norm": 1.705227669431673, + "learning_rate": 1.929598454621805e-05, + "loss": 0.858, + "step": 983 + }, + { + "epoch": 0.15, + "grad_norm": 1.5549062019144357, + "learning_rate": 1.9294202062010667e-05, + "loss": 0.9408, + "step": 984 + }, + { + "epoch": 0.15, + "grad_norm": 1.8191871479844153, + "learning_rate": 1.929241740667676e-05, + "loss": 0.9941, + "step": 985 + }, + { + "epoch": 0.15, + "grad_norm": 1.564111482985053, + "learning_rate": 1.9290630580633215e-05, + "loss": 0.9335, + "step": 986 + }, + { + "epoch": 0.15, + "grad_norm": 1.9801970291709687, + "learning_rate": 1.9288841584297445e-05, + "loss": 0.8279, + "step": 987 + }, + { + "epoch": 0.15, + "grad_norm": 1.567372245990633, + "learning_rate": 1.9287050418087355e-05, + "loss": 0.9691, + "step": 988 + }, + { + "epoch": 0.15, + "grad_norm": 1.5953372252161375, + "learning_rate": 1.9285257082421363e-05, + "loss": 0.9838, + "step": 989 + }, + { + "epoch": 0.15, + "grad_norm": 1.7250035247041102, + "learning_rate": 1.9283461577718387e-05, + "loss": 1.0088, + "step": 990 + }, + { + "epoch": 0.15, + "grad_norm": 1.6945413668318634, + "learning_rate": 1.9281663904397868e-05, + "loss": 1.0042, + "step": 991 + }, + { + "epoch": 0.15, + "grad_norm": 1.7227946602108561, + "learning_rate": 1.927986406287973e-05, + "loss": 0.96, + "step": 992 + }, + { + "epoch": 0.15, + "grad_norm": 1.7345490411262316, + "learning_rate": 1.9278062053584426e-05, + "loss": 0.879, + "step": 993 + }, + { + "epoch": 0.15, + "grad_norm": 1.6858241979163227, + "learning_rate": 1.92762578769329e-05, + "loss": 0.9862, + "step": 994 + }, + { + "epoch": 0.15, + "grad_norm": 1.4507724525460193, + "learning_rate": 1.9274451533346617e-05, + "loss": 0.8984, + "step": 995 + }, + { + "epoch": 0.15, + "grad_norm": 1.6060884328583993, + "learning_rate": 1.9272643023247527e-05, + "loss": 0.8906, + "step": 996 + }, + { + "epoch": 0.15, + "grad_norm": 1.5068881235338356, + "learning_rate": 1.92708323470581e-05, + "loss": 0.8976, + "step": 997 + }, + { + "epoch": 0.15, + "grad_norm": 1.458801622575446, + "learning_rate": 1.9269019505201316e-05, + "loss": 0.9613, + "step": 998 + }, + { + "epoch": 0.15, + "grad_norm": 1.7842201725250648, + "learning_rate": 1.926720449810065e-05, + "loss": 0.9744, + "step": 999 + }, + { + "epoch": 0.15, + "grad_norm": 1.7434290655106102, + "learning_rate": 1.9265387326180094e-05, + "loss": 0.9213, + "step": 1000 + }, + { + "epoch": 0.15, + "grad_norm": 1.5809507488983856, + "learning_rate": 1.9263567989864135e-05, + "loss": 0.94, + "step": 1001 + }, + { + "epoch": 0.15, + "grad_norm": 2.0103325391784943, + "learning_rate": 1.9261746489577767e-05, + "loss": 0.9846, + "step": 1002 + }, + { + "epoch": 0.15, + "grad_norm": 1.4733187161883579, + "learning_rate": 1.92599228257465e-05, + "loss": 0.9268, + "step": 1003 + }, + { + "epoch": 0.15, + "grad_norm": 1.73848092346793, + "learning_rate": 1.9258096998796335e-05, + "loss": 0.9287, + "step": 1004 + }, + { + "epoch": 0.15, + "grad_norm": 1.5786852190432847, + "learning_rate": 1.9256269009153793e-05, + "loss": 1.0006, + "step": 1005 + }, + { + "epoch": 0.15, + "grad_norm": 2.047706222288261, + "learning_rate": 1.9254438857245885e-05, + "loss": 1.0241, + "step": 1006 + }, + { + "epoch": 0.15, + "grad_norm": 1.564794242473782, + "learning_rate": 1.925260654350014e-05, + "loss": 0.8822, + "step": 1007 + }, + { + "epoch": 0.15, + "grad_norm": 1.7244582623023423, + "learning_rate": 1.925077206834458e-05, + "loss": 0.9899, + "step": 1008 + }, + { + "epoch": 0.15, + "grad_norm": 1.5106414287874854, + "learning_rate": 1.924893543220775e-05, + "loss": 0.9203, + "step": 1009 + }, + { + "epoch": 0.15, + "grad_norm": 1.6940817584575845, + "learning_rate": 1.924709663551868e-05, + "loss": 0.9331, + "step": 1010 + }, + { + "epoch": 0.15, + "grad_norm": 1.4736767061444838, + "learning_rate": 1.924525567870691e-05, + "loss": 0.9139, + "step": 1011 + }, + { + "epoch": 0.15, + "grad_norm": 1.5892533394013502, + "learning_rate": 1.92434125622025e-05, + "loss": 0.9577, + "step": 1012 + }, + { + "epoch": 0.15, + "grad_norm": 1.5069840037690432, + "learning_rate": 1.924156728643599e-05, + "loss": 0.9924, + "step": 1013 + }, + { + "epoch": 0.15, + "grad_norm": 1.827463580569672, + "learning_rate": 1.923971985183844e-05, + "loss": 1.0114, + "step": 1014 + }, + { + "epoch": 0.15, + "grad_norm": 1.8072351096196546, + "learning_rate": 1.9237870258841412e-05, + "loss": 0.9514, + "step": 1015 + }, + { + "epoch": 0.15, + "grad_norm": 1.6544125718032203, + "learning_rate": 1.9236018507876973e-05, + "loss": 0.8919, + "step": 1016 + }, + { + "epoch": 0.15, + "grad_norm": 1.6331111598313437, + "learning_rate": 1.9234164599377692e-05, + "loss": 0.874, + "step": 1017 + }, + { + "epoch": 0.15, + "grad_norm": 1.6305193472351633, + "learning_rate": 1.923230853377664e-05, + "loss": 0.8717, + "step": 1018 + }, + { + "epoch": 0.15, + "grad_norm": 1.8152351603497061, + "learning_rate": 1.9230450311507393e-05, + "loss": 1.0009, + "step": 1019 + }, + { + "epoch": 0.15, + "grad_norm": 1.6114396940966587, + "learning_rate": 1.9228589933004038e-05, + "loss": 0.975, + "step": 1020 + }, + { + "epoch": 0.15, + "grad_norm": 1.6012989408836813, + "learning_rate": 1.922672739870115e-05, + "loss": 0.9333, + "step": 1021 + }, + { + "epoch": 0.15, + "grad_norm": 0.7855007219011148, + "learning_rate": 1.9224862709033823e-05, + "loss": 0.2865, + "step": 1022 + }, + { + "epoch": 0.15, + "grad_norm": 1.483599431306749, + "learning_rate": 1.922299586443765e-05, + "loss": 0.8616, + "step": 1023 + }, + { + "epoch": 0.15, + "grad_norm": 1.6727535577627035, + "learning_rate": 1.9221126865348726e-05, + "loss": 0.9666, + "step": 1024 + }, + { + "epoch": 0.15, + "grad_norm": 1.6338024218663487, + "learning_rate": 1.9219255712203643e-05, + "loss": 0.9909, + "step": 1025 + }, + { + "epoch": 0.15, + "grad_norm": 1.5637048760714245, + "learning_rate": 1.921738240543951e-05, + "loss": 0.8844, + "step": 1026 + }, + { + "epoch": 0.15, + "grad_norm": 1.5686945741102496, + "learning_rate": 1.9215506945493933e-05, + "loss": 0.9415, + "step": 1027 + }, + { + "epoch": 0.15, + "grad_norm": 0.8983981362302563, + "learning_rate": 1.921362933280501e-05, + "loss": 0.3279, + "step": 1028 + }, + { + "epoch": 0.15, + "grad_norm": 1.6451691310090992, + "learning_rate": 1.9211749567811357e-05, + "loss": 1.0003, + "step": 1029 + }, + { + "epoch": 0.15, + "grad_norm": 1.4457971868499322, + "learning_rate": 1.9209867650952088e-05, + "loss": 0.8923, + "step": 1030 + }, + { + "epoch": 0.15, + "grad_norm": 1.5679541975371474, + "learning_rate": 1.920798358266682e-05, + "loss": 0.8575, + "step": 1031 + }, + { + "epoch": 0.15, + "grad_norm": 1.3624673288260447, + "learning_rate": 1.9206097363395668e-05, + "loss": 0.9169, + "step": 1032 + }, + { + "epoch": 0.15, + "grad_norm": 1.4873129881378602, + "learning_rate": 1.9204208993579256e-05, + "loss": 0.9479, + "step": 1033 + }, + { + "epoch": 0.15, + "grad_norm": 1.5513706432478374, + "learning_rate": 1.9202318473658707e-05, + "loss": 0.9757, + "step": 1034 + }, + { + "epoch": 0.15, + "grad_norm": 1.557317703777887, + "learning_rate": 1.9200425804075643e-05, + "loss": 0.8763, + "step": 1035 + }, + { + "epoch": 0.15, + "grad_norm": 1.5147214892815681, + "learning_rate": 1.91985309852722e-05, + "loss": 0.907, + "step": 1036 + }, + { + "epoch": 0.15, + "grad_norm": 1.8336115611527413, + "learning_rate": 1.9196634017690993e-05, + "loss": 0.8366, + "step": 1037 + }, + { + "epoch": 0.15, + "grad_norm": 1.5462916898783394, + "learning_rate": 1.919473490177517e-05, + "loss": 0.9545, + "step": 1038 + }, + { + "epoch": 0.16, + "grad_norm": 1.553231495625522, + "learning_rate": 1.9192833637968357e-05, + "loss": 0.9268, + "step": 1039 + }, + { + "epoch": 0.16, + "grad_norm": 1.7533040956484711, + "learning_rate": 1.919093022671469e-05, + "loss": 0.8703, + "step": 1040 + }, + { + "epoch": 0.16, + "grad_norm": 1.603529183148066, + "learning_rate": 1.9189024668458803e-05, + "loss": 0.9518, + "step": 1041 + }, + { + "epoch": 0.16, + "grad_norm": 1.6093992907348331, + "learning_rate": 1.9187116963645845e-05, + "loss": 0.8954, + "step": 1042 + }, + { + "epoch": 0.16, + "grad_norm": 1.5781791454648255, + "learning_rate": 1.9185207112721443e-05, + "loss": 0.9686, + "step": 1043 + }, + { + "epoch": 0.16, + "grad_norm": 1.4886508899381776, + "learning_rate": 1.9183295116131747e-05, + "loss": 0.9872, + "step": 1044 + }, + { + "epoch": 0.16, + "grad_norm": 1.7310737376825676, + "learning_rate": 1.91813809743234e-05, + "loss": 0.981, + "step": 1045 + }, + { + "epoch": 0.16, + "grad_norm": 1.6699307837443829, + "learning_rate": 1.917946468774354e-05, + "loss": 0.9904, + "step": 1046 + }, + { + "epoch": 0.16, + "grad_norm": 1.5569391143917253, + "learning_rate": 1.9177546256839814e-05, + "loss": 0.9039, + "step": 1047 + }, + { + "epoch": 0.16, + "grad_norm": 1.5562505197332657, + "learning_rate": 1.9175625682060367e-05, + "loss": 1.0236, + "step": 1048 + }, + { + "epoch": 0.16, + "grad_norm": 1.6843176124223667, + "learning_rate": 1.917370296385385e-05, + "loss": 0.9393, + "step": 1049 + }, + { + "epoch": 0.16, + "grad_norm": 1.8842551839750727, + "learning_rate": 1.9171778102669404e-05, + "loss": 0.9301, + "step": 1050 + }, + { + "epoch": 0.16, + "grad_norm": 1.5283218590969267, + "learning_rate": 1.916985109895668e-05, + "loss": 0.9344, + "step": 1051 + }, + { + "epoch": 0.16, + "grad_norm": 1.7811991972956647, + "learning_rate": 1.9167921953165827e-05, + "loss": 0.9073, + "step": 1052 + }, + { + "epoch": 0.16, + "grad_norm": 2.009274141812004, + "learning_rate": 1.916599066574749e-05, + "loss": 1.0302, + "step": 1053 + }, + { + "epoch": 0.16, + "grad_norm": 1.5560398545916263, + "learning_rate": 1.916405723715282e-05, + "loss": 0.8723, + "step": 1054 + }, + { + "epoch": 0.16, + "grad_norm": 1.6982062556988202, + "learning_rate": 1.9162121667833473e-05, + "loss": 0.827, + "step": 1055 + }, + { + "epoch": 0.16, + "grad_norm": 1.4390011295161496, + "learning_rate": 1.9160183958241584e-05, + "loss": 1.012, + "step": 1056 + }, + { + "epoch": 0.16, + "grad_norm": 1.5675652703775997, + "learning_rate": 1.9158244108829815e-05, + "loss": 0.9237, + "step": 1057 + }, + { + "epoch": 0.16, + "grad_norm": 1.7182232678190625, + "learning_rate": 1.9156302120051308e-05, + "loss": 0.8548, + "step": 1058 + }, + { + "epoch": 0.16, + "grad_norm": 1.540934584726451, + "learning_rate": 1.915435799235971e-05, + "loss": 0.9531, + "step": 1059 + }, + { + "epoch": 0.16, + "grad_norm": 1.5662737954471622, + "learning_rate": 1.9152411726209176e-05, + "loss": 0.9333, + "step": 1060 + }, + { + "epoch": 0.16, + "grad_norm": 1.5147867502384917, + "learning_rate": 1.9150463322054352e-05, + "loss": 0.8942, + "step": 1061 + }, + { + "epoch": 0.16, + "grad_norm": 1.455687265455594, + "learning_rate": 1.9148512780350384e-05, + "loss": 0.9758, + "step": 1062 + }, + { + "epoch": 0.16, + "grad_norm": 1.542549480743184, + "learning_rate": 1.914656010155292e-05, + "loss": 1.0144, + "step": 1063 + }, + { + "epoch": 0.16, + "grad_norm": 1.6262299732539058, + "learning_rate": 1.9144605286118104e-05, + "loss": 1.0009, + "step": 1064 + }, + { + "epoch": 0.16, + "grad_norm": 1.6034253997602659, + "learning_rate": 1.914264833450258e-05, + "loss": 1.0239, + "step": 1065 + }, + { + "epoch": 0.16, + "grad_norm": 1.6515694758828192, + "learning_rate": 1.9140689247163497e-05, + "loss": 0.996, + "step": 1066 + }, + { + "epoch": 0.16, + "grad_norm": 1.5371943126802028, + "learning_rate": 1.9138728024558494e-05, + "loss": 0.9552, + "step": 1067 + }, + { + "epoch": 0.16, + "grad_norm": 1.4234567087135699, + "learning_rate": 1.9136764667145715e-05, + "loss": 0.9426, + "step": 1068 + }, + { + "epoch": 0.16, + "grad_norm": 1.584905264976068, + "learning_rate": 1.91347991753838e-05, + "loss": 0.9271, + "step": 1069 + }, + { + "epoch": 0.16, + "grad_norm": 1.5283414309081558, + "learning_rate": 1.9132831549731886e-05, + "loss": 0.9339, + "step": 1070 + }, + { + "epoch": 0.16, + "grad_norm": 1.6230574262071782, + "learning_rate": 1.9130861790649613e-05, + "loss": 0.9209, + "step": 1071 + }, + { + "epoch": 0.16, + "grad_norm": 1.782268243589682, + "learning_rate": 1.9128889898597117e-05, + "loss": 0.8811, + "step": 1072 + }, + { + "epoch": 0.16, + "grad_norm": 1.642954884070177, + "learning_rate": 1.912691587403503e-05, + "loss": 0.9269, + "step": 1073 + }, + { + "epoch": 0.16, + "grad_norm": 1.611167321706687, + "learning_rate": 1.9124939717424486e-05, + "loss": 0.9532, + "step": 1074 + }, + { + "epoch": 0.16, + "grad_norm": 2.039632935772377, + "learning_rate": 1.9122961429227115e-05, + "loss": 0.9914, + "step": 1075 + }, + { + "epoch": 0.16, + "grad_norm": 1.507891765917394, + "learning_rate": 1.9120981009905044e-05, + "loss": 0.9597, + "step": 1076 + }, + { + "epoch": 0.16, + "grad_norm": 1.7353146859857143, + "learning_rate": 1.91189984599209e-05, + "loss": 0.9525, + "step": 1077 + }, + { + "epoch": 0.16, + "grad_norm": 1.743926213446444, + "learning_rate": 1.911701377973781e-05, + "loss": 0.9592, + "step": 1078 + }, + { + "epoch": 0.16, + "grad_norm": 1.4958115191969878, + "learning_rate": 1.9115026969819396e-05, + "loss": 0.9158, + "step": 1079 + }, + { + "epoch": 0.16, + "grad_norm": 1.5499496067703662, + "learning_rate": 1.9113038030629767e-05, + "loss": 0.9213, + "step": 1080 + }, + { + "epoch": 0.16, + "grad_norm": 1.632258271537784, + "learning_rate": 1.9111046962633547e-05, + "loss": 0.9798, + "step": 1081 + }, + { + "epoch": 0.16, + "grad_norm": 1.6584206293562322, + "learning_rate": 1.910905376629585e-05, + "loss": 0.912, + "step": 1082 + }, + { + "epoch": 0.16, + "grad_norm": 1.8075650041200193, + "learning_rate": 1.9107058442082288e-05, + "loss": 0.9683, + "step": 1083 + }, + { + "epoch": 0.16, + "grad_norm": 1.5794052075073737, + "learning_rate": 1.9105060990458964e-05, + "loss": 0.9788, + "step": 1084 + }, + { + "epoch": 0.16, + "grad_norm": 1.4310453616359426, + "learning_rate": 1.9103061411892488e-05, + "loss": 0.932, + "step": 1085 + }, + { + "epoch": 0.16, + "grad_norm": 1.4454411410741406, + "learning_rate": 1.9101059706849957e-05, + "loss": 0.9622, + "step": 1086 + }, + { + "epoch": 0.16, + "grad_norm": 1.6760446379265568, + "learning_rate": 1.9099055875798974e-05, + "loss": 0.9857, + "step": 1087 + }, + { + "epoch": 0.16, + "grad_norm": 1.6486076508974432, + "learning_rate": 1.909704991920763e-05, + "loss": 0.9774, + "step": 1088 + }, + { + "epoch": 0.16, + "grad_norm": 1.612687760287242, + "learning_rate": 1.909504183754452e-05, + "loss": 0.9426, + "step": 1089 + }, + { + "epoch": 0.16, + "grad_norm": 1.751900434499064, + "learning_rate": 1.909303163127873e-05, + "loss": 0.9245, + "step": 1090 + }, + { + "epoch": 0.16, + "grad_norm": 1.4777626648740374, + "learning_rate": 1.9091019300879848e-05, + "loss": 0.9625, + "step": 1091 + }, + { + "epoch": 0.16, + "grad_norm": 1.553647698794413, + "learning_rate": 1.9089004846817947e-05, + "loss": 0.9355, + "step": 1092 + }, + { + "epoch": 0.16, + "grad_norm": 1.3314385018182198, + "learning_rate": 1.908698826956361e-05, + "loss": 0.9244, + "step": 1093 + }, + { + "epoch": 0.16, + "grad_norm": 1.546789986990871, + "learning_rate": 1.9084969569587908e-05, + "loss": 0.9455, + "step": 1094 + }, + { + "epoch": 0.16, + "grad_norm": 1.6272980769666485, + "learning_rate": 1.9082948747362412e-05, + "loss": 0.8533, + "step": 1095 + }, + { + "epoch": 0.16, + "grad_norm": 1.646582490796409, + "learning_rate": 1.908092580335918e-05, + "loss": 1.0128, + "step": 1096 + }, + { + "epoch": 0.16, + "grad_norm": 1.6106610836389794, + "learning_rate": 1.9078900738050776e-05, + "loss": 0.9394, + "step": 1097 + }, + { + "epoch": 0.16, + "grad_norm": 1.850712706921773, + "learning_rate": 1.9076873551910256e-05, + "loss": 0.9306, + "step": 1098 + }, + { + "epoch": 0.16, + "grad_norm": 1.629630802435573, + "learning_rate": 1.907484424541117e-05, + "loss": 0.8473, + "step": 1099 + }, + { + "epoch": 0.16, + "grad_norm": 1.5917835047809785, + "learning_rate": 1.9072812819027566e-05, + "loss": 0.8673, + "step": 1100 + }, + { + "epoch": 0.16, + "grad_norm": 1.6063218553405059, + "learning_rate": 1.907077927323398e-05, + "loss": 0.9599, + "step": 1101 + }, + { + "epoch": 0.16, + "grad_norm": 2.0558726669264535, + "learning_rate": 1.9068743608505454e-05, + "loss": 1.0059, + "step": 1102 + }, + { + "epoch": 0.16, + "grad_norm": 1.8377801434250658, + "learning_rate": 1.9066705825317518e-05, + "loss": 0.9009, + "step": 1103 + }, + { + "epoch": 0.16, + "grad_norm": 1.6201705153542958, + "learning_rate": 1.90646659241462e-05, + "loss": 0.8804, + "step": 1104 + }, + { + "epoch": 0.16, + "grad_norm": 1.5761230040469598, + "learning_rate": 1.9062623905468015e-05, + "loss": 0.8529, + "step": 1105 + }, + { + "epoch": 0.17, + "grad_norm": 1.5993850233949765, + "learning_rate": 1.9060579769759986e-05, + "loss": 0.9292, + "step": 1106 + }, + { + "epoch": 0.17, + "grad_norm": 1.5057654530863425, + "learning_rate": 1.905853351749962e-05, + "loss": 0.862, + "step": 1107 + }, + { + "epoch": 0.17, + "grad_norm": 1.7542682308151063, + "learning_rate": 1.905648514916492e-05, + "loss": 0.8259, + "step": 1108 + }, + { + "epoch": 0.17, + "grad_norm": 0.8702799615906952, + "learning_rate": 1.905443466523439e-05, + "loss": 0.3283, + "step": 1109 + }, + { + "epoch": 0.17, + "grad_norm": 1.4645305788331042, + "learning_rate": 1.9052382066187017e-05, + "loss": 0.9252, + "step": 1110 + }, + { + "epoch": 0.17, + "grad_norm": 1.5128347426173903, + "learning_rate": 1.9050327352502292e-05, + "loss": 0.9566, + "step": 1111 + }, + { + "epoch": 0.17, + "grad_norm": 1.5390629762319612, + "learning_rate": 1.9048270524660197e-05, + "loss": 0.9492, + "step": 1112 + }, + { + "epoch": 0.17, + "grad_norm": 1.4759078605606883, + "learning_rate": 1.9046211583141206e-05, + "loss": 0.9014, + "step": 1113 + }, + { + "epoch": 0.17, + "grad_norm": 1.647019446415463, + "learning_rate": 1.9044150528426288e-05, + "loss": 0.8662, + "step": 1114 + }, + { + "epoch": 0.17, + "grad_norm": 0.8630314364105205, + "learning_rate": 1.9042087360996904e-05, + "loss": 0.3188, + "step": 1115 + }, + { + "epoch": 0.17, + "grad_norm": 1.5735208914477208, + "learning_rate": 1.904002208133501e-05, + "loss": 0.9512, + "step": 1116 + }, + { + "epoch": 0.17, + "grad_norm": 1.5933487202315462, + "learning_rate": 1.903795468992306e-05, + "loss": 1.0019, + "step": 1117 + }, + { + "epoch": 0.17, + "grad_norm": 1.6957806433073173, + "learning_rate": 1.9035885187243996e-05, + "loss": 0.9519, + "step": 1118 + }, + { + "epoch": 0.17, + "grad_norm": 1.751716539320122, + "learning_rate": 1.903381357378125e-05, + "loss": 0.9308, + "step": 1119 + }, + { + "epoch": 0.17, + "grad_norm": 1.5846124102495645, + "learning_rate": 1.9031739850018755e-05, + "loss": 0.8757, + "step": 1120 + }, + { + "epoch": 0.17, + "grad_norm": 1.6299300138382484, + "learning_rate": 1.902966401644093e-05, + "loss": 0.9869, + "step": 1121 + }, + { + "epoch": 0.17, + "grad_norm": 1.75999031791679, + "learning_rate": 1.902758607353269e-05, + "loss": 0.8777, + "step": 1122 + }, + { + "epoch": 0.17, + "grad_norm": 1.5477360968137088, + "learning_rate": 1.9025506021779446e-05, + "loss": 0.9461, + "step": 1123 + }, + { + "epoch": 0.17, + "grad_norm": 1.5465128298842241, + "learning_rate": 1.9023423861667093e-05, + "loss": 0.9638, + "step": 1124 + }, + { + "epoch": 0.17, + "grad_norm": 1.458325709737424, + "learning_rate": 1.902133959368203e-05, + "loss": 0.8923, + "step": 1125 + }, + { + "epoch": 0.17, + "grad_norm": 1.6242994588879356, + "learning_rate": 1.901925321831114e-05, + "loss": 0.9359, + "step": 1126 + }, + { + "epoch": 0.17, + "grad_norm": 1.5322664977753728, + "learning_rate": 1.9017164736041795e-05, + "loss": 0.9671, + "step": 1127 + }, + { + "epoch": 0.17, + "grad_norm": 1.4499265652951752, + "learning_rate": 1.9015074147361875e-05, + "loss": 0.9617, + "step": 1128 + }, + { + "epoch": 0.17, + "grad_norm": 1.629563184268512, + "learning_rate": 1.901298145275973e-05, + "loss": 0.869, + "step": 1129 + }, + { + "epoch": 0.17, + "grad_norm": 1.6874952305178883, + "learning_rate": 1.9010886652724226e-05, + "loss": 0.9653, + "step": 1130 + }, + { + "epoch": 0.17, + "grad_norm": 1.6908336566611315, + "learning_rate": 1.9008789747744697e-05, + "loss": 0.9849, + "step": 1131 + }, + { + "epoch": 0.17, + "grad_norm": 1.4852033800777031, + "learning_rate": 1.9006690738310988e-05, + "loss": 0.9168, + "step": 1132 + }, + { + "epoch": 0.17, + "grad_norm": 1.319541720274812, + "learning_rate": 1.9004589624913428e-05, + "loss": 0.9015, + "step": 1133 + }, + { + "epoch": 0.17, + "grad_norm": 1.6475923414574336, + "learning_rate": 1.900248640804283e-05, + "loss": 0.8672, + "step": 1134 + }, + { + "epoch": 0.17, + "grad_norm": 1.5780109993436227, + "learning_rate": 1.9000381088190512e-05, + "loss": 0.9191, + "step": 1135 + }, + { + "epoch": 0.17, + "grad_norm": 0.9846952079335375, + "learning_rate": 1.8998273665848273e-05, + "loss": 0.3237, + "step": 1136 + }, + { + "epoch": 0.17, + "grad_norm": 1.3882339815916798, + "learning_rate": 1.8996164141508412e-05, + "loss": 0.9764, + "step": 1137 + }, + { + "epoch": 0.17, + "grad_norm": 1.534498214435999, + "learning_rate": 1.899405251566371e-05, + "loss": 0.9173, + "step": 1138 + }, + { + "epoch": 0.17, + "grad_norm": 1.4067702006750886, + "learning_rate": 1.8991938788807446e-05, + "loss": 0.8883, + "step": 1139 + }, + { + "epoch": 0.17, + "grad_norm": 1.4152944938468568, + "learning_rate": 1.8989822961433386e-05, + "loss": 0.9221, + "step": 1140 + }, + { + "epoch": 0.17, + "grad_norm": 1.604223557388208, + "learning_rate": 1.8987705034035784e-05, + "loss": 0.9298, + "step": 1141 + }, + { + "epoch": 0.17, + "grad_norm": 0.8280467427137816, + "learning_rate": 1.898558500710939e-05, + "loss": 0.308, + "step": 1142 + }, + { + "epoch": 0.17, + "grad_norm": 1.424759975225943, + "learning_rate": 1.8983462881149447e-05, + "loss": 0.9473, + "step": 1143 + }, + { + "epoch": 0.17, + "grad_norm": 1.8005634391281962, + "learning_rate": 1.898133865665168e-05, + "loss": 0.978, + "step": 1144 + }, + { + "epoch": 0.17, + "grad_norm": 1.8356695202795086, + "learning_rate": 1.89792123341123e-05, + "loss": 0.9625, + "step": 1145 + }, + { + "epoch": 0.17, + "grad_norm": 1.4477431954667923, + "learning_rate": 1.8977083914028034e-05, + "loss": 0.9643, + "step": 1146 + }, + { + "epoch": 0.17, + "grad_norm": 1.4744031225049494, + "learning_rate": 1.8974953396896066e-05, + "loss": 0.8575, + "step": 1147 + }, + { + "epoch": 0.17, + "grad_norm": 1.9251500327455193, + "learning_rate": 1.897282078321409e-05, + "loss": 0.9276, + "step": 1148 + }, + { + "epoch": 0.17, + "grad_norm": 2.0091543750979954, + "learning_rate": 1.8970686073480287e-05, + "loss": 0.9273, + "step": 1149 + }, + { + "epoch": 0.17, + "grad_norm": 1.711553824038662, + "learning_rate": 1.8968549268193325e-05, + "loss": 0.9475, + "step": 1150 + }, + { + "epoch": 0.17, + "grad_norm": 1.593556430926505, + "learning_rate": 1.896641036785236e-05, + "loss": 0.9742, + "step": 1151 + }, + { + "epoch": 0.17, + "grad_norm": 1.6311827847181575, + "learning_rate": 1.896426937295704e-05, + "loss": 0.8693, + "step": 1152 + }, + { + "epoch": 0.17, + "grad_norm": 1.5218338485437497, + "learning_rate": 1.89621262840075e-05, + "loss": 0.979, + "step": 1153 + }, + { + "epoch": 0.17, + "grad_norm": 0.7871557978104736, + "learning_rate": 1.895998110150437e-05, + "loss": 0.3205, + "step": 1154 + }, + { + "epoch": 0.17, + "grad_norm": 1.5994081568588547, + "learning_rate": 1.8957833825948756e-05, + "loss": 0.9642, + "step": 1155 + }, + { + "epoch": 0.17, + "grad_norm": 1.5861992995747178, + "learning_rate": 1.8955684457842275e-05, + "loss": 0.9777, + "step": 1156 + }, + { + "epoch": 0.17, + "grad_norm": 1.3731965923484906, + "learning_rate": 1.8953532997687008e-05, + "loss": 0.9178, + "step": 1157 + }, + { + "epoch": 0.17, + "grad_norm": 1.573120856031462, + "learning_rate": 1.895137944598554e-05, + "loss": 0.8593, + "step": 1158 + }, + { + "epoch": 0.17, + "grad_norm": 1.7175529012027326, + "learning_rate": 1.8949223803240945e-05, + "loss": 0.9878, + "step": 1159 + }, + { + "epoch": 0.17, + "grad_norm": 1.4005092480162458, + "learning_rate": 1.8947066069956777e-05, + "loss": 0.9378, + "step": 1160 + }, + { + "epoch": 0.17, + "grad_norm": 1.5569368233785106, + "learning_rate": 1.8944906246637084e-05, + "loss": 0.9185, + "step": 1161 + }, + { + "epoch": 0.17, + "grad_norm": 1.6432224375309423, + "learning_rate": 1.89427443337864e-05, + "loss": 0.9815, + "step": 1162 + }, + { + "epoch": 0.17, + "grad_norm": 1.7008021551835373, + "learning_rate": 1.8940580331909747e-05, + "loss": 0.9366, + "step": 1163 + }, + { + "epoch": 0.17, + "grad_norm": 1.5144996188733308, + "learning_rate": 1.893841424151264e-05, + "loss": 0.8872, + "step": 1164 + }, + { + "epoch": 0.17, + "grad_norm": 1.4102849274200462, + "learning_rate": 1.8936246063101077e-05, + "loss": 0.9337, + "step": 1165 + }, + { + "epoch": 0.17, + "grad_norm": 1.4799118869337513, + "learning_rate": 1.893407579718154e-05, + "loss": 0.9644, + "step": 1166 + }, + { + "epoch": 0.17, + "grad_norm": 1.4522204519140673, + "learning_rate": 1.8931903444261007e-05, + "loss": 0.9241, + "step": 1167 + }, + { + "epoch": 0.17, + "grad_norm": 0.853809030195236, + "learning_rate": 1.892972900484694e-05, + "loss": 0.3487, + "step": 1168 + }, + { + "epoch": 0.17, + "grad_norm": 1.747228098226903, + "learning_rate": 1.892755247944729e-05, + "loss": 0.9473, + "step": 1169 + }, + { + "epoch": 0.17, + "grad_norm": 0.8855540718253962, + "learning_rate": 1.892537386857049e-05, + "loss": 0.2987, + "step": 1170 + }, + { + "epoch": 0.17, + "grad_norm": 1.5213810875862679, + "learning_rate": 1.8923193172725467e-05, + "loss": 0.9122, + "step": 1171 + }, + { + "epoch": 0.17, + "grad_norm": 1.5748516388964353, + "learning_rate": 1.8921010392421628e-05, + "loss": 0.9491, + "step": 1172 + }, + { + "epoch": 0.18, + "grad_norm": 1.6928939630742308, + "learning_rate": 1.8918825528168872e-05, + "loss": 0.9216, + "step": 1173 + }, + { + "epoch": 0.18, + "grad_norm": 1.5499795107198675, + "learning_rate": 1.8916638580477586e-05, + "loss": 0.937, + "step": 1174 + }, + { + "epoch": 0.18, + "grad_norm": 1.6389989492389447, + "learning_rate": 1.891444954985864e-05, + "loss": 1.0057, + "step": 1175 + }, + { + "epoch": 0.18, + "grad_norm": 1.5387430525153407, + "learning_rate": 1.891225843682339e-05, + "loss": 0.9499, + "step": 1176 + }, + { + "epoch": 0.18, + "grad_norm": 1.6905031491277491, + "learning_rate": 1.891006524188368e-05, + "loss": 0.9389, + "step": 1177 + }, + { + "epoch": 0.18, + "grad_norm": 1.3827014638750614, + "learning_rate": 1.890786996555184e-05, + "loss": 0.8842, + "step": 1178 + }, + { + "epoch": 0.18, + "grad_norm": 0.9882403410484893, + "learning_rate": 1.8905672608340693e-05, + "loss": 0.3373, + "step": 1179 + }, + { + "epoch": 0.18, + "grad_norm": 1.936884862689874, + "learning_rate": 1.890347317076354e-05, + "loss": 0.8439, + "step": 1180 + }, + { + "epoch": 0.18, + "grad_norm": 1.5826834988298497, + "learning_rate": 1.890127165333416e-05, + "loss": 0.9413, + "step": 1181 + }, + { + "epoch": 0.18, + "grad_norm": 1.716550978669316, + "learning_rate": 1.889906805656684e-05, + "loss": 0.9028, + "step": 1182 + }, + { + "epoch": 0.18, + "grad_norm": 1.65347920176481, + "learning_rate": 1.8896862380976332e-05, + "loss": 0.9535, + "step": 1183 + }, + { + "epoch": 0.18, + "grad_norm": 2.3065422708988446, + "learning_rate": 1.889465462707789e-05, + "loss": 0.9364, + "step": 1184 + }, + { + "epoch": 0.18, + "grad_norm": 1.5315127743317618, + "learning_rate": 1.8892444795387237e-05, + "loss": 1.0257, + "step": 1185 + }, + { + "epoch": 0.18, + "grad_norm": 1.6037148550513256, + "learning_rate": 1.8890232886420598e-05, + "loss": 0.9283, + "step": 1186 + }, + { + "epoch": 0.18, + "grad_norm": 0.8328336039368817, + "learning_rate": 1.888801890069467e-05, + "loss": 0.3108, + "step": 1187 + }, + { + "epoch": 0.18, + "grad_norm": 1.551529222916193, + "learning_rate": 1.8885802838726634e-05, + "loss": 0.957, + "step": 1188 + }, + { + "epoch": 0.18, + "grad_norm": 1.6219782082568708, + "learning_rate": 1.8883584701034177e-05, + "loss": 0.9161, + "step": 1189 + }, + { + "epoch": 0.18, + "grad_norm": 1.4742246808113189, + "learning_rate": 1.8881364488135448e-05, + "loss": 0.9947, + "step": 1190 + }, + { + "epoch": 0.18, + "grad_norm": 1.5864959917297368, + "learning_rate": 1.887914220054909e-05, + "loss": 0.8545, + "step": 1191 + }, + { + "epoch": 0.18, + "grad_norm": 1.6162266422211726, + "learning_rate": 1.8876917838794226e-05, + "loss": 0.8937, + "step": 1192 + }, + { + "epoch": 0.18, + "grad_norm": 1.5502100079784191, + "learning_rate": 1.8874691403390476e-05, + "loss": 0.9681, + "step": 1193 + }, + { + "epoch": 0.18, + "grad_norm": 1.6388995327313163, + "learning_rate": 1.8872462894857927e-05, + "loss": 0.8787, + "step": 1194 + }, + { + "epoch": 0.18, + "grad_norm": 1.6493475252940657, + "learning_rate": 1.887023231371716e-05, + "loss": 0.9474, + "step": 1195 + }, + { + "epoch": 0.18, + "grad_norm": 1.5760642833929563, + "learning_rate": 1.886799966048924e-05, + "loss": 0.9447, + "step": 1196 + }, + { + "epoch": 0.18, + "grad_norm": 1.807192015145044, + "learning_rate": 1.886576493569572e-05, + "loss": 0.9359, + "step": 1197 + }, + { + "epoch": 0.18, + "grad_norm": 1.4519760309115668, + "learning_rate": 1.8863528139858628e-05, + "loss": 0.9556, + "step": 1198 + }, + { + "epoch": 0.18, + "grad_norm": 1.4077583230853012, + "learning_rate": 1.886128927350048e-05, + "loss": 0.953, + "step": 1199 + }, + { + "epoch": 0.18, + "grad_norm": 1.52294725819737, + "learning_rate": 1.885904833714427e-05, + "loss": 0.8942, + "step": 1200 + }, + { + "epoch": 0.18, + "grad_norm": 1.4446690602737893, + "learning_rate": 1.8856805331313487e-05, + "loss": 0.8931, + "step": 1201 + }, + { + "epoch": 0.18, + "grad_norm": 1.4177219935426382, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.8838, + "step": 1202 + }, + { + "epoch": 0.18, + "grad_norm": 1.544077419393447, + "learning_rate": 1.8852313113324553e-05, + "loss": 0.9048, + "step": 1203 + }, + { + "epoch": 0.18, + "grad_norm": 1.5659039029571216, + "learning_rate": 1.885006390221578e-05, + "loss": 0.909, + "step": 1204 + }, + { + "epoch": 0.18, + "grad_norm": 1.4326474232723647, + "learning_rate": 1.8847812623731202e-05, + "loss": 0.9173, + "step": 1205 + }, + { + "epoch": 0.18, + "grad_norm": 1.505348227046058, + "learning_rate": 1.8845559278396707e-05, + "loss": 0.8552, + "step": 1206 + }, + { + "epoch": 0.18, + "grad_norm": 1.8805963484830883, + "learning_rate": 1.884330386673869e-05, + "loss": 0.8872, + "step": 1207 + }, + { + "epoch": 0.18, + "grad_norm": 1.6435418743284602, + "learning_rate": 1.8841046389284004e-05, + "loss": 0.9942, + "step": 1208 + }, + { + "epoch": 0.18, + "grad_norm": 1.5910597996206866, + "learning_rate": 1.8838786846560003e-05, + "loss": 0.9474, + "step": 1209 + }, + { + "epoch": 0.18, + "grad_norm": 1.6840176723500377, + "learning_rate": 1.883652523909451e-05, + "loss": 0.9131, + "step": 1210 + }, + { + "epoch": 0.18, + "grad_norm": 1.8808549938368748, + "learning_rate": 1.883426156741585e-05, + "loss": 0.96, + "step": 1211 + }, + { + "epoch": 0.18, + "grad_norm": 1.5601920463302617, + "learning_rate": 1.8831995832052802e-05, + "loss": 0.942, + "step": 1212 + }, + { + "epoch": 0.18, + "grad_norm": 1.673192219474004, + "learning_rate": 1.8829728033534644e-05, + "loss": 0.941, + "step": 1213 + }, + { + "epoch": 0.18, + "grad_norm": 1.6446156400833298, + "learning_rate": 1.882745817239114e-05, + "loss": 0.9321, + "step": 1214 + }, + { + "epoch": 0.18, + "grad_norm": 1.6079547527209246, + "learning_rate": 1.882518624915253e-05, + "loss": 0.9555, + "step": 1215 + }, + { + "epoch": 0.18, + "grad_norm": 1.5859759724230382, + "learning_rate": 1.8822912264349535e-05, + "loss": 0.9227, + "step": 1216 + }, + { + "epoch": 0.18, + "grad_norm": 1.6065715125698727, + "learning_rate": 1.8820636218513354e-05, + "loss": 0.9624, + "step": 1217 + }, + { + "epoch": 0.18, + "grad_norm": 1.5036676341415365, + "learning_rate": 1.881835811217567e-05, + "loss": 0.8987, + "step": 1218 + }, + { + "epoch": 0.18, + "grad_norm": 1.723636847104978, + "learning_rate": 1.8816077945868656e-05, + "loss": 0.9751, + "step": 1219 + }, + { + "epoch": 0.18, + "grad_norm": 1.4359919454953791, + "learning_rate": 1.8813795720124958e-05, + "loss": 1.0122, + "step": 1220 + }, + { + "epoch": 0.18, + "grad_norm": 1.4045787191543881, + "learning_rate": 1.8811511435477695e-05, + "loss": 0.8864, + "step": 1221 + }, + { + "epoch": 0.18, + "grad_norm": 1.5753337937586187, + "learning_rate": 1.8809225092460488e-05, + "loss": 0.9649, + "step": 1222 + }, + { + "epoch": 0.18, + "grad_norm": 1.4494026059745404, + "learning_rate": 1.880693669160742e-05, + "loss": 0.9843, + "step": 1223 + }, + { + "epoch": 0.18, + "grad_norm": 1.7448198121680767, + "learning_rate": 1.8804646233453067e-05, + "loss": 0.917, + "step": 1224 + }, + { + "epoch": 0.18, + "grad_norm": 1.5492769850059651, + "learning_rate": 1.8802353718532472e-05, + "loss": 0.902, + "step": 1225 + }, + { + "epoch": 0.18, + "grad_norm": 1.7016026852506534, + "learning_rate": 1.8800059147381172e-05, + "loss": 0.9395, + "step": 1226 + }, + { + "epoch": 0.18, + "grad_norm": 1.6215535350617172, + "learning_rate": 1.8797762520535178e-05, + "loss": 0.9146, + "step": 1227 + }, + { + "epoch": 0.18, + "grad_norm": 1.6084653427022122, + "learning_rate": 1.8795463838530988e-05, + "loss": 0.8814, + "step": 1228 + }, + { + "epoch": 0.18, + "grad_norm": 1.6905071988948548, + "learning_rate": 1.8793163101905562e-05, + "loss": 0.8176, + "step": 1229 + }, + { + "epoch": 0.18, + "grad_norm": 1.6941768912386646, + "learning_rate": 1.8790860311196365e-05, + "loss": 0.9573, + "step": 1230 + }, + { + "epoch": 0.18, + "grad_norm": 1.8326464679439716, + "learning_rate": 1.878855546694132e-05, + "loss": 0.933, + "step": 1231 + }, + { + "epoch": 0.18, + "grad_norm": 1.6187182912151123, + "learning_rate": 1.8786248569678847e-05, + "loss": 0.9439, + "step": 1232 + }, + { + "epoch": 0.18, + "grad_norm": 1.8292645668436696, + "learning_rate": 1.8783939619947827e-05, + "loss": 0.8819, + "step": 1233 + }, + { + "epoch": 0.18, + "grad_norm": 1.4424933382792418, + "learning_rate": 1.878162861828764e-05, + "loss": 0.9702, + "step": 1234 + }, + { + "epoch": 0.18, + "grad_norm": 1.629760378746899, + "learning_rate": 1.8779315565238133e-05, + "loss": 0.9306, + "step": 1235 + }, + { + "epoch": 0.18, + "grad_norm": 1.7883182293496125, + "learning_rate": 1.8777000461339635e-05, + "loss": 0.9406, + "step": 1236 + }, + { + "epoch": 0.18, + "grad_norm": 0.8801254801532837, + "learning_rate": 1.8774683307132956e-05, + "loss": 0.3198, + "step": 1237 + }, + { + "epoch": 0.18, + "grad_norm": 1.5185368527656098, + "learning_rate": 1.877236410315938e-05, + "loss": 0.948, + "step": 1238 + }, + { + "epoch": 0.18, + "grad_norm": 1.4700319581192989, + "learning_rate": 1.8770042849960676e-05, + "loss": 0.9333, + "step": 1239 + }, + { + "epoch": 0.19, + "grad_norm": 1.6535375410700721, + "learning_rate": 1.8767719548079088e-05, + "loss": 0.9977, + "step": 1240 + }, + { + "epoch": 0.19, + "grad_norm": 1.4919605419559654, + "learning_rate": 1.8765394198057342e-05, + "loss": 0.8905, + "step": 1241 + }, + { + "epoch": 0.19, + "grad_norm": 1.5638896984825594, + "learning_rate": 1.8763066800438638e-05, + "loss": 1.023, + "step": 1242 + }, + { + "epoch": 0.19, + "grad_norm": 1.643108876942023, + "learning_rate": 1.8760737355766654e-05, + "loss": 0.9504, + "step": 1243 + }, + { + "epoch": 0.19, + "grad_norm": 1.5382238193047761, + "learning_rate": 1.8758405864585554e-05, + "loss": 0.9877, + "step": 1244 + }, + { + "epoch": 0.19, + "grad_norm": 1.6923843660033813, + "learning_rate": 1.875607232743997e-05, + "loss": 0.9433, + "step": 1245 + }, + { + "epoch": 0.19, + "grad_norm": 1.4801961313132213, + "learning_rate": 1.875373674487502e-05, + "loss": 0.8968, + "step": 1246 + }, + { + "epoch": 0.19, + "grad_norm": 1.3989568237535377, + "learning_rate": 1.8751399117436292e-05, + "loss": 0.9798, + "step": 1247 + }, + { + "epoch": 0.19, + "grad_norm": 1.6318968276986034, + "learning_rate": 1.8749059445669856e-05, + "loss": 0.8227, + "step": 1248 + }, + { + "epoch": 0.19, + "grad_norm": 1.5987330878890313, + "learning_rate": 1.8746717730122266e-05, + "loss": 0.8551, + "step": 1249 + }, + { + "epoch": 0.19, + "grad_norm": 1.4963770665364888, + "learning_rate": 1.874437397134054e-05, + "loss": 0.9298, + "step": 1250 + }, + { + "epoch": 0.19, + "grad_norm": 1.6008725261776349, + "learning_rate": 1.8742028169872188e-05, + "loss": 0.8638, + "step": 1251 + }, + { + "epoch": 0.19, + "grad_norm": 1.9130527674558184, + "learning_rate": 1.873968032626518e-05, + "loss": 0.9344, + "step": 1252 + }, + { + "epoch": 0.19, + "grad_norm": 1.7082103393248815, + "learning_rate": 1.873733044106798e-05, + "loss": 0.8369, + "step": 1253 + }, + { + "epoch": 0.19, + "grad_norm": 1.505975577772877, + "learning_rate": 1.8734978514829518e-05, + "loss": 0.9899, + "step": 1254 + }, + { + "epoch": 0.19, + "grad_norm": 1.3955283226871127, + "learning_rate": 1.8732624548099204e-05, + "loss": 0.9152, + "step": 1255 + }, + { + "epoch": 0.19, + "grad_norm": 1.4906288739719873, + "learning_rate": 1.8730268541426924e-05, + "loss": 0.8999, + "step": 1256 + }, + { + "epoch": 0.19, + "grad_norm": 0.8773049411359727, + "learning_rate": 1.8727910495363043e-05, + "loss": 0.3199, + "step": 1257 + }, + { + "epoch": 0.19, + "grad_norm": 1.6588613464064355, + "learning_rate": 1.8725550410458403e-05, + "loss": 0.9285, + "step": 1258 + }, + { + "epoch": 0.19, + "grad_norm": 1.3533282989792614, + "learning_rate": 1.872318828726432e-05, + "loss": 0.9319, + "step": 1259 + }, + { + "epoch": 0.19, + "grad_norm": 1.84850224972434, + "learning_rate": 1.8720824126332583e-05, + "loss": 0.9614, + "step": 1260 + }, + { + "epoch": 0.19, + "grad_norm": 1.6700730043477208, + "learning_rate": 1.8718457928215458e-05, + "loss": 0.9567, + "step": 1261 + }, + { + "epoch": 0.19, + "grad_norm": 1.642011708088758, + "learning_rate": 1.8716089693465696e-05, + "loss": 0.9584, + "step": 1262 + }, + { + "epoch": 0.19, + "grad_norm": 1.708630877310769, + "learning_rate": 1.871371942263651e-05, + "loss": 0.8787, + "step": 1263 + }, + { + "epoch": 0.19, + "grad_norm": 1.491084515022238, + "learning_rate": 1.8711347116281598e-05, + "loss": 0.9189, + "step": 1264 + }, + { + "epoch": 0.19, + "grad_norm": 1.678290646274311, + "learning_rate": 1.8708972774955136e-05, + "loss": 0.9222, + "step": 1265 + }, + { + "epoch": 0.19, + "grad_norm": 1.4672324007677358, + "learning_rate": 1.8706596399211765e-05, + "loss": 0.863, + "step": 1266 + }, + { + "epoch": 0.19, + "grad_norm": 1.5234800650690186, + "learning_rate": 1.8704217989606606e-05, + "loss": 0.9324, + "step": 1267 + }, + { + "epoch": 0.19, + "grad_norm": 1.5853127502177646, + "learning_rate": 1.870183754669526e-05, + "loss": 0.9476, + "step": 1268 + }, + { + "epoch": 0.19, + "grad_norm": 1.504131171511284, + "learning_rate": 1.8699455071033795e-05, + "loss": 0.9881, + "step": 1269 + }, + { + "epoch": 0.19, + "grad_norm": 2.203410522410484, + "learning_rate": 1.8697070563178758e-05, + "loss": 0.9066, + "step": 1270 + }, + { + "epoch": 0.19, + "grad_norm": 1.5035306343812098, + "learning_rate": 1.869468402368717e-05, + "loss": 0.95, + "step": 1271 + }, + { + "epoch": 0.19, + "grad_norm": 1.5599186836032735, + "learning_rate": 1.869229545311653e-05, + "loss": 1.0106, + "step": 1272 + }, + { + "epoch": 0.19, + "grad_norm": 1.835861925842248, + "learning_rate": 1.8689904852024803e-05, + "loss": 1.0088, + "step": 1273 + }, + { + "epoch": 0.19, + "grad_norm": 1.4560103873428691, + "learning_rate": 1.868751222097044e-05, + "loss": 0.9692, + "step": 1274 + }, + { + "epoch": 0.19, + "grad_norm": 1.3895527095621782, + "learning_rate": 1.868511756051236e-05, + "loss": 0.8148, + "step": 1275 + }, + { + "epoch": 0.19, + "grad_norm": 1.8882937550870484, + "learning_rate": 1.868272087120995e-05, + "loss": 0.8738, + "step": 1276 + }, + { + "epoch": 0.19, + "grad_norm": 1.3671512469526015, + "learning_rate": 1.8680322153623077e-05, + "loss": 0.8906, + "step": 1277 + }, + { + "epoch": 0.19, + "grad_norm": 1.5260697060885258, + "learning_rate": 1.8677921408312083e-05, + "loss": 0.8557, + "step": 1278 + }, + { + "epoch": 0.19, + "grad_norm": 1.7329632279907663, + "learning_rate": 1.8675518635837788e-05, + "loss": 0.9841, + "step": 1279 + }, + { + "epoch": 0.19, + "grad_norm": 1.5317587579021834, + "learning_rate": 1.8673113836761475e-05, + "loss": 0.9284, + "step": 1280 + }, + { + "epoch": 0.19, + "grad_norm": 1.5342315444098116, + "learning_rate": 1.86707070116449e-05, + "loss": 0.9132, + "step": 1281 + }, + { + "epoch": 0.19, + "grad_norm": 1.6717295817387856, + "learning_rate": 1.8668298161050308e-05, + "loss": 0.8795, + "step": 1282 + }, + { + "epoch": 0.19, + "grad_norm": 1.5656831730049623, + "learning_rate": 1.8665887285540405e-05, + "loss": 1.0018, + "step": 1283 + }, + { + "epoch": 0.19, + "grad_norm": 1.6604674743194432, + "learning_rate": 1.8663474385678364e-05, + "loss": 0.8799, + "step": 1284 + }, + { + "epoch": 0.19, + "grad_norm": 1.646276807088851, + "learning_rate": 1.8661059462027844e-05, + "loss": 0.8531, + "step": 1285 + }, + { + "epoch": 0.19, + "grad_norm": 1.4665391825162069, + "learning_rate": 1.8658642515152973e-05, + "loss": 0.9083, + "step": 1286 + }, + { + "epoch": 0.19, + "grad_norm": 1.4433325441109526, + "learning_rate": 1.8656223545618345e-05, + "loss": 0.9448, + "step": 1287 + }, + { + "epoch": 0.19, + "grad_norm": 1.5484993310241129, + "learning_rate": 1.8653802553989037e-05, + "loss": 0.9323, + "step": 1288 + }, + { + "epoch": 0.19, + "grad_norm": 1.3402972566022855, + "learning_rate": 1.865137954083059e-05, + "loss": 0.8927, + "step": 1289 + }, + { + "epoch": 0.19, + "grad_norm": 1.7226551730613198, + "learning_rate": 1.864895450670902e-05, + "loss": 0.9696, + "step": 1290 + }, + { + "epoch": 0.19, + "grad_norm": 1.428140063383765, + "learning_rate": 1.8646527452190815e-05, + "loss": 0.8815, + "step": 1291 + }, + { + "epoch": 0.19, + "grad_norm": 1.6296343514031562, + "learning_rate": 1.8644098377842934e-05, + "loss": 0.9921, + "step": 1292 + }, + { + "epoch": 0.19, + "grad_norm": 1.5240842326767499, + "learning_rate": 1.8641667284232813e-05, + "loss": 0.9861, + "step": 1293 + }, + { + "epoch": 0.19, + "grad_norm": 1.5892538011792583, + "learning_rate": 1.8639234171928355e-05, + "loss": 0.9365, + "step": 1294 + }, + { + "epoch": 0.19, + "grad_norm": 1.6202418582308802, + "learning_rate": 1.863679904149793e-05, + "loss": 0.9119, + "step": 1295 + }, + { + "epoch": 0.19, + "grad_norm": 1.6208106241825433, + "learning_rate": 1.8634361893510393e-05, + "loss": 0.8552, + "step": 1296 + }, + { + "epoch": 0.19, + "grad_norm": 1.6827653785313705, + "learning_rate": 1.8631922728535054e-05, + "loss": 0.9716, + "step": 1297 + }, + { + "epoch": 0.19, + "grad_norm": 1.5840536530440106, + "learning_rate": 1.8629481547141708e-05, + "loss": 0.8995, + "step": 1298 + }, + { + "epoch": 0.19, + "grad_norm": 1.6552926791037665, + "learning_rate": 1.862703834990061e-05, + "loss": 0.9875, + "step": 1299 + }, + { + "epoch": 0.19, + "grad_norm": 1.7296687832433923, + "learning_rate": 1.8624593137382495e-05, + "loss": 0.9829, + "step": 1300 + }, + { + "epoch": 0.19, + "grad_norm": 0.9887302056486602, + "learning_rate": 1.8622145910158568e-05, + "loss": 0.3243, + "step": 1301 + }, + { + "epoch": 0.19, + "grad_norm": 1.5895416783536622, + "learning_rate": 1.8619696668800494e-05, + "loss": 0.9121, + "step": 1302 + }, + { + "epoch": 0.19, + "grad_norm": 1.727160176659264, + "learning_rate": 1.861724541388042e-05, + "loss": 0.9216, + "step": 1303 + }, + { + "epoch": 0.19, + "grad_norm": 1.3950993785765546, + "learning_rate": 1.8614792145970958e-05, + "loss": 0.8511, + "step": 1304 + }, + { + "epoch": 0.19, + "grad_norm": 1.833936936452527, + "learning_rate": 1.861233686564519e-05, + "loss": 0.9771, + "step": 1305 + }, + { + "epoch": 0.19, + "grad_norm": 1.464209391884905, + "learning_rate": 1.860987957347668e-05, + "loss": 0.9382, + "step": 1306 + }, + { + "epoch": 0.2, + "grad_norm": 1.5161525371094917, + "learning_rate": 1.860742027003944e-05, + "loss": 0.9019, + "step": 1307 + }, + { + "epoch": 0.2, + "grad_norm": 1.603007359166619, + "learning_rate": 1.8604958955907964e-05, + "loss": 0.9364, + "step": 1308 + }, + { + "epoch": 0.2, + "grad_norm": 2.272688820219678, + "learning_rate": 1.8602495631657222e-05, + "loss": 0.8541, + "step": 1309 + }, + { + "epoch": 0.2, + "grad_norm": 1.5642350446073157, + "learning_rate": 1.8600030297862643e-05, + "loss": 0.987, + "step": 1310 + }, + { + "epoch": 0.2, + "grad_norm": 1.5720007495605721, + "learning_rate": 1.859756295510013e-05, + "loss": 1.0131, + "step": 1311 + }, + { + "epoch": 0.2, + "grad_norm": 1.3758413622015309, + "learning_rate": 1.8595093603946053e-05, + "loss": 0.9656, + "step": 1312 + }, + { + "epoch": 0.2, + "grad_norm": 1.5162712819548991, + "learning_rate": 1.8592622244977255e-05, + "loss": 0.912, + "step": 1313 + }, + { + "epoch": 0.2, + "grad_norm": 1.5771425028591872, + "learning_rate": 1.8590148878771043e-05, + "loss": 0.9851, + "step": 1314 + }, + { + "epoch": 0.2, + "grad_norm": 1.5543978543493784, + "learning_rate": 1.8587673505905198e-05, + "loss": 0.976, + "step": 1315 + }, + { + "epoch": 0.2, + "grad_norm": 1.477787802433991, + "learning_rate": 1.858519612695797e-05, + "loss": 0.8555, + "step": 1316 + }, + { + "epoch": 0.2, + "grad_norm": 1.7477959223642614, + "learning_rate": 1.8582716742508066e-05, + "loss": 0.9811, + "step": 1317 + }, + { + "epoch": 0.2, + "grad_norm": 1.6315112141858157, + "learning_rate": 1.8580235353134678e-05, + "loss": 0.9515, + "step": 1318 + }, + { + "epoch": 0.2, + "grad_norm": 1.499480271700626, + "learning_rate": 1.8577751959417458e-05, + "loss": 0.9578, + "step": 1319 + }, + { + "epoch": 0.2, + "grad_norm": 1.5070908875280902, + "learning_rate": 1.8575266561936526e-05, + "loss": 0.9274, + "step": 1320 + }, + { + "epoch": 0.2, + "grad_norm": 1.652018021206146, + "learning_rate": 1.857277916127247e-05, + "loss": 0.9642, + "step": 1321 + }, + { + "epoch": 0.2, + "grad_norm": 1.6129189943556552, + "learning_rate": 1.8570289758006346e-05, + "loss": 0.9857, + "step": 1322 + }, + { + "epoch": 0.2, + "grad_norm": 1.531419336555744, + "learning_rate": 1.856779835271968e-05, + "loss": 0.9487, + "step": 1323 + }, + { + "epoch": 0.2, + "grad_norm": 1.543964428092662, + "learning_rate": 1.856530494599447e-05, + "loss": 0.9843, + "step": 1324 + }, + { + "epoch": 0.2, + "grad_norm": 1.5111762121811114, + "learning_rate": 1.856280953841317e-05, + "loss": 0.9859, + "step": 1325 + }, + { + "epoch": 0.2, + "grad_norm": 1.4074756380707643, + "learning_rate": 1.8560312130558706e-05, + "loss": 0.9449, + "step": 1326 + }, + { + "epoch": 0.2, + "grad_norm": 1.520628428223058, + "learning_rate": 1.8557812723014476e-05, + "loss": 0.889, + "step": 1327 + }, + { + "epoch": 0.2, + "grad_norm": 1.4257094365154306, + "learning_rate": 1.8555311316364344e-05, + "loss": 0.892, + "step": 1328 + }, + { + "epoch": 0.2, + "grad_norm": 1.6365941433827504, + "learning_rate": 1.8552807911192636e-05, + "loss": 0.8999, + "step": 1329 + }, + { + "epoch": 0.2, + "grad_norm": 1.5991657175142358, + "learning_rate": 1.8550302508084145e-05, + "loss": 0.8945, + "step": 1330 + }, + { + "epoch": 0.2, + "grad_norm": 1.3884756966401848, + "learning_rate": 1.8547795107624137e-05, + "loss": 0.9064, + "step": 1331 + }, + { + "epoch": 0.2, + "grad_norm": 1.3479605524683398, + "learning_rate": 1.8545285710398343e-05, + "loss": 0.8934, + "step": 1332 + }, + { + "epoch": 0.2, + "grad_norm": 1.4691756727343244, + "learning_rate": 1.8542774316992953e-05, + "loss": 0.937, + "step": 1333 + }, + { + "epoch": 0.2, + "grad_norm": 1.4131267847512172, + "learning_rate": 1.8540260927994633e-05, + "loss": 0.8723, + "step": 1334 + }, + { + "epoch": 0.2, + "grad_norm": 1.4669830614267512, + "learning_rate": 1.853774554399051e-05, + "loss": 0.9234, + "step": 1335 + }, + { + "epoch": 0.2, + "grad_norm": 1.4830741377153693, + "learning_rate": 1.8535228165568177e-05, + "loss": 0.9726, + "step": 1336 + }, + { + "epoch": 0.2, + "grad_norm": 1.6042513861104033, + "learning_rate": 1.853270879331569e-05, + "loss": 0.9202, + "step": 1337 + }, + { + "epoch": 0.2, + "grad_norm": 1.5331887627190128, + "learning_rate": 1.8530187427821585e-05, + "loss": 0.9193, + "step": 1338 + }, + { + "epoch": 0.2, + "grad_norm": 1.0493620964174841, + "learning_rate": 1.8527664069674844e-05, + "loss": 0.3261, + "step": 1339 + }, + { + "epoch": 0.2, + "grad_norm": 1.5734458237459092, + "learning_rate": 1.8525138719464927e-05, + "loss": 0.8838, + "step": 1340 + }, + { + "epoch": 0.2, + "grad_norm": 1.5167431946381404, + "learning_rate": 1.8522611377781758e-05, + "loss": 0.8644, + "step": 1341 + }, + { + "epoch": 0.2, + "grad_norm": 1.6394667431036494, + "learning_rate": 1.852008204521572e-05, + "loss": 0.9634, + "step": 1342 + }, + { + "epoch": 0.2, + "grad_norm": 1.6506746623286161, + "learning_rate": 1.8517550722357667e-05, + "loss": 0.9378, + "step": 1343 + }, + { + "epoch": 0.2, + "grad_norm": 1.5567760887218947, + "learning_rate": 1.8515017409798915e-05, + "loss": 0.9221, + "step": 1344 + }, + { + "epoch": 0.2, + "grad_norm": 1.4543054935979685, + "learning_rate": 1.8512482108131254e-05, + "loss": 0.9022, + "step": 1345 + }, + { + "epoch": 0.2, + "grad_norm": 2.1048299823681966, + "learning_rate": 1.850994481794692e-05, + "loss": 0.9204, + "step": 1346 + }, + { + "epoch": 0.2, + "grad_norm": 1.5253089487284166, + "learning_rate": 1.850740553983863e-05, + "loss": 0.8562, + "step": 1347 + }, + { + "epoch": 0.2, + "grad_norm": 1.6580663092147898, + "learning_rate": 1.8504864274399557e-05, + "loss": 0.8681, + "step": 1348 + }, + { + "epoch": 0.2, + "grad_norm": 1.6978845861442777, + "learning_rate": 1.8502321022223344e-05, + "loss": 0.9746, + "step": 1349 + }, + { + "epoch": 0.2, + "grad_norm": 1.3833270476083062, + "learning_rate": 1.8499775783904094e-05, + "loss": 0.9414, + "step": 1350 + }, + { + "epoch": 0.2, + "grad_norm": 1.5016279513457007, + "learning_rate": 1.849722856003637e-05, + "loss": 0.9382, + "step": 1351 + }, + { + "epoch": 0.2, + "grad_norm": 1.6086146088387632, + "learning_rate": 1.8494679351215212e-05, + "loss": 0.9381, + "step": 1352 + }, + { + "epoch": 0.2, + "grad_norm": 1.3333134192572778, + "learning_rate": 1.8492128158036113e-05, + "loss": 0.9227, + "step": 1353 + }, + { + "epoch": 0.2, + "grad_norm": 1.6278614849135744, + "learning_rate": 1.8489574981095026e-05, + "loss": 1.0032, + "step": 1354 + }, + { + "epoch": 0.2, + "grad_norm": 1.4688545935501023, + "learning_rate": 1.8487019820988378e-05, + "loss": 0.8512, + "step": 1355 + }, + { + "epoch": 0.2, + "grad_norm": 1.5330555705119522, + "learning_rate": 1.8484462678313053e-05, + "loss": 0.9969, + "step": 1356 + }, + { + "epoch": 0.2, + "grad_norm": 1.348580242682029, + "learning_rate": 1.8481903553666405e-05, + "loss": 0.9258, + "step": 1357 + }, + { + "epoch": 0.2, + "grad_norm": 1.4913611220880396, + "learning_rate": 1.847934244764624e-05, + "loss": 0.9432, + "step": 1358 + }, + { + "epoch": 0.2, + "grad_norm": 1.5106434669616648, + "learning_rate": 1.8476779360850833e-05, + "loss": 0.9214, + "step": 1359 + }, + { + "epoch": 0.2, + "grad_norm": 1.676985565785228, + "learning_rate": 1.8474214293878926e-05, + "loss": 0.9706, + "step": 1360 + }, + { + "epoch": 0.2, + "grad_norm": 1.5161578382250267, + "learning_rate": 1.8471647247329714e-05, + "loss": 0.9006, + "step": 1361 + }, + { + "epoch": 0.2, + "grad_norm": 1.8451989408657201, + "learning_rate": 1.846907822180286e-05, + "loss": 0.8809, + "step": 1362 + }, + { + "epoch": 0.2, + "grad_norm": 1.2645698139980488, + "learning_rate": 1.8466507217898493e-05, + "loss": 0.9171, + "step": 1363 + }, + { + "epoch": 0.2, + "grad_norm": 1.5400524767749244, + "learning_rate": 1.8463934236217195e-05, + "loss": 0.9166, + "step": 1364 + }, + { + "epoch": 0.2, + "grad_norm": 1.6959936733583316, + "learning_rate": 1.8461359277360014e-05, + "loss": 0.955, + "step": 1365 + }, + { + "epoch": 0.2, + "grad_norm": 1.5706814669330444, + "learning_rate": 1.8458782341928465e-05, + "loss": 0.9506, + "step": 1366 + }, + { + "epoch": 0.2, + "grad_norm": 1.3933633781270796, + "learning_rate": 1.845620343052452e-05, + "loss": 0.8393, + "step": 1367 + }, + { + "epoch": 0.2, + "grad_norm": 1.5434039962717498, + "learning_rate": 1.8453622543750608e-05, + "loss": 1.0014, + "step": 1368 + }, + { + "epoch": 0.2, + "grad_norm": 1.4432493853182402, + "learning_rate": 1.8451039682209626e-05, + "loss": 0.9955, + "step": 1369 + }, + { + "epoch": 0.2, + "grad_norm": 1.7283538149943536, + "learning_rate": 1.8448454846504936e-05, + "loss": 0.9312, + "step": 1370 + }, + { + "epoch": 0.2, + "grad_norm": 1.6093431598096921, + "learning_rate": 1.844586803724035e-05, + "loss": 0.823, + "step": 1371 + }, + { + "epoch": 0.2, + "grad_norm": 1.420925494064239, + "learning_rate": 1.8443279255020153e-05, + "loss": 0.9405, + "step": 1372 + }, + { + "epoch": 0.2, + "grad_norm": 1.4719271100644025, + "learning_rate": 1.8440688500449076e-05, + "loss": 1.0121, + "step": 1373 + }, + { + "epoch": 0.21, + "grad_norm": 1.5046113355419708, + "learning_rate": 1.8438095774132327e-05, + "loss": 0.8513, + "step": 1374 + }, + { + "epoch": 0.21, + "grad_norm": 1.58328749970449, + "learning_rate": 1.8435501076675566e-05, + "loss": 0.9453, + "step": 1375 + }, + { + "epoch": 0.21, + "grad_norm": 1.5273273984224014, + "learning_rate": 1.8432904408684912e-05, + "loss": 0.9004, + "step": 1376 + }, + { + "epoch": 0.21, + "grad_norm": 1.6405097693893897, + "learning_rate": 1.8430305770766947e-05, + "loss": 0.8508, + "step": 1377 + }, + { + "epoch": 0.21, + "grad_norm": 1.431987793216744, + "learning_rate": 1.8427705163528716e-05, + "loss": 0.9321, + "step": 1378 + }, + { + "epoch": 0.21, + "grad_norm": 1.4307210572382882, + "learning_rate": 1.8425102587577716e-05, + "loss": 0.9552, + "step": 1379 + }, + { + "epoch": 0.21, + "grad_norm": 1.4030733256180892, + "learning_rate": 1.8422498043521915e-05, + "loss": 0.8972, + "step": 1380 + }, + { + "epoch": 0.21, + "grad_norm": 1.5824955859280896, + "learning_rate": 1.841989153196973e-05, + "loss": 0.8686, + "step": 1381 + }, + { + "epoch": 0.21, + "grad_norm": 1.5346678546610544, + "learning_rate": 1.8417283053530047e-05, + "loss": 0.9393, + "step": 1382 + }, + { + "epoch": 0.21, + "grad_norm": 1.6932440897803103, + "learning_rate": 1.84146726088122e-05, + "loss": 1.0061, + "step": 1383 + }, + { + "epoch": 0.21, + "grad_norm": 1.730670733105885, + "learning_rate": 1.8412060198426e-05, + "loss": 0.9637, + "step": 1384 + }, + { + "epoch": 0.21, + "grad_norm": 1.5837054389199716, + "learning_rate": 1.8409445822981694e-05, + "loss": 0.9567, + "step": 1385 + }, + { + "epoch": 0.21, + "grad_norm": 1.4366452640659975, + "learning_rate": 1.8406829483090006e-05, + "loss": 0.9026, + "step": 1386 + }, + { + "epoch": 0.21, + "grad_norm": 1.445942618934189, + "learning_rate": 1.8404211179362116e-05, + "loss": 0.8645, + "step": 1387 + }, + { + "epoch": 0.21, + "grad_norm": 1.470152848227919, + "learning_rate": 1.8401590912409653e-05, + "loss": 0.8691, + "step": 1388 + }, + { + "epoch": 0.21, + "grad_norm": 1.8476267080384925, + "learning_rate": 1.839896868284472e-05, + "loss": 0.7976, + "step": 1389 + }, + { + "epoch": 0.21, + "grad_norm": 1.3481521481882888, + "learning_rate": 1.8396344491279864e-05, + "loss": 0.9588, + "step": 1390 + }, + { + "epoch": 0.21, + "grad_norm": 1.6982467282779086, + "learning_rate": 1.8393718338328102e-05, + "loss": 0.9483, + "step": 1391 + }, + { + "epoch": 0.21, + "grad_norm": 1.375222365594476, + "learning_rate": 1.8391090224602895e-05, + "loss": 0.8921, + "step": 1392 + }, + { + "epoch": 0.21, + "grad_norm": 1.4177302969215262, + "learning_rate": 1.838846015071818e-05, + "loss": 0.9851, + "step": 1393 + }, + { + "epoch": 0.21, + "grad_norm": 1.1595223674721813, + "learning_rate": 1.8385828117288333e-05, + "loss": 0.3101, + "step": 1394 + }, + { + "epoch": 0.21, + "grad_norm": 1.4966780701730207, + "learning_rate": 1.838319412492821e-05, + "loss": 0.8755, + "step": 1395 + }, + { + "epoch": 0.21, + "grad_norm": 1.4258036193660553, + "learning_rate": 1.8380558174253095e-05, + "loss": 0.9083, + "step": 1396 + }, + { + "epoch": 0.21, + "grad_norm": 1.5377659722200108, + "learning_rate": 1.837792026587876e-05, + "loss": 0.8627, + "step": 1397 + }, + { + "epoch": 0.21, + "grad_norm": 1.697825250670764, + "learning_rate": 1.837528040042142e-05, + "loss": 0.9405, + "step": 1398 + }, + { + "epoch": 0.21, + "grad_norm": 1.6569503302197386, + "learning_rate": 1.8372638578497738e-05, + "loss": 0.8927, + "step": 1399 + }, + { + "epoch": 0.21, + "grad_norm": 1.4769447528205317, + "learning_rate": 1.8369994800724855e-05, + "loss": 0.8755, + "step": 1400 + }, + { + "epoch": 0.21, + "grad_norm": 1.5028198194593256, + "learning_rate": 1.836734906772035e-05, + "loss": 0.9232, + "step": 1401 + }, + { + "epoch": 0.21, + "grad_norm": 1.7897250123186603, + "learning_rate": 1.8364701380102267e-05, + "loss": 0.8772, + "step": 1402 + }, + { + "epoch": 0.21, + "grad_norm": 1.7545370866299816, + "learning_rate": 1.836205173848911e-05, + "loss": 0.797, + "step": 1403 + }, + { + "epoch": 0.21, + "grad_norm": 1.3680504500535342, + "learning_rate": 1.835940014349984e-05, + "loss": 0.9338, + "step": 1404 + }, + { + "epoch": 0.21, + "grad_norm": 1.627172584661305, + "learning_rate": 1.8356746595753856e-05, + "loss": 0.9072, + "step": 1405 + }, + { + "epoch": 0.21, + "grad_norm": 1.6118572545415228, + "learning_rate": 1.8354091095871038e-05, + "loss": 0.9417, + "step": 1406 + }, + { + "epoch": 0.21, + "grad_norm": 1.5496130790598868, + "learning_rate": 1.8351433644471708e-05, + "loss": 0.8422, + "step": 1407 + }, + { + "epoch": 0.21, + "grad_norm": 1.6138938151875928, + "learning_rate": 1.8348774242176642e-05, + "loss": 0.8675, + "step": 1408 + }, + { + "epoch": 0.21, + "grad_norm": 1.0340059348763608, + "learning_rate": 1.8346112889607086e-05, + "loss": 0.3633, + "step": 1409 + }, + { + "epoch": 0.21, + "grad_norm": 1.3929871898635133, + "learning_rate": 1.8343449587384727e-05, + "loss": 0.8767, + "step": 1410 + }, + { + "epoch": 0.21, + "grad_norm": 1.8613657702374764, + "learning_rate": 1.8340784336131715e-05, + "loss": 0.9883, + "step": 1411 + }, + { + "epoch": 0.21, + "grad_norm": 1.4416846972977517, + "learning_rate": 1.8338117136470648e-05, + "loss": 0.9644, + "step": 1412 + }, + { + "epoch": 0.21, + "grad_norm": 1.4801650062828753, + "learning_rate": 1.833544798902459e-05, + "loss": 0.9389, + "step": 1413 + }, + { + "epoch": 0.21, + "grad_norm": 1.4552923172875576, + "learning_rate": 1.8332776894417047e-05, + "loss": 0.8727, + "step": 1414 + }, + { + "epoch": 0.21, + "grad_norm": 1.5582812457274473, + "learning_rate": 1.8330103853271993e-05, + "loss": 0.9203, + "step": 1415 + }, + { + "epoch": 0.21, + "grad_norm": 1.602036635066962, + "learning_rate": 1.832742886621385e-05, + "loss": 0.9371, + "step": 1416 + }, + { + "epoch": 0.21, + "grad_norm": 1.5894637268745953, + "learning_rate": 1.8324751933867496e-05, + "loss": 0.9851, + "step": 1417 + }, + { + "epoch": 0.21, + "grad_norm": 1.4416083359620568, + "learning_rate": 1.8322073056858257e-05, + "loss": 0.9316, + "step": 1418 + }, + { + "epoch": 0.21, + "grad_norm": 1.6005260021073704, + "learning_rate": 1.8319392235811927e-05, + "loss": 0.8721, + "step": 1419 + }, + { + "epoch": 0.21, + "grad_norm": 1.5302405606397658, + "learning_rate": 1.831670947135474e-05, + "loss": 0.8851, + "step": 1420 + }, + { + "epoch": 0.21, + "grad_norm": 1.7482184246000811, + "learning_rate": 1.8314024764113392e-05, + "loss": 0.9732, + "step": 1421 + }, + { + "epoch": 0.21, + "grad_norm": 1.4686236650583988, + "learning_rate": 1.831133811471503e-05, + "loss": 0.9718, + "step": 1422 + }, + { + "epoch": 0.21, + "grad_norm": 1.866156273345178, + "learning_rate": 1.830864952378726e-05, + "loss": 0.9626, + "step": 1423 + }, + { + "epoch": 0.21, + "grad_norm": 1.3176770062046157, + "learning_rate": 1.830595899195813e-05, + "loss": 0.8406, + "step": 1424 + }, + { + "epoch": 0.21, + "grad_norm": 1.320715112076083, + "learning_rate": 1.830326651985615e-05, + "loss": 0.8682, + "step": 1425 + }, + { + "epoch": 0.21, + "grad_norm": 0.9674957023138261, + "learning_rate": 1.8300572108110287e-05, + "loss": 0.3209, + "step": 1426 + }, + { + "epoch": 0.21, + "grad_norm": 1.7141530219710719, + "learning_rate": 1.829787575734995e-05, + "loss": 0.9555, + "step": 1427 + }, + { + "epoch": 0.21, + "grad_norm": 1.46450866299843, + "learning_rate": 1.8295177468205015e-05, + "loss": 0.8932, + "step": 1428 + }, + { + "epoch": 0.21, + "grad_norm": 1.8058981880342446, + "learning_rate": 1.8292477241305794e-05, + "loss": 0.9201, + "step": 1429 + }, + { + "epoch": 0.21, + "grad_norm": 1.384683821808224, + "learning_rate": 1.8289775077283063e-05, + "loss": 1.0247, + "step": 1430 + }, + { + "epoch": 0.21, + "grad_norm": 1.2925014335623914, + "learning_rate": 1.8287070976768045e-05, + "loss": 0.9926, + "step": 1431 + }, + { + "epoch": 0.21, + "grad_norm": 1.3516160590524255, + "learning_rate": 1.8284364940392426e-05, + "loss": 0.8744, + "step": 1432 + }, + { + "epoch": 0.21, + "grad_norm": 1.4822768315059653, + "learning_rate": 1.828165696878833e-05, + "loss": 0.8601, + "step": 1433 + }, + { + "epoch": 0.21, + "grad_norm": 1.6360883130209174, + "learning_rate": 1.8278947062588343e-05, + "loss": 0.8558, + "step": 1434 + }, + { + "epoch": 0.21, + "grad_norm": 1.5637041161020004, + "learning_rate": 1.8276235222425494e-05, + "loss": 0.8817, + "step": 1435 + }, + { + "epoch": 0.21, + "grad_norm": 1.7258729237288886, + "learning_rate": 1.8273521448933277e-05, + "loss": 0.9295, + "step": 1436 + }, + { + "epoch": 0.21, + "grad_norm": 1.6557778931673972, + "learning_rate": 1.827080574274562e-05, + "loss": 0.903, + "step": 1437 + }, + { + "epoch": 0.21, + "grad_norm": 1.4290332417607585, + "learning_rate": 1.826808810449692e-05, + "loss": 0.9376, + "step": 1438 + }, + { + "epoch": 0.21, + "grad_norm": 1.3708815417218, + "learning_rate": 1.8265368534822016e-05, + "loss": 0.904, + "step": 1439 + }, + { + "epoch": 0.21, + "grad_norm": 1.5470844016284202, + "learning_rate": 1.8262647034356197e-05, + "loss": 0.9463, + "step": 1440 + }, + { + "epoch": 0.22, + "grad_norm": 1.5947478381137556, + "learning_rate": 1.825992360373521e-05, + "loss": 0.8913, + "step": 1441 + }, + { + "epoch": 0.22, + "grad_norm": 1.446940972521378, + "learning_rate": 1.825719824359524e-05, + "loss": 0.8848, + "step": 1442 + }, + { + "epoch": 0.22, + "grad_norm": 1.455984981257585, + "learning_rate": 1.8254470954572946e-05, + "loss": 0.9032, + "step": 1443 + }, + { + "epoch": 0.22, + "grad_norm": 1.5308337765028341, + "learning_rate": 1.825174173730541e-05, + "loss": 0.9168, + "step": 1444 + }, + { + "epoch": 0.22, + "grad_norm": 1.4858221441011372, + "learning_rate": 1.8249010592430186e-05, + "loss": 0.9031, + "step": 1445 + }, + { + "epoch": 0.22, + "grad_norm": 1.2818499561887657, + "learning_rate": 1.8246277520585262e-05, + "loss": 0.9196, + "step": 1446 + }, + { + "epoch": 0.22, + "grad_norm": 1.503553425905122, + "learning_rate": 1.824354252240909e-05, + "loss": 0.8541, + "step": 1447 + }, + { + "epoch": 0.22, + "grad_norm": 1.7404696372944537, + "learning_rate": 1.8240805598540562e-05, + "loss": 0.9065, + "step": 1448 + }, + { + "epoch": 0.22, + "grad_norm": 1.4865176497344108, + "learning_rate": 1.8238066749619026e-05, + "loss": 0.9083, + "step": 1449 + }, + { + "epoch": 0.22, + "grad_norm": 1.5013071116892325, + "learning_rate": 1.8235325976284276e-05, + "loss": 0.9602, + "step": 1450 + }, + { + "epoch": 0.22, + "grad_norm": 0.9342158491412124, + "learning_rate": 1.823258327917656e-05, + "loss": 0.3407, + "step": 1451 + }, + { + "epoch": 0.22, + "grad_norm": 1.7922600558941737, + "learning_rate": 1.8229838658936566e-05, + "loss": 0.8983, + "step": 1452 + }, + { + "epoch": 0.22, + "grad_norm": 1.471219040705767, + "learning_rate": 1.822709211620544e-05, + "loss": 0.9212, + "step": 1453 + }, + { + "epoch": 0.22, + "grad_norm": 1.79779651791726, + "learning_rate": 1.822434365162478e-05, + "loss": 0.8914, + "step": 1454 + }, + { + "epoch": 0.22, + "grad_norm": 1.7298716910087544, + "learning_rate": 1.8221593265836624e-05, + "loss": 0.9112, + "step": 1455 + }, + { + "epoch": 0.22, + "grad_norm": 1.3886159390640143, + "learning_rate": 1.821884095948346e-05, + "loss": 0.9563, + "step": 1456 + }, + { + "epoch": 0.22, + "grad_norm": 1.529580158241215, + "learning_rate": 1.821608673320823e-05, + "loss": 0.9526, + "step": 1457 + }, + { + "epoch": 0.22, + "grad_norm": 1.6158602711557444, + "learning_rate": 1.8213330587654324e-05, + "loss": 0.9298, + "step": 1458 + }, + { + "epoch": 0.22, + "grad_norm": 1.458628492345118, + "learning_rate": 1.8210572523465567e-05, + "loss": 0.9128, + "step": 1459 + }, + { + "epoch": 0.22, + "grad_norm": 1.6006041008487177, + "learning_rate": 1.8207812541286256e-05, + "loss": 0.8498, + "step": 1460 + }, + { + "epoch": 0.22, + "grad_norm": 1.4658261355564617, + "learning_rate": 1.820505064176112e-05, + "loss": 0.8809, + "step": 1461 + }, + { + "epoch": 0.22, + "grad_norm": 1.505954233538878, + "learning_rate": 1.820228682553533e-05, + "loss": 0.8566, + "step": 1462 + }, + { + "epoch": 0.22, + "grad_norm": 1.6084062301641522, + "learning_rate": 1.8199521093254524e-05, + "loss": 0.8057, + "step": 1463 + }, + { + "epoch": 0.22, + "grad_norm": 1.6311049225116951, + "learning_rate": 1.8196753445564775e-05, + "loss": 0.9693, + "step": 1464 + }, + { + "epoch": 0.22, + "grad_norm": 1.3278004768731315, + "learning_rate": 1.8193983883112603e-05, + "loss": 0.8407, + "step": 1465 + }, + { + "epoch": 0.22, + "grad_norm": 1.426047660811788, + "learning_rate": 1.819121240654498e-05, + "loss": 0.945, + "step": 1466 + }, + { + "epoch": 0.22, + "grad_norm": 1.5451472349527837, + "learning_rate": 1.818843901650932e-05, + "loss": 0.839, + "step": 1467 + }, + { + "epoch": 0.22, + "grad_norm": 1.6821455734600956, + "learning_rate": 1.8185663713653495e-05, + "loss": 0.9481, + "step": 1468 + }, + { + "epoch": 0.22, + "grad_norm": 1.4708004598843258, + "learning_rate": 1.8182886498625806e-05, + "loss": 0.851, + "step": 1469 + }, + { + "epoch": 0.22, + "grad_norm": 1.3762300466315378, + "learning_rate": 1.818010737207502e-05, + "loss": 0.9502, + "step": 1470 + }, + { + "epoch": 0.22, + "grad_norm": 1.4969819510026572, + "learning_rate": 1.8177326334650335e-05, + "loss": 0.9317, + "step": 1471 + }, + { + "epoch": 0.22, + "grad_norm": 1.7290544898393814, + "learning_rate": 1.8174543387001403e-05, + "loss": 0.8434, + "step": 1472 + }, + { + "epoch": 0.22, + "grad_norm": 1.415050729001895, + "learning_rate": 1.817175852977832e-05, + "loss": 0.9118, + "step": 1473 + }, + { + "epoch": 0.22, + "grad_norm": 1.6752822495386923, + "learning_rate": 1.8168971763631636e-05, + "loss": 0.9153, + "step": 1474 + }, + { + "epoch": 0.22, + "grad_norm": 1.370087380605017, + "learning_rate": 1.816618308921233e-05, + "loss": 0.9445, + "step": 1475 + }, + { + "epoch": 0.22, + "grad_norm": 1.5811721293966912, + "learning_rate": 1.816339250717184e-05, + "loss": 1.0104, + "step": 1476 + }, + { + "epoch": 0.22, + "grad_norm": 1.6250835223007674, + "learning_rate": 1.816060001816205e-05, + "loss": 0.9256, + "step": 1477 + }, + { + "epoch": 0.22, + "grad_norm": 1.502584890655715, + "learning_rate": 1.815780562283528e-05, + "loss": 0.8978, + "step": 1478 + }, + { + "epoch": 0.22, + "grad_norm": 1.5892699633215046, + "learning_rate": 1.8155009321844306e-05, + "loss": 0.8817, + "step": 1479 + }, + { + "epoch": 0.22, + "grad_norm": 1.5803467565813663, + "learning_rate": 1.8152211115842342e-05, + "loss": 0.9955, + "step": 1480 + }, + { + "epoch": 0.22, + "grad_norm": 1.651876969836039, + "learning_rate": 1.8149411005483047e-05, + "loss": 0.9283, + "step": 1481 + }, + { + "epoch": 0.22, + "grad_norm": 1.743679780298233, + "learning_rate": 1.8146608991420533e-05, + "loss": 0.9177, + "step": 1482 + }, + { + "epoch": 0.22, + "grad_norm": 1.5462032940470876, + "learning_rate": 1.8143805074309344e-05, + "loss": 0.9605, + "step": 1483 + }, + { + "epoch": 0.22, + "grad_norm": 1.6143460454441354, + "learning_rate": 1.814099925480448e-05, + "loss": 0.8757, + "step": 1484 + }, + { + "epoch": 0.22, + "grad_norm": 1.4543950045459417, + "learning_rate": 1.813819153356138e-05, + "loss": 0.9228, + "step": 1485 + }, + { + "epoch": 0.22, + "grad_norm": 1.3778814108628574, + "learning_rate": 1.8135381911235924e-05, + "loss": 0.937, + "step": 1486 + }, + { + "epoch": 0.22, + "grad_norm": 1.426541612969419, + "learning_rate": 1.8132570388484442e-05, + "loss": 0.7878, + "step": 1487 + }, + { + "epoch": 0.22, + "grad_norm": 1.556011293504534, + "learning_rate": 1.8129756965963712e-05, + "loss": 0.899, + "step": 1488 + }, + { + "epoch": 0.22, + "grad_norm": 1.6689503961511356, + "learning_rate": 1.812694164433094e-05, + "loss": 0.9025, + "step": 1489 + }, + { + "epoch": 0.22, + "grad_norm": 1.599667296566135, + "learning_rate": 1.8124124424243794e-05, + "loss": 0.9317, + "step": 1490 + }, + { + "epoch": 0.22, + "grad_norm": 1.6482370715757464, + "learning_rate": 1.8121305306360374e-05, + "loss": 0.9498, + "step": 1491 + }, + { + "epoch": 0.22, + "grad_norm": 1.6130410170244356, + "learning_rate": 1.811848429133922e-05, + "loss": 0.8094, + "step": 1492 + }, + { + "epoch": 0.22, + "grad_norm": 1.5177496551723935, + "learning_rate": 1.8115661379839328e-05, + "loss": 0.818, + "step": 1493 + }, + { + "epoch": 0.22, + "grad_norm": 1.3777150791970385, + "learning_rate": 1.8112836572520132e-05, + "loss": 0.942, + "step": 1494 + }, + { + "epoch": 0.22, + "grad_norm": 1.06139906827602, + "learning_rate": 1.8110009870041505e-05, + "loss": 0.331, + "step": 1495 + }, + { + "epoch": 0.22, + "grad_norm": 1.5670786161336097, + "learning_rate": 1.8107181273063764e-05, + "loss": 0.8789, + "step": 1496 + }, + { + "epoch": 0.22, + "grad_norm": 1.397744676453817, + "learning_rate": 1.810435078224767e-05, + "loss": 0.9252, + "step": 1497 + }, + { + "epoch": 0.22, + "grad_norm": 1.5655471855230654, + "learning_rate": 1.8101518398254423e-05, + "loss": 0.9022, + "step": 1498 + }, + { + "epoch": 0.22, + "grad_norm": 1.4664594056337388, + "learning_rate": 1.8098684121745675e-05, + "loss": 0.9256, + "step": 1499 + }, + { + "epoch": 0.22, + "grad_norm": 1.492371839850668, + "learning_rate": 1.8095847953383506e-05, + "loss": 0.9655, + "step": 1500 + }, + { + "epoch": 0.22, + "grad_norm": 1.3361464728772487, + "learning_rate": 1.809300989383045e-05, + "loss": 0.8374, + "step": 1501 + }, + { + "epoch": 0.22, + "grad_norm": 1.7483009846294821, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.9746, + "step": 1502 + }, + { + "epoch": 0.22, + "grad_norm": 1.554544434482407, + "learning_rate": 1.8087328103803998e-05, + "loss": 0.9189, + "step": 1503 + }, + { + "epoch": 0.22, + "grad_norm": 1.5815414912754846, + "learning_rate": 1.808448437465787e-05, + "loss": 0.8423, + "step": 1504 + }, + { + "epoch": 0.22, + "grad_norm": 1.5686762241478212, + "learning_rate": 1.8081638756975385e-05, + "loss": 0.9415, + "step": 1505 + }, + { + "epoch": 0.22, + "grad_norm": 1.7599626212831148, + "learning_rate": 1.8078791251421282e-05, + "loss": 0.9791, + "step": 1506 + }, + { + "epoch": 0.22, + "grad_norm": 1.4598071792420808, + "learning_rate": 1.8075941858660737e-05, + "loss": 0.9578, + "step": 1507 + }, + { + "epoch": 0.23, + "grad_norm": 1.6101342396386267, + "learning_rate": 1.8073090579359373e-05, + "loss": 0.9315, + "step": 1508 + }, + { + "epoch": 0.23, + "grad_norm": 1.4658079826654757, + "learning_rate": 1.807023741418324e-05, + "loss": 0.9002, + "step": 1509 + }, + { + "epoch": 0.23, + "grad_norm": 1.428926679758196, + "learning_rate": 1.8067382363798845e-05, + "loss": 0.9682, + "step": 1510 + }, + { + "epoch": 0.23, + "grad_norm": 1.7658554455291877, + "learning_rate": 1.8064525428873128e-05, + "loss": 0.9315, + "step": 1511 + }, + { + "epoch": 0.23, + "grad_norm": 1.5955447068129873, + "learning_rate": 1.8061666610073465e-05, + "loss": 0.9593, + "step": 1512 + }, + { + "epoch": 0.23, + "grad_norm": 1.473041064226398, + "learning_rate": 1.805880590806768e-05, + "loss": 0.9342, + "step": 1513 + }, + { + "epoch": 0.23, + "grad_norm": 0.7178878290796751, + "learning_rate": 1.8055943323524033e-05, + "loss": 0.3207, + "step": 1514 + }, + { + "epoch": 0.23, + "grad_norm": 2.0193566545685027, + "learning_rate": 1.8053078857111218e-05, + "loss": 0.9605, + "step": 1515 + }, + { + "epoch": 0.23, + "grad_norm": 1.629310915558816, + "learning_rate": 1.805021250949839e-05, + "loss": 0.9409, + "step": 1516 + }, + { + "epoch": 0.23, + "grad_norm": 1.5853255658087846, + "learning_rate": 1.8047344281355112e-05, + "loss": 0.8924, + "step": 1517 + }, + { + "epoch": 0.23, + "grad_norm": 1.5117666257872096, + "learning_rate": 1.8044474173351403e-05, + "loss": 0.8893, + "step": 1518 + }, + { + "epoch": 0.23, + "grad_norm": 1.4301520769232796, + "learning_rate": 1.8041602186157732e-05, + "loss": 0.9124, + "step": 1519 + }, + { + "epoch": 0.23, + "grad_norm": 1.4755765686141669, + "learning_rate": 1.803872832044499e-05, + "loss": 0.8578, + "step": 1520 + }, + { + "epoch": 0.23, + "grad_norm": 1.573526723741205, + "learning_rate": 1.8035852576884508e-05, + "loss": 0.9123, + "step": 1521 + }, + { + "epoch": 0.23, + "grad_norm": 1.582495436445243, + "learning_rate": 1.8032974956148064e-05, + "loss": 0.9056, + "step": 1522 + }, + { + "epoch": 0.23, + "grad_norm": 1.433860248616746, + "learning_rate": 1.803009545890787e-05, + "loss": 0.935, + "step": 1523 + }, + { + "epoch": 0.23, + "grad_norm": 1.4019894756760587, + "learning_rate": 1.8027214085836578e-05, + "loss": 0.942, + "step": 1524 + }, + { + "epoch": 0.23, + "grad_norm": 1.7583527407692845, + "learning_rate": 1.8024330837607275e-05, + "loss": 0.8886, + "step": 1525 + }, + { + "epoch": 0.23, + "grad_norm": 1.4990834528561294, + "learning_rate": 1.802144571489349e-05, + "loss": 0.9415, + "step": 1526 + }, + { + "epoch": 0.23, + "grad_norm": 1.4385328780349365, + "learning_rate": 1.8018558718369187e-05, + "loss": 0.9743, + "step": 1527 + }, + { + "epoch": 0.23, + "grad_norm": 1.4935455084531502, + "learning_rate": 1.8015669848708768e-05, + "loss": 0.9042, + "step": 1528 + }, + { + "epoch": 0.23, + "grad_norm": 1.2882483382153245, + "learning_rate": 1.8012779106587073e-05, + "loss": 0.8525, + "step": 1529 + }, + { + "epoch": 0.23, + "grad_norm": 1.529216023691471, + "learning_rate": 1.800988649267938e-05, + "loss": 0.9304, + "step": 1530 + }, + { + "epoch": 0.23, + "grad_norm": 1.4827389836552516, + "learning_rate": 1.8006992007661407e-05, + "loss": 0.9116, + "step": 1531 + }, + { + "epoch": 0.23, + "grad_norm": 1.703636851520783, + "learning_rate": 1.8004095652209304e-05, + "loss": 1.0178, + "step": 1532 + }, + { + "epoch": 0.23, + "grad_norm": 1.6171629582491727, + "learning_rate": 1.8001197426999658e-05, + "loss": 0.9361, + "step": 1533 + }, + { + "epoch": 0.23, + "grad_norm": 1.350755787731342, + "learning_rate": 1.7998297332709492e-05, + "loss": 0.8575, + "step": 1534 + }, + { + "epoch": 0.23, + "grad_norm": 1.4670535763301058, + "learning_rate": 1.7995395370016274e-05, + "loss": 1.0027, + "step": 1535 + }, + { + "epoch": 0.23, + "grad_norm": 1.5501017036498101, + "learning_rate": 1.7992491539597904e-05, + "loss": 0.9615, + "step": 1536 + }, + { + "epoch": 0.23, + "grad_norm": 1.4328751731537759, + "learning_rate": 1.7989585842132713e-05, + "loss": 0.9616, + "step": 1537 + }, + { + "epoch": 0.23, + "grad_norm": 1.70064371803186, + "learning_rate": 1.7986678278299473e-05, + "loss": 0.8806, + "step": 1538 + }, + { + "epoch": 0.23, + "grad_norm": 1.4205590953877154, + "learning_rate": 1.798376884877739e-05, + "loss": 0.9058, + "step": 1539 + }, + { + "epoch": 0.23, + "grad_norm": 1.6090932293860116, + "learning_rate": 1.798085755424611e-05, + "loss": 0.9362, + "step": 1540 + }, + { + "epoch": 0.23, + "grad_norm": 1.6654992270354068, + "learning_rate": 1.7977944395385713e-05, + "loss": 0.9807, + "step": 1541 + }, + { + "epoch": 0.23, + "grad_norm": 1.419143038794105, + "learning_rate": 1.7975029372876706e-05, + "loss": 0.8858, + "step": 1542 + }, + { + "epoch": 0.23, + "grad_norm": 1.6965653057851617, + "learning_rate": 1.7972112487400047e-05, + "loss": 0.8889, + "step": 1543 + }, + { + "epoch": 0.23, + "grad_norm": 1.4143447455987608, + "learning_rate": 1.7969193739637113e-05, + "loss": 0.8719, + "step": 1544 + }, + { + "epoch": 0.23, + "grad_norm": 1.4752442060782491, + "learning_rate": 1.7966273130269727e-05, + "loss": 0.8251, + "step": 1545 + }, + { + "epoch": 0.23, + "grad_norm": 1.4645123412033758, + "learning_rate": 1.796335065998015e-05, + "loss": 0.9551, + "step": 1546 + }, + { + "epoch": 0.23, + "grad_norm": 1.4545903687669843, + "learning_rate": 1.7960426329451062e-05, + "loss": 0.8886, + "step": 1547 + }, + { + "epoch": 0.23, + "grad_norm": 1.6639257574088695, + "learning_rate": 1.795750013936559e-05, + "loss": 0.8962, + "step": 1548 + }, + { + "epoch": 0.23, + "grad_norm": 1.378138793621645, + "learning_rate": 1.7954572090407292e-05, + "loss": 0.9416, + "step": 1549 + }, + { + "epoch": 0.23, + "grad_norm": 1.5078069755966894, + "learning_rate": 1.7951642183260163e-05, + "loss": 0.9405, + "step": 1550 + }, + { + "epoch": 0.23, + "grad_norm": 1.4714863757146972, + "learning_rate": 1.7948710418608626e-05, + "loss": 0.8232, + "step": 1551 + }, + { + "epoch": 0.23, + "grad_norm": 1.5203309743382125, + "learning_rate": 1.7945776797137544e-05, + "loss": 0.9125, + "step": 1552 + }, + { + "epoch": 0.23, + "grad_norm": 1.4394416976847793, + "learning_rate": 1.794284131953221e-05, + "loss": 0.8864, + "step": 1553 + }, + { + "epoch": 0.23, + "grad_norm": 1.6192554529792336, + "learning_rate": 1.7939903986478354e-05, + "loss": 0.9913, + "step": 1554 + }, + { + "epoch": 0.23, + "grad_norm": 1.6718502906283552, + "learning_rate": 1.7936964798662134e-05, + "loss": 0.8929, + "step": 1555 + }, + { + "epoch": 0.23, + "grad_norm": 1.40202113587381, + "learning_rate": 1.793402375677015e-05, + "loss": 0.7959, + "step": 1556 + }, + { + "epoch": 0.23, + "grad_norm": 1.4785852844175729, + "learning_rate": 1.7931080861489425e-05, + "loss": 0.8941, + "step": 1557 + }, + { + "epoch": 0.23, + "grad_norm": 1.6389009817960503, + "learning_rate": 1.7928136113507416e-05, + "loss": 0.8848, + "step": 1558 + }, + { + "epoch": 0.23, + "grad_norm": 1.6442349522622586, + "learning_rate": 1.7925189513512026e-05, + "loss": 0.943, + "step": 1559 + }, + { + "epoch": 0.23, + "grad_norm": 1.6843259031486602, + "learning_rate": 1.7922241062191576e-05, + "loss": 0.8991, + "step": 1560 + }, + { + "epoch": 0.23, + "grad_norm": 1.4130152069717505, + "learning_rate": 1.791929076023482e-05, + "loss": 0.8297, + "step": 1561 + }, + { + "epoch": 0.23, + "grad_norm": 1.521629480217419, + "learning_rate": 1.791633860833096e-05, + "loss": 0.9784, + "step": 1562 + }, + { + "epoch": 0.23, + "grad_norm": 1.5847138266858074, + "learning_rate": 1.7913384607169608e-05, + "loss": 0.912, + "step": 1563 + }, + { + "epoch": 0.23, + "grad_norm": 1.589646845065804, + "learning_rate": 1.7910428757440826e-05, + "loss": 0.9449, + "step": 1564 + }, + { + "epoch": 0.23, + "grad_norm": 1.5248021958024043, + "learning_rate": 1.7907471059835097e-05, + "loss": 0.9189, + "step": 1565 + }, + { + "epoch": 0.23, + "grad_norm": 1.5770284034236917, + "learning_rate": 1.7904511515043338e-05, + "loss": 0.8986, + "step": 1566 + }, + { + "epoch": 0.23, + "grad_norm": 1.6235388826026316, + "learning_rate": 1.7901550123756906e-05, + "loss": 0.9351, + "step": 1567 + }, + { + "epoch": 0.23, + "grad_norm": 1.6085182688439044, + "learning_rate": 1.7898586886667574e-05, + "loss": 0.9453, + "step": 1568 + }, + { + "epoch": 0.23, + "grad_norm": 1.4297041475129335, + "learning_rate": 1.7895621804467562e-05, + "loss": 0.9748, + "step": 1569 + }, + { + "epoch": 0.23, + "grad_norm": 1.5316494607998903, + "learning_rate": 1.789265487784951e-05, + "loss": 0.9373, + "step": 1570 + }, + { + "epoch": 0.23, + "grad_norm": 1.471618927636049, + "learning_rate": 1.788968610750649e-05, + "loss": 0.9069, + "step": 1571 + }, + { + "epoch": 0.23, + "grad_norm": 1.5583034862952156, + "learning_rate": 1.7886715494132008e-05, + "loss": 0.9885, + "step": 1572 + }, + { + "epoch": 0.23, + "grad_norm": 1.556130305248774, + "learning_rate": 1.7883743038420002e-05, + "loss": 0.9727, + "step": 1573 + }, + { + "epoch": 0.23, + "grad_norm": 1.4781080752587312, + "learning_rate": 1.788076874106484e-05, + "loss": 0.8378, + "step": 1574 + }, + { + "epoch": 0.24, + "grad_norm": 1.5675583559886217, + "learning_rate": 1.7877792602761312e-05, + "loss": 0.9611, + "step": 1575 + }, + { + "epoch": 0.24, + "grad_norm": 1.5171448654937751, + "learning_rate": 1.787481462420465e-05, + "loss": 0.845, + "step": 1576 + }, + { + "epoch": 0.24, + "grad_norm": 1.3741617783810502, + "learning_rate": 1.7871834806090502e-05, + "loss": 0.871, + "step": 1577 + }, + { + "epoch": 0.24, + "grad_norm": 1.411148419013395, + "learning_rate": 1.7868853149114966e-05, + "loss": 0.839, + "step": 1578 + }, + { + "epoch": 0.24, + "grad_norm": 1.5433128238880478, + "learning_rate": 1.7865869653974545e-05, + "loss": 0.9828, + "step": 1579 + }, + { + "epoch": 0.24, + "grad_norm": 1.3669869480177081, + "learning_rate": 1.786288432136619e-05, + "loss": 0.9316, + "step": 1580 + }, + { + "epoch": 0.24, + "grad_norm": 1.5452442556225698, + "learning_rate": 1.7859897151987276e-05, + "loss": 0.9209, + "step": 1581 + }, + { + "epoch": 0.24, + "grad_norm": 1.446698103249709, + "learning_rate": 1.7856908146535602e-05, + "loss": 0.905, + "step": 1582 + }, + { + "epoch": 0.24, + "grad_norm": 1.3660707654069846, + "learning_rate": 1.7853917305709405e-05, + "loss": 0.9321, + "step": 1583 + }, + { + "epoch": 0.24, + "grad_norm": 1.5629656675943877, + "learning_rate": 1.785092463020734e-05, + "loss": 0.9017, + "step": 1584 + }, + { + "epoch": 0.24, + "grad_norm": 1.6060196692364832, + "learning_rate": 1.78479301207285e-05, + "loss": 0.8026, + "step": 1585 + }, + { + "epoch": 0.24, + "grad_norm": 1.463758311889416, + "learning_rate": 1.7844933777972406e-05, + "loss": 0.9187, + "step": 1586 + }, + { + "epoch": 0.24, + "grad_norm": 1.9032905394259563, + "learning_rate": 1.7841935602638997e-05, + "loss": 0.9195, + "step": 1587 + }, + { + "epoch": 0.24, + "grad_norm": 1.6859178002349369, + "learning_rate": 1.783893559542865e-05, + "loss": 0.885, + "step": 1588 + }, + { + "epoch": 0.24, + "grad_norm": 1.5535702386924535, + "learning_rate": 1.7835933757042165e-05, + "loss": 0.8999, + "step": 1589 + }, + { + "epoch": 0.24, + "grad_norm": 1.4710355258912164, + "learning_rate": 1.7832930088180777e-05, + "loss": 0.8353, + "step": 1590 + }, + { + "epoch": 0.24, + "grad_norm": 1.515526926908418, + "learning_rate": 1.782992458954614e-05, + "loss": 0.8078, + "step": 1591 + }, + { + "epoch": 0.24, + "grad_norm": 1.4536267293731475, + "learning_rate": 1.7826917261840337e-05, + "loss": 0.8524, + "step": 1592 + }, + { + "epoch": 0.24, + "grad_norm": 1.4877227876822239, + "learning_rate": 1.7823908105765883e-05, + "loss": 0.9063, + "step": 1593 + }, + { + "epoch": 0.24, + "grad_norm": 1.3361140839951366, + "learning_rate": 1.7820897122025717e-05, + "loss": 0.8861, + "step": 1594 + }, + { + "epoch": 0.24, + "grad_norm": 1.6146067722731678, + "learning_rate": 1.7817884311323203e-05, + "loss": 0.8947, + "step": 1595 + }, + { + "epoch": 0.24, + "grad_norm": 1.415559239996958, + "learning_rate": 1.7814869674362133e-05, + "loss": 0.898, + "step": 1596 + }, + { + "epoch": 0.24, + "grad_norm": 1.8070587128079143, + "learning_rate": 1.781185321184673e-05, + "loss": 0.9469, + "step": 1597 + }, + { + "epoch": 0.24, + "grad_norm": 1.6918271919710215, + "learning_rate": 1.780883492448164e-05, + "loss": 0.8558, + "step": 1598 + }, + { + "epoch": 0.24, + "grad_norm": 0.9394417938532487, + "learning_rate": 1.780581481297193e-05, + "loss": 0.3136, + "step": 1599 + }, + { + "epoch": 0.24, + "grad_norm": 1.3710380329592233, + "learning_rate": 1.7802792878023108e-05, + "loss": 0.8846, + "step": 1600 + }, + { + "epoch": 0.24, + "grad_norm": 1.621653634787497, + "learning_rate": 1.779976912034109e-05, + "loss": 0.9499, + "step": 1601 + }, + { + "epoch": 0.24, + "grad_norm": 1.49535069053136, + "learning_rate": 1.7796743540632226e-05, + "loss": 0.914, + "step": 1602 + }, + { + "epoch": 0.24, + "grad_norm": 1.5287861793456374, + "learning_rate": 1.7793716139603297e-05, + "loss": 0.8583, + "step": 1603 + }, + { + "epoch": 0.24, + "grad_norm": 1.6938236693429272, + "learning_rate": 1.7790686917961498e-05, + "loss": 0.8335, + "step": 1604 + }, + { + "epoch": 0.24, + "grad_norm": 1.5387393199407082, + "learning_rate": 1.7787655876414463e-05, + "loss": 0.844, + "step": 1605 + }, + { + "epoch": 0.24, + "grad_norm": 1.5769405086202581, + "learning_rate": 1.7784623015670237e-05, + "loss": 0.9482, + "step": 1606 + }, + { + "epoch": 0.24, + "grad_norm": 1.422886086138693, + "learning_rate": 1.77815883364373e-05, + "loss": 0.9369, + "step": 1607 + }, + { + "epoch": 0.24, + "grad_norm": 0.9304298204162212, + "learning_rate": 1.777855183942455e-05, + "loss": 0.3799, + "step": 1608 + }, + { + "epoch": 0.24, + "grad_norm": 1.6124495672824373, + "learning_rate": 1.7775513525341318e-05, + "loss": 0.827, + "step": 1609 + }, + { + "epoch": 0.24, + "grad_norm": 1.7060739694083267, + "learning_rate": 1.777247339489735e-05, + "loss": 0.8808, + "step": 1610 + }, + { + "epoch": 0.24, + "grad_norm": 1.5229384717595726, + "learning_rate": 1.7769431448802824e-05, + "loss": 0.9077, + "step": 1611 + }, + { + "epoch": 0.24, + "grad_norm": 1.5395189579010597, + "learning_rate": 1.7766387687768338e-05, + "loss": 0.8656, + "step": 1612 + }, + { + "epoch": 0.24, + "grad_norm": 1.4780489060209367, + "learning_rate": 1.7763342112504913e-05, + "loss": 0.942, + "step": 1613 + }, + { + "epoch": 0.24, + "grad_norm": 1.4799745384004357, + "learning_rate": 1.7760294723724e-05, + "loss": 0.8457, + "step": 1614 + }, + { + "epoch": 0.24, + "grad_norm": 1.4684627472723826, + "learning_rate": 1.775724552213746e-05, + "loss": 0.8746, + "step": 1615 + }, + { + "epoch": 0.24, + "grad_norm": 1.6040790935063485, + "learning_rate": 1.7754194508457602e-05, + "loss": 0.9138, + "step": 1616 + }, + { + "epoch": 0.24, + "grad_norm": 1.35682520205269, + "learning_rate": 1.7751141683397128e-05, + "loss": 0.8522, + "step": 1617 + }, + { + "epoch": 0.24, + "grad_norm": 1.3353106341381324, + "learning_rate": 1.7748087047669186e-05, + "loss": 0.9063, + "step": 1618 + }, + { + "epoch": 0.24, + "grad_norm": 1.8100525305630744, + "learning_rate": 1.7745030601987338e-05, + "loss": 0.9223, + "step": 1619 + }, + { + "epoch": 0.24, + "grad_norm": 1.6092339721248268, + "learning_rate": 1.774197234706557e-05, + "loss": 0.9258, + "step": 1620 + }, + { + "epoch": 0.24, + "grad_norm": 1.7458819446495246, + "learning_rate": 1.7738912283618293e-05, + "loss": 0.9343, + "step": 1621 + }, + { + "epoch": 0.24, + "grad_norm": 1.7811148199017655, + "learning_rate": 1.7735850412360332e-05, + "loss": 0.8773, + "step": 1622 + }, + { + "epoch": 0.24, + "grad_norm": 1.3201410256999138, + "learning_rate": 1.7732786734006947e-05, + "loss": 0.8987, + "step": 1623 + }, + { + "epoch": 0.24, + "grad_norm": 1.506969757466883, + "learning_rate": 1.772972124927381e-05, + "loss": 0.951, + "step": 1624 + }, + { + "epoch": 0.24, + "grad_norm": 1.5858392367904162, + "learning_rate": 1.7726653958877016e-05, + "loss": 0.874, + "step": 1625 + }, + { + "epoch": 0.24, + "grad_norm": 1.6850626361525405, + "learning_rate": 1.772358486353309e-05, + "loss": 0.867, + "step": 1626 + }, + { + "epoch": 0.24, + "grad_norm": 1.5132178985557543, + "learning_rate": 1.772051396395897e-05, + "loss": 0.9403, + "step": 1627 + }, + { + "epoch": 0.24, + "grad_norm": 1.5875971923008991, + "learning_rate": 1.771744126087202e-05, + "loss": 0.9398, + "step": 1628 + }, + { + "epoch": 0.24, + "grad_norm": 1.3874449433591847, + "learning_rate": 1.771436675499002e-05, + "loss": 0.9517, + "step": 1629 + }, + { + "epoch": 0.24, + "grad_norm": 1.6778689098484927, + "learning_rate": 1.771129044703118e-05, + "loss": 0.9133, + "step": 1630 + }, + { + "epoch": 0.24, + "grad_norm": 1.5965843953852386, + "learning_rate": 1.770821233771412e-05, + "loss": 0.9315, + "step": 1631 + }, + { + "epoch": 0.24, + "grad_norm": 1.480797115990424, + "learning_rate": 1.7705132427757895e-05, + "loss": 0.9489, + "step": 1632 + }, + { + "epoch": 0.24, + "grad_norm": 0.8642756347111773, + "learning_rate": 1.7702050717881966e-05, + "loss": 0.3473, + "step": 1633 + }, + { + "epoch": 0.24, + "grad_norm": 1.4990152302868358, + "learning_rate": 1.7698967208806216e-05, + "loss": 0.9048, + "step": 1634 + }, + { + "epoch": 0.24, + "grad_norm": 1.485270652222611, + "learning_rate": 1.7695881901250966e-05, + "loss": 0.9014, + "step": 1635 + }, + { + "epoch": 0.24, + "grad_norm": 1.5214858759038965, + "learning_rate": 1.7692794795936933e-05, + "loss": 0.896, + "step": 1636 + }, + { + "epoch": 0.24, + "grad_norm": 1.4474856682989883, + "learning_rate": 1.7689705893585273e-05, + "loss": 0.9221, + "step": 1637 + }, + { + "epoch": 0.24, + "grad_norm": 1.5294260479584558, + "learning_rate": 1.768661519491755e-05, + "loss": 0.9252, + "step": 1638 + }, + { + "epoch": 0.24, + "grad_norm": 1.5849465529940199, + "learning_rate": 1.7683522700655748e-05, + "loss": 0.9858, + "step": 1639 + }, + { + "epoch": 0.24, + "grad_norm": 0.7816169669656882, + "learning_rate": 1.768042841152228e-05, + "loss": 0.3248, + "step": 1640 + }, + { + "epoch": 0.24, + "grad_norm": 1.378251448439808, + "learning_rate": 1.767733232823997e-05, + "loss": 0.9562, + "step": 1641 + }, + { + "epoch": 0.25, + "grad_norm": 0.76066055704607, + "learning_rate": 1.7674234451532065e-05, + "loss": 0.308, + "step": 1642 + }, + { + "epoch": 0.25, + "grad_norm": 0.8072615083001393, + "learning_rate": 1.7671134782122225e-05, + "loss": 0.3364, + "step": 1643 + }, + { + "epoch": 0.25, + "grad_norm": 1.4158020503018405, + "learning_rate": 1.7668033320734536e-05, + "loss": 0.8818, + "step": 1644 + }, + { + "epoch": 0.25, + "grad_norm": 1.5579198244318886, + "learning_rate": 1.76649300680935e-05, + "loss": 0.9414, + "step": 1645 + }, + { + "epoch": 0.25, + "grad_norm": 1.4821532688374293, + "learning_rate": 1.7661825024924035e-05, + "loss": 0.9064, + "step": 1646 + }, + { + "epoch": 0.25, + "grad_norm": 1.532267404015606, + "learning_rate": 1.7658718191951483e-05, + "loss": 0.9663, + "step": 1647 + }, + { + "epoch": 0.25, + "grad_norm": 1.4236472895400747, + "learning_rate": 1.76556095699016e-05, + "loss": 0.9092, + "step": 1648 + }, + { + "epoch": 0.25, + "grad_norm": 1.4394290112011618, + "learning_rate": 1.7652499159500554e-05, + "loss": 0.9262, + "step": 1649 + }, + { + "epoch": 0.25, + "grad_norm": 0.9082894509829694, + "learning_rate": 1.7649386961474944e-05, + "loss": 0.3479, + "step": 1650 + }, + { + "epoch": 0.25, + "grad_norm": 1.5694811646372455, + "learning_rate": 1.764627297655178e-05, + "loss": 0.8595, + "step": 1651 + }, + { + "epoch": 0.25, + "grad_norm": 1.778224908991867, + "learning_rate": 1.7643157205458483e-05, + "loss": 0.9557, + "step": 1652 + }, + { + "epoch": 0.25, + "grad_norm": 1.7570159431849186, + "learning_rate": 1.7640039648922905e-05, + "loss": 0.9355, + "step": 1653 + }, + { + "epoch": 0.25, + "grad_norm": 1.535934897475885, + "learning_rate": 1.7636920307673305e-05, + "loss": 0.9163, + "step": 1654 + }, + { + "epoch": 0.25, + "grad_norm": 1.500786112142508, + "learning_rate": 1.7633799182438355e-05, + "loss": 0.8763, + "step": 1655 + }, + { + "epoch": 0.25, + "grad_norm": 0.7958521536960721, + "learning_rate": 1.7630676273947157e-05, + "loss": 0.3603, + "step": 1656 + }, + { + "epoch": 0.25, + "grad_norm": 1.5405759425893217, + "learning_rate": 1.7627551582929223e-05, + "loss": 0.8867, + "step": 1657 + }, + { + "epoch": 0.25, + "grad_norm": 1.4105335679188502, + "learning_rate": 1.762442511011448e-05, + "loss": 0.8636, + "step": 1658 + }, + { + "epoch": 0.25, + "grad_norm": 1.6396145007703673, + "learning_rate": 1.762129685623327e-05, + "loss": 0.8969, + "step": 1659 + }, + { + "epoch": 0.25, + "grad_norm": 1.6454153765062922, + "learning_rate": 1.7618166822016358e-05, + "loss": 0.9028, + "step": 1660 + }, + { + "epoch": 0.25, + "grad_norm": 1.6597883658356147, + "learning_rate": 1.7615035008194914e-05, + "loss": 0.852, + "step": 1661 + }, + { + "epoch": 0.25, + "grad_norm": 1.4238215944744783, + "learning_rate": 1.7611901415500536e-05, + "loss": 0.8634, + "step": 1662 + }, + { + "epoch": 0.25, + "grad_norm": 1.3317438945524687, + "learning_rate": 1.7608766044665227e-05, + "loss": 0.8193, + "step": 1663 + }, + { + "epoch": 0.25, + "grad_norm": 1.5834953879779157, + "learning_rate": 1.7605628896421412e-05, + "loss": 0.8995, + "step": 1664 + }, + { + "epoch": 0.25, + "grad_norm": 1.6297456879070857, + "learning_rate": 1.760248997150193e-05, + "loss": 0.9008, + "step": 1665 + }, + { + "epoch": 0.25, + "grad_norm": 1.3606229375868286, + "learning_rate": 1.7599349270640034e-05, + "loss": 0.9694, + "step": 1666 + }, + { + "epoch": 0.25, + "grad_norm": 1.428091273337352, + "learning_rate": 1.759620679456939e-05, + "loss": 0.8583, + "step": 1667 + }, + { + "epoch": 0.25, + "grad_norm": 1.5387743968272987, + "learning_rate": 1.7593062544024084e-05, + "loss": 0.8277, + "step": 1668 + }, + { + "epoch": 0.25, + "grad_norm": 1.4367561605643178, + "learning_rate": 1.758991651973861e-05, + "loss": 0.8267, + "step": 1669 + }, + { + "epoch": 0.25, + "grad_norm": 1.5360877563358466, + "learning_rate": 1.758676872244788e-05, + "loss": 0.946, + "step": 1670 + }, + { + "epoch": 0.25, + "grad_norm": 1.6575018452365091, + "learning_rate": 1.7583619152887222e-05, + "loss": 0.8916, + "step": 1671 + }, + { + "epoch": 0.25, + "grad_norm": 1.5189025333321005, + "learning_rate": 1.7580467811792374e-05, + "loss": 0.8831, + "step": 1672 + }, + { + "epoch": 0.25, + "grad_norm": 1.4889245276310539, + "learning_rate": 1.7577314699899486e-05, + "loss": 0.962, + "step": 1673 + }, + { + "epoch": 0.25, + "grad_norm": 1.5920227440503765, + "learning_rate": 1.7574159817945134e-05, + "loss": 0.8194, + "step": 1674 + }, + { + "epoch": 0.25, + "grad_norm": 1.6663380352263728, + "learning_rate": 1.7571003166666295e-05, + "loss": 0.9153, + "step": 1675 + }, + { + "epoch": 0.25, + "grad_norm": 1.5568655430691456, + "learning_rate": 1.756784474680036e-05, + "loss": 0.9206, + "step": 1676 + }, + { + "epoch": 0.25, + "grad_norm": 1.5313420778927918, + "learning_rate": 1.7564684559085138e-05, + "loss": 0.9114, + "step": 1677 + }, + { + "epoch": 0.25, + "grad_norm": 0.7955562493684992, + "learning_rate": 1.756152260425885e-05, + "loss": 0.3385, + "step": 1678 + }, + { + "epoch": 0.25, + "grad_norm": 1.791077961143688, + "learning_rate": 1.7558358883060128e-05, + "loss": 0.9402, + "step": 1679 + }, + { + "epoch": 0.25, + "grad_norm": 1.6186353518599614, + "learning_rate": 1.7555193396228015e-05, + "loss": 0.8572, + "step": 1680 + }, + { + "epoch": 0.25, + "grad_norm": 1.5337082328468437, + "learning_rate": 1.7552026144501976e-05, + "loss": 1.0438, + "step": 1681 + }, + { + "epoch": 0.25, + "grad_norm": 1.946323407870686, + "learning_rate": 1.7548857128621878e-05, + "loss": 0.8694, + "step": 1682 + }, + { + "epoch": 0.25, + "grad_norm": 1.6462468216871893, + "learning_rate": 1.7545686349328e-05, + "loss": 0.9585, + "step": 1683 + }, + { + "epoch": 0.25, + "grad_norm": 1.5771995772346616, + "learning_rate": 1.754251380736104e-05, + "loss": 0.9554, + "step": 1684 + }, + { + "epoch": 0.25, + "grad_norm": 1.5107802119336817, + "learning_rate": 1.7539339503462103e-05, + "loss": 0.8278, + "step": 1685 + }, + { + "epoch": 0.25, + "grad_norm": 1.5948314643925587, + "learning_rate": 1.753616343837271e-05, + "loss": 0.8782, + "step": 1686 + }, + { + "epoch": 0.25, + "grad_norm": 1.5517116597383998, + "learning_rate": 1.753298561283478e-05, + "loss": 0.9333, + "step": 1687 + }, + { + "epoch": 0.25, + "grad_norm": 1.7926694624360804, + "learning_rate": 1.7529806027590668e-05, + "loss": 0.9011, + "step": 1688 + }, + { + "epoch": 0.25, + "grad_norm": 1.4242999599319386, + "learning_rate": 1.7526624683383114e-05, + "loss": 1.0023, + "step": 1689 + }, + { + "epoch": 0.25, + "grad_norm": 1.4756871055595069, + "learning_rate": 1.752344158095528e-05, + "loss": 0.866, + "step": 1690 + }, + { + "epoch": 0.25, + "grad_norm": 1.5269401155973021, + "learning_rate": 1.752025672105075e-05, + "loss": 0.9106, + "step": 1691 + }, + { + "epoch": 0.25, + "grad_norm": 1.6514765659791704, + "learning_rate": 1.7517070104413497e-05, + "loss": 0.8834, + "step": 1692 + }, + { + "epoch": 0.25, + "grad_norm": 1.466126278503492, + "learning_rate": 1.7513881731787924e-05, + "loss": 0.95, + "step": 1693 + }, + { + "epoch": 0.25, + "grad_norm": 1.4524000407796849, + "learning_rate": 1.7510691603918825e-05, + "loss": 0.8784, + "step": 1694 + }, + { + "epoch": 0.25, + "grad_norm": 1.7410398325393686, + "learning_rate": 1.750749972155142e-05, + "loss": 0.845, + "step": 1695 + }, + { + "epoch": 0.25, + "grad_norm": 1.5777950166125367, + "learning_rate": 1.7504306085431334e-05, + "loss": 0.965, + "step": 1696 + }, + { + "epoch": 0.25, + "grad_norm": 1.5838872906840074, + "learning_rate": 1.7501110696304598e-05, + "loss": 0.9294, + "step": 1697 + }, + { + "epoch": 0.25, + "grad_norm": 1.5369772478631567, + "learning_rate": 1.7497913554917656e-05, + "loss": 0.9132, + "step": 1698 + }, + { + "epoch": 0.25, + "grad_norm": 1.3601257098393902, + "learning_rate": 1.749471466201736e-05, + "loss": 0.8909, + "step": 1699 + }, + { + "epoch": 0.25, + "grad_norm": 1.5347499479608235, + "learning_rate": 1.7491514018350974e-05, + "loss": 0.8639, + "step": 1700 + }, + { + "epoch": 0.25, + "grad_norm": 1.5490947412133416, + "learning_rate": 1.7488311624666165e-05, + "loss": 0.8829, + "step": 1701 + }, + { + "epoch": 0.25, + "grad_norm": 1.5543988693147053, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.941, + "step": 1702 + }, + { + "epoch": 0.25, + "grad_norm": 1.6548435316823147, + "learning_rate": 1.748190159023401e-05, + "loss": 1.0323, + "step": 1703 + }, + { + "epoch": 0.25, + "grad_norm": 1.339462331638418, + "learning_rate": 1.747869395098405e-05, + "loss": 0.9051, + "step": 1704 + }, + { + "epoch": 0.25, + "grad_norm": 1.8201796335838167, + "learning_rate": 1.7475484564710437e-05, + "loss": 0.8872, + "step": 1705 + }, + { + "epoch": 0.25, + "grad_norm": 1.6880989562572777, + "learning_rate": 1.7472273432162886e-05, + "loss": 0.8475, + "step": 1706 + }, + { + "epoch": 0.25, + "grad_norm": 1.611526301363743, + "learning_rate": 1.7469060554091518e-05, + "loss": 0.8852, + "step": 1707 + }, + { + "epoch": 0.25, + "grad_norm": 1.6037891104770856, + "learning_rate": 1.7465845931246858e-05, + "loss": 0.8353, + "step": 1708 + }, + { + "epoch": 0.25, + "grad_norm": 1.5586701340947895, + "learning_rate": 1.7462629564379846e-05, + "loss": 0.8964, + "step": 1709 + }, + { + "epoch": 0.26, + "grad_norm": 1.4803400744931496, + "learning_rate": 1.7459411454241822e-05, + "loss": 0.8311, + "step": 1710 + }, + { + "epoch": 0.26, + "grad_norm": 1.4170972934968988, + "learning_rate": 1.7456191601584544e-05, + "loss": 0.8556, + "step": 1711 + }, + { + "epoch": 0.26, + "grad_norm": 1.7313153185296979, + "learning_rate": 1.745297000716016e-05, + "loss": 0.866, + "step": 1712 + }, + { + "epoch": 0.26, + "grad_norm": 1.5203219014060387, + "learning_rate": 1.7449746671721243e-05, + "loss": 0.8972, + "step": 1713 + }, + { + "epoch": 0.26, + "grad_norm": 1.4457192583170824, + "learning_rate": 1.744652159602076e-05, + "loss": 0.9248, + "step": 1714 + }, + { + "epoch": 0.26, + "grad_norm": 1.3942818973038391, + "learning_rate": 1.744329478081209e-05, + "loss": 0.9204, + "step": 1715 + }, + { + "epoch": 0.26, + "grad_norm": 1.4531075562435056, + "learning_rate": 1.744006622684902e-05, + "loss": 0.8871, + "step": 1716 + }, + { + "epoch": 0.26, + "grad_norm": 1.5789970700730995, + "learning_rate": 1.7436835934885735e-05, + "loss": 0.9262, + "step": 1717 + }, + { + "epoch": 0.26, + "grad_norm": 1.5611116320765956, + "learning_rate": 1.743360390567684e-05, + "loss": 0.8636, + "step": 1718 + }, + { + "epoch": 0.26, + "grad_norm": 1.467310440937347, + "learning_rate": 1.7430370139977327e-05, + "loss": 0.8532, + "step": 1719 + }, + { + "epoch": 0.26, + "grad_norm": 1.4069213853948344, + "learning_rate": 1.7427134638542612e-05, + "loss": 0.9562, + "step": 1720 + }, + { + "epoch": 0.26, + "grad_norm": 0.8629732208129236, + "learning_rate": 1.7423897402128505e-05, + "loss": 0.2969, + "step": 1721 + }, + { + "epoch": 0.26, + "grad_norm": 1.5794908589828864, + "learning_rate": 1.7420658431491224e-05, + "loss": 0.9728, + "step": 1722 + }, + { + "epoch": 0.26, + "grad_norm": 1.4555251810863923, + "learning_rate": 1.7417417727387392e-05, + "loss": 0.9158, + "step": 1723 + }, + { + "epoch": 0.26, + "grad_norm": 1.4490216871807111, + "learning_rate": 1.7414175290574044e-05, + "loss": 0.885, + "step": 1724 + }, + { + "epoch": 0.26, + "grad_norm": 2.0688220651181375, + "learning_rate": 1.741093112180861e-05, + "loss": 0.9506, + "step": 1725 + }, + { + "epoch": 0.26, + "grad_norm": 1.4449212317917777, + "learning_rate": 1.7407685221848925e-05, + "loss": 0.8994, + "step": 1726 + }, + { + "epoch": 0.26, + "grad_norm": 1.4599573524456597, + "learning_rate": 1.7404437591453237e-05, + "loss": 0.9289, + "step": 1727 + }, + { + "epoch": 0.26, + "grad_norm": 1.5625676068938543, + "learning_rate": 1.7401188231380185e-05, + "loss": 0.973, + "step": 1728 + }, + { + "epoch": 0.26, + "grad_norm": 1.4693554672713998, + "learning_rate": 1.739793714238883e-05, + "loss": 0.9168, + "step": 1729 + }, + { + "epoch": 0.26, + "grad_norm": 1.6503806164680692, + "learning_rate": 1.7394684325238616e-05, + "loss": 0.8105, + "step": 1730 + }, + { + "epoch": 0.26, + "grad_norm": 1.5745296703674048, + "learning_rate": 1.7391429780689414e-05, + "loss": 0.9836, + "step": 1731 + }, + { + "epoch": 0.26, + "grad_norm": 1.4485110292525867, + "learning_rate": 1.7388173509501475e-05, + "loss": 0.9437, + "step": 1732 + }, + { + "epoch": 0.26, + "grad_norm": 1.5343799885243359, + "learning_rate": 1.7384915512435466e-05, + "loss": 0.9329, + "step": 1733 + }, + { + "epoch": 0.26, + "grad_norm": 1.6859775492495856, + "learning_rate": 1.738165579025246e-05, + "loss": 0.8766, + "step": 1734 + }, + { + "epoch": 0.26, + "grad_norm": 1.7636578528690656, + "learning_rate": 1.737839434371393e-05, + "loss": 0.884, + "step": 1735 + }, + { + "epoch": 0.26, + "grad_norm": 1.5276155041013695, + "learning_rate": 1.737513117358174e-05, + "loss": 0.915, + "step": 1736 + }, + { + "epoch": 0.26, + "grad_norm": 0.792478974482549, + "learning_rate": 1.7371866280618176e-05, + "loss": 0.326, + "step": 1737 + }, + { + "epoch": 0.26, + "grad_norm": 1.4961026604787953, + "learning_rate": 1.7368599665585916e-05, + "loss": 0.9006, + "step": 1738 + }, + { + "epoch": 0.26, + "grad_norm": 1.5274209999344202, + "learning_rate": 1.7365331329248035e-05, + "loss": 0.8725, + "step": 1739 + }, + { + "epoch": 0.26, + "grad_norm": 1.351068948198467, + "learning_rate": 1.7362061272368026e-05, + "loss": 0.9436, + "step": 1740 + }, + { + "epoch": 0.26, + "grad_norm": 1.5360903912923594, + "learning_rate": 1.735878949570977e-05, + "loss": 0.8497, + "step": 1741 + }, + { + "epoch": 0.26, + "grad_norm": 1.3497630823993179, + "learning_rate": 1.7355516000037555e-05, + "loss": 0.9142, + "step": 1742 + }, + { + "epoch": 0.26, + "grad_norm": 1.5809404969285956, + "learning_rate": 1.7352240786116068e-05, + "loss": 0.8947, + "step": 1743 + }, + { + "epoch": 0.26, + "grad_norm": 1.4846738357643812, + "learning_rate": 1.73489638547104e-05, + "loss": 0.9025, + "step": 1744 + }, + { + "epoch": 0.26, + "grad_norm": 1.624635000914181, + "learning_rate": 1.7345685206586045e-05, + "loss": 0.8762, + "step": 1745 + }, + { + "epoch": 0.26, + "grad_norm": 1.830149285837252, + "learning_rate": 1.7342404842508896e-05, + "loss": 0.8836, + "step": 1746 + }, + { + "epoch": 0.26, + "grad_norm": 1.461486520725363, + "learning_rate": 1.733912276324524e-05, + "loss": 0.8534, + "step": 1747 + }, + { + "epoch": 0.26, + "grad_norm": 1.434998799493417, + "learning_rate": 1.7335838969561777e-05, + "loss": 0.9205, + "step": 1748 + }, + { + "epoch": 0.26, + "grad_norm": 1.6244167337949473, + "learning_rate": 1.7332553462225604e-05, + "loss": 0.8886, + "step": 1749 + }, + { + "epoch": 0.26, + "grad_norm": 1.6222076398074747, + "learning_rate": 1.7329266242004205e-05, + "loss": 0.8626, + "step": 1750 + }, + { + "epoch": 0.26, + "grad_norm": 1.5200145285107727, + "learning_rate": 1.7325977309665485e-05, + "loss": 1.0063, + "step": 1751 + }, + { + "epoch": 0.26, + "grad_norm": 1.451699862945971, + "learning_rate": 1.7322686665977738e-05, + "loss": 0.8288, + "step": 1752 + }, + { + "epoch": 0.26, + "grad_norm": 1.9622903219057697, + "learning_rate": 1.7319394311709655e-05, + "loss": 0.9016, + "step": 1753 + }, + { + "epoch": 0.26, + "grad_norm": 1.3631012382364607, + "learning_rate": 1.731610024763033e-05, + "loss": 0.943, + "step": 1754 + }, + { + "epoch": 0.26, + "grad_norm": 1.69183243256369, + "learning_rate": 1.731280447450926e-05, + "loss": 0.8885, + "step": 1755 + }, + { + "epoch": 0.26, + "grad_norm": 1.5841491510032268, + "learning_rate": 1.7309506993116333e-05, + "loss": 0.8766, + "step": 1756 + }, + { + "epoch": 0.26, + "grad_norm": 1.5842091051694704, + "learning_rate": 1.7306207804221845e-05, + "loss": 1.013, + "step": 1757 + }, + { + "epoch": 0.26, + "grad_norm": 1.5566617085980659, + "learning_rate": 1.7302906908596487e-05, + "loss": 0.8763, + "step": 1758 + }, + { + "epoch": 0.26, + "grad_norm": 1.4113051668924068, + "learning_rate": 1.729960430701135e-05, + "loss": 0.8902, + "step": 1759 + }, + { + "epoch": 0.26, + "grad_norm": 1.5081173429273016, + "learning_rate": 1.7296300000237917e-05, + "loss": 0.997, + "step": 1760 + }, + { + "epoch": 0.26, + "grad_norm": 1.4691870494415773, + "learning_rate": 1.7292993989048076e-05, + "loss": 0.8829, + "step": 1761 + }, + { + "epoch": 0.26, + "grad_norm": 1.5797824647838894, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.928, + "step": 1762 + }, + { + "epoch": 0.26, + "grad_norm": 1.43389339394806, + "learning_rate": 1.7286376856508714e-05, + "loss": 0.9845, + "step": 1763 + }, + { + "epoch": 0.26, + "grad_norm": 1.4309202791520272, + "learning_rate": 1.7283065736704956e-05, + "loss": 0.9778, + "step": 1764 + }, + { + "epoch": 0.26, + "grad_norm": 1.5539428890256648, + "learning_rate": 1.7279752915576312e-05, + "loss": 0.852, + "step": 1765 + }, + { + "epoch": 0.26, + "grad_norm": 1.4520281464163747, + "learning_rate": 1.7276438393896663e-05, + "loss": 0.8648, + "step": 1766 + }, + { + "epoch": 0.26, + "grad_norm": 1.6531081314163723, + "learning_rate": 1.727312217244028e-05, + "loss": 0.8553, + "step": 1767 + }, + { + "epoch": 0.26, + "grad_norm": 1.4559817625290705, + "learning_rate": 1.7269804251981835e-05, + "loss": 0.9928, + "step": 1768 + }, + { + "epoch": 0.26, + "grad_norm": 1.521372074174493, + "learning_rate": 1.726648463329639e-05, + "loss": 0.9311, + "step": 1769 + }, + { + "epoch": 0.26, + "grad_norm": 1.39292238696397, + "learning_rate": 1.726316331715941e-05, + "loss": 0.9482, + "step": 1770 + }, + { + "epoch": 0.26, + "grad_norm": 1.6238781226882557, + "learning_rate": 1.7259840304346757e-05, + "loss": 0.866, + "step": 1771 + }, + { + "epoch": 0.26, + "grad_norm": 1.427991969722996, + "learning_rate": 1.7256515595634688e-05, + "loss": 0.9149, + "step": 1772 + }, + { + "epoch": 0.26, + "grad_norm": 1.3335333993191922, + "learning_rate": 1.7253189191799853e-05, + "loss": 0.9076, + "step": 1773 + }, + { + "epoch": 0.26, + "grad_norm": 1.472499218013164, + "learning_rate": 1.7249861093619298e-05, + "loss": 0.8997, + "step": 1774 + }, + { + "epoch": 0.26, + "grad_norm": 1.6890628308746691, + "learning_rate": 1.7246531301870467e-05, + "loss": 0.9985, + "step": 1775 + }, + { + "epoch": 0.26, + "grad_norm": 1.7011530663800565, + "learning_rate": 1.7243199817331207e-05, + "loss": 1.0012, + "step": 1776 + }, + { + "epoch": 0.27, + "grad_norm": 1.9561611437955488, + "learning_rate": 1.7239866640779745e-05, + "loss": 0.9127, + "step": 1777 + }, + { + "epoch": 0.27, + "grad_norm": 1.4494502468755988, + "learning_rate": 1.7236531772994714e-05, + "loss": 0.9929, + "step": 1778 + }, + { + "epoch": 0.27, + "grad_norm": 1.5584496393241531, + "learning_rate": 1.723319521475514e-05, + "loss": 0.9213, + "step": 1779 + }, + { + "epoch": 0.27, + "grad_norm": 1.4555365582732622, + "learning_rate": 1.7229856966840444e-05, + "loss": 0.8719, + "step": 1780 + }, + { + "epoch": 0.27, + "grad_norm": 1.5727131597575448, + "learning_rate": 1.7226517030030444e-05, + "loss": 0.9302, + "step": 1781 + }, + { + "epoch": 0.27, + "grad_norm": 1.495720422096291, + "learning_rate": 1.722317540510534e-05, + "loss": 0.8064, + "step": 1782 + }, + { + "epoch": 0.27, + "grad_norm": 1.415398908027525, + "learning_rate": 1.7219832092845746e-05, + "loss": 0.9454, + "step": 1783 + }, + { + "epoch": 0.27, + "grad_norm": 1.4238637709112882, + "learning_rate": 1.7216487094032653e-05, + "loss": 0.9063, + "step": 1784 + }, + { + "epoch": 0.27, + "grad_norm": 1.4713464307722328, + "learning_rate": 1.7213140409447455e-05, + "loss": 0.8112, + "step": 1785 + }, + { + "epoch": 0.27, + "grad_norm": 1.3735514428630609, + "learning_rate": 1.7209792039871942e-05, + "loss": 0.8631, + "step": 1786 + }, + { + "epoch": 0.27, + "grad_norm": 1.4875295284990784, + "learning_rate": 1.720644198608829e-05, + "loss": 0.8771, + "step": 1787 + }, + { + "epoch": 0.27, + "grad_norm": 1.508326494733138, + "learning_rate": 1.720309024887907e-05, + "loss": 0.9088, + "step": 1788 + }, + { + "epoch": 0.27, + "grad_norm": 1.6146606431930937, + "learning_rate": 1.7199736829027252e-05, + "loss": 0.9049, + "step": 1789 + }, + { + "epoch": 0.27, + "grad_norm": 1.516122841920055, + "learning_rate": 1.7196381727316192e-05, + "loss": 0.8811, + "step": 1790 + }, + { + "epoch": 0.27, + "grad_norm": 1.4806277282829798, + "learning_rate": 1.7193024944529647e-05, + "loss": 0.8732, + "step": 1791 + }, + { + "epoch": 0.27, + "grad_norm": 1.4559408235450584, + "learning_rate": 1.7189666481451755e-05, + "loss": 0.9023, + "step": 1792 + }, + { + "epoch": 0.27, + "grad_norm": 1.4453281936566724, + "learning_rate": 1.7186306338867055e-05, + "loss": 0.9596, + "step": 1793 + }, + { + "epoch": 0.27, + "grad_norm": 1.498584277906921, + "learning_rate": 1.7182944517560483e-05, + "loss": 0.8495, + "step": 1794 + }, + { + "epoch": 0.27, + "grad_norm": 1.816898789845504, + "learning_rate": 1.7179581018317354e-05, + "loss": 0.8659, + "step": 1795 + }, + { + "epoch": 0.27, + "grad_norm": 3.7775574133563197, + "learning_rate": 1.7176215841923385e-05, + "loss": 0.8375, + "step": 1796 + }, + { + "epoch": 0.27, + "grad_norm": 1.4353467710993535, + "learning_rate": 1.717284898916468e-05, + "loss": 0.8893, + "step": 1797 + }, + { + "epoch": 0.27, + "grad_norm": 1.4930765015281215, + "learning_rate": 1.7169480460827734e-05, + "loss": 0.9407, + "step": 1798 + }, + { + "epoch": 0.27, + "grad_norm": 1.5119016940782954, + "learning_rate": 1.716611025769944e-05, + "loss": 0.8741, + "step": 1799 + }, + { + "epoch": 0.27, + "grad_norm": 1.5374560607421481, + "learning_rate": 1.7162738380567077e-05, + "loss": 0.9104, + "step": 1800 + }, + { + "epoch": 0.27, + "grad_norm": 1.3886831010828815, + "learning_rate": 1.7159364830218312e-05, + "loss": 0.8395, + "step": 1801 + }, + { + "epoch": 0.27, + "grad_norm": 1.448245797696183, + "learning_rate": 1.715598960744121e-05, + "loss": 0.8416, + "step": 1802 + }, + { + "epoch": 0.27, + "grad_norm": 1.5111294590928952, + "learning_rate": 1.7152612713024226e-05, + "loss": 0.8573, + "step": 1803 + }, + { + "epoch": 0.27, + "grad_norm": 1.7186909367426648, + "learning_rate": 1.71492341477562e-05, + "loss": 0.9487, + "step": 1804 + }, + { + "epoch": 0.27, + "grad_norm": 1.6378023900766485, + "learning_rate": 1.714585391242636e-05, + "loss": 0.9359, + "step": 1805 + }, + { + "epoch": 0.27, + "grad_norm": 1.5038280119918932, + "learning_rate": 1.714247200782434e-05, + "loss": 0.906, + "step": 1806 + }, + { + "epoch": 0.27, + "grad_norm": 1.7788525810740219, + "learning_rate": 1.7139088434740142e-05, + "loss": 0.8566, + "step": 1807 + }, + { + "epoch": 0.27, + "grad_norm": 1.5776725594957457, + "learning_rate": 1.7135703193964176e-05, + "loss": 0.9065, + "step": 1808 + }, + { + "epoch": 0.27, + "grad_norm": 1.453841505756265, + "learning_rate": 1.7132316286287235e-05, + "loss": 0.9159, + "step": 1809 + }, + { + "epoch": 0.27, + "grad_norm": 1.46112002415715, + "learning_rate": 1.71289277125005e-05, + "loss": 0.9018, + "step": 1810 + }, + { + "epoch": 0.27, + "grad_norm": 1.632216149580276, + "learning_rate": 1.712553747339554e-05, + "loss": 0.9288, + "step": 1811 + }, + { + "epoch": 0.27, + "grad_norm": 1.2984613259168827, + "learning_rate": 1.712214556976431e-05, + "loss": 0.9663, + "step": 1812 + }, + { + "epoch": 0.27, + "grad_norm": 1.6099174011408217, + "learning_rate": 1.711875200239917e-05, + "loss": 0.9112, + "step": 1813 + }, + { + "epoch": 0.27, + "grad_norm": 1.605283506059939, + "learning_rate": 1.7115356772092858e-05, + "loss": 0.9828, + "step": 1814 + }, + { + "epoch": 0.27, + "grad_norm": 1.6315331928711274, + "learning_rate": 1.711195987963849e-05, + "loss": 0.9046, + "step": 1815 + }, + { + "epoch": 0.27, + "grad_norm": 1.6107117450770427, + "learning_rate": 1.7108561325829584e-05, + "loss": 0.9446, + "step": 1816 + }, + { + "epoch": 0.27, + "grad_norm": 1.6408891204130671, + "learning_rate": 1.7105161111460046e-05, + "loss": 0.9636, + "step": 1817 + }, + { + "epoch": 0.27, + "grad_norm": 1.5407244258507045, + "learning_rate": 1.7101759237324165e-05, + "loss": 0.9403, + "step": 1818 + }, + { + "epoch": 0.27, + "grad_norm": 1.4569115346658608, + "learning_rate": 1.7098355704216622e-05, + "loss": 0.9117, + "step": 1819 + }, + { + "epoch": 0.27, + "grad_norm": 1.3826627279790076, + "learning_rate": 1.7094950512932475e-05, + "loss": 0.8837, + "step": 1820 + }, + { + "epoch": 0.27, + "grad_norm": 1.5932423897031658, + "learning_rate": 1.7091543664267183e-05, + "loss": 0.9623, + "step": 1821 + }, + { + "epoch": 0.27, + "grad_norm": 1.5126250911344894, + "learning_rate": 1.7088135159016584e-05, + "loss": 0.9499, + "step": 1822 + }, + { + "epoch": 0.27, + "grad_norm": 1.4927210889833857, + "learning_rate": 1.7084724997976903e-05, + "loss": 0.8769, + "step": 1823 + }, + { + "epoch": 0.27, + "grad_norm": 1.7444005091219006, + "learning_rate": 1.708131318194476e-05, + "loss": 0.8957, + "step": 1824 + }, + { + "epoch": 0.27, + "grad_norm": 1.4384235727115369, + "learning_rate": 1.7077899711717152e-05, + "loss": 0.9226, + "step": 1825 + }, + { + "epoch": 0.27, + "grad_norm": 1.5573586969929953, + "learning_rate": 1.7074484588091465e-05, + "loss": 0.8396, + "step": 1826 + }, + { + "epoch": 0.27, + "grad_norm": 1.6943024623763063, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.9327, + "step": 1827 + }, + { + "epoch": 0.27, + "grad_norm": 1.3264942269004616, + "learning_rate": 1.706764938383734e-05, + "loss": 0.9455, + "step": 1828 + }, + { + "epoch": 0.27, + "grad_norm": 1.5792481985995255, + "learning_rate": 1.7064229304805607e-05, + "loss": 0.8752, + "step": 1829 + }, + { + "epoch": 0.27, + "grad_norm": 1.905335752327466, + "learning_rate": 1.70608075755692e-05, + "loss": 0.8723, + "step": 1830 + }, + { + "epoch": 0.27, + "grad_norm": 0.8859667078128329, + "learning_rate": 1.705738419692744e-05, + "loss": 0.3358, + "step": 1831 + }, + { + "epoch": 0.27, + "grad_norm": 1.5627065563762872, + "learning_rate": 1.7053959169680033e-05, + "loss": 0.8605, + "step": 1832 + }, + { + "epoch": 0.27, + "grad_norm": 1.6044406348575977, + "learning_rate": 1.7050532494627058e-05, + "loss": 0.9248, + "step": 1833 + }, + { + "epoch": 0.27, + "grad_norm": 1.5970990414792006, + "learning_rate": 1.704710417256899e-05, + "loss": 0.9733, + "step": 1834 + }, + { + "epoch": 0.27, + "grad_norm": 1.5942881087129774, + "learning_rate": 1.7043674204306688e-05, + "loss": 0.93, + "step": 1835 + }, + { + "epoch": 0.27, + "grad_norm": 1.466825606220935, + "learning_rate": 1.7040242590641385e-05, + "loss": 0.8589, + "step": 1836 + }, + { + "epoch": 0.27, + "grad_norm": 1.3292173388448874, + "learning_rate": 1.7036809332374713e-05, + "loss": 0.9582, + "step": 1837 + }, + { + "epoch": 0.27, + "grad_norm": 1.4421528178939231, + "learning_rate": 1.7033374430308683e-05, + "loss": 0.8817, + "step": 1838 + }, + { + "epoch": 0.27, + "grad_norm": 1.5129807210984627, + "learning_rate": 1.7029937885245682e-05, + "loss": 0.8774, + "step": 1839 + }, + { + "epoch": 0.27, + "grad_norm": 1.5214130895415245, + "learning_rate": 1.7026499697988496e-05, + "loss": 0.882, + "step": 1840 + }, + { + "epoch": 0.27, + "grad_norm": 1.6976364727070776, + "learning_rate": 1.7023059869340276e-05, + "loss": 0.956, + "step": 1841 + }, + { + "epoch": 0.27, + "grad_norm": 1.3286347161428194, + "learning_rate": 1.7019618400104572e-05, + "loss": 0.8316, + "step": 1842 + }, + { + "epoch": 0.27, + "grad_norm": 1.5186825909980486, + "learning_rate": 1.7016175291085308e-05, + "loss": 0.9076, + "step": 1843 + }, + { + "epoch": 0.28, + "grad_norm": 1.6401109304442076, + "learning_rate": 1.7012730543086798e-05, + "loss": 0.9993, + "step": 1844 + }, + { + "epoch": 0.28, + "grad_norm": 1.5035105642985296, + "learning_rate": 1.7009284156913737e-05, + "loss": 0.7926, + "step": 1845 + }, + { + "epoch": 0.28, + "grad_norm": 1.4650515021185209, + "learning_rate": 1.70058361333712e-05, + "loss": 0.9737, + "step": 1846 + }, + { + "epoch": 0.28, + "grad_norm": 1.7570150367833464, + "learning_rate": 1.700238647326464e-05, + "loss": 0.8754, + "step": 1847 + }, + { + "epoch": 0.28, + "grad_norm": 1.6313988044756296, + "learning_rate": 1.6998935177399904e-05, + "loss": 0.8843, + "step": 1848 + }, + { + "epoch": 0.28, + "grad_norm": 1.4428389337508905, + "learning_rate": 1.6995482246583215e-05, + "loss": 0.8757, + "step": 1849 + }, + { + "epoch": 0.28, + "grad_norm": 1.428385144633288, + "learning_rate": 1.699202768162117e-05, + "loss": 0.9636, + "step": 1850 + }, + { + "epoch": 0.28, + "grad_norm": 1.7392542687475825, + "learning_rate": 1.6988571483320767e-05, + "loss": 0.9013, + "step": 1851 + }, + { + "epoch": 0.28, + "grad_norm": 1.472283720333911, + "learning_rate": 1.6985113652489374e-05, + "loss": 0.9108, + "step": 1852 + }, + { + "epoch": 0.28, + "grad_norm": 1.6345929022663972, + "learning_rate": 1.698165418993473e-05, + "loss": 0.8426, + "step": 1853 + }, + { + "epoch": 0.28, + "grad_norm": 1.545556794572305, + "learning_rate": 1.697819309646497e-05, + "loss": 0.9063, + "step": 1854 + }, + { + "epoch": 0.28, + "grad_norm": 0.9339432221890696, + "learning_rate": 1.697473037288861e-05, + "loss": 0.3542, + "step": 1855 + }, + { + "epoch": 0.28, + "grad_norm": 1.393912084417808, + "learning_rate": 1.697126602001454e-05, + "loss": 0.9832, + "step": 1856 + }, + { + "epoch": 0.28, + "grad_norm": 1.6237529786345433, + "learning_rate": 1.6967800038652035e-05, + "loss": 0.9168, + "step": 1857 + }, + { + "epoch": 0.28, + "grad_norm": 1.4129856900534066, + "learning_rate": 1.6964332429610747e-05, + "loss": 0.9548, + "step": 1858 + }, + { + "epoch": 0.28, + "grad_norm": 1.5444319571559653, + "learning_rate": 1.696086319370071e-05, + "loss": 0.9455, + "step": 1859 + }, + { + "epoch": 0.28, + "grad_norm": 1.397181378998314, + "learning_rate": 1.695739233173233e-05, + "loss": 0.9599, + "step": 1860 + }, + { + "epoch": 0.28, + "grad_norm": 1.3144805233666212, + "learning_rate": 1.6953919844516415e-05, + "loss": 0.8988, + "step": 1861 + }, + { + "epoch": 0.28, + "grad_norm": 1.4844170313455844, + "learning_rate": 1.695044573286413e-05, + "loss": 0.9151, + "step": 1862 + }, + { + "epoch": 0.28, + "grad_norm": 1.6429506575694284, + "learning_rate": 1.694696999758703e-05, + "loss": 0.9013, + "step": 1863 + }, + { + "epoch": 0.28, + "grad_norm": 2.462834925554896, + "learning_rate": 1.6943492639497044e-05, + "loss": 0.8297, + "step": 1864 + }, + { + "epoch": 0.28, + "grad_norm": 1.3801884498303576, + "learning_rate": 1.6940013659406492e-05, + "loss": 0.9058, + "step": 1865 + }, + { + "epoch": 0.28, + "grad_norm": 1.486277465984611, + "learning_rate": 1.693653305812805e-05, + "loss": 0.8542, + "step": 1866 + }, + { + "epoch": 0.28, + "grad_norm": 1.7762073614813474, + "learning_rate": 1.69330508364748e-05, + "loss": 0.9188, + "step": 1867 + }, + { + "epoch": 0.28, + "grad_norm": 1.3483277679563481, + "learning_rate": 1.6929566995260184e-05, + "loss": 0.8309, + "step": 1868 + }, + { + "epoch": 0.28, + "grad_norm": 1.5490639111449134, + "learning_rate": 1.692608153529802e-05, + "loss": 0.9155, + "step": 1869 + }, + { + "epoch": 0.28, + "grad_norm": 1.4626550077248273, + "learning_rate": 1.6922594457402528e-05, + "loss": 0.9275, + "step": 1870 + }, + { + "epoch": 0.28, + "grad_norm": 1.6728514344526555, + "learning_rate": 1.691910576238828e-05, + "loss": 0.9352, + "step": 1871 + }, + { + "epoch": 0.28, + "grad_norm": 1.6905296423949694, + "learning_rate": 1.6915615451070234e-05, + "loss": 0.8731, + "step": 1872 + }, + { + "epoch": 0.28, + "grad_norm": 1.4445955044540189, + "learning_rate": 1.691212352426373e-05, + "loss": 0.9092, + "step": 1873 + }, + { + "epoch": 0.28, + "grad_norm": 1.5379695811991725, + "learning_rate": 1.690862998278448e-05, + "loss": 0.8341, + "step": 1874 + }, + { + "epoch": 0.28, + "grad_norm": 1.483547024947764, + "learning_rate": 1.690513482744858e-05, + "loss": 0.8795, + "step": 1875 + }, + { + "epoch": 0.28, + "grad_norm": 1.6764891403089985, + "learning_rate": 1.69016380590725e-05, + "loss": 0.8803, + "step": 1876 + }, + { + "epoch": 0.28, + "grad_norm": 1.4931952259312042, + "learning_rate": 1.689813967847308e-05, + "loss": 0.904, + "step": 1877 + }, + { + "epoch": 0.28, + "grad_norm": 1.3735944278809964, + "learning_rate": 1.689463968646754e-05, + "loss": 0.9055, + "step": 1878 + }, + { + "epoch": 0.28, + "grad_norm": 1.6314148550799485, + "learning_rate": 1.6891138083873486e-05, + "loss": 0.8594, + "step": 1879 + }, + { + "epoch": 0.28, + "grad_norm": 1.4852944237066374, + "learning_rate": 1.688763487150889e-05, + "loss": 0.8542, + "step": 1880 + }, + { + "epoch": 0.28, + "grad_norm": 1.543406465469015, + "learning_rate": 1.6884130050192098e-05, + "loss": 0.9442, + "step": 1881 + }, + { + "epoch": 0.28, + "grad_norm": 1.5159666927316846, + "learning_rate": 1.6880623620741843e-05, + "loss": 0.9213, + "step": 1882 + }, + { + "epoch": 0.28, + "grad_norm": 1.4719725711640013, + "learning_rate": 1.6877115583977225e-05, + "loss": 0.9367, + "step": 1883 + }, + { + "epoch": 0.28, + "grad_norm": 1.7933370133062927, + "learning_rate": 1.687360594071772e-05, + "loss": 0.8699, + "step": 1884 + }, + { + "epoch": 0.28, + "grad_norm": 1.040836183674288, + "learning_rate": 1.6870094691783182e-05, + "loss": 0.3144, + "step": 1885 + }, + { + "epoch": 0.28, + "grad_norm": 1.5923834117952378, + "learning_rate": 1.6866581837993842e-05, + "loss": 0.9466, + "step": 1886 + }, + { + "epoch": 0.28, + "grad_norm": 1.4085024814425162, + "learning_rate": 1.68630673801703e-05, + "loss": 0.8911, + "step": 1887 + }, + { + "epoch": 0.28, + "grad_norm": 1.635650363451425, + "learning_rate": 1.6859551319133534e-05, + "loss": 0.884, + "step": 1888 + }, + { + "epoch": 0.28, + "grad_norm": 1.56261987464738, + "learning_rate": 1.6856033655704894e-05, + "loss": 0.8434, + "step": 1889 + }, + { + "epoch": 0.28, + "grad_norm": 1.428811357245329, + "learning_rate": 1.685251439070611e-05, + "loss": 0.8943, + "step": 1890 + }, + { + "epoch": 0.28, + "grad_norm": 1.464265757652771, + "learning_rate": 1.6848993524959286e-05, + "loss": 0.9075, + "step": 1891 + }, + { + "epoch": 0.28, + "grad_norm": 1.6909022505379656, + "learning_rate": 1.684547105928689e-05, + "loss": 0.9051, + "step": 1892 + }, + { + "epoch": 0.28, + "grad_norm": 1.6575890573411753, + "learning_rate": 1.684194699451177e-05, + "loss": 0.9504, + "step": 1893 + }, + { + "epoch": 0.28, + "grad_norm": 1.4531623210011089, + "learning_rate": 1.6838421331457154e-05, + "loss": 0.9892, + "step": 1894 + }, + { + "epoch": 0.28, + "grad_norm": 1.6826652280886958, + "learning_rate": 1.683489407094663e-05, + "loss": 0.9738, + "step": 1895 + }, + { + "epoch": 0.28, + "grad_norm": 1.5130114363566207, + "learning_rate": 1.683136521380417e-05, + "loss": 0.9192, + "step": 1896 + }, + { + "epoch": 0.28, + "grad_norm": 1.4557444032686035, + "learning_rate": 1.682783476085412e-05, + "loss": 0.9928, + "step": 1897 + }, + { + "epoch": 0.28, + "grad_norm": 1.6474761436452572, + "learning_rate": 1.6824302712921187e-05, + "loss": 0.8659, + "step": 1898 + }, + { + "epoch": 0.28, + "grad_norm": 1.4207912548841013, + "learning_rate": 1.682076907083046e-05, + "loss": 0.8705, + "step": 1899 + }, + { + "epoch": 0.28, + "grad_norm": 1.3852045086029119, + "learning_rate": 1.68172338354074e-05, + "loss": 0.9266, + "step": 1900 + }, + { + "epoch": 0.28, + "grad_norm": 1.437722974696888, + "learning_rate": 1.6813697007477837e-05, + "loss": 0.9057, + "step": 1901 + }, + { + "epoch": 0.28, + "grad_norm": 1.380899724815938, + "learning_rate": 1.6810158587867973e-05, + "loss": 0.9064, + "step": 1902 + }, + { + "epoch": 0.28, + "grad_norm": 1.468761736995389, + "learning_rate": 1.6806618577404385e-05, + "loss": 0.9327, + "step": 1903 + }, + { + "epoch": 0.28, + "grad_norm": 1.5475257428326372, + "learning_rate": 1.6803076976914018e-05, + "loss": 0.9255, + "step": 1904 + }, + { + "epoch": 0.28, + "grad_norm": 1.4974052044671982, + "learning_rate": 1.6799533787224192e-05, + "loss": 0.9057, + "step": 1905 + }, + { + "epoch": 0.28, + "grad_norm": 1.5713979886021696, + "learning_rate": 1.67959890091626e-05, + "loss": 0.9424, + "step": 1906 + }, + { + "epoch": 0.28, + "grad_norm": 1.2893939032962567, + "learning_rate": 1.679244264355729e-05, + "loss": 0.9293, + "step": 1907 + }, + { + "epoch": 0.28, + "grad_norm": 1.4765156418061116, + "learning_rate": 1.678889469123671e-05, + "loss": 0.8976, + "step": 1908 + }, + { + "epoch": 0.28, + "grad_norm": 1.509181420996771, + "learning_rate": 1.6785345153029648e-05, + "loss": 0.365, + "step": 1909 + }, + { + "epoch": 0.28, + "grad_norm": 1.656713904896557, + "learning_rate": 1.678179402976529e-05, + "loss": 0.8678, + "step": 1910 + }, + { + "epoch": 0.29, + "grad_norm": 1.3995329506226941, + "learning_rate": 1.6778241322273163e-05, + "loss": 0.8212, + "step": 1911 + }, + { + "epoch": 0.29, + "grad_norm": 1.5147949580754596, + "learning_rate": 1.677468703138319e-05, + "loss": 0.8517, + "step": 1912 + }, + { + "epoch": 0.29, + "grad_norm": 1.4262386346100058, + "learning_rate": 1.677113115792565e-05, + "loss": 0.8621, + "step": 1913 + }, + { + "epoch": 0.29, + "grad_norm": 1.5044534319447298, + "learning_rate": 1.6767573702731203e-05, + "loss": 0.8743, + "step": 1914 + }, + { + "epoch": 0.29, + "grad_norm": 1.3800901404342816, + "learning_rate": 1.676401466663086e-05, + "loss": 0.891, + "step": 1915 + }, + { + "epoch": 0.29, + "grad_norm": 1.5210445169789544, + "learning_rate": 1.676045405045602e-05, + "loss": 0.8876, + "step": 1916 + }, + { + "epoch": 0.29, + "grad_norm": 1.4724340581103998, + "learning_rate": 1.6756891855038436e-05, + "loss": 0.9291, + "step": 1917 + }, + { + "epoch": 0.29, + "grad_norm": 1.4764706883320164, + "learning_rate": 1.6753328081210244e-05, + "loss": 0.8795, + "step": 1918 + }, + { + "epoch": 0.29, + "grad_norm": 1.4618439501485787, + "learning_rate": 1.6749762729803943e-05, + "loss": 0.8933, + "step": 1919 + }, + { + "epoch": 0.29, + "grad_norm": 1.3946471980855777, + "learning_rate": 1.6746195801652393e-05, + "loss": 0.882, + "step": 1920 + }, + { + "epoch": 0.29, + "grad_norm": 1.5934232950774414, + "learning_rate": 1.674262729758883e-05, + "loss": 0.9255, + "step": 1921 + }, + { + "epoch": 0.29, + "grad_norm": 1.591363392062247, + "learning_rate": 1.673905721844686e-05, + "loss": 0.898, + "step": 1922 + }, + { + "epoch": 0.29, + "grad_norm": 1.7081082566979897, + "learning_rate": 1.673548556506045e-05, + "loss": 0.904, + "step": 1923 + }, + { + "epoch": 0.29, + "grad_norm": 1.776634681656157, + "learning_rate": 1.6731912338263943e-05, + "loss": 0.9289, + "step": 1924 + }, + { + "epoch": 0.29, + "grad_norm": 1.489714214242666, + "learning_rate": 1.6728337538892043e-05, + "loss": 0.8945, + "step": 1925 + }, + { + "epoch": 0.29, + "grad_norm": 1.4288390220179836, + "learning_rate": 1.6724761167779825e-05, + "loss": 0.9229, + "step": 1926 + }, + { + "epoch": 0.29, + "grad_norm": 2.6201175877374947, + "learning_rate": 1.6721183225762726e-05, + "loss": 0.8427, + "step": 1927 + }, + { + "epoch": 0.29, + "grad_norm": 1.4639948877642863, + "learning_rate": 1.6717603713676557e-05, + "loss": 0.9087, + "step": 1928 + }, + { + "epoch": 0.29, + "grad_norm": 1.3851772323434917, + "learning_rate": 1.6714022632357495e-05, + "loss": 0.9229, + "step": 1929 + }, + { + "epoch": 0.29, + "grad_norm": 1.4888210963311344, + "learning_rate": 1.671043998264207e-05, + "loss": 0.9124, + "step": 1930 + }, + { + "epoch": 0.29, + "grad_norm": 1.573404158169826, + "learning_rate": 1.6706855765367202e-05, + "loss": 0.9532, + "step": 1931 + }, + { + "epoch": 0.29, + "grad_norm": 1.3049727359363632, + "learning_rate": 1.670326998137016e-05, + "loss": 0.9143, + "step": 1932 + }, + { + "epoch": 0.29, + "grad_norm": 1.4639621599473185, + "learning_rate": 1.6699682631488578e-05, + "loss": 0.8668, + "step": 1933 + }, + { + "epoch": 0.29, + "grad_norm": 1.5249654095422258, + "learning_rate": 1.6696093716560466e-05, + "loss": 0.924, + "step": 1934 + }, + { + "epoch": 0.29, + "grad_norm": 1.4494831937704733, + "learning_rate": 1.6692503237424197e-05, + "loss": 0.9259, + "step": 1935 + }, + { + "epoch": 0.29, + "grad_norm": 1.5834105883487717, + "learning_rate": 1.6688911194918506e-05, + "loss": 0.9022, + "step": 1936 + }, + { + "epoch": 0.29, + "grad_norm": 1.5730013044324458, + "learning_rate": 1.668531758988249e-05, + "loss": 0.9607, + "step": 1937 + }, + { + "epoch": 0.29, + "grad_norm": 1.4828482593911496, + "learning_rate": 1.668172242315562e-05, + "loss": 0.9112, + "step": 1938 + }, + { + "epoch": 0.29, + "grad_norm": 1.5490312659547139, + "learning_rate": 1.667812569557773e-05, + "loss": 0.9085, + "step": 1939 + }, + { + "epoch": 0.29, + "grad_norm": 1.4382818676865894, + "learning_rate": 1.667452740798901e-05, + "loss": 0.8739, + "step": 1940 + }, + { + "epoch": 0.29, + "grad_norm": 1.3282378534446506, + "learning_rate": 1.6670927561230018e-05, + "loss": 0.8595, + "step": 1941 + }, + { + "epoch": 0.29, + "grad_norm": 1.4965124751942585, + "learning_rate": 1.666732615614169e-05, + "loss": 0.85, + "step": 1942 + }, + { + "epoch": 0.29, + "grad_norm": 1.4468147885809837, + "learning_rate": 1.6663723193565308e-05, + "loss": 0.8229, + "step": 1943 + }, + { + "epoch": 0.29, + "grad_norm": 1.7302487989187791, + "learning_rate": 1.666011867434252e-05, + "loss": 1.0199, + "step": 1944 + }, + { + "epoch": 0.29, + "grad_norm": 1.4320928092384664, + "learning_rate": 1.6656512599315348e-05, + "loss": 0.875, + "step": 1945 + }, + { + "epoch": 0.29, + "grad_norm": 1.6760163575753484, + "learning_rate": 1.6652904969326167e-05, + "loss": 0.92, + "step": 1946 + }, + { + "epoch": 0.29, + "grad_norm": 1.5269023069511656, + "learning_rate": 1.6649295785217722e-05, + "loss": 0.904, + "step": 1947 + }, + { + "epoch": 0.29, + "grad_norm": 1.3718225900332046, + "learning_rate": 1.6645685047833124e-05, + "loss": 0.7822, + "step": 1948 + }, + { + "epoch": 0.29, + "grad_norm": 1.5824897596177077, + "learning_rate": 1.6642072758015834e-05, + "loss": 0.9779, + "step": 1949 + }, + { + "epoch": 0.29, + "grad_norm": 1.5332858506742733, + "learning_rate": 1.6638458916609685e-05, + "loss": 0.9419, + "step": 1950 + }, + { + "epoch": 0.29, + "grad_norm": 1.5125142402215015, + "learning_rate": 1.6634843524458874e-05, + "loss": 0.9616, + "step": 1951 + }, + { + "epoch": 0.29, + "grad_norm": 1.5669749706776952, + "learning_rate": 1.6631226582407954e-05, + "loss": 0.9496, + "step": 1952 + }, + { + "epoch": 0.29, + "grad_norm": 1.189677675314092, + "learning_rate": 1.6627608091301842e-05, + "loss": 0.3662, + "step": 1953 + }, + { + "epoch": 0.29, + "grad_norm": 1.3373281438103897, + "learning_rate": 1.6623988051985823e-05, + "loss": 0.8457, + "step": 1954 + }, + { + "epoch": 0.29, + "grad_norm": 1.5230997420295853, + "learning_rate": 1.6620366465305533e-05, + "loss": 0.9122, + "step": 1955 + }, + { + "epoch": 0.29, + "grad_norm": 1.377707305958102, + "learning_rate": 1.6616743332106976e-05, + "loss": 0.8796, + "step": 1956 + }, + { + "epoch": 0.29, + "grad_norm": 1.551055903099349, + "learning_rate": 1.661311865323652e-05, + "loss": 0.8674, + "step": 1957 + }, + { + "epoch": 0.29, + "grad_norm": 1.6513766351087809, + "learning_rate": 1.660949242954089e-05, + "loss": 0.9343, + "step": 1958 + }, + { + "epoch": 0.29, + "grad_norm": 1.4462516742486833, + "learning_rate": 1.6605864661867165e-05, + "loss": 0.9126, + "step": 1959 + }, + { + "epoch": 0.29, + "grad_norm": 1.6598395864626054, + "learning_rate": 1.6602235351062797e-05, + "loss": 0.9657, + "step": 1960 + }, + { + "epoch": 0.29, + "grad_norm": 1.43272870194693, + "learning_rate": 1.6598604497975598e-05, + "loss": 0.8766, + "step": 1961 + }, + { + "epoch": 0.29, + "grad_norm": 1.685255448154483, + "learning_rate": 1.6594972103453727e-05, + "loss": 0.8856, + "step": 1962 + }, + { + "epoch": 0.29, + "grad_norm": 1.5203105048786965, + "learning_rate": 1.6591338168345713e-05, + "loss": 0.9465, + "step": 1963 + }, + { + "epoch": 0.29, + "grad_norm": 1.430726407681374, + "learning_rate": 1.6587702693500452e-05, + "loss": 0.9138, + "step": 1964 + }, + { + "epoch": 0.29, + "grad_norm": 1.413927142240461, + "learning_rate": 1.6584065679767186e-05, + "loss": 0.9046, + "step": 1965 + }, + { + "epoch": 0.29, + "grad_norm": 1.582377967988086, + "learning_rate": 1.6580427127995516e-05, + "loss": 0.904, + "step": 1966 + }, + { + "epoch": 0.29, + "grad_norm": 0.9117864281076555, + "learning_rate": 1.6576787039035417e-05, + "loss": 0.3333, + "step": 1967 + }, + { + "epoch": 0.29, + "grad_norm": 1.9696745480283004, + "learning_rate": 1.657314541373721e-05, + "loss": 0.8769, + "step": 1968 + }, + { + "epoch": 0.29, + "grad_norm": 1.7704686283082576, + "learning_rate": 1.656950225295158e-05, + "loss": 0.9051, + "step": 1969 + }, + { + "epoch": 0.29, + "grad_norm": 1.4378779539002324, + "learning_rate": 1.6565857557529567e-05, + "loss": 0.8737, + "step": 1970 + }, + { + "epoch": 0.29, + "grad_norm": 1.4285185743723405, + "learning_rate": 1.6562211328322576e-05, + "loss": 0.8525, + "step": 1971 + }, + { + "epoch": 0.29, + "grad_norm": 1.204406636983857, + "learning_rate": 1.6558563566182365e-05, + "loss": 0.9461, + "step": 1972 + }, + { + "epoch": 0.29, + "grad_norm": 1.3545335228408402, + "learning_rate": 1.6554914271961047e-05, + "loss": 0.8799, + "step": 1973 + }, + { + "epoch": 0.29, + "grad_norm": 1.4680363425345486, + "learning_rate": 1.655126344651111e-05, + "loss": 0.8849, + "step": 1974 + }, + { + "epoch": 0.29, + "grad_norm": 1.2834420286597472, + "learning_rate": 1.6547611090685378e-05, + "loss": 0.9061, + "step": 1975 + }, + { + "epoch": 0.29, + "grad_norm": 1.4435875432256455, + "learning_rate": 1.6543957205337034e-05, + "loss": 0.9571, + "step": 1976 + }, + { + "epoch": 0.29, + "grad_norm": 1.6255764958791408, + "learning_rate": 1.6540301791319647e-05, + "loss": 0.7883, + "step": 1977 + }, + { + "epoch": 0.3, + "grad_norm": 1.7920660029703597, + "learning_rate": 1.6536644849487104e-05, + "loss": 0.9691, + "step": 1978 + }, + { + "epoch": 0.3, + "grad_norm": 1.4952383434558507, + "learning_rate": 1.6532986380693673e-05, + "loss": 0.8549, + "step": 1979 + }, + { + "epoch": 0.3, + "grad_norm": 1.2664267817179569, + "learning_rate": 1.6529326385793972e-05, + "loss": 0.9409, + "step": 1980 + }, + { + "epoch": 0.3, + "grad_norm": 1.3638323603114864, + "learning_rate": 1.6525664865642978e-05, + "loss": 0.9198, + "step": 1981 + }, + { + "epoch": 0.3, + "grad_norm": 1.6703378460444627, + "learning_rate": 1.652200182109602e-05, + "loss": 0.9389, + "step": 1982 + }, + { + "epoch": 0.3, + "grad_norm": 1.6574802103414707, + "learning_rate": 1.651833725300879e-05, + "loss": 0.8769, + "step": 1983 + }, + { + "epoch": 0.3, + "grad_norm": 1.3356220049895915, + "learning_rate": 1.6514671162237327e-05, + "loss": 0.9444, + "step": 1984 + }, + { + "epoch": 0.3, + "grad_norm": 1.493508440800496, + "learning_rate": 1.651100354963803e-05, + "loss": 0.9429, + "step": 1985 + }, + { + "epoch": 0.3, + "grad_norm": 1.6090742543143652, + "learning_rate": 1.6507334416067656e-05, + "loss": 0.8426, + "step": 1986 + }, + { + "epoch": 0.3, + "grad_norm": 1.603834803476885, + "learning_rate": 1.6503663762383312e-05, + "loss": 0.8728, + "step": 1987 + }, + { + "epoch": 0.3, + "grad_norm": 1.6670319955248305, + "learning_rate": 1.649999158944247e-05, + "loss": 0.8964, + "step": 1988 + }, + { + "epoch": 0.3, + "grad_norm": 1.4012671152463971, + "learning_rate": 1.6496317898102942e-05, + "loss": 0.9463, + "step": 1989 + }, + { + "epoch": 0.3, + "grad_norm": 1.4078083483616775, + "learning_rate": 1.649264268922291e-05, + "loss": 0.8795, + "step": 1990 + }, + { + "epoch": 0.3, + "grad_norm": 1.3180993007284014, + "learning_rate": 1.6488965963660892e-05, + "loss": 0.8524, + "step": 1991 + }, + { + "epoch": 0.3, + "grad_norm": 1.5562919779079722, + "learning_rate": 1.6485287722275783e-05, + "loss": 0.8819, + "step": 1992 + }, + { + "epoch": 0.3, + "grad_norm": 1.3797329904329805, + "learning_rate": 1.6481607965926812e-05, + "loss": 0.921, + "step": 1993 + }, + { + "epoch": 0.3, + "grad_norm": 1.4271421671983215, + "learning_rate": 1.647792669547358e-05, + "loss": 0.8914, + "step": 1994 + }, + { + "epoch": 0.3, + "grad_norm": 1.636940227013142, + "learning_rate": 1.6474243911776026e-05, + "loss": 0.8372, + "step": 1995 + }, + { + "epoch": 0.3, + "grad_norm": 1.387700690917425, + "learning_rate": 1.6470559615694445e-05, + "loss": 0.8901, + "step": 1996 + }, + { + "epoch": 0.3, + "grad_norm": 1.6221434503794132, + "learning_rate": 1.6466873808089496e-05, + "loss": 0.9761, + "step": 1997 + }, + { + "epoch": 0.3, + "grad_norm": 1.4959770809528985, + "learning_rate": 1.646318648982218e-05, + "loss": 0.8258, + "step": 1998 + }, + { + "epoch": 0.3, + "grad_norm": 1.4325709715801538, + "learning_rate": 1.6459497661753857e-05, + "loss": 0.9125, + "step": 1999 + }, + { + "epoch": 0.3, + "grad_norm": 1.0230319533656353, + "learning_rate": 1.6455807324746237e-05, + "loss": 0.3341, + "step": 2000 + }, + { + "epoch": 0.3, + "grad_norm": 1.3686508616923576, + "learning_rate": 1.645211547966138e-05, + "loss": 0.9118, + "step": 2001 + }, + { + "epoch": 0.3, + "grad_norm": 1.4022161138780582, + "learning_rate": 1.6448422127361707e-05, + "loss": 0.9522, + "step": 2002 + }, + { + "epoch": 0.3, + "grad_norm": 1.3899670541906008, + "learning_rate": 1.6444727268709984e-05, + "loss": 0.9214, + "step": 2003 + }, + { + "epoch": 0.3, + "grad_norm": 1.45606700206336, + "learning_rate": 1.6441030904569327e-05, + "loss": 0.9148, + "step": 2004 + }, + { + "epoch": 0.3, + "grad_norm": 1.3448561195289104, + "learning_rate": 1.6437333035803208e-05, + "loss": 0.8743, + "step": 2005 + }, + { + "epoch": 0.3, + "grad_norm": 1.5715648819197445, + "learning_rate": 1.6433633663275453e-05, + "loss": 0.9288, + "step": 2006 + }, + { + "epoch": 0.3, + "grad_norm": 1.5561307050375306, + "learning_rate": 1.642993278785023e-05, + "loss": 0.8729, + "step": 2007 + }, + { + "epoch": 0.3, + "grad_norm": 1.540332567866295, + "learning_rate": 1.642623041039207e-05, + "loss": 0.866, + "step": 2008 + }, + { + "epoch": 0.3, + "grad_norm": 1.6223774894138585, + "learning_rate": 1.6422526531765846e-05, + "loss": 0.8024, + "step": 2009 + }, + { + "epoch": 0.3, + "grad_norm": 1.3839531769548294, + "learning_rate": 1.6418821152836782e-05, + "loss": 0.9578, + "step": 2010 + }, + { + "epoch": 0.3, + "grad_norm": 1.3932875682646482, + "learning_rate": 1.641511427447046e-05, + "loss": 0.8685, + "step": 2011 + }, + { + "epoch": 0.3, + "grad_norm": 1.472219488403293, + "learning_rate": 1.64114058975328e-05, + "loss": 0.9228, + "step": 2012 + }, + { + "epoch": 0.3, + "grad_norm": 1.6950888211491755, + "learning_rate": 1.640769602289009e-05, + "loss": 0.9212, + "step": 2013 + }, + { + "epoch": 0.3, + "grad_norm": 1.4453771525176202, + "learning_rate": 1.6403984651408947e-05, + "loss": 0.9402, + "step": 2014 + }, + { + "epoch": 0.3, + "grad_norm": 1.4901681519089356, + "learning_rate": 1.6400271783956352e-05, + "loss": 0.9915, + "step": 2015 + }, + { + "epoch": 0.3, + "grad_norm": 1.2876539539908727, + "learning_rate": 1.6396557421399634e-05, + "loss": 0.8771, + "step": 2016 + }, + { + "epoch": 0.3, + "grad_norm": 0.9135175062444185, + "learning_rate": 1.639284156460646e-05, + "loss": 0.3526, + "step": 2017 + }, + { + "epoch": 0.3, + "grad_norm": 1.4623818518678788, + "learning_rate": 1.638912421444486e-05, + "loss": 0.9509, + "step": 2018 + }, + { + "epoch": 0.3, + "grad_norm": 1.6777871454441866, + "learning_rate": 1.638540537178321e-05, + "loss": 0.963, + "step": 2019 + }, + { + "epoch": 0.3, + "grad_norm": 1.2438738026296454, + "learning_rate": 1.6381685037490225e-05, + "loss": 0.9235, + "step": 2020 + }, + { + "epoch": 0.3, + "grad_norm": 1.5426780390603687, + "learning_rate": 1.6377963212434982e-05, + "loss": 0.8005, + "step": 2021 + }, + { + "epoch": 0.3, + "grad_norm": 1.50490739494114, + "learning_rate": 1.63742398974869e-05, + "loss": 0.8412, + "step": 2022 + }, + { + "epoch": 0.3, + "grad_norm": 1.44441824195559, + "learning_rate": 1.637051509351574e-05, + "loss": 0.9227, + "step": 2023 + }, + { + "epoch": 0.3, + "grad_norm": 1.6153868732398677, + "learning_rate": 1.6366788801391618e-05, + "loss": 0.9126, + "step": 2024 + }, + { + "epoch": 0.3, + "grad_norm": 1.4472502766823665, + "learning_rate": 1.6363061021984997e-05, + "loss": 0.8415, + "step": 2025 + }, + { + "epoch": 0.3, + "grad_norm": 1.5193502531641168, + "learning_rate": 1.6359331756166694e-05, + "loss": 0.8252, + "step": 2026 + }, + { + "epoch": 0.3, + "grad_norm": 1.4973435945659064, + "learning_rate": 1.6355601004807856e-05, + "loss": 0.88, + "step": 2027 + }, + { + "epoch": 0.3, + "grad_norm": 1.4215879189794591, + "learning_rate": 1.635186876877999e-05, + "loss": 0.8839, + "step": 2028 + }, + { + "epoch": 0.3, + "grad_norm": 1.4100515721030893, + "learning_rate": 1.6348135048954943e-05, + "loss": 0.9207, + "step": 2029 + }, + { + "epoch": 0.3, + "grad_norm": 1.3975214042084307, + "learning_rate": 1.6344399846204918e-05, + "loss": 0.7465, + "step": 2030 + }, + { + "epoch": 0.3, + "grad_norm": 1.7076184737981555, + "learning_rate": 1.634066316140246e-05, + "loss": 0.855, + "step": 2031 + }, + { + "epoch": 0.3, + "grad_norm": 1.6985969392291949, + "learning_rate": 1.6336924995420453e-05, + "loss": 0.9418, + "step": 2032 + }, + { + "epoch": 0.3, + "grad_norm": 1.3544998120600074, + "learning_rate": 1.6333185349132138e-05, + "loss": 0.8925, + "step": 2033 + }, + { + "epoch": 0.3, + "grad_norm": 1.7049199999956408, + "learning_rate": 1.632944422341109e-05, + "loss": 0.8878, + "step": 2034 + }, + { + "epoch": 0.3, + "grad_norm": 1.4448598436383564, + "learning_rate": 1.6325701619131246e-05, + "loss": 0.9263, + "step": 2035 + }, + { + "epoch": 0.3, + "grad_norm": 1.5326367079112837, + "learning_rate": 1.632195753716687e-05, + "loss": 0.8386, + "step": 2036 + }, + { + "epoch": 0.3, + "grad_norm": 1.66287064852823, + "learning_rate": 1.6318211978392588e-05, + "loss": 0.9434, + "step": 2037 + }, + { + "epoch": 0.3, + "grad_norm": 1.3421603172866972, + "learning_rate": 1.6314464943683353e-05, + "loss": 0.8913, + "step": 2038 + }, + { + "epoch": 0.3, + "grad_norm": 1.5574219046124602, + "learning_rate": 1.631071643391448e-05, + "loss": 0.8685, + "step": 2039 + }, + { + "epoch": 0.3, + "grad_norm": 1.486758911574603, + "learning_rate": 1.6306966449961623e-05, + "loss": 0.932, + "step": 2040 + }, + { + "epoch": 0.3, + "grad_norm": 1.3878371369587263, + "learning_rate": 1.6303214992700773e-05, + "loss": 0.9122, + "step": 2041 + }, + { + "epoch": 0.3, + "grad_norm": 1.4605312394526726, + "learning_rate": 1.6299462063008272e-05, + "loss": 0.8802, + "step": 2042 + }, + { + "epoch": 0.3, + "grad_norm": 1.2705456378416817, + "learning_rate": 1.6295707661760804e-05, + "loss": 0.8642, + "step": 2043 + }, + { + "epoch": 0.3, + "grad_norm": 1.5219381216498384, + "learning_rate": 1.62919517898354e-05, + "loss": 0.8679, + "step": 2044 + }, + { + "epoch": 0.31, + "grad_norm": 0.874751290350141, + "learning_rate": 1.6288194448109433e-05, + "loss": 0.3563, + "step": 2045 + }, + { + "epoch": 0.31, + "grad_norm": 1.628572545871969, + "learning_rate": 1.6284435637460613e-05, + "loss": 0.8588, + "step": 2046 + }, + { + "epoch": 0.31, + "grad_norm": 1.4918264999654482, + "learning_rate": 1.6280675358767005e-05, + "loss": 0.9225, + "step": 2047 + }, + { + "epoch": 0.31, + "grad_norm": 1.2983771595321323, + "learning_rate": 1.6276913612907005e-05, + "loss": 0.8829, + "step": 2048 + }, + { + "epoch": 0.31, + "grad_norm": 1.4042975521218652, + "learning_rate": 1.6273150400759363e-05, + "loss": 0.9553, + "step": 2049 + }, + { + "epoch": 0.31, + "grad_norm": 1.3670530191766468, + "learning_rate": 1.626938572320316e-05, + "loss": 0.9586, + "step": 2050 + }, + { + "epoch": 0.31, + "grad_norm": 1.7118950046842918, + "learning_rate": 1.6265619581117827e-05, + "loss": 0.8918, + "step": 2051 + }, + { + "epoch": 0.31, + "grad_norm": 1.6059810860474364, + "learning_rate": 1.626185197538314e-05, + "loss": 0.8534, + "step": 2052 + }, + { + "epoch": 0.31, + "grad_norm": 1.4542421346009562, + "learning_rate": 1.6258082906879203e-05, + "loss": 0.8438, + "step": 2053 + }, + { + "epoch": 0.31, + "grad_norm": 1.3872075777411936, + "learning_rate": 1.6254312376486478e-05, + "loss": 0.8785, + "step": 2054 + }, + { + "epoch": 0.31, + "grad_norm": 1.5172539098330657, + "learning_rate": 1.6250540385085754e-05, + "loss": 0.8844, + "step": 2055 + }, + { + "epoch": 0.31, + "grad_norm": 1.3844475037746036, + "learning_rate": 1.624676693355818e-05, + "loss": 0.9405, + "step": 2056 + }, + { + "epoch": 0.31, + "grad_norm": 1.7606453702335263, + "learning_rate": 1.6242992022785225e-05, + "loss": 0.8863, + "step": 2057 + }, + { + "epoch": 0.31, + "grad_norm": 1.7003422432064619, + "learning_rate": 1.623921565364871e-05, + "loss": 0.8944, + "step": 2058 + }, + { + "epoch": 0.31, + "grad_norm": 1.7130310203318548, + "learning_rate": 1.62354378270308e-05, + "loss": 0.8927, + "step": 2059 + }, + { + "epoch": 0.31, + "grad_norm": 1.5905434793199673, + "learning_rate": 1.6231658543813994e-05, + "loss": 0.8592, + "step": 2060 + }, + { + "epoch": 0.31, + "grad_norm": 1.2917759289757775, + "learning_rate": 1.6227877804881126e-05, + "loss": 0.8089, + "step": 2061 + }, + { + "epoch": 0.31, + "grad_norm": 1.439521879782407, + "learning_rate": 1.6224095611115385e-05, + "loss": 0.8094, + "step": 2062 + }, + { + "epoch": 0.31, + "grad_norm": 1.3578887301294011, + "learning_rate": 1.622031196340029e-05, + "loss": 0.9578, + "step": 2063 + }, + { + "epoch": 0.31, + "grad_norm": 1.4755677595098133, + "learning_rate": 1.62165268626197e-05, + "loss": 0.9084, + "step": 2064 + }, + { + "epoch": 0.31, + "grad_norm": 1.377149178365839, + "learning_rate": 1.6212740309657814e-05, + "loss": 0.9298, + "step": 2065 + }, + { + "epoch": 0.31, + "grad_norm": 1.4730742693498102, + "learning_rate": 1.6208952305399175e-05, + "loss": 0.9347, + "step": 2066 + }, + { + "epoch": 0.31, + "grad_norm": 1.5585505293109423, + "learning_rate": 1.620516285072866e-05, + "loss": 1.028, + "step": 2067 + }, + { + "epoch": 0.31, + "grad_norm": 1.4727086391793578, + "learning_rate": 1.6201371946531483e-05, + "loss": 0.8647, + "step": 2068 + }, + { + "epoch": 0.31, + "grad_norm": 1.5238672200605055, + "learning_rate": 1.6197579593693197e-05, + "loss": 0.8957, + "step": 2069 + }, + { + "epoch": 0.31, + "grad_norm": 1.433432135970078, + "learning_rate": 1.6193785793099706e-05, + "loss": 0.9113, + "step": 2070 + }, + { + "epoch": 0.31, + "grad_norm": 1.3850307768961654, + "learning_rate": 1.6189990545637234e-05, + "loss": 0.87, + "step": 2071 + }, + { + "epoch": 0.31, + "grad_norm": 1.6087341363590808, + "learning_rate": 1.6186193852192356e-05, + "loss": 0.8862, + "step": 2072 + }, + { + "epoch": 0.31, + "grad_norm": 1.5855787381158655, + "learning_rate": 1.618239571365198e-05, + "loss": 0.8915, + "step": 2073 + }, + { + "epoch": 0.31, + "grad_norm": 1.5917248873434566, + "learning_rate": 1.6178596130903345e-05, + "loss": 0.84, + "step": 2074 + }, + { + "epoch": 0.31, + "grad_norm": 1.5559401210334554, + "learning_rate": 1.6174795104834042e-05, + "loss": 0.9011, + "step": 2075 + }, + { + "epoch": 0.31, + "grad_norm": 1.4147010363941737, + "learning_rate": 1.6170992636331983e-05, + "loss": 0.9283, + "step": 2076 + }, + { + "epoch": 0.31, + "grad_norm": 1.468813100940576, + "learning_rate": 1.6167188726285433e-05, + "loss": 0.8916, + "step": 2077 + }, + { + "epoch": 0.31, + "grad_norm": 1.346034853427754, + "learning_rate": 1.6163383375582983e-05, + "loss": 0.8515, + "step": 2078 + }, + { + "epoch": 0.31, + "grad_norm": 1.6959678601645014, + "learning_rate": 1.6159576585113556e-05, + "loss": 0.8427, + "step": 2079 + }, + { + "epoch": 0.31, + "grad_norm": 1.494265976179155, + "learning_rate": 1.615576835576643e-05, + "loss": 0.9348, + "step": 2080 + }, + { + "epoch": 0.31, + "grad_norm": 1.5296318118583636, + "learning_rate": 1.6151958688431204e-05, + "loss": 0.8514, + "step": 2081 + }, + { + "epoch": 0.31, + "grad_norm": 1.5943274760743165, + "learning_rate": 1.6148147583997813e-05, + "loss": 0.9528, + "step": 2082 + }, + { + "epoch": 0.31, + "grad_norm": 1.4207516912340847, + "learning_rate": 1.6144335043356533e-05, + "loss": 0.8594, + "step": 2083 + }, + { + "epoch": 0.31, + "grad_norm": 1.6194334490109523, + "learning_rate": 1.6140521067397978e-05, + "loss": 0.8966, + "step": 2084 + }, + { + "epoch": 0.31, + "grad_norm": 1.7309903549193382, + "learning_rate": 1.613670565701309e-05, + "loss": 0.9132, + "step": 2085 + }, + { + "epoch": 0.31, + "grad_norm": 1.4828829054704131, + "learning_rate": 1.6132888813093147e-05, + "loss": 0.8547, + "step": 2086 + }, + { + "epoch": 0.31, + "grad_norm": 1.4658897899348529, + "learning_rate": 1.6129070536529767e-05, + "loss": 0.8575, + "step": 2087 + }, + { + "epoch": 0.31, + "grad_norm": 1.5753911565320524, + "learning_rate": 1.6125250828214897e-05, + "loss": 0.9427, + "step": 2088 + }, + { + "epoch": 0.31, + "grad_norm": 1.8613786550429863, + "learning_rate": 1.6121429689040825e-05, + "loss": 0.9581, + "step": 2089 + }, + { + "epoch": 0.31, + "grad_norm": 1.536593823331853, + "learning_rate": 1.6117607119900172e-05, + "loss": 0.841, + "step": 2090 + }, + { + "epoch": 0.31, + "grad_norm": 1.548912047268822, + "learning_rate": 1.6113783121685883e-05, + "loss": 0.8924, + "step": 2091 + }, + { + "epoch": 0.31, + "grad_norm": 1.39243288540068, + "learning_rate": 1.6109957695291246e-05, + "loss": 0.9371, + "step": 2092 + }, + { + "epoch": 0.31, + "grad_norm": 1.3766790753082239, + "learning_rate": 1.6106130841609883e-05, + "loss": 0.9236, + "step": 2093 + }, + { + "epoch": 0.31, + "grad_norm": 1.5224816490364808, + "learning_rate": 1.6102302561535748e-05, + "loss": 0.9086, + "step": 2094 + }, + { + "epoch": 0.31, + "grad_norm": 1.3929197673454525, + "learning_rate": 1.6098472855963126e-05, + "loss": 0.9315, + "step": 2095 + }, + { + "epoch": 0.31, + "grad_norm": 1.5478797837822456, + "learning_rate": 1.609464172578664e-05, + "loss": 0.9094, + "step": 2096 + }, + { + "epoch": 0.31, + "grad_norm": 1.8001229216489654, + "learning_rate": 1.6090809171901237e-05, + "loss": 0.9044, + "step": 2097 + }, + { + "epoch": 0.31, + "grad_norm": 1.4322805067552384, + "learning_rate": 1.6086975195202207e-05, + "loss": 0.8694, + "step": 2098 + }, + { + "epoch": 0.31, + "grad_norm": 1.5616684870720325, + "learning_rate": 1.608313979658516e-05, + "loss": 0.8225, + "step": 2099 + }, + { + "epoch": 0.31, + "grad_norm": 1.6133765464974652, + "learning_rate": 1.6079302976946055e-05, + "loss": 0.8523, + "step": 2100 + }, + { + "epoch": 0.31, + "grad_norm": 1.4668609215307034, + "learning_rate": 1.607546473718117e-05, + "loss": 0.8671, + "step": 2101 + }, + { + "epoch": 0.31, + "grad_norm": 1.6183739055890012, + "learning_rate": 1.6071625078187113e-05, + "loss": 0.8829, + "step": 2102 + }, + { + "epoch": 0.31, + "grad_norm": 1.5109289204325285, + "learning_rate": 1.6067784000860838e-05, + "loss": 0.8995, + "step": 2103 + }, + { + "epoch": 0.31, + "grad_norm": 1.4755414530665127, + "learning_rate": 1.606394150609961e-05, + "loss": 0.8562, + "step": 2104 + }, + { + "epoch": 0.31, + "grad_norm": 1.4392717478803918, + "learning_rate": 1.6060097594801044e-05, + "loss": 0.8945, + "step": 2105 + }, + { + "epoch": 0.31, + "grad_norm": 1.4674829321178218, + "learning_rate": 1.605625226786308e-05, + "loss": 0.8958, + "step": 2106 + }, + { + "epoch": 0.31, + "grad_norm": 1.471457028520947, + "learning_rate": 1.605240552618398e-05, + "loss": 0.9222, + "step": 2107 + }, + { + "epoch": 0.31, + "grad_norm": 1.5716153743086634, + "learning_rate": 1.6048557370662346e-05, + "loss": 0.9905, + "step": 2108 + }, + { + "epoch": 0.31, + "grad_norm": 1.6040361578266755, + "learning_rate": 1.6044707802197106e-05, + "loss": 0.9642, + "step": 2109 + }, + { + "epoch": 0.31, + "grad_norm": 1.592093355850832, + "learning_rate": 1.6040856821687523e-05, + "loss": 0.8493, + "step": 2110 + }, + { + "epoch": 0.31, + "grad_norm": 1.4991844010058457, + "learning_rate": 1.6037004430033186e-05, + "loss": 0.8945, + "step": 2111 + }, + { + "epoch": 0.32, + "grad_norm": 1.4009151194782519, + "learning_rate": 1.603315062813401e-05, + "loss": 0.8817, + "step": 2112 + }, + { + "epoch": 0.32, + "grad_norm": 1.5971079634027765, + "learning_rate": 1.602929541689025e-05, + "loss": 0.8554, + "step": 2113 + }, + { + "epoch": 0.32, + "grad_norm": 1.392574563764216, + "learning_rate": 1.6025438797202478e-05, + "loss": 0.8991, + "step": 2114 + }, + { + "epoch": 0.32, + "grad_norm": 1.4263051285524602, + "learning_rate": 1.6021580769971602e-05, + "loss": 0.855, + "step": 2115 + }, + { + "epoch": 0.32, + "grad_norm": 0.8538281078218202, + "learning_rate": 1.601772133609886e-05, + "loss": 0.346, + "step": 2116 + }, + { + "epoch": 0.32, + "grad_norm": 1.509314583177348, + "learning_rate": 1.601386049648581e-05, + "loss": 0.8159, + "step": 2117 + }, + { + "epoch": 0.32, + "grad_norm": 1.3356096816378258, + "learning_rate": 1.6009998252034354e-05, + "loss": 0.9456, + "step": 2118 + }, + { + "epoch": 0.32, + "grad_norm": 1.4564987844734394, + "learning_rate": 1.6006134603646706e-05, + "loss": 0.8351, + "step": 2119 + }, + { + "epoch": 0.32, + "grad_norm": 1.610159923284599, + "learning_rate": 1.6002269552225413e-05, + "loss": 0.9145, + "step": 2120 + }, + { + "epoch": 0.32, + "grad_norm": 1.3321514865440012, + "learning_rate": 1.599840309867336e-05, + "loss": 0.915, + "step": 2121 + }, + { + "epoch": 0.32, + "grad_norm": 1.4104962995637664, + "learning_rate": 1.5994535243893742e-05, + "loss": 0.9151, + "step": 2122 + }, + { + "epoch": 0.32, + "grad_norm": 0.7997541167092745, + "learning_rate": 1.5990665988790094e-05, + "loss": 0.3586, + "step": 2123 + }, + { + "epoch": 0.32, + "grad_norm": 1.446492106823676, + "learning_rate": 1.5986795334266276e-05, + "loss": 0.8006, + "step": 2124 + }, + { + "epoch": 0.32, + "grad_norm": 1.3127292684003837, + "learning_rate": 1.598292328122647e-05, + "loss": 0.8736, + "step": 2125 + }, + { + "epoch": 0.32, + "grad_norm": 1.4516478241910866, + "learning_rate": 1.597904983057519e-05, + "loss": 0.8844, + "step": 2126 + }, + { + "epoch": 0.32, + "grad_norm": 1.459318812083033, + "learning_rate": 1.5975174983217273e-05, + "loss": 0.9549, + "step": 2127 + }, + { + "epoch": 0.32, + "grad_norm": 1.3856462911876546, + "learning_rate": 1.5971298740057885e-05, + "loss": 0.9146, + "step": 2128 + }, + { + "epoch": 0.32, + "grad_norm": 1.5500762751966088, + "learning_rate": 1.596742110200252e-05, + "loss": 0.9535, + "step": 2129 + }, + { + "epoch": 0.32, + "grad_norm": 1.4467626410621395, + "learning_rate": 1.596354206995699e-05, + "loss": 0.9051, + "step": 2130 + }, + { + "epoch": 0.32, + "grad_norm": 1.5381992292308826, + "learning_rate": 1.5959661644827432e-05, + "loss": 0.8696, + "step": 2131 + }, + { + "epoch": 0.32, + "grad_norm": 1.406221321926157, + "learning_rate": 1.5955779827520327e-05, + "loss": 0.8703, + "step": 2132 + }, + { + "epoch": 0.32, + "grad_norm": 1.3309496763287714, + "learning_rate": 1.595189661894246e-05, + "loss": 0.9135, + "step": 2133 + }, + { + "epoch": 0.32, + "grad_norm": 1.3602130857750188, + "learning_rate": 1.5948012020000948e-05, + "loss": 0.9501, + "step": 2134 + }, + { + "epoch": 0.32, + "grad_norm": 1.384241937255165, + "learning_rate": 1.5944126031603236e-05, + "loss": 0.8658, + "step": 2135 + }, + { + "epoch": 0.32, + "grad_norm": 1.383323932484943, + "learning_rate": 1.594023865465709e-05, + "loss": 0.9245, + "step": 2136 + }, + { + "epoch": 0.32, + "grad_norm": 1.4038502416652323, + "learning_rate": 1.5936349890070602e-05, + "loss": 0.9337, + "step": 2137 + }, + { + "epoch": 0.32, + "grad_norm": 1.5538719137012444, + "learning_rate": 1.5932459738752194e-05, + "loss": 0.828, + "step": 2138 + }, + { + "epoch": 0.32, + "grad_norm": 1.5704005392266531, + "learning_rate": 1.5928568201610593e-05, + "loss": 0.8224, + "step": 2139 + }, + { + "epoch": 0.32, + "grad_norm": 1.326320690406751, + "learning_rate": 1.5924675279554874e-05, + "loss": 0.8088, + "step": 2140 + }, + { + "epoch": 0.32, + "grad_norm": 1.4552450717930803, + "learning_rate": 1.5920780973494418e-05, + "loss": 0.8675, + "step": 2141 + }, + { + "epoch": 0.32, + "grad_norm": 1.5900269726142606, + "learning_rate": 1.5916885284338937e-05, + "loss": 0.8273, + "step": 2142 + }, + { + "epoch": 0.32, + "grad_norm": 1.5908164661845712, + "learning_rate": 1.591298821299846e-05, + "loss": 0.9057, + "step": 2143 + }, + { + "epoch": 0.32, + "grad_norm": 1.4337178515683364, + "learning_rate": 1.5909089760383354e-05, + "loss": 0.8492, + "step": 2144 + }, + { + "epoch": 0.32, + "grad_norm": 1.523661560307685, + "learning_rate": 1.590518992740429e-05, + "loss": 0.8446, + "step": 2145 + }, + { + "epoch": 0.32, + "grad_norm": 1.5152220608239626, + "learning_rate": 1.5901288714972268e-05, + "loss": 0.9108, + "step": 2146 + }, + { + "epoch": 0.32, + "grad_norm": 1.6261242223827643, + "learning_rate": 1.5897386123998613e-05, + "loss": 0.9179, + "step": 2147 + }, + { + "epoch": 0.32, + "grad_norm": 1.3758853999925584, + "learning_rate": 1.5893482155394978e-05, + "loss": 0.9105, + "step": 2148 + }, + { + "epoch": 0.32, + "grad_norm": 0.8682760707058219, + "learning_rate": 1.588957681007332e-05, + "loss": 0.3252, + "step": 2149 + }, + { + "epoch": 0.32, + "grad_norm": 1.5432027084094335, + "learning_rate": 1.588567008894593e-05, + "loss": 0.8688, + "step": 2150 + }, + { + "epoch": 0.32, + "grad_norm": 1.4288536946959483, + "learning_rate": 1.5881761992925425e-05, + "loss": 0.931, + "step": 2151 + }, + { + "epoch": 0.32, + "grad_norm": 1.2984264276046191, + "learning_rate": 1.5877852522924733e-05, + "loss": 0.8444, + "step": 2152 + }, + { + "epoch": 0.32, + "grad_norm": 1.7091369549894575, + "learning_rate": 1.5873941679857107e-05, + "loss": 0.9633, + "step": 2153 + }, + { + "epoch": 0.32, + "grad_norm": 1.3813193929510699, + "learning_rate": 1.5870029464636113e-05, + "loss": 0.8091, + "step": 2154 + }, + { + "epoch": 0.32, + "grad_norm": 1.3125478609475392, + "learning_rate": 1.586611587817566e-05, + "loss": 0.891, + "step": 2155 + }, + { + "epoch": 0.32, + "grad_norm": 1.4716689249264063, + "learning_rate": 1.5862200921389947e-05, + "loss": 0.9326, + "step": 2156 + }, + { + "epoch": 0.32, + "grad_norm": 1.4523612945773285, + "learning_rate": 1.5858284595193514e-05, + "loss": 0.8674, + "step": 2157 + }, + { + "epoch": 0.32, + "grad_norm": 1.3363962333961232, + "learning_rate": 1.585436690050122e-05, + "loss": 0.9189, + "step": 2158 + }, + { + "epoch": 0.32, + "grad_norm": 0.8834030562403844, + "learning_rate": 1.5850447838228235e-05, + "loss": 0.3528, + "step": 2159 + }, + { + "epoch": 0.32, + "grad_norm": 1.421970990367768, + "learning_rate": 1.584652740929005e-05, + "loss": 0.9381, + "step": 2160 + }, + { + "epoch": 0.32, + "grad_norm": 1.5336996161463692, + "learning_rate": 1.5842605614602482e-05, + "loss": 0.9464, + "step": 2161 + }, + { + "epoch": 0.32, + "grad_norm": 1.2243795027048836, + "learning_rate": 1.5838682455081657e-05, + "loss": 0.8881, + "step": 2162 + }, + { + "epoch": 0.32, + "grad_norm": 1.4865764981850325, + "learning_rate": 1.583475793164403e-05, + "loss": 0.9292, + "step": 2163 + }, + { + "epoch": 0.32, + "grad_norm": 1.4753815050477015, + "learning_rate": 1.583083204520637e-05, + "loss": 0.8603, + "step": 2164 + }, + { + "epoch": 0.32, + "grad_norm": 1.2964147388713199, + "learning_rate": 1.5826904796685763e-05, + "loss": 0.8703, + "step": 2165 + }, + { + "epoch": 0.32, + "grad_norm": 1.5292446857136408, + "learning_rate": 1.582297618699961e-05, + "loss": 0.7957, + "step": 2166 + }, + { + "epoch": 0.32, + "grad_norm": 1.5148288562784187, + "learning_rate": 1.581904621706565e-05, + "loss": 0.8397, + "step": 2167 + }, + { + "epoch": 0.32, + "grad_norm": 1.6078954557551346, + "learning_rate": 1.581511488780191e-05, + "loss": 0.8936, + "step": 2168 + }, + { + "epoch": 0.32, + "grad_norm": 1.5709729762820952, + "learning_rate": 1.581118220012675e-05, + "loss": 0.8943, + "step": 2169 + }, + { + "epoch": 0.32, + "grad_norm": 1.6692647232106506, + "learning_rate": 1.5807248154958848e-05, + "loss": 0.8169, + "step": 2170 + }, + { + "epoch": 0.32, + "grad_norm": 1.4049933196100746, + "learning_rate": 1.5803312753217202e-05, + "loss": 0.9228, + "step": 2171 + }, + { + "epoch": 0.32, + "grad_norm": 1.635474221215026, + "learning_rate": 1.5799375995821116e-05, + "loss": 0.898, + "step": 2172 + }, + { + "epoch": 0.32, + "grad_norm": 1.553659932170287, + "learning_rate": 1.5795437883690225e-05, + "loss": 0.9361, + "step": 2173 + }, + { + "epoch": 0.32, + "grad_norm": 1.4474668240688067, + "learning_rate": 1.5791498417744463e-05, + "loss": 0.9461, + "step": 2174 + }, + { + "epoch": 0.32, + "grad_norm": 1.455261302785284, + "learning_rate": 1.578755759890409e-05, + "loss": 0.8827, + "step": 2175 + }, + { + "epoch": 0.32, + "grad_norm": 1.5158065252140318, + "learning_rate": 1.5783615428089693e-05, + "loss": 0.8082, + "step": 2176 + }, + { + "epoch": 0.32, + "grad_norm": 1.2723095006700331, + "learning_rate": 1.577967190622215e-05, + "loss": 0.8922, + "step": 2177 + }, + { + "epoch": 0.32, + "grad_norm": 1.4095759479018382, + "learning_rate": 1.5775727034222675e-05, + "loss": 0.8597, + "step": 2178 + }, + { + "epoch": 0.33, + "grad_norm": 1.6099465915272615, + "learning_rate": 1.5771780813012793e-05, + "loss": 0.908, + "step": 2179 + }, + { + "epoch": 0.33, + "grad_norm": 1.5208257123568947, + "learning_rate": 1.5767833243514337e-05, + "loss": 0.8655, + "step": 2180 + }, + { + "epoch": 0.33, + "grad_norm": 1.4611612259562878, + "learning_rate": 1.576388432664946e-05, + "loss": 1.0259, + "step": 2181 + }, + { + "epoch": 0.33, + "grad_norm": 1.30880284133487, + "learning_rate": 1.5759934063340627e-05, + "loss": 0.8593, + "step": 2182 + }, + { + "epoch": 0.33, + "grad_norm": 1.5608508287888272, + "learning_rate": 1.5755982454510626e-05, + "loss": 0.8777, + "step": 2183 + }, + { + "epoch": 0.33, + "grad_norm": 1.5452633961299929, + "learning_rate": 1.5752029501082547e-05, + "loss": 0.9205, + "step": 2184 + }, + { + "epoch": 0.33, + "grad_norm": 1.3680647052245383, + "learning_rate": 1.574807520397981e-05, + "loss": 0.9257, + "step": 2185 + }, + { + "epoch": 0.33, + "grad_norm": 1.4420257570652117, + "learning_rate": 1.5744119564126127e-05, + "loss": 0.8994, + "step": 2186 + }, + { + "epoch": 0.33, + "grad_norm": 1.4966876977454537, + "learning_rate": 1.5740162582445545e-05, + "loss": 0.9569, + "step": 2187 + }, + { + "epoch": 0.33, + "grad_norm": 1.4967860519963248, + "learning_rate": 1.573620425986241e-05, + "loss": 0.8831, + "step": 2188 + }, + { + "epoch": 0.33, + "grad_norm": 0.9470941766944733, + "learning_rate": 1.573224459730139e-05, + "loss": 0.3304, + "step": 2189 + }, + { + "epoch": 0.33, + "grad_norm": 1.4879996316147293, + "learning_rate": 1.572828359568746e-05, + "loss": 0.8798, + "step": 2190 + }, + { + "epoch": 0.33, + "grad_norm": 1.3005612225926124, + "learning_rate": 1.572432125594591e-05, + "loss": 0.8512, + "step": 2191 + }, + { + "epoch": 0.33, + "grad_norm": 1.3872635563657048, + "learning_rate": 1.5720357579002346e-05, + "loss": 0.8609, + "step": 2192 + }, + { + "epoch": 0.33, + "grad_norm": 1.6451510509568592, + "learning_rate": 1.5716392565782683e-05, + "loss": 0.8718, + "step": 2193 + }, + { + "epoch": 0.33, + "grad_norm": 1.5026512594517971, + "learning_rate": 1.5712426217213143e-05, + "loss": 0.8823, + "step": 2194 + }, + { + "epoch": 0.33, + "grad_norm": 1.3529289599428111, + "learning_rate": 1.5708458534220274e-05, + "loss": 0.7962, + "step": 2195 + }, + { + "epoch": 0.33, + "grad_norm": 1.5069515847694586, + "learning_rate": 1.570448951773092e-05, + "loss": 0.9035, + "step": 2196 + }, + { + "epoch": 0.33, + "grad_norm": 1.4580898785602376, + "learning_rate": 1.5700519168672248e-05, + "loss": 0.9372, + "step": 2197 + }, + { + "epoch": 0.33, + "grad_norm": 1.51821727009186, + "learning_rate": 1.5696547487971727e-05, + "loss": 0.8667, + "step": 2198 + }, + { + "epoch": 0.33, + "grad_norm": 1.4271390550471268, + "learning_rate": 1.5692574476557147e-05, + "loss": 0.837, + "step": 2199 + }, + { + "epoch": 0.33, + "grad_norm": 1.6197580648864112, + "learning_rate": 1.5688600135356608e-05, + "loss": 0.8572, + "step": 2200 + }, + { + "epoch": 0.33, + "grad_norm": 1.4561272634408733, + "learning_rate": 1.5684624465298503e-05, + "loss": 0.7864, + "step": 2201 + }, + { + "epoch": 0.33, + "grad_norm": 1.3937402333037126, + "learning_rate": 1.568064746731156e-05, + "loss": 0.9116, + "step": 2202 + }, + { + "epoch": 0.33, + "grad_norm": 1.3484215139256983, + "learning_rate": 1.5676669142324802e-05, + "loss": 0.8893, + "step": 2203 + }, + { + "epoch": 0.33, + "grad_norm": 1.3091942548048083, + "learning_rate": 1.567268949126757e-05, + "loss": 0.804, + "step": 2204 + }, + { + "epoch": 0.33, + "grad_norm": 1.3148493615935646, + "learning_rate": 1.5668708515069503e-05, + "loss": 0.843, + "step": 2205 + }, + { + "epoch": 0.33, + "grad_norm": 1.3483526856004353, + "learning_rate": 1.5664726214660562e-05, + "loss": 0.8502, + "step": 2206 + }, + { + "epoch": 0.33, + "grad_norm": 1.63158011129037, + "learning_rate": 1.5660742590971014e-05, + "loss": 0.8952, + "step": 2207 + }, + { + "epoch": 0.33, + "grad_norm": 1.3337432739792972, + "learning_rate": 1.5656757644931433e-05, + "loss": 0.9488, + "step": 2208 + }, + { + "epoch": 0.33, + "grad_norm": 1.5399512766054197, + "learning_rate": 1.5652771377472702e-05, + "loss": 0.8717, + "step": 2209 + }, + { + "epoch": 0.33, + "grad_norm": 1.4468026604756365, + "learning_rate": 1.5648783789526015e-05, + "loss": 0.9012, + "step": 2210 + }, + { + "epoch": 0.33, + "grad_norm": 1.7067606327369804, + "learning_rate": 1.5644794882022875e-05, + "loss": 0.902, + "step": 2211 + }, + { + "epoch": 0.33, + "grad_norm": 1.4971688468939985, + "learning_rate": 1.5640804655895086e-05, + "loss": 0.9287, + "step": 2212 + }, + { + "epoch": 0.33, + "grad_norm": 1.354877618546282, + "learning_rate": 1.5636813112074766e-05, + "loss": 0.9474, + "step": 2213 + }, + { + "epoch": 0.33, + "grad_norm": 1.8363170606810937, + "learning_rate": 1.5632820251494343e-05, + "loss": 0.9499, + "step": 2214 + }, + { + "epoch": 0.33, + "grad_norm": 1.600171282753231, + "learning_rate": 1.5628826075086546e-05, + "loss": 0.9178, + "step": 2215 + }, + { + "epoch": 0.33, + "grad_norm": 1.5056276452344919, + "learning_rate": 1.562483058378442e-05, + "loss": 0.8927, + "step": 2216 + }, + { + "epoch": 0.33, + "grad_norm": 1.945562162809114, + "learning_rate": 1.5620833778521306e-05, + "loss": 0.8398, + "step": 2217 + }, + { + "epoch": 0.33, + "grad_norm": 1.4186141618112442, + "learning_rate": 1.5616835660230864e-05, + "loss": 0.8215, + "step": 2218 + }, + { + "epoch": 0.33, + "grad_norm": 1.4689542192154341, + "learning_rate": 1.561283622984705e-05, + "loss": 0.9097, + "step": 2219 + }, + { + "epoch": 0.33, + "grad_norm": 1.534684008479006, + "learning_rate": 1.5608835488304138e-05, + "loss": 0.8979, + "step": 2220 + }, + { + "epoch": 0.33, + "grad_norm": 1.3876388036294593, + "learning_rate": 1.5604833436536692e-05, + "loss": 0.9596, + "step": 2221 + }, + { + "epoch": 0.33, + "grad_norm": 1.5487391312805103, + "learning_rate": 1.5600830075479604e-05, + "loss": 0.8965, + "step": 2222 + }, + { + "epoch": 0.33, + "grad_norm": 1.391755006595629, + "learning_rate": 1.5596825406068043e-05, + "loss": 0.9275, + "step": 2223 + }, + { + "epoch": 0.33, + "grad_norm": 1.5384997558976328, + "learning_rate": 1.5592819429237517e-05, + "loss": 0.9673, + "step": 2224 + }, + { + "epoch": 0.33, + "grad_norm": 1.6906691237842095, + "learning_rate": 1.5588812145923812e-05, + "loss": 0.8544, + "step": 2225 + }, + { + "epoch": 0.33, + "grad_norm": 1.480162351048646, + "learning_rate": 1.5584803557063034e-05, + "loss": 0.9415, + "step": 2226 + }, + { + "epoch": 0.33, + "grad_norm": 1.5164365175077197, + "learning_rate": 1.5580793663591583e-05, + "loss": 0.9039, + "step": 2227 + }, + { + "epoch": 0.33, + "grad_norm": 1.4675588666684414, + "learning_rate": 1.5576782466446184e-05, + "loss": 0.8955, + "step": 2228 + }, + { + "epoch": 0.33, + "grad_norm": 1.6289353665372766, + "learning_rate": 1.5572769966563844e-05, + "loss": 0.7649, + "step": 2229 + }, + { + "epoch": 0.33, + "grad_norm": 1.4166898562628878, + "learning_rate": 1.556875616488188e-05, + "loss": 0.9154, + "step": 2230 + }, + { + "epoch": 0.33, + "grad_norm": 1.5940118713608051, + "learning_rate": 1.5564741062337928e-05, + "loss": 0.8957, + "step": 2231 + }, + { + "epoch": 0.33, + "grad_norm": 1.6883631922453783, + "learning_rate": 1.5560724659869905e-05, + "loss": 0.9627, + "step": 2232 + }, + { + "epoch": 0.33, + "grad_norm": 1.5894771816451598, + "learning_rate": 1.5556706958416044e-05, + "loss": 0.8595, + "step": 2233 + }, + { + "epoch": 0.33, + "grad_norm": 1.5662607698591635, + "learning_rate": 1.5552687958914892e-05, + "loss": 0.9358, + "step": 2234 + }, + { + "epoch": 0.33, + "grad_norm": 1.5258953948083736, + "learning_rate": 1.5548667662305275e-05, + "loss": 0.8574, + "step": 2235 + }, + { + "epoch": 0.33, + "grad_norm": 1.4040338219456894, + "learning_rate": 1.554464606952634e-05, + "loss": 0.8979, + "step": 2236 + }, + { + "epoch": 0.33, + "grad_norm": 1.5369400594007905, + "learning_rate": 1.5540623181517532e-05, + "loss": 0.9366, + "step": 2237 + }, + { + "epoch": 0.33, + "grad_norm": 0.8806919665176648, + "learning_rate": 1.5536598999218592e-05, + "loss": 0.3439, + "step": 2238 + }, + { + "epoch": 0.33, + "grad_norm": 1.6199453123598875, + "learning_rate": 1.553257352356958e-05, + "loss": 0.8684, + "step": 2239 + }, + { + "epoch": 0.33, + "grad_norm": 1.2520656215275296, + "learning_rate": 1.5528546755510842e-05, + "loss": 0.9159, + "step": 2240 + }, + { + "epoch": 0.33, + "grad_norm": 1.4701516446023108, + "learning_rate": 1.5524518695983025e-05, + "loss": 0.9249, + "step": 2241 + }, + { + "epoch": 0.33, + "grad_norm": 1.4088294135179658, + "learning_rate": 1.5520489345927095e-05, + "loss": 0.888, + "step": 2242 + }, + { + "epoch": 0.33, + "grad_norm": 1.3486884780323214, + "learning_rate": 1.5516458706284306e-05, + "loss": 0.9258, + "step": 2243 + }, + { + "epoch": 0.33, + "grad_norm": 1.4183918713098838, + "learning_rate": 1.5512426777996206e-05, + "loss": 0.92, + "step": 2244 + }, + { + "epoch": 0.33, + "grad_norm": 1.4185166981634143, + "learning_rate": 1.550839356200467e-05, + "loss": 0.8933, + "step": 2245 + }, + { + "epoch": 0.34, + "grad_norm": 1.6146733037174301, + "learning_rate": 1.5504359059251843e-05, + "loss": 0.7927, + "step": 2246 + }, + { + "epoch": 0.34, + "grad_norm": 1.5621420378726925, + "learning_rate": 1.5500323270680194e-05, + "loss": 0.7321, + "step": 2247 + }, + { + "epoch": 0.34, + "grad_norm": 0.9487127370154067, + "learning_rate": 1.5496286197232483e-05, + "loss": 0.3571, + "step": 2248 + }, + { + "epoch": 0.34, + "grad_norm": 1.35658916070422, + "learning_rate": 1.5492247839851767e-05, + "loss": 0.9208, + "step": 2249 + }, + { + "epoch": 0.34, + "grad_norm": 1.4108634909517979, + "learning_rate": 1.5488208199481406e-05, + "loss": 0.8842, + "step": 2250 + }, + { + "epoch": 0.34, + "grad_norm": 1.4262516081982304, + "learning_rate": 1.5484167277065066e-05, + "loss": 0.8548, + "step": 2251 + }, + { + "epoch": 0.34, + "grad_norm": 1.5429323759499571, + "learning_rate": 1.5480125073546705e-05, + "loss": 0.9319, + "step": 2252 + }, + { + "epoch": 0.34, + "grad_norm": 1.3841626087791337, + "learning_rate": 1.5476081589870574e-05, + "loss": 0.9014, + "step": 2253 + }, + { + "epoch": 0.34, + "grad_norm": 1.649983356944591, + "learning_rate": 1.5472036826981243e-05, + "loss": 0.9695, + "step": 2254 + }, + { + "epoch": 0.34, + "grad_norm": 1.4482985567220856, + "learning_rate": 1.5467990785823562e-05, + "loss": 0.9038, + "step": 2255 + }, + { + "epoch": 0.34, + "grad_norm": 1.3362472367525746, + "learning_rate": 1.5463943467342694e-05, + "loss": 0.9004, + "step": 2256 + }, + { + "epoch": 0.34, + "grad_norm": 1.4007567854073044, + "learning_rate": 1.5459894872484083e-05, + "loss": 0.9684, + "step": 2257 + }, + { + "epoch": 0.34, + "grad_norm": 1.61796984660997, + "learning_rate": 1.5455845002193485e-05, + "loss": 0.8807, + "step": 2258 + }, + { + "epoch": 0.34, + "grad_norm": 1.458057620532137, + "learning_rate": 1.5451793857416955e-05, + "loss": 0.874, + "step": 2259 + }, + { + "epoch": 0.34, + "grad_norm": 1.5021369806670417, + "learning_rate": 1.5447741439100833e-05, + "loss": 0.787, + "step": 2260 + }, + { + "epoch": 0.34, + "grad_norm": 1.7381014168330053, + "learning_rate": 1.5443687748191772e-05, + "loss": 0.9132, + "step": 2261 + }, + { + "epoch": 0.34, + "grad_norm": 1.3540898415126852, + "learning_rate": 1.5439632785636707e-05, + "loss": 0.9611, + "step": 2262 + }, + { + "epoch": 0.34, + "grad_norm": 1.4220905723485562, + "learning_rate": 1.5435576552382883e-05, + "loss": 0.8881, + "step": 2263 + }, + { + "epoch": 0.34, + "grad_norm": 1.3431553377314935, + "learning_rate": 1.5431519049377835e-05, + "loss": 0.9087, + "step": 2264 + }, + { + "epoch": 0.34, + "grad_norm": 1.2961370734239621, + "learning_rate": 1.54274602775694e-05, + "loss": 0.863, + "step": 2265 + }, + { + "epoch": 0.34, + "grad_norm": 1.3762272186152134, + "learning_rate": 1.5423400237905695e-05, + "loss": 0.903, + "step": 2266 + }, + { + "epoch": 0.34, + "grad_norm": 1.5387414546573555, + "learning_rate": 1.5419338931335155e-05, + "loss": 0.9676, + "step": 2267 + }, + { + "epoch": 0.34, + "grad_norm": 1.6189088284489723, + "learning_rate": 1.541527635880651e-05, + "loss": 0.832, + "step": 2268 + }, + { + "epoch": 0.34, + "grad_norm": 1.6331762913586843, + "learning_rate": 1.541121252126876e-05, + "loss": 0.871, + "step": 2269 + }, + { + "epoch": 0.34, + "grad_norm": 1.5171575393036398, + "learning_rate": 1.5407147419671227e-05, + "loss": 0.8905, + "step": 2270 + }, + { + "epoch": 0.34, + "grad_norm": 1.4316603683942186, + "learning_rate": 1.5403081054963524e-05, + "loss": 0.8599, + "step": 2271 + }, + { + "epoch": 0.34, + "grad_norm": 1.3474805696668777, + "learning_rate": 1.539901342809554e-05, + "loss": 0.9241, + "step": 2272 + }, + { + "epoch": 0.34, + "grad_norm": 1.4477942990205288, + "learning_rate": 1.5394944540017484e-05, + "loss": 0.785, + "step": 2273 + }, + { + "epoch": 0.34, + "grad_norm": 1.375109046437524, + "learning_rate": 1.539087439167985e-05, + "loss": 0.8774, + "step": 2274 + }, + { + "epoch": 0.34, + "grad_norm": 1.3627256481363823, + "learning_rate": 1.5386802984033417e-05, + "loss": 0.8875, + "step": 2275 + }, + { + "epoch": 0.34, + "grad_norm": 1.2827976869478268, + "learning_rate": 1.538273031802927e-05, + "loss": 0.9094, + "step": 2276 + }, + { + "epoch": 0.34, + "grad_norm": 1.5484833652332188, + "learning_rate": 1.5378656394618788e-05, + "loss": 0.8757, + "step": 2277 + }, + { + "epoch": 0.34, + "grad_norm": 1.8288346129902062, + "learning_rate": 1.537458121475363e-05, + "loss": 0.9001, + "step": 2278 + }, + { + "epoch": 0.34, + "grad_norm": 1.37712553108305, + "learning_rate": 1.537050477938577e-05, + "loss": 0.9418, + "step": 2279 + }, + { + "epoch": 0.34, + "grad_norm": 1.548431791344098, + "learning_rate": 1.5366427089467458e-05, + "loss": 0.8796, + "step": 2280 + }, + { + "epoch": 0.34, + "grad_norm": 1.605941113595781, + "learning_rate": 1.5362348145951242e-05, + "loss": 0.8877, + "step": 2281 + }, + { + "epoch": 0.34, + "grad_norm": 1.3944602729194844, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.9196, + "step": 2282 + }, + { + "epoch": 0.34, + "grad_norm": 1.629319528483046, + "learning_rate": 1.5354186501936764e-05, + "loss": 0.9201, + "step": 2283 + }, + { + "epoch": 0.34, + "grad_norm": 1.420243089173775, + "learning_rate": 1.535010380334506e-05, + "loss": 0.7979, + "step": 2284 + }, + { + "epoch": 0.34, + "grad_norm": 1.7455501606322166, + "learning_rate": 1.5346019854968576e-05, + "loss": 0.9369, + "step": 2285 + }, + { + "epoch": 0.34, + "grad_norm": 1.4095648267675434, + "learning_rate": 1.534193465776132e-05, + "loss": 0.9682, + "step": 2286 + }, + { + "epoch": 0.34, + "grad_norm": 1.3625991844792567, + "learning_rate": 1.53378482126776e-05, + "loss": 0.9045, + "step": 2287 + }, + { + "epoch": 0.34, + "grad_norm": 1.2975952126331425, + "learning_rate": 1.5333760520672e-05, + "loss": 0.9706, + "step": 2288 + }, + { + "epoch": 0.34, + "grad_norm": 1.705059630390903, + "learning_rate": 1.532967158269941e-05, + "loss": 0.9164, + "step": 2289 + }, + { + "epoch": 0.34, + "grad_norm": 1.4009019764024053, + "learning_rate": 1.5325581399715013e-05, + "loss": 0.9911, + "step": 2290 + }, + { + "epoch": 0.34, + "grad_norm": 1.4617561861562665, + "learning_rate": 1.5321489972674268e-05, + "loss": 0.8413, + "step": 2291 + }, + { + "epoch": 0.34, + "grad_norm": 1.7649872965902604, + "learning_rate": 1.5317397302532933e-05, + "loss": 0.9461, + "step": 2292 + }, + { + "epoch": 0.34, + "grad_norm": 0.945462129313132, + "learning_rate": 1.5313303390247062e-05, + "loss": 0.325, + "step": 2293 + }, + { + "epoch": 0.34, + "grad_norm": 1.6442386849458337, + "learning_rate": 1.5309208236772988e-05, + "loss": 0.8823, + "step": 2294 + }, + { + "epoch": 0.34, + "grad_norm": 1.351108691206905, + "learning_rate": 1.5305111843067343e-05, + "loss": 0.8636, + "step": 2295 + }, + { + "epoch": 0.34, + "grad_norm": 1.629401278689014, + "learning_rate": 1.530101421008704e-05, + "loss": 0.8586, + "step": 2296 + }, + { + "epoch": 0.34, + "grad_norm": 1.5085386710431767, + "learning_rate": 1.529691533878929e-05, + "loss": 0.9112, + "step": 2297 + }, + { + "epoch": 0.34, + "grad_norm": 1.4307107546741473, + "learning_rate": 1.5292815230131592e-05, + "loss": 0.8613, + "step": 2298 + }, + { + "epoch": 0.34, + "grad_norm": 1.4314818482870362, + "learning_rate": 1.5288713885071723e-05, + "loss": 0.8195, + "step": 2299 + }, + { + "epoch": 0.34, + "grad_norm": 1.7662229928387017, + "learning_rate": 1.5284611304567772e-05, + "loss": 0.877, + "step": 2300 + }, + { + "epoch": 0.34, + "grad_norm": 1.5834178446475238, + "learning_rate": 1.5280507489578087e-05, + "loss": 0.852, + "step": 2301 + }, + { + "epoch": 0.34, + "grad_norm": 1.3132249371325437, + "learning_rate": 1.527640244106133e-05, + "loss": 0.8881, + "step": 2302 + }, + { + "epoch": 0.34, + "grad_norm": 1.3965578693444147, + "learning_rate": 1.5272296159976438e-05, + "loss": 0.9597, + "step": 2303 + }, + { + "epoch": 0.34, + "grad_norm": 1.4857132475717196, + "learning_rate": 1.5268188647282633e-05, + "loss": 0.8924, + "step": 2304 + }, + { + "epoch": 0.34, + "grad_norm": 1.8222442474419664, + "learning_rate": 1.5264079903939437e-05, + "loss": 0.8943, + "step": 2305 + }, + { + "epoch": 0.34, + "grad_norm": 1.552734078701159, + "learning_rate": 1.5259969930906653e-05, + "loss": 0.8836, + "step": 2306 + }, + { + "epoch": 0.34, + "grad_norm": 1.8007006927146774, + "learning_rate": 1.5255858729144368e-05, + "loss": 0.8999, + "step": 2307 + }, + { + "epoch": 0.34, + "grad_norm": 1.2328321360461432, + "learning_rate": 1.5251746299612959e-05, + "loss": 0.8642, + "step": 2308 + }, + { + "epoch": 0.34, + "grad_norm": 1.3393351052321958, + "learning_rate": 1.5247632643273092e-05, + "loss": 0.9278, + "step": 2309 + }, + { + "epoch": 0.34, + "grad_norm": 1.532450030571645, + "learning_rate": 1.5243517761085717e-05, + "loss": 0.8624, + "step": 2310 + }, + { + "epoch": 0.34, + "grad_norm": 1.696772369816577, + "learning_rate": 1.5239401654012072e-05, + "loss": 0.8041, + "step": 2311 + }, + { + "epoch": 0.34, + "grad_norm": 1.3992107034629708, + "learning_rate": 1.5235284323013674e-05, + "loss": 0.8638, + "step": 2312 + }, + { + "epoch": 0.35, + "grad_norm": 1.4534537125595492, + "learning_rate": 1.5231165769052343e-05, + "loss": 0.8273, + "step": 2313 + }, + { + "epoch": 0.35, + "grad_norm": 1.5868384381394676, + "learning_rate": 1.5227045993090164e-05, + "loss": 0.9711, + "step": 2314 + }, + { + "epoch": 0.35, + "grad_norm": 1.578135235441649, + "learning_rate": 1.5222924996089518e-05, + "loss": 0.9509, + "step": 2315 + }, + { + "epoch": 0.35, + "grad_norm": 1.5827739214502767, + "learning_rate": 1.5218802779013077e-05, + "loss": 0.962, + "step": 2316 + }, + { + "epoch": 0.35, + "grad_norm": 1.3964253733248453, + "learning_rate": 1.5214679342823786e-05, + "loss": 0.9139, + "step": 2317 + }, + { + "epoch": 0.35, + "grad_norm": 1.420897818103144, + "learning_rate": 1.521055468848488e-05, + "loss": 0.9577, + "step": 2318 + }, + { + "epoch": 0.35, + "grad_norm": 1.366209770458451, + "learning_rate": 1.520642881695988e-05, + "loss": 0.8727, + "step": 2319 + }, + { + "epoch": 0.35, + "grad_norm": 1.2955331050244765, + "learning_rate": 1.520230172921259e-05, + "loss": 0.8803, + "step": 2320 + }, + { + "epoch": 0.35, + "grad_norm": 1.4311093005368292, + "learning_rate": 1.5198173426207095e-05, + "loss": 0.9317, + "step": 2321 + }, + { + "epoch": 0.35, + "grad_norm": 1.4325823685380945, + "learning_rate": 1.5194043908907774e-05, + "loss": 0.8965, + "step": 2322 + }, + { + "epoch": 0.35, + "grad_norm": 1.3878217239573705, + "learning_rate": 1.5189913178279277e-05, + "loss": 0.8678, + "step": 2323 + }, + { + "epoch": 0.35, + "grad_norm": 1.6083570635997588, + "learning_rate": 1.5185781235286543e-05, + "loss": 0.9364, + "step": 2324 + }, + { + "epoch": 0.35, + "grad_norm": 1.7317265595725782, + "learning_rate": 1.5181648080894796e-05, + "loss": 0.8544, + "step": 2325 + }, + { + "epoch": 0.35, + "grad_norm": 1.3870363254643336, + "learning_rate": 1.5177513716069543e-05, + "loss": 0.8949, + "step": 2326 + }, + { + "epoch": 0.35, + "grad_norm": 1.4668381535325745, + "learning_rate": 1.5173378141776569e-05, + "loss": 0.8749, + "step": 2327 + }, + { + "epoch": 0.35, + "grad_norm": 0.9537819485161397, + "learning_rate": 1.5169241358981946e-05, + "loss": 0.3455, + "step": 2328 + }, + { + "epoch": 0.35, + "grad_norm": 1.4274521247339735, + "learning_rate": 1.5165103368652028e-05, + "loss": 0.8641, + "step": 2329 + }, + { + "epoch": 0.35, + "grad_norm": 1.6050826685124675, + "learning_rate": 1.5160964171753444e-05, + "loss": 0.863, + "step": 2330 + }, + { + "epoch": 0.35, + "grad_norm": 1.429337965600591, + "learning_rate": 1.515682376925312e-05, + "loss": 0.9224, + "step": 2331 + }, + { + "epoch": 0.35, + "grad_norm": 1.4831736873410963, + "learning_rate": 1.515268216211825e-05, + "loss": 0.8651, + "step": 2332 + }, + { + "epoch": 0.35, + "grad_norm": 1.3691947412067333, + "learning_rate": 1.514853935131631e-05, + "loss": 0.8515, + "step": 2333 + }, + { + "epoch": 0.35, + "grad_norm": 1.578314145480583, + "learning_rate": 1.5144395337815066e-05, + "loss": 0.8925, + "step": 2334 + }, + { + "epoch": 0.35, + "grad_norm": 1.3003209770357662, + "learning_rate": 1.514025012258256e-05, + "loss": 0.8414, + "step": 2335 + }, + { + "epoch": 0.35, + "grad_norm": 1.4716077443987046, + "learning_rate": 1.5136103706587111e-05, + "loss": 0.9372, + "step": 2336 + }, + { + "epoch": 0.35, + "grad_norm": 1.5880096315877341, + "learning_rate": 1.5131956090797326e-05, + "loss": 0.8109, + "step": 2337 + }, + { + "epoch": 0.35, + "grad_norm": 1.4644640773832716, + "learning_rate": 1.5127807276182084e-05, + "loss": 0.8915, + "step": 2338 + }, + { + "epoch": 0.35, + "grad_norm": 1.4236492161009207, + "learning_rate": 1.5123657263710558e-05, + "loss": 0.9772, + "step": 2339 + }, + { + "epoch": 0.35, + "grad_norm": 1.433436242163942, + "learning_rate": 1.5119506054352178e-05, + "loss": 0.8215, + "step": 2340 + }, + { + "epoch": 0.35, + "grad_norm": 1.6063318369015331, + "learning_rate": 1.5115353649076676e-05, + "loss": 0.8725, + "step": 2341 + }, + { + "epoch": 0.35, + "grad_norm": 1.839816332165306, + "learning_rate": 1.5111200048854055e-05, + "loss": 0.8826, + "step": 2342 + }, + { + "epoch": 0.35, + "grad_norm": 1.5216059868754872, + "learning_rate": 1.5107045254654588e-05, + "loss": 0.8669, + "step": 2343 + }, + { + "epoch": 0.35, + "grad_norm": 1.5300404609259486, + "learning_rate": 1.5102889267448846e-05, + "loss": 0.8356, + "step": 2344 + }, + { + "epoch": 0.35, + "grad_norm": 1.4313848808295826, + "learning_rate": 1.5098732088207662e-05, + "loss": 0.886, + "step": 2345 + }, + { + "epoch": 0.35, + "grad_norm": 1.5299634473431738, + "learning_rate": 1.509457371790215e-05, + "loss": 0.8882, + "step": 2346 + }, + { + "epoch": 0.35, + "grad_norm": 1.4963645801974077, + "learning_rate": 1.5090414157503715e-05, + "loss": 0.8834, + "step": 2347 + }, + { + "epoch": 0.35, + "grad_norm": 1.4424008056458453, + "learning_rate": 1.5086253407984024e-05, + "loss": 0.9216, + "step": 2348 + }, + { + "epoch": 0.35, + "grad_norm": 1.5844090382834033, + "learning_rate": 1.5082091470315028e-05, + "loss": 0.9112, + "step": 2349 + }, + { + "epoch": 0.35, + "grad_norm": 1.4526264421115531, + "learning_rate": 1.5077928345468959e-05, + "loss": 0.9302, + "step": 2350 + }, + { + "epoch": 0.35, + "grad_norm": 1.3418442052853163, + "learning_rate": 1.5073764034418326e-05, + "loss": 0.8916, + "step": 2351 + }, + { + "epoch": 0.35, + "grad_norm": 1.6891761376321324, + "learning_rate": 1.5069598538135905e-05, + "loss": 0.8041, + "step": 2352 + }, + { + "epoch": 0.35, + "grad_norm": 1.6325851389366492, + "learning_rate": 1.5065431857594762e-05, + "loss": 0.9762, + "step": 2353 + }, + { + "epoch": 0.35, + "grad_norm": 1.4238778885306893, + "learning_rate": 1.5061263993768234e-05, + "loss": 0.9239, + "step": 2354 + }, + { + "epoch": 0.35, + "grad_norm": 1.4156220777115123, + "learning_rate": 1.5057094947629928e-05, + "loss": 0.8043, + "step": 2355 + }, + { + "epoch": 0.35, + "grad_norm": 1.5654947364937417, + "learning_rate": 1.5052924720153743e-05, + "loss": 0.8883, + "step": 2356 + }, + { + "epoch": 0.35, + "grad_norm": 1.500895562527217, + "learning_rate": 1.504875331231384e-05, + "loss": 0.8911, + "step": 2357 + }, + { + "epoch": 0.35, + "grad_norm": 1.9238417436146753, + "learning_rate": 1.5044580725084659e-05, + "loss": 0.8234, + "step": 2358 + }, + { + "epoch": 0.35, + "grad_norm": 1.4575158021709844, + "learning_rate": 1.504040695944092e-05, + "loss": 0.8808, + "step": 2359 + }, + { + "epoch": 0.35, + "grad_norm": 1.5683316267511338, + "learning_rate": 1.503623201635761e-05, + "loss": 0.8476, + "step": 2360 + }, + { + "epoch": 0.35, + "grad_norm": 0.9450791116160474, + "learning_rate": 1.503205589681e-05, + "loss": 0.3525, + "step": 2361 + }, + { + "epoch": 0.35, + "grad_norm": 1.5042833864780634, + "learning_rate": 1.5027878601773633e-05, + "loss": 0.9452, + "step": 2362 + }, + { + "epoch": 0.35, + "grad_norm": 1.4131609923776498, + "learning_rate": 1.5023700132224321e-05, + "loss": 0.9178, + "step": 2363 + }, + { + "epoch": 0.35, + "grad_norm": 1.4386296226455535, + "learning_rate": 1.501952048913816e-05, + "loss": 0.9758, + "step": 2364 + }, + { + "epoch": 0.35, + "grad_norm": 1.6133745053380633, + "learning_rate": 1.5015339673491514e-05, + "loss": 0.9059, + "step": 2365 + }, + { + "epoch": 0.35, + "grad_norm": 1.4305133297514012, + "learning_rate": 1.5011157686261015e-05, + "loss": 0.9652, + "step": 2366 + }, + { + "epoch": 0.35, + "grad_norm": 1.3992892034589874, + "learning_rate": 1.5006974528423585e-05, + "loss": 0.8351, + "step": 2367 + }, + { + "epoch": 0.35, + "grad_norm": 1.4661697775481777, + "learning_rate": 1.5002790200956408e-05, + "loss": 0.957, + "step": 2368 + }, + { + "epoch": 0.35, + "grad_norm": 1.83615936348965, + "learning_rate": 1.4998604704836939e-05, + "loss": 0.8929, + "step": 2369 + }, + { + "epoch": 0.35, + "grad_norm": 1.4292429261452897, + "learning_rate": 1.4994418041042913e-05, + "loss": 0.8038, + "step": 2370 + }, + { + "epoch": 0.35, + "grad_norm": 1.5957900003809427, + "learning_rate": 1.499023021055234e-05, + "loss": 0.8624, + "step": 2371 + }, + { + "epoch": 0.35, + "grad_norm": 1.566093941659696, + "learning_rate": 1.4986041214343487e-05, + "loss": 0.8765, + "step": 2372 + }, + { + "epoch": 0.35, + "grad_norm": 1.4259109803843022, + "learning_rate": 1.498185105339491e-05, + "loss": 0.8846, + "step": 2373 + }, + { + "epoch": 0.35, + "grad_norm": 1.331796638042093, + "learning_rate": 1.4977659728685435e-05, + "loss": 0.8723, + "step": 2374 + }, + { + "epoch": 0.35, + "grad_norm": 1.5317344057461537, + "learning_rate": 1.4973467241194146e-05, + "loss": 0.8441, + "step": 2375 + }, + { + "epoch": 0.35, + "grad_norm": 1.4247032159462272, + "learning_rate": 1.4969273591900415e-05, + "loss": 0.8656, + "step": 2376 + }, + { + "epoch": 0.35, + "grad_norm": 1.4195801482441541, + "learning_rate": 1.4965078781783882e-05, + "loss": 0.8739, + "step": 2377 + }, + { + "epoch": 0.35, + "grad_norm": 1.5502697740156313, + "learning_rate": 1.4960882811824446e-05, + "loss": 0.8784, + "step": 2378 + }, + { + "epoch": 0.35, + "grad_norm": 1.4750552652752638, + "learning_rate": 1.4956685683002292e-05, + "loss": 0.9735, + "step": 2379 + }, + { + "epoch": 0.36, + "grad_norm": 1.522049382723157, + "learning_rate": 1.4952487396297872e-05, + "loss": 0.8709, + "step": 2380 + }, + { + "epoch": 0.36, + "grad_norm": 1.4467281355070856, + "learning_rate": 1.49482879526919e-05, + "loss": 0.9195, + "step": 2381 + }, + { + "epoch": 0.36, + "grad_norm": 0.9023408022745709, + "learning_rate": 1.494408735316537e-05, + "loss": 0.3312, + "step": 2382 + }, + { + "epoch": 0.36, + "grad_norm": 1.4877313363881013, + "learning_rate": 1.493988559869954e-05, + "loss": 0.8392, + "step": 2383 + }, + { + "epoch": 0.36, + "grad_norm": 1.3222375779438422, + "learning_rate": 1.4935682690275945e-05, + "loss": 0.8781, + "step": 2384 + }, + { + "epoch": 0.36, + "grad_norm": 0.9539362745691424, + "learning_rate": 1.4931478628876382e-05, + "loss": 0.357, + "step": 2385 + }, + { + "epoch": 0.36, + "grad_norm": 1.5534801347691423, + "learning_rate": 1.4927273415482916e-05, + "loss": 0.8494, + "step": 2386 + }, + { + "epoch": 0.36, + "grad_norm": 1.5393226604550732, + "learning_rate": 1.4923067051077893e-05, + "loss": 0.8438, + "step": 2387 + }, + { + "epoch": 0.36, + "grad_norm": 1.2562688905863733, + "learning_rate": 1.4918859536643915e-05, + "loss": 0.8121, + "step": 2388 + }, + { + "epoch": 0.36, + "grad_norm": 1.4484418447155691, + "learning_rate": 1.491465087316386e-05, + "loss": 0.8762, + "step": 2389 + }, + { + "epoch": 0.36, + "grad_norm": 1.6561482033845658, + "learning_rate": 1.4910441061620872e-05, + "loss": 0.8421, + "step": 2390 + }, + { + "epoch": 0.36, + "grad_norm": 1.1526764033938166, + "learning_rate": 1.490623010299836e-05, + "loss": 0.9026, + "step": 2391 + }, + { + "epoch": 0.36, + "grad_norm": 2.001240085062583, + "learning_rate": 1.490201799828001e-05, + "loss": 0.8276, + "step": 2392 + }, + { + "epoch": 0.36, + "grad_norm": 1.6877704895867491, + "learning_rate": 1.4897804748449767e-05, + "loss": 0.8119, + "step": 2393 + }, + { + "epoch": 0.36, + "grad_norm": 1.349063557279118, + "learning_rate": 1.4893590354491845e-05, + "loss": 0.8973, + "step": 2394 + }, + { + "epoch": 0.36, + "grad_norm": 1.3497819721141477, + "learning_rate": 1.488937481739073e-05, + "loss": 0.8139, + "step": 2395 + }, + { + "epoch": 0.36, + "grad_norm": 1.427016609449477, + "learning_rate": 1.4885158138131171e-05, + "loss": 0.9004, + "step": 2396 + }, + { + "epoch": 0.36, + "grad_norm": 1.6930834728031425, + "learning_rate": 1.4880940317698182e-05, + "loss": 0.8265, + "step": 2397 + }, + { + "epoch": 0.36, + "grad_norm": 1.5074730513249242, + "learning_rate": 1.4876721357077051e-05, + "loss": 0.9503, + "step": 2398 + }, + { + "epoch": 0.36, + "grad_norm": 1.4632392717400273, + "learning_rate": 1.4872501257253325e-05, + "loss": 0.8291, + "step": 2399 + }, + { + "epoch": 0.36, + "grad_norm": 1.5221970631070543, + "learning_rate": 1.486828001921282e-05, + "loss": 0.96, + "step": 2400 + }, + { + "epoch": 0.36, + "grad_norm": 1.4653147169219638, + "learning_rate": 1.4864057643941616e-05, + "loss": 0.8475, + "step": 2401 + }, + { + "epoch": 0.36, + "grad_norm": 1.2854470399396005, + "learning_rate": 1.485983413242606e-05, + "loss": 0.8813, + "step": 2402 + }, + { + "epoch": 0.36, + "grad_norm": 1.6584598356029185, + "learning_rate": 1.4855609485652771e-05, + "loss": 0.846, + "step": 2403 + }, + { + "epoch": 0.36, + "grad_norm": 1.3443065865674046, + "learning_rate": 1.4851383704608621e-05, + "loss": 0.9016, + "step": 2404 + }, + { + "epoch": 0.36, + "grad_norm": 1.4248373800478973, + "learning_rate": 1.4847156790280753e-05, + "loss": 0.8781, + "step": 2405 + }, + { + "epoch": 0.36, + "grad_norm": 1.6376030252757539, + "learning_rate": 1.4842928743656577e-05, + "loss": 0.8687, + "step": 2406 + }, + { + "epoch": 0.36, + "grad_norm": 1.440488834136608, + "learning_rate": 1.4838699565723764e-05, + "loss": 0.8349, + "step": 2407 + }, + { + "epoch": 0.36, + "grad_norm": 1.389871529118748, + "learning_rate": 1.483446925747025e-05, + "loss": 0.8012, + "step": 2408 + }, + { + "epoch": 0.36, + "grad_norm": 1.4600342212260655, + "learning_rate": 1.4830237819884237e-05, + "loss": 0.9294, + "step": 2409 + }, + { + "epoch": 0.36, + "grad_norm": 1.5131935483661065, + "learning_rate": 1.4826005253954185e-05, + "loss": 0.8992, + "step": 2410 + }, + { + "epoch": 0.36, + "grad_norm": 1.4004860423845367, + "learning_rate": 1.4821771560668828e-05, + "loss": 0.8442, + "step": 2411 + }, + { + "epoch": 0.36, + "grad_norm": 1.4613737835668572, + "learning_rate": 1.4817536741017153e-05, + "loss": 0.8467, + "step": 2412 + }, + { + "epoch": 0.36, + "grad_norm": 1.4061947401103583, + "learning_rate": 1.4813300795988417e-05, + "loss": 0.8241, + "step": 2413 + }, + { + "epoch": 0.36, + "grad_norm": 1.2683224601296244, + "learning_rate": 1.4809063726572132e-05, + "loss": 0.8952, + "step": 2414 + }, + { + "epoch": 0.36, + "grad_norm": 1.5535422652571538, + "learning_rate": 1.4804825533758082e-05, + "loss": 0.9388, + "step": 2415 + }, + { + "epoch": 0.36, + "grad_norm": 1.6076200230999473, + "learning_rate": 1.4800586218536315e-05, + "loss": 0.9591, + "step": 2416 + }, + { + "epoch": 0.36, + "grad_norm": 1.400041287235936, + "learning_rate": 1.479634578189712e-05, + "loss": 0.8987, + "step": 2417 + }, + { + "epoch": 0.36, + "grad_norm": 1.377021043874476, + "learning_rate": 1.4792104224831079e-05, + "loss": 0.9753, + "step": 2418 + }, + { + "epoch": 0.36, + "grad_norm": 1.335754231380377, + "learning_rate": 1.4787861548329012e-05, + "loss": 0.89, + "step": 2419 + }, + { + "epoch": 0.36, + "grad_norm": 1.5095214720356913, + "learning_rate": 1.478361775338201e-05, + "loss": 0.932, + "step": 2420 + }, + { + "epoch": 0.36, + "grad_norm": 1.3076321598873812, + "learning_rate": 1.4779372840981425e-05, + "loss": 0.956, + "step": 2421 + }, + { + "epoch": 0.36, + "grad_norm": 1.369567585488684, + "learning_rate": 1.4775126812118865e-05, + "loss": 0.8672, + "step": 2422 + }, + { + "epoch": 0.36, + "grad_norm": 1.804175563342299, + "learning_rate": 1.477087966778621e-05, + "loss": 0.8879, + "step": 2423 + }, + { + "epoch": 0.36, + "grad_norm": 1.689058301798696, + "learning_rate": 1.4766631408975586e-05, + "loss": 0.8523, + "step": 2424 + }, + { + "epoch": 0.36, + "grad_norm": 1.312661117076535, + "learning_rate": 1.4762382036679393e-05, + "loss": 0.8567, + "step": 2425 + }, + { + "epoch": 0.36, + "grad_norm": 1.3588466412209272, + "learning_rate": 1.475813155189028e-05, + "loss": 0.8619, + "step": 2426 + }, + { + "epoch": 0.36, + "grad_norm": 1.3263685508280627, + "learning_rate": 1.4753879955601162e-05, + "loss": 0.9336, + "step": 2427 + }, + { + "epoch": 0.36, + "grad_norm": 1.456260335945933, + "learning_rate": 1.4749627248805214e-05, + "loss": 0.8655, + "step": 2428 + }, + { + "epoch": 0.36, + "grad_norm": 1.6345935181748061, + "learning_rate": 1.4745373432495869e-05, + "loss": 0.8579, + "step": 2429 + }, + { + "epoch": 0.36, + "grad_norm": 1.63078982838802, + "learning_rate": 1.4741118507666815e-05, + "loss": 0.9137, + "step": 2430 + }, + { + "epoch": 0.36, + "grad_norm": 1.5178031392493103, + "learning_rate": 1.4736862475312005e-05, + "loss": 0.9154, + "step": 2431 + }, + { + "epoch": 0.36, + "grad_norm": 1.6975881382058196, + "learning_rate": 1.4732605336425651e-05, + "loss": 0.9785, + "step": 2432 + }, + { + "epoch": 0.36, + "grad_norm": 1.4917404999460437, + "learning_rate": 1.4728347092002218e-05, + "loss": 0.8574, + "step": 2433 + }, + { + "epoch": 0.36, + "grad_norm": 1.430996562609836, + "learning_rate": 1.4724087743036432e-05, + "loss": 0.8788, + "step": 2434 + }, + { + "epoch": 0.36, + "grad_norm": 1.4830372818362922, + "learning_rate": 1.4719827290523281e-05, + "loss": 0.8599, + "step": 2435 + }, + { + "epoch": 0.36, + "grad_norm": 1.4356241257311562, + "learning_rate": 1.4715565735458004e-05, + "loss": 0.8991, + "step": 2436 + }, + { + "epoch": 0.36, + "grad_norm": 1.4217908102042733, + "learning_rate": 1.4711303078836098e-05, + "loss": 0.8859, + "step": 2437 + }, + { + "epoch": 0.36, + "grad_norm": 1.4335111469512989, + "learning_rate": 1.470703932165333e-05, + "loss": 0.8343, + "step": 2438 + }, + { + "epoch": 0.36, + "grad_norm": 1.4389369322442183, + "learning_rate": 1.4702774464905703e-05, + "loss": 0.8744, + "step": 2439 + }, + { + "epoch": 0.36, + "grad_norm": 1.4257248383950092, + "learning_rate": 1.469850850958949e-05, + "loss": 0.867, + "step": 2440 + }, + { + "epoch": 0.36, + "grad_norm": 1.489892645435096, + "learning_rate": 1.4694241456701227e-05, + "loss": 0.8774, + "step": 2441 + }, + { + "epoch": 0.36, + "grad_norm": 1.2522499599960846, + "learning_rate": 1.4689973307237687e-05, + "loss": 0.836, + "step": 2442 + }, + { + "epoch": 0.36, + "grad_norm": 1.552581436494605, + "learning_rate": 1.468570406219592e-05, + "loss": 0.8857, + "step": 2443 + }, + { + "epoch": 0.36, + "grad_norm": 1.9034813464894758, + "learning_rate": 1.4681433722573212e-05, + "loss": 0.8745, + "step": 2444 + }, + { + "epoch": 0.36, + "grad_norm": 1.7042494838096385, + "learning_rate": 1.4677162289367126e-05, + "loss": 0.8993, + "step": 2445 + }, + { + "epoch": 0.36, + "grad_norm": 1.0272032107655216, + "learning_rate": 1.467288976357546e-05, + "loss": 0.3507, + "step": 2446 + }, + { + "epoch": 0.37, + "grad_norm": 1.3359220640506526, + "learning_rate": 1.466861614619628e-05, + "loss": 0.8886, + "step": 2447 + }, + { + "epoch": 0.37, + "grad_norm": 1.427324392080972, + "learning_rate": 1.4664341438227903e-05, + "loss": 0.8058, + "step": 2448 + }, + { + "epoch": 0.37, + "grad_norm": 1.654523611012364, + "learning_rate": 1.46600656406689e-05, + "loss": 0.8601, + "step": 2449 + }, + { + "epoch": 0.37, + "grad_norm": 1.4648545995946267, + "learning_rate": 1.4655788754518101e-05, + "loss": 0.7682, + "step": 2450 + }, + { + "epoch": 0.37, + "grad_norm": 0.8865049667434419, + "learning_rate": 1.4651510780774585e-05, + "loss": 0.3383, + "step": 2451 + }, + { + "epoch": 0.37, + "grad_norm": 1.3405881105529465, + "learning_rate": 1.4647231720437687e-05, + "loss": 0.9416, + "step": 2452 + }, + { + "epoch": 0.37, + "grad_norm": 1.4222131755338903, + "learning_rate": 1.4642951574506996e-05, + "loss": 0.8263, + "step": 2453 + }, + { + "epoch": 0.37, + "grad_norm": 0.8321441396951367, + "learning_rate": 1.4638670343982356e-05, + "loss": 0.3209, + "step": 2454 + }, + { + "epoch": 0.37, + "grad_norm": 1.392619106979098, + "learning_rate": 1.463438802986386e-05, + "loss": 0.833, + "step": 2455 + }, + { + "epoch": 0.37, + "grad_norm": 1.4416212994695166, + "learning_rate": 1.4630104633151858e-05, + "loss": 0.8464, + "step": 2456 + }, + { + "epoch": 0.37, + "grad_norm": 0.8940930834255698, + "learning_rate": 1.4625820154846953e-05, + "loss": 0.3257, + "step": 2457 + }, + { + "epoch": 0.37, + "grad_norm": 1.4518221858725542, + "learning_rate": 1.462153459595e-05, + "loss": 0.7631, + "step": 2458 + }, + { + "epoch": 0.37, + "grad_norm": 1.297559268233778, + "learning_rate": 1.4617247957462105e-05, + "loss": 0.8622, + "step": 2459 + }, + { + "epoch": 0.37, + "grad_norm": 1.5874471655167313, + "learning_rate": 1.4612960240384624e-05, + "loss": 0.8689, + "step": 2460 + }, + { + "epoch": 0.37, + "grad_norm": 1.765896841660207, + "learning_rate": 1.4608671445719176e-05, + "loss": 0.9403, + "step": 2461 + }, + { + "epoch": 0.37, + "grad_norm": 1.3266408678098056, + "learning_rate": 1.4604381574467616e-05, + "loss": 0.9079, + "step": 2462 + }, + { + "epoch": 0.37, + "grad_norm": 0.9644536893054212, + "learning_rate": 1.4600090627632061e-05, + "loss": 0.3515, + "step": 2463 + }, + { + "epoch": 0.37, + "grad_norm": 1.5591183272000826, + "learning_rate": 1.4595798606214882e-05, + "loss": 0.8543, + "step": 2464 + }, + { + "epoch": 0.37, + "grad_norm": 0.9083395414548567, + "learning_rate": 1.4591505511218685e-05, + "loss": 0.3563, + "step": 2465 + }, + { + "epoch": 0.37, + "grad_norm": 1.5305677409675236, + "learning_rate": 1.4587211343646345e-05, + "loss": 0.8811, + "step": 2466 + }, + { + "epoch": 0.37, + "grad_norm": 1.3240987503909145, + "learning_rate": 1.4582916104500977e-05, + "loss": 0.9405, + "step": 2467 + }, + { + "epoch": 0.37, + "grad_norm": 1.214697916471032, + "learning_rate": 1.4578619794785956e-05, + "loss": 0.9009, + "step": 2468 + }, + { + "epoch": 0.37, + "grad_norm": 1.4146701537207016, + "learning_rate": 1.457432241550489e-05, + "loss": 0.8477, + "step": 2469 + }, + { + "epoch": 0.37, + "grad_norm": 1.4341981273880275, + "learning_rate": 1.4570023967661651e-05, + "loss": 0.8904, + "step": 2470 + }, + { + "epoch": 0.37, + "grad_norm": 1.5617096666431618, + "learning_rate": 1.4565724452260361e-05, + "loss": 0.9562, + "step": 2471 + }, + { + "epoch": 0.37, + "grad_norm": 1.2771080496656244, + "learning_rate": 1.4561423870305383e-05, + "loss": 0.8642, + "step": 2472 + }, + { + "epoch": 0.37, + "grad_norm": 1.4269934224052399, + "learning_rate": 1.4557122222801332e-05, + "loss": 0.8667, + "step": 2473 + }, + { + "epoch": 0.37, + "grad_norm": 1.6239354579443843, + "learning_rate": 1.455281951075308e-05, + "loss": 0.8982, + "step": 2474 + }, + { + "epoch": 0.37, + "grad_norm": 1.2853315938161574, + "learning_rate": 1.4548515735165735e-05, + "loss": 0.8794, + "step": 2475 + }, + { + "epoch": 0.37, + "grad_norm": 1.4950762431034248, + "learning_rate": 1.4544210897044662e-05, + "loss": 0.8418, + "step": 2476 + }, + { + "epoch": 0.37, + "grad_norm": 1.5800141497228144, + "learning_rate": 1.4539904997395468e-05, + "loss": 0.922, + "step": 2477 + }, + { + "epoch": 0.37, + "grad_norm": 1.6768337114826615, + "learning_rate": 1.4535598037224016e-05, + "loss": 0.9138, + "step": 2478 + }, + { + "epoch": 0.37, + "grad_norm": 1.4183248717676997, + "learning_rate": 1.453129001753641e-05, + "loss": 0.8768, + "step": 2479 + }, + { + "epoch": 0.37, + "grad_norm": 1.2319062328588768, + "learning_rate": 1.4526980939339008e-05, + "loss": 0.8361, + "step": 2480 + }, + { + "epoch": 0.37, + "grad_norm": 1.4220273760221085, + "learning_rate": 1.4522670803638403e-05, + "loss": 0.8686, + "step": 2481 + }, + { + "epoch": 0.37, + "grad_norm": 1.5702338495533112, + "learning_rate": 1.4518359611441452e-05, + "loss": 0.8722, + "step": 2482 + }, + { + "epoch": 0.37, + "grad_norm": 1.5212147942761973, + "learning_rate": 1.4514047363755244e-05, + "loss": 0.9242, + "step": 2483 + }, + { + "epoch": 0.37, + "grad_norm": 1.619409485916836, + "learning_rate": 1.450973406158712e-05, + "loss": 0.937, + "step": 2484 + }, + { + "epoch": 0.37, + "grad_norm": 1.4174108984231562, + "learning_rate": 1.4505419705944672e-05, + "loss": 0.9332, + "step": 2485 + }, + { + "epoch": 0.37, + "grad_norm": 1.5244552049577198, + "learning_rate": 1.450110429783573e-05, + "loss": 0.8876, + "step": 2486 + }, + { + "epoch": 0.37, + "grad_norm": 1.001233135928483, + "learning_rate": 1.4496787838268378e-05, + "loss": 0.3707, + "step": 2487 + }, + { + "epoch": 0.37, + "grad_norm": 1.3222148961601148, + "learning_rate": 1.4492470328250937e-05, + "loss": 0.9267, + "step": 2488 + }, + { + "epoch": 0.37, + "grad_norm": 1.321131517811123, + "learning_rate": 1.4488151768791981e-05, + "loss": 0.8573, + "step": 2489 + }, + { + "epoch": 0.37, + "grad_norm": 1.3838353273818826, + "learning_rate": 1.4483832160900326e-05, + "loss": 0.7848, + "step": 2490 + }, + { + "epoch": 0.37, + "grad_norm": 1.5302241371602243, + "learning_rate": 1.447951150558503e-05, + "loss": 0.8495, + "step": 2491 + }, + { + "epoch": 0.37, + "grad_norm": 1.383114049112275, + "learning_rate": 1.4475189803855399e-05, + "loss": 0.8219, + "step": 2492 + }, + { + "epoch": 0.37, + "grad_norm": 1.379191325250921, + "learning_rate": 1.4470867056720986e-05, + "loss": 0.8633, + "step": 2493 + }, + { + "epoch": 0.37, + "grad_norm": 1.374261044533996, + "learning_rate": 1.4466543265191581e-05, + "loss": 0.8753, + "step": 2494 + }, + { + "epoch": 0.37, + "grad_norm": 1.5698929814633877, + "learning_rate": 1.4462218430277224e-05, + "loss": 0.9101, + "step": 2495 + }, + { + "epoch": 0.37, + "grad_norm": 1.754659180994369, + "learning_rate": 1.44578925529882e-05, + "loss": 0.8941, + "step": 2496 + }, + { + "epoch": 0.37, + "grad_norm": 1.3555804118684756, + "learning_rate": 1.445356563433503e-05, + "loss": 0.8446, + "step": 2497 + }, + { + "epoch": 0.37, + "grad_norm": 1.5655428107230935, + "learning_rate": 1.4449237675328483e-05, + "loss": 0.9254, + "step": 2498 + }, + { + "epoch": 0.37, + "grad_norm": 1.4179944024248319, + "learning_rate": 1.4444908676979572e-05, + "loss": 0.7922, + "step": 2499 + }, + { + "epoch": 0.37, + "grad_norm": 1.4306349878318732, + "learning_rate": 1.4440578640299554e-05, + "loss": 0.9124, + "step": 2500 + }, + { + "epoch": 0.37, + "grad_norm": 1.4057487375892963, + "learning_rate": 1.443624756629992e-05, + "loss": 0.7937, + "step": 2501 + }, + { + "epoch": 0.37, + "grad_norm": 0.959703629237683, + "learning_rate": 1.4431915455992416e-05, + "loss": 0.3069, + "step": 2502 + }, + { + "epoch": 0.37, + "grad_norm": 1.3794660485556631, + "learning_rate": 1.442758231038902e-05, + "loss": 0.878, + "step": 2503 + }, + { + "epoch": 0.37, + "grad_norm": 1.615842428578242, + "learning_rate": 1.442324813050195e-05, + "loss": 0.9043, + "step": 2504 + }, + { + "epoch": 0.37, + "grad_norm": 1.4424541974853513, + "learning_rate": 1.441891291734368e-05, + "loss": 0.8685, + "step": 2505 + }, + { + "epoch": 0.37, + "grad_norm": 0.8013155439881876, + "learning_rate": 1.4414576671926912e-05, + "loss": 0.3216, + "step": 2506 + }, + { + "epoch": 0.37, + "grad_norm": 1.7248805028446959, + "learning_rate": 1.4410239395264594e-05, + "loss": 0.8993, + "step": 2507 + }, + { + "epoch": 0.37, + "grad_norm": 1.6101262083541032, + "learning_rate": 1.4405901088369912e-05, + "loss": 0.852, + "step": 2508 + }, + { + "epoch": 0.37, + "grad_norm": 1.40907005430812, + "learning_rate": 1.4401561752256299e-05, + "loss": 0.9285, + "step": 2509 + }, + { + "epoch": 0.37, + "grad_norm": 1.4433310083949351, + "learning_rate": 1.4397221387937421e-05, + "loss": 0.9448, + "step": 2510 + }, + { + "epoch": 0.37, + "grad_norm": 1.464338085634876, + "learning_rate": 1.4392879996427187e-05, + "loss": 0.8269, + "step": 2511 + }, + { + "epoch": 0.37, + "grad_norm": 1.4392826537091399, + "learning_rate": 1.438853757873975e-05, + "loss": 0.8649, + "step": 2512 + }, + { + "epoch": 0.37, + "grad_norm": 1.3955350783763023, + "learning_rate": 1.43841941358895e-05, + "loss": 0.9145, + "step": 2513 + }, + { + "epoch": 0.38, + "grad_norm": 1.47521857063144, + "learning_rate": 1.437984966889106e-05, + "loss": 0.9298, + "step": 2514 + }, + { + "epoch": 0.38, + "grad_norm": 1.637925496356776, + "learning_rate": 1.4375504178759301e-05, + "loss": 0.8291, + "step": 2515 + }, + { + "epoch": 0.38, + "grad_norm": 1.3408259942475451, + "learning_rate": 1.437115766650933e-05, + "loss": 0.9389, + "step": 2516 + }, + { + "epoch": 0.38, + "grad_norm": 1.556762069550069, + "learning_rate": 1.4366810133156495e-05, + "loss": 0.8537, + "step": 2517 + }, + { + "epoch": 0.38, + "grad_norm": 1.7535168017856988, + "learning_rate": 1.4362461579716373e-05, + "loss": 0.8531, + "step": 2518 + }, + { + "epoch": 0.38, + "grad_norm": 1.2898085147763103, + "learning_rate": 1.4358112007204796e-05, + "loss": 0.8193, + "step": 2519 + }, + { + "epoch": 0.38, + "grad_norm": 1.4160184005183938, + "learning_rate": 1.4353761416637819e-05, + "loss": 0.9242, + "step": 2520 + }, + { + "epoch": 0.38, + "grad_norm": 1.4505937992117637, + "learning_rate": 1.434940980903174e-05, + "loss": 0.8706, + "step": 2521 + }, + { + "epoch": 0.38, + "grad_norm": 1.3897966418449819, + "learning_rate": 1.43450571854031e-05, + "loss": 0.9258, + "step": 2522 + }, + { + "epoch": 0.38, + "grad_norm": 1.449961250374203, + "learning_rate": 1.4340703546768664e-05, + "loss": 0.9013, + "step": 2523 + }, + { + "epoch": 0.38, + "grad_norm": 1.3299021569474143, + "learning_rate": 1.4336348894145448e-05, + "loss": 0.8777, + "step": 2524 + }, + { + "epoch": 0.38, + "grad_norm": 1.3679749891181332, + "learning_rate": 1.43319932285507e-05, + "loss": 0.8531, + "step": 2525 + }, + { + "epoch": 0.38, + "grad_norm": 1.225261738820702, + "learning_rate": 1.4327636551001902e-05, + "loss": 0.9164, + "step": 2526 + }, + { + "epoch": 0.38, + "grad_norm": 1.4582549720527263, + "learning_rate": 1.4323278862516774e-05, + "loss": 0.9147, + "step": 2527 + }, + { + "epoch": 0.38, + "grad_norm": 1.3273892573626784, + "learning_rate": 1.431892016411327e-05, + "loss": 0.812, + "step": 2528 + }, + { + "epoch": 0.38, + "grad_norm": 1.5271798384926805, + "learning_rate": 1.4314560456809592e-05, + "loss": 0.8963, + "step": 2529 + }, + { + "epoch": 0.38, + "grad_norm": 1.4835462082220499, + "learning_rate": 1.4310199741624157e-05, + "loss": 0.9723, + "step": 2530 + }, + { + "epoch": 0.38, + "grad_norm": 1.5159125964632234, + "learning_rate": 1.4305838019575633e-05, + "loss": 0.9049, + "step": 2531 + }, + { + "epoch": 0.38, + "grad_norm": 1.4289726661630802, + "learning_rate": 1.430147529168292e-05, + "loss": 0.8788, + "step": 2532 + }, + { + "epoch": 0.38, + "grad_norm": 1.4576685281376076, + "learning_rate": 1.429711155896515e-05, + "loss": 0.961, + "step": 2533 + }, + { + "epoch": 0.38, + "grad_norm": 1.4067202397745056, + "learning_rate": 1.4292746822441692e-05, + "loss": 0.9013, + "step": 2534 + }, + { + "epoch": 0.38, + "grad_norm": 1.4856733654476078, + "learning_rate": 1.4288381083132149e-05, + "loss": 0.8861, + "step": 2535 + }, + { + "epoch": 0.38, + "grad_norm": 1.3843936070726652, + "learning_rate": 1.4284014342056355e-05, + "loss": 0.8702, + "step": 2536 + }, + { + "epoch": 0.38, + "grad_norm": 1.4576533423873292, + "learning_rate": 1.4279646600234388e-05, + "loss": 0.7968, + "step": 2537 + }, + { + "epoch": 0.38, + "grad_norm": 1.2553881477494369, + "learning_rate": 1.4275277858686548e-05, + "loss": 0.8991, + "step": 2538 + }, + { + "epoch": 0.38, + "grad_norm": 1.5183692303713805, + "learning_rate": 1.4270908118433371e-05, + "loss": 0.9356, + "step": 2539 + }, + { + "epoch": 0.38, + "grad_norm": 1.5235294558820247, + "learning_rate": 1.4266537380495634e-05, + "loss": 0.8765, + "step": 2540 + }, + { + "epoch": 0.38, + "grad_norm": 1.4493190018662951, + "learning_rate": 1.4262165645894342e-05, + "loss": 0.9116, + "step": 2541 + }, + { + "epoch": 0.38, + "grad_norm": 1.3218951946039377, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.9227, + "step": 2542 + }, + { + "epoch": 0.38, + "grad_norm": 1.440647004108404, + "learning_rate": 1.4253419190786264e-05, + "loss": 0.8283, + "step": 2543 + }, + { + "epoch": 0.38, + "grad_norm": 1.6219193826420353, + "learning_rate": 1.4249044472322657e-05, + "loss": 0.9475, + "step": 2544 + }, + { + "epoch": 0.38, + "grad_norm": 1.6038560622084053, + "learning_rate": 1.4244668761281834e-05, + "loss": 0.9073, + "step": 2545 + }, + { + "epoch": 0.38, + "grad_norm": 1.416289752865661, + "learning_rate": 1.424029205868597e-05, + "loss": 0.9425, + "step": 2546 + }, + { + "epoch": 0.38, + "grad_norm": 1.5738836681992094, + "learning_rate": 1.4235914365557455e-05, + "loss": 0.8851, + "step": 2547 + }, + { + "epoch": 0.38, + "grad_norm": 1.5041053702079985, + "learning_rate": 1.4231535682918923e-05, + "loss": 0.8746, + "step": 2548 + }, + { + "epoch": 0.38, + "grad_norm": 1.4937466239043116, + "learning_rate": 1.4227156011793234e-05, + "loss": 1.0052, + "step": 2549 + }, + { + "epoch": 0.38, + "grad_norm": 1.41035156006992, + "learning_rate": 1.422277535320348e-05, + "loss": 0.856, + "step": 2550 + }, + { + "epoch": 0.38, + "grad_norm": 1.4068458526592393, + "learning_rate": 1.4218393708172982e-05, + "loss": 0.8929, + "step": 2551 + }, + { + "epoch": 0.38, + "grad_norm": 1.59927223481385, + "learning_rate": 1.4214011077725293e-05, + "loss": 0.8949, + "step": 2552 + }, + { + "epoch": 0.38, + "grad_norm": 1.6144970233726026, + "learning_rate": 1.4209627462884198e-05, + "loss": 0.8937, + "step": 2553 + }, + { + "epoch": 0.38, + "grad_norm": 1.5561086901696273, + "learning_rate": 1.4205242864673707e-05, + "loss": 0.8422, + "step": 2554 + }, + { + "epoch": 0.38, + "grad_norm": 1.5266038359752994, + "learning_rate": 1.4200857284118067e-05, + "loss": 0.8968, + "step": 2555 + }, + { + "epoch": 0.38, + "grad_norm": 1.3990174674567508, + "learning_rate": 1.4196470722241741e-05, + "loss": 0.8887, + "step": 2556 + }, + { + "epoch": 0.38, + "grad_norm": 1.515273129527405, + "learning_rate": 1.4192083180069441e-05, + "loss": 0.8738, + "step": 2557 + }, + { + "epoch": 0.38, + "grad_norm": 1.4476279430689292, + "learning_rate": 1.4187694658626092e-05, + "loss": 0.9118, + "step": 2558 + }, + { + "epoch": 0.38, + "grad_norm": 1.4417288956678438, + "learning_rate": 1.4183305158936851e-05, + "loss": 0.9134, + "step": 2559 + }, + { + "epoch": 0.38, + "grad_norm": 1.6423951703669446, + "learning_rate": 1.417891468202711e-05, + "loss": 0.9373, + "step": 2560 + }, + { + "epoch": 0.38, + "grad_norm": 1.3216423385662226, + "learning_rate": 1.4174523228922486e-05, + "loss": 0.8462, + "step": 2561 + }, + { + "epoch": 0.38, + "grad_norm": 1.5985498727245373, + "learning_rate": 1.4170130800648814e-05, + "loss": 0.9322, + "step": 2562 + }, + { + "epoch": 0.38, + "grad_norm": 1.3634116341344829, + "learning_rate": 1.4165737398232173e-05, + "loss": 0.8917, + "step": 2563 + }, + { + "epoch": 0.38, + "grad_norm": 1.332052343235081, + "learning_rate": 1.4161343022698864e-05, + "loss": 0.8639, + "step": 2564 + }, + { + "epoch": 0.38, + "grad_norm": 1.3308551013173646, + "learning_rate": 1.4156947675075408e-05, + "loss": 0.9535, + "step": 2565 + }, + { + "epoch": 0.38, + "grad_norm": 1.403820455749816, + "learning_rate": 1.415255135638856e-05, + "loss": 0.8465, + "step": 2566 + }, + { + "epoch": 0.38, + "grad_norm": 1.466444409991185, + "learning_rate": 1.4148154067665305e-05, + "loss": 0.9115, + "step": 2567 + }, + { + "epoch": 0.38, + "grad_norm": 1.3660410522513904, + "learning_rate": 1.4143755809932843e-05, + "loss": 0.8438, + "step": 2568 + }, + { + "epoch": 0.38, + "grad_norm": 1.5002762631980608, + "learning_rate": 1.4139356584218614e-05, + "loss": 0.9316, + "step": 2569 + }, + { + "epoch": 0.38, + "grad_norm": 1.4724129386017795, + "learning_rate": 1.4134956391550269e-05, + "loss": 0.898, + "step": 2570 + }, + { + "epoch": 0.38, + "grad_norm": 1.5837350430380717, + "learning_rate": 1.4130555232955706e-05, + "loss": 0.8268, + "step": 2571 + }, + { + "epoch": 0.38, + "grad_norm": 1.628259001926628, + "learning_rate": 1.4126153109463025e-05, + "loss": 0.8868, + "step": 2572 + }, + { + "epoch": 0.38, + "grad_norm": 1.4678294792314015, + "learning_rate": 1.4121750022100566e-05, + "loss": 0.848, + "step": 2573 + }, + { + "epoch": 0.38, + "grad_norm": 1.517896945215126, + "learning_rate": 1.4117345971896894e-05, + "loss": 0.8622, + "step": 2574 + }, + { + "epoch": 0.38, + "grad_norm": 1.5606168366028863, + "learning_rate": 1.4112940959880791e-05, + "loss": 0.9071, + "step": 2575 + }, + { + "epoch": 0.38, + "grad_norm": 1.4410772581355227, + "learning_rate": 1.4108534987081273e-05, + "loss": 0.8863, + "step": 2576 + }, + { + "epoch": 0.38, + "grad_norm": 1.569678484617207, + "learning_rate": 1.410412805452757e-05, + "loss": 0.7814, + "step": 2577 + }, + { + "epoch": 0.38, + "grad_norm": 1.4276863571473113, + "learning_rate": 1.4099720163249144e-05, + "loss": 0.9419, + "step": 2578 + }, + { + "epoch": 0.38, + "grad_norm": 1.3564115997969606, + "learning_rate": 1.4095311314275678e-05, + "loss": 0.9409, + "step": 2579 + }, + { + "epoch": 0.38, + "grad_norm": 1.6187014634172197, + "learning_rate": 1.4090901508637087e-05, + "loss": 0.8859, + "step": 2580 + }, + { + "epoch": 0.39, + "grad_norm": 1.7330198997558404, + "learning_rate": 1.4086490747363492e-05, + "loss": 0.8812, + "step": 2581 + }, + { + "epoch": 0.39, + "grad_norm": 1.3933799356067045, + "learning_rate": 1.4082079031485253e-05, + "loss": 0.7615, + "step": 2582 + }, + { + "epoch": 0.39, + "grad_norm": 1.4326675200770536, + "learning_rate": 1.4077666362032942e-05, + "loss": 0.8636, + "step": 2583 + }, + { + "epoch": 0.39, + "grad_norm": 1.3985332690643688, + "learning_rate": 1.4073252740037367e-05, + "loss": 0.8511, + "step": 2584 + }, + { + "epoch": 0.39, + "grad_norm": 1.5035865493422815, + "learning_rate": 1.4068838166529544e-05, + "loss": 0.9004, + "step": 2585 + }, + { + "epoch": 0.39, + "grad_norm": 1.6097473077303504, + "learning_rate": 1.4064422642540719e-05, + "loss": 0.9268, + "step": 2586 + }, + { + "epoch": 0.39, + "grad_norm": 1.3760840283967968, + "learning_rate": 1.4060006169102363e-05, + "loss": 0.822, + "step": 2587 + }, + { + "epoch": 0.39, + "grad_norm": 1.5573301319859965, + "learning_rate": 1.4055588747246158e-05, + "loss": 0.824, + "step": 2588 + }, + { + "epoch": 0.39, + "grad_norm": 1.4205612641202485, + "learning_rate": 1.4051170378004018e-05, + "loss": 0.8696, + "step": 2589 + }, + { + "epoch": 0.39, + "grad_norm": 1.4251927402301132, + "learning_rate": 1.4046751062408076e-05, + "loss": 0.857, + "step": 2590 + }, + { + "epoch": 0.39, + "grad_norm": 1.4526906742365633, + "learning_rate": 1.4042330801490682e-05, + "loss": 0.9346, + "step": 2591 + }, + { + "epoch": 0.39, + "grad_norm": 1.6838225100847595, + "learning_rate": 1.403790959628441e-05, + "loss": 0.9334, + "step": 2592 + }, + { + "epoch": 0.39, + "grad_norm": 1.5125729601874176, + "learning_rate": 1.4033487447822053e-05, + "loss": 0.8971, + "step": 2593 + }, + { + "epoch": 0.39, + "grad_norm": 1.4759423376509302, + "learning_rate": 1.4029064357136628e-05, + "loss": 0.9267, + "step": 2594 + }, + { + "epoch": 0.39, + "grad_norm": 1.5474866226262085, + "learning_rate": 1.4024640325261367e-05, + "loss": 0.8046, + "step": 2595 + }, + { + "epoch": 0.39, + "grad_norm": 1.3968421444399874, + "learning_rate": 1.4020215353229726e-05, + "loss": 0.8745, + "step": 2596 + }, + { + "epoch": 0.39, + "grad_norm": 1.4674226566319317, + "learning_rate": 1.4015789442075376e-05, + "loss": 0.8347, + "step": 2597 + }, + { + "epoch": 0.39, + "grad_norm": 1.4305886668626155, + "learning_rate": 1.4011362592832214e-05, + "loss": 0.8437, + "step": 2598 + }, + { + "epoch": 0.39, + "grad_norm": 1.6321575251986986, + "learning_rate": 1.400693480653435e-05, + "loss": 0.9749, + "step": 2599 + }, + { + "epoch": 0.39, + "grad_norm": 1.4181709736470574, + "learning_rate": 1.4002506084216117e-05, + "loss": 0.9266, + "step": 2600 + }, + { + "epoch": 0.39, + "grad_norm": 1.7947765453393132, + "learning_rate": 1.3998076426912061e-05, + "loss": 0.9106, + "step": 2601 + }, + { + "epoch": 0.39, + "grad_norm": 1.5158074044306045, + "learning_rate": 1.3993645835656955e-05, + "loss": 0.7601, + "step": 2602 + }, + { + "epoch": 0.39, + "grad_norm": 1.5324432439867566, + "learning_rate": 1.3989214311485787e-05, + "loss": 0.9008, + "step": 2603 + }, + { + "epoch": 0.39, + "grad_norm": 1.6704085549907486, + "learning_rate": 1.3984781855433756e-05, + "loss": 0.8627, + "step": 2604 + }, + { + "epoch": 0.39, + "grad_norm": 1.6257902754643552, + "learning_rate": 1.3980348468536287e-05, + "loss": 0.8764, + "step": 2605 + }, + { + "epoch": 0.39, + "grad_norm": 1.239993048640492, + "learning_rate": 1.3975914151829017e-05, + "loss": 0.9321, + "step": 2606 + }, + { + "epoch": 0.39, + "grad_norm": 1.8529626254514404, + "learning_rate": 1.3971478906347806e-05, + "loss": 0.8682, + "step": 2607 + }, + { + "epoch": 0.39, + "grad_norm": 0.8448329417548159, + "learning_rate": 1.396704273312873e-05, + "loss": 0.3175, + "step": 2608 + }, + { + "epoch": 0.39, + "grad_norm": 1.451862067106698, + "learning_rate": 1.3962605633208073e-05, + "loss": 0.8604, + "step": 2609 + }, + { + "epoch": 0.39, + "grad_norm": 1.4702687185520813, + "learning_rate": 1.3958167607622347e-05, + "loss": 0.9232, + "step": 2610 + }, + { + "epoch": 0.39, + "grad_norm": 1.6512774612847272, + "learning_rate": 1.3953728657408272e-05, + "loss": 0.8766, + "step": 2611 + }, + { + "epoch": 0.39, + "grad_norm": 1.325563291145746, + "learning_rate": 1.394928878360279e-05, + "loss": 0.7663, + "step": 2612 + }, + { + "epoch": 0.39, + "grad_norm": 1.6850175391042796, + "learning_rate": 1.3944847987243056e-05, + "loss": 0.8568, + "step": 2613 + }, + { + "epoch": 0.39, + "grad_norm": 1.5044241626499406, + "learning_rate": 1.3940406269366436e-05, + "loss": 0.8196, + "step": 2614 + }, + { + "epoch": 0.39, + "grad_norm": 1.5835866252322985, + "learning_rate": 1.393596363101052e-05, + "loss": 0.8259, + "step": 2615 + }, + { + "epoch": 0.39, + "grad_norm": 1.4622932992060935, + "learning_rate": 1.393152007321311e-05, + "loss": 0.88, + "step": 2616 + }, + { + "epoch": 0.39, + "grad_norm": 1.3047974677197762, + "learning_rate": 1.3927075597012215e-05, + "loss": 0.8773, + "step": 2617 + }, + { + "epoch": 0.39, + "grad_norm": 1.401477291021865, + "learning_rate": 1.3922630203446072e-05, + "loss": 0.8362, + "step": 2618 + }, + { + "epoch": 0.39, + "grad_norm": 1.5605621014879874, + "learning_rate": 1.3918183893553123e-05, + "loss": 0.7719, + "step": 2619 + }, + { + "epoch": 0.39, + "grad_norm": 1.3756526135208338, + "learning_rate": 1.3913736668372027e-05, + "loss": 0.9267, + "step": 2620 + }, + { + "epoch": 0.39, + "grad_norm": 1.361235066055886, + "learning_rate": 1.3909288528941653e-05, + "loss": 0.8024, + "step": 2621 + }, + { + "epoch": 0.39, + "grad_norm": 1.5027505211375485, + "learning_rate": 1.3904839476301091e-05, + "loss": 0.8662, + "step": 2622 + }, + { + "epoch": 0.39, + "grad_norm": 1.52035786998657, + "learning_rate": 1.3900389511489639e-05, + "loss": 0.8855, + "step": 2623 + }, + { + "epoch": 0.39, + "grad_norm": 1.276641356566796, + "learning_rate": 1.3895938635546804e-05, + "loss": 0.8484, + "step": 2624 + }, + { + "epoch": 0.39, + "grad_norm": 1.404555107323176, + "learning_rate": 1.3891486849512322e-05, + "loss": 0.8914, + "step": 2625 + }, + { + "epoch": 0.39, + "grad_norm": 1.4265557576215502, + "learning_rate": 1.388703415442612e-05, + "loss": 0.9581, + "step": 2626 + }, + { + "epoch": 0.39, + "grad_norm": 1.3749721826372947, + "learning_rate": 1.388258055132835e-05, + "loss": 0.8745, + "step": 2627 + }, + { + "epoch": 0.39, + "grad_norm": 1.4407921583939474, + "learning_rate": 1.3878126041259381e-05, + "loss": 0.9058, + "step": 2628 + }, + { + "epoch": 0.39, + "grad_norm": 1.412824725886247, + "learning_rate": 1.3873670625259781e-05, + "loss": 0.8879, + "step": 2629 + }, + { + "epoch": 0.39, + "grad_norm": 1.395833299824254, + "learning_rate": 1.3869214304370338e-05, + "loss": 0.9118, + "step": 2630 + }, + { + "epoch": 0.39, + "grad_norm": 1.4325023038185696, + "learning_rate": 1.3864757079632045e-05, + "loss": 0.9031, + "step": 2631 + }, + { + "epoch": 0.39, + "grad_norm": 1.4400560275844982, + "learning_rate": 1.3860298952086118e-05, + "loss": 0.937, + "step": 2632 + }, + { + "epoch": 0.39, + "grad_norm": 1.6120185644381455, + "learning_rate": 1.3855839922773968e-05, + "loss": 0.9211, + "step": 2633 + }, + { + "epoch": 0.39, + "grad_norm": 0.8594615979345239, + "learning_rate": 1.3851379992737229e-05, + "loss": 0.3214, + "step": 2634 + }, + { + "epoch": 0.39, + "grad_norm": 1.6059541067234224, + "learning_rate": 1.3846919163017738e-05, + "loss": 0.915, + "step": 2635 + }, + { + "epoch": 0.39, + "grad_norm": 1.5222039571677914, + "learning_rate": 1.3842457434657548e-05, + "loss": 0.8153, + "step": 2636 + }, + { + "epoch": 0.39, + "grad_norm": 1.4289934813714156, + "learning_rate": 1.383799480869892e-05, + "loss": 0.9169, + "step": 2637 + }, + { + "epoch": 0.39, + "grad_norm": 1.4580096278655192, + "learning_rate": 1.3833531286184324e-05, + "loss": 0.8961, + "step": 2638 + }, + { + "epoch": 0.39, + "grad_norm": 1.431852256100158, + "learning_rate": 1.3829066868156433e-05, + "loss": 0.9564, + "step": 2639 + }, + { + "epoch": 0.39, + "grad_norm": 1.533394514882259, + "learning_rate": 1.382460155565814e-05, + "loss": 0.9108, + "step": 2640 + }, + { + "epoch": 0.39, + "grad_norm": 1.5078492296024473, + "learning_rate": 1.3820135349732546e-05, + "loss": 0.8217, + "step": 2641 + }, + { + "epoch": 0.39, + "grad_norm": 1.547073269670143, + "learning_rate": 1.3815668251422953e-05, + "loss": 0.9108, + "step": 2642 + }, + { + "epoch": 0.39, + "grad_norm": 1.844719619697523, + "learning_rate": 1.3811200261772877e-05, + "loss": 0.8304, + "step": 2643 + }, + { + "epoch": 0.39, + "grad_norm": 1.7898835927873684, + "learning_rate": 1.380673138182604e-05, + "loss": 0.8516, + "step": 2644 + }, + { + "epoch": 0.39, + "grad_norm": 1.5937448126343847, + "learning_rate": 1.3802261612626372e-05, + "loss": 0.83, + "step": 2645 + }, + { + "epoch": 0.39, + "grad_norm": 1.4429464407674535, + "learning_rate": 1.3797790955218014e-05, + "loss": 0.9524, + "step": 2646 + }, + { + "epoch": 0.39, + "grad_norm": 1.3986477047889547, + "learning_rate": 1.3793319410645307e-05, + "loss": 0.9305, + "step": 2647 + }, + { + "epoch": 0.4, + "grad_norm": 1.5353157031019835, + "learning_rate": 1.378884697995281e-05, + "loss": 0.8977, + "step": 2648 + }, + { + "epoch": 0.4, + "grad_norm": 1.3383483201282307, + "learning_rate": 1.3784373664185282e-05, + "loss": 0.9332, + "step": 2649 + }, + { + "epoch": 0.4, + "grad_norm": 1.4277754893821355, + "learning_rate": 1.3779899464387688e-05, + "loss": 0.8646, + "step": 2650 + }, + { + "epoch": 0.4, + "grad_norm": 1.5481888184023798, + "learning_rate": 1.3775424381605205e-05, + "loss": 0.9324, + "step": 2651 + }, + { + "epoch": 0.4, + "grad_norm": 1.4714686356946542, + "learning_rate": 1.3770948416883205e-05, + "loss": 0.8804, + "step": 2652 + }, + { + "epoch": 0.4, + "grad_norm": 1.4669755383672847, + "learning_rate": 1.3766471571267284e-05, + "loss": 0.885, + "step": 2653 + }, + { + "epoch": 0.4, + "grad_norm": 1.3552008324215652, + "learning_rate": 1.3761993845803225e-05, + "loss": 0.9643, + "step": 2654 + }, + { + "epoch": 0.4, + "grad_norm": 0.8468283420463011, + "learning_rate": 1.3757515241537031e-05, + "loss": 0.3217, + "step": 2655 + }, + { + "epoch": 0.4, + "grad_norm": 1.2920960941758886, + "learning_rate": 1.3753035759514904e-05, + "loss": 0.9343, + "step": 2656 + }, + { + "epoch": 0.4, + "grad_norm": 1.5091857740453016, + "learning_rate": 1.3748555400783245e-05, + "loss": 0.9207, + "step": 2657 + }, + { + "epoch": 0.4, + "grad_norm": 1.4299632735398338, + "learning_rate": 1.3744074166388677e-05, + "loss": 0.8937, + "step": 2658 + }, + { + "epoch": 0.4, + "grad_norm": 1.3905957960962714, + "learning_rate": 1.3739592057378005e-05, + "loss": 0.8504, + "step": 2659 + }, + { + "epoch": 0.4, + "grad_norm": 1.5850265228465883, + "learning_rate": 1.3735109074798259e-05, + "loss": 0.8316, + "step": 2660 + }, + { + "epoch": 0.4, + "grad_norm": 1.3664090733419905, + "learning_rate": 1.373062521969666e-05, + "loss": 0.8936, + "step": 2661 + }, + { + "epoch": 0.4, + "grad_norm": 1.595661939045099, + "learning_rate": 1.3726140493120639e-05, + "loss": 0.8979, + "step": 2662 + }, + { + "epoch": 0.4, + "grad_norm": 1.3444722197550636, + "learning_rate": 1.3721654896117826e-05, + "loss": 0.9154, + "step": 2663 + }, + { + "epoch": 0.4, + "grad_norm": 1.5877124479957379, + "learning_rate": 1.3717168429736061e-05, + "loss": 0.9277, + "step": 2664 + }, + { + "epoch": 0.4, + "grad_norm": 1.4310795171006792, + "learning_rate": 1.3712681095023376e-05, + "loss": 0.8793, + "step": 2665 + }, + { + "epoch": 0.4, + "grad_norm": 1.35458440404843, + "learning_rate": 1.370819289302802e-05, + "loss": 0.9304, + "step": 2666 + }, + { + "epoch": 0.4, + "grad_norm": 1.4810619876746554, + "learning_rate": 1.3703703824798438e-05, + "loss": 0.856, + "step": 2667 + }, + { + "epoch": 0.4, + "grad_norm": 1.4018382832408174, + "learning_rate": 1.369921389138327e-05, + "loss": 0.8895, + "step": 2668 + }, + { + "epoch": 0.4, + "grad_norm": 1.5792172271039533, + "learning_rate": 1.369472309383137e-05, + "loss": 0.9842, + "step": 2669 + }, + { + "epoch": 0.4, + "grad_norm": 1.4482192489752068, + "learning_rate": 1.3690231433191787e-05, + "loss": 0.8758, + "step": 2670 + }, + { + "epoch": 0.4, + "grad_norm": 1.4456435069376843, + "learning_rate": 1.3685738910513771e-05, + "loss": 0.8542, + "step": 2671 + }, + { + "epoch": 0.4, + "grad_norm": 1.336789119056566, + "learning_rate": 1.3681245526846782e-05, + "loss": 0.8027, + "step": 2672 + }, + { + "epoch": 0.4, + "grad_norm": 1.4665755950736112, + "learning_rate": 1.3676751283240469e-05, + "loss": 0.8532, + "step": 2673 + }, + { + "epoch": 0.4, + "grad_norm": 1.5622988571399659, + "learning_rate": 1.367225618074469e-05, + "loss": 0.8718, + "step": 2674 + }, + { + "epoch": 0.4, + "grad_norm": 1.4680124032471622, + "learning_rate": 1.3667760220409503e-05, + "loss": 0.8207, + "step": 2675 + }, + { + "epoch": 0.4, + "grad_norm": 1.429503164498111, + "learning_rate": 1.366326340328516e-05, + "loss": 0.898, + "step": 2676 + }, + { + "epoch": 0.4, + "grad_norm": 1.5106450008308003, + "learning_rate": 1.3658765730422126e-05, + "loss": 0.9109, + "step": 2677 + }, + { + "epoch": 0.4, + "grad_norm": 1.4102071597031536, + "learning_rate": 1.3654267202871047e-05, + "loss": 0.9364, + "step": 2678 + }, + { + "epoch": 0.4, + "grad_norm": 1.5346638991677883, + "learning_rate": 1.3649767821682788e-05, + "loss": 0.8921, + "step": 2679 + }, + { + "epoch": 0.4, + "grad_norm": 1.4788022563460717, + "learning_rate": 1.3645267587908404e-05, + "loss": 0.9039, + "step": 2680 + }, + { + "epoch": 0.4, + "grad_norm": 1.3789313512630113, + "learning_rate": 1.3640766502599148e-05, + "loss": 0.8535, + "step": 2681 + }, + { + "epoch": 0.4, + "grad_norm": 1.3797766056750267, + "learning_rate": 1.3636264566806473e-05, + "loss": 0.8912, + "step": 2682 + }, + { + "epoch": 0.4, + "grad_norm": 1.5794891260456347, + "learning_rate": 1.3631761781582031e-05, + "loss": 0.7905, + "step": 2683 + }, + { + "epoch": 0.4, + "grad_norm": 1.3700951909578696, + "learning_rate": 1.3627258147977678e-05, + "loss": 0.8504, + "step": 2684 + }, + { + "epoch": 0.4, + "grad_norm": 1.4665715543588154, + "learning_rate": 1.3622753667045459e-05, + "loss": 0.8409, + "step": 2685 + }, + { + "epoch": 0.4, + "grad_norm": 1.4286278324375734, + "learning_rate": 1.361824833983762e-05, + "loss": 0.8186, + "step": 2686 + }, + { + "epoch": 0.4, + "grad_norm": 1.2618747997035793, + "learning_rate": 1.3613742167406614e-05, + "loss": 0.8608, + "step": 2687 + }, + { + "epoch": 0.4, + "grad_norm": 1.4133239208251391, + "learning_rate": 1.3609235150805074e-05, + "loss": 0.8741, + "step": 2688 + }, + { + "epoch": 0.4, + "grad_norm": 1.5103995101601069, + "learning_rate": 1.3604727291085845e-05, + "loss": 0.8436, + "step": 2689 + }, + { + "epoch": 0.4, + "grad_norm": 1.490327124752854, + "learning_rate": 1.3600218589301962e-05, + "loss": 0.8769, + "step": 2690 + }, + { + "epoch": 0.4, + "grad_norm": 1.2291126900239977, + "learning_rate": 1.3595709046506656e-05, + "loss": 0.9348, + "step": 2691 + }, + { + "epoch": 0.4, + "grad_norm": 1.3683233073134076, + "learning_rate": 1.3591198663753358e-05, + "loss": 0.8823, + "step": 2692 + }, + { + "epoch": 0.4, + "grad_norm": 0.9280492408896337, + "learning_rate": 1.3586687442095697e-05, + "loss": 0.3374, + "step": 2693 + }, + { + "epoch": 0.4, + "grad_norm": 1.5655953664078606, + "learning_rate": 1.3582175382587491e-05, + "loss": 0.7969, + "step": 2694 + }, + { + "epoch": 0.4, + "grad_norm": 1.7751709568192449, + "learning_rate": 1.3577662486282757e-05, + "loss": 0.9202, + "step": 2695 + }, + { + "epoch": 0.4, + "grad_norm": 1.3914000997776887, + "learning_rate": 1.3573148754235712e-05, + "loss": 0.8776, + "step": 2696 + }, + { + "epoch": 0.4, + "grad_norm": 1.292619574496971, + "learning_rate": 1.3568634187500762e-05, + "loss": 0.8441, + "step": 2697 + }, + { + "epoch": 0.4, + "grad_norm": 1.6721705934102595, + "learning_rate": 1.3564118787132507e-05, + "loss": 0.8796, + "step": 2698 + }, + { + "epoch": 0.4, + "grad_norm": 0.8622105164694549, + "learning_rate": 1.3559602554185751e-05, + "loss": 0.3033, + "step": 2699 + }, + { + "epoch": 0.4, + "grad_norm": 1.3139195180990364, + "learning_rate": 1.3555085489715487e-05, + "loss": 0.7475, + "step": 2700 + }, + { + "epoch": 0.4, + "grad_norm": 1.4395956890155168, + "learning_rate": 1.3550567594776893e-05, + "loss": 0.8933, + "step": 2701 + }, + { + "epoch": 0.4, + "grad_norm": 1.393412083504995, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.8661, + "step": 2702 + }, + { + "epoch": 0.4, + "grad_norm": 1.3390615281866014, + "learning_rate": 1.3541529317716453e-05, + "loss": 0.8396, + "step": 2703 + }, + { + "epoch": 0.4, + "grad_norm": 1.4944494302311004, + "learning_rate": 1.3537008937705947e-05, + "loss": 0.8495, + "step": 2704 + }, + { + "epoch": 0.4, + "grad_norm": 1.4588533914641837, + "learning_rate": 1.35324877314498e-05, + "loss": 0.8481, + "step": 2705 + }, + { + "epoch": 0.4, + "grad_norm": 0.7951588388786142, + "learning_rate": 1.3527965700004168e-05, + "loss": 0.3305, + "step": 2706 + }, + { + "epoch": 0.4, + "grad_norm": 1.4278610475969367, + "learning_rate": 1.3523442844425393e-05, + "loss": 0.8847, + "step": 2707 + }, + { + "epoch": 0.4, + "grad_norm": 1.2566448917144541, + "learning_rate": 1.3518919165770023e-05, + "loss": 0.8673, + "step": 2708 + }, + { + "epoch": 0.4, + "grad_norm": 1.3868269139403437, + "learning_rate": 1.3514394665094786e-05, + "loss": 0.886, + "step": 2709 + }, + { + "epoch": 0.4, + "grad_norm": 1.23748714423486, + "learning_rate": 1.3509869343456603e-05, + "loss": 0.8737, + "step": 2710 + }, + { + "epoch": 0.4, + "grad_norm": 1.4958673902288788, + "learning_rate": 1.350534320191259e-05, + "loss": 0.9207, + "step": 2711 + }, + { + "epoch": 0.4, + "grad_norm": 1.3222347816577644, + "learning_rate": 1.3500816241520059e-05, + "loss": 0.8255, + "step": 2712 + }, + { + "epoch": 0.4, + "grad_norm": 1.442461306089769, + "learning_rate": 1.3496288463336504e-05, + "loss": 0.9017, + "step": 2713 + }, + { + "epoch": 0.4, + "grad_norm": 1.4070497794893102, + "learning_rate": 1.3491759868419616e-05, + "loss": 0.8347, + "step": 2714 + }, + { + "epoch": 0.41, + "grad_norm": 1.4676779559228024, + "learning_rate": 1.3487230457827273e-05, + "loss": 0.8356, + "step": 2715 + }, + { + "epoch": 0.41, + "grad_norm": 1.5534142834991447, + "learning_rate": 1.3482700232617552e-05, + "loss": 0.9119, + "step": 2716 + }, + { + "epoch": 0.41, + "grad_norm": 1.6512695364839567, + "learning_rate": 1.3478169193848705e-05, + "loss": 0.8854, + "step": 2717 + }, + { + "epoch": 0.41, + "grad_norm": 1.9404515292546731, + "learning_rate": 1.3473637342579191e-05, + "loss": 0.8552, + "step": 2718 + }, + { + "epoch": 0.41, + "grad_norm": 1.638153216880537, + "learning_rate": 1.3469104679867646e-05, + "loss": 0.9331, + "step": 2719 + }, + { + "epoch": 0.41, + "grad_norm": 1.4718691590622501, + "learning_rate": 1.3464571206772903e-05, + "loss": 0.8541, + "step": 2720 + }, + { + "epoch": 0.41, + "grad_norm": 1.5376357567174224, + "learning_rate": 1.346003692435398e-05, + "loss": 0.8681, + "step": 2721 + }, + { + "epoch": 0.41, + "grad_norm": 1.6025152297633412, + "learning_rate": 1.3455501833670089e-05, + "loss": 0.8213, + "step": 2722 + }, + { + "epoch": 0.41, + "grad_norm": 1.4857630332175427, + "learning_rate": 1.3450965935780622e-05, + "loss": 0.8519, + "step": 2723 + }, + { + "epoch": 0.41, + "grad_norm": 1.35435945937307, + "learning_rate": 1.344642923174517e-05, + "loss": 0.82, + "step": 2724 + }, + { + "epoch": 0.41, + "grad_norm": 1.5257038937719627, + "learning_rate": 1.3441891722623507e-05, + "loss": 0.8516, + "step": 2725 + }, + { + "epoch": 0.41, + "grad_norm": 1.3854099625470668, + "learning_rate": 1.3437353409475596e-05, + "loss": 0.9315, + "step": 2726 + }, + { + "epoch": 0.41, + "grad_norm": 2.02330935101869, + "learning_rate": 1.3432814293361585e-05, + "loss": 0.8323, + "step": 2727 + }, + { + "epoch": 0.41, + "grad_norm": 1.3516827189445648, + "learning_rate": 1.3428274375341812e-05, + "loss": 0.8364, + "step": 2728 + }, + { + "epoch": 0.41, + "grad_norm": 1.298260055448875, + "learning_rate": 1.3423733656476806e-05, + "loss": 0.875, + "step": 2729 + }, + { + "epoch": 0.41, + "grad_norm": 0.9402579985671157, + "learning_rate": 1.341919213782727e-05, + "loss": 0.3317, + "step": 2730 + }, + { + "epoch": 0.41, + "grad_norm": 1.404877551225526, + "learning_rate": 1.3414649820454118e-05, + "loss": 0.8369, + "step": 2731 + }, + { + "epoch": 0.41, + "grad_norm": 1.4569397263506838, + "learning_rate": 1.3410106705418424e-05, + "loss": 0.9473, + "step": 2732 + }, + { + "epoch": 0.41, + "grad_norm": 1.4835074386579736, + "learning_rate": 1.3405562793781463e-05, + "loss": 0.8485, + "step": 2733 + }, + { + "epoch": 0.41, + "grad_norm": 0.8521980191219083, + "learning_rate": 1.3401018086604698e-05, + "loss": 0.3556, + "step": 2734 + }, + { + "epoch": 0.41, + "grad_norm": 1.3997092684338868, + "learning_rate": 1.3396472584949765e-05, + "loss": 0.7977, + "step": 2735 + }, + { + "epoch": 0.41, + "grad_norm": 0.8093156834033487, + "learning_rate": 1.33919262898785e-05, + "loss": 0.3449, + "step": 2736 + }, + { + "epoch": 0.41, + "grad_norm": 1.2633286338676704, + "learning_rate": 1.3387379202452917e-05, + "loss": 0.8237, + "step": 2737 + }, + { + "epoch": 0.41, + "grad_norm": 1.7981384334420416, + "learning_rate": 1.3382831323735213e-05, + "loss": 0.9182, + "step": 2738 + }, + { + "epoch": 0.41, + "grad_norm": 1.463791416464488, + "learning_rate": 1.3378282654787774e-05, + "loss": 0.8949, + "step": 2739 + }, + { + "epoch": 0.41, + "grad_norm": 1.5331657957890301, + "learning_rate": 1.337373319667317e-05, + "loss": 0.8081, + "step": 2740 + }, + { + "epoch": 0.41, + "grad_norm": 1.513597267527198, + "learning_rate": 1.3369182950454155e-05, + "loss": 0.9781, + "step": 2741 + }, + { + "epoch": 0.41, + "grad_norm": 1.569337832978486, + "learning_rate": 1.3364631917193671e-05, + "loss": 0.8654, + "step": 2742 + }, + { + "epoch": 0.41, + "grad_norm": 1.5340515000606945, + "learning_rate": 1.3360080097954833e-05, + "loss": 0.8375, + "step": 2743 + }, + { + "epoch": 0.41, + "grad_norm": 0.9859001605063671, + "learning_rate": 1.3355527493800948e-05, + "loss": 0.3091, + "step": 2744 + }, + { + "epoch": 0.41, + "grad_norm": 1.3190392980272423, + "learning_rate": 1.3350974105795511e-05, + "loss": 0.8391, + "step": 2745 + }, + { + "epoch": 0.41, + "grad_norm": 1.3109891098432258, + "learning_rate": 1.3346419935002186e-05, + "loss": 0.8052, + "step": 2746 + }, + { + "epoch": 0.41, + "grad_norm": 1.4070143125945398, + "learning_rate": 1.3341864982484828e-05, + "loss": 0.8624, + "step": 2747 + }, + { + "epoch": 0.41, + "grad_norm": 1.4746437384536173, + "learning_rate": 1.3337309249307482e-05, + "loss": 0.8636, + "step": 2748 + }, + { + "epoch": 0.41, + "grad_norm": 1.4646471349372367, + "learning_rate": 1.333275273653436e-05, + "loss": 0.9017, + "step": 2749 + }, + { + "epoch": 0.41, + "grad_norm": 1.3825523231380634, + "learning_rate": 1.3328195445229869e-05, + "loss": 0.9316, + "step": 2750 + }, + { + "epoch": 0.41, + "grad_norm": 1.4205471168177572, + "learning_rate": 1.332363737645859e-05, + "loss": 0.8697, + "step": 2751 + }, + { + "epoch": 0.41, + "grad_norm": 1.4143053367920866, + "learning_rate": 1.3319078531285286e-05, + "loss": 0.8981, + "step": 2752 + }, + { + "epoch": 0.41, + "grad_norm": 1.5796261861355516, + "learning_rate": 1.331451891077491e-05, + "loss": 0.958, + "step": 2753 + }, + { + "epoch": 0.41, + "grad_norm": 1.3784716545349904, + "learning_rate": 1.3309958515992585e-05, + "loss": 0.7835, + "step": 2754 + }, + { + "epoch": 0.41, + "grad_norm": 1.3714778354024928, + "learning_rate": 1.3305397348003618e-05, + "loss": 0.8958, + "step": 2755 + }, + { + "epoch": 0.41, + "grad_norm": 1.5493638686283826, + "learning_rate": 1.3300835407873505e-05, + "loss": 0.8977, + "step": 2756 + }, + { + "epoch": 0.41, + "grad_norm": 1.4634259757006258, + "learning_rate": 1.329627269666791e-05, + "loss": 0.8462, + "step": 2757 + }, + { + "epoch": 0.41, + "grad_norm": 1.3447874882345294, + "learning_rate": 1.3291709215452686e-05, + "loss": 0.8637, + "step": 2758 + }, + { + "epoch": 0.41, + "grad_norm": 1.46631777605198, + "learning_rate": 1.3287144965293858e-05, + "loss": 0.8598, + "step": 2759 + }, + { + "epoch": 0.41, + "grad_norm": 1.5271707153842025, + "learning_rate": 1.3282579947257643e-05, + "loss": 0.949, + "step": 2760 + }, + { + "epoch": 0.41, + "grad_norm": 1.6294027619215443, + "learning_rate": 1.3278014162410425e-05, + "loss": 0.7969, + "step": 2761 + }, + { + "epoch": 0.41, + "grad_norm": 1.4982752458060427, + "learning_rate": 1.3273447611818768e-05, + "loss": 0.9121, + "step": 2762 + }, + { + "epoch": 0.41, + "grad_norm": 1.2917002890970246, + "learning_rate": 1.3268880296549424e-05, + "loss": 0.8878, + "step": 2763 + }, + { + "epoch": 0.41, + "grad_norm": 1.4771702313003898, + "learning_rate": 1.3264312217669321e-05, + "loss": 0.9529, + "step": 2764 + }, + { + "epoch": 0.41, + "grad_norm": 1.374160582171779, + "learning_rate": 1.3259743376245556e-05, + "loss": 0.8503, + "step": 2765 + }, + { + "epoch": 0.41, + "grad_norm": 1.4679637843885989, + "learning_rate": 1.3255173773345413e-05, + "loss": 0.9244, + "step": 2766 + }, + { + "epoch": 0.41, + "grad_norm": 1.2875655846187317, + "learning_rate": 1.3250603410036356e-05, + "loss": 0.8389, + "step": 2767 + }, + { + "epoch": 0.41, + "grad_norm": 1.499564603731924, + "learning_rate": 1.3246032287386015e-05, + "loss": 0.9563, + "step": 2768 + }, + { + "epoch": 0.41, + "grad_norm": 1.6576115628009187, + "learning_rate": 1.3241460406462208e-05, + "loss": 0.7855, + "step": 2769 + }, + { + "epoch": 0.41, + "grad_norm": 1.517047036766871, + "learning_rate": 1.3236887768332927e-05, + "loss": 0.8999, + "step": 2770 + }, + { + "epoch": 0.41, + "grad_norm": 1.3336785770236914, + "learning_rate": 1.3232314374066343e-05, + "loss": 0.9127, + "step": 2771 + }, + { + "epoch": 0.41, + "grad_norm": 1.3143820552119605, + "learning_rate": 1.3227740224730799e-05, + "loss": 0.8981, + "step": 2772 + }, + { + "epoch": 0.41, + "grad_norm": 1.4757263298579144, + "learning_rate": 1.3223165321394814e-05, + "loss": 0.8437, + "step": 2773 + }, + { + "epoch": 0.41, + "grad_norm": 1.4442835108983245, + "learning_rate": 1.3218589665127093e-05, + "loss": 0.8314, + "step": 2774 + }, + { + "epoch": 0.41, + "grad_norm": 1.3315542218495533, + "learning_rate": 1.3214013256996503e-05, + "loss": 0.9225, + "step": 2775 + }, + { + "epoch": 0.41, + "grad_norm": 1.4025539317722029, + "learning_rate": 1.3209436098072095e-05, + "loss": 0.9129, + "step": 2776 + }, + { + "epoch": 0.41, + "grad_norm": 1.5196167301562642, + "learning_rate": 1.3204858189423097e-05, + "loss": 0.9796, + "step": 2777 + }, + { + "epoch": 0.41, + "grad_norm": 1.346062685541074, + "learning_rate": 1.3200279532118907e-05, + "loss": 0.8542, + "step": 2778 + }, + { + "epoch": 0.41, + "grad_norm": 1.4084339521429838, + "learning_rate": 1.3195700127229097e-05, + "loss": 0.8982, + "step": 2779 + }, + { + "epoch": 0.41, + "grad_norm": 1.346306080269962, + "learning_rate": 1.3191119975823421e-05, + "loss": 0.8565, + "step": 2780 + }, + { + "epoch": 0.41, + "grad_norm": 0.9174900282828216, + "learning_rate": 1.31865390789718e-05, + "loss": 0.3242, + "step": 2781 + }, + { + "epoch": 0.42, + "grad_norm": 1.4261095707568456, + "learning_rate": 1.3181957437744333e-05, + "loss": 0.9327, + "step": 2782 + }, + { + "epoch": 0.42, + "grad_norm": 1.435811314263392, + "learning_rate": 1.3177375053211293e-05, + "loss": 0.8541, + "step": 2783 + }, + { + "epoch": 0.42, + "grad_norm": 0.9191109663040323, + "learning_rate": 1.3172791926443122e-05, + "loss": 0.3497, + "step": 2784 + }, + { + "epoch": 0.42, + "grad_norm": 1.3376890253361224, + "learning_rate": 1.3168208058510441e-05, + "loss": 0.9509, + "step": 2785 + }, + { + "epoch": 0.42, + "grad_norm": 0.8808116339541431, + "learning_rate": 1.316362345048404e-05, + "loss": 0.3041, + "step": 2786 + }, + { + "epoch": 0.42, + "grad_norm": 1.5675887242949134, + "learning_rate": 1.3159038103434889e-05, + "loss": 0.8123, + "step": 2787 + }, + { + "epoch": 0.42, + "grad_norm": 1.479852674020534, + "learning_rate": 1.3154452018434123e-05, + "loss": 0.8988, + "step": 2788 + }, + { + "epoch": 0.42, + "grad_norm": 1.2273852472738545, + "learning_rate": 1.3149865196553049e-05, + "loss": 0.8744, + "step": 2789 + }, + { + "epoch": 0.42, + "grad_norm": 1.3194277644584498, + "learning_rate": 1.3145277638863152e-05, + "loss": 0.8952, + "step": 2790 + }, + { + "epoch": 0.42, + "grad_norm": 1.492389169766071, + "learning_rate": 1.3140689346436083e-05, + "loss": 0.8553, + "step": 2791 + }, + { + "epoch": 0.42, + "grad_norm": 1.3349625985805669, + "learning_rate": 1.3136100320343674e-05, + "loss": 0.9191, + "step": 2792 + }, + { + "epoch": 0.42, + "grad_norm": 1.5256027924008635, + "learning_rate": 1.3131510561657917e-05, + "loss": 1.0017, + "step": 2793 + }, + { + "epoch": 0.42, + "grad_norm": 1.584737001536214, + "learning_rate": 1.3126920071450977e-05, + "loss": 0.8844, + "step": 2794 + }, + { + "epoch": 0.42, + "grad_norm": 1.31418489338073, + "learning_rate": 1.3122328850795203e-05, + "loss": 0.891, + "step": 2795 + }, + { + "epoch": 0.42, + "grad_norm": 1.5273655605710132, + "learning_rate": 1.3117736900763091e-05, + "loss": 0.8901, + "step": 2796 + }, + { + "epoch": 0.42, + "grad_norm": 1.4642726611478112, + "learning_rate": 1.3113144222427334e-05, + "loss": 0.8697, + "step": 2797 + }, + { + "epoch": 0.42, + "grad_norm": 1.65752365964945, + "learning_rate": 1.3108550816860777e-05, + "loss": 0.9416, + "step": 2798 + }, + { + "epoch": 0.42, + "grad_norm": 1.4458756735026272, + "learning_rate": 1.3103956685136435e-05, + "loss": 0.9093, + "step": 2799 + }, + { + "epoch": 0.42, + "grad_norm": 1.620923480318348, + "learning_rate": 1.3099361828327506e-05, + "loss": 0.9545, + "step": 2800 + }, + { + "epoch": 0.42, + "grad_norm": 0.9369605205954596, + "learning_rate": 1.3094766247507343e-05, + "loss": 0.315, + "step": 2801 + }, + { + "epoch": 0.42, + "grad_norm": 1.6217311640079706, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.9124, + "step": 2802 + }, + { + "epoch": 0.42, + "grad_norm": 1.292977066531907, + "learning_rate": 1.3085572918127605e-05, + "loss": 0.8573, + "step": 2803 + }, + { + "epoch": 0.42, + "grad_norm": 1.224009892632372, + "learning_rate": 1.3080975171715592e-05, + "loss": 0.8639, + "step": 2804 + }, + { + "epoch": 0.42, + "grad_norm": 1.5531301749697768, + "learning_rate": 1.3076376705587468e-05, + "loss": 0.8572, + "step": 2805 + }, + { + "epoch": 0.42, + "grad_norm": 1.4922651450442632, + "learning_rate": 1.3071777520817443e-05, + "loss": 0.9102, + "step": 2806 + }, + { + "epoch": 0.42, + "grad_norm": 1.3198714535383125, + "learning_rate": 1.3067177618479883e-05, + "loss": 0.8952, + "step": 2807 + }, + { + "epoch": 0.42, + "grad_norm": 1.6565717820909742, + "learning_rate": 1.3062576999649323e-05, + "loss": 0.8719, + "step": 2808 + }, + { + "epoch": 0.42, + "grad_norm": 1.7905667661889144, + "learning_rate": 1.3057975665400475e-05, + "loss": 0.8703, + "step": 2809 + }, + { + "epoch": 0.42, + "grad_norm": 1.5267722964469028, + "learning_rate": 1.3053373616808202e-05, + "loss": 0.8753, + "step": 2810 + }, + { + "epoch": 0.42, + "grad_norm": 1.3703134209967969, + "learning_rate": 1.3048770854947553e-05, + "loss": 0.9015, + "step": 2811 + }, + { + "epoch": 0.42, + "grad_norm": 1.330131053571653, + "learning_rate": 1.3044167380893726e-05, + "loss": 0.7464, + "step": 2812 + }, + { + "epoch": 0.42, + "grad_norm": 1.4087981548885895, + "learning_rate": 1.3039563195722096e-05, + "loss": 0.8774, + "step": 2813 + }, + { + "epoch": 0.42, + "grad_norm": 1.43548309807309, + "learning_rate": 1.3034958300508197e-05, + "loss": 0.8509, + "step": 2814 + }, + { + "epoch": 0.42, + "grad_norm": 1.4876310374941417, + "learning_rate": 1.3030352696327741e-05, + "loss": 0.8381, + "step": 2815 + }, + { + "epoch": 0.42, + "grad_norm": 1.5924307386096148, + "learning_rate": 1.3025746384256595e-05, + "loss": 0.9437, + "step": 2816 + }, + { + "epoch": 0.42, + "grad_norm": 1.3512327999331264, + "learning_rate": 1.3021139365370787e-05, + "loss": 0.8617, + "step": 2817 + }, + { + "epoch": 0.42, + "grad_norm": 1.5995053768701828, + "learning_rate": 1.3016531640746524e-05, + "loss": 0.8566, + "step": 2818 + }, + { + "epoch": 0.42, + "grad_norm": 1.4391657530784563, + "learning_rate": 1.301192321146017e-05, + "loss": 0.9008, + "step": 2819 + }, + { + "epoch": 0.42, + "grad_norm": 1.4061270183193015, + "learning_rate": 1.3007314078588255e-05, + "loss": 0.8893, + "step": 2820 + }, + { + "epoch": 0.42, + "grad_norm": 1.5721682773094316, + "learning_rate": 1.3002704243207468e-05, + "loss": 0.8499, + "step": 2821 + }, + { + "epoch": 0.42, + "grad_norm": 1.4406997272365956, + "learning_rate": 1.2998093706394676e-05, + "loss": 0.8869, + "step": 2822 + }, + { + "epoch": 0.42, + "grad_norm": 1.5687787218618148, + "learning_rate": 1.2993482469226892e-05, + "loss": 0.9351, + "step": 2823 + }, + { + "epoch": 0.42, + "grad_norm": 1.0412201349513743, + "learning_rate": 1.2988870532781306e-05, + "loss": 0.3252, + "step": 2824 + }, + { + "epoch": 0.42, + "grad_norm": 1.3557584573901178, + "learning_rate": 1.2984257898135266e-05, + "loss": 0.848, + "step": 2825 + }, + { + "epoch": 0.42, + "grad_norm": 1.5764654127289022, + "learning_rate": 1.2979644566366284e-05, + "loss": 0.891, + "step": 2826 + }, + { + "epoch": 0.42, + "grad_norm": 1.2571764931420957, + "learning_rate": 1.297503053855203e-05, + "loss": 0.7778, + "step": 2827 + }, + { + "epoch": 0.42, + "grad_norm": 1.6396797851964806, + "learning_rate": 1.297041581577035e-05, + "loss": 0.892, + "step": 2828 + }, + { + "epoch": 0.42, + "grad_norm": 1.600777381261169, + "learning_rate": 1.296580039909924e-05, + "loss": 0.8459, + "step": 2829 + }, + { + "epoch": 0.42, + "grad_norm": 1.2570862131863656, + "learning_rate": 1.296118428961686e-05, + "loss": 0.8713, + "step": 2830 + }, + { + "epoch": 0.42, + "grad_norm": 1.4757539967756321, + "learning_rate": 1.2956567488401534e-05, + "loss": 0.7618, + "step": 2831 + }, + { + "epoch": 0.42, + "grad_norm": 1.5752293889899953, + "learning_rate": 1.295194999653175e-05, + "loss": 0.8065, + "step": 2832 + }, + { + "epoch": 0.42, + "grad_norm": 1.3536762696806495, + "learning_rate": 1.2947331815086153e-05, + "loss": 0.8186, + "step": 2833 + }, + { + "epoch": 0.42, + "grad_norm": 1.420220437465423, + "learning_rate": 1.2942712945143547e-05, + "loss": 0.8313, + "step": 2834 + }, + { + "epoch": 0.42, + "grad_norm": 1.535317788591522, + "learning_rate": 1.2938093387782908e-05, + "loss": 0.7851, + "step": 2835 + }, + { + "epoch": 0.42, + "grad_norm": 1.6624921969718232, + "learning_rate": 1.2933473144083359e-05, + "loss": 0.8577, + "step": 2836 + }, + { + "epoch": 0.42, + "grad_norm": 1.597251109143332, + "learning_rate": 1.292885221512419e-05, + "loss": 0.8359, + "step": 2837 + }, + { + "epoch": 0.42, + "grad_norm": 1.422663088394019, + "learning_rate": 1.2924230601984855e-05, + "loss": 0.7572, + "step": 2838 + }, + { + "epoch": 0.42, + "grad_norm": 0.8755452222120067, + "learning_rate": 1.291960830574496e-05, + "loss": 0.3252, + "step": 2839 + }, + { + "epoch": 0.42, + "grad_norm": 1.5175529002887995, + "learning_rate": 1.291498532748427e-05, + "loss": 0.8386, + "step": 2840 + }, + { + "epoch": 0.42, + "grad_norm": 1.6590066845415288, + "learning_rate": 1.2910361668282718e-05, + "loss": 0.8527, + "step": 2841 + }, + { + "epoch": 0.42, + "grad_norm": 1.3547232739467012, + "learning_rate": 1.2905737329220394e-05, + "loss": 0.8828, + "step": 2842 + }, + { + "epoch": 0.42, + "grad_norm": 1.9352992541283203, + "learning_rate": 1.2901112311377536e-05, + "loss": 0.8556, + "step": 2843 + }, + { + "epoch": 0.42, + "grad_norm": 1.6668354758563957, + "learning_rate": 1.2896486615834557e-05, + "loss": 0.3409, + "step": 2844 + }, + { + "epoch": 0.42, + "grad_norm": 1.5135897318793683, + "learning_rate": 1.2891860243672014e-05, + "loss": 0.8589, + "step": 2845 + }, + { + "epoch": 0.42, + "grad_norm": 1.6238132944134112, + "learning_rate": 1.288723319597063e-05, + "loss": 0.8711, + "step": 2846 + }, + { + "epoch": 0.42, + "grad_norm": 1.5117542499247825, + "learning_rate": 1.2882605473811282e-05, + "loss": 0.8952, + "step": 2847 + }, + { + "epoch": 0.42, + "grad_norm": 1.5140260835722537, + "learning_rate": 1.2877977078275012e-05, + "loss": 0.9237, + "step": 2848 + }, + { + "epoch": 0.43, + "grad_norm": 1.547050139839024, + "learning_rate": 1.2873348010443007e-05, + "loss": 0.8401, + "step": 2849 + }, + { + "epoch": 0.43, + "grad_norm": 1.4240158896533306, + "learning_rate": 1.2868718271396622e-05, + "loss": 0.8486, + "step": 2850 + }, + { + "epoch": 0.43, + "grad_norm": 1.5421106270063638, + "learning_rate": 1.2864087862217365e-05, + "loss": 0.8987, + "step": 2851 + }, + { + "epoch": 0.43, + "grad_norm": 1.5260434383950567, + "learning_rate": 1.2859456783986892e-05, + "loss": 0.8559, + "step": 2852 + }, + { + "epoch": 0.43, + "grad_norm": 1.555891695627861, + "learning_rate": 1.2854825037787034e-05, + "loss": 0.9277, + "step": 2853 + }, + { + "epoch": 0.43, + "grad_norm": 1.3669859451498059, + "learning_rate": 1.2850192624699762e-05, + "loss": 0.9069, + "step": 2854 + }, + { + "epoch": 0.43, + "grad_norm": 1.4260370286369646, + "learning_rate": 1.2845559545807208e-05, + "loss": 0.8505, + "step": 2855 + }, + { + "epoch": 0.43, + "grad_norm": 0.8744328951602257, + "learning_rate": 1.2840925802191661e-05, + "loss": 0.33, + "step": 2856 + }, + { + "epoch": 0.43, + "grad_norm": 1.4492934225745673, + "learning_rate": 1.2836291394935568e-05, + "loss": 0.8974, + "step": 2857 + }, + { + "epoch": 0.43, + "grad_norm": 1.5499553871560139, + "learning_rate": 1.2831656325121516e-05, + "loss": 0.8693, + "step": 2858 + }, + { + "epoch": 0.43, + "grad_norm": 1.4159297838171299, + "learning_rate": 1.2827020593832269e-05, + "loss": 0.8539, + "step": 2859 + }, + { + "epoch": 0.43, + "grad_norm": 1.4912202200260303, + "learning_rate": 1.2822384202150726e-05, + "loss": 0.854, + "step": 2860 + }, + { + "epoch": 0.43, + "grad_norm": 1.3863013814731202, + "learning_rate": 1.2817747151159954e-05, + "loss": 0.8878, + "step": 2861 + }, + { + "epoch": 0.43, + "grad_norm": 1.3929211995111637, + "learning_rate": 1.2813109441943166e-05, + "loss": 0.8858, + "step": 2862 + }, + { + "epoch": 0.43, + "grad_norm": 1.3976120270651626, + "learning_rate": 1.2808471075583733e-05, + "loss": 0.8636, + "step": 2863 + }, + { + "epoch": 0.43, + "grad_norm": 1.389595806146513, + "learning_rate": 1.2803832053165177e-05, + "loss": 0.9057, + "step": 2864 + }, + { + "epoch": 0.43, + "grad_norm": 1.6247254125893544, + "learning_rate": 1.2799192375771172e-05, + "loss": 0.8099, + "step": 2865 + }, + { + "epoch": 0.43, + "grad_norm": 1.5931098782630584, + "learning_rate": 1.279455204448555e-05, + "loss": 0.8777, + "step": 2866 + }, + { + "epoch": 0.43, + "grad_norm": 1.332669443634771, + "learning_rate": 1.2789911060392295e-05, + "loss": 0.7954, + "step": 2867 + }, + { + "epoch": 0.43, + "grad_norm": 1.3761425750681961, + "learning_rate": 1.2785269424575537e-05, + "loss": 0.7842, + "step": 2868 + }, + { + "epoch": 0.43, + "grad_norm": 1.4800211587143304, + "learning_rate": 1.278062713811956e-05, + "loss": 0.7922, + "step": 2869 + }, + { + "epoch": 0.43, + "grad_norm": 1.4300191942674076, + "learning_rate": 1.2775984202108811e-05, + "loss": 0.866, + "step": 2870 + }, + { + "epoch": 0.43, + "grad_norm": 1.2961036876046856, + "learning_rate": 1.2771340617627877e-05, + "loss": 0.8125, + "step": 2871 + }, + { + "epoch": 0.43, + "grad_norm": 1.3955828991517687, + "learning_rate": 1.2766696385761494e-05, + "loss": 0.8926, + "step": 2872 + }, + { + "epoch": 0.43, + "grad_norm": 1.6516070346818763, + "learning_rate": 1.2762051507594562e-05, + "loss": 0.8499, + "step": 2873 + }, + { + "epoch": 0.43, + "grad_norm": 1.6192950271731728, + "learning_rate": 1.2757405984212123e-05, + "loss": 0.9199, + "step": 2874 + }, + { + "epoch": 0.43, + "grad_norm": 1.503052796297095, + "learning_rate": 1.275275981669937e-05, + "loss": 0.7629, + "step": 2875 + }, + { + "epoch": 0.43, + "grad_norm": 1.4524095944350233, + "learning_rate": 1.274811300614165e-05, + "loss": 0.7698, + "step": 2876 + }, + { + "epoch": 0.43, + "grad_norm": 1.4262779507230168, + "learning_rate": 1.274346555362446e-05, + "loss": 0.8276, + "step": 2877 + }, + { + "epoch": 0.43, + "grad_norm": 1.4357493731345239, + "learning_rate": 1.2738817460233442e-05, + "loss": 0.7778, + "step": 2878 + }, + { + "epoch": 0.43, + "grad_norm": 1.4112609580702526, + "learning_rate": 1.273416872705439e-05, + "loss": 0.8642, + "step": 2879 + }, + { + "epoch": 0.43, + "grad_norm": 1.4835508744129458, + "learning_rate": 1.2729519355173254e-05, + "loss": 0.8629, + "step": 2880 + }, + { + "epoch": 0.43, + "grad_norm": 1.367291082136817, + "learning_rate": 1.2724869345676125e-05, + "loss": 0.9205, + "step": 2881 + }, + { + "epoch": 0.43, + "grad_norm": 1.4035915246735866, + "learning_rate": 1.2720218699649243e-05, + "loss": 0.8473, + "step": 2882 + }, + { + "epoch": 0.43, + "grad_norm": 1.4245481122805437, + "learning_rate": 1.2715567418179004e-05, + "loss": 0.9134, + "step": 2883 + }, + { + "epoch": 0.43, + "grad_norm": 1.3359999056511975, + "learning_rate": 1.2710915502351944e-05, + "loss": 0.8786, + "step": 2884 + }, + { + "epoch": 0.43, + "grad_norm": 1.543552965310203, + "learning_rate": 1.2706262953254753e-05, + "loss": 0.8447, + "step": 2885 + }, + { + "epoch": 0.43, + "grad_norm": 1.5624320277688122, + "learning_rate": 1.2701609771974266e-05, + "loss": 0.8168, + "step": 2886 + }, + { + "epoch": 0.43, + "grad_norm": 1.4960428919003828, + "learning_rate": 1.269695595959747e-05, + "loss": 0.8791, + "step": 2887 + }, + { + "epoch": 0.43, + "grad_norm": 1.4859725637139685, + "learning_rate": 1.2692301517211489e-05, + "loss": 0.9044, + "step": 2888 + }, + { + "epoch": 0.43, + "grad_norm": 1.6363436763058141, + "learning_rate": 1.2687646445903608e-05, + "loss": 0.8768, + "step": 2889 + }, + { + "epoch": 0.43, + "grad_norm": 1.487628766204558, + "learning_rate": 1.268299074676125e-05, + "loss": 0.8297, + "step": 2890 + }, + { + "epoch": 0.43, + "grad_norm": 1.4636866411737361, + "learning_rate": 1.2678334420871984e-05, + "loss": 0.8336, + "step": 2891 + }, + { + "epoch": 0.43, + "grad_norm": 1.5409720678180998, + "learning_rate": 1.2673677469323532e-05, + "loss": 0.8385, + "step": 2892 + }, + { + "epoch": 0.43, + "grad_norm": 1.503300617936997, + "learning_rate": 1.2669019893203758e-05, + "loss": 0.854, + "step": 2893 + }, + { + "epoch": 0.43, + "grad_norm": 1.558709754546662, + "learning_rate": 1.266436169360067e-05, + "loss": 0.8503, + "step": 2894 + }, + { + "epoch": 0.43, + "grad_norm": 2.748108289041869, + "learning_rate": 1.2659702871602423e-05, + "loss": 0.8621, + "step": 2895 + }, + { + "epoch": 0.43, + "grad_norm": 1.4674572658043554, + "learning_rate": 1.2655043428297322e-05, + "loss": 0.8414, + "step": 2896 + }, + { + "epoch": 0.43, + "grad_norm": 1.3357567100164935, + "learning_rate": 1.2650383364773812e-05, + "loss": 0.9621, + "step": 2897 + }, + { + "epoch": 0.43, + "grad_norm": 1.3223168498418596, + "learning_rate": 1.2645722682120483e-05, + "loss": 0.8976, + "step": 2898 + }, + { + "epoch": 0.43, + "grad_norm": 1.5668371266720529, + "learning_rate": 1.2641061381426072e-05, + "loss": 0.9014, + "step": 2899 + }, + { + "epoch": 0.43, + "grad_norm": 1.440731960843974, + "learning_rate": 1.2636399463779458e-05, + "loss": 0.8922, + "step": 2900 + }, + { + "epoch": 0.43, + "grad_norm": 1.6289682090648314, + "learning_rate": 1.2631736930269669e-05, + "loss": 0.8364, + "step": 2901 + }, + { + "epoch": 0.43, + "grad_norm": 1.6478553700038558, + "learning_rate": 1.262707378198587e-05, + "loss": 0.9544, + "step": 2902 + }, + { + "epoch": 0.43, + "grad_norm": 1.4663725733256079, + "learning_rate": 1.2622410020017374e-05, + "loss": 0.8472, + "step": 2903 + }, + { + "epoch": 0.43, + "grad_norm": 1.6485851665770868, + "learning_rate": 1.2617745645453637e-05, + "loss": 0.8724, + "step": 2904 + }, + { + "epoch": 0.43, + "grad_norm": 1.622954768793349, + "learning_rate": 1.2613080659384253e-05, + "loss": 0.8446, + "step": 2905 + }, + { + "epoch": 0.43, + "grad_norm": 1.4334313225264799, + "learning_rate": 1.2608415062898971e-05, + "loss": 0.8179, + "step": 2906 + }, + { + "epoch": 0.43, + "grad_norm": 1.2360768657963812, + "learning_rate": 1.2603748857087668e-05, + "loss": 0.8575, + "step": 2907 + }, + { + "epoch": 0.43, + "grad_norm": 1.4295615788283238, + "learning_rate": 1.2599082043040372e-05, + "loss": 0.8954, + "step": 2908 + }, + { + "epoch": 0.43, + "grad_norm": 1.4531978690170937, + "learning_rate": 1.2594414621847255e-05, + "loss": 0.888, + "step": 2909 + }, + { + "epoch": 0.43, + "grad_norm": 1.3773406930728338, + "learning_rate": 1.258974659459862e-05, + "loss": 0.9444, + "step": 2910 + }, + { + "epoch": 0.43, + "grad_norm": 1.46623077211472, + "learning_rate": 1.2585077962384924e-05, + "loss": 0.8639, + "step": 2911 + }, + { + "epoch": 0.43, + "grad_norm": 1.3953303530587875, + "learning_rate": 1.258040872629676e-05, + "loss": 0.8458, + "step": 2912 + }, + { + "epoch": 0.43, + "grad_norm": 1.3597271527516432, + "learning_rate": 1.2575738887424858e-05, + "loss": 0.8017, + "step": 2913 + }, + { + "epoch": 0.43, + "grad_norm": 1.4115092082432688, + "learning_rate": 1.2571068446860095e-05, + "loss": 0.9054, + "step": 2914 + }, + { + "epoch": 0.43, + "grad_norm": 1.6165288906301793, + "learning_rate": 1.2566397405693492e-05, + "loss": 0.8772, + "step": 2915 + }, + { + "epoch": 0.44, + "grad_norm": 1.5197170948046925, + "learning_rate": 1.2561725765016192e-05, + "loss": 1.003, + "step": 2916 + }, + { + "epoch": 0.44, + "grad_norm": 1.471604164934742, + "learning_rate": 1.2557053525919503e-05, + "loss": 0.9166, + "step": 2917 + }, + { + "epoch": 0.44, + "grad_norm": 1.478769754215771, + "learning_rate": 1.2552380689494857e-05, + "loss": 0.8545, + "step": 2918 + }, + { + "epoch": 0.44, + "grad_norm": 1.3551497915510557, + "learning_rate": 1.2547707256833823e-05, + "loss": 0.8997, + "step": 2919 + }, + { + "epoch": 0.44, + "grad_norm": 1.5662929685398246, + "learning_rate": 1.254303322902812e-05, + "loss": 0.8499, + "step": 2920 + }, + { + "epoch": 0.44, + "grad_norm": 1.660737843459019, + "learning_rate": 1.2538358607169605e-05, + "loss": 0.9351, + "step": 2921 + }, + { + "epoch": 0.44, + "grad_norm": 1.4782547714334449, + "learning_rate": 1.2533683392350264e-05, + "loss": 0.9234, + "step": 2922 + }, + { + "epoch": 0.44, + "grad_norm": 1.380743854848808, + "learning_rate": 1.2529007585662235e-05, + "loss": 0.8727, + "step": 2923 + }, + { + "epoch": 0.44, + "grad_norm": 1.4669722866160526, + "learning_rate": 1.2524331188197776e-05, + "loss": 0.8531, + "step": 2924 + }, + { + "epoch": 0.44, + "grad_norm": 0.8843042136817216, + "learning_rate": 1.25196542010493e-05, + "loss": 0.3118, + "step": 2925 + }, + { + "epoch": 0.44, + "grad_norm": 1.4103489952766648, + "learning_rate": 1.2514976625309357e-05, + "loss": 0.8847, + "step": 2926 + }, + { + "epoch": 0.44, + "grad_norm": 1.5607810912942268, + "learning_rate": 1.2510298462070619e-05, + "loss": 0.8251, + "step": 2927 + }, + { + "epoch": 0.44, + "grad_norm": 1.5495810073888892, + "learning_rate": 1.2505619712425912e-05, + "loss": 0.7579, + "step": 2928 + }, + { + "epoch": 0.44, + "grad_norm": 1.284160925724493, + "learning_rate": 1.2500940377468188e-05, + "loss": 0.8677, + "step": 2929 + }, + { + "epoch": 0.44, + "grad_norm": 1.2522987946338024, + "learning_rate": 1.2496260458290545e-05, + "loss": 0.852, + "step": 2930 + }, + { + "epoch": 0.44, + "grad_norm": 1.4964180263746187, + "learning_rate": 1.249157995598621e-05, + "loss": 0.9062, + "step": 2931 + }, + { + "epoch": 0.44, + "grad_norm": 1.4671293293346372, + "learning_rate": 1.2486898871648552e-05, + "loss": 0.8764, + "step": 2932 + }, + { + "epoch": 0.44, + "grad_norm": 1.436913718000181, + "learning_rate": 1.2482217206371065e-05, + "loss": 0.8216, + "step": 2933 + }, + { + "epoch": 0.44, + "grad_norm": 1.7265448450678933, + "learning_rate": 1.2477534961247393e-05, + "loss": 0.9017, + "step": 2934 + }, + { + "epoch": 0.44, + "grad_norm": 1.4570819172218483, + "learning_rate": 1.247285213737131e-05, + "loss": 0.9173, + "step": 2935 + }, + { + "epoch": 0.44, + "grad_norm": 1.4395717619838253, + "learning_rate": 1.2468168735836716e-05, + "loss": 0.8328, + "step": 2936 + }, + { + "epoch": 0.44, + "grad_norm": 1.6316807020716233, + "learning_rate": 1.2463484757737663e-05, + "loss": 0.9196, + "step": 2937 + }, + { + "epoch": 0.44, + "grad_norm": 1.4842598632028106, + "learning_rate": 1.2458800204168324e-05, + "loss": 0.859, + "step": 2938 + }, + { + "epoch": 0.44, + "grad_norm": 1.339004209088753, + "learning_rate": 1.2454115076223012e-05, + "loss": 0.8488, + "step": 2939 + }, + { + "epoch": 0.44, + "grad_norm": 1.4390710921553949, + "learning_rate": 1.2449429374996176e-05, + "loss": 0.8926, + "step": 2940 + }, + { + "epoch": 0.44, + "grad_norm": 1.5354211775212254, + "learning_rate": 1.2444743101582392e-05, + "loss": 0.9376, + "step": 2941 + }, + { + "epoch": 0.44, + "grad_norm": 1.3997777983897326, + "learning_rate": 1.2440056257076376e-05, + "loss": 0.91, + "step": 2942 + }, + { + "epoch": 0.44, + "grad_norm": 1.4190529850055054, + "learning_rate": 1.2435368842572975e-05, + "loss": 0.8968, + "step": 2943 + }, + { + "epoch": 0.44, + "grad_norm": 1.4147911394280994, + "learning_rate": 1.243068085916717e-05, + "loss": 0.789, + "step": 2944 + }, + { + "epoch": 0.44, + "grad_norm": 1.4818082828739265, + "learning_rate": 1.2425992307954075e-05, + "loss": 0.8539, + "step": 2945 + }, + { + "epoch": 0.44, + "grad_norm": 1.5425223924156828, + "learning_rate": 1.2421303190028935e-05, + "loss": 0.9645, + "step": 2946 + }, + { + "epoch": 0.44, + "grad_norm": 1.3171606731963654, + "learning_rate": 1.241661350648713e-05, + "loss": 0.8771, + "step": 2947 + }, + { + "epoch": 0.44, + "grad_norm": 1.4668479674338866, + "learning_rate": 1.2411923258424167e-05, + "loss": 0.9488, + "step": 2948 + }, + { + "epoch": 0.44, + "grad_norm": 1.48443304437788, + "learning_rate": 1.240723244693569e-05, + "loss": 0.856, + "step": 2949 + }, + { + "epoch": 0.44, + "grad_norm": 1.4663900656968, + "learning_rate": 1.2402541073117475e-05, + "loss": 0.8797, + "step": 2950 + }, + { + "epoch": 0.44, + "grad_norm": 1.4446205274041983, + "learning_rate": 1.2397849138065428e-05, + "loss": 0.8935, + "step": 2951 + }, + { + "epoch": 0.44, + "grad_norm": 1.4001794613294685, + "learning_rate": 1.2393156642875579e-05, + "loss": 0.8156, + "step": 2952 + }, + { + "epoch": 0.44, + "grad_norm": 1.373598546837832, + "learning_rate": 1.2388463588644102e-05, + "loss": 0.8819, + "step": 2953 + }, + { + "epoch": 0.44, + "grad_norm": 1.3595147718722762, + "learning_rate": 1.2383769976467295e-05, + "loss": 0.8764, + "step": 2954 + }, + { + "epoch": 0.44, + "grad_norm": 1.493008891813113, + "learning_rate": 1.237907580744158e-05, + "loss": 0.8964, + "step": 2955 + }, + { + "epoch": 0.44, + "grad_norm": 1.6334055492059742, + "learning_rate": 1.237438108266352e-05, + "loss": 0.8859, + "step": 2956 + }, + { + "epoch": 0.44, + "grad_norm": 1.5206122659564747, + "learning_rate": 1.2369685803229802e-05, + "loss": 0.913, + "step": 2957 + }, + { + "epoch": 0.44, + "grad_norm": 1.384897602393876, + "learning_rate": 1.236498997023725e-05, + "loss": 0.9501, + "step": 2958 + }, + { + "epoch": 0.44, + "grad_norm": 1.3548174703348206, + "learning_rate": 1.2360293584782799e-05, + "loss": 0.9317, + "step": 2959 + }, + { + "epoch": 0.44, + "grad_norm": 1.511302509347131, + "learning_rate": 1.2355596647963533e-05, + "loss": 0.8618, + "step": 2960 + }, + { + "epoch": 0.44, + "grad_norm": 1.574174360922581, + "learning_rate": 1.2350899160876657e-05, + "loss": 0.8419, + "step": 2961 + }, + { + "epoch": 0.44, + "grad_norm": 1.3873586722837887, + "learning_rate": 1.2346201124619502e-05, + "loss": 0.823, + "step": 2962 + }, + { + "epoch": 0.44, + "grad_norm": 1.5674222067757635, + "learning_rate": 1.234150254028953e-05, + "loss": 0.8077, + "step": 2963 + }, + { + "epoch": 0.44, + "grad_norm": 1.392208042164477, + "learning_rate": 1.2336803408984333e-05, + "loss": 0.7711, + "step": 2964 + }, + { + "epoch": 0.44, + "grad_norm": 1.3437310497478019, + "learning_rate": 1.2332103731801626e-05, + "loss": 0.8969, + "step": 2965 + }, + { + "epoch": 0.44, + "grad_norm": 1.4877327236075626, + "learning_rate": 1.2327403509839253e-05, + "loss": 0.8666, + "step": 2966 + }, + { + "epoch": 0.44, + "grad_norm": 1.6759832300494302, + "learning_rate": 1.2322702744195192e-05, + "loss": 0.8277, + "step": 2967 + }, + { + "epoch": 0.44, + "grad_norm": 1.3762100155699448, + "learning_rate": 1.2318001435967535e-05, + "loss": 0.8168, + "step": 2968 + }, + { + "epoch": 0.44, + "grad_norm": 1.4008679387862262, + "learning_rate": 1.2313299586254512e-05, + "loss": 0.9323, + "step": 2969 + }, + { + "epoch": 0.44, + "grad_norm": 1.337826624961473, + "learning_rate": 1.230859719615448e-05, + "loss": 0.8607, + "step": 2970 + }, + { + "epoch": 0.44, + "grad_norm": 1.2592926335081416, + "learning_rate": 1.2303894266765908e-05, + "loss": 0.821, + "step": 2971 + }, + { + "epoch": 0.44, + "grad_norm": 1.3355145189126434, + "learning_rate": 1.2299190799187405e-05, + "loss": 0.8837, + "step": 2972 + }, + { + "epoch": 0.44, + "grad_norm": 1.4291473069063707, + "learning_rate": 1.2294486794517703e-05, + "loss": 0.8396, + "step": 2973 + }, + { + "epoch": 0.44, + "grad_norm": 1.5630904679409707, + "learning_rate": 1.228978225385566e-05, + "loss": 0.8794, + "step": 2974 + }, + { + "epoch": 0.44, + "grad_norm": 1.547518354345178, + "learning_rate": 1.2285077178300253e-05, + "loss": 0.8506, + "step": 2975 + }, + { + "epoch": 0.44, + "grad_norm": 1.546085980064757, + "learning_rate": 1.2280371568950588e-05, + "loss": 0.9193, + "step": 2976 + }, + { + "epoch": 0.44, + "grad_norm": 1.2997699477828981, + "learning_rate": 1.22756654269059e-05, + "loss": 0.8235, + "step": 2977 + }, + { + "epoch": 0.44, + "grad_norm": 1.5694756686013684, + "learning_rate": 1.2270958753265541e-05, + "loss": 0.8518, + "step": 2978 + }, + { + "epoch": 0.44, + "grad_norm": 1.5683788521627309, + "learning_rate": 1.2266251549128989e-05, + "loss": 0.939, + "step": 2979 + }, + { + "epoch": 0.44, + "grad_norm": 1.507400188255447, + "learning_rate": 1.2261543815595853e-05, + "loss": 0.9251, + "step": 2980 + }, + { + "epoch": 0.44, + "grad_norm": 1.711953762745263, + "learning_rate": 1.2256835553765853e-05, + "loss": 0.9525, + "step": 2981 + }, + { + "epoch": 0.44, + "grad_norm": 0.8813211183435995, + "learning_rate": 1.2252126764738845e-05, + "loss": 0.3422, + "step": 2982 + }, + { + "epoch": 0.45, + "grad_norm": 1.4273060196502532, + "learning_rate": 1.2247417449614801e-05, + "loss": 0.8891, + "step": 2983 + }, + { + "epoch": 0.45, + "grad_norm": 1.4557649393213015, + "learning_rate": 1.2242707609493814e-05, + "loss": 0.8817, + "step": 2984 + }, + { + "epoch": 0.45, + "grad_norm": 1.4776730423092455, + "learning_rate": 1.2237997245476108e-05, + "loss": 0.9373, + "step": 2985 + }, + { + "epoch": 0.45, + "grad_norm": 1.5023754800906353, + "learning_rate": 1.2233286358662019e-05, + "loss": 0.8223, + "step": 2986 + }, + { + "epoch": 0.45, + "grad_norm": 1.5557183090397682, + "learning_rate": 1.222857495015202e-05, + "loss": 0.8227, + "step": 2987 + }, + { + "epoch": 0.45, + "grad_norm": 1.3720835125108328, + "learning_rate": 1.2223863021046687e-05, + "loss": 0.925, + "step": 2988 + }, + { + "epoch": 0.45, + "grad_norm": 1.4295231797681311, + "learning_rate": 1.2219150572446729e-05, + "loss": 0.893, + "step": 2989 + }, + { + "epoch": 0.45, + "grad_norm": 1.348506772437856, + "learning_rate": 1.221443760545298e-05, + "loss": 0.9044, + "step": 2990 + }, + { + "epoch": 0.45, + "grad_norm": 1.7005311786479078, + "learning_rate": 1.2209724121166384e-05, + "loss": 0.8561, + "step": 2991 + }, + { + "epoch": 0.45, + "grad_norm": 1.357581686999907, + "learning_rate": 1.2205010120688012e-05, + "loss": 0.9322, + "step": 2992 + }, + { + "epoch": 0.45, + "grad_norm": 1.4693097908211747, + "learning_rate": 1.2200295605119059e-05, + "loss": 0.9076, + "step": 2993 + }, + { + "epoch": 0.45, + "grad_norm": 1.312130254367557, + "learning_rate": 1.2195580575560833e-05, + "loss": 0.8045, + "step": 2994 + }, + { + "epoch": 0.45, + "grad_norm": 1.5142933535676377, + "learning_rate": 1.2190865033114763e-05, + "loss": 0.8597, + "step": 2995 + }, + { + "epoch": 0.45, + "grad_norm": 1.4014620260386157, + "learning_rate": 1.2186148978882406e-05, + "loss": 0.9623, + "step": 2996 + }, + { + "epoch": 0.45, + "grad_norm": 1.5077486148207053, + "learning_rate": 1.2181432413965428e-05, + "loss": 0.8405, + "step": 2997 + }, + { + "epoch": 0.45, + "grad_norm": 1.4031928879949722, + "learning_rate": 1.217671533946562e-05, + "loss": 0.8714, + "step": 2998 + }, + { + "epoch": 0.45, + "grad_norm": 1.5586041202773804, + "learning_rate": 1.2171997756484895e-05, + "loss": 0.7691, + "step": 2999 + }, + { + "epoch": 0.45, + "grad_norm": 1.5050302311989334, + "learning_rate": 1.2167279666125275e-05, + "loss": 0.7938, + "step": 3000 + }, + { + "epoch": 0.45, + "grad_norm": 1.4605864317221, + "learning_rate": 1.216256106948891e-05, + "loss": 0.8903, + "step": 3001 + }, + { + "epoch": 0.45, + "grad_norm": 1.7399280537520456, + "learning_rate": 1.2157841967678064e-05, + "loss": 0.8701, + "step": 3002 + }, + { + "epoch": 0.45, + "grad_norm": 1.201863932314353, + "learning_rate": 1.215312236179512e-05, + "loss": 0.9012, + "step": 3003 + }, + { + "epoch": 0.45, + "grad_norm": 1.3230824785873494, + "learning_rate": 1.2148402252942575e-05, + "loss": 0.8315, + "step": 3004 + }, + { + "epoch": 0.45, + "grad_norm": 1.292362376353151, + "learning_rate": 1.214368164222305e-05, + "loss": 0.8847, + "step": 3005 + }, + { + "epoch": 0.45, + "grad_norm": 1.315792264578551, + "learning_rate": 1.2138960530739283e-05, + "loss": 0.8831, + "step": 3006 + }, + { + "epoch": 0.45, + "grad_norm": 1.7180431006832764, + "learning_rate": 1.2134238919594122e-05, + "loss": 0.9555, + "step": 3007 + }, + { + "epoch": 0.45, + "grad_norm": 1.499204376806803, + "learning_rate": 1.2129516809890536e-05, + "loss": 0.9263, + "step": 3008 + }, + { + "epoch": 0.45, + "grad_norm": 1.497233422980793, + "learning_rate": 1.2124794202731611e-05, + "loss": 0.835, + "step": 3009 + }, + { + "epoch": 0.45, + "grad_norm": 1.4675135023821613, + "learning_rate": 1.212007109922055e-05, + "loss": 0.8658, + "step": 3010 + }, + { + "epoch": 0.45, + "grad_norm": 1.2599068858175837, + "learning_rate": 1.2115347500460666e-05, + "loss": 0.8713, + "step": 3011 + }, + { + "epoch": 0.45, + "grad_norm": 1.2566553199473303, + "learning_rate": 1.2110623407555398e-05, + "loss": 0.8825, + "step": 3012 + }, + { + "epoch": 0.45, + "grad_norm": 1.4885573950716942, + "learning_rate": 1.2105898821608291e-05, + "loss": 0.8454, + "step": 3013 + }, + { + "epoch": 0.45, + "grad_norm": 1.358555798359703, + "learning_rate": 1.2101173743723007e-05, + "loss": 0.8169, + "step": 3014 + }, + { + "epoch": 0.45, + "grad_norm": 1.2938516791672219, + "learning_rate": 1.2096448175003329e-05, + "loss": 0.7889, + "step": 3015 + }, + { + "epoch": 0.45, + "grad_norm": 1.473109130130557, + "learning_rate": 1.2091722116553148e-05, + "loss": 0.8483, + "step": 3016 + }, + { + "epoch": 0.45, + "grad_norm": 1.4099539865326929, + "learning_rate": 1.2086995569476474e-05, + "loss": 0.8039, + "step": 3017 + }, + { + "epoch": 0.45, + "grad_norm": 1.4280624006301712, + "learning_rate": 1.2082268534877425e-05, + "loss": 0.9476, + "step": 3018 + }, + { + "epoch": 0.45, + "grad_norm": 1.331332074968661, + "learning_rate": 1.207754101386024e-05, + "loss": 0.8361, + "step": 3019 + }, + { + "epoch": 0.45, + "grad_norm": 1.340391751819343, + "learning_rate": 1.2072813007529267e-05, + "loss": 0.856, + "step": 3020 + }, + { + "epoch": 0.45, + "grad_norm": 1.452627894270564, + "learning_rate": 1.206808451698897e-05, + "loss": 0.8399, + "step": 3021 + }, + { + "epoch": 0.45, + "grad_norm": 1.2531290631573224, + "learning_rate": 1.2063355543343925e-05, + "loss": 0.8818, + "step": 3022 + }, + { + "epoch": 0.45, + "grad_norm": 1.6892794326843585, + "learning_rate": 1.2058626087698814e-05, + "loss": 0.8702, + "step": 3023 + }, + { + "epoch": 0.45, + "grad_norm": 1.333843669200861, + "learning_rate": 1.2053896151158446e-05, + "loss": 0.874, + "step": 3024 + }, + { + "epoch": 0.45, + "grad_norm": 1.46007175981807, + "learning_rate": 1.2049165734827737e-05, + "loss": 0.8874, + "step": 3025 + }, + { + "epoch": 0.45, + "grad_norm": 1.3511418621030704, + "learning_rate": 1.2044434839811702e-05, + "loss": 0.7554, + "step": 3026 + }, + { + "epoch": 0.45, + "grad_norm": 1.4641088358794427, + "learning_rate": 1.2039703467215489e-05, + "loss": 0.8504, + "step": 3027 + }, + { + "epoch": 0.45, + "grad_norm": 1.3961767014444657, + "learning_rate": 1.203497161814434e-05, + "loss": 0.8116, + "step": 3028 + }, + { + "epoch": 0.45, + "grad_norm": 1.5106527626286321, + "learning_rate": 1.203023929370362e-05, + "loss": 0.7546, + "step": 3029 + }, + { + "epoch": 0.45, + "grad_norm": 1.3812305748714984, + "learning_rate": 1.2025506494998797e-05, + "loss": 0.9022, + "step": 3030 + }, + { + "epoch": 0.45, + "grad_norm": 1.4536285156128268, + "learning_rate": 1.2020773223135458e-05, + "loss": 0.8524, + "step": 3031 + }, + { + "epoch": 0.45, + "grad_norm": 1.4920000194439378, + "learning_rate": 1.2016039479219293e-05, + "loss": 0.8502, + "step": 3032 + }, + { + "epoch": 0.45, + "grad_norm": 1.4418751740827567, + "learning_rate": 1.2011305264356101e-05, + "loss": 0.8617, + "step": 3033 + }, + { + "epoch": 0.45, + "grad_norm": 1.4090200454050512, + "learning_rate": 1.2006570579651799e-05, + "loss": 0.919, + "step": 3034 + }, + { + "epoch": 0.45, + "grad_norm": 1.5198068883611635, + "learning_rate": 1.2001835426212413e-05, + "loss": 0.8118, + "step": 3035 + }, + { + "epoch": 0.45, + "grad_norm": 1.516676265353004, + "learning_rate": 1.1997099805144071e-05, + "loss": 0.8816, + "step": 3036 + }, + { + "epoch": 0.45, + "grad_norm": 1.2685939964065611, + "learning_rate": 1.1992363717553015e-05, + "loss": 0.8116, + "step": 3037 + }, + { + "epoch": 0.45, + "grad_norm": 1.3990909685267978, + "learning_rate": 1.1987627164545597e-05, + "loss": 0.8341, + "step": 3038 + }, + { + "epoch": 0.45, + "grad_norm": 1.6251760803476545, + "learning_rate": 1.1982890147228275e-05, + "loss": 0.8656, + "step": 3039 + }, + { + "epoch": 0.45, + "grad_norm": 1.424315107003013, + "learning_rate": 1.1978152666707614e-05, + "loss": 0.8372, + "step": 3040 + }, + { + "epoch": 0.45, + "grad_norm": 0.9660462928937239, + "learning_rate": 1.1973414724090297e-05, + "loss": 0.3461, + "step": 3041 + }, + { + "epoch": 0.45, + "grad_norm": 1.654321301042364, + "learning_rate": 1.1968676320483103e-05, + "loss": 0.9203, + "step": 3042 + }, + { + "epoch": 0.45, + "grad_norm": 1.4530487848106561, + "learning_rate": 1.1963937456992922e-05, + "loss": 0.9107, + "step": 3043 + }, + { + "epoch": 0.45, + "grad_norm": 1.4630946308693005, + "learning_rate": 1.195919813472676e-05, + "loss": 0.9254, + "step": 3044 + }, + { + "epoch": 0.45, + "grad_norm": 1.4754084266364316, + "learning_rate": 1.1954458354791716e-05, + "loss": 0.8686, + "step": 3045 + }, + { + "epoch": 0.45, + "grad_norm": 1.611348266475679, + "learning_rate": 1.1949718118295006e-05, + "loss": 0.8129, + "step": 3046 + }, + { + "epoch": 0.45, + "grad_norm": 1.0019443745816585, + "learning_rate": 1.194497742634395e-05, + "loss": 0.3489, + "step": 3047 + }, + { + "epoch": 0.45, + "grad_norm": 1.4659915400914505, + "learning_rate": 1.1940236280045968e-05, + "loss": 0.8748, + "step": 3048 + }, + { + "epoch": 0.45, + "grad_norm": 1.71761967706553, + "learning_rate": 1.1935494680508606e-05, + "loss": 0.8715, + "step": 3049 + }, + { + "epoch": 0.46, + "grad_norm": 1.5164726322881468, + "learning_rate": 1.1930752628839487e-05, + "loss": 0.7458, + "step": 3050 + }, + { + "epoch": 0.46, + "grad_norm": 1.7200021138514376, + "learning_rate": 1.1926010126146368e-05, + "loss": 0.8808, + "step": 3051 + }, + { + "epoch": 0.46, + "grad_norm": 1.393900459474437, + "learning_rate": 1.1921267173537085e-05, + "loss": 0.8703, + "step": 3052 + }, + { + "epoch": 0.46, + "grad_norm": 1.4933222998083704, + "learning_rate": 1.1916523772119603e-05, + "loss": 0.8706, + "step": 3053 + }, + { + "epoch": 0.46, + "grad_norm": 1.5730561123281428, + "learning_rate": 1.1911779923001976e-05, + "loss": 0.8166, + "step": 3054 + }, + { + "epoch": 0.46, + "grad_norm": 1.2444854292962508, + "learning_rate": 1.1907035627292367e-05, + "loss": 0.8736, + "step": 3055 + }, + { + "epoch": 0.46, + "grad_norm": 1.431234221392283, + "learning_rate": 1.1902290886099048e-05, + "loss": 0.9329, + "step": 3056 + }, + { + "epoch": 0.46, + "grad_norm": 1.5751700224859408, + "learning_rate": 1.1897545700530387e-05, + "loss": 0.8516, + "step": 3057 + }, + { + "epoch": 0.46, + "grad_norm": 1.5363727280373627, + "learning_rate": 1.1892800071694862e-05, + "loss": 0.9249, + "step": 3058 + }, + { + "epoch": 0.46, + "grad_norm": 1.5859313598288451, + "learning_rate": 1.188805400070105e-05, + "loss": 0.8546, + "step": 3059 + }, + { + "epoch": 0.46, + "grad_norm": 2.012910401994648, + "learning_rate": 1.1883307488657637e-05, + "loss": 0.8183, + "step": 3060 + }, + { + "epoch": 0.46, + "grad_norm": 1.5128460889604634, + "learning_rate": 1.1878560536673407e-05, + "loss": 0.829, + "step": 3061 + }, + { + "epoch": 0.46, + "grad_norm": 1.4679675104005057, + "learning_rate": 1.187381314585725e-05, + "loss": 0.889, + "step": 3062 + }, + { + "epoch": 0.46, + "grad_norm": 1.4133970324165064, + "learning_rate": 1.1869065317318151e-05, + "loss": 0.7788, + "step": 3063 + }, + { + "epoch": 0.46, + "grad_norm": 1.3780765989080113, + "learning_rate": 1.1864317052165213e-05, + "loss": 0.8574, + "step": 3064 + }, + { + "epoch": 0.46, + "grad_norm": 1.437439415784921, + "learning_rate": 1.1859568351507623e-05, + "loss": 0.8575, + "step": 3065 + }, + { + "epoch": 0.46, + "grad_norm": 1.6865035355746034, + "learning_rate": 1.1854819216454678e-05, + "loss": 0.8581, + "step": 3066 + }, + { + "epoch": 0.46, + "grad_norm": 1.319392966838635, + "learning_rate": 1.1850069648115785e-05, + "loss": 0.8765, + "step": 3067 + }, + { + "epoch": 0.46, + "grad_norm": 1.455221215195415, + "learning_rate": 1.1845319647600433e-05, + "loss": 0.8913, + "step": 3068 + }, + { + "epoch": 0.46, + "grad_norm": 1.602324623112788, + "learning_rate": 1.184056921601823e-05, + "loss": 0.8388, + "step": 3069 + }, + { + "epoch": 0.46, + "grad_norm": 1.4966083679868898, + "learning_rate": 1.1835818354478876e-05, + "loss": 0.9157, + "step": 3070 + }, + { + "epoch": 0.46, + "grad_norm": 1.8192445688038177, + "learning_rate": 1.183106706409217e-05, + "loss": 0.9695, + "step": 3071 + }, + { + "epoch": 0.46, + "grad_norm": 1.3695512627201891, + "learning_rate": 1.1826315345968014e-05, + "loss": 0.8006, + "step": 3072 + }, + { + "epoch": 0.46, + "grad_norm": 1.2715114936339134, + "learning_rate": 1.1821563201216413e-05, + "loss": 0.9115, + "step": 3073 + }, + { + "epoch": 0.46, + "grad_norm": 1.4951401845158783, + "learning_rate": 1.1816810630947466e-05, + "loss": 0.9116, + "step": 3074 + }, + { + "epoch": 0.46, + "grad_norm": 1.566681414382169, + "learning_rate": 1.1812057636271374e-05, + "loss": 0.7301, + "step": 3075 + }, + { + "epoch": 0.46, + "grad_norm": 1.4581646359031806, + "learning_rate": 1.1807304218298438e-05, + "loss": 0.8599, + "step": 3076 + }, + { + "epoch": 0.46, + "grad_norm": 1.4795194138643568, + "learning_rate": 1.180255037813906e-05, + "loss": 0.9039, + "step": 3077 + }, + { + "epoch": 0.46, + "grad_norm": 1.4435725086997526, + "learning_rate": 1.179779611690373e-05, + "loss": 0.7618, + "step": 3078 + }, + { + "epoch": 0.46, + "grad_norm": 1.583157657588781, + "learning_rate": 1.1793041435703048e-05, + "loss": 0.8929, + "step": 3079 + }, + { + "epoch": 0.46, + "grad_norm": 1.8743703877817455, + "learning_rate": 1.1788286335647712e-05, + "loss": 0.8314, + "step": 3080 + }, + { + "epoch": 0.46, + "grad_norm": 1.1796359925678022, + "learning_rate": 1.1783530817848505e-05, + "loss": 0.8939, + "step": 3081 + }, + { + "epoch": 0.46, + "grad_norm": 1.3686494082127807, + "learning_rate": 1.1778774883416325e-05, + "loss": 0.7968, + "step": 3082 + }, + { + "epoch": 0.46, + "grad_norm": 1.363617184109668, + "learning_rate": 1.1774018533462152e-05, + "loss": 0.9151, + "step": 3083 + }, + { + "epoch": 0.46, + "grad_norm": 1.3846582820377824, + "learning_rate": 1.1769261769097076e-05, + "loss": 0.8732, + "step": 3084 + }, + { + "epoch": 0.46, + "grad_norm": 1.3779437884252015, + "learning_rate": 1.1764504591432271e-05, + "loss": 0.9035, + "step": 3085 + }, + { + "epoch": 0.46, + "grad_norm": 1.7283154646505554, + "learning_rate": 1.1759747001579019e-05, + "loss": 0.9436, + "step": 3086 + }, + { + "epoch": 0.46, + "grad_norm": 1.3891214093803212, + "learning_rate": 1.1754989000648693e-05, + "loss": 0.7946, + "step": 3087 + }, + { + "epoch": 0.46, + "grad_norm": 1.506600063575142, + "learning_rate": 1.1750230589752763e-05, + "loss": 0.8634, + "step": 3088 + }, + { + "epoch": 0.46, + "grad_norm": 1.4777622886298265, + "learning_rate": 1.174547177000279e-05, + "loss": 0.9042, + "step": 3089 + }, + { + "epoch": 0.46, + "grad_norm": 1.5611376712322091, + "learning_rate": 1.1740712542510439e-05, + "loss": 0.8122, + "step": 3090 + }, + { + "epoch": 0.46, + "grad_norm": 1.4889340514664842, + "learning_rate": 1.1735952908387463e-05, + "loss": 0.8116, + "step": 3091 + }, + { + "epoch": 0.46, + "grad_norm": 1.347914404261615, + "learning_rate": 1.1731192868745716e-05, + "loss": 0.8829, + "step": 3092 + }, + { + "epoch": 0.46, + "grad_norm": 1.4937668557243533, + "learning_rate": 1.1726432424697144e-05, + "loss": 0.824, + "step": 3093 + }, + { + "epoch": 0.46, + "grad_norm": 1.4041903724659708, + "learning_rate": 1.1721671577353783e-05, + "loss": 0.9096, + "step": 3094 + }, + { + "epoch": 0.46, + "grad_norm": 1.5318421184838733, + "learning_rate": 1.171691032782777e-05, + "loss": 0.8226, + "step": 3095 + }, + { + "epoch": 0.46, + "grad_norm": 1.7790736698045904, + "learning_rate": 1.1712148677231336e-05, + "loss": 0.8268, + "step": 3096 + }, + { + "epoch": 0.46, + "grad_norm": 1.5518442757622766, + "learning_rate": 1.1707386626676798e-05, + "loss": 0.7812, + "step": 3097 + }, + { + "epoch": 0.46, + "grad_norm": 1.7627156813863276, + "learning_rate": 1.1702624177276574e-05, + "loss": 0.8417, + "step": 3098 + }, + { + "epoch": 0.46, + "grad_norm": 1.3227947069693133, + "learning_rate": 1.1697861330143174e-05, + "loss": 0.8252, + "step": 3099 + }, + { + "epoch": 0.46, + "grad_norm": 1.7663971765311302, + "learning_rate": 1.1693098086389198e-05, + "loss": 0.8531, + "step": 3100 + }, + { + "epoch": 0.46, + "grad_norm": 1.7351294060287639, + "learning_rate": 1.1688334447127338e-05, + "loss": 0.759, + "step": 3101 + }, + { + "epoch": 0.46, + "grad_norm": 1.722082010454196, + "learning_rate": 1.1683570413470384e-05, + "loss": 0.9057, + "step": 3102 + }, + { + "epoch": 0.46, + "grad_norm": 1.7910847676532884, + "learning_rate": 1.1678805986531216e-05, + "loss": 0.8273, + "step": 3103 + }, + { + "epoch": 0.46, + "grad_norm": 1.5471257526725348, + "learning_rate": 1.16740411674228e-05, + "loss": 0.8945, + "step": 3104 + }, + { + "epoch": 0.46, + "grad_norm": 1.4183108085337643, + "learning_rate": 1.1669275957258204e-05, + "loss": 0.9055, + "step": 3105 + }, + { + "epoch": 0.46, + "grad_norm": 1.421261728518497, + "learning_rate": 1.1664510357150575e-05, + "loss": 0.815, + "step": 3106 + }, + { + "epoch": 0.46, + "grad_norm": 1.4434060823473944, + "learning_rate": 1.1659744368213159e-05, + "loss": 0.9223, + "step": 3107 + }, + { + "epoch": 0.46, + "grad_norm": 1.4725094666423635, + "learning_rate": 1.1654977991559297e-05, + "loss": 0.8076, + "step": 3108 + }, + { + "epoch": 0.46, + "grad_norm": 1.4774040398734796, + "learning_rate": 1.165021122830241e-05, + "loss": 0.8453, + "step": 3109 + }, + { + "epoch": 0.46, + "grad_norm": 1.6651641815473082, + "learning_rate": 1.1645444079556017e-05, + "loss": 0.8856, + "step": 3110 + }, + { + "epoch": 0.46, + "grad_norm": 1.5036688084614938, + "learning_rate": 1.164067654643372e-05, + "loss": 0.8824, + "step": 3111 + }, + { + "epoch": 0.46, + "grad_norm": 1.5822542527877423, + "learning_rate": 1.163590863004922e-05, + "loss": 0.9043, + "step": 3112 + }, + { + "epoch": 0.46, + "grad_norm": 1.2481746630549335, + "learning_rate": 1.1631140331516302e-05, + "loss": 0.8621, + "step": 3113 + }, + { + "epoch": 0.46, + "grad_norm": 1.5057241172506834, + "learning_rate": 1.1626371651948839e-05, + "loss": 0.8129, + "step": 3114 + }, + { + "epoch": 0.46, + "grad_norm": 1.3543105735944232, + "learning_rate": 1.1621602592460794e-05, + "loss": 0.9079, + "step": 3115 + }, + { + "epoch": 0.46, + "grad_norm": 1.604022830985037, + "learning_rate": 1.1616833154166224e-05, + "loss": 0.8384, + "step": 3116 + }, + { + "epoch": 0.47, + "grad_norm": 1.4886478794749518, + "learning_rate": 1.1612063338179269e-05, + "loss": 0.8097, + "step": 3117 + }, + { + "epoch": 0.47, + "grad_norm": 1.6295960415754123, + "learning_rate": 1.1607293145614156e-05, + "loss": 0.7728, + "step": 3118 + }, + { + "epoch": 0.47, + "grad_norm": 1.4522191539033344, + "learning_rate": 1.1602522577585207e-05, + "loss": 0.9451, + "step": 3119 + }, + { + "epoch": 0.47, + "grad_norm": 1.6034491521738696, + "learning_rate": 1.159775163520682e-05, + "loss": 0.9183, + "step": 3120 + }, + { + "epoch": 0.47, + "grad_norm": 1.449268700723357, + "learning_rate": 1.1592980319593492e-05, + "loss": 0.8214, + "step": 3121 + }, + { + "epoch": 0.47, + "grad_norm": 1.6526954977646768, + "learning_rate": 1.1588208631859808e-05, + "loss": 0.8086, + "step": 3122 + }, + { + "epoch": 0.47, + "grad_norm": 1.3262290317258014, + "learning_rate": 1.1583436573120424e-05, + "loss": 0.9211, + "step": 3123 + }, + { + "epoch": 0.47, + "grad_norm": 1.3867548870697202, + "learning_rate": 1.1578664144490099e-05, + "loss": 0.8379, + "step": 3124 + }, + { + "epoch": 0.47, + "grad_norm": 1.5175965871387511, + "learning_rate": 1.1573891347083676e-05, + "loss": 0.8695, + "step": 3125 + }, + { + "epoch": 0.47, + "grad_norm": 1.4524827974980379, + "learning_rate": 1.1569118182016074e-05, + "loss": 0.8104, + "step": 3126 + }, + { + "epoch": 0.47, + "grad_norm": 1.3809905900351476, + "learning_rate": 1.156434465040231e-05, + "loss": 0.8211, + "step": 3127 + }, + { + "epoch": 0.47, + "grad_norm": 1.3353853939223754, + "learning_rate": 1.1559570753357481e-05, + "loss": 0.8801, + "step": 3128 + }, + { + "epoch": 0.47, + "grad_norm": 1.3752891848728184, + "learning_rate": 1.1554796491996767e-05, + "loss": 0.8375, + "step": 3129 + }, + { + "epoch": 0.47, + "grad_norm": 1.6699279205034372, + "learning_rate": 1.1550021867435437e-05, + "loss": 0.846, + "step": 3130 + }, + { + "epoch": 0.47, + "grad_norm": 1.297098362456666, + "learning_rate": 1.1545246880788845e-05, + "loss": 0.9009, + "step": 3131 + }, + { + "epoch": 0.47, + "grad_norm": 1.5235535895303025, + "learning_rate": 1.154047153317243e-05, + "loss": 0.8915, + "step": 3132 + }, + { + "epoch": 0.47, + "grad_norm": 1.3636984364423204, + "learning_rate": 1.153569582570171e-05, + "loss": 0.8818, + "step": 3133 + }, + { + "epoch": 0.47, + "grad_norm": 1.3220122875059277, + "learning_rate": 1.1530919759492289e-05, + "loss": 0.969, + "step": 3134 + }, + { + "epoch": 0.47, + "grad_norm": 1.8002835706233236, + "learning_rate": 1.1526143335659866e-05, + "loss": 0.781, + "step": 3135 + }, + { + "epoch": 0.47, + "grad_norm": 1.5655544663195764, + "learning_rate": 1.15213665553202e-05, + "loss": 0.9365, + "step": 3136 + }, + { + "epoch": 0.47, + "grad_norm": 1.400438407657353, + "learning_rate": 1.1516589419589159e-05, + "loss": 0.969, + "step": 3137 + }, + { + "epoch": 0.47, + "grad_norm": 1.5885045517561907, + "learning_rate": 1.1511811929582682e-05, + "loss": 0.909, + "step": 3138 + }, + { + "epoch": 0.47, + "grad_norm": 1.4630718208113698, + "learning_rate": 1.1507034086416781e-05, + "loss": 0.9308, + "step": 3139 + }, + { + "epoch": 0.47, + "grad_norm": 1.4686223980331212, + "learning_rate": 1.1502255891207572e-05, + "loss": 0.8355, + "step": 3140 + }, + { + "epoch": 0.47, + "grad_norm": 0.9354897055549466, + "learning_rate": 1.1497477345071238e-05, + "loss": 0.3223, + "step": 3141 + }, + { + "epoch": 0.47, + "grad_norm": 1.3394201911811199, + "learning_rate": 1.1492698449124042e-05, + "loss": 0.8666, + "step": 3142 + }, + { + "epoch": 0.47, + "grad_norm": 1.4512550720352924, + "learning_rate": 1.1487919204482343e-05, + "loss": 0.9129, + "step": 3143 + }, + { + "epoch": 0.47, + "grad_norm": 1.4555437021618638, + "learning_rate": 1.1483139612262569e-05, + "loss": 0.9393, + "step": 3144 + }, + { + "epoch": 0.47, + "grad_norm": 1.3724218359431628, + "learning_rate": 1.1478359673581235e-05, + "loss": 0.8974, + "step": 3145 + }, + { + "epoch": 0.47, + "grad_norm": 1.4362551434068322, + "learning_rate": 1.147357938955493e-05, + "loss": 0.8402, + "step": 3146 + }, + { + "epoch": 0.47, + "grad_norm": 1.4967591247823635, + "learning_rate": 1.1468798761300335e-05, + "loss": 0.8436, + "step": 3147 + }, + { + "epoch": 0.47, + "grad_norm": 1.4510910568733975, + "learning_rate": 1.1464017789934206e-05, + "loss": 0.8404, + "step": 3148 + }, + { + "epoch": 0.47, + "grad_norm": 1.531675922995948, + "learning_rate": 1.1459236476573373e-05, + "loss": 0.8667, + "step": 3149 + }, + { + "epoch": 0.47, + "grad_norm": 1.6720610569654422, + "learning_rate": 1.1454454822334753e-05, + "loss": 0.8412, + "step": 3150 + }, + { + "epoch": 0.47, + "grad_norm": 1.5006690271807168, + "learning_rate": 1.1449672828335344e-05, + "loss": 0.871, + "step": 3151 + }, + { + "epoch": 0.47, + "grad_norm": 1.3715690049951308, + "learning_rate": 1.1444890495692214e-05, + "loss": 0.9099, + "step": 3152 + }, + { + "epoch": 0.47, + "grad_norm": 1.5133068709058257, + "learning_rate": 1.1440107825522522e-05, + "loss": 0.9019, + "step": 3153 + }, + { + "epoch": 0.47, + "grad_norm": 1.3814938563260044, + "learning_rate": 1.1435324818943501e-05, + "loss": 0.8867, + "step": 3154 + }, + { + "epoch": 0.47, + "grad_norm": 1.3785832373916405, + "learning_rate": 1.1430541477072457e-05, + "loss": 0.8274, + "step": 3155 + }, + { + "epoch": 0.47, + "grad_norm": 1.4264784401677657, + "learning_rate": 1.142575780102678e-05, + "loss": 0.8389, + "step": 3156 + }, + { + "epoch": 0.47, + "grad_norm": 1.3109979329427974, + "learning_rate": 1.1420973791923941e-05, + "loss": 0.8932, + "step": 3157 + }, + { + "epoch": 0.47, + "grad_norm": 1.3402078122803545, + "learning_rate": 1.1416189450881483e-05, + "loss": 0.9515, + "step": 3158 + }, + { + "epoch": 0.47, + "grad_norm": 1.4710769890530828, + "learning_rate": 1.1411404779017026e-05, + "loss": 0.7858, + "step": 3159 + }, + { + "epoch": 0.47, + "grad_norm": 1.4478997998118535, + "learning_rate": 1.1406619777448271e-05, + "loss": 0.7494, + "step": 3160 + }, + { + "epoch": 0.47, + "grad_norm": 1.782353692942064, + "learning_rate": 1.1401834447293001e-05, + "loss": 0.8911, + "step": 3161 + }, + { + "epoch": 0.47, + "grad_norm": 1.418182572355784, + "learning_rate": 1.1397048789669061e-05, + "loss": 0.8355, + "step": 3162 + }, + { + "epoch": 0.47, + "grad_norm": 1.6036213457799025, + "learning_rate": 1.1392262805694382e-05, + "loss": 0.8555, + "step": 3163 + }, + { + "epoch": 0.47, + "grad_norm": 1.3161952446736378, + "learning_rate": 1.138747649648698e-05, + "loss": 0.7779, + "step": 3164 + }, + { + "epoch": 0.47, + "grad_norm": 1.5656688421834137, + "learning_rate": 1.1382689863164924e-05, + "loss": 0.9487, + "step": 3165 + }, + { + "epoch": 0.47, + "grad_norm": 1.4391459780451075, + "learning_rate": 1.137790290684638e-05, + "loss": 0.9187, + "step": 3166 + }, + { + "epoch": 0.47, + "grad_norm": 1.3957428030261596, + "learning_rate": 1.1373115628649582e-05, + "loss": 0.7765, + "step": 3167 + }, + { + "epoch": 0.47, + "grad_norm": 1.4744429134366377, + "learning_rate": 1.1368328029692834e-05, + "loss": 0.796, + "step": 3168 + }, + { + "epoch": 0.47, + "grad_norm": 1.4289570007884824, + "learning_rate": 1.1363540111094524e-05, + "loss": 0.8491, + "step": 3169 + }, + { + "epoch": 0.47, + "grad_norm": 1.5375573003663248, + "learning_rate": 1.1358751873973106e-05, + "loss": 0.9315, + "step": 3170 + }, + { + "epoch": 0.47, + "grad_norm": 1.3442175905496911, + "learning_rate": 1.1353963319447114e-05, + "loss": 0.8699, + "step": 3171 + }, + { + "epoch": 0.47, + "grad_norm": 1.3556248055585878, + "learning_rate": 1.1349174448635158e-05, + "loss": 0.7842, + "step": 3172 + }, + { + "epoch": 0.47, + "grad_norm": 1.4499807545858407, + "learning_rate": 1.1344385262655915e-05, + "loss": 0.9286, + "step": 3173 + }, + { + "epoch": 0.47, + "grad_norm": 1.4326422350461436, + "learning_rate": 1.1339595762628144e-05, + "loss": 0.8862, + "step": 3174 + }, + { + "epoch": 0.47, + "grad_norm": 1.4043441964772698, + "learning_rate": 1.1334805949670666e-05, + "loss": 0.8887, + "step": 3175 + }, + { + "epoch": 0.47, + "grad_norm": 1.3564087586115487, + "learning_rate": 1.1330015824902385e-05, + "loss": 0.8986, + "step": 3176 + }, + { + "epoch": 0.47, + "grad_norm": 1.4187450713962702, + "learning_rate": 1.1325225389442278e-05, + "loss": 0.8738, + "step": 3177 + }, + { + "epoch": 0.47, + "grad_norm": 1.500901250743024, + "learning_rate": 1.1320434644409384e-05, + "loss": 0.9082, + "step": 3178 + }, + { + "epoch": 0.47, + "grad_norm": 1.3701879941190935, + "learning_rate": 1.1315643590922827e-05, + "loss": 0.9028, + "step": 3179 + }, + { + "epoch": 0.47, + "grad_norm": 1.2702735022157703, + "learning_rate": 1.1310852230101797e-05, + "loss": 0.846, + "step": 3180 + }, + { + "epoch": 0.47, + "grad_norm": 1.5143694107192218, + "learning_rate": 1.1306060563065556e-05, + "loss": 0.9128, + "step": 3181 + }, + { + "epoch": 0.47, + "grad_norm": 1.5025758729603544, + "learning_rate": 1.1301268590933434e-05, + "loss": 0.8676, + "step": 3182 + }, + { + "epoch": 0.47, + "grad_norm": 1.5516499433241973, + "learning_rate": 1.1296476314824842e-05, + "loss": 0.8963, + "step": 3183 + }, + { + "epoch": 0.48, + "grad_norm": 1.2619508228695464, + "learning_rate": 1.1291683735859254e-05, + "loss": 0.8583, + "step": 3184 + }, + { + "epoch": 0.48, + "grad_norm": 1.4600969959255856, + "learning_rate": 1.1286890855156215e-05, + "loss": 0.8135, + "step": 3185 + }, + { + "epoch": 0.48, + "grad_norm": 1.3648612463451428, + "learning_rate": 1.1282097673835343e-05, + "loss": 0.964, + "step": 3186 + }, + { + "epoch": 0.48, + "grad_norm": 1.408230953232585, + "learning_rate": 1.1277304193016332e-05, + "loss": 0.9216, + "step": 3187 + }, + { + "epoch": 0.48, + "grad_norm": 1.3614985203381647, + "learning_rate": 1.1272510413818929e-05, + "loss": 0.8879, + "step": 3188 + }, + { + "epoch": 0.48, + "grad_norm": 1.3341023461384207, + "learning_rate": 1.1267716337362968e-05, + "loss": 0.8861, + "step": 3189 + }, + { + "epoch": 0.48, + "grad_norm": 1.5598086328497889, + "learning_rate": 1.1262921964768348e-05, + "loss": 0.891, + "step": 3190 + }, + { + "epoch": 0.48, + "grad_norm": 1.3691015843363636, + "learning_rate": 1.1258127297155027e-05, + "loss": 0.8791, + "step": 3191 + }, + { + "epoch": 0.48, + "grad_norm": 1.3519206199662215, + "learning_rate": 1.1253332335643043e-05, + "loss": 0.8901, + "step": 3192 + }, + { + "epoch": 0.48, + "grad_norm": 1.4265451925602446, + "learning_rate": 1.1248537081352504e-05, + "loss": 0.8382, + "step": 3193 + }, + { + "epoch": 0.48, + "grad_norm": 1.3397813537793721, + "learning_rate": 1.1243741535403576e-05, + "loss": 0.8338, + "step": 3194 + }, + { + "epoch": 0.48, + "grad_norm": 1.4797015082966112, + "learning_rate": 1.1238945698916504e-05, + "loss": 0.8629, + "step": 3195 + }, + { + "epoch": 0.48, + "grad_norm": 1.50219390288469, + "learning_rate": 1.1234149573011592e-05, + "loss": 0.7937, + "step": 3196 + }, + { + "epoch": 0.48, + "grad_norm": 1.5411575793503773, + "learning_rate": 1.1229353158809216e-05, + "loss": 0.8635, + "step": 3197 + }, + { + "epoch": 0.48, + "grad_norm": 1.5902635438371475, + "learning_rate": 1.1224556457429818e-05, + "loss": 0.9358, + "step": 3198 + }, + { + "epoch": 0.48, + "grad_norm": 1.3068818566745855, + "learning_rate": 1.1219759469993914e-05, + "loss": 0.8411, + "step": 3199 + }, + { + "epoch": 0.48, + "grad_norm": 1.547690589548854, + "learning_rate": 1.1214962197622075e-05, + "loss": 0.8158, + "step": 3200 + }, + { + "epoch": 0.48, + "grad_norm": 1.5045698021077403, + "learning_rate": 1.1210164641434942e-05, + "loss": 0.8607, + "step": 3201 + }, + { + "epoch": 0.48, + "grad_norm": 1.5073812601846175, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.8351, + "step": 3202 + }, + { + "epoch": 0.48, + "grad_norm": 1.3994553853367406, + "learning_rate": 1.1200568682097716e-05, + "loss": 0.9068, + "step": 3203 + }, + { + "epoch": 0.48, + "grad_norm": 1.5101653762853424, + "learning_rate": 1.1195770281189236e-05, + "loss": 0.8502, + "step": 3204 + }, + { + "epoch": 0.48, + "grad_norm": 1.4947101342885902, + "learning_rate": 1.11909716009487e-05, + "loss": 0.9071, + "step": 3205 + }, + { + "epoch": 0.48, + "grad_norm": 1.3762661268457008, + "learning_rate": 1.1186172642497077e-05, + "loss": 0.8973, + "step": 3206 + }, + { + "epoch": 0.48, + "grad_norm": 1.4062281812193775, + "learning_rate": 1.118137340695541e-05, + "loss": 0.9108, + "step": 3207 + }, + { + "epoch": 0.48, + "grad_norm": 1.480754004502827, + "learning_rate": 1.1176573895444794e-05, + "loss": 0.866, + "step": 3208 + }, + { + "epoch": 0.48, + "grad_norm": 1.6581946171698438, + "learning_rate": 1.1171774109086401e-05, + "loss": 0.8574, + "step": 3209 + }, + { + "epoch": 0.48, + "grad_norm": 1.439093803734207, + "learning_rate": 1.1166974049001458e-05, + "loss": 0.8622, + "step": 3210 + }, + { + "epoch": 0.48, + "grad_norm": 1.5948127961539507, + "learning_rate": 1.116217371631126e-05, + "loss": 0.8294, + "step": 3211 + }, + { + "epoch": 0.48, + "grad_norm": 1.4182513870895743, + "learning_rate": 1.1157373112137171e-05, + "loss": 0.9321, + "step": 3212 + }, + { + "epoch": 0.48, + "grad_norm": 1.4575381394417286, + "learning_rate": 1.1152572237600603e-05, + "loss": 0.8344, + "step": 3213 + }, + { + "epoch": 0.48, + "grad_norm": 1.4660880418034354, + "learning_rate": 1.1147771093823045e-05, + "loss": 0.8666, + "step": 3214 + }, + { + "epoch": 0.48, + "grad_norm": 1.5638955797953278, + "learning_rate": 1.1142969681926048e-05, + "loss": 0.8724, + "step": 3215 + }, + { + "epoch": 0.48, + "grad_norm": 1.3846935527861206, + "learning_rate": 1.113816800303122e-05, + "loss": 0.8614, + "step": 3216 + }, + { + "epoch": 0.48, + "grad_norm": 1.70949218780247, + "learning_rate": 1.1133366058260232e-05, + "loss": 0.8525, + "step": 3217 + }, + { + "epoch": 0.48, + "grad_norm": 1.4560236323093925, + "learning_rate": 1.1128563848734817e-05, + "loss": 0.8842, + "step": 3218 + }, + { + "epoch": 0.48, + "grad_norm": 1.5451365163840847, + "learning_rate": 1.1123761375576779e-05, + "loss": 0.9024, + "step": 3219 + }, + { + "epoch": 0.48, + "grad_norm": 1.3867848716338, + "learning_rate": 1.1118958639907969e-05, + "loss": 0.8899, + "step": 3220 + }, + { + "epoch": 0.48, + "grad_norm": 1.4996118614571643, + "learning_rate": 1.1114155642850308e-05, + "loss": 0.8742, + "step": 3221 + }, + { + "epoch": 0.48, + "grad_norm": 0.9349425689585481, + "learning_rate": 1.1109352385525782e-05, + "loss": 0.3365, + "step": 3222 + }, + { + "epoch": 0.48, + "grad_norm": 1.4489256544447984, + "learning_rate": 1.1104548869056424e-05, + "loss": 0.9305, + "step": 3223 + }, + { + "epoch": 0.48, + "grad_norm": 1.413013319958576, + "learning_rate": 1.1099745094564342e-05, + "loss": 0.878, + "step": 3224 + }, + { + "epoch": 0.48, + "grad_norm": 1.5069643993095365, + "learning_rate": 1.1094941063171699e-05, + "loss": 0.8433, + "step": 3225 + }, + { + "epoch": 0.48, + "grad_norm": 1.336620279026069, + "learning_rate": 1.1090136776000711e-05, + "loss": 0.8056, + "step": 3226 + }, + { + "epoch": 0.48, + "grad_norm": 1.395681893587308, + "learning_rate": 1.1085332234173664e-05, + "loss": 0.8558, + "step": 3227 + }, + { + "epoch": 0.48, + "grad_norm": 1.6747714648274186, + "learning_rate": 1.10805274388129e-05, + "loss": 0.9456, + "step": 3228 + }, + { + "epoch": 0.48, + "grad_norm": 0.9726801164602124, + "learning_rate": 1.1075722391040817e-05, + "loss": 0.353, + "step": 3229 + }, + { + "epoch": 0.48, + "grad_norm": 1.5198087610464737, + "learning_rate": 1.1070917091979878e-05, + "loss": 0.8582, + "step": 3230 + }, + { + "epoch": 0.48, + "grad_norm": 1.5968587500814786, + "learning_rate": 1.10661115427526e-05, + "loss": 0.8366, + "step": 3231 + }, + { + "epoch": 0.48, + "grad_norm": 1.5396198773811056, + "learning_rate": 1.106130574448156e-05, + "loss": 0.8698, + "step": 3232 + }, + { + "epoch": 0.48, + "grad_norm": 1.4506663395572341, + "learning_rate": 1.1056499698289392e-05, + "loss": 0.8829, + "step": 3233 + }, + { + "epoch": 0.48, + "grad_norm": 1.2969783718780825, + "learning_rate": 1.1051693405298788e-05, + "loss": 0.8799, + "step": 3234 + }, + { + "epoch": 0.48, + "grad_norm": 1.3892835532090662, + "learning_rate": 1.1046886866632498e-05, + "loss": 0.8323, + "step": 3235 + }, + { + "epoch": 0.48, + "grad_norm": 0.8891087370013573, + "learning_rate": 1.1042080083413336e-05, + "loss": 0.3364, + "step": 3236 + }, + { + "epoch": 0.48, + "grad_norm": 1.3153465403309512, + "learning_rate": 1.1037273056764157e-05, + "loss": 0.8142, + "step": 3237 + }, + { + "epoch": 0.48, + "grad_norm": 1.3507943271888676, + "learning_rate": 1.1032465787807893e-05, + "loss": 0.8866, + "step": 3238 + }, + { + "epoch": 0.48, + "grad_norm": 1.4397305437355834, + "learning_rate": 1.1027658277667518e-05, + "loss": 0.796, + "step": 3239 + }, + { + "epoch": 0.48, + "grad_norm": 1.5110699434330976, + "learning_rate": 1.1022850527466065e-05, + "loss": 0.9167, + "step": 3240 + }, + { + "epoch": 0.48, + "grad_norm": 1.7748015321334307, + "learning_rate": 1.101804253832663e-05, + "loss": 0.8163, + "step": 3241 + }, + { + "epoch": 0.48, + "grad_norm": 1.5136345243460296, + "learning_rate": 1.1013234311372353e-05, + "loss": 0.8523, + "step": 3242 + }, + { + "epoch": 0.48, + "grad_norm": 1.5896348734022807, + "learning_rate": 1.100842584772644e-05, + "loss": 0.887, + "step": 3243 + }, + { + "epoch": 0.48, + "grad_norm": 1.4423979175373318, + "learning_rate": 1.1003617148512149e-05, + "loss": 0.8395, + "step": 3244 + }, + { + "epoch": 0.48, + "grad_norm": 1.5055996352937633, + "learning_rate": 1.0998808214852796e-05, + "loss": 0.863, + "step": 3245 + }, + { + "epoch": 0.48, + "grad_norm": 1.2719839333530392, + "learning_rate": 1.099399904787174e-05, + "loss": 0.846, + "step": 3246 + }, + { + "epoch": 0.48, + "grad_norm": 1.522533795495694, + "learning_rate": 1.0989189648692408e-05, + "loss": 0.8546, + "step": 3247 + }, + { + "epoch": 0.48, + "grad_norm": 1.3238194392468925, + "learning_rate": 1.0984380018438279e-05, + "loss": 0.8768, + "step": 3248 + }, + { + "epoch": 0.48, + "grad_norm": 1.5209300741743716, + "learning_rate": 1.0979570158232875e-05, + "loss": 0.7585, + "step": 3249 + }, + { + "epoch": 0.48, + "grad_norm": 1.7556471192466117, + "learning_rate": 1.0974760069199786e-05, + "loss": 0.8433, + "step": 3250 + }, + { + "epoch": 0.49, + "grad_norm": 1.3946055598188356, + "learning_rate": 1.096994975246265e-05, + "loss": 0.8441, + "step": 3251 + }, + { + "epoch": 0.49, + "grad_norm": 1.918498783720375, + "learning_rate": 1.0965139209145153e-05, + "loss": 0.8632, + "step": 3252 + }, + { + "epoch": 0.49, + "grad_norm": 1.4030675786573157, + "learning_rate": 1.0960328440371039e-05, + "loss": 0.8807, + "step": 3253 + }, + { + "epoch": 0.49, + "grad_norm": 1.400948138170017, + "learning_rate": 1.095551744726411e-05, + "loss": 0.8308, + "step": 3254 + }, + { + "epoch": 0.49, + "grad_norm": 1.517048585648085, + "learning_rate": 1.0950706230948207e-05, + "loss": 0.861, + "step": 3255 + }, + { + "epoch": 0.49, + "grad_norm": 1.3050948602070962, + "learning_rate": 1.0945894792547234e-05, + "loss": 0.836, + "step": 3256 + }, + { + "epoch": 0.49, + "grad_norm": 1.488506457793547, + "learning_rate": 1.0941083133185146e-05, + "loss": 0.8529, + "step": 3257 + }, + { + "epoch": 0.49, + "grad_norm": 1.433369094022262, + "learning_rate": 1.0936271253985941e-05, + "loss": 0.8661, + "step": 3258 + }, + { + "epoch": 0.49, + "grad_norm": 1.3875842053297514, + "learning_rate": 1.0931459156073679e-05, + "loss": 0.9041, + "step": 3259 + }, + { + "epoch": 0.49, + "grad_norm": 1.412282257128803, + "learning_rate": 1.0926646840572463e-05, + "loss": 0.8359, + "step": 3260 + }, + { + "epoch": 0.49, + "grad_norm": 1.4542792476207704, + "learning_rate": 1.0921834308606458e-05, + "loss": 0.9112, + "step": 3261 + }, + { + "epoch": 0.49, + "grad_norm": 1.500897298716677, + "learning_rate": 1.0917021561299864e-05, + "loss": 0.9332, + "step": 3262 + }, + { + "epoch": 0.49, + "grad_norm": 1.4275058486473355, + "learning_rate": 1.0912208599776939e-05, + "loss": 0.8995, + "step": 3263 + }, + { + "epoch": 0.49, + "grad_norm": 1.4211037254822234, + "learning_rate": 1.0907395425161999e-05, + "loss": 0.9139, + "step": 3264 + }, + { + "epoch": 0.49, + "grad_norm": 1.4050995213436084, + "learning_rate": 1.0902582038579395e-05, + "loss": 0.8354, + "step": 3265 + }, + { + "epoch": 0.49, + "grad_norm": 1.4974475479870746, + "learning_rate": 1.0897768441153536e-05, + "loss": 0.8153, + "step": 3266 + }, + { + "epoch": 0.49, + "grad_norm": 1.412041135513596, + "learning_rate": 1.089295463400888e-05, + "loss": 0.8894, + "step": 3267 + }, + { + "epoch": 0.49, + "grad_norm": 1.491443608387105, + "learning_rate": 1.0888140618269934e-05, + "loss": 0.8433, + "step": 3268 + }, + { + "epoch": 0.49, + "grad_norm": 1.4527504897243158, + "learning_rate": 1.088332639506125e-05, + "loss": 0.8622, + "step": 3269 + }, + { + "epoch": 0.49, + "grad_norm": 1.4525848667924097, + "learning_rate": 1.0878511965507435e-05, + "loss": 0.8803, + "step": 3270 + }, + { + "epoch": 0.49, + "grad_norm": 1.369125952485571, + "learning_rate": 1.0873697330733132e-05, + "loss": 0.9162, + "step": 3271 + }, + { + "epoch": 0.49, + "grad_norm": 1.2961907707447895, + "learning_rate": 1.0868882491863048e-05, + "loss": 0.8074, + "step": 3272 + }, + { + "epoch": 0.49, + "grad_norm": 0.9195850696265195, + "learning_rate": 1.0864067450021926e-05, + "loss": 0.3642, + "step": 3273 + }, + { + "epoch": 0.49, + "grad_norm": 1.3388902528641635, + "learning_rate": 1.0859252206334568e-05, + "loss": 0.7924, + "step": 3274 + }, + { + "epoch": 0.49, + "grad_norm": 1.5429729540079526, + "learning_rate": 1.0854436761925802e-05, + "loss": 0.8295, + "step": 3275 + }, + { + "epoch": 0.49, + "grad_norm": 1.3513487622046556, + "learning_rate": 1.0849621117920526e-05, + "loss": 0.8604, + "step": 3276 + }, + { + "epoch": 0.49, + "grad_norm": 1.3383886231852877, + "learning_rate": 1.0844805275443673e-05, + "loss": 0.8704, + "step": 3277 + }, + { + "epoch": 0.49, + "grad_norm": 1.5683372396234836, + "learning_rate": 1.083998923562022e-05, + "loss": 0.7999, + "step": 3278 + }, + { + "epoch": 0.49, + "grad_norm": 1.4120720743658346, + "learning_rate": 1.0835172999575201e-05, + "loss": 0.8071, + "step": 3279 + }, + { + "epoch": 0.49, + "grad_norm": 1.4219719407026294, + "learning_rate": 1.0830356568433686e-05, + "loss": 0.905, + "step": 3280 + }, + { + "epoch": 0.49, + "grad_norm": 1.550804339796694, + "learning_rate": 1.0825539943320793e-05, + "loss": 0.9136, + "step": 3281 + }, + { + "epoch": 0.49, + "grad_norm": 1.6178041494130784, + "learning_rate": 1.0820723125361685e-05, + "loss": 0.9173, + "step": 3282 + }, + { + "epoch": 0.49, + "grad_norm": 1.3657765792113936, + "learning_rate": 1.0815906115681579e-05, + "loss": 0.8622, + "step": 3283 + }, + { + "epoch": 0.49, + "grad_norm": 1.4167990490444722, + "learning_rate": 1.0811088915405717e-05, + "loss": 0.8967, + "step": 3284 + }, + { + "epoch": 0.49, + "grad_norm": 1.3804517532817808, + "learning_rate": 1.0806271525659403e-05, + "loss": 0.7558, + "step": 3285 + }, + { + "epoch": 0.49, + "grad_norm": 1.4673309972777784, + "learning_rate": 1.0801453947567985e-05, + "loss": 0.7906, + "step": 3286 + }, + { + "epoch": 0.49, + "grad_norm": 1.4064551064799597, + "learning_rate": 1.0796636182256846e-05, + "loss": 0.8757, + "step": 3287 + }, + { + "epoch": 0.49, + "grad_norm": 1.4860978205002118, + "learning_rate": 1.079181823085141e-05, + "loss": 0.833, + "step": 3288 + }, + { + "epoch": 0.49, + "grad_norm": 1.6913933863602182, + "learning_rate": 1.0787000094477157e-05, + "loss": 0.9043, + "step": 3289 + }, + { + "epoch": 0.49, + "grad_norm": 1.475357657168501, + "learning_rate": 1.0782181774259608e-05, + "loss": 0.8384, + "step": 3290 + }, + { + "epoch": 0.49, + "grad_norm": 1.5505328183602032, + "learning_rate": 1.0777363271324318e-05, + "loss": 0.8601, + "step": 3291 + }, + { + "epoch": 0.49, + "grad_norm": 1.5008862852349154, + "learning_rate": 1.077254458679689e-05, + "loss": 0.8765, + "step": 3292 + }, + { + "epoch": 0.49, + "grad_norm": 1.3576032682014516, + "learning_rate": 1.0767725721802967e-05, + "loss": 0.8405, + "step": 3293 + }, + { + "epoch": 0.49, + "grad_norm": 1.4799977373596405, + "learning_rate": 1.0762906677468238e-05, + "loss": 0.8246, + "step": 3294 + }, + { + "epoch": 0.49, + "grad_norm": 1.261867349099804, + "learning_rate": 1.0758087454918437e-05, + "loss": 0.8698, + "step": 3295 + }, + { + "epoch": 0.49, + "grad_norm": 1.4721664472986458, + "learning_rate": 1.0753268055279328e-05, + "loss": 0.9105, + "step": 3296 + }, + { + "epoch": 0.49, + "grad_norm": 1.405084957210123, + "learning_rate": 1.074844847967673e-05, + "loss": 0.8782, + "step": 3297 + }, + { + "epoch": 0.49, + "grad_norm": 1.5766235505618889, + "learning_rate": 1.0743628729236488e-05, + "loss": 0.8583, + "step": 3298 + }, + { + "epoch": 0.49, + "grad_norm": 1.4574825780583431, + "learning_rate": 1.0738808805084503e-05, + "loss": 0.8743, + "step": 3299 + }, + { + "epoch": 0.49, + "grad_norm": 1.4879152606397799, + "learning_rate": 1.0733988708346708e-05, + "loss": 0.8943, + "step": 3300 + }, + { + "epoch": 0.49, + "grad_norm": 1.6945035499755132, + "learning_rate": 1.0729168440149077e-05, + "loss": 0.8587, + "step": 3301 + }, + { + "epoch": 0.49, + "grad_norm": 1.7116493873523388, + "learning_rate": 1.0724348001617626e-05, + "loss": 0.8792, + "step": 3302 + }, + { + "epoch": 0.49, + "grad_norm": 1.4195991664935301, + "learning_rate": 1.071952739387841e-05, + "loss": 0.8005, + "step": 3303 + }, + { + "epoch": 0.49, + "grad_norm": 1.6186570502625592, + "learning_rate": 1.0714706618057521e-05, + "loss": 0.8612, + "step": 3304 + }, + { + "epoch": 0.49, + "grad_norm": 1.5725985512190277, + "learning_rate": 1.0709885675281096e-05, + "loss": 0.8169, + "step": 3305 + }, + { + "epoch": 0.49, + "grad_norm": 1.4697268875730034, + "learning_rate": 1.070506456667531e-05, + "loss": 0.7828, + "step": 3306 + }, + { + "epoch": 0.49, + "grad_norm": 1.3733405623928798, + "learning_rate": 1.0700243293366365e-05, + "loss": 0.7682, + "step": 3307 + }, + { + "epoch": 0.49, + "grad_norm": 1.385707077861223, + "learning_rate": 1.0695421856480519e-05, + "loss": 0.8699, + "step": 3308 + }, + { + "epoch": 0.49, + "grad_norm": 1.3671698432872128, + "learning_rate": 1.0690600257144062e-05, + "loss": 0.8566, + "step": 3309 + }, + { + "epoch": 0.49, + "grad_norm": 1.2562807853328557, + "learning_rate": 1.0685778496483312e-05, + "loss": 0.7756, + "step": 3310 + }, + { + "epoch": 0.49, + "grad_norm": 1.432256248787592, + "learning_rate": 1.0680956575624637e-05, + "loss": 0.8291, + "step": 3311 + }, + { + "epoch": 0.49, + "grad_norm": 1.6006305204260534, + "learning_rate": 1.0676134495694439e-05, + "loss": 0.8341, + "step": 3312 + }, + { + "epoch": 0.49, + "grad_norm": 1.4602116538548533, + "learning_rate": 1.0671312257819155e-05, + "loss": 0.7872, + "step": 3313 + }, + { + "epoch": 0.49, + "grad_norm": 1.4536833933617224, + "learning_rate": 1.066648986312526e-05, + "loss": 0.8948, + "step": 3314 + }, + { + "epoch": 0.49, + "grad_norm": 1.4740122084886806, + "learning_rate": 1.066166731273927e-05, + "loss": 0.8412, + "step": 3315 + }, + { + "epoch": 0.49, + "grad_norm": 1.7936136752172585, + "learning_rate": 1.0656844607787727e-05, + "loss": 0.892, + "step": 3316 + }, + { + "epoch": 0.49, + "grad_norm": 1.2913633749641416, + "learning_rate": 1.0652021749397216e-05, + "loss": 0.8016, + "step": 3317 + }, + { + "epoch": 0.5, + "grad_norm": 1.524952638171077, + "learning_rate": 1.0647198738694362e-05, + "loss": 0.859, + "step": 3318 + }, + { + "epoch": 0.5, + "grad_norm": 1.4305612156065701, + "learning_rate": 1.0642375576805822e-05, + "loss": 0.8886, + "step": 3319 + }, + { + "epoch": 0.5, + "grad_norm": 0.8176563368147927, + "learning_rate": 1.0637552264858278e-05, + "loss": 0.3428, + "step": 3320 + }, + { + "epoch": 0.5, + "grad_norm": 1.5608220591829975, + "learning_rate": 1.063272880397846e-05, + "loss": 0.8365, + "step": 3321 + }, + { + "epoch": 0.5, + "grad_norm": 1.4073317665150227, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.9181, + "step": 3322 + }, + { + "epoch": 0.5, + "grad_norm": 1.3750375808922841, + "learning_rate": 1.0623081439929092e-05, + "loss": 0.8391, + "step": 3323 + }, + { + "epoch": 0.5, + "grad_norm": 1.3502911948056306, + "learning_rate": 1.0618257539013162e-05, + "loss": 0.8246, + "step": 3324 + }, + { + "epoch": 0.5, + "grad_norm": 1.4364394598401726, + "learning_rate": 1.0613433493672212e-05, + "loss": 0.8636, + "step": 3325 + }, + { + "epoch": 0.5, + "grad_norm": 1.4119382077863705, + "learning_rate": 1.0608609305033132e-05, + "loss": 0.8503, + "step": 3326 + }, + { + "epoch": 0.5, + "grad_norm": 1.4285975402761721, + "learning_rate": 1.0603784974222862e-05, + "loss": 0.7625, + "step": 3327 + }, + { + "epoch": 0.5, + "grad_norm": 1.4806941925151347, + "learning_rate": 1.059896050236836e-05, + "loss": 0.8629, + "step": 3328 + }, + { + "epoch": 0.5, + "grad_norm": 1.5974230351250054, + "learning_rate": 1.0594135890596626e-05, + "loss": 0.8308, + "step": 3329 + }, + { + "epoch": 0.5, + "grad_norm": 1.4199316360264063, + "learning_rate": 1.0589311140034687e-05, + "loss": 0.8772, + "step": 3330 + }, + { + "epoch": 0.5, + "grad_norm": 0.9374677279997454, + "learning_rate": 1.0584486251809607e-05, + "loss": 0.3231, + "step": 3331 + }, + { + "epoch": 0.5, + "grad_norm": 1.4092125416267967, + "learning_rate": 1.0579661227048484e-05, + "loss": 0.8906, + "step": 3332 + }, + { + "epoch": 0.5, + "grad_norm": 1.3912957052775976, + "learning_rate": 1.0574836066878436e-05, + "loss": 0.7822, + "step": 3333 + }, + { + "epoch": 0.5, + "grad_norm": 1.5214334566315058, + "learning_rate": 1.0570010772426627e-05, + "loss": 0.7969, + "step": 3334 + }, + { + "epoch": 0.5, + "grad_norm": 1.3584620174993083, + "learning_rate": 1.0565185344820248e-05, + "loss": 0.7973, + "step": 3335 + }, + { + "epoch": 0.5, + "grad_norm": 1.4008528666105484, + "learning_rate": 1.056035978518651e-05, + "loss": 0.7914, + "step": 3336 + }, + { + "epoch": 0.5, + "grad_norm": 1.5890886958906585, + "learning_rate": 1.0555534094652675e-05, + "loss": 0.8714, + "step": 3337 + }, + { + "epoch": 0.5, + "grad_norm": 1.4497686240042948, + "learning_rate": 1.055070827434602e-05, + "loss": 0.901, + "step": 3338 + }, + { + "epoch": 0.5, + "grad_norm": 1.4720263835329306, + "learning_rate": 1.0545882325393855e-05, + "loss": 0.7266, + "step": 3339 + }, + { + "epoch": 0.5, + "grad_norm": 1.5557475453766123, + "learning_rate": 1.0541056248923525e-05, + "loss": 0.8511, + "step": 3340 + }, + { + "epoch": 0.5, + "grad_norm": 0.8157541800058901, + "learning_rate": 1.0536230046062403e-05, + "loss": 0.3379, + "step": 3341 + }, + { + "epoch": 0.5, + "grad_norm": 1.4507454852829846, + "learning_rate": 1.0531403717937888e-05, + "loss": 0.9001, + "step": 3342 + }, + { + "epoch": 0.5, + "grad_norm": 1.5582387969360811, + "learning_rate": 1.052657726567741e-05, + "loss": 0.9101, + "step": 3343 + }, + { + "epoch": 0.5, + "grad_norm": 1.4455404887934284, + "learning_rate": 1.0521750690408434e-05, + "loss": 0.8998, + "step": 3344 + }, + { + "epoch": 0.5, + "grad_norm": 1.4149830177218141, + "learning_rate": 1.0516923993258441e-05, + "loss": 0.8047, + "step": 3345 + }, + { + "epoch": 0.5, + "grad_norm": 1.5139444481096482, + "learning_rate": 1.0512097175354952e-05, + "loss": 0.8879, + "step": 3346 + }, + { + "epoch": 0.5, + "grad_norm": 1.564061075251842, + "learning_rate": 1.0507270237825513e-05, + "loss": 0.8589, + "step": 3347 + }, + { + "epoch": 0.5, + "grad_norm": 1.600578731104028, + "learning_rate": 1.0502443181797696e-05, + "loss": 0.8265, + "step": 3348 + }, + { + "epoch": 0.5, + "grad_norm": 1.529991932229045, + "learning_rate": 1.04976160083991e-05, + "loss": 0.8825, + "step": 3349 + }, + { + "epoch": 0.5, + "grad_norm": 1.4257194717949433, + "learning_rate": 1.0492788718757356e-05, + "loss": 0.859, + "step": 3350 + }, + { + "epoch": 0.5, + "grad_norm": 1.5481766813570168, + "learning_rate": 1.0487961314000121e-05, + "loss": 0.9265, + "step": 3351 + }, + { + "epoch": 0.5, + "grad_norm": 0.7954439866732235, + "learning_rate": 1.0483133795255072e-05, + "loss": 0.3158, + "step": 3352 + }, + { + "epoch": 0.5, + "grad_norm": 1.4419756671776065, + "learning_rate": 1.0478306163649919e-05, + "loss": 0.8269, + "step": 3353 + }, + { + "epoch": 0.5, + "grad_norm": 1.5925319577454673, + "learning_rate": 1.0473478420312403e-05, + "loss": 0.8426, + "step": 3354 + }, + { + "epoch": 0.5, + "grad_norm": 1.454752769834404, + "learning_rate": 1.0468650566370276e-05, + "loss": 0.8296, + "step": 3355 + }, + { + "epoch": 0.5, + "grad_norm": 1.3799821564103434, + "learning_rate": 1.0463822602951332e-05, + "loss": 0.8385, + "step": 3356 + }, + { + "epoch": 0.5, + "grad_norm": 1.2911316044777288, + "learning_rate": 1.045899453118338e-05, + "loss": 0.9213, + "step": 3357 + }, + { + "epoch": 0.5, + "grad_norm": 1.5856373401875128, + "learning_rate": 1.045416635219426e-05, + "loss": 0.8897, + "step": 3358 + }, + { + "epoch": 0.5, + "grad_norm": 1.4944949851843312, + "learning_rate": 1.0449338067111839e-05, + "loss": 0.8466, + "step": 3359 + }, + { + "epoch": 0.5, + "grad_norm": 1.4438510052329614, + "learning_rate": 1.0444509677063997e-05, + "loss": 0.8537, + "step": 3360 + }, + { + "epoch": 0.5, + "grad_norm": 1.4253224575315486, + "learning_rate": 1.043968118317865e-05, + "loss": 0.8797, + "step": 3361 + }, + { + "epoch": 0.5, + "grad_norm": 1.4237900762876985, + "learning_rate": 1.0434852586583737e-05, + "loss": 0.9016, + "step": 3362 + }, + { + "epoch": 0.5, + "grad_norm": 1.385505287050664, + "learning_rate": 1.0430023888407215e-05, + "loss": 0.8269, + "step": 3363 + }, + { + "epoch": 0.5, + "grad_norm": 1.3824235044374373, + "learning_rate": 1.0425195089777072e-05, + "loss": 0.8486, + "step": 3364 + }, + { + "epoch": 0.5, + "grad_norm": 1.3363181946515676, + "learning_rate": 1.042036619182131e-05, + "loss": 0.8656, + "step": 3365 + }, + { + "epoch": 0.5, + "grad_norm": 1.3769306059274253, + "learning_rate": 1.0415537195667963e-05, + "loss": 0.7979, + "step": 3366 + }, + { + "epoch": 0.5, + "grad_norm": 1.3437704057724318, + "learning_rate": 1.0410708102445091e-05, + "loss": 0.8795, + "step": 3367 + }, + { + "epoch": 0.5, + "grad_norm": 1.5801029138542073, + "learning_rate": 1.0405878913280762e-05, + "loss": 0.8885, + "step": 3368 + }, + { + "epoch": 0.5, + "grad_norm": 1.4037382211046627, + "learning_rate": 1.0401049629303077e-05, + "loss": 0.8228, + "step": 3369 + }, + { + "epoch": 0.5, + "grad_norm": 1.3928843358190088, + "learning_rate": 1.0396220251640163e-05, + "loss": 0.8134, + "step": 3370 + }, + { + "epoch": 0.5, + "grad_norm": 1.4154174106697666, + "learning_rate": 1.0391390781420157e-05, + "loss": 0.8066, + "step": 3371 + }, + { + "epoch": 0.5, + "grad_norm": 0.9039117454388134, + "learning_rate": 1.0386561219771222e-05, + "loss": 0.3323, + "step": 3372 + }, + { + "epoch": 0.5, + "grad_norm": 1.6176655472818309, + "learning_rate": 1.038173156782155e-05, + "loss": 0.8305, + "step": 3373 + }, + { + "epoch": 0.5, + "grad_norm": 1.572117586452326, + "learning_rate": 1.0376901826699349e-05, + "loss": 0.9186, + "step": 3374 + }, + { + "epoch": 0.5, + "grad_norm": 1.4538043418438247, + "learning_rate": 1.037207199753284e-05, + "loss": 0.8045, + "step": 3375 + }, + { + "epoch": 0.5, + "grad_norm": 1.2011663359227436, + "learning_rate": 1.0367242081450274e-05, + "loss": 0.7749, + "step": 3376 + }, + { + "epoch": 0.5, + "grad_norm": 1.67280307416685, + "learning_rate": 1.0362412079579925e-05, + "loss": 0.933, + "step": 3377 + }, + { + "epoch": 0.5, + "grad_norm": 1.5232488876128898, + "learning_rate": 1.0357581993050076e-05, + "loss": 0.8128, + "step": 3378 + }, + { + "epoch": 0.5, + "grad_norm": 1.4373897206983042, + "learning_rate": 1.0352751822989037e-05, + "loss": 0.8645, + "step": 3379 + }, + { + "epoch": 0.5, + "grad_norm": 1.7123411655510947, + "learning_rate": 1.0347921570525139e-05, + "loss": 0.8188, + "step": 3380 + }, + { + "epoch": 0.5, + "grad_norm": 1.5148923647970316, + "learning_rate": 1.0343091236786727e-05, + "loss": 0.8678, + "step": 3381 + }, + { + "epoch": 0.5, + "grad_norm": 1.5304738542180532, + "learning_rate": 1.0338260822902166e-05, + "loss": 0.8519, + "step": 3382 + }, + { + "epoch": 0.5, + "grad_norm": 1.2966406686620444, + "learning_rate": 1.0333430329999847e-05, + "loss": 0.8247, + "step": 3383 + }, + { + "epoch": 0.5, + "grad_norm": 1.4055012166427823, + "learning_rate": 1.0328599759208167e-05, + "loss": 0.9245, + "step": 3384 + }, + { + "epoch": 0.51, + "grad_norm": 1.3812381044259643, + "learning_rate": 1.0323769111655549e-05, + "loss": 0.906, + "step": 3385 + }, + { + "epoch": 0.51, + "grad_norm": 1.5038095405992304, + "learning_rate": 1.0318938388470439e-05, + "loss": 0.8307, + "step": 3386 + }, + { + "epoch": 0.51, + "grad_norm": 0.7944063065900513, + "learning_rate": 1.0314107590781284e-05, + "loss": 0.3517, + "step": 3387 + }, + { + "epoch": 0.51, + "grad_norm": 1.6270276999354705, + "learning_rate": 1.0309276719716567e-05, + "loss": 0.8457, + "step": 3388 + }, + { + "epoch": 0.51, + "grad_norm": 1.5765300754959457, + "learning_rate": 1.0304445776404778e-05, + "loss": 0.8124, + "step": 3389 + }, + { + "epoch": 0.51, + "grad_norm": 1.3348605223522607, + "learning_rate": 1.0299614761974426e-05, + "loss": 0.9029, + "step": 3390 + }, + { + "epoch": 0.51, + "grad_norm": 1.6366106990026816, + "learning_rate": 1.0294783677554035e-05, + "loss": 0.8696, + "step": 3391 + }, + { + "epoch": 0.51, + "grad_norm": 1.4577689502156046, + "learning_rate": 1.0289952524272147e-05, + "loss": 0.8429, + "step": 3392 + }, + { + "epoch": 0.51, + "grad_norm": 1.7120799633270154, + "learning_rate": 1.0285121303257321e-05, + "loss": 0.8248, + "step": 3393 + }, + { + "epoch": 0.51, + "grad_norm": 0.9089973986061121, + "learning_rate": 1.0280290015638129e-05, + "loss": 0.3627, + "step": 3394 + }, + { + "epoch": 0.51, + "grad_norm": 1.3561757233821814, + "learning_rate": 1.0275458662543161e-05, + "loss": 0.8559, + "step": 3395 + }, + { + "epoch": 0.51, + "grad_norm": 1.5449940372835376, + "learning_rate": 1.0270627245101026e-05, + "loss": 0.8215, + "step": 3396 + }, + { + "epoch": 0.51, + "grad_norm": 1.2878587383584195, + "learning_rate": 1.0265795764440335e-05, + "loss": 0.8348, + "step": 3397 + }, + { + "epoch": 0.51, + "grad_norm": 1.5177199102321068, + "learning_rate": 1.0260964221689729e-05, + "loss": 0.9529, + "step": 3398 + }, + { + "epoch": 0.51, + "grad_norm": 1.563450037500708, + "learning_rate": 1.0256132617977856e-05, + "loss": 0.8202, + "step": 3399 + }, + { + "epoch": 0.51, + "grad_norm": 1.422944037103568, + "learning_rate": 1.0251300954433377e-05, + "loss": 0.929, + "step": 3400 + }, + { + "epoch": 0.51, + "grad_norm": 1.3383386822306005, + "learning_rate": 1.0246469232184968e-05, + "loss": 0.8325, + "step": 3401 + }, + { + "epoch": 0.51, + "grad_norm": 1.5252639056039872, + "learning_rate": 1.0241637452361323e-05, + "loss": 0.8408, + "step": 3402 + }, + { + "epoch": 0.51, + "grad_norm": 1.2936133135124792, + "learning_rate": 1.0236805616091148e-05, + "loss": 0.7926, + "step": 3403 + }, + { + "epoch": 0.51, + "grad_norm": 0.823261602405843, + "learning_rate": 1.0231973724503152e-05, + "loss": 0.3524, + "step": 3404 + }, + { + "epoch": 0.51, + "grad_norm": 1.6340610325664995, + "learning_rate": 1.0227141778726075e-05, + "loss": 0.8217, + "step": 3405 + }, + { + "epoch": 0.51, + "grad_norm": 0.8259746336356296, + "learning_rate": 1.0222309779888656e-05, + "loss": 0.3607, + "step": 3406 + }, + { + "epoch": 0.51, + "grad_norm": 1.4087914601565654, + "learning_rate": 1.0217477729119648e-05, + "loss": 0.8592, + "step": 3407 + }, + { + "epoch": 0.51, + "grad_norm": 1.4812567583968104, + "learning_rate": 1.0212645627547821e-05, + "loss": 0.8718, + "step": 3408 + }, + { + "epoch": 0.51, + "grad_norm": 1.0356564590795982, + "learning_rate": 1.0207813476301955e-05, + "loss": 0.3653, + "step": 3409 + }, + { + "epoch": 0.51, + "grad_norm": 1.2516975238618395, + "learning_rate": 1.0202981276510841e-05, + "loss": 0.8742, + "step": 3410 + }, + { + "epoch": 0.51, + "grad_norm": 1.3432517326923943, + "learning_rate": 1.019814902930328e-05, + "loss": 0.8582, + "step": 3411 + }, + { + "epoch": 0.51, + "grad_norm": 1.4166719592716657, + "learning_rate": 1.0193316735808085e-05, + "loss": 0.8335, + "step": 3412 + }, + { + "epoch": 0.51, + "grad_norm": 1.266556815487124, + "learning_rate": 1.0188484397154083e-05, + "loss": 0.8493, + "step": 3413 + }, + { + "epoch": 0.51, + "grad_norm": 1.4260501790843365, + "learning_rate": 1.0183652014470105e-05, + "loss": 0.8731, + "step": 3414 + }, + { + "epoch": 0.51, + "grad_norm": 1.469974719290576, + "learning_rate": 1.0178819588885001e-05, + "loss": 0.9264, + "step": 3415 + }, + { + "epoch": 0.51, + "grad_norm": 0.925301636828885, + "learning_rate": 1.0173987121527619e-05, + "loss": 0.327, + "step": 3416 + }, + { + "epoch": 0.51, + "grad_norm": 1.4926511046840938, + "learning_rate": 1.0169154613526831e-05, + "loss": 0.8761, + "step": 3417 + }, + { + "epoch": 0.51, + "grad_norm": 1.6290364264882893, + "learning_rate": 1.0164322066011509e-05, + "loss": 0.9029, + "step": 3418 + }, + { + "epoch": 0.51, + "grad_norm": 1.2345519225771537, + "learning_rate": 1.0159489480110536e-05, + "loss": 0.8194, + "step": 3419 + }, + { + "epoch": 0.51, + "grad_norm": 1.5341449784614702, + "learning_rate": 1.0154656856952805e-05, + "loss": 0.852, + "step": 3420 + }, + { + "epoch": 0.51, + "grad_norm": 1.5686703370107327, + "learning_rate": 1.0149824197667213e-05, + "loss": 0.7736, + "step": 3421 + }, + { + "epoch": 0.51, + "grad_norm": 1.3919099131495478, + "learning_rate": 1.0144991503382676e-05, + "loss": 0.8704, + "step": 3422 + }, + { + "epoch": 0.51, + "grad_norm": 1.342730158715274, + "learning_rate": 1.0140158775228111e-05, + "loss": 0.7477, + "step": 3423 + }, + { + "epoch": 0.51, + "grad_norm": 1.433967463664584, + "learning_rate": 1.013532601433244e-05, + "loss": 0.7749, + "step": 3424 + }, + { + "epoch": 0.51, + "grad_norm": 1.3450195129146365, + "learning_rate": 1.0130493221824598e-05, + "loss": 0.8148, + "step": 3425 + }, + { + "epoch": 0.51, + "grad_norm": 1.3576115365940458, + "learning_rate": 1.0125660398833528e-05, + "loss": 0.8098, + "step": 3426 + }, + { + "epoch": 0.51, + "grad_norm": 1.682810772803721, + "learning_rate": 1.0120827546488175e-05, + "loss": 0.8974, + "step": 3427 + }, + { + "epoch": 0.51, + "grad_norm": 1.5815080295529864, + "learning_rate": 1.0115994665917497e-05, + "loss": 0.8791, + "step": 3428 + }, + { + "epoch": 0.51, + "grad_norm": 1.3273680268796113, + "learning_rate": 1.0111161758250451e-05, + "loss": 0.833, + "step": 3429 + }, + { + "epoch": 0.51, + "grad_norm": 1.5861881485544678, + "learning_rate": 1.010632882461601e-05, + "loss": 0.9005, + "step": 3430 + }, + { + "epoch": 0.51, + "grad_norm": 1.486615505332043, + "learning_rate": 1.0101495866143143e-05, + "loss": 0.7811, + "step": 3431 + }, + { + "epoch": 0.51, + "grad_norm": 1.4187267303133957, + "learning_rate": 1.0096662883960833e-05, + "loss": 0.9513, + "step": 3432 + }, + { + "epoch": 0.51, + "grad_norm": 1.3946573350043077, + "learning_rate": 1.0091829879198061e-05, + "loss": 0.8659, + "step": 3433 + }, + { + "epoch": 0.51, + "grad_norm": 1.397473477161376, + "learning_rate": 1.0086996852983822e-05, + "loss": 0.9039, + "step": 3434 + }, + { + "epoch": 0.51, + "grad_norm": 0.9320843507599951, + "learning_rate": 1.008216380644711e-05, + "loss": 0.2924, + "step": 3435 + }, + { + "epoch": 0.51, + "grad_norm": 1.4572545460855695, + "learning_rate": 1.0077330740716922e-05, + "loss": 0.8394, + "step": 3436 + }, + { + "epoch": 0.51, + "grad_norm": 1.304388279638198, + "learning_rate": 1.0072497656922266e-05, + "loss": 0.8702, + "step": 3437 + }, + { + "epoch": 0.51, + "grad_norm": 1.3150299090643778, + "learning_rate": 1.0067664556192154e-05, + "loss": 0.8178, + "step": 3438 + }, + { + "epoch": 0.51, + "grad_norm": 1.5357642158967026, + "learning_rate": 1.0062831439655591e-05, + "loss": 0.7881, + "step": 3439 + }, + { + "epoch": 0.51, + "grad_norm": 1.588994093213062, + "learning_rate": 1.0057998308441598e-05, + "loss": 0.9227, + "step": 3440 + }, + { + "epoch": 0.51, + "grad_norm": 1.3774811739963508, + "learning_rate": 1.0053165163679196e-05, + "loss": 0.8628, + "step": 3441 + }, + { + "epoch": 0.51, + "grad_norm": 1.5394552971302613, + "learning_rate": 1.0048332006497406e-05, + "loss": 0.9178, + "step": 3442 + }, + { + "epoch": 0.51, + "grad_norm": 1.4357760000003168, + "learning_rate": 1.0043498838025252e-05, + "loss": 0.873, + "step": 3443 + }, + { + "epoch": 0.51, + "grad_norm": 1.3667153769434364, + "learning_rate": 1.0038665659391768e-05, + "loss": 0.8536, + "step": 3444 + }, + { + "epoch": 0.51, + "grad_norm": 1.4129285981820916, + "learning_rate": 1.0033832471725982e-05, + "loss": 0.8652, + "step": 3445 + }, + { + "epoch": 0.51, + "grad_norm": 1.3788770005255109, + "learning_rate": 1.0028999276156926e-05, + "loss": 0.9494, + "step": 3446 + }, + { + "epoch": 0.51, + "grad_norm": 1.437959527600928, + "learning_rate": 1.0024166073813634e-05, + "loss": 0.851, + "step": 3447 + }, + { + "epoch": 0.51, + "grad_norm": 1.3953992388955814, + "learning_rate": 1.0019332865825149e-05, + "loss": 0.8021, + "step": 3448 + }, + { + "epoch": 0.51, + "grad_norm": 1.2450964443498471, + "learning_rate": 1.0014499653320504e-05, + "loss": 0.8689, + "step": 3449 + }, + { + "epoch": 0.51, + "grad_norm": 0.8268230647239653, + "learning_rate": 1.0009666437428736e-05, + "loss": 0.3281, + "step": 3450 + }, + { + "epoch": 0.51, + "grad_norm": 1.6439971532235766, + "learning_rate": 1.000483321927889e-05, + "loss": 0.7811, + "step": 3451 + }, + { + "epoch": 0.52, + "grad_norm": 1.5491931444265257, + "learning_rate": 1e-05, + "loss": 0.8971, + "step": 3452 + }, + { + "epoch": 0.52, + "grad_norm": 1.3962384987833836, + "learning_rate": 9.995166780721112e-06, + "loss": 0.8105, + "step": 3453 + }, + { + "epoch": 0.52, + "grad_norm": 1.4999578781720608, + "learning_rate": 9.990333562571266e-06, + "loss": 0.7888, + "step": 3454 + }, + { + "epoch": 0.52, + "grad_norm": 1.5713598082496674, + "learning_rate": 9.985500346679497e-06, + "loss": 0.8393, + "step": 3455 + }, + { + "epoch": 0.52, + "grad_norm": 1.5588921724933924, + "learning_rate": 9.980667134174854e-06, + "loss": 0.8367, + "step": 3456 + }, + { + "epoch": 0.52, + "grad_norm": 1.409938850974922, + "learning_rate": 9.975833926186367e-06, + "loss": 0.9221, + "step": 3457 + }, + { + "epoch": 0.52, + "grad_norm": 1.6915889480538027, + "learning_rate": 9.971000723843077e-06, + "loss": 0.881, + "step": 3458 + }, + { + "epoch": 0.52, + "grad_norm": 1.2717013195965097, + "learning_rate": 9.966167528274021e-06, + "loss": 0.8117, + "step": 3459 + }, + { + "epoch": 0.52, + "grad_norm": 1.6657230773270417, + "learning_rate": 9.961334340608233e-06, + "loss": 0.9457, + "step": 3460 + }, + { + "epoch": 0.52, + "grad_norm": 1.476567160508203, + "learning_rate": 9.956501161974747e-06, + "loss": 0.8671, + "step": 3461 + }, + { + "epoch": 0.52, + "grad_norm": 1.398525152043447, + "learning_rate": 9.951667993502599e-06, + "loss": 0.8317, + "step": 3462 + }, + { + "epoch": 0.52, + "grad_norm": 1.409430063915483, + "learning_rate": 9.946834836320809e-06, + "loss": 0.7966, + "step": 3463 + }, + { + "epoch": 0.52, + "grad_norm": 1.3155647348693218, + "learning_rate": 9.942001691558405e-06, + "loss": 0.8294, + "step": 3464 + }, + { + "epoch": 0.52, + "grad_norm": 1.4181669138795794, + "learning_rate": 9.937168560344412e-06, + "loss": 0.8945, + "step": 3465 + }, + { + "epoch": 0.52, + "grad_norm": 1.4805933068300412, + "learning_rate": 9.93233544380785e-06, + "loss": 0.8231, + "step": 3466 + }, + { + "epoch": 0.52, + "grad_norm": 1.4679960030080992, + "learning_rate": 9.927502343077732e-06, + "loss": 0.8718, + "step": 3467 + }, + { + "epoch": 0.52, + "grad_norm": 1.4759980906730832, + "learning_rate": 9.922669259283078e-06, + "loss": 0.8738, + "step": 3468 + }, + { + "epoch": 0.52, + "grad_norm": 1.4435891341511244, + "learning_rate": 9.917836193552895e-06, + "loss": 0.7917, + "step": 3469 + }, + { + "epoch": 0.52, + "grad_norm": 1.6149539257186423, + "learning_rate": 9.913003147016181e-06, + "loss": 0.8177, + "step": 3470 + }, + { + "epoch": 0.52, + "grad_norm": 1.2646296481353163, + "learning_rate": 9.908170120801942e-06, + "loss": 0.903, + "step": 3471 + }, + { + "epoch": 0.52, + "grad_norm": 1.6149703568830507, + "learning_rate": 9.903337116039172e-06, + "loss": 0.8874, + "step": 3472 + }, + { + "epoch": 0.52, + "grad_norm": 1.5138555218440664, + "learning_rate": 9.898504133856858e-06, + "loss": 0.8392, + "step": 3473 + }, + { + "epoch": 0.52, + "grad_norm": 1.3751495839434065, + "learning_rate": 9.893671175383995e-06, + "loss": 0.8933, + "step": 3474 + }, + { + "epoch": 0.52, + "grad_norm": 1.3562143958654773, + "learning_rate": 9.888838241749552e-06, + "loss": 0.8951, + "step": 3475 + }, + { + "epoch": 0.52, + "grad_norm": 1.3420581396688758, + "learning_rate": 9.884005334082508e-06, + "loss": 0.8905, + "step": 3476 + }, + { + "epoch": 0.52, + "grad_norm": 1.6688196872686993, + "learning_rate": 9.879172453511827e-06, + "loss": 0.9132, + "step": 3477 + }, + { + "epoch": 0.52, + "grad_norm": 1.4965535508101526, + "learning_rate": 9.874339601166474e-06, + "loss": 0.7874, + "step": 3478 + }, + { + "epoch": 0.52, + "grad_norm": 1.5107073172630312, + "learning_rate": 9.869506778175405e-06, + "loss": 0.8915, + "step": 3479 + }, + { + "epoch": 0.52, + "grad_norm": 1.3490376296511224, + "learning_rate": 9.864673985667563e-06, + "loss": 0.865, + "step": 3480 + }, + { + "epoch": 0.52, + "grad_norm": 0.8606336133466326, + "learning_rate": 9.859841224771892e-06, + "loss": 0.3232, + "step": 3481 + }, + { + "epoch": 0.52, + "grad_norm": 1.6294404704402772, + "learning_rate": 9.855008496617326e-06, + "loss": 0.8966, + "step": 3482 + }, + { + "epoch": 0.52, + "grad_norm": 1.4277643586753277, + "learning_rate": 9.850175802332788e-06, + "loss": 0.8575, + "step": 3483 + }, + { + "epoch": 0.52, + "grad_norm": 1.4295334133640925, + "learning_rate": 9.845343143047198e-06, + "loss": 0.8474, + "step": 3484 + }, + { + "epoch": 0.52, + "grad_norm": 1.3416235595279287, + "learning_rate": 9.840510519889467e-06, + "loss": 0.797, + "step": 3485 + }, + { + "epoch": 0.52, + "grad_norm": 1.3743923663939088, + "learning_rate": 9.835677933988493e-06, + "loss": 0.8545, + "step": 3486 + }, + { + "epoch": 0.52, + "grad_norm": 1.4360387653563598, + "learning_rate": 9.830845386473169e-06, + "loss": 0.8848, + "step": 3487 + }, + { + "epoch": 0.52, + "grad_norm": 1.4465719670119639, + "learning_rate": 9.826012878472383e-06, + "loss": 0.7971, + "step": 3488 + }, + { + "epoch": 0.52, + "grad_norm": 1.4395948997565176, + "learning_rate": 9.821180411115002e-06, + "loss": 0.7928, + "step": 3489 + }, + { + "epoch": 0.52, + "grad_norm": 1.3503594356826982, + "learning_rate": 9.816347985529898e-06, + "loss": 0.8364, + "step": 3490 + }, + { + "epoch": 0.52, + "grad_norm": 1.3977523449230942, + "learning_rate": 9.81151560284592e-06, + "loss": 0.8894, + "step": 3491 + }, + { + "epoch": 0.52, + "grad_norm": 1.427630707751418, + "learning_rate": 9.806683264191916e-06, + "loss": 0.8475, + "step": 3492 + }, + { + "epoch": 0.52, + "grad_norm": 1.390789281458476, + "learning_rate": 9.801850970696722e-06, + "loss": 0.8817, + "step": 3493 + }, + { + "epoch": 0.52, + "grad_norm": 1.4045923045630875, + "learning_rate": 9.797018723489162e-06, + "loss": 0.8881, + "step": 3494 + }, + { + "epoch": 0.52, + "grad_norm": 1.5868689862231609, + "learning_rate": 9.792186523698048e-06, + "loss": 0.897, + "step": 3495 + }, + { + "epoch": 0.52, + "grad_norm": 1.304606199586758, + "learning_rate": 9.787354372452182e-06, + "loss": 0.8632, + "step": 3496 + }, + { + "epoch": 0.52, + "grad_norm": 1.3613617515241991, + "learning_rate": 9.782522270880354e-06, + "loss": 0.8005, + "step": 3497 + }, + { + "epoch": 0.52, + "grad_norm": 1.474100117188263, + "learning_rate": 9.777690220111348e-06, + "loss": 0.8629, + "step": 3498 + }, + { + "epoch": 0.52, + "grad_norm": 1.4886728595038716, + "learning_rate": 9.772858221273926e-06, + "loss": 0.8898, + "step": 3499 + }, + { + "epoch": 0.52, + "grad_norm": 1.3751076024932953, + "learning_rate": 9.768026275496848e-06, + "loss": 0.8846, + "step": 3500 + }, + { + "epoch": 0.52, + "grad_norm": 1.5461795175423334, + "learning_rate": 9.763194383908857e-06, + "loss": 0.9162, + "step": 3501 + }, + { + "epoch": 0.52, + "grad_norm": 1.5141999923105947, + "learning_rate": 9.75836254763868e-06, + "loss": 0.9415, + "step": 3502 + }, + { + "epoch": 0.52, + "grad_norm": 1.4606037115570538, + "learning_rate": 9.753530767815036e-06, + "loss": 0.8304, + "step": 3503 + }, + { + "epoch": 0.52, + "grad_norm": 1.6429926386650577, + "learning_rate": 9.748699045566626e-06, + "loss": 0.84, + "step": 3504 + }, + { + "epoch": 0.52, + "grad_norm": 1.5817934299018297, + "learning_rate": 9.743867382022147e-06, + "loss": 0.8337, + "step": 3505 + }, + { + "epoch": 0.52, + "grad_norm": 1.6776480807980998, + "learning_rate": 9.739035778310273e-06, + "loss": 0.8274, + "step": 3506 + }, + { + "epoch": 0.52, + "grad_norm": 1.4683466063134807, + "learning_rate": 9.73420423555967e-06, + "loss": 0.8072, + "step": 3507 + }, + { + "epoch": 0.52, + "grad_norm": 1.4015235262069392, + "learning_rate": 9.729372754898979e-06, + "loss": 0.8712, + "step": 3508 + }, + { + "epoch": 0.52, + "grad_norm": 1.2740110088831653, + "learning_rate": 9.72454133745684e-06, + "loss": 0.8242, + "step": 3509 + }, + { + "epoch": 0.52, + "grad_norm": 1.455902727462656, + "learning_rate": 9.719709984361873e-06, + "loss": 0.8048, + "step": 3510 + }, + { + "epoch": 0.52, + "grad_norm": 1.3958298369679383, + "learning_rate": 9.714878696742682e-06, + "loss": 0.8955, + "step": 3511 + }, + { + "epoch": 0.52, + "grad_norm": 1.3503633296382895, + "learning_rate": 9.710047475727854e-06, + "loss": 0.8193, + "step": 3512 + }, + { + "epoch": 0.52, + "grad_norm": 1.4416193274786029, + "learning_rate": 9.705216322445967e-06, + "loss": 0.8766, + "step": 3513 + }, + { + "epoch": 0.52, + "grad_norm": 1.429921863190966, + "learning_rate": 9.700385238025579e-06, + "loss": 0.7381, + "step": 3514 + }, + { + "epoch": 0.52, + "grad_norm": 1.4837315532550757, + "learning_rate": 9.695554223595224e-06, + "loss": 0.8643, + "step": 3515 + }, + { + "epoch": 0.52, + "grad_norm": 1.4185058933318586, + "learning_rate": 9.690723280283436e-06, + "loss": 0.7869, + "step": 3516 + }, + { + "epoch": 0.52, + "grad_norm": 1.4232037133997082, + "learning_rate": 9.685892409218718e-06, + "loss": 0.9165, + "step": 3517 + }, + { + "epoch": 0.52, + "grad_norm": 1.6296266888238622, + "learning_rate": 9.681061611529566e-06, + "loss": 0.8628, + "step": 3518 + }, + { + "epoch": 0.53, + "grad_norm": 1.3830645878320222, + "learning_rate": 9.676230888344451e-06, + "loss": 0.8816, + "step": 3519 + }, + { + "epoch": 0.53, + "grad_norm": 1.3966046074814396, + "learning_rate": 9.67140024079184e-06, + "loss": 0.8832, + "step": 3520 + }, + { + "epoch": 0.53, + "grad_norm": 1.3218236889727253, + "learning_rate": 9.66656967000016e-06, + "loss": 0.7842, + "step": 3521 + }, + { + "epoch": 0.53, + "grad_norm": 1.4563715946151394, + "learning_rate": 9.661739177097836e-06, + "loss": 0.8157, + "step": 3522 + }, + { + "epoch": 0.53, + "grad_norm": 1.4276943501238542, + "learning_rate": 9.656908763213276e-06, + "loss": 0.8315, + "step": 3523 + }, + { + "epoch": 0.53, + "grad_norm": 1.5598785833817714, + "learning_rate": 9.652078429474863e-06, + "loss": 0.8444, + "step": 3524 + }, + { + "epoch": 0.53, + "grad_norm": 1.4390880666578192, + "learning_rate": 9.647248177010964e-06, + "loss": 0.9087, + "step": 3525 + }, + { + "epoch": 0.53, + "grad_norm": 1.5631165635038766, + "learning_rate": 9.642418006949926e-06, + "loss": 0.9012, + "step": 3526 + }, + { + "epoch": 0.53, + "grad_norm": 1.401429788340974, + "learning_rate": 9.63758792042008e-06, + "loss": 0.8189, + "step": 3527 + }, + { + "epoch": 0.53, + "grad_norm": 1.4051232884020528, + "learning_rate": 9.632757918549729e-06, + "loss": 0.8885, + "step": 3528 + }, + { + "epoch": 0.53, + "grad_norm": 1.4049373593315253, + "learning_rate": 9.627928002467164e-06, + "loss": 0.8789, + "step": 3529 + }, + { + "epoch": 0.53, + "grad_norm": 1.7769730338513625, + "learning_rate": 9.623098173300655e-06, + "loss": 0.7836, + "step": 3530 + }, + { + "epoch": 0.53, + "grad_norm": 1.7560963613957583, + "learning_rate": 9.618268432178451e-06, + "loss": 0.9626, + "step": 3531 + }, + { + "epoch": 0.53, + "grad_norm": 1.4968967710416943, + "learning_rate": 9.613438780228777e-06, + "loss": 0.863, + "step": 3532 + }, + { + "epoch": 0.53, + "grad_norm": 1.2567442006122844, + "learning_rate": 9.60860921857985e-06, + "loss": 0.8815, + "step": 3533 + }, + { + "epoch": 0.53, + "grad_norm": 1.4565100383201046, + "learning_rate": 9.603779748359842e-06, + "loss": 0.8918, + "step": 3534 + }, + { + "epoch": 0.53, + "grad_norm": 1.3208910327756287, + "learning_rate": 9.598950370696924e-06, + "loss": 0.8856, + "step": 3535 + }, + { + "epoch": 0.53, + "grad_norm": 1.5268204279075828, + "learning_rate": 9.594121086719241e-06, + "loss": 0.9415, + "step": 3536 + }, + { + "epoch": 0.53, + "grad_norm": 1.3787442009454582, + "learning_rate": 9.589291897554912e-06, + "loss": 0.8071, + "step": 3537 + }, + { + "epoch": 0.53, + "grad_norm": 1.3772955610702697, + "learning_rate": 9.584462804332038e-06, + "loss": 0.8394, + "step": 3538 + }, + { + "epoch": 0.53, + "grad_norm": 1.4024396819717755, + "learning_rate": 9.579633808178693e-06, + "loss": 0.8812, + "step": 3539 + }, + { + "epoch": 0.53, + "grad_norm": 1.4484462324206373, + "learning_rate": 9.574804910222934e-06, + "loss": 0.8493, + "step": 3540 + }, + { + "epoch": 0.53, + "grad_norm": 1.588517954263737, + "learning_rate": 9.569976111592789e-06, + "loss": 0.8839, + "step": 3541 + }, + { + "epoch": 0.53, + "grad_norm": 1.47160428960858, + "learning_rate": 9.565147413416266e-06, + "loss": 0.838, + "step": 3542 + }, + { + "epoch": 0.53, + "grad_norm": 1.3582415294730648, + "learning_rate": 9.560318816821354e-06, + "loss": 0.8813, + "step": 3543 + }, + { + "epoch": 0.53, + "grad_norm": 1.4717764889042155, + "learning_rate": 9.555490322936007e-06, + "loss": 0.8361, + "step": 3544 + }, + { + "epoch": 0.53, + "grad_norm": 1.420587255872903, + "learning_rate": 9.550661932888164e-06, + "loss": 0.9022, + "step": 3545 + }, + { + "epoch": 0.53, + "grad_norm": 1.5727905664727972, + "learning_rate": 9.545833647805743e-06, + "loss": 0.8461, + "step": 3546 + }, + { + "epoch": 0.53, + "grad_norm": 1.4340290427362725, + "learning_rate": 9.541005468816622e-06, + "loss": 0.8367, + "step": 3547 + }, + { + "epoch": 0.53, + "grad_norm": 1.4525572631647652, + "learning_rate": 9.53617739704867e-06, + "loss": 0.8262, + "step": 3548 + }, + { + "epoch": 0.53, + "grad_norm": 1.3596503866711438, + "learning_rate": 9.531349433629729e-06, + "loss": 0.8865, + "step": 3549 + }, + { + "epoch": 0.53, + "grad_norm": 1.328003759319903, + "learning_rate": 9.526521579687603e-06, + "loss": 0.8414, + "step": 3550 + }, + { + "epoch": 0.53, + "grad_norm": 1.4274244800583733, + "learning_rate": 9.521693836350083e-06, + "loss": 0.8624, + "step": 3551 + }, + { + "epoch": 0.53, + "grad_norm": 1.2721743081537307, + "learning_rate": 9.516866204744932e-06, + "loss": 0.798, + "step": 3552 + }, + { + "epoch": 0.53, + "grad_norm": 1.423403909592571, + "learning_rate": 9.512038685999882e-06, + "loss": 0.8182, + "step": 3553 + }, + { + "epoch": 0.53, + "grad_norm": 1.3349668325476816, + "learning_rate": 9.507211281242646e-06, + "loss": 0.7936, + "step": 3554 + }, + { + "epoch": 0.53, + "grad_norm": 1.2729769829172104, + "learning_rate": 9.502383991600901e-06, + "loss": 0.8028, + "step": 3555 + }, + { + "epoch": 0.53, + "grad_norm": 1.3212568408846261, + "learning_rate": 9.497556818202306e-06, + "loss": 0.8854, + "step": 3556 + }, + { + "epoch": 0.53, + "grad_norm": 1.3679916047148348, + "learning_rate": 9.492729762174489e-06, + "loss": 0.8882, + "step": 3557 + }, + { + "epoch": 0.53, + "grad_norm": 1.6020324192081918, + "learning_rate": 9.487902824645048e-06, + "loss": 0.8382, + "step": 3558 + }, + { + "epoch": 0.53, + "grad_norm": 1.4011749069206587, + "learning_rate": 9.483076006741564e-06, + "loss": 0.914, + "step": 3559 + }, + { + "epoch": 0.53, + "grad_norm": 1.5141374591847243, + "learning_rate": 9.478249309591571e-06, + "loss": 0.7728, + "step": 3560 + }, + { + "epoch": 0.53, + "grad_norm": 1.440791733711582, + "learning_rate": 9.473422734322593e-06, + "loss": 0.936, + "step": 3561 + }, + { + "epoch": 0.53, + "grad_norm": 0.8757881665786142, + "learning_rate": 9.468596282062114e-06, + "loss": 0.3428, + "step": 3562 + }, + { + "epoch": 0.53, + "grad_norm": 1.4844334352435307, + "learning_rate": 9.4637699539376e-06, + "loss": 0.8313, + "step": 3563 + }, + { + "epoch": 0.53, + "grad_norm": 1.4568034114088537, + "learning_rate": 9.458943751076475e-06, + "loss": 0.7888, + "step": 3564 + }, + { + "epoch": 0.53, + "grad_norm": 1.483814626877465, + "learning_rate": 9.45411767460615e-06, + "loss": 0.7983, + "step": 3565 + }, + { + "epoch": 0.53, + "grad_norm": 1.3152010626237944, + "learning_rate": 9.449291725653985e-06, + "loss": 0.856, + "step": 3566 + }, + { + "epoch": 0.53, + "grad_norm": 1.5257691708588121, + "learning_rate": 9.444465905347327e-06, + "loss": 0.8419, + "step": 3567 + }, + { + "epoch": 0.53, + "grad_norm": 1.7190177871427816, + "learning_rate": 9.439640214813491e-06, + "loss": 0.8877, + "step": 3568 + }, + { + "epoch": 0.53, + "grad_norm": 1.4985641707877073, + "learning_rate": 9.434814655179756e-06, + "loss": 0.8346, + "step": 3569 + }, + { + "epoch": 0.53, + "grad_norm": 1.4425306392945307, + "learning_rate": 9.429989227573373e-06, + "loss": 0.9176, + "step": 3570 + }, + { + "epoch": 0.53, + "grad_norm": 1.3746720838755477, + "learning_rate": 9.425163933121564e-06, + "loss": 0.836, + "step": 3571 + }, + { + "epoch": 0.53, + "grad_norm": 1.5429985562596362, + "learning_rate": 9.420338772951521e-06, + "loss": 0.7823, + "step": 3572 + }, + { + "epoch": 0.53, + "grad_norm": 1.4744078909042018, + "learning_rate": 9.415513748190396e-06, + "loss": 0.7614, + "step": 3573 + }, + { + "epoch": 0.53, + "grad_norm": 1.7209365591126629, + "learning_rate": 9.410688859965316e-06, + "loss": 0.8301, + "step": 3574 + }, + { + "epoch": 0.53, + "grad_norm": 1.3494378842155879, + "learning_rate": 9.405864109403378e-06, + "loss": 0.8302, + "step": 3575 + }, + { + "epoch": 0.53, + "grad_norm": 1.3813497247339959, + "learning_rate": 9.401039497631642e-06, + "loss": 0.866, + "step": 3576 + }, + { + "epoch": 0.53, + "grad_norm": 1.2198492695495537, + "learning_rate": 9.39621502577714e-06, + "loss": 0.8478, + "step": 3577 + }, + { + "epoch": 0.53, + "grad_norm": 1.61010211426017, + "learning_rate": 9.391390694966871e-06, + "loss": 0.8767, + "step": 3578 + }, + { + "epoch": 0.53, + "grad_norm": 1.704134001873991, + "learning_rate": 9.386566506327793e-06, + "loss": 0.8722, + "step": 3579 + }, + { + "epoch": 0.53, + "grad_norm": 1.6219507680910457, + "learning_rate": 9.38174246098684e-06, + "loss": 0.8499, + "step": 3580 + }, + { + "epoch": 0.53, + "grad_norm": 1.4237076096099812, + "learning_rate": 9.37691856007091e-06, + "loss": 0.8444, + "step": 3581 + }, + { + "epoch": 0.53, + "grad_norm": 1.4439272757009594, + "learning_rate": 9.372094804706867e-06, + "loss": 0.8567, + "step": 3582 + }, + { + "epoch": 0.53, + "grad_norm": 1.3840393884520266, + "learning_rate": 9.367271196021539e-06, + "loss": 0.7988, + "step": 3583 + }, + { + "epoch": 0.53, + "grad_norm": 1.3749031519106785, + "learning_rate": 9.362447735141723e-06, + "loss": 0.7628, + "step": 3584 + }, + { + "epoch": 0.53, + "grad_norm": 1.3315978589720228, + "learning_rate": 9.357624423194185e-06, + "loss": 0.7265, + "step": 3585 + }, + { + "epoch": 0.54, + "grad_norm": 1.408940903040862, + "learning_rate": 9.35280126130564e-06, + "loss": 0.8107, + "step": 3586 + }, + { + "epoch": 0.54, + "grad_norm": 1.3050787861493243, + "learning_rate": 9.347978250602786e-06, + "loss": 0.8877, + "step": 3587 + }, + { + "epoch": 0.54, + "grad_norm": 0.808001497731327, + "learning_rate": 9.343155392212276e-06, + "loss": 0.339, + "step": 3588 + }, + { + "epoch": 0.54, + "grad_norm": 1.271314135667705, + "learning_rate": 9.338332687260733e-06, + "loss": 0.8631, + "step": 3589 + }, + { + "epoch": 0.54, + "grad_norm": 1.4720510180123563, + "learning_rate": 9.333510136874741e-06, + "loss": 0.8051, + "step": 3590 + }, + { + "epoch": 0.54, + "grad_norm": 1.5133052468058892, + "learning_rate": 9.32868774218085e-06, + "loss": 0.7821, + "step": 3591 + }, + { + "epoch": 0.54, + "grad_norm": 1.46234917010658, + "learning_rate": 9.323865504305566e-06, + "loss": 0.9526, + "step": 3592 + }, + { + "epoch": 0.54, + "grad_norm": 1.3227504731588213, + "learning_rate": 9.319043424375366e-06, + "loss": 0.85, + "step": 3593 + }, + { + "epoch": 0.54, + "grad_norm": 1.5174377442756495, + "learning_rate": 9.314221503516691e-06, + "loss": 0.8886, + "step": 3594 + }, + { + "epoch": 0.54, + "grad_norm": 1.4949720014446959, + "learning_rate": 9.309399742855943e-06, + "loss": 0.9539, + "step": 3595 + }, + { + "epoch": 0.54, + "grad_norm": 1.4120682945185337, + "learning_rate": 9.304578143519481e-06, + "loss": 0.8741, + "step": 3596 + }, + { + "epoch": 0.54, + "grad_norm": 1.3994489706707465, + "learning_rate": 9.299756706633636e-06, + "loss": 0.7803, + "step": 3597 + }, + { + "epoch": 0.54, + "grad_norm": 1.3259161483491646, + "learning_rate": 9.294935433324697e-06, + "loss": 0.8158, + "step": 3598 + }, + { + "epoch": 0.54, + "grad_norm": 1.5547741968815594, + "learning_rate": 9.290114324718906e-06, + "loss": 0.8629, + "step": 3599 + }, + { + "epoch": 0.54, + "grad_norm": 1.352249375454564, + "learning_rate": 9.285293381942482e-06, + "loss": 0.8699, + "step": 3600 + }, + { + "epoch": 0.54, + "grad_norm": 1.4072800711492426, + "learning_rate": 9.280472606121594e-06, + "loss": 0.8532, + "step": 3601 + }, + { + "epoch": 0.54, + "grad_norm": 1.654395452937887, + "learning_rate": 9.275651998382377e-06, + "loss": 0.8206, + "step": 3602 + }, + { + "epoch": 0.54, + "grad_norm": 1.5882043159144885, + "learning_rate": 9.270831559850925e-06, + "loss": 0.8294, + "step": 3603 + }, + { + "epoch": 0.54, + "grad_norm": 1.4364936962977737, + "learning_rate": 9.266011291653296e-06, + "loss": 0.8994, + "step": 3604 + }, + { + "epoch": 0.54, + "grad_norm": 1.3703816811813574, + "learning_rate": 9.261191194915499e-06, + "loss": 0.8437, + "step": 3605 + }, + { + "epoch": 0.54, + "grad_norm": 1.4929826774726052, + "learning_rate": 9.256371270763515e-06, + "loss": 0.9054, + "step": 3606 + }, + { + "epoch": 0.54, + "grad_norm": 1.3279618485600522, + "learning_rate": 9.251551520323273e-06, + "loss": 0.8317, + "step": 3607 + }, + { + "epoch": 0.54, + "grad_norm": 1.3984147521518384, + "learning_rate": 9.246731944720675e-06, + "loss": 0.9185, + "step": 3608 + }, + { + "epoch": 0.54, + "grad_norm": 1.58201288518725, + "learning_rate": 9.241912545081566e-06, + "loss": 0.9057, + "step": 3609 + }, + { + "epoch": 0.54, + "grad_norm": 1.567417912075355, + "learning_rate": 9.237093322531765e-06, + "loss": 0.9229, + "step": 3610 + }, + { + "epoch": 0.54, + "grad_norm": 1.4436698180955618, + "learning_rate": 9.232274278197038e-06, + "loss": 0.8431, + "step": 3611 + }, + { + "epoch": 0.54, + "grad_norm": 1.4854795700482486, + "learning_rate": 9.227455413203115e-06, + "loss": 0.9073, + "step": 3612 + }, + { + "epoch": 0.54, + "grad_norm": 1.4293128197513787, + "learning_rate": 9.222636728675687e-06, + "loss": 0.8548, + "step": 3613 + }, + { + "epoch": 0.54, + "grad_norm": 1.4813444641378652, + "learning_rate": 9.217818225740394e-06, + "loss": 0.8235, + "step": 3614 + }, + { + "epoch": 0.54, + "grad_norm": 1.4726405471753816, + "learning_rate": 9.212999905522841e-06, + "loss": 0.9532, + "step": 3615 + }, + { + "epoch": 0.54, + "grad_norm": 1.35946535969364, + "learning_rate": 9.20818176914859e-06, + "loss": 0.754, + "step": 3616 + }, + { + "epoch": 0.54, + "grad_norm": 1.3769326309951566, + "learning_rate": 9.203363817743159e-06, + "loss": 0.8283, + "step": 3617 + }, + { + "epoch": 0.54, + "grad_norm": 1.490114739691744, + "learning_rate": 9.198546052432018e-06, + "loss": 0.8051, + "step": 3618 + }, + { + "epoch": 0.54, + "grad_norm": 1.5055819452526438, + "learning_rate": 9.193728474340598e-06, + "loss": 0.8298, + "step": 3619 + }, + { + "epoch": 0.54, + "grad_norm": 1.4324767421589004, + "learning_rate": 9.188911084594286e-06, + "loss": 0.9232, + "step": 3620 + }, + { + "epoch": 0.54, + "grad_norm": 1.4386311336013364, + "learning_rate": 9.184093884318426e-06, + "loss": 0.893, + "step": 3621 + }, + { + "epoch": 0.54, + "grad_norm": 1.4429187011610316, + "learning_rate": 9.179276874638315e-06, + "loss": 0.8807, + "step": 3622 + }, + { + "epoch": 0.54, + "grad_norm": 1.467235808090102, + "learning_rate": 9.174460056679214e-06, + "loss": 0.8067, + "step": 3623 + }, + { + "epoch": 0.54, + "grad_norm": 1.3880377809290287, + "learning_rate": 9.16964343156632e-06, + "loss": 0.8749, + "step": 3624 + }, + { + "epoch": 0.54, + "grad_norm": 1.3900396944950153, + "learning_rate": 9.164827000424804e-06, + "loss": 0.8239, + "step": 3625 + }, + { + "epoch": 0.54, + "grad_norm": 1.3999335049692658, + "learning_rate": 9.160010764379782e-06, + "loss": 0.8855, + "step": 3626 + }, + { + "epoch": 0.54, + "grad_norm": 1.7598920865615002, + "learning_rate": 9.15519472455633e-06, + "loss": 0.9147, + "step": 3627 + }, + { + "epoch": 0.54, + "grad_norm": 1.3450064482122197, + "learning_rate": 9.150378882079475e-06, + "loss": 0.8669, + "step": 3628 + }, + { + "epoch": 0.54, + "grad_norm": 1.322550110521103, + "learning_rate": 9.145563238074198e-06, + "loss": 0.8886, + "step": 3629 + }, + { + "epoch": 0.54, + "grad_norm": 1.5315689416487994, + "learning_rate": 9.140747793665439e-06, + "loss": 0.8772, + "step": 3630 + }, + { + "epoch": 0.54, + "grad_norm": 1.337534278554035, + "learning_rate": 9.135932549978075e-06, + "loss": 0.8542, + "step": 3631 + }, + { + "epoch": 0.54, + "grad_norm": 1.3680346160196941, + "learning_rate": 9.131117508136953e-06, + "loss": 0.7951, + "step": 3632 + }, + { + "epoch": 0.54, + "grad_norm": 1.3785027441750946, + "learning_rate": 9.12630266926687e-06, + "loss": 0.8613, + "step": 3633 + }, + { + "epoch": 0.54, + "grad_norm": 0.92137407376053, + "learning_rate": 9.121488034492569e-06, + "loss": 0.3458, + "step": 3634 + }, + { + "epoch": 0.54, + "grad_norm": 1.3616645129198242, + "learning_rate": 9.11667360493875e-06, + "loss": 0.8985, + "step": 3635 + }, + { + "epoch": 0.54, + "grad_norm": 1.5686851438664762, + "learning_rate": 9.111859381730071e-06, + "loss": 0.8012, + "step": 3636 + }, + { + "epoch": 0.54, + "grad_norm": 1.502591740200375, + "learning_rate": 9.107045365991123e-06, + "loss": 0.9713, + "step": 3637 + }, + { + "epoch": 0.54, + "grad_norm": 1.3178483834558912, + "learning_rate": 9.102231558846467e-06, + "loss": 0.8959, + "step": 3638 + }, + { + "epoch": 0.54, + "grad_norm": 1.2851938456685843, + "learning_rate": 9.097417961420608e-06, + "loss": 0.785, + "step": 3639 + }, + { + "epoch": 0.54, + "grad_norm": 2.0918073484726687, + "learning_rate": 9.092604574838004e-06, + "loss": 0.9197, + "step": 3640 + }, + { + "epoch": 0.54, + "grad_norm": 1.476397571990377, + "learning_rate": 9.08779140022306e-06, + "loss": 0.8163, + "step": 3641 + }, + { + "epoch": 0.54, + "grad_norm": 1.32157160394685, + "learning_rate": 9.082978438700138e-06, + "loss": 0.8903, + "step": 3642 + }, + { + "epoch": 0.54, + "grad_norm": 1.5251531141297938, + "learning_rate": 9.078165691393548e-06, + "loss": 0.7586, + "step": 3643 + }, + { + "epoch": 0.54, + "grad_norm": 1.63149351194706, + "learning_rate": 9.073353159427538e-06, + "loss": 0.8153, + "step": 3644 + }, + { + "epoch": 0.54, + "grad_norm": 1.5791357906031458, + "learning_rate": 9.068540843926324e-06, + "loss": 0.9268, + "step": 3645 + }, + { + "epoch": 0.54, + "grad_norm": 1.5209111822193677, + "learning_rate": 9.06372874601406e-06, + "loss": 0.8482, + "step": 3646 + }, + { + "epoch": 0.54, + "grad_norm": 1.1645795734612956, + "learning_rate": 9.058916866814857e-06, + "loss": 0.6842, + "step": 3647 + }, + { + "epoch": 0.54, + "grad_norm": 1.5763810319166138, + "learning_rate": 9.054105207452766e-06, + "loss": 0.8708, + "step": 3648 + }, + { + "epoch": 0.54, + "grad_norm": 1.409372429544046, + "learning_rate": 9.049293769051797e-06, + "loss": 0.7862, + "step": 3649 + }, + { + "epoch": 0.54, + "grad_norm": 1.3887514696501861, + "learning_rate": 9.044482552735895e-06, + "loss": 0.8234, + "step": 3650 + }, + { + "epoch": 0.54, + "grad_norm": 1.4237640161039977, + "learning_rate": 9.039671559628963e-06, + "loss": 0.8167, + "step": 3651 + }, + { + "epoch": 0.54, + "grad_norm": 1.4797547626715986, + "learning_rate": 9.034860790854848e-06, + "loss": 0.8635, + "step": 3652 + }, + { + "epoch": 0.55, + "grad_norm": 1.6376077381756147, + "learning_rate": 9.030050247537352e-06, + "loss": 0.821, + "step": 3653 + }, + { + "epoch": 0.55, + "grad_norm": 1.5123309009875996, + "learning_rate": 9.025239930800214e-06, + "loss": 0.8598, + "step": 3654 + }, + { + "epoch": 0.55, + "grad_norm": 1.4201874478057306, + "learning_rate": 9.020429841767125e-06, + "loss": 0.7664, + "step": 3655 + }, + { + "epoch": 0.55, + "grad_norm": 1.4104704506833088, + "learning_rate": 9.015619981561726e-06, + "loss": 0.8372, + "step": 3656 + }, + { + "epoch": 0.55, + "grad_norm": 1.4274461457888306, + "learning_rate": 9.010810351307593e-06, + "loss": 0.8264, + "step": 3657 + }, + { + "epoch": 0.55, + "grad_norm": 1.50307120945164, + "learning_rate": 9.006000952128262e-06, + "loss": 0.8302, + "step": 3658 + }, + { + "epoch": 0.55, + "grad_norm": 1.3303321163143917, + "learning_rate": 9.001191785147207e-06, + "loss": 0.7848, + "step": 3659 + }, + { + "epoch": 0.55, + "grad_norm": 1.5566102315694863, + "learning_rate": 8.996382851487851e-06, + "loss": 0.8337, + "step": 3660 + }, + { + "epoch": 0.55, + "grad_norm": 1.350540387232985, + "learning_rate": 8.991574152273562e-06, + "loss": 0.7944, + "step": 3661 + }, + { + "epoch": 0.55, + "grad_norm": 1.4692775566401828, + "learning_rate": 8.986765688627652e-06, + "loss": 0.9261, + "step": 3662 + }, + { + "epoch": 0.55, + "grad_norm": 1.4754467752753881, + "learning_rate": 8.981957461673375e-06, + "loss": 0.9122, + "step": 3663 + }, + { + "epoch": 0.55, + "grad_norm": 1.4751818556180016, + "learning_rate": 8.977149472533938e-06, + "loss": 0.853, + "step": 3664 + }, + { + "epoch": 0.55, + "grad_norm": 1.4982121418702838, + "learning_rate": 8.972341722332485e-06, + "loss": 0.8164, + "step": 3665 + }, + { + "epoch": 0.55, + "grad_norm": 1.3554341572214257, + "learning_rate": 8.96753421219211e-06, + "loss": 0.8351, + "step": 3666 + }, + { + "epoch": 0.55, + "grad_norm": 1.4273524040853462, + "learning_rate": 8.962726943235845e-06, + "loss": 0.7495, + "step": 3667 + }, + { + "epoch": 0.55, + "grad_norm": 1.3278289661342635, + "learning_rate": 8.957919916586668e-06, + "loss": 0.8635, + "step": 3668 + }, + { + "epoch": 0.55, + "grad_norm": 1.415638814315592, + "learning_rate": 8.953113133367506e-06, + "loss": 0.8099, + "step": 3669 + }, + { + "epoch": 0.55, + "grad_norm": 1.6108040992121972, + "learning_rate": 8.948306594701216e-06, + "loss": 0.8644, + "step": 3670 + }, + { + "epoch": 0.55, + "grad_norm": 1.3928125205605932, + "learning_rate": 8.943500301710612e-06, + "loss": 0.849, + "step": 3671 + }, + { + "epoch": 0.55, + "grad_norm": 1.4263875090410174, + "learning_rate": 8.938694255518444e-06, + "loss": 0.9428, + "step": 3672 + }, + { + "epoch": 0.55, + "grad_norm": 1.3068800465943815, + "learning_rate": 8.933888457247402e-06, + "loss": 0.9137, + "step": 3673 + }, + { + "epoch": 0.55, + "grad_norm": 1.484848507848754, + "learning_rate": 8.929082908020122e-06, + "loss": 0.8755, + "step": 3674 + }, + { + "epoch": 0.55, + "grad_norm": 1.739395187581526, + "learning_rate": 8.924277608959185e-06, + "loss": 0.8508, + "step": 3675 + }, + { + "epoch": 0.55, + "grad_norm": 1.4402809816314825, + "learning_rate": 8.919472561187102e-06, + "loss": 0.8728, + "step": 3676 + }, + { + "epoch": 0.55, + "grad_norm": 1.4125680983002775, + "learning_rate": 8.91466776582634e-06, + "loss": 0.7384, + "step": 3677 + }, + { + "epoch": 0.55, + "grad_norm": 1.2714720247843756, + "learning_rate": 8.909863223999292e-06, + "loss": 0.864, + "step": 3678 + }, + { + "epoch": 0.55, + "grad_norm": 1.6191381942215402, + "learning_rate": 8.905058936828305e-06, + "loss": 0.8752, + "step": 3679 + }, + { + "epoch": 0.55, + "grad_norm": 1.5091538232162136, + "learning_rate": 8.900254905435658e-06, + "loss": 0.9165, + "step": 3680 + }, + { + "epoch": 0.55, + "grad_norm": 1.5629433569131972, + "learning_rate": 8.895451130943578e-06, + "loss": 0.8256, + "step": 3681 + }, + { + "epoch": 0.55, + "grad_norm": 1.4238194156636768, + "learning_rate": 8.890647614474223e-06, + "loss": 0.8212, + "step": 3682 + }, + { + "epoch": 0.55, + "grad_norm": 0.8088611912616787, + "learning_rate": 8.885844357149695e-06, + "loss": 0.3902, + "step": 3683 + }, + { + "epoch": 0.55, + "grad_norm": 0.9599801679607902, + "learning_rate": 8.881041360092035e-06, + "loss": 0.3242, + "step": 3684 + }, + { + "epoch": 0.55, + "grad_norm": 1.60980110175648, + "learning_rate": 8.876238624423224e-06, + "loss": 0.8785, + "step": 3685 + }, + { + "epoch": 0.55, + "grad_norm": 1.4839687368679044, + "learning_rate": 8.871436151265183e-06, + "loss": 0.8752, + "step": 3686 + }, + { + "epoch": 0.55, + "grad_norm": 1.4882005803580445, + "learning_rate": 8.86663394173977e-06, + "loss": 0.8671, + "step": 3687 + }, + { + "epoch": 0.55, + "grad_norm": 1.4178082136173882, + "learning_rate": 8.861831996968785e-06, + "loss": 0.8665, + "step": 3688 + }, + { + "epoch": 0.55, + "grad_norm": 1.574815976418105, + "learning_rate": 8.857030318073955e-06, + "loss": 0.8933, + "step": 3689 + }, + { + "epoch": 0.55, + "grad_norm": 1.4431843315959276, + "learning_rate": 8.852228906176958e-06, + "loss": 0.8461, + "step": 3690 + }, + { + "epoch": 0.55, + "grad_norm": 1.4804551506424388, + "learning_rate": 8.8474277623994e-06, + "loss": 0.8183, + "step": 3691 + }, + { + "epoch": 0.55, + "grad_norm": 1.621543558176056, + "learning_rate": 8.842626887862832e-06, + "loss": 0.9262, + "step": 3692 + }, + { + "epoch": 0.55, + "grad_norm": 1.2599794494671617, + "learning_rate": 8.83782628368874e-06, + "loss": 0.8692, + "step": 3693 + }, + { + "epoch": 0.55, + "grad_norm": 1.3462655631952707, + "learning_rate": 8.833025950998547e-06, + "loss": 0.8694, + "step": 3694 + }, + { + "epoch": 0.55, + "grad_norm": 1.4281421022439762, + "learning_rate": 8.828225890913604e-06, + "loss": 0.9251, + "step": 3695 + }, + { + "epoch": 0.55, + "grad_norm": 1.2492095750408565, + "learning_rate": 8.82342610455521e-06, + "loss": 0.8945, + "step": 3696 + }, + { + "epoch": 0.55, + "grad_norm": 1.5149663865692948, + "learning_rate": 8.818626593044595e-06, + "loss": 0.8192, + "step": 3697 + }, + { + "epoch": 0.55, + "grad_norm": 1.4909150180348938, + "learning_rate": 8.813827357502926e-06, + "loss": 0.869, + "step": 3698 + }, + { + "epoch": 0.55, + "grad_norm": 0.9020382175022832, + "learning_rate": 8.809028399051302e-06, + "loss": 0.3518, + "step": 3699 + }, + { + "epoch": 0.55, + "grad_norm": 1.4520313431220402, + "learning_rate": 8.804229718810765e-06, + "loss": 0.9426, + "step": 3700 + }, + { + "epoch": 0.55, + "grad_norm": 1.3110802967675608, + "learning_rate": 8.799431317902289e-06, + "loss": 0.8607, + "step": 3701 + }, + { + "epoch": 0.55, + "grad_norm": 1.7065063319823648, + "learning_rate": 8.79463319744677e-06, + "loss": 0.7935, + "step": 3702 + }, + { + "epoch": 0.55, + "grad_norm": 1.2540867443840078, + "learning_rate": 8.78983535856506e-06, + "loss": 0.8654, + "step": 3703 + }, + { + "epoch": 0.55, + "grad_norm": 1.3607794628838974, + "learning_rate": 8.785037802377929e-06, + "loss": 0.9087, + "step": 3704 + }, + { + "epoch": 0.55, + "grad_norm": 1.4927415355990383, + "learning_rate": 8.780240530006088e-06, + "loss": 0.8236, + "step": 3705 + }, + { + "epoch": 0.55, + "grad_norm": 1.043188025608745, + "learning_rate": 8.775443542570182e-06, + "loss": 0.3716, + "step": 3706 + }, + { + "epoch": 0.55, + "grad_norm": 1.414927917232927, + "learning_rate": 8.77064684119079e-06, + "loss": 0.8667, + "step": 3707 + }, + { + "epoch": 0.55, + "grad_norm": 1.3673059854936396, + "learning_rate": 8.765850426988414e-06, + "loss": 0.8297, + "step": 3708 + }, + { + "epoch": 0.55, + "grad_norm": 1.5143694303989128, + "learning_rate": 8.7610543010835e-06, + "loss": 0.9296, + "step": 3709 + }, + { + "epoch": 0.55, + "grad_norm": 0.9946903070291274, + "learning_rate": 8.756258464596426e-06, + "loss": 0.3683, + "step": 3710 + }, + { + "epoch": 0.55, + "grad_norm": 1.5999828561687655, + "learning_rate": 8.7514629186475e-06, + "loss": 0.8469, + "step": 3711 + }, + { + "epoch": 0.55, + "grad_norm": 1.2716777884402222, + "learning_rate": 8.746667664356957e-06, + "loss": 0.8119, + "step": 3712 + }, + { + "epoch": 0.55, + "grad_norm": 1.344547015956051, + "learning_rate": 8.741872702844975e-06, + "loss": 0.87, + "step": 3713 + }, + { + "epoch": 0.55, + "grad_norm": 1.4579613046452948, + "learning_rate": 8.737078035231659e-06, + "loss": 0.868, + "step": 3714 + }, + { + "epoch": 0.55, + "grad_norm": 1.4720169685824185, + "learning_rate": 8.732283662637034e-06, + "loss": 0.8837, + "step": 3715 + }, + { + "epoch": 0.55, + "grad_norm": 1.2252768844980197, + "learning_rate": 8.727489586181073e-06, + "loss": 0.8548, + "step": 3716 + }, + { + "epoch": 0.55, + "grad_norm": 1.3151230578592215, + "learning_rate": 8.722695806983673e-06, + "loss": 0.8467, + "step": 3717 + }, + { + "epoch": 0.55, + "grad_norm": 1.5932891000445994, + "learning_rate": 8.717902326164657e-06, + "loss": 0.9325, + "step": 3718 + }, + { + "epoch": 0.55, + "grad_norm": 1.47362186258443, + "learning_rate": 8.713109144843785e-06, + "loss": 0.8729, + "step": 3719 + }, + { + "epoch": 0.56, + "grad_norm": 1.5056953254232075, + "learning_rate": 8.708316264140751e-06, + "loss": 0.8864, + "step": 3720 + }, + { + "epoch": 0.56, + "grad_norm": 1.4934534543403102, + "learning_rate": 8.703523685175161e-06, + "loss": 0.9236, + "step": 3721 + }, + { + "epoch": 0.56, + "grad_norm": 1.3906336526923, + "learning_rate": 8.69873140906657e-06, + "loss": 0.8048, + "step": 3722 + }, + { + "epoch": 0.56, + "grad_norm": 1.4372350899576056, + "learning_rate": 8.69393943693445e-06, + "loss": 0.8234, + "step": 3723 + }, + { + "epoch": 0.56, + "grad_norm": 1.5466391896638483, + "learning_rate": 8.689147769898205e-06, + "loss": 0.856, + "step": 3724 + }, + { + "epoch": 0.56, + "grad_norm": 1.5769944602986026, + "learning_rate": 8.684356409077177e-06, + "loss": 0.854, + "step": 3725 + }, + { + "epoch": 0.56, + "grad_norm": 1.5608698292307723, + "learning_rate": 8.67956535559062e-06, + "loss": 0.8282, + "step": 3726 + }, + { + "epoch": 0.56, + "grad_norm": 1.6257226419219133, + "learning_rate": 8.674774610557728e-06, + "loss": 0.7686, + "step": 3727 + }, + { + "epoch": 0.56, + "grad_norm": 1.4473572784365238, + "learning_rate": 8.669984175097617e-06, + "loss": 0.828, + "step": 3728 + }, + { + "epoch": 0.56, + "grad_norm": 1.5766672847205068, + "learning_rate": 8.665194050329337e-06, + "loss": 0.8642, + "step": 3729 + }, + { + "epoch": 0.56, + "grad_norm": 1.4354587534372352, + "learning_rate": 8.66040423737186e-06, + "loss": 0.8939, + "step": 3730 + }, + { + "epoch": 0.56, + "grad_norm": 1.4958470579399212, + "learning_rate": 8.655614737344087e-06, + "loss": 0.9224, + "step": 3731 + }, + { + "epoch": 0.56, + "grad_norm": 1.5866593262083413, + "learning_rate": 8.650825551364844e-06, + "loss": 0.8107, + "step": 3732 + }, + { + "epoch": 0.56, + "grad_norm": 1.4286557817274976, + "learning_rate": 8.646036680552889e-06, + "loss": 0.7898, + "step": 3733 + }, + { + "epoch": 0.56, + "grad_norm": 1.49711372800259, + "learning_rate": 8.641248126026897e-06, + "loss": 0.8839, + "step": 3734 + }, + { + "epoch": 0.56, + "grad_norm": 1.475445424476935, + "learning_rate": 8.63645988890548e-06, + "loss": 0.8853, + "step": 3735 + }, + { + "epoch": 0.56, + "grad_norm": 1.6377280532848835, + "learning_rate": 8.63167197030717e-06, + "loss": 0.8435, + "step": 3736 + }, + { + "epoch": 0.56, + "grad_norm": 1.5656859330352237, + "learning_rate": 8.626884371350421e-06, + "loss": 0.8265, + "step": 3737 + }, + { + "epoch": 0.56, + "grad_norm": 1.3987400018119063, + "learning_rate": 8.62209709315362e-06, + "loss": 0.8263, + "step": 3738 + }, + { + "epoch": 0.56, + "grad_norm": 1.394969069383033, + "learning_rate": 8.617310136835078e-06, + "loss": 0.7883, + "step": 3739 + }, + { + "epoch": 0.56, + "grad_norm": 1.7140430414561054, + "learning_rate": 8.612523503513024e-06, + "loss": 0.7338, + "step": 3740 + }, + { + "epoch": 0.56, + "grad_norm": 1.4398814971817229, + "learning_rate": 8.60773719430562e-06, + "loss": 0.9348, + "step": 3741 + }, + { + "epoch": 0.56, + "grad_norm": 1.4729099352410537, + "learning_rate": 8.602951210330942e-06, + "loss": 0.9038, + "step": 3742 + }, + { + "epoch": 0.56, + "grad_norm": 1.3374320239659074, + "learning_rate": 8.598165552707002e-06, + "loss": 0.7976, + "step": 3743 + }, + { + "epoch": 0.56, + "grad_norm": 1.4929592199136175, + "learning_rate": 8.593380222551729e-06, + "loss": 0.8883, + "step": 3744 + }, + { + "epoch": 0.56, + "grad_norm": 1.4218434321127036, + "learning_rate": 8.588595220982975e-06, + "loss": 0.8428, + "step": 3745 + }, + { + "epoch": 0.56, + "grad_norm": 1.32917261179005, + "learning_rate": 8.583810549118524e-06, + "loss": 0.8247, + "step": 3746 + }, + { + "epoch": 0.56, + "grad_norm": 1.4693009225936944, + "learning_rate": 8.579026208076064e-06, + "loss": 0.9103, + "step": 3747 + }, + { + "epoch": 0.56, + "grad_norm": 1.3354877975442685, + "learning_rate": 8.574242198973223e-06, + "loss": 0.7976, + "step": 3748 + }, + { + "epoch": 0.56, + "grad_norm": 1.484015506579505, + "learning_rate": 8.569458522927547e-06, + "loss": 0.977, + "step": 3749 + }, + { + "epoch": 0.56, + "grad_norm": 0.8765068607300504, + "learning_rate": 8.564675181056502e-06, + "loss": 0.3536, + "step": 3750 + }, + { + "epoch": 0.56, + "grad_norm": 1.4653358840764616, + "learning_rate": 8.559892174477478e-06, + "loss": 0.8365, + "step": 3751 + }, + { + "epoch": 0.56, + "grad_norm": 1.2953205048862486, + "learning_rate": 8.55510950430779e-06, + "loss": 0.8583, + "step": 3752 + }, + { + "epoch": 0.56, + "grad_norm": 1.422696732872559, + "learning_rate": 8.550327171664661e-06, + "loss": 0.8618, + "step": 3753 + }, + { + "epoch": 0.56, + "grad_norm": 1.4063541989664001, + "learning_rate": 8.545545177665252e-06, + "loss": 0.768, + "step": 3754 + }, + { + "epoch": 0.56, + "grad_norm": 1.4509323019911897, + "learning_rate": 8.54076352342663e-06, + "loss": 0.8402, + "step": 3755 + }, + { + "epoch": 0.56, + "grad_norm": 1.4747712482252835, + "learning_rate": 8.535982210065797e-06, + "loss": 0.9576, + "step": 3756 + }, + { + "epoch": 0.56, + "grad_norm": 1.4200240889415054, + "learning_rate": 8.531201238699665e-06, + "loss": 0.8261, + "step": 3757 + }, + { + "epoch": 0.56, + "grad_norm": 1.3360012008582538, + "learning_rate": 8.52642061044507e-06, + "loss": 0.8501, + "step": 3758 + }, + { + "epoch": 0.56, + "grad_norm": 1.5472446190309326, + "learning_rate": 8.52164032641877e-06, + "loss": 0.8644, + "step": 3759 + }, + { + "epoch": 0.56, + "grad_norm": 1.469815897840417, + "learning_rate": 8.516860387737436e-06, + "loss": 0.7903, + "step": 3760 + }, + { + "epoch": 0.56, + "grad_norm": 1.6280436930700162, + "learning_rate": 8.512080795517662e-06, + "loss": 0.9012, + "step": 3761 + }, + { + "epoch": 0.56, + "grad_norm": 1.2568879364982588, + "learning_rate": 8.50730155087596e-06, + "loss": 0.8578, + "step": 3762 + }, + { + "epoch": 0.56, + "grad_norm": 1.720820360703925, + "learning_rate": 8.502522654928767e-06, + "loss": 0.8634, + "step": 3763 + }, + { + "epoch": 0.56, + "grad_norm": 1.544101591965494, + "learning_rate": 8.49774410879243e-06, + "loss": 0.842, + "step": 3764 + }, + { + "epoch": 0.56, + "grad_norm": 1.575814262104861, + "learning_rate": 8.492965913583222e-06, + "loss": 0.798, + "step": 3765 + }, + { + "epoch": 0.56, + "grad_norm": 1.4435210621339085, + "learning_rate": 8.488188070417323e-06, + "loss": 0.8032, + "step": 3766 + }, + { + "epoch": 0.56, + "grad_norm": 1.916004344681442, + "learning_rate": 8.483410580410843e-06, + "loss": 0.8702, + "step": 3767 + }, + { + "epoch": 0.56, + "grad_norm": 1.5064238258529847, + "learning_rate": 8.478633444679801e-06, + "loss": 0.7975, + "step": 3768 + }, + { + "epoch": 0.56, + "grad_norm": 1.4736029177617054, + "learning_rate": 8.47385666434014e-06, + "loss": 0.8767, + "step": 3769 + }, + { + "epoch": 0.56, + "grad_norm": 1.5350354992839987, + "learning_rate": 8.469080240507711e-06, + "loss": 0.883, + "step": 3770 + }, + { + "epoch": 0.56, + "grad_norm": 1.4472707936224234, + "learning_rate": 8.464304174298292e-06, + "loss": 0.8234, + "step": 3771 + }, + { + "epoch": 0.56, + "grad_norm": 1.386760113327573, + "learning_rate": 8.459528466827576e-06, + "loss": 0.884, + "step": 3772 + }, + { + "epoch": 0.56, + "grad_norm": 1.301808046486341, + "learning_rate": 8.454753119211158e-06, + "loss": 0.8764, + "step": 3773 + }, + { + "epoch": 0.56, + "grad_norm": 1.6143737389027555, + "learning_rate": 8.449978132564565e-06, + "loss": 0.8372, + "step": 3774 + }, + { + "epoch": 0.56, + "grad_norm": 1.461787736285232, + "learning_rate": 8.445203508003236e-06, + "loss": 0.8934, + "step": 3775 + }, + { + "epoch": 0.56, + "grad_norm": 1.4281086424614915, + "learning_rate": 8.44042924664252e-06, + "loss": 0.8477, + "step": 3776 + }, + { + "epoch": 0.56, + "grad_norm": 1.4311179030356793, + "learning_rate": 8.43565534959769e-06, + "loss": 0.8676, + "step": 3777 + }, + { + "epoch": 0.56, + "grad_norm": 1.3602770711709342, + "learning_rate": 8.430881817983931e-06, + "loss": 0.8781, + "step": 3778 + }, + { + "epoch": 0.56, + "grad_norm": 1.4737895802559111, + "learning_rate": 8.426108652916329e-06, + "loss": 0.8753, + "step": 3779 + }, + { + "epoch": 0.56, + "grad_norm": 1.4894731459733923, + "learning_rate": 8.421335855509903e-06, + "loss": 0.824, + "step": 3780 + }, + { + "epoch": 0.56, + "grad_norm": 1.2854324749803256, + "learning_rate": 8.41656342687958e-06, + "loss": 0.8785, + "step": 3781 + }, + { + "epoch": 0.56, + "grad_norm": 1.339716958144086, + "learning_rate": 8.411791368140197e-06, + "loss": 0.895, + "step": 3782 + }, + { + "epoch": 0.56, + "grad_norm": 1.5556465095252934, + "learning_rate": 8.407019680406508e-06, + "loss": 0.8271, + "step": 3783 + }, + { + "epoch": 0.56, + "grad_norm": 1.4461301745933728, + "learning_rate": 8.402248364793182e-06, + "loss": 0.7326, + "step": 3784 + }, + { + "epoch": 0.56, + "grad_norm": 1.4795888966719792, + "learning_rate": 8.3974774224148e-06, + "loss": 0.823, + "step": 3785 + }, + { + "epoch": 0.56, + "grad_norm": 1.4046169627155651, + "learning_rate": 8.392706854385847e-06, + "loss": 0.9134, + "step": 3786 + }, + { + "epoch": 0.57, + "grad_norm": 1.481080069935016, + "learning_rate": 8.387936661820733e-06, + "loss": 0.8109, + "step": 3787 + }, + { + "epoch": 0.57, + "grad_norm": 1.3783459734112478, + "learning_rate": 8.383166845833777e-06, + "loss": 0.9482, + "step": 3788 + }, + { + "epoch": 0.57, + "grad_norm": 1.5053364690658528, + "learning_rate": 8.378397407539209e-06, + "loss": 0.8239, + "step": 3789 + }, + { + "epoch": 0.57, + "grad_norm": 2.1832163660341024, + "learning_rate": 8.373628348051165e-06, + "loss": 0.8668, + "step": 3790 + }, + { + "epoch": 0.57, + "grad_norm": 1.381206535582202, + "learning_rate": 8.368859668483703e-06, + "loss": 0.8552, + "step": 3791 + }, + { + "epoch": 0.57, + "grad_norm": 1.3847250528942607, + "learning_rate": 8.364091369950783e-06, + "loss": 0.9023, + "step": 3792 + }, + { + "epoch": 0.57, + "grad_norm": 1.6363542060886245, + "learning_rate": 8.359323453566283e-06, + "loss": 0.7639, + "step": 3793 + }, + { + "epoch": 0.57, + "grad_norm": 1.3242330944218341, + "learning_rate": 8.354555920443987e-06, + "loss": 0.7948, + "step": 3794 + }, + { + "epoch": 0.57, + "grad_norm": 1.256241261646971, + "learning_rate": 8.349788771697593e-06, + "loss": 0.8451, + "step": 3795 + }, + { + "epoch": 0.57, + "grad_norm": 1.5169178257742937, + "learning_rate": 8.345022008440704e-06, + "loss": 0.8129, + "step": 3796 + }, + { + "epoch": 0.57, + "grad_norm": 1.5172240919489481, + "learning_rate": 8.340255631786843e-06, + "loss": 0.7435, + "step": 3797 + }, + { + "epoch": 0.57, + "grad_norm": 1.3758617553573378, + "learning_rate": 8.335489642849428e-06, + "loss": 0.8201, + "step": 3798 + }, + { + "epoch": 0.57, + "grad_norm": 1.4432807999052983, + "learning_rate": 8.3307240427418e-06, + "loss": 0.8063, + "step": 3799 + }, + { + "epoch": 0.57, + "grad_norm": 1.6474266970139628, + "learning_rate": 8.325958832577201e-06, + "loss": 0.8054, + "step": 3800 + }, + { + "epoch": 0.57, + "grad_norm": 2.6749639223464494, + "learning_rate": 8.321194013468786e-06, + "loss": 0.8441, + "step": 3801 + }, + { + "epoch": 0.57, + "grad_norm": 1.4026745257334892, + "learning_rate": 8.316429586529616e-06, + "loss": 0.8244, + "step": 3802 + }, + { + "epoch": 0.57, + "grad_norm": 1.2923184009681465, + "learning_rate": 8.311665552872662e-06, + "loss": 0.8592, + "step": 3803 + }, + { + "epoch": 0.57, + "grad_norm": 1.5419515788001992, + "learning_rate": 8.306901913610805e-06, + "loss": 0.8691, + "step": 3804 + }, + { + "epoch": 0.57, + "grad_norm": 1.4586798230035831, + "learning_rate": 8.30213866985683e-06, + "loss": 0.7941, + "step": 3805 + }, + { + "epoch": 0.57, + "grad_norm": 1.5242093380706372, + "learning_rate": 8.297375822723428e-06, + "loss": 0.8163, + "step": 3806 + }, + { + "epoch": 0.57, + "grad_norm": 1.3665116969556927, + "learning_rate": 8.292613373323203e-06, + "loss": 0.858, + "step": 3807 + }, + { + "epoch": 0.57, + "grad_norm": 1.6026898126638671, + "learning_rate": 8.287851322768666e-06, + "loss": 0.8313, + "step": 3808 + }, + { + "epoch": 0.57, + "grad_norm": 1.3705275254872518, + "learning_rate": 8.283089672172232e-06, + "loss": 0.8161, + "step": 3809 + }, + { + "epoch": 0.57, + "grad_norm": 1.4737242303126872, + "learning_rate": 8.278328422646222e-06, + "loss": 0.8358, + "step": 3810 + }, + { + "epoch": 0.57, + "grad_norm": 1.397857335888745, + "learning_rate": 8.273567575302861e-06, + "loss": 0.8586, + "step": 3811 + }, + { + "epoch": 0.57, + "grad_norm": 1.500025643437456, + "learning_rate": 8.268807131254288e-06, + "loss": 0.9056, + "step": 3812 + }, + { + "epoch": 0.57, + "grad_norm": 1.4417751545744042, + "learning_rate": 8.264047091612538e-06, + "loss": 0.8565, + "step": 3813 + }, + { + "epoch": 0.57, + "grad_norm": 1.4740418857953064, + "learning_rate": 8.259287457489564e-06, + "loss": 0.8447, + "step": 3814 + }, + { + "epoch": 0.57, + "grad_norm": 1.3443466784374998, + "learning_rate": 8.254528229997211e-06, + "loss": 0.8153, + "step": 3815 + }, + { + "epoch": 0.57, + "grad_norm": 1.493733502689942, + "learning_rate": 8.249769410247239e-06, + "loss": 0.8315, + "step": 3816 + }, + { + "epoch": 0.57, + "grad_norm": 1.4889474464071153, + "learning_rate": 8.24501099935131e-06, + "loss": 0.8845, + "step": 3817 + }, + { + "epoch": 0.57, + "grad_norm": 1.6813774002563011, + "learning_rate": 8.240252998420983e-06, + "loss": 0.8625, + "step": 3818 + }, + { + "epoch": 0.57, + "grad_norm": 1.468863988825738, + "learning_rate": 8.235495408567732e-06, + "loss": 0.7962, + "step": 3819 + }, + { + "epoch": 0.57, + "grad_norm": 1.6231877464369744, + "learning_rate": 8.230738230902928e-06, + "loss": 0.9005, + "step": 3820 + }, + { + "epoch": 0.57, + "grad_norm": 1.4096450955762019, + "learning_rate": 8.22598146653785e-06, + "loss": 0.7747, + "step": 3821 + }, + { + "epoch": 0.57, + "grad_norm": 1.6081042979083204, + "learning_rate": 8.221225116583677e-06, + "loss": 0.8344, + "step": 3822 + }, + { + "epoch": 0.57, + "grad_norm": 1.5459868375091343, + "learning_rate": 8.216469182151498e-06, + "loss": 0.8066, + "step": 3823 + }, + { + "epoch": 0.57, + "grad_norm": 1.3622485268888176, + "learning_rate": 8.211713664352293e-06, + "loss": 0.7873, + "step": 3824 + }, + { + "epoch": 0.57, + "grad_norm": 1.49143883013621, + "learning_rate": 8.206958564296954e-06, + "loss": 0.9116, + "step": 3825 + }, + { + "epoch": 0.57, + "grad_norm": 1.5818685530373284, + "learning_rate": 8.202203883096272e-06, + "loss": 0.7721, + "step": 3826 + }, + { + "epoch": 0.57, + "grad_norm": 0.923033014250726, + "learning_rate": 8.197449621860944e-06, + "loss": 0.3334, + "step": 3827 + }, + { + "epoch": 0.57, + "grad_norm": 1.5076111111776078, + "learning_rate": 8.192695781701562e-06, + "loss": 0.8287, + "step": 3828 + }, + { + "epoch": 0.57, + "grad_norm": 1.6755224196968321, + "learning_rate": 8.187942363728626e-06, + "loss": 0.8249, + "step": 3829 + }, + { + "epoch": 0.57, + "grad_norm": 1.6398240592332278, + "learning_rate": 8.183189369052538e-06, + "loss": 0.8649, + "step": 3830 + }, + { + "epoch": 0.57, + "grad_norm": 1.2389481994167582, + "learning_rate": 8.17843679878359e-06, + "loss": 0.8193, + "step": 3831 + }, + { + "epoch": 0.57, + "grad_norm": 1.4016039837669882, + "learning_rate": 8.17368465403199e-06, + "loss": 0.9018, + "step": 3832 + }, + { + "epoch": 0.57, + "grad_norm": 1.3596032529242756, + "learning_rate": 8.168932935907833e-06, + "loss": 0.8173, + "step": 3833 + }, + { + "epoch": 0.57, + "grad_norm": 1.3099226278553266, + "learning_rate": 8.164181645521126e-06, + "loss": 0.8756, + "step": 3834 + }, + { + "epoch": 0.57, + "grad_norm": 1.5078606881955319, + "learning_rate": 8.15943078398177e-06, + "loss": 0.8613, + "step": 3835 + }, + { + "epoch": 0.57, + "grad_norm": 1.6099830781875908, + "learning_rate": 8.15468035239957e-06, + "loss": 0.869, + "step": 3836 + }, + { + "epoch": 0.57, + "grad_norm": 1.5064763402148404, + "learning_rate": 8.14993035188422e-06, + "loss": 0.9439, + "step": 3837 + }, + { + "epoch": 0.57, + "grad_norm": 1.347387602193213, + "learning_rate": 8.145180783545324e-06, + "loss": 0.8585, + "step": 3838 + }, + { + "epoch": 0.57, + "grad_norm": 1.4095366720398286, + "learning_rate": 8.140431648492382e-06, + "loss": 0.8703, + "step": 3839 + }, + { + "epoch": 0.57, + "grad_norm": 1.227683166218185, + "learning_rate": 8.13568294783479e-06, + "loss": 0.8817, + "step": 3840 + }, + { + "epoch": 0.57, + "grad_norm": 1.4318849893685222, + "learning_rate": 8.130934682681849e-06, + "loss": 0.8773, + "step": 3841 + }, + { + "epoch": 0.57, + "grad_norm": 1.4975102867392485, + "learning_rate": 8.126186854142752e-06, + "loss": 0.8536, + "step": 3842 + }, + { + "epoch": 0.57, + "grad_norm": 1.4178935633973675, + "learning_rate": 8.121439463326597e-06, + "loss": 0.8583, + "step": 3843 + }, + { + "epoch": 0.57, + "grad_norm": 0.853345345505174, + "learning_rate": 8.116692511342365e-06, + "loss": 0.3621, + "step": 3844 + }, + { + "epoch": 0.57, + "grad_norm": 1.4643583078283737, + "learning_rate": 8.111945999298952e-06, + "loss": 0.8396, + "step": 3845 + }, + { + "epoch": 0.57, + "grad_norm": 1.5141879192662218, + "learning_rate": 8.10719992830514e-06, + "loss": 0.8872, + "step": 3846 + }, + { + "epoch": 0.57, + "grad_norm": 1.679919863308799, + "learning_rate": 8.102454299469615e-06, + "loss": 0.8201, + "step": 3847 + }, + { + "epoch": 0.57, + "grad_norm": 1.442904645268295, + "learning_rate": 8.097709113900956e-06, + "loss": 0.8832, + "step": 3848 + }, + { + "epoch": 0.57, + "grad_norm": 1.3972234830774664, + "learning_rate": 8.092964372707636e-06, + "loss": 0.9463, + "step": 3849 + }, + { + "epoch": 0.57, + "grad_norm": 1.4631665234466493, + "learning_rate": 8.088220076998029e-06, + "loss": 0.8648, + "step": 3850 + }, + { + "epoch": 0.57, + "grad_norm": 1.2591285943123254, + "learning_rate": 8.0834762278804e-06, + "loss": 0.8343, + "step": 3851 + }, + { + "epoch": 0.57, + "grad_norm": 1.4070522549833657, + "learning_rate": 8.078732826462917e-06, + "loss": 0.9144, + "step": 3852 + }, + { + "epoch": 0.57, + "grad_norm": 1.4661778160816774, + "learning_rate": 8.073989873853637e-06, + "loss": 0.8708, + "step": 3853 + }, + { + "epoch": 0.58, + "grad_norm": 1.5160750342810743, + "learning_rate": 8.069247371160514e-06, + "loss": 0.7942, + "step": 3854 + }, + { + "epoch": 0.58, + "grad_norm": 1.6119367166875718, + "learning_rate": 8.064505319491398e-06, + "loss": 0.8553, + "step": 3855 + }, + { + "epoch": 0.58, + "grad_norm": 1.367028429450525, + "learning_rate": 8.059763719954033e-06, + "loss": 0.8285, + "step": 3856 + }, + { + "epoch": 0.58, + "grad_norm": 1.4701219521135274, + "learning_rate": 8.055022573656055e-06, + "loss": 0.8257, + "step": 3857 + }, + { + "epoch": 0.58, + "grad_norm": 1.3831092656115833, + "learning_rate": 8.050281881704997e-06, + "loss": 0.8723, + "step": 3858 + }, + { + "epoch": 0.58, + "grad_norm": 1.5745489540895334, + "learning_rate": 8.045541645208288e-06, + "loss": 0.8047, + "step": 3859 + }, + { + "epoch": 0.58, + "grad_norm": 1.58253378951427, + "learning_rate": 8.040801865273243e-06, + "loss": 0.8515, + "step": 3860 + }, + { + "epoch": 0.58, + "grad_norm": 1.3477314444309891, + "learning_rate": 8.036062543007076e-06, + "loss": 0.8398, + "step": 3861 + }, + { + "epoch": 0.58, + "grad_norm": 1.366991053500511, + "learning_rate": 8.0313236795169e-06, + "loss": 0.9, + "step": 3862 + }, + { + "epoch": 0.58, + "grad_norm": 1.529866341519627, + "learning_rate": 8.026585275909704e-06, + "loss": 0.8634, + "step": 3863 + }, + { + "epoch": 0.58, + "grad_norm": 1.357561736825964, + "learning_rate": 8.021847333292388e-06, + "loss": 0.7932, + "step": 3864 + }, + { + "epoch": 0.58, + "grad_norm": 1.3669840204861803, + "learning_rate": 8.017109852771729e-06, + "loss": 0.8846, + "step": 3865 + }, + { + "epoch": 0.58, + "grad_norm": 2.0147996328907256, + "learning_rate": 8.012372835454406e-06, + "loss": 0.9027, + "step": 3866 + }, + { + "epoch": 0.58, + "grad_norm": 1.2348257898542403, + "learning_rate": 8.007636282446986e-06, + "loss": 0.8429, + "step": 3867 + }, + { + "epoch": 0.58, + "grad_norm": 1.3546989375261893, + "learning_rate": 8.00290019485593e-06, + "loss": 0.8412, + "step": 3868 + }, + { + "epoch": 0.58, + "grad_norm": 1.525857690282193, + "learning_rate": 7.998164573787592e-06, + "loss": 0.8739, + "step": 3869 + }, + { + "epoch": 0.58, + "grad_norm": 1.6262260629347693, + "learning_rate": 7.993429420348203e-06, + "loss": 0.8596, + "step": 3870 + }, + { + "epoch": 0.58, + "grad_norm": 1.51772982282246, + "learning_rate": 7.988694735643902e-06, + "loss": 0.8345, + "step": 3871 + }, + { + "epoch": 0.58, + "grad_norm": 1.4294823658326008, + "learning_rate": 7.983960520780712e-06, + "loss": 0.8575, + "step": 3872 + }, + { + "epoch": 0.58, + "grad_norm": 1.4107911669823776, + "learning_rate": 7.979226776864544e-06, + "loss": 0.7989, + "step": 3873 + }, + { + "epoch": 0.58, + "grad_norm": 1.4753793891234486, + "learning_rate": 7.974493505001201e-06, + "loss": 0.8148, + "step": 3874 + }, + { + "epoch": 0.58, + "grad_norm": 1.555913526812343, + "learning_rate": 7.969760706296385e-06, + "loss": 0.8568, + "step": 3875 + }, + { + "epoch": 0.58, + "grad_norm": 1.4406752774635811, + "learning_rate": 7.965028381855664e-06, + "loss": 0.8511, + "step": 3876 + }, + { + "epoch": 0.58, + "grad_norm": 1.438675105992132, + "learning_rate": 7.960296532784515e-06, + "loss": 0.8406, + "step": 3877 + }, + { + "epoch": 0.58, + "grad_norm": 1.5014848760493564, + "learning_rate": 7.9555651601883e-06, + "loss": 0.7237, + "step": 3878 + }, + { + "epoch": 0.58, + "grad_norm": 0.8863640571563507, + "learning_rate": 7.950834265172266e-06, + "loss": 0.3502, + "step": 3879 + }, + { + "epoch": 0.58, + "grad_norm": 1.2787713951768571, + "learning_rate": 7.946103848841554e-06, + "loss": 0.8035, + "step": 3880 + }, + { + "epoch": 0.58, + "grad_norm": 1.4279112764970951, + "learning_rate": 7.94137391230119e-06, + "loss": 0.8433, + "step": 3881 + }, + { + "epoch": 0.58, + "grad_norm": 1.5247138667577969, + "learning_rate": 7.936644456656082e-06, + "loss": 0.8462, + "step": 3882 + }, + { + "epoch": 0.58, + "grad_norm": 0.8057065507493604, + "learning_rate": 7.931915483011035e-06, + "loss": 0.3251, + "step": 3883 + }, + { + "epoch": 0.58, + "grad_norm": 0.8444376501041313, + "learning_rate": 7.927186992470736e-06, + "loss": 0.3292, + "step": 3884 + }, + { + "epoch": 0.58, + "grad_norm": 1.5524881935496142, + "learning_rate": 7.922458986139761e-06, + "loss": 0.8193, + "step": 3885 + }, + { + "epoch": 0.58, + "grad_norm": 1.5702087453470714, + "learning_rate": 7.917731465122576e-06, + "loss": 0.7837, + "step": 3886 + }, + { + "epoch": 0.58, + "grad_norm": 1.5440342483626506, + "learning_rate": 7.913004430523526e-06, + "loss": 0.7835, + "step": 3887 + }, + { + "epoch": 0.58, + "grad_norm": 1.4085996726858885, + "learning_rate": 7.908277883446855e-06, + "loss": 0.8379, + "step": 3888 + }, + { + "epoch": 0.58, + "grad_norm": 1.2960327900950117, + "learning_rate": 7.903551824996675e-06, + "loss": 0.8207, + "step": 3889 + }, + { + "epoch": 0.58, + "grad_norm": 1.3646501135471174, + "learning_rate": 7.898826256276995e-06, + "loss": 0.7851, + "step": 3890 + }, + { + "epoch": 0.58, + "grad_norm": 1.5404777440279533, + "learning_rate": 7.894101178391714e-06, + "loss": 0.8001, + "step": 3891 + }, + { + "epoch": 0.58, + "grad_norm": 1.4879105442914047, + "learning_rate": 7.889376592444605e-06, + "loss": 0.8203, + "step": 3892 + }, + { + "epoch": 0.58, + "grad_norm": 1.4130601742181295, + "learning_rate": 7.884652499539335e-06, + "loss": 0.8284, + "step": 3893 + }, + { + "epoch": 0.58, + "grad_norm": 1.5208452924547775, + "learning_rate": 7.879928900779457e-06, + "loss": 0.8893, + "step": 3894 + }, + { + "epoch": 0.58, + "grad_norm": 1.4347533744779224, + "learning_rate": 7.875205797268394e-06, + "loss": 0.8337, + "step": 3895 + }, + { + "epoch": 0.58, + "grad_norm": 1.5382185355053928, + "learning_rate": 7.870483190109468e-06, + "loss": 0.8698, + "step": 3896 + }, + { + "epoch": 0.58, + "grad_norm": 1.3558823877819808, + "learning_rate": 7.865761080405882e-06, + "loss": 0.8562, + "step": 3897 + }, + { + "epoch": 0.58, + "grad_norm": 1.4726484662768524, + "learning_rate": 7.86103946926072e-06, + "loss": 0.8643, + "step": 3898 + }, + { + "epoch": 0.58, + "grad_norm": 1.4108247968582, + "learning_rate": 7.85631835777695e-06, + "loss": 0.8056, + "step": 3899 + }, + { + "epoch": 0.58, + "grad_norm": 1.5382425113486704, + "learning_rate": 7.851597747057426e-06, + "loss": 0.8896, + "step": 3900 + }, + { + "epoch": 0.58, + "grad_norm": 1.63086058108348, + "learning_rate": 7.846877638204887e-06, + "loss": 0.8275, + "step": 3901 + }, + { + "epoch": 0.58, + "grad_norm": 1.553380579206785, + "learning_rate": 7.84215803232194e-06, + "loss": 0.8167, + "step": 3902 + }, + { + "epoch": 0.58, + "grad_norm": 1.5278258587184337, + "learning_rate": 7.837438930511093e-06, + "loss": 0.8325, + "step": 3903 + }, + { + "epoch": 0.58, + "grad_norm": 1.5868841567522767, + "learning_rate": 7.832720333874728e-06, + "loss": 0.8061, + "step": 3904 + }, + { + "epoch": 0.58, + "grad_norm": 1.4167538244565792, + "learning_rate": 7.828002243515107e-06, + "loss": 0.8431, + "step": 3905 + }, + { + "epoch": 0.58, + "grad_norm": 1.5835765438246021, + "learning_rate": 7.823284660534381e-06, + "loss": 0.8556, + "step": 3906 + }, + { + "epoch": 0.58, + "grad_norm": 1.5105778408107289, + "learning_rate": 7.818567586034578e-06, + "loss": 0.853, + "step": 3907 + }, + { + "epoch": 0.58, + "grad_norm": 1.3622683790283752, + "learning_rate": 7.8138510211176e-06, + "loss": 0.8336, + "step": 3908 + }, + { + "epoch": 0.58, + "grad_norm": 1.477763111073078, + "learning_rate": 7.80913496688524e-06, + "loss": 0.828, + "step": 3909 + }, + { + "epoch": 0.58, + "grad_norm": 1.560333593066661, + "learning_rate": 7.804419424439172e-06, + "loss": 0.8737, + "step": 3910 + }, + { + "epoch": 0.58, + "grad_norm": 1.46646249403625, + "learning_rate": 7.799704394880943e-06, + "loss": 0.8469, + "step": 3911 + }, + { + "epoch": 0.58, + "grad_norm": 1.3335479389617615, + "learning_rate": 7.794989879311991e-06, + "loss": 0.8321, + "step": 3912 + }, + { + "epoch": 0.58, + "grad_norm": 1.4148086244382008, + "learning_rate": 7.790275878833619e-06, + "loss": 0.85, + "step": 3913 + }, + { + "epoch": 0.58, + "grad_norm": 1.5599994546728069, + "learning_rate": 7.785562394547023e-06, + "loss": 0.8901, + "step": 3914 + }, + { + "epoch": 0.58, + "grad_norm": 1.416786162438713, + "learning_rate": 7.780849427553273e-06, + "loss": 0.8368, + "step": 3915 + }, + { + "epoch": 0.58, + "grad_norm": 1.2555798750528289, + "learning_rate": 7.776136978953316e-06, + "loss": 0.8113, + "step": 3916 + }, + { + "epoch": 0.58, + "grad_norm": 1.5308106775074777, + "learning_rate": 7.771425049847984e-06, + "loss": 0.8299, + "step": 3917 + }, + { + "epoch": 0.58, + "grad_norm": 1.357819158748458, + "learning_rate": 7.766713641337983e-06, + "loss": 0.8391, + "step": 3918 + }, + { + "epoch": 0.58, + "grad_norm": 1.593746994053119, + "learning_rate": 7.762002754523897e-06, + "loss": 0.853, + "step": 3919 + }, + { + "epoch": 0.58, + "grad_norm": 1.4946031353238858, + "learning_rate": 7.757292390506191e-06, + "loss": 0.8329, + "step": 3920 + }, + { + "epoch": 0.59, + "grad_norm": 1.4636789713683203, + "learning_rate": 7.752582550385204e-06, + "loss": 0.8565, + "step": 3921 + }, + { + "epoch": 0.59, + "grad_norm": 1.424063490420883, + "learning_rate": 7.747873235261157e-06, + "loss": 0.8976, + "step": 3922 + }, + { + "epoch": 0.59, + "grad_norm": 1.5471326909578724, + "learning_rate": 7.743164446234149e-06, + "loss": 0.7977, + "step": 3923 + }, + { + "epoch": 0.59, + "grad_norm": 1.3450696226406185, + "learning_rate": 7.73845618440415e-06, + "loss": 0.8387, + "step": 3924 + }, + { + "epoch": 0.59, + "grad_norm": 1.9266863073507474, + "learning_rate": 7.733748450871011e-06, + "loss": 0.8583, + "step": 3925 + }, + { + "epoch": 0.59, + "grad_norm": 1.5139731654905673, + "learning_rate": 7.729041246734462e-06, + "loss": 0.8228, + "step": 3926 + }, + { + "epoch": 0.59, + "grad_norm": 1.6790758017143192, + "learning_rate": 7.724334573094101e-06, + "loss": 0.8303, + "step": 3927 + }, + { + "epoch": 0.59, + "grad_norm": 1.3130255687124601, + "learning_rate": 7.719628431049413e-06, + "loss": 0.9192, + "step": 3928 + }, + { + "epoch": 0.59, + "grad_norm": 1.6282751755683016, + "learning_rate": 7.71492282169975e-06, + "loss": 0.9017, + "step": 3929 + }, + { + "epoch": 0.59, + "grad_norm": 1.4981723034110144, + "learning_rate": 7.710217746144341e-06, + "loss": 0.801, + "step": 3930 + }, + { + "epoch": 0.59, + "grad_norm": 1.341580442189316, + "learning_rate": 7.705513205482297e-06, + "loss": 0.8048, + "step": 3931 + }, + { + "epoch": 0.59, + "grad_norm": 1.487211009060395, + "learning_rate": 7.700809200812596e-06, + "loss": 0.7876, + "step": 3932 + }, + { + "epoch": 0.59, + "grad_norm": 1.5425565024024739, + "learning_rate": 7.696105733234099e-06, + "loss": 0.7763, + "step": 3933 + }, + { + "epoch": 0.59, + "grad_norm": 1.3098183559846637, + "learning_rate": 7.691402803845527e-06, + "loss": 0.7894, + "step": 3934 + }, + { + "epoch": 0.59, + "grad_norm": 1.3225369724392708, + "learning_rate": 7.68670041374549e-06, + "loss": 0.7881, + "step": 3935 + }, + { + "epoch": 0.59, + "grad_norm": 1.4478912031098143, + "learning_rate": 7.681998564032467e-06, + "loss": 0.7877, + "step": 3936 + }, + { + "epoch": 0.59, + "grad_norm": 1.3632588166082595, + "learning_rate": 7.677297255804811e-06, + "loss": 0.8345, + "step": 3937 + }, + { + "epoch": 0.59, + "grad_norm": 0.9737144663632736, + "learning_rate": 7.672596490160747e-06, + "loss": 0.3358, + "step": 3938 + }, + { + "epoch": 0.59, + "grad_norm": 1.5469877245318275, + "learning_rate": 7.667896268198379e-06, + "loss": 0.8507, + "step": 3939 + }, + { + "epoch": 0.59, + "grad_norm": 1.281300420176089, + "learning_rate": 7.66319659101567e-06, + "loss": 0.8628, + "step": 3940 + }, + { + "epoch": 0.59, + "grad_norm": 0.8431429296120123, + "learning_rate": 7.658497459710472e-06, + "loss": 0.3126, + "step": 3941 + }, + { + "epoch": 0.59, + "grad_norm": 1.4301530895471217, + "learning_rate": 7.6537988753805e-06, + "loss": 0.9024, + "step": 3942 + }, + { + "epoch": 0.59, + "grad_norm": 1.4004654006466817, + "learning_rate": 7.649100839123344e-06, + "loss": 0.8568, + "step": 3943 + }, + { + "epoch": 0.59, + "grad_norm": 1.4403571999624494, + "learning_rate": 7.644403352036467e-06, + "loss": 0.8231, + "step": 3944 + }, + { + "epoch": 0.59, + "grad_norm": 1.3580275721095185, + "learning_rate": 7.639706415217201e-06, + "loss": 0.8773, + "step": 3945 + }, + { + "epoch": 0.59, + "grad_norm": 1.5379641106208168, + "learning_rate": 7.635010029762755e-06, + "loss": 0.8644, + "step": 3946 + }, + { + "epoch": 0.59, + "grad_norm": 1.5661028612188268, + "learning_rate": 7.6303141967702e-06, + "loss": 0.8415, + "step": 3947 + }, + { + "epoch": 0.59, + "grad_norm": 1.5205939310782801, + "learning_rate": 7.625618917336482e-06, + "loss": 0.8753, + "step": 3948 + }, + { + "epoch": 0.59, + "grad_norm": 1.7425247483928072, + "learning_rate": 7.620924192558423e-06, + "loss": 0.8838, + "step": 3949 + }, + { + "epoch": 0.59, + "grad_norm": 1.44009310653146, + "learning_rate": 7.616230023532709e-06, + "loss": 0.8138, + "step": 3950 + }, + { + "epoch": 0.59, + "grad_norm": 1.3992381201574247, + "learning_rate": 7.611536411355899e-06, + "loss": 0.9108, + "step": 3951 + }, + { + "epoch": 0.59, + "grad_norm": 1.4791180656960128, + "learning_rate": 7.606843357124426e-06, + "loss": 0.814, + "step": 3952 + }, + { + "epoch": 0.59, + "grad_norm": 1.5128382842418315, + "learning_rate": 7.602150861934578e-06, + "loss": 0.8056, + "step": 3953 + }, + { + "epoch": 0.59, + "grad_norm": 0.8813251466069273, + "learning_rate": 7.597458926882529e-06, + "loss": 0.3489, + "step": 3954 + }, + { + "epoch": 0.59, + "grad_norm": 1.493765949818518, + "learning_rate": 7.592767553064312e-06, + "loss": 0.7881, + "step": 3955 + }, + { + "epoch": 0.59, + "grad_norm": 1.2748501101093348, + "learning_rate": 7.588076741575836e-06, + "loss": 0.8122, + "step": 3956 + }, + { + "epoch": 0.59, + "grad_norm": 1.3753928739469214, + "learning_rate": 7.583386493512872e-06, + "loss": 0.7931, + "step": 3957 + }, + { + "epoch": 0.59, + "grad_norm": 1.4579932300888518, + "learning_rate": 7.5786968099710645e-06, + "loss": 0.8817, + "step": 3958 + }, + { + "epoch": 0.59, + "grad_norm": 1.2391749294932688, + "learning_rate": 7.574007692045928e-06, + "loss": 0.7476, + "step": 3959 + }, + { + "epoch": 0.59, + "grad_norm": 1.5266460303116258, + "learning_rate": 7.5693191408328325e-06, + "loss": 0.8863, + "step": 3960 + }, + { + "epoch": 0.59, + "grad_norm": 1.4001846348188698, + "learning_rate": 7.564631157427027e-06, + "loss": 0.832, + "step": 3961 + }, + { + "epoch": 0.59, + "grad_norm": 1.3619921739411784, + "learning_rate": 7.559943742923626e-06, + "loss": 0.9073, + "step": 3962 + }, + { + "epoch": 0.59, + "grad_norm": 0.8096329160949032, + "learning_rate": 7.55525689841761e-06, + "loss": 0.3241, + "step": 3963 + }, + { + "epoch": 0.59, + "grad_norm": 1.4217503317475708, + "learning_rate": 7.550570625003826e-06, + "loss": 0.8777, + "step": 3964 + }, + { + "epoch": 0.59, + "grad_norm": 1.4624702641630836, + "learning_rate": 7.5458849237769915e-06, + "loss": 0.8274, + "step": 3965 + }, + { + "epoch": 0.59, + "grad_norm": 1.4223706120499529, + "learning_rate": 7.54119979583168e-06, + "loss": 0.8615, + "step": 3966 + }, + { + "epoch": 0.59, + "grad_norm": 1.3349460372149062, + "learning_rate": 7.536515242262341e-06, + "loss": 0.8587, + "step": 3967 + }, + { + "epoch": 0.59, + "grad_norm": 1.5094578701967627, + "learning_rate": 7.531831264163286e-06, + "loss": 0.8722, + "step": 3968 + }, + { + "epoch": 0.59, + "grad_norm": 0.8521364852081045, + "learning_rate": 7.527147862628695e-06, + "loss": 0.3333, + "step": 3969 + }, + { + "epoch": 0.59, + "grad_norm": 1.3390398588903791, + "learning_rate": 7.522465038752609e-06, + "loss": 0.9447, + "step": 3970 + }, + { + "epoch": 0.59, + "grad_norm": 1.5691904725772001, + "learning_rate": 7.517782793628938e-06, + "loss": 0.7773, + "step": 3971 + }, + { + "epoch": 0.59, + "grad_norm": 1.437712327161783, + "learning_rate": 7.513101128351454e-06, + "loss": 0.9071, + "step": 3972 + }, + { + "epoch": 0.59, + "grad_norm": 1.4371124910153377, + "learning_rate": 7.508420044013793e-06, + "loss": 0.8583, + "step": 3973 + }, + { + "epoch": 0.59, + "grad_norm": 1.434640774252635, + "learning_rate": 7.503739541709457e-06, + "loss": 0.8027, + "step": 3974 + }, + { + "epoch": 0.59, + "grad_norm": 1.5215308643823906, + "learning_rate": 7.499059622531812e-06, + "loss": 0.7998, + "step": 3975 + }, + { + "epoch": 0.59, + "grad_norm": 1.4233905619619556, + "learning_rate": 7.49438028757409e-06, + "loss": 0.9129, + "step": 3976 + }, + { + "epoch": 0.59, + "grad_norm": 1.6152575305538228, + "learning_rate": 7.489701537929384e-06, + "loss": 0.8575, + "step": 3977 + }, + { + "epoch": 0.59, + "grad_norm": 1.4684522699866303, + "learning_rate": 7.48502337469065e-06, + "loss": 0.8257, + "step": 3978 + }, + { + "epoch": 0.59, + "grad_norm": 1.2857423173607951, + "learning_rate": 7.480345798950702e-06, + "loss": 0.8252, + "step": 3979 + }, + { + "epoch": 0.59, + "grad_norm": 1.6244608369775477, + "learning_rate": 7.475668811802228e-06, + "loss": 0.7919, + "step": 3980 + }, + { + "epoch": 0.59, + "grad_norm": 1.660382677232523, + "learning_rate": 7.4709924143377696e-06, + "loss": 0.7712, + "step": 3981 + }, + { + "epoch": 0.59, + "grad_norm": 1.4466859342528529, + "learning_rate": 7.4663166076497376e-06, + "loss": 0.8693, + "step": 3982 + }, + { + "epoch": 0.59, + "grad_norm": 1.253153635260305, + "learning_rate": 7.4616413928303964e-06, + "loss": 0.8288, + "step": 3983 + }, + { + "epoch": 0.59, + "grad_norm": 1.432949832347982, + "learning_rate": 7.456966770971882e-06, + "loss": 0.8611, + "step": 3984 + }, + { + "epoch": 0.59, + "grad_norm": 1.654687288755272, + "learning_rate": 7.4522927431661805e-06, + "loss": 0.7739, + "step": 3985 + }, + { + "epoch": 0.59, + "grad_norm": 1.4540036618777414, + "learning_rate": 7.447619310505147e-06, + "loss": 0.8273, + "step": 3986 + }, + { + "epoch": 0.59, + "grad_norm": 1.4622291844531243, + "learning_rate": 7.442946474080499e-06, + "loss": 0.7933, + "step": 3987 + }, + { + "epoch": 0.6, + "grad_norm": 1.3514932789744367, + "learning_rate": 7.438274234983809e-06, + "loss": 0.8039, + "step": 3988 + }, + { + "epoch": 0.6, + "grad_norm": 1.5514088136788373, + "learning_rate": 7.433602594306512e-06, + "loss": 0.887, + "step": 3989 + }, + { + "epoch": 0.6, + "grad_norm": 1.5322458868953754, + "learning_rate": 7.428931553139904e-06, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 0.6, + "grad_norm": 1.384575511972655, + "learning_rate": 7.4242611125751445e-06, + "loss": 0.8497, + "step": 3991 + }, + { + "epoch": 0.6, + "grad_norm": 1.3213976895042756, + "learning_rate": 7.419591273703245e-06, + "loss": 0.812, + "step": 3992 + }, + { + "epoch": 0.6, + "grad_norm": 1.5716519668692461, + "learning_rate": 7.414922037615079e-06, + "loss": 0.8533, + "step": 3993 + }, + { + "epoch": 0.6, + "grad_norm": 1.5355347459178688, + "learning_rate": 7.410253405401382e-06, + "loss": 0.8135, + "step": 3994 + }, + { + "epoch": 0.6, + "grad_norm": 0.939914269984516, + "learning_rate": 7.405585378152749e-06, + "loss": 0.3249, + "step": 3995 + }, + { + "epoch": 0.6, + "grad_norm": 1.5646833019493598, + "learning_rate": 7.400917956959628e-06, + "loss": 0.8355, + "step": 3996 + }, + { + "epoch": 0.6, + "grad_norm": 1.4322248850050128, + "learning_rate": 7.396251142912337e-06, + "loss": 0.8362, + "step": 3997 + }, + { + "epoch": 0.6, + "grad_norm": 1.235414863743788, + "learning_rate": 7.391584937101034e-06, + "loss": 0.8693, + "step": 3998 + }, + { + "epoch": 0.6, + "grad_norm": 1.294460085551339, + "learning_rate": 7.386919340615749e-06, + "loss": 0.7786, + "step": 3999 + }, + { + "epoch": 0.6, + "grad_norm": 1.5092882034419526, + "learning_rate": 7.382254354546367e-06, + "loss": 0.845, + "step": 4000 + }, + { + "epoch": 0.6, + "grad_norm": 1.2960092659583327, + "learning_rate": 7.377589979982628e-06, + "loss": 0.905, + "step": 4001 + }, + { + "epoch": 0.6, + "grad_norm": 1.3597418855897803, + "learning_rate": 7.372926218014131e-06, + "loss": 0.8258, + "step": 4002 + }, + { + "epoch": 0.6, + "grad_norm": 1.4807505565862369, + "learning_rate": 7.368263069730331e-06, + "loss": 0.8712, + "step": 4003 + }, + { + "epoch": 0.6, + "grad_norm": 1.3377627081873373, + "learning_rate": 7.363600536220546e-06, + "loss": 0.8534, + "step": 4004 + }, + { + "epoch": 0.6, + "grad_norm": 1.4402833172413079, + "learning_rate": 7.358938618573932e-06, + "loss": 0.864, + "step": 4005 + }, + { + "epoch": 0.6, + "grad_norm": 1.2655980407527947, + "learning_rate": 7.3542773178795216e-06, + "loss": 0.8662, + "step": 4006 + }, + { + "epoch": 0.6, + "grad_norm": 1.6481355248148812, + "learning_rate": 7.349616635226191e-06, + "loss": 0.7618, + "step": 4007 + }, + { + "epoch": 0.6, + "grad_norm": 1.2883529776713392, + "learning_rate": 7.344956571702679e-06, + "loss": 0.8389, + "step": 4008 + }, + { + "epoch": 0.6, + "grad_norm": 1.4733728624458065, + "learning_rate": 7.340297128397577e-06, + "loss": 0.8307, + "step": 4009 + }, + { + "epoch": 0.6, + "grad_norm": 1.3365062163973156, + "learning_rate": 7.335638306399337e-06, + "loss": 0.8739, + "step": 4010 + }, + { + "epoch": 0.6, + "grad_norm": 1.3278711160455579, + "learning_rate": 7.330980106796247e-06, + "loss": 0.8348, + "step": 4011 + }, + { + "epoch": 0.6, + "grad_norm": 1.4355968194175286, + "learning_rate": 7.326322530676471e-06, + "loss": 0.9093, + "step": 4012 + }, + { + "epoch": 0.6, + "grad_norm": 0.8143306063703842, + "learning_rate": 7.3216655791280175e-06, + "loss": 0.3316, + "step": 4013 + }, + { + "epoch": 0.6, + "grad_norm": 1.360521303542236, + "learning_rate": 7.317009253238752e-06, + "loss": 0.8919, + "step": 4014 + }, + { + "epoch": 0.6, + "grad_norm": 0.7935957872402526, + "learning_rate": 7.312353554096393e-06, + "loss": 0.3349, + "step": 4015 + }, + { + "epoch": 0.6, + "grad_norm": 1.4203158524655182, + "learning_rate": 7.30769848278851e-06, + "loss": 0.8608, + "step": 4016 + }, + { + "epoch": 0.6, + "grad_norm": 1.2636815900591305, + "learning_rate": 7.303044040402536e-06, + "loss": 0.8244, + "step": 4017 + }, + { + "epoch": 0.6, + "grad_norm": 1.380026898889672, + "learning_rate": 7.298390228025737e-06, + "loss": 0.9059, + "step": 4018 + }, + { + "epoch": 0.6, + "grad_norm": 1.3105225852159377, + "learning_rate": 7.293737046745249e-06, + "loss": 0.8377, + "step": 4019 + }, + { + "epoch": 0.6, + "grad_norm": 1.5454422415586047, + "learning_rate": 7.289084497648058e-06, + "loss": 0.8803, + "step": 4020 + }, + { + "epoch": 0.6, + "grad_norm": 1.4179957166582344, + "learning_rate": 7.284432581820998e-06, + "loss": 0.8246, + "step": 4021 + }, + { + "epoch": 0.6, + "grad_norm": 1.2551732959068278, + "learning_rate": 7.279781300350758e-06, + "loss": 0.8108, + "step": 4022 + }, + { + "epoch": 0.6, + "grad_norm": 1.48777467636917, + "learning_rate": 7.2751306543238805e-06, + "loss": 0.8819, + "step": 4023 + }, + { + "epoch": 0.6, + "grad_norm": 1.5658873470893908, + "learning_rate": 7.27048064482675e-06, + "loss": 0.8779, + "step": 4024 + }, + { + "epoch": 0.6, + "grad_norm": 1.4329109423795188, + "learning_rate": 7.265831272945612e-06, + "loss": 0.8751, + "step": 4025 + }, + { + "epoch": 0.6, + "grad_norm": 1.3207738870079062, + "learning_rate": 7.261182539766563e-06, + "loss": 0.8604, + "step": 4026 + }, + { + "epoch": 0.6, + "grad_norm": 1.4049262717461986, + "learning_rate": 7.256534446375543e-06, + "loss": 0.8636, + "step": 4027 + }, + { + "epoch": 0.6, + "grad_norm": 1.4255930645742714, + "learning_rate": 7.25188699385835e-06, + "loss": 0.8658, + "step": 4028 + }, + { + "epoch": 0.6, + "grad_norm": 1.485431652681221, + "learning_rate": 7.24724018330063e-06, + "loss": 0.8162, + "step": 4029 + }, + { + "epoch": 0.6, + "grad_norm": 1.2862814453139493, + "learning_rate": 7.242594015787883e-06, + "loss": 0.8189, + "step": 4030 + }, + { + "epoch": 0.6, + "grad_norm": 1.4160652178268391, + "learning_rate": 7.237948492405442e-06, + "loss": 0.7981, + "step": 4031 + }, + { + "epoch": 0.6, + "grad_norm": 1.436552380672859, + "learning_rate": 7.23330361423851e-06, + "loss": 0.7666, + "step": 4032 + }, + { + "epoch": 0.6, + "grad_norm": 1.498942457490684, + "learning_rate": 7.228659382372127e-06, + "loss": 0.9183, + "step": 4033 + }, + { + "epoch": 0.6, + "grad_norm": 1.4511328675275896, + "learning_rate": 7.224015797891191e-06, + "loss": 0.8301, + "step": 4034 + }, + { + "epoch": 0.6, + "grad_norm": 1.3815665760963811, + "learning_rate": 7.21937286188044e-06, + "loss": 0.7797, + "step": 4035 + }, + { + "epoch": 0.6, + "grad_norm": 1.6364873708904342, + "learning_rate": 7.214730575424469e-06, + "loss": 0.8685, + "step": 4036 + }, + { + "epoch": 0.6, + "grad_norm": 1.4193385302004342, + "learning_rate": 7.210088939607709e-06, + "loss": 0.7878, + "step": 4037 + }, + { + "epoch": 0.6, + "grad_norm": 1.5728229950466202, + "learning_rate": 7.205447955514451e-06, + "loss": 0.8154, + "step": 4038 + }, + { + "epoch": 0.6, + "grad_norm": 1.418169133553306, + "learning_rate": 7.20080762422883e-06, + "loss": 0.8431, + "step": 4039 + }, + { + "epoch": 0.6, + "grad_norm": 1.4721707491113019, + "learning_rate": 7.196167946834826e-06, + "loss": 0.8158, + "step": 4040 + }, + { + "epoch": 0.6, + "grad_norm": 1.370917171671462, + "learning_rate": 7.191528924416271e-06, + "loss": 0.87, + "step": 4041 + }, + { + "epoch": 0.6, + "grad_norm": 1.4760456896367928, + "learning_rate": 7.186890558056836e-06, + "loss": 0.8774, + "step": 4042 + }, + { + "epoch": 0.6, + "grad_norm": 1.2016534088205737, + "learning_rate": 7.18225284884005e-06, + "loss": 0.8819, + "step": 4043 + }, + { + "epoch": 0.6, + "grad_norm": 1.6612606122766522, + "learning_rate": 7.177615797849278e-06, + "loss": 0.8183, + "step": 4044 + }, + { + "epoch": 0.6, + "grad_norm": 1.4284650854433951, + "learning_rate": 7.172979406167735e-06, + "loss": 0.7784, + "step": 4045 + }, + { + "epoch": 0.6, + "grad_norm": 1.3178468598112467, + "learning_rate": 7.168343674878487e-06, + "loss": 0.778, + "step": 4046 + }, + { + "epoch": 0.6, + "grad_norm": 1.3576375844798225, + "learning_rate": 7.163708605064437e-06, + "loss": 0.8331, + "step": 4047 + }, + { + "epoch": 0.6, + "grad_norm": 1.3543054566112847, + "learning_rate": 7.15907419780834e-06, + "loss": 0.8668, + "step": 4048 + }, + { + "epoch": 0.6, + "grad_norm": 1.3403824885308662, + "learning_rate": 7.154440454192793e-06, + "loss": 0.9069, + "step": 4049 + }, + { + "epoch": 0.6, + "grad_norm": 0.8448849237433186, + "learning_rate": 7.149807375300239e-06, + "loss": 0.3267, + "step": 4050 + }, + { + "epoch": 0.6, + "grad_norm": 1.3465822224689834, + "learning_rate": 7.145174962212969e-06, + "loss": 0.8171, + "step": 4051 + }, + { + "epoch": 0.6, + "grad_norm": 1.6501537412576142, + "learning_rate": 7.140543216013109e-06, + "loss": 0.949, + "step": 4052 + }, + { + "epoch": 0.6, + "grad_norm": 0.9552282505973152, + "learning_rate": 7.135912137782639e-06, + "loss": 0.3141, + "step": 4053 + }, + { + "epoch": 0.6, + "grad_norm": 0.8572351006782111, + "learning_rate": 7.1312817286033784e-06, + "loss": 0.3467, + "step": 4054 + }, + { + "epoch": 0.61, + "grad_norm": 1.4218916250137275, + "learning_rate": 7.126651989556994e-06, + "loss": 0.8139, + "step": 4055 + }, + { + "epoch": 0.61, + "grad_norm": 1.4618065691959776, + "learning_rate": 7.122022921724993e-06, + "loss": 0.8941, + "step": 4056 + }, + { + "epoch": 0.61, + "grad_norm": 1.528894395835897, + "learning_rate": 7.117394526188719e-06, + "loss": 0.925, + "step": 4057 + }, + { + "epoch": 0.61, + "grad_norm": 1.4229688720414047, + "learning_rate": 7.112766804029373e-06, + "loss": 0.7987, + "step": 4058 + }, + { + "epoch": 0.61, + "grad_norm": 1.603676423154499, + "learning_rate": 7.1081397563279885e-06, + "loss": 0.835, + "step": 4059 + }, + { + "epoch": 0.61, + "grad_norm": 1.4642568009968235, + "learning_rate": 7.103513384165446e-06, + "loss": 0.8365, + "step": 4060 + }, + { + "epoch": 0.61, + "grad_norm": 1.5779818337718972, + "learning_rate": 7.0988876886224635e-06, + "loss": 0.7563, + "step": 4061 + }, + { + "epoch": 0.61, + "grad_norm": 1.6644244292945192, + "learning_rate": 7.094262670779611e-06, + "loss": 0.74, + "step": 4062 + }, + { + "epoch": 0.61, + "grad_norm": 0.9530389123482829, + "learning_rate": 7.0896383317172845e-06, + "loss": 0.3291, + "step": 4063 + }, + { + "epoch": 0.61, + "grad_norm": 1.4439081115502193, + "learning_rate": 7.085014672515733e-06, + "loss": 0.7958, + "step": 4064 + }, + { + "epoch": 0.61, + "grad_norm": 1.48300673892835, + "learning_rate": 7.080391694255045e-06, + "loss": 0.8805, + "step": 4065 + }, + { + "epoch": 0.61, + "grad_norm": 1.4814951208714935, + "learning_rate": 7.075769398015147e-06, + "loss": 0.9266, + "step": 4066 + }, + { + "epoch": 0.61, + "grad_norm": 1.5077635659544213, + "learning_rate": 7.071147784875809e-06, + "loss": 0.906, + "step": 4067 + }, + { + "epoch": 0.61, + "grad_norm": 1.525242014299514, + "learning_rate": 7.066526855916647e-06, + "loss": 0.8863, + "step": 4068 + }, + { + "epoch": 0.61, + "grad_norm": 1.595501849380148, + "learning_rate": 7.061906612217096e-06, + "loss": 0.8448, + "step": 4069 + }, + { + "epoch": 0.61, + "grad_norm": 1.5060984369799253, + "learning_rate": 7.057287054856455e-06, + "loss": 0.8336, + "step": 4070 + }, + { + "epoch": 0.61, + "grad_norm": 1.3920515704772587, + "learning_rate": 7.052668184913851e-06, + "loss": 0.817, + "step": 4071 + }, + { + "epoch": 0.61, + "grad_norm": 1.413615194197871, + "learning_rate": 7.048050003468252e-06, + "loss": 0.823, + "step": 4072 + }, + { + "epoch": 0.61, + "grad_norm": 1.540746389736846, + "learning_rate": 7.043432511598467e-06, + "loss": 0.9207, + "step": 4073 + }, + { + "epoch": 0.61, + "grad_norm": 1.2531081865614424, + "learning_rate": 7.038815710383141e-06, + "loss": 0.8804, + "step": 4074 + }, + { + "epoch": 0.61, + "grad_norm": 1.5550505044204708, + "learning_rate": 7.034199600900765e-06, + "loss": 0.939, + "step": 4075 + }, + { + "epoch": 0.61, + "grad_norm": 1.4761354330873884, + "learning_rate": 7.029584184229653e-06, + "loss": 0.8422, + "step": 4076 + }, + { + "epoch": 0.61, + "grad_norm": 0.7945827105032043, + "learning_rate": 7.024969461447973e-06, + "loss": 0.3307, + "step": 4077 + }, + { + "epoch": 0.61, + "grad_norm": 1.4962021549382294, + "learning_rate": 7.0203554336337206e-06, + "loss": 0.8578, + "step": 4078 + }, + { + "epoch": 0.61, + "grad_norm": 1.387360633797198, + "learning_rate": 7.015742101864737e-06, + "loss": 0.8725, + "step": 4079 + }, + { + "epoch": 0.61, + "grad_norm": 1.3947455938686388, + "learning_rate": 7.011129467218696e-06, + "loss": 0.8815, + "step": 4080 + }, + { + "epoch": 0.61, + "grad_norm": 1.300912888341301, + "learning_rate": 7.006517530773113e-06, + "loss": 0.774, + "step": 4081 + }, + { + "epoch": 0.61, + "grad_norm": 1.2872764792959677, + "learning_rate": 7.001906293605329e-06, + "loss": 0.8299, + "step": 4082 + }, + { + "epoch": 0.61, + "grad_norm": 1.4972984957501676, + "learning_rate": 6.9972957567925325e-06, + "loss": 0.8413, + "step": 4083 + }, + { + "epoch": 0.61, + "grad_norm": 1.434924057387539, + "learning_rate": 6.992685921411748e-06, + "loss": 0.8089, + "step": 4084 + }, + { + "epoch": 0.61, + "grad_norm": 1.3539369272860728, + "learning_rate": 6.98807678853983e-06, + "loss": 0.8128, + "step": 4085 + }, + { + "epoch": 0.61, + "grad_norm": 1.3362311924354842, + "learning_rate": 6.983468359253476e-06, + "loss": 0.8295, + "step": 4086 + }, + { + "epoch": 0.61, + "grad_norm": 1.4891369608012521, + "learning_rate": 6.978860634629213e-06, + "loss": 0.867, + "step": 4087 + }, + { + "epoch": 0.61, + "grad_norm": 1.3185065858384972, + "learning_rate": 6.9742536157434114e-06, + "loss": 0.8652, + "step": 4088 + }, + { + "epoch": 0.61, + "grad_norm": 1.5313936477545826, + "learning_rate": 6.969647303672262e-06, + "loss": 0.8704, + "step": 4089 + }, + { + "epoch": 0.61, + "grad_norm": 0.7551649233931472, + "learning_rate": 6.965041699491804e-06, + "loss": 0.3241, + "step": 4090 + }, + { + "epoch": 0.61, + "grad_norm": 0.8106437550522526, + "learning_rate": 6.960436804277908e-06, + "loss": 0.3507, + "step": 4091 + }, + { + "epoch": 0.61, + "grad_norm": 1.4268161427547736, + "learning_rate": 6.9558326191062775e-06, + "loss": 0.8902, + "step": 4092 + }, + { + "epoch": 0.61, + "grad_norm": 1.5353929359035485, + "learning_rate": 6.95122914505245e-06, + "loss": 0.7933, + "step": 4093 + }, + { + "epoch": 0.61, + "grad_norm": 1.730421795434039, + "learning_rate": 6.9466263831918015e-06, + "loss": 0.8664, + "step": 4094 + }, + { + "epoch": 0.61, + "grad_norm": 1.7210413116591243, + "learning_rate": 6.942024334599531e-06, + "loss": 0.8472, + "step": 4095 + }, + { + "epoch": 0.61, + "grad_norm": 1.466800227459811, + "learning_rate": 6.93742300035068e-06, + "loss": 0.7794, + "step": 4096 + }, + { + "epoch": 0.61, + "grad_norm": 1.3937999293018455, + "learning_rate": 6.932822381520121e-06, + "loss": 0.7822, + "step": 4097 + }, + { + "epoch": 0.61, + "grad_norm": 1.558196612961541, + "learning_rate": 6.92822247918256e-06, + "loss": 0.8065, + "step": 4098 + }, + { + "epoch": 0.61, + "grad_norm": 1.4331822129452696, + "learning_rate": 6.923623294412534e-06, + "loss": 0.8168, + "step": 4099 + }, + { + "epoch": 0.61, + "grad_norm": 1.4815568397903058, + "learning_rate": 6.919024828284413e-06, + "loss": 0.8234, + "step": 4100 + }, + { + "epoch": 0.61, + "grad_norm": 1.5729062013456891, + "learning_rate": 6.914427081872401e-06, + "loss": 0.8991, + "step": 4101 + }, + { + "epoch": 0.61, + "grad_norm": 0.8936306186370127, + "learning_rate": 6.909830056250527e-06, + "loss": 0.3264, + "step": 4102 + }, + { + "epoch": 0.61, + "grad_norm": 1.5144505805052484, + "learning_rate": 6.90523375249266e-06, + "loss": 0.8432, + "step": 4103 + }, + { + "epoch": 0.61, + "grad_norm": 1.4348186371479237, + "learning_rate": 6.900638171672497e-06, + "loss": 0.8396, + "step": 4104 + }, + { + "epoch": 0.61, + "grad_norm": 1.595521510328429, + "learning_rate": 6.896043314863568e-06, + "loss": 0.9411, + "step": 4105 + }, + { + "epoch": 0.61, + "grad_norm": 1.3618433523264837, + "learning_rate": 6.891449183139227e-06, + "loss": 0.7777, + "step": 4106 + }, + { + "epoch": 0.61, + "grad_norm": 1.324848945010024, + "learning_rate": 6.88685577757267e-06, + "loss": 0.8853, + "step": 4107 + }, + { + "epoch": 0.61, + "grad_norm": 1.7554898133009502, + "learning_rate": 6.88226309923691e-06, + "loss": 0.8283, + "step": 4108 + }, + { + "epoch": 0.61, + "grad_norm": 1.3576531233824523, + "learning_rate": 6.877671149204801e-06, + "loss": 0.891, + "step": 4109 + }, + { + "epoch": 0.61, + "grad_norm": 1.47677119990268, + "learning_rate": 6.873079928549024e-06, + "loss": 0.8275, + "step": 4110 + }, + { + "epoch": 0.61, + "grad_norm": 0.8488148487446681, + "learning_rate": 6.8684894383420865e-06, + "loss": 0.3182, + "step": 4111 + }, + { + "epoch": 0.61, + "grad_norm": 1.4415400651002197, + "learning_rate": 6.8638996796563275e-06, + "loss": 0.8724, + "step": 4112 + }, + { + "epoch": 0.61, + "grad_norm": 1.4167742486003656, + "learning_rate": 6.859310653563917e-06, + "loss": 0.8249, + "step": 4113 + }, + { + "epoch": 0.61, + "grad_norm": 1.4793070611232852, + "learning_rate": 6.854722361136851e-06, + "loss": 0.8581, + "step": 4114 + }, + { + "epoch": 0.61, + "grad_norm": 1.2894146698282294, + "learning_rate": 6.850134803446955e-06, + "loss": 0.8169, + "step": 4115 + }, + { + "epoch": 0.61, + "grad_norm": 1.4877944347848857, + "learning_rate": 6.845547981565881e-06, + "loss": 0.8318, + "step": 4116 + }, + { + "epoch": 0.61, + "grad_norm": 0.8799505493476157, + "learning_rate": 6.8409618965651125e-06, + "loss": 0.3599, + "step": 4117 + }, + { + "epoch": 0.61, + "grad_norm": 1.467202015709495, + "learning_rate": 6.83637654951596e-06, + "loss": 0.7922, + "step": 4118 + }, + { + "epoch": 0.61, + "grad_norm": 1.4664981184763466, + "learning_rate": 6.83179194148956e-06, + "loss": 0.9283, + "step": 4119 + }, + { + "epoch": 0.61, + "grad_norm": 1.399346106397274, + "learning_rate": 6.827208073556884e-06, + "loss": 0.8478, + "step": 4120 + }, + { + "epoch": 0.61, + "grad_norm": 1.4130029719896253, + "learning_rate": 6.8226249467887115e-06, + "loss": 0.7969, + "step": 4121 + }, + { + "epoch": 0.62, + "grad_norm": 1.5657724943944973, + "learning_rate": 6.81804256225567e-06, + "loss": 0.8389, + "step": 4122 + }, + { + "epoch": 0.62, + "grad_norm": 1.4375012653023915, + "learning_rate": 6.813460921028202e-06, + "loss": 0.9325, + "step": 4123 + }, + { + "epoch": 0.62, + "grad_norm": 1.4011686935473764, + "learning_rate": 6.808880024176581e-06, + "loss": 0.8124, + "step": 4124 + }, + { + "epoch": 0.62, + "grad_norm": 1.3735590246834153, + "learning_rate": 6.804299872770903e-06, + "loss": 0.891, + "step": 4125 + }, + { + "epoch": 0.62, + "grad_norm": 1.4998532014778108, + "learning_rate": 6.799720467881099e-06, + "loss": 0.7542, + "step": 4126 + }, + { + "epoch": 0.62, + "grad_norm": 1.5551702095063675, + "learning_rate": 6.795141810576906e-06, + "loss": 0.8109, + "step": 4127 + }, + { + "epoch": 0.62, + "grad_norm": 1.5565704224573844, + "learning_rate": 6.790563901927907e-06, + "loss": 0.7405, + "step": 4128 + }, + { + "epoch": 0.62, + "grad_norm": 1.3945263173312614, + "learning_rate": 6.7859867430035006e-06, + "loss": 0.8928, + "step": 4129 + }, + { + "epoch": 0.62, + "grad_norm": 1.593621447464884, + "learning_rate": 6.781410334872911e-06, + "loss": 0.7722, + "step": 4130 + }, + { + "epoch": 0.62, + "grad_norm": 1.238250226615948, + "learning_rate": 6.776834678605186e-06, + "loss": 0.7682, + "step": 4131 + }, + { + "epoch": 0.62, + "grad_norm": 1.3975788469241808, + "learning_rate": 6.772259775269203e-06, + "loss": 0.7892, + "step": 4132 + }, + { + "epoch": 0.62, + "grad_norm": 1.376254726393351, + "learning_rate": 6.767685625933662e-06, + "loss": 0.8457, + "step": 4133 + }, + { + "epoch": 0.62, + "grad_norm": 1.383936034280788, + "learning_rate": 6.763112231667076e-06, + "loss": 0.9207, + "step": 4134 + }, + { + "epoch": 0.62, + "grad_norm": 1.4866025911878868, + "learning_rate": 6.758539593537796e-06, + "loss": 0.7821, + "step": 4135 + }, + { + "epoch": 0.62, + "grad_norm": 1.4194263370146019, + "learning_rate": 6.7539677126139894e-06, + "loss": 0.7913, + "step": 4136 + }, + { + "epoch": 0.62, + "grad_norm": 1.433294867976207, + "learning_rate": 6.7493965899636486e-06, + "loss": 0.7948, + "step": 4137 + }, + { + "epoch": 0.62, + "grad_norm": 1.3901480971400966, + "learning_rate": 6.744826226654587e-06, + "loss": 0.8548, + "step": 4138 + }, + { + "epoch": 0.62, + "grad_norm": 1.5320051056295763, + "learning_rate": 6.740256623754448e-06, + "loss": 0.8665, + "step": 4139 + }, + { + "epoch": 0.62, + "grad_norm": 1.4649678273104199, + "learning_rate": 6.735687782330683e-06, + "loss": 0.8402, + "step": 4140 + }, + { + "epoch": 0.62, + "grad_norm": 1.3889835104566026, + "learning_rate": 6.731119703450577e-06, + "loss": 0.7785, + "step": 4141 + }, + { + "epoch": 0.62, + "grad_norm": 1.4424155650620423, + "learning_rate": 6.7265523881812335e-06, + "loss": 0.8174, + "step": 4142 + }, + { + "epoch": 0.62, + "grad_norm": 1.5899231150451114, + "learning_rate": 6.7219858375895785e-06, + "loss": 0.7566, + "step": 4143 + }, + { + "epoch": 0.62, + "grad_norm": 1.3927449959694322, + "learning_rate": 6.717420052742358e-06, + "loss": 0.7821, + "step": 4144 + }, + { + "epoch": 0.62, + "grad_norm": 1.480147895668604, + "learning_rate": 6.712855034706141e-06, + "loss": 0.8191, + "step": 4145 + }, + { + "epoch": 0.62, + "grad_norm": 1.5231233181130175, + "learning_rate": 6.70829078454732e-06, + "loss": 0.8902, + "step": 4146 + }, + { + "epoch": 0.62, + "grad_norm": 1.6688998413141465, + "learning_rate": 6.703727303332094e-06, + "loss": 0.8547, + "step": 4147 + }, + { + "epoch": 0.62, + "grad_norm": 1.487571873901898, + "learning_rate": 6.699164592126499e-06, + "loss": 0.8383, + "step": 4148 + }, + { + "epoch": 0.62, + "grad_norm": 1.4079169612227123, + "learning_rate": 6.694602651996385e-06, + "loss": 0.7667, + "step": 4149 + }, + { + "epoch": 0.62, + "grad_norm": 1.6025870959320427, + "learning_rate": 6.690041484007419e-06, + "loss": 0.8794, + "step": 4150 + }, + { + "epoch": 0.62, + "grad_norm": 1.4147850042856225, + "learning_rate": 6.685481089225092e-06, + "loss": 0.7816, + "step": 4151 + }, + { + "epoch": 0.62, + "grad_norm": 1.2862360875889831, + "learning_rate": 6.680921468714718e-06, + "loss": 0.799, + "step": 4152 + }, + { + "epoch": 0.62, + "grad_norm": 1.4366328381256777, + "learning_rate": 6.676362623541415e-06, + "loss": 0.9017, + "step": 4153 + }, + { + "epoch": 0.62, + "grad_norm": 1.500383038838065, + "learning_rate": 6.671804554770135e-06, + "loss": 0.8304, + "step": 4154 + }, + { + "epoch": 0.62, + "grad_norm": 1.2651402941954724, + "learning_rate": 6.6672472634656414e-06, + "loss": 0.762, + "step": 4155 + }, + { + "epoch": 0.62, + "grad_norm": 1.5537863988192027, + "learning_rate": 6.66269075069252e-06, + "loss": 0.8654, + "step": 4156 + }, + { + "epoch": 0.62, + "grad_norm": 1.367136033315608, + "learning_rate": 6.6581350175151715e-06, + "loss": 0.8236, + "step": 4157 + }, + { + "epoch": 0.62, + "grad_norm": 1.5510267242373068, + "learning_rate": 6.653580064997817e-06, + "loss": 0.8204, + "step": 4158 + }, + { + "epoch": 0.62, + "grad_norm": 1.2609935778753811, + "learning_rate": 6.649025894204495e-06, + "loss": 0.8253, + "step": 4159 + }, + { + "epoch": 0.62, + "grad_norm": 1.3191193304695545, + "learning_rate": 6.644472506199053e-06, + "loss": 0.8938, + "step": 4160 + }, + { + "epoch": 0.62, + "grad_norm": 1.4343246693704728, + "learning_rate": 6.639919902045169e-06, + "loss": 0.837, + "step": 4161 + }, + { + "epoch": 0.62, + "grad_norm": 1.4235153705074768, + "learning_rate": 6.6353680828063306e-06, + "loss": 0.9105, + "step": 4162 + }, + { + "epoch": 0.62, + "grad_norm": 1.538675941051293, + "learning_rate": 6.630817049545844e-06, + "loss": 0.8009, + "step": 4163 + }, + { + "epoch": 0.62, + "grad_norm": 1.573990402982657, + "learning_rate": 6.626266803326831e-06, + "loss": 0.8009, + "step": 4164 + }, + { + "epoch": 0.62, + "grad_norm": 1.349235348966594, + "learning_rate": 6.621717345212231e-06, + "loss": 0.8098, + "step": 4165 + }, + { + "epoch": 0.62, + "grad_norm": 1.375875300762048, + "learning_rate": 6.617168676264791e-06, + "loss": 0.8209, + "step": 4166 + }, + { + "epoch": 0.62, + "grad_norm": 1.4786015684629978, + "learning_rate": 6.612620797547087e-06, + "loss": 0.8196, + "step": 4167 + }, + { + "epoch": 0.62, + "grad_norm": 1.4590395089728088, + "learning_rate": 6.608073710121501e-06, + "loss": 0.9172, + "step": 4168 + }, + { + "epoch": 0.62, + "grad_norm": 1.439167559855049, + "learning_rate": 6.603527415050237e-06, + "loss": 0.7508, + "step": 4169 + }, + { + "epoch": 0.62, + "grad_norm": 1.317155404819832, + "learning_rate": 6.598981913395306e-06, + "loss": 0.8445, + "step": 4170 + }, + { + "epoch": 0.62, + "grad_norm": 1.2301618420364648, + "learning_rate": 6.594437206218539e-06, + "loss": 0.7968, + "step": 4171 + }, + { + "epoch": 0.62, + "grad_norm": 1.4949245721240965, + "learning_rate": 6.589893294581579e-06, + "loss": 0.8821, + "step": 4172 + }, + { + "epoch": 0.62, + "grad_norm": 1.3523744723985995, + "learning_rate": 6.585350179545884e-06, + "loss": 0.7763, + "step": 4173 + }, + { + "epoch": 0.62, + "grad_norm": 1.417083630151581, + "learning_rate": 6.580807862172731e-06, + "loss": 0.8731, + "step": 4174 + }, + { + "epoch": 0.62, + "grad_norm": 1.4346886047869207, + "learning_rate": 6.576266343523199e-06, + "loss": 0.9036, + "step": 4175 + }, + { + "epoch": 0.62, + "grad_norm": 1.5340577610696056, + "learning_rate": 6.571725624658189e-06, + "loss": 0.8244, + "step": 4176 + }, + { + "epoch": 0.62, + "grad_norm": 1.5280645747612418, + "learning_rate": 6.567185706638417e-06, + "loss": 0.7882, + "step": 4177 + }, + { + "epoch": 0.62, + "grad_norm": 1.2985215900920564, + "learning_rate": 6.562646590524406e-06, + "loss": 0.7443, + "step": 4178 + }, + { + "epoch": 0.62, + "grad_norm": 1.475359074328717, + "learning_rate": 6.558108277376496e-06, + "loss": 0.9245, + "step": 4179 + }, + { + "epoch": 0.62, + "grad_norm": 0.9721485135513757, + "learning_rate": 6.553570768254831e-06, + "loss": 0.3412, + "step": 4180 + }, + { + "epoch": 0.62, + "grad_norm": 1.4147354501858538, + "learning_rate": 6.549034064219379e-06, + "loss": 0.8838, + "step": 4181 + }, + { + "epoch": 0.62, + "grad_norm": 1.6845249225195797, + "learning_rate": 6.5444981663299135e-06, + "loss": 0.8188, + "step": 4182 + }, + { + "epoch": 0.62, + "grad_norm": 1.7170103675974178, + "learning_rate": 6.539963075646021e-06, + "loss": 0.8265, + "step": 4183 + }, + { + "epoch": 0.62, + "grad_norm": 1.3756278572860934, + "learning_rate": 6.535428793227102e-06, + "loss": 0.8473, + "step": 4184 + }, + { + "epoch": 0.62, + "grad_norm": 1.5865118743651678, + "learning_rate": 6.530895320132358e-06, + "loss": 0.8326, + "step": 4185 + }, + { + "epoch": 0.62, + "grad_norm": 1.2986020579822197, + "learning_rate": 6.526362657420813e-06, + "loss": 0.8223, + "step": 4186 + }, + { + "epoch": 0.62, + "grad_norm": 1.752031447872313, + "learning_rate": 6.521830806151297e-06, + "loss": 0.79, + "step": 4187 + }, + { + "epoch": 0.62, + "grad_norm": 1.3716378762489827, + "learning_rate": 6.517299767382451e-06, + "loss": 0.8159, + "step": 4188 + }, + { + "epoch": 0.63, + "grad_norm": 1.6077610112352816, + "learning_rate": 6.512769542172727e-06, + "loss": 0.8324, + "step": 4189 + }, + { + "epoch": 0.63, + "grad_norm": 1.3794370209217903, + "learning_rate": 6.508240131580386e-06, + "loss": 0.8094, + "step": 4190 + }, + { + "epoch": 0.63, + "grad_norm": 1.27304634569455, + "learning_rate": 6.503711536663499e-06, + "loss": 0.9299, + "step": 4191 + }, + { + "epoch": 0.63, + "grad_norm": 0.7915954960445426, + "learning_rate": 6.499183758479944e-06, + "loss": 0.3219, + "step": 4192 + }, + { + "epoch": 0.63, + "grad_norm": 1.5799377264683774, + "learning_rate": 6.494656798087412e-06, + "loss": 0.8785, + "step": 4193 + }, + { + "epoch": 0.63, + "grad_norm": 1.3712298999472066, + "learning_rate": 6.490130656543401e-06, + "loss": 0.8239, + "step": 4194 + }, + { + "epoch": 0.63, + "grad_norm": 1.852395606632373, + "learning_rate": 6.485605334905216e-06, + "loss": 0.7694, + "step": 4195 + }, + { + "epoch": 0.63, + "grad_norm": 1.75395148841272, + "learning_rate": 6.481080834229978e-06, + "loss": 0.8826, + "step": 4196 + }, + { + "epoch": 0.63, + "grad_norm": 1.4051221536796374, + "learning_rate": 6.47655715557461e-06, + "loss": 0.8934, + "step": 4197 + }, + { + "epoch": 0.63, + "grad_norm": 1.5128813235811764, + "learning_rate": 6.472034299995837e-06, + "loss": 0.8364, + "step": 4198 + }, + { + "epoch": 0.63, + "grad_norm": 1.509019990797452, + "learning_rate": 6.467512268550204e-06, + "loss": 0.8056, + "step": 4199 + }, + { + "epoch": 0.63, + "grad_norm": 1.6635778531772962, + "learning_rate": 6.462991062294057e-06, + "loss": 0.9425, + "step": 4200 + }, + { + "epoch": 0.63, + "grad_norm": 1.4492217157698564, + "learning_rate": 6.45847068228355e-06, + "loss": 0.8003, + "step": 4201 + }, + { + "epoch": 0.63, + "grad_norm": 1.4306574303795523, + "learning_rate": 6.453951129574644e-06, + "loss": 0.8952, + "step": 4202 + }, + { + "epoch": 0.63, + "grad_norm": 1.4208096689503036, + "learning_rate": 6.449432405223107e-06, + "loss": 0.8559, + "step": 4203 + }, + { + "epoch": 0.63, + "grad_norm": 1.4006992522726627, + "learning_rate": 6.444914510284519e-06, + "loss": 0.8232, + "step": 4204 + }, + { + "epoch": 0.63, + "grad_norm": 1.4007788722480299, + "learning_rate": 6.44039744581425e-06, + "loss": 0.8447, + "step": 4205 + }, + { + "epoch": 0.63, + "grad_norm": 1.492992637029976, + "learning_rate": 6.435881212867494e-06, + "loss": 0.8254, + "step": 4206 + }, + { + "epoch": 0.63, + "grad_norm": 1.5937910518782015, + "learning_rate": 6.431365812499242e-06, + "loss": 0.7467, + "step": 4207 + }, + { + "epoch": 0.63, + "grad_norm": 1.2812051394505937, + "learning_rate": 6.426851245764289e-06, + "loss": 0.8438, + "step": 4208 + }, + { + "epoch": 0.63, + "grad_norm": 1.477217115384169, + "learning_rate": 6.422337513717244e-06, + "loss": 0.7668, + "step": 4209 + }, + { + "epoch": 0.63, + "grad_norm": 1.470877767480189, + "learning_rate": 6.417824617412515e-06, + "loss": 0.8416, + "step": 4210 + }, + { + "epoch": 0.63, + "grad_norm": 1.396810879585123, + "learning_rate": 6.413312557904307e-06, + "loss": 0.7626, + "step": 4211 + }, + { + "epoch": 0.63, + "grad_norm": 1.4567238314620887, + "learning_rate": 6.408801336246645e-06, + "loss": 0.7778, + "step": 4212 + }, + { + "epoch": 0.63, + "grad_norm": 1.3125779328880536, + "learning_rate": 6.404290953493347e-06, + "loss": 0.8648, + "step": 4213 + }, + { + "epoch": 0.63, + "grad_norm": 1.325498950246133, + "learning_rate": 6.399781410698042e-06, + "loss": 0.8384, + "step": 4214 + }, + { + "epoch": 0.63, + "grad_norm": 1.3568458439531355, + "learning_rate": 6.395272708914156e-06, + "loss": 0.8031, + "step": 4215 + }, + { + "epoch": 0.63, + "grad_norm": 1.410992858968764, + "learning_rate": 6.390764849194926e-06, + "loss": 0.853, + "step": 4216 + }, + { + "epoch": 0.63, + "grad_norm": 1.3981473318003852, + "learning_rate": 6.38625783259339e-06, + "loss": 0.8115, + "step": 4217 + }, + { + "epoch": 0.63, + "grad_norm": 1.3431667445451263, + "learning_rate": 6.3817516601623805e-06, + "loss": 0.8796, + "step": 4218 + }, + { + "epoch": 0.63, + "grad_norm": 1.3691504814130562, + "learning_rate": 6.377246332954544e-06, + "loss": 0.7353, + "step": 4219 + }, + { + "epoch": 0.63, + "grad_norm": 1.4072387702724756, + "learning_rate": 6.372741852022324e-06, + "loss": 0.8817, + "step": 4220 + }, + { + "epoch": 0.63, + "grad_norm": 1.5120617701596062, + "learning_rate": 6.36823821841797e-06, + "loss": 0.8299, + "step": 4221 + }, + { + "epoch": 0.63, + "grad_norm": 1.6236411004168696, + "learning_rate": 6.36373543319353e-06, + "loss": 0.8628, + "step": 4222 + }, + { + "epoch": 0.63, + "grad_norm": 1.245746242713602, + "learning_rate": 6.359233497400858e-06, + "loss": 0.8393, + "step": 4223 + }, + { + "epoch": 0.63, + "grad_norm": 1.7154947155416183, + "learning_rate": 6.3547324120916e-06, + "loss": 0.8352, + "step": 4224 + }, + { + "epoch": 0.63, + "grad_norm": 1.5713588184641445, + "learning_rate": 6.350232178317214e-06, + "loss": 0.7593, + "step": 4225 + }, + { + "epoch": 0.63, + "grad_norm": 1.3004270836098588, + "learning_rate": 6.345732797128954e-06, + "loss": 0.838, + "step": 4226 + }, + { + "epoch": 0.63, + "grad_norm": 1.4283736555043585, + "learning_rate": 6.341234269577878e-06, + "loss": 0.8661, + "step": 4227 + }, + { + "epoch": 0.63, + "grad_norm": 1.506266843090225, + "learning_rate": 6.336736596714842e-06, + "loss": 0.8497, + "step": 4228 + }, + { + "epoch": 0.63, + "grad_norm": 1.513288925704829, + "learning_rate": 6.3322397795905e-06, + "loss": 0.7767, + "step": 4229 + }, + { + "epoch": 0.63, + "grad_norm": 1.4930537085112032, + "learning_rate": 6.327743819255313e-06, + "loss": 0.7917, + "step": 4230 + }, + { + "epoch": 0.63, + "grad_norm": 1.297647998423009, + "learning_rate": 6.323248716759534e-06, + "loss": 0.8647, + "step": 4231 + }, + { + "epoch": 0.63, + "grad_norm": 1.332295261575304, + "learning_rate": 6.318754473153221e-06, + "loss": 0.8023, + "step": 4232 + }, + { + "epoch": 0.63, + "grad_norm": 1.4274856772169817, + "learning_rate": 6.314261089486231e-06, + "loss": 0.8488, + "step": 4233 + }, + { + "epoch": 0.63, + "grad_norm": 1.4828094376621843, + "learning_rate": 6.309768566808217e-06, + "loss": 0.847, + "step": 4234 + }, + { + "epoch": 0.63, + "grad_norm": 1.5252582978647513, + "learning_rate": 6.305276906168633e-06, + "loss": 0.8819, + "step": 4235 + }, + { + "epoch": 0.63, + "grad_norm": 1.4434784516188566, + "learning_rate": 6.300786108616732e-06, + "loss": 0.7488, + "step": 4236 + }, + { + "epoch": 0.63, + "grad_norm": 1.3117261603522674, + "learning_rate": 6.296296175201565e-06, + "loss": 0.8374, + "step": 4237 + }, + { + "epoch": 0.63, + "grad_norm": 1.3891473317015244, + "learning_rate": 6.291807106971981e-06, + "loss": 0.7711, + "step": 4238 + }, + { + "epoch": 0.63, + "grad_norm": 1.588283988633166, + "learning_rate": 6.287318904976625e-06, + "loss": 0.8558, + "step": 4239 + }, + { + "epoch": 0.63, + "grad_norm": 1.554095954832167, + "learning_rate": 6.282831570263943e-06, + "loss": 0.8746, + "step": 4240 + }, + { + "epoch": 0.63, + "grad_norm": 1.629557306780163, + "learning_rate": 6.278345103882175e-06, + "loss": 0.7383, + "step": 4241 + }, + { + "epoch": 0.63, + "grad_norm": 1.3915828937817551, + "learning_rate": 6.273859506879365e-06, + "loss": 0.8946, + "step": 4242 + }, + { + "epoch": 0.63, + "grad_norm": 1.4434973621095728, + "learning_rate": 6.269374780303345e-06, + "loss": 0.8357, + "step": 4243 + }, + { + "epoch": 0.63, + "grad_norm": 1.632965291782185, + "learning_rate": 6.264890925201745e-06, + "loss": 0.8347, + "step": 4244 + }, + { + "epoch": 0.63, + "grad_norm": 1.3674917536474693, + "learning_rate": 6.260407942621998e-06, + "loss": 0.7915, + "step": 4245 + }, + { + "epoch": 0.63, + "grad_norm": 1.3848768036280812, + "learning_rate": 6.255925833611327e-06, + "loss": 0.943, + "step": 4246 + }, + { + "epoch": 0.63, + "grad_norm": 1.4433174927329897, + "learning_rate": 6.251444599216756e-06, + "loss": 0.7456, + "step": 4247 + }, + { + "epoch": 0.63, + "grad_norm": 0.9415590312652048, + "learning_rate": 6.246964240485099e-06, + "loss": 0.3529, + "step": 4248 + }, + { + "epoch": 0.63, + "grad_norm": 1.6894160492638008, + "learning_rate": 6.242484758462972e-06, + "loss": 0.8447, + "step": 4249 + }, + { + "epoch": 0.63, + "grad_norm": 1.3035035193386735, + "learning_rate": 6.238006154196779e-06, + "loss": 0.8487, + "step": 4250 + }, + { + "epoch": 0.63, + "grad_norm": 1.591451505627097, + "learning_rate": 6.23352842873272e-06, + "loss": 0.9135, + "step": 4251 + }, + { + "epoch": 0.63, + "grad_norm": 1.3382616280488455, + "learning_rate": 6.229051583116796e-06, + "loss": 0.7808, + "step": 4252 + }, + { + "epoch": 0.63, + "grad_norm": 1.581072434342835, + "learning_rate": 6.2245756183947995e-06, + "loss": 0.7799, + "step": 4253 + }, + { + "epoch": 0.63, + "grad_norm": 1.4652129089585146, + "learning_rate": 6.220100535612313e-06, + "loss": 0.8485, + "step": 4254 + }, + { + "epoch": 0.63, + "grad_norm": 1.5604234062928044, + "learning_rate": 6.215626335814723e-06, + "loss": 0.8722, + "step": 4255 + }, + { + "epoch": 0.64, + "grad_norm": 1.4007858286664163, + "learning_rate": 6.2111530200471935e-06, + "loss": 0.8063, + "step": 4256 + }, + { + "epoch": 0.64, + "grad_norm": 1.4459710991902535, + "learning_rate": 6.206680589354696e-06, + "loss": 0.8536, + "step": 4257 + }, + { + "epoch": 0.64, + "grad_norm": 1.4525943230605125, + "learning_rate": 6.202209044781991e-06, + "loss": 0.8338, + "step": 4258 + }, + { + "epoch": 0.64, + "grad_norm": 1.50384393538145, + "learning_rate": 6.197738387373631e-06, + "loss": 0.8181, + "step": 4259 + }, + { + "epoch": 0.64, + "grad_norm": 1.5001895003770507, + "learning_rate": 6.193268618173962e-06, + "loss": 0.8414, + "step": 4260 + }, + { + "epoch": 0.64, + "grad_norm": 1.5014986989616157, + "learning_rate": 6.188799738227124e-06, + "loss": 0.9541, + "step": 4261 + }, + { + "epoch": 0.64, + "grad_norm": 1.468910105835831, + "learning_rate": 6.18433174857705e-06, + "loss": 0.8768, + "step": 4262 + }, + { + "epoch": 0.64, + "grad_norm": 1.5090954540467645, + "learning_rate": 6.179864650267457e-06, + "loss": 0.8795, + "step": 4263 + }, + { + "epoch": 0.64, + "grad_norm": 1.3939941070331088, + "learning_rate": 6.17539844434186e-06, + "loss": 0.9215, + "step": 4264 + }, + { + "epoch": 0.64, + "grad_norm": 1.5681736029675502, + "learning_rate": 6.170933131843569e-06, + "loss": 0.8103, + "step": 4265 + }, + { + "epoch": 0.64, + "grad_norm": 1.4430785170433695, + "learning_rate": 6.16646871381568e-06, + "loss": 0.8547, + "step": 4266 + }, + { + "epoch": 0.64, + "grad_norm": 1.5176629671169857, + "learning_rate": 6.162005191301082e-06, + "loss": 0.8358, + "step": 4267 + }, + { + "epoch": 0.64, + "grad_norm": 1.7127915117738377, + "learning_rate": 6.1575425653424555e-06, + "loss": 0.8852, + "step": 4268 + }, + { + "epoch": 0.64, + "grad_norm": 0.9307209549893665, + "learning_rate": 6.153080836982266e-06, + "loss": 0.3375, + "step": 4269 + }, + { + "epoch": 0.64, + "grad_norm": 1.4111039148823523, + "learning_rate": 6.148620007262775e-06, + "loss": 0.9023, + "step": 4270 + }, + { + "epoch": 0.64, + "grad_norm": 1.3554616970516378, + "learning_rate": 6.144160077226035e-06, + "loss": 0.8947, + "step": 4271 + }, + { + "epoch": 0.64, + "grad_norm": 1.6455928876714077, + "learning_rate": 6.139701047913885e-06, + "loss": 0.8517, + "step": 4272 + }, + { + "epoch": 0.64, + "grad_norm": 1.6237397855139317, + "learning_rate": 6.135242920367955e-06, + "loss": 0.8149, + "step": 4273 + }, + { + "epoch": 0.64, + "grad_norm": 1.449193928549702, + "learning_rate": 6.130785695629664e-06, + "loss": 0.7908, + "step": 4274 + }, + { + "epoch": 0.64, + "grad_norm": 1.4797848056442116, + "learning_rate": 6.126329374740223e-06, + "loss": 0.7772, + "step": 4275 + }, + { + "epoch": 0.64, + "grad_norm": 1.6584399293501135, + "learning_rate": 6.121873958740623e-06, + "loss": 0.8944, + "step": 4276 + }, + { + "epoch": 0.64, + "grad_norm": 1.4196461589578617, + "learning_rate": 6.117419448671651e-06, + "loss": 0.8511, + "step": 4277 + }, + { + "epoch": 0.64, + "grad_norm": 1.5236881282248365, + "learning_rate": 6.112965845573884e-06, + "loss": 0.9161, + "step": 4278 + }, + { + "epoch": 0.64, + "grad_norm": 1.4004184315792443, + "learning_rate": 6.108513150487682e-06, + "loss": 0.8552, + "step": 4279 + }, + { + "epoch": 0.64, + "grad_norm": 1.397850362231667, + "learning_rate": 6.104061364453196e-06, + "loss": 0.8726, + "step": 4280 + }, + { + "epoch": 0.64, + "grad_norm": 1.505198715986615, + "learning_rate": 6.099610488510368e-06, + "loss": 0.8826, + "step": 4281 + }, + { + "epoch": 0.64, + "grad_norm": 1.4501053117509792, + "learning_rate": 6.095160523698913e-06, + "loss": 0.8147, + "step": 4282 + }, + { + "epoch": 0.64, + "grad_norm": 1.4644763880072755, + "learning_rate": 6.09071147105835e-06, + "loss": 0.8353, + "step": 4283 + }, + { + "epoch": 0.64, + "grad_norm": 1.5582500577166882, + "learning_rate": 6.086263331627976e-06, + "loss": 0.8452, + "step": 4284 + }, + { + "epoch": 0.64, + "grad_norm": 1.4587571970699578, + "learning_rate": 6.081816106446878e-06, + "loss": 0.839, + "step": 4285 + }, + { + "epoch": 0.64, + "grad_norm": 1.2969889842061815, + "learning_rate": 6.077369796553928e-06, + "loss": 0.839, + "step": 4286 + }, + { + "epoch": 0.64, + "grad_norm": 1.3326597996093192, + "learning_rate": 6.072924402987785e-06, + "loss": 0.8325, + "step": 4287 + }, + { + "epoch": 0.64, + "grad_norm": 1.386576818536693, + "learning_rate": 6.068479926786894e-06, + "loss": 0.8935, + "step": 4288 + }, + { + "epoch": 0.64, + "grad_norm": 1.3943188329209395, + "learning_rate": 6.064036368989482e-06, + "loss": 0.8013, + "step": 4289 + }, + { + "epoch": 0.64, + "grad_norm": 1.2575423049671666, + "learning_rate": 6.0595937306335666e-06, + "loss": 0.8299, + "step": 4290 + }, + { + "epoch": 0.64, + "grad_norm": 1.5127120666610412, + "learning_rate": 6.055152012756946e-06, + "loss": 0.883, + "step": 4291 + }, + { + "epoch": 0.64, + "grad_norm": 1.4461158324152272, + "learning_rate": 6.050711216397212e-06, + "loss": 0.8011, + "step": 4292 + }, + { + "epoch": 0.64, + "grad_norm": 1.255689961796309, + "learning_rate": 6.04627134259173e-06, + "loss": 0.7625, + "step": 4293 + }, + { + "epoch": 0.64, + "grad_norm": 1.6353414790572611, + "learning_rate": 6.0418323923776565e-06, + "loss": 0.8339, + "step": 4294 + }, + { + "epoch": 0.64, + "grad_norm": 1.4185120885409621, + "learning_rate": 6.0373943667919285e-06, + "loss": 0.807, + "step": 4295 + }, + { + "epoch": 0.64, + "grad_norm": 1.3513936918603378, + "learning_rate": 6.032957266871274e-06, + "loss": 0.8903, + "step": 4296 + }, + { + "epoch": 0.64, + "grad_norm": 1.4366055004193379, + "learning_rate": 6.028521093652195e-06, + "loss": 0.8149, + "step": 4297 + }, + { + "epoch": 0.64, + "grad_norm": 1.3778814960274006, + "learning_rate": 6.0240858481709854e-06, + "loss": 0.8322, + "step": 4298 + }, + { + "epoch": 0.64, + "grad_norm": 1.2785585617095774, + "learning_rate": 6.0196515314637164e-06, + "loss": 0.867, + "step": 4299 + }, + { + "epoch": 0.64, + "grad_norm": 1.4186514342042469, + "learning_rate": 6.0152181445662485e-06, + "loss": 0.8652, + "step": 4300 + }, + { + "epoch": 0.64, + "grad_norm": 1.4491098560951776, + "learning_rate": 6.010785688514216e-06, + "loss": 0.8721, + "step": 4301 + }, + { + "epoch": 0.64, + "grad_norm": 1.4138503969357206, + "learning_rate": 6.006354164343047e-06, + "loss": 0.8754, + "step": 4302 + }, + { + "epoch": 0.64, + "grad_norm": 1.4673056310662789, + "learning_rate": 6.0019235730879414e-06, + "loss": 0.8147, + "step": 4303 + }, + { + "epoch": 0.64, + "grad_norm": 1.3413836663355185, + "learning_rate": 5.997493915783887e-06, + "loss": 0.7428, + "step": 4304 + }, + { + "epoch": 0.64, + "grad_norm": 1.6571155751456612, + "learning_rate": 5.993065193465653e-06, + "loss": 0.8255, + "step": 4305 + }, + { + "epoch": 0.64, + "grad_norm": 1.386258314819785, + "learning_rate": 5.9886374071677875e-06, + "loss": 0.8126, + "step": 4306 + }, + { + "epoch": 0.64, + "grad_norm": 1.6832059565184125, + "learning_rate": 5.984210557924628e-06, + "loss": 0.8096, + "step": 4307 + }, + { + "epoch": 0.64, + "grad_norm": 1.2509020605244172, + "learning_rate": 5.979784646770279e-06, + "loss": 0.7397, + "step": 4308 + }, + { + "epoch": 0.64, + "grad_norm": 1.5688376594080107, + "learning_rate": 5.975359674738637e-06, + "loss": 0.8038, + "step": 4309 + }, + { + "epoch": 0.64, + "grad_norm": 1.6431356622347084, + "learning_rate": 5.970935642863375e-06, + "loss": 0.8522, + "step": 4310 + }, + { + "epoch": 0.64, + "grad_norm": 1.5348489237759009, + "learning_rate": 5.966512552177949e-06, + "loss": 0.7724, + "step": 4311 + }, + { + "epoch": 0.64, + "grad_norm": 1.3747303776927502, + "learning_rate": 5.962090403715592e-06, + "loss": 0.8389, + "step": 4312 + }, + { + "epoch": 0.64, + "grad_norm": 1.5817183974070743, + "learning_rate": 5.9576691985093235e-06, + "loss": 0.8636, + "step": 4313 + }, + { + "epoch": 0.64, + "grad_norm": 1.4498172344233735, + "learning_rate": 5.953248937591929e-06, + "loss": 0.8532, + "step": 4314 + }, + { + "epoch": 0.64, + "grad_norm": 1.2929147142891533, + "learning_rate": 5.948829621995984e-06, + "loss": 0.8615, + "step": 4315 + }, + { + "epoch": 0.64, + "grad_norm": 1.3669170548118452, + "learning_rate": 5.944411252753846e-06, + "loss": 0.8234, + "step": 4316 + }, + { + "epoch": 0.64, + "grad_norm": 1.3873774434908999, + "learning_rate": 5.939993830897641e-06, + "loss": 0.7439, + "step": 4317 + }, + { + "epoch": 0.64, + "grad_norm": 1.566217294554743, + "learning_rate": 5.935577357459282e-06, + "loss": 0.7592, + "step": 4318 + }, + { + "epoch": 0.64, + "grad_norm": 1.505575332001506, + "learning_rate": 5.931161833470458e-06, + "loss": 0.7697, + "step": 4319 + }, + { + "epoch": 0.64, + "grad_norm": 1.3834612834388402, + "learning_rate": 5.926747259962638e-06, + "loss": 0.8533, + "step": 4320 + }, + { + "epoch": 0.64, + "grad_norm": 1.4389951176560545, + "learning_rate": 5.92233363796706e-06, + "loss": 0.8403, + "step": 4321 + }, + { + "epoch": 0.64, + "grad_norm": 1.4469450635506795, + "learning_rate": 5.9179209685147525e-06, + "loss": 0.7903, + "step": 4322 + }, + { + "epoch": 0.65, + "grad_norm": 1.4416974001075, + "learning_rate": 5.913509252636511e-06, + "loss": 0.8761, + "step": 4323 + }, + { + "epoch": 0.65, + "grad_norm": 1.4398427252266477, + "learning_rate": 5.909098491362916e-06, + "loss": 0.8267, + "step": 4324 + }, + { + "epoch": 0.65, + "grad_norm": 1.4409529760251871, + "learning_rate": 5.90468868572432e-06, + "loss": 0.8025, + "step": 4325 + }, + { + "epoch": 0.65, + "grad_norm": 1.6198685784698694, + "learning_rate": 5.900279836750859e-06, + "loss": 0.8161, + "step": 4326 + }, + { + "epoch": 0.65, + "grad_norm": 1.3505587930220215, + "learning_rate": 5.895871945472434e-06, + "loss": 0.8447, + "step": 4327 + }, + { + "epoch": 0.65, + "grad_norm": 1.4245812408853182, + "learning_rate": 5.891465012918731e-06, + "loss": 0.806, + "step": 4328 + }, + { + "epoch": 0.65, + "grad_norm": 1.4211720966680543, + "learning_rate": 5.887059040119209e-06, + "loss": 0.8067, + "step": 4329 + }, + { + "epoch": 0.65, + "grad_norm": 1.4558989699342442, + "learning_rate": 5.882654028103108e-06, + "loss": 0.8538, + "step": 4330 + }, + { + "epoch": 0.65, + "grad_norm": 1.4152251654979937, + "learning_rate": 5.878249977899433e-06, + "loss": 0.8639, + "step": 4331 + }, + { + "epoch": 0.65, + "grad_norm": 1.5119627119939212, + "learning_rate": 5.873846890536976e-06, + "loss": 0.8323, + "step": 4332 + }, + { + "epoch": 0.65, + "grad_norm": 1.382540626254464, + "learning_rate": 5.8694447670442985e-06, + "loss": 0.8252, + "step": 4333 + }, + { + "epoch": 0.65, + "grad_norm": 1.236847545414012, + "learning_rate": 5.865043608449732e-06, + "loss": 0.8008, + "step": 4334 + }, + { + "epoch": 0.65, + "grad_norm": 1.3006345706296107, + "learning_rate": 5.86064341578139e-06, + "loss": 0.8261, + "step": 4335 + }, + { + "epoch": 0.65, + "grad_norm": 1.5705140628555114, + "learning_rate": 5.85624419006716e-06, + "loss": 0.8304, + "step": 4336 + }, + { + "epoch": 0.65, + "grad_norm": 0.9560450903125399, + "learning_rate": 5.851845932334698e-06, + "loss": 0.3257, + "step": 4337 + }, + { + "epoch": 0.65, + "grad_norm": 1.4345550257139037, + "learning_rate": 5.84744864361144e-06, + "loss": 0.8811, + "step": 4338 + }, + { + "epoch": 0.65, + "grad_norm": 1.4646735219591631, + "learning_rate": 5.843052324924596e-06, + "loss": 0.8505, + "step": 4339 + }, + { + "epoch": 0.65, + "grad_norm": 1.5632767784954522, + "learning_rate": 5.83865697730114e-06, + "loss": 0.826, + "step": 4340 + }, + { + "epoch": 0.65, + "grad_norm": 1.3724701926561396, + "learning_rate": 5.834262601767828e-06, + "loss": 0.8357, + "step": 4341 + }, + { + "epoch": 0.65, + "grad_norm": 1.3894978788180528, + "learning_rate": 5.829869199351188e-06, + "loss": 0.7285, + "step": 4342 + }, + { + "epoch": 0.65, + "grad_norm": 1.5056022042232862, + "learning_rate": 5.825476771077518e-06, + "loss": 0.8732, + "step": 4343 + }, + { + "epoch": 0.65, + "grad_norm": 1.3594049581975511, + "learning_rate": 5.821085317972891e-06, + "loss": 0.8071, + "step": 4344 + }, + { + "epoch": 0.65, + "grad_norm": 1.426420079127766, + "learning_rate": 5.816694841063149e-06, + "loss": 0.837, + "step": 4345 + }, + { + "epoch": 0.65, + "grad_norm": 1.6557987246122676, + "learning_rate": 5.812305341373914e-06, + "loss": 0.7949, + "step": 4346 + }, + { + "epoch": 0.65, + "grad_norm": 1.5684910477866751, + "learning_rate": 5.807916819930563e-06, + "loss": 0.8455, + "step": 4347 + }, + { + "epoch": 0.65, + "grad_norm": 1.5031814990112005, + "learning_rate": 5.803529277758261e-06, + "loss": 0.8887, + "step": 4348 + }, + { + "epoch": 0.65, + "grad_norm": 1.253253187367845, + "learning_rate": 5.799142715881938e-06, + "loss": 0.7544, + "step": 4349 + }, + { + "epoch": 0.65, + "grad_norm": 1.5725081132401277, + "learning_rate": 5.794757135326294e-06, + "loss": 0.8732, + "step": 4350 + }, + { + "epoch": 0.65, + "grad_norm": 1.5591659015589374, + "learning_rate": 5.790372537115802e-06, + "loss": 0.8555, + "step": 4351 + }, + { + "epoch": 0.65, + "grad_norm": 1.6061185452276394, + "learning_rate": 5.785988922274711e-06, + "loss": 0.8433, + "step": 4352 + }, + { + "epoch": 0.65, + "grad_norm": 1.5060769398929812, + "learning_rate": 5.7816062918270225e-06, + "loss": 0.866, + "step": 4353 + }, + { + "epoch": 0.65, + "grad_norm": 1.361922133818583, + "learning_rate": 5.7772246467965246e-06, + "loss": 0.8871, + "step": 4354 + }, + { + "epoch": 0.65, + "grad_norm": 1.5091309243459323, + "learning_rate": 5.772843988206769e-06, + "loss": 0.9164, + "step": 4355 + }, + { + "epoch": 0.65, + "grad_norm": 1.4106831015506072, + "learning_rate": 5.76846431708108e-06, + "loss": 0.7768, + "step": 4356 + }, + { + "epoch": 0.65, + "grad_norm": 1.8663618325193143, + "learning_rate": 5.7640856344425465e-06, + "loss": 0.8162, + "step": 4357 + }, + { + "epoch": 0.65, + "grad_norm": 1.2245658626461553, + "learning_rate": 5.759707941314032e-06, + "loss": 0.8264, + "step": 4358 + }, + { + "epoch": 0.65, + "grad_norm": 2.061817501495855, + "learning_rate": 5.7553312387181685e-06, + "loss": 0.8357, + "step": 4359 + }, + { + "epoch": 0.65, + "grad_norm": 1.5553629165236145, + "learning_rate": 5.750955527677347e-06, + "loss": 0.8562, + "step": 4360 + }, + { + "epoch": 0.65, + "grad_norm": 1.53810718340051, + "learning_rate": 5.746580809213736e-06, + "loss": 0.8346, + "step": 4361 + }, + { + "epoch": 0.65, + "grad_norm": 1.6304397408610432, + "learning_rate": 5.742207084349274e-06, + "loss": 0.9015, + "step": 4362 + }, + { + "epoch": 0.65, + "grad_norm": 1.6222530206970949, + "learning_rate": 5.737834354105659e-06, + "loss": 0.8514, + "step": 4363 + }, + { + "epoch": 0.65, + "grad_norm": 1.5184247350745008, + "learning_rate": 5.733462619504364e-06, + "loss": 0.844, + "step": 4364 + }, + { + "epoch": 0.65, + "grad_norm": 1.5542523803151984, + "learning_rate": 5.729091881566631e-06, + "loss": 0.9175, + "step": 4365 + }, + { + "epoch": 0.65, + "grad_norm": 1.3978160544236615, + "learning_rate": 5.724722141313456e-06, + "loss": 0.8409, + "step": 4366 + }, + { + "epoch": 0.65, + "grad_norm": 1.4025366771302141, + "learning_rate": 5.720353399765615e-06, + "loss": 0.8381, + "step": 4367 + }, + { + "epoch": 0.65, + "grad_norm": 1.5183792000593377, + "learning_rate": 5.715985657943644e-06, + "loss": 0.8227, + "step": 4368 + }, + { + "epoch": 0.65, + "grad_norm": 0.748280145836894, + "learning_rate": 5.711618916867851e-06, + "loss": 0.3147, + "step": 4369 + }, + { + "epoch": 0.65, + "grad_norm": 1.372096064858729, + "learning_rate": 5.707253177558308e-06, + "loss": 0.8043, + "step": 4370 + }, + { + "epoch": 0.65, + "grad_norm": 1.5708921172932517, + "learning_rate": 5.702888441034853e-06, + "loss": 0.8723, + "step": 4371 + }, + { + "epoch": 0.65, + "grad_norm": 1.4808292616788827, + "learning_rate": 5.698524708317082e-06, + "loss": 0.8946, + "step": 4372 + }, + { + "epoch": 0.65, + "grad_norm": 1.3427792018419864, + "learning_rate": 5.694161980424368e-06, + "loss": 0.7711, + "step": 4373 + }, + { + "epoch": 0.65, + "grad_norm": 1.3974828964949968, + "learning_rate": 5.689800258375844e-06, + "loss": 0.8335, + "step": 4374 + }, + { + "epoch": 0.65, + "grad_norm": 1.2134117843502503, + "learning_rate": 5.685439543190409e-06, + "loss": 0.7655, + "step": 4375 + }, + { + "epoch": 0.65, + "grad_norm": 1.4692356867718188, + "learning_rate": 5.681079835886727e-06, + "loss": 0.7673, + "step": 4376 + }, + { + "epoch": 0.65, + "grad_norm": 1.5862665316608566, + "learning_rate": 5.676721137483226e-06, + "loss": 0.8585, + "step": 4377 + }, + { + "epoch": 0.65, + "grad_norm": 1.4394047313614347, + "learning_rate": 5.672363448998102e-06, + "loss": 0.7739, + "step": 4378 + }, + { + "epoch": 0.65, + "grad_norm": 1.3529931855338269, + "learning_rate": 5.668006771449302e-06, + "loss": 0.8324, + "step": 4379 + }, + { + "epoch": 0.65, + "grad_norm": 1.4497687775364378, + "learning_rate": 5.6636511058545525e-06, + "loss": 0.9044, + "step": 4380 + }, + { + "epoch": 0.65, + "grad_norm": 1.5213890853835024, + "learning_rate": 5.6592964532313365e-06, + "loss": 0.8179, + "step": 4381 + }, + { + "epoch": 0.65, + "grad_norm": 1.478262068248342, + "learning_rate": 5.654942814596902e-06, + "loss": 0.8414, + "step": 4382 + }, + { + "epoch": 0.65, + "grad_norm": 1.3837117460910946, + "learning_rate": 5.650590190968263e-06, + "loss": 0.7734, + "step": 4383 + }, + { + "epoch": 0.65, + "grad_norm": 1.6458157661537622, + "learning_rate": 5.646238583362184e-06, + "loss": 0.8523, + "step": 4384 + }, + { + "epoch": 0.65, + "grad_norm": 1.6966816860102454, + "learning_rate": 5.641887992795205e-06, + "loss": 0.8899, + "step": 4385 + }, + { + "epoch": 0.65, + "grad_norm": 1.4374113858939788, + "learning_rate": 5.6375384202836254e-06, + "loss": 0.8221, + "step": 4386 + }, + { + "epoch": 0.65, + "grad_norm": 1.428425929936069, + "learning_rate": 5.633189866843507e-06, + "loss": 0.7949, + "step": 4387 + }, + { + "epoch": 0.65, + "grad_norm": 1.484082934858807, + "learning_rate": 5.628842333490674e-06, + "loss": 0.7901, + "step": 4388 + }, + { + "epoch": 0.65, + "grad_norm": 1.4991614511184272, + "learning_rate": 5.624495821240703e-06, + "loss": 0.7751, + "step": 4389 + }, + { + "epoch": 0.66, + "grad_norm": 1.3917369830178221, + "learning_rate": 5.620150331108943e-06, + "loss": 0.8411, + "step": 4390 + }, + { + "epoch": 0.66, + "grad_norm": 1.513431232766497, + "learning_rate": 5.615805864110504e-06, + "loss": 0.827, + "step": 4391 + }, + { + "epoch": 0.66, + "grad_norm": 1.5020482488305424, + "learning_rate": 5.611462421260251e-06, + "loss": 0.8172, + "step": 4392 + }, + { + "epoch": 0.66, + "grad_norm": 1.4921293946382255, + "learning_rate": 5.607120003572817e-06, + "loss": 0.8029, + "step": 4393 + }, + { + "epoch": 0.66, + "grad_norm": 1.618901813781922, + "learning_rate": 5.602778612062585e-06, + "loss": 0.8448, + "step": 4394 + }, + { + "epoch": 0.66, + "grad_norm": 1.5388424529668256, + "learning_rate": 5.598438247743706e-06, + "loss": 0.833, + "step": 4395 + }, + { + "epoch": 0.66, + "grad_norm": 1.3046444318995354, + "learning_rate": 5.594098911630091e-06, + "loss": 0.8201, + "step": 4396 + }, + { + "epoch": 0.66, + "grad_norm": 1.5203985709097454, + "learning_rate": 5.58976060473541e-06, + "loss": 0.8727, + "step": 4397 + }, + { + "epoch": 0.66, + "grad_norm": 1.4174538707438733, + "learning_rate": 5.585423328073094e-06, + "loss": 0.8481, + "step": 4398 + }, + { + "epoch": 0.66, + "grad_norm": 1.424667041266223, + "learning_rate": 5.581087082656325e-06, + "loss": 0.8137, + "step": 4399 + }, + { + "epoch": 0.66, + "grad_norm": 1.455661799348346, + "learning_rate": 5.576751869498054e-06, + "loss": 0.8111, + "step": 4400 + }, + { + "epoch": 0.66, + "grad_norm": 1.6100683512657807, + "learning_rate": 5.572417689610987e-06, + "loss": 0.8298, + "step": 4401 + }, + { + "epoch": 0.66, + "grad_norm": 1.3152238958137867, + "learning_rate": 5.5680845440075885e-06, + "loss": 0.7922, + "step": 4402 + }, + { + "epoch": 0.66, + "grad_norm": 1.386333283054433, + "learning_rate": 5.563752433700082e-06, + "loss": 0.8462, + "step": 4403 + }, + { + "epoch": 0.66, + "grad_norm": 1.5842246815233396, + "learning_rate": 5.559421359700452e-06, + "loss": 0.887, + "step": 4404 + }, + { + "epoch": 0.66, + "grad_norm": 0.9231606374610966, + "learning_rate": 5.555091323020432e-06, + "loss": 0.3161, + "step": 4405 + }, + { + "epoch": 0.66, + "grad_norm": 1.4900159932034664, + "learning_rate": 5.550762324671521e-06, + "loss": 0.8244, + "step": 4406 + }, + { + "epoch": 0.66, + "grad_norm": 1.4883715040269252, + "learning_rate": 5.546434365664974e-06, + "loss": 0.8429, + "step": 4407 + }, + { + "epoch": 0.66, + "grad_norm": 1.582801746603013, + "learning_rate": 5.5421074470118045e-06, + "loss": 0.855, + "step": 4408 + }, + { + "epoch": 0.66, + "grad_norm": 0.8678966127423097, + "learning_rate": 5.5377815697227776e-06, + "loss": 0.3312, + "step": 4409 + }, + { + "epoch": 0.66, + "grad_norm": 1.5638858657317964, + "learning_rate": 5.5334567348084265e-06, + "loss": 0.8955, + "step": 4410 + }, + { + "epoch": 0.66, + "grad_norm": 1.6200262519006703, + "learning_rate": 5.529132943279021e-06, + "loss": 0.7759, + "step": 4411 + }, + { + "epoch": 0.66, + "grad_norm": 1.3310397784712833, + "learning_rate": 5.5248101961446065e-06, + "loss": 0.7596, + "step": 4412 + }, + { + "epoch": 0.66, + "grad_norm": 1.4530825909968506, + "learning_rate": 5.520488494414975e-06, + "loss": 0.7975, + "step": 4413 + }, + { + "epoch": 0.66, + "grad_norm": 1.503052018672958, + "learning_rate": 5.516167839099679e-06, + "loss": 0.856, + "step": 4414 + }, + { + "epoch": 0.66, + "grad_norm": 1.3347766047130232, + "learning_rate": 5.511848231208022e-06, + "loss": 0.9411, + "step": 4415 + }, + { + "epoch": 0.66, + "grad_norm": 1.359225803442016, + "learning_rate": 5.507529671749064e-06, + "loss": 0.8778, + "step": 4416 + }, + { + "epoch": 0.66, + "grad_norm": 1.7242945035870008, + "learning_rate": 5.503212161731628e-06, + "loss": 0.8246, + "step": 4417 + }, + { + "epoch": 0.66, + "grad_norm": 1.5992071114038786, + "learning_rate": 5.498895702164274e-06, + "loss": 0.8331, + "step": 4418 + }, + { + "epoch": 0.66, + "grad_norm": 1.5451353940733725, + "learning_rate": 5.494580294055333e-06, + "loss": 0.8012, + "step": 4419 + }, + { + "epoch": 0.66, + "grad_norm": 1.488193809764088, + "learning_rate": 5.490265938412883e-06, + "loss": 0.856, + "step": 4420 + }, + { + "epoch": 0.66, + "grad_norm": 1.531953056023603, + "learning_rate": 5.4859526362447605e-06, + "loss": 0.898, + "step": 4421 + }, + { + "epoch": 0.66, + "grad_norm": 1.466591457454563, + "learning_rate": 5.481640388558551e-06, + "loss": 0.843, + "step": 4422 + }, + { + "epoch": 0.66, + "grad_norm": 1.382143136105328, + "learning_rate": 5.4773291963616006e-06, + "loss": 0.8348, + "step": 4423 + }, + { + "epoch": 0.66, + "grad_norm": 1.4595352349595985, + "learning_rate": 5.473019060660997e-06, + "loss": 0.8394, + "step": 4424 + }, + { + "epoch": 0.66, + "grad_norm": 1.4918264362885365, + "learning_rate": 5.468709982463591e-06, + "loss": 0.7896, + "step": 4425 + }, + { + "epoch": 0.66, + "grad_norm": 1.4049767372515047, + "learning_rate": 5.464401962775986e-06, + "loss": 0.7831, + "step": 4426 + }, + { + "epoch": 0.66, + "grad_norm": 1.429010411997647, + "learning_rate": 5.460095002604533e-06, + "loss": 0.8122, + "step": 4427 + }, + { + "epoch": 0.66, + "grad_norm": 1.4145809462691723, + "learning_rate": 5.45578910295534e-06, + "loss": 0.8534, + "step": 4428 + }, + { + "epoch": 0.66, + "grad_norm": 1.7319571292074991, + "learning_rate": 5.451484264834271e-06, + "loss": 0.823, + "step": 4429 + }, + { + "epoch": 0.66, + "grad_norm": 1.625213531072949, + "learning_rate": 5.447180489246924e-06, + "loss": 0.8202, + "step": 4430 + }, + { + "epoch": 0.66, + "grad_norm": 1.4906172504263588, + "learning_rate": 5.442877777198669e-06, + "loss": 0.7706, + "step": 4431 + }, + { + "epoch": 0.66, + "grad_norm": 1.4920874088980793, + "learning_rate": 5.43857612969462e-06, + "loss": 0.8523, + "step": 4432 + }, + { + "epoch": 0.66, + "grad_norm": 1.3836969581862542, + "learning_rate": 5.434275547739641e-06, + "loss": 0.8519, + "step": 4433 + }, + { + "epoch": 0.66, + "grad_norm": 1.5148861635181443, + "learning_rate": 5.42997603233835e-06, + "loss": 0.8813, + "step": 4434 + }, + { + "epoch": 0.66, + "grad_norm": 1.3965685959074967, + "learning_rate": 5.425677584495112e-06, + "loss": 0.7946, + "step": 4435 + }, + { + "epoch": 0.66, + "grad_norm": 1.9166251981221127, + "learning_rate": 5.421380205214051e-06, + "loss": 0.8042, + "step": 4436 + }, + { + "epoch": 0.66, + "grad_norm": 1.4464172117189402, + "learning_rate": 5.417083895499024e-06, + "loss": 0.7559, + "step": 4437 + }, + { + "epoch": 0.66, + "grad_norm": 1.3574169001042158, + "learning_rate": 5.412788656353658e-06, + "loss": 0.9286, + "step": 4438 + }, + { + "epoch": 0.66, + "grad_norm": 1.4076995107799493, + "learning_rate": 5.408494488781317e-06, + "loss": 0.8437, + "step": 4439 + }, + { + "epoch": 0.66, + "grad_norm": 1.4228475632149882, + "learning_rate": 5.404201393785123e-06, + "loss": 0.916, + "step": 4440 + }, + { + "epoch": 0.66, + "grad_norm": 1.549351414989158, + "learning_rate": 5.3999093723679395e-06, + "loss": 0.826, + "step": 4441 + }, + { + "epoch": 0.66, + "grad_norm": 1.5388365085844244, + "learning_rate": 5.39561842553239e-06, + "loss": 0.8221, + "step": 4442 + }, + { + "epoch": 0.66, + "grad_norm": 1.341995071227043, + "learning_rate": 5.391328554280829e-06, + "loss": 0.7786, + "step": 4443 + }, + { + "epoch": 0.66, + "grad_norm": 1.5561369663863633, + "learning_rate": 5.387039759615378e-06, + "loss": 0.8233, + "step": 4444 + }, + { + "epoch": 0.66, + "grad_norm": 1.5667210426705647, + "learning_rate": 5.382752042537899e-06, + "loss": 0.7424, + "step": 4445 + }, + { + "epoch": 0.66, + "grad_norm": 1.5075810988656286, + "learning_rate": 5.378465404050003e-06, + "loss": 0.827, + "step": 4446 + }, + { + "epoch": 0.66, + "grad_norm": 1.4562725247024544, + "learning_rate": 5.374179845153048e-06, + "loss": 0.8379, + "step": 4447 + }, + { + "epoch": 0.66, + "grad_norm": 0.8038457546512405, + "learning_rate": 5.369895366848144e-06, + "loss": 0.3358, + "step": 4448 + }, + { + "epoch": 0.66, + "grad_norm": 1.4842366248182006, + "learning_rate": 5.365611970136145e-06, + "loss": 0.8117, + "step": 4449 + }, + { + "epoch": 0.66, + "grad_norm": 1.6709230935057313, + "learning_rate": 5.361329656017649e-06, + "loss": 0.794, + "step": 4450 + }, + { + "epoch": 0.66, + "grad_norm": 1.5387975923449324, + "learning_rate": 5.357048425493007e-06, + "loss": 0.7274, + "step": 4451 + }, + { + "epoch": 0.66, + "grad_norm": 1.3531270790579684, + "learning_rate": 5.352768279562315e-06, + "loss": 0.8071, + "step": 4452 + }, + { + "epoch": 0.66, + "grad_norm": 1.4566089258404527, + "learning_rate": 5.348489219225417e-06, + "loss": 0.7737, + "step": 4453 + }, + { + "epoch": 0.66, + "grad_norm": 1.4342289038609468, + "learning_rate": 5.3442112454819e-06, + "loss": 0.8189, + "step": 4454 + }, + { + "epoch": 0.66, + "grad_norm": 1.5053840844518434, + "learning_rate": 5.339934359331104e-06, + "loss": 0.8856, + "step": 4455 + }, + { + "epoch": 0.66, + "grad_norm": 1.5080987746244463, + "learning_rate": 5.335658561772101e-06, + "loss": 0.8089, + "step": 4456 + }, + { + "epoch": 0.67, + "grad_norm": 1.839324060773937, + "learning_rate": 5.331383853803724e-06, + "loss": 0.8145, + "step": 4457 + }, + { + "epoch": 0.67, + "grad_norm": 1.498780826730005, + "learning_rate": 5.327110236424544e-06, + "loss": 0.8238, + "step": 4458 + }, + { + "epoch": 0.67, + "grad_norm": 1.4611618244608902, + "learning_rate": 5.3228377106328775e-06, + "loss": 0.8497, + "step": 4459 + }, + { + "epoch": 0.67, + "grad_norm": 1.4759978710926638, + "learning_rate": 5.3185662774267875e-06, + "loss": 0.8957, + "step": 4460 + }, + { + "epoch": 0.67, + "grad_norm": 1.576081780336865, + "learning_rate": 5.314295937804082e-06, + "loss": 0.8981, + "step": 4461 + }, + { + "epoch": 0.67, + "grad_norm": 1.6212599301475261, + "learning_rate": 5.310026692762316e-06, + "loss": 0.8939, + "step": 4462 + }, + { + "epoch": 0.67, + "grad_norm": 1.4763640019291224, + "learning_rate": 5.305758543298778e-06, + "loss": 0.8265, + "step": 4463 + }, + { + "epoch": 0.67, + "grad_norm": 1.5588311561910986, + "learning_rate": 5.3014914904105105e-06, + "loss": 0.8727, + "step": 4464 + }, + { + "epoch": 0.67, + "grad_norm": 1.3335454785684293, + "learning_rate": 5.297225535094302e-06, + "loss": 0.8926, + "step": 4465 + }, + { + "epoch": 0.67, + "grad_norm": 1.6354198103973723, + "learning_rate": 5.292960678346674e-06, + "loss": 0.8286, + "step": 4466 + }, + { + "epoch": 0.67, + "grad_norm": 1.453256815141083, + "learning_rate": 5.288696921163902e-06, + "loss": 0.8306, + "step": 4467 + }, + { + "epoch": 0.67, + "grad_norm": 1.307165861408919, + "learning_rate": 5.284434264542002e-06, + "loss": 0.766, + "step": 4468 + }, + { + "epoch": 0.67, + "grad_norm": 1.5938569359105743, + "learning_rate": 5.280172709476723e-06, + "loss": 0.7814, + "step": 4469 + }, + { + "epoch": 0.67, + "grad_norm": 0.9559373413405672, + "learning_rate": 5.275912256963571e-06, + "loss": 0.3479, + "step": 4470 + }, + { + "epoch": 0.67, + "grad_norm": 0.9000019690000334, + "learning_rate": 5.2716529079977856e-06, + "loss": 0.3258, + "step": 4471 + }, + { + "epoch": 0.67, + "grad_norm": 1.3764502750206267, + "learning_rate": 5.267394663574351e-06, + "loss": 0.8096, + "step": 4472 + }, + { + "epoch": 0.67, + "grad_norm": 1.3787878690807869, + "learning_rate": 5.2631375246879955e-06, + "loss": 0.8471, + "step": 4473 + }, + { + "epoch": 0.67, + "grad_norm": 1.2535421676238763, + "learning_rate": 5.2588814923331854e-06, + "loss": 0.7559, + "step": 4474 + }, + { + "epoch": 0.67, + "grad_norm": 1.5403722857322744, + "learning_rate": 5.254626567504135e-06, + "loss": 0.8154, + "step": 4475 + }, + { + "epoch": 0.67, + "grad_norm": 0.7775978512926978, + "learning_rate": 5.250372751194788e-06, + "loss": 0.342, + "step": 4476 + }, + { + "epoch": 0.67, + "grad_norm": 1.4310148401493037, + "learning_rate": 5.246120044398839e-06, + "loss": 0.8904, + "step": 4477 + }, + { + "epoch": 0.67, + "grad_norm": 1.5003548031150493, + "learning_rate": 5.241868448109722e-06, + "loss": 0.8289, + "step": 4478 + }, + { + "epoch": 0.67, + "grad_norm": 1.5076239003440834, + "learning_rate": 5.237617963320608e-06, + "loss": 0.8126, + "step": 4479 + }, + { + "epoch": 0.67, + "grad_norm": 1.6390164130179972, + "learning_rate": 5.233368591024412e-06, + "loss": 0.8271, + "step": 4480 + }, + { + "epoch": 0.67, + "grad_norm": 1.533919308673761, + "learning_rate": 5.229120332213794e-06, + "loss": 0.7336, + "step": 4481 + }, + { + "epoch": 0.67, + "grad_norm": 1.3311413158121979, + "learning_rate": 5.224873187881136e-06, + "loss": 0.7806, + "step": 4482 + }, + { + "epoch": 0.67, + "grad_norm": 1.2932170164255992, + "learning_rate": 5.220627159018578e-06, + "loss": 0.8019, + "step": 4483 + }, + { + "epoch": 0.67, + "grad_norm": 1.531277760307563, + "learning_rate": 5.216382246617993e-06, + "loss": 0.8908, + "step": 4484 + }, + { + "epoch": 0.67, + "grad_norm": 1.4348066692261237, + "learning_rate": 5.212138451670989e-06, + "loss": 0.7672, + "step": 4485 + }, + { + "epoch": 0.67, + "grad_norm": 1.366045679400478, + "learning_rate": 5.2078957751689206e-06, + "loss": 0.8047, + "step": 4486 + }, + { + "epoch": 0.67, + "grad_norm": 1.34046114732214, + "learning_rate": 5.20365421810288e-06, + "loss": 0.8945, + "step": 4487 + }, + { + "epoch": 0.67, + "grad_norm": 1.4321217411563052, + "learning_rate": 5.199413781463689e-06, + "loss": 0.831, + "step": 4488 + }, + { + "epoch": 0.67, + "grad_norm": 1.4477171495991878, + "learning_rate": 5.195174466241917e-06, + "loss": 0.8124, + "step": 4489 + }, + { + "epoch": 0.67, + "grad_norm": 1.3967339806132228, + "learning_rate": 5.190936273427868e-06, + "loss": 0.7988, + "step": 4490 + }, + { + "epoch": 0.67, + "grad_norm": 0.805574481492177, + "learning_rate": 5.186699204011585e-06, + "loss": 0.3301, + "step": 4491 + }, + { + "epoch": 0.67, + "grad_norm": 1.4266117525111504, + "learning_rate": 5.1824632589828465e-06, + "loss": 0.7418, + "step": 4492 + }, + { + "epoch": 0.67, + "grad_norm": 1.4342952734683052, + "learning_rate": 5.17822843933117e-06, + "loss": 0.7672, + "step": 4493 + }, + { + "epoch": 0.67, + "grad_norm": 1.3902627052428154, + "learning_rate": 5.173994746045816e-06, + "loss": 0.8372, + "step": 4494 + }, + { + "epoch": 0.67, + "grad_norm": 1.2373050781526378, + "learning_rate": 5.169762180115765e-06, + "loss": 0.8381, + "step": 4495 + }, + { + "epoch": 0.67, + "grad_norm": 1.3199151290423123, + "learning_rate": 5.16553074252975e-06, + "loss": 0.8824, + "step": 4496 + }, + { + "epoch": 0.67, + "grad_norm": 1.5321191928800468, + "learning_rate": 5.161300434276237e-06, + "loss": 0.8242, + "step": 4497 + }, + { + "epoch": 0.67, + "grad_norm": 1.3667582512440828, + "learning_rate": 5.157071256343422e-06, + "loss": 0.8292, + "step": 4498 + }, + { + "epoch": 0.67, + "grad_norm": 1.581681238709597, + "learning_rate": 5.152843209719246e-06, + "loss": 0.9121, + "step": 4499 + }, + { + "epoch": 0.67, + "grad_norm": 1.4639550005205448, + "learning_rate": 5.148616295391382e-06, + "loss": 0.8378, + "step": 4500 + }, + { + "epoch": 0.67, + "grad_norm": 1.3227331554859414, + "learning_rate": 5.1443905143472305e-06, + "loss": 0.7974, + "step": 4501 + }, + { + "epoch": 0.67, + "grad_norm": 1.505669022686916, + "learning_rate": 5.14016586757394e-06, + "loss": 0.8293, + "step": 4502 + }, + { + "epoch": 0.67, + "grad_norm": 1.4469881152042756, + "learning_rate": 5.135942356058385e-06, + "loss": 0.7211, + "step": 4503 + }, + { + "epoch": 0.67, + "grad_norm": 1.477252731389027, + "learning_rate": 5.131719980787182e-06, + "loss": 0.7417, + "step": 4504 + }, + { + "epoch": 0.67, + "grad_norm": 1.384440988626623, + "learning_rate": 5.127498742746675e-06, + "loss": 0.9194, + "step": 4505 + }, + { + "epoch": 0.67, + "grad_norm": 1.6896485232215501, + "learning_rate": 5.123278642922952e-06, + "loss": 0.9312, + "step": 4506 + }, + { + "epoch": 0.67, + "grad_norm": 1.2915419307944953, + "learning_rate": 5.119059682301819e-06, + "loss": 0.8508, + "step": 4507 + }, + { + "epoch": 0.67, + "grad_norm": 1.4212061270310063, + "learning_rate": 5.114841861868831e-06, + "loss": 0.8126, + "step": 4508 + }, + { + "epoch": 0.67, + "grad_norm": 1.468073100491747, + "learning_rate": 5.1106251826092716e-06, + "loss": 0.8181, + "step": 4509 + }, + { + "epoch": 0.67, + "grad_norm": 1.560872592998939, + "learning_rate": 5.106409645508155e-06, + "loss": 0.8251, + "step": 4510 + }, + { + "epoch": 0.67, + "grad_norm": 1.56810609641717, + "learning_rate": 5.102195251550237e-06, + "loss": 0.8889, + "step": 4511 + }, + { + "epoch": 0.67, + "grad_norm": 1.618062832301333, + "learning_rate": 5.097982001719994e-06, + "loss": 0.8631, + "step": 4512 + }, + { + "epoch": 0.67, + "grad_norm": 1.362190469445197, + "learning_rate": 5.093769897001641e-06, + "loss": 0.7091, + "step": 4513 + }, + { + "epoch": 0.67, + "grad_norm": 1.3833745464797038, + "learning_rate": 5.089558938379131e-06, + "loss": 0.8261, + "step": 4514 + }, + { + "epoch": 0.67, + "grad_norm": 1.4768480864864184, + "learning_rate": 5.085349126836141e-06, + "loss": 0.9492, + "step": 4515 + }, + { + "epoch": 0.67, + "grad_norm": 1.5444612370928807, + "learning_rate": 5.081140463356089e-06, + "loss": 0.9123, + "step": 4516 + }, + { + "epoch": 0.67, + "grad_norm": 1.5312007548237583, + "learning_rate": 5.076932948922111e-06, + "loss": 0.8498, + "step": 4517 + }, + { + "epoch": 0.67, + "grad_norm": 1.6265024417321052, + "learning_rate": 5.072726584517086e-06, + "loss": 0.8181, + "step": 4518 + }, + { + "epoch": 0.67, + "grad_norm": 1.3985544720537066, + "learning_rate": 5.068521371123622e-06, + "loss": 0.7978, + "step": 4519 + }, + { + "epoch": 0.67, + "grad_norm": 1.503437702462011, + "learning_rate": 5.064317309724057e-06, + "loss": 0.8156, + "step": 4520 + }, + { + "epoch": 0.67, + "grad_norm": 0.8016548351966756, + "learning_rate": 5.060114401300465e-06, + "loss": 0.3438, + "step": 4521 + }, + { + "epoch": 0.67, + "grad_norm": 1.506198834281312, + "learning_rate": 5.0559126468346354e-06, + "loss": 0.81, + "step": 4522 + }, + { + "epoch": 0.67, + "grad_norm": 1.5161621509729026, + "learning_rate": 5.051712047308104e-06, + "loss": 0.8009, + "step": 4523 + }, + { + "epoch": 0.68, + "grad_norm": 1.7146744535901257, + "learning_rate": 5.047512603702132e-06, + "loss": 0.8517, + "step": 4524 + }, + { + "epoch": 0.68, + "grad_norm": 1.4227024463650564, + "learning_rate": 5.043314316997709e-06, + "loss": 0.8578, + "step": 4525 + }, + { + "epoch": 0.68, + "grad_norm": 1.283323270001088, + "learning_rate": 5.039117188175556e-06, + "loss": 0.7787, + "step": 4526 + }, + { + "epoch": 0.68, + "grad_norm": 1.3692576199226896, + "learning_rate": 5.034921218216126e-06, + "loss": 0.8635, + "step": 4527 + }, + { + "epoch": 0.68, + "grad_norm": 1.4081134866816505, + "learning_rate": 5.030726408099589e-06, + "loss": 0.9368, + "step": 4528 + }, + { + "epoch": 0.68, + "grad_norm": 1.3647333884969972, + "learning_rate": 5.026532758805859e-06, + "loss": 0.8316, + "step": 4529 + }, + { + "epoch": 0.68, + "grad_norm": 1.4782823771698343, + "learning_rate": 5.022340271314572e-06, + "loss": 0.897, + "step": 4530 + }, + { + "epoch": 0.68, + "grad_norm": 1.2459189380550428, + "learning_rate": 5.018148946605092e-06, + "loss": 0.8375, + "step": 4531 + }, + { + "epoch": 0.68, + "grad_norm": 1.4245408018389651, + "learning_rate": 5.013958785656516e-06, + "loss": 0.9104, + "step": 4532 + }, + { + "epoch": 0.68, + "grad_norm": 1.3663946250477728, + "learning_rate": 5.009769789447668e-06, + "loss": 0.878, + "step": 4533 + }, + { + "epoch": 0.68, + "grad_norm": 1.4889949616649736, + "learning_rate": 5.0055819589570904e-06, + "loss": 0.8248, + "step": 4534 + }, + { + "epoch": 0.68, + "grad_norm": 1.3973172131891143, + "learning_rate": 5.001395295163065e-06, + "loss": 0.7906, + "step": 4535 + }, + { + "epoch": 0.68, + "grad_norm": 1.5180849992577252, + "learning_rate": 4.997209799043597e-06, + "loss": 0.7637, + "step": 4536 + }, + { + "epoch": 0.68, + "grad_norm": 1.3678956105381423, + "learning_rate": 4.993025471576417e-06, + "loss": 0.7792, + "step": 4537 + }, + { + "epoch": 0.68, + "grad_norm": 1.386641441124462, + "learning_rate": 4.988842313738986e-06, + "loss": 0.8597, + "step": 4538 + }, + { + "epoch": 0.68, + "grad_norm": 1.2796436909860898, + "learning_rate": 4.9846603265084935e-06, + "loss": 0.8482, + "step": 4539 + }, + { + "epoch": 0.68, + "grad_norm": 0.9104113093504237, + "learning_rate": 4.980479510861845e-06, + "loss": 0.3076, + "step": 4540 + }, + { + "epoch": 0.68, + "grad_norm": 1.5604483140035275, + "learning_rate": 4.976299867775682e-06, + "loss": 0.8138, + "step": 4541 + }, + { + "epoch": 0.68, + "grad_norm": 1.461619237196551, + "learning_rate": 4.972121398226371e-06, + "loss": 0.7858, + "step": 4542 + }, + { + "epoch": 0.68, + "grad_norm": 1.4754957724440456, + "learning_rate": 4.967944103190002e-06, + "loss": 0.8772, + "step": 4543 + }, + { + "epoch": 0.68, + "grad_norm": 1.456856612306713, + "learning_rate": 4.9637679836423926e-06, + "loss": 0.8643, + "step": 4544 + }, + { + "epoch": 0.68, + "grad_norm": 1.3935185583466247, + "learning_rate": 4.959593040559083e-06, + "loss": 0.8556, + "step": 4545 + }, + { + "epoch": 0.68, + "grad_norm": 1.5275407198267879, + "learning_rate": 4.955419274915345e-06, + "loss": 0.8872, + "step": 4546 + }, + { + "epoch": 0.68, + "grad_norm": 1.532512031594679, + "learning_rate": 4.951246687686164e-06, + "loss": 0.8451, + "step": 4547 + }, + { + "epoch": 0.68, + "grad_norm": 1.7196571795337954, + "learning_rate": 4.94707527984626e-06, + "loss": 0.8261, + "step": 4548 + }, + { + "epoch": 0.68, + "grad_norm": 0.7672563143972103, + "learning_rate": 4.942905052370073e-06, + "loss": 0.313, + "step": 4549 + }, + { + "epoch": 0.68, + "grad_norm": 1.3882736374094085, + "learning_rate": 4.938736006231769e-06, + "loss": 0.8346, + "step": 4550 + }, + { + "epoch": 0.68, + "grad_norm": 1.4891459013367765, + "learning_rate": 4.934568142405239e-06, + "loss": 0.8619, + "step": 4551 + }, + { + "epoch": 0.68, + "grad_norm": 1.3190510757828653, + "learning_rate": 4.930401461864099e-06, + "loss": 0.8991, + "step": 4552 + }, + { + "epoch": 0.68, + "grad_norm": 1.5827776490335297, + "learning_rate": 4.926235965581679e-06, + "loss": 0.8309, + "step": 4553 + }, + { + "epoch": 0.68, + "grad_norm": 1.4168386435313955, + "learning_rate": 4.922071654531043e-06, + "loss": 0.849, + "step": 4554 + }, + { + "epoch": 0.68, + "grad_norm": 1.4720349417788874, + "learning_rate": 4.917908529684975e-06, + "loss": 0.9505, + "step": 4555 + }, + { + "epoch": 0.68, + "grad_norm": 1.5480680079705795, + "learning_rate": 4.91374659201598e-06, + "loss": 0.8567, + "step": 4556 + }, + { + "epoch": 0.68, + "grad_norm": 1.2266388750898982, + "learning_rate": 4.909585842496287e-06, + "loss": 0.8331, + "step": 4557 + }, + { + "epoch": 0.68, + "grad_norm": 1.4410847302704344, + "learning_rate": 4.905426282097853e-06, + "loss": 0.8324, + "step": 4558 + }, + { + "epoch": 0.68, + "grad_norm": 1.3908886478737463, + "learning_rate": 4.9012679117923436e-06, + "loss": 0.8585, + "step": 4559 + }, + { + "epoch": 0.68, + "grad_norm": 1.4501652030516372, + "learning_rate": 4.897110732551157e-06, + "loss": 0.8974, + "step": 4560 + }, + { + "epoch": 0.68, + "grad_norm": 1.4534998879302232, + "learning_rate": 4.892954745345413e-06, + "loss": 0.8178, + "step": 4561 + }, + { + "epoch": 0.68, + "grad_norm": 1.2703453637431656, + "learning_rate": 4.888799951145948e-06, + "loss": 0.7791, + "step": 4562 + }, + { + "epoch": 0.68, + "grad_norm": 1.505063842539341, + "learning_rate": 4.884646350923323e-06, + "loss": 0.8761, + "step": 4563 + }, + { + "epoch": 0.68, + "grad_norm": 1.5996110634334908, + "learning_rate": 4.8804939456478215e-06, + "loss": 0.7712, + "step": 4564 + }, + { + "epoch": 0.68, + "grad_norm": 1.5736127727123124, + "learning_rate": 4.8763427362894475e-06, + "loss": 0.8745, + "step": 4565 + }, + { + "epoch": 0.68, + "grad_norm": 1.5032957825871684, + "learning_rate": 4.872192723817917e-06, + "loss": 0.867, + "step": 4566 + }, + { + "epoch": 0.68, + "grad_norm": 0.9536430981218512, + "learning_rate": 4.868043909202678e-06, + "loss": 0.3731, + "step": 4567 + }, + { + "epoch": 0.68, + "grad_norm": 1.628592184147086, + "learning_rate": 4.863896293412892e-06, + "loss": 0.8634, + "step": 4568 + }, + { + "epoch": 0.68, + "grad_norm": 2.913613399845025, + "learning_rate": 4.859749877417443e-06, + "loss": 0.8612, + "step": 4569 + }, + { + "epoch": 0.68, + "grad_norm": 1.3476565471593558, + "learning_rate": 4.855604662184935e-06, + "loss": 0.7867, + "step": 4570 + }, + { + "epoch": 0.68, + "grad_norm": 1.6611259870708521, + "learning_rate": 4.851460648683695e-06, + "loss": 0.8543, + "step": 4571 + }, + { + "epoch": 0.68, + "grad_norm": 1.593920873266002, + "learning_rate": 4.847317837881757e-06, + "loss": 0.7428, + "step": 4572 + }, + { + "epoch": 0.68, + "grad_norm": 1.5082126683079213, + "learning_rate": 4.843176230746883e-06, + "loss": 0.8174, + "step": 4573 + }, + { + "epoch": 0.68, + "grad_norm": 1.4511265523001697, + "learning_rate": 4.8390358282465574e-06, + "loss": 0.8506, + "step": 4574 + }, + { + "epoch": 0.68, + "grad_norm": 1.7499705614440846, + "learning_rate": 4.834896631347975e-06, + "loss": 0.864, + "step": 4575 + }, + { + "epoch": 0.68, + "grad_norm": 1.650954780990009, + "learning_rate": 4.830758641018055e-06, + "loss": 0.8253, + "step": 4576 + }, + { + "epoch": 0.68, + "grad_norm": 1.5726215735600095, + "learning_rate": 4.826621858223431e-06, + "loss": 0.8757, + "step": 4577 + }, + { + "epoch": 0.68, + "grad_norm": 1.3210203005981902, + "learning_rate": 4.822486283930461e-06, + "loss": 0.9078, + "step": 4578 + }, + { + "epoch": 0.68, + "grad_norm": 1.599523019148098, + "learning_rate": 4.818351919105207e-06, + "loss": 0.8421, + "step": 4579 + }, + { + "epoch": 0.68, + "grad_norm": 1.3370304725741007, + "learning_rate": 4.8142187647134595e-06, + "loss": 0.7919, + "step": 4580 + }, + { + "epoch": 0.68, + "grad_norm": 1.5167297350877778, + "learning_rate": 4.810086821720726e-06, + "loss": 0.9118, + "step": 4581 + }, + { + "epoch": 0.68, + "grad_norm": 1.3664144435318473, + "learning_rate": 4.805956091092228e-06, + "loss": 0.8118, + "step": 4582 + }, + { + "epoch": 0.68, + "grad_norm": 1.5749322833630273, + "learning_rate": 4.801826573792905e-06, + "loss": 0.7619, + "step": 4583 + }, + { + "epoch": 0.68, + "grad_norm": 1.4436898795166926, + "learning_rate": 4.797698270787415e-06, + "loss": 0.8065, + "step": 4584 + }, + { + "epoch": 0.68, + "grad_norm": 1.2930932085923927, + "learning_rate": 4.793571183040124e-06, + "loss": 0.8655, + "step": 4585 + }, + { + "epoch": 0.68, + "grad_norm": 1.511749942467589, + "learning_rate": 4.789445311515123e-06, + "loss": 0.7977, + "step": 4586 + }, + { + "epoch": 0.68, + "grad_norm": 1.416551855433553, + "learning_rate": 4.785320657176216e-06, + "loss": 0.8471, + "step": 4587 + }, + { + "epoch": 0.68, + "grad_norm": 1.4991309708287353, + "learning_rate": 4.7811972209869235e-06, + "loss": 0.8712, + "step": 4588 + }, + { + "epoch": 0.68, + "grad_norm": 1.4053003170737826, + "learning_rate": 4.77707500391048e-06, + "loss": 0.8534, + "step": 4589 + }, + { + "epoch": 0.68, + "grad_norm": 1.442337568757676, + "learning_rate": 4.772954006909837e-06, + "loss": 0.9233, + "step": 4590 + }, + { + "epoch": 0.69, + "grad_norm": 1.3742264582042067, + "learning_rate": 4.768834230947661e-06, + "loss": 0.8291, + "step": 4591 + }, + { + "epoch": 0.69, + "grad_norm": 1.4333504917387017, + "learning_rate": 4.764715676986327e-06, + "loss": 0.772, + "step": 4592 + }, + { + "epoch": 0.69, + "grad_norm": 1.4574227689128374, + "learning_rate": 4.760598345987931e-06, + "loss": 0.7811, + "step": 4593 + }, + { + "epoch": 0.69, + "grad_norm": 1.3944850454503117, + "learning_rate": 4.756482238914285e-06, + "loss": 0.856, + "step": 4594 + }, + { + "epoch": 0.69, + "grad_norm": 1.4009858805571114, + "learning_rate": 4.7523673567269095e-06, + "loss": 0.8674, + "step": 4595 + }, + { + "epoch": 0.69, + "grad_norm": 1.386660900386837, + "learning_rate": 4.7482537003870425e-06, + "loss": 0.8883, + "step": 4596 + }, + { + "epoch": 0.69, + "grad_norm": 1.5430515381778944, + "learning_rate": 4.744141270855638e-06, + "loss": 0.8107, + "step": 4597 + }, + { + "epoch": 0.69, + "grad_norm": 1.3471632988832563, + "learning_rate": 4.740030069093351e-06, + "loss": 0.8284, + "step": 4598 + }, + { + "epoch": 0.69, + "grad_norm": 1.3261892829533413, + "learning_rate": 4.735920096060565e-06, + "loss": 0.8505, + "step": 4599 + }, + { + "epoch": 0.69, + "grad_norm": 1.3722022600635457, + "learning_rate": 4.731811352717369e-06, + "loss": 0.8297, + "step": 4600 + }, + { + "epoch": 0.69, + "grad_norm": 1.4497870477453527, + "learning_rate": 4.727703840023566e-06, + "loss": 0.7923, + "step": 4601 + }, + { + "epoch": 0.69, + "grad_norm": 1.3970553224672155, + "learning_rate": 4.7235975589386715e-06, + "loss": 0.8887, + "step": 4602 + }, + { + "epoch": 0.69, + "grad_norm": 0.8949589253544046, + "learning_rate": 4.719492510421913e-06, + "loss": 0.3419, + "step": 4603 + }, + { + "epoch": 0.69, + "grad_norm": 1.2786837316951456, + "learning_rate": 4.715388695432232e-06, + "loss": 0.7902, + "step": 4604 + }, + { + "epoch": 0.69, + "grad_norm": 1.5366408350148864, + "learning_rate": 4.711286114928277e-06, + "loss": 0.7646, + "step": 4605 + }, + { + "epoch": 0.69, + "grad_norm": 1.5567456309321708, + "learning_rate": 4.707184769868412e-06, + "loss": 0.8615, + "step": 4606 + }, + { + "epoch": 0.69, + "grad_norm": 1.384110707474668, + "learning_rate": 4.7030846612107105e-06, + "loss": 0.7813, + "step": 4607 + }, + { + "epoch": 0.69, + "grad_norm": 1.4534471241967188, + "learning_rate": 4.6989857899129595e-06, + "loss": 0.8056, + "step": 4608 + }, + { + "epoch": 0.69, + "grad_norm": 1.2515046645075467, + "learning_rate": 4.694888156932657e-06, + "loss": 0.8657, + "step": 4609 + }, + { + "epoch": 0.69, + "grad_norm": 1.185863418049476, + "learning_rate": 4.690791763227014e-06, + "loss": 0.826, + "step": 4610 + }, + { + "epoch": 0.69, + "grad_norm": 1.441683371713312, + "learning_rate": 4.68669660975294e-06, + "loss": 0.8537, + "step": 4611 + }, + { + "epoch": 0.69, + "grad_norm": 1.412766075049699, + "learning_rate": 4.6826026974670665e-06, + "loss": 0.8745, + "step": 4612 + }, + { + "epoch": 0.69, + "grad_norm": 1.4241818680646237, + "learning_rate": 4.6785100273257335e-06, + "loss": 0.7919, + "step": 4613 + }, + { + "epoch": 0.69, + "grad_norm": 1.3799514787169287, + "learning_rate": 4.674418600284988e-06, + "loss": 0.7948, + "step": 4614 + }, + { + "epoch": 0.69, + "grad_norm": 1.245829465995908, + "learning_rate": 4.670328417300588e-06, + "loss": 0.7695, + "step": 4615 + }, + { + "epoch": 0.69, + "grad_norm": 1.5035562101700672, + "learning_rate": 4.666239479328004e-06, + "loss": 0.8887, + "step": 4616 + }, + { + "epoch": 0.69, + "grad_norm": 1.4349659938211743, + "learning_rate": 4.662151787322405e-06, + "loss": 0.7622, + "step": 4617 + }, + { + "epoch": 0.69, + "grad_norm": 1.4942491548054193, + "learning_rate": 4.658065342238681e-06, + "loss": 0.8055, + "step": 4618 + }, + { + "epoch": 0.69, + "grad_norm": 1.4261721754426413, + "learning_rate": 4.653980145031425e-06, + "loss": 0.8548, + "step": 4619 + }, + { + "epoch": 0.69, + "grad_norm": 1.36462012444893, + "learning_rate": 4.64989619665494e-06, + "loss": 0.7615, + "step": 4620 + }, + { + "epoch": 0.69, + "grad_norm": 1.2236799625284607, + "learning_rate": 4.645813498063235e-06, + "loss": 0.8346, + "step": 4621 + }, + { + "epoch": 0.69, + "grad_norm": 1.6040389836625868, + "learning_rate": 4.641732050210032e-06, + "loss": 0.8527, + "step": 4622 + }, + { + "epoch": 0.69, + "grad_norm": 1.667602740115268, + "learning_rate": 4.637651854048759e-06, + "loss": 0.7332, + "step": 4623 + }, + { + "epoch": 0.69, + "grad_norm": 1.3652476404225031, + "learning_rate": 4.633572910532543e-06, + "loss": 0.7827, + "step": 4624 + }, + { + "epoch": 0.69, + "grad_norm": 1.2873324821688368, + "learning_rate": 4.62949522061423e-06, + "loss": 0.8192, + "step": 4625 + }, + { + "epoch": 0.69, + "grad_norm": 1.587948321834436, + "learning_rate": 4.6254187852463685e-06, + "loss": 0.8425, + "step": 4626 + }, + { + "epoch": 0.69, + "grad_norm": 1.0106800101685143, + "learning_rate": 4.621343605381215e-06, + "loss": 0.3402, + "step": 4627 + }, + { + "epoch": 0.69, + "grad_norm": 1.3883844177347062, + "learning_rate": 4.61726968197073e-06, + "loss": 0.8996, + "step": 4628 + }, + { + "epoch": 0.69, + "grad_norm": 1.4410600362744985, + "learning_rate": 4.613197015966587e-06, + "loss": 0.8247, + "step": 4629 + }, + { + "epoch": 0.69, + "grad_norm": 1.4033930885063715, + "learning_rate": 4.609125608320154e-06, + "loss": 0.8112, + "step": 4630 + }, + { + "epoch": 0.69, + "grad_norm": 1.3584792142610111, + "learning_rate": 4.605055459982517e-06, + "loss": 0.9075, + "step": 4631 + }, + { + "epoch": 0.69, + "grad_norm": 1.5072101207536828, + "learning_rate": 4.600986571904461e-06, + "loss": 0.7514, + "step": 4632 + }, + { + "epoch": 0.69, + "grad_norm": 1.4723419099347086, + "learning_rate": 4.5969189450364804e-06, + "loss": 0.8128, + "step": 4633 + }, + { + "epoch": 0.69, + "grad_norm": 1.4686786946275787, + "learning_rate": 4.592852580328775e-06, + "loss": 0.8374, + "step": 4634 + }, + { + "epoch": 0.69, + "grad_norm": 1.4389864722866783, + "learning_rate": 4.588787478731242e-06, + "loss": 0.7918, + "step": 4635 + }, + { + "epoch": 0.69, + "grad_norm": 1.3863671528666628, + "learning_rate": 4.5847236411934945e-06, + "loss": 0.855, + "step": 4636 + }, + { + "epoch": 0.69, + "grad_norm": 1.4385407123388714, + "learning_rate": 4.580661068664844e-06, + "loss": 0.8695, + "step": 4637 + }, + { + "epoch": 0.69, + "grad_norm": 1.33760559150418, + "learning_rate": 4.576599762094306e-06, + "loss": 0.9329, + "step": 4638 + }, + { + "epoch": 0.69, + "grad_norm": 1.575854801348923, + "learning_rate": 4.5725397224306076e-06, + "loss": 0.8426, + "step": 4639 + }, + { + "epoch": 0.69, + "grad_norm": 1.5345019136284757, + "learning_rate": 4.568480950622169e-06, + "loss": 0.7612, + "step": 4640 + }, + { + "epoch": 0.69, + "grad_norm": 1.413658351733374, + "learning_rate": 4.56442344761712e-06, + "loss": 0.8526, + "step": 4641 + }, + { + "epoch": 0.69, + "grad_norm": 1.6060934881575104, + "learning_rate": 4.560367214363295e-06, + "loss": 0.8299, + "step": 4642 + }, + { + "epoch": 0.69, + "grad_norm": 1.430545619058654, + "learning_rate": 4.556312251808232e-06, + "loss": 0.8646, + "step": 4643 + }, + { + "epoch": 0.69, + "grad_norm": 1.4653842653589146, + "learning_rate": 4.552258560899171e-06, + "loss": 0.808, + "step": 4644 + }, + { + "epoch": 0.69, + "grad_norm": 1.428059233094383, + "learning_rate": 4.5482061425830504e-06, + "loss": 0.7432, + "step": 4645 + }, + { + "epoch": 0.69, + "grad_norm": 1.65868272854902, + "learning_rate": 4.5441549978065166e-06, + "loss": 0.8384, + "step": 4646 + }, + { + "epoch": 0.69, + "grad_norm": 1.5749764364283159, + "learning_rate": 4.540105127515921e-06, + "loss": 0.8385, + "step": 4647 + }, + { + "epoch": 0.69, + "grad_norm": 1.4863594831349358, + "learning_rate": 4.53605653265731e-06, + "loss": 0.7843, + "step": 4648 + }, + { + "epoch": 0.69, + "grad_norm": 1.5667757379774738, + "learning_rate": 4.532009214176438e-06, + "loss": 0.712, + "step": 4649 + }, + { + "epoch": 0.69, + "grad_norm": 1.4829015263672782, + "learning_rate": 4.527963173018762e-06, + "loss": 0.7888, + "step": 4650 + }, + { + "epoch": 0.69, + "grad_norm": 1.4465025339215238, + "learning_rate": 4.52391841012943e-06, + "loss": 0.7083, + "step": 4651 + }, + { + "epoch": 0.69, + "grad_norm": 1.405534162716725, + "learning_rate": 4.519874926453303e-06, + "loss": 0.8239, + "step": 4652 + }, + { + "epoch": 0.69, + "grad_norm": 1.5563763462334577, + "learning_rate": 4.515832722934939e-06, + "loss": 0.8349, + "step": 4653 + }, + { + "epoch": 0.69, + "grad_norm": 1.5581598143871236, + "learning_rate": 4.511791800518596e-06, + "loss": 0.8495, + "step": 4654 + }, + { + "epoch": 0.69, + "grad_norm": 1.4292397488450292, + "learning_rate": 4.507752160148241e-06, + "loss": 0.7418, + "step": 4655 + }, + { + "epoch": 0.69, + "grad_norm": 0.8932544506107188, + "learning_rate": 4.503713802767523e-06, + "loss": 0.3297, + "step": 4656 + }, + { + "epoch": 0.69, + "grad_norm": 1.4965378261590467, + "learning_rate": 4.499676729319809e-06, + "loss": 0.8386, + "step": 4657 + }, + { + "epoch": 0.7, + "grad_norm": 1.515165323177369, + "learning_rate": 4.495640940748159e-06, + "loss": 0.8873, + "step": 4658 + }, + { + "epoch": 0.7, + "grad_norm": 1.439888403102316, + "learning_rate": 4.491606437995335e-06, + "loss": 0.8209, + "step": 4659 + }, + { + "epoch": 0.7, + "grad_norm": 1.3864125397241902, + "learning_rate": 4.4875732220037935e-06, + "loss": 0.8238, + "step": 4660 + }, + { + "epoch": 0.7, + "grad_norm": 1.4600354363816694, + "learning_rate": 4.483541293715699e-06, + "loss": 0.8752, + "step": 4661 + }, + { + "epoch": 0.7, + "grad_norm": 1.5157005313359344, + "learning_rate": 4.479510654072909e-06, + "loss": 0.8415, + "step": 4662 + }, + { + "epoch": 0.7, + "grad_norm": 1.6642532675826205, + "learning_rate": 4.475481304016978e-06, + "loss": 0.8695, + "step": 4663 + }, + { + "epoch": 0.7, + "grad_norm": 1.2668533571239595, + "learning_rate": 4.471453244489164e-06, + "loss": 0.8312, + "step": 4664 + }, + { + "epoch": 0.7, + "grad_norm": 1.5066316284393733, + "learning_rate": 4.467426476430423e-06, + "loss": 0.9041, + "step": 4665 + }, + { + "epoch": 0.7, + "grad_norm": 1.5060708000202778, + "learning_rate": 4.463401000781409e-06, + "loss": 0.7757, + "step": 4666 + }, + { + "epoch": 0.7, + "grad_norm": 1.3390542785816126, + "learning_rate": 4.459376818482471e-06, + "loss": 0.7843, + "step": 4667 + }, + { + "epoch": 0.7, + "grad_norm": 1.436900641049538, + "learning_rate": 4.455353930473666e-06, + "loss": 0.8594, + "step": 4668 + }, + { + "epoch": 0.7, + "grad_norm": 1.6462740588457703, + "learning_rate": 4.4513323376947304e-06, + "loss": 0.8773, + "step": 4669 + }, + { + "epoch": 0.7, + "grad_norm": 1.4759910823819076, + "learning_rate": 4.447312041085113e-06, + "loss": 0.8712, + "step": 4670 + }, + { + "epoch": 0.7, + "grad_norm": 1.4827403981573728, + "learning_rate": 4.443293041583957e-06, + "loss": 0.8468, + "step": 4671 + }, + { + "epoch": 0.7, + "grad_norm": 1.396443301710576, + "learning_rate": 4.439275340130099e-06, + "loss": 0.8089, + "step": 4672 + }, + { + "epoch": 0.7, + "grad_norm": 1.2549821331929478, + "learning_rate": 4.435258937662076e-06, + "loss": 0.7941, + "step": 4673 + }, + { + "epoch": 0.7, + "grad_norm": 1.4418495124689839, + "learning_rate": 4.4312438351181246e-06, + "loss": 0.784, + "step": 4674 + }, + { + "epoch": 0.7, + "grad_norm": 1.7972635512494446, + "learning_rate": 4.4272300334361616e-06, + "loss": 0.8095, + "step": 4675 + }, + { + "epoch": 0.7, + "grad_norm": 1.4403862723580476, + "learning_rate": 4.42321753355382e-06, + "loss": 0.9377, + "step": 4676 + }, + { + "epoch": 0.7, + "grad_norm": 1.6161400482673711, + "learning_rate": 4.419206336408418e-06, + "loss": 0.8809, + "step": 4677 + }, + { + "epoch": 0.7, + "grad_norm": 1.2982877338535375, + "learning_rate": 4.415196442936971e-06, + "loss": 0.759, + "step": 4678 + }, + { + "epoch": 0.7, + "grad_norm": 1.3484505620701703, + "learning_rate": 4.411187854076192e-06, + "loss": 0.8095, + "step": 4679 + }, + { + "epoch": 0.7, + "grad_norm": 1.3972446506196152, + "learning_rate": 4.407180570762486e-06, + "loss": 0.8509, + "step": 4680 + }, + { + "epoch": 0.7, + "grad_norm": 0.9182674315182678, + "learning_rate": 4.40317459393196e-06, + "loss": 0.2946, + "step": 4681 + }, + { + "epoch": 0.7, + "grad_norm": 1.45912375033366, + "learning_rate": 4.399169924520403e-06, + "loss": 0.8808, + "step": 4682 + }, + { + "epoch": 0.7, + "grad_norm": 1.3372329181206644, + "learning_rate": 4.39516656346331e-06, + "loss": 0.8189, + "step": 4683 + }, + { + "epoch": 0.7, + "grad_norm": 1.6867213837115818, + "learning_rate": 4.391164511695866e-06, + "loss": 0.8303, + "step": 4684 + }, + { + "epoch": 0.7, + "grad_norm": 1.3696883979157333, + "learning_rate": 4.38716377015295e-06, + "loss": 0.8647, + "step": 4685 + }, + { + "epoch": 0.7, + "grad_norm": 1.3683280308809131, + "learning_rate": 4.383164339769137e-06, + "loss": 0.7426, + "step": 4686 + }, + { + "epoch": 0.7, + "grad_norm": 1.6572007765643664, + "learning_rate": 4.379166221478697e-06, + "loss": 0.9312, + "step": 4687 + }, + { + "epoch": 0.7, + "grad_norm": 1.3925392846339484, + "learning_rate": 4.375169416215584e-06, + "loss": 0.7916, + "step": 4688 + }, + { + "epoch": 0.7, + "grad_norm": 0.8757613192986392, + "learning_rate": 4.371173924913457e-06, + "loss": 0.3176, + "step": 4689 + }, + { + "epoch": 0.7, + "grad_norm": 1.4637444923799754, + "learning_rate": 4.36717974850566e-06, + "loss": 0.8318, + "step": 4690 + }, + { + "epoch": 0.7, + "grad_norm": 1.7819713138015125, + "learning_rate": 4.363186887925236e-06, + "loss": 0.8466, + "step": 4691 + }, + { + "epoch": 0.7, + "grad_norm": 1.4104984652806851, + "learning_rate": 4.359195344104916e-06, + "loss": 0.7462, + "step": 4692 + }, + { + "epoch": 0.7, + "grad_norm": 1.5619620124179776, + "learning_rate": 4.355205117977126e-06, + "loss": 0.8214, + "step": 4693 + }, + { + "epoch": 0.7, + "grad_norm": 1.4586233537527196, + "learning_rate": 4.351216210473986e-06, + "loss": 0.7839, + "step": 4694 + }, + { + "epoch": 0.7, + "grad_norm": 1.6979397152850684, + "learning_rate": 4.3472286225272995e-06, + "loss": 0.829, + "step": 4695 + }, + { + "epoch": 0.7, + "grad_norm": 1.3617905750744985, + "learning_rate": 4.343242355068569e-06, + "loss": 0.8012, + "step": 4696 + }, + { + "epoch": 0.7, + "grad_norm": 1.4182092000368562, + "learning_rate": 4.339257409028987e-06, + "loss": 0.8156, + "step": 4697 + }, + { + "epoch": 0.7, + "grad_norm": 1.424496608891188, + "learning_rate": 4.33527378533944e-06, + "loss": 0.8184, + "step": 4698 + }, + { + "epoch": 0.7, + "grad_norm": 1.455989477998742, + "learning_rate": 4.3312914849305e-06, + "loss": 0.819, + "step": 4699 + }, + { + "epoch": 0.7, + "grad_norm": 1.5782519822918681, + "learning_rate": 4.3273105087324375e-06, + "loss": 0.8165, + "step": 4700 + }, + { + "epoch": 0.7, + "grad_norm": 1.3999424919648857, + "learning_rate": 4.323330857675202e-06, + "loss": 0.8229, + "step": 4701 + }, + { + "epoch": 0.7, + "grad_norm": 1.534057270534777, + "learning_rate": 4.319352532688444e-06, + "loss": 0.8212, + "step": 4702 + }, + { + "epoch": 0.7, + "grad_norm": 1.4408446519193885, + "learning_rate": 4.315375534701499e-06, + "loss": 0.7798, + "step": 4703 + }, + { + "epoch": 0.7, + "grad_norm": 1.2941510522769486, + "learning_rate": 4.311399864643396e-06, + "loss": 0.8171, + "step": 4704 + }, + { + "epoch": 0.7, + "grad_norm": 1.4795992991888256, + "learning_rate": 4.307425523442852e-06, + "loss": 0.7836, + "step": 4705 + }, + { + "epoch": 0.7, + "grad_norm": 1.6663074617930316, + "learning_rate": 4.303452512028272e-06, + "loss": 0.6779, + "step": 4706 + }, + { + "epoch": 0.7, + "grad_norm": 1.3665190506750673, + "learning_rate": 4.2994808313277565e-06, + "loss": 0.8058, + "step": 4707 + }, + { + "epoch": 0.7, + "grad_norm": 1.4618797234507428, + "learning_rate": 4.295510482269083e-06, + "loss": 0.8783, + "step": 4708 + }, + { + "epoch": 0.7, + "grad_norm": 1.5481642831369997, + "learning_rate": 4.29154146577973e-06, + "loss": 0.8546, + "step": 4709 + }, + { + "epoch": 0.7, + "grad_norm": 1.43559915648515, + "learning_rate": 4.287573782786858e-06, + "loss": 0.8495, + "step": 4710 + }, + { + "epoch": 0.7, + "grad_norm": 1.6092210621960257, + "learning_rate": 4.2836074342173195e-06, + "loss": 0.8064, + "step": 4711 + }, + { + "epoch": 0.7, + "grad_norm": 1.5309822909220474, + "learning_rate": 4.279642420997655e-06, + "loss": 0.8266, + "step": 4712 + }, + { + "epoch": 0.7, + "grad_norm": 1.362953973360087, + "learning_rate": 4.275678744054094e-06, + "loss": 0.8767, + "step": 4713 + }, + { + "epoch": 0.7, + "grad_norm": 1.4498389862743661, + "learning_rate": 4.271716404312545e-06, + "loss": 0.8809, + "step": 4714 + }, + { + "epoch": 0.7, + "grad_norm": 1.532173708945647, + "learning_rate": 4.267755402698613e-06, + "loss": 0.7707, + "step": 4715 + }, + { + "epoch": 0.7, + "grad_norm": 1.3243116948190425, + "learning_rate": 4.263795740137592e-06, + "loss": 0.8274, + "step": 4716 + }, + { + "epoch": 0.7, + "grad_norm": 1.476025755020728, + "learning_rate": 4.259837417554457e-06, + "loss": 0.816, + "step": 4717 + }, + { + "epoch": 0.7, + "grad_norm": 1.4079421882295637, + "learning_rate": 4.2558804358738725e-06, + "loss": 0.7988, + "step": 4718 + }, + { + "epoch": 0.7, + "grad_norm": 1.4452387256093888, + "learning_rate": 4.251924796020191e-06, + "loss": 0.8519, + "step": 4719 + }, + { + "epoch": 0.7, + "grad_norm": 1.7772103385319755, + "learning_rate": 4.247970498917453e-06, + "loss": 0.9175, + "step": 4720 + }, + { + "epoch": 0.7, + "grad_norm": 1.4318473583385691, + "learning_rate": 4.244017545489376e-06, + "loss": 0.7419, + "step": 4721 + }, + { + "epoch": 0.7, + "grad_norm": 1.5810543046165035, + "learning_rate": 4.240065936659374e-06, + "loss": 0.7995, + "step": 4722 + }, + { + "epoch": 0.7, + "grad_norm": 1.5578898439366653, + "learning_rate": 4.236115673350544e-06, + "loss": 0.886, + "step": 4723 + }, + { + "epoch": 0.7, + "grad_norm": 1.4437783475875456, + "learning_rate": 4.232166756485665e-06, + "loss": 0.869, + "step": 4724 + }, + { + "epoch": 0.71, + "grad_norm": 1.4534665290887891, + "learning_rate": 4.228219186987207e-06, + "loss": 0.8314, + "step": 4725 + }, + { + "epoch": 0.71, + "grad_norm": 1.5232528226128734, + "learning_rate": 4.224272965777326e-06, + "loss": 0.833, + "step": 4726 + }, + { + "epoch": 0.71, + "grad_norm": 1.3574944236633812, + "learning_rate": 4.220328093777851e-06, + "loss": 0.8731, + "step": 4727 + }, + { + "epoch": 0.71, + "grad_norm": 1.363372775068747, + "learning_rate": 4.2163845719103106e-06, + "loss": 0.7636, + "step": 4728 + }, + { + "epoch": 0.71, + "grad_norm": 1.419879949441352, + "learning_rate": 4.212442401095908e-06, + "loss": 0.8024, + "step": 4729 + }, + { + "epoch": 0.71, + "grad_norm": 1.5478196558988124, + "learning_rate": 4.20850158225554e-06, + "loss": 0.8649, + "step": 4730 + }, + { + "epoch": 0.71, + "grad_norm": 1.6587733556465767, + "learning_rate": 4.2045621163097775e-06, + "loss": 0.859, + "step": 4731 + }, + { + "epoch": 0.71, + "grad_norm": 1.413761817593304, + "learning_rate": 4.200624004178883e-06, + "loss": 0.8101, + "step": 4732 + }, + { + "epoch": 0.71, + "grad_norm": 1.332535731383828, + "learning_rate": 4.196687246782801e-06, + "loss": 0.8221, + "step": 4733 + }, + { + "epoch": 0.71, + "grad_norm": 0.9546101248457464, + "learning_rate": 4.192751845041153e-06, + "loss": 0.3299, + "step": 4734 + }, + { + "epoch": 0.71, + "grad_norm": 1.5541536544415093, + "learning_rate": 4.188817799873254e-06, + "loss": 0.7875, + "step": 4735 + }, + { + "epoch": 0.71, + "grad_norm": 1.377765673467641, + "learning_rate": 4.184885112198094e-06, + "loss": 0.8193, + "step": 4736 + }, + { + "epoch": 0.71, + "grad_norm": 1.361601929925436, + "learning_rate": 4.180953782934352e-06, + "loss": 0.7177, + "step": 4737 + }, + { + "epoch": 0.71, + "grad_norm": 1.5278395496840536, + "learning_rate": 4.177023813000386e-06, + "loss": 0.8795, + "step": 4738 + }, + { + "epoch": 0.71, + "grad_norm": 1.4926864272996119, + "learning_rate": 4.173095203314241e-06, + "loss": 0.8539, + "step": 4739 + }, + { + "epoch": 0.71, + "grad_norm": 1.731921820523921, + "learning_rate": 4.169167954793633e-06, + "loss": 0.8566, + "step": 4740 + }, + { + "epoch": 0.71, + "grad_norm": 0.8371866827097895, + "learning_rate": 4.1652420683559725e-06, + "loss": 0.334, + "step": 4741 + }, + { + "epoch": 0.71, + "grad_norm": 0.8822856624435376, + "learning_rate": 4.161317544918345e-06, + "loss": 0.3251, + "step": 4742 + }, + { + "epoch": 0.71, + "grad_norm": 1.3531579402367617, + "learning_rate": 4.157394385397521e-06, + "loss": 0.8454, + "step": 4743 + }, + { + "epoch": 0.71, + "grad_norm": 1.585947068425145, + "learning_rate": 4.153472590709951e-06, + "loss": 0.8922, + "step": 4744 + }, + { + "epoch": 0.71, + "grad_norm": 1.4739895838085741, + "learning_rate": 4.1495521617717695e-06, + "loss": 0.934, + "step": 4745 + }, + { + "epoch": 0.71, + "grad_norm": 1.2765781203485074, + "learning_rate": 4.145633099498783e-06, + "loss": 0.8586, + "step": 4746 + }, + { + "epoch": 0.71, + "grad_norm": 1.4175262312379457, + "learning_rate": 4.141715404806486e-06, + "loss": 0.7706, + "step": 4747 + }, + { + "epoch": 0.71, + "grad_norm": 1.3290769573347403, + "learning_rate": 4.137799078610055e-06, + "loss": 0.8767, + "step": 4748 + }, + { + "epoch": 0.71, + "grad_norm": 1.446959698733549, + "learning_rate": 4.133884121824344e-06, + "loss": 0.8162, + "step": 4749 + }, + { + "epoch": 0.71, + "grad_norm": 1.7136313246230725, + "learning_rate": 4.129970535363885e-06, + "loss": 0.8187, + "step": 4750 + }, + { + "epoch": 0.71, + "grad_norm": 0.9308276498009022, + "learning_rate": 4.126058320142895e-06, + "loss": 0.32, + "step": 4751 + }, + { + "epoch": 0.71, + "grad_norm": 1.5418527357692045, + "learning_rate": 4.12214747707527e-06, + "loss": 0.8637, + "step": 4752 + }, + { + "epoch": 0.71, + "grad_norm": 1.5596672032785333, + "learning_rate": 4.1182380070745755e-06, + "loss": 0.8271, + "step": 4753 + }, + { + "epoch": 0.71, + "grad_norm": 1.529606549684988, + "learning_rate": 4.114329911054069e-06, + "loss": 0.8066, + "step": 4754 + }, + { + "epoch": 0.71, + "grad_norm": 1.5967319561630282, + "learning_rate": 4.110423189926682e-06, + "loss": 0.7884, + "step": 4755 + }, + { + "epoch": 0.71, + "grad_norm": 1.5521734226059614, + "learning_rate": 4.106517844605023e-06, + "loss": 0.8523, + "step": 4756 + }, + { + "epoch": 0.71, + "grad_norm": 1.4287801655380412, + "learning_rate": 4.1026138760013886e-06, + "loss": 0.8229, + "step": 4757 + }, + { + "epoch": 0.71, + "grad_norm": 1.4106015522009068, + "learning_rate": 4.098711285027736e-06, + "loss": 0.7808, + "step": 4758 + }, + { + "epoch": 0.71, + "grad_norm": 1.4666396817475729, + "learning_rate": 4.094810072595714e-06, + "loss": 0.8313, + "step": 4759 + }, + { + "epoch": 0.71, + "grad_norm": 1.4927494148209086, + "learning_rate": 4.090910239616648e-06, + "loss": 0.8182, + "step": 4760 + }, + { + "epoch": 0.71, + "grad_norm": 1.3403735295109507, + "learning_rate": 4.087011787001538e-06, + "loss": 0.7562, + "step": 4761 + }, + { + "epoch": 0.71, + "grad_norm": 1.5008674785460179, + "learning_rate": 4.083114715661069e-06, + "loss": 0.7528, + "step": 4762 + }, + { + "epoch": 0.71, + "grad_norm": 1.5130862936844511, + "learning_rate": 4.079219026505586e-06, + "loss": 0.832, + "step": 4763 + }, + { + "epoch": 0.71, + "grad_norm": 1.5675065725552264, + "learning_rate": 4.07532472044513e-06, + "loss": 0.7786, + "step": 4764 + }, + { + "epoch": 0.71, + "grad_norm": 1.454215364889777, + "learning_rate": 4.071431798389408e-06, + "loss": 0.8455, + "step": 4765 + }, + { + "epoch": 0.71, + "grad_norm": 1.5021169353639472, + "learning_rate": 4.0675402612478095e-06, + "loss": 0.8277, + "step": 4766 + }, + { + "epoch": 0.71, + "grad_norm": 1.4966154832197276, + "learning_rate": 4.0636501099294e-06, + "loss": 0.7536, + "step": 4767 + }, + { + "epoch": 0.71, + "grad_norm": 1.6086826252898843, + "learning_rate": 4.059761345342913e-06, + "loss": 0.8441, + "step": 4768 + }, + { + "epoch": 0.71, + "grad_norm": 1.4273763467326928, + "learning_rate": 4.055873968396767e-06, + "loss": 0.9098, + "step": 4769 + }, + { + "epoch": 0.71, + "grad_norm": 1.4487811363521983, + "learning_rate": 4.051987979999056e-06, + "loss": 0.804, + "step": 4770 + }, + { + "epoch": 0.71, + "grad_norm": 1.674378791908379, + "learning_rate": 4.048103381057543e-06, + "loss": 0.8816, + "step": 4771 + }, + { + "epoch": 0.71, + "grad_norm": 1.3597268972714522, + "learning_rate": 4.044220172479675e-06, + "loss": 0.7997, + "step": 4772 + }, + { + "epoch": 0.71, + "grad_norm": 1.3826802885569884, + "learning_rate": 4.040338355172571e-06, + "loss": 0.8142, + "step": 4773 + }, + { + "epoch": 0.71, + "grad_norm": 1.9760999015720804, + "learning_rate": 4.0364579300430164e-06, + "loss": 0.8612, + "step": 4774 + }, + { + "epoch": 0.71, + "grad_norm": 1.5191566335247086, + "learning_rate": 4.032578897997485e-06, + "loss": 0.7454, + "step": 4775 + }, + { + "epoch": 0.71, + "grad_norm": 1.552452624447766, + "learning_rate": 4.028701259942116e-06, + "loss": 0.8588, + "step": 4776 + }, + { + "epoch": 0.71, + "grad_norm": 1.2612519758977432, + "learning_rate": 4.024825016782727e-06, + "loss": 0.7941, + "step": 4777 + }, + { + "epoch": 0.71, + "grad_norm": 1.3981196958767816, + "learning_rate": 4.020950169424815e-06, + "loss": 0.7622, + "step": 4778 + }, + { + "epoch": 0.71, + "grad_norm": 1.3804476999751516, + "learning_rate": 4.017076718773535e-06, + "loss": 0.832, + "step": 4779 + }, + { + "epoch": 0.71, + "grad_norm": 1.3309790715032808, + "learning_rate": 4.013204665733729e-06, + "loss": 0.7547, + "step": 4780 + }, + { + "epoch": 0.71, + "grad_norm": 1.3757551771852794, + "learning_rate": 4.009334011209909e-06, + "loss": 0.8043, + "step": 4781 + }, + { + "epoch": 0.71, + "grad_norm": 1.3654193357223396, + "learning_rate": 4.0054647561062625e-06, + "loss": 0.7933, + "step": 4782 + }, + { + "epoch": 0.71, + "grad_norm": 1.4864574594156503, + "learning_rate": 4.001596901326644e-06, + "loss": 0.799, + "step": 4783 + }, + { + "epoch": 0.71, + "grad_norm": 1.482259758464327, + "learning_rate": 3.997730447774591e-06, + "loss": 0.9302, + "step": 4784 + }, + { + "epoch": 0.71, + "grad_norm": 1.4225989636571883, + "learning_rate": 3.9938653963533e-06, + "loss": 0.8264, + "step": 4785 + }, + { + "epoch": 0.71, + "grad_norm": 1.4878184307287057, + "learning_rate": 3.990001747965652e-06, + "loss": 0.8491, + "step": 4786 + }, + { + "epoch": 0.71, + "grad_norm": 1.3759775364793156, + "learning_rate": 3.9861395035141936e-06, + "loss": 0.8779, + "step": 4787 + }, + { + "epoch": 0.71, + "grad_norm": 1.5151635812092463, + "learning_rate": 3.982278663901146e-06, + "loss": 0.7878, + "step": 4788 + }, + { + "epoch": 0.71, + "grad_norm": 1.194047763646673, + "learning_rate": 3.978419230028402e-06, + "loss": 0.7804, + "step": 4789 + }, + { + "epoch": 0.71, + "grad_norm": 1.4074109914243975, + "learning_rate": 3.974561202797525e-06, + "loss": 0.7491, + "step": 4790 + }, + { + "epoch": 0.71, + "grad_norm": 1.321865521333858, + "learning_rate": 3.970704583109755e-06, + "loss": 0.8734, + "step": 4791 + }, + { + "epoch": 0.72, + "grad_norm": 1.466675656207691, + "learning_rate": 3.9668493718659924e-06, + "loss": 0.7656, + "step": 4792 + }, + { + "epoch": 0.72, + "grad_norm": 1.3905945966108024, + "learning_rate": 3.962995569966817e-06, + "loss": 0.8681, + "step": 4793 + }, + { + "epoch": 0.72, + "grad_norm": 1.8481108003508808, + "learning_rate": 3.9591431783124786e-06, + "loss": 0.8089, + "step": 4794 + }, + { + "epoch": 0.72, + "grad_norm": 1.5345186500111654, + "learning_rate": 3.955292197802895e-06, + "loss": 0.786, + "step": 4795 + }, + { + "epoch": 0.72, + "grad_norm": 1.3616600576042024, + "learning_rate": 3.951442629337657e-06, + "loss": 0.8063, + "step": 4796 + }, + { + "epoch": 0.72, + "grad_norm": 1.4456912188901643, + "learning_rate": 3.947594473816026e-06, + "loss": 0.8064, + "step": 4797 + }, + { + "epoch": 0.72, + "grad_norm": 1.471979114571703, + "learning_rate": 3.943747732136925e-06, + "loss": 0.7711, + "step": 4798 + }, + { + "epoch": 0.72, + "grad_norm": 1.4492812965756292, + "learning_rate": 3.939902405198959e-06, + "loss": 0.8662, + "step": 4799 + }, + { + "epoch": 0.72, + "grad_norm": 1.3353068567023287, + "learning_rate": 3.936058493900393e-06, + "loss": 0.8853, + "step": 4800 + }, + { + "epoch": 0.72, + "grad_norm": 1.5290175325305821, + "learning_rate": 3.932215999139167e-06, + "loss": 0.7905, + "step": 4801 + }, + { + "epoch": 0.72, + "grad_norm": 1.4040433445236589, + "learning_rate": 3.9283749218128885e-06, + "loss": 0.8155, + "step": 4802 + }, + { + "epoch": 0.72, + "grad_norm": 1.46126270881956, + "learning_rate": 3.924535262818836e-06, + "loss": 0.8694, + "step": 4803 + }, + { + "epoch": 0.72, + "grad_norm": 1.632287188494207, + "learning_rate": 3.920697023053949e-06, + "loss": 0.8303, + "step": 4804 + }, + { + "epoch": 0.72, + "grad_norm": 1.3928589169948147, + "learning_rate": 3.916860203414843e-06, + "loss": 0.8813, + "step": 4805 + }, + { + "epoch": 0.72, + "grad_norm": 1.397834508641805, + "learning_rate": 3.913024804797798e-06, + "loss": 0.8743, + "step": 4806 + }, + { + "epoch": 0.72, + "grad_norm": 1.512113429699455, + "learning_rate": 3.909190828098766e-06, + "loss": 0.784, + "step": 4807 + }, + { + "epoch": 0.72, + "grad_norm": 1.4448060836762557, + "learning_rate": 3.905358274213363e-06, + "loss": 0.7627, + "step": 4808 + }, + { + "epoch": 0.72, + "grad_norm": 1.3835596663047998, + "learning_rate": 3.901527144036875e-06, + "loss": 0.9225, + "step": 4809 + }, + { + "epoch": 0.72, + "grad_norm": 1.3675386495594217, + "learning_rate": 3.897697438464256e-06, + "loss": 0.8488, + "step": 4810 + }, + { + "epoch": 0.72, + "grad_norm": 1.4752544122799878, + "learning_rate": 3.893869158390121e-06, + "loss": 0.8653, + "step": 4811 + }, + { + "epoch": 0.72, + "grad_norm": 1.5770190289200632, + "learning_rate": 3.890042304708758e-06, + "loss": 0.8394, + "step": 4812 + }, + { + "epoch": 0.72, + "grad_norm": 1.874338005345734, + "learning_rate": 3.886216878314122e-06, + "loss": 0.9145, + "step": 4813 + }, + { + "epoch": 0.72, + "grad_norm": 1.3081891252765854, + "learning_rate": 3.882392880099832e-06, + "loss": 0.8998, + "step": 4814 + }, + { + "epoch": 0.72, + "grad_norm": 1.5232707030122634, + "learning_rate": 3.878570310959175e-06, + "loss": 0.8172, + "step": 4815 + }, + { + "epoch": 0.72, + "grad_norm": 1.4112825499027488, + "learning_rate": 3.874749171785106e-06, + "loss": 0.8087, + "step": 4816 + }, + { + "epoch": 0.72, + "grad_norm": 1.3860203911991953, + "learning_rate": 3.8709294634702374e-06, + "loss": 0.8032, + "step": 4817 + }, + { + "epoch": 0.72, + "grad_norm": 1.4114405702309578, + "learning_rate": 3.867111186906857e-06, + "loss": 0.8686, + "step": 4818 + }, + { + "epoch": 0.72, + "grad_norm": 1.5032844688609845, + "learning_rate": 3.8632943429869145e-06, + "loss": 0.7949, + "step": 4819 + }, + { + "epoch": 0.72, + "grad_norm": 1.4174561611802248, + "learning_rate": 3.859478932602025e-06, + "loss": 0.8234, + "step": 4820 + }, + { + "epoch": 0.72, + "grad_norm": 1.3569986795174422, + "learning_rate": 3.855664956643467e-06, + "loss": 0.8315, + "step": 4821 + }, + { + "epoch": 0.72, + "grad_norm": 1.4857589500337158, + "learning_rate": 3.8518524160021876e-06, + "loss": 0.8737, + "step": 4822 + }, + { + "epoch": 0.72, + "grad_norm": 1.367728689573794, + "learning_rate": 3.848041311568801e-06, + "loss": 0.812, + "step": 4823 + }, + { + "epoch": 0.72, + "grad_norm": 0.8014481210062113, + "learning_rate": 3.844231644233572e-06, + "loss": 0.324, + "step": 4824 + }, + { + "epoch": 0.72, + "grad_norm": 1.2932778274280683, + "learning_rate": 3.840423414886445e-06, + "loss": 0.8386, + "step": 4825 + }, + { + "epoch": 0.72, + "grad_norm": 1.4478635582400583, + "learning_rate": 3.836616624417022e-06, + "loss": 1.0061, + "step": 4826 + }, + { + "epoch": 0.72, + "grad_norm": 1.417811438869292, + "learning_rate": 3.832811273714569e-06, + "loss": 0.9058, + "step": 4827 + }, + { + "epoch": 0.72, + "grad_norm": 0.8921443753417463, + "learning_rate": 3.829007363668018e-06, + "loss": 0.3239, + "step": 4828 + }, + { + "epoch": 0.72, + "grad_norm": 1.5137789436127937, + "learning_rate": 3.825204895165964e-06, + "loss": 0.8515, + "step": 4829 + }, + { + "epoch": 0.72, + "grad_norm": 1.379460039861955, + "learning_rate": 3.821403869096658e-06, + "loss": 0.754, + "step": 4830 + }, + { + "epoch": 0.72, + "grad_norm": 1.3290326957603285, + "learning_rate": 3.817604286348025e-06, + "loss": 0.8301, + "step": 4831 + }, + { + "epoch": 0.72, + "grad_norm": 1.4835734951551014, + "learning_rate": 3.813806147807645e-06, + "loss": 0.8526, + "step": 4832 + }, + { + "epoch": 0.72, + "grad_norm": 1.4383267507861999, + "learning_rate": 3.8100094543627666e-06, + "loss": 0.8591, + "step": 4833 + }, + { + "epoch": 0.72, + "grad_norm": 1.3725708866281356, + "learning_rate": 3.806214206900295e-06, + "loss": 0.8745, + "step": 4834 + }, + { + "epoch": 0.72, + "grad_norm": 1.4540064109992301, + "learning_rate": 3.8024204063068024e-06, + "loss": 0.8106, + "step": 4835 + }, + { + "epoch": 0.72, + "grad_norm": 1.5596694932667077, + "learning_rate": 3.798628053468524e-06, + "loss": 0.8085, + "step": 4836 + }, + { + "epoch": 0.72, + "grad_norm": 1.3951547675296299, + "learning_rate": 3.7948371492713454e-06, + "loss": 0.8687, + "step": 4837 + }, + { + "epoch": 0.72, + "grad_norm": 0.9132438069304774, + "learning_rate": 3.791047694600828e-06, + "loss": 0.3558, + "step": 4838 + }, + { + "epoch": 0.72, + "grad_norm": 1.420216989479524, + "learning_rate": 3.7872596903421876e-06, + "loss": 0.8197, + "step": 4839 + }, + { + "epoch": 0.72, + "grad_norm": 0.8687397487767968, + "learning_rate": 3.7834731373803023e-06, + "loss": 0.3534, + "step": 4840 + }, + { + "epoch": 0.72, + "grad_norm": 1.393845029363547, + "learning_rate": 3.7796880365997114e-06, + "loss": 0.7153, + "step": 4841 + }, + { + "epoch": 0.72, + "grad_norm": 1.4690922867111074, + "learning_rate": 3.775904388884618e-06, + "loss": 0.8483, + "step": 4842 + }, + { + "epoch": 0.72, + "grad_norm": 1.4726739719194737, + "learning_rate": 3.772122195118877e-06, + "loss": 0.7215, + "step": 4843 + }, + { + "epoch": 0.72, + "grad_norm": 1.5858698405316958, + "learning_rate": 3.7683414561860097e-06, + "loss": 0.9649, + "step": 4844 + }, + { + "epoch": 0.72, + "grad_norm": 1.3722663494991016, + "learning_rate": 3.7645621729692004e-06, + "loss": 0.8364, + "step": 4845 + }, + { + "epoch": 0.72, + "grad_norm": 1.55642083721182, + "learning_rate": 3.7607843463512894e-06, + "loss": 0.8412, + "step": 4846 + }, + { + "epoch": 0.72, + "grad_norm": 1.8704953899371346, + "learning_rate": 3.7570079772147748e-06, + "loss": 0.8075, + "step": 4847 + }, + { + "epoch": 0.72, + "grad_norm": 1.2505106330014943, + "learning_rate": 3.7532330664418202e-06, + "loss": 0.8618, + "step": 4848 + }, + { + "epoch": 0.72, + "grad_norm": 1.458515907819654, + "learning_rate": 3.749459614914246e-06, + "loss": 0.8533, + "step": 4849 + }, + { + "epoch": 0.72, + "grad_norm": 1.4192929770699139, + "learning_rate": 3.7456876235135252e-06, + "loss": 0.7358, + "step": 4850 + }, + { + "epoch": 0.72, + "grad_norm": 1.5055481196921887, + "learning_rate": 3.7419170931207994e-06, + "loss": 0.8199, + "step": 4851 + }, + { + "epoch": 0.72, + "grad_norm": 1.5142826478550857, + "learning_rate": 3.738148024616863e-06, + "loss": 0.8703, + "step": 4852 + }, + { + "epoch": 0.72, + "grad_norm": 1.5294559050168062, + "learning_rate": 3.7343804188821718e-06, + "loss": 0.7979, + "step": 4853 + }, + { + "epoch": 0.72, + "grad_norm": 1.5519530789845535, + "learning_rate": 3.73061427679684e-06, + "loss": 0.8235, + "step": 4854 + }, + { + "epoch": 0.72, + "grad_norm": 1.5608174014356124, + "learning_rate": 3.7268495992406404e-06, + "loss": 0.8163, + "step": 4855 + }, + { + "epoch": 0.72, + "grad_norm": 1.5086427864597807, + "learning_rate": 3.723086387092997e-06, + "loss": 0.9018, + "step": 4856 + }, + { + "epoch": 0.72, + "grad_norm": 1.3719692274440831, + "learning_rate": 3.7193246412329976e-06, + "loss": 0.7802, + "step": 4857 + }, + { + "epoch": 0.72, + "grad_norm": 1.456942407287454, + "learning_rate": 3.7155643625393878e-06, + "loss": 0.8336, + "step": 4858 + }, + { + "epoch": 0.73, + "grad_norm": 1.2060986746361193, + "learning_rate": 3.7118055518905693e-06, + "loss": 0.7113, + "step": 4859 + }, + { + "epoch": 0.73, + "grad_norm": 1.3452162934726444, + "learning_rate": 3.7080482101645998e-06, + "loss": 0.8275, + "step": 4860 + }, + { + "epoch": 0.73, + "grad_norm": 1.4342690963752147, + "learning_rate": 3.7042923382391993e-06, + "loss": 0.7637, + "step": 4861 + }, + { + "epoch": 0.73, + "grad_norm": 1.3953516988278183, + "learning_rate": 3.700537936991733e-06, + "loss": 0.7598, + "step": 4862 + }, + { + "epoch": 0.73, + "grad_norm": 1.585860891793868, + "learning_rate": 3.6967850072992307e-06, + "loss": 0.8729, + "step": 4863 + }, + { + "epoch": 0.73, + "grad_norm": 1.4687300449902814, + "learning_rate": 3.6930335500383795e-06, + "loss": 0.8169, + "step": 4864 + }, + { + "epoch": 0.73, + "grad_norm": 1.4885200011118405, + "learning_rate": 3.6892835660855184e-06, + "loss": 0.889, + "step": 4865 + }, + { + "epoch": 0.73, + "grad_norm": 1.2646212306097024, + "learning_rate": 3.6855350563166457e-06, + "loss": 0.8271, + "step": 4866 + }, + { + "epoch": 0.73, + "grad_norm": 1.3582479975015693, + "learning_rate": 3.681788021607413e-06, + "loss": 0.7663, + "step": 4867 + }, + { + "epoch": 0.73, + "grad_norm": 1.5216727207530145, + "learning_rate": 3.6780424628331313e-06, + "loss": 0.8025, + "step": 4868 + }, + { + "epoch": 0.73, + "grad_norm": 1.5854804973142658, + "learning_rate": 3.674298380868756e-06, + "loss": 0.814, + "step": 4869 + }, + { + "epoch": 0.73, + "grad_norm": 1.5724932262994187, + "learning_rate": 3.67055577658891e-06, + "loss": 0.8147, + "step": 4870 + }, + { + "epoch": 0.73, + "grad_norm": 1.5240502329583558, + "learning_rate": 3.6668146508678646e-06, + "loss": 0.8439, + "step": 4871 + }, + { + "epoch": 0.73, + "grad_norm": 1.6149185983653107, + "learning_rate": 3.6630750045795472e-06, + "loss": 0.8102, + "step": 4872 + }, + { + "epoch": 0.73, + "grad_norm": 1.6175554480042074, + "learning_rate": 3.65933683859754e-06, + "loss": 0.7363, + "step": 4873 + }, + { + "epoch": 0.73, + "grad_norm": 1.3917677897171943, + "learning_rate": 3.655600153795084e-06, + "loss": 0.8935, + "step": 4874 + }, + { + "epoch": 0.73, + "grad_norm": 1.4527951219889264, + "learning_rate": 3.6518649510450598e-06, + "loss": 0.8515, + "step": 4875 + }, + { + "epoch": 0.73, + "grad_norm": 1.4030707992947349, + "learning_rate": 3.6481312312200144e-06, + "loss": 0.8567, + "step": 4876 + }, + { + "epoch": 0.73, + "grad_norm": 1.5009433237588676, + "learning_rate": 3.6443989951921478e-06, + "loss": 0.835, + "step": 4877 + }, + { + "epoch": 0.73, + "grad_norm": 1.422809373076941, + "learning_rate": 3.6406682438333074e-06, + "loss": 0.8757, + "step": 4878 + }, + { + "epoch": 0.73, + "grad_norm": 1.5316751167326643, + "learning_rate": 3.636938978015e-06, + "loss": 0.7215, + "step": 4879 + }, + { + "epoch": 0.73, + "grad_norm": 1.471397196863491, + "learning_rate": 3.633211198608384e-06, + "loss": 0.8552, + "step": 4880 + }, + { + "epoch": 0.73, + "grad_norm": 1.5386594248149572, + "learning_rate": 3.6294849064842642e-06, + "loss": 0.8179, + "step": 4881 + }, + { + "epoch": 0.73, + "grad_norm": 1.605579674037567, + "learning_rate": 3.625760102513103e-06, + "loss": 0.7647, + "step": 4882 + }, + { + "epoch": 0.73, + "grad_norm": 1.4130220583309183, + "learning_rate": 3.6220367875650175e-06, + "loss": 0.8248, + "step": 4883 + }, + { + "epoch": 0.73, + "grad_norm": 1.3502545719118326, + "learning_rate": 3.6183149625097745e-06, + "loss": 0.7769, + "step": 4884 + }, + { + "epoch": 0.73, + "grad_norm": 1.563683135797119, + "learning_rate": 3.6145946282167944e-06, + "loss": 0.9104, + "step": 4885 + }, + { + "epoch": 0.73, + "grad_norm": 1.4312655964724141, + "learning_rate": 3.6108757855551426e-06, + "loss": 0.7876, + "step": 4886 + }, + { + "epoch": 0.73, + "grad_norm": 0.8904965521392547, + "learning_rate": 3.607158435393544e-06, + "loss": 0.3235, + "step": 4887 + }, + { + "epoch": 0.73, + "grad_norm": 0.8261118634840814, + "learning_rate": 3.603442578600371e-06, + "loss": 0.3093, + "step": 4888 + }, + { + "epoch": 0.73, + "grad_norm": 1.4719987889087596, + "learning_rate": 3.5997282160436488e-06, + "loss": 0.889, + "step": 4889 + }, + { + "epoch": 0.73, + "grad_norm": 1.4901343120813264, + "learning_rate": 3.5960153485910564e-06, + "loss": 0.8714, + "step": 4890 + }, + { + "epoch": 0.73, + "grad_norm": 1.5333902742885017, + "learning_rate": 3.592303977109914e-06, + "loss": 0.8535, + "step": 4891 + }, + { + "epoch": 0.73, + "grad_norm": 1.4793218502024539, + "learning_rate": 3.5885941024672e-06, + "loss": 0.7675, + "step": 4892 + }, + { + "epoch": 0.73, + "grad_norm": 1.348049959777778, + "learning_rate": 3.5848857255295434e-06, + "loss": 0.8275, + "step": 4893 + }, + { + "epoch": 0.73, + "grad_norm": 1.54461516893757, + "learning_rate": 3.581178847163219e-06, + "loss": 0.8014, + "step": 4894 + }, + { + "epoch": 0.73, + "grad_norm": 1.5353823118014385, + "learning_rate": 3.5774734682341563e-06, + "loss": 0.873, + "step": 4895 + }, + { + "epoch": 0.73, + "grad_norm": 1.439438940156898, + "learning_rate": 3.573769589607934e-06, + "loss": 0.8959, + "step": 4896 + }, + { + "epoch": 0.73, + "grad_norm": 1.3809090171048923, + "learning_rate": 3.5700672121497728e-06, + "loss": 0.8152, + "step": 4897 + }, + { + "epoch": 0.73, + "grad_norm": 1.3830247550446988, + "learning_rate": 3.5663663367245517e-06, + "loss": 0.8019, + "step": 4898 + }, + { + "epoch": 0.73, + "grad_norm": 1.5182380493344791, + "learning_rate": 3.5626669641967948e-06, + "loss": 0.7764, + "step": 4899 + }, + { + "epoch": 0.73, + "grad_norm": 1.3226309811070474, + "learning_rate": 3.5589690954306764e-06, + "loss": 0.7231, + "step": 4900 + }, + { + "epoch": 0.73, + "grad_norm": 1.3586395222508696, + "learning_rate": 3.5552727312900228e-06, + "loss": 0.8259, + "step": 4901 + }, + { + "epoch": 0.73, + "grad_norm": 1.3165007756817604, + "learning_rate": 3.5515778726382967e-06, + "loss": 0.8583, + "step": 4902 + }, + { + "epoch": 0.73, + "grad_norm": 1.2576651442439926, + "learning_rate": 3.5478845203386227e-06, + "loss": 0.8827, + "step": 4903 + }, + { + "epoch": 0.73, + "grad_norm": 1.4480741034191904, + "learning_rate": 3.544192675253767e-06, + "loss": 0.806, + "step": 4904 + }, + { + "epoch": 0.73, + "grad_norm": 1.4602856622581222, + "learning_rate": 3.5405023382461457e-06, + "loss": 0.835, + "step": 4905 + }, + { + "epoch": 0.73, + "grad_norm": 0.9027757157731147, + "learning_rate": 3.536813510177822e-06, + "loss": 0.2934, + "step": 4906 + }, + { + "epoch": 0.73, + "grad_norm": 1.2897084712802276, + "learning_rate": 3.53312619191051e-06, + "loss": 0.7262, + "step": 4907 + }, + { + "epoch": 0.73, + "grad_norm": 1.2227321683451722, + "learning_rate": 3.5294403843055604e-06, + "loss": 0.8158, + "step": 4908 + }, + { + "epoch": 0.73, + "grad_norm": 1.582154301579126, + "learning_rate": 3.5257560882239806e-06, + "loss": 0.8582, + "step": 4909 + }, + { + "epoch": 0.73, + "grad_norm": 1.453247587483461, + "learning_rate": 3.5220733045264245e-06, + "loss": 0.9205, + "step": 4910 + }, + { + "epoch": 0.73, + "grad_norm": 1.5417394708801138, + "learning_rate": 3.5183920340731893e-06, + "loss": 0.8361, + "step": 4911 + }, + { + "epoch": 0.73, + "grad_norm": 1.415344627828099, + "learning_rate": 3.5147122777242203e-06, + "loss": 0.7334, + "step": 4912 + }, + { + "epoch": 0.73, + "grad_norm": 1.3561795511934418, + "learning_rate": 3.5110340363391128e-06, + "loss": 0.9329, + "step": 4913 + }, + { + "epoch": 0.73, + "grad_norm": 1.2808862931708658, + "learning_rate": 3.5073573107770977e-06, + "loss": 0.8246, + "step": 4914 + }, + { + "epoch": 0.73, + "grad_norm": 1.4454705583647465, + "learning_rate": 3.5036821018970623e-06, + "loss": 0.7803, + "step": 4915 + }, + { + "epoch": 0.73, + "grad_norm": 1.4250048247786875, + "learning_rate": 3.5000084105575338e-06, + "loss": 0.8314, + "step": 4916 + }, + { + "epoch": 0.73, + "grad_norm": 1.3528221316158837, + "learning_rate": 3.4963362376166886e-06, + "loss": 0.8673, + "step": 4917 + }, + { + "epoch": 0.73, + "grad_norm": 1.500395836854606, + "learning_rate": 3.4926655839323463e-06, + "loss": 0.8191, + "step": 4918 + }, + { + "epoch": 0.73, + "grad_norm": 1.527077861481613, + "learning_rate": 3.4889964503619723e-06, + "loss": 0.8824, + "step": 4919 + }, + { + "epoch": 0.73, + "grad_norm": 1.469735482020893, + "learning_rate": 3.4853288377626793e-06, + "loss": 0.8314, + "step": 4920 + }, + { + "epoch": 0.73, + "grad_norm": 1.3202190387252009, + "learning_rate": 3.4816627469912147e-06, + "loss": 0.793, + "step": 4921 + }, + { + "epoch": 0.73, + "grad_norm": 1.4228220905820244, + "learning_rate": 3.477998178903982e-06, + "loss": 0.8116, + "step": 4922 + }, + { + "epoch": 0.73, + "grad_norm": 1.4091884588971832, + "learning_rate": 3.4743351343570253e-06, + "loss": 0.8918, + "step": 4923 + }, + { + "epoch": 0.73, + "grad_norm": 1.4314405410755784, + "learning_rate": 3.4706736142060305e-06, + "loss": 0.8379, + "step": 4924 + }, + { + "epoch": 0.73, + "grad_norm": 1.509050342935842, + "learning_rate": 3.46701361930633e-06, + "loss": 0.8057, + "step": 4925 + }, + { + "epoch": 0.74, + "grad_norm": 1.5487347150153037, + "learning_rate": 3.4633551505129025e-06, + "loss": 0.8935, + "step": 4926 + }, + { + "epoch": 0.74, + "grad_norm": 1.325865828913274, + "learning_rate": 3.459698208680359e-06, + "loss": 0.8681, + "step": 4927 + }, + { + "epoch": 0.74, + "grad_norm": 1.4563853056717129, + "learning_rate": 3.456042794662966e-06, + "loss": 0.8923, + "step": 4928 + }, + { + "epoch": 0.74, + "grad_norm": 1.3784967252019977, + "learning_rate": 3.452388909314628e-06, + "loss": 0.7922, + "step": 4929 + }, + { + "epoch": 0.74, + "grad_norm": 1.4515064541782943, + "learning_rate": 3.4487365534888926e-06, + "loss": 0.8274, + "step": 4930 + }, + { + "epoch": 0.74, + "grad_norm": 1.4210344164301765, + "learning_rate": 3.4450857280389527e-06, + "loss": 0.7892, + "step": 4931 + }, + { + "epoch": 0.74, + "grad_norm": 1.6105679192603404, + "learning_rate": 3.441436433817641e-06, + "loss": 0.8734, + "step": 4932 + }, + { + "epoch": 0.74, + "grad_norm": 2.6438779495888407, + "learning_rate": 3.437788671677429e-06, + "loss": 0.746, + "step": 4933 + }, + { + "epoch": 0.74, + "grad_norm": 1.388980935038465, + "learning_rate": 3.4341424424704373e-06, + "loss": 0.8235, + "step": 4934 + }, + { + "epoch": 0.74, + "grad_norm": 1.540215415025226, + "learning_rate": 3.4304977470484257e-06, + "loss": 0.7908, + "step": 4935 + }, + { + "epoch": 0.74, + "grad_norm": 1.6147232861518725, + "learning_rate": 3.4268545862627932e-06, + "loss": 0.8428, + "step": 4936 + }, + { + "epoch": 0.74, + "grad_norm": 1.5604165300879358, + "learning_rate": 3.423212960964586e-06, + "loss": 0.8757, + "step": 4937 + }, + { + "epoch": 0.74, + "grad_norm": 1.5906676546431284, + "learning_rate": 3.419572872004485e-06, + "loss": 0.8674, + "step": 4938 + }, + { + "epoch": 0.74, + "grad_norm": 1.3604494458081229, + "learning_rate": 3.415934320232821e-06, + "loss": 0.8945, + "step": 4939 + }, + { + "epoch": 0.74, + "grad_norm": 1.2812985347068435, + "learning_rate": 3.4122973064995513e-06, + "loss": 0.8319, + "step": 4940 + }, + { + "epoch": 0.74, + "grad_norm": 1.5164318824952208, + "learning_rate": 3.4086618316542874e-06, + "loss": 0.8293, + "step": 4941 + }, + { + "epoch": 0.74, + "grad_norm": 1.4807960329669345, + "learning_rate": 3.405027896546277e-06, + "loss": 0.8723, + "step": 4942 + }, + { + "epoch": 0.74, + "grad_norm": 1.3964843623288026, + "learning_rate": 3.401395502024406e-06, + "loss": 0.8985, + "step": 4943 + }, + { + "epoch": 0.74, + "grad_norm": 1.3281162934859385, + "learning_rate": 3.3977646489372042e-06, + "loss": 0.8389, + "step": 4944 + }, + { + "epoch": 0.74, + "grad_norm": 1.3915344998446528, + "learning_rate": 3.39413533813284e-06, + "loss": 0.768, + "step": 4945 + }, + { + "epoch": 0.74, + "grad_norm": 1.3920876554152817, + "learning_rate": 3.390507570459116e-06, + "loss": 0.875, + "step": 4946 + }, + { + "epoch": 0.74, + "grad_norm": 1.3177138846422447, + "learning_rate": 3.3868813467634833e-06, + "loss": 0.777, + "step": 4947 + }, + { + "epoch": 0.74, + "grad_norm": 1.5053367573710172, + "learning_rate": 3.3832566678930255e-06, + "loss": 0.8646, + "step": 4948 + }, + { + "epoch": 0.74, + "grad_norm": 1.3967249923098846, + "learning_rate": 3.3796335346944697e-06, + "loss": 0.8012, + "step": 4949 + }, + { + "epoch": 0.74, + "grad_norm": 1.5067589592446557, + "learning_rate": 3.3760119480141794e-06, + "loss": 0.8661, + "step": 4950 + }, + { + "epoch": 0.74, + "grad_norm": 1.5924306672587207, + "learning_rate": 3.3723919086981584e-06, + "loss": 0.8267, + "step": 4951 + }, + { + "epoch": 0.74, + "grad_norm": 1.3137392575510314, + "learning_rate": 3.3687734175920505e-06, + "loss": 0.8414, + "step": 4952 + }, + { + "epoch": 0.74, + "grad_norm": 1.3798969731073352, + "learning_rate": 3.3651564755411293e-06, + "loss": 0.7691, + "step": 4953 + }, + { + "epoch": 0.74, + "grad_norm": 1.2526497970532375, + "learning_rate": 3.3615410833903174e-06, + "loss": 0.7858, + "step": 4954 + }, + { + "epoch": 0.74, + "grad_norm": 1.6640178804280839, + "learning_rate": 3.357927241984169e-06, + "loss": 0.815, + "step": 4955 + }, + { + "epoch": 0.74, + "grad_norm": 1.4664663750115534, + "learning_rate": 3.3543149521668784e-06, + "loss": 0.947, + "step": 4956 + }, + { + "epoch": 0.74, + "grad_norm": 1.5390067054104086, + "learning_rate": 3.350704214782278e-06, + "loss": 0.8785, + "step": 4957 + }, + { + "epoch": 0.74, + "grad_norm": 1.2736941902371575, + "learning_rate": 3.347095030673838e-06, + "loss": 0.881, + "step": 4958 + }, + { + "epoch": 0.74, + "grad_norm": 0.8277592649853668, + "learning_rate": 3.3434874006846583e-06, + "loss": 0.3672, + "step": 4959 + }, + { + "epoch": 0.74, + "grad_norm": 1.2968850178504, + "learning_rate": 3.3398813256574847e-06, + "loss": 0.8256, + "step": 4960 + }, + { + "epoch": 0.74, + "grad_norm": 1.2894751394057296, + "learning_rate": 3.3362768064346974e-06, + "loss": 0.8186, + "step": 4961 + }, + { + "epoch": 0.74, + "grad_norm": 1.525085724019825, + "learning_rate": 3.3326738438583116e-06, + "loss": 0.9152, + "step": 4962 + }, + { + "epoch": 0.74, + "grad_norm": 1.7221369955523933, + "learning_rate": 3.3290724387699803e-06, + "loss": 0.812, + "step": 4963 + }, + { + "epoch": 0.74, + "grad_norm": 1.4134138223835346, + "learning_rate": 3.3254725920109922e-06, + "loss": 0.8234, + "step": 4964 + }, + { + "epoch": 0.74, + "grad_norm": 0.8848070614945303, + "learning_rate": 3.3218743044222746e-06, + "loss": 0.3164, + "step": 4965 + }, + { + "epoch": 0.74, + "grad_norm": 1.5071622942571334, + "learning_rate": 3.318277576844381e-06, + "loss": 0.7947, + "step": 4966 + }, + { + "epoch": 0.74, + "grad_norm": 1.2562755819047962, + "learning_rate": 3.314682410117511e-06, + "loss": 0.7868, + "step": 4967 + }, + { + "epoch": 0.74, + "grad_norm": 1.6172549756573282, + "learning_rate": 3.311088805081497e-06, + "loss": 0.7891, + "step": 4968 + }, + { + "epoch": 0.74, + "grad_norm": 1.5749992253286484, + "learning_rate": 3.3074967625758037e-06, + "loss": 0.803, + "step": 4969 + }, + { + "epoch": 0.74, + "grad_norm": 1.445036344951784, + "learning_rate": 3.303906283439534e-06, + "loss": 0.7489, + "step": 4970 + }, + { + "epoch": 0.74, + "grad_norm": 1.6058600011632724, + "learning_rate": 3.3003173685114265e-06, + "loss": 0.7793, + "step": 4971 + }, + { + "epoch": 0.74, + "grad_norm": 1.5406243645877573, + "learning_rate": 3.2967300186298456e-06, + "loss": 0.8547, + "step": 4972 + }, + { + "epoch": 0.74, + "grad_norm": 1.3541374085701254, + "learning_rate": 3.2931442346328e-06, + "loss": 0.7588, + "step": 4973 + }, + { + "epoch": 0.74, + "grad_norm": 1.4130209886115168, + "learning_rate": 3.2895600173579302e-06, + "loss": 0.8479, + "step": 4974 + }, + { + "epoch": 0.74, + "grad_norm": 1.5548230799785667, + "learning_rate": 3.2859773676425077e-06, + "loss": 0.8801, + "step": 4975 + }, + { + "epoch": 0.74, + "grad_norm": 1.6156861896843813, + "learning_rate": 3.2823962863234416e-06, + "loss": 0.9096, + "step": 4976 + }, + { + "epoch": 0.74, + "grad_norm": 0.784636664226037, + "learning_rate": 3.2788167742372725e-06, + "loss": 0.3089, + "step": 4977 + }, + { + "epoch": 0.74, + "grad_norm": 1.3833322295907178, + "learning_rate": 3.275238832220178e-06, + "loss": 0.841, + "step": 4978 + }, + { + "epoch": 0.74, + "grad_norm": 1.2908992635542076, + "learning_rate": 3.271662461107958e-06, + "loss": 0.8154, + "step": 4979 + }, + { + "epoch": 0.74, + "grad_norm": 1.4818017759567386, + "learning_rate": 3.2680876617360578e-06, + "loss": 0.8424, + "step": 4980 + }, + { + "epoch": 0.74, + "grad_norm": 1.440983055632886, + "learning_rate": 3.264514434939551e-06, + "loss": 0.8194, + "step": 4981 + }, + { + "epoch": 0.74, + "grad_norm": 1.4442508400330965, + "learning_rate": 3.2609427815531426e-06, + "loss": 0.7527, + "step": 4982 + }, + { + "epoch": 0.74, + "grad_norm": 1.3619144099421232, + "learning_rate": 3.257372702411171e-06, + "loss": 0.8014, + "step": 4983 + }, + { + "epoch": 0.74, + "grad_norm": 1.4155440591046933, + "learning_rate": 3.253804198347612e-06, + "loss": 0.7911, + "step": 4984 + }, + { + "epoch": 0.74, + "grad_norm": 1.7960117277388261, + "learning_rate": 3.2502372701960603e-06, + "loss": 0.8031, + "step": 4985 + }, + { + "epoch": 0.74, + "grad_norm": 1.4163487581840326, + "learning_rate": 3.2466719187897555e-06, + "loss": 0.7964, + "step": 4986 + }, + { + "epoch": 0.74, + "grad_norm": 1.609020266395373, + "learning_rate": 3.243108144961563e-06, + "loss": 0.8117, + "step": 4987 + }, + { + "epoch": 0.74, + "grad_norm": 1.5806663577035889, + "learning_rate": 3.2395459495439817e-06, + "loss": 0.8302, + "step": 4988 + }, + { + "epoch": 0.74, + "grad_norm": 1.6151307336844092, + "learning_rate": 3.235985333369139e-06, + "loss": 0.8235, + "step": 4989 + }, + { + "epoch": 0.74, + "grad_norm": 1.4287413498046333, + "learning_rate": 3.2324262972688e-06, + "loss": 0.8531, + "step": 4990 + }, + { + "epoch": 0.74, + "grad_norm": 1.4059978378234, + "learning_rate": 3.2288688420743487e-06, + "loss": 0.8024, + "step": 4991 + }, + { + "epoch": 0.74, + "grad_norm": 1.3787769062446489, + "learning_rate": 3.2253129686168105e-06, + "loss": 0.8391, + "step": 4992 + }, + { + "epoch": 0.75, + "grad_norm": 1.3963622259858146, + "learning_rate": 3.2217586777268383e-06, + "loss": 0.8302, + "step": 4993 + }, + { + "epoch": 0.75, + "grad_norm": 1.3595949123893245, + "learning_rate": 3.2182059702347135e-06, + "loss": 0.886, + "step": 4994 + }, + { + "epoch": 0.75, + "grad_norm": 1.4233739780108456, + "learning_rate": 3.2146548469703497e-06, + "loss": 0.7828, + "step": 4995 + }, + { + "epoch": 0.75, + "grad_norm": 0.8106513386936745, + "learning_rate": 3.2111053087632904e-06, + "loss": 0.321, + "step": 4996 + }, + { + "epoch": 0.75, + "grad_norm": 1.4962990572622585, + "learning_rate": 3.2075573564427097e-06, + "loss": 0.8459, + "step": 4997 + }, + { + "epoch": 0.75, + "grad_norm": 1.3640774397370556, + "learning_rate": 3.204010990837404e-06, + "loss": 0.8962, + "step": 4998 + }, + { + "epoch": 0.75, + "grad_norm": 1.3858456300834159, + "learning_rate": 3.200466212775808e-06, + "loss": 0.8486, + "step": 4999 + }, + { + "epoch": 0.75, + "grad_norm": 1.4379594867977468, + "learning_rate": 3.1969230230859815e-06, + "loss": 0.7793, + "step": 5000 + }, + { + "epoch": 0.75, + "grad_norm": 1.4347378066243224, + "learning_rate": 3.193381422595616e-06, + "loss": 0.8284, + "step": 5001 + }, + { + "epoch": 0.75, + "grad_norm": 1.5176999424925202, + "learning_rate": 3.1898414121320277e-06, + "loss": 0.7464, + "step": 5002 + }, + { + "epoch": 0.75, + "grad_norm": 1.4320486678996, + "learning_rate": 3.1863029925221667e-06, + "loss": 0.8051, + "step": 5003 + }, + { + "epoch": 0.75, + "grad_norm": 1.5131113349621699, + "learning_rate": 3.1827661645926024e-06, + "loss": 0.8955, + "step": 5004 + }, + { + "epoch": 0.75, + "grad_norm": 1.4197091477150343, + "learning_rate": 3.179230929169541e-06, + "loss": 0.7323, + "step": 5005 + }, + { + "epoch": 0.75, + "grad_norm": 1.4592043909915928, + "learning_rate": 3.175697287078814e-06, + "loss": 0.8198, + "step": 5006 + }, + { + "epoch": 0.75, + "grad_norm": 1.4670316378769366, + "learning_rate": 3.1721652391458804e-06, + "loss": 0.8006, + "step": 5007 + }, + { + "epoch": 0.75, + "grad_norm": 1.3955671586110505, + "learning_rate": 3.1686347861958313e-06, + "loss": 0.7781, + "step": 5008 + }, + { + "epoch": 0.75, + "grad_norm": 1.499166452003566, + "learning_rate": 3.1651059290533726e-06, + "loss": 0.8033, + "step": 5009 + }, + { + "epoch": 0.75, + "grad_norm": 1.7163630963892307, + "learning_rate": 3.1615786685428495e-06, + "loss": 0.7976, + "step": 5010 + }, + { + "epoch": 0.75, + "grad_norm": 1.3711049794011652, + "learning_rate": 3.158053005488232e-06, + "loss": 0.7648, + "step": 5011 + }, + { + "epoch": 0.75, + "grad_norm": 1.4268633524707137, + "learning_rate": 3.1545289407131128e-06, + "loss": 0.8595, + "step": 5012 + }, + { + "epoch": 0.75, + "grad_norm": 1.4917406541529343, + "learning_rate": 3.151006475040719e-06, + "loss": 0.7695, + "step": 5013 + }, + { + "epoch": 0.75, + "grad_norm": 1.4830555811553476, + "learning_rate": 3.147485609293891e-06, + "loss": 0.8996, + "step": 5014 + }, + { + "epoch": 0.75, + "grad_norm": 1.480986922176576, + "learning_rate": 3.1439663442951073e-06, + "loss": 0.7786, + "step": 5015 + }, + { + "epoch": 0.75, + "grad_norm": 1.479415921804418, + "learning_rate": 3.140448680866469e-06, + "loss": 0.8503, + "step": 5016 + }, + { + "epoch": 0.75, + "grad_norm": 1.592533068876874, + "learning_rate": 3.1369326198297025e-06, + "loss": 0.8365, + "step": 5017 + }, + { + "epoch": 0.75, + "grad_norm": 1.486062744249541, + "learning_rate": 3.13341816200616e-06, + "loss": 0.8036, + "step": 5018 + }, + { + "epoch": 0.75, + "grad_norm": 1.5557173332485443, + "learning_rate": 3.1299053082168206e-06, + "loss": 0.8483, + "step": 5019 + }, + { + "epoch": 0.75, + "grad_norm": 1.493092759764533, + "learning_rate": 3.126394059282284e-06, + "loss": 0.7798, + "step": 5020 + }, + { + "epoch": 0.75, + "grad_norm": 1.5227747649393084, + "learning_rate": 3.122884416022779e-06, + "loss": 0.8205, + "step": 5021 + }, + { + "epoch": 0.75, + "grad_norm": 1.4368630713763728, + "learning_rate": 3.11937637925816e-06, + "loss": 0.7469, + "step": 5022 + }, + { + "epoch": 0.75, + "grad_norm": 1.5487856744863249, + "learning_rate": 3.1158699498079037e-06, + "loss": 0.8654, + "step": 5023 + }, + { + "epoch": 0.75, + "grad_norm": 1.5658335899407716, + "learning_rate": 3.1123651284911162e-06, + "loss": 0.8348, + "step": 5024 + }, + { + "epoch": 0.75, + "grad_norm": 1.4453956373892873, + "learning_rate": 3.108861916126518e-06, + "loss": 0.8853, + "step": 5025 + }, + { + "epoch": 0.75, + "grad_norm": 1.4394074957485403, + "learning_rate": 3.105360313532463e-06, + "loss": 0.8181, + "step": 5026 + }, + { + "epoch": 0.75, + "grad_norm": 1.5528121628859766, + "learning_rate": 3.101860321526924e-06, + "loss": 0.8283, + "step": 5027 + }, + { + "epoch": 0.75, + "grad_norm": 1.3932914351574135, + "learning_rate": 3.098361940927502e-06, + "loss": 0.8278, + "step": 5028 + }, + { + "epoch": 0.75, + "grad_norm": 1.667759959310162, + "learning_rate": 3.0948651725514223e-06, + "loss": 0.8683, + "step": 5029 + }, + { + "epoch": 0.75, + "grad_norm": 1.3775178698087676, + "learning_rate": 3.0913700172155226e-06, + "loss": 0.7878, + "step": 5030 + }, + { + "epoch": 0.75, + "grad_norm": 1.5710079754899224, + "learning_rate": 3.0878764757362745e-06, + "loss": 0.8395, + "step": 5031 + }, + { + "epoch": 0.75, + "grad_norm": 1.3338889322444747, + "learning_rate": 3.0843845489297698e-06, + "loss": 0.8098, + "step": 5032 + }, + { + "epoch": 0.75, + "grad_norm": 1.4173942261102668, + "learning_rate": 3.0808942376117244e-06, + "loss": 0.8592, + "step": 5033 + }, + { + "epoch": 0.75, + "grad_norm": 1.3592008638109572, + "learning_rate": 3.0774055425974737e-06, + "loss": 0.8769, + "step": 5034 + }, + { + "epoch": 0.75, + "grad_norm": 0.8606934636640169, + "learning_rate": 3.073918464701978e-06, + "loss": 0.3375, + "step": 5035 + }, + { + "epoch": 0.75, + "grad_norm": 1.4968595300384728, + "learning_rate": 3.0704330047398233e-06, + "loss": 0.8324, + "step": 5036 + }, + { + "epoch": 0.75, + "grad_norm": 1.5703071753093727, + "learning_rate": 3.066949163525205e-06, + "loss": 0.7868, + "step": 5037 + }, + { + "epoch": 0.75, + "grad_norm": 1.6708134574393665, + "learning_rate": 3.063466941871952e-06, + "loss": 0.8006, + "step": 5038 + }, + { + "epoch": 0.75, + "grad_norm": 1.4595164971708143, + "learning_rate": 3.059986340593514e-06, + "loss": 0.8248, + "step": 5039 + }, + { + "epoch": 0.75, + "grad_norm": 0.8799428612374821, + "learning_rate": 3.056507360502957e-06, + "loss": 0.3184, + "step": 5040 + }, + { + "epoch": 0.75, + "grad_norm": 0.852615578048221, + "learning_rate": 3.053030002412972e-06, + "loss": 0.3233, + "step": 5041 + }, + { + "epoch": 0.75, + "grad_norm": 1.6089103963264473, + "learning_rate": 3.0495542671358745e-06, + "loss": 0.8355, + "step": 5042 + }, + { + "epoch": 0.75, + "grad_norm": 1.5634084444855452, + "learning_rate": 3.0460801554835894e-06, + "loss": 0.798, + "step": 5043 + }, + { + "epoch": 0.75, + "grad_norm": 1.6761728153835087, + "learning_rate": 3.042607668267672e-06, + "loss": 0.8607, + "step": 5044 + }, + { + "epoch": 0.75, + "grad_norm": 1.4439379044967995, + "learning_rate": 3.0391368062992976e-06, + "loss": 0.9213, + "step": 5045 + }, + { + "epoch": 0.75, + "grad_norm": 1.4837084064000567, + "learning_rate": 3.035667570389258e-06, + "loss": 0.8277, + "step": 5046 + }, + { + "epoch": 0.75, + "grad_norm": 1.4727755847309505, + "learning_rate": 3.0321999613479668e-06, + "loss": 0.7724, + "step": 5047 + }, + { + "epoch": 0.75, + "grad_norm": 1.5219994898447782, + "learning_rate": 3.028733979985463e-06, + "loss": 0.8173, + "step": 5048 + }, + { + "epoch": 0.75, + "grad_norm": 1.4186030695155631, + "learning_rate": 3.0252696271113934e-06, + "loss": 0.8002, + "step": 5049 + }, + { + "epoch": 0.75, + "grad_norm": 1.3888898174130462, + "learning_rate": 3.0218069035350328e-06, + "loss": 0.8786, + "step": 5050 + }, + { + "epoch": 0.75, + "grad_norm": 1.4197482543381839, + "learning_rate": 3.0183458100652752e-06, + "loss": 0.8537, + "step": 5051 + }, + { + "epoch": 0.75, + "grad_norm": 1.4045806897728466, + "learning_rate": 3.0148863475106315e-06, + "loss": 0.7819, + "step": 5052 + }, + { + "epoch": 0.75, + "grad_norm": 1.5951332104091749, + "learning_rate": 3.011428516679232e-06, + "loss": 0.8807, + "step": 5053 + }, + { + "epoch": 0.75, + "grad_norm": 1.4687694702988459, + "learning_rate": 3.007972318378829e-06, + "loss": 0.7757, + "step": 5054 + }, + { + "epoch": 0.75, + "grad_norm": 1.5007927235368792, + "learning_rate": 3.004517753416791e-06, + "loss": 0.8562, + "step": 5055 + }, + { + "epoch": 0.75, + "grad_norm": 1.3634559360939043, + "learning_rate": 3.0010648226001004e-06, + "loss": 0.8298, + "step": 5056 + }, + { + "epoch": 0.75, + "grad_norm": 1.5942653075376565, + "learning_rate": 2.9976135267353636e-06, + "loss": 0.8789, + "step": 5057 + }, + { + "epoch": 0.75, + "grad_norm": 1.5196176653906448, + "learning_rate": 2.994163866628805e-06, + "loss": 0.7989, + "step": 5058 + }, + { + "epoch": 0.75, + "grad_norm": 1.461773155903358, + "learning_rate": 2.990715843086265e-06, + "loss": 0.8552, + "step": 5059 + }, + { + "epoch": 0.75, + "grad_norm": 1.6170273312204086, + "learning_rate": 2.9872694569132022e-06, + "loss": 0.8673, + "step": 5060 + }, + { + "epoch": 0.76, + "grad_norm": 1.5402932885493266, + "learning_rate": 2.9838247089146956e-06, + "loss": 0.9398, + "step": 5061 + }, + { + "epoch": 0.76, + "grad_norm": 1.3473375558661, + "learning_rate": 2.9803815998954334e-06, + "loss": 0.8014, + "step": 5062 + }, + { + "epoch": 0.76, + "grad_norm": 1.450757505960035, + "learning_rate": 2.976940130659729e-06, + "loss": 0.7577, + "step": 5063 + }, + { + "epoch": 0.76, + "grad_norm": 1.364956507899819, + "learning_rate": 2.9735003020115095e-06, + "loss": 0.7797, + "step": 5064 + }, + { + "epoch": 0.76, + "grad_norm": 1.5904351482979222, + "learning_rate": 2.9700621147543194e-06, + "loss": 0.8334, + "step": 5065 + }, + { + "epoch": 0.76, + "grad_norm": 1.347287987412576, + "learning_rate": 2.9666255696913195e-06, + "loss": 0.7295, + "step": 5066 + }, + { + "epoch": 0.76, + "grad_norm": 1.5721416993200201, + "learning_rate": 2.9631906676252865e-06, + "loss": 0.7833, + "step": 5067 + }, + { + "epoch": 0.76, + "grad_norm": 1.438067685640525, + "learning_rate": 2.959757409358619e-06, + "loss": 0.8828, + "step": 5068 + }, + { + "epoch": 0.76, + "grad_norm": 1.4346065594564978, + "learning_rate": 2.9563257956933177e-06, + "loss": 0.8692, + "step": 5069 + }, + { + "epoch": 0.76, + "grad_norm": 1.3165758742443843, + "learning_rate": 2.9528958274310126e-06, + "loss": 0.8155, + "step": 5070 + }, + { + "epoch": 0.76, + "grad_norm": 1.4585942457879668, + "learning_rate": 2.949467505372945e-06, + "loss": 0.9053, + "step": 5071 + }, + { + "epoch": 0.76, + "grad_norm": 1.5773018422825356, + "learning_rate": 2.9460408303199696e-06, + "loss": 0.7672, + "step": 5072 + }, + { + "epoch": 0.76, + "grad_norm": 1.8638157887523645, + "learning_rate": 2.9426158030725592e-06, + "loss": 0.833, + "step": 5073 + }, + { + "epoch": 0.76, + "grad_norm": 1.4726185945609227, + "learning_rate": 2.9391924244308046e-06, + "loss": 0.7475, + "step": 5074 + }, + { + "epoch": 0.76, + "grad_norm": 1.4897454354613884, + "learning_rate": 2.9357706951943987e-06, + "loss": 0.8107, + "step": 5075 + }, + { + "epoch": 0.76, + "grad_norm": 1.5411085993975648, + "learning_rate": 2.9323506161626613e-06, + "loss": 0.7344, + "step": 5076 + }, + { + "epoch": 0.76, + "grad_norm": 1.4441299536622616, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.925, + "step": 5077 + }, + { + "epoch": 0.76, + "grad_norm": 1.6700704831929407, + "learning_rate": 2.925515411908535e-06, + "loss": 0.808, + "step": 5078 + }, + { + "epoch": 0.76, + "grad_norm": 1.6239075805756247, + "learning_rate": 2.9221002882828486e-06, + "loss": 0.7671, + "step": 5079 + }, + { + "epoch": 0.76, + "grad_norm": 1.631084893675169, + "learning_rate": 2.9186868180552396e-06, + "loss": 0.8497, + "step": 5080 + }, + { + "epoch": 0.76, + "grad_norm": 1.4432021560009018, + "learning_rate": 2.915275002023099e-06, + "loss": 0.8866, + "step": 5081 + }, + { + "epoch": 0.76, + "grad_norm": 1.5917419792725953, + "learning_rate": 2.9118648409834205e-06, + "loss": 0.8394, + "step": 5082 + }, + { + "epoch": 0.76, + "grad_norm": 1.4742628738843244, + "learning_rate": 2.908456335732821e-06, + "loss": 0.8454, + "step": 5083 + }, + { + "epoch": 0.76, + "grad_norm": 1.398670749818334, + "learning_rate": 2.905049487067528e-06, + "loss": 0.7637, + "step": 5084 + }, + { + "epoch": 0.76, + "grad_norm": 1.539167197292273, + "learning_rate": 2.901644295783381e-06, + "loss": 0.7946, + "step": 5085 + }, + { + "epoch": 0.76, + "grad_norm": 1.4893290093084923, + "learning_rate": 2.8982407626758348e-06, + "loss": 0.8733, + "step": 5086 + }, + { + "epoch": 0.76, + "grad_norm": 0.7945640541251401, + "learning_rate": 2.894838888539957e-06, + "loss": 0.3267, + "step": 5087 + }, + { + "epoch": 0.76, + "grad_norm": 1.4855221770122016, + "learning_rate": 2.891438674170419e-06, + "loss": 0.815, + "step": 5088 + }, + { + "epoch": 0.76, + "grad_norm": 1.3375040644456926, + "learning_rate": 2.888040120361515e-06, + "loss": 0.8673, + "step": 5089 + }, + { + "epoch": 0.76, + "grad_norm": 1.4070560953000169, + "learning_rate": 2.884643227907147e-06, + "loss": 0.8149, + "step": 5090 + }, + { + "epoch": 0.76, + "grad_norm": 1.5364650165821996, + "learning_rate": 2.88124799760083e-06, + "loss": 0.8326, + "step": 5091 + }, + { + "epoch": 0.76, + "grad_norm": 1.5013267844391118, + "learning_rate": 2.8778544302356904e-06, + "loss": 0.8663, + "step": 5092 + }, + { + "epoch": 0.76, + "grad_norm": 1.445383067599179, + "learning_rate": 2.8744625266044647e-06, + "loss": 0.8959, + "step": 5093 + }, + { + "epoch": 0.76, + "grad_norm": 1.435227235254386, + "learning_rate": 2.8710722874995066e-06, + "loss": 0.7043, + "step": 5094 + }, + { + "epoch": 0.76, + "grad_norm": 1.4574933584856937, + "learning_rate": 2.8676837137127687e-06, + "loss": 0.828, + "step": 5095 + }, + { + "epoch": 0.76, + "grad_norm": 1.49241878055303, + "learning_rate": 2.8642968060358256e-06, + "loss": 0.7665, + "step": 5096 + }, + { + "epoch": 0.76, + "grad_norm": 1.4802101722200727, + "learning_rate": 2.8609115652598595e-06, + "loss": 0.7698, + "step": 5097 + }, + { + "epoch": 0.76, + "grad_norm": 1.4690246223836267, + "learning_rate": 2.8575279921756637e-06, + "loss": 0.8466, + "step": 5098 + }, + { + "epoch": 0.76, + "grad_norm": 1.5907910908075324, + "learning_rate": 2.8541460875736403e-06, + "loss": 0.8175, + "step": 5099 + }, + { + "epoch": 0.76, + "grad_norm": 1.3758410095306948, + "learning_rate": 2.8507658522438054e-06, + "loss": 0.8449, + "step": 5100 + }, + { + "epoch": 0.76, + "grad_norm": 1.4552158911343096, + "learning_rate": 2.8473872869757768e-06, + "loss": 0.7662, + "step": 5101 + }, + { + "epoch": 0.76, + "grad_norm": 1.396666607604129, + "learning_rate": 2.8440103925587904e-06, + "loss": 0.7347, + "step": 5102 + }, + { + "epoch": 0.76, + "grad_norm": 1.5318893799511526, + "learning_rate": 2.840635169781688e-06, + "loss": 0.7885, + "step": 5103 + }, + { + "epoch": 0.76, + "grad_norm": 1.4432453980308066, + "learning_rate": 2.837261619432925e-06, + "loss": 0.7513, + "step": 5104 + }, + { + "epoch": 0.76, + "grad_norm": 1.7019402709888787, + "learning_rate": 2.8338897423005607e-06, + "loss": 0.7712, + "step": 5105 + }, + { + "epoch": 0.76, + "grad_norm": 1.3952607709665923, + "learning_rate": 2.8305195391722663e-06, + "loss": 0.835, + "step": 5106 + }, + { + "epoch": 0.76, + "grad_norm": 1.5190263120056213, + "learning_rate": 2.8271510108353237e-06, + "loss": 0.8781, + "step": 5107 + }, + { + "epoch": 0.76, + "grad_norm": 1.6806913303820863, + "learning_rate": 2.823784158076618e-06, + "loss": 0.7667, + "step": 5108 + }, + { + "epoch": 0.76, + "grad_norm": 1.4422855408966526, + "learning_rate": 2.820418981682648e-06, + "loss": 0.7727, + "step": 5109 + }, + { + "epoch": 0.76, + "grad_norm": 1.4882226191508583, + "learning_rate": 2.817055482439518e-06, + "loss": 0.816, + "step": 5110 + }, + { + "epoch": 0.76, + "grad_norm": 1.4436284848058087, + "learning_rate": 2.8136936611329436e-06, + "loss": 0.7696, + "step": 5111 + }, + { + "epoch": 0.76, + "grad_norm": 1.3164923834991495, + "learning_rate": 2.810333518548246e-06, + "loss": 0.8996, + "step": 5112 + }, + { + "epoch": 0.76, + "grad_norm": 1.6070138765748827, + "learning_rate": 2.806975055470358e-06, + "loss": 0.8524, + "step": 5113 + }, + { + "epoch": 0.76, + "grad_norm": 1.4785965301469797, + "learning_rate": 2.803618272683809e-06, + "loss": 0.8403, + "step": 5114 + }, + { + "epoch": 0.76, + "grad_norm": 1.499030225036364, + "learning_rate": 2.80026317097275e-06, + "loss": 0.8636, + "step": 5115 + }, + { + "epoch": 0.76, + "grad_norm": 1.4707822651739537, + "learning_rate": 2.796909751120931e-06, + "loss": 0.7879, + "step": 5116 + }, + { + "epoch": 0.76, + "grad_norm": 1.4997675002731632, + "learning_rate": 2.7935580139117114e-06, + "loss": 0.8177, + "step": 5117 + }, + { + "epoch": 0.76, + "grad_norm": 1.458039115421537, + "learning_rate": 2.7902079601280583e-06, + "loss": 0.9224, + "step": 5118 + }, + { + "epoch": 0.76, + "grad_norm": 1.2851376148078655, + "learning_rate": 2.7868595905525464e-06, + "loss": 0.837, + "step": 5119 + }, + { + "epoch": 0.76, + "grad_norm": 1.5736676747159453, + "learning_rate": 2.7835129059673496e-06, + "loss": 0.8696, + "step": 5120 + }, + { + "epoch": 0.76, + "grad_norm": 1.5999291732597782, + "learning_rate": 2.7801679071542576e-06, + "loss": 0.8646, + "step": 5121 + }, + { + "epoch": 0.76, + "grad_norm": 1.4416532169220666, + "learning_rate": 2.7768245948946615e-06, + "loss": 0.8669, + "step": 5122 + }, + { + "epoch": 0.76, + "grad_norm": 0.9014195585070164, + "learning_rate": 2.7734829699695585e-06, + "loss": 0.3194, + "step": 5123 + }, + { + "epoch": 0.76, + "grad_norm": 1.5242080097123147, + "learning_rate": 2.770143033159555e-06, + "loss": 0.741, + "step": 5124 + }, + { + "epoch": 0.76, + "grad_norm": 1.4625193006207853, + "learning_rate": 2.766804785244859e-06, + "loss": 0.8942, + "step": 5125 + }, + { + "epoch": 0.76, + "grad_norm": 1.5477927639018938, + "learning_rate": 2.763468227005288e-06, + "loss": 0.7551, + "step": 5126 + }, + { + "epoch": 0.76, + "grad_norm": 1.4738246408619002, + "learning_rate": 2.7601333592202583e-06, + "loss": 0.7635, + "step": 5127 + }, + { + "epoch": 0.77, + "grad_norm": 0.866234631735783, + "learning_rate": 2.756800182668796e-06, + "loss": 0.374, + "step": 5128 + }, + { + "epoch": 0.77, + "grad_norm": 1.559274426831016, + "learning_rate": 2.7534686981295335e-06, + "loss": 0.9176, + "step": 5129 + }, + { + "epoch": 0.77, + "grad_norm": 1.408270645110137, + "learning_rate": 2.7501389063807037e-06, + "loss": 0.768, + "step": 5130 + }, + { + "epoch": 0.77, + "grad_norm": 1.6788602843544644, + "learning_rate": 2.746810808200152e-06, + "loss": 0.7975, + "step": 5131 + }, + { + "epoch": 0.77, + "grad_norm": 1.494269935148476, + "learning_rate": 2.743484404365314e-06, + "loss": 0.8175, + "step": 5132 + }, + { + "epoch": 0.77, + "grad_norm": 1.497123785737367, + "learning_rate": 2.740159695653243e-06, + "loss": 0.8339, + "step": 5133 + }, + { + "epoch": 0.77, + "grad_norm": 1.573388340919819, + "learning_rate": 2.7368366828405892e-06, + "loss": 0.8561, + "step": 5134 + }, + { + "epoch": 0.77, + "grad_norm": 1.3587759331226101, + "learning_rate": 2.7335153667036106e-06, + "loss": 0.866, + "step": 5135 + }, + { + "epoch": 0.77, + "grad_norm": 0.8963988622800161, + "learning_rate": 2.7301957480181697e-06, + "loss": 0.3358, + "step": 5136 + }, + { + "epoch": 0.77, + "grad_norm": 1.4595957755480402, + "learning_rate": 2.7268778275597217e-06, + "loss": 0.8431, + "step": 5137 + }, + { + "epoch": 0.77, + "grad_norm": 1.424294375124013, + "learning_rate": 2.7235616061033388e-06, + "loss": 0.7391, + "step": 5138 + }, + { + "epoch": 0.77, + "grad_norm": 1.3765513523843336, + "learning_rate": 2.7202470844236896e-06, + "loss": 0.7617, + "step": 5139 + }, + { + "epoch": 0.77, + "grad_norm": 1.402381045740778, + "learning_rate": 2.7169342632950466e-06, + "loss": 0.8732, + "step": 5140 + }, + { + "epoch": 0.77, + "grad_norm": 1.651675469054545, + "learning_rate": 2.7136231434912854e-06, + "loss": 0.8727, + "step": 5141 + }, + { + "epoch": 0.77, + "grad_norm": 1.4181967333955, + "learning_rate": 2.7103137257858867e-06, + "loss": 0.8236, + "step": 5142 + }, + { + "epoch": 0.77, + "grad_norm": 1.4509007314845606, + "learning_rate": 2.707006010951925e-06, + "loss": 0.7604, + "step": 5143 + }, + { + "epoch": 0.77, + "grad_norm": 1.4976815508106767, + "learning_rate": 2.703699999762085e-06, + "loss": 0.8438, + "step": 5144 + }, + { + "epoch": 0.77, + "grad_norm": 1.3890570376023808, + "learning_rate": 2.7003956929886523e-06, + "loss": 0.8297, + "step": 5145 + }, + { + "epoch": 0.77, + "grad_norm": 1.5252838903345383, + "learning_rate": 2.6970930914035134e-06, + "loss": 0.859, + "step": 5146 + }, + { + "epoch": 0.77, + "grad_norm": 1.3687283116847584, + "learning_rate": 2.6937921957781587e-06, + "loss": 0.8266, + "step": 5147 + }, + { + "epoch": 0.77, + "grad_norm": 1.4690525156816665, + "learning_rate": 2.690493006883671e-06, + "loss": 0.8691, + "step": 5148 + }, + { + "epoch": 0.77, + "grad_norm": 1.428916280178367, + "learning_rate": 2.6871955254907455e-06, + "loss": 0.7763, + "step": 5149 + }, + { + "epoch": 0.77, + "grad_norm": 0.8549993759103848, + "learning_rate": 2.683899752369674e-06, + "loss": 0.3284, + "step": 5150 + }, + { + "epoch": 0.77, + "grad_norm": 1.0003487786199141, + "learning_rate": 2.68060568829035e-06, + "loss": 0.3414, + "step": 5151 + }, + { + "epoch": 0.77, + "grad_norm": 1.370832645926139, + "learning_rate": 2.6773133340222677e-06, + "loss": 0.8533, + "step": 5152 + }, + { + "epoch": 0.77, + "grad_norm": 1.495586436241171, + "learning_rate": 2.674022690334518e-06, + "loss": 0.7907, + "step": 5153 + }, + { + "epoch": 0.77, + "grad_norm": 1.6276951697022042, + "learning_rate": 2.6707337579957983e-06, + "loss": 0.8283, + "step": 5154 + }, + { + "epoch": 0.77, + "grad_norm": 1.4199171059645121, + "learning_rate": 2.667446537774402e-06, + "loss": 0.8223, + "step": 5155 + }, + { + "epoch": 0.77, + "grad_norm": 1.5410516943777657, + "learning_rate": 2.664161030438225e-06, + "loss": 0.8104, + "step": 5156 + }, + { + "epoch": 0.77, + "grad_norm": 1.4454646777894695, + "learning_rate": 2.660877236754762e-06, + "loss": 0.7811, + "step": 5157 + }, + { + "epoch": 0.77, + "grad_norm": 1.3515720656502974, + "learning_rate": 2.657595157491111e-06, + "loss": 0.8302, + "step": 5158 + }, + { + "epoch": 0.77, + "grad_norm": 1.4708979326894904, + "learning_rate": 2.654314793413959e-06, + "loss": 0.8698, + "step": 5159 + }, + { + "epoch": 0.77, + "grad_norm": 1.4599677975598686, + "learning_rate": 2.6510361452896038e-06, + "loss": 0.8004, + "step": 5160 + }, + { + "epoch": 0.77, + "grad_norm": 1.506253704189162, + "learning_rate": 2.6477592138839357e-06, + "loss": 0.8241, + "step": 5161 + }, + { + "epoch": 0.77, + "grad_norm": 1.415136095699315, + "learning_rate": 2.6444839999624496e-06, + "loss": 0.7332, + "step": 5162 + }, + { + "epoch": 0.77, + "grad_norm": 1.4388816830176892, + "learning_rate": 2.6412105042902327e-06, + "loss": 0.8909, + "step": 5163 + }, + { + "epoch": 0.77, + "grad_norm": 1.6125252675046966, + "learning_rate": 2.6379387276319757e-06, + "loss": 0.7778, + "step": 5164 + }, + { + "epoch": 0.77, + "grad_norm": 1.428628587989547, + "learning_rate": 2.6346686707519675e-06, + "loss": 0.7432, + "step": 5165 + }, + { + "epoch": 0.77, + "grad_norm": 1.6225580323936093, + "learning_rate": 2.631400334414089e-06, + "loss": 0.8191, + "step": 5166 + }, + { + "epoch": 0.77, + "grad_norm": 1.436347071481445, + "learning_rate": 2.6281337193818267e-06, + "loss": 0.7627, + "step": 5167 + }, + { + "epoch": 0.77, + "grad_norm": 1.3116512534546898, + "learning_rate": 2.624868826418262e-06, + "loss": 0.7553, + "step": 5168 + }, + { + "epoch": 0.77, + "grad_norm": 1.5899296275758623, + "learning_rate": 2.6216056562860746e-06, + "loss": 0.7602, + "step": 5169 + }, + { + "epoch": 0.77, + "grad_norm": 1.4261176809767078, + "learning_rate": 2.6183442097475398e-06, + "loss": 0.801, + "step": 5170 + }, + { + "epoch": 0.77, + "grad_norm": 1.3800929706572926, + "learning_rate": 2.6150844875645364e-06, + "loss": 0.7783, + "step": 5171 + }, + { + "epoch": 0.77, + "grad_norm": 1.6495394684461355, + "learning_rate": 2.61182649049853e-06, + "loss": 0.8392, + "step": 5172 + }, + { + "epoch": 0.77, + "grad_norm": 1.5925889519385894, + "learning_rate": 2.6085702193105913e-06, + "loss": 0.9093, + "step": 5173 + }, + { + "epoch": 0.77, + "grad_norm": 1.495658535440884, + "learning_rate": 2.605315674761385e-06, + "loss": 0.9176, + "step": 5174 + }, + { + "epoch": 0.77, + "grad_norm": 1.4187953236982287, + "learning_rate": 2.602062857611174e-06, + "loss": 0.8022, + "step": 5175 + }, + { + "epoch": 0.77, + "grad_norm": 1.408316648671769, + "learning_rate": 2.5988117686198167e-06, + "loss": 0.7984, + "step": 5176 + }, + { + "epoch": 0.77, + "grad_norm": 1.5680440434879845, + "learning_rate": 2.59556240854677e-06, + "loss": 0.8152, + "step": 5177 + }, + { + "epoch": 0.77, + "grad_norm": 1.5053090457953842, + "learning_rate": 2.5923147781510795e-06, + "loss": 0.797, + "step": 5178 + }, + { + "epoch": 0.77, + "grad_norm": 1.3465570887804141, + "learning_rate": 2.5890688781913954e-06, + "loss": 0.8021, + "step": 5179 + }, + { + "epoch": 0.77, + "grad_norm": 1.5141151868089557, + "learning_rate": 2.585824709425958e-06, + "loss": 0.9689, + "step": 5180 + }, + { + "epoch": 0.77, + "grad_norm": 1.6343181970856544, + "learning_rate": 2.5825822726126095e-06, + "loss": 0.741, + "step": 5181 + }, + { + "epoch": 0.77, + "grad_norm": 1.379115861408794, + "learning_rate": 2.5793415685087797e-06, + "loss": 0.8026, + "step": 5182 + }, + { + "epoch": 0.77, + "grad_norm": 1.446645431349882, + "learning_rate": 2.576102597871498e-06, + "loss": 0.762, + "step": 5183 + }, + { + "epoch": 0.77, + "grad_norm": 1.4875574360555688, + "learning_rate": 2.5728653614573927e-06, + "loss": 0.7581, + "step": 5184 + }, + { + "epoch": 0.77, + "grad_norm": 1.5517246123952102, + "learning_rate": 2.5696298600226766e-06, + "loss": 0.8235, + "step": 5185 + }, + { + "epoch": 0.77, + "grad_norm": 1.4999064078537652, + "learning_rate": 2.566396094323165e-06, + "loss": 0.8537, + "step": 5186 + }, + { + "epoch": 0.77, + "grad_norm": 1.4835738655312591, + "learning_rate": 2.5631640651142654e-06, + "loss": 0.7247, + "step": 5187 + }, + { + "epoch": 0.77, + "grad_norm": 1.4463761398096484, + "learning_rate": 2.559933773150982e-06, + "loss": 0.8275, + "step": 5188 + }, + { + "epoch": 0.77, + "grad_norm": 1.4602632828289412, + "learning_rate": 2.5567052191879104e-06, + "loss": 0.8509, + "step": 5189 + }, + { + "epoch": 0.77, + "grad_norm": 1.4387524315018494, + "learning_rate": 2.5534784039792437e-06, + "loss": 0.7542, + "step": 5190 + }, + { + "epoch": 0.77, + "grad_norm": 1.3635582970291584, + "learning_rate": 2.550253328278761e-06, + "loss": 0.804, + "step": 5191 + }, + { + "epoch": 0.77, + "grad_norm": 1.2054497070192842, + "learning_rate": 2.5470299928398424e-06, + "loss": 0.8303, + "step": 5192 + }, + { + "epoch": 0.77, + "grad_norm": 1.285101037158337, + "learning_rate": 2.5438083984154606e-06, + "loss": 0.809, + "step": 5193 + }, + { + "epoch": 0.77, + "grad_norm": 1.3858796677194265, + "learning_rate": 2.5405885457581793e-06, + "loss": 0.7714, + "step": 5194 + }, + { + "epoch": 0.78, + "grad_norm": 1.5169228055727215, + "learning_rate": 2.537370435620157e-06, + "loss": 0.7131, + "step": 5195 + }, + { + "epoch": 0.78, + "grad_norm": 1.538157340920337, + "learning_rate": 2.534154068753144e-06, + "loss": 0.8513, + "step": 5196 + }, + { + "epoch": 0.78, + "grad_norm": 1.5409136558321017, + "learning_rate": 2.5309394459084878e-06, + "loss": 0.905, + "step": 5197 + }, + { + "epoch": 0.78, + "grad_norm": 1.3537277369240417, + "learning_rate": 2.527726567837118e-06, + "loss": 0.8278, + "step": 5198 + }, + { + "epoch": 0.78, + "grad_norm": 1.3521318008433085, + "learning_rate": 2.524515435289566e-06, + "loss": 0.8742, + "step": 5199 + }, + { + "epoch": 0.78, + "grad_norm": 1.6118387465155175, + "learning_rate": 2.5213060490159536e-06, + "loss": 0.764, + "step": 5200 + }, + { + "epoch": 0.78, + "grad_norm": 1.350296708759324, + "learning_rate": 2.5180984097659924e-06, + "loss": 0.8, + "step": 5201 + }, + { + "epoch": 0.78, + "grad_norm": 0.9074766211718266, + "learning_rate": 2.514892518288988e-06, + "loss": 0.318, + "step": 5202 + }, + { + "epoch": 0.78, + "grad_norm": 1.2675993429738672, + "learning_rate": 2.511688375333842e-06, + "loss": 0.8022, + "step": 5203 + }, + { + "epoch": 0.78, + "grad_norm": 1.6910893229222446, + "learning_rate": 2.5084859816490327e-06, + "loss": 0.8335, + "step": 5204 + }, + { + "epoch": 0.78, + "grad_norm": 1.5454597820732972, + "learning_rate": 2.505285337982644e-06, + "loss": 0.7369, + "step": 5205 + }, + { + "epoch": 0.78, + "grad_norm": 1.4088497899234877, + "learning_rate": 2.5020864450823477e-06, + "loss": 0.82, + "step": 5206 + }, + { + "epoch": 0.78, + "grad_norm": 0.9735689390957943, + "learning_rate": 2.4988893036954045e-06, + "loss": 0.3349, + "step": 5207 + }, + { + "epoch": 0.78, + "grad_norm": 1.257075008397669, + "learning_rate": 2.4956939145686677e-06, + "loss": 0.8491, + "step": 5208 + }, + { + "epoch": 0.78, + "grad_norm": 1.3366961353814741, + "learning_rate": 2.4925002784485796e-06, + "loss": 0.8911, + "step": 5209 + }, + { + "epoch": 0.78, + "grad_norm": 1.483889845568321, + "learning_rate": 2.489308396081178e-06, + "loss": 0.7483, + "step": 5210 + }, + { + "epoch": 0.78, + "grad_norm": 1.667345769867752, + "learning_rate": 2.48611826821208e-06, + "loss": 0.8605, + "step": 5211 + }, + { + "epoch": 0.78, + "grad_norm": 1.6191098379376316, + "learning_rate": 2.4829298955865022e-06, + "loss": 0.8575, + "step": 5212 + }, + { + "epoch": 0.78, + "grad_norm": 1.3941262429078851, + "learning_rate": 2.4797432789492506e-06, + "loss": 0.8482, + "step": 5213 + }, + { + "epoch": 0.78, + "grad_norm": 1.5854755924570065, + "learning_rate": 2.476558419044718e-06, + "loss": 0.8346, + "step": 5214 + }, + { + "epoch": 0.78, + "grad_norm": 1.3690406610453252, + "learning_rate": 2.4733753166168883e-06, + "loss": 0.7884, + "step": 5215 + }, + { + "epoch": 0.78, + "grad_norm": 1.5323899439106292, + "learning_rate": 2.470193972409337e-06, + "loss": 0.8227, + "step": 5216 + }, + { + "epoch": 0.78, + "grad_norm": 1.3838070664348467, + "learning_rate": 2.467014387165222e-06, + "loss": 0.7963, + "step": 5217 + }, + { + "epoch": 0.78, + "grad_norm": 1.3533578391259928, + "learning_rate": 2.4638365616272952e-06, + "loss": 0.8548, + "step": 5218 + }, + { + "epoch": 0.78, + "grad_norm": 1.4816083900048744, + "learning_rate": 2.460660496537899e-06, + "loss": 0.8006, + "step": 5219 + }, + { + "epoch": 0.78, + "grad_norm": 0.9892379235304632, + "learning_rate": 2.4574861926389615e-06, + "loss": 0.3097, + "step": 5220 + }, + { + "epoch": 0.78, + "grad_norm": 0.8787789305551243, + "learning_rate": 2.454313650672001e-06, + "loss": 0.3708, + "step": 5221 + }, + { + "epoch": 0.78, + "grad_norm": 1.380559118396946, + "learning_rate": 2.451142871378124e-06, + "loss": 0.7665, + "step": 5222 + }, + { + "epoch": 0.78, + "grad_norm": 1.631606092651518, + "learning_rate": 2.447973855498027e-06, + "loss": 0.8337, + "step": 5223 + }, + { + "epoch": 0.78, + "grad_norm": 1.5255623267782115, + "learning_rate": 2.4448066037719865e-06, + "loss": 0.8143, + "step": 5224 + }, + { + "epoch": 0.78, + "grad_norm": 1.2901481456682227, + "learning_rate": 2.4416411169398755e-06, + "loss": 0.8573, + "step": 5225 + }, + { + "epoch": 0.78, + "grad_norm": 1.371585847299984, + "learning_rate": 2.4384773957411533e-06, + "loss": 0.7782, + "step": 5226 + }, + { + "epoch": 0.78, + "grad_norm": 1.4576708307762685, + "learning_rate": 2.4353154409148637e-06, + "loss": 0.8154, + "step": 5227 + }, + { + "epoch": 0.78, + "grad_norm": 0.8625181709864146, + "learning_rate": 2.432155253199642e-06, + "loss": 0.3304, + "step": 5228 + }, + { + "epoch": 0.78, + "grad_norm": 1.4268532491954804, + "learning_rate": 2.42899683333371e-06, + "loss": 0.7975, + "step": 5229 + }, + { + "epoch": 0.78, + "grad_norm": 1.7498593891227354, + "learning_rate": 2.4258401820548682e-06, + "loss": 0.81, + "step": 5230 + }, + { + "epoch": 0.78, + "grad_norm": 0.8353925158433698, + "learning_rate": 2.4226853001005146e-06, + "loss": 0.3, + "step": 5231 + }, + { + "epoch": 0.78, + "grad_norm": 1.472565166859187, + "learning_rate": 2.4195321882076295e-06, + "loss": 0.8038, + "step": 5232 + }, + { + "epoch": 0.78, + "grad_norm": 1.6878072163181843, + "learning_rate": 2.4163808471127815e-06, + "loss": 0.7141, + "step": 5233 + }, + { + "epoch": 0.78, + "grad_norm": 1.6696531085069557, + "learning_rate": 2.413231277552122e-06, + "loss": 0.7578, + "step": 5234 + }, + { + "epoch": 0.78, + "grad_norm": 1.5066712475494022, + "learning_rate": 2.410083480261395e-06, + "loss": 0.7178, + "step": 5235 + }, + { + "epoch": 0.78, + "grad_norm": 1.476737938530119, + "learning_rate": 2.4069374559759207e-06, + "loss": 0.7588, + "step": 5236 + }, + { + "epoch": 0.78, + "grad_norm": 1.5684129870884418, + "learning_rate": 2.4037932054306125e-06, + "loss": 0.839, + "step": 5237 + }, + { + "epoch": 0.78, + "grad_norm": 1.3662739030944941, + "learning_rate": 2.400650729359969e-06, + "loss": 0.775, + "step": 5238 + }, + { + "epoch": 0.78, + "grad_norm": 1.5272639783042223, + "learning_rate": 2.3975100284980713e-06, + "loss": 0.8904, + "step": 5239 + }, + { + "epoch": 0.78, + "grad_norm": 1.4701769731117003, + "learning_rate": 2.394371103578589e-06, + "loss": 0.7936, + "step": 5240 + }, + { + "epoch": 0.78, + "grad_norm": 0.8338514340417672, + "learning_rate": 2.3912339553347742e-06, + "loss": 0.311, + "step": 5241 + }, + { + "epoch": 0.78, + "grad_norm": 1.3105285724265792, + "learning_rate": 2.3880985844994674e-06, + "loss": 0.8007, + "step": 5242 + }, + { + "epoch": 0.78, + "grad_norm": 1.6320425498602253, + "learning_rate": 2.384964991805089e-06, + "loss": 0.8204, + "step": 5243 + }, + { + "epoch": 0.78, + "grad_norm": 1.5809349853554442, + "learning_rate": 2.3818331779836447e-06, + "loss": 0.7954, + "step": 5244 + }, + { + "epoch": 0.78, + "grad_norm": 1.4669171282749138, + "learning_rate": 2.3787031437667307e-06, + "loss": 0.9086, + "step": 5245 + }, + { + "epoch": 0.78, + "grad_norm": 1.5352490520819126, + "learning_rate": 2.37557488988552e-06, + "loss": 0.8549, + "step": 5246 + }, + { + "epoch": 0.78, + "grad_norm": 1.2499745634590351, + "learning_rate": 2.372448417070776e-06, + "loss": 0.7744, + "step": 5247 + }, + { + "epoch": 0.78, + "grad_norm": 1.5954549075762638, + "learning_rate": 2.3693237260528436e-06, + "loss": 0.8664, + "step": 5248 + }, + { + "epoch": 0.78, + "grad_norm": 1.3293878231609324, + "learning_rate": 2.366200817561647e-06, + "loss": 0.8828, + "step": 5249 + }, + { + "epoch": 0.78, + "grad_norm": 1.4326423728616928, + "learning_rate": 2.3630796923266997e-06, + "loss": 0.8084, + "step": 5250 + }, + { + "epoch": 0.78, + "grad_norm": 1.4560243550959084, + "learning_rate": 2.3599603510770962e-06, + "loss": 0.9566, + "step": 5251 + }, + { + "epoch": 0.78, + "grad_norm": 1.510083149119049, + "learning_rate": 2.3568427945415163e-06, + "loss": 0.7854, + "step": 5252 + }, + { + "epoch": 0.78, + "grad_norm": 1.5400789252076748, + "learning_rate": 2.3537270234482213e-06, + "loss": 0.8988, + "step": 5253 + }, + { + "epoch": 0.78, + "grad_norm": 1.5527720716686881, + "learning_rate": 2.350613038525058e-06, + "loss": 0.9005, + "step": 5254 + }, + { + "epoch": 0.78, + "grad_norm": 1.2622735063177308, + "learning_rate": 2.347500840499447e-06, + "loss": 0.8947, + "step": 5255 + }, + { + "epoch": 0.78, + "grad_norm": 1.5993420350708767, + "learning_rate": 2.3443904300984034e-06, + "loss": 0.879, + "step": 5256 + }, + { + "epoch": 0.78, + "grad_norm": 1.477986581250986, + "learning_rate": 2.3412818080485176e-06, + "loss": 0.7646, + "step": 5257 + }, + { + "epoch": 0.78, + "grad_norm": 1.4935451605038887, + "learning_rate": 2.3381749750759643e-06, + "loss": 0.7298, + "step": 5258 + }, + { + "epoch": 0.78, + "grad_norm": 1.5313736345048246, + "learning_rate": 2.335069931906503e-06, + "loss": 0.7812, + "step": 5259 + }, + { + "epoch": 0.78, + "grad_norm": 1.5945991236264816, + "learning_rate": 2.331966679265467e-06, + "loss": 0.8085, + "step": 5260 + }, + { + "epoch": 0.78, + "grad_norm": 1.3749900459882924, + "learning_rate": 2.3288652178777783e-06, + "loss": 0.8377, + "step": 5261 + }, + { + "epoch": 0.79, + "grad_norm": 1.387328957902974, + "learning_rate": 2.3257655484679376e-06, + "loss": 0.8335, + "step": 5262 + }, + { + "epoch": 0.79, + "grad_norm": 1.2039454576295368, + "learning_rate": 2.3226676717600303e-06, + "loss": 0.7619, + "step": 5263 + }, + { + "epoch": 0.79, + "grad_norm": 1.3853506734165646, + "learning_rate": 2.3195715884777203e-06, + "loss": 0.8254, + "step": 5264 + }, + { + "epoch": 0.79, + "grad_norm": 1.2965391899145648, + "learning_rate": 2.316477299344254e-06, + "loss": 0.8019, + "step": 5265 + }, + { + "epoch": 0.79, + "grad_norm": 0.9064926046763753, + "learning_rate": 2.3133848050824536e-06, + "loss": 0.3247, + "step": 5266 + }, + { + "epoch": 0.79, + "grad_norm": 1.4082301126663817, + "learning_rate": 2.3102941064147287e-06, + "loss": 0.7995, + "step": 5267 + }, + { + "epoch": 0.79, + "grad_norm": 1.6248746087681787, + "learning_rate": 2.3072052040630666e-06, + "loss": 0.8671, + "step": 5268 + }, + { + "epoch": 0.79, + "grad_norm": 1.3629218163259866, + "learning_rate": 2.3041180987490354e-06, + "loss": 0.7719, + "step": 5269 + }, + { + "epoch": 0.79, + "grad_norm": 1.4157957129881158, + "learning_rate": 2.3010327911937856e-06, + "loss": 0.8142, + "step": 5270 + }, + { + "epoch": 0.79, + "grad_norm": 1.5878835480873652, + "learning_rate": 2.2979492821180394e-06, + "loss": 0.7817, + "step": 5271 + }, + { + "epoch": 0.79, + "grad_norm": 1.5204970072152255, + "learning_rate": 2.2948675722421086e-06, + "loss": 0.893, + "step": 5272 + }, + { + "epoch": 0.79, + "grad_norm": 1.505893533717531, + "learning_rate": 2.2917876622858814e-06, + "loss": 0.7169, + "step": 5273 + }, + { + "epoch": 0.79, + "grad_norm": 1.4025162488470824, + "learning_rate": 2.288709552968823e-06, + "loss": 0.8427, + "step": 5274 + }, + { + "epoch": 0.79, + "grad_norm": 1.37039198332275, + "learning_rate": 2.285633245009984e-06, + "loss": 0.8525, + "step": 5275 + }, + { + "epoch": 0.79, + "grad_norm": 1.6165986290472205, + "learning_rate": 2.2825587391279857e-06, + "loss": 0.8273, + "step": 5276 + }, + { + "epoch": 0.79, + "grad_norm": 1.4875401293626862, + "learning_rate": 2.279486036041034e-06, + "loss": 0.8569, + "step": 5277 + }, + { + "epoch": 0.79, + "grad_norm": 1.4496691362536043, + "learning_rate": 2.276415136466913e-06, + "loss": 0.7318, + "step": 5278 + }, + { + "epoch": 0.79, + "grad_norm": 1.2522938185915398, + "learning_rate": 2.273346041122987e-06, + "loss": 0.7418, + "step": 5279 + }, + { + "epoch": 0.79, + "grad_norm": 1.3592608817749643, + "learning_rate": 2.270278750726194e-06, + "loss": 0.8044, + "step": 5280 + }, + { + "epoch": 0.79, + "grad_norm": 1.3956000475771186, + "learning_rate": 2.267213265993058e-06, + "loss": 0.9067, + "step": 5281 + }, + { + "epoch": 0.79, + "grad_norm": 1.6103565656677958, + "learning_rate": 2.2641495876396713e-06, + "loss": 0.8752, + "step": 5282 + }, + { + "epoch": 0.79, + "grad_norm": 1.4045429133576957, + "learning_rate": 2.2610877163817113e-06, + "loss": 0.822, + "step": 5283 + }, + { + "epoch": 0.79, + "grad_norm": 1.2697186999216175, + "learning_rate": 2.2580276529344312e-06, + "loss": 0.8148, + "step": 5284 + }, + { + "epoch": 0.79, + "grad_norm": 1.4721477791839408, + "learning_rate": 2.254969398012663e-06, + "loss": 0.8856, + "step": 5285 + }, + { + "epoch": 0.79, + "grad_norm": 1.358093066277862, + "learning_rate": 2.2519129523308146e-06, + "loss": 0.8148, + "step": 5286 + }, + { + "epoch": 0.79, + "grad_norm": 1.3097182267848255, + "learning_rate": 2.2488583166028754e-06, + "loss": 0.85, + "step": 5287 + }, + { + "epoch": 0.79, + "grad_norm": 1.6008923320524924, + "learning_rate": 2.2458054915424033e-06, + "loss": 0.8212, + "step": 5288 + }, + { + "epoch": 0.79, + "grad_norm": 1.5377344741108048, + "learning_rate": 2.242754477862541e-06, + "loss": 0.9197, + "step": 5289 + }, + { + "epoch": 0.79, + "grad_norm": 1.3935879968673492, + "learning_rate": 2.239705276276004e-06, + "loss": 0.6686, + "step": 5290 + }, + { + "epoch": 0.79, + "grad_norm": 1.4539789976955533, + "learning_rate": 2.2366578874950894e-06, + "loss": 0.8263, + "step": 5291 + }, + { + "epoch": 0.79, + "grad_norm": 1.4132237100301375, + "learning_rate": 2.2336123122316642e-06, + "loss": 0.8171, + "step": 5292 + }, + { + "epoch": 0.79, + "grad_norm": 1.4586722526510985, + "learning_rate": 2.2305685511971777e-06, + "loss": 0.7955, + "step": 5293 + }, + { + "epoch": 0.79, + "grad_norm": 1.3740479406789656, + "learning_rate": 2.2275266051026533e-06, + "loss": 0.8507, + "step": 5294 + }, + { + "epoch": 0.79, + "grad_norm": 1.3185802465432919, + "learning_rate": 2.2244864746586858e-06, + "loss": 0.7254, + "step": 5295 + }, + { + "epoch": 0.79, + "grad_norm": 1.5739235116017158, + "learning_rate": 2.2214481605754532e-06, + "loss": 0.8483, + "step": 5296 + }, + { + "epoch": 0.79, + "grad_norm": 1.4799101405989241, + "learning_rate": 2.218411663562704e-06, + "loss": 0.8058, + "step": 5297 + }, + { + "epoch": 0.79, + "grad_norm": 1.4417786188295285, + "learning_rate": 2.215376984329767e-06, + "loss": 0.7945, + "step": 5298 + }, + { + "epoch": 0.79, + "grad_norm": 1.4823076407706, + "learning_rate": 2.2123441235855404e-06, + "loss": 0.7885, + "step": 5299 + }, + { + "epoch": 0.79, + "grad_norm": 1.409585830784883, + "learning_rate": 2.2093130820385057e-06, + "loss": 0.9087, + "step": 5300 + }, + { + "epoch": 0.79, + "grad_norm": 1.3536009300620115, + "learning_rate": 2.2062838603967086e-06, + "loss": 0.7439, + "step": 5301 + }, + { + "epoch": 0.79, + "grad_norm": 1.6376922723404126, + "learning_rate": 2.2032564593677773e-06, + "loss": 0.8306, + "step": 5302 + }, + { + "epoch": 0.79, + "grad_norm": 1.6286567421425713, + "learning_rate": 2.2002308796589146e-06, + "loss": 0.8224, + "step": 5303 + }, + { + "epoch": 0.79, + "grad_norm": 1.3952200274512352, + "learning_rate": 2.197207121976895e-06, + "loss": 0.8228, + "step": 5304 + }, + { + "epoch": 0.79, + "grad_norm": 1.5100682857093606, + "learning_rate": 2.1941851870280697e-06, + "loss": 0.8633, + "step": 5305 + }, + { + "epoch": 0.79, + "grad_norm": 1.4221243770650307, + "learning_rate": 2.1911650755183646e-06, + "loss": 0.8321, + "step": 5306 + }, + { + "epoch": 0.79, + "grad_norm": 1.462208790142082, + "learning_rate": 2.1881467881532737e-06, + "loss": 0.7677, + "step": 5307 + }, + { + "epoch": 0.79, + "grad_norm": 1.4424789373436055, + "learning_rate": 2.18513032563787e-06, + "loss": 0.7192, + "step": 5308 + }, + { + "epoch": 0.79, + "grad_norm": 1.4481557118018122, + "learning_rate": 2.1821156886768013e-06, + "loss": 0.8455, + "step": 5309 + }, + { + "epoch": 0.79, + "grad_norm": 1.3664316424469038, + "learning_rate": 2.179102877974287e-06, + "loss": 0.7968, + "step": 5310 + }, + { + "epoch": 0.79, + "grad_norm": 1.6382194725462393, + "learning_rate": 2.1760918942341193e-06, + "loss": 0.8453, + "step": 5311 + }, + { + "epoch": 0.79, + "grad_norm": 1.3278957112921814, + "learning_rate": 2.1730827381596643e-06, + "loss": 0.791, + "step": 5312 + }, + { + "epoch": 0.79, + "grad_norm": 1.549786987636284, + "learning_rate": 2.1700754104538645e-06, + "loss": 0.8028, + "step": 5313 + }, + { + "epoch": 0.79, + "grad_norm": 2.1473408579159785, + "learning_rate": 2.167069911819225e-06, + "loss": 0.8213, + "step": 5314 + }, + { + "epoch": 0.79, + "grad_norm": 1.382246412771008, + "learning_rate": 2.164066242957836e-06, + "loss": 0.8631, + "step": 5315 + }, + { + "epoch": 0.79, + "grad_norm": 0.8344783487008602, + "learning_rate": 2.1610644045713525e-06, + "loss": 0.3076, + "step": 5316 + }, + { + "epoch": 0.79, + "grad_norm": 1.5441011492544239, + "learning_rate": 2.158064397361005e-06, + "loss": 0.8532, + "step": 5317 + }, + { + "epoch": 0.79, + "grad_norm": 0.950255354839666, + "learning_rate": 2.1550662220275955e-06, + "loss": 0.3057, + "step": 5318 + }, + { + "epoch": 0.79, + "grad_norm": 1.4853759784936431, + "learning_rate": 2.152069879271501e-06, + "loss": 0.8486, + "step": 5319 + }, + { + "epoch": 0.79, + "grad_norm": 1.4454438136993746, + "learning_rate": 2.1490753697926613e-06, + "loss": 0.8341, + "step": 5320 + }, + { + "epoch": 0.79, + "grad_norm": 1.4622404209473119, + "learning_rate": 2.1460826942905977e-06, + "loss": 0.8743, + "step": 5321 + }, + { + "epoch": 0.79, + "grad_norm": 1.3797156454716968, + "learning_rate": 2.1430918534643996e-06, + "loss": 0.7598, + "step": 5322 + }, + { + "epoch": 0.79, + "grad_norm": 1.5968018872923193, + "learning_rate": 2.140102848012726e-06, + "loss": 0.8278, + "step": 5323 + }, + { + "epoch": 0.79, + "grad_norm": 0.7060098238102301, + "learning_rate": 2.1371156786338108e-06, + "loss": 0.3128, + "step": 5324 + }, + { + "epoch": 0.79, + "grad_norm": 1.3758143227136712, + "learning_rate": 2.134130346025457e-06, + "loss": 0.8858, + "step": 5325 + }, + { + "epoch": 0.79, + "grad_norm": 0.8866841618266074, + "learning_rate": 2.13114685088504e-06, + "loss": 0.3158, + "step": 5326 + }, + { + "epoch": 0.79, + "grad_norm": 1.618890873061229, + "learning_rate": 2.1281651939094996e-06, + "loss": 0.7638, + "step": 5327 + }, + { + "epoch": 0.79, + "grad_norm": 1.8191641502751412, + "learning_rate": 2.1251853757953546e-06, + "loss": 0.8254, + "step": 5328 + }, + { + "epoch": 0.8, + "grad_norm": 1.406772042438374, + "learning_rate": 2.1222073972386903e-06, + "loss": 0.7824, + "step": 5329 + }, + { + "epoch": 0.8, + "grad_norm": 0.9188725734587385, + "learning_rate": 2.1192312589351626e-06, + "loss": 0.3072, + "step": 5330 + }, + { + "epoch": 0.8, + "grad_norm": 1.550843961826565, + "learning_rate": 2.1162569615799978e-06, + "loss": 0.8175, + "step": 5331 + }, + { + "epoch": 0.8, + "grad_norm": 1.5359053052232876, + "learning_rate": 2.1132845058679942e-06, + "loss": 0.8831, + "step": 5332 + }, + { + "epoch": 0.8, + "grad_norm": 1.3691604942044595, + "learning_rate": 2.110313892493514e-06, + "loss": 0.7478, + "step": 5333 + }, + { + "epoch": 0.8, + "grad_norm": 0.879156354710742, + "learning_rate": 2.107345122150495e-06, + "loss": 0.3026, + "step": 5334 + }, + { + "epoch": 0.8, + "grad_norm": 1.5930274302444687, + "learning_rate": 2.1043781955324406e-06, + "loss": 0.8315, + "step": 5335 + }, + { + "epoch": 0.8, + "grad_norm": 1.5767135817808149, + "learning_rate": 2.101413113332427e-06, + "loss": 0.8328, + "step": 5336 + }, + { + "epoch": 0.8, + "grad_norm": 1.4387097891525165, + "learning_rate": 2.098449876243096e-06, + "loss": 0.8215, + "step": 5337 + }, + { + "epoch": 0.8, + "grad_norm": 1.567902221655809, + "learning_rate": 2.0954884849566613e-06, + "loss": 0.8905, + "step": 5338 + }, + { + "epoch": 0.8, + "grad_norm": 1.312496117177397, + "learning_rate": 2.0925289401649074e-06, + "loss": 0.8174, + "step": 5339 + }, + { + "epoch": 0.8, + "grad_norm": 1.3971126283604443, + "learning_rate": 2.0895712425591776e-06, + "loss": 0.7667, + "step": 5340 + }, + { + "epoch": 0.8, + "grad_norm": 1.5511720378763254, + "learning_rate": 2.0866153928303947e-06, + "loss": 0.8443, + "step": 5341 + }, + { + "epoch": 0.8, + "grad_norm": 1.326341440515166, + "learning_rate": 2.083661391669043e-06, + "loss": 0.8235, + "step": 5342 + }, + { + "epoch": 0.8, + "grad_norm": 1.3307545513156853, + "learning_rate": 2.0807092397651795e-06, + "loss": 0.8682, + "step": 5343 + }, + { + "epoch": 0.8, + "grad_norm": 1.4253670642799852, + "learning_rate": 2.0777589378084263e-06, + "loss": 0.8324, + "step": 5344 + }, + { + "epoch": 0.8, + "grad_norm": 1.4893130020171332, + "learning_rate": 2.074810486487977e-06, + "loss": 0.8671, + "step": 5345 + }, + { + "epoch": 0.8, + "grad_norm": 1.4683105719418144, + "learning_rate": 2.0718638864925845e-06, + "loss": 0.8913, + "step": 5346 + }, + { + "epoch": 0.8, + "grad_norm": 1.485124944420155, + "learning_rate": 2.0689191385105787e-06, + "loss": 0.7797, + "step": 5347 + }, + { + "epoch": 0.8, + "grad_norm": 1.4233495198308606, + "learning_rate": 2.0659762432298527e-06, + "loss": 0.7999, + "step": 5348 + }, + { + "epoch": 0.8, + "grad_norm": 1.5214002510094353, + "learning_rate": 2.063035201337865e-06, + "loss": 0.8696, + "step": 5349 + }, + { + "epoch": 0.8, + "grad_norm": 1.432322488077478, + "learning_rate": 2.0600960135216463e-06, + "loss": 0.8727, + "step": 5350 + }, + { + "epoch": 0.8, + "grad_norm": 1.3551453168787113, + "learning_rate": 2.057158680467789e-06, + "loss": 0.8017, + "step": 5351 + }, + { + "epoch": 0.8, + "grad_norm": 1.4328855823486144, + "learning_rate": 2.0542232028624585e-06, + "loss": 0.8454, + "step": 5352 + }, + { + "epoch": 0.8, + "grad_norm": 1.3750459164842304, + "learning_rate": 2.051289581391377e-06, + "loss": 0.7989, + "step": 5353 + }, + { + "epoch": 0.8, + "grad_norm": 1.4856494596060186, + "learning_rate": 2.04835781673984e-06, + "loss": 0.791, + "step": 5354 + }, + { + "epoch": 0.8, + "grad_norm": 1.3117024561544246, + "learning_rate": 2.04542790959271e-06, + "loss": 0.851, + "step": 5355 + }, + { + "epoch": 0.8, + "grad_norm": 1.4377549971515386, + "learning_rate": 2.0424998606344127e-06, + "loss": 0.8665, + "step": 5356 + }, + { + "epoch": 0.8, + "grad_norm": 0.9241354861484622, + "learning_rate": 2.03957367054894e-06, + "loss": 0.2912, + "step": 5357 + }, + { + "epoch": 0.8, + "grad_norm": 1.4316737260934016, + "learning_rate": 2.036649340019854e-06, + "loss": 0.8389, + "step": 5358 + }, + { + "epoch": 0.8, + "grad_norm": 1.443895065695472, + "learning_rate": 2.0337268697302738e-06, + "loss": 0.7649, + "step": 5359 + }, + { + "epoch": 0.8, + "grad_norm": 1.4539548730739582, + "learning_rate": 2.0308062603628888e-06, + "loss": 0.8618, + "step": 5360 + }, + { + "epoch": 0.8, + "grad_norm": 1.5010686975171228, + "learning_rate": 2.0278875125999565e-06, + "loss": 0.7263, + "step": 5361 + }, + { + "epoch": 0.8, + "grad_norm": 0.8260585406193655, + "learning_rate": 2.024970627123295e-06, + "loss": 0.3464, + "step": 5362 + }, + { + "epoch": 0.8, + "grad_norm": 1.4445796919489027, + "learning_rate": 2.022055604614289e-06, + "loss": 0.7995, + "step": 5363 + }, + { + "epoch": 0.8, + "grad_norm": 1.368011901802301, + "learning_rate": 2.0191424457538923e-06, + "loss": 0.8362, + "step": 5364 + }, + { + "epoch": 0.8, + "grad_norm": 1.4680967679116654, + "learning_rate": 2.016231151222612e-06, + "loss": 0.8054, + "step": 5365 + }, + { + "epoch": 0.8, + "grad_norm": 1.5570133174700023, + "learning_rate": 2.0133217217005295e-06, + "loss": 0.7711, + "step": 5366 + }, + { + "epoch": 0.8, + "grad_norm": 1.5520615277476053, + "learning_rate": 2.0104141578672887e-06, + "loss": 0.8724, + "step": 5367 + }, + { + "epoch": 0.8, + "grad_norm": 1.5891897668906345, + "learning_rate": 2.0075084604020967e-06, + "loss": 0.8117, + "step": 5368 + }, + { + "epoch": 0.8, + "grad_norm": 1.3704469913285864, + "learning_rate": 2.004604629983725e-06, + "loss": 0.8463, + "step": 5369 + }, + { + "epoch": 0.8, + "grad_norm": 1.8332186817945408, + "learning_rate": 2.001702667290508e-06, + "loss": 0.8072, + "step": 5370 + }, + { + "epoch": 0.8, + "grad_norm": 1.719631561783302, + "learning_rate": 1.998802573000348e-06, + "loss": 0.8008, + "step": 5371 + }, + { + "epoch": 0.8, + "grad_norm": 1.4088136034876957, + "learning_rate": 1.9959043477907e-06, + "loss": 0.8846, + "step": 5372 + }, + { + "epoch": 0.8, + "grad_norm": 1.4511524817812553, + "learning_rate": 1.9930079923385946e-06, + "loss": 0.8108, + "step": 5373 + }, + { + "epoch": 0.8, + "grad_norm": 1.4416291799769558, + "learning_rate": 1.99011350732062e-06, + "loss": 0.8025, + "step": 5374 + }, + { + "epoch": 0.8, + "grad_norm": 1.2962009627312474, + "learning_rate": 1.987220893412928e-06, + "loss": 0.7932, + "step": 5375 + }, + { + "epoch": 0.8, + "grad_norm": 1.4065173464534155, + "learning_rate": 1.984330151291233e-06, + "loss": 0.8619, + "step": 5376 + }, + { + "epoch": 0.8, + "grad_norm": 1.4356291974396183, + "learning_rate": 1.981441281630816e-06, + "loss": 0.716, + "step": 5377 + }, + { + "epoch": 0.8, + "grad_norm": 1.420530063030565, + "learning_rate": 1.978554285106512e-06, + "loss": 0.8435, + "step": 5378 + }, + { + "epoch": 0.8, + "grad_norm": 1.386981267661283, + "learning_rate": 1.975669162392726e-06, + "loss": 0.8503, + "step": 5379 + }, + { + "epoch": 0.8, + "grad_norm": 1.3077479661038947, + "learning_rate": 1.972785914163423e-06, + "loss": 0.7825, + "step": 5380 + }, + { + "epoch": 0.8, + "grad_norm": 1.4674984921255925, + "learning_rate": 1.9699045410921303e-06, + "loss": 0.803, + "step": 5381 + }, + { + "epoch": 0.8, + "grad_norm": 1.4783606189721945, + "learning_rate": 1.967025043851939e-06, + "loss": 0.8165, + "step": 5382 + }, + { + "epoch": 0.8, + "grad_norm": 1.471136624506106, + "learning_rate": 1.9641474231154956e-06, + "loss": 0.7785, + "step": 5383 + }, + { + "epoch": 0.8, + "grad_norm": 1.4972478962810782, + "learning_rate": 1.9612716795550146e-06, + "loss": 0.7779, + "step": 5384 + }, + { + "epoch": 0.8, + "grad_norm": 1.33398224633579, + "learning_rate": 1.95839781384227e-06, + "loss": 0.8951, + "step": 5385 + }, + { + "epoch": 0.8, + "grad_norm": 1.4877879045961764, + "learning_rate": 1.9555258266485965e-06, + "loss": 0.9326, + "step": 5386 + }, + { + "epoch": 0.8, + "grad_norm": 1.2889626811133876, + "learning_rate": 1.9526557186448924e-06, + "loss": 0.7704, + "step": 5387 + }, + { + "epoch": 0.8, + "grad_norm": 1.4819842542004975, + "learning_rate": 1.9497874905016156e-06, + "loss": 0.7892, + "step": 5388 + }, + { + "epoch": 0.8, + "grad_norm": 1.566893118420538, + "learning_rate": 1.9469211428887813e-06, + "loss": 0.8179, + "step": 5389 + }, + { + "epoch": 0.8, + "grad_norm": 1.4152449845672597, + "learning_rate": 1.9440566764759704e-06, + "loss": 0.9246, + "step": 5390 + }, + { + "epoch": 0.8, + "grad_norm": 1.4861657396053622, + "learning_rate": 1.9411940919323215e-06, + "loss": 0.8003, + "step": 5391 + }, + { + "epoch": 0.8, + "grad_norm": 1.514813741250979, + "learning_rate": 1.9383333899265368e-06, + "loss": 0.8184, + "step": 5392 + }, + { + "epoch": 0.8, + "grad_norm": 1.3701586672865653, + "learning_rate": 1.9354745711268763e-06, + "loss": 0.8368, + "step": 5393 + }, + { + "epoch": 0.8, + "grad_norm": 1.3485465888843264, + "learning_rate": 1.9326176362011584e-06, + "loss": 0.8578, + "step": 5394 + }, + { + "epoch": 0.8, + "grad_norm": 1.4747462436491496, + "learning_rate": 1.9297625858167636e-06, + "loss": 0.7842, + "step": 5395 + }, + { + "epoch": 0.81, + "grad_norm": 1.5091024054912812, + "learning_rate": 1.9269094206406326e-06, + "loss": 0.8427, + "step": 5396 + }, + { + "epoch": 0.81, + "grad_norm": 1.5720568014752216, + "learning_rate": 1.9240581413392647e-06, + "loss": 0.8886, + "step": 5397 + }, + { + "epoch": 0.81, + "grad_norm": 2.297846175226786, + "learning_rate": 1.9212087485787233e-06, + "loss": 0.7749, + "step": 5398 + }, + { + "epoch": 0.81, + "grad_norm": 1.4141657288423555, + "learning_rate": 1.918361243024619e-06, + "loss": 0.8324, + "step": 5399 + }, + { + "epoch": 0.81, + "grad_norm": 1.4273951645429928, + "learning_rate": 1.9155156253421348e-06, + "loss": 0.8196, + "step": 5400 + }, + { + "epoch": 0.81, + "grad_norm": 1.4874439812979077, + "learning_rate": 1.9126718961960056e-06, + "loss": 0.776, + "step": 5401 + }, + { + "epoch": 0.81, + "grad_norm": 1.4351381245446067, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.7787, + "step": 5402 + }, + { + "epoch": 0.81, + "grad_norm": 1.5731438725047595, + "learning_rate": 1.906990106169555e-06, + "loss": 0.8282, + "step": 5403 + }, + { + "epoch": 0.81, + "grad_norm": 1.3985393622930269, + "learning_rate": 1.9041520466164988e-06, + "loss": 0.8458, + "step": 5404 + }, + { + "epoch": 0.81, + "grad_norm": 1.368068726462965, + "learning_rate": 1.9013158782543307e-06, + "loss": 0.8215, + "step": 5405 + }, + { + "epoch": 0.81, + "grad_norm": 0.9142689359705787, + "learning_rate": 1.8984816017455798e-06, + "loss": 0.3243, + "step": 5406 + }, + { + "epoch": 0.81, + "grad_norm": 1.5347651366817776, + "learning_rate": 1.8956492177523345e-06, + "loss": 0.803, + "step": 5407 + }, + { + "epoch": 0.81, + "grad_norm": 1.4512161072296008, + "learning_rate": 1.8928187269362398e-06, + "loss": 0.7971, + "step": 5408 + }, + { + "epoch": 0.81, + "grad_norm": 1.635393545659041, + "learning_rate": 1.8899901299584965e-06, + "loss": 0.8271, + "step": 5409 + }, + { + "epoch": 0.81, + "grad_norm": 1.461825113896843, + "learning_rate": 1.8871634274798701e-06, + "loss": 0.7527, + "step": 5410 + }, + { + "epoch": 0.81, + "grad_norm": 1.5204580589195327, + "learning_rate": 1.8843386201606729e-06, + "loss": 0.8204, + "step": 5411 + }, + { + "epoch": 0.81, + "grad_norm": 1.3681925437789058, + "learning_rate": 1.8815157086607826e-06, + "loss": 0.7905, + "step": 5412 + }, + { + "epoch": 0.81, + "grad_norm": 1.3815746363605288, + "learning_rate": 1.878694693639631e-06, + "loss": 0.8228, + "step": 5413 + }, + { + "epoch": 0.81, + "grad_norm": 1.4192743005331077, + "learning_rate": 1.8758755757562087e-06, + "loss": 0.8051, + "step": 5414 + }, + { + "epoch": 0.81, + "grad_norm": 1.637184468861064, + "learning_rate": 1.8730583556690607e-06, + "loss": 0.8609, + "step": 5415 + }, + { + "epoch": 0.81, + "grad_norm": 1.5111275078575743, + "learning_rate": 1.8702430340362932e-06, + "loss": 0.8047, + "step": 5416 + }, + { + "epoch": 0.81, + "grad_norm": 1.60423389568482, + "learning_rate": 1.86742961151556e-06, + "loss": 0.7251, + "step": 5417 + }, + { + "epoch": 0.81, + "grad_norm": 1.3892790376448796, + "learning_rate": 1.8646180887640808e-06, + "loss": 0.8207, + "step": 5418 + }, + { + "epoch": 0.81, + "grad_norm": 1.4942206841256545, + "learning_rate": 1.861808466438625e-06, + "loss": 0.8745, + "step": 5419 + }, + { + "epoch": 0.81, + "grad_norm": 1.5195657191743155, + "learning_rate": 1.8590007451955227e-06, + "loss": 0.855, + "step": 5420 + }, + { + "epoch": 0.81, + "grad_norm": 1.3182416460665347, + "learning_rate": 1.8561949256906576e-06, + "loss": 0.7961, + "step": 5421 + }, + { + "epoch": 0.81, + "grad_norm": 1.4030934540663471, + "learning_rate": 1.8533910085794714e-06, + "loss": 0.6997, + "step": 5422 + }, + { + "epoch": 0.81, + "grad_norm": 1.3457299328086845, + "learning_rate": 1.8505889945169552e-06, + "loss": 0.7676, + "step": 5423 + }, + { + "epoch": 0.81, + "grad_norm": 1.7200951162355735, + "learning_rate": 1.8477888841576619e-06, + "loss": 0.833, + "step": 5424 + }, + { + "epoch": 0.81, + "grad_norm": 1.5666898663294038, + "learning_rate": 1.8449906781556959e-06, + "loss": 0.7709, + "step": 5425 + }, + { + "epoch": 0.81, + "grad_norm": 1.4403661668640364, + "learning_rate": 1.842194377164721e-06, + "loss": 0.8095, + "step": 5426 + }, + { + "epoch": 0.81, + "grad_norm": 1.3491385214056537, + "learning_rate": 1.8393999818379527e-06, + "loss": 0.7634, + "step": 5427 + }, + { + "epoch": 0.81, + "grad_norm": 1.3301694743398509, + "learning_rate": 1.8366074928281608e-06, + "loss": 0.8202, + "step": 5428 + }, + { + "epoch": 0.81, + "grad_norm": 0.8879331586424297, + "learning_rate": 1.8338169107876746e-06, + "loss": 0.3175, + "step": 5429 + }, + { + "epoch": 0.81, + "grad_norm": 1.3519710070880397, + "learning_rate": 1.8310282363683686e-06, + "loss": 0.8608, + "step": 5430 + }, + { + "epoch": 0.81, + "grad_norm": 1.5633425384109751, + "learning_rate": 1.828241470221681e-06, + "loss": 0.8305, + "step": 5431 + }, + { + "epoch": 0.81, + "grad_norm": 1.4968910744526684, + "learning_rate": 1.8254566129985996e-06, + "loss": 0.8011, + "step": 5432 + }, + { + "epoch": 0.81, + "grad_norm": 1.4919415715662103, + "learning_rate": 1.822673665349668e-06, + "loss": 0.8718, + "step": 5433 + }, + { + "epoch": 0.81, + "grad_norm": 0.9065972620641687, + "learning_rate": 1.8198926279249828e-06, + "loss": 0.3274, + "step": 5434 + }, + { + "epoch": 0.81, + "grad_norm": 1.3408161261896738, + "learning_rate": 1.817113501374197e-06, + "loss": 0.8617, + "step": 5435 + }, + { + "epoch": 0.81, + "grad_norm": 1.4891788486746218, + "learning_rate": 1.8143362863465098e-06, + "loss": 0.7444, + "step": 5436 + }, + { + "epoch": 0.81, + "grad_norm": 1.608909984761927, + "learning_rate": 1.8115609834906821e-06, + "loss": 0.8493, + "step": 5437 + }, + { + "epoch": 0.81, + "grad_norm": 0.9057507054155469, + "learning_rate": 1.8087875934550237e-06, + "loss": 0.3092, + "step": 5438 + }, + { + "epoch": 0.81, + "grad_norm": 1.480485169966335, + "learning_rate": 1.8060161168874002e-06, + "loss": 0.8688, + "step": 5439 + }, + { + "epoch": 0.81, + "grad_norm": 1.3153707806850248, + "learning_rate": 1.8032465544352274e-06, + "loss": 0.8455, + "step": 5440 + }, + { + "epoch": 0.81, + "grad_norm": 1.6466245964860347, + "learning_rate": 1.8004789067454763e-06, + "loss": 0.7466, + "step": 5441 + }, + { + "epoch": 0.81, + "grad_norm": 1.5009255769765546, + "learning_rate": 1.7977131744646724e-06, + "loss": 0.7905, + "step": 5442 + }, + { + "epoch": 0.81, + "grad_norm": 1.5511504641793747, + "learning_rate": 1.7949493582388856e-06, + "loss": 0.7825, + "step": 5443 + }, + { + "epoch": 0.81, + "grad_norm": 1.551146739245963, + "learning_rate": 1.7921874587137455e-06, + "loss": 0.7932, + "step": 5444 + }, + { + "epoch": 0.81, + "grad_norm": 1.4813303748840447, + "learning_rate": 1.7894274765344322e-06, + "loss": 0.8227, + "step": 5445 + }, + { + "epoch": 0.81, + "grad_norm": 1.495632531310112, + "learning_rate": 1.7866694123456796e-06, + "loss": 0.9659, + "step": 5446 + }, + { + "epoch": 0.81, + "grad_norm": 1.42540853410187, + "learning_rate": 1.7839132667917692e-06, + "loss": 0.7582, + "step": 5447 + }, + { + "epoch": 0.81, + "grad_norm": 1.407862157718545, + "learning_rate": 1.7811590405165413e-06, + "loss": 0.7592, + "step": 5448 + }, + { + "epoch": 0.81, + "grad_norm": 1.3616127417179036, + "learning_rate": 1.7784067341633781e-06, + "loss": 0.8078, + "step": 5449 + }, + { + "epoch": 0.81, + "grad_norm": 0.837409960876165, + "learning_rate": 1.775656348375221e-06, + "loss": 0.3608, + "step": 5450 + }, + { + "epoch": 0.81, + "grad_norm": 1.7862977420501167, + "learning_rate": 1.7729078837945602e-06, + "loss": 0.8365, + "step": 5451 + }, + { + "epoch": 0.81, + "grad_norm": 0.7679983407166945, + "learning_rate": 1.7701613410634367e-06, + "loss": 0.3726, + "step": 5452 + }, + { + "epoch": 0.81, + "grad_norm": 0.824373552736497, + "learning_rate": 1.7674167208234438e-06, + "loss": 0.3395, + "step": 5453 + }, + { + "epoch": 0.81, + "grad_norm": 1.4208820342070756, + "learning_rate": 1.7646740237157256e-06, + "loss": 0.8493, + "step": 5454 + }, + { + "epoch": 0.81, + "grad_norm": 1.7536404929779834, + "learning_rate": 1.7619332503809771e-06, + "loss": 0.8722, + "step": 5455 + }, + { + "epoch": 0.81, + "grad_norm": 0.8505266871876735, + "learning_rate": 1.7591944014594408e-06, + "loss": 0.3603, + "step": 5456 + }, + { + "epoch": 0.81, + "grad_norm": 1.5725560251819182, + "learning_rate": 1.7564574775909127e-06, + "loss": 0.8108, + "step": 5457 + }, + { + "epoch": 0.81, + "grad_norm": 1.4459537436782959, + "learning_rate": 1.75372247941474e-06, + "loss": 0.7431, + "step": 5458 + }, + { + "epoch": 0.81, + "grad_norm": 1.4518695940688044, + "learning_rate": 1.7509894075698165e-06, + "loss": 0.7779, + "step": 5459 + }, + { + "epoch": 0.81, + "grad_norm": 1.2579137060960675, + "learning_rate": 1.74825826269459e-06, + "loss": 0.7821, + "step": 5460 + }, + { + "epoch": 0.81, + "grad_norm": 0.7987288411006441, + "learning_rate": 1.7455290454270567e-06, + "loss": 0.3288, + "step": 5461 + }, + { + "epoch": 0.81, + "grad_norm": 1.4483407244918518, + "learning_rate": 1.7428017564047594e-06, + "loss": 0.8205, + "step": 5462 + }, + { + "epoch": 0.82, + "grad_norm": 1.3118202103777108, + "learning_rate": 1.7400763962647937e-06, + "loss": 0.8173, + "step": 5463 + }, + { + "epoch": 0.82, + "grad_norm": 1.5396675993508369, + "learning_rate": 1.7373529656438048e-06, + "loss": 0.8424, + "step": 5464 + }, + { + "epoch": 0.82, + "grad_norm": 1.389698906931674, + "learning_rate": 1.7346314651779872e-06, + "loss": 0.8204, + "step": 5465 + }, + { + "epoch": 0.82, + "grad_norm": 1.494668317622261, + "learning_rate": 1.7319118955030812e-06, + "loss": 0.8112, + "step": 5466 + }, + { + "epoch": 0.82, + "grad_norm": 1.297368352188807, + "learning_rate": 1.7291942572543806e-06, + "loss": 0.7652, + "step": 5467 + }, + { + "epoch": 0.82, + "grad_norm": 1.633014915094076, + "learning_rate": 1.7264785510667281e-06, + "loss": 0.7763, + "step": 5468 + }, + { + "epoch": 0.82, + "grad_norm": 1.3795988886870105, + "learning_rate": 1.723764777574508e-06, + "loss": 0.829, + "step": 5469 + }, + { + "epoch": 0.82, + "grad_norm": 1.5189054188364544, + "learning_rate": 1.7210529374116603e-06, + "loss": 0.7594, + "step": 5470 + }, + { + "epoch": 0.82, + "grad_norm": 1.480291764895458, + "learning_rate": 1.718343031211671e-06, + "loss": 0.814, + "step": 5471 + }, + { + "epoch": 0.82, + "grad_norm": 1.5080355296449017, + "learning_rate": 1.7156350596075743e-06, + "loss": 0.7721, + "step": 5472 + }, + { + "epoch": 0.82, + "grad_norm": 1.5572793959024667, + "learning_rate": 1.712929023231954e-06, + "loss": 0.9408, + "step": 5473 + }, + { + "epoch": 0.82, + "grad_norm": 1.5542340264241221, + "learning_rate": 1.710224922716941e-06, + "loss": 0.793, + "step": 5474 + }, + { + "epoch": 0.82, + "grad_norm": 1.5082668289507613, + "learning_rate": 1.7075227586942101e-06, + "loss": 0.8061, + "step": 5475 + }, + { + "epoch": 0.82, + "grad_norm": 1.4416454906202698, + "learning_rate": 1.7048225317949873e-06, + "loss": 0.7459, + "step": 5476 + }, + { + "epoch": 0.82, + "grad_norm": 1.490440949287603, + "learning_rate": 1.7021242426500495e-06, + "loss": 0.8798, + "step": 5477 + }, + { + "epoch": 0.82, + "grad_norm": 1.5788130841710728, + "learning_rate": 1.6994278918897146e-06, + "loss": 0.7776, + "step": 5478 + }, + { + "epoch": 0.82, + "grad_norm": 1.4502246295007821, + "learning_rate": 1.6967334801438507e-06, + "loss": 0.8834, + "step": 5479 + }, + { + "epoch": 0.82, + "grad_norm": 1.4164442249021554, + "learning_rate": 1.6940410080418723e-06, + "loss": 0.7219, + "step": 5480 + }, + { + "epoch": 0.82, + "grad_norm": 1.4303587284987394, + "learning_rate": 1.6913504762127452e-06, + "loss": 0.8324, + "step": 5481 + }, + { + "epoch": 0.82, + "grad_norm": 1.4338681350810047, + "learning_rate": 1.6886618852849723e-06, + "loss": 0.8403, + "step": 5482 + }, + { + "epoch": 0.82, + "grad_norm": 1.566794449013292, + "learning_rate": 1.685975235886611e-06, + "loss": 0.8762, + "step": 5483 + }, + { + "epoch": 0.82, + "grad_norm": 1.5650936153858075, + "learning_rate": 1.6832905286452616e-06, + "loss": 0.8565, + "step": 5484 + }, + { + "epoch": 0.82, + "grad_norm": 1.5441902866285886, + "learning_rate": 1.6806077641880746e-06, + "loss": 0.77, + "step": 5485 + }, + { + "epoch": 0.82, + "grad_norm": 1.4211188115955646, + "learning_rate": 1.6779269431417421e-06, + "loss": 0.8214, + "step": 5486 + }, + { + "epoch": 0.82, + "grad_norm": 1.469878819332048, + "learning_rate": 1.6752480661325077e-06, + "loss": 0.8167, + "step": 5487 + }, + { + "epoch": 0.82, + "grad_norm": 1.5274607084327922, + "learning_rate": 1.6725711337861517e-06, + "loss": 0.7898, + "step": 5488 + }, + { + "epoch": 0.82, + "grad_norm": 1.724208090636435, + "learning_rate": 1.6698961467280073e-06, + "loss": 0.8674, + "step": 5489 + }, + { + "epoch": 0.82, + "grad_norm": 1.323917594496611, + "learning_rate": 1.6672231055829547e-06, + "loss": 0.8017, + "step": 5490 + }, + { + "epoch": 0.82, + "grad_norm": 1.450098527703443, + "learning_rate": 1.6645520109754132e-06, + "loss": 0.8718, + "step": 5491 + }, + { + "epoch": 0.82, + "grad_norm": 1.4373245611629832, + "learning_rate": 1.6618828635293538e-06, + "loss": 0.8192, + "step": 5492 + }, + { + "epoch": 0.82, + "grad_norm": 1.4608870774252785, + "learning_rate": 1.6592156638682887e-06, + "loss": 0.782, + "step": 5493 + }, + { + "epoch": 0.82, + "grad_norm": 0.8681398448419817, + "learning_rate": 1.6565504126152742e-06, + "loss": 0.3591, + "step": 5494 + }, + { + "epoch": 0.82, + "grad_norm": 1.468811983084041, + "learning_rate": 1.6538871103929144e-06, + "loss": 0.7336, + "step": 5495 + }, + { + "epoch": 0.82, + "grad_norm": 1.4895131208110255, + "learning_rate": 1.651225757823357e-06, + "loss": 0.7878, + "step": 5496 + }, + { + "epoch": 0.82, + "grad_norm": 1.481807201846488, + "learning_rate": 1.6485663555282949e-06, + "loss": 0.8492, + "step": 5497 + }, + { + "epoch": 0.82, + "grad_norm": 1.5016800181957053, + "learning_rate": 1.6459089041289634e-06, + "loss": 0.8835, + "step": 5498 + }, + { + "epoch": 0.82, + "grad_norm": 1.447151729797157, + "learning_rate": 1.6432534042461446e-06, + "loss": 0.8699, + "step": 5499 + }, + { + "epoch": 0.82, + "grad_norm": 1.6934858085748248, + "learning_rate": 1.6405998565001646e-06, + "loss": 0.904, + "step": 5500 + }, + { + "epoch": 0.82, + "grad_norm": 1.3007438313483457, + "learning_rate": 1.6379482615108888e-06, + "loss": 0.8035, + "step": 5501 + }, + { + "epoch": 0.82, + "grad_norm": 1.4078265296991646, + "learning_rate": 1.6352986198977327e-06, + "loss": 0.813, + "step": 5502 + }, + { + "epoch": 0.82, + "grad_norm": 1.4457536552501893, + "learning_rate": 1.6326509322796513e-06, + "loss": 0.7866, + "step": 5503 + }, + { + "epoch": 0.82, + "grad_norm": 1.5888623704702443, + "learning_rate": 1.6300051992751463e-06, + "loss": 0.8373, + "step": 5504 + }, + { + "epoch": 0.82, + "grad_norm": 1.5273461323546347, + "learning_rate": 1.6273614215022636e-06, + "loss": 0.7989, + "step": 5505 + }, + { + "epoch": 0.82, + "grad_norm": 1.5966160367825208, + "learning_rate": 1.6247195995785836e-06, + "loss": 0.877, + "step": 5506 + }, + { + "epoch": 0.82, + "grad_norm": 1.4706451707290387, + "learning_rate": 1.6220797341212401e-06, + "loss": 0.7165, + "step": 5507 + }, + { + "epoch": 0.82, + "grad_norm": 1.409521968747777, + "learning_rate": 1.6194418257469047e-06, + "loss": 0.8675, + "step": 5508 + }, + { + "epoch": 0.82, + "grad_norm": 1.5564335717663023, + "learning_rate": 1.616805875071794e-06, + "loss": 0.8843, + "step": 5509 + }, + { + "epoch": 0.82, + "grad_norm": 1.2637774601711118, + "learning_rate": 1.6141718827116659e-06, + "loss": 0.7899, + "step": 5510 + }, + { + "epoch": 0.82, + "grad_norm": 1.5686378730543915, + "learning_rate": 1.6115398492818235e-06, + "loss": 0.7816, + "step": 5511 + }, + { + "epoch": 0.82, + "grad_norm": 1.566122470511675, + "learning_rate": 1.6089097753971061e-06, + "loss": 0.8672, + "step": 5512 + }, + { + "epoch": 0.82, + "grad_norm": 1.697413481427657, + "learning_rate": 1.6062816616719013e-06, + "loss": 0.7988, + "step": 5513 + }, + { + "epoch": 0.82, + "grad_norm": 1.5111844365468987, + "learning_rate": 1.6036555087201366e-06, + "loss": 0.8115, + "step": 5514 + }, + { + "epoch": 0.82, + "grad_norm": 1.4064719084651394, + "learning_rate": 1.6010313171552804e-06, + "loss": 0.8659, + "step": 5515 + }, + { + "epoch": 0.82, + "grad_norm": 1.4776909845260506, + "learning_rate": 1.598409087590349e-06, + "loss": 0.8006, + "step": 5516 + }, + { + "epoch": 0.82, + "grad_norm": 1.2557623367432647, + "learning_rate": 1.595788820637888e-06, + "loss": 0.8216, + "step": 5517 + }, + { + "epoch": 0.82, + "grad_norm": 0.9164182432906981, + "learning_rate": 1.5931705169099965e-06, + "loss": 0.3066, + "step": 5518 + }, + { + "epoch": 0.82, + "grad_norm": 1.5187019706027092, + "learning_rate": 1.5905541770183096e-06, + "loss": 0.8389, + "step": 5519 + }, + { + "epoch": 0.82, + "grad_norm": 1.4841139363050238, + "learning_rate": 1.5879398015740043e-06, + "loss": 0.7987, + "step": 5520 + }, + { + "epoch": 0.82, + "grad_norm": 0.8975412075519973, + "learning_rate": 1.5853273911878031e-06, + "loss": 0.3308, + "step": 5521 + }, + { + "epoch": 0.82, + "grad_norm": 1.4671420708327252, + "learning_rate": 1.5827169464699576e-06, + "loss": 0.7869, + "step": 5522 + }, + { + "epoch": 0.82, + "grad_norm": 1.6414938703587976, + "learning_rate": 1.5801084680302725e-06, + "loss": 0.7815, + "step": 5523 + }, + { + "epoch": 0.82, + "grad_norm": 1.3630373252883512, + "learning_rate": 1.577501956478088e-06, + "loss": 0.9018, + "step": 5524 + }, + { + "epoch": 0.82, + "grad_norm": 1.391208658132168, + "learning_rate": 1.574897412422286e-06, + "loss": 0.8026, + "step": 5525 + }, + { + "epoch": 0.82, + "grad_norm": 1.4961974752938094, + "learning_rate": 1.5722948364712898e-06, + "loss": 0.8504, + "step": 5526 + }, + { + "epoch": 0.82, + "grad_norm": 1.3970538658742992, + "learning_rate": 1.5696942292330574e-06, + "loss": 0.8241, + "step": 5527 + }, + { + "epoch": 0.82, + "grad_norm": 1.3240620518979163, + "learning_rate": 1.567095591315092e-06, + "loss": 0.85, + "step": 5528 + }, + { + "epoch": 0.82, + "grad_norm": 1.496021946934474, + "learning_rate": 1.5644989233244367e-06, + "loss": 0.8403, + "step": 5529 + }, + { + "epoch": 0.83, + "grad_norm": 1.7780968941032695, + "learning_rate": 1.5619042258676743e-06, + "loss": 0.8073, + "step": 5530 + }, + { + "epoch": 0.83, + "grad_norm": 1.2583673301924199, + "learning_rate": 1.559311499550924e-06, + "loss": 0.7656, + "step": 5531 + }, + { + "epoch": 0.83, + "grad_norm": 1.5084843361784621, + "learning_rate": 1.5567207449798517e-06, + "loss": 0.8093, + "step": 5532 + }, + { + "epoch": 0.83, + "grad_norm": 1.6460516067871112, + "learning_rate": 1.5541319627596517e-06, + "loss": 0.776, + "step": 5533 + }, + { + "epoch": 0.83, + "grad_norm": 1.5494695221710313, + "learning_rate": 1.5515451534950665e-06, + "loss": 0.8247, + "step": 5534 + }, + { + "epoch": 0.83, + "grad_norm": 1.4142065481981463, + "learning_rate": 1.5489603177903756e-06, + "loss": 0.8355, + "step": 5535 + }, + { + "epoch": 0.83, + "grad_norm": 1.444022779838901, + "learning_rate": 1.546377456249396e-06, + "loss": 0.8107, + "step": 5536 + }, + { + "epoch": 0.83, + "grad_norm": 1.3935138660325066, + "learning_rate": 1.5437965694754842e-06, + "loss": 0.8107, + "step": 5537 + }, + { + "epoch": 0.83, + "grad_norm": 1.4773841885156629, + "learning_rate": 1.5412176580715367e-06, + "loss": 0.813, + "step": 5538 + }, + { + "epoch": 0.83, + "grad_norm": 1.350258353742779, + "learning_rate": 1.5386407226399892e-06, + "loss": 0.9231, + "step": 5539 + }, + { + "epoch": 0.83, + "grad_norm": 1.4658333125290868, + "learning_rate": 1.5360657637828103e-06, + "loss": 0.8656, + "step": 5540 + }, + { + "epoch": 0.83, + "grad_norm": 1.5984906393784741, + "learning_rate": 1.533492782101511e-06, + "loss": 0.8379, + "step": 5541 + }, + { + "epoch": 0.83, + "grad_norm": 0.9607959445338982, + "learning_rate": 1.5309217781971419e-06, + "loss": 0.3203, + "step": 5542 + }, + { + "epoch": 0.83, + "grad_norm": 1.5163276766486915, + "learning_rate": 1.5283527526702891e-06, + "loss": 0.8196, + "step": 5543 + }, + { + "epoch": 0.83, + "grad_norm": 1.4747659600090726, + "learning_rate": 1.525785706121077e-06, + "loss": 0.7726, + "step": 5544 + }, + { + "epoch": 0.83, + "grad_norm": 1.433930879055901, + "learning_rate": 1.52322063914917e-06, + "loss": 0.9098, + "step": 5545 + }, + { + "epoch": 0.83, + "grad_norm": 1.4701327234782449, + "learning_rate": 1.5206575523537649e-06, + "loss": 0.8306, + "step": 5546 + }, + { + "epoch": 0.83, + "grad_norm": 1.6304313189272786, + "learning_rate": 1.518096446333599e-06, + "loss": 0.7637, + "step": 5547 + }, + { + "epoch": 0.83, + "grad_norm": 1.65617738308151, + "learning_rate": 1.5155373216869485e-06, + "loss": 0.7441, + "step": 5548 + }, + { + "epoch": 0.83, + "grad_norm": 1.5414975316928146, + "learning_rate": 1.5129801790116261e-06, + "loss": 0.8168, + "step": 5549 + }, + { + "epoch": 0.83, + "grad_norm": 1.3932105857881663, + "learning_rate": 1.5104250189049786e-06, + "loss": 0.8248, + "step": 5550 + }, + { + "epoch": 0.83, + "grad_norm": 1.6404144594793981, + "learning_rate": 1.5078718419638939e-06, + "loss": 0.8879, + "step": 5551 + }, + { + "epoch": 0.83, + "grad_norm": 1.5051287190843852, + "learning_rate": 1.5053206487847916e-06, + "loss": 0.8251, + "step": 5552 + }, + { + "epoch": 0.83, + "grad_norm": 1.5778456490253587, + "learning_rate": 1.5027714399636318e-06, + "loss": 0.7344, + "step": 5553 + }, + { + "epoch": 0.83, + "grad_norm": 1.4540634957295167, + "learning_rate": 1.5002242160959102e-06, + "loss": 0.876, + "step": 5554 + }, + { + "epoch": 0.83, + "grad_norm": 1.3944013702726914, + "learning_rate": 1.4976789777766576e-06, + "loss": 0.8261, + "step": 5555 + }, + { + "epoch": 0.83, + "grad_norm": 1.459976009941761, + "learning_rate": 1.4951357256004439e-06, + "loss": 0.8651, + "step": 5556 + }, + { + "epoch": 0.83, + "grad_norm": 1.4746105623556245, + "learning_rate": 1.4925944601613718e-06, + "loss": 0.858, + "step": 5557 + }, + { + "epoch": 0.83, + "grad_norm": 1.644449789245113, + "learning_rate": 1.490055182053083e-06, + "loss": 0.7707, + "step": 5558 + }, + { + "epoch": 0.83, + "grad_norm": 1.7752689156187316, + "learning_rate": 1.4875178918687493e-06, + "loss": 0.8024, + "step": 5559 + }, + { + "epoch": 0.83, + "grad_norm": 1.6857290766204718, + "learning_rate": 1.4849825902010851e-06, + "loss": 0.8048, + "step": 5560 + }, + { + "epoch": 0.83, + "grad_norm": 0.8098440466767464, + "learning_rate": 1.4824492776423349e-06, + "loss": 0.3148, + "step": 5561 + }, + { + "epoch": 0.83, + "grad_norm": 1.7196549666587329, + "learning_rate": 1.4799179547842823e-06, + "loss": 0.81, + "step": 5562 + }, + { + "epoch": 0.83, + "grad_norm": 1.4645435413452035, + "learning_rate": 1.4773886222182442e-06, + "loss": 0.8788, + "step": 5563 + }, + { + "epoch": 0.83, + "grad_norm": 1.2583547143180198, + "learning_rate": 1.474861280535076e-06, + "loss": 0.8497, + "step": 5564 + }, + { + "epoch": 0.83, + "grad_norm": 1.3721015946972879, + "learning_rate": 1.4723359303251594e-06, + "loss": 0.7628, + "step": 5565 + }, + { + "epoch": 0.83, + "grad_norm": 1.58826013842811, + "learning_rate": 1.4698125721784183e-06, + "loss": 0.7902, + "step": 5566 + }, + { + "epoch": 0.83, + "grad_norm": 0.829573092816346, + "learning_rate": 1.4672912066843103e-06, + "loss": 0.3181, + "step": 5567 + }, + { + "epoch": 0.83, + "grad_norm": 1.5418678877428342, + "learning_rate": 1.4647718344318263e-06, + "loss": 0.8186, + "step": 5568 + }, + { + "epoch": 0.83, + "grad_norm": 1.4416269105037618, + "learning_rate": 1.462254456009493e-06, + "loss": 0.7823, + "step": 5569 + }, + { + "epoch": 0.83, + "grad_norm": 1.35936507887344, + "learning_rate": 1.4597390720053683e-06, + "loss": 0.7987, + "step": 5570 + }, + { + "epoch": 0.83, + "grad_norm": 1.5937556355507636, + "learning_rate": 1.4572256830070497e-06, + "loss": 0.8052, + "step": 5571 + }, + { + "epoch": 0.83, + "grad_norm": 1.602847615303778, + "learning_rate": 1.454714289601661e-06, + "loss": 0.8419, + "step": 5572 + }, + { + "epoch": 0.83, + "grad_norm": 1.425390778596702, + "learning_rate": 1.4522048923758647e-06, + "loss": 0.7829, + "step": 5573 + }, + { + "epoch": 0.83, + "grad_norm": 1.352883551895712, + "learning_rate": 1.4496974919158569e-06, + "loss": 0.7635, + "step": 5574 + }, + { + "epoch": 0.83, + "grad_norm": 1.6195130523840662, + "learning_rate": 1.4471920888073676e-06, + "loss": 0.8647, + "step": 5575 + }, + { + "epoch": 0.83, + "grad_norm": 1.3764801105736542, + "learning_rate": 1.4446886836356578e-06, + "loss": 0.8518, + "step": 5576 + }, + { + "epoch": 0.83, + "grad_norm": 1.530840901786308, + "learning_rate": 1.4421872769855262e-06, + "loss": 0.823, + "step": 5577 + }, + { + "epoch": 0.83, + "grad_norm": 1.4800179519865388, + "learning_rate": 1.4396878694412975e-06, + "loss": 0.9144, + "step": 5578 + }, + { + "epoch": 0.83, + "grad_norm": 1.3808580072118306, + "learning_rate": 1.4371904615868348e-06, + "loss": 0.8762, + "step": 5579 + }, + { + "epoch": 0.83, + "grad_norm": 1.535561323716555, + "learning_rate": 1.4346950540055327e-06, + "loss": 0.8773, + "step": 5580 + }, + { + "epoch": 0.83, + "grad_norm": 1.3278806482656107, + "learning_rate": 1.4322016472803202e-06, + "loss": 0.864, + "step": 5581 + }, + { + "epoch": 0.83, + "grad_norm": 1.495017309359749, + "learning_rate": 1.4297102419936559e-06, + "loss": 0.8616, + "step": 5582 + }, + { + "epoch": 0.83, + "grad_norm": 1.5488413392858595, + "learning_rate": 1.4272208387275332e-06, + "loss": 0.805, + "step": 5583 + }, + { + "epoch": 0.83, + "grad_norm": 1.589270863427278, + "learning_rate": 1.4247334380634792e-06, + "loss": 0.8344, + "step": 5584 + }, + { + "epoch": 0.83, + "grad_norm": 1.5553199198233327, + "learning_rate": 1.4222480405825455e-06, + "loss": 0.8337, + "step": 5585 + }, + { + "epoch": 0.83, + "grad_norm": 1.5185311128558847, + "learning_rate": 1.4197646468653236e-06, + "loss": 0.9153, + "step": 5586 + }, + { + "epoch": 0.83, + "grad_norm": 1.3820352329727235, + "learning_rate": 1.4172832574919359e-06, + "loss": 0.7893, + "step": 5587 + }, + { + "epoch": 0.83, + "grad_norm": 1.4654419643879775, + "learning_rate": 1.4148038730420333e-06, + "loss": 0.8081, + "step": 5588 + }, + { + "epoch": 0.83, + "grad_norm": 1.4031886859981557, + "learning_rate": 1.4123264940948022e-06, + "loss": 0.8936, + "step": 5589 + }, + { + "epoch": 0.83, + "grad_norm": 1.5201834490088904, + "learning_rate": 1.409851121228959e-06, + "loss": 0.8423, + "step": 5590 + }, + { + "epoch": 0.83, + "grad_norm": 1.3663162568502987, + "learning_rate": 1.4073777550227485e-06, + "loss": 0.7801, + "step": 5591 + }, + { + "epoch": 0.83, + "grad_norm": 0.8827130379812916, + "learning_rate": 1.4049063960539488e-06, + "loss": 0.3037, + "step": 5592 + }, + { + "epoch": 0.83, + "grad_norm": 1.394609849782385, + "learning_rate": 1.4024370448998726e-06, + "loss": 0.7811, + "step": 5593 + }, + { + "epoch": 0.83, + "grad_norm": 1.4127957323526816, + "learning_rate": 1.3999697021373582e-06, + "loss": 0.7325, + "step": 5594 + }, + { + "epoch": 0.83, + "grad_norm": 1.7082815452586944, + "learning_rate": 1.3975043683427791e-06, + "loss": 0.8212, + "step": 5595 + }, + { + "epoch": 0.83, + "grad_norm": 1.5110768777820272, + "learning_rate": 1.3950410440920359e-06, + "loss": 0.8165, + "step": 5596 + }, + { + "epoch": 0.84, + "grad_norm": 1.4289574355055963, + "learning_rate": 1.3925797299605649e-06, + "loss": 0.8385, + "step": 5597 + }, + { + "epoch": 0.84, + "grad_norm": 1.3500965380534669, + "learning_rate": 1.3901204265233237e-06, + "loss": 0.7188, + "step": 5598 + }, + { + "epoch": 0.84, + "grad_norm": 1.5445498321140911, + "learning_rate": 1.3876631343548085e-06, + "loss": 0.8636, + "step": 5599 + }, + { + "epoch": 0.84, + "grad_norm": 1.4966427428235587, + "learning_rate": 1.3852078540290437e-06, + "loss": 0.7899, + "step": 5600 + }, + { + "epoch": 0.84, + "grad_norm": 1.5137203048833996, + "learning_rate": 1.3827545861195813e-06, + "loss": 0.9025, + "step": 5601 + }, + { + "epoch": 0.84, + "grad_norm": 1.6069025415128586, + "learning_rate": 1.3803033311995072e-06, + "loss": 0.7619, + "step": 5602 + }, + { + "epoch": 0.84, + "grad_norm": 1.481638423049099, + "learning_rate": 1.3778540898414349e-06, + "loss": 0.7315, + "step": 5603 + }, + { + "epoch": 0.84, + "grad_norm": 1.3281603114258382, + "learning_rate": 1.3754068626175043e-06, + "loss": 0.844, + "step": 5604 + }, + { + "epoch": 0.84, + "grad_norm": 0.803280401038341, + "learning_rate": 1.3729616500993902e-06, + "loss": 0.3147, + "step": 5605 + }, + { + "epoch": 0.84, + "grad_norm": 1.4297202906470103, + "learning_rate": 1.3705184528582937e-06, + "loss": 0.8571, + "step": 5606 + }, + { + "epoch": 0.84, + "grad_norm": 1.4129285382000398, + "learning_rate": 1.368077271464946e-06, + "loss": 0.8243, + "step": 5607 + }, + { + "epoch": 0.84, + "grad_norm": 1.5345366552564756, + "learning_rate": 1.3656381064896084e-06, + "loss": 0.8709, + "step": 5608 + }, + { + "epoch": 0.84, + "grad_norm": 1.4250483314365596, + "learning_rate": 1.3632009585020712e-06, + "loss": 0.7837, + "step": 5609 + }, + { + "epoch": 0.84, + "grad_norm": 1.36223177424182, + "learning_rate": 1.3607658280716474e-06, + "loss": 0.8424, + "step": 5610 + }, + { + "epoch": 0.84, + "grad_norm": 1.4879863076304276, + "learning_rate": 1.3583327157671878e-06, + "loss": 0.84, + "step": 5611 + }, + { + "epoch": 0.84, + "grad_norm": 1.352572915104578, + "learning_rate": 1.3559016221570663e-06, + "loss": 0.7236, + "step": 5612 + }, + { + "epoch": 0.84, + "grad_norm": 1.2885910385786041, + "learning_rate": 1.353472547809187e-06, + "loss": 0.8153, + "step": 5613 + }, + { + "epoch": 0.84, + "grad_norm": 1.3929198883641805, + "learning_rate": 1.3510454932909823e-06, + "loss": 0.7998, + "step": 5614 + }, + { + "epoch": 0.84, + "grad_norm": 1.4842694094395823, + "learning_rate": 1.3486204591694118e-06, + "loss": 0.861, + "step": 5615 + }, + { + "epoch": 0.84, + "grad_norm": 1.6298579173532424, + "learning_rate": 1.346197446010965e-06, + "loss": 0.7964, + "step": 5616 + }, + { + "epoch": 0.84, + "grad_norm": 1.5871884598816404, + "learning_rate": 1.3437764543816556e-06, + "loss": 0.8198, + "step": 5617 + }, + { + "epoch": 0.84, + "grad_norm": 1.5068796933954247, + "learning_rate": 1.341357484847029e-06, + "loss": 0.7854, + "step": 5618 + }, + { + "epoch": 0.84, + "grad_norm": 1.4252774628818057, + "learning_rate": 1.3389405379721564e-06, + "loss": 0.8259, + "step": 5619 + }, + { + "epoch": 0.84, + "grad_norm": 1.4353888677715951, + "learning_rate": 1.3365256143216377e-06, + "loss": 0.827, + "step": 5620 + }, + { + "epoch": 0.84, + "grad_norm": 1.3669370939639536, + "learning_rate": 1.3341127144595978e-06, + "loss": 0.8322, + "step": 5621 + }, + { + "epoch": 0.84, + "grad_norm": 1.3917847201630018, + "learning_rate": 1.3317018389496927e-06, + "loss": 0.7853, + "step": 5622 + }, + { + "epoch": 0.84, + "grad_norm": 1.317118778025492, + "learning_rate": 1.3292929883550998e-06, + "loss": 0.8805, + "step": 5623 + }, + { + "epoch": 0.84, + "grad_norm": 1.360222807635599, + "learning_rate": 1.3268861632385288e-06, + "loss": 0.8551, + "step": 5624 + }, + { + "epoch": 0.84, + "grad_norm": 1.3977123370252795, + "learning_rate": 1.3244813641622146e-06, + "loss": 0.7454, + "step": 5625 + }, + { + "epoch": 0.84, + "grad_norm": 1.595640924845315, + "learning_rate": 1.3220785916879165e-06, + "loss": 0.8409, + "step": 5626 + }, + { + "epoch": 0.84, + "grad_norm": 1.411216738623978, + "learning_rate": 1.3196778463769256e-06, + "loss": 0.816, + "step": 5627 + }, + { + "epoch": 0.84, + "grad_norm": 1.5395991120887047, + "learning_rate": 1.3172791287900555e-06, + "loss": 0.8501, + "step": 5628 + }, + { + "epoch": 0.84, + "grad_norm": 1.543553886041695, + "learning_rate": 1.3148824394876437e-06, + "loss": 0.8512, + "step": 5629 + }, + { + "epoch": 0.84, + "grad_norm": 1.5065576280259123, + "learning_rate": 1.3124877790295597e-06, + "loss": 0.739, + "step": 5630 + }, + { + "epoch": 0.84, + "grad_norm": 1.31284031117535, + "learning_rate": 1.3100951479751967e-06, + "loss": 0.8386, + "step": 5631 + }, + { + "epoch": 0.84, + "grad_norm": 1.3337472241278043, + "learning_rate": 1.3077045468834714e-06, + "loss": 0.826, + "step": 5632 + }, + { + "epoch": 0.84, + "grad_norm": 1.4845560434009752, + "learning_rate": 1.3053159763128308e-06, + "loss": 0.8674, + "step": 5633 + }, + { + "epoch": 0.84, + "grad_norm": 1.5253173197521832, + "learning_rate": 1.3029294368212464e-06, + "loss": 0.7816, + "step": 5634 + }, + { + "epoch": 0.84, + "grad_norm": 1.4925019509595872, + "learning_rate": 1.3005449289662099e-06, + "loss": 0.7995, + "step": 5635 + }, + { + "epoch": 0.84, + "grad_norm": 1.4176195538307048, + "learning_rate": 1.2981624533047432e-06, + "loss": 0.7658, + "step": 5636 + }, + { + "epoch": 0.84, + "grad_norm": 1.500556523428137, + "learning_rate": 1.295782010393396e-06, + "loss": 0.8436, + "step": 5637 + }, + { + "epoch": 0.84, + "grad_norm": 1.5447101538367003, + "learning_rate": 1.2934036007882378e-06, + "loss": 0.8523, + "step": 5638 + }, + { + "epoch": 0.84, + "grad_norm": 1.444016724390116, + "learning_rate": 1.2910272250448675e-06, + "loss": 0.7755, + "step": 5639 + }, + { + "epoch": 0.84, + "grad_norm": 1.6557898883228868, + "learning_rate": 1.288652883718403e-06, + "loss": 0.8196, + "step": 5640 + }, + { + "epoch": 0.84, + "grad_norm": 1.2910261932840776, + "learning_rate": 1.2862805773634934e-06, + "loss": 0.8632, + "step": 5641 + }, + { + "epoch": 0.84, + "grad_norm": 1.4065579474552827, + "learning_rate": 1.2839103065343084e-06, + "loss": 0.8273, + "step": 5642 + }, + { + "epoch": 0.84, + "grad_norm": 1.383671166641272, + "learning_rate": 1.281542071784544e-06, + "loss": 0.7879, + "step": 5643 + }, + { + "epoch": 0.84, + "grad_norm": 1.4156518225465577, + "learning_rate": 1.2791758736674232e-06, + "loss": 0.8987, + "step": 5644 + }, + { + "epoch": 0.84, + "grad_norm": 1.479803108051562, + "learning_rate": 1.2768117127356838e-06, + "loss": 0.7887, + "step": 5645 + }, + { + "epoch": 0.84, + "grad_norm": 1.6259523166532743, + "learning_rate": 1.2744495895415975e-06, + "loss": 0.8528, + "step": 5646 + }, + { + "epoch": 0.84, + "grad_norm": 1.4272012497979003, + "learning_rate": 1.2720895046369564e-06, + "loss": 0.7828, + "step": 5647 + }, + { + "epoch": 0.84, + "grad_norm": 1.5802915709514433, + "learning_rate": 1.269731458573077e-06, + "loss": 0.8399, + "step": 5648 + }, + { + "epoch": 0.84, + "grad_norm": 1.576318908295224, + "learning_rate": 1.2673754519008008e-06, + "loss": 0.8486, + "step": 5649 + }, + { + "epoch": 0.84, + "grad_norm": 1.724227901424948, + "learning_rate": 1.2650214851704866e-06, + "loss": 0.822, + "step": 5650 + }, + { + "epoch": 0.84, + "grad_norm": 2.1321056578034416, + "learning_rate": 1.2626695589320226e-06, + "loss": 0.8174, + "step": 5651 + }, + { + "epoch": 0.84, + "grad_norm": 1.454999180386745, + "learning_rate": 1.2603196737348211e-06, + "loss": 0.7969, + "step": 5652 + }, + { + "epoch": 0.84, + "grad_norm": 1.6057886190190596, + "learning_rate": 1.2579718301278143e-06, + "loss": 0.7726, + "step": 5653 + }, + { + "epoch": 0.84, + "grad_norm": 1.4388468907285137, + "learning_rate": 1.255626028659459e-06, + "loss": 0.7955, + "step": 5654 + }, + { + "epoch": 0.84, + "grad_norm": 1.3757111656895906, + "learning_rate": 1.253282269877737e-06, + "loss": 0.8094, + "step": 5655 + }, + { + "epoch": 0.84, + "grad_norm": 1.5699015145975979, + "learning_rate": 1.2509405543301456e-06, + "loss": 0.8156, + "step": 5656 + }, + { + "epoch": 0.84, + "grad_norm": 1.4009171025629052, + "learning_rate": 1.2486008825637119e-06, + "loss": 0.7991, + "step": 5657 + }, + { + "epoch": 0.84, + "grad_norm": 1.628825473269102, + "learning_rate": 1.2462632551249842e-06, + "loss": 0.7587, + "step": 5658 + }, + { + "epoch": 0.84, + "grad_norm": 1.529396740197073, + "learning_rate": 1.2439276725600324e-06, + "loss": 0.7718, + "step": 5659 + }, + { + "epoch": 0.84, + "grad_norm": 1.535025012252462, + "learning_rate": 1.2415941354144478e-06, + "loss": 0.8602, + "step": 5660 + }, + { + "epoch": 0.84, + "grad_norm": 1.6595181776048544, + "learning_rate": 1.2392626442333488e-06, + "loss": 0.73, + "step": 5661 + }, + { + "epoch": 0.84, + "grad_norm": 1.3737998874084563, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.8746, + "step": 5662 + }, + { + "epoch": 0.84, + "grad_norm": 1.3606653146067516, + "learning_rate": 1.234605801942661e-06, + "loss": 0.7773, + "step": 5663 + }, + { + "epoch": 0.85, + "grad_norm": 1.4311778828617514, + "learning_rate": 1.232280451920914e-06, + "loss": 0.8644, + "step": 5664 + }, + { + "epoch": 0.85, + "grad_norm": 1.3928005654786797, + "learning_rate": 1.229957150039327e-06, + "loss": 0.7957, + "step": 5665 + }, + { + "epoch": 0.85, + "grad_norm": 1.4631977140958095, + "learning_rate": 1.2276358968406233e-06, + "loss": 0.8095, + "step": 5666 + }, + { + "epoch": 0.85, + "grad_norm": 1.5743956596606044, + "learning_rate": 1.2253166928670478e-06, + "loss": 0.8352, + "step": 5667 + }, + { + "epoch": 0.85, + "grad_norm": 1.3886411116206447, + "learning_rate": 1.222999538660369e-06, + "loss": 0.7899, + "step": 5668 + }, + { + "epoch": 0.85, + "grad_norm": 1.2069141491625306, + "learning_rate": 1.2206844347618707e-06, + "loss": 0.8168, + "step": 5669 + }, + { + "epoch": 0.85, + "grad_norm": 1.4132497016375687, + "learning_rate": 1.2183713817123622e-06, + "loss": 0.7725, + "step": 5670 + }, + { + "epoch": 0.85, + "grad_norm": 1.5733510174513445, + "learning_rate": 1.2160603800521742e-06, + "loss": 0.8459, + "step": 5671 + }, + { + "epoch": 0.85, + "grad_norm": 1.4265737408302719, + "learning_rate": 1.213751430321156e-06, + "loss": 0.8258, + "step": 5672 + }, + { + "epoch": 0.85, + "grad_norm": 1.437067274470808, + "learning_rate": 1.21144453305868e-06, + "loss": 0.8088, + "step": 5673 + }, + { + "epoch": 0.85, + "grad_norm": 0.8530367117609204, + "learning_rate": 1.2091396888036388e-06, + "loss": 0.3131, + "step": 5674 + }, + { + "epoch": 0.85, + "grad_norm": 1.3021184310156435, + "learning_rate": 1.206836898094439e-06, + "loss": 0.8166, + "step": 5675 + }, + { + "epoch": 0.85, + "grad_norm": 1.5845480138318666, + "learning_rate": 1.2045361614690166e-06, + "loss": 0.8347, + "step": 5676 + }, + { + "epoch": 0.85, + "grad_norm": 1.5298449452560832, + "learning_rate": 1.2022374794648229e-06, + "loss": 0.8639, + "step": 5677 + }, + { + "epoch": 0.85, + "grad_norm": 1.5372755598222279, + "learning_rate": 1.1999408526188295e-06, + "loss": 0.7902, + "step": 5678 + }, + { + "epoch": 0.85, + "grad_norm": 1.5181375212651633, + "learning_rate": 1.1976462814675305e-06, + "loss": 0.7438, + "step": 5679 + }, + { + "epoch": 0.85, + "grad_norm": 1.426182147061947, + "learning_rate": 1.1953537665469383e-06, + "loss": 0.795, + "step": 5680 + }, + { + "epoch": 0.85, + "grad_norm": 1.301857246241473, + "learning_rate": 1.1930633083925824e-06, + "loss": 0.7949, + "step": 5681 + }, + { + "epoch": 0.85, + "grad_norm": 1.5857344312979722, + "learning_rate": 1.1907749075395147e-06, + "loss": 0.8395, + "step": 5682 + }, + { + "epoch": 0.85, + "grad_norm": 1.595842514113353, + "learning_rate": 1.1884885645223055e-06, + "loss": 0.8312, + "step": 5683 + }, + { + "epoch": 0.85, + "grad_norm": 1.4108327685210615, + "learning_rate": 1.1862042798750462e-06, + "loss": 0.8865, + "step": 5684 + }, + { + "epoch": 0.85, + "grad_norm": 1.2966530700459167, + "learning_rate": 1.1839220541313445e-06, + "loss": 0.7852, + "step": 5685 + }, + { + "epoch": 0.85, + "grad_norm": 1.384203636723357, + "learning_rate": 1.1816418878243296e-06, + "loss": 0.8297, + "step": 5686 + }, + { + "epoch": 0.85, + "grad_norm": 1.6024638913253402, + "learning_rate": 1.179363781486651e-06, + "loss": 0.8701, + "step": 5687 + }, + { + "epoch": 0.85, + "grad_norm": 1.3896996353969324, + "learning_rate": 1.1770877356504684e-06, + "loss": 0.7804, + "step": 5688 + }, + { + "epoch": 0.85, + "grad_norm": 0.8686173337085714, + "learning_rate": 1.1748137508474699e-06, + "loss": 0.314, + "step": 5689 + }, + { + "epoch": 0.85, + "grad_norm": 1.3516349118465154, + "learning_rate": 1.1725418276088596e-06, + "loss": 0.7975, + "step": 5690 + }, + { + "epoch": 0.85, + "grad_norm": 1.5003963644647405, + "learning_rate": 1.170271966465356e-06, + "loss": 0.8027, + "step": 5691 + }, + { + "epoch": 0.85, + "grad_norm": 1.5614163777746215, + "learning_rate": 1.168004167947202e-06, + "loss": 0.8543, + "step": 5692 + }, + { + "epoch": 0.85, + "grad_norm": 1.4618083199562741, + "learning_rate": 1.1657384325841558e-06, + "loss": 0.8432, + "step": 5693 + }, + { + "epoch": 0.85, + "grad_norm": 1.5359316892455774, + "learning_rate": 1.1634747609054897e-06, + "loss": 0.8702, + "step": 5694 + }, + { + "epoch": 0.85, + "grad_norm": 1.4256592737565792, + "learning_rate": 1.1612131534399995e-06, + "loss": 0.86, + "step": 5695 + }, + { + "epoch": 0.85, + "grad_norm": 1.7752052562870355, + "learning_rate": 1.1589536107159981e-06, + "loss": 0.7744, + "step": 5696 + }, + { + "epoch": 0.85, + "grad_norm": 1.351909281502772, + "learning_rate": 1.1566961332613136e-06, + "loss": 0.7192, + "step": 5697 + }, + { + "epoch": 0.85, + "grad_norm": 1.4024335665085583, + "learning_rate": 1.1544407216032928e-06, + "loss": 0.7249, + "step": 5698 + }, + { + "epoch": 0.85, + "grad_norm": 1.4207985079256367, + "learning_rate": 1.1521873762688007e-06, + "loss": 0.8791, + "step": 5699 + }, + { + "epoch": 0.85, + "grad_norm": 1.4849432616691427, + "learning_rate": 1.1499360977842212e-06, + "loss": 0.7577, + "step": 5700 + }, + { + "epoch": 0.85, + "grad_norm": 1.4694271207164782, + "learning_rate": 1.1476868866754488e-06, + "loss": 0.8661, + "step": 5701 + }, + { + "epoch": 0.85, + "grad_norm": 1.2823296074815393, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.8279, + "step": 5702 + }, + { + "epoch": 0.85, + "grad_norm": 1.5509524587197696, + "learning_rate": 1.1431946686865124e-06, + "loss": 0.7952, + "step": 5703 + }, + { + "epoch": 0.85, + "grad_norm": 1.5756226567134055, + "learning_rate": 1.1409516628557315e-06, + "loss": 0.8093, + "step": 5704 + }, + { + "epoch": 0.85, + "grad_norm": 1.5036382708895017, + "learning_rate": 1.1387107264995234e-06, + "loss": 0.8505, + "step": 5705 + }, + { + "epoch": 0.85, + "grad_norm": 1.5551125242582047, + "learning_rate": 1.136471860141376e-06, + "loss": 0.8046, + "step": 5706 + }, + { + "epoch": 0.85, + "grad_norm": 1.5147034194234463, + "learning_rate": 1.1342350643042822e-06, + "loss": 0.8299, + "step": 5707 + }, + { + "epoch": 0.85, + "grad_norm": 1.2882249632765759, + "learning_rate": 1.1320003395107604e-06, + "loss": 0.6946, + "step": 5708 + }, + { + "epoch": 0.85, + "grad_norm": 1.527704240862158, + "learning_rate": 1.1297676862828421e-06, + "loss": 0.8621, + "step": 5709 + }, + { + "epoch": 0.85, + "grad_norm": 0.7554174081058926, + "learning_rate": 1.1275371051420769e-06, + "loss": 0.3287, + "step": 5710 + }, + { + "epoch": 0.85, + "grad_norm": 1.4371161712937863, + "learning_rate": 1.1253085966095278e-06, + "loss": 0.8085, + "step": 5711 + }, + { + "epoch": 0.85, + "grad_norm": 1.4776900788489071, + "learning_rate": 1.123082161205775e-06, + "loss": 0.8682, + "step": 5712 + }, + { + "epoch": 0.85, + "grad_norm": 1.4585296095734914, + "learning_rate": 1.120857799450915e-06, + "loss": 0.8439, + "step": 5713 + }, + { + "epoch": 0.85, + "grad_norm": 1.3931721948995697, + "learning_rate": 1.1186355118645552e-06, + "loss": 0.8381, + "step": 5714 + }, + { + "epoch": 0.85, + "grad_norm": 1.4748332302123681, + "learning_rate": 1.1164152989658251e-06, + "loss": 0.8336, + "step": 5715 + }, + { + "epoch": 0.85, + "grad_norm": 1.5041398498631433, + "learning_rate": 1.114197161273367e-06, + "loss": 0.7713, + "step": 5716 + }, + { + "epoch": 0.85, + "grad_norm": 1.560401465715829, + "learning_rate": 1.111981099305336e-06, + "loss": 0.837, + "step": 5717 + }, + { + "epoch": 0.85, + "grad_norm": 1.3731257523173033, + "learning_rate": 1.1097671135794063e-06, + "loss": 0.8013, + "step": 5718 + }, + { + "epoch": 0.85, + "grad_norm": 1.6030186220943285, + "learning_rate": 1.1075552046127658e-06, + "loss": 0.7719, + "step": 5719 + }, + { + "epoch": 0.85, + "grad_norm": 1.5143900533446337, + "learning_rate": 1.1053453729221142e-06, + "loss": 0.7533, + "step": 5720 + }, + { + "epoch": 0.85, + "grad_norm": 1.491812052103907, + "learning_rate": 1.1031376190236687e-06, + "loss": 0.8135, + "step": 5721 + }, + { + "epoch": 0.85, + "grad_norm": 1.5813760118728517, + "learning_rate": 1.1009319434331623e-06, + "loss": 0.7529, + "step": 5722 + }, + { + "epoch": 0.85, + "grad_norm": 1.4710801643072446, + "learning_rate": 1.0987283466658404e-06, + "loss": 0.823, + "step": 5723 + }, + { + "epoch": 0.85, + "grad_norm": 1.6725489837678664, + "learning_rate": 1.0965268292364639e-06, + "loss": 0.7201, + "step": 5724 + }, + { + "epoch": 0.85, + "grad_norm": 1.36205055368535, + "learning_rate": 1.0943273916593067e-06, + "loss": 0.8893, + "step": 5725 + }, + { + "epoch": 0.85, + "grad_norm": 1.544847948401028, + "learning_rate": 1.092130034448159e-06, + "loss": 0.8332, + "step": 5726 + }, + { + "epoch": 0.85, + "grad_norm": 1.345572158658101, + "learning_rate": 1.0899347581163222e-06, + "loss": 0.7924, + "step": 5727 + }, + { + "epoch": 0.85, + "grad_norm": 1.3997858742149203, + "learning_rate": 1.087741563176613e-06, + "loss": 0.919, + "step": 5728 + }, + { + "epoch": 0.85, + "grad_norm": 1.5506090866150255, + "learning_rate": 1.0855504501413616e-06, + "loss": 0.8058, + "step": 5729 + }, + { + "epoch": 0.85, + "grad_norm": 1.4596590706039845, + "learning_rate": 1.0833614195224141e-06, + "loss": 0.8247, + "step": 5730 + }, + { + "epoch": 0.86, + "grad_norm": 1.574355821699966, + "learning_rate": 1.0811744718311267e-06, + "loss": 0.8068, + "step": 5731 + }, + { + "epoch": 0.86, + "grad_norm": 1.5252985114444997, + "learning_rate": 1.0789896075783734e-06, + "loss": 0.8754, + "step": 5732 + }, + { + "epoch": 0.86, + "grad_norm": 1.498463974222597, + "learning_rate": 1.0768068272745347e-06, + "loss": 0.8187, + "step": 5733 + }, + { + "epoch": 0.86, + "grad_norm": 1.644244947792092, + "learning_rate": 1.0746261314295104e-06, + "loss": 0.893, + "step": 5734 + }, + { + "epoch": 0.86, + "grad_norm": 1.6551908140463691, + "learning_rate": 1.0724475205527107e-06, + "loss": 0.8681, + "step": 5735 + }, + { + "epoch": 0.86, + "grad_norm": 1.4688769555769485, + "learning_rate": 1.0702709951530587e-06, + "loss": 0.7887, + "step": 5736 + }, + { + "epoch": 0.86, + "grad_norm": 1.4819465661777282, + "learning_rate": 1.0680965557389934e-06, + "loss": 0.7924, + "step": 5737 + }, + { + "epoch": 0.86, + "grad_norm": 1.3203300977132322, + "learning_rate": 1.0659242028184635e-06, + "loss": 0.7207, + "step": 5738 + }, + { + "epoch": 0.86, + "grad_norm": 1.4262053701510407, + "learning_rate": 1.063753936898928e-06, + "loss": 0.8571, + "step": 5739 + }, + { + "epoch": 0.86, + "grad_norm": 1.3777035501274875, + "learning_rate": 1.0615857584873624e-06, + "loss": 0.7935, + "step": 5740 + }, + { + "epoch": 0.86, + "grad_norm": 1.2632930599066385, + "learning_rate": 1.0594196680902547e-06, + "loss": 0.7612, + "step": 5741 + }, + { + "epoch": 0.86, + "grad_norm": 1.4726409468627246, + "learning_rate": 1.0572556662136036e-06, + "loss": 0.7954, + "step": 5742 + }, + { + "epoch": 0.86, + "grad_norm": 1.622198734792019, + "learning_rate": 1.055093753362919e-06, + "loss": 0.7772, + "step": 5743 + }, + { + "epoch": 0.86, + "grad_norm": 1.3993958460706282, + "learning_rate": 1.052933930043225e-06, + "loss": 0.7605, + "step": 5744 + }, + { + "epoch": 0.86, + "grad_norm": 1.6258069519841674, + "learning_rate": 1.050776196759058e-06, + "loss": 0.79, + "step": 5745 + }, + { + "epoch": 0.86, + "grad_norm": 1.4824049600633822, + "learning_rate": 1.0486205540144612e-06, + "loss": 0.8563, + "step": 5746 + }, + { + "epoch": 0.86, + "grad_norm": 1.501868208380895, + "learning_rate": 1.0464670023129952e-06, + "loss": 0.8536, + "step": 5747 + }, + { + "epoch": 0.86, + "grad_norm": 0.9202954531970664, + "learning_rate": 1.044315542157729e-06, + "loss": 0.3553, + "step": 5748 + }, + { + "epoch": 0.86, + "grad_norm": 1.6241696521866726, + "learning_rate": 1.0421661740512445e-06, + "loss": 0.7516, + "step": 5749 + }, + { + "epoch": 0.86, + "grad_norm": 1.6776822673801417, + "learning_rate": 1.0400188984956339e-06, + "loss": 0.8252, + "step": 5750 + }, + { + "epoch": 0.86, + "grad_norm": 1.439800028491611, + "learning_rate": 1.0378737159925023e-06, + "loss": 0.8404, + "step": 5751 + }, + { + "epoch": 0.86, + "grad_norm": 1.377604308260554, + "learning_rate": 1.0357306270429623e-06, + "loss": 0.7838, + "step": 5752 + }, + { + "epoch": 0.86, + "grad_norm": 1.4277866023904988, + "learning_rate": 1.0335896321476413e-06, + "loss": 0.8116, + "step": 5753 + }, + { + "epoch": 0.86, + "grad_norm": 1.3260175821665998, + "learning_rate": 1.0314507318066757e-06, + "loss": 0.8293, + "step": 5754 + }, + { + "epoch": 0.86, + "grad_norm": 1.1277469173166015, + "learning_rate": 1.0293139265197116e-06, + "loss": 0.7013, + "step": 5755 + }, + { + "epoch": 0.86, + "grad_norm": 1.3449864326868755, + "learning_rate": 1.0271792167859084e-06, + "loss": 0.8138, + "step": 5756 + }, + { + "epoch": 0.86, + "grad_norm": 1.4059085424770785, + "learning_rate": 1.0250466031039353e-06, + "loss": 0.8375, + "step": 5757 + }, + { + "epoch": 0.86, + "grad_norm": 1.6031715927733514, + "learning_rate": 1.0229160859719688e-06, + "loss": 0.81, + "step": 5758 + }, + { + "epoch": 0.86, + "grad_norm": 1.4429648535097033, + "learning_rate": 1.020787665887699e-06, + "loss": 0.778, + "step": 5759 + }, + { + "epoch": 0.86, + "grad_norm": 1.6734038285911446, + "learning_rate": 1.0186613433483238e-06, + "loss": 0.7931, + "step": 5760 + }, + { + "epoch": 0.86, + "grad_norm": 1.4500883473979898, + "learning_rate": 1.0165371188505546e-06, + "loss": 0.9094, + "step": 5761 + }, + { + "epoch": 0.86, + "grad_norm": 1.5036194763234356, + "learning_rate": 1.014414992890611e-06, + "loss": 0.8898, + "step": 5762 + }, + { + "epoch": 0.86, + "grad_norm": 1.464445203605549, + "learning_rate": 1.012294965964218e-06, + "loss": 0.7474, + "step": 5763 + }, + { + "epoch": 0.86, + "grad_norm": 1.7599123365443723, + "learning_rate": 1.0101770385666166e-06, + "loss": 0.7693, + "step": 5764 + }, + { + "epoch": 0.86, + "grad_norm": 1.5430975135964056, + "learning_rate": 1.0080612111925547e-06, + "loss": 0.8473, + "step": 5765 + }, + { + "epoch": 0.86, + "grad_norm": 1.3302683323151863, + "learning_rate": 1.0059474843362893e-06, + "loss": 0.9066, + "step": 5766 + }, + { + "epoch": 0.86, + "grad_norm": 1.5519377350254149, + "learning_rate": 1.0038358584915896e-06, + "loss": 0.8193, + "step": 5767 + }, + { + "epoch": 0.86, + "grad_norm": 1.4810117214026877, + "learning_rate": 1.001726334151728e-06, + "loss": 0.8091, + "step": 5768 + }, + { + "epoch": 0.86, + "grad_norm": 1.4257675464180606, + "learning_rate": 9.996189118094901e-07, + "loss": 0.7591, + "step": 5769 + }, + { + "epoch": 0.86, + "grad_norm": 1.5802017076351729, + "learning_rate": 9.97513591957172e-07, + "loss": 0.8658, + "step": 5770 + }, + { + "epoch": 0.86, + "grad_norm": 1.360368185633235, + "learning_rate": 9.954103750865762e-07, + "loss": 0.8092, + "step": 5771 + }, + { + "epoch": 0.86, + "grad_norm": 1.4468781870298115, + "learning_rate": 9.93309261689015e-07, + "loss": 0.8227, + "step": 5772 + }, + { + "epoch": 0.86, + "grad_norm": 1.454892611358579, + "learning_rate": 9.912102522553047e-07, + "loss": 0.8374, + "step": 5773 + }, + { + "epoch": 0.86, + "grad_norm": 1.4704066227167842, + "learning_rate": 9.891133472757774e-07, + "loss": 0.8417, + "step": 5774 + }, + { + "epoch": 0.86, + "grad_norm": 1.2723028149877522, + "learning_rate": 9.870185472402705e-07, + "loss": 0.7401, + "step": 5775 + }, + { + "epoch": 0.86, + "grad_norm": 1.4894490628871158, + "learning_rate": 9.849258526381288e-07, + "loss": 0.8302, + "step": 5776 + }, + { + "epoch": 0.86, + "grad_norm": 1.4345225306084823, + "learning_rate": 9.828352639582073e-07, + "loss": 0.8345, + "step": 5777 + }, + { + "epoch": 0.86, + "grad_norm": 1.4535279131992453, + "learning_rate": 9.80746781688865e-07, + "loss": 0.8294, + "step": 5778 + }, + { + "epoch": 0.86, + "grad_norm": 0.9183404251213945, + "learning_rate": 9.786604063179728e-07, + "loss": 0.3042, + "step": 5779 + }, + { + "epoch": 0.86, + "grad_norm": 1.3514829003041369, + "learning_rate": 9.765761383329087e-07, + "loss": 0.8808, + "step": 5780 + }, + { + "epoch": 0.86, + "grad_norm": 1.5428358829608562, + "learning_rate": 9.744939782205575e-07, + "loss": 0.8282, + "step": 5781 + }, + { + "epoch": 0.86, + "grad_norm": 0.8707008379114233, + "learning_rate": 9.724139264673116e-07, + "loss": 0.3138, + "step": 5782 + }, + { + "epoch": 0.86, + "grad_norm": 0.9674334588089352, + "learning_rate": 9.703359835590731e-07, + "loss": 0.3321, + "step": 5783 + }, + { + "epoch": 0.86, + "grad_norm": 1.3459840365070956, + "learning_rate": 9.682601499812494e-07, + "loss": 0.8321, + "step": 5784 + }, + { + "epoch": 0.86, + "grad_norm": 1.4689395863345713, + "learning_rate": 9.661864262187527e-07, + "loss": 0.8771, + "step": 5785 + }, + { + "epoch": 0.86, + "grad_norm": 1.4383817416169693, + "learning_rate": 9.641148127560063e-07, + "loss": 0.8344, + "step": 5786 + }, + { + "epoch": 0.86, + "grad_norm": 1.7361414243647821, + "learning_rate": 9.6204531007694e-07, + "loss": 0.9285, + "step": 5787 + }, + { + "epoch": 0.86, + "grad_norm": 1.283474659749838, + "learning_rate": 9.599779186649893e-07, + "loss": 0.8295, + "step": 5788 + }, + { + "epoch": 0.86, + "grad_norm": 1.394214252832118, + "learning_rate": 9.57912639003098e-07, + "loss": 0.8236, + "step": 5789 + }, + { + "epoch": 0.86, + "grad_norm": 1.3760682712367116, + "learning_rate": 9.558494715737166e-07, + "loss": 0.8192, + "step": 5790 + }, + { + "epoch": 0.86, + "grad_norm": 1.5853118143788905, + "learning_rate": 9.537884168587974e-07, + "loss": 0.7848, + "step": 5791 + }, + { + "epoch": 0.86, + "grad_norm": 1.5994496003727312, + "learning_rate": 9.517294753398066e-07, + "loss": 0.8238, + "step": 5792 + }, + { + "epoch": 0.86, + "grad_norm": 1.5367172933551225, + "learning_rate": 9.496726474977103e-07, + "loss": 0.8078, + "step": 5793 + }, + { + "epoch": 0.86, + "grad_norm": 1.5255289576737288, + "learning_rate": 9.476179338129854e-07, + "loss": 0.8875, + "step": 5794 + }, + { + "epoch": 0.86, + "grad_norm": 1.6827260799673724, + "learning_rate": 9.455653347656135e-07, + "loss": 0.8962, + "step": 5795 + }, + { + "epoch": 0.86, + "grad_norm": 1.8724072774454232, + "learning_rate": 9.435148508350822e-07, + "loss": 0.8302, + "step": 5796 + }, + { + "epoch": 0.86, + "grad_norm": 1.3734770002098882, + "learning_rate": 9.414664825003838e-07, + "loss": 0.7681, + "step": 5797 + }, + { + "epoch": 0.87, + "grad_norm": 1.492997322976864, + "learning_rate": 9.394202302400158e-07, + "loss": 0.7453, + "step": 5798 + }, + { + "epoch": 0.87, + "grad_norm": 1.4009183158131107, + "learning_rate": 9.373760945319854e-07, + "loss": 0.8687, + "step": 5799 + }, + { + "epoch": 0.87, + "grad_norm": 0.8318579391405551, + "learning_rate": 9.353340758538021e-07, + "loss": 0.3236, + "step": 5800 + }, + { + "epoch": 0.87, + "grad_norm": 1.6650743508724601, + "learning_rate": 9.332941746824819e-07, + "loss": 0.8434, + "step": 5801 + }, + { + "epoch": 0.87, + "grad_norm": 1.5911997301075265, + "learning_rate": 9.312563914945461e-07, + "loss": 0.831, + "step": 5802 + }, + { + "epoch": 0.87, + "grad_norm": 1.3763131703916958, + "learning_rate": 9.292207267660214e-07, + "loss": 0.8581, + "step": 5803 + }, + { + "epoch": 0.87, + "grad_norm": 0.863228281691854, + "learning_rate": 9.271871809724375e-07, + "loss": 0.306, + "step": 5804 + }, + { + "epoch": 0.87, + "grad_norm": 1.3637326430519523, + "learning_rate": 9.251557545888312e-07, + "loss": 0.8285, + "step": 5805 + }, + { + "epoch": 0.87, + "grad_norm": 1.5170808686874004, + "learning_rate": 9.231264480897461e-07, + "loss": 0.9109, + "step": 5806 + }, + { + "epoch": 0.87, + "grad_norm": 1.4544174089568205, + "learning_rate": 9.210992619492254e-07, + "loss": 0.8129, + "step": 5807 + }, + { + "epoch": 0.87, + "grad_norm": 1.395209327233474, + "learning_rate": 9.190741966408224e-07, + "loss": 0.8445, + "step": 5808 + }, + { + "epoch": 0.87, + "grad_norm": 0.8813065463805019, + "learning_rate": 9.17051252637593e-07, + "loss": 0.2995, + "step": 5809 + }, + { + "epoch": 0.87, + "grad_norm": 1.3091140773028045, + "learning_rate": 9.15030430412095e-07, + "loss": 0.8705, + "step": 5810 + }, + { + "epoch": 0.87, + "grad_norm": 1.4537442000700627, + "learning_rate": 9.130117304363928e-07, + "loss": 0.7422, + "step": 5811 + }, + { + "epoch": 0.87, + "grad_norm": 1.3806688434908208, + "learning_rate": 9.10995153182056e-07, + "loss": 0.764, + "step": 5812 + }, + { + "epoch": 0.87, + "grad_norm": 1.5180552185569036, + "learning_rate": 9.089806991201567e-07, + "loss": 0.8315, + "step": 5813 + }, + { + "epoch": 0.87, + "grad_norm": 1.3785202064124553, + "learning_rate": 9.069683687212716e-07, + "loss": 0.8407, + "step": 5814 + }, + { + "epoch": 0.87, + "grad_norm": 1.4673116583184187, + "learning_rate": 9.049581624554816e-07, + "loss": 0.8306, + "step": 5815 + }, + { + "epoch": 0.87, + "grad_norm": 1.6167578207010136, + "learning_rate": 9.029500807923719e-07, + "loss": 0.8216, + "step": 5816 + }, + { + "epoch": 0.87, + "grad_norm": 1.3910951536208571, + "learning_rate": 9.009441242010287e-07, + "loss": 0.8002, + "step": 5817 + }, + { + "epoch": 0.87, + "grad_norm": 1.4898255176058681, + "learning_rate": 8.989402931500434e-07, + "loss": 0.8257, + "step": 5818 + }, + { + "epoch": 0.87, + "grad_norm": 1.4987953863726593, + "learning_rate": 8.969385881075132e-07, + "loss": 0.8308, + "step": 5819 + }, + { + "epoch": 0.87, + "grad_norm": 1.4010799990932457, + "learning_rate": 8.94939009541036e-07, + "loss": 0.8521, + "step": 5820 + }, + { + "epoch": 0.87, + "grad_norm": 1.5226489006714568, + "learning_rate": 8.929415579177125e-07, + "loss": 0.8066, + "step": 5821 + }, + { + "epoch": 0.87, + "grad_norm": 1.4828013504853244, + "learning_rate": 8.909462337041508e-07, + "loss": 0.798, + "step": 5822 + }, + { + "epoch": 0.87, + "grad_norm": 1.5389632105871618, + "learning_rate": 8.889530373664546e-07, + "loss": 0.7639, + "step": 5823 + }, + { + "epoch": 0.87, + "grad_norm": 1.4121760204886484, + "learning_rate": 8.869619693702358e-07, + "loss": 0.7407, + "step": 5824 + }, + { + "epoch": 0.87, + "grad_norm": 1.4112065074909304, + "learning_rate": 8.849730301806092e-07, + "loss": 0.8603, + "step": 5825 + }, + { + "epoch": 0.87, + "grad_norm": 1.3596342212335895, + "learning_rate": 8.829862202621908e-07, + "loss": 0.8309, + "step": 5826 + }, + { + "epoch": 0.87, + "grad_norm": 1.3652803559261828, + "learning_rate": 8.810015400790994e-07, + "loss": 0.7371, + "step": 5827 + }, + { + "epoch": 0.87, + "grad_norm": 1.5847949900690912, + "learning_rate": 8.790189900949563e-07, + "loss": 0.8003, + "step": 5828 + }, + { + "epoch": 0.87, + "grad_norm": 1.531503888656653, + "learning_rate": 8.770385707728879e-07, + "loss": 0.7864, + "step": 5829 + }, + { + "epoch": 0.87, + "grad_norm": 1.2646239525033796, + "learning_rate": 8.750602825755172e-07, + "loss": 0.8302, + "step": 5830 + }, + { + "epoch": 0.87, + "grad_norm": 1.5924891575432214, + "learning_rate": 8.730841259649725e-07, + "loss": 0.7938, + "step": 5831 + }, + { + "epoch": 0.87, + "grad_norm": 1.509769617349298, + "learning_rate": 8.711101014028855e-07, + "loss": 0.8567, + "step": 5832 + }, + { + "epoch": 0.87, + "grad_norm": 1.6722493966858294, + "learning_rate": 8.691382093503886e-07, + "loss": 0.7371, + "step": 5833 + }, + { + "epoch": 0.87, + "grad_norm": 1.4924078586405232, + "learning_rate": 8.671684502681155e-07, + "loss": 0.8129, + "step": 5834 + }, + { + "epoch": 0.87, + "grad_norm": 1.4985604344656946, + "learning_rate": 8.652008246162036e-07, + "loss": 0.8161, + "step": 5835 + }, + { + "epoch": 0.87, + "grad_norm": 1.4942446217465666, + "learning_rate": 8.632353328542875e-07, + "loss": 0.8597, + "step": 5836 + }, + { + "epoch": 0.87, + "grad_norm": 1.3692937423860445, + "learning_rate": 8.612719754415078e-07, + "loss": 0.8034, + "step": 5837 + }, + { + "epoch": 0.87, + "grad_norm": 1.486290822206405, + "learning_rate": 8.593107528365052e-07, + "loss": 0.8376, + "step": 5838 + }, + { + "epoch": 0.87, + "grad_norm": 1.2882952480295942, + "learning_rate": 8.573516654974212e-07, + "loss": 0.8052, + "step": 5839 + }, + { + "epoch": 0.87, + "grad_norm": 1.6290616846366113, + "learning_rate": 8.553947138818985e-07, + "loss": 0.8781, + "step": 5840 + }, + { + "epoch": 0.87, + "grad_norm": 1.4831647990290548, + "learning_rate": 8.534398984470827e-07, + "loss": 0.7592, + "step": 5841 + }, + { + "epoch": 0.87, + "grad_norm": 0.8115019858562805, + "learning_rate": 8.514872196496182e-07, + "loss": 0.3453, + "step": 5842 + }, + { + "epoch": 0.87, + "grad_norm": 1.491721846413568, + "learning_rate": 8.495366779456493e-07, + "loss": 0.7949, + "step": 5843 + }, + { + "epoch": 0.87, + "grad_norm": 1.4194444512265951, + "learning_rate": 8.475882737908248e-07, + "loss": 0.8349, + "step": 5844 + }, + { + "epoch": 0.87, + "grad_norm": 1.4785707355140427, + "learning_rate": 8.456420076402904e-07, + "loss": 0.7385, + "step": 5845 + }, + { + "epoch": 0.87, + "grad_norm": 1.6845582104676677, + "learning_rate": 8.436978799486962e-07, + "loss": 0.795, + "step": 5846 + }, + { + "epoch": 0.87, + "grad_norm": 1.6073742096973014, + "learning_rate": 8.417558911701884e-07, + "loss": 0.8416, + "step": 5847 + }, + { + "epoch": 0.87, + "grad_norm": 1.65362204205427, + "learning_rate": 8.398160417584178e-07, + "loss": 0.8626, + "step": 5848 + }, + { + "epoch": 0.87, + "grad_norm": 1.5279714545727174, + "learning_rate": 8.378783321665318e-07, + "loss": 0.8275, + "step": 5849 + }, + { + "epoch": 0.87, + "grad_norm": 1.4573769696162062, + "learning_rate": 8.359427628471806e-07, + "loss": 0.7494, + "step": 5850 + }, + { + "epoch": 0.87, + "grad_norm": 1.347179425928135, + "learning_rate": 8.340093342525113e-07, + "loss": 0.8045, + "step": 5851 + }, + { + "epoch": 0.87, + "grad_norm": 0.8442951887023357, + "learning_rate": 8.320780468341761e-07, + "loss": 0.3437, + "step": 5852 + }, + { + "epoch": 0.87, + "grad_norm": 1.5100270423733437, + "learning_rate": 8.301489010433216e-07, + "loss": 0.8728, + "step": 5853 + }, + { + "epoch": 0.87, + "grad_norm": 1.4718822670157852, + "learning_rate": 8.282218973305978e-07, + "loss": 0.8375, + "step": 5854 + }, + { + "epoch": 0.87, + "grad_norm": 1.4244416488806975, + "learning_rate": 8.262970361461542e-07, + "loss": 0.8348, + "step": 5855 + }, + { + "epoch": 0.87, + "grad_norm": 1.578791154349969, + "learning_rate": 8.243743179396346e-07, + "loss": 0.8614, + "step": 5856 + }, + { + "epoch": 0.87, + "grad_norm": 1.446664990566375, + "learning_rate": 8.224537431601886e-07, + "loss": 0.821, + "step": 5857 + }, + { + "epoch": 0.87, + "grad_norm": 0.8475259068692577, + "learning_rate": 8.205353122564629e-07, + "loss": 0.3364, + "step": 5858 + }, + { + "epoch": 0.87, + "grad_norm": 1.408534325183491, + "learning_rate": 8.186190256766025e-07, + "loss": 0.7731, + "step": 5859 + }, + { + "epoch": 0.87, + "grad_norm": 1.649701800863386, + "learning_rate": 8.167048838682523e-07, + "loss": 0.8906, + "step": 5860 + }, + { + "epoch": 0.87, + "grad_norm": 1.7061926562678056, + "learning_rate": 8.147928872785571e-07, + "loss": 0.7741, + "step": 5861 + }, + { + "epoch": 0.87, + "grad_norm": 1.6046694926335212, + "learning_rate": 8.128830363541574e-07, + "loss": 0.8543, + "step": 5862 + }, + { + "epoch": 0.87, + "grad_norm": 1.5276635711978692, + "learning_rate": 8.109753315411962e-07, + "loss": 0.7865, + "step": 5863 + }, + { + "epoch": 0.87, + "grad_norm": 1.5439641940502322, + "learning_rate": 8.090697732853125e-07, + "loss": 0.8035, + "step": 5864 + }, + { + "epoch": 0.88, + "grad_norm": 1.4847064037299842, + "learning_rate": 8.07166362031645e-07, + "loss": 0.7861, + "step": 5865 + }, + { + "epoch": 0.88, + "grad_norm": 1.4189935053801672, + "learning_rate": 8.052650982248311e-07, + "loss": 0.8629, + "step": 5866 + }, + { + "epoch": 0.88, + "grad_norm": 1.4735022740471926, + "learning_rate": 8.03365982309009e-07, + "loss": 0.7917, + "step": 5867 + }, + { + "epoch": 0.88, + "grad_norm": 1.4740504544527968, + "learning_rate": 8.01469014727807e-07, + "loss": 0.7894, + "step": 5868 + }, + { + "epoch": 0.88, + "grad_norm": 1.3676577263498024, + "learning_rate": 7.995741959243597e-07, + "loss": 0.8685, + "step": 5869 + }, + { + "epoch": 0.88, + "grad_norm": 1.4609799060174877, + "learning_rate": 7.976815263412963e-07, + "loss": 0.8569, + "step": 5870 + }, + { + "epoch": 0.88, + "grad_norm": 1.545162037581966, + "learning_rate": 7.957910064207453e-07, + "loss": 0.8005, + "step": 5871 + }, + { + "epoch": 0.88, + "grad_norm": 1.4012629184411343, + "learning_rate": 7.939026366043323e-07, + "loss": 0.7933, + "step": 5872 + }, + { + "epoch": 0.88, + "grad_norm": 1.3771260484372194, + "learning_rate": 7.920164173331812e-07, + "loss": 0.8361, + "step": 5873 + }, + { + "epoch": 0.88, + "grad_norm": 1.4917728849161653, + "learning_rate": 7.901323490479129e-07, + "loss": 0.8597, + "step": 5874 + }, + { + "epoch": 0.88, + "grad_norm": 1.3083003163505948, + "learning_rate": 7.882504321886442e-07, + "loss": 0.7648, + "step": 5875 + }, + { + "epoch": 0.88, + "grad_norm": 1.2868280819729914, + "learning_rate": 7.863706671949922e-07, + "loss": 0.7772, + "step": 5876 + }, + { + "epoch": 0.88, + "grad_norm": 1.5135939799539269, + "learning_rate": 7.844930545060703e-07, + "loss": 0.807, + "step": 5877 + }, + { + "epoch": 0.88, + "grad_norm": 1.7674986110105817, + "learning_rate": 7.826175945604886e-07, + "loss": 0.8494, + "step": 5878 + }, + { + "epoch": 0.88, + "grad_norm": 1.4639300510378377, + "learning_rate": 7.807442877963556e-07, + "loss": 0.8043, + "step": 5879 + }, + { + "epoch": 0.88, + "grad_norm": 1.406425716097815, + "learning_rate": 7.788731346512768e-07, + "loss": 0.703, + "step": 5880 + }, + { + "epoch": 0.88, + "grad_norm": 1.4434702686094867, + "learning_rate": 7.770041355623504e-07, + "loss": 0.8474, + "step": 5881 + }, + { + "epoch": 0.88, + "grad_norm": 1.1867874381627033, + "learning_rate": 7.75137290966177e-07, + "loss": 0.7855, + "step": 5882 + }, + { + "epoch": 0.88, + "grad_norm": 1.4689276592876295, + "learning_rate": 7.732726012988512e-07, + "loss": 0.8909, + "step": 5883 + }, + { + "epoch": 0.88, + "grad_norm": 1.47282270608883, + "learning_rate": 7.714100669959656e-07, + "loss": 0.831, + "step": 5884 + }, + { + "epoch": 0.88, + "grad_norm": 1.428883323777507, + "learning_rate": 7.695496884926079e-07, + "loss": 0.8295, + "step": 5885 + }, + { + "epoch": 0.88, + "grad_norm": 1.3342588241499156, + "learning_rate": 7.676914662233625e-07, + "loss": 0.787, + "step": 5886 + }, + { + "epoch": 0.88, + "grad_norm": 1.3721511898436154, + "learning_rate": 7.65835400622309e-07, + "loss": 0.7669, + "step": 5887 + }, + { + "epoch": 0.88, + "grad_norm": 1.433935255307242, + "learning_rate": 7.639814921230271e-07, + "loss": 0.8445, + "step": 5888 + }, + { + "epoch": 0.88, + "grad_norm": 1.491144372419929, + "learning_rate": 7.621297411585881e-07, + "loss": 0.7324, + "step": 5889 + }, + { + "epoch": 0.88, + "grad_norm": 1.6064407313783018, + "learning_rate": 7.602801481615629e-07, + "loss": 0.8837, + "step": 5890 + }, + { + "epoch": 0.88, + "grad_norm": 1.7871585651569413, + "learning_rate": 7.584327135640146e-07, + "loss": 0.7751, + "step": 5891 + }, + { + "epoch": 0.88, + "grad_norm": 1.4574514849192333, + "learning_rate": 7.565874377975046e-07, + "loss": 0.8443, + "step": 5892 + }, + { + "epoch": 0.88, + "grad_norm": 1.459437021769356, + "learning_rate": 7.547443212930906e-07, + "loss": 0.7697, + "step": 5893 + }, + { + "epoch": 0.88, + "grad_norm": 1.440790127415659, + "learning_rate": 7.529033644813232e-07, + "loss": 0.8447, + "step": 5894 + }, + { + "epoch": 0.88, + "grad_norm": 1.256512669246677, + "learning_rate": 7.510645677922534e-07, + "loss": 0.7913, + "step": 5895 + }, + { + "epoch": 0.88, + "grad_norm": 1.3945551991743672, + "learning_rate": 7.492279316554207e-07, + "loss": 0.8086, + "step": 5896 + }, + { + "epoch": 0.88, + "grad_norm": 1.5442572826822383, + "learning_rate": 7.473934564998641e-07, + "loss": 0.8898, + "step": 5897 + }, + { + "epoch": 0.88, + "grad_norm": 1.6020052053752276, + "learning_rate": 7.455611427541176e-07, + "loss": 0.7849, + "step": 5898 + }, + { + "epoch": 0.88, + "grad_norm": 0.853193944264774, + "learning_rate": 7.43730990846211e-07, + "loss": 0.2979, + "step": 5899 + }, + { + "epoch": 0.88, + "grad_norm": 1.6672546736511864, + "learning_rate": 7.419030012036676e-07, + "loss": 0.7982, + "step": 5900 + }, + { + "epoch": 0.88, + "grad_norm": 1.4526160346089265, + "learning_rate": 7.400771742535051e-07, + "loss": 0.8094, + "step": 5901 + }, + { + "epoch": 0.88, + "grad_norm": 1.5524239079764113, + "learning_rate": 7.382535104222366e-07, + "loss": 0.8098, + "step": 5902 + }, + { + "epoch": 0.88, + "grad_norm": 1.4528487927438605, + "learning_rate": 7.364320101358701e-07, + "loss": 0.7176, + "step": 5903 + }, + { + "epoch": 0.88, + "grad_norm": 1.4708249667282491, + "learning_rate": 7.346126738199089e-07, + "loss": 0.7953, + "step": 5904 + }, + { + "epoch": 0.88, + "grad_norm": 1.350785066291444, + "learning_rate": 7.327955018993504e-07, + "loss": 0.8541, + "step": 5905 + }, + { + "epoch": 0.88, + "grad_norm": 1.4043682820359351, + "learning_rate": 7.309804947986876e-07, + "loss": 0.8801, + "step": 5906 + }, + { + "epoch": 0.88, + "grad_norm": 1.3474453816398573, + "learning_rate": 7.291676529419034e-07, + "loss": 0.7746, + "step": 5907 + }, + { + "epoch": 0.88, + "grad_norm": 0.8010749208623257, + "learning_rate": 7.273569767524791e-07, + "loss": 0.3604, + "step": 5908 + }, + { + "epoch": 0.88, + "grad_norm": 1.399684426677342, + "learning_rate": 7.255484666533874e-07, + "loss": 0.8385, + "step": 5909 + }, + { + "epoch": 0.88, + "grad_norm": 1.5450925660392787, + "learning_rate": 7.237421230670994e-07, + "loss": 0.8281, + "step": 5910 + }, + { + "epoch": 0.88, + "grad_norm": 1.6584249517702134, + "learning_rate": 7.21937946415574e-07, + "loss": 0.8738, + "step": 5911 + }, + { + "epoch": 0.88, + "grad_norm": 1.5627101947546354, + "learning_rate": 7.201359371202698e-07, + "loss": 0.8822, + "step": 5912 + }, + { + "epoch": 0.88, + "grad_norm": 1.3450334997884406, + "learning_rate": 7.183360956021368e-07, + "loss": 0.802, + "step": 5913 + }, + { + "epoch": 0.88, + "grad_norm": 1.390558199671098, + "learning_rate": 7.165384222816141e-07, + "loss": 0.9109, + "step": 5914 + }, + { + "epoch": 0.88, + "grad_norm": 1.295718793987034, + "learning_rate": 7.147429175786413e-07, + "loss": 0.8183, + "step": 5915 + }, + { + "epoch": 0.88, + "grad_norm": 1.4065951062967756, + "learning_rate": 7.129495819126476e-07, + "loss": 0.8235, + "step": 5916 + }, + { + "epoch": 0.88, + "grad_norm": 1.4245071015388078, + "learning_rate": 7.111584157025575e-07, + "loss": 0.8289, + "step": 5917 + }, + { + "epoch": 0.88, + "grad_norm": 1.3064255238005393, + "learning_rate": 7.093694193667866e-07, + "loss": 0.7531, + "step": 5918 + }, + { + "epoch": 0.88, + "grad_norm": 1.3899770121723252, + "learning_rate": 7.075825933232461e-07, + "loss": 0.8687, + "step": 5919 + }, + { + "epoch": 0.88, + "grad_norm": 1.5346226407869021, + "learning_rate": 7.057979379893353e-07, + "loss": 0.8846, + "step": 5920 + }, + { + "epoch": 0.88, + "grad_norm": 1.4240059362195934, + "learning_rate": 7.040154537819533e-07, + "loss": 0.8722, + "step": 5921 + }, + { + "epoch": 0.88, + "grad_norm": 1.4310095282065138, + "learning_rate": 7.022351411174866e-07, + "loss": 0.7597, + "step": 5922 + }, + { + "epoch": 0.88, + "grad_norm": 1.603588444105267, + "learning_rate": 7.004570004118172e-07, + "loss": 0.8156, + "step": 5923 + }, + { + "epoch": 0.88, + "grad_norm": 1.4914003700292304, + "learning_rate": 6.986810320803195e-07, + "loss": 0.7692, + "step": 5924 + }, + { + "epoch": 0.88, + "grad_norm": 1.6335567804036393, + "learning_rate": 6.969072365378605e-07, + "loss": 0.8702, + "step": 5925 + }, + { + "epoch": 0.88, + "grad_norm": 1.5112571294894828, + "learning_rate": 6.951356141987963e-07, + "loss": 0.7707, + "step": 5926 + }, + { + "epoch": 0.88, + "grad_norm": 1.2762290949935435, + "learning_rate": 6.933661654769797e-07, + "loss": 0.8104, + "step": 5927 + }, + { + "epoch": 0.88, + "grad_norm": 1.53685108084369, + "learning_rate": 6.91598890785754e-07, + "loss": 0.8344, + "step": 5928 + }, + { + "epoch": 0.88, + "grad_norm": 1.429970967874016, + "learning_rate": 6.898337905379549e-07, + "loss": 0.8886, + "step": 5929 + }, + { + "epoch": 0.88, + "grad_norm": 1.4579141475590403, + "learning_rate": 6.88070865145909e-07, + "loss": 0.7821, + "step": 5930 + }, + { + "epoch": 0.88, + "grad_norm": 0.9416034346551806, + "learning_rate": 6.863101150214369e-07, + "loss": 0.3091, + "step": 5931 + }, + { + "epoch": 0.89, + "grad_norm": 1.3314424385195347, + "learning_rate": 6.845515405758518e-07, + "loss": 0.8465, + "step": 5932 + }, + { + "epoch": 0.89, + "grad_norm": 1.5400321902240883, + "learning_rate": 6.827951422199531e-07, + "loss": 0.787, + "step": 5933 + }, + { + "epoch": 0.89, + "grad_norm": 1.527361783118756, + "learning_rate": 6.810409203640378e-07, + "loss": 0.7712, + "step": 5934 + }, + { + "epoch": 0.89, + "grad_norm": 1.414291817686033, + "learning_rate": 6.792888754178906e-07, + "loss": 0.7771, + "step": 5935 + }, + { + "epoch": 0.89, + "grad_norm": 1.3719478328516528, + "learning_rate": 6.775390077907918e-07, + "loss": 0.7731, + "step": 5936 + }, + { + "epoch": 0.89, + "grad_norm": 1.4162112814461316, + "learning_rate": 6.757913178915087e-07, + "loss": 0.7218, + "step": 5937 + }, + { + "epoch": 0.89, + "grad_norm": 0.8770896406052634, + "learning_rate": 6.74045806128305e-07, + "loss": 0.3018, + "step": 5938 + }, + { + "epoch": 0.89, + "grad_norm": 1.543155433733017, + "learning_rate": 6.723024729089278e-07, + "loss": 0.795, + "step": 5939 + }, + { + "epoch": 0.89, + "grad_norm": 1.3829758091038407, + "learning_rate": 6.705613186406223e-07, + "loss": 0.7688, + "step": 5940 + }, + { + "epoch": 0.89, + "grad_norm": 1.5224843203869813, + "learning_rate": 6.688223437301222e-07, + "loss": 0.7841, + "step": 5941 + }, + { + "epoch": 0.89, + "grad_norm": 1.3913430204648534, + "learning_rate": 6.670855485836525e-07, + "loss": 0.8429, + "step": 5942 + }, + { + "epoch": 0.89, + "grad_norm": 1.4823748495662123, + "learning_rate": 6.653509336069285e-07, + "loss": 0.8302, + "step": 5943 + }, + { + "epoch": 0.89, + "grad_norm": 1.364959787415724, + "learning_rate": 6.636184992051553e-07, + "loss": 0.8156, + "step": 5944 + }, + { + "epoch": 0.89, + "grad_norm": 1.5895254674139034, + "learning_rate": 6.618882457830334e-07, + "loss": 0.7907, + "step": 5945 + }, + { + "epoch": 0.89, + "grad_norm": 1.3317540836872253, + "learning_rate": 6.601601737447461e-07, + "loss": 0.7529, + "step": 5946 + }, + { + "epoch": 0.89, + "grad_norm": 1.380769605969531, + "learning_rate": 6.584342834939717e-07, + "loss": 0.7617, + "step": 5947 + }, + { + "epoch": 0.89, + "grad_norm": 1.486678832756588, + "learning_rate": 6.567105754338798e-07, + "loss": 0.7975, + "step": 5948 + }, + { + "epoch": 0.89, + "grad_norm": 1.5142665894518585, + "learning_rate": 6.549890499671285e-07, + "loss": 0.798, + "step": 5949 + }, + { + "epoch": 0.89, + "grad_norm": 1.5457823288642993, + "learning_rate": 6.532697074958661e-07, + "loss": 0.8038, + "step": 5950 + }, + { + "epoch": 0.89, + "grad_norm": 1.4400660699205168, + "learning_rate": 6.515525484217323e-07, + "loss": 0.8534, + "step": 5951 + }, + { + "epoch": 0.89, + "grad_norm": 1.5195908792166468, + "learning_rate": 6.498375731458529e-07, + "loss": 0.7921, + "step": 5952 + }, + { + "epoch": 0.89, + "grad_norm": 1.4485575627746758, + "learning_rate": 6.481247820688475e-07, + "loss": 0.7321, + "step": 5953 + }, + { + "epoch": 0.89, + "grad_norm": 1.3029254282814693, + "learning_rate": 6.46414175590826e-07, + "loss": 0.8233, + "step": 5954 + }, + { + "epoch": 0.89, + "grad_norm": 1.5290624918802223, + "learning_rate": 6.447057541113832e-07, + "loss": 0.8634, + "step": 5955 + }, + { + "epoch": 0.89, + "grad_norm": 1.5153888138614064, + "learning_rate": 6.42999518029609e-07, + "loss": 0.7795, + "step": 5956 + }, + { + "epoch": 0.89, + "grad_norm": 1.5781135086547653, + "learning_rate": 6.412954677440797e-07, + "loss": 0.78, + "step": 5957 + }, + { + "epoch": 0.89, + "grad_norm": 1.431357275022252, + "learning_rate": 6.395936036528627e-07, + "loss": 0.7856, + "step": 5958 + }, + { + "epoch": 0.89, + "grad_norm": 1.4299018300540858, + "learning_rate": 6.378939261535111e-07, + "loss": 0.8316, + "step": 5959 + }, + { + "epoch": 0.89, + "grad_norm": 1.3564976379320886, + "learning_rate": 6.361964356430717e-07, + "loss": 0.9327, + "step": 5960 + }, + { + "epoch": 0.89, + "grad_norm": 1.4516939954152928, + "learning_rate": 6.345011325180772e-07, + "loss": 0.768, + "step": 5961 + }, + { + "epoch": 0.89, + "grad_norm": 1.4536822824499733, + "learning_rate": 6.32808017174551e-07, + "loss": 0.8605, + "step": 5962 + }, + { + "epoch": 0.89, + "grad_norm": 1.54892466016969, + "learning_rate": 6.311170900080044e-07, + "loss": 0.8204, + "step": 5963 + }, + { + "epoch": 0.89, + "grad_norm": 1.378243499157131, + "learning_rate": 6.294283514134414e-07, + "loss": 0.8382, + "step": 5964 + }, + { + "epoch": 0.89, + "grad_norm": 1.5842245204461887, + "learning_rate": 6.277418017853476e-07, + "loss": 0.7621, + "step": 5965 + }, + { + "epoch": 0.89, + "grad_norm": 1.2991322863957695, + "learning_rate": 6.260574415177012e-07, + "loss": 0.8059, + "step": 5966 + }, + { + "epoch": 0.89, + "grad_norm": 1.7342637752404075, + "learning_rate": 6.243752710039719e-07, + "loss": 0.8289, + "step": 5967 + }, + { + "epoch": 0.89, + "grad_norm": 1.5609923563588473, + "learning_rate": 6.226952906371131e-07, + "loss": 0.9273, + "step": 5968 + }, + { + "epoch": 0.89, + "grad_norm": 1.5331418134494013, + "learning_rate": 6.210175008095675e-07, + "loss": 0.7992, + "step": 5969 + }, + { + "epoch": 0.89, + "grad_norm": 1.261713053776074, + "learning_rate": 6.193419019132685e-07, + "loss": 0.8286, + "step": 5970 + }, + { + "epoch": 0.89, + "grad_norm": 1.5298776163358483, + "learning_rate": 6.176684943396383e-07, + "loss": 0.8028, + "step": 5971 + }, + { + "epoch": 0.89, + "grad_norm": 1.4465507995565334, + "learning_rate": 6.159972784795798e-07, + "loss": 0.8406, + "step": 5972 + }, + { + "epoch": 0.89, + "grad_norm": 1.465692630216025, + "learning_rate": 6.143282547234919e-07, + "loss": 0.7757, + "step": 5973 + }, + { + "epoch": 0.89, + "grad_norm": 1.542709106618553, + "learning_rate": 6.126614234612593e-07, + "loss": 0.7998, + "step": 5974 + }, + { + "epoch": 0.89, + "grad_norm": 1.4419882214597715, + "learning_rate": 6.109967850822529e-07, + "loss": 0.9108, + "step": 5975 + }, + { + "epoch": 0.89, + "grad_norm": 1.4451114978722102, + "learning_rate": 6.093343399753327e-07, + "loss": 0.7805, + "step": 5976 + }, + { + "epoch": 0.89, + "grad_norm": 1.3334239554683949, + "learning_rate": 6.076740885288479e-07, + "loss": 0.8371, + "step": 5977 + }, + { + "epoch": 0.89, + "grad_norm": 0.8213178071696831, + "learning_rate": 6.060160311306307e-07, + "loss": 0.3207, + "step": 5978 + }, + { + "epoch": 0.89, + "grad_norm": 1.414291328414453, + "learning_rate": 6.043601681680045e-07, + "loss": 0.7822, + "step": 5979 + }, + { + "epoch": 0.89, + "grad_norm": 1.5378556300109016, + "learning_rate": 6.027065000277776e-07, + "loss": 0.8647, + "step": 5980 + }, + { + "epoch": 0.89, + "grad_norm": 1.5324473042727753, + "learning_rate": 6.010550270962501e-07, + "loss": 0.8052, + "step": 5981 + }, + { + "epoch": 0.89, + "grad_norm": 1.5700500752958246, + "learning_rate": 5.994057497592032e-07, + "loss": 0.7401, + "step": 5982 + }, + { + "epoch": 0.89, + "grad_norm": 1.4364867805404387, + "learning_rate": 5.97758668401911e-07, + "loss": 0.8555, + "step": 5983 + }, + { + "epoch": 0.89, + "grad_norm": 1.6340851820368967, + "learning_rate": 5.961137834091313e-07, + "loss": 0.7655, + "step": 5984 + }, + { + "epoch": 0.89, + "grad_norm": 0.805879616656503, + "learning_rate": 5.944710951651067e-07, + "loss": 0.287, + "step": 5985 + }, + { + "epoch": 0.89, + "grad_norm": 1.4135264110222188, + "learning_rate": 5.928306040535725e-07, + "loss": 0.8703, + "step": 5986 + }, + { + "epoch": 0.89, + "grad_norm": 1.3645196402647846, + "learning_rate": 5.911923104577455e-07, + "loss": 0.8089, + "step": 5987 + }, + { + "epoch": 0.89, + "grad_norm": 1.3706676426923676, + "learning_rate": 5.895562147603317e-07, + "loss": 0.8147, + "step": 5988 + }, + { + "epoch": 0.89, + "grad_norm": 1.5334474551148063, + "learning_rate": 5.879223173435245e-07, + "loss": 0.8056, + "step": 5989 + }, + { + "epoch": 0.89, + "grad_norm": 1.5710583796662052, + "learning_rate": 5.862906185890027e-07, + "loss": 0.823, + "step": 5990 + }, + { + "epoch": 0.89, + "grad_norm": 1.3071111258208394, + "learning_rate": 5.846611188779283e-07, + "loss": 0.8242, + "step": 5991 + }, + { + "epoch": 0.89, + "grad_norm": 1.5030716301688591, + "learning_rate": 5.830338185909545e-07, + "loss": 0.7478, + "step": 5992 + }, + { + "epoch": 0.89, + "grad_norm": 1.4878308855063345, + "learning_rate": 5.814087181082195e-07, + "loss": 0.7888, + "step": 5993 + }, + { + "epoch": 0.89, + "grad_norm": 1.430201095560194, + "learning_rate": 5.797858178093463e-07, + "loss": 0.7913, + "step": 5994 + }, + { + "epoch": 0.89, + "grad_norm": 1.395487065090944, + "learning_rate": 5.781651180734438e-07, + "loss": 0.8612, + "step": 5995 + }, + { + "epoch": 0.89, + "grad_norm": 1.536681800421341, + "learning_rate": 5.765466192791103e-07, + "loss": 0.845, + "step": 5996 + }, + { + "epoch": 0.89, + "grad_norm": 1.668963353532185, + "learning_rate": 5.749303218044234e-07, + "loss": 0.8468, + "step": 5997 + }, + { + "epoch": 0.89, + "grad_norm": 1.5744757785586105, + "learning_rate": 5.73316226026952e-07, + "loss": 0.8692, + "step": 5998 + }, + { + "epoch": 0.9, + "grad_norm": 1.775092832268406, + "learning_rate": 5.717043323237503e-07, + "loss": 0.8236, + "step": 5999 + }, + { + "epoch": 0.9, + "grad_norm": 1.4651047245247877, + "learning_rate": 5.700946410713548e-07, + "loss": 0.9029, + "step": 6000 + } + ], + "logging_steps": 1.0, + "max_steps": 6702, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 2.000184256211242e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}