diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,86158 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12304, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.273040591428128, + "learning_rate": 5.405405405405406e-08, + "loss": 0.8615, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.14660204198603, + "learning_rate": 1.0810810810810812e-07, + "loss": 0.8351, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 3.5136363685645717, + "learning_rate": 1.6216216216216218e-07, + "loss": 0.7338, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.84314659649236, + "learning_rate": 2.1621621621621625e-07, + "loss": 0.3097, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 4.3116383284825615, + "learning_rate": 2.702702702702703e-07, + "loss": 0.9433, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 4.151887895391798, + "learning_rate": 3.2432432432432436e-07, + "loss": 0.8352, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 4.359720301871594, + "learning_rate": 3.7837837837837843e-07, + "loss": 0.8869, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 4.069900304039918, + "learning_rate": 4.324324324324325e-07, + "loss": 0.8074, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 4.221531701629939, + "learning_rate": 4.864864864864865e-07, + "loss": 0.8024, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 4.223144226951507, + "learning_rate": 5.405405405405406e-07, + "loss": 0.793, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 4.3223820324853826, + "learning_rate": 5.945945945945947e-07, + "loss": 0.9239, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 3.8854492475804197, + "learning_rate": 6.486486486486487e-07, + "loss": 0.7622, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 3.5468659118607886, + "learning_rate": 7.027027027027028e-07, + "loss": 0.8072, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 3.239336624444215, + "learning_rate": 7.567567567567569e-07, + "loss": 0.8146, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 3.476905400596688, + "learning_rate": 8.108108108108109e-07, + "loss": 0.7157, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 2.8515112912848704, + "learning_rate": 8.64864864864865e-07, + "loss": 0.6896, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.363378808068876, + "learning_rate": 9.189189189189191e-07, + "loss": 0.7681, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.646198456699034, + "learning_rate": 9.72972972972973e-07, + "loss": 0.8277, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 2.958244139103593, + "learning_rate": 1.027027027027027e-06, + "loss": 0.707, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.4628987030897243, + "learning_rate": 1.0810810810810812e-06, + "loss": 0.8058, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.0814254976956748, + "learning_rate": 1.1351351351351352e-06, + "loss": 0.796, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 2.134834005763383, + "learning_rate": 1.1891891891891893e-06, + "loss": 0.7277, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 1.9432058635310248, + "learning_rate": 1.2432432432432434e-06, + "loss": 0.6734, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 2.2219674117937425, + "learning_rate": 1.2972972972972974e-06, + "loss": 0.7524, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 1.6618761393680126, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.6931, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 1.7996160383043525, + "learning_rate": 1.4054054054054056e-06, + "loss": 0.7208, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 1.7728768994451718, + "learning_rate": 1.4594594594594596e-06, + "loss": 0.6902, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 1.8967993526053564, + "learning_rate": 1.5135135135135137e-06, + "loss": 0.7391, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 1.7443476331916143, + "learning_rate": 1.5675675675675678e-06, + "loss": 0.7357, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 1.789052299834985, + "learning_rate": 1.6216216216216219e-06, + "loss": 0.6854, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 1.584750261961582, + "learning_rate": 1.675675675675676e-06, + "loss": 0.6375, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.789236566080917, + "learning_rate": 1.72972972972973e-06, + "loss": 0.7515, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 1.5893620129109587, + "learning_rate": 1.783783783783784e-06, + "loss": 0.6829, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 1.7943146599443252, + "learning_rate": 1.8378378378378381e-06, + "loss": 0.7003, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.57691567184717, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.6645, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.5392384888398358, + "learning_rate": 1.945945945945946e-06, + "loss": 0.7122, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 0.8831190046317889, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3036, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.3157264544284732, + "learning_rate": 2.054054054054054e-06, + "loss": 0.6175, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.432915059081968, + "learning_rate": 2.1081081081081085e-06, + "loss": 0.7022, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 1.4613657485470233, + "learning_rate": 2.1621621621621623e-06, + "loss": 0.6661, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 1.5547873171988644, + "learning_rate": 2.2162162162162166e-06, + "loss": 0.6233, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 1.6877086684917586, + "learning_rate": 2.2702702702702705e-06, + "loss": 0.6675, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 1.7184407982106682, + "learning_rate": 2.3243243243243247e-06, + "loss": 0.7379, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 1.7731563332171092, + "learning_rate": 2.3783783783783786e-06, + "loss": 0.6577, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 1.801920902456086, + "learning_rate": 2.432432432432433e-06, + "loss": 0.7479, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 1.6573379957461731, + "learning_rate": 2.4864864864864867e-06, + "loss": 0.6584, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 1.4084165272724114, + "learning_rate": 2.540540540540541e-06, + "loss": 0.6129, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 1.6525233813389781, + "learning_rate": 2.594594594594595e-06, + "loss": 0.704, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 1.3782860124442546, + "learning_rate": 2.648648648648649e-06, + "loss": 0.6154, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 1.4337365267953683, + "learning_rate": 2.702702702702703e-06, + "loss": 0.6052, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1.2702372133594575, + "learning_rate": 2.7567567567567573e-06, + "loss": 0.5269, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 1.2788406696387828, + "learning_rate": 2.810810810810811e-06, + "loss": 0.559, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 1.5336812095086203, + "learning_rate": 2.8648648648648654e-06, + "loss": 0.688, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 1.3528711974952585, + "learning_rate": 2.9189189189189193e-06, + "loss": 0.6331, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 1.333332096964607, + "learning_rate": 2.9729729729729736e-06, + "loss": 0.6096, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 1.4784282183701558, + "learning_rate": 3.0270270270270274e-06, + "loss": 0.6252, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 1.4658057231898225, + "learning_rate": 3.0810810810810817e-06, + "loss": 0.6609, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 1.4561817591061057, + "learning_rate": 3.1351351351351356e-06, + "loss": 0.6048, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 1.355095612044604, + "learning_rate": 3.1891891891891894e-06, + "loss": 0.5902, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 1.3067364183441053, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.5688, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 1.3751486378946132, + "learning_rate": 3.2972972972972976e-06, + "loss": 0.6159, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.2862193884453008, + "learning_rate": 3.351351351351352e-06, + "loss": 0.593, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 1.4789049778412198, + "learning_rate": 3.4054054054054057e-06, + "loss": 0.6427, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.4546804071157595, + "learning_rate": 3.45945945945946e-06, + "loss": 0.632, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.4395735150129738, + "learning_rate": 3.513513513513514e-06, + "loss": 0.6129, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.4346828370676308, + "learning_rate": 3.567567567567568e-06, + "loss": 0.6517, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 1.4377476761384835, + "learning_rate": 3.621621621621622e-06, + "loss": 0.6353, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 0.8803533563697833, + "learning_rate": 3.6756756756756763e-06, + "loss": 0.3199, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.3957121329522453, + "learning_rate": 3.72972972972973e-06, + "loss": 0.6205, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.4125486289001528, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.6586, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.431189021990164, + "learning_rate": 3.837837837837838e-06, + "loss": 0.6508, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.2515213521059998, + "learning_rate": 3.891891891891892e-06, + "loss": 0.5511, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.4889319759544897, + "learning_rate": 3.945945945945947e-06, + "loss": 0.6802, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.3850309722543301, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5624, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 1.5352306040196768, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.6158, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 1.306638038154721, + "learning_rate": 4.108108108108108e-06, + "loss": 0.5853, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.4703554455302463, + "learning_rate": 4.162162162162163e-06, + "loss": 0.6308, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.3157754223842182, + "learning_rate": 4.216216216216217e-06, + "loss": 0.6677, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.2784647086503598, + "learning_rate": 4.270270270270271e-06, + "loss": 0.5917, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 1.4109447882750359, + "learning_rate": 4.324324324324325e-06, + "loss": 0.635, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.793412137191073, + "learning_rate": 4.378378378378379e-06, + "loss": 0.6547, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 1.4467943130716234, + "learning_rate": 4.432432432432433e-06, + "loss": 0.6441, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.5009537855152142, + "learning_rate": 4.486486486486487e-06, + "loss": 0.6814, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.298628336993122, + "learning_rate": 4.540540540540541e-06, + "loss": 0.6049, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.599083383122989, + "learning_rate": 4.594594594594596e-06, + "loss": 0.7032, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.3744837239712366, + "learning_rate": 4.6486486486486495e-06, + "loss": 0.5911, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.4225318479566327, + "learning_rate": 4.702702702702703e-06, + "loss": 0.5956, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.3958885743685878, + "learning_rate": 4.756756756756757e-06, + "loss": 0.5795, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.3249125605774177, + "learning_rate": 4.810810810810811e-06, + "loss": 0.5479, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 1.3298464814569155, + "learning_rate": 4.864864864864866e-06, + "loss": 0.5754, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.4683698015671058, + "learning_rate": 4.91891891891892e-06, + "loss": 0.5928, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.3388444498030598, + "learning_rate": 4.9729729729729735e-06, + "loss": 0.6335, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.3594399027949904, + "learning_rate": 5.027027027027027e-06, + "loss": 0.602, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 0.8636527147025339, + "learning_rate": 5.081081081081082e-06, + "loss": 0.3149, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 1.2168309642609405, + "learning_rate": 5.135135135135135e-06, + "loss": 0.5471, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.2351839782392957, + "learning_rate": 5.18918918918919e-06, + "loss": 0.5424, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.292604070171166, + "learning_rate": 5.243243243243244e-06, + "loss": 0.5823, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 1.311816649798154, + "learning_rate": 5.297297297297298e-06, + "loss": 0.6077, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 0.8143613433965735, + "learning_rate": 5.351351351351351e-06, + "loss": 0.3432, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.3116387212090366, + "learning_rate": 5.405405405405406e-06, + "loss": 0.6164, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.3535109082047654, + "learning_rate": 5.45945945945946e-06, + "loss": 0.6392, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 1.3443165530744305, + "learning_rate": 5.513513513513515e-06, + "loss": 0.568, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.414863407793237, + "learning_rate": 5.567567567567568e-06, + "loss": 0.6686, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.3009721885803063, + "learning_rate": 5.621621621621622e-06, + "loss": 0.5804, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.2774240010244946, + "learning_rate": 5.675675675675676e-06, + "loss": 0.6211, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.8344894256527007, + "learning_rate": 5.729729729729731e-06, + "loss": 0.3314, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.3879213164427744, + "learning_rate": 5.783783783783784e-06, + "loss": 0.656, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.375044697622862, + "learning_rate": 5.837837837837839e-06, + "loss": 0.5651, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.4877329295170625, + "learning_rate": 5.8918918918918924e-06, + "loss": 0.6075, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.2983195899347304, + "learning_rate": 5.945945945945947e-06, + "loss": 0.5463, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 1.2218725662034848, + "learning_rate": 6e-06, + "loss": 0.6036, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.3436025991798024, + "learning_rate": 6.054054054054055e-06, + "loss": 0.6227, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.2977100495834533, + "learning_rate": 6.108108108108109e-06, + "loss": 0.5586, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.3507646576725383, + "learning_rate": 6.162162162162163e-06, + "loss": 0.5956, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 1.399931221889171, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.6347, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.4071118149538326, + "learning_rate": 6.270270270270271e-06, + "loss": 0.6294, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.5751588266470815, + "learning_rate": 6.324324324324325e-06, + "loss": 0.5842, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 1.5903449377258876, + "learning_rate": 6.378378378378379e-06, + "loss": 0.6512, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 1.6710624509177674, + "learning_rate": 6.432432432432433e-06, + "loss": 0.5931, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 1.1494354077899656, + "learning_rate": 6.486486486486487e-06, + "loss": 0.5491, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.3044888172319544, + "learning_rate": 6.540540540540541e-06, + "loss": 0.615, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 5.268346144656469, + "learning_rate": 6.594594594594595e-06, + "loss": 0.5698, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 1.3160275141042568, + "learning_rate": 6.648648648648649e-06, + "loss": 0.5061, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 1.246078708770176, + "learning_rate": 6.702702702702704e-06, + "loss": 0.5999, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.427653128474058, + "learning_rate": 6.7567567567567575e-06, + "loss": 0.6213, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.303477151542448, + "learning_rate": 6.810810810810811e-06, + "loss": 0.6192, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.3710171885376965, + "learning_rate": 6.864864864864865e-06, + "loss": 0.5732, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 1.3631429826418746, + "learning_rate": 6.91891891891892e-06, + "loss": 0.5732, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.4099354465501006, + "learning_rate": 6.972972972972973e-06, + "loss": 0.6389, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.4281747107854714, + "learning_rate": 7.027027027027028e-06, + "loss": 0.6006, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 1.4595708181113594, + "learning_rate": 7.0810810810810815e-06, + "loss": 0.6472, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.489743891332324, + "learning_rate": 7.135135135135136e-06, + "loss": 0.6437, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.2738442793830842, + "learning_rate": 7.189189189189189e-06, + "loss": 0.5813, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.6295009674146617, + "learning_rate": 7.243243243243244e-06, + "loss": 0.7095, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 1.5620836204569906, + "learning_rate": 7.297297297297298e-06, + "loss": 0.5914, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 1.5060790781694255, + "learning_rate": 7.3513513513513525e-06, + "loss": 0.6339, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.388913535992189, + "learning_rate": 7.4054054054054055e-06, + "loss": 0.6277, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.4863516194848927, + "learning_rate": 7.45945945945946e-06, + "loss": 0.5646, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 1.3445803127161098, + "learning_rate": 7.513513513513514e-06, + "loss": 0.6292, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 1.3384584779109399, + "learning_rate": 7.567567567567569e-06, + "loss": 0.5435, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.4926838153450348, + "learning_rate": 7.621621621621622e-06, + "loss": 0.6428, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 1.3869777102360936, + "learning_rate": 7.675675675675676e-06, + "loss": 0.5966, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.3409434328050722, + "learning_rate": 7.72972972972973e-06, + "loss": 0.6088, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 1.2762128491212468, + "learning_rate": 7.783783783783784e-06, + "loss": 0.6029, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.314550003433995, + "learning_rate": 7.837837837837838e-06, + "loss": 0.5568, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 1.4580077217392413, + "learning_rate": 7.891891891891894e-06, + "loss": 0.6383, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 1.460486772621834, + "learning_rate": 7.945945945945946e-06, + "loss": 0.6107, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.3790848419120394, + "learning_rate": 8.000000000000001e-06, + "loss": 0.6367, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.3986888028757734, + "learning_rate": 8.054054054054055e-06, + "loss": 0.5709, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.5066361422827574, + "learning_rate": 8.108108108108109e-06, + "loss": 0.5565, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 1.231260969143522, + "learning_rate": 8.162162162162163e-06, + "loss": 0.5974, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 1.3662141363352507, + "learning_rate": 8.216216216216217e-06, + "loss": 0.5617, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 1.379757692211869, + "learning_rate": 8.27027027027027e-06, + "loss": 0.6216, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.4032029868039735, + "learning_rate": 8.324324324324326e-06, + "loss": 0.5659, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.246102895886861, + "learning_rate": 8.378378378378378e-06, + "loss": 0.6069, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 1.3129486277734963, + "learning_rate": 8.432432432432434e-06, + "loss": 0.5649, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.5949807239779763, + "learning_rate": 8.486486486486488e-06, + "loss": 0.6402, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 1.2367723511070507, + "learning_rate": 8.540540540540542e-06, + "loss": 0.5507, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 1.3732449159415614, + "learning_rate": 8.594594594594595e-06, + "loss": 0.6039, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 1.3522671789714542, + "learning_rate": 8.64864864864865e-06, + "loss": 0.583, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.3857986224694943, + "learning_rate": 8.702702702702703e-06, + "loss": 0.6238, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 1.6041881805697953, + "learning_rate": 8.756756756756759e-06, + "loss": 0.5787, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 1.3844553479323647, + "learning_rate": 8.810810810810811e-06, + "loss": 0.6455, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 1.372775499931938, + "learning_rate": 8.864864864864866e-06, + "loss": 0.5469, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 1.3096092196718552, + "learning_rate": 8.91891891891892e-06, + "loss": 0.6066, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 1.3873598772944997, + "learning_rate": 8.972972972972974e-06, + "loss": 0.6354, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 1.4261461525288062, + "learning_rate": 9.027027027027028e-06, + "loss": 0.5119, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 1.3805251800039318, + "learning_rate": 9.081081081081082e-06, + "loss": 0.5846, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.8240477026882622, + "learning_rate": 9.135135135135136e-06, + "loss": 0.3422, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 1.3817497767181797, + "learning_rate": 9.189189189189191e-06, + "loss": 0.5916, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.4599858495694384, + "learning_rate": 9.243243243243243e-06, + "loss": 0.5474, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.3796151016083495, + "learning_rate": 9.297297297297299e-06, + "loss": 0.5706, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.409777572587561, + "learning_rate": 9.351351351351353e-06, + "loss": 0.6052, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 1.4002947829604957, + "learning_rate": 9.405405405405407e-06, + "loss": 0.5698, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 1.405204434110899, + "learning_rate": 9.45945945945946e-06, + "loss": 0.596, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 1.462514585076768, + "learning_rate": 9.513513513513514e-06, + "loss": 0.62, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 1.7499683079184325, + "learning_rate": 9.567567567567568e-06, + "loss": 0.5936, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 1.4505838663306836, + "learning_rate": 9.621621621621622e-06, + "loss": 0.6323, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 1.2193208841128076, + "learning_rate": 9.675675675675676e-06, + "loss": 0.5516, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 1.5461709008460212, + "learning_rate": 9.729729729729732e-06, + "loss": 0.6565, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.3783256914341504, + "learning_rate": 9.783783783783785e-06, + "loss": 0.6273, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 1.3009129274113025, + "learning_rate": 9.83783783783784e-06, + "loss": 0.5709, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 1.4331042802705694, + "learning_rate": 9.891891891891893e-06, + "loss": 0.6054, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 1.6637836990959596, + "learning_rate": 9.945945945945947e-06, + "loss": 0.5736, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.4268744416818444, + "learning_rate": 1e-05, + "loss": 0.5935, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 1.325262107647321, + "learning_rate": 1.0054054054054055e-05, + "loss": 0.5971, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 1.477582803403984, + "learning_rate": 1.0108108108108109e-05, + "loss": 0.6255, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.3006442007786023, + "learning_rate": 1.0162162162162164e-05, + "loss": 0.5756, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 1.3884979600616545, + "learning_rate": 1.0216216216216216e-05, + "loss": 0.6581, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 1.2998097970689109, + "learning_rate": 1.027027027027027e-05, + "loss": 0.5781, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 1.3950495411925943, + "learning_rate": 1.0324324324324324e-05, + "loss": 0.5968, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.36884172166217, + "learning_rate": 1.037837837837838e-05, + "loss": 0.5746, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.3242805937768347, + "learning_rate": 1.0432432432432433e-05, + "loss": 0.6482, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.387103904758459, + "learning_rate": 1.0486486486486487e-05, + "loss": 0.5908, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.3226413426613004, + "learning_rate": 1.0540540540540541e-05, + "loss": 0.5772, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 1.4041121660539917, + "learning_rate": 1.0594594594594597e-05, + "loss": 0.5884, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.4683717432308583, + "learning_rate": 1.0648648648648649e-05, + "loss": 0.6196, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.3287374075081768, + "learning_rate": 1.0702702702702703e-05, + "loss": 0.5308, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 1.31673514609701, + "learning_rate": 1.0756756756756757e-05, + "loss": 0.5923, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 1.2248739732041203, + "learning_rate": 1.0810810810810812e-05, + "loss": 0.5463, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.4582091435807332, + "learning_rate": 1.0864864864864866e-05, + "loss": 0.5624, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 1.3974633462736783, + "learning_rate": 1.091891891891892e-05, + "loss": 0.5518, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 1.3098255750598122, + "learning_rate": 1.0972972972972974e-05, + "loss": 0.5978, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 1.4020937292739093, + "learning_rate": 1.102702702702703e-05, + "loss": 0.6133, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 1.3006764141808833, + "learning_rate": 1.1081081081081081e-05, + "loss": 0.5437, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 1.3121807130049612, + "learning_rate": 1.1135135135135135e-05, + "loss": 0.5422, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 1.1807616133678631, + "learning_rate": 1.1189189189189189e-05, + "loss": 0.551, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 1.411361256781282, + "learning_rate": 1.1243243243243245e-05, + "loss": 0.594, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.4437358882210654, + "learning_rate": 1.1297297297297298e-05, + "loss": 0.6544, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 1.359156252589416, + "learning_rate": 1.1351351351351352e-05, + "loss": 0.6272, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 1.29652118501433, + "learning_rate": 1.1405405405405404e-05, + "loss": 0.6323, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 1.3097621304296205, + "learning_rate": 1.1459459459459462e-05, + "loss": 0.6287, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 1.2169668382664947, + "learning_rate": 1.1513513513513514e-05, + "loss": 0.5423, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 1.4669873874010895, + "learning_rate": 1.1567567567567568e-05, + "loss": 0.6193, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 1.2954066617571018, + "learning_rate": 1.1621621621621622e-05, + "loss": 0.5883, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 1.160631713695103, + "learning_rate": 1.1675675675675677e-05, + "loss": 0.5929, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 1.34874266847612, + "learning_rate": 1.1729729729729731e-05, + "loss": 0.6387, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 1.439317708700643, + "learning_rate": 1.1783783783783785e-05, + "loss": 0.5649, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 1.502437313426451, + "learning_rate": 1.1837837837837837e-05, + "loss": 0.6514, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 1.3337609956026828, + "learning_rate": 1.1891891891891894e-05, + "loss": 0.5162, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 1.2481859007998901, + "learning_rate": 1.1945945945945946e-05, + "loss": 0.6038, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 1.3067393315538185, + "learning_rate": 1.2e-05, + "loss": 0.5858, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 1.5757684424914125, + "learning_rate": 1.2054054054054054e-05, + "loss": 0.6355, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 1.5277737917360357, + "learning_rate": 1.210810810810811e-05, + "loss": 0.6498, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.3038105939520328, + "learning_rate": 1.2162162162162164e-05, + "loss": 0.5666, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 1.3439900623746683, + "learning_rate": 1.2216216216216217e-05, + "loss": 0.6461, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 1.4499492648511714, + "learning_rate": 1.227027027027027e-05, + "loss": 0.5793, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 1.1895262665587139, + "learning_rate": 1.2324324324324327e-05, + "loss": 0.615, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 1.4385484255224155, + "learning_rate": 1.2378378378378379e-05, + "loss": 0.5863, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 1.2568226128615418, + "learning_rate": 1.2432432432432433e-05, + "loss": 0.5336, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 1.4055308076396922, + "learning_rate": 1.2486486486486487e-05, + "loss": 0.5927, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 1.3604519277780607, + "learning_rate": 1.2540540540540542e-05, + "loss": 0.6365, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 1.3976912250958944, + "learning_rate": 1.2594594594594596e-05, + "loss": 0.6016, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 1.3870555370772695, + "learning_rate": 1.264864864864865e-05, + "loss": 0.6367, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 1.1506464171651247, + "learning_rate": 1.2702702702702702e-05, + "loss": 0.3983, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 1.105610354688429, + "learning_rate": 1.2756756756756758e-05, + "loss": 0.4861, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 1.2922528747586577, + "learning_rate": 1.2810810810810812e-05, + "loss": 0.5429, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 1.4117421331128992, + "learning_rate": 1.2864864864864865e-05, + "loss": 0.7149, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 1.1561171907937822, + "learning_rate": 1.291891891891892e-05, + "loss": 0.5846, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 1.2823597420925605, + "learning_rate": 1.2972972972972975e-05, + "loss": 0.6354, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.331199921173987, + "learning_rate": 1.3027027027027029e-05, + "loss": 0.6929, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 1.3632982831874763, + "learning_rate": 1.3081081081081083e-05, + "loss": 0.6142, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 1.30032739139169, + "learning_rate": 1.3135135135135135e-05, + "loss": 0.6053, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 1.2669107246418034, + "learning_rate": 1.318918918918919e-05, + "loss": 0.6165, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 1.4024543498429707, + "learning_rate": 1.3243243243243244e-05, + "loss": 0.5811, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 1.2282938927207665, + "learning_rate": 1.3297297297297298e-05, + "loss": 0.5824, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 1.2230011147853819, + "learning_rate": 1.3351351351351352e-05, + "loss": 0.5773, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 1.4405143504114788, + "learning_rate": 1.3405405405405407e-05, + "loss": 0.5856, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 1.407496997500501, + "learning_rate": 1.3459459459459461e-05, + "loss": 0.6758, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 1.3481381140295823, + "learning_rate": 1.3513513513513515e-05, + "loss": 0.6091, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 1.3772848449020698, + "learning_rate": 1.3567567567567567e-05, + "loss": 0.6378, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 1.3399199799458354, + "learning_rate": 1.3621621621621623e-05, + "loss": 0.6966, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.418137767899028, + "learning_rate": 1.3675675675675677e-05, + "loss": 0.6543, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 1.3872559456216722, + "learning_rate": 1.372972972972973e-05, + "loss": 0.6297, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 1.4057293135350568, + "learning_rate": 1.3783783783783784e-05, + "loss": 0.6444, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 1.2832126600113147, + "learning_rate": 1.383783783783784e-05, + "loss": 0.5735, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 1.1665636697135973, + "learning_rate": 1.3891891891891894e-05, + "loss": 0.606, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 1.2594265086154244, + "learning_rate": 1.3945945945945946e-05, + "loss": 0.5554, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 1.2758336481579695, + "learning_rate": 1.4e-05, + "loss": 0.5898, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 1.300264913132061, + "learning_rate": 1.4054054054054055e-05, + "loss": 0.6402, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 1.3315419378572004, + "learning_rate": 1.410810810810811e-05, + "loss": 0.548, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 1.219427993604917, + "learning_rate": 1.4162162162162163e-05, + "loss": 0.5622, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 1.1897819364789977, + "learning_rate": 1.4216216216216217e-05, + "loss": 0.6132, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 1.333457147167597, + "learning_rate": 1.4270270270270272e-05, + "loss": 0.6269, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 1.1467620301405113, + "learning_rate": 1.4324324324324326e-05, + "loss": 0.5014, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 1.2858858581704378, + "learning_rate": 1.4378378378378378e-05, + "loss": 0.6116, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 1.425258049531029, + "learning_rate": 1.4432432432432432e-05, + "loss": 0.5601, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 1.1845124721882538, + "learning_rate": 1.4486486486486488e-05, + "loss": 0.5994, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 1.2324371046729103, + "learning_rate": 1.4540540540540542e-05, + "loss": 0.5734, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 1.2291795518358182, + "learning_rate": 1.4594594594594596e-05, + "loss": 0.5702, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 1.2848681461436653, + "learning_rate": 1.464864864864865e-05, + "loss": 0.6426, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 1.3483627165006886, + "learning_rate": 1.4702702702702705e-05, + "loss": 0.6642, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 1.4277089296969299, + "learning_rate": 1.4756756756756759e-05, + "loss": 0.5671, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 1.3093511771695634, + "learning_rate": 1.4810810810810811e-05, + "loss": 0.5995, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 1.2312759429949, + "learning_rate": 1.4864864864864865e-05, + "loss": 0.6446, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 1.398767031842918, + "learning_rate": 1.491891891891892e-05, + "loss": 0.6307, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 1.2914932022710714, + "learning_rate": 1.4972972972972974e-05, + "loss": 0.5712, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 1.2827552834296592, + "learning_rate": 1.5027027027027028e-05, + "loss": 0.6311, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 1.17125498981903, + "learning_rate": 1.5081081081081082e-05, + "loss": 0.5277, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 1.2532330404418561, + "learning_rate": 1.5135135135135138e-05, + "loss": 0.6122, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 1.1771467390533292, + "learning_rate": 1.5189189189189191e-05, + "loss": 0.5551, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 1.3178622763909202, + "learning_rate": 1.5243243243243244e-05, + "loss": 0.6077, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 1.2923685907272977, + "learning_rate": 1.5297297297297297e-05, + "loss": 0.5549, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 1.2778488850748242, + "learning_rate": 1.5351351351351353e-05, + "loss": 0.6552, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 1.8710331713669297, + "learning_rate": 1.540540540540541e-05, + "loss": 0.6568, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 1.288199139989406, + "learning_rate": 1.545945945945946e-05, + "loss": 0.5902, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 1.452547686281634, + "learning_rate": 1.5513513513513513e-05, + "loss": 0.591, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 1.2416649407225506, + "learning_rate": 1.556756756756757e-05, + "loss": 0.6087, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.251564791733786, + "learning_rate": 1.5621621621621624e-05, + "loss": 0.6281, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 1.2801045493742762, + "learning_rate": 1.5675675675675676e-05, + "loss": 0.5922, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 1.2655122609805658, + "learning_rate": 1.572972972972973e-05, + "loss": 0.604, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 1.1773901975659047, + "learning_rate": 1.5783783783783787e-05, + "loss": 0.5544, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 1.1546196570757032, + "learning_rate": 1.583783783783784e-05, + "loss": 0.5362, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 1.3076851584325124, + "learning_rate": 1.589189189189189e-05, + "loss": 0.5621, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 1.2826279531334102, + "learning_rate": 1.5945945945945947e-05, + "loss": 0.5728, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.282623769972093, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.5627, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 1.3408958446169106, + "learning_rate": 1.6054054054054055e-05, + "loss": 0.611, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 1.2348334073578877, + "learning_rate": 1.610810810810811e-05, + "loss": 0.6068, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 1.2975389815390468, + "learning_rate": 1.6162162162162163e-05, + "loss": 0.6597, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 1.2096404489865318, + "learning_rate": 1.6216216216216218e-05, + "loss": 0.5774, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.1813743688009573, + "learning_rate": 1.6270270270270274e-05, + "loss": 0.5426, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.1681240565989068, + "learning_rate": 1.6324324324324326e-05, + "loss": 0.5872, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 1.3077141773660117, + "learning_rate": 1.6378378378378378e-05, + "loss": 0.5684, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 1.302528145958617, + "learning_rate": 1.6432432432432434e-05, + "loss": 0.6124, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 1.0758705697342368, + "learning_rate": 1.648648648648649e-05, + "loss": 0.6116, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 1.4131860721941674, + "learning_rate": 1.654054054054054e-05, + "loss": 0.609, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 1.2851791287758079, + "learning_rate": 1.6594594594594597e-05, + "loss": 0.6528, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 1.366338400597577, + "learning_rate": 1.6648648648648652e-05, + "loss": 0.6228, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 1.497776907617428, + "learning_rate": 1.6702702702702704e-05, + "loss": 0.6945, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 1.1602743781779306, + "learning_rate": 1.6756756756756757e-05, + "loss": 0.5984, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 1.4018561667642373, + "learning_rate": 1.6810810810810812e-05, + "loss": 0.6094, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 1.4040754275179608, + "learning_rate": 1.6864864864864868e-05, + "loss": 0.6365, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 1.2110215218082543, + "learning_rate": 1.691891891891892e-05, + "loss": 0.5932, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 1.2478904633212702, + "learning_rate": 1.6972972972972975e-05, + "loss": 0.5906, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 1.219674052199782, + "learning_rate": 1.7027027027027028e-05, + "loss": 0.5978, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 1.1334469527342586, + "learning_rate": 1.7081081081081083e-05, + "loss": 0.3721, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 1.2986498696320505, + "learning_rate": 1.7135135135135135e-05, + "loss": 0.6508, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 1.432617075086687, + "learning_rate": 1.718918918918919e-05, + "loss": 0.6134, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 1.3291322605886884, + "learning_rate": 1.7243243243243243e-05, + "loss": 0.5531, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 1.250068899737447, + "learning_rate": 1.72972972972973e-05, + "loss": 0.6449, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 1.2273279396721362, + "learning_rate": 1.7351351351351354e-05, + "loss": 0.63, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 1.288258537341939, + "learning_rate": 1.7405405405405406e-05, + "loss": 0.614, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 1.207319610961464, + "learning_rate": 1.745945945945946e-05, + "loss": 0.6219, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 1.1602660996710994, + "learning_rate": 1.7513513513513517e-05, + "loss": 0.5795, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.3070055487401167, + "learning_rate": 1.756756756756757e-05, + "loss": 0.7466, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 1.1048493267116712, + "learning_rate": 1.7621621621621622e-05, + "loss": 0.5529, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 1.2579563285152189, + "learning_rate": 1.7675675675675677e-05, + "loss": 0.6631, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 1.3235404147237428, + "learning_rate": 1.7729729729729733e-05, + "loss": 0.6216, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 1.1827495846841425, + "learning_rate": 1.7783783783783785e-05, + "loss": 0.6543, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 1.2601381738043989, + "learning_rate": 1.783783783783784e-05, + "loss": 0.6269, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 1.2430301530941215, + "learning_rate": 1.7891891891891893e-05, + "loss": 0.5677, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 1.2692760414700273, + "learning_rate": 1.7945945945945948e-05, + "loss": 0.6412, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.2001150074501838, + "learning_rate": 1.8e-05, + "loss": 0.5676, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 1.3075487794158436, + "learning_rate": 1.8054054054054056e-05, + "loss": 0.6472, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 1.456133317177465, + "learning_rate": 1.8108108108108108e-05, + "loss": 0.6338, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 1.1939888671400902, + "learning_rate": 1.8162162162162164e-05, + "loss": 0.6184, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.1981881909785101, + "learning_rate": 1.821621621621622e-05, + "loss": 0.5323, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 1.20091981296182, + "learning_rate": 1.827027027027027e-05, + "loss": 0.618, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 1.41989949023168, + "learning_rate": 1.8324324324324324e-05, + "loss": 0.6336, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 1.3139662867417505, + "learning_rate": 1.8378378378378383e-05, + "loss": 0.5965, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.333158999654752, + "learning_rate": 1.8432432432432435e-05, + "loss": 0.6264, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 1.4531220569740995, + "learning_rate": 1.8486486486486487e-05, + "loss": 0.5439, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 1.3131403063055023, + "learning_rate": 1.8540540540540542e-05, + "loss": 0.6107, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 1.3701905301886614, + "learning_rate": 1.8594594594594598e-05, + "loss": 0.6123, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 1.3653791912501723, + "learning_rate": 1.864864864864865e-05, + "loss": 0.6613, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 1.1142972006714638, + "learning_rate": 1.8702702702702706e-05, + "loss": 0.5526, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 1.4299690140309247, + "learning_rate": 1.8756756756756758e-05, + "loss": 0.6966, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 1.2593444176121449, + "learning_rate": 1.8810810810810813e-05, + "loss": 0.5774, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 1.1765124465821895, + "learning_rate": 1.8864864864864866e-05, + "loss": 0.6137, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 1.3065008361798192, + "learning_rate": 1.891891891891892e-05, + "loss": 0.5976, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 1.4087498976721118, + "learning_rate": 1.8972972972972973e-05, + "loss": 0.6289, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 1.2341841476607793, + "learning_rate": 1.902702702702703e-05, + "loss": 0.6139, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 1.328079281513009, + "learning_rate": 1.9081081081081084e-05, + "loss": 0.6233, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 1.2477408998174193, + "learning_rate": 1.9135135135135137e-05, + "loss": 0.6108, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 1.2816918475775614, + "learning_rate": 1.918918918918919e-05, + "loss": 0.6245, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 1.2016024948487902, + "learning_rate": 1.9243243243243244e-05, + "loss": 0.5771, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 1.217828445501427, + "learning_rate": 1.92972972972973e-05, + "loss": 0.578, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 1.9731305095764395, + "learning_rate": 1.9351351351351352e-05, + "loss": 0.4656, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 1.3567485383775508, + "learning_rate": 1.9405405405405408e-05, + "loss": 0.6423, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 1.5361091360932055, + "learning_rate": 1.9459459459459463e-05, + "loss": 0.6508, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 1.1645320502210965, + "learning_rate": 1.9513513513513515e-05, + "loss": 0.5934, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 1.0958112327792273, + "learning_rate": 1.956756756756757e-05, + "loss": 0.575, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 1.3740750676813598, + "learning_rate": 1.9621621621621623e-05, + "loss": 0.6114, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 1.262910997447116, + "learning_rate": 1.967567567567568e-05, + "loss": 0.6542, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 1.3570416080679524, + "learning_rate": 1.972972972972973e-05, + "loss": 0.6802, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 1.1127799363489526, + "learning_rate": 1.9783783783783786e-05, + "loss": 0.5859, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 1.2463121588632178, + "learning_rate": 1.983783783783784e-05, + "loss": 0.6215, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 1.2379252903825178, + "learning_rate": 1.9891891891891894e-05, + "loss": 0.6024, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 1.1412956122460987, + "learning_rate": 1.994594594594595e-05, + "loss": 0.5442, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 1.1359940245417317, + "learning_rate": 2e-05, + "loss": 0.6442, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 1.1800301173417773, + "learning_rate": 1.9999999653504437e-05, + "loss": 0.6515, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 1.2781092218781143, + "learning_rate": 1.9999998614017768e-05, + "loss": 0.6085, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 1.172788213355845, + "learning_rate": 1.999999688154006e-05, + "loss": 0.5846, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 1.2016265117946634, + "learning_rate": 1.999999445607144e-05, + "loss": 0.6501, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 1.1201465902325103, + "learning_rate": 1.9999991337612076e-05, + "loss": 0.6075, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 1.27326710987819, + "learning_rate": 1.9999987526162182e-05, + "loss": 0.7154, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 1.2675834152794216, + "learning_rate": 1.9999983021722023e-05, + "loss": 0.6373, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 1.3527705851821266, + "learning_rate": 1.999997782429191e-05, + "loss": 0.6465, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 1.258748188700293, + "learning_rate": 1.99999719338722e-05, + "loss": 0.6348, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 1.213322715499661, + "learning_rate": 1.999996535046331e-05, + "loss": 0.6418, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 1.3321652569073361, + "learning_rate": 1.999995807406569e-05, + "loss": 0.722, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 1.245916973083892, + "learning_rate": 1.9999950104679847e-05, + "loss": 0.6258, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 1.1799491737624703, + "learning_rate": 1.9999941442306328e-05, + "loss": 0.6077, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 1.187391126708189, + "learning_rate": 1.9999932086945735e-05, + "loss": 0.641, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 1.2541712864332293, + "learning_rate": 1.9999922038598724e-05, + "loss": 0.5921, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 1.316135226338434, + "learning_rate": 1.9999911297265987e-05, + "loss": 0.5714, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 1.2810307718824059, + "learning_rate": 1.999989986294826e-05, + "loss": 0.612, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 1.2079749802503408, + "learning_rate": 1.999988773564635e-05, + "loss": 0.6461, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 1.221100438003557, + "learning_rate": 1.9999874915361083e-05, + "loss": 0.5395, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 1.2321815633471245, + "learning_rate": 1.999986140209336e-05, + "loss": 0.626, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 1.2602679527635106, + "learning_rate": 1.9999847195844104e-05, + "loss": 0.6293, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 1.1795010152484604, + "learning_rate": 1.999983229661431e-05, + "loss": 0.5701, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 1.2089494729842096, + "learning_rate": 1.999981670440501e-05, + "loss": 0.6006, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 1.3261960086679885, + "learning_rate": 1.9999800419217285e-05, + "loss": 0.5857, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 1.258321316496881, + "learning_rate": 1.999978344105226e-05, + "loss": 0.6662, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 1.1705754710200273, + "learning_rate": 1.9999765769911108e-05, + "loss": 0.647, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 1.1891364177409702, + "learning_rate": 1.9999747405795057e-05, + "loss": 0.5929, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 1.2102667886877647, + "learning_rate": 1.9999728348705386e-05, + "loss": 0.6013, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 1.2641137026960239, + "learning_rate": 1.9999708598643405e-05, + "loss": 0.6163, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 1.2937634579523318, + "learning_rate": 1.999968815561049e-05, + "loss": 0.6061, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 1.3553502126852537, + "learning_rate": 1.9999667019608058e-05, + "loss": 0.6649, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 1.1871908937053395, + "learning_rate": 1.999964519063757e-05, + "loss": 0.5435, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 1.2660823944720954, + "learning_rate": 1.999962266870054e-05, + "loss": 0.6907, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 1.2083772436641733, + "learning_rate": 1.9999599453798523e-05, + "loss": 0.6537, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 1.1642945571775338, + "learning_rate": 1.999957554593314e-05, + "loss": 0.6076, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 1.2599482071313974, + "learning_rate": 1.9999550945106038e-05, + "loss": 0.6471, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 1.3562495151348162, + "learning_rate": 1.999952565131893e-05, + "loss": 0.7162, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 1.0991869227445243, + "learning_rate": 1.999949966457356e-05, + "loss": 0.6373, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 1.4761791840477894, + "learning_rate": 1.9999472984871734e-05, + "loss": 0.681, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 1.2778912175636392, + "learning_rate": 1.99994456122153e-05, + "loss": 0.6335, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 1.3259903218416293, + "learning_rate": 1.9999417546606153e-05, + "loss": 0.6059, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 1.30223892918819, + "learning_rate": 1.9999388788046238e-05, + "loss": 0.6614, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 1.2694327949012507, + "learning_rate": 1.9999359336537552e-05, + "loss": 0.5949, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 1.2617795135910619, + "learning_rate": 1.9999329192082132e-05, + "loss": 0.6396, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 1.2451258095266837, + "learning_rate": 1.999929835468207e-05, + "loss": 0.6391, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 1.276674414464419, + "learning_rate": 1.9999266824339502e-05, + "loss": 0.7109, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 1.3589387839394185, + "learning_rate": 1.999923460105661e-05, + "loss": 0.6363, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 1.2702756200783802, + "learning_rate": 1.999920168483563e-05, + "loss": 0.6659, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 1.2708618223956263, + "learning_rate": 1.9999168075678842e-05, + "loss": 0.6351, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 1.2391455379672414, + "learning_rate": 1.999913377358858e-05, + "loss": 0.6541, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 1.1599886274827143, + "learning_rate": 1.999909877856721e-05, + "loss": 0.665, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 1.288184833339696, + "learning_rate": 1.999906309061717e-05, + "loss": 0.6596, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 1.2045342383158713, + "learning_rate": 1.999902670974092e-05, + "loss": 0.6208, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 1.1790516976577494, + "learning_rate": 1.9998989635940996e-05, + "loss": 0.6316, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 1.1881008635767092, + "learning_rate": 1.9998951869219954e-05, + "loss": 0.6216, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 1.2987033612753498, + "learning_rate": 1.9998913409580418e-05, + "loss": 0.641, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 1.1570461964852865, + "learning_rate": 1.9998874257025055e-05, + "loss": 0.6407, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 1.1249351566142112, + "learning_rate": 1.999883441155657e-05, + "loss": 0.6047, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 1.1065769182994274, + "learning_rate": 1.999879387317773e-05, + "loss": 0.6333, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 1.2437237331960347, + "learning_rate": 1.9998752641891347e-05, + "loss": 0.6552, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 1.3090764868276303, + "learning_rate": 1.9998710717700272e-05, + "loss": 0.6934, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 1.4015060478983412, + "learning_rate": 1.9998668100607414e-05, + "loss": 0.6688, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 1.1350594119836854, + "learning_rate": 1.9998624790615722e-05, + "loss": 0.5912, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 1.1184799612791112, + "learning_rate": 1.9998580787728207e-05, + "loss": 0.6357, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 1.2121276795334905, + "learning_rate": 1.9998536091947907e-05, + "loss": 0.6295, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 1.243908237449138, + "learning_rate": 1.999849070327793e-05, + "loss": 0.6512, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 1.1423620436885662, + "learning_rate": 1.9998444621721413e-05, + "loss": 0.5638, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 1.195628182158961, + "learning_rate": 1.9998397847281548e-05, + "loss": 0.6488, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 1.245661950095741, + "learning_rate": 1.9998350379961583e-05, + "loss": 0.6223, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 1.2069261086286653, + "learning_rate": 1.9998302219764806e-05, + "loss": 0.5926, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 1.1260911895558972, + "learning_rate": 1.9998253366694555e-05, + "loss": 0.6861, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 1.1454069464798424, + "learning_rate": 1.9998203820754213e-05, + "loss": 0.6685, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 1.1441632874889192, + "learning_rate": 1.9998153581947216e-05, + "loss": 0.5732, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 1.2796985229602802, + "learning_rate": 1.9998102650277046e-05, + "loss": 0.6827, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 1.259238092044345, + "learning_rate": 1.9998051025747223e-05, + "loss": 0.6325, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 1.1999964484464798, + "learning_rate": 1.999799870836134e-05, + "loss": 0.6663, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 1.2378447884686448, + "learning_rate": 1.999794569812301e-05, + "loss": 0.6847, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 1.180576325266745, + "learning_rate": 1.9997891995035914e-05, + "loss": 0.6204, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 1.1858681540882434, + "learning_rate": 1.9997837599103772e-05, + "loss": 0.6072, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 1.1677804559635043, + "learning_rate": 1.9997782510330352e-05, + "loss": 0.6739, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 1.1158809705378172, + "learning_rate": 1.9997726728719468e-05, + "loss": 0.5778, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 1.2044311500249878, + "learning_rate": 1.9997670254274992e-05, + "loss": 0.6658, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 1.1467332394365353, + "learning_rate": 1.9997613087000833e-05, + "loss": 0.6158, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 1.1366647161281065, + "learning_rate": 1.9997555226900957e-05, + "loss": 0.6127, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 1.3568063589428707, + "learning_rate": 1.9997496673979375e-05, + "loss": 0.6049, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 1.2692474254544384, + "learning_rate": 1.9997437428240136e-05, + "loss": 0.6066, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 1.359777571955867, + "learning_rate": 1.999737748968735e-05, + "loss": 0.7016, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 1.260913107073566, + "learning_rate": 1.9997316858325177e-05, + "loss": 0.6022, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 1.2523330977441332, + "learning_rate": 1.9997255534157814e-05, + "loss": 0.6213, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 1.1390397288959015, + "learning_rate": 1.9997193517189505e-05, + "loss": 0.6717, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 1.3578840331768134, + "learning_rate": 1.9997130807424556e-05, + "loss": 0.6383, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 1.441823833716857, + "learning_rate": 1.999706740486731e-05, + "loss": 0.5689, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 1.2290512975605155, + "learning_rate": 1.999700330952216e-05, + "loss": 0.5764, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 1.1972836304503411, + "learning_rate": 1.9996938521393542e-05, + "loss": 0.6016, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 1.190460722585969, + "learning_rate": 1.9996873040485957e-05, + "loss": 0.6477, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 1.1123058896187992, + "learning_rate": 1.9996806866803937e-05, + "loss": 0.615, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 1.0935102246732809, + "learning_rate": 1.9996740000352068e-05, + "loss": 0.5393, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 1.2682103650008913, + "learning_rate": 1.9996672441134987e-05, + "loss": 0.7129, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 1.1267973119694383, + "learning_rate": 1.999660418915737e-05, + "loss": 0.5966, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 1.2916713629818193, + "learning_rate": 1.9996535244423947e-05, + "loss": 0.5527, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 1.1237915877887323, + "learning_rate": 1.99964656069395e-05, + "loss": 0.5915, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 1.2113708220129957, + "learning_rate": 1.9996395276708856e-05, + "loss": 0.6178, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 1.2780784931519782, + "learning_rate": 1.9996324253736884e-05, + "loss": 0.6967, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 1.2129306566189202, + "learning_rate": 1.999625253802851e-05, + "loss": 0.6319, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 1.2176291595097406, + "learning_rate": 1.99961801295887e-05, + "loss": 0.602, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 1.1928293808417556, + "learning_rate": 1.9996107028422474e-05, + "loss": 0.6077, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 1.4041430903030365, + "learning_rate": 1.9996033234534895e-05, + "loss": 0.6844, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 1.1082709009194491, + "learning_rate": 1.9995958747931083e-05, + "loss": 0.6288, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 1.2591220583252387, + "learning_rate": 1.9995883568616195e-05, + "loss": 0.624, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 1.1472894687965085, + "learning_rate": 1.9995807696595442e-05, + "loss": 0.6267, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 1.282837857275939, + "learning_rate": 1.9995731131874082e-05, + "loss": 0.6904, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 1.2078256390579016, + "learning_rate": 1.9995653874457418e-05, + "loss": 0.6489, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 1.6494002270659436, + "learning_rate": 1.9995575924350813e-05, + "loss": 0.6999, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 1.2895378872657652, + "learning_rate": 1.9995497281559658e-05, + "loss": 0.689, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 1.350853543232127, + "learning_rate": 1.9995417946089407e-05, + "loss": 0.6829, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 1.1943843703640955, + "learning_rate": 1.999533791794556e-05, + "loss": 0.6556, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 1.3439418508960037, + "learning_rate": 1.999525719713366e-05, + "loss": 0.4694, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 1.5174981490776334, + "learning_rate": 1.9995175783659304e-05, + "loss": 0.5545, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 1.1823217060352558, + "learning_rate": 1.999509367752813e-05, + "loss": 0.58, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 1.3439366064000626, + "learning_rate": 1.999501087874583e-05, + "loss": 0.6784, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 1.1052186698208823, + "learning_rate": 1.9994927387318142e-05, + "loss": 0.6275, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 1.1759748514990198, + "learning_rate": 1.9994843203250853e-05, + "loss": 0.6861, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 1.3358621470858025, + "learning_rate": 1.9994758326549794e-05, + "loss": 0.6388, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 1.2111472045537026, + "learning_rate": 1.9994672757220845e-05, + "loss": 0.5908, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 1.3290851723488153, + "learning_rate": 1.9994586495269944e-05, + "loss": 0.6672, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 1.1800279999404348, + "learning_rate": 1.9994499540703062e-05, + "loss": 0.6758, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 1.187891209226948, + "learning_rate": 1.9994411893526226e-05, + "loss": 0.699, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 1.4327628101310725, + "learning_rate": 1.9994323553745515e-05, + "loss": 0.4593, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 1.2353310670768305, + "learning_rate": 1.9994234521367043e-05, + "loss": 0.6498, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 1.217220834744078, + "learning_rate": 1.9994144796396985e-05, + "loss": 0.637, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.1012119323943392, + "learning_rate": 1.9994054378841557e-05, + "loss": 0.5268, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 1.109010150374482, + "learning_rate": 1.999396326870702e-05, + "loss": 0.6481, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 1.3104432477385424, + "learning_rate": 1.99938714659997e-05, + "loss": 0.6435, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 1.0225947862526552, + "learning_rate": 1.9993778970725953e-05, + "loss": 0.5475, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 1.1807483240933856, + "learning_rate": 1.9993685782892184e-05, + "loss": 0.6164, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 1.1036461749929372, + "learning_rate": 1.9993591902504854e-05, + "loss": 0.6136, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 1.2879988587362776, + "learning_rate": 1.9993497329570473e-05, + "loss": 0.6551, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 1.1963291092976445, + "learning_rate": 1.999340206409559e-05, + "loss": 0.6794, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 1.1047181103321142, + "learning_rate": 1.9993306106086808e-05, + "loss": 0.5978, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 1.1150390655450892, + "learning_rate": 1.9993209455550773e-05, + "loss": 0.5817, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 1.233783965559147, + "learning_rate": 1.999311211249419e-05, + "loss": 0.6413, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 1.2297400715670368, + "learning_rate": 1.9993014076923803e-05, + "loss": 0.6877, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 1.2022050419955803, + "learning_rate": 1.9992915348846403e-05, + "loss": 0.6793, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 1.3398503987350763, + "learning_rate": 1.9992815928268832e-05, + "loss": 0.7163, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 1.0694563998559343, + "learning_rate": 1.999271581519798e-05, + "loss": 0.6376, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 1.0402449566393575, + "learning_rate": 1.999261500964079e-05, + "loss": 0.6029, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 1.227880870571574, + "learning_rate": 1.999251351160424e-05, + "loss": 0.5998, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 1.1310312614228673, + "learning_rate": 1.9992411321095366e-05, + "loss": 0.614, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 1.0086715856563895, + "learning_rate": 1.9992308438121253e-05, + "loss": 0.5827, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 1.240841306459145, + "learning_rate": 1.999220486268903e-05, + "loss": 0.5864, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 1.1996928015356496, + "learning_rate": 1.9992100594805866e-05, + "loss": 0.6422, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 1.4282724911086893, + "learning_rate": 1.9991995634479e-05, + "loss": 0.6399, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 1.1202283620976718, + "learning_rate": 1.9991889981715696e-05, + "loss": 0.5842, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 1.206185140830073, + "learning_rate": 1.9991783636523282e-05, + "loss": 0.6642, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 1.090229502573641, + "learning_rate": 1.9991676598909124e-05, + "loss": 0.6654, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 1.3103010963837258, + "learning_rate": 1.999156886888064e-05, + "loss": 0.6399, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 1.243129736124134, + "learning_rate": 1.9991460446445297e-05, + "loss": 0.5981, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 1.2535060220321637, + "learning_rate": 1.9991351331610606e-05, + "loss": 0.6845, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 1.0397220665110378, + "learning_rate": 1.999124152438413e-05, + "loss": 0.6322, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 1.119272269521874, + "learning_rate": 1.9991131024773478e-05, + "loss": 0.6278, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 1.176775643440753, + "learning_rate": 1.9991019832786308e-05, + "loss": 0.625, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 1.161217955544425, + "learning_rate": 1.9990907948430327e-05, + "loss": 0.6539, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 1.1368426586875748, + "learning_rate": 1.999079537171329e-05, + "loss": 0.6275, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 1.338855766945942, + "learning_rate": 1.9990682102642987e-05, + "loss": 0.6615, + "step": 534 + }, + { + "epoch": 0.04, + "grad_norm": 1.1168425819506358, + "learning_rate": 1.9990568141227284e-05, + "loss": 0.6824, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 1.1617301433270104, + "learning_rate": 1.9990453487474067e-05, + "loss": 0.6133, + "step": 536 + }, + { + "epoch": 0.04, + "grad_norm": 1.1564343856241175, + "learning_rate": 1.9990338141391284e-05, + "loss": 0.6341, + "step": 537 + }, + { + "epoch": 0.04, + "grad_norm": 1.2885256022165874, + "learning_rate": 1.9990222102986935e-05, + "loss": 0.6475, + "step": 538 + }, + { + "epoch": 0.04, + "grad_norm": 1.2400580863381165, + "learning_rate": 1.999010537226905e-05, + "loss": 0.5882, + "step": 539 + }, + { + "epoch": 0.04, + "grad_norm": 1.2885791998719807, + "learning_rate": 1.9989987949245725e-05, + "loss": 0.6527, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 1.177901799429232, + "learning_rate": 1.9989869833925094e-05, + "loss": 0.6551, + "step": 541 + }, + { + "epoch": 0.04, + "grad_norm": 1.7818280325226765, + "learning_rate": 1.9989751026315347e-05, + "loss": 0.723, + "step": 542 + }, + { + "epoch": 0.04, + "grad_norm": 1.3058762683856173, + "learning_rate": 1.9989631526424716e-05, + "loss": 0.6873, + "step": 543 + }, + { + "epoch": 0.04, + "grad_norm": 1.150498168071998, + "learning_rate": 1.998951133426148e-05, + "loss": 0.7088, + "step": 544 + }, + { + "epoch": 0.04, + "grad_norm": 1.1648183483500902, + "learning_rate": 1.9989390449833968e-05, + "loss": 0.634, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 1.242716598701499, + "learning_rate": 1.998926887315056e-05, + "loss": 0.6164, + "step": 546 + }, + { + "epoch": 0.04, + "grad_norm": 1.1423813392922217, + "learning_rate": 1.998914660421968e-05, + "loss": 0.6478, + "step": 547 + }, + { + "epoch": 0.04, + "grad_norm": 1.3749975258731517, + "learning_rate": 1.99890236430498e-05, + "loss": 0.618, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 1.1455085795035111, + "learning_rate": 1.9988899989649438e-05, + "loss": 0.5982, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 1.2543874159946773, + "learning_rate": 1.9988775644027172e-05, + "loss": 0.633, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 1.174490222564184, + "learning_rate": 1.9988650606191614e-05, + "loss": 0.6266, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 1.1832096627499953, + "learning_rate": 1.9988524876151425e-05, + "loss": 0.6144, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 1.21550652390854, + "learning_rate": 1.9988398453915322e-05, + "loss": 0.659, + "step": 553 + }, + { + "epoch": 0.05, + "grad_norm": 1.1543957372759528, + "learning_rate": 1.998827133949207e-05, + "loss": 0.6467, + "step": 554 + }, + { + "epoch": 0.05, + "grad_norm": 1.0967112499183245, + "learning_rate": 1.998814353289047e-05, + "loss": 0.6141, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 1.0267622312276812, + "learning_rate": 1.9988015034119385e-05, + "loss": 0.6064, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 1.1774243726685896, + "learning_rate": 1.9987885843187717e-05, + "loss": 0.6174, + "step": 557 + }, + { + "epoch": 0.05, + "grad_norm": 1.0625061356459073, + "learning_rate": 1.9987755960104418e-05, + "loss": 0.6006, + "step": 558 + }, + { + "epoch": 0.05, + "grad_norm": 1.084560871180758, + "learning_rate": 1.9987625384878493e-05, + "loss": 0.6063, + "step": 559 + }, + { + "epoch": 0.05, + "grad_norm": 1.1889027689490759, + "learning_rate": 1.9987494117518986e-05, + "loss": 0.6729, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 1.1335356972818682, + "learning_rate": 1.9987362158034996e-05, + "loss": 0.6896, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 1.4050654834507526, + "learning_rate": 1.9987229506435666e-05, + "loss": 0.5667, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 1.1973334017795196, + "learning_rate": 1.998709616273019e-05, + "loss": 0.6441, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 1.1948117517144539, + "learning_rate": 1.998696212692781e-05, + "loss": 0.7147, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 1.1921590585442832, + "learning_rate": 1.998682739903781e-05, + "loss": 0.6434, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 1.243769075857278, + "learning_rate": 1.9986691979069532e-05, + "loss": 0.6608, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 1.268764077048501, + "learning_rate": 1.9986555867032357e-05, + "loss": 0.6149, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 1.0901318091600034, + "learning_rate": 1.998641906293572e-05, + "loss": 0.6869, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 1.1808960933093477, + "learning_rate": 1.99862815667891e-05, + "loss": 0.6034, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 1.1064884659907313, + "learning_rate": 1.9986143378602026e-05, + "loss": 0.6145, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 1.2629968487327867, + "learning_rate": 1.998600449838407e-05, + "loss": 0.6354, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 1.1098239795799647, + "learning_rate": 1.998586492614486e-05, + "loss": 0.6446, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 1.1172317251983954, + "learning_rate": 1.998572466189407e-05, + "loss": 0.6059, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 1.1776210955533486, + "learning_rate": 1.9985583705641418e-05, + "loss": 0.6194, + "step": 574 + }, + { + "epoch": 0.05, + "grad_norm": 1.1421240125909013, + "learning_rate": 1.9985442057396675e-05, + "loss": 0.5688, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 1.3033078081535983, + "learning_rate": 1.9985299717169654e-05, + "loss": 0.6404, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 1.1946419016655512, + "learning_rate": 1.9985156684970214e-05, + "loss": 0.577, + "step": 577 + }, + { + "epoch": 0.05, + "grad_norm": 1.2188327655634794, + "learning_rate": 1.9985012960808275e-05, + "loss": 0.6336, + "step": 578 + }, + { + "epoch": 0.05, + "grad_norm": 1.2471832078397793, + "learning_rate": 1.9984868544693795e-05, + "loss": 0.6152, + "step": 579 + }, + { + "epoch": 0.05, + "grad_norm": 1.2827823919534669, + "learning_rate": 1.9984723436636785e-05, + "loss": 0.6819, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 1.154753356121878, + "learning_rate": 1.9984577636647292e-05, + "loss": 0.6452, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 1.2501698234663974, + "learning_rate": 1.9984431144735426e-05, + "loss": 0.6386, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 1.3338990753512137, + "learning_rate": 1.998428396091134e-05, + "loss": 0.6606, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 1.1873704742882054, + "learning_rate": 1.9984136085185232e-05, + "loss": 0.591, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 1.166414930342365, + "learning_rate": 1.9983987517567348e-05, + "loss": 0.6735, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 1.1362047762422045, + "learning_rate": 1.998383825806799e-05, + "loss": 0.5989, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 1.1785883612942007, + "learning_rate": 1.9983688306697488e-05, + "loss": 0.6682, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 1.1575253422468532, + "learning_rate": 1.9983537663466244e-05, + "loss": 0.5574, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 1.2171303979561474, + "learning_rate": 1.9983386328384696e-05, + "loss": 0.6653, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 1.1779477719716205, + "learning_rate": 1.998323430146333e-05, + "loss": 0.6468, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 1.0520298733113895, + "learning_rate": 1.9983081582712684e-05, + "loss": 0.5669, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 1.2413891317205863, + "learning_rate": 1.9982928172143337e-05, + "loss": 0.6552, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 1.0805023983063284, + "learning_rate": 1.9982774069765923e-05, + "loss": 0.5755, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 1.2462092352478134, + "learning_rate": 1.9982619275591124e-05, + "loss": 0.6539, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 1.2311934123337847, + "learning_rate": 1.998246378962966e-05, + "loss": 0.6769, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 1.2862807486957082, + "learning_rate": 1.9982307611892314e-05, + "loss": 0.7153, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 1.1501963851498185, + "learning_rate": 1.9982150742389897e-05, + "loss": 0.586, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 1.0185449240780169, + "learning_rate": 1.9981993181133297e-05, + "loss": 0.6049, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 1.390711835950105, + "learning_rate": 1.998183492813342e-05, + "loss": 0.6899, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 1.4417654309382693, + "learning_rate": 1.9981675983401234e-05, + "loss": 0.6118, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 1.0392600350000416, + "learning_rate": 1.9981516346947757e-05, + "loss": 0.5494, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 1.124642591279403, + "learning_rate": 1.998135601878405e-05, + "loss": 0.5698, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 1.1218889954717481, + "learning_rate": 1.9981194998921226e-05, + "loss": 0.6356, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 1.189137467170877, + "learning_rate": 1.9981033287370443e-05, + "loss": 0.6391, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 1.1882587920699188, + "learning_rate": 1.9980870884142906e-05, + "loss": 0.6681, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 1.255753276002731, + "learning_rate": 1.9980707789249866e-05, + "loss": 0.6781, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 1.2023510062253613, + "learning_rate": 1.9980544002702635e-05, + "loss": 0.7198, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 1.054688112742724, + "learning_rate": 1.998037952451255e-05, + "loss": 0.6138, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 1.107499007509298, + "learning_rate": 1.9980214354691022e-05, + "loss": 0.5985, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 1.0589520103441592, + "learning_rate": 1.998004849324949e-05, + "loss": 0.6205, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 1.1975815219590236, + "learning_rate": 1.997988194019945e-05, + "loss": 0.5926, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 1.2336481921456244, + "learning_rate": 1.9979714695552444e-05, + "loss": 0.6479, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 1.0394534716312003, + "learning_rate": 1.997954675932006e-05, + "loss": 0.5713, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 1.1298571096569328, + "learning_rate": 1.997937813151394e-05, + "loss": 0.606, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 1.1594385853901574, + "learning_rate": 1.9979208812145766e-05, + "loss": 0.6047, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 1.0480989117397763, + "learning_rate": 1.9979038801227273e-05, + "loss": 0.6042, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 1.11743684858994, + "learning_rate": 1.9978868098770244e-05, + "loss": 0.629, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 1.0789247877309214, + "learning_rate": 1.9978696704786505e-05, + "loss": 0.7054, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 1.214704874693797, + "learning_rate": 1.9978524619287937e-05, + "loss": 0.6837, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 1.2762010798985541, + "learning_rate": 1.997835184228646e-05, + "loss": 0.6524, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 1.036839257758854, + "learning_rate": 1.9978178373794055e-05, + "loss": 0.6761, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 1.0783286513719492, + "learning_rate": 1.9978004213822736e-05, + "loss": 0.6654, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 1.2133745178374113, + "learning_rate": 1.997782936238458e-05, + "loss": 0.6946, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 1.2098030928382038, + "learning_rate": 1.9977653819491696e-05, + "loss": 0.5995, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 1.1151817187068271, + "learning_rate": 1.9977477585156252e-05, + "loss": 0.586, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 1.3347118405295793, + "learning_rate": 1.9977300659390463e-05, + "loss": 0.6752, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 1.1815468771856832, + "learning_rate": 1.997712304220659e-05, + "loss": 0.5802, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 1.1188855784750988, + "learning_rate": 1.9976944733616935e-05, + "loss": 0.63, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 1.0985965917225944, + "learning_rate": 1.9976765733633866e-05, + "loss": 0.5863, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 1.2046569696689249, + "learning_rate": 1.9976586042269776e-05, + "loss": 0.4522, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 1.1672364801364987, + "learning_rate": 1.9976405659537123e-05, + "loss": 0.6388, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 1.1689539536673572, + "learning_rate": 1.9976224585448407e-05, + "loss": 0.6218, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 1.2637252555860425, + "learning_rate": 1.9976042820016176e-05, + "loss": 0.6143, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 1.1814014645815236, + "learning_rate": 1.997586036325303e-05, + "loss": 0.6382, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 1.222151385844291, + "learning_rate": 1.9975677215171606e-05, + "loss": 0.6453, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 1.0914309372643383, + "learning_rate": 1.9975493375784598e-05, + "loss": 0.6333, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 1.201970236345254, + "learning_rate": 1.997530884510475e-05, + "loss": 0.6586, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 1.054938735797658, + "learning_rate": 1.9975123623144847e-05, + "loss": 0.6288, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 1.3866748760527043, + "learning_rate": 1.9974937709917722e-05, + "loss": 0.6907, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 1.1652988956147512, + "learning_rate": 1.9974751105436266e-05, + "loss": 0.659, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 1.2370818741683758, + "learning_rate": 1.9974563809713406e-05, + "loss": 0.4051, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 1.2982928662518747, + "learning_rate": 1.9974375822762117e-05, + "loss": 0.6157, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 1.0479843603076193, + "learning_rate": 1.9974187144595433e-05, + "loss": 0.4918, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 1.2429258136089865, + "learning_rate": 1.9973997775226424e-05, + "loss": 0.653, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 1.2192984610891346, + "learning_rate": 1.9973807714668224e-05, + "loss": 0.6706, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 1.1251401683601023, + "learning_rate": 1.997361696293399e-05, + "loss": 0.6229, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 1.0708970840194627, + "learning_rate": 1.9973425520036948e-05, + "loss": 0.5686, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 1.1553407807232856, + "learning_rate": 1.9973233385990364e-05, + "loss": 0.6734, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 1.1466860232715055, + "learning_rate": 1.997304056080755e-05, + "loss": 0.5551, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 1.350493965217335, + "learning_rate": 1.9972847044501876e-05, + "loss": 0.5949, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 1.2454267382525472, + "learning_rate": 1.9972652837086746e-05, + "loss": 0.6507, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 1.029352302353856, + "learning_rate": 1.997245793857562e-05, + "loss": 0.5958, + "step": 652 + }, + { + "epoch": 0.05, + "grad_norm": 1.1497968726198924, + "learning_rate": 1.9972262348982e-05, + "loss": 0.67, + "step": 653 + }, + { + "epoch": 0.05, + "grad_norm": 1.088989161085879, + "learning_rate": 1.997206606831945e-05, + "loss": 0.5854, + "step": 654 + }, + { + "epoch": 0.05, + "grad_norm": 1.1790833944364503, + "learning_rate": 1.997186909660157e-05, + "loss": 0.6825, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 1.0384747944909554, + "learning_rate": 1.9971671433842e-05, + "loss": 0.6763, + "step": 656 + }, + { + "epoch": 0.05, + "grad_norm": 1.1213533648567533, + "learning_rate": 1.9971473080054445e-05, + "loss": 0.6894, + "step": 657 + }, + { + "epoch": 0.05, + "grad_norm": 1.1609933428042643, + "learning_rate": 1.9971274035252653e-05, + "loss": 0.5707, + "step": 658 + }, + { + "epoch": 0.05, + "grad_norm": 1.2081990423276776, + "learning_rate": 1.9971074299450414e-05, + "loss": 0.6806, + "step": 659 + }, + { + "epoch": 0.05, + "grad_norm": 1.0724208806076496, + "learning_rate": 1.9970873872661567e-05, + "loss": 0.5806, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 1.0610754332454024, + "learning_rate": 1.997067275490001e-05, + "loss": 0.6022, + "step": 661 + }, + { + "epoch": 0.05, + "grad_norm": 1.1629971401024213, + "learning_rate": 1.997047094617967e-05, + "loss": 0.6024, + "step": 662 + }, + { + "epoch": 0.05, + "grad_norm": 1.2436199105527355, + "learning_rate": 1.9970268446514543e-05, + "loss": 0.6317, + "step": 663 + }, + { + "epoch": 0.05, + "grad_norm": 1.1442220153552487, + "learning_rate": 1.997006525591865e-05, + "loss": 0.6532, + "step": 664 + }, + { + "epoch": 0.05, + "grad_norm": 1.3021218257030307, + "learning_rate": 1.9969861374406086e-05, + "loss": 0.6593, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 1.1111386090671853, + "learning_rate": 1.9969656801990967e-05, + "loss": 0.6495, + "step": 666 + }, + { + "epoch": 0.05, + "grad_norm": 1.1031034022345894, + "learning_rate": 1.9969451538687474e-05, + "loss": 0.6161, + "step": 667 + }, + { + "epoch": 0.05, + "grad_norm": 1.0933845844784758, + "learning_rate": 1.9969245584509832e-05, + "loss": 0.5477, + "step": 668 + }, + { + "epoch": 0.05, + "grad_norm": 1.1712771182304644, + "learning_rate": 1.9969038939472315e-05, + "loss": 0.6338, + "step": 669 + }, + { + "epoch": 0.05, + "grad_norm": 1.2155964784926518, + "learning_rate": 1.9968831603589243e-05, + "loss": 0.6716, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 1.1244215186538657, + "learning_rate": 1.9968623576874984e-05, + "loss": 0.6159, + "step": 671 + }, + { + "epoch": 0.05, + "grad_norm": 1.11184462359373, + "learning_rate": 1.996841485934395e-05, + "loss": 0.5756, + "step": 672 + }, + { + "epoch": 0.05, + "grad_norm": 1.136388764225708, + "learning_rate": 1.996820545101061e-05, + "loss": 0.6362, + "step": 673 + }, + { + "epoch": 0.05, + "grad_norm": 1.2594325771585668, + "learning_rate": 1.9967995351889476e-05, + "loss": 0.6782, + "step": 674 + }, + { + "epoch": 0.05, + "grad_norm": 1.077894031554686, + "learning_rate": 1.9967784561995103e-05, + "loss": 0.5933, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 1.085391720228807, + "learning_rate": 1.9967573081342103e-05, + "loss": 0.6247, + "step": 676 + }, + { + "epoch": 0.06, + "grad_norm": 1.092634209209323, + "learning_rate": 1.996736090994513e-05, + "loss": 0.5373, + "step": 677 + }, + { + "epoch": 0.06, + "grad_norm": 1.198269526162391, + "learning_rate": 1.9967148047818884e-05, + "loss": 0.5686, + "step": 678 + }, + { + "epoch": 0.06, + "grad_norm": 1.2183730301752702, + "learning_rate": 1.996693449497812e-05, + "loss": 0.6678, + "step": 679 + }, + { + "epoch": 0.06, + "grad_norm": 1.1399175734048983, + "learning_rate": 1.9966720251437635e-05, + "loss": 0.6017, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 1.0273161496250076, + "learning_rate": 1.996650531721228e-05, + "loss": 0.5534, + "step": 681 + }, + { + "epoch": 0.06, + "grad_norm": 1.047089911204804, + "learning_rate": 1.9966289692316944e-05, + "loss": 0.5956, + "step": 682 + }, + { + "epoch": 0.06, + "grad_norm": 1.217569905753707, + "learning_rate": 1.9966073376766575e-05, + "loss": 0.6095, + "step": 683 + }, + { + "epoch": 0.06, + "grad_norm": 1.1145304786124874, + "learning_rate": 1.9965856370576163e-05, + "loss": 0.613, + "step": 684 + }, + { + "epoch": 0.06, + "grad_norm": 1.1977722179345607, + "learning_rate": 1.9965638673760738e-05, + "loss": 0.6071, + "step": 685 + }, + { + "epoch": 0.06, + "grad_norm": 1.2160224513488096, + "learning_rate": 1.9965420286335397e-05, + "loss": 0.6376, + "step": 686 + }, + { + "epoch": 0.06, + "grad_norm": 1.3450712803411724, + "learning_rate": 1.996520120831527e-05, + "loss": 0.6046, + "step": 687 + }, + { + "epoch": 0.06, + "grad_norm": 1.1398839262388, + "learning_rate": 1.9964981439715532e-05, + "loss": 0.5862, + "step": 688 + }, + { + "epoch": 0.06, + "grad_norm": 1.1615547015849625, + "learning_rate": 1.9964760980551428e-05, + "loss": 0.5632, + "step": 689 + }, + { + "epoch": 0.06, + "grad_norm": 1.1095424326717258, + "learning_rate": 1.996453983083822e-05, + "loss": 0.6577, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 1.1245069632159537, + "learning_rate": 1.9964317990591243e-05, + "loss": 0.6358, + "step": 691 + }, + { + "epoch": 0.06, + "grad_norm": 1.0739036008737486, + "learning_rate": 1.9964095459825866e-05, + "loss": 0.6257, + "step": 692 + }, + { + "epoch": 0.06, + "grad_norm": 1.0889267792227462, + "learning_rate": 1.9963872238557516e-05, + "loss": 0.6612, + "step": 693 + }, + { + "epoch": 0.06, + "grad_norm": 1.101405073966227, + "learning_rate": 1.9963648326801653e-05, + "loss": 0.6636, + "step": 694 + }, + { + "epoch": 0.06, + "grad_norm": 1.061467009295685, + "learning_rate": 1.99634237245738e-05, + "loss": 0.4699, + "step": 695 + }, + { + "epoch": 0.06, + "grad_norm": 1.0916752453273684, + "learning_rate": 1.9963198431889523e-05, + "loss": 0.5637, + "step": 696 + }, + { + "epoch": 0.06, + "grad_norm": 1.0034377408449018, + "learning_rate": 1.996297244876443e-05, + "loss": 0.5266, + "step": 697 + }, + { + "epoch": 0.06, + "grad_norm": 1.1009331610859694, + "learning_rate": 1.9962745775214187e-05, + "loss": 0.6557, + "step": 698 + }, + { + "epoch": 0.06, + "grad_norm": 1.006040563282083, + "learning_rate": 1.9962518411254493e-05, + "loss": 0.571, + "step": 699 + }, + { + "epoch": 0.06, + "grad_norm": 1.2442846642824592, + "learning_rate": 1.996229035690111e-05, + "loss": 0.6507, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 1.0822018700447882, + "learning_rate": 1.9962061612169844e-05, + "loss": 0.648, + "step": 701 + }, + { + "epoch": 0.06, + "grad_norm": 1.2097081161212462, + "learning_rate": 1.9961832177076544e-05, + "loss": 0.6196, + "step": 702 + }, + { + "epoch": 0.06, + "grad_norm": 1.2030315390368393, + "learning_rate": 1.996160205163711e-05, + "loss": 0.6099, + "step": 703 + }, + { + "epoch": 0.06, + "grad_norm": 1.1297382460923315, + "learning_rate": 1.9961371235867494e-05, + "loss": 0.6596, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 1.1674304329946574, + "learning_rate": 1.9961139729783683e-05, + "loss": 0.6333, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 1.2304187056475613, + "learning_rate": 1.9960907533401722e-05, + "loss": 0.6619, + "step": 706 + }, + { + "epoch": 0.06, + "grad_norm": 1.1425239340299234, + "learning_rate": 1.996067464673771e-05, + "loss": 0.5948, + "step": 707 + }, + { + "epoch": 0.06, + "grad_norm": 1.2514604903606674, + "learning_rate": 1.9960441069807778e-05, + "loss": 0.6098, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 1.1006930003492308, + "learning_rate": 1.996020680262811e-05, + "loss": 0.6225, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 1.0897684407070278, + "learning_rate": 1.9959971845214953e-05, + "loss": 0.6381, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 1.0351947993478245, + "learning_rate": 1.9959736197584577e-05, + "loss": 0.5927, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 1.2403363792215993, + "learning_rate": 1.9959499859753317e-05, + "loss": 0.647, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 1.1981011259553083, + "learning_rate": 1.995926283173755e-05, + "loss": 0.6881, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 0.9841149549694405, + "learning_rate": 1.9959025113553706e-05, + "loss": 0.6494, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 1.0600192658138001, + "learning_rate": 1.9958786705218254e-05, + "loss": 0.6265, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 1.2811772952743912, + "learning_rate": 1.9958547606747715e-05, + "loss": 0.5239, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 1.1521958975518616, + "learning_rate": 1.9958307818158662e-05, + "loss": 0.6341, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 1.1179962043127805, + "learning_rate": 1.995806733946771e-05, + "loss": 0.6329, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 1.0146153054249425, + "learning_rate": 1.995782617069152e-05, + "loss": 0.5976, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 1.1645750270250357, + "learning_rate": 1.9957584311846814e-05, + "loss": 0.576, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 1.2491529995294561, + "learning_rate": 1.9957341762950346e-05, + "loss": 0.7102, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 1.1764586868629787, + "learning_rate": 1.9957098524018925e-05, + "loss": 0.6875, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 1.1389846638956975, + "learning_rate": 1.995685459506941e-05, + "loss": 0.6148, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 1.057344796664354, + "learning_rate": 1.9956609976118704e-05, + "loss": 0.5285, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 1.246786646040238, + "learning_rate": 1.9956364667183755e-05, + "loss": 0.5909, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 1.2410890769714888, + "learning_rate": 1.9956118668281568e-05, + "loss": 0.6763, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 1.1142586997360597, + "learning_rate": 1.9955871979429188e-05, + "loss": 0.6469, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 1.1276135732050507, + "learning_rate": 1.9955624600643712e-05, + "loss": 0.6592, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 1.066854522714197, + "learning_rate": 1.9955376531942278e-05, + "loss": 0.5611, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 1.08373527229625, + "learning_rate": 1.9955127773342086e-05, + "loss": 0.5927, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 1.2196226278884847, + "learning_rate": 1.9954878324860365e-05, + "loss": 0.6232, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 1.1409979528374246, + "learning_rate": 1.995462818651441e-05, + "loss": 0.6322, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 1.074608252737145, + "learning_rate": 1.9954377358321547e-05, + "loss": 0.5813, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 1.0325700029995777, + "learning_rate": 1.9954125840299165e-05, + "loss": 0.4502, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 1.1297723614367152, + "learning_rate": 1.995387363246469e-05, + "loss": 0.6199, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 1.0916886554647816, + "learning_rate": 1.9953620734835603e-05, + "loss": 0.6186, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 1.1978421868431721, + "learning_rate": 1.995336714742943e-05, + "loss": 0.6573, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 1.146234644774168, + "learning_rate": 1.9953112870263737e-05, + "loss": 0.6158, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 1.0833865452731306, + "learning_rate": 1.9952857903356155e-05, + "loss": 0.6236, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 1.0277210942729749, + "learning_rate": 1.9952602246724348e-05, + "loss": 0.6171, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 1.1046435282049112, + "learning_rate": 1.995234590038603e-05, + "loss": 0.6509, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 1.187550009575104, + "learning_rate": 1.995208886435897e-05, + "loss": 0.6832, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 1.1013173491692863, + "learning_rate": 1.995183113866098e-05, + "loss": 0.6326, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 1.0349313608491895, + "learning_rate": 1.9951572723309918e-05, + "loss": 0.5911, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 1.13623690688545, + "learning_rate": 1.9951313618323696e-05, + "loss": 0.6131, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 1.05623891586562, + "learning_rate": 1.9951053823720267e-05, + "loss": 0.6311, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 1.098775832907385, + "learning_rate": 1.9950793339517632e-05, + "loss": 0.6321, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 1.0964712573130224, + "learning_rate": 1.9950532165733847e-05, + "loss": 0.6329, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 1.1245652087511098, + "learning_rate": 1.995027030238701e-05, + "loss": 0.6432, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 1.2375929509587618, + "learning_rate": 1.9950007749495263e-05, + "loss": 0.611, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 1.080590540348563, + "learning_rate": 1.9949744507076806e-05, + "loss": 0.6639, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 1.106532099604561, + "learning_rate": 1.994948057514988e-05, + "loss": 0.6232, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 1.0356971574762561, + "learning_rate": 1.994921595373278e-05, + "loss": 0.5809, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 1.2054321702962159, + "learning_rate": 1.9948950642843836e-05, + "loss": 0.6959, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 1.1866427603796048, + "learning_rate": 1.9948684642501433e-05, + "loss": 0.5915, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 1.1568127923932405, + "learning_rate": 1.9948417952724014e-05, + "loss": 0.6201, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 1.0679478432065364, + "learning_rate": 1.9948150573530054e-05, + "loss": 0.6118, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 1.2389782363966395, + "learning_rate": 1.994788250493808e-05, + "loss": 0.6178, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 1.1320990625121634, + "learning_rate": 1.9947613746966678e-05, + "loss": 0.5965, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 1.055659570175554, + "learning_rate": 1.9947344299634464e-05, + "loss": 0.6197, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 1.0158590766537203, + "learning_rate": 1.9947074162960113e-05, + "loss": 0.6052, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 1.213723272660111, + "learning_rate": 1.9946803336962346e-05, + "loss": 0.5871, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 1.1870814088915298, + "learning_rate": 1.994653182165993e-05, + "loss": 0.6191, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 1.2498400851275093, + "learning_rate": 1.994625961707168e-05, + "loss": 0.7248, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 1.1683672596182015, + "learning_rate": 1.9945986723216463e-05, + "loss": 0.5657, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 1.1686459028088645, + "learning_rate": 1.9945713140113188e-05, + "loss": 0.6563, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 1.174814353567215, + "learning_rate": 1.9945438867780814e-05, + "loss": 0.5865, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 1.1195506080858777, + "learning_rate": 1.9945163906238347e-05, + "loss": 0.6169, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 1.2211466830279876, + "learning_rate": 1.9944888255504846e-05, + "loss": 0.6599, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 1.098094186565308, + "learning_rate": 1.994461191559941e-05, + "loss": 0.5711, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 1.0342907512334356, + "learning_rate": 1.9944334886541184e-05, + "loss": 0.6, + "step": 771 + }, + { + "epoch": 0.06, + "grad_norm": 1.1118274230511065, + "learning_rate": 1.9944057168349374e-05, + "loss": 0.5939, + "step": 772 + }, + { + "epoch": 0.06, + "grad_norm": 1.0468608125121714, + "learning_rate": 1.9943778761043223e-05, + "loss": 0.5908, + "step": 773 + }, + { + "epoch": 0.06, + "grad_norm": 1.0957138400050503, + "learning_rate": 1.994349966464202e-05, + "loss": 0.6157, + "step": 774 + }, + { + "epoch": 0.06, + "grad_norm": 1.1635606740555189, + "learning_rate": 1.9943219879165113e-05, + "loss": 0.6674, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 1.0570904827261942, + "learning_rate": 1.9942939404631893e-05, + "loss": 0.5812, + "step": 776 + }, + { + "epoch": 0.06, + "grad_norm": 1.1047314581338863, + "learning_rate": 1.9942658241061785e-05, + "loss": 0.6725, + "step": 777 + }, + { + "epoch": 0.06, + "grad_norm": 1.1209323619076677, + "learning_rate": 1.9942376388474282e-05, + "loss": 0.6099, + "step": 778 + }, + { + "epoch": 0.06, + "grad_norm": 1.0531072607859158, + "learning_rate": 1.9942093846888912e-05, + "loss": 0.654, + "step": 779 + }, + { + "epoch": 0.06, + "grad_norm": 1.1245284813161793, + "learning_rate": 1.9941810616325262e-05, + "loss": 0.6879, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 1.2806531327448978, + "learning_rate": 1.994152669680295e-05, + "loss": 0.6672, + "step": 781 + }, + { + "epoch": 0.06, + "grad_norm": 1.1367446672914951, + "learning_rate": 1.994124208834166e-05, + "loss": 0.6449, + "step": 782 + }, + { + "epoch": 0.06, + "grad_norm": 1.123488436414863, + "learning_rate": 1.9940956790961108e-05, + "loss": 0.6283, + "step": 783 + }, + { + "epoch": 0.06, + "grad_norm": 1.1621898434373996, + "learning_rate": 1.9940670804681068e-05, + "loss": 0.6227, + "step": 784 + }, + { + "epoch": 0.06, + "grad_norm": 1.1273964782620118, + "learning_rate": 1.994038412952136e-05, + "loss": 0.6902, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 1.1176649212729894, + "learning_rate": 1.994009676550185e-05, + "loss": 0.6174, + "step": 786 + }, + { + "epoch": 0.06, + "grad_norm": 1.1072199332026769, + "learning_rate": 1.993980871264245e-05, + "loss": 0.6046, + "step": 787 + }, + { + "epoch": 0.06, + "grad_norm": 1.0486363905651077, + "learning_rate": 1.993951997096312e-05, + "loss": 0.5663, + "step": 788 + }, + { + "epoch": 0.06, + "grad_norm": 1.033014159421138, + "learning_rate": 1.9939230540483873e-05, + "loss": 0.5875, + "step": 789 + }, + { + "epoch": 0.06, + "grad_norm": 1.1065308629200603, + "learning_rate": 1.9938940421224768e-05, + "loss": 0.6169, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 1.0786912963350155, + "learning_rate": 1.9938649613205907e-05, + "loss": 0.6306, + "step": 791 + }, + { + "epoch": 0.06, + "grad_norm": 1.129046654293702, + "learning_rate": 1.9938358116447444e-05, + "loss": 0.631, + "step": 792 + }, + { + "epoch": 0.06, + "grad_norm": 0.9863221302179781, + "learning_rate": 1.9938065930969578e-05, + "loss": 0.5559, + "step": 793 + }, + { + "epoch": 0.06, + "grad_norm": 1.089501085757703, + "learning_rate": 1.993777305679256e-05, + "loss": 0.6775, + "step": 794 + }, + { + "epoch": 0.06, + "grad_norm": 1.2022727340828454, + "learning_rate": 1.993747949393668e-05, + "loss": 0.6469, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 1.1368424852076264, + "learning_rate": 1.993718524242229e-05, + "loss": 0.6475, + "step": 796 + }, + { + "epoch": 0.06, + "grad_norm": 1.0820497403961937, + "learning_rate": 1.9936890302269773e-05, + "loss": 0.5831, + "step": 797 + }, + { + "epoch": 0.06, + "grad_norm": 1.0898352110514833, + "learning_rate": 1.9936594673499578e-05, + "loss": 0.6381, + "step": 798 + }, + { + "epoch": 0.06, + "grad_norm": 1.2294389908535341, + "learning_rate": 1.993629835613218e-05, + "loss": 0.4407, + "step": 799 + }, + { + "epoch": 0.07, + "grad_norm": 1.0918414115098216, + "learning_rate": 1.993600135018812e-05, + "loss": 0.6081, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 0.9537633730917028, + "learning_rate": 1.9935703655687982e-05, + "loss": 0.5166, + "step": 801 + }, + { + "epoch": 0.07, + "grad_norm": 1.0660382716002714, + "learning_rate": 1.993540527265239e-05, + "loss": 0.6609, + "step": 802 + }, + { + "epoch": 0.07, + "grad_norm": 1.1379226647047838, + "learning_rate": 1.9935106201102032e-05, + "loss": 0.6515, + "step": 803 + }, + { + "epoch": 0.07, + "grad_norm": 1.0589300124561416, + "learning_rate": 1.993480644105762e-05, + "loss": 0.5814, + "step": 804 + }, + { + "epoch": 0.07, + "grad_norm": 1.0175478881295723, + "learning_rate": 1.9934505992539934e-05, + "loss": 0.5841, + "step": 805 + }, + { + "epoch": 0.07, + "grad_norm": 1.1396476742438806, + "learning_rate": 1.99342048555698e-05, + "loss": 0.6405, + "step": 806 + }, + { + "epoch": 0.07, + "grad_norm": 1.1621349884764827, + "learning_rate": 1.9933903030168075e-05, + "loss": 0.7015, + "step": 807 + }, + { + "epoch": 0.07, + "grad_norm": 1.0632458504231408, + "learning_rate": 1.9933600516355684e-05, + "loss": 0.6147, + "step": 808 + }, + { + "epoch": 0.07, + "grad_norm": 1.06885761464636, + "learning_rate": 1.9933297314153593e-05, + "loss": 0.6383, + "step": 809 + }, + { + "epoch": 0.07, + "grad_norm": 1.0962532335479325, + "learning_rate": 1.99329934235828e-05, + "loss": 0.5765, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 1.1464346102920349, + "learning_rate": 1.993268884466438e-05, + "loss": 0.5723, + "step": 811 + }, + { + "epoch": 0.07, + "grad_norm": 1.2509669034212627, + "learning_rate": 1.9932383577419432e-05, + "loss": 0.5771, + "step": 812 + }, + { + "epoch": 0.07, + "grad_norm": 1.0414712734658653, + "learning_rate": 1.9932077621869112e-05, + "loss": 0.6552, + "step": 813 + }, + { + "epoch": 0.07, + "grad_norm": 1.2035876318057512, + "learning_rate": 1.993177097803462e-05, + "loss": 0.6744, + "step": 814 + }, + { + "epoch": 0.07, + "grad_norm": 1.118292884144844, + "learning_rate": 1.993146364593721e-05, + "loss": 0.6673, + "step": 815 + }, + { + "epoch": 0.07, + "grad_norm": 1.2133348072449912, + "learning_rate": 1.993115562559818e-05, + "loss": 0.6326, + "step": 816 + }, + { + "epoch": 0.07, + "grad_norm": 1.054058689703446, + "learning_rate": 1.9930846917038873e-05, + "loss": 0.6147, + "step": 817 + }, + { + "epoch": 0.07, + "grad_norm": 1.2012351073159693, + "learning_rate": 1.9930537520280684e-05, + "loss": 0.6592, + "step": 818 + }, + { + "epoch": 0.07, + "grad_norm": 1.164198742439426, + "learning_rate": 1.9930227435345053e-05, + "loss": 0.6111, + "step": 819 + }, + { + "epoch": 0.07, + "grad_norm": 1.0662066688946203, + "learning_rate": 1.992991666225347e-05, + "loss": 0.5349, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 0.999433802047172, + "learning_rate": 1.9929605201027468e-05, + "loss": 0.6664, + "step": 821 + }, + { + "epoch": 0.07, + "grad_norm": 1.1356669429058914, + "learning_rate": 1.9929293051688634e-05, + "loss": 0.5944, + "step": 822 + }, + { + "epoch": 0.07, + "grad_norm": 1.0915185271313848, + "learning_rate": 1.9928980214258597e-05, + "loss": 0.6321, + "step": 823 + }, + { + "epoch": 0.07, + "grad_norm": 1.1512780011586605, + "learning_rate": 1.992866668875904e-05, + "loss": 0.6711, + "step": 824 + }, + { + "epoch": 0.07, + "grad_norm": 1.211306666748613, + "learning_rate": 1.992835247521169e-05, + "loss": 0.6296, + "step": 825 + }, + { + "epoch": 0.07, + "grad_norm": 1.1157941995912357, + "learning_rate": 1.9928037573638316e-05, + "loss": 0.6819, + "step": 826 + }, + { + "epoch": 0.07, + "grad_norm": 1.1529125372179114, + "learning_rate": 1.9927721984060747e-05, + "loss": 0.6221, + "step": 827 + }, + { + "epoch": 0.07, + "grad_norm": 1.172281554294927, + "learning_rate": 1.992740570650085e-05, + "loss": 0.662, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 1.150850811381565, + "learning_rate": 1.992708874098054e-05, + "loss": 0.683, + "step": 829 + }, + { + "epoch": 0.07, + "grad_norm": 1.0835533671026325, + "learning_rate": 1.992677108752179e-05, + "loss": 0.6373, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 1.1311275755980512, + "learning_rate": 1.9926452746146605e-05, + "loss": 0.6024, + "step": 831 + }, + { + "epoch": 0.07, + "grad_norm": 1.0663614825308445, + "learning_rate": 1.992613371687705e-05, + "loss": 0.4717, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 1.122966022638328, + "learning_rate": 1.9925813999735238e-05, + "loss": 0.6189, + "step": 833 + }, + { + "epoch": 0.07, + "grad_norm": 1.2502857431361363, + "learning_rate": 1.992549359474332e-05, + "loss": 0.6694, + "step": 834 + }, + { + "epoch": 0.07, + "grad_norm": 0.9883676453300628, + "learning_rate": 1.992517250192349e-05, + "loss": 0.6629, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 1.0843568044317529, + "learning_rate": 1.9924850721298017e-05, + "loss": 0.5797, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 1.1302426860122095, + "learning_rate": 1.992452825288919e-05, + "loss": 0.6683, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 1.1279702636357174, + "learning_rate": 1.992420509671936e-05, + "loss": 0.6624, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 1.1548522444197566, + "learning_rate": 1.9923881252810917e-05, + "loss": 0.6606, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 1.2189793417281167, + "learning_rate": 1.9923556721186308e-05, + "loss": 0.6906, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 1.0057515507994603, + "learning_rate": 1.9923231501868018e-05, + "loss": 0.6407, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 1.03480201906667, + "learning_rate": 1.992290559487859e-05, + "loss": 0.6212, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 1.2312157692437504, + "learning_rate": 1.9922579000240602e-05, + "loss": 0.6539, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 1.1510651239803475, + "learning_rate": 1.9922251717976697e-05, + "loss": 0.6423, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 1.228234587897638, + "learning_rate": 1.992192374810954e-05, + "loss": 0.6523, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 1.3035456156567933, + "learning_rate": 1.9921595090661872e-05, + "loss": 0.6438, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 1.125010510695676, + "learning_rate": 1.9921265745656466e-05, + "loss": 0.6128, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 1.0702335652144521, + "learning_rate": 1.9920935713116144e-05, + "loss": 0.6311, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 1.1455341862560653, + "learning_rate": 1.9920604993063777e-05, + "loss": 0.6406, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 1.203676850522986, + "learning_rate": 1.992027358552228e-05, + "loss": 0.7314, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 1.0271676262087595, + "learning_rate": 1.991994149051463e-05, + "loss": 0.5971, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 1.205644875995286, + "learning_rate": 1.9919608708063826e-05, + "loss": 0.6107, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 1.2046978136025066, + "learning_rate": 1.991927523819294e-05, + "loss": 0.6406, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 1.0945845324462102, + "learning_rate": 1.991894108092508e-05, + "loss": 0.5819, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 1.1130735560807379, + "learning_rate": 1.99186062362834e-05, + "loss": 0.6191, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 1.0389898930015686, + "learning_rate": 1.9918270704291104e-05, + "loss": 0.6716, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 1.2716474232262376, + "learning_rate": 1.991793448497145e-05, + "loss": 0.6004, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 1.0389131172541453, + "learning_rate": 1.991759757834773e-05, + "loss": 0.5947, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 1.0950374146231157, + "learning_rate": 1.9917259984443295e-05, + "loss": 0.6049, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 1.0364117726822706, + "learning_rate": 1.991692170328154e-05, + "loss": 0.6233, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 1.1011938895926205, + "learning_rate": 1.9916582734885906e-05, + "loss": 0.7002, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 1.061748573780894, + "learning_rate": 1.991624307927989e-05, + "loss": 0.5697, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 1.0460051979027172, + "learning_rate": 1.991590273648702e-05, + "loss": 0.607, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 1.0497730613084617, + "learning_rate": 1.9915561706530882e-05, + "loss": 0.5858, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 1.0650382321114669, + "learning_rate": 1.9915219989435117e-05, + "loss": 0.6258, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 1.0583106709875194, + "learning_rate": 1.9914877585223403e-05, + "loss": 0.6135, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 1.073826426367363, + "learning_rate": 1.9914534493919464e-05, + "loss": 0.6618, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 1.0532675508825666, + "learning_rate": 1.991419071554708e-05, + "loss": 0.6091, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 1.126468696862115, + "learning_rate": 1.9913846250130074e-05, + "loss": 0.6568, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 1.2196257672975, + "learning_rate": 1.9913501097692312e-05, + "loss": 0.6105, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 1.1475489520819309, + "learning_rate": 1.9913155258257724e-05, + "loss": 0.5912, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 1.1808552294384747, + "learning_rate": 1.9912808731850265e-05, + "loss": 0.6991, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 1.1584108242615976, + "learning_rate": 1.991246151849396e-05, + "loss": 0.6977, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 1.0289436915103616, + "learning_rate": 1.9912113618212852e-05, + "loss": 0.6501, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 1.0439752354711855, + "learning_rate": 1.991176503103107e-05, + "loss": 0.6131, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 1.1188632994545695, + "learning_rate": 1.9911415756972764e-05, + "loss": 0.6592, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 1.2134771338604364, + "learning_rate": 1.9911065796062137e-05, + "loss": 0.6363, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 1.1070321810757595, + "learning_rate": 1.9910715148323438e-05, + "loss": 0.6432, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 1.1159201260679672, + "learning_rate": 1.9910363813780975e-05, + "loss": 0.5572, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 1.082698575626692, + "learning_rate": 1.9910011792459086e-05, + "loss": 0.5655, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 1.0551268128565945, + "learning_rate": 1.9909659084382172e-05, + "loss": 0.5494, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 1.1463091432748589, + "learning_rate": 1.9909305689574672e-05, + "loss": 0.6349, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 1.0274960646007008, + "learning_rate": 1.9908951608061078e-05, + "loss": 0.6146, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 1.0541607414609717, + "learning_rate": 1.9908596839865927e-05, + "loss": 0.5856, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 1.416168177554952, + "learning_rate": 1.9908241385013804e-05, + "loss": 0.4781, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 1.068463683699775, + "learning_rate": 1.990788524352934e-05, + "loss": 0.5868, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 1.1117140695290186, + "learning_rate": 1.990752841543722e-05, + "loss": 0.681, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 1.108831597329571, + "learning_rate": 1.9907170900762164e-05, + "loss": 0.6391, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 1.1082692126241613, + "learning_rate": 1.9906812699528956e-05, + "loss": 0.6302, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 1.084315446777715, + "learning_rate": 1.9906453811762415e-05, + "loss": 0.5957, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 1.0997233557871868, + "learning_rate": 1.990609423748741e-05, + "loss": 0.6264, + "step": 891 + }, + { + "epoch": 0.07, + "grad_norm": 1.1273421993059851, + "learning_rate": 1.9905733976728862e-05, + "loss": 0.6095, + "step": 892 + }, + { + "epoch": 0.07, + "grad_norm": 1.1057839655029467, + "learning_rate": 1.990537302951174e-05, + "loss": 0.571, + "step": 893 + }, + { + "epoch": 0.07, + "grad_norm": 1.0514643831580244, + "learning_rate": 1.9905011395861048e-05, + "loss": 0.5988, + "step": 894 + }, + { + "epoch": 0.07, + "grad_norm": 1.0361450310583142, + "learning_rate": 1.9904649075801852e-05, + "loss": 0.5558, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 1.150559012228703, + "learning_rate": 1.9904286069359263e-05, + "loss": 0.6765, + "step": 896 + }, + { + "epoch": 0.07, + "grad_norm": 1.005249808253615, + "learning_rate": 1.9903922376558432e-05, + "loss": 0.5878, + "step": 897 + }, + { + "epoch": 0.07, + "grad_norm": 0.9205219013292413, + "learning_rate": 1.9903557997424565e-05, + "loss": 0.5287, + "step": 898 + }, + { + "epoch": 0.07, + "grad_norm": 1.0380631233474868, + "learning_rate": 1.9903192931982916e-05, + "loss": 0.635, + "step": 899 + }, + { + "epoch": 0.07, + "grad_norm": 1.1151631244071423, + "learning_rate": 1.9902827180258778e-05, + "loss": 0.6371, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 1.1385580867661342, + "learning_rate": 1.99024607422775e-05, + "loss": 0.6381, + "step": 901 + }, + { + "epoch": 0.07, + "grad_norm": 1.1749410505707398, + "learning_rate": 1.9902093618064483e-05, + "loss": 0.676, + "step": 902 + }, + { + "epoch": 0.07, + "grad_norm": 1.0927232181593396, + "learning_rate": 1.9901725807645154e-05, + "loss": 0.6469, + "step": 903 + }, + { + "epoch": 0.07, + "grad_norm": 1.0974191506180526, + "learning_rate": 1.990135731104501e-05, + "loss": 0.5542, + "step": 904 + }, + { + "epoch": 0.07, + "grad_norm": 1.124203680324317, + "learning_rate": 1.9900988128289593e-05, + "loss": 0.6122, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 1.0088657889858017, + "learning_rate": 1.990061825940447e-05, + "loss": 0.6634, + "step": 906 + }, + { + "epoch": 0.07, + "grad_norm": 1.0044706318253844, + "learning_rate": 1.990024770441529e-05, + "loss": 0.5614, + "step": 907 + }, + { + "epoch": 0.07, + "grad_norm": 1.1291401742975522, + "learning_rate": 1.9899876463347727e-05, + "loss": 0.5792, + "step": 908 + }, + { + "epoch": 0.07, + "grad_norm": 1.1340186318427607, + "learning_rate": 1.9899504536227505e-05, + "loss": 0.6871, + "step": 909 + }, + { + "epoch": 0.07, + "grad_norm": 1.1230863530970432, + "learning_rate": 1.98991319230804e-05, + "loss": 0.6134, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 1.1644758801324757, + "learning_rate": 1.989875862393223e-05, + "loss": 0.5701, + "step": 911 + }, + { + "epoch": 0.07, + "grad_norm": 1.094310361279521, + "learning_rate": 1.989838463880887e-05, + "loss": 0.6342, + "step": 912 + }, + { + "epoch": 0.07, + "grad_norm": 1.0246166924294686, + "learning_rate": 1.9898009967736236e-05, + "loss": 0.5564, + "step": 913 + }, + { + "epoch": 0.07, + "grad_norm": 1.029422191382958, + "learning_rate": 1.989763461074029e-05, + "loss": 0.5958, + "step": 914 + }, + { + "epoch": 0.07, + "grad_norm": 1.107913272959802, + "learning_rate": 1.989725856784704e-05, + "loss": 0.5847, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 1.096318807524696, + "learning_rate": 1.9896881839082554e-05, + "loss": 0.6037, + "step": 916 + }, + { + "epoch": 0.07, + "grad_norm": 1.2018583652830437, + "learning_rate": 1.9896504424472936e-05, + "loss": 0.6224, + "step": 917 + }, + { + "epoch": 0.07, + "grad_norm": 1.1961833371889385, + "learning_rate": 1.9896126324044338e-05, + "loss": 0.6426, + "step": 918 + }, + { + "epoch": 0.07, + "grad_norm": 1.2817682573246538, + "learning_rate": 1.9895747537822965e-05, + "loss": 0.6955, + "step": 919 + }, + { + "epoch": 0.07, + "grad_norm": 1.14955929893788, + "learning_rate": 1.989536806583506e-05, + "loss": 0.6328, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 1.1540859308447506, + "learning_rate": 1.9894987908106933e-05, + "loss": 0.6459, + "step": 921 + }, + { + "epoch": 0.07, + "grad_norm": 1.0375748859320604, + "learning_rate": 1.9894607064664914e-05, + "loss": 0.6201, + "step": 922 + }, + { + "epoch": 0.08, + "grad_norm": 1.1298354214589668, + "learning_rate": 1.9894225535535407e-05, + "loss": 0.6913, + "step": 923 + }, + { + "epoch": 0.08, + "grad_norm": 1.0021066744486649, + "learning_rate": 1.9893843320744845e-05, + "loss": 0.5619, + "step": 924 + }, + { + "epoch": 0.08, + "grad_norm": 1.0168328583368194, + "learning_rate": 1.9893460420319716e-05, + "loss": 0.6607, + "step": 925 + }, + { + "epoch": 0.08, + "grad_norm": 0.9896091925300838, + "learning_rate": 1.9893076834286557e-05, + "loss": 0.5867, + "step": 926 + }, + { + "epoch": 0.08, + "grad_norm": 1.2502531797998122, + "learning_rate": 1.9892692562671944e-05, + "loss": 0.718, + "step": 927 + }, + { + "epoch": 0.08, + "grad_norm": 1.0353809825535139, + "learning_rate": 1.9892307605502514e-05, + "loss": 0.5753, + "step": 928 + }, + { + "epoch": 0.08, + "grad_norm": 1.0514084405024644, + "learning_rate": 1.9891921962804942e-05, + "loss": 0.5604, + "step": 929 + }, + { + "epoch": 0.08, + "grad_norm": 1.0772823283544717, + "learning_rate": 1.9891535634605954e-05, + "loss": 0.5712, + "step": 930 + }, + { + "epoch": 0.08, + "grad_norm": 1.0807441080391822, + "learning_rate": 1.989114862093232e-05, + "loss": 0.6179, + "step": 931 + }, + { + "epoch": 0.08, + "grad_norm": 1.0458888781155307, + "learning_rate": 1.9890760921810856e-05, + "loss": 0.6274, + "step": 932 + }, + { + "epoch": 0.08, + "grad_norm": 1.0511812832675538, + "learning_rate": 1.9890372537268433e-05, + "loss": 0.6443, + "step": 933 + }, + { + "epoch": 0.08, + "grad_norm": 1.078728574604796, + "learning_rate": 1.988998346733197e-05, + "loss": 0.6147, + "step": 934 + }, + { + "epoch": 0.08, + "grad_norm": 1.1166802869475498, + "learning_rate": 1.9889593712028422e-05, + "loss": 0.6403, + "step": 935 + }, + { + "epoch": 0.08, + "grad_norm": 1.0624234559060666, + "learning_rate": 1.9889203271384803e-05, + "loss": 0.5563, + "step": 936 + }, + { + "epoch": 0.08, + "grad_norm": 1.0577128119712451, + "learning_rate": 1.9888812145428172e-05, + "loss": 0.673, + "step": 937 + }, + { + "epoch": 0.08, + "grad_norm": 1.1039049361248896, + "learning_rate": 1.9888420334185627e-05, + "loss": 0.6499, + "step": 938 + }, + { + "epoch": 0.08, + "grad_norm": 1.2272573165212521, + "learning_rate": 1.9888027837684326e-05, + "loss": 0.6061, + "step": 939 + }, + { + "epoch": 0.08, + "grad_norm": 1.1160443934529196, + "learning_rate": 1.9887634655951464e-05, + "loss": 0.6287, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 1.104634072856478, + "learning_rate": 1.988724078901429e-05, + "loss": 0.6851, + "step": 941 + }, + { + "epoch": 0.08, + "grad_norm": 1.105786043699008, + "learning_rate": 1.9886846236900102e-05, + "loss": 0.7122, + "step": 942 + }, + { + "epoch": 0.08, + "grad_norm": 1.0171741574616147, + "learning_rate": 1.9886450999636243e-05, + "loss": 0.5775, + "step": 943 + }, + { + "epoch": 0.08, + "grad_norm": 1.2819694827608143, + "learning_rate": 1.9886055077250092e-05, + "loss": 0.7037, + "step": 944 + }, + { + "epoch": 0.08, + "grad_norm": 1.1271597552993238, + "learning_rate": 1.9885658469769094e-05, + "loss": 0.5979, + "step": 945 + }, + { + "epoch": 0.08, + "grad_norm": 1.1455028853874523, + "learning_rate": 1.9885261177220737e-05, + "loss": 0.5962, + "step": 946 + }, + { + "epoch": 0.08, + "grad_norm": 1.0639290716254242, + "learning_rate": 1.9884863199632546e-05, + "loss": 0.6347, + "step": 947 + }, + { + "epoch": 0.08, + "grad_norm": 1.0144440835622914, + "learning_rate": 1.9884464537032103e-05, + "loss": 0.6397, + "step": 948 + }, + { + "epoch": 0.08, + "grad_norm": 1.036460767711225, + "learning_rate": 1.9884065189447036e-05, + "loss": 0.6134, + "step": 949 + }, + { + "epoch": 0.08, + "grad_norm": 1.0006656100539892, + "learning_rate": 1.9883665156905015e-05, + "loss": 0.6241, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 1.058228820011543, + "learning_rate": 1.988326443943377e-05, + "loss": 0.6417, + "step": 951 + }, + { + "epoch": 0.08, + "grad_norm": 1.1734905485423708, + "learning_rate": 1.988286303706106e-05, + "loss": 0.626, + "step": 952 + }, + { + "epoch": 0.08, + "grad_norm": 1.1270162617667199, + "learning_rate": 1.9882460949814716e-05, + "loss": 0.5621, + "step": 953 + }, + { + "epoch": 0.08, + "grad_norm": 1.1408949836430282, + "learning_rate": 1.988205817772259e-05, + "loss": 0.6351, + "step": 954 + }, + { + "epoch": 0.08, + "grad_norm": 1.1396736021976535, + "learning_rate": 1.9881654720812594e-05, + "loss": 0.6667, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 1.0169098488256985, + "learning_rate": 1.9881250579112694e-05, + "loss": 0.6072, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 1.0907079731731821, + "learning_rate": 1.9880845752650896e-05, + "loss": 0.5655, + "step": 957 + }, + { + "epoch": 0.08, + "grad_norm": 1.169059792113472, + "learning_rate": 1.988044024145525e-05, + "loss": 0.6416, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 1.1156233349996894, + "learning_rate": 1.9880034045553858e-05, + "loss": 0.6269, + "step": 959 + }, + { + "epoch": 0.08, + "grad_norm": 1.1084136434129432, + "learning_rate": 1.9879627164974868e-05, + "loss": 0.6429, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 1.1176885510223709, + "learning_rate": 1.9879219599746486e-05, + "loss": 0.6685, + "step": 961 + }, + { + "epoch": 0.08, + "grad_norm": 0.8867395514211373, + "learning_rate": 1.987881134989694e-05, + "loss": 0.4933, + "step": 962 + }, + { + "epoch": 0.08, + "grad_norm": 0.990682541637075, + "learning_rate": 1.9878402415454534e-05, + "loss": 0.6724, + "step": 963 + }, + { + "epoch": 0.08, + "grad_norm": 1.0056387200473818, + "learning_rate": 1.9877992796447604e-05, + "loss": 0.5682, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 1.1467756136126264, + "learning_rate": 1.9877582492904533e-05, + "loss": 0.695, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 1.1088054224503603, + "learning_rate": 1.987717150485376e-05, + "loss": 0.6608, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 1.0436562408251457, + "learning_rate": 1.9876759832323756e-05, + "loss": 0.6355, + "step": 967 + }, + { + "epoch": 0.08, + "grad_norm": 1.024013032934522, + "learning_rate": 1.9876347475343062e-05, + "loss": 0.6029, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 1.123425037307103, + "learning_rate": 1.9875934433940248e-05, + "loss": 0.6084, + "step": 969 + }, + { + "epoch": 0.08, + "grad_norm": 1.0661739928360128, + "learning_rate": 1.9875520708143933e-05, + "loss": 0.6411, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 1.0218598836399087, + "learning_rate": 1.9875106297982798e-05, + "loss": 0.6291, + "step": 971 + }, + { + "epoch": 0.08, + "grad_norm": 1.1134477148440822, + "learning_rate": 1.987469120348555e-05, + "loss": 0.6534, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 1.1517210895453271, + "learning_rate": 1.9874275424680966e-05, + "loss": 0.6704, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 1.1140417226147281, + "learning_rate": 1.987385896159785e-05, + "loss": 0.621, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 1.015703441507511, + "learning_rate": 1.987344181426507e-05, + "loss": 0.6387, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 1.031557022872771, + "learning_rate": 1.987302398271153e-05, + "loss": 0.619, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 1.179749176771803, + "learning_rate": 1.987260546696618e-05, + "loss": 0.62, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 1.1224651935196714, + "learning_rate": 1.987218626705803e-05, + "loss": 0.677, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 1.0949746162974316, + "learning_rate": 1.9871766383016127e-05, + "loss": 0.6365, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 1.0828147418781917, + "learning_rate": 1.9871345814869575e-05, + "loss": 0.6582, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 1.0417686332976923, + "learning_rate": 1.9870924562647512e-05, + "loss": 0.6457, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 1.2736955707692121, + "learning_rate": 1.9870502626379127e-05, + "loss": 0.6502, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 1.0004273125024994, + "learning_rate": 1.9870080006093674e-05, + "loss": 0.578, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 1.1096665328492759, + "learning_rate": 1.9869656701820424e-05, + "loss": 0.5603, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 1.2125446185211894, + "learning_rate": 1.9869232713588724e-05, + "loss": 0.464, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 1.1221372062657058, + "learning_rate": 1.9868808041427948e-05, + "loss": 0.5887, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 0.996001225280882, + "learning_rate": 1.9868382685367533e-05, + "loss": 0.6238, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 1.1367037471800612, + "learning_rate": 1.9867956645436944e-05, + "loss": 0.6152, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 1.1129390935514154, + "learning_rate": 1.9867529921665713e-05, + "loss": 0.6287, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 1.2269397679391247, + "learning_rate": 1.9867102514083415e-05, + "loss": 0.6655, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 1.26827046275962, + "learning_rate": 1.9866674422719666e-05, + "loss": 0.6604, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 1.0594102839215263, + "learning_rate": 1.9866245647604128e-05, + "loss": 0.6356, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 1.0773242595576222, + "learning_rate": 1.9865816188766516e-05, + "loss": 0.6223, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 1.1461188807936942, + "learning_rate": 1.9865386046236597e-05, + "loss": 0.6325, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 1.2375769131659358, + "learning_rate": 1.9864955220044175e-05, + "loss": 0.6041, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 1.1529673106809384, + "learning_rate": 1.9864523710219107e-05, + "loss": 0.6545, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 1.0715029734939707, + "learning_rate": 1.986409151679129e-05, + "loss": 0.567, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 0.9652259721065576, + "learning_rate": 1.9863658639790686e-05, + "loss": 0.622, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 1.1205944623699782, + "learning_rate": 1.9863225079247286e-05, + "loss": 0.6425, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 1.251314945813738, + "learning_rate": 1.9862790835191137e-05, + "loss": 0.6876, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 0.9323839167997477, + "learning_rate": 1.9862355907652332e-05, + "loss": 0.5943, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 0.9863978580378259, + "learning_rate": 1.986192029666101e-05, + "loss": 0.5605, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 1.0486628796809216, + "learning_rate": 1.9861484002247357e-05, + "loss": 0.5606, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 1.1196085211902809, + "learning_rate": 1.9861047024441614e-05, + "loss": 0.6215, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 1.0658424922162064, + "learning_rate": 1.9860609363274056e-05, + "loss": 0.6589, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 1.0050420253517978, + "learning_rate": 1.9860171018775018e-05, + "loss": 0.5239, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 1.0584283466966222, + "learning_rate": 1.9859731990974867e-05, + "loss": 0.6244, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 1.0585855584799557, + "learning_rate": 1.9859292279904043e-05, + "loss": 0.5561, + "step": 1008 + }, + { + "epoch": 0.08, + "grad_norm": 0.9990564994313156, + "learning_rate": 1.9858851885593004e-05, + "loss": 0.4815, + "step": 1009 + }, + { + "epoch": 0.08, + "grad_norm": 1.0178981238404532, + "learning_rate": 1.9858410808072278e-05, + "loss": 0.58, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 0.9590816481827416, + "learning_rate": 1.9857969047372422e-05, + "loss": 0.5601, + "step": 1011 + }, + { + "epoch": 0.08, + "grad_norm": 1.0622275660970883, + "learning_rate": 1.985752660352406e-05, + "loss": 0.5345, + "step": 1012 + }, + { + "epoch": 0.08, + "grad_norm": 1.1863129467779967, + "learning_rate": 1.9857083476557846e-05, + "loss": 0.6909, + "step": 1013 + }, + { + "epoch": 0.08, + "grad_norm": 1.052063234705519, + "learning_rate": 1.9856639666504492e-05, + "loss": 0.5748, + "step": 1014 + }, + { + "epoch": 0.08, + "grad_norm": 1.002701005360324, + "learning_rate": 1.9856195173394754e-05, + "loss": 0.589, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 0.938763483372456, + "learning_rate": 1.985574999725943e-05, + "loss": 0.5357, + "step": 1016 + }, + { + "epoch": 0.08, + "grad_norm": 1.0966699932094541, + "learning_rate": 1.985530413812937e-05, + "loss": 0.6467, + "step": 1017 + }, + { + "epoch": 0.08, + "grad_norm": 0.9847762667527534, + "learning_rate": 1.9854857596035476e-05, + "loss": 0.636, + "step": 1018 + }, + { + "epoch": 0.08, + "grad_norm": 1.1229985693999278, + "learning_rate": 1.9854410371008693e-05, + "loss": 0.6408, + "step": 1019 + }, + { + "epoch": 0.08, + "grad_norm": 1.0914436618280507, + "learning_rate": 1.9853962463080013e-05, + "loss": 0.6649, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 1.063621268903449, + "learning_rate": 1.9853513872280476e-05, + "loss": 0.6554, + "step": 1021 + }, + { + "epoch": 0.08, + "grad_norm": 1.0312095056964494, + "learning_rate": 1.985306459864117e-05, + "loss": 0.6262, + "step": 1022 + }, + { + "epoch": 0.08, + "grad_norm": 1.117013859147517, + "learning_rate": 1.985261464219322e-05, + "loss": 0.6764, + "step": 1023 + }, + { + "epoch": 0.08, + "grad_norm": 1.036906745742879, + "learning_rate": 1.9852164002967818e-05, + "loss": 0.6228, + "step": 1024 + }, + { + "epoch": 0.08, + "grad_norm": 1.0208147244129753, + "learning_rate": 1.9851712680996188e-05, + "loss": 0.6193, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 0.9589392180387819, + "learning_rate": 1.985126067630961e-05, + "loss": 0.6018, + "step": 1026 + }, + { + "epoch": 0.08, + "grad_norm": 1.1675221592473077, + "learning_rate": 1.9850807988939405e-05, + "loss": 0.6075, + "step": 1027 + }, + { + "epoch": 0.08, + "grad_norm": 1.0544466663271506, + "learning_rate": 1.9850354618916942e-05, + "loss": 0.6635, + "step": 1028 + }, + { + "epoch": 0.08, + "grad_norm": 1.0481367773090586, + "learning_rate": 1.9849900566273642e-05, + "loss": 0.5868, + "step": 1029 + }, + { + "epoch": 0.08, + "grad_norm": 0.969942783915277, + "learning_rate": 1.984944583104097e-05, + "loss": 0.5484, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 0.9991758991063624, + "learning_rate": 1.9848990413250436e-05, + "loss": 0.5921, + "step": 1031 + }, + { + "epoch": 0.08, + "grad_norm": 0.9426679105129601, + "learning_rate": 1.9848534312933606e-05, + "loss": 0.5454, + "step": 1032 + }, + { + "epoch": 0.08, + "grad_norm": 1.108668062134207, + "learning_rate": 1.9848077530122083e-05, + "loss": 0.6393, + "step": 1033 + }, + { + "epoch": 0.08, + "grad_norm": 1.163748991275181, + "learning_rate": 1.9847620064847522e-05, + "loss": 0.6919, + "step": 1034 + }, + { + "epoch": 0.08, + "grad_norm": 1.1011142742465692, + "learning_rate": 1.9847161917141626e-05, + "loss": 0.6098, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 1.1603642028495122, + "learning_rate": 1.984670308703614e-05, + "loss": 0.6514, + "step": 1036 + }, + { + "epoch": 0.08, + "grad_norm": 0.9787388127791858, + "learning_rate": 1.9846243574562866e-05, + "loss": 0.5918, + "step": 1037 + }, + { + "epoch": 0.08, + "grad_norm": 1.1162496201649623, + "learning_rate": 1.9845783379753648e-05, + "loss": 0.627, + "step": 1038 + }, + { + "epoch": 0.08, + "grad_norm": 1.104476587435251, + "learning_rate": 1.9845322502640374e-05, + "loss": 0.5546, + "step": 1039 + }, + { + "epoch": 0.08, + "grad_norm": 1.1535168566665752, + "learning_rate": 1.9844860943254983e-05, + "loss": 0.6147, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 1.1166245608533367, + "learning_rate": 1.984439870162946e-05, + "loss": 0.6482, + "step": 1041 + }, + { + "epoch": 0.08, + "grad_norm": 1.0427963654809782, + "learning_rate": 1.984393577779584e-05, + "loss": 0.5811, + "step": 1042 + }, + { + "epoch": 0.08, + "grad_norm": 1.0767589535184312, + "learning_rate": 1.9843472171786204e-05, + "loss": 0.7003, + "step": 1043 + }, + { + "epoch": 0.08, + "grad_norm": 0.9479657664193446, + "learning_rate": 1.9843007883632674e-05, + "loss": 0.5707, + "step": 1044 + }, + { + "epoch": 0.08, + "grad_norm": 1.0463700626631678, + "learning_rate": 1.984254291336743e-05, + "loss": 0.6024, + "step": 1045 + }, + { + "epoch": 0.09, + "grad_norm": 1.0773084022527417, + "learning_rate": 1.984207726102269e-05, + "loss": 0.6155, + "step": 1046 + }, + { + "epoch": 0.09, + "grad_norm": 1.0902382782675282, + "learning_rate": 1.984161092663073e-05, + "loss": 0.5474, + "step": 1047 + }, + { + "epoch": 0.09, + "grad_norm": 0.9644531785730229, + "learning_rate": 1.984114391022386e-05, + "loss": 0.529, + "step": 1048 + }, + { + "epoch": 0.09, + "grad_norm": 1.127131720367706, + "learning_rate": 1.984067621183445e-05, + "loss": 0.5964, + "step": 1049 + }, + { + "epoch": 0.09, + "grad_norm": 1.094880066095062, + "learning_rate": 1.9840207831494903e-05, + "loss": 0.6238, + "step": 1050 + }, + { + "epoch": 0.09, + "grad_norm": 1.1393198512420912, + "learning_rate": 1.983973876923768e-05, + "loss": 0.7522, + "step": 1051 + }, + { + "epoch": 0.09, + "grad_norm": 1.0059937311188745, + "learning_rate": 1.9839269025095293e-05, + "loss": 0.5853, + "step": 1052 + }, + { + "epoch": 0.09, + "grad_norm": 1.4109933801849883, + "learning_rate": 1.9838798599100286e-05, + "loss": 0.6001, + "step": 1053 + }, + { + "epoch": 0.09, + "grad_norm": 1.1132444057463866, + "learning_rate": 1.9838327491285266e-05, + "loss": 0.5767, + "step": 1054 + }, + { + "epoch": 0.09, + "grad_norm": 0.9701633973894861, + "learning_rate": 1.9837855701682875e-05, + "loss": 0.5808, + "step": 1055 + }, + { + "epoch": 0.09, + "grad_norm": 1.1247541865700246, + "learning_rate": 1.983738323032581e-05, + "loss": 0.6039, + "step": 1056 + }, + { + "epoch": 0.09, + "grad_norm": 1.0092591983705888, + "learning_rate": 1.9836910077246813e-05, + "loss": 0.6334, + "step": 1057 + }, + { + "epoch": 0.09, + "grad_norm": 0.9705654534952596, + "learning_rate": 1.9836436242478676e-05, + "loss": 0.5847, + "step": 1058 + }, + { + "epoch": 0.09, + "grad_norm": 0.9904458846833634, + "learning_rate": 1.9835961726054228e-05, + "loss": 0.6517, + "step": 1059 + }, + { + "epoch": 0.09, + "grad_norm": 1.1197036023727012, + "learning_rate": 1.983548652800636e-05, + "loss": 0.6023, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 1.1771133490975412, + "learning_rate": 1.9835010648368e-05, + "loss": 0.6963, + "step": 1061 + }, + { + "epoch": 0.09, + "grad_norm": 1.0699175937730134, + "learning_rate": 1.9834534087172126e-05, + "loss": 0.6147, + "step": 1062 + }, + { + "epoch": 0.09, + "grad_norm": 1.1551718492888678, + "learning_rate": 1.983405684445176e-05, + "loss": 0.6549, + "step": 1063 + }, + { + "epoch": 0.09, + "grad_norm": 1.0603070951538438, + "learning_rate": 1.983357892023998e-05, + "loss": 0.6094, + "step": 1064 + }, + { + "epoch": 0.09, + "grad_norm": 1.0402062222116446, + "learning_rate": 1.98331003145699e-05, + "loss": 0.6219, + "step": 1065 + }, + { + "epoch": 0.09, + "grad_norm": 0.9939060911914949, + "learning_rate": 1.983262102747469e-05, + "loss": 0.6104, + "step": 1066 + }, + { + "epoch": 0.09, + "grad_norm": 0.9774031117289627, + "learning_rate": 1.983214105898757e-05, + "loss": 0.5856, + "step": 1067 + }, + { + "epoch": 0.09, + "grad_norm": 1.0914284577741291, + "learning_rate": 1.983166040914179e-05, + "loss": 0.5925, + "step": 1068 + }, + { + "epoch": 0.09, + "grad_norm": 1.0680432076381006, + "learning_rate": 1.983117907797067e-05, + "loss": 0.682, + "step": 1069 + }, + { + "epoch": 0.09, + "grad_norm": 1.0942550327326452, + "learning_rate": 1.9830697065507554e-05, + "loss": 0.5849, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 1.0321035757944577, + "learning_rate": 1.9830214371785858e-05, + "loss": 0.5567, + "step": 1071 + }, + { + "epoch": 0.09, + "grad_norm": 1.0419076031734429, + "learning_rate": 1.982973099683902e-05, + "loss": 0.6708, + "step": 1072 + }, + { + "epoch": 0.09, + "grad_norm": 1.023472160735429, + "learning_rate": 1.9829246940700543e-05, + "loss": 0.6217, + "step": 1073 + }, + { + "epoch": 0.09, + "grad_norm": 0.9986553016654867, + "learning_rate": 1.9828762203403973e-05, + "loss": 0.5932, + "step": 1074 + }, + { + "epoch": 0.09, + "grad_norm": 1.078259151860152, + "learning_rate": 1.98282767849829e-05, + "loss": 0.6501, + "step": 1075 + }, + { + "epoch": 0.09, + "grad_norm": 0.9435008967700403, + "learning_rate": 1.9827790685470963e-05, + "loss": 0.5278, + "step": 1076 + }, + { + "epoch": 0.09, + "grad_norm": 1.058301399489676, + "learning_rate": 1.9827303904901853e-05, + "loss": 0.6602, + "step": 1077 + }, + { + "epoch": 0.09, + "grad_norm": 1.1175064502389007, + "learning_rate": 1.9826816443309294e-05, + "loss": 0.6301, + "step": 1078 + }, + { + "epoch": 0.09, + "grad_norm": 0.99965556015855, + "learning_rate": 1.9826328300727074e-05, + "loss": 0.6088, + "step": 1079 + }, + { + "epoch": 0.09, + "grad_norm": 1.1386141728126398, + "learning_rate": 1.9825839477189017e-05, + "loss": 0.676, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 1.082303668746878, + "learning_rate": 1.9825349972729003e-05, + "loss": 0.6496, + "step": 1081 + }, + { + "epoch": 0.09, + "grad_norm": 0.9824070847379709, + "learning_rate": 1.982485978738095e-05, + "loss": 0.6093, + "step": 1082 + }, + { + "epoch": 0.09, + "grad_norm": 1.0204342899951846, + "learning_rate": 1.9824368921178825e-05, + "loss": 0.6787, + "step": 1083 + }, + { + "epoch": 0.09, + "grad_norm": 1.004722031213302, + "learning_rate": 1.9823877374156647e-05, + "loss": 0.6069, + "step": 1084 + }, + { + "epoch": 0.09, + "grad_norm": 1.036218483815217, + "learning_rate": 1.9823385146348485e-05, + "loss": 0.6353, + "step": 1085 + }, + { + "epoch": 0.09, + "grad_norm": 1.0611860755386595, + "learning_rate": 1.9822892237788448e-05, + "loss": 0.5991, + "step": 1086 + }, + { + "epoch": 0.09, + "grad_norm": 1.1588271079524333, + "learning_rate": 1.9822398648510684e-05, + "loss": 0.7502, + "step": 1087 + }, + { + "epoch": 0.09, + "grad_norm": 1.03100217892194, + "learning_rate": 1.9821904378549414e-05, + "loss": 0.5934, + "step": 1088 + }, + { + "epoch": 0.09, + "grad_norm": 1.052827521858649, + "learning_rate": 1.9821409427938878e-05, + "loss": 0.6325, + "step": 1089 + }, + { + "epoch": 0.09, + "grad_norm": 0.9408025909515155, + "learning_rate": 1.982091379671338e-05, + "loss": 0.5921, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 1.1543920923976936, + "learning_rate": 1.982041748490727e-05, + "loss": 0.6921, + "step": 1091 + }, + { + "epoch": 0.09, + "grad_norm": 1.0745995961987254, + "learning_rate": 1.9819920492554935e-05, + "loss": 0.651, + "step": 1092 + }, + { + "epoch": 0.09, + "grad_norm": 1.101414274214897, + "learning_rate": 1.9819422819690824e-05, + "loss": 0.6983, + "step": 1093 + }, + { + "epoch": 0.09, + "grad_norm": 1.0657813614519533, + "learning_rate": 1.9818924466349422e-05, + "loss": 0.6255, + "step": 1094 + }, + { + "epoch": 0.09, + "grad_norm": 1.0039151439925784, + "learning_rate": 1.981842543256526e-05, + "loss": 0.613, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 0.9727132720114751, + "learning_rate": 1.981792571837293e-05, + "loss": 0.6061, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 1.089952183620595, + "learning_rate": 1.981742532380705e-05, + "loss": 0.6576, + "step": 1097 + }, + { + "epoch": 0.09, + "grad_norm": 1.1378152254765532, + "learning_rate": 1.9816924248902304e-05, + "loss": 0.6077, + "step": 1098 + }, + { + "epoch": 0.09, + "grad_norm": 1.0000243008860976, + "learning_rate": 1.9816422493693417e-05, + "loss": 0.6, + "step": 1099 + }, + { + "epoch": 0.09, + "grad_norm": 1.0262601717430377, + "learning_rate": 1.9815920058215157e-05, + "loss": 0.5675, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 1.0424400127184634, + "learning_rate": 1.9815416942502346e-05, + "loss": 0.6893, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 1.1959462863993415, + "learning_rate": 1.9814913146589847e-05, + "loss": 0.621, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 1.0532552468562892, + "learning_rate": 1.9814408670512572e-05, + "loss": 0.6032, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 1.0106259099736372, + "learning_rate": 1.981390351430548e-05, + "loss": 0.5914, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 1.1403825524665405, + "learning_rate": 1.981339767800358e-05, + "loss": 0.6111, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 1.043511667204388, + "learning_rate": 1.9812891161641927e-05, + "loss": 0.6132, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 1.1476441506456099, + "learning_rate": 1.981238396525562e-05, + "loss": 0.6988, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 1.0601807753560835, + "learning_rate": 1.9811876088879808e-05, + "loss": 0.6176, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 1.1079517298243802, + "learning_rate": 1.9811367532549686e-05, + "loss": 0.5924, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 1.0216589393619098, + "learning_rate": 1.9810858296300496e-05, + "loss": 0.5809, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 1.1727244067131004, + "learning_rate": 1.9810348380167527e-05, + "loss": 0.666, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 1.0590507767162933, + "learning_rate": 1.9809837784186117e-05, + "loss": 0.5926, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 1.0800020752477553, + "learning_rate": 1.9809326508391653e-05, + "loss": 0.6388, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 1.113372749895697, + "learning_rate": 1.980881455281956e-05, + "loss": 0.6648, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 1.1485772029405572, + "learning_rate": 1.980830191750532e-05, + "loss": 0.6499, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 0.8997588895538492, + "learning_rate": 1.980778860248446e-05, + "loss": 0.533, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 1.027167278167877, + "learning_rate": 1.9807274607792545e-05, + "loss": 0.6515, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 1.0088579416146564, + "learning_rate": 1.98067599334652e-05, + "loss": 0.5844, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 1.1311769529954938, + "learning_rate": 1.980624457953809e-05, + "loss": 0.6711, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 1.0829367163761348, + "learning_rate": 1.980572854604693e-05, + "loss": 0.5857, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 1.1391803149878377, + "learning_rate": 1.980521183302748e-05, + "loss": 0.6258, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 1.125268403164557, + "learning_rate": 1.980469444051554e-05, + "loss": 0.6142, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 1.0778526344981947, + "learning_rate": 1.980417636854698e-05, + "loss": 0.5555, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 1.082392494479735, + "learning_rate": 1.9803657617157693e-05, + "loss": 0.603, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 1.0569189959865126, + "learning_rate": 1.9803138186383628e-05, + "loss": 0.6449, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 1.0852474738625402, + "learning_rate": 1.9802618076260784e-05, + "loss": 0.6242, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 1.1175750495002248, + "learning_rate": 1.9802097286825197e-05, + "loss": 0.6315, + "step": 1127 + }, + { + "epoch": 0.09, + "grad_norm": 1.0773717623972658, + "learning_rate": 1.9801575818112964e-05, + "loss": 0.664, + "step": 1128 + }, + { + "epoch": 0.09, + "grad_norm": 1.162007196070372, + "learning_rate": 1.980105367016022e-05, + "loss": 0.6763, + "step": 1129 + }, + { + "epoch": 0.09, + "grad_norm": 1.136601052174248, + "learning_rate": 1.9800530843003157e-05, + "loss": 0.6117, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 1.143985763323429, + "learning_rate": 1.9800007336677994e-05, + "loss": 0.6176, + "step": 1131 + }, + { + "epoch": 0.09, + "grad_norm": 1.0404379302337705, + "learning_rate": 1.979948315122102e-05, + "loss": 0.6334, + "step": 1132 + }, + { + "epoch": 0.09, + "grad_norm": 0.9958232338449807, + "learning_rate": 1.979895828666855e-05, + "loss": 0.6101, + "step": 1133 + }, + { + "epoch": 0.09, + "grad_norm": 1.0200571531476168, + "learning_rate": 1.9798432743056964e-05, + "loss": 0.5884, + "step": 1134 + }, + { + "epoch": 0.09, + "grad_norm": 1.0324428907512637, + "learning_rate": 1.979790652042268e-05, + "loss": 0.6089, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 1.1115915371843572, + "learning_rate": 1.9797379618802163e-05, + "loss": 0.6658, + "step": 1136 + }, + { + "epoch": 0.09, + "grad_norm": 1.1328477500696639, + "learning_rate": 1.9796852038231932e-05, + "loss": 0.6374, + "step": 1137 + }, + { + "epoch": 0.09, + "grad_norm": 1.2353642392222939, + "learning_rate": 1.9796323778748544e-05, + "loss": 0.6538, + "step": 1138 + }, + { + "epoch": 0.09, + "grad_norm": 1.053963539963818, + "learning_rate": 1.9795794840388605e-05, + "loss": 0.568, + "step": 1139 + }, + { + "epoch": 0.09, + "grad_norm": 1.1496538850517588, + "learning_rate": 1.9795265223188775e-05, + "loss": 0.5619, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 1.1157661180851675, + "learning_rate": 1.9794734927185756e-05, + "loss": 0.6437, + "step": 1141 + }, + { + "epoch": 0.09, + "grad_norm": 1.1599396320995672, + "learning_rate": 1.979420395241629e-05, + "loss": 0.6507, + "step": 1142 + }, + { + "epoch": 0.09, + "grad_norm": 1.112262206229225, + "learning_rate": 1.9793672298917178e-05, + "loss": 0.6211, + "step": 1143 + }, + { + "epoch": 0.09, + "grad_norm": 1.0729221065115775, + "learning_rate": 1.9793139966725264e-05, + "loss": 0.5873, + "step": 1144 + }, + { + "epoch": 0.09, + "grad_norm": 1.1470954186434454, + "learning_rate": 1.9792606955877437e-05, + "loss": 0.6533, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 1.0966433183148414, + "learning_rate": 1.979207326641063e-05, + "loss": 0.6198, + "step": 1146 + }, + { + "epoch": 0.09, + "grad_norm": 1.0887800674211725, + "learning_rate": 1.979153889836184e-05, + "loss": 0.6142, + "step": 1147 + }, + { + "epoch": 0.09, + "grad_norm": 1.0505608118632632, + "learning_rate": 1.979100385176808e-05, + "loss": 0.6201, + "step": 1148 + }, + { + "epoch": 0.09, + "grad_norm": 1.0637276276013699, + "learning_rate": 1.979046812666644e-05, + "loss": 0.6334, + "step": 1149 + }, + { + "epoch": 0.09, + "grad_norm": 1.0959557081058233, + "learning_rate": 1.9789931723094046e-05, + "loss": 0.5997, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 1.1145558485853704, + "learning_rate": 1.9789394641088068e-05, + "loss": 0.6324, + "step": 1151 + }, + { + "epoch": 0.09, + "grad_norm": 1.1470672684736019, + "learning_rate": 1.978885688068572e-05, + "loss": 0.6474, + "step": 1152 + }, + { + "epoch": 0.09, + "grad_norm": 1.059988951594, + "learning_rate": 1.9788318441924276e-05, + "loss": 0.6878, + "step": 1153 + }, + { + "epoch": 0.09, + "grad_norm": 1.0449670410814962, + "learning_rate": 1.9787779324841045e-05, + "loss": 0.6136, + "step": 1154 + }, + { + "epoch": 0.09, + "grad_norm": 1.1036223223053379, + "learning_rate": 1.978723952947339e-05, + "loss": 0.4832, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 0.9439899179136536, + "learning_rate": 1.9786699055858715e-05, + "loss": 0.5875, + "step": 1156 + }, + { + "epoch": 0.09, + "grad_norm": 1.1271381575003854, + "learning_rate": 1.9786157904034476e-05, + "loss": 0.6729, + "step": 1157 + }, + { + "epoch": 0.09, + "grad_norm": 1.1559246313644351, + "learning_rate": 1.9785616074038177e-05, + "loss": 0.6624, + "step": 1158 + }, + { + "epoch": 0.09, + "grad_norm": 1.1229234851699839, + "learning_rate": 1.978507356590736e-05, + "loss": 0.5696, + "step": 1159 + }, + { + "epoch": 0.09, + "grad_norm": 1.1417411033525822, + "learning_rate": 1.978453037967963e-05, + "loss": 0.6108, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 0.9829968484078301, + "learning_rate": 1.978398651539262e-05, + "loss": 0.6475, + "step": 1161 + }, + { + "epoch": 0.09, + "grad_norm": 1.0605076321999238, + "learning_rate": 1.9783441973084023e-05, + "loss": 0.61, + "step": 1162 + }, + { + "epoch": 0.09, + "grad_norm": 0.9719439884646509, + "learning_rate": 1.9782896752791576e-05, + "loss": 0.574, + "step": 1163 + }, + { + "epoch": 0.09, + "grad_norm": 1.0185093398635798, + "learning_rate": 1.978235085455306e-05, + "loss": 0.6139, + "step": 1164 + }, + { + "epoch": 0.09, + "grad_norm": 0.9513968903641321, + "learning_rate": 1.9781804278406308e-05, + "loss": 0.6326, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 1.0634305191214797, + "learning_rate": 1.9781257024389194e-05, + "loss": 0.6322, + "step": 1166 + }, + { + "epoch": 0.09, + "grad_norm": 0.9513957696631159, + "learning_rate": 1.9780709092539647e-05, + "loss": 0.6513, + "step": 1167 + }, + { + "epoch": 0.09, + "grad_norm": 0.899895870947211, + "learning_rate": 1.9780160482895633e-05, + "loss": 0.5656, + "step": 1168 + }, + { + "epoch": 0.1, + "grad_norm": 1.0053864478482981, + "learning_rate": 1.9779611195495177e-05, + "loss": 0.6136, + "step": 1169 + }, + { + "epoch": 0.1, + "grad_norm": 1.0556166213683182, + "learning_rate": 1.9779061230376334e-05, + "loss": 0.5876, + "step": 1170 + }, + { + "epoch": 0.1, + "grad_norm": 1.0620148974297947, + "learning_rate": 1.9778510587577226e-05, + "loss": 0.6286, + "step": 1171 + }, + { + "epoch": 0.1, + "grad_norm": 1.0944898098419875, + "learning_rate": 1.9777959267136005e-05, + "loss": 0.6408, + "step": 1172 + }, + { + "epoch": 0.1, + "grad_norm": 1.1262428474198718, + "learning_rate": 1.977740726909088e-05, + "loss": 0.5868, + "step": 1173 + }, + { + "epoch": 0.1, + "grad_norm": 1.1348189568565432, + "learning_rate": 1.9776854593480107e-05, + "loss": 0.6325, + "step": 1174 + }, + { + "epoch": 0.1, + "grad_norm": 1.0265007969572906, + "learning_rate": 1.977630124034198e-05, + "loss": 0.6002, + "step": 1175 + }, + { + "epoch": 0.1, + "grad_norm": 1.0045863640942072, + "learning_rate": 1.9775747209714847e-05, + "loss": 0.4731, + "step": 1176 + }, + { + "epoch": 0.1, + "grad_norm": 1.1767076857181433, + "learning_rate": 1.9775192501637104e-05, + "loss": 0.5284, + "step": 1177 + }, + { + "epoch": 0.1, + "grad_norm": 1.0981349566632057, + "learning_rate": 1.9774637116147194e-05, + "loss": 0.6136, + "step": 1178 + }, + { + "epoch": 0.1, + "grad_norm": 1.1161235671651721, + "learning_rate": 1.97740810532836e-05, + "loss": 0.6547, + "step": 1179 + }, + { + "epoch": 0.1, + "grad_norm": 1.0884324364235982, + "learning_rate": 1.9773524313084857e-05, + "loss": 0.6237, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 1.1129008869770969, + "learning_rate": 1.977296689558955e-05, + "loss": 0.6, + "step": 1181 + }, + { + "epoch": 0.1, + "grad_norm": 1.096193804673158, + "learning_rate": 1.9772408800836308e-05, + "loss": 0.6041, + "step": 1182 + }, + { + "epoch": 0.1, + "grad_norm": 0.9666265897405518, + "learning_rate": 1.9771850028863802e-05, + "loss": 0.5926, + "step": 1183 + }, + { + "epoch": 0.1, + "grad_norm": 1.2019818907270623, + "learning_rate": 1.977129057971076e-05, + "loss": 0.6472, + "step": 1184 + }, + { + "epoch": 0.1, + "grad_norm": 1.0893335099998267, + "learning_rate": 1.977073045341594e-05, + "loss": 0.6338, + "step": 1185 + }, + { + "epoch": 0.1, + "grad_norm": 1.0960683097540216, + "learning_rate": 1.977016965001817e-05, + "loss": 0.6508, + "step": 1186 + }, + { + "epoch": 0.1, + "grad_norm": 1.2142205404788615, + "learning_rate": 1.9769608169556314e-05, + "loss": 0.6288, + "step": 1187 + }, + { + "epoch": 0.1, + "grad_norm": 1.0610027884391453, + "learning_rate": 1.9769046012069273e-05, + "loss": 0.611, + "step": 1188 + }, + { + "epoch": 0.1, + "grad_norm": 1.0705797801046106, + "learning_rate": 1.9768483177596008e-05, + "loss": 0.6007, + "step": 1189 + }, + { + "epoch": 0.1, + "grad_norm": 0.9870195777939947, + "learning_rate": 1.9767919666175526e-05, + "loss": 0.579, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 1.0850603051142014, + "learning_rate": 1.976735547784687e-05, + "loss": 0.5486, + "step": 1191 + }, + { + "epoch": 0.1, + "grad_norm": 0.976803281526567, + "learning_rate": 1.976679061264915e-05, + "loss": 0.5985, + "step": 1192 + }, + { + "epoch": 0.1, + "grad_norm": 1.0568799468567787, + "learning_rate": 1.97662250706215e-05, + "loss": 0.5601, + "step": 1193 + }, + { + "epoch": 0.1, + "grad_norm": 1.0791769089869665, + "learning_rate": 1.9765658851803116e-05, + "loss": 0.593, + "step": 1194 + }, + { + "epoch": 0.1, + "grad_norm": 1.0301803244650218, + "learning_rate": 1.9765091956233235e-05, + "loss": 0.5781, + "step": 1195 + }, + { + "epoch": 0.1, + "grad_norm": 1.1232153943591479, + "learning_rate": 1.9764524383951147e-05, + "loss": 0.681, + "step": 1196 + }, + { + "epoch": 0.1, + "grad_norm": 0.978549446797339, + "learning_rate": 1.9763956134996176e-05, + "loss": 0.6041, + "step": 1197 + }, + { + "epoch": 0.1, + "grad_norm": 1.0988170899210346, + "learning_rate": 1.9763387209407706e-05, + "loss": 0.6253, + "step": 1198 + }, + { + "epoch": 0.1, + "grad_norm": 0.9992464023597344, + "learning_rate": 1.9762817607225163e-05, + "loss": 0.5952, + "step": 1199 + }, + { + "epoch": 0.1, + "grad_norm": 1.0648084957537778, + "learning_rate": 1.976224732848802e-05, + "loss": 0.6382, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 1.0094155962654219, + "learning_rate": 1.9761676373235797e-05, + "loss": 0.5395, + "step": 1201 + }, + { + "epoch": 0.1, + "grad_norm": 0.9813734130095451, + "learning_rate": 1.976110474150806e-05, + "loss": 0.6242, + "step": 1202 + }, + { + "epoch": 0.1, + "grad_norm": 0.9800418269518998, + "learning_rate": 1.976053243334442e-05, + "loss": 0.5816, + "step": 1203 + }, + { + "epoch": 0.1, + "grad_norm": 1.0461807540210366, + "learning_rate": 1.975995944878454e-05, + "loss": 0.6228, + "step": 1204 + }, + { + "epoch": 0.1, + "grad_norm": 1.1634591956582694, + "learning_rate": 1.9759385787868128e-05, + "loss": 0.642, + "step": 1205 + }, + { + "epoch": 0.1, + "grad_norm": 1.0139084190327168, + "learning_rate": 1.9758811450634936e-05, + "loss": 0.5179, + "step": 1206 + }, + { + "epoch": 0.1, + "grad_norm": 1.1264320315929353, + "learning_rate": 1.9758236437124768e-05, + "loss": 0.6243, + "step": 1207 + }, + { + "epoch": 0.1, + "grad_norm": 1.0325778222003517, + "learning_rate": 1.975766074737747e-05, + "loss": 0.5712, + "step": 1208 + }, + { + "epoch": 0.1, + "grad_norm": 1.0593457101808865, + "learning_rate": 1.975708438143294e-05, + "loss": 0.6367, + "step": 1209 + }, + { + "epoch": 0.1, + "grad_norm": 1.1923816644294367, + "learning_rate": 1.9756507339331115e-05, + "loss": 0.6276, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 0.9729967459849977, + "learning_rate": 1.9755929621111985e-05, + "loss": 0.6313, + "step": 1211 + }, + { + "epoch": 0.1, + "grad_norm": 1.0460689918701482, + "learning_rate": 1.9755351226815586e-05, + "loss": 0.6645, + "step": 1212 + }, + { + "epoch": 0.1, + "grad_norm": 0.9693482523850563, + "learning_rate": 1.9754772156482e-05, + "loss": 0.5295, + "step": 1213 + }, + { + "epoch": 0.1, + "grad_norm": 1.124750196318656, + "learning_rate": 1.9754192410151357e-05, + "loss": 0.5686, + "step": 1214 + }, + { + "epoch": 0.1, + "grad_norm": 1.073152082986142, + "learning_rate": 1.975361198786383e-05, + "loss": 0.6052, + "step": 1215 + }, + { + "epoch": 0.1, + "grad_norm": 1.023364933183271, + "learning_rate": 1.9753030889659644e-05, + "loss": 0.5647, + "step": 1216 + }, + { + "epoch": 0.1, + "grad_norm": 1.081775326478639, + "learning_rate": 1.975244911557907e-05, + "loss": 0.632, + "step": 1217 + }, + { + "epoch": 0.1, + "grad_norm": 1.1735100931930877, + "learning_rate": 1.9751866665662424e-05, + "loss": 0.5785, + "step": 1218 + }, + { + "epoch": 0.1, + "grad_norm": 1.0741437321863894, + "learning_rate": 1.9751283539950065e-05, + "loss": 0.6068, + "step": 1219 + }, + { + "epoch": 0.1, + "grad_norm": 1.0562432053843551, + "learning_rate": 1.9750699738482403e-05, + "loss": 0.636, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 1.0989013126993985, + "learning_rate": 1.9750115261299903e-05, + "loss": 0.6054, + "step": 1221 + }, + { + "epoch": 0.1, + "grad_norm": 1.0119053434260734, + "learning_rate": 1.9749530108443063e-05, + "loss": 0.6065, + "step": 1222 + }, + { + "epoch": 0.1, + "grad_norm": 1.079234448458603, + "learning_rate": 1.9748944279952433e-05, + "loss": 0.6835, + "step": 1223 + }, + { + "epoch": 0.1, + "grad_norm": 1.0915946403546992, + "learning_rate": 1.9748357775868615e-05, + "loss": 0.6293, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 1.0920504111740896, + "learning_rate": 1.9747770596232247e-05, + "loss": 0.7179, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 1.1510367657359035, + "learning_rate": 1.974718274108402e-05, + "loss": 0.6247, + "step": 1226 + }, + { + "epoch": 0.1, + "grad_norm": 0.9528179845476008, + "learning_rate": 1.974659421046468e-05, + "loss": 0.5738, + "step": 1227 + }, + { + "epoch": 0.1, + "grad_norm": 1.0980600804946157, + "learning_rate": 1.9746005004415004e-05, + "loss": 0.603, + "step": 1228 + }, + { + "epoch": 0.1, + "grad_norm": 1.112778681258694, + "learning_rate": 1.9745415122975825e-05, + "loss": 0.6356, + "step": 1229 + }, + { + "epoch": 0.1, + "grad_norm": 1.0704814670074079, + "learning_rate": 1.9744824566188027e-05, + "loss": 0.6157, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 1.0033594940133337, + "learning_rate": 1.9744233334092525e-05, + "loss": 0.6019, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 1.2024696465537366, + "learning_rate": 1.9743641426730297e-05, + "loss": 0.6621, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 0.9820245364504157, + "learning_rate": 1.9743048844142364e-05, + "loss": 0.6052, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 1.143158745985614, + "learning_rate": 1.9742455586369786e-05, + "loss": 0.6646, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 1.2942234573798506, + "learning_rate": 1.9741861653453672e-05, + "loss": 0.6221, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 1.2151223441956862, + "learning_rate": 1.9741267045435193e-05, + "loss": 0.6063, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 1.0487514310045238, + "learning_rate": 1.9740671762355548e-05, + "loss": 0.5818, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 1.0123848172684526, + "learning_rate": 1.9740075804255987e-05, + "loss": 0.6388, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 0.9570590630526808, + "learning_rate": 1.9739479171177816e-05, + "loss": 0.6237, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 1.156696361880616, + "learning_rate": 1.9738881863162372e-05, + "loss": 0.7091, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 1.0897508472632145, + "learning_rate": 1.973828388025106e-05, + "loss": 0.6423, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 1.1737063919986686, + "learning_rate": 1.9737685222485307e-05, + "loss": 0.6774, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 1.0938566735390922, + "learning_rate": 1.9737085889906608e-05, + "loss": 0.6219, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 1.1437393726188527, + "learning_rate": 1.9736485882556495e-05, + "loss": 0.6295, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 1.0791457714353598, + "learning_rate": 1.9735885200476545e-05, + "loss": 0.6326, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 1.105337478068499, + "learning_rate": 1.9735283843708384e-05, + "loss": 0.5601, + "step": 1246 + }, + { + "epoch": 0.1, + "grad_norm": 0.9900255732554479, + "learning_rate": 1.973468181229369e-05, + "loss": 0.612, + "step": 1247 + }, + { + "epoch": 0.1, + "grad_norm": 0.9638114049500947, + "learning_rate": 1.9734079106274185e-05, + "loss": 0.5639, + "step": 1248 + }, + { + "epoch": 0.1, + "grad_norm": 1.155607623256461, + "learning_rate": 1.9733475725691627e-05, + "loss": 0.656, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 0.8552961484865126, + "learning_rate": 1.9732871670587835e-05, + "loss": 0.5105, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 1.010264435160565, + "learning_rate": 1.973226694100467e-05, + "loss": 0.6195, + "step": 1251 + }, + { + "epoch": 0.1, + "grad_norm": 1.7056228909411109, + "learning_rate": 1.9731661536984038e-05, + "loss": 0.4986, + "step": 1252 + }, + { + "epoch": 0.1, + "grad_norm": 0.9955221005221241, + "learning_rate": 1.9731055458567895e-05, + "loss": 0.6386, + "step": 1253 + }, + { + "epoch": 0.1, + "grad_norm": 1.3420947012150832, + "learning_rate": 1.973044870579824e-05, + "loss": 0.5083, + "step": 1254 + }, + { + "epoch": 0.1, + "grad_norm": 1.0090464596512863, + "learning_rate": 1.972984127871712e-05, + "loss": 0.6675, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 1.0806608477579371, + "learning_rate": 1.972923317736663e-05, + "loss": 0.6017, + "step": 1256 + }, + { + "epoch": 0.1, + "grad_norm": 1.12514539614613, + "learning_rate": 1.9728624401788908e-05, + "loss": 0.625, + "step": 1257 + }, + { + "epoch": 0.1, + "grad_norm": 1.0903568183651684, + "learning_rate": 1.972801495202615e-05, + "loss": 0.6665, + "step": 1258 + }, + { + "epoch": 0.1, + "grad_norm": 0.9965450584629888, + "learning_rate": 1.972740482812058e-05, + "loss": 0.589, + "step": 1259 + }, + { + "epoch": 0.1, + "grad_norm": 1.0261512414679927, + "learning_rate": 1.9726794030114484e-05, + "loss": 0.4879, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 1.2427966939144413, + "learning_rate": 1.972618255805019e-05, + "loss": 0.6667, + "step": 1261 + }, + { + "epoch": 0.1, + "grad_norm": 1.101568478078574, + "learning_rate": 1.9725570411970074e-05, + "loss": 0.6222, + "step": 1262 + }, + { + "epoch": 0.1, + "grad_norm": 1.0828180407176138, + "learning_rate": 1.972495759191655e-05, + "loss": 0.6233, + "step": 1263 + }, + { + "epoch": 0.1, + "grad_norm": 1.0487641243984773, + "learning_rate": 1.9724344097932097e-05, + "loss": 0.6602, + "step": 1264 + }, + { + "epoch": 0.1, + "grad_norm": 1.1526684403152978, + "learning_rate": 1.972372993005922e-05, + "loss": 0.6448, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 1.029942884679848, + "learning_rate": 1.9723115088340483e-05, + "loss": 0.6205, + "step": 1266 + }, + { + "epoch": 0.1, + "grad_norm": 1.0887794032526703, + "learning_rate": 1.9722499572818496e-05, + "loss": 0.5804, + "step": 1267 + }, + { + "epoch": 0.1, + "grad_norm": 1.315334282162586, + "learning_rate": 1.972188338353591e-05, + "loss": 0.6214, + "step": 1268 + }, + { + "epoch": 0.1, + "grad_norm": 1.1271196813059783, + "learning_rate": 1.9721266520535435e-05, + "loss": 0.6682, + "step": 1269 + }, + { + "epoch": 0.1, + "grad_norm": 1.163495523822124, + "learning_rate": 1.972064898385981e-05, + "loss": 0.6164, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 1.0811227308269724, + "learning_rate": 1.972003077355183e-05, + "loss": 0.6423, + "step": 1271 + }, + { + "epoch": 0.1, + "grad_norm": 1.0236819263499997, + "learning_rate": 1.971941188965434e-05, + "loss": 0.6196, + "step": 1272 + }, + { + "epoch": 0.1, + "grad_norm": 1.0352641260022146, + "learning_rate": 1.971879233221023e-05, + "loss": 0.6194, + "step": 1273 + }, + { + "epoch": 0.1, + "grad_norm": 1.0848684317413644, + "learning_rate": 1.971817210126243e-05, + "loss": 0.6133, + "step": 1274 + }, + { + "epoch": 0.1, + "grad_norm": 1.0566291958722285, + "learning_rate": 1.9717551196853925e-05, + "loss": 0.5973, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 1.0355223511760434, + "learning_rate": 1.9716929619027734e-05, + "loss": 0.5948, + "step": 1276 + }, + { + "epoch": 0.1, + "grad_norm": 1.1732139139098121, + "learning_rate": 1.971630736782695e-05, + "loss": 0.7294, + "step": 1277 + }, + { + "epoch": 0.1, + "grad_norm": 0.9992577370321353, + "learning_rate": 1.9715684443294677e-05, + "loss": 0.6432, + "step": 1278 + }, + { + "epoch": 0.1, + "grad_norm": 1.0221808293116248, + "learning_rate": 1.971506084547409e-05, + "loss": 0.5805, + "step": 1279 + }, + { + "epoch": 0.1, + "grad_norm": 1.0768858195353592, + "learning_rate": 1.9714436574408408e-05, + "loss": 0.6353, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 0.9226845697587781, + "learning_rate": 1.9713811630140885e-05, + "loss": 0.592, + "step": 1281 + }, + { + "epoch": 0.1, + "grad_norm": 1.0268897107440507, + "learning_rate": 1.971318601271483e-05, + "loss": 0.6324, + "step": 1282 + }, + { + "epoch": 0.1, + "grad_norm": 1.0428076161671564, + "learning_rate": 1.9712559722173602e-05, + "loss": 0.6377, + "step": 1283 + }, + { + "epoch": 0.1, + "grad_norm": 1.0859639624879796, + "learning_rate": 1.9711932758560604e-05, + "loss": 0.6022, + "step": 1284 + }, + { + "epoch": 0.1, + "grad_norm": 1.0210349956207485, + "learning_rate": 1.971130512191928e-05, + "loss": 0.6474, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 1.0599033695812297, + "learning_rate": 1.971067681229312e-05, + "loss": 0.6067, + "step": 1286 + }, + { + "epoch": 0.1, + "grad_norm": 1.120979412441857, + "learning_rate": 1.971004782972567e-05, + "loss": 0.6269, + "step": 1287 + }, + { + "epoch": 0.1, + "grad_norm": 1.2802519696010277, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.547, + "step": 1288 + }, + { + "epoch": 0.1, + "grad_norm": 1.0022250775277506, + "learning_rate": 1.9708787845941306e-05, + "loss": 0.6117, + "step": 1289 + }, + { + "epoch": 0.1, + "grad_norm": 1.0787715179894646, + "learning_rate": 1.97081568448117e-05, + "loss": 0.6176, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 1.0037129554011988, + "learning_rate": 1.970752517091544e-05, + "loss": 0.6323, + "step": 1291 + }, + { + "epoch": 0.11, + "grad_norm": 1.0334591262363808, + "learning_rate": 1.9706892824296297e-05, + "loss": 0.6304, + "step": 1292 + }, + { + "epoch": 0.11, + "grad_norm": 1.0374628218351345, + "learning_rate": 1.9706259804998093e-05, + "loss": 0.4893, + "step": 1293 + }, + { + "epoch": 0.11, + "grad_norm": 0.9865355112514083, + "learning_rate": 1.970562611306469e-05, + "loss": 0.5467, + "step": 1294 + }, + { + "epoch": 0.11, + "grad_norm": 1.116948900462893, + "learning_rate": 1.9704991748540004e-05, + "loss": 0.611, + "step": 1295 + }, + { + "epoch": 0.11, + "grad_norm": 1.084751087051627, + "learning_rate": 1.9704356711468e-05, + "loss": 0.644, + "step": 1296 + }, + { + "epoch": 0.11, + "grad_norm": 1.1208899298642612, + "learning_rate": 1.9703721001892685e-05, + "loss": 0.6399, + "step": 1297 + }, + { + "epoch": 0.11, + "grad_norm": 1.0538062893584283, + "learning_rate": 1.9703084619858112e-05, + "loss": 0.6364, + "step": 1298 + }, + { + "epoch": 0.11, + "grad_norm": 0.9944067068990796, + "learning_rate": 1.9702447565408382e-05, + "loss": 0.6172, + "step": 1299 + }, + { + "epoch": 0.11, + "grad_norm": 1.022208238295205, + "learning_rate": 1.970180983858764e-05, + "loss": 0.6118, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 1.0199055126077872, + "learning_rate": 1.970117143944008e-05, + "loss": 0.6156, + "step": 1301 + }, + { + "epoch": 0.11, + "grad_norm": 0.982700829054324, + "learning_rate": 1.9700532368009947e-05, + "loss": 0.5647, + "step": 1302 + }, + { + "epoch": 0.11, + "grad_norm": 1.0121386373965509, + "learning_rate": 1.9699892624341527e-05, + "loss": 0.604, + "step": 1303 + }, + { + "epoch": 0.11, + "grad_norm": 0.9843522129415369, + "learning_rate": 1.9699252208479147e-05, + "loss": 0.4893, + "step": 1304 + }, + { + "epoch": 0.11, + "grad_norm": 1.0805374392292688, + "learning_rate": 1.9698611120467196e-05, + "loss": 0.6397, + "step": 1305 + }, + { + "epoch": 0.11, + "grad_norm": 1.0497480677687236, + "learning_rate": 1.9697969360350098e-05, + "loss": 0.5588, + "step": 1306 + }, + { + "epoch": 0.11, + "grad_norm": 1.0958726599081376, + "learning_rate": 1.9697326928172323e-05, + "loss": 0.6535, + "step": 1307 + }, + { + "epoch": 0.11, + "grad_norm": 0.9666448228023902, + "learning_rate": 1.9696683823978392e-05, + "loss": 0.576, + "step": 1308 + }, + { + "epoch": 0.11, + "grad_norm": 0.9946083522677421, + "learning_rate": 1.9696040047812874e-05, + "loss": 0.6022, + "step": 1309 + }, + { + "epoch": 0.11, + "grad_norm": 1.0237237273450226, + "learning_rate": 1.9695395599720385e-05, + "loss": 0.6396, + "step": 1310 + }, + { + "epoch": 0.11, + "grad_norm": 1.0853962147117462, + "learning_rate": 1.9694750479745573e-05, + "loss": 0.6019, + "step": 1311 + }, + { + "epoch": 0.11, + "grad_norm": 1.002745208425993, + "learning_rate": 1.969410468793316e-05, + "loss": 0.576, + "step": 1312 + }, + { + "epoch": 0.11, + "grad_norm": 1.033603915650126, + "learning_rate": 1.9693458224327886e-05, + "loss": 0.5707, + "step": 1313 + }, + { + "epoch": 0.11, + "grad_norm": 1.0369646792701839, + "learning_rate": 1.9692811088974556e-05, + "loss": 0.5698, + "step": 1314 + }, + { + "epoch": 0.11, + "grad_norm": 1.0388865698456857, + "learning_rate": 1.9692163281918016e-05, + "loss": 0.623, + "step": 1315 + }, + { + "epoch": 0.11, + "grad_norm": 0.9886888943099889, + "learning_rate": 1.9691514803203157e-05, + "loss": 0.6026, + "step": 1316 + }, + { + "epoch": 0.11, + "grad_norm": 1.0760978377547694, + "learning_rate": 1.969086565287492e-05, + "loss": 0.6156, + "step": 1317 + }, + { + "epoch": 0.11, + "grad_norm": 1.0024953962331788, + "learning_rate": 1.9690215830978286e-05, + "loss": 0.6297, + "step": 1318 + }, + { + "epoch": 0.11, + "grad_norm": 1.092907645473543, + "learning_rate": 1.968956533755829e-05, + "loss": 0.6497, + "step": 1319 + }, + { + "epoch": 0.11, + "grad_norm": 1.0441774948342657, + "learning_rate": 1.968891417266001e-05, + "loss": 0.6091, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 0.9898586379948117, + "learning_rate": 1.9688262336328576e-05, + "loss": 0.5755, + "step": 1321 + }, + { + "epoch": 0.11, + "grad_norm": 0.9397873413080308, + "learning_rate": 1.9687609828609156e-05, + "loss": 0.6226, + "step": 1322 + }, + { + "epoch": 0.11, + "grad_norm": 1.0412928068520495, + "learning_rate": 1.9686956649546964e-05, + "loss": 0.5881, + "step": 1323 + }, + { + "epoch": 0.11, + "grad_norm": 0.9954215004050639, + "learning_rate": 1.9686302799187272e-05, + "loss": 0.6102, + "step": 1324 + }, + { + "epoch": 0.11, + "grad_norm": 1.1638641323105938, + "learning_rate": 1.9685648277575385e-05, + "loss": 0.6323, + "step": 1325 + }, + { + "epoch": 0.11, + "grad_norm": 0.9463279359139796, + "learning_rate": 1.9684993084756664e-05, + "loss": 0.5582, + "step": 1326 + }, + { + "epoch": 0.11, + "grad_norm": 1.071442865518808, + "learning_rate": 1.9684337220776514e-05, + "loss": 0.5199, + "step": 1327 + }, + { + "epoch": 0.11, + "grad_norm": 1.0170811281607908, + "learning_rate": 1.9683680685680382e-05, + "loss": 0.5709, + "step": 1328 + }, + { + "epoch": 0.11, + "grad_norm": 0.9828891043357619, + "learning_rate": 1.9683023479513768e-05, + "loss": 0.6285, + "step": 1329 + }, + { + "epoch": 0.11, + "grad_norm": 0.9806341501252827, + "learning_rate": 1.968236560232222e-05, + "loss": 0.6161, + "step": 1330 + }, + { + "epoch": 0.11, + "grad_norm": 1.0613457807345021, + "learning_rate": 1.968170705415132e-05, + "loss": 0.6188, + "step": 1331 + }, + { + "epoch": 0.11, + "grad_norm": 1.0685738980115753, + "learning_rate": 1.9681047835046708e-05, + "loss": 0.6087, + "step": 1332 + }, + { + "epoch": 0.11, + "grad_norm": 0.9840438544734612, + "learning_rate": 1.9680387945054073e-05, + "loss": 0.5443, + "step": 1333 + }, + { + "epoch": 0.11, + "grad_norm": 1.140764435462584, + "learning_rate": 1.9679727384219137e-05, + "loss": 0.6209, + "step": 1334 + }, + { + "epoch": 0.11, + "grad_norm": 1.0182327607866783, + "learning_rate": 1.967906615258768e-05, + "loss": 0.6131, + "step": 1335 + }, + { + "epoch": 0.11, + "grad_norm": 1.0021196334036369, + "learning_rate": 1.9678404250205522e-05, + "loss": 0.6491, + "step": 1336 + }, + { + "epoch": 0.11, + "grad_norm": 0.9975411361548076, + "learning_rate": 1.9677741677118536e-05, + "loss": 0.5964, + "step": 1337 + }, + { + "epoch": 0.11, + "grad_norm": 0.924058375002181, + "learning_rate": 1.9677078433372635e-05, + "loss": 0.6155, + "step": 1338 + }, + { + "epoch": 0.11, + "grad_norm": 1.1148488713326163, + "learning_rate": 1.9676414519013782e-05, + "loss": 0.6719, + "step": 1339 + }, + { + "epoch": 0.11, + "grad_norm": 1.0508767522012055, + "learning_rate": 1.9675749934087988e-05, + "loss": 0.6367, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 1.0723392595965906, + "learning_rate": 1.9675084678641303e-05, + "loss": 0.615, + "step": 1341 + }, + { + "epoch": 0.11, + "grad_norm": 1.0466812905500391, + "learning_rate": 1.9674418752719835e-05, + "loss": 0.6187, + "step": 1342 + }, + { + "epoch": 0.11, + "grad_norm": 1.0192894391011744, + "learning_rate": 1.9673752156369726e-05, + "loss": 0.6268, + "step": 1343 + }, + { + "epoch": 0.11, + "grad_norm": 0.9594859291872142, + "learning_rate": 1.9673084889637172e-05, + "loss": 0.5834, + "step": 1344 + }, + { + "epoch": 0.11, + "grad_norm": 1.0881934969983387, + "learning_rate": 1.9672416952568416e-05, + "loss": 0.6063, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 1.1647734316504663, + "learning_rate": 1.9671748345209746e-05, + "loss": 0.6656, + "step": 1346 + }, + { + "epoch": 0.11, + "grad_norm": 1.1321357566390124, + "learning_rate": 1.9671079067607495e-05, + "loss": 0.683, + "step": 1347 + }, + { + "epoch": 0.11, + "grad_norm": 1.0163862126726073, + "learning_rate": 1.9670409119808042e-05, + "loss": 0.6353, + "step": 1348 + }, + { + "epoch": 0.11, + "grad_norm": 1.148257709769176, + "learning_rate": 1.9669738501857812e-05, + "loss": 0.6746, + "step": 1349 + }, + { + "epoch": 0.11, + "grad_norm": 1.0982853234518442, + "learning_rate": 1.9669067213803287e-05, + "loss": 0.6424, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 1.126143133493401, + "learning_rate": 1.9668395255690975e-05, + "loss": 0.686, + "step": 1351 + }, + { + "epoch": 0.11, + "grad_norm": 0.9792936415990227, + "learning_rate": 1.966772262756745e-05, + "loss": 0.6292, + "step": 1352 + }, + { + "epoch": 0.11, + "grad_norm": 1.1548354716465838, + "learning_rate": 1.966704932947932e-05, + "loss": 0.6186, + "step": 1353 + }, + { + "epoch": 0.11, + "grad_norm": 1.000836433337724, + "learning_rate": 1.966637536147325e-05, + "loss": 0.5942, + "step": 1354 + }, + { + "epoch": 0.11, + "grad_norm": 1.0117949869053557, + "learning_rate": 1.966570072359594e-05, + "loss": 0.5791, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 1.054083198075337, + "learning_rate": 1.966502541589414e-05, + "loss": 0.5827, + "step": 1356 + }, + { + "epoch": 0.11, + "grad_norm": 1.1281975366177772, + "learning_rate": 1.9664349438414656e-05, + "loss": 0.6375, + "step": 1357 + }, + { + "epoch": 0.11, + "grad_norm": 1.0217925796235703, + "learning_rate": 1.9663672791204328e-05, + "loss": 0.6147, + "step": 1358 + }, + { + "epoch": 0.11, + "grad_norm": 1.1560400317141977, + "learning_rate": 1.9662995474310042e-05, + "loss": 0.6497, + "step": 1359 + }, + { + "epoch": 0.11, + "grad_norm": 1.0824588534286652, + "learning_rate": 1.9662317487778745e-05, + "loss": 0.6342, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 1.066511214990864, + "learning_rate": 1.9661638831657414e-05, + "loss": 0.6056, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 1.1489912524465191, + "learning_rate": 1.9660959505993086e-05, + "loss": 0.6158, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 1.0074515766780978, + "learning_rate": 1.966027951083283e-05, + "loss": 0.6348, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 1.0830137168473368, + "learning_rate": 1.9659598846223775e-05, + "loss": 0.6652, + "step": 1364 + }, + { + "epoch": 0.11, + "grad_norm": 0.9800063722365412, + "learning_rate": 1.9658917512213084e-05, + "loss": 0.5851, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 0.9938348549484941, + "learning_rate": 1.9658235508847982e-05, + "loss": 0.6333, + "step": 1366 + }, + { + "epoch": 0.11, + "grad_norm": 0.9375863809230456, + "learning_rate": 1.9657552836175725e-05, + "loss": 0.6282, + "step": 1367 + }, + { + "epoch": 0.11, + "grad_norm": 1.2414875465032795, + "learning_rate": 1.965686949424362e-05, + "loss": 0.6691, + "step": 1368 + }, + { + "epoch": 0.11, + "grad_norm": 0.9846915989441325, + "learning_rate": 1.9656185483099027e-05, + "loss": 0.6228, + "step": 1369 + }, + { + "epoch": 0.11, + "grad_norm": 0.9855716559708518, + "learning_rate": 1.9655500802789342e-05, + "loss": 0.5688, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 0.9690623160434885, + "learning_rate": 1.9654815453362016e-05, + "loss": 0.5783, + "step": 1371 + }, + { + "epoch": 0.11, + "grad_norm": 1.0343685845795267, + "learning_rate": 1.9654129434864545e-05, + "loss": 0.6056, + "step": 1372 + }, + { + "epoch": 0.11, + "grad_norm": 1.0140780390900874, + "learning_rate": 1.965344274734447e-05, + "loss": 0.5836, + "step": 1373 + }, + { + "epoch": 0.11, + "grad_norm": 1.0009332862499594, + "learning_rate": 1.965275539084937e-05, + "loss": 0.6123, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 1.016702561351821, + "learning_rate": 1.9652067365426887e-05, + "loss": 0.5728, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 1.0162934957404974, + "learning_rate": 1.96513786711247e-05, + "loss": 0.6103, + "step": 1376 + }, + { + "epoch": 0.11, + "grad_norm": 1.0463167905694821, + "learning_rate": 1.9650689307990522e-05, + "loss": 0.5792, + "step": 1377 + }, + { + "epoch": 0.11, + "grad_norm": 1.0308461139411522, + "learning_rate": 1.964999927607214e-05, + "loss": 0.5638, + "step": 1378 + }, + { + "epoch": 0.11, + "grad_norm": 1.0748914011352861, + "learning_rate": 1.9649308575417372e-05, + "loss": 0.6312, + "step": 1379 + }, + { + "epoch": 0.11, + "grad_norm": 1.0594179627576494, + "learning_rate": 1.9648617206074073e-05, + "loss": 0.624, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 0.9715447357154672, + "learning_rate": 1.9647925168090162e-05, + "loss": 0.5731, + "step": 1381 + }, + { + "epoch": 0.11, + "grad_norm": 1.0014903173207736, + "learning_rate": 1.9647232461513597e-05, + "loss": 0.6465, + "step": 1382 + }, + { + "epoch": 0.11, + "grad_norm": 1.0461118034809385, + "learning_rate": 1.9646539086392376e-05, + "loss": 0.6042, + "step": 1383 + }, + { + "epoch": 0.11, + "grad_norm": 1.1278315412347864, + "learning_rate": 1.9645845042774555e-05, + "loss": 0.6311, + "step": 1384 + }, + { + "epoch": 0.11, + "grad_norm": 0.9916310533107556, + "learning_rate": 1.9645150330708225e-05, + "loss": 0.5808, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 1.1730449228013495, + "learning_rate": 1.9644454950241532e-05, + "loss": 0.6499, + "step": 1386 + }, + { + "epoch": 0.11, + "grad_norm": 1.0680519469964425, + "learning_rate": 1.9643758901422673e-05, + "loss": 0.6125, + "step": 1387 + }, + { + "epoch": 0.11, + "grad_norm": 1.0495618373555042, + "learning_rate": 1.964306218429987e-05, + "loss": 0.6534, + "step": 1388 + }, + { + "epoch": 0.11, + "grad_norm": 0.991210791391256, + "learning_rate": 1.964236479892141e-05, + "loss": 0.5843, + "step": 1389 + }, + { + "epoch": 0.11, + "grad_norm": 1.1017210183429609, + "learning_rate": 1.9641666745335626e-05, + "loss": 0.6282, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 1.1241495120151075, + "learning_rate": 1.9640968023590887e-05, + "loss": 0.6856, + "step": 1391 + }, + { + "epoch": 0.11, + "grad_norm": 1.02748779670214, + "learning_rate": 1.9640268633735616e-05, + "loss": 0.587, + "step": 1392 + }, + { + "epoch": 0.11, + "grad_norm": 1.1048813470548953, + "learning_rate": 1.963956857581828e-05, + "loss": 0.6463, + "step": 1393 + }, + { + "epoch": 0.11, + "grad_norm": 0.9678613319611744, + "learning_rate": 1.963886784988739e-05, + "loss": 0.6009, + "step": 1394 + }, + { + "epoch": 0.11, + "grad_norm": 0.9820811076689967, + "learning_rate": 1.9638166455991508e-05, + "loss": 0.6384, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 1.0235020554571306, + "learning_rate": 1.963746439417924e-05, + "loss": 0.6096, + "step": 1396 + }, + { + "epoch": 0.11, + "grad_norm": 1.0644057126732112, + "learning_rate": 1.963676166449924e-05, + "loss": 0.6501, + "step": 1397 + }, + { + "epoch": 0.11, + "grad_norm": 1.0743452283499784, + "learning_rate": 1.9636058267000203e-05, + "loss": 0.6937, + "step": 1398 + }, + { + "epoch": 0.11, + "grad_norm": 1.0666403475309791, + "learning_rate": 1.9635354201730874e-05, + "loss": 0.6369, + "step": 1399 + }, + { + "epoch": 0.11, + "grad_norm": 1.0795702053351024, + "learning_rate": 1.9634649468740048e-05, + "loss": 0.5609, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 1.0842891736824818, + "learning_rate": 1.963394406807656e-05, + "loss": 0.6054, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 1.232357196608936, + "learning_rate": 1.963323799978929e-05, + "loss": 0.7096, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 1.0627038461122897, + "learning_rate": 1.9632531263927173e-05, + "loss": 0.5423, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 1.158864262721317, + "learning_rate": 1.963182386053918e-05, + "loss": 0.6784, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 1.1047999928694838, + "learning_rate": 1.9631115789674343e-05, + "loss": 0.5742, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 1.0348252416708916, + "learning_rate": 1.963040705138172e-05, + "loss": 0.6467, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 1.100756898992012, + "learning_rate": 1.9629697645710432e-05, + "loss": 0.6564, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 1.0134805396263211, + "learning_rate": 1.962898757270964e-05, + "loss": 0.6013, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 1.1551860789638277, + "learning_rate": 1.9628276832428548e-05, + "loss": 0.6122, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 1.0523592965335977, + "learning_rate": 1.962756542491641e-05, + "loss": 0.6456, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 1.0072420388751542, + "learning_rate": 1.9626853350222535e-05, + "loss": 0.6206, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 0.9389508294024854, + "learning_rate": 1.962614060839626e-05, + "loss": 0.5318, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 0.9785971976060436, + "learning_rate": 1.9625427199486973e-05, + "loss": 0.5988, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 1.0565134094275008, + "learning_rate": 1.962471312354412e-05, + "loss": 0.6017, + "step": 1414 + }, + { + "epoch": 0.12, + "grad_norm": 1.0769950453386896, + "learning_rate": 1.9623998380617187e-05, + "loss": 0.5855, + "step": 1415 + }, + { + "epoch": 0.12, + "grad_norm": 1.0424660062990199, + "learning_rate": 1.9623282970755702e-05, + "loss": 0.6113, + "step": 1416 + }, + { + "epoch": 0.12, + "grad_norm": 0.9809188594461099, + "learning_rate": 1.9622566894009247e-05, + "loss": 0.6357, + "step": 1417 + }, + { + "epoch": 0.12, + "grad_norm": 1.0366035114434953, + "learning_rate": 1.962185015042744e-05, + "loss": 0.5987, + "step": 1418 + }, + { + "epoch": 0.12, + "grad_norm": 1.0288494452139783, + "learning_rate": 1.962113274005995e-05, + "loss": 0.5874, + "step": 1419 + }, + { + "epoch": 0.12, + "grad_norm": 1.0099375316753343, + "learning_rate": 1.9620414662956494e-05, + "loss": 0.5621, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 0.9666975626620397, + "learning_rate": 1.9619695919166836e-05, + "loss": 0.5965, + "step": 1421 + }, + { + "epoch": 0.12, + "grad_norm": 0.9862166627261453, + "learning_rate": 1.9618976508740782e-05, + "loss": 0.5448, + "step": 1422 + }, + { + "epoch": 0.12, + "grad_norm": 0.8706924255202901, + "learning_rate": 1.961825643172819e-05, + "loss": 0.4272, + "step": 1423 + }, + { + "epoch": 0.12, + "grad_norm": 0.9861330858786174, + "learning_rate": 1.961753568817896e-05, + "loss": 0.6266, + "step": 1424 + }, + { + "epoch": 0.12, + "grad_norm": 0.9746193528706869, + "learning_rate": 1.9616814278143038e-05, + "loss": 0.6087, + "step": 1425 + }, + { + "epoch": 0.12, + "grad_norm": 1.0273042877502343, + "learning_rate": 1.9616092201670415e-05, + "loss": 0.5207, + "step": 1426 + }, + { + "epoch": 0.12, + "grad_norm": 0.9975075185558374, + "learning_rate": 1.961536945881113e-05, + "loss": 0.5777, + "step": 1427 + }, + { + "epoch": 0.12, + "grad_norm": 1.074244733642685, + "learning_rate": 1.9614646049615273e-05, + "loss": 0.6868, + "step": 1428 + }, + { + "epoch": 0.12, + "grad_norm": 1.019900271457737, + "learning_rate": 1.961392197413297e-05, + "loss": 0.5854, + "step": 1429 + }, + { + "epoch": 0.12, + "grad_norm": 1.0169010906420888, + "learning_rate": 1.9613197232414405e-05, + "loss": 0.6737, + "step": 1430 + }, + { + "epoch": 0.12, + "grad_norm": 1.0616525197387021, + "learning_rate": 1.96124718245098e-05, + "loss": 0.6442, + "step": 1431 + }, + { + "epoch": 0.12, + "grad_norm": 1.0010044272938432, + "learning_rate": 1.961174575046942e-05, + "loss": 0.5957, + "step": 1432 + }, + { + "epoch": 0.12, + "grad_norm": 1.0575345130965779, + "learning_rate": 1.9611019010343585e-05, + "loss": 0.5818, + "step": 1433 + }, + { + "epoch": 0.12, + "grad_norm": 1.1147214676236437, + "learning_rate": 1.9610291604182658e-05, + "loss": 0.6263, + "step": 1434 + }, + { + "epoch": 0.12, + "grad_norm": 1.1387158998687177, + "learning_rate": 1.960956353203705e-05, + "loss": 0.6204, + "step": 1435 + }, + { + "epoch": 0.12, + "grad_norm": 1.2318473099005043, + "learning_rate": 1.960883479395721e-05, + "loss": 0.5864, + "step": 1436 + }, + { + "epoch": 0.12, + "grad_norm": 1.0377594896122029, + "learning_rate": 1.9608105389993644e-05, + "loss": 0.5611, + "step": 1437 + }, + { + "epoch": 0.12, + "grad_norm": 0.9970888793455268, + "learning_rate": 1.9607375320196892e-05, + "loss": 0.6519, + "step": 1438 + }, + { + "epoch": 0.12, + "grad_norm": 1.0175219908144566, + "learning_rate": 1.960664458461756e-05, + "loss": 0.6332, + "step": 1439 + }, + { + "epoch": 0.12, + "grad_norm": 1.0990453693654425, + "learning_rate": 1.9605913183306272e-05, + "loss": 0.4677, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 1.1027567360059916, + "learning_rate": 1.9605181116313725e-05, + "loss": 0.641, + "step": 1441 + }, + { + "epoch": 0.12, + "grad_norm": 0.9380109470025033, + "learning_rate": 1.9604448383690644e-05, + "loss": 0.6192, + "step": 1442 + }, + { + "epoch": 0.12, + "grad_norm": 0.9869363044160382, + "learning_rate": 1.9603714985487813e-05, + "loss": 0.6524, + "step": 1443 + }, + { + "epoch": 0.12, + "grad_norm": 1.013485737719734, + "learning_rate": 1.9602980921756046e-05, + "loss": 0.5573, + "step": 1444 + }, + { + "epoch": 0.12, + "grad_norm": 0.9702032466898539, + "learning_rate": 1.9602246192546224e-05, + "loss": 0.6087, + "step": 1445 + }, + { + "epoch": 0.12, + "grad_norm": 1.1239818572944578, + "learning_rate": 1.9601510797909257e-05, + "loss": 0.6198, + "step": 1446 + }, + { + "epoch": 0.12, + "grad_norm": 1.0349287078717646, + "learning_rate": 1.9600774737896106e-05, + "loss": 0.6169, + "step": 1447 + }, + { + "epoch": 0.12, + "grad_norm": 1.0023446170801873, + "learning_rate": 1.960003801255778e-05, + "loss": 0.5426, + "step": 1448 + }, + { + "epoch": 0.12, + "grad_norm": 1.0838020795025674, + "learning_rate": 1.959930062194534e-05, + "loss": 0.6661, + "step": 1449 + }, + { + "epoch": 0.12, + "grad_norm": 1.1445880774155552, + "learning_rate": 1.959856256610988e-05, + "loss": 0.6241, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 1.0756990406078983, + "learning_rate": 1.959782384510255e-05, + "loss": 0.596, + "step": 1451 + }, + { + "epoch": 0.12, + "grad_norm": 1.0238726201838086, + "learning_rate": 1.959708445897454e-05, + "loss": 0.6427, + "step": 1452 + }, + { + "epoch": 0.12, + "grad_norm": 1.1307731133424648, + "learning_rate": 1.9596344407777085e-05, + "loss": 0.643, + "step": 1453 + }, + { + "epoch": 0.12, + "grad_norm": 1.0285910561385603, + "learning_rate": 1.9595603691561477e-05, + "loss": 0.6756, + "step": 1454 + }, + { + "epoch": 0.12, + "grad_norm": 0.9949420844408791, + "learning_rate": 1.9594862310379046e-05, + "loss": 0.6079, + "step": 1455 + }, + { + "epoch": 0.12, + "grad_norm": 1.040586161086886, + "learning_rate": 1.959412026428117e-05, + "loss": 0.5854, + "step": 1456 + }, + { + "epoch": 0.12, + "grad_norm": 0.9126335616675043, + "learning_rate": 1.959337755331926e-05, + "loss": 0.5551, + "step": 1457 + }, + { + "epoch": 0.12, + "grad_norm": 1.0176160660649483, + "learning_rate": 1.9592634177544803e-05, + "loss": 0.5906, + "step": 1458 + }, + { + "epoch": 0.12, + "grad_norm": 1.086794686195696, + "learning_rate": 1.9591890137009308e-05, + "loss": 0.6345, + "step": 1459 + }, + { + "epoch": 0.12, + "grad_norm": 1.0365211797145388, + "learning_rate": 1.9591145431764327e-05, + "loss": 0.6051, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 1.1849633403345523, + "learning_rate": 1.959040006186148e-05, + "loss": 0.6648, + "step": 1461 + }, + { + "epoch": 0.12, + "grad_norm": 1.0349945536963618, + "learning_rate": 1.9589654027352412e-05, + "loss": 0.6387, + "step": 1462 + }, + { + "epoch": 0.12, + "grad_norm": 0.9929411170861682, + "learning_rate": 1.958890732828883e-05, + "loss": 0.5947, + "step": 1463 + }, + { + "epoch": 0.12, + "grad_norm": 0.9923671261831932, + "learning_rate": 1.9588159964722474e-05, + "loss": 0.5977, + "step": 1464 + }, + { + "epoch": 0.12, + "grad_norm": 1.0723403570973749, + "learning_rate": 1.9587411936705135e-05, + "loss": 0.6061, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 1.1309137191054779, + "learning_rate": 1.9586663244288655e-05, + "loss": 0.6172, + "step": 1466 + }, + { + "epoch": 0.12, + "grad_norm": 1.0228119546392276, + "learning_rate": 1.9585913887524914e-05, + "loss": 0.5978, + "step": 1467 + }, + { + "epoch": 0.12, + "grad_norm": 1.055575387496972, + "learning_rate": 1.9585163866465847e-05, + "loss": 0.6854, + "step": 1468 + }, + { + "epoch": 0.12, + "grad_norm": 1.0321171852377182, + "learning_rate": 1.958441318116342e-05, + "loss": 0.6108, + "step": 1469 + }, + { + "epoch": 0.12, + "grad_norm": 0.9265607906492737, + "learning_rate": 1.9583661831669664e-05, + "loss": 0.5967, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 1.1081882645274406, + "learning_rate": 1.9582909818036648e-05, + "loss": 0.5702, + "step": 1471 + }, + { + "epoch": 0.12, + "grad_norm": 1.03719923918571, + "learning_rate": 1.9582157140316472e-05, + "loss": 0.5178, + "step": 1472 + }, + { + "epoch": 0.12, + "grad_norm": 1.119059131475791, + "learning_rate": 1.9581403798561314e-05, + "loss": 0.5767, + "step": 1473 + }, + { + "epoch": 0.12, + "grad_norm": 1.0311596675673023, + "learning_rate": 1.9580649792823368e-05, + "loss": 0.61, + "step": 1474 + }, + { + "epoch": 0.12, + "grad_norm": 1.0583962918596255, + "learning_rate": 1.957989512315489e-05, + "loss": 0.579, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 1.084334762026287, + "learning_rate": 1.957913978960818e-05, + "loss": 0.6214, + "step": 1476 + }, + { + "epoch": 0.12, + "grad_norm": 1.0887905371592597, + "learning_rate": 1.9578383792235573e-05, + "loss": 0.5538, + "step": 1477 + }, + { + "epoch": 0.12, + "grad_norm": 1.057229505151521, + "learning_rate": 1.957762713108947e-05, + "loss": 0.5807, + "step": 1478 + }, + { + "epoch": 0.12, + "grad_norm": 1.081309748615656, + "learning_rate": 1.95768698062223e-05, + "loss": 0.7256, + "step": 1479 + }, + { + "epoch": 0.12, + "grad_norm": 1.0835482753095111, + "learning_rate": 1.957611181768655e-05, + "loss": 0.6008, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 0.9472949645396779, + "learning_rate": 1.957535316553474e-05, + "loss": 0.6263, + "step": 1481 + }, + { + "epoch": 0.12, + "grad_norm": 1.0463220262272768, + "learning_rate": 1.9574593849819453e-05, + "loss": 0.5526, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 1.0668226136034622, + "learning_rate": 1.9573833870593307e-05, + "loss": 0.6201, + "step": 1483 + }, + { + "epoch": 0.12, + "grad_norm": 1.2624562734003215, + "learning_rate": 1.957307322790896e-05, + "loss": 0.6532, + "step": 1484 + }, + { + "epoch": 0.12, + "grad_norm": 1.0860664742619095, + "learning_rate": 1.9572311921819135e-05, + "loss": 0.5966, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 1.0627301848870327, + "learning_rate": 1.957154995237658e-05, + "loss": 0.656, + "step": 1486 + }, + { + "epoch": 0.12, + "grad_norm": 1.0151482279119024, + "learning_rate": 1.9570787319634107e-05, + "loss": 0.6252, + "step": 1487 + }, + { + "epoch": 0.12, + "grad_norm": 1.0381758566758736, + "learning_rate": 1.957002402364456e-05, + "loss": 0.5607, + "step": 1488 + }, + { + "epoch": 0.12, + "grad_norm": 1.318043710299476, + "learning_rate": 1.9569260064460837e-05, + "loss": 0.7168, + "step": 1489 + }, + { + "epoch": 0.12, + "grad_norm": 1.0484950881712423, + "learning_rate": 1.9568495442135878e-05, + "loss": 0.5741, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 1.257735140002447, + "learning_rate": 1.9567730156722672e-05, + "loss": 0.6263, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 1.1015010819265607, + "learning_rate": 1.9566964208274254e-05, + "loss": 0.5963, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 0.9913609664622401, + "learning_rate": 1.9566197596843702e-05, + "loss": 0.5649, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 1.0272315395873561, + "learning_rate": 1.956543032248414e-05, + "loss": 0.5699, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 1.0427619537229365, + "learning_rate": 1.9564662385248743e-05, + "loss": 0.5767, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 1.0631391429171742, + "learning_rate": 1.9563893785190728e-05, + "loss": 0.6465, + "step": 1496 + }, + { + "epoch": 0.12, + "grad_norm": 0.9870702083748742, + "learning_rate": 1.9563124522363357e-05, + "loss": 0.6024, + "step": 1497 + }, + { + "epoch": 0.12, + "grad_norm": 1.2315661829412479, + "learning_rate": 1.9562354596819938e-05, + "loss": 0.6791, + "step": 1498 + }, + { + "epoch": 0.12, + "grad_norm": 1.1144355699428936, + "learning_rate": 1.9561584008613826e-05, + "loss": 0.6348, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 1.1130939812444343, + "learning_rate": 1.9560812757798423e-05, + "loss": 0.6291, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 1.2017247690800552, + "learning_rate": 1.956004084442718e-05, + "loss": 0.6411, + "step": 1501 + }, + { + "epoch": 0.12, + "grad_norm": 1.126123159735174, + "learning_rate": 1.955926826855358e-05, + "loss": 0.6034, + "step": 1502 + }, + { + "epoch": 0.12, + "grad_norm": 1.003296902988338, + "learning_rate": 1.9558495030231174e-05, + "loss": 0.6248, + "step": 1503 + }, + { + "epoch": 0.12, + "grad_norm": 0.9657714590592112, + "learning_rate": 1.9557721129513538e-05, + "loss": 0.5877, + "step": 1504 + }, + { + "epoch": 0.12, + "grad_norm": 0.9224512012672073, + "learning_rate": 1.9556946566454308e-05, + "loss": 0.5968, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 1.032329308432359, + "learning_rate": 1.9556171341107152e-05, + "loss": 0.5938, + "step": 1506 + }, + { + "epoch": 0.12, + "grad_norm": 1.0343115532513394, + "learning_rate": 1.9555395453525806e-05, + "loss": 0.5759, + "step": 1507 + }, + { + "epoch": 0.12, + "grad_norm": 1.1010501401367896, + "learning_rate": 1.9554618903764026e-05, + "loss": 0.6259, + "step": 1508 + }, + { + "epoch": 0.12, + "grad_norm": 1.0039742973115253, + "learning_rate": 1.9553841691875632e-05, + "loss": 0.655, + "step": 1509 + }, + { + "epoch": 0.12, + "grad_norm": 1.100021199901766, + "learning_rate": 1.9553063817914482e-05, + "loss": 0.4912, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 1.0913235913602368, + "learning_rate": 1.9552285281934484e-05, + "loss": 0.5687, + "step": 1511 + }, + { + "epoch": 0.12, + "grad_norm": 1.1085906438760087, + "learning_rate": 1.9551506083989592e-05, + "loss": 0.6476, + "step": 1512 + }, + { + "epoch": 0.12, + "grad_norm": 1.0239746332752369, + "learning_rate": 1.9550726224133795e-05, + "loss": 0.6609, + "step": 1513 + }, + { + "epoch": 0.12, + "grad_norm": 1.0811194986543498, + "learning_rate": 1.9549945702421144e-05, + "loss": 0.5572, + "step": 1514 + }, + { + "epoch": 0.12, + "grad_norm": 1.1327547585506779, + "learning_rate": 1.9549164518905727e-05, + "loss": 0.6541, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 1.1449440442891867, + "learning_rate": 1.954838267364168e-05, + "loss": 0.6387, + "step": 1516 + }, + { + "epoch": 0.12, + "grad_norm": 1.0080649164608988, + "learning_rate": 1.9547600166683184e-05, + "loss": 0.6361, + "step": 1517 + }, + { + "epoch": 0.12, + "grad_norm": 1.0634607963289726, + "learning_rate": 1.954681699808446e-05, + "loss": 0.58, + "step": 1518 + }, + { + "epoch": 0.12, + "grad_norm": 1.0450288464842377, + "learning_rate": 1.9546033167899788e-05, + "loss": 0.6228, + "step": 1519 + }, + { + "epoch": 0.12, + "grad_norm": 0.9794147398904083, + "learning_rate": 1.9545248676183486e-05, + "loss": 0.646, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 0.9439257970714194, + "learning_rate": 1.9544463522989917e-05, + "loss": 0.6007, + "step": 1521 + }, + { + "epoch": 0.12, + "grad_norm": 0.9593080154837881, + "learning_rate": 1.9543677708373496e-05, + "loss": 0.5715, + "step": 1522 + }, + { + "epoch": 0.12, + "grad_norm": 1.0550240084527236, + "learning_rate": 1.954289123238867e-05, + "loss": 0.698, + "step": 1523 + }, + { + "epoch": 0.12, + "grad_norm": 1.010980510854376, + "learning_rate": 1.9542104095089946e-05, + "loss": 0.5256, + "step": 1524 + }, + { + "epoch": 0.12, + "grad_norm": 1.0780155128709732, + "learning_rate": 1.9541316296531875e-05, + "loss": 0.6156, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 1.0783401810137738, + "learning_rate": 1.9540527836769047e-05, + "loss": 0.6011, + "step": 1526 + }, + { + "epoch": 0.12, + "grad_norm": 1.0220845234322393, + "learning_rate": 1.95397387158561e-05, + "loss": 0.6436, + "step": 1527 + }, + { + "epoch": 0.12, + "grad_norm": 1.006269151060652, + "learning_rate": 1.9538948933847727e-05, + "loss": 0.5765, + "step": 1528 + }, + { + "epoch": 0.12, + "grad_norm": 1.0502520128791262, + "learning_rate": 1.953815849079865e-05, + "loss": 0.5935, + "step": 1529 + }, + { + "epoch": 0.12, + "grad_norm": 1.0099151207552917, + "learning_rate": 1.953736738676365e-05, + "loss": 0.5783, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 1.0115633381309157, + "learning_rate": 1.9536575621797546e-05, + "loss": 0.7009, + "step": 1531 + }, + { + "epoch": 0.12, + "grad_norm": 1.0314261395753932, + "learning_rate": 1.9535783195955215e-05, + "loss": 0.625, + "step": 1532 + }, + { + "epoch": 0.12, + "grad_norm": 1.0305413122838198, + "learning_rate": 1.9534990109291568e-05, + "loss": 0.6119, + "step": 1533 + }, + { + "epoch": 0.12, + "grad_norm": 1.0589889413813205, + "learning_rate": 1.953419636186156e-05, + "loss": 0.6151, + "step": 1534 + }, + { + "epoch": 0.12, + "grad_norm": 1.0537572089897898, + "learning_rate": 1.9533401953720204e-05, + "loss": 0.732, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 1.1561489357870232, + "learning_rate": 1.9532606884922547e-05, + "loss": 0.6193, + "step": 1536 + }, + { + "epoch": 0.12, + "grad_norm": 1.036129485903809, + "learning_rate": 1.953181115552369e-05, + "loss": 0.6357, + "step": 1537 + }, + { + "epoch": 0.12, + "grad_norm": 0.9621713367722898, + "learning_rate": 1.9531014765578774e-05, + "loss": 0.5871, + "step": 1538 + }, + { + "epoch": 0.13, + "grad_norm": 1.094823343969492, + "learning_rate": 1.9530217715142987e-05, + "loss": 0.6485, + "step": 1539 + }, + { + "epoch": 0.13, + "grad_norm": 0.9785272655837435, + "learning_rate": 1.9529420004271568e-05, + "loss": 0.5039, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 1.083586772678874, + "learning_rate": 1.9528621633019792e-05, + "loss": 0.6447, + "step": 1541 + }, + { + "epoch": 0.13, + "grad_norm": 1.0002925976491284, + "learning_rate": 1.952782260144299e-05, + "loss": 0.536, + "step": 1542 + }, + { + "epoch": 0.13, + "grad_norm": 0.9262307923258066, + "learning_rate": 1.9527022909596537e-05, + "loss": 0.54, + "step": 1543 + }, + { + "epoch": 0.13, + "grad_norm": 0.9800418846378972, + "learning_rate": 1.9526222557535842e-05, + "loss": 0.6442, + "step": 1544 + }, + { + "epoch": 0.13, + "grad_norm": 0.9628400126389941, + "learning_rate": 1.9525421545316378e-05, + "loss": 0.5566, + "step": 1545 + }, + { + "epoch": 0.13, + "grad_norm": 0.9428595296385236, + "learning_rate": 1.9524619872993648e-05, + "loss": 0.5485, + "step": 1546 + }, + { + "epoch": 0.13, + "grad_norm": 1.1165341782891196, + "learning_rate": 1.9523817540623208e-05, + "loss": 0.5906, + "step": 1547 + }, + { + "epoch": 0.13, + "grad_norm": 1.0112460723323362, + "learning_rate": 1.9523014548260657e-05, + "loss": 0.635, + "step": 1548 + }, + { + "epoch": 0.13, + "grad_norm": 1.0185326086531783, + "learning_rate": 1.9522210895961648e-05, + "loss": 0.5875, + "step": 1549 + }, + { + "epoch": 0.13, + "grad_norm": 1.00351955353462, + "learning_rate": 1.9521406583781872e-05, + "loss": 0.6074, + "step": 1550 + }, + { + "epoch": 0.13, + "grad_norm": 1.0794945145826804, + "learning_rate": 1.9520601611777065e-05, + "loss": 0.6069, + "step": 1551 + }, + { + "epoch": 0.13, + "grad_norm": 0.9679784995016987, + "learning_rate": 1.9519795980003007e-05, + "loss": 0.614, + "step": 1552 + }, + { + "epoch": 0.13, + "grad_norm": 1.1043952426403454, + "learning_rate": 1.9518989688515533e-05, + "loss": 0.5914, + "step": 1553 + }, + { + "epoch": 0.13, + "grad_norm": 1.1527097920161586, + "learning_rate": 1.9518182737370515e-05, + "loss": 0.7452, + "step": 1554 + }, + { + "epoch": 0.13, + "grad_norm": 1.1059153106910997, + "learning_rate": 1.9517375126623882e-05, + "loss": 0.6206, + "step": 1555 + }, + { + "epoch": 0.13, + "grad_norm": 0.9977261438193731, + "learning_rate": 1.9516566856331593e-05, + "loss": 0.6321, + "step": 1556 + }, + { + "epoch": 0.13, + "grad_norm": 1.0201871612144144, + "learning_rate": 1.951575792654966e-05, + "loss": 0.5564, + "step": 1557 + }, + { + "epoch": 0.13, + "grad_norm": 0.9601699304044047, + "learning_rate": 1.9514948337334144e-05, + "loss": 0.4204, + "step": 1558 + }, + { + "epoch": 0.13, + "grad_norm": 0.9655527593749033, + "learning_rate": 1.9514138088741146e-05, + "loss": 0.5994, + "step": 1559 + }, + { + "epoch": 0.13, + "grad_norm": 1.021938738185002, + "learning_rate": 1.951332718082682e-05, + "loss": 0.6092, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 1.1164618252129554, + "learning_rate": 1.9512515613647358e-05, + "loss": 0.6469, + "step": 1561 + }, + { + "epoch": 0.13, + "grad_norm": 0.9661864554484725, + "learning_rate": 1.9511703387259e-05, + "loss": 0.543, + "step": 1562 + }, + { + "epoch": 0.13, + "grad_norm": 0.9647944075014523, + "learning_rate": 1.9510890501718037e-05, + "loss": 0.5907, + "step": 1563 + }, + { + "epoch": 0.13, + "grad_norm": 1.0424734503862871, + "learning_rate": 1.95100769570808e-05, + "loss": 0.6481, + "step": 1564 + }, + { + "epoch": 0.13, + "grad_norm": 1.0588923703277024, + "learning_rate": 1.9509262753403656e-05, + "loss": 0.65, + "step": 1565 + }, + { + "epoch": 0.13, + "grad_norm": 0.8810729326303413, + "learning_rate": 1.950844789074305e-05, + "loss": 0.5341, + "step": 1566 + }, + { + "epoch": 0.13, + "grad_norm": 1.0265294325695518, + "learning_rate": 1.950763236915543e-05, + "loss": 0.6191, + "step": 1567 + }, + { + "epoch": 0.13, + "grad_norm": 1.059898351367038, + "learning_rate": 1.9506816188697322e-05, + "loss": 0.6289, + "step": 1568 + }, + { + "epoch": 0.13, + "grad_norm": 1.0009322743834865, + "learning_rate": 1.950599934942529e-05, + "loss": 0.5897, + "step": 1569 + }, + { + "epoch": 0.13, + "grad_norm": 1.061526486833266, + "learning_rate": 1.9505181851395928e-05, + "loss": 0.6192, + "step": 1570 + }, + { + "epoch": 0.13, + "grad_norm": 1.081852926219257, + "learning_rate": 1.9504363694665897e-05, + "loss": 0.6464, + "step": 1571 + }, + { + "epoch": 0.13, + "grad_norm": 1.1260267561080988, + "learning_rate": 1.9503544879291893e-05, + "loss": 0.6535, + "step": 1572 + }, + { + "epoch": 0.13, + "grad_norm": 1.1258767221569566, + "learning_rate": 1.950272540533066e-05, + "loss": 0.6928, + "step": 1573 + }, + { + "epoch": 0.13, + "grad_norm": 1.0346108077510237, + "learning_rate": 1.9501905272838983e-05, + "loss": 0.5869, + "step": 1574 + }, + { + "epoch": 0.13, + "grad_norm": 1.149899484431447, + "learning_rate": 1.95010844818737e-05, + "loss": 0.664, + "step": 1575 + }, + { + "epoch": 0.13, + "grad_norm": 1.1020327223722928, + "learning_rate": 1.9500263032491688e-05, + "loss": 0.6623, + "step": 1576 + }, + { + "epoch": 0.13, + "grad_norm": 1.0297737960298794, + "learning_rate": 1.9499440924749878e-05, + "loss": 0.6437, + "step": 1577 + }, + { + "epoch": 0.13, + "grad_norm": 0.9343334803708131, + "learning_rate": 1.9498618158705235e-05, + "loss": 0.6093, + "step": 1578 + }, + { + "epoch": 0.13, + "grad_norm": 1.0287446925236747, + "learning_rate": 1.9497794734414782e-05, + "loss": 0.5599, + "step": 1579 + }, + { + "epoch": 0.13, + "grad_norm": 1.0489118928687258, + "learning_rate": 1.9496970651935575e-05, + "loss": 0.6262, + "step": 1580 + }, + { + "epoch": 0.13, + "grad_norm": 1.1295782786398594, + "learning_rate": 1.9496145911324724e-05, + "loss": 0.6088, + "step": 1581 + }, + { + "epoch": 0.13, + "grad_norm": 1.042843824083153, + "learning_rate": 1.949532051263939e-05, + "loss": 0.5938, + "step": 1582 + }, + { + "epoch": 0.13, + "grad_norm": 1.1030489445427343, + "learning_rate": 1.9494494455936763e-05, + "loss": 0.6502, + "step": 1583 + }, + { + "epoch": 0.13, + "grad_norm": 1.0155020298359885, + "learning_rate": 1.9493667741274093e-05, + "loss": 0.5839, + "step": 1584 + }, + { + "epoch": 0.13, + "grad_norm": 1.065572552200458, + "learning_rate": 1.9492840368708668e-05, + "loss": 0.5917, + "step": 1585 + }, + { + "epoch": 0.13, + "grad_norm": 1.311197032426637, + "learning_rate": 1.949201233829783e-05, + "loss": 0.5324, + "step": 1586 + }, + { + "epoch": 0.13, + "grad_norm": 3.3655083579169545, + "learning_rate": 1.9491183650098953e-05, + "loss": 0.5474, + "step": 1587 + }, + { + "epoch": 0.13, + "grad_norm": 1.179113938400328, + "learning_rate": 1.9490354304169467e-05, + "loss": 0.6661, + "step": 1588 + }, + { + "epoch": 0.13, + "grad_norm": 1.0957183409700058, + "learning_rate": 1.9489524300566845e-05, + "loss": 0.6467, + "step": 1589 + }, + { + "epoch": 0.13, + "grad_norm": 1.051786424467458, + "learning_rate": 1.948869363934861e-05, + "loss": 0.6013, + "step": 1590 + }, + { + "epoch": 0.13, + "grad_norm": 1.1924075929286637, + "learning_rate": 1.948786232057232e-05, + "loss": 0.6942, + "step": 1591 + }, + { + "epoch": 0.13, + "grad_norm": 1.0584652636189806, + "learning_rate": 1.9487030344295586e-05, + "loss": 0.6294, + "step": 1592 + }, + { + "epoch": 0.13, + "grad_norm": 0.9447168266155662, + "learning_rate": 1.9486197710576063e-05, + "loss": 0.5499, + "step": 1593 + }, + { + "epoch": 0.13, + "grad_norm": 0.972937373205438, + "learning_rate": 1.9485364419471454e-05, + "loss": 0.664, + "step": 1594 + }, + { + "epoch": 0.13, + "grad_norm": 0.9792901817040632, + "learning_rate": 1.948453047103951e-05, + "loss": 0.5629, + "step": 1595 + }, + { + "epoch": 0.13, + "grad_norm": 1.0196538962335693, + "learning_rate": 1.948369586533801e-05, + "loss": 0.61, + "step": 1596 + }, + { + "epoch": 0.13, + "grad_norm": 1.0479317330475613, + "learning_rate": 1.94828606024248e-05, + "loss": 0.6615, + "step": 1597 + }, + { + "epoch": 0.13, + "grad_norm": 1.0404865485674981, + "learning_rate": 1.948202468235776e-05, + "loss": 0.6868, + "step": 1598 + }, + { + "epoch": 0.13, + "grad_norm": 0.9845855698699403, + "learning_rate": 1.9481188105194827e-05, + "loss": 0.6699, + "step": 1599 + }, + { + "epoch": 0.13, + "grad_norm": 1.0453875361089522, + "learning_rate": 1.948035087099396e-05, + "loss": 0.6004, + "step": 1600 + }, + { + "epoch": 0.13, + "grad_norm": 0.9423906166078881, + "learning_rate": 1.9479512979813193e-05, + "loss": 0.5634, + "step": 1601 + }, + { + "epoch": 0.13, + "grad_norm": 1.0283084189763398, + "learning_rate": 1.947867443171058e-05, + "loss": 0.5786, + "step": 1602 + }, + { + "epoch": 0.13, + "grad_norm": 1.141933735094667, + "learning_rate": 1.9477835226744243e-05, + "loss": 0.5785, + "step": 1603 + }, + { + "epoch": 0.13, + "grad_norm": 1.1207636909607657, + "learning_rate": 1.9476995364972327e-05, + "loss": 0.6612, + "step": 1604 + }, + { + "epoch": 0.13, + "grad_norm": 1.1185052250128316, + "learning_rate": 1.9476154846453037e-05, + "loss": 0.5691, + "step": 1605 + }, + { + "epoch": 0.13, + "grad_norm": 1.0395024380287932, + "learning_rate": 1.9475313671244624e-05, + "loss": 0.6653, + "step": 1606 + }, + { + "epoch": 0.13, + "grad_norm": 1.1227322427576973, + "learning_rate": 1.9474471839405377e-05, + "loss": 0.6205, + "step": 1607 + }, + { + "epoch": 0.13, + "grad_norm": 1.0231221460248499, + "learning_rate": 1.9473629350993633e-05, + "loss": 0.5498, + "step": 1608 + }, + { + "epoch": 0.13, + "grad_norm": 0.9523212008119725, + "learning_rate": 1.947278620606778e-05, + "loss": 0.5849, + "step": 1609 + }, + { + "epoch": 0.13, + "grad_norm": 1.094776684086471, + "learning_rate": 1.9471942404686247e-05, + "loss": 0.6354, + "step": 1610 + }, + { + "epoch": 0.13, + "grad_norm": 1.0211375159615892, + "learning_rate": 1.9471097946907506e-05, + "loss": 0.6031, + "step": 1611 + }, + { + "epoch": 0.13, + "grad_norm": 1.2412258044393085, + "learning_rate": 1.947025283279008e-05, + "loss": 0.6913, + "step": 1612 + }, + { + "epoch": 0.13, + "grad_norm": 0.9912415664046444, + "learning_rate": 1.9469407062392528e-05, + "loss": 0.5713, + "step": 1613 + }, + { + "epoch": 0.13, + "grad_norm": 0.9588341015952317, + "learning_rate": 1.946856063577347e-05, + "loss": 0.5535, + "step": 1614 + }, + { + "epoch": 0.13, + "grad_norm": 1.0301367696445638, + "learning_rate": 1.9467713552991557e-05, + "loss": 0.6127, + "step": 1615 + }, + { + "epoch": 0.13, + "grad_norm": 0.9800192103314544, + "learning_rate": 1.9466865814105493e-05, + "loss": 0.6247, + "step": 1616 + }, + { + "epoch": 0.13, + "grad_norm": 0.9889924807240417, + "learning_rate": 1.9466017419174027e-05, + "loss": 0.5599, + "step": 1617 + }, + { + "epoch": 0.13, + "grad_norm": 1.0075065416062952, + "learning_rate": 1.9465168368255946e-05, + "loss": 0.6166, + "step": 1618 + }, + { + "epoch": 0.13, + "grad_norm": 1.0671090763975322, + "learning_rate": 1.9464318661410097e-05, + "loss": 0.6176, + "step": 1619 + }, + { + "epoch": 0.13, + "grad_norm": 1.053639753510594, + "learning_rate": 1.9463468298695357e-05, + "loss": 0.6008, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 1.043584000165112, + "learning_rate": 1.9462617280170657e-05, + "loss": 0.5603, + "step": 1621 + }, + { + "epoch": 0.13, + "grad_norm": 0.9855988604094579, + "learning_rate": 1.9461765605894974e-05, + "loss": 0.6136, + "step": 1622 + }, + { + "epoch": 0.13, + "grad_norm": 0.9785494268307853, + "learning_rate": 1.9460913275927326e-05, + "loss": 0.5665, + "step": 1623 + }, + { + "epoch": 0.13, + "grad_norm": 0.9185300086052195, + "learning_rate": 1.9460060290326784e-05, + "loss": 0.526, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 0.9419639343149278, + "learning_rate": 1.9459206649152452e-05, + "loss": 0.5603, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 1.0604969546819052, + "learning_rate": 1.945835235246349e-05, + "loss": 0.6027, + "step": 1626 + }, + { + "epoch": 0.13, + "grad_norm": 0.9976023956722005, + "learning_rate": 1.9457497400319097e-05, + "loss": 0.6019, + "step": 1627 + }, + { + "epoch": 0.13, + "grad_norm": 1.0103924418233958, + "learning_rate": 1.9456641792778527e-05, + "loss": 0.565, + "step": 1628 + }, + { + "epoch": 0.13, + "grad_norm": 1.0289771525148863, + "learning_rate": 1.9455785529901064e-05, + "loss": 0.6336, + "step": 1629 + }, + { + "epoch": 0.13, + "grad_norm": 0.958196944777394, + "learning_rate": 1.945492861174606e-05, + "loss": 0.4424, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 1.036359306845461, + "learning_rate": 1.945407103837288e-05, + "loss": 0.6501, + "step": 1631 + }, + { + "epoch": 0.13, + "grad_norm": 1.0229971933809134, + "learning_rate": 1.9453212809840965e-05, + "loss": 0.6873, + "step": 1632 + }, + { + "epoch": 0.13, + "grad_norm": 0.997114764638409, + "learning_rate": 1.945235392620979e-05, + "loss": 0.4905, + "step": 1633 + }, + { + "epoch": 0.13, + "grad_norm": 0.9633455772418739, + "learning_rate": 1.9451494387538873e-05, + "loss": 0.5667, + "step": 1634 + }, + { + "epoch": 0.13, + "grad_norm": 1.0761532151031492, + "learning_rate": 1.9450634193887776e-05, + "loss": 0.6529, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 1.1922896878104507, + "learning_rate": 1.9449773345316113e-05, + "loss": 0.6218, + "step": 1636 + }, + { + "epoch": 0.13, + "grad_norm": 1.000698930279899, + "learning_rate": 1.944891184188354e-05, + "loss": 0.5953, + "step": 1637 + }, + { + "epoch": 0.13, + "grad_norm": 1.0527551660365246, + "learning_rate": 1.9448049683649753e-05, + "loss": 0.6841, + "step": 1638 + }, + { + "epoch": 0.13, + "grad_norm": 1.1807838377158926, + "learning_rate": 1.9447186870674505e-05, + "loss": 0.6784, + "step": 1639 + }, + { + "epoch": 0.13, + "grad_norm": 1.2843482015093421, + "learning_rate": 1.944632340301759e-05, + "loss": 0.5106, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 0.9922564540621734, + "learning_rate": 1.9445459280738838e-05, + "loss": 0.5037, + "step": 1641 + }, + { + "epoch": 0.13, + "grad_norm": 0.952673106460145, + "learning_rate": 1.944459450389814e-05, + "loss": 0.5759, + "step": 1642 + }, + { + "epoch": 0.13, + "grad_norm": 1.0789735287269921, + "learning_rate": 1.9443729072555417e-05, + "loss": 0.5833, + "step": 1643 + }, + { + "epoch": 0.13, + "grad_norm": 0.9929128562914025, + "learning_rate": 1.9442862986770645e-05, + "loss": 0.6195, + "step": 1644 + }, + { + "epoch": 0.13, + "grad_norm": 0.9939666202490641, + "learning_rate": 1.9441996246603848e-05, + "loss": 0.6348, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 0.9146036701079484, + "learning_rate": 1.9441128852115083e-05, + "loss": 0.611, + "step": 1646 + }, + { + "epoch": 0.13, + "grad_norm": 0.9358084165527047, + "learning_rate": 1.9440260803364463e-05, + "loss": 0.603, + "step": 1647 + }, + { + "epoch": 0.13, + "grad_norm": 0.9996578346711003, + "learning_rate": 1.9439392100412145e-05, + "loss": 0.5301, + "step": 1648 + }, + { + "epoch": 0.13, + "grad_norm": 1.1106725835561553, + "learning_rate": 1.9438522743318327e-05, + "loss": 0.581, + "step": 1649 + }, + { + "epoch": 0.13, + "grad_norm": 1.0223341153440686, + "learning_rate": 1.9437652732143252e-05, + "loss": 0.594, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 1.061253275018964, + "learning_rate": 1.9436782066947215e-05, + "loss": 0.5873, + "step": 1651 + }, + { + "epoch": 0.13, + "grad_norm": 1.0961476565306238, + "learning_rate": 1.943591074779055e-05, + "loss": 0.6504, + "step": 1652 + }, + { + "epoch": 0.13, + "grad_norm": 1.1131513809579108, + "learning_rate": 1.9435038774733644e-05, + "loss": 0.6113, + "step": 1653 + }, + { + "epoch": 0.13, + "grad_norm": 0.9959591458602137, + "learning_rate": 1.9434166147836917e-05, + "loss": 0.6193, + "step": 1654 + }, + { + "epoch": 0.13, + "grad_norm": 1.0898356458340683, + "learning_rate": 1.9433292867160843e-05, + "loss": 0.6632, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 1.012741168027966, + "learning_rate": 1.9432418932765942e-05, + "loss": 0.6281, + "step": 1656 + }, + { + "epoch": 0.13, + "grad_norm": 0.9008564402084792, + "learning_rate": 1.9431544344712776e-05, + "loss": 0.5148, + "step": 1657 + }, + { + "epoch": 0.13, + "grad_norm": 0.9579588034766762, + "learning_rate": 1.9430669103061953e-05, + "loss": 0.5577, + "step": 1658 + }, + { + "epoch": 0.13, + "grad_norm": 0.9758486715067173, + "learning_rate": 1.9429793207874126e-05, + "loss": 0.5811, + "step": 1659 + }, + { + "epoch": 0.13, + "grad_norm": 0.9899522905676923, + "learning_rate": 1.9428916659209995e-05, + "loss": 0.6214, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 0.9120782629931777, + "learning_rate": 1.94280394571303e-05, + "loss": 0.5678, + "step": 1661 + }, + { + "epoch": 0.14, + "grad_norm": 0.9752034280998241, + "learning_rate": 1.9427161601695833e-05, + "loss": 0.6131, + "step": 1662 + }, + { + "epoch": 0.14, + "grad_norm": 1.0109589990611225, + "learning_rate": 1.942628309296743e-05, + "loss": 0.5384, + "step": 1663 + }, + { + "epoch": 0.14, + "grad_norm": 1.1014070080522274, + "learning_rate": 1.9425403931005968e-05, + "loss": 0.677, + "step": 1664 + }, + { + "epoch": 0.14, + "grad_norm": 1.009005362126033, + "learning_rate": 1.9424524115872375e-05, + "loss": 0.6068, + "step": 1665 + }, + { + "epoch": 0.14, + "grad_norm": 1.008399993710228, + "learning_rate": 1.9423643647627625e-05, + "loss": 0.6021, + "step": 1666 + }, + { + "epoch": 0.14, + "grad_norm": 0.9525007427813187, + "learning_rate": 1.9422762526332723e-05, + "loss": 0.5691, + "step": 1667 + }, + { + "epoch": 0.14, + "grad_norm": 1.0926859628696797, + "learning_rate": 1.942188075204874e-05, + "loss": 0.6686, + "step": 1668 + }, + { + "epoch": 0.14, + "grad_norm": 1.0722394254713838, + "learning_rate": 1.9420998324836777e-05, + "loss": 0.5931, + "step": 1669 + }, + { + "epoch": 0.14, + "grad_norm": 0.9650163359668696, + "learning_rate": 1.9420115244757985e-05, + "loss": 0.6437, + "step": 1670 + }, + { + "epoch": 0.14, + "grad_norm": 1.017397111933101, + "learning_rate": 1.941923151187356e-05, + "loss": 0.5942, + "step": 1671 + }, + { + "epoch": 0.14, + "grad_norm": 1.0942653548901369, + "learning_rate": 1.9418347126244754e-05, + "loss": 0.6195, + "step": 1672 + }, + { + "epoch": 0.14, + "grad_norm": 1.0256751446156467, + "learning_rate": 1.941746208793284e-05, + "loss": 0.5689, + "step": 1673 + }, + { + "epoch": 0.14, + "grad_norm": 1.0420200004454279, + "learning_rate": 1.9416576396999156e-05, + "loss": 0.5454, + "step": 1674 + }, + { + "epoch": 0.14, + "grad_norm": 1.1282627502666678, + "learning_rate": 1.941569005350508e-05, + "loss": 0.6112, + "step": 1675 + }, + { + "epoch": 0.14, + "grad_norm": 1.1223142256728877, + "learning_rate": 1.941480305751204e-05, + "loss": 0.5911, + "step": 1676 + }, + { + "epoch": 0.14, + "grad_norm": 0.9574998557547988, + "learning_rate": 1.9413915409081496e-05, + "loss": 0.5814, + "step": 1677 + }, + { + "epoch": 0.14, + "grad_norm": 0.9789856639171853, + "learning_rate": 1.9413027108274964e-05, + "loss": 0.6239, + "step": 1678 + }, + { + "epoch": 0.14, + "grad_norm": 0.9079658859319869, + "learning_rate": 1.9412138155154e-05, + "loss": 0.5424, + "step": 1679 + }, + { + "epoch": 0.14, + "grad_norm": 0.997675552438307, + "learning_rate": 1.941124854978022e-05, + "loss": 0.5783, + "step": 1680 + }, + { + "epoch": 0.14, + "grad_norm": 1.0240956770351866, + "learning_rate": 1.9410358292215252e-05, + "loss": 0.68, + "step": 1681 + }, + { + "epoch": 0.14, + "grad_norm": 1.0531747697737566, + "learning_rate": 1.9409467382520805e-05, + "loss": 0.5563, + "step": 1682 + }, + { + "epoch": 0.14, + "grad_norm": 1.0106078332022375, + "learning_rate": 1.9408575820758616e-05, + "loss": 0.6239, + "step": 1683 + }, + { + "epoch": 0.14, + "grad_norm": 1.1336686212019484, + "learning_rate": 1.940768360699047e-05, + "loss": 0.5812, + "step": 1684 + }, + { + "epoch": 0.14, + "grad_norm": 0.9427888254007594, + "learning_rate": 1.9406790741278188e-05, + "loss": 0.6002, + "step": 1685 + }, + { + "epoch": 0.14, + "grad_norm": 1.0708900224334923, + "learning_rate": 1.940589722368366e-05, + "loss": 0.6759, + "step": 1686 + }, + { + "epoch": 0.14, + "grad_norm": 1.0544731535821026, + "learning_rate": 1.940500305426879e-05, + "loss": 0.5946, + "step": 1687 + }, + { + "epoch": 0.14, + "grad_norm": 1.000572268213837, + "learning_rate": 1.9404108233095557e-05, + "loss": 0.5565, + "step": 1688 + }, + { + "epoch": 0.14, + "grad_norm": 0.9469861618458745, + "learning_rate": 1.940321276022596e-05, + "loss": 0.6116, + "step": 1689 + }, + { + "epoch": 0.14, + "grad_norm": 0.984815886481496, + "learning_rate": 1.9402316635722062e-05, + "loss": 0.6322, + "step": 1690 + }, + { + "epoch": 0.14, + "grad_norm": 1.0360745957580155, + "learning_rate": 1.9401419859645958e-05, + "loss": 0.5955, + "step": 1691 + }, + { + "epoch": 0.14, + "grad_norm": 1.0390331354965414, + "learning_rate": 1.9400522432059802e-05, + "loss": 0.4928, + "step": 1692 + }, + { + "epoch": 0.14, + "grad_norm": 1.095133833358097, + "learning_rate": 1.9399624353025774e-05, + "loss": 0.6053, + "step": 1693 + }, + { + "epoch": 0.14, + "grad_norm": 1.0481745310455743, + "learning_rate": 1.939872562260612e-05, + "loss": 0.5901, + "step": 1694 + }, + { + "epoch": 0.14, + "grad_norm": 0.9635325259605018, + "learning_rate": 1.9397826240863113e-05, + "loss": 0.5477, + "step": 1695 + }, + { + "epoch": 0.14, + "grad_norm": 1.056255123418143, + "learning_rate": 1.9396926207859085e-05, + "loss": 0.5865, + "step": 1696 + }, + { + "epoch": 0.14, + "grad_norm": 0.8758001421262852, + "learning_rate": 1.939602552365641e-05, + "loss": 0.5967, + "step": 1697 + }, + { + "epoch": 0.14, + "grad_norm": 1.0623347770055493, + "learning_rate": 1.9395124188317493e-05, + "loss": 0.5864, + "step": 1698 + }, + { + "epoch": 0.14, + "grad_norm": 1.046069421727186, + "learning_rate": 1.9394222201904806e-05, + "loss": 0.6322, + "step": 1699 + }, + { + "epoch": 0.14, + "grad_norm": 1.0635388569825572, + "learning_rate": 1.9393319564480854e-05, + "loss": 0.5859, + "step": 1700 + }, + { + "epoch": 0.14, + "grad_norm": 1.0396994085777351, + "learning_rate": 1.9392416276108192e-05, + "loss": 0.6369, + "step": 1701 + }, + { + "epoch": 0.14, + "grad_norm": 1.069197810429597, + "learning_rate": 1.9391512336849406e-05, + "loss": 0.6187, + "step": 1702 + }, + { + "epoch": 0.14, + "grad_norm": 0.9948691983607679, + "learning_rate": 1.939060774676715e-05, + "loss": 0.6283, + "step": 1703 + }, + { + "epoch": 0.14, + "grad_norm": 0.9369154927339403, + "learning_rate": 1.9389702505924106e-05, + "loss": 0.5582, + "step": 1704 + }, + { + "epoch": 0.14, + "grad_norm": 0.9110705801015708, + "learning_rate": 1.9388796614383008e-05, + "loss": 0.566, + "step": 1705 + }, + { + "epoch": 0.14, + "grad_norm": 0.9312148329941633, + "learning_rate": 1.938789007220663e-05, + "loss": 0.5327, + "step": 1706 + }, + { + "epoch": 0.14, + "grad_norm": 0.9391005657013416, + "learning_rate": 1.9386982879457795e-05, + "loss": 0.5459, + "step": 1707 + }, + { + "epoch": 0.14, + "grad_norm": 1.0106676478778127, + "learning_rate": 1.9386075036199378e-05, + "loss": 0.5732, + "step": 1708 + }, + { + "epoch": 0.14, + "grad_norm": 1.1020962008149404, + "learning_rate": 1.938516654249428e-05, + "loss": 0.6255, + "step": 1709 + }, + { + "epoch": 0.14, + "grad_norm": 1.161045643502798, + "learning_rate": 1.9384257398405473e-05, + "loss": 0.6387, + "step": 1710 + }, + { + "epoch": 0.14, + "grad_norm": 1.0899649970282215, + "learning_rate": 1.938334760399595e-05, + "loss": 0.6095, + "step": 1711 + }, + { + "epoch": 0.14, + "grad_norm": 0.9554662845585481, + "learning_rate": 1.9382437159328758e-05, + "loss": 0.5472, + "step": 1712 + }, + { + "epoch": 0.14, + "grad_norm": 1.0773572457764136, + "learning_rate": 1.9381526064466995e-05, + "loss": 0.4818, + "step": 1713 + }, + { + "epoch": 0.14, + "grad_norm": 0.9980357980012017, + "learning_rate": 1.9380614319473798e-05, + "loss": 0.5929, + "step": 1714 + }, + { + "epoch": 0.14, + "grad_norm": 1.0965656273453313, + "learning_rate": 1.9379701924412344e-05, + "loss": 0.6584, + "step": 1715 + }, + { + "epoch": 0.14, + "grad_norm": 0.9565851250803735, + "learning_rate": 1.937878887934587e-05, + "loss": 0.6129, + "step": 1716 + }, + { + "epoch": 0.14, + "grad_norm": 0.9284045836171315, + "learning_rate": 1.9377875184337647e-05, + "loss": 0.5907, + "step": 1717 + }, + { + "epoch": 0.14, + "grad_norm": 1.1134948767026893, + "learning_rate": 1.9376960839450988e-05, + "loss": 0.5701, + "step": 1718 + }, + { + "epoch": 0.14, + "grad_norm": 0.9792430599995569, + "learning_rate": 1.9376045844749267e-05, + "loss": 0.6148, + "step": 1719 + }, + { + "epoch": 0.14, + "grad_norm": 0.9237201545998925, + "learning_rate": 1.937513020029588e-05, + "loss": 0.5524, + "step": 1720 + }, + { + "epoch": 0.14, + "grad_norm": 0.9695000739742969, + "learning_rate": 1.9374213906154286e-05, + "loss": 0.6064, + "step": 1721 + }, + { + "epoch": 0.14, + "grad_norm": 1.0509066073915099, + "learning_rate": 1.9373296962387988e-05, + "loss": 0.5842, + "step": 1722 + }, + { + "epoch": 0.14, + "grad_norm": 1.0137941184215924, + "learning_rate": 1.937237936906052e-05, + "loss": 0.5337, + "step": 1723 + }, + { + "epoch": 0.14, + "grad_norm": 1.0478769449551844, + "learning_rate": 1.9371461126235474e-05, + "loss": 0.6209, + "step": 1724 + }, + { + "epoch": 0.14, + "grad_norm": 1.0231670900137617, + "learning_rate": 1.937054223397649e-05, + "loss": 0.6478, + "step": 1725 + }, + { + "epoch": 0.14, + "grad_norm": 1.0087171579611938, + "learning_rate": 1.9369622692347233e-05, + "loss": 0.5918, + "step": 1726 + }, + { + "epoch": 0.14, + "grad_norm": 1.0541550223771612, + "learning_rate": 1.936870250141144e-05, + "loss": 0.7013, + "step": 1727 + }, + { + "epoch": 0.14, + "grad_norm": 1.0331248553066827, + "learning_rate": 1.936778166123287e-05, + "loss": 0.5751, + "step": 1728 + }, + { + "epoch": 0.14, + "grad_norm": 1.0075349881213924, + "learning_rate": 1.9366860171875345e-05, + "loss": 0.5509, + "step": 1729 + }, + { + "epoch": 0.14, + "grad_norm": 0.975520975539256, + "learning_rate": 1.9365938033402715e-05, + "loss": 0.5705, + "step": 1730 + }, + { + "epoch": 0.14, + "grad_norm": 1.0499924966908039, + "learning_rate": 1.9365015245878892e-05, + "loss": 0.5684, + "step": 1731 + }, + { + "epoch": 0.14, + "grad_norm": 0.9711685551889372, + "learning_rate": 1.936409180936781e-05, + "loss": 0.6005, + "step": 1732 + }, + { + "epoch": 0.14, + "grad_norm": 1.052822654931391, + "learning_rate": 1.9363167723933477e-05, + "loss": 0.6444, + "step": 1733 + }, + { + "epoch": 0.14, + "grad_norm": 0.9489608663159473, + "learning_rate": 1.9362242989639926e-05, + "loss": 0.518, + "step": 1734 + }, + { + "epoch": 0.14, + "grad_norm": 1.0523021470731204, + "learning_rate": 1.936131760655124e-05, + "loss": 0.506, + "step": 1735 + }, + { + "epoch": 0.14, + "grad_norm": 0.9665533177577489, + "learning_rate": 1.9360391574731547e-05, + "loss": 0.5292, + "step": 1736 + }, + { + "epoch": 0.14, + "grad_norm": 1.005596907178615, + "learning_rate": 1.935946489424502e-05, + "loss": 0.6312, + "step": 1737 + }, + { + "epoch": 0.14, + "grad_norm": 1.0262942511052289, + "learning_rate": 1.935853756515588e-05, + "loss": 0.6343, + "step": 1738 + }, + { + "epoch": 0.14, + "grad_norm": 1.0438665597916685, + "learning_rate": 1.9357609587528385e-05, + "loss": 0.6709, + "step": 1739 + }, + { + "epoch": 0.14, + "grad_norm": 1.0675839441468322, + "learning_rate": 1.9356680961426847e-05, + "loss": 0.6619, + "step": 1740 + }, + { + "epoch": 0.14, + "grad_norm": 1.0499414830717126, + "learning_rate": 1.9355751686915617e-05, + "loss": 0.6456, + "step": 1741 + }, + { + "epoch": 0.14, + "grad_norm": 0.9769748381304898, + "learning_rate": 1.9354821764059094e-05, + "loss": 0.6537, + "step": 1742 + }, + { + "epoch": 0.14, + "grad_norm": 0.9619271758171696, + "learning_rate": 1.935389119292172e-05, + "loss": 0.6137, + "step": 1743 + }, + { + "epoch": 0.14, + "grad_norm": 1.0273752317519276, + "learning_rate": 1.9352959973567984e-05, + "loss": 0.6247, + "step": 1744 + }, + { + "epoch": 0.14, + "grad_norm": 1.0540513963093687, + "learning_rate": 1.9352028106062417e-05, + "loss": 0.6241, + "step": 1745 + }, + { + "epoch": 0.14, + "grad_norm": 1.019786109985117, + "learning_rate": 1.9351095590469596e-05, + "loss": 0.5784, + "step": 1746 + }, + { + "epoch": 0.14, + "grad_norm": 1.1593224796693433, + "learning_rate": 1.9350162426854152e-05, + "loss": 0.6446, + "step": 1747 + }, + { + "epoch": 0.14, + "grad_norm": 1.003402218113443, + "learning_rate": 1.9349228615280736e-05, + "loss": 0.6559, + "step": 1748 + }, + { + "epoch": 0.14, + "grad_norm": 0.9323779466330664, + "learning_rate": 1.9348294155814078e-05, + "loss": 0.5848, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 0.9649266600056559, + "learning_rate": 1.934735904851892e-05, + "loss": 0.5913, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 1.0189695057359012, + "learning_rate": 1.9346423293460078e-05, + "loss": 0.6156, + "step": 1751 + }, + { + "epoch": 0.14, + "grad_norm": 0.9508957159234627, + "learning_rate": 1.9345486890702386e-05, + "loss": 0.5786, + "step": 1752 + }, + { + "epoch": 0.14, + "grad_norm": 0.8771419924453374, + "learning_rate": 1.9344549840310743e-05, + "loss": 0.5926, + "step": 1753 + }, + { + "epoch": 0.14, + "grad_norm": 1.0763350268212386, + "learning_rate": 1.9343612142350085e-05, + "loss": 0.5966, + "step": 1754 + }, + { + "epoch": 0.14, + "grad_norm": 0.9946198344432027, + "learning_rate": 1.9342673796885395e-05, + "loss": 0.6017, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 0.9539611035425803, + "learning_rate": 1.93417348039817e-05, + "loss": 0.5984, + "step": 1756 + }, + { + "epoch": 0.14, + "grad_norm": 0.9344252040983774, + "learning_rate": 1.934079516370406e-05, + "loss": 0.6351, + "step": 1757 + }, + { + "epoch": 0.14, + "grad_norm": 1.0915301423903436, + "learning_rate": 1.933985487611761e-05, + "loss": 0.6586, + "step": 1758 + }, + { + "epoch": 0.14, + "grad_norm": 0.8947065911393781, + "learning_rate": 1.93389139412875e-05, + "loss": 0.5607, + "step": 1759 + }, + { + "epoch": 0.14, + "grad_norm": 1.1073120365984666, + "learning_rate": 1.9337972359278935e-05, + "loss": 0.5979, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 0.9693167721107211, + "learning_rate": 1.9337030130157166e-05, + "loss": 0.628, + "step": 1761 + }, + { + "epoch": 0.14, + "grad_norm": 0.9625722731315666, + "learning_rate": 1.9336087253987495e-05, + "loss": 0.5942, + "step": 1762 + }, + { + "epoch": 0.14, + "grad_norm": 0.9396578385873692, + "learning_rate": 1.9335143730835258e-05, + "loss": 0.5917, + "step": 1763 + }, + { + "epoch": 0.14, + "grad_norm": 0.9613382312549996, + "learning_rate": 1.933419956076584e-05, + "loss": 0.5517, + "step": 1764 + }, + { + "epoch": 0.14, + "grad_norm": 0.9495156501733618, + "learning_rate": 1.933325474384467e-05, + "loss": 0.5303, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 1.0290084286898358, + "learning_rate": 1.9332309280137227e-05, + "loss": 0.6439, + "step": 1766 + }, + { + "epoch": 0.14, + "grad_norm": 0.9716484435289793, + "learning_rate": 1.933136316970903e-05, + "loss": 0.5967, + "step": 1767 + }, + { + "epoch": 0.14, + "grad_norm": 0.968535746654822, + "learning_rate": 1.933041641262564e-05, + "loss": 0.603, + "step": 1768 + }, + { + "epoch": 0.14, + "grad_norm": 1.045616747126607, + "learning_rate": 1.9329469008952668e-05, + "loss": 0.6284, + "step": 1769 + }, + { + "epoch": 0.14, + "grad_norm": 1.069043035466113, + "learning_rate": 1.932852095875577e-05, + "loss": 0.6672, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 0.9707470847011743, + "learning_rate": 1.9327572262100642e-05, + "loss": 0.5747, + "step": 1771 + }, + { + "epoch": 0.14, + "grad_norm": 0.8752792859825459, + "learning_rate": 1.9326622919053034e-05, + "loss": 0.5589, + "step": 1772 + }, + { + "epoch": 0.14, + "grad_norm": 0.9614197721357719, + "learning_rate": 1.9325672929678728e-05, + "loss": 0.5549, + "step": 1773 + }, + { + "epoch": 0.14, + "grad_norm": 1.0074353729006507, + "learning_rate": 1.932472229404356e-05, + "loss": 0.5593, + "step": 1774 + }, + { + "epoch": 0.14, + "grad_norm": 0.9120956947478422, + "learning_rate": 1.932377101221341e-05, + "loss": 0.5694, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 0.9156991949273016, + "learning_rate": 1.9322819084254197e-05, + "loss": 0.5573, + "step": 1776 + }, + { + "epoch": 0.14, + "grad_norm": 0.9199254380244064, + "learning_rate": 1.9321866510231887e-05, + "loss": 0.6179, + "step": 1777 + }, + { + "epoch": 0.14, + "grad_norm": 1.0861265552584947, + "learning_rate": 1.93209132902125e-05, + "loss": 0.6715, + "step": 1778 + }, + { + "epoch": 0.14, + "grad_norm": 1.1236405448947937, + "learning_rate": 1.9319959424262092e-05, + "loss": 0.6493, + "step": 1779 + }, + { + "epoch": 0.14, + "grad_norm": 1.0467816961232732, + "learning_rate": 1.931900491244676e-05, + "loss": 0.621, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 1.0688593278210219, + "learning_rate": 1.9318049754832656e-05, + "loss": 0.6196, + "step": 1781 + }, + { + "epoch": 0.14, + "grad_norm": 0.9995772369930963, + "learning_rate": 1.9317093951485963e-05, + "loss": 0.5881, + "step": 1782 + }, + { + "epoch": 0.14, + "grad_norm": 1.0178102088596666, + "learning_rate": 1.931613750247293e-05, + "loss": 0.6829, + "step": 1783 + }, + { + "epoch": 0.14, + "grad_norm": 1.0075909234904905, + "learning_rate": 1.9315180407859828e-05, + "loss": 0.6581, + "step": 1784 + }, + { + "epoch": 0.15, + "grad_norm": 1.0256750596460658, + "learning_rate": 1.9314222667712988e-05, + "loss": 0.6293, + "step": 1785 + }, + { + "epoch": 0.15, + "grad_norm": 1.0177433162472909, + "learning_rate": 1.931326428209878e-05, + "loss": 0.6093, + "step": 1786 + }, + { + "epoch": 0.15, + "grad_norm": 1.0862864909273302, + "learning_rate": 1.9312305251083613e-05, + "loss": 0.5152, + "step": 1787 + }, + { + "epoch": 0.15, + "grad_norm": 0.9256285006173005, + "learning_rate": 1.9311345574733958e-05, + "loss": 0.5556, + "step": 1788 + }, + { + "epoch": 0.15, + "grad_norm": 1.0230687624511863, + "learning_rate": 1.9310385253116307e-05, + "loss": 0.6125, + "step": 1789 + }, + { + "epoch": 0.15, + "grad_norm": 1.020000318712854, + "learning_rate": 1.930942428629722e-05, + "loss": 0.593, + "step": 1790 + }, + { + "epoch": 0.15, + "grad_norm": 1.060391790298629, + "learning_rate": 1.9308462674343288e-05, + "loss": 0.6621, + "step": 1791 + }, + { + "epoch": 0.15, + "grad_norm": 0.96873478565776, + "learning_rate": 1.9307500417321154e-05, + "loss": 0.6247, + "step": 1792 + }, + { + "epoch": 0.15, + "grad_norm": 0.9259799452753706, + "learning_rate": 1.930653751529749e-05, + "loss": 0.6099, + "step": 1793 + }, + { + "epoch": 0.15, + "grad_norm": 1.1069947124278503, + "learning_rate": 1.9305573968339032e-05, + "loss": 0.6069, + "step": 1794 + }, + { + "epoch": 0.15, + "grad_norm": 1.058441133773948, + "learning_rate": 1.930460977651255e-05, + "loss": 0.6325, + "step": 1795 + }, + { + "epoch": 0.15, + "grad_norm": 0.977173212643966, + "learning_rate": 1.930364493988487e-05, + "loss": 0.6179, + "step": 1796 + }, + { + "epoch": 0.15, + "grad_norm": 1.0266606354391452, + "learning_rate": 1.9302679458522844e-05, + "loss": 0.5753, + "step": 1797 + }, + { + "epoch": 0.15, + "grad_norm": 0.9743030871356761, + "learning_rate": 1.9301713332493386e-05, + "loss": 0.5387, + "step": 1798 + }, + { + "epoch": 0.15, + "grad_norm": 0.9153132297294244, + "learning_rate": 1.930074656186344e-05, + "loss": 0.5802, + "step": 1799 + }, + { + "epoch": 0.15, + "grad_norm": 1.0851112564975114, + "learning_rate": 1.929977914670001e-05, + "loss": 0.6124, + "step": 1800 + }, + { + "epoch": 0.15, + "grad_norm": 1.0126714996986839, + "learning_rate": 1.9298811087070134e-05, + "loss": 0.6734, + "step": 1801 + }, + { + "epoch": 0.15, + "grad_norm": 1.1123818538124828, + "learning_rate": 1.9297842383040898e-05, + "loss": 0.7079, + "step": 1802 + }, + { + "epoch": 0.15, + "grad_norm": 1.0228876548907846, + "learning_rate": 1.9296873034679427e-05, + "loss": 0.6336, + "step": 1803 + }, + { + "epoch": 0.15, + "grad_norm": 0.9516639256057758, + "learning_rate": 1.9295903042052907e-05, + "loss": 0.5531, + "step": 1804 + }, + { + "epoch": 0.15, + "grad_norm": 0.9704534136451547, + "learning_rate": 1.929493240522855e-05, + "loss": 0.6174, + "step": 1805 + }, + { + "epoch": 0.15, + "grad_norm": 0.9801162091417275, + "learning_rate": 1.9293961124273623e-05, + "loss": 0.6326, + "step": 1806 + }, + { + "epoch": 0.15, + "grad_norm": 1.0425771118361655, + "learning_rate": 1.929298919925543e-05, + "loss": 0.6636, + "step": 1807 + }, + { + "epoch": 0.15, + "grad_norm": 0.9599469190171349, + "learning_rate": 1.9292016630241334e-05, + "loss": 0.6003, + "step": 1808 + }, + { + "epoch": 0.15, + "grad_norm": 0.8707841858381885, + "learning_rate": 1.9291043417298723e-05, + "loss": 0.5516, + "step": 1809 + }, + { + "epoch": 0.15, + "grad_norm": 1.0020013467161477, + "learning_rate": 1.9290069560495042e-05, + "loss": 0.5347, + "step": 1810 + }, + { + "epoch": 0.15, + "grad_norm": 1.095080749599255, + "learning_rate": 1.9289095059897787e-05, + "loss": 0.6277, + "step": 1811 + }, + { + "epoch": 0.15, + "grad_norm": 1.090773935898295, + "learning_rate": 1.9288119915574485e-05, + "loss": 0.6111, + "step": 1812 + }, + { + "epoch": 0.15, + "grad_norm": 0.978967808007286, + "learning_rate": 1.9287144127592704e-05, + "loss": 0.5623, + "step": 1813 + }, + { + "epoch": 0.15, + "grad_norm": 0.9302789832381554, + "learning_rate": 1.9286167696020076e-05, + "loss": 0.549, + "step": 1814 + }, + { + "epoch": 0.15, + "grad_norm": 0.974690334183085, + "learning_rate": 1.9285190620924267e-05, + "loss": 0.6169, + "step": 1815 + }, + { + "epoch": 0.15, + "grad_norm": 0.9767003754849144, + "learning_rate": 1.9284212902372978e-05, + "loss": 0.6241, + "step": 1816 + }, + { + "epoch": 0.15, + "grad_norm": 0.9907255241813573, + "learning_rate": 1.928323454043397e-05, + "loss": 0.6229, + "step": 1817 + }, + { + "epoch": 0.15, + "grad_norm": 1.1142070623511469, + "learning_rate": 1.9282255535175047e-05, + "loss": 0.6103, + "step": 1818 + }, + { + "epoch": 0.15, + "grad_norm": 1.0248511092891297, + "learning_rate": 1.928127588666405e-05, + "loss": 0.5623, + "step": 1819 + }, + { + "epoch": 0.15, + "grad_norm": 0.994991335298417, + "learning_rate": 1.9280295594968863e-05, + "loss": 0.5865, + "step": 1820 + }, + { + "epoch": 0.15, + "grad_norm": 0.9812727318761492, + "learning_rate": 1.9279314660157423e-05, + "loss": 0.6418, + "step": 1821 + }, + { + "epoch": 0.15, + "grad_norm": 0.9630743577652199, + "learning_rate": 1.927833308229771e-05, + "loss": 0.5103, + "step": 1822 + }, + { + "epoch": 0.15, + "grad_norm": 0.9996298075093891, + "learning_rate": 1.927735086145774e-05, + "loss": 0.6354, + "step": 1823 + }, + { + "epoch": 0.15, + "grad_norm": 0.9805994014100998, + "learning_rate": 1.9276367997705584e-05, + "loss": 0.5739, + "step": 1824 + }, + { + "epoch": 0.15, + "grad_norm": 0.9010497876657586, + "learning_rate": 1.927538449110936e-05, + "loss": 0.5258, + "step": 1825 + }, + { + "epoch": 0.15, + "grad_norm": 0.9976777723502999, + "learning_rate": 1.9274400341737214e-05, + "loss": 0.5761, + "step": 1826 + }, + { + "epoch": 0.15, + "grad_norm": 1.0130011150060951, + "learning_rate": 1.927341554965735e-05, + "loss": 0.5522, + "step": 1827 + }, + { + "epoch": 0.15, + "grad_norm": 1.0349919103598368, + "learning_rate": 1.9272430114938018e-05, + "loss": 0.5707, + "step": 1828 + }, + { + "epoch": 0.15, + "grad_norm": 1.0269075489815958, + "learning_rate": 1.92714440376475e-05, + "loss": 0.6424, + "step": 1829 + }, + { + "epoch": 0.15, + "grad_norm": 0.9245815228929377, + "learning_rate": 1.9270457317854135e-05, + "loss": 0.5445, + "step": 1830 + }, + { + "epoch": 0.15, + "grad_norm": 1.042268266511188, + "learning_rate": 1.92694699556263e-05, + "loss": 0.5685, + "step": 1831 + }, + { + "epoch": 0.15, + "grad_norm": 1.0755570703561435, + "learning_rate": 1.926848195103242e-05, + "loss": 0.7127, + "step": 1832 + }, + { + "epoch": 0.15, + "grad_norm": 0.8802799775002621, + "learning_rate": 1.926749330414096e-05, + "loss": 0.5497, + "step": 1833 + }, + { + "epoch": 0.15, + "grad_norm": 0.9749207683721478, + "learning_rate": 1.926650401502044e-05, + "loss": 0.6509, + "step": 1834 + }, + { + "epoch": 0.15, + "grad_norm": 0.9936837660861811, + "learning_rate": 1.9265514083739404e-05, + "loss": 0.6173, + "step": 1835 + }, + { + "epoch": 0.15, + "grad_norm": 0.8941460948798987, + "learning_rate": 1.9264523510366463e-05, + "loss": 0.5846, + "step": 1836 + }, + { + "epoch": 0.15, + "grad_norm": 0.9556447858034413, + "learning_rate": 1.9263532294970263e-05, + "loss": 0.5685, + "step": 1837 + }, + { + "epoch": 0.15, + "grad_norm": 0.9581469049694387, + "learning_rate": 1.9262540437619488e-05, + "loss": 0.608, + "step": 1838 + }, + { + "epoch": 0.15, + "grad_norm": 1.0159830950902855, + "learning_rate": 1.926154793838288e-05, + "loss": 0.6803, + "step": 1839 + }, + { + "epoch": 0.15, + "grad_norm": 1.006614746978762, + "learning_rate": 1.926055479732921e-05, + "loss": 0.5681, + "step": 1840 + }, + { + "epoch": 0.15, + "grad_norm": 0.9604027364713731, + "learning_rate": 1.925956101452731e-05, + "loss": 0.606, + "step": 1841 + }, + { + "epoch": 0.15, + "grad_norm": 1.0826894078754479, + "learning_rate": 1.9258566590046047e-05, + "loss": 0.5971, + "step": 1842 + }, + { + "epoch": 0.15, + "grad_norm": 1.0223462945685502, + "learning_rate": 1.9257571523954328e-05, + "loss": 0.5968, + "step": 1843 + }, + { + "epoch": 0.15, + "grad_norm": 0.9614839255093992, + "learning_rate": 1.9256575816321114e-05, + "loss": 0.5565, + "step": 1844 + }, + { + "epoch": 0.15, + "grad_norm": 0.924425557439055, + "learning_rate": 1.925557946721541e-05, + "loss": 0.5685, + "step": 1845 + }, + { + "epoch": 0.15, + "grad_norm": 1.056866996923417, + "learning_rate": 1.9254582476706254e-05, + "loss": 0.6002, + "step": 1846 + }, + { + "epoch": 0.15, + "grad_norm": 0.9334277386485584, + "learning_rate": 1.9253584844862745e-05, + "loss": 0.6184, + "step": 1847 + }, + { + "epoch": 0.15, + "grad_norm": 0.8767870072335794, + "learning_rate": 1.9252586571754013e-05, + "loss": 0.5253, + "step": 1848 + }, + { + "epoch": 0.15, + "grad_norm": 0.9113482795540835, + "learning_rate": 1.925158765744924e-05, + "loss": 0.5904, + "step": 1849 + }, + { + "epoch": 0.15, + "grad_norm": 0.964622683320886, + "learning_rate": 1.9250588102017643e-05, + "loss": 0.6002, + "step": 1850 + }, + { + "epoch": 0.15, + "grad_norm": 0.9209021189317579, + "learning_rate": 1.92495879055285e-05, + "loss": 0.5529, + "step": 1851 + }, + { + "epoch": 0.15, + "grad_norm": 1.0517115998700501, + "learning_rate": 1.924858706805112e-05, + "loss": 0.6084, + "step": 1852 + }, + { + "epoch": 0.15, + "grad_norm": 1.0121355327781139, + "learning_rate": 1.924758558965486e-05, + "loss": 0.5884, + "step": 1853 + }, + { + "epoch": 0.15, + "grad_norm": 1.035522236552528, + "learning_rate": 1.924658347040912e-05, + "loss": 0.6331, + "step": 1854 + }, + { + "epoch": 0.15, + "grad_norm": 0.9098667167283655, + "learning_rate": 1.9245580710383344e-05, + "loss": 0.5066, + "step": 1855 + }, + { + "epoch": 0.15, + "grad_norm": 0.9882838782668055, + "learning_rate": 1.924457730964703e-05, + "loss": 0.6573, + "step": 1856 + }, + { + "epoch": 0.15, + "grad_norm": 0.8170470779204368, + "learning_rate": 1.9243573268269706e-05, + "loss": 0.5192, + "step": 1857 + }, + { + "epoch": 0.15, + "grad_norm": 1.1153812484779042, + "learning_rate": 1.9242568586320956e-05, + "loss": 0.6242, + "step": 1858 + }, + { + "epoch": 0.15, + "grad_norm": 1.034925508619141, + "learning_rate": 1.92415632638704e-05, + "loss": 0.6048, + "step": 1859 + }, + { + "epoch": 0.15, + "grad_norm": 0.943648489456906, + "learning_rate": 1.9240557300987705e-05, + "loss": 0.5746, + "step": 1860 + }, + { + "epoch": 0.15, + "grad_norm": 1.0863408480572274, + "learning_rate": 1.923955069774259e-05, + "loss": 0.7041, + "step": 1861 + }, + { + "epoch": 0.15, + "grad_norm": 1.0642094533973365, + "learning_rate": 1.9238543454204802e-05, + "loss": 0.6411, + "step": 1862 + }, + { + "epoch": 0.15, + "grad_norm": 1.0069358328656293, + "learning_rate": 1.923753557044415e-05, + "loss": 0.6549, + "step": 1863 + }, + { + "epoch": 0.15, + "grad_norm": 0.9637795959365351, + "learning_rate": 1.9236527046530476e-05, + "loss": 0.6205, + "step": 1864 + }, + { + "epoch": 0.15, + "grad_norm": 0.9083232656443532, + "learning_rate": 1.923551788253367e-05, + "loss": 0.5804, + "step": 1865 + }, + { + "epoch": 0.15, + "grad_norm": 1.0283659794931066, + "learning_rate": 1.923450807852367e-05, + "loss": 0.581, + "step": 1866 + }, + { + "epoch": 0.15, + "grad_norm": 1.0104856417456372, + "learning_rate": 1.9233497634570446e-05, + "loss": 0.6503, + "step": 1867 + }, + { + "epoch": 0.15, + "grad_norm": 1.105383745788613, + "learning_rate": 1.923248655074403e-05, + "loss": 0.7004, + "step": 1868 + }, + { + "epoch": 0.15, + "grad_norm": 1.0774461890083162, + "learning_rate": 1.923147482711448e-05, + "loss": 0.6321, + "step": 1869 + }, + { + "epoch": 0.15, + "grad_norm": 1.0318851189457057, + "learning_rate": 1.923046246375192e-05, + "loss": 0.6791, + "step": 1870 + }, + { + "epoch": 0.15, + "grad_norm": 0.9327788418837317, + "learning_rate": 1.9229449460726495e-05, + "loss": 0.5738, + "step": 1871 + }, + { + "epoch": 0.15, + "grad_norm": 0.9803706876908969, + "learning_rate": 1.9228435818108408e-05, + "loss": 0.6639, + "step": 1872 + }, + { + "epoch": 0.15, + "grad_norm": 0.9657285086391221, + "learning_rate": 1.9227421535967906e-05, + "loss": 0.6011, + "step": 1873 + }, + { + "epoch": 0.15, + "grad_norm": 0.9644259081561086, + "learning_rate": 1.9226406614375276e-05, + "loss": 0.4687, + "step": 1874 + }, + { + "epoch": 0.15, + "grad_norm": 1.0937221730243076, + "learning_rate": 1.922539105340085e-05, + "loss": 0.5926, + "step": 1875 + }, + { + "epoch": 0.15, + "grad_norm": 1.0355689113799338, + "learning_rate": 1.922437485311501e-05, + "loss": 0.6252, + "step": 1876 + }, + { + "epoch": 0.15, + "grad_norm": 0.9493040784838319, + "learning_rate": 1.9223358013588172e-05, + "loss": 0.5499, + "step": 1877 + }, + { + "epoch": 0.15, + "grad_norm": 0.9474996681844716, + "learning_rate": 1.9222340534890803e-05, + "loss": 0.5851, + "step": 1878 + }, + { + "epoch": 0.15, + "grad_norm": 1.0338117204194743, + "learning_rate": 1.922132241709342e-05, + "loss": 0.6586, + "step": 1879 + }, + { + "epoch": 0.15, + "grad_norm": 0.9455740270706581, + "learning_rate": 1.9220303660266568e-05, + "loss": 0.5658, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 0.9766084713327681, + "learning_rate": 1.9219284264480854e-05, + "loss": 0.5583, + "step": 1881 + }, + { + "epoch": 0.15, + "grad_norm": 1.1407050220765134, + "learning_rate": 1.9218264229806917e-05, + "loss": 0.6909, + "step": 1882 + }, + { + "epoch": 0.15, + "grad_norm": 1.0189913748914254, + "learning_rate": 1.9217243556315445e-05, + "loss": 0.5941, + "step": 1883 + }, + { + "epoch": 0.15, + "grad_norm": 1.005943329132577, + "learning_rate": 1.9216222244077173e-05, + "loss": 0.6116, + "step": 1884 + }, + { + "epoch": 0.15, + "grad_norm": 1.0085992680953428, + "learning_rate": 1.921520029316287e-05, + "loss": 0.5959, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 0.9530355673098095, + "learning_rate": 1.9214177703643365e-05, + "loss": 0.5689, + "step": 1886 + }, + { + "epoch": 0.15, + "grad_norm": 0.9927108368990538, + "learning_rate": 1.9213154475589513e-05, + "loss": 0.4917, + "step": 1887 + }, + { + "epoch": 0.15, + "grad_norm": 1.145381087906433, + "learning_rate": 1.921213060907223e-05, + "loss": 0.6237, + "step": 1888 + }, + { + "epoch": 0.15, + "grad_norm": 0.9237290032126951, + "learning_rate": 1.921110610416247e-05, + "loss": 0.5401, + "step": 1889 + }, + { + "epoch": 0.15, + "grad_norm": 1.1816105930140846, + "learning_rate": 1.9210080960931224e-05, + "loss": 0.6396, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 1.045015765819876, + "learning_rate": 1.920905517944954e-05, + "loss": 0.5987, + "step": 1891 + }, + { + "epoch": 0.15, + "grad_norm": 1.0438281050104827, + "learning_rate": 1.9208028759788496e-05, + "loss": 0.6267, + "step": 1892 + }, + { + "epoch": 0.15, + "grad_norm": 1.1258432834265213, + "learning_rate": 1.920700170201923e-05, + "loss": 0.6134, + "step": 1893 + }, + { + "epoch": 0.15, + "grad_norm": 1.0806923013990861, + "learning_rate": 1.920597400621291e-05, + "loss": 0.7003, + "step": 1894 + }, + { + "epoch": 0.15, + "grad_norm": 1.0215362002067059, + "learning_rate": 1.9204945672440757e-05, + "loss": 0.5316, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 1.1174940694287256, + "learning_rate": 1.9203916700774035e-05, + "loss": 0.688, + "step": 1896 + }, + { + "epoch": 0.15, + "grad_norm": 1.0585002502030911, + "learning_rate": 1.920288709128405e-05, + "loss": 0.5828, + "step": 1897 + }, + { + "epoch": 0.15, + "grad_norm": 1.10077227527449, + "learning_rate": 1.920185684404215e-05, + "loss": 0.6515, + "step": 1898 + }, + { + "epoch": 0.15, + "grad_norm": 1.047385362560591, + "learning_rate": 1.9200825959119736e-05, + "loss": 0.6398, + "step": 1899 + }, + { + "epoch": 0.15, + "grad_norm": 1.0580131941502218, + "learning_rate": 1.9199794436588244e-05, + "loss": 0.6211, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 1.0182865825702039, + "learning_rate": 1.9198762276519156e-05, + "loss": 0.6557, + "step": 1901 + }, + { + "epoch": 0.15, + "grad_norm": 0.9618402593349183, + "learning_rate": 1.9197729478984003e-05, + "loss": 0.5659, + "step": 1902 + }, + { + "epoch": 0.15, + "grad_norm": 0.9881111937134388, + "learning_rate": 1.9196696044054354e-05, + "loss": 0.6162, + "step": 1903 + }, + { + "epoch": 0.15, + "grad_norm": 0.9798061834622409, + "learning_rate": 1.9195661971801825e-05, + "loss": 0.5748, + "step": 1904 + }, + { + "epoch": 0.15, + "grad_norm": 1.0083357679462275, + "learning_rate": 1.9194627262298082e-05, + "loss": 0.5276, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 0.979514797051948, + "learning_rate": 1.9193591915614824e-05, + "loss": 0.5788, + "step": 1906 + }, + { + "epoch": 0.15, + "grad_norm": 1.086160667609808, + "learning_rate": 1.9192555931823798e-05, + "loss": 0.6366, + "step": 1907 + }, + { + "epoch": 0.16, + "grad_norm": 0.9793396668250293, + "learning_rate": 1.9191519310996806e-05, + "loss": 0.5931, + "step": 1908 + }, + { + "epoch": 0.16, + "grad_norm": 0.9361137077940025, + "learning_rate": 1.9190482053205673e-05, + "loss": 0.5084, + "step": 1909 + }, + { + "epoch": 0.16, + "grad_norm": 0.9618308068511757, + "learning_rate": 1.9189444158522287e-05, + "loss": 0.6516, + "step": 1910 + }, + { + "epoch": 0.16, + "grad_norm": 0.9732343315606669, + "learning_rate": 1.9188405627018573e-05, + "loss": 0.5654, + "step": 1911 + }, + { + "epoch": 0.16, + "grad_norm": 1.132773160557045, + "learning_rate": 1.9187366458766497e-05, + "loss": 0.6753, + "step": 1912 + }, + { + "epoch": 0.16, + "grad_norm": 1.0402651050839569, + "learning_rate": 1.9186326653838075e-05, + "loss": 0.6445, + "step": 1913 + }, + { + "epoch": 0.16, + "grad_norm": 0.996823757390506, + "learning_rate": 1.918528621230537e-05, + "loss": 0.6372, + "step": 1914 + }, + { + "epoch": 0.16, + "grad_norm": 1.073328738789437, + "learning_rate": 1.918424513424047e-05, + "loss": 0.6432, + "step": 1915 + }, + { + "epoch": 0.16, + "grad_norm": 0.9053958985827631, + "learning_rate": 1.918320341971553e-05, + "loss": 0.5817, + "step": 1916 + }, + { + "epoch": 0.16, + "grad_norm": 0.9971311966062738, + "learning_rate": 1.9182161068802742e-05, + "loss": 0.548, + "step": 1917 + }, + { + "epoch": 0.16, + "grad_norm": 0.9791716632717283, + "learning_rate": 1.9181118081574336e-05, + "loss": 0.5843, + "step": 1918 + }, + { + "epoch": 0.16, + "grad_norm": 0.9691858920466895, + "learning_rate": 1.918007445810259e-05, + "loss": 0.5861, + "step": 1919 + }, + { + "epoch": 0.16, + "grad_norm": 1.0086233222980516, + "learning_rate": 1.9179030198459822e-05, + "loss": 0.6408, + "step": 1920 + }, + { + "epoch": 0.16, + "grad_norm": 0.9223756495219673, + "learning_rate": 1.917798530271841e-05, + "loss": 0.5596, + "step": 1921 + }, + { + "epoch": 0.16, + "grad_norm": 1.0383476005597099, + "learning_rate": 1.9176939770950753e-05, + "loss": 0.5413, + "step": 1922 + }, + { + "epoch": 0.16, + "grad_norm": 0.9672186642469593, + "learning_rate": 1.917589360322931e-05, + "loss": 0.5994, + "step": 1923 + }, + { + "epoch": 0.16, + "grad_norm": 1.0489479306416298, + "learning_rate": 1.9174846799626584e-05, + "loss": 0.5887, + "step": 1924 + }, + { + "epoch": 0.16, + "grad_norm": 1.0890574446451857, + "learning_rate": 1.9173799360215106e-05, + "loss": 0.5113, + "step": 1925 + }, + { + "epoch": 0.16, + "grad_norm": 0.8828386718344657, + "learning_rate": 1.917275128506747e-05, + "loss": 0.5213, + "step": 1926 + }, + { + "epoch": 0.16, + "grad_norm": 1.050030722223755, + "learning_rate": 1.9171702574256314e-05, + "loss": 0.6337, + "step": 1927 + }, + { + "epoch": 0.16, + "grad_norm": 0.9529536751310449, + "learning_rate": 1.91706532278543e-05, + "loss": 0.5547, + "step": 1928 + }, + { + "epoch": 0.16, + "grad_norm": 1.1090583641257683, + "learning_rate": 1.916960324593415e-05, + "loss": 0.697, + "step": 1929 + }, + { + "epoch": 0.16, + "grad_norm": 0.8848474899547881, + "learning_rate": 1.9168552628568632e-05, + "loss": 0.5998, + "step": 1930 + }, + { + "epoch": 0.16, + "grad_norm": 0.9502043834849424, + "learning_rate": 1.9167501375830543e-05, + "loss": 0.5589, + "step": 1931 + }, + { + "epoch": 0.16, + "grad_norm": 1.0050861666142, + "learning_rate": 1.9166449487792746e-05, + "loss": 0.584, + "step": 1932 + }, + { + "epoch": 0.16, + "grad_norm": 1.0436702439254533, + "learning_rate": 1.916539696452813e-05, + "loss": 0.634, + "step": 1933 + }, + { + "epoch": 0.16, + "grad_norm": 0.9459373406054806, + "learning_rate": 1.916434380610963e-05, + "loss": 0.5895, + "step": 1934 + }, + { + "epoch": 0.16, + "grad_norm": 1.054908650188775, + "learning_rate": 1.916329001261024e-05, + "loss": 0.568, + "step": 1935 + }, + { + "epoch": 0.16, + "grad_norm": 0.9330365256475838, + "learning_rate": 1.9162235584102973e-05, + "loss": 0.5557, + "step": 1936 + }, + { + "epoch": 0.16, + "grad_norm": 1.0055011485384573, + "learning_rate": 1.916118052066091e-05, + "loss": 0.6304, + "step": 1937 + }, + { + "epoch": 0.16, + "grad_norm": 1.02211232342174, + "learning_rate": 1.9160124822357162e-05, + "loss": 0.6059, + "step": 1938 + }, + { + "epoch": 0.16, + "grad_norm": 0.9944380434012641, + "learning_rate": 1.915906848926489e-05, + "loss": 0.6317, + "step": 1939 + }, + { + "epoch": 0.16, + "grad_norm": 0.9493365884245534, + "learning_rate": 1.9158011521457296e-05, + "loss": 0.6037, + "step": 1940 + }, + { + "epoch": 0.16, + "grad_norm": 1.0219440943167644, + "learning_rate": 1.9156953919007625e-05, + "loss": 0.6231, + "step": 1941 + }, + { + "epoch": 0.16, + "grad_norm": 1.135401366313444, + "learning_rate": 1.915589568198917e-05, + "loss": 0.5845, + "step": 1942 + }, + { + "epoch": 0.16, + "grad_norm": 0.9750445054134775, + "learning_rate": 1.9154836810475266e-05, + "loss": 0.5453, + "step": 1943 + }, + { + "epoch": 0.16, + "grad_norm": 0.9264585897691986, + "learning_rate": 1.9153777304539295e-05, + "loss": 0.5591, + "step": 1944 + }, + { + "epoch": 0.16, + "grad_norm": 1.0085873685393283, + "learning_rate": 1.9152717164254668e-05, + "loss": 0.664, + "step": 1945 + }, + { + "epoch": 0.16, + "grad_norm": 1.0278958093912496, + "learning_rate": 1.915165638969487e-05, + "loss": 0.5891, + "step": 1946 + }, + { + "epoch": 0.16, + "grad_norm": 1.064403603956213, + "learning_rate": 1.9150594980933392e-05, + "loss": 0.5634, + "step": 1947 + }, + { + "epoch": 0.16, + "grad_norm": 0.9931787664117413, + "learning_rate": 1.9149532938043803e-05, + "loss": 0.6159, + "step": 1948 + }, + { + "epoch": 0.16, + "grad_norm": 1.0329314277757113, + "learning_rate": 1.9148470261099698e-05, + "loss": 0.6072, + "step": 1949 + }, + { + "epoch": 0.16, + "grad_norm": 1.1039277621455925, + "learning_rate": 1.9147406950174715e-05, + "loss": 0.5811, + "step": 1950 + }, + { + "epoch": 0.16, + "grad_norm": 1.125424655598283, + "learning_rate": 1.9146343005342546e-05, + "loss": 0.6605, + "step": 1951 + }, + { + "epoch": 0.16, + "grad_norm": 1.0717462179634367, + "learning_rate": 1.914527842667692e-05, + "loss": 0.6044, + "step": 1952 + }, + { + "epoch": 0.16, + "grad_norm": 1.0645384893136742, + "learning_rate": 1.914421321425161e-05, + "loss": 0.6595, + "step": 1953 + }, + { + "epoch": 0.16, + "grad_norm": 0.9880743315660508, + "learning_rate": 1.914314736814044e-05, + "loss": 0.516, + "step": 1954 + }, + { + "epoch": 0.16, + "grad_norm": 0.990695340025184, + "learning_rate": 1.914208088841726e-05, + "loss": 0.5579, + "step": 1955 + }, + { + "epoch": 0.16, + "grad_norm": 1.0861237703124709, + "learning_rate": 1.914101377515599e-05, + "loss": 0.5526, + "step": 1956 + }, + { + "epoch": 0.16, + "grad_norm": 0.9343367407195712, + "learning_rate": 1.9139946028430568e-05, + "loss": 0.5441, + "step": 1957 + }, + { + "epoch": 0.16, + "grad_norm": 1.0557213241662768, + "learning_rate": 1.9138877648314994e-05, + "loss": 0.6169, + "step": 1958 + }, + { + "epoch": 0.16, + "grad_norm": 1.1145568689622156, + "learning_rate": 1.9137808634883307e-05, + "loss": 0.5841, + "step": 1959 + }, + { + "epoch": 0.16, + "grad_norm": 0.9547072242413849, + "learning_rate": 1.9136738988209585e-05, + "loss": 0.6044, + "step": 1960 + }, + { + "epoch": 0.16, + "grad_norm": 1.0031154617029014, + "learning_rate": 1.913566870836796e-05, + "loss": 0.6126, + "step": 1961 + }, + { + "epoch": 0.16, + "grad_norm": 0.9713050505949087, + "learning_rate": 1.913459779543259e-05, + "loss": 0.6042, + "step": 1962 + }, + { + "epoch": 0.16, + "grad_norm": 1.0593675597978944, + "learning_rate": 1.91335262494777e-05, + "loss": 0.579, + "step": 1963 + }, + { + "epoch": 0.16, + "grad_norm": 1.1283732724343774, + "learning_rate": 1.913245407057754e-05, + "loss": 0.6801, + "step": 1964 + }, + { + "epoch": 0.16, + "grad_norm": 0.8975735142461613, + "learning_rate": 1.9131381258806417e-05, + "loss": 0.5387, + "step": 1965 + }, + { + "epoch": 0.16, + "grad_norm": 0.9882741415747127, + "learning_rate": 1.9130307814238672e-05, + "loss": 0.5941, + "step": 1966 + }, + { + "epoch": 0.16, + "grad_norm": 0.9637366097998488, + "learning_rate": 1.912923373694869e-05, + "loss": 0.6374, + "step": 1967 + }, + { + "epoch": 0.16, + "grad_norm": 0.9766823600750104, + "learning_rate": 1.912815902701091e-05, + "loss": 0.5561, + "step": 1968 + }, + { + "epoch": 0.16, + "grad_norm": 0.9260778389900388, + "learning_rate": 1.9127083684499805e-05, + "loss": 0.5718, + "step": 1969 + }, + { + "epoch": 0.16, + "grad_norm": 1.091377532249139, + "learning_rate": 1.9126007709489896e-05, + "loss": 0.5721, + "step": 1970 + }, + { + "epoch": 0.16, + "grad_norm": 1.0418857534347066, + "learning_rate": 1.912493110205575e-05, + "loss": 0.6359, + "step": 1971 + }, + { + "epoch": 0.16, + "grad_norm": 0.9657653994400671, + "learning_rate": 1.912385386227197e-05, + "loss": 0.5676, + "step": 1972 + }, + { + "epoch": 0.16, + "grad_norm": 0.9697092686869903, + "learning_rate": 1.9122775990213212e-05, + "loss": 0.6102, + "step": 1973 + }, + { + "epoch": 0.16, + "grad_norm": 1.0133425995852425, + "learning_rate": 1.9121697485954168e-05, + "loss": 0.6055, + "step": 1974 + }, + { + "epoch": 0.16, + "grad_norm": 0.9679797921794574, + "learning_rate": 1.912061834956958e-05, + "loss": 0.581, + "step": 1975 + }, + { + "epoch": 0.16, + "grad_norm": 0.9154155156284548, + "learning_rate": 1.911953858113423e-05, + "loss": 0.546, + "step": 1976 + }, + { + "epoch": 0.16, + "grad_norm": 0.933799306983249, + "learning_rate": 1.9118458180722945e-05, + "loss": 0.6079, + "step": 1977 + }, + { + "epoch": 0.16, + "grad_norm": 1.053758507399398, + "learning_rate": 1.91173771484106e-05, + "loss": 0.5769, + "step": 1978 + }, + { + "epoch": 0.16, + "grad_norm": 1.138191845494953, + "learning_rate": 1.9116295484272102e-05, + "loss": 0.6154, + "step": 1979 + }, + { + "epoch": 0.16, + "grad_norm": 0.9134720259284926, + "learning_rate": 1.9115213188382413e-05, + "loss": 0.5892, + "step": 1980 + }, + { + "epoch": 0.16, + "grad_norm": 1.059907173398205, + "learning_rate": 1.9114130260816534e-05, + "loss": 0.6303, + "step": 1981 + }, + { + "epoch": 0.16, + "grad_norm": 0.9855166008746266, + "learning_rate": 1.9113046701649517e-05, + "loss": 0.5789, + "step": 1982 + }, + { + "epoch": 0.16, + "grad_norm": 1.0597633161782598, + "learning_rate": 1.9111962510956442e-05, + "loss": 0.5726, + "step": 1983 + }, + { + "epoch": 0.16, + "grad_norm": 0.8120227538838763, + "learning_rate": 1.9110877688812452e-05, + "loss": 0.5493, + "step": 1984 + }, + { + "epoch": 0.16, + "grad_norm": 1.022270679868322, + "learning_rate": 1.9109792235292715e-05, + "loss": 0.6067, + "step": 1985 + }, + { + "epoch": 0.16, + "grad_norm": 1.0319294759236322, + "learning_rate": 1.9108706150472457e-05, + "loss": 0.5586, + "step": 1986 + }, + { + "epoch": 0.16, + "grad_norm": 0.9043951748653178, + "learning_rate": 1.9107619434426944e-05, + "loss": 0.6206, + "step": 1987 + }, + { + "epoch": 0.16, + "grad_norm": 0.8981673775030506, + "learning_rate": 1.9106532087231483e-05, + "loss": 0.5786, + "step": 1988 + }, + { + "epoch": 0.16, + "grad_norm": 0.9107278405121401, + "learning_rate": 1.9105444108961423e-05, + "loss": 0.6036, + "step": 1989 + }, + { + "epoch": 0.16, + "grad_norm": 1.044523349649377, + "learning_rate": 1.9104355499692166e-05, + "loss": 0.633, + "step": 1990 + }, + { + "epoch": 0.16, + "grad_norm": 1.0117957050741717, + "learning_rate": 1.9103266259499146e-05, + "loss": 0.5685, + "step": 1991 + }, + { + "epoch": 0.16, + "grad_norm": 0.965600636959254, + "learning_rate": 1.910217638845785e-05, + "loss": 0.6139, + "step": 1992 + }, + { + "epoch": 0.16, + "grad_norm": 1.0093507104367392, + "learning_rate": 1.9101085886643804e-05, + "loss": 0.6291, + "step": 1993 + }, + { + "epoch": 0.16, + "grad_norm": 1.0634999781981085, + "learning_rate": 1.909999475413258e-05, + "loss": 0.6767, + "step": 1994 + }, + { + "epoch": 0.16, + "grad_norm": 0.9359622179269197, + "learning_rate": 1.909890299099979e-05, + "loss": 0.5753, + "step": 1995 + }, + { + "epoch": 0.16, + "grad_norm": 0.9781846611171376, + "learning_rate": 1.9097810597321095e-05, + "loss": 0.5734, + "step": 1996 + }, + { + "epoch": 0.16, + "grad_norm": 0.9658528853661609, + "learning_rate": 1.9096717573172192e-05, + "loss": 0.6279, + "step": 1997 + }, + { + "epoch": 0.16, + "grad_norm": 1.0353063421004602, + "learning_rate": 1.909562391862883e-05, + "loss": 0.6234, + "step": 1998 + }, + { + "epoch": 0.16, + "grad_norm": 0.8673949782339934, + "learning_rate": 1.90945296337668e-05, + "loss": 0.5474, + "step": 1999 + }, + { + "epoch": 0.16, + "grad_norm": 1.0514599651032068, + "learning_rate": 1.909343471866193e-05, + "loss": 0.6215, + "step": 2000 + }, + { + "epoch": 0.16, + "grad_norm": 1.0759403952207536, + "learning_rate": 1.9092339173390108e-05, + "loss": 0.6361, + "step": 2001 + }, + { + "epoch": 0.16, + "grad_norm": 1.1525565183994508, + "learning_rate": 1.909124299802724e-05, + "loss": 0.6463, + "step": 2002 + }, + { + "epoch": 0.16, + "grad_norm": 1.0991145737955368, + "learning_rate": 1.9090146192649293e-05, + "loss": 0.6291, + "step": 2003 + }, + { + "epoch": 0.16, + "grad_norm": 0.9114300270012013, + "learning_rate": 1.9089048757332285e-05, + "loss": 0.5504, + "step": 2004 + }, + { + "epoch": 0.16, + "grad_norm": 1.1271425098911323, + "learning_rate": 1.908795069215226e-05, + "loss": 0.6001, + "step": 2005 + }, + { + "epoch": 0.16, + "grad_norm": 0.9186969798037613, + "learning_rate": 1.9086851997185307e-05, + "loss": 0.5815, + "step": 2006 + }, + { + "epoch": 0.16, + "grad_norm": 0.9598657717472785, + "learning_rate": 1.908575267250757e-05, + "loss": 0.6459, + "step": 2007 + }, + { + "epoch": 0.16, + "grad_norm": 1.0136639599893646, + "learning_rate": 1.9084652718195237e-05, + "loss": 0.5951, + "step": 2008 + }, + { + "epoch": 0.16, + "grad_norm": 0.9345431131491013, + "learning_rate": 1.908355213432453e-05, + "loss": 0.5771, + "step": 2009 + }, + { + "epoch": 0.16, + "grad_norm": 0.8956818361321761, + "learning_rate": 1.9082450920971712e-05, + "loss": 0.593, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 0.9994855629040246, + "learning_rate": 1.9081349078213105e-05, + "loss": 0.6248, + "step": 2011 + }, + { + "epoch": 0.16, + "grad_norm": 1.2337518096291753, + "learning_rate": 1.908024660612506e-05, + "loss": 0.5699, + "step": 2012 + }, + { + "epoch": 0.16, + "grad_norm": 0.938035174519471, + "learning_rate": 1.9079143504783982e-05, + "loss": 0.6318, + "step": 2013 + }, + { + "epoch": 0.16, + "grad_norm": 0.9892323617279899, + "learning_rate": 1.9078039774266308e-05, + "loss": 0.5445, + "step": 2014 + }, + { + "epoch": 0.16, + "grad_norm": 0.922117987267186, + "learning_rate": 1.9076935414648533e-05, + "loss": 0.5127, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 0.9604587542539131, + "learning_rate": 1.9075830426007184e-05, + "loss": 0.5714, + "step": 2016 + }, + { + "epoch": 0.16, + "grad_norm": 0.9977282997590355, + "learning_rate": 1.9074724808418837e-05, + "loss": 0.537, + "step": 2017 + }, + { + "epoch": 0.16, + "grad_norm": 1.0716207428072122, + "learning_rate": 1.907361856196011e-05, + "loss": 0.6096, + "step": 2018 + }, + { + "epoch": 0.16, + "grad_norm": 1.0384083043576222, + "learning_rate": 1.9072511686707663e-05, + "loss": 0.5372, + "step": 2019 + }, + { + "epoch": 0.16, + "grad_norm": 0.9594529053049637, + "learning_rate": 1.9071404182738206e-05, + "loss": 0.5936, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 0.9420925394401806, + "learning_rate": 1.9070296050128486e-05, + "loss": 0.532, + "step": 2021 + }, + { + "epoch": 0.16, + "grad_norm": 1.0147759141041532, + "learning_rate": 1.9069187288955296e-05, + "loss": 0.5981, + "step": 2022 + }, + { + "epoch": 0.16, + "grad_norm": 0.9816266033078141, + "learning_rate": 1.9068077899295468e-05, + "loss": 0.6499, + "step": 2023 + }, + { + "epoch": 0.16, + "grad_norm": 0.9879841579914928, + "learning_rate": 1.9066967881225887e-05, + "loss": 0.5568, + "step": 2024 + }, + { + "epoch": 0.16, + "grad_norm": 1.058523530099737, + "learning_rate": 1.906585723482347e-05, + "loss": 0.6412, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 0.9878797709934289, + "learning_rate": 1.9064745960165196e-05, + "loss": 0.6247, + "step": 2026 + }, + { + "epoch": 0.16, + "grad_norm": 0.8936292649973512, + "learning_rate": 1.906363405732806e-05, + "loss": 0.5601, + "step": 2027 + }, + { + "epoch": 0.16, + "grad_norm": 1.011701229053685, + "learning_rate": 1.9062521526389126e-05, + "loss": 0.5808, + "step": 2028 + }, + { + "epoch": 0.16, + "grad_norm": 1.1187731519116415, + "learning_rate": 1.906140836742549e-05, + "loss": 0.6699, + "step": 2029 + }, + { + "epoch": 0.16, + "grad_norm": 1.0718544915843096, + "learning_rate": 1.9060294580514293e-05, + "loss": 0.6348, + "step": 2030 + }, + { + "epoch": 0.17, + "grad_norm": 0.9462997682760822, + "learning_rate": 1.905918016573271e-05, + "loss": 0.6116, + "step": 2031 + }, + { + "epoch": 0.17, + "grad_norm": 0.9663357125634172, + "learning_rate": 1.9058065123157985e-05, + "loss": 0.6202, + "step": 2032 + }, + { + "epoch": 0.17, + "grad_norm": 0.9246530545030645, + "learning_rate": 1.905694945286738e-05, + "loss": 0.5894, + "step": 2033 + }, + { + "epoch": 0.17, + "grad_norm": 0.9493007285658821, + "learning_rate": 1.9055833154938208e-05, + "loss": 0.5899, + "step": 2034 + }, + { + "epoch": 0.17, + "grad_norm": 0.9030284864206434, + "learning_rate": 1.9054716229447835e-05, + "loss": 0.5221, + "step": 2035 + }, + { + "epoch": 0.17, + "grad_norm": 0.9601465057030015, + "learning_rate": 1.9053598676473656e-05, + "loss": 0.6482, + "step": 2036 + }, + { + "epoch": 0.17, + "grad_norm": 1.0732783069225416, + "learning_rate": 1.905248049609312e-05, + "loss": 0.6566, + "step": 2037 + }, + { + "epoch": 0.17, + "grad_norm": 1.1640059200373662, + "learning_rate": 1.9051361688383715e-05, + "loss": 0.6609, + "step": 2038 + }, + { + "epoch": 0.17, + "grad_norm": 0.922087630723464, + "learning_rate": 1.9050242253422975e-05, + "loss": 0.5481, + "step": 2039 + }, + { + "epoch": 0.17, + "grad_norm": 1.0115892884144417, + "learning_rate": 1.9049122191288473e-05, + "loss": 0.5152, + "step": 2040 + }, + { + "epoch": 0.17, + "grad_norm": 0.9810015503795328, + "learning_rate": 1.9048001502057828e-05, + "loss": 0.5688, + "step": 2041 + }, + { + "epoch": 0.17, + "grad_norm": 0.9424808502813058, + "learning_rate": 1.9046880185808706e-05, + "loss": 0.629, + "step": 2042 + }, + { + "epoch": 0.17, + "grad_norm": 1.0318164669425995, + "learning_rate": 1.9045758242618813e-05, + "loss": 0.6051, + "step": 2043 + }, + { + "epoch": 0.17, + "grad_norm": 1.0273118739479803, + "learning_rate": 1.9044635672565898e-05, + "loss": 0.5613, + "step": 2044 + }, + { + "epoch": 0.17, + "grad_norm": 0.8830868072185081, + "learning_rate": 1.904351247572775e-05, + "loss": 0.5698, + "step": 2045 + }, + { + "epoch": 0.17, + "grad_norm": 1.1183889457477896, + "learning_rate": 1.904238865218221e-05, + "loss": 0.4665, + "step": 2046 + }, + { + "epoch": 0.17, + "grad_norm": 1.0225856305965528, + "learning_rate": 1.9041264202007158e-05, + "loss": 0.5748, + "step": 2047 + }, + { + "epoch": 0.17, + "grad_norm": 1.0694762111194591, + "learning_rate": 1.9040139125280517e-05, + "loss": 0.5748, + "step": 2048 + }, + { + "epoch": 0.17, + "grad_norm": 1.0199752555319277, + "learning_rate": 1.9039013422080255e-05, + "loss": 0.6081, + "step": 2049 + }, + { + "epoch": 0.17, + "grad_norm": 0.9842016088499462, + "learning_rate": 1.9037887092484377e-05, + "loss": 0.5637, + "step": 2050 + }, + { + "epoch": 0.17, + "grad_norm": 1.023273970692155, + "learning_rate": 1.903676013657094e-05, + "loss": 0.6317, + "step": 2051 + }, + { + "epoch": 0.17, + "grad_norm": 1.030403375603567, + "learning_rate": 1.9035632554418045e-05, + "loss": 0.6595, + "step": 2052 + }, + { + "epoch": 0.17, + "grad_norm": 0.9432884806250941, + "learning_rate": 1.9034504346103825e-05, + "loss": 0.6023, + "step": 2053 + }, + { + "epoch": 0.17, + "grad_norm": 1.0073480752386745, + "learning_rate": 1.9033375511706466e-05, + "loss": 0.6112, + "step": 2054 + }, + { + "epoch": 0.17, + "grad_norm": 1.068931655724002, + "learning_rate": 1.90322460513042e-05, + "loss": 0.6017, + "step": 2055 + }, + { + "epoch": 0.17, + "grad_norm": 1.0386163177149474, + "learning_rate": 1.9031115964975295e-05, + "loss": 0.6221, + "step": 2056 + }, + { + "epoch": 0.17, + "grad_norm": 1.0051235548300301, + "learning_rate": 1.9029985252798062e-05, + "loss": 0.6493, + "step": 2057 + }, + { + "epoch": 0.17, + "grad_norm": 1.0869761990452371, + "learning_rate": 1.902885391485086e-05, + "loss": 0.5659, + "step": 2058 + }, + { + "epoch": 0.17, + "grad_norm": 0.9535848607413672, + "learning_rate": 1.9027721951212092e-05, + "loss": 0.6128, + "step": 2059 + }, + { + "epoch": 0.17, + "grad_norm": 1.0412077321045292, + "learning_rate": 1.90265893619602e-05, + "loss": 0.6478, + "step": 2060 + }, + { + "epoch": 0.17, + "grad_norm": 1.0429919366838214, + "learning_rate": 1.9025456147173668e-05, + "loss": 0.6557, + "step": 2061 + }, + { + "epoch": 0.17, + "grad_norm": 0.9722372011141724, + "learning_rate": 1.9024322306931035e-05, + "loss": 0.585, + "step": 2062 + }, + { + "epoch": 0.17, + "grad_norm": 1.026797061134865, + "learning_rate": 1.902318784131087e-05, + "loss": 0.661, + "step": 2063 + }, + { + "epoch": 0.17, + "grad_norm": 0.9701379700065667, + "learning_rate": 1.902205275039179e-05, + "loss": 0.6493, + "step": 2064 + }, + { + "epoch": 0.17, + "grad_norm": 0.9532383610104006, + "learning_rate": 1.902091703425246e-05, + "loss": 0.5722, + "step": 2065 + }, + { + "epoch": 0.17, + "grad_norm": 1.0321123189440147, + "learning_rate": 1.901978069297158e-05, + "loss": 0.5949, + "step": 2066 + }, + { + "epoch": 0.17, + "grad_norm": 0.9594333455983534, + "learning_rate": 1.9018643726627894e-05, + "loss": 0.4708, + "step": 2067 + }, + { + "epoch": 0.17, + "grad_norm": 0.9797937635347074, + "learning_rate": 1.90175061353002e-05, + "loss": 0.4845, + "step": 2068 + }, + { + "epoch": 0.17, + "grad_norm": 1.029429760101533, + "learning_rate": 1.9016367919067332e-05, + "loss": 0.6088, + "step": 2069 + }, + { + "epoch": 0.17, + "grad_norm": 1.0400521070299615, + "learning_rate": 1.9015229078008163e-05, + "loss": 0.6772, + "step": 2070 + }, + { + "epoch": 0.17, + "grad_norm": 0.984440854452079, + "learning_rate": 1.9014089612201612e-05, + "loss": 0.5406, + "step": 2071 + }, + { + "epoch": 0.17, + "grad_norm": 1.0205451452855927, + "learning_rate": 1.901294952172665e-05, + "loss": 0.5767, + "step": 2072 + }, + { + "epoch": 0.17, + "grad_norm": 0.9840334858354977, + "learning_rate": 1.901180880666228e-05, + "loss": 0.6672, + "step": 2073 + }, + { + "epoch": 0.17, + "grad_norm": 0.9427674229758277, + "learning_rate": 1.9010667467087554e-05, + "loss": 0.5223, + "step": 2074 + }, + { + "epoch": 0.17, + "grad_norm": 1.1013136774216972, + "learning_rate": 1.9009525503081565e-05, + "loss": 0.5849, + "step": 2075 + }, + { + "epoch": 0.17, + "grad_norm": 1.0761601500099016, + "learning_rate": 1.900838291472345e-05, + "loss": 0.5946, + "step": 2076 + }, + { + "epoch": 0.17, + "grad_norm": 1.0176861590592503, + "learning_rate": 1.900723970209239e-05, + "loss": 0.5184, + "step": 2077 + }, + { + "epoch": 0.17, + "grad_norm": 1.0460873298295472, + "learning_rate": 1.9006095865267605e-05, + "loss": 0.5795, + "step": 2078 + }, + { + "epoch": 0.17, + "grad_norm": 0.9584819948779129, + "learning_rate": 1.9004951404328363e-05, + "loss": 0.6414, + "step": 2079 + }, + { + "epoch": 0.17, + "grad_norm": 1.087426505655538, + "learning_rate": 1.9003806319353985e-05, + "loss": 0.5963, + "step": 2080 + }, + { + "epoch": 0.17, + "grad_norm": 0.9346911893859755, + "learning_rate": 1.9002660610423808e-05, + "loss": 0.5711, + "step": 2081 + }, + { + "epoch": 0.17, + "grad_norm": 1.0277683323783546, + "learning_rate": 1.9001514277617236e-05, + "loss": 0.6062, + "step": 2082 + }, + { + "epoch": 0.17, + "grad_norm": 0.9992807356109709, + "learning_rate": 1.900036732101371e-05, + "loss": 0.6467, + "step": 2083 + }, + { + "epoch": 0.17, + "grad_norm": 0.8751132896048749, + "learning_rate": 1.8999219740692716e-05, + "loss": 0.5005, + "step": 2084 + }, + { + "epoch": 0.17, + "grad_norm": 0.9918946212928599, + "learning_rate": 1.899807153673377e-05, + "loss": 0.6396, + "step": 2085 + }, + { + "epoch": 0.17, + "grad_norm": 0.8375400677513595, + "learning_rate": 1.8996922709216456e-05, + "loss": 0.5189, + "step": 2086 + }, + { + "epoch": 0.17, + "grad_norm": 1.0006365110478812, + "learning_rate": 1.8995773258220374e-05, + "loss": 0.6274, + "step": 2087 + }, + { + "epoch": 0.17, + "grad_norm": 0.9937520191210955, + "learning_rate": 1.8994623183825183e-05, + "loss": 0.6189, + "step": 2088 + }, + { + "epoch": 0.17, + "grad_norm": 1.1116281752553856, + "learning_rate": 1.8993472486110586e-05, + "loss": 0.694, + "step": 2089 + }, + { + "epoch": 0.17, + "grad_norm": 1.0213234557429571, + "learning_rate": 1.899232116515632e-05, + "loss": 0.6026, + "step": 2090 + }, + { + "epoch": 0.17, + "grad_norm": 1.0023366265849354, + "learning_rate": 1.8991169221042173e-05, + "loss": 0.5995, + "step": 2091 + }, + { + "epoch": 0.17, + "grad_norm": 0.9910945945231302, + "learning_rate": 1.8990016653847978e-05, + "loss": 0.6022, + "step": 2092 + }, + { + "epoch": 0.17, + "grad_norm": 1.1200184181031805, + "learning_rate": 1.8988863463653603e-05, + "loss": 0.6562, + "step": 2093 + }, + { + "epoch": 0.17, + "grad_norm": 1.0155184591151794, + "learning_rate": 1.8987709650538958e-05, + "loss": 0.6389, + "step": 2094 + }, + { + "epoch": 0.17, + "grad_norm": 0.9650011456012725, + "learning_rate": 1.898655521458401e-05, + "loss": 0.5576, + "step": 2095 + }, + { + "epoch": 0.17, + "grad_norm": 1.0579642872444046, + "learning_rate": 1.8985400155868756e-05, + "loss": 0.5898, + "step": 2096 + }, + { + "epoch": 0.17, + "grad_norm": 0.968661935774855, + "learning_rate": 1.898424447447324e-05, + "loss": 0.5597, + "step": 2097 + }, + { + "epoch": 0.17, + "grad_norm": 1.0804058104288323, + "learning_rate": 1.8983088170477556e-05, + "loss": 0.6229, + "step": 2098 + }, + { + "epoch": 0.17, + "grad_norm": 1.1630361930612874, + "learning_rate": 1.8981931243961823e-05, + "loss": 0.6395, + "step": 2099 + }, + { + "epoch": 0.17, + "grad_norm": 0.9472132177706004, + "learning_rate": 1.8980773695006226e-05, + "loss": 0.5772, + "step": 2100 + }, + { + "epoch": 0.17, + "grad_norm": 0.9232106027800606, + "learning_rate": 1.897961552369098e-05, + "loss": 0.5748, + "step": 2101 + }, + { + "epoch": 0.17, + "grad_norm": 0.9375393267389712, + "learning_rate": 1.8978456730096336e-05, + "loss": 0.5782, + "step": 2102 + }, + { + "epoch": 0.17, + "grad_norm": 1.0526881739286673, + "learning_rate": 1.897729731430261e-05, + "loss": 0.5435, + "step": 2103 + }, + { + "epoch": 0.17, + "grad_norm": 1.1440929482692097, + "learning_rate": 1.8976137276390145e-05, + "loss": 0.6177, + "step": 2104 + }, + { + "epoch": 0.17, + "grad_norm": 0.9239252211596448, + "learning_rate": 1.897497661643932e-05, + "loss": 0.6119, + "step": 2105 + }, + { + "epoch": 0.17, + "grad_norm": 0.9664453721766127, + "learning_rate": 1.8973815334530583e-05, + "loss": 0.523, + "step": 2106 + }, + { + "epoch": 0.17, + "grad_norm": 0.9504949286899973, + "learning_rate": 1.8972653430744403e-05, + "loss": 0.6081, + "step": 2107 + }, + { + "epoch": 0.17, + "grad_norm": 1.0457199660560634, + "learning_rate": 1.8971490905161297e-05, + "loss": 0.6772, + "step": 2108 + }, + { + "epoch": 0.17, + "grad_norm": 1.0378069524788978, + "learning_rate": 1.897032775786183e-05, + "loss": 0.626, + "step": 2109 + }, + { + "epoch": 0.17, + "grad_norm": 0.9729500857706217, + "learning_rate": 1.8969163988926606e-05, + "loss": 0.5945, + "step": 2110 + }, + { + "epoch": 0.17, + "grad_norm": 1.533917801839161, + "learning_rate": 1.896799959843627e-05, + "loss": 0.6287, + "step": 2111 + }, + { + "epoch": 0.17, + "grad_norm": 1.0979853305051799, + "learning_rate": 1.8966834586471517e-05, + "loss": 0.656, + "step": 2112 + }, + { + "epoch": 0.17, + "grad_norm": 0.9436270666351112, + "learning_rate": 1.8965668953113083e-05, + "loss": 0.5879, + "step": 2113 + }, + { + "epoch": 0.17, + "grad_norm": 0.9454702686057739, + "learning_rate": 1.8964502698441745e-05, + "loss": 0.5991, + "step": 2114 + }, + { + "epoch": 0.17, + "grad_norm": 0.8873844998178761, + "learning_rate": 1.8963335822538317e-05, + "loss": 0.5835, + "step": 2115 + }, + { + "epoch": 0.17, + "grad_norm": 1.147471428553273, + "learning_rate": 1.896216832548367e-05, + "loss": 0.6603, + "step": 2116 + }, + { + "epoch": 0.17, + "grad_norm": 1.1057922519103787, + "learning_rate": 1.8961000207358707e-05, + "loss": 0.6675, + "step": 2117 + }, + { + "epoch": 0.17, + "grad_norm": 0.9412420554305279, + "learning_rate": 1.895983146824438e-05, + "loss": 0.6009, + "step": 2118 + }, + { + "epoch": 0.17, + "grad_norm": 0.9299496677178599, + "learning_rate": 1.8958662108221677e-05, + "loss": 0.5487, + "step": 2119 + }, + { + "epoch": 0.17, + "grad_norm": 1.0024697235170035, + "learning_rate": 1.8957492127371635e-05, + "loss": 0.644, + "step": 2120 + }, + { + "epoch": 0.17, + "grad_norm": 0.9525929588641228, + "learning_rate": 1.8956321525775337e-05, + "loss": 0.628, + "step": 2121 + }, + { + "epoch": 0.17, + "grad_norm": 1.0418909039513666, + "learning_rate": 1.8955150303513902e-05, + "loss": 0.5646, + "step": 2122 + }, + { + "epoch": 0.17, + "grad_norm": 0.9514053674277251, + "learning_rate": 1.895397846066849e-05, + "loss": 0.5671, + "step": 2123 + }, + { + "epoch": 0.17, + "grad_norm": 1.040858447434405, + "learning_rate": 1.8952805997320315e-05, + "loss": 0.6042, + "step": 2124 + }, + { + "epoch": 0.17, + "grad_norm": 1.0380436753980238, + "learning_rate": 1.8951632913550625e-05, + "loss": 0.6072, + "step": 2125 + }, + { + "epoch": 0.17, + "grad_norm": 1.0234302731992189, + "learning_rate": 1.8950459209440716e-05, + "loss": 0.6109, + "step": 2126 + }, + { + "epoch": 0.17, + "grad_norm": 1.0945225898261712, + "learning_rate": 1.8949284885071917e-05, + "loss": 0.6402, + "step": 2127 + }, + { + "epoch": 0.17, + "grad_norm": 0.9642259329102109, + "learning_rate": 1.8948109940525622e-05, + "loss": 0.5854, + "step": 2128 + }, + { + "epoch": 0.17, + "grad_norm": 0.9262640462813323, + "learning_rate": 1.894693437588324e-05, + "loss": 0.5699, + "step": 2129 + }, + { + "epoch": 0.17, + "grad_norm": 1.0546036035380781, + "learning_rate": 1.8945758191226242e-05, + "loss": 0.6399, + "step": 2130 + }, + { + "epoch": 0.17, + "grad_norm": 0.892279726784369, + "learning_rate": 1.8944581386636137e-05, + "loss": 0.5991, + "step": 2131 + }, + { + "epoch": 0.17, + "grad_norm": 0.9601886962094532, + "learning_rate": 1.8943403962194477e-05, + "loss": 0.5391, + "step": 2132 + }, + { + "epoch": 0.17, + "grad_norm": 0.9820769851632366, + "learning_rate": 1.8942225917982854e-05, + "loss": 0.6186, + "step": 2133 + }, + { + "epoch": 0.17, + "grad_norm": 1.0907957831446005, + "learning_rate": 1.8941047254082903e-05, + "loss": 0.5005, + "step": 2134 + }, + { + "epoch": 0.17, + "grad_norm": 0.8872934358107167, + "learning_rate": 1.8939867970576315e-05, + "loss": 0.5791, + "step": 2135 + }, + { + "epoch": 0.17, + "grad_norm": 1.012616664948051, + "learning_rate": 1.8938688067544802e-05, + "loss": 0.5794, + "step": 2136 + }, + { + "epoch": 0.17, + "grad_norm": 1.023609395455567, + "learning_rate": 1.893750754507014e-05, + "loss": 0.5418, + "step": 2137 + }, + { + "epoch": 0.17, + "grad_norm": 1.0245431610313795, + "learning_rate": 1.8936326403234125e-05, + "loss": 0.5804, + "step": 2138 + }, + { + "epoch": 0.17, + "grad_norm": 1.0381393955766731, + "learning_rate": 1.893514464211862e-05, + "loss": 0.617, + "step": 2139 + }, + { + "epoch": 0.17, + "grad_norm": 0.9191246443163674, + "learning_rate": 1.8933962261805515e-05, + "loss": 0.6222, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 1.0053856875035811, + "learning_rate": 1.893277926237675e-05, + "loss": 0.5887, + "step": 2141 + }, + { + "epoch": 0.17, + "grad_norm": 0.9937571856876615, + "learning_rate": 1.8931595643914307e-05, + "loss": 0.6551, + "step": 2142 + }, + { + "epoch": 0.17, + "grad_norm": 1.0103025400896615, + "learning_rate": 1.893041140650021e-05, + "loss": 0.4889, + "step": 2143 + }, + { + "epoch": 0.17, + "grad_norm": 1.1326283733191278, + "learning_rate": 1.8929226550216522e-05, + "loss": 0.6579, + "step": 2144 + }, + { + "epoch": 0.17, + "grad_norm": 1.0345882359058862, + "learning_rate": 1.8928041075145352e-05, + "loss": 0.6248, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 0.9638961572425462, + "learning_rate": 1.892685498136886e-05, + "loss": 0.5776, + "step": 2146 + }, + { + "epoch": 0.17, + "grad_norm": 0.9503035496039794, + "learning_rate": 1.892566826896923e-05, + "loss": 0.5758, + "step": 2147 + }, + { + "epoch": 0.17, + "grad_norm": 1.0265755834645953, + "learning_rate": 1.8924480938028708e-05, + "loss": 0.5906, + "step": 2148 + }, + { + "epoch": 0.17, + "grad_norm": 0.9772350507987435, + "learning_rate": 1.8923292988629575e-05, + "loss": 0.5762, + "step": 2149 + }, + { + "epoch": 0.17, + "grad_norm": 0.9678168956550066, + "learning_rate": 1.892210442085415e-05, + "loss": 0.5854, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 0.9550725284032997, + "learning_rate": 1.8920915234784805e-05, + "loss": 0.6434, + "step": 2151 + }, + { + "epoch": 0.17, + "grad_norm": 1.0418780540372863, + "learning_rate": 1.8919725430503946e-05, + "loss": 0.6381, + "step": 2152 + }, + { + "epoch": 0.17, + "grad_norm": 0.951117776669499, + "learning_rate": 1.8918535008094028e-05, + "loss": 0.5835, + "step": 2153 + }, + { + "epoch": 0.18, + "grad_norm": 0.8789893915475889, + "learning_rate": 1.891734396763754e-05, + "loss": 0.5071, + "step": 2154 + }, + { + "epoch": 0.18, + "grad_norm": 0.9637330021078095, + "learning_rate": 1.891615230921703e-05, + "loss": 0.602, + "step": 2155 + }, + { + "epoch": 0.18, + "grad_norm": 0.9730380922707319, + "learning_rate": 1.8914960032915072e-05, + "loss": 0.5574, + "step": 2156 + }, + { + "epoch": 0.18, + "grad_norm": 0.9745286624861224, + "learning_rate": 1.891376713881429e-05, + "loss": 0.6078, + "step": 2157 + }, + { + "epoch": 0.18, + "grad_norm": 0.9153110919261715, + "learning_rate": 1.8912573626997354e-05, + "loss": 0.5904, + "step": 2158 + }, + { + "epoch": 0.18, + "grad_norm": 1.011667820668252, + "learning_rate": 1.891137949754697e-05, + "loss": 0.6148, + "step": 2159 + }, + { + "epoch": 0.18, + "grad_norm": 1.0902155476345923, + "learning_rate": 1.891018475054589e-05, + "loss": 0.6056, + "step": 2160 + }, + { + "epoch": 0.18, + "grad_norm": 0.9472518475231407, + "learning_rate": 1.890898938607691e-05, + "loss": 0.544, + "step": 2161 + }, + { + "epoch": 0.18, + "grad_norm": 1.0552234696767666, + "learning_rate": 1.890779340422287e-05, + "loss": 0.6252, + "step": 2162 + }, + { + "epoch": 0.18, + "grad_norm": 1.0569263154227653, + "learning_rate": 1.8906596805066648e-05, + "loss": 0.5732, + "step": 2163 + }, + { + "epoch": 0.18, + "grad_norm": 1.0492075165471888, + "learning_rate": 1.8905399588691165e-05, + "loss": 0.621, + "step": 2164 + }, + { + "epoch": 0.18, + "grad_norm": 1.0089167595328195, + "learning_rate": 1.890420175517939e-05, + "loss": 0.6021, + "step": 2165 + }, + { + "epoch": 0.18, + "grad_norm": 1.0353057215864783, + "learning_rate": 1.8903003304614332e-05, + "loss": 0.5981, + "step": 2166 + }, + { + "epoch": 0.18, + "grad_norm": 0.9646737678180385, + "learning_rate": 1.8901804237079043e-05, + "loss": 0.6035, + "step": 2167 + }, + { + "epoch": 0.18, + "grad_norm": 1.0961600407946666, + "learning_rate": 1.8900604552656615e-05, + "loss": 0.6611, + "step": 2168 + }, + { + "epoch": 0.18, + "grad_norm": 0.9352215688772914, + "learning_rate": 1.889940425143019e-05, + "loss": 0.5734, + "step": 2169 + }, + { + "epoch": 0.18, + "grad_norm": 0.9609782533708203, + "learning_rate": 1.889820333348294e-05, + "loss": 0.569, + "step": 2170 + }, + { + "epoch": 0.18, + "grad_norm": 0.8452540176519757, + "learning_rate": 1.8897001798898093e-05, + "loss": 0.5334, + "step": 2171 + }, + { + "epoch": 0.18, + "grad_norm": 0.9851803544469124, + "learning_rate": 1.8895799647758912e-05, + "loss": 0.6187, + "step": 2172 + }, + { + "epoch": 0.18, + "grad_norm": 1.0293889942861165, + "learning_rate": 1.889459688014871e-05, + "loss": 0.6058, + "step": 2173 + }, + { + "epoch": 0.18, + "grad_norm": 1.0018336586296221, + "learning_rate": 1.8893393496150828e-05, + "loss": 0.6061, + "step": 2174 + }, + { + "epoch": 0.18, + "grad_norm": 1.008102851968431, + "learning_rate": 1.889218949584867e-05, + "loss": 0.6237, + "step": 2175 + }, + { + "epoch": 0.18, + "grad_norm": 1.0072450005393212, + "learning_rate": 1.8890984879325664e-05, + "loss": 0.6138, + "step": 2176 + }, + { + "epoch": 0.18, + "grad_norm": 1.187526031541943, + "learning_rate": 1.888977964666529e-05, + "loss": 0.6545, + "step": 2177 + }, + { + "epoch": 0.18, + "grad_norm": 0.9867953653591311, + "learning_rate": 1.8888573797951078e-05, + "loss": 0.6127, + "step": 2178 + }, + { + "epoch": 0.18, + "grad_norm": 0.9387087958810846, + "learning_rate": 1.888736733326658e-05, + "loss": 0.6117, + "step": 2179 + }, + { + "epoch": 0.18, + "grad_norm": 1.0722541801802747, + "learning_rate": 1.8886160252695413e-05, + "loss": 0.6302, + "step": 2180 + }, + { + "epoch": 0.18, + "grad_norm": 0.9779850206491939, + "learning_rate": 1.8884952556321223e-05, + "loss": 0.6469, + "step": 2181 + }, + { + "epoch": 0.18, + "grad_norm": 1.0439762284857736, + "learning_rate": 1.8883744244227697e-05, + "loss": 0.6531, + "step": 2182 + }, + { + "epoch": 0.18, + "grad_norm": 1.0267706789612425, + "learning_rate": 1.8882535316498577e-05, + "loss": 0.6373, + "step": 2183 + }, + { + "epoch": 0.18, + "grad_norm": 0.9898580400329446, + "learning_rate": 1.888132577321764e-05, + "loss": 0.6467, + "step": 2184 + }, + { + "epoch": 0.18, + "grad_norm": 0.8775000630268309, + "learning_rate": 1.8880115614468705e-05, + "loss": 0.5477, + "step": 2185 + }, + { + "epoch": 0.18, + "grad_norm": 0.9595056001380599, + "learning_rate": 1.8878904840335635e-05, + "loss": 0.669, + "step": 2186 + }, + { + "epoch": 0.18, + "grad_norm": 0.9219339134739535, + "learning_rate": 1.887769345090233e-05, + "loss": 0.5872, + "step": 2187 + }, + { + "epoch": 0.18, + "grad_norm": 1.034499138231239, + "learning_rate": 1.887648144625275e-05, + "loss": 0.6174, + "step": 2188 + }, + { + "epoch": 0.18, + "grad_norm": 0.9426548133498441, + "learning_rate": 1.8875268826470875e-05, + "loss": 0.5727, + "step": 2189 + }, + { + "epoch": 0.18, + "grad_norm": 1.0290765921044192, + "learning_rate": 1.8874055591640746e-05, + "loss": 0.5592, + "step": 2190 + }, + { + "epoch": 0.18, + "grad_norm": 0.8568541726447252, + "learning_rate": 1.887284174184643e-05, + "loss": 0.499, + "step": 2191 + }, + { + "epoch": 0.18, + "grad_norm": 1.009808118159287, + "learning_rate": 1.8871627277172058e-05, + "loss": 0.5931, + "step": 2192 + }, + { + "epoch": 0.18, + "grad_norm": 1.0151562984941591, + "learning_rate": 1.887041219770178e-05, + "loss": 0.6362, + "step": 2193 + }, + { + "epoch": 0.18, + "grad_norm": 0.9196707775669238, + "learning_rate": 1.8869196503519807e-05, + "loss": 0.5952, + "step": 2194 + }, + { + "epoch": 0.18, + "grad_norm": 0.9065759621006254, + "learning_rate": 1.8867980194710382e-05, + "loss": 0.6028, + "step": 2195 + }, + { + "epoch": 0.18, + "grad_norm": 1.027461981521237, + "learning_rate": 1.88667632713578e-05, + "loss": 0.5995, + "step": 2196 + }, + { + "epoch": 0.18, + "grad_norm": 0.9316606805491132, + "learning_rate": 1.886554573354638e-05, + "loss": 0.561, + "step": 2197 + }, + { + "epoch": 0.18, + "grad_norm": 1.0300501829703554, + "learning_rate": 1.886432758136051e-05, + "loss": 0.5255, + "step": 2198 + }, + { + "epoch": 0.18, + "grad_norm": 1.0239573905462445, + "learning_rate": 1.8863108814884602e-05, + "loss": 0.6205, + "step": 2199 + }, + { + "epoch": 0.18, + "grad_norm": 1.0534000129161385, + "learning_rate": 1.8861889434203112e-05, + "loss": 0.599, + "step": 2200 + }, + { + "epoch": 0.18, + "grad_norm": 1.0750868919378282, + "learning_rate": 1.8860669439400543e-05, + "loss": 0.5538, + "step": 2201 + }, + { + "epoch": 0.18, + "grad_norm": 0.9883194803662289, + "learning_rate": 1.8859448830561445e-05, + "loss": 0.6598, + "step": 2202 + }, + { + "epoch": 0.18, + "grad_norm": 0.9895234269568443, + "learning_rate": 1.8858227607770398e-05, + "loss": 0.668, + "step": 2203 + }, + { + "epoch": 0.18, + "grad_norm": 0.9954174791349467, + "learning_rate": 1.885700577111204e-05, + "loss": 0.632, + "step": 2204 + }, + { + "epoch": 0.18, + "grad_norm": 1.1474192952722595, + "learning_rate": 1.8855783320671034e-05, + "loss": 0.5958, + "step": 2205 + }, + { + "epoch": 0.18, + "grad_norm": 0.9192615979698529, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.5911, + "step": 2206 + }, + { + "epoch": 0.18, + "grad_norm": 1.013275376886477, + "learning_rate": 1.8853336578779994e-05, + "loss": 0.5722, + "step": 2207 + }, + { + "epoch": 0.18, + "grad_norm": 0.9358020044247859, + "learning_rate": 1.8852112287499518e-05, + "loss": 0.5594, + "step": 2208 + }, + { + "epoch": 0.18, + "grad_norm": 0.983445447661659, + "learning_rate": 1.8850887382775507e-05, + "loss": 0.5761, + "step": 2209 + }, + { + "epoch": 0.18, + "grad_norm": 1.026955046322064, + "learning_rate": 1.884966186469286e-05, + "loss": 0.5661, + "step": 2210 + }, + { + "epoch": 0.18, + "grad_norm": 1.159699459442207, + "learning_rate": 1.8848435733336487e-05, + "loss": 0.6511, + "step": 2211 + }, + { + "epoch": 0.18, + "grad_norm": 1.001777768746846, + "learning_rate": 1.884720898879137e-05, + "loss": 0.5746, + "step": 2212 + }, + { + "epoch": 0.18, + "grad_norm": 0.8785765469744956, + "learning_rate": 1.8845981631142518e-05, + "loss": 0.5533, + "step": 2213 + }, + { + "epoch": 0.18, + "grad_norm": 1.0559320675134651, + "learning_rate": 1.8844753660474985e-05, + "loss": 0.6484, + "step": 2214 + }, + { + "epoch": 0.18, + "grad_norm": 1.060208265156859, + "learning_rate": 1.8843525076873866e-05, + "loss": 0.6731, + "step": 2215 + }, + { + "epoch": 0.18, + "grad_norm": 0.9241440849844474, + "learning_rate": 1.8842295880424305e-05, + "loss": 0.5179, + "step": 2216 + }, + { + "epoch": 0.18, + "grad_norm": 0.960927482449649, + "learning_rate": 1.8841066071211485e-05, + "loss": 0.5848, + "step": 2217 + }, + { + "epoch": 0.18, + "grad_norm": 1.0117277009292722, + "learning_rate": 1.8839835649320622e-05, + "loss": 0.5871, + "step": 2218 + }, + { + "epoch": 0.18, + "grad_norm": 1.1051299188661905, + "learning_rate": 1.8838604614837e-05, + "loss": 0.6599, + "step": 2219 + }, + { + "epoch": 0.18, + "grad_norm": 2.1126022281978067, + "learning_rate": 1.8837372967845907e-05, + "loss": 0.6209, + "step": 2220 + }, + { + "epoch": 0.18, + "grad_norm": 1.0437512390828085, + "learning_rate": 1.883614070843271e-05, + "loss": 0.6129, + "step": 2221 + }, + { + "epoch": 0.18, + "grad_norm": 0.9347343174247541, + "learning_rate": 1.88349078366828e-05, + "loss": 0.5732, + "step": 2222 + }, + { + "epoch": 0.18, + "grad_norm": 0.978072948910801, + "learning_rate": 1.8833674352681613e-05, + "loss": 0.5862, + "step": 2223 + }, + { + "epoch": 0.18, + "grad_norm": 0.9035635392386544, + "learning_rate": 1.8832440256514633e-05, + "loss": 0.5817, + "step": 2224 + }, + { + "epoch": 0.18, + "grad_norm": 0.946716318845138, + "learning_rate": 1.8831205548267375e-05, + "loss": 0.6018, + "step": 2225 + }, + { + "epoch": 0.18, + "grad_norm": 0.9114345496424868, + "learning_rate": 1.8829970228025405e-05, + "loss": 0.5422, + "step": 2226 + }, + { + "epoch": 0.18, + "grad_norm": 0.947695890004665, + "learning_rate": 1.882873429587433e-05, + "loss": 0.6219, + "step": 2227 + }, + { + "epoch": 0.18, + "grad_norm": 0.9607336606853346, + "learning_rate": 1.8827497751899798e-05, + "loss": 0.5899, + "step": 2228 + }, + { + "epoch": 0.18, + "grad_norm": 1.056301862698836, + "learning_rate": 1.8826260596187505e-05, + "loss": 0.6012, + "step": 2229 + }, + { + "epoch": 0.18, + "grad_norm": 0.9839236383385103, + "learning_rate": 1.882502282882318e-05, + "loss": 0.5476, + "step": 2230 + }, + { + "epoch": 0.18, + "grad_norm": 0.9453761983415658, + "learning_rate": 1.88237844498926e-05, + "loss": 0.5613, + "step": 2231 + }, + { + "epoch": 0.18, + "grad_norm": 1.0769594253515593, + "learning_rate": 1.8822545459481585e-05, + "loss": 0.6289, + "step": 2232 + }, + { + "epoch": 0.18, + "grad_norm": 0.9504981321207882, + "learning_rate": 1.8821305857675997e-05, + "loss": 0.5588, + "step": 2233 + }, + { + "epoch": 0.18, + "grad_norm": 0.9703487793124386, + "learning_rate": 1.8820065644561736e-05, + "loss": 0.5812, + "step": 2234 + }, + { + "epoch": 0.18, + "grad_norm": 0.9863394180870134, + "learning_rate": 1.8818824820224747e-05, + "loss": 0.6304, + "step": 2235 + }, + { + "epoch": 0.18, + "grad_norm": 0.9529494000210472, + "learning_rate": 1.8817583384751023e-05, + "loss": 0.5819, + "step": 2236 + }, + { + "epoch": 0.18, + "grad_norm": 0.9677249493395993, + "learning_rate": 1.881634133822659e-05, + "loss": 0.6066, + "step": 2237 + }, + { + "epoch": 0.18, + "grad_norm": 0.9420220803818873, + "learning_rate": 1.8815098680737523e-05, + "loss": 0.5745, + "step": 2238 + }, + { + "epoch": 0.18, + "grad_norm": 1.016139132803753, + "learning_rate": 1.881385541236994e-05, + "loss": 0.6346, + "step": 2239 + }, + { + "epoch": 0.18, + "grad_norm": 1.1089714899910492, + "learning_rate": 1.881261153320999e-05, + "loss": 0.6067, + "step": 2240 + }, + { + "epoch": 0.18, + "grad_norm": 0.9853924873312063, + "learning_rate": 1.881136704334388e-05, + "loss": 0.6338, + "step": 2241 + }, + { + "epoch": 0.18, + "grad_norm": 1.8155046887597646, + "learning_rate": 1.8810121942857848e-05, + "loss": 0.6043, + "step": 2242 + }, + { + "epoch": 0.18, + "grad_norm": 0.9104861985868933, + "learning_rate": 1.880887623183818e-05, + "loss": 0.5181, + "step": 2243 + }, + { + "epoch": 0.18, + "grad_norm": 0.9228901883726971, + "learning_rate": 1.8807629910371203e-05, + "loss": 0.5156, + "step": 2244 + }, + { + "epoch": 0.18, + "grad_norm": 0.9801612127255177, + "learning_rate": 1.8806382978543283e-05, + "loss": 0.6033, + "step": 2245 + }, + { + "epoch": 0.18, + "grad_norm": 1.056989468750248, + "learning_rate": 1.8805135436440837e-05, + "loss": 0.6078, + "step": 2246 + }, + { + "epoch": 0.18, + "grad_norm": 1.1457163395568721, + "learning_rate": 1.8803887284150317e-05, + "loss": 0.6381, + "step": 2247 + }, + { + "epoch": 0.18, + "grad_norm": 1.0805771612792152, + "learning_rate": 1.8802638521758214e-05, + "loss": 0.6548, + "step": 2248 + }, + { + "epoch": 0.18, + "grad_norm": 1.3202820723326176, + "learning_rate": 1.880138914935107e-05, + "loss": 0.636, + "step": 2249 + }, + { + "epoch": 0.18, + "grad_norm": 0.9148686090902376, + "learning_rate": 1.8800139167015466e-05, + "loss": 0.5745, + "step": 2250 + }, + { + "epoch": 0.18, + "grad_norm": 1.0389049536285395, + "learning_rate": 1.8798888574838023e-05, + "loss": 0.6133, + "step": 2251 + }, + { + "epoch": 0.18, + "grad_norm": 1.030661980018216, + "learning_rate": 1.8797637372905407e-05, + "loss": 0.548, + "step": 2252 + }, + { + "epoch": 0.18, + "grad_norm": 1.0276577285974946, + "learning_rate": 1.8796385561304323e-05, + "loss": 0.6184, + "step": 2253 + }, + { + "epoch": 0.18, + "grad_norm": 1.0196070836335733, + "learning_rate": 1.8795133140121522e-05, + "loss": 0.5818, + "step": 2254 + }, + { + "epoch": 0.18, + "grad_norm": 1.0889379693225758, + "learning_rate": 1.8793880109443797e-05, + "loss": 0.6733, + "step": 2255 + }, + { + "epoch": 0.18, + "grad_norm": 0.9353604600412142, + "learning_rate": 1.8792626469357983e-05, + "loss": 0.5649, + "step": 2256 + }, + { + "epoch": 0.18, + "grad_norm": 0.9000891999282137, + "learning_rate": 1.879137221995095e-05, + "loss": 0.558, + "step": 2257 + }, + { + "epoch": 0.18, + "grad_norm": 0.9242703330932535, + "learning_rate": 1.879011736130962e-05, + "loss": 0.6322, + "step": 2258 + }, + { + "epoch": 0.18, + "grad_norm": 1.1666243279578408, + "learning_rate": 1.8788861893520954e-05, + "loss": 0.6027, + "step": 2259 + }, + { + "epoch": 0.18, + "grad_norm": 0.9173273086450375, + "learning_rate": 1.8787605816671956e-05, + "loss": 0.582, + "step": 2260 + }, + { + "epoch": 0.18, + "grad_norm": 0.9437307573005449, + "learning_rate": 1.8786349130849667e-05, + "loss": 0.5623, + "step": 2261 + }, + { + "epoch": 0.18, + "grad_norm": 0.8570905229082192, + "learning_rate": 1.8785091836141177e-05, + "loss": 0.522, + "step": 2262 + }, + { + "epoch": 0.18, + "grad_norm": 1.5564688988373234, + "learning_rate": 1.8783833932633617e-05, + "loss": 0.6621, + "step": 2263 + }, + { + "epoch": 0.18, + "grad_norm": 1.0150904173397395, + "learning_rate": 1.8782575420414155e-05, + "loss": 0.652, + "step": 2264 + }, + { + "epoch": 0.18, + "grad_norm": 0.9376520417247656, + "learning_rate": 1.8781316299570007e-05, + "loss": 0.5854, + "step": 2265 + }, + { + "epoch": 0.18, + "grad_norm": 0.9330936778529615, + "learning_rate": 1.878005657018843e-05, + "loss": 0.5916, + "step": 2266 + }, + { + "epoch": 0.18, + "grad_norm": 1.0036469337468876, + "learning_rate": 1.877879623235672e-05, + "loss": 0.6498, + "step": 2267 + }, + { + "epoch": 0.18, + "grad_norm": 1.0443264323120567, + "learning_rate": 1.8777535286162217e-05, + "loss": 0.678, + "step": 2268 + }, + { + "epoch": 0.18, + "grad_norm": 0.9581630196481067, + "learning_rate": 1.8776273731692306e-05, + "loss": 0.6016, + "step": 2269 + }, + { + "epoch": 0.18, + "grad_norm": 1.083876675418555, + "learning_rate": 1.8775011569034405e-05, + "loss": 0.6118, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 0.9873027524445195, + "learning_rate": 1.877374879827599e-05, + "loss": 0.629, + "step": 2271 + }, + { + "epoch": 0.18, + "grad_norm": 0.9486312587682523, + "learning_rate": 1.8772485419504566e-05, + "loss": 0.5635, + "step": 2272 + }, + { + "epoch": 0.18, + "grad_norm": 1.011383681981482, + "learning_rate": 1.877122143280768e-05, + "loss": 0.642, + "step": 2273 + }, + { + "epoch": 0.18, + "grad_norm": 1.054871890426257, + "learning_rate": 1.8769956838272937e-05, + "loss": 0.624, + "step": 2274 + }, + { + "epoch": 0.18, + "grad_norm": 0.9243443195958089, + "learning_rate": 1.8768691635987957e-05, + "loss": 0.6012, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 0.9137583522941478, + "learning_rate": 1.8767425826040426e-05, + "loss": 0.6101, + "step": 2276 + }, + { + "epoch": 0.19, + "grad_norm": 0.8732029787589334, + "learning_rate": 1.8766159408518062e-05, + "loss": 0.5436, + "step": 2277 + }, + { + "epoch": 0.19, + "grad_norm": 1.0490382062184178, + "learning_rate": 1.8764892383508626e-05, + "loss": 0.5305, + "step": 2278 + }, + { + "epoch": 0.19, + "grad_norm": 1.0350121153065304, + "learning_rate": 1.8763624751099924e-05, + "loss": 0.6107, + "step": 2279 + }, + { + "epoch": 0.19, + "grad_norm": 0.8036004247191376, + "learning_rate": 1.8762356511379796e-05, + "loss": 0.5169, + "step": 2280 + }, + { + "epoch": 0.19, + "grad_norm": 0.9817024833816537, + "learning_rate": 1.8761087664436137e-05, + "loss": 0.5483, + "step": 2281 + }, + { + "epoch": 0.19, + "grad_norm": 1.0869713946968007, + "learning_rate": 1.8759818210356874e-05, + "loss": 0.6059, + "step": 2282 + }, + { + "epoch": 0.19, + "grad_norm": 0.9333008852297026, + "learning_rate": 1.8758548149229978e-05, + "loss": 0.5559, + "step": 2283 + }, + { + "epoch": 0.19, + "grad_norm": 0.9797631089608451, + "learning_rate": 1.8757277481143467e-05, + "loss": 0.5846, + "step": 2284 + }, + { + "epoch": 0.19, + "grad_norm": 0.8580503736403456, + "learning_rate": 1.8756006206185388e-05, + "loss": 0.5484, + "step": 2285 + }, + { + "epoch": 0.19, + "grad_norm": 0.9459834566720147, + "learning_rate": 1.8754734324443853e-05, + "loss": 0.5814, + "step": 2286 + }, + { + "epoch": 0.19, + "grad_norm": 1.129380323888552, + "learning_rate": 1.875346183600699e-05, + "loss": 0.6183, + "step": 2287 + }, + { + "epoch": 0.19, + "grad_norm": 0.9513144228125862, + "learning_rate": 1.8752188740962986e-05, + "loss": 0.607, + "step": 2288 + }, + { + "epoch": 0.19, + "grad_norm": 1.0304282864628564, + "learning_rate": 1.8750915039400068e-05, + "loss": 0.6071, + "step": 2289 + }, + { + "epoch": 0.19, + "grad_norm": 0.9340505697364657, + "learning_rate": 1.87496407314065e-05, + "loss": 0.5504, + "step": 2290 + }, + { + "epoch": 0.19, + "grad_norm": 1.023533370754278, + "learning_rate": 1.8748365817070586e-05, + "loss": 0.6182, + "step": 2291 + }, + { + "epoch": 0.19, + "grad_norm": 0.9882283676462045, + "learning_rate": 1.8747090296480683e-05, + "loss": 0.5844, + "step": 2292 + }, + { + "epoch": 0.19, + "grad_norm": 0.9320546167398589, + "learning_rate": 1.8745814169725183e-05, + "loss": 0.5435, + "step": 2293 + }, + { + "epoch": 0.19, + "grad_norm": 0.9867593700841457, + "learning_rate": 1.8744537436892517e-05, + "loss": 0.5975, + "step": 2294 + }, + { + "epoch": 0.19, + "grad_norm": 1.0258212231425314, + "learning_rate": 1.8743260098071163e-05, + "loss": 0.5049, + "step": 2295 + }, + { + "epoch": 0.19, + "grad_norm": 1.0907723321403282, + "learning_rate": 1.8741982153349642e-05, + "loss": 0.6586, + "step": 2296 + }, + { + "epoch": 0.19, + "grad_norm": 0.950702288691101, + "learning_rate": 1.8740703602816506e-05, + "loss": 0.6177, + "step": 2297 + }, + { + "epoch": 0.19, + "grad_norm": 1.0254255423493217, + "learning_rate": 1.8739424446560365e-05, + "loss": 0.6089, + "step": 2298 + }, + { + "epoch": 0.19, + "grad_norm": 1.0918084848642433, + "learning_rate": 1.8738144684669867e-05, + "loss": 0.6807, + "step": 2299 + }, + { + "epoch": 0.19, + "grad_norm": 1.0255381404566133, + "learning_rate": 1.8736864317233688e-05, + "loss": 0.6707, + "step": 2300 + }, + { + "epoch": 0.19, + "grad_norm": 0.9287783578796915, + "learning_rate": 1.873558334434056e-05, + "loss": 0.6023, + "step": 2301 + }, + { + "epoch": 0.19, + "grad_norm": 0.9522040066938621, + "learning_rate": 1.873430176607926e-05, + "loss": 0.5756, + "step": 2302 + }, + { + "epoch": 0.19, + "grad_norm": 0.9664634365215445, + "learning_rate": 1.8733019582538595e-05, + "loss": 0.5544, + "step": 2303 + }, + { + "epoch": 0.19, + "grad_norm": 0.9438264204012479, + "learning_rate": 1.8731736793807417e-05, + "loss": 0.5802, + "step": 2304 + }, + { + "epoch": 0.19, + "grad_norm": 0.9450277499788748, + "learning_rate": 1.873045339997462e-05, + "loss": 0.5775, + "step": 2305 + }, + { + "epoch": 0.19, + "grad_norm": 0.9226491997621605, + "learning_rate": 1.872916940112915e-05, + "loss": 0.5655, + "step": 2306 + }, + { + "epoch": 0.19, + "grad_norm": 0.9316094898328231, + "learning_rate": 1.8727884797359984e-05, + "loss": 0.571, + "step": 2307 + }, + { + "epoch": 0.19, + "grad_norm": 0.9161774303898466, + "learning_rate": 1.8726599588756144e-05, + "loss": 0.5742, + "step": 2308 + }, + { + "epoch": 0.19, + "grad_norm": 1.0212729130725422, + "learning_rate": 1.8725313775406693e-05, + "loss": 0.6725, + "step": 2309 + }, + { + "epoch": 0.19, + "grad_norm": 1.0371994533029285, + "learning_rate": 1.8724027357400737e-05, + "loss": 0.6275, + "step": 2310 + }, + { + "epoch": 0.19, + "grad_norm": 0.9387275440330368, + "learning_rate": 1.872274033482742e-05, + "loss": 0.609, + "step": 2311 + }, + { + "epoch": 0.19, + "grad_norm": 1.018429708491077, + "learning_rate": 1.8721452707775935e-05, + "loss": 0.5579, + "step": 2312 + }, + { + "epoch": 0.19, + "grad_norm": 1.0025377413107226, + "learning_rate": 1.8720164476335516e-05, + "loss": 0.6228, + "step": 2313 + }, + { + "epoch": 0.19, + "grad_norm": 1.1300305737441967, + "learning_rate": 1.8718875640595432e-05, + "loss": 0.661, + "step": 2314 + }, + { + "epoch": 0.19, + "grad_norm": 1.0204356367555911, + "learning_rate": 1.8717586200645002e-05, + "loss": 0.5863, + "step": 2315 + }, + { + "epoch": 0.19, + "grad_norm": 0.9527375433742457, + "learning_rate": 1.8716296156573578e-05, + "loss": 0.6602, + "step": 2316 + }, + { + "epoch": 0.19, + "grad_norm": 0.9901798497245836, + "learning_rate": 1.8715005508470565e-05, + "loss": 0.5693, + "step": 2317 + }, + { + "epoch": 0.19, + "grad_norm": 1.053822223301016, + "learning_rate": 1.8713714256425396e-05, + "loss": 0.5913, + "step": 2318 + }, + { + "epoch": 0.19, + "grad_norm": 0.8833294760007623, + "learning_rate": 1.8712422400527556e-05, + "loss": 0.5931, + "step": 2319 + }, + { + "epoch": 0.19, + "grad_norm": 0.9639514016390054, + "learning_rate": 1.8711129940866577e-05, + "loss": 0.5981, + "step": 2320 + }, + { + "epoch": 0.19, + "grad_norm": 1.0823666757152655, + "learning_rate": 1.870983687753202e-05, + "loss": 0.6608, + "step": 2321 + }, + { + "epoch": 0.19, + "grad_norm": 0.8931959289690588, + "learning_rate": 1.8708543210613492e-05, + "loss": 0.5971, + "step": 2322 + }, + { + "epoch": 0.19, + "grad_norm": 0.9858619839716677, + "learning_rate": 1.8707248940200643e-05, + "loss": 0.5847, + "step": 2323 + }, + { + "epoch": 0.19, + "grad_norm": 0.931992339599603, + "learning_rate": 1.8705954066383166e-05, + "loss": 0.5913, + "step": 2324 + }, + { + "epoch": 0.19, + "grad_norm": 0.9600650161274669, + "learning_rate": 1.8704658589250795e-05, + "loss": 0.5694, + "step": 2325 + }, + { + "epoch": 0.19, + "grad_norm": 0.9139115608171103, + "learning_rate": 1.8703362508893302e-05, + "loss": 0.599, + "step": 2326 + }, + { + "epoch": 0.19, + "grad_norm": 0.9079993112354389, + "learning_rate": 1.870206582540051e-05, + "loss": 0.6328, + "step": 2327 + }, + { + "epoch": 0.19, + "grad_norm": 0.930373994652327, + "learning_rate": 1.8700768538862274e-05, + "loss": 0.6067, + "step": 2328 + }, + { + "epoch": 0.19, + "grad_norm": 0.935081675704654, + "learning_rate": 1.8699470649368496e-05, + "loss": 0.5887, + "step": 2329 + }, + { + "epoch": 0.19, + "grad_norm": 0.974106870385751, + "learning_rate": 1.8698172157009124e-05, + "loss": 0.5886, + "step": 2330 + }, + { + "epoch": 0.19, + "grad_norm": 0.9335074917711323, + "learning_rate": 1.8696873061874127e-05, + "loss": 0.556, + "step": 2331 + }, + { + "epoch": 0.19, + "grad_norm": 0.9289390886527041, + "learning_rate": 1.8695573364053548e-05, + "loss": 0.6101, + "step": 2332 + }, + { + "epoch": 0.19, + "grad_norm": 0.9494119915399611, + "learning_rate": 1.8694273063637444e-05, + "loss": 0.6225, + "step": 2333 + }, + { + "epoch": 0.19, + "grad_norm": 0.868203726273747, + "learning_rate": 1.869297216071593e-05, + "loss": 0.5634, + "step": 2334 + }, + { + "epoch": 0.19, + "grad_norm": 0.9645235066076571, + "learning_rate": 1.8691670655379157e-05, + "loss": 0.5783, + "step": 2335 + }, + { + "epoch": 0.19, + "grad_norm": 0.9291663000652597, + "learning_rate": 1.8690368547717313e-05, + "loss": 0.5383, + "step": 2336 + }, + { + "epoch": 0.19, + "grad_norm": 1.030978162010167, + "learning_rate": 1.8689065837820642e-05, + "loss": 0.6168, + "step": 2337 + }, + { + "epoch": 0.19, + "grad_norm": 1.01006563723956, + "learning_rate": 1.8687762525779412e-05, + "loss": 0.5955, + "step": 2338 + }, + { + "epoch": 0.19, + "grad_norm": 0.9572350447649162, + "learning_rate": 1.8686458611683948e-05, + "loss": 0.617, + "step": 2339 + }, + { + "epoch": 0.19, + "grad_norm": 1.0413589226973223, + "learning_rate": 1.8685154095624605e-05, + "loss": 0.7026, + "step": 2340 + }, + { + "epoch": 0.19, + "grad_norm": 1.006611036261011, + "learning_rate": 1.8683848977691784e-05, + "loss": 0.6215, + "step": 2341 + }, + { + "epoch": 0.19, + "grad_norm": 0.902856036151998, + "learning_rate": 1.868254325797594e-05, + "loss": 0.5864, + "step": 2342 + }, + { + "epoch": 0.19, + "grad_norm": 1.0432428298350762, + "learning_rate": 1.868123693656754e-05, + "loss": 0.5814, + "step": 2343 + }, + { + "epoch": 0.19, + "grad_norm": 1.0310284547691086, + "learning_rate": 1.8679930013557127e-05, + "loss": 0.5941, + "step": 2344 + }, + { + "epoch": 0.19, + "grad_norm": 0.9342025335462472, + "learning_rate": 1.867862248903526e-05, + "loss": 0.6058, + "step": 2345 + }, + { + "epoch": 0.19, + "grad_norm": 1.0046093700308836, + "learning_rate": 1.8677314363092555e-05, + "loss": 0.5298, + "step": 2346 + }, + { + "epoch": 0.19, + "grad_norm": 0.9711270887222404, + "learning_rate": 1.867600563581966e-05, + "loss": 0.5702, + "step": 2347 + }, + { + "epoch": 0.19, + "grad_norm": 1.077095212889763, + "learning_rate": 1.867469630730727e-05, + "loss": 0.5995, + "step": 2348 + }, + { + "epoch": 0.19, + "grad_norm": 0.8931131934883211, + "learning_rate": 1.867338637764612e-05, + "loss": 0.4927, + "step": 2349 + }, + { + "epoch": 0.19, + "grad_norm": 1.0369626372647212, + "learning_rate": 1.867207584692699e-05, + "loss": 0.577, + "step": 2350 + }, + { + "epoch": 0.19, + "grad_norm": 0.9567874811250195, + "learning_rate": 1.867076471524069e-05, + "loss": 0.4779, + "step": 2351 + }, + { + "epoch": 0.19, + "grad_norm": 0.9766997211054103, + "learning_rate": 1.866945298267809e-05, + "loss": 0.6276, + "step": 2352 + }, + { + "epoch": 0.19, + "grad_norm": 1.0546535855796144, + "learning_rate": 1.866814064933009e-05, + "loss": 0.6283, + "step": 2353 + }, + { + "epoch": 0.19, + "grad_norm": 1.0842469473548701, + "learning_rate": 1.8666827715287627e-05, + "loss": 0.66, + "step": 2354 + }, + { + "epoch": 0.19, + "grad_norm": 0.9133052767488332, + "learning_rate": 1.8665514180641697e-05, + "loss": 0.5781, + "step": 2355 + }, + { + "epoch": 0.19, + "grad_norm": 0.9991322639797208, + "learning_rate": 1.8664200045483314e-05, + "loss": 0.5336, + "step": 2356 + }, + { + "epoch": 0.19, + "grad_norm": 0.9699785492721534, + "learning_rate": 1.8662885309903558e-05, + "loss": 0.5715, + "step": 2357 + }, + { + "epoch": 0.19, + "grad_norm": 1.0024836346982164, + "learning_rate": 1.8661569973993533e-05, + "loss": 0.6319, + "step": 2358 + }, + { + "epoch": 0.19, + "grad_norm": 1.0298945721796273, + "learning_rate": 1.866025403784439e-05, + "loss": 0.6255, + "step": 2359 + }, + { + "epoch": 0.19, + "grad_norm": 0.8974998871243925, + "learning_rate": 1.865893750154732e-05, + "loss": 0.5178, + "step": 2360 + }, + { + "epoch": 0.19, + "grad_norm": 1.057894104135965, + "learning_rate": 1.8657620365193566e-05, + "loss": 0.6258, + "step": 2361 + }, + { + "epoch": 0.19, + "grad_norm": 0.9877141542623917, + "learning_rate": 1.8656302628874402e-05, + "loss": 0.6648, + "step": 2362 + }, + { + "epoch": 0.19, + "grad_norm": 0.9542372663196041, + "learning_rate": 1.8654984292681142e-05, + "loss": 0.5872, + "step": 2363 + }, + { + "epoch": 0.19, + "grad_norm": 0.8546229947901854, + "learning_rate": 1.8653665356705146e-05, + "loss": 0.5864, + "step": 2364 + }, + { + "epoch": 0.19, + "grad_norm": 0.9260957967141475, + "learning_rate": 1.865234582103782e-05, + "loss": 0.5882, + "step": 2365 + }, + { + "epoch": 0.19, + "grad_norm": 1.038963914135987, + "learning_rate": 1.86510256857706e-05, + "loss": 0.6273, + "step": 2366 + }, + { + "epoch": 0.19, + "grad_norm": 0.9630029696755222, + "learning_rate": 1.8649704950994976e-05, + "loss": 0.6044, + "step": 2367 + }, + { + "epoch": 0.19, + "grad_norm": 1.0408025228163054, + "learning_rate": 1.864838361680247e-05, + "loss": 0.5721, + "step": 2368 + }, + { + "epoch": 0.19, + "grad_norm": 0.9901450030781432, + "learning_rate": 1.864706168328465e-05, + "loss": 0.6231, + "step": 2369 + }, + { + "epoch": 0.19, + "grad_norm": 1.0128039127196022, + "learning_rate": 1.8645739150533123e-05, + "loss": 0.6232, + "step": 2370 + }, + { + "epoch": 0.19, + "grad_norm": 1.035782290703731, + "learning_rate": 1.8644416018639547e-05, + "loss": 0.6019, + "step": 2371 + }, + { + "epoch": 0.19, + "grad_norm": 1.0569343629525514, + "learning_rate": 1.8643092287695604e-05, + "loss": 0.6118, + "step": 2372 + }, + { + "epoch": 0.19, + "grad_norm": 0.9739875131987521, + "learning_rate": 1.8641767957793037e-05, + "loss": 0.6424, + "step": 2373 + }, + { + "epoch": 0.19, + "grad_norm": 0.9480212105192773, + "learning_rate": 1.864044302902361e-05, + "loss": 0.5794, + "step": 2374 + }, + { + "epoch": 0.19, + "grad_norm": 0.9633841127891988, + "learning_rate": 1.8639117501479143e-05, + "loss": 0.5918, + "step": 2375 + }, + { + "epoch": 0.19, + "grad_norm": 0.8875048063352914, + "learning_rate": 1.8637791375251505e-05, + "loss": 0.5661, + "step": 2376 + }, + { + "epoch": 0.19, + "grad_norm": 0.9553926429206897, + "learning_rate": 1.863646465043258e-05, + "loss": 0.6066, + "step": 2377 + }, + { + "epoch": 0.19, + "grad_norm": 0.9406156212474246, + "learning_rate": 1.8635137327114317e-05, + "loss": 0.5694, + "step": 2378 + }, + { + "epoch": 0.19, + "grad_norm": 1.0082468868263996, + "learning_rate": 1.8633809405388697e-05, + "loss": 0.584, + "step": 2379 + }, + { + "epoch": 0.19, + "grad_norm": 0.9697465631492217, + "learning_rate": 1.8632480885347744e-05, + "loss": 0.636, + "step": 2380 + }, + { + "epoch": 0.19, + "grad_norm": 0.9103226982286848, + "learning_rate": 1.863115176708352e-05, + "loss": 0.5912, + "step": 2381 + }, + { + "epoch": 0.19, + "grad_norm": 0.903505883017083, + "learning_rate": 1.8629822050688138e-05, + "loss": 0.5725, + "step": 2382 + }, + { + "epoch": 0.19, + "grad_norm": 0.9399914412587055, + "learning_rate": 1.862849173625374e-05, + "loss": 0.5703, + "step": 2383 + }, + { + "epoch": 0.19, + "grad_norm": 0.865161610202597, + "learning_rate": 1.862716082387252e-05, + "loss": 0.5792, + "step": 2384 + }, + { + "epoch": 0.19, + "grad_norm": 0.9859677288051738, + "learning_rate": 1.8625829313636707e-05, + "loss": 0.6196, + "step": 2385 + }, + { + "epoch": 0.19, + "grad_norm": 0.994656223718024, + "learning_rate": 1.862449720563857e-05, + "loss": 0.6377, + "step": 2386 + }, + { + "epoch": 0.19, + "grad_norm": 1.0886207567360782, + "learning_rate": 1.862316449997043e-05, + "loss": 0.5995, + "step": 2387 + }, + { + "epoch": 0.19, + "grad_norm": 0.9986334346192072, + "learning_rate": 1.862183119672464e-05, + "loss": 0.6196, + "step": 2388 + }, + { + "epoch": 0.19, + "grad_norm": 1.0365462981424516, + "learning_rate": 1.862049729599359e-05, + "loss": 0.5983, + "step": 2389 + }, + { + "epoch": 0.19, + "grad_norm": 1.0433892804940201, + "learning_rate": 1.8619162797869728e-05, + "loss": 0.5875, + "step": 2390 + }, + { + "epoch": 0.19, + "grad_norm": 0.956003796235037, + "learning_rate": 1.861782770244553e-05, + "loss": 0.5334, + "step": 2391 + }, + { + "epoch": 0.19, + "grad_norm": 0.8948968349280684, + "learning_rate": 1.8616492009813516e-05, + "loss": 0.5317, + "step": 2392 + }, + { + "epoch": 0.19, + "grad_norm": 0.967022027051535, + "learning_rate": 1.8615155720066247e-05, + "loss": 0.6288, + "step": 2393 + }, + { + "epoch": 0.19, + "grad_norm": 0.9790421801306229, + "learning_rate": 1.861381883329633e-05, + "loss": 0.6143, + "step": 2394 + }, + { + "epoch": 0.19, + "grad_norm": 1.1236596585523422, + "learning_rate": 1.8612481349596406e-05, + "loss": 0.6214, + "step": 2395 + }, + { + "epoch": 0.19, + "grad_norm": 0.946020311289503, + "learning_rate": 1.8611143269059165e-05, + "loss": 0.5774, + "step": 2396 + }, + { + "epoch": 0.19, + "grad_norm": 1.0809689030226681, + "learning_rate": 1.8609804591777333e-05, + "loss": 0.5975, + "step": 2397 + }, + { + "epoch": 0.19, + "grad_norm": 1.0135643705288075, + "learning_rate": 1.860846531784368e-05, + "loss": 0.6491, + "step": 2398 + }, + { + "epoch": 0.19, + "grad_norm": 0.9544748502585023, + "learning_rate": 1.8607125447351017e-05, + "loss": 0.6125, + "step": 2399 + }, + { + "epoch": 0.2, + "grad_norm": 0.9163164402222126, + "learning_rate": 1.8605784980392193e-05, + "loss": 0.5258, + "step": 2400 + }, + { + "epoch": 0.2, + "grad_norm": 1.14714289345333, + "learning_rate": 1.86044439170601e-05, + "loss": 0.5343, + "step": 2401 + }, + { + "epoch": 0.2, + "grad_norm": 0.9996593655533791, + "learning_rate": 1.8603102257447686e-05, + "loss": 0.6477, + "step": 2402 + }, + { + "epoch": 0.2, + "grad_norm": 1.0396410331618158, + "learning_rate": 1.860176000164791e-05, + "loss": 0.5779, + "step": 2403 + }, + { + "epoch": 0.2, + "grad_norm": 0.9694426587894854, + "learning_rate": 1.8600417149753794e-05, + "loss": 0.5503, + "step": 2404 + }, + { + "epoch": 0.2, + "grad_norm": 0.9399594621363393, + "learning_rate": 1.85990737018584e-05, + "loss": 0.5529, + "step": 2405 + }, + { + "epoch": 0.2, + "grad_norm": 0.9611673318625494, + "learning_rate": 1.8597729658054827e-05, + "loss": 0.577, + "step": 2406 + }, + { + "epoch": 0.2, + "grad_norm": 1.063842825267057, + "learning_rate": 1.8596385018436214e-05, + "loss": 0.6513, + "step": 2407 + }, + { + "epoch": 0.2, + "grad_norm": 0.9440600590318581, + "learning_rate": 1.8595039783095747e-05, + "loss": 0.522, + "step": 2408 + }, + { + "epoch": 0.2, + "grad_norm": 0.9333040800805044, + "learning_rate": 1.859369395212664e-05, + "loss": 0.6214, + "step": 2409 + }, + { + "epoch": 0.2, + "grad_norm": 1.0208532063205993, + "learning_rate": 1.859234752562217e-05, + "loss": 0.6271, + "step": 2410 + }, + { + "epoch": 0.2, + "grad_norm": 1.081775506940939, + "learning_rate": 1.8591000503675635e-05, + "loss": 0.5961, + "step": 2411 + }, + { + "epoch": 0.2, + "grad_norm": 0.9756559941848068, + "learning_rate": 1.8589652886380387e-05, + "loss": 0.6023, + "step": 2412 + }, + { + "epoch": 0.2, + "grad_norm": 1.061469168824822, + "learning_rate": 1.8588304673829814e-05, + "loss": 0.64, + "step": 2413 + }, + { + "epoch": 0.2, + "grad_norm": 0.9643883392364556, + "learning_rate": 1.8586955866117345e-05, + "loss": 0.6009, + "step": 2414 + }, + { + "epoch": 0.2, + "grad_norm": 0.9622948386380665, + "learning_rate": 1.8585606463336448e-05, + "loss": 0.5491, + "step": 2415 + }, + { + "epoch": 0.2, + "grad_norm": 0.9804391313646905, + "learning_rate": 1.8584256465580642e-05, + "loss": 0.6679, + "step": 2416 + }, + { + "epoch": 0.2, + "grad_norm": 1.1901230134706908, + "learning_rate": 1.8582905872943477e-05, + "loss": 0.5767, + "step": 2417 + }, + { + "epoch": 0.2, + "grad_norm": 1.0298855448133197, + "learning_rate": 1.8581554685518543e-05, + "loss": 0.5731, + "step": 2418 + }, + { + "epoch": 0.2, + "grad_norm": 0.9403946452528907, + "learning_rate": 1.8580202903399484e-05, + "loss": 0.5958, + "step": 2419 + }, + { + "epoch": 0.2, + "grad_norm": 1.0789858376109014, + "learning_rate": 1.8578850526679976e-05, + "loss": 0.6047, + "step": 2420 + }, + { + "epoch": 0.2, + "grad_norm": 0.9846049002431896, + "learning_rate": 1.8577497555453735e-05, + "loss": 0.5542, + "step": 2421 + }, + { + "epoch": 0.2, + "grad_norm": 0.9920273561796581, + "learning_rate": 1.8576143989814524e-05, + "loss": 0.5957, + "step": 2422 + }, + { + "epoch": 0.2, + "grad_norm": 1.012926254158362, + "learning_rate": 1.857478982985614e-05, + "loss": 0.5816, + "step": 2423 + }, + { + "epoch": 0.2, + "grad_norm": 1.1060199873713032, + "learning_rate": 1.8573435075672422e-05, + "loss": 0.6219, + "step": 2424 + }, + { + "epoch": 0.2, + "grad_norm": 0.9665465330730769, + "learning_rate": 1.8572079727357265e-05, + "loss": 0.5141, + "step": 2425 + }, + { + "epoch": 0.2, + "grad_norm": 1.010619313409472, + "learning_rate": 1.8570723785004583e-05, + "loss": 0.6223, + "step": 2426 + }, + { + "epoch": 0.2, + "grad_norm": 1.0701981612275158, + "learning_rate": 1.8569367248708343e-05, + "loss": 0.6207, + "step": 2427 + }, + { + "epoch": 0.2, + "grad_norm": 0.968653341581941, + "learning_rate": 1.8568010118562556e-05, + "loss": 0.6153, + "step": 2428 + }, + { + "epoch": 0.2, + "grad_norm": 1.0285147099330336, + "learning_rate": 1.8566652394661268e-05, + "loss": 0.5845, + "step": 2429 + }, + { + "epoch": 0.2, + "grad_norm": 1.0401009522193112, + "learning_rate": 1.856529407709857e-05, + "loss": 0.5614, + "step": 2430 + }, + { + "epoch": 0.2, + "grad_norm": 1.0312486719248375, + "learning_rate": 1.8563935165968584e-05, + "loss": 0.5535, + "step": 2431 + }, + { + "epoch": 0.2, + "grad_norm": 0.8939351482825008, + "learning_rate": 1.8562575661365493e-05, + "loss": 0.5404, + "step": 2432 + }, + { + "epoch": 0.2, + "grad_norm": 1.0252632002781987, + "learning_rate": 1.8561215563383496e-05, + "loss": 0.5902, + "step": 2433 + }, + { + "epoch": 0.2, + "grad_norm": 0.9425198617957646, + "learning_rate": 1.855985487211686e-05, + "loss": 0.579, + "step": 2434 + }, + { + "epoch": 0.2, + "grad_norm": 0.9890648746831118, + "learning_rate": 1.8558493587659874e-05, + "loss": 0.621, + "step": 2435 + }, + { + "epoch": 0.2, + "grad_norm": 0.9366563149047062, + "learning_rate": 1.8557131710106873e-05, + "loss": 0.5563, + "step": 2436 + }, + { + "epoch": 0.2, + "grad_norm": 0.9984953205723499, + "learning_rate": 1.8555769239552232e-05, + "loss": 0.6493, + "step": 2437 + }, + { + "epoch": 0.2, + "grad_norm": 0.980140534084269, + "learning_rate": 1.8554406176090377e-05, + "loss": 0.6212, + "step": 2438 + }, + { + "epoch": 0.2, + "grad_norm": 1.00668465629207, + "learning_rate": 1.8553042519815756e-05, + "loss": 0.6299, + "step": 2439 + }, + { + "epoch": 0.2, + "grad_norm": 0.9874709621095749, + "learning_rate": 1.8551678270822878e-05, + "loss": 0.6188, + "step": 2440 + }, + { + "epoch": 0.2, + "grad_norm": 0.9247042108253141, + "learning_rate": 1.8550313429206282e-05, + "loss": 0.557, + "step": 2441 + }, + { + "epoch": 0.2, + "grad_norm": 0.9952811181588312, + "learning_rate": 1.8548947995060547e-05, + "loss": 0.6042, + "step": 2442 + }, + { + "epoch": 0.2, + "grad_norm": 1.0020526733733572, + "learning_rate": 1.85475819684803e-05, + "loss": 0.5833, + "step": 2443 + }, + { + "epoch": 0.2, + "grad_norm": 0.955455293951105, + "learning_rate": 1.8546215349560204e-05, + "loss": 0.6151, + "step": 2444 + }, + { + "epoch": 0.2, + "grad_norm": 1.0341207506844388, + "learning_rate": 1.8544848138394965e-05, + "loss": 0.5803, + "step": 2445 + }, + { + "epoch": 0.2, + "grad_norm": 1.0264964629842621, + "learning_rate": 1.854348033507933e-05, + "loss": 0.6286, + "step": 2446 + }, + { + "epoch": 0.2, + "grad_norm": 0.9302590580265883, + "learning_rate": 1.8542111939708086e-05, + "loss": 0.5781, + "step": 2447 + }, + { + "epoch": 0.2, + "grad_norm": 0.8796676362843857, + "learning_rate": 1.854074295237606e-05, + "loss": 0.5421, + "step": 2448 + }, + { + "epoch": 0.2, + "grad_norm": 1.004870683626179, + "learning_rate": 1.8539373373178126e-05, + "loss": 0.5445, + "step": 2449 + }, + { + "epoch": 0.2, + "grad_norm": 1.0665094713061753, + "learning_rate": 1.8538003202209186e-05, + "loss": 0.6193, + "step": 2450 + }, + { + "epoch": 0.2, + "grad_norm": 1.0457962229818203, + "learning_rate": 1.8536632439564203e-05, + "loss": 0.5028, + "step": 2451 + }, + { + "epoch": 0.2, + "grad_norm": 0.9640419849222112, + "learning_rate": 1.853526108533816e-05, + "loss": 0.6291, + "step": 2452 + }, + { + "epoch": 0.2, + "grad_norm": 1.0268636636017952, + "learning_rate": 1.8533889139626096e-05, + "loss": 0.5408, + "step": 2453 + }, + { + "epoch": 0.2, + "grad_norm": 1.0358135887820035, + "learning_rate": 1.8532516602523087e-05, + "loss": 0.5789, + "step": 2454 + }, + { + "epoch": 0.2, + "grad_norm": 1.046182089719115, + "learning_rate": 1.853114347412424e-05, + "loss": 0.5674, + "step": 2455 + }, + { + "epoch": 0.2, + "grad_norm": 1.126374433619317, + "learning_rate": 1.8529769754524724e-05, + "loss": 0.6586, + "step": 2456 + }, + { + "epoch": 0.2, + "grad_norm": 0.9045955142000208, + "learning_rate": 1.8528395443819725e-05, + "loss": 0.5948, + "step": 2457 + }, + { + "epoch": 0.2, + "grad_norm": 0.8992249493330345, + "learning_rate": 1.8527020542104487e-05, + "loss": 0.5861, + "step": 2458 + }, + { + "epoch": 0.2, + "grad_norm": 1.0028727195091176, + "learning_rate": 1.852564504947429e-05, + "loss": 0.6158, + "step": 2459 + }, + { + "epoch": 0.2, + "grad_norm": 1.1317843702334225, + "learning_rate": 1.852426896602445e-05, + "loss": 0.632, + "step": 2460 + }, + { + "epoch": 0.2, + "grad_norm": 1.0718645139741845, + "learning_rate": 1.8522892291850335e-05, + "loss": 0.6324, + "step": 2461 + }, + { + "epoch": 0.2, + "grad_norm": 1.1507465013420966, + "learning_rate": 1.8521515027047344e-05, + "loss": 0.5917, + "step": 2462 + }, + { + "epoch": 0.2, + "grad_norm": 1.0390501368556815, + "learning_rate": 1.8520137171710923e-05, + "loss": 0.623, + "step": 2463 + }, + { + "epoch": 0.2, + "grad_norm": 1.0199145522160606, + "learning_rate": 1.851875872593655e-05, + "loss": 0.6655, + "step": 2464 + }, + { + "epoch": 0.2, + "grad_norm": 0.9765738987075874, + "learning_rate": 1.8517379689819752e-05, + "loss": 0.5973, + "step": 2465 + }, + { + "epoch": 0.2, + "grad_norm": 1.0897129702395667, + "learning_rate": 1.85160000634561e-05, + "loss": 0.5654, + "step": 2466 + }, + { + "epoch": 0.2, + "grad_norm": 0.9737301064012777, + "learning_rate": 1.8514619846941192e-05, + "loss": 0.589, + "step": 2467 + }, + { + "epoch": 0.2, + "grad_norm": 0.9419077307422697, + "learning_rate": 1.851323904037069e-05, + "loss": 0.5888, + "step": 2468 + }, + { + "epoch": 0.2, + "grad_norm": 1.0853730530182204, + "learning_rate": 1.8511857643840264e-05, + "loss": 0.6198, + "step": 2469 + }, + { + "epoch": 0.2, + "grad_norm": 0.992302096237278, + "learning_rate": 1.8510475657445656e-05, + "loss": 0.5962, + "step": 2470 + }, + { + "epoch": 0.2, + "grad_norm": 1.0140166287761607, + "learning_rate": 1.8509093081282636e-05, + "loss": 0.6112, + "step": 2471 + }, + { + "epoch": 0.2, + "grad_norm": 0.8910914912445789, + "learning_rate": 1.8507709915447013e-05, + "loss": 0.5705, + "step": 2472 + }, + { + "epoch": 0.2, + "grad_norm": 0.9660251973192866, + "learning_rate": 1.8506326160034638e-05, + "loss": 0.556, + "step": 2473 + }, + { + "epoch": 0.2, + "grad_norm": 0.9516083077316758, + "learning_rate": 1.8504941815141406e-05, + "loss": 0.622, + "step": 2474 + }, + { + "epoch": 0.2, + "grad_norm": 1.0276900810420124, + "learning_rate": 1.850355688086325e-05, + "loss": 0.6428, + "step": 2475 + }, + { + "epoch": 0.2, + "grad_norm": 1.0234009192262177, + "learning_rate": 1.8502171357296144e-05, + "loss": 0.521, + "step": 2476 + }, + { + "epoch": 0.2, + "grad_norm": 0.9300947641179861, + "learning_rate": 1.8500785244536104e-05, + "loss": 0.5487, + "step": 2477 + }, + { + "epoch": 0.2, + "grad_norm": 0.9803225475297762, + "learning_rate": 1.849939854267919e-05, + "loss": 0.5837, + "step": 2478 + }, + { + "epoch": 0.2, + "grad_norm": 0.9718620665209154, + "learning_rate": 1.849801125182149e-05, + "loss": 0.5954, + "step": 2479 + }, + { + "epoch": 0.2, + "grad_norm": 0.8568227724968096, + "learning_rate": 1.8496623372059152e-05, + "loss": 0.5528, + "step": 2480 + }, + { + "epoch": 0.2, + "grad_norm": 0.9975366955008645, + "learning_rate": 1.849523490348835e-05, + "loss": 0.6434, + "step": 2481 + }, + { + "epoch": 0.2, + "grad_norm": 0.9704820831780121, + "learning_rate": 1.8493845846205303e-05, + "loss": 0.6076, + "step": 2482 + }, + { + "epoch": 0.2, + "grad_norm": 1.0470017044262758, + "learning_rate": 1.8492456200306276e-05, + "loss": 0.6127, + "step": 2483 + }, + { + "epoch": 0.2, + "grad_norm": 1.0002178090901197, + "learning_rate": 1.8491065965887568e-05, + "loss": 0.5508, + "step": 2484 + }, + { + "epoch": 0.2, + "grad_norm": 0.9119469336380572, + "learning_rate": 1.8489675143045516e-05, + "loss": 0.5624, + "step": 2485 + }, + { + "epoch": 0.2, + "grad_norm": 1.0505339179736148, + "learning_rate": 1.8488283731876508e-05, + "loss": 0.6058, + "step": 2486 + }, + { + "epoch": 0.2, + "grad_norm": 0.8875674907841911, + "learning_rate": 1.848689173247697e-05, + "loss": 0.5268, + "step": 2487 + }, + { + "epoch": 0.2, + "grad_norm": 0.9865949243859709, + "learning_rate": 1.8485499144943358e-05, + "loss": 0.5906, + "step": 2488 + }, + { + "epoch": 0.2, + "grad_norm": 0.9625178791840348, + "learning_rate": 1.8484105969372184e-05, + "loss": 0.5549, + "step": 2489 + }, + { + "epoch": 0.2, + "grad_norm": 0.8948390139961174, + "learning_rate": 1.8482712205859992e-05, + "loss": 0.6188, + "step": 2490 + }, + { + "epoch": 0.2, + "grad_norm": 1.047076142564619, + "learning_rate": 1.848131785450337e-05, + "loss": 0.6543, + "step": 2491 + }, + { + "epoch": 0.2, + "grad_norm": 0.9444702882438062, + "learning_rate": 1.8479922915398937e-05, + "loss": 0.579, + "step": 2492 + }, + { + "epoch": 0.2, + "grad_norm": 1.0852998525665014, + "learning_rate": 1.8478527388643375e-05, + "loss": 0.6392, + "step": 2493 + }, + { + "epoch": 0.2, + "grad_norm": 0.951577608183831, + "learning_rate": 1.8477131274333383e-05, + "loss": 0.5678, + "step": 2494 + }, + { + "epoch": 0.2, + "grad_norm": 0.9536234809843884, + "learning_rate": 1.847573457256571e-05, + "loss": 0.5663, + "step": 2495 + }, + { + "epoch": 0.2, + "grad_norm": 1.0085519410123744, + "learning_rate": 1.8474337283437155e-05, + "loss": 0.6148, + "step": 2496 + }, + { + "epoch": 0.2, + "grad_norm": 0.9733488355019365, + "learning_rate": 1.8472939407044536e-05, + "loss": 0.6036, + "step": 2497 + }, + { + "epoch": 0.2, + "grad_norm": 0.9837751916587119, + "learning_rate": 1.847154094348474e-05, + "loss": 0.5707, + "step": 2498 + }, + { + "epoch": 0.2, + "grad_norm": 1.0040076270633222, + "learning_rate": 1.847014189285466e-05, + "loss": 0.5996, + "step": 2499 + }, + { + "epoch": 0.2, + "grad_norm": 0.8408621469655635, + "learning_rate": 1.8468742255251268e-05, + "loss": 0.5432, + "step": 2500 + }, + { + "epoch": 0.2, + "grad_norm": 1.0111470886873506, + "learning_rate": 1.846734203077155e-05, + "loss": 0.6538, + "step": 2501 + }, + { + "epoch": 0.2, + "grad_norm": 0.965137606513294, + "learning_rate": 1.8465941219512533e-05, + "loss": 0.581, + "step": 2502 + }, + { + "epoch": 0.2, + "grad_norm": 0.8809664551569435, + "learning_rate": 1.8464539821571302e-05, + "loss": 0.5743, + "step": 2503 + }, + { + "epoch": 0.2, + "grad_norm": 1.0831610501744091, + "learning_rate": 1.8463137837044973e-05, + "loss": 0.5693, + "step": 2504 + }, + { + "epoch": 0.2, + "grad_norm": 0.9377551417262783, + "learning_rate": 1.8461735266030696e-05, + "loss": 0.5784, + "step": 2505 + }, + { + "epoch": 0.2, + "grad_norm": 1.0527879433579985, + "learning_rate": 1.8460332108625668e-05, + "loss": 0.629, + "step": 2506 + }, + { + "epoch": 0.2, + "grad_norm": 0.992612826274164, + "learning_rate": 1.8458928364927137e-05, + "loss": 0.5982, + "step": 2507 + }, + { + "epoch": 0.2, + "grad_norm": 0.9554499534353595, + "learning_rate": 1.8457524035032364e-05, + "loss": 0.5667, + "step": 2508 + }, + { + "epoch": 0.2, + "grad_norm": 1.0322061914645393, + "learning_rate": 1.8456119119038683e-05, + "loss": 0.6673, + "step": 2509 + }, + { + "epoch": 0.2, + "grad_norm": 1.012298631254966, + "learning_rate": 1.8454713617043448e-05, + "loss": 0.6267, + "step": 2510 + }, + { + "epoch": 0.2, + "grad_norm": 0.9751534686903088, + "learning_rate": 1.8453307529144055e-05, + "loss": 0.641, + "step": 2511 + }, + { + "epoch": 0.2, + "grad_norm": 1.0246962915110827, + "learning_rate": 1.845190085543795e-05, + "loss": 0.6305, + "step": 2512 + }, + { + "epoch": 0.2, + "grad_norm": 0.9089309094696517, + "learning_rate": 1.845049359602261e-05, + "loss": 0.5929, + "step": 2513 + }, + { + "epoch": 0.2, + "grad_norm": 0.8998356265099821, + "learning_rate": 1.8449085750995564e-05, + "loss": 0.5446, + "step": 2514 + }, + { + "epoch": 0.2, + "grad_norm": 1.008345723946353, + "learning_rate": 1.8447677320454367e-05, + "loss": 0.596, + "step": 2515 + }, + { + "epoch": 0.2, + "grad_norm": 0.9908774036336532, + "learning_rate": 1.8446268304496624e-05, + "loss": 0.6383, + "step": 2516 + }, + { + "epoch": 0.2, + "grad_norm": 1.0072737095100066, + "learning_rate": 1.8444858703219982e-05, + "loss": 0.6442, + "step": 2517 + }, + { + "epoch": 0.2, + "grad_norm": 0.8652406980423714, + "learning_rate": 1.844344851672212e-05, + "loss": 0.5321, + "step": 2518 + }, + { + "epoch": 0.2, + "grad_norm": 0.9981622954590514, + "learning_rate": 1.844203774510077e-05, + "loss": 0.6602, + "step": 2519 + }, + { + "epoch": 0.2, + "grad_norm": 0.8713331836320304, + "learning_rate": 1.8440626388453686e-05, + "loss": 0.5823, + "step": 2520 + }, + { + "epoch": 0.2, + "grad_norm": 0.9236692999869184, + "learning_rate": 1.8439214446878685e-05, + "loss": 0.7023, + "step": 2521 + }, + { + "epoch": 0.2, + "grad_norm": 0.9117951641284442, + "learning_rate": 1.8437801920473605e-05, + "loss": 0.5547, + "step": 2522 + }, + { + "epoch": 0.21, + "grad_norm": 0.9483077352466959, + "learning_rate": 1.8436388809336338e-05, + "loss": 0.561, + "step": 2523 + }, + { + "epoch": 0.21, + "grad_norm": 0.9665947626085446, + "learning_rate": 1.8434975113564804e-05, + "loss": 0.6055, + "step": 2524 + }, + { + "epoch": 0.21, + "grad_norm": 0.9385491506378937, + "learning_rate": 1.8433560833256986e-05, + "loss": 0.6063, + "step": 2525 + }, + { + "epoch": 0.21, + "grad_norm": 0.9527023510611712, + "learning_rate": 1.8432145968510878e-05, + "loss": 0.5277, + "step": 2526 + }, + { + "epoch": 0.21, + "grad_norm": 0.9662826327001135, + "learning_rate": 1.8430730519424532e-05, + "loss": 0.5898, + "step": 2527 + }, + { + "epoch": 0.21, + "grad_norm": 1.034736667191442, + "learning_rate": 1.8429314486096042e-05, + "loss": 0.5866, + "step": 2528 + }, + { + "epoch": 0.21, + "grad_norm": 1.0689261718103882, + "learning_rate": 1.8427897868623535e-05, + "loss": 0.6371, + "step": 2529 + }, + { + "epoch": 0.21, + "grad_norm": 0.8653291692283432, + "learning_rate": 1.8426480667105178e-05, + "loss": 0.568, + "step": 2530 + }, + { + "epoch": 0.21, + "grad_norm": 0.9098599831792978, + "learning_rate": 1.842506288163919e-05, + "loss": 0.5838, + "step": 2531 + }, + { + "epoch": 0.21, + "grad_norm": 0.9191263895725162, + "learning_rate": 1.8423644512323814e-05, + "loss": 0.6342, + "step": 2532 + }, + { + "epoch": 0.21, + "grad_norm": 0.9569768590266543, + "learning_rate": 1.8422225559257345e-05, + "loss": 0.6281, + "step": 2533 + }, + { + "epoch": 0.21, + "grad_norm": 0.8285310527806168, + "learning_rate": 1.8420806022538115e-05, + "loss": 0.5172, + "step": 2534 + }, + { + "epoch": 0.21, + "grad_norm": 0.8930960260286901, + "learning_rate": 1.8419385902264497e-05, + "loss": 0.6172, + "step": 2535 + }, + { + "epoch": 0.21, + "grad_norm": 0.9269553677227064, + "learning_rate": 1.8417965198534907e-05, + "loss": 0.6097, + "step": 2536 + }, + { + "epoch": 0.21, + "grad_norm": 1.0740954878523123, + "learning_rate": 1.841654391144779e-05, + "loss": 0.6397, + "step": 2537 + }, + { + "epoch": 0.21, + "grad_norm": 0.9446598393823056, + "learning_rate": 1.841512204110165e-05, + "loss": 0.5599, + "step": 2538 + }, + { + "epoch": 0.21, + "grad_norm": 0.9103665763797646, + "learning_rate": 1.8413699587595016e-05, + "loss": 0.5865, + "step": 2539 + }, + { + "epoch": 0.21, + "grad_norm": 0.9054825790388551, + "learning_rate": 1.841227655102646e-05, + "loss": 0.5674, + "step": 2540 + }, + { + "epoch": 0.21, + "grad_norm": 0.8263852032716875, + "learning_rate": 1.8410852931494606e-05, + "loss": 0.6027, + "step": 2541 + }, + { + "epoch": 0.21, + "grad_norm": 1.0279982369799479, + "learning_rate": 1.84094287290981e-05, + "loss": 0.6128, + "step": 2542 + }, + { + "epoch": 0.21, + "grad_norm": 0.8627449323576147, + "learning_rate": 1.8408003943935643e-05, + "loss": 0.5885, + "step": 2543 + }, + { + "epoch": 0.21, + "grad_norm": 0.9312463352872257, + "learning_rate": 1.8406578576105973e-05, + "loss": 0.5843, + "step": 2544 + }, + { + "epoch": 0.21, + "grad_norm": 0.8876688206289071, + "learning_rate": 1.8405152625707863e-05, + "loss": 0.5971, + "step": 2545 + }, + { + "epoch": 0.21, + "grad_norm": 0.8334420024858418, + "learning_rate": 1.840372609284013e-05, + "loss": 0.5071, + "step": 2546 + }, + { + "epoch": 0.21, + "grad_norm": 1.0547438702977643, + "learning_rate": 1.8402298977601636e-05, + "loss": 0.6659, + "step": 2547 + }, + { + "epoch": 0.21, + "grad_norm": 0.9384283089727256, + "learning_rate": 1.8400871280091274e-05, + "loss": 0.62, + "step": 2548 + }, + { + "epoch": 0.21, + "grad_norm": 0.8741990456683056, + "learning_rate": 1.839944300040798e-05, + "loss": 0.5506, + "step": 2549 + }, + { + "epoch": 0.21, + "grad_norm": 0.9156548395431, + "learning_rate": 1.8398014138650742e-05, + "loss": 0.575, + "step": 2550 + }, + { + "epoch": 0.21, + "grad_norm": 0.9491831775479801, + "learning_rate": 1.839658469491857e-05, + "loss": 0.5716, + "step": 2551 + }, + { + "epoch": 0.21, + "grad_norm": 0.970076589441962, + "learning_rate": 1.839515466931053e-05, + "loss": 0.6099, + "step": 2552 + }, + { + "epoch": 0.21, + "grad_norm": 1.004362696843191, + "learning_rate": 1.8393724061925714e-05, + "loss": 0.6277, + "step": 2553 + }, + { + "epoch": 0.21, + "grad_norm": 0.9440776440870434, + "learning_rate": 1.839229287286327e-05, + "loss": 0.5533, + "step": 2554 + }, + { + "epoch": 0.21, + "grad_norm": 0.9078265991640199, + "learning_rate": 1.839086110222237e-05, + "loss": 0.5973, + "step": 2555 + }, + { + "epoch": 0.21, + "grad_norm": 0.9495101497226277, + "learning_rate": 1.8389428750102238e-05, + "loss": 0.5827, + "step": 2556 + }, + { + "epoch": 0.21, + "grad_norm": 0.9223923326893704, + "learning_rate": 1.8387995816602137e-05, + "loss": 0.5098, + "step": 2557 + }, + { + "epoch": 0.21, + "grad_norm": 0.8775065736224732, + "learning_rate": 1.8386562301821363e-05, + "loss": 0.5311, + "step": 2558 + }, + { + "epoch": 0.21, + "grad_norm": 0.9730830661108568, + "learning_rate": 1.8385128205859267e-05, + "loss": 0.5869, + "step": 2559 + }, + { + "epoch": 0.21, + "grad_norm": 0.9741033272488008, + "learning_rate": 1.8383693528815218e-05, + "loss": 0.5783, + "step": 2560 + }, + { + "epoch": 0.21, + "grad_norm": 0.9586616755962616, + "learning_rate": 1.8382258270788648e-05, + "loss": 0.5866, + "step": 2561 + }, + { + "epoch": 0.21, + "grad_norm": 0.9620923146243326, + "learning_rate": 1.8380822431879012e-05, + "loss": 0.5727, + "step": 2562 + }, + { + "epoch": 0.21, + "grad_norm": 0.9470891414477499, + "learning_rate": 1.8379386012185813e-05, + "loss": 0.5911, + "step": 2563 + }, + { + "epoch": 0.21, + "grad_norm": 1.1488547373729767, + "learning_rate": 1.83779490118086e-05, + "loss": 0.626, + "step": 2564 + }, + { + "epoch": 0.21, + "grad_norm": 1.0020985253821808, + "learning_rate": 1.837651143084695e-05, + "loss": 0.5889, + "step": 2565 + }, + { + "epoch": 0.21, + "grad_norm": 0.9519069969151185, + "learning_rate": 1.8375073269400488e-05, + "loss": 0.6057, + "step": 2566 + }, + { + "epoch": 0.21, + "grad_norm": 1.040029345268677, + "learning_rate": 1.8373634527568877e-05, + "loss": 0.4678, + "step": 2567 + }, + { + "epoch": 0.21, + "grad_norm": 1.0185322739046456, + "learning_rate": 1.8372195205451822e-05, + "loss": 0.6452, + "step": 2568 + }, + { + "epoch": 0.21, + "grad_norm": 0.9846593231833829, + "learning_rate": 1.8370755303149064e-05, + "loss": 0.5689, + "step": 2569 + }, + { + "epoch": 0.21, + "grad_norm": 1.0532067696810197, + "learning_rate": 1.8369314820760386e-05, + "loss": 0.6109, + "step": 2570 + }, + { + "epoch": 0.21, + "grad_norm": 1.062830205061241, + "learning_rate": 1.836787375838562e-05, + "loss": 0.6192, + "step": 2571 + }, + { + "epoch": 0.21, + "grad_norm": 0.8932031597560607, + "learning_rate": 1.836643211612462e-05, + "loss": 0.5726, + "step": 2572 + }, + { + "epoch": 0.21, + "grad_norm": 0.938840490483992, + "learning_rate": 1.8364989894077297e-05, + "loss": 0.6245, + "step": 2573 + }, + { + "epoch": 0.21, + "grad_norm": 0.9301251286808755, + "learning_rate": 1.8363547092343593e-05, + "loss": 0.5844, + "step": 2574 + }, + { + "epoch": 0.21, + "grad_norm": 0.9830204831340076, + "learning_rate": 1.8362103711023498e-05, + "loss": 0.5838, + "step": 2575 + }, + { + "epoch": 0.21, + "grad_norm": 0.8233563546163213, + "learning_rate": 1.836065975021703e-05, + "loss": 0.4613, + "step": 2576 + }, + { + "epoch": 0.21, + "grad_norm": 0.8804645354736946, + "learning_rate": 1.835921521002426e-05, + "loss": 0.6636, + "step": 2577 + }, + { + "epoch": 0.21, + "grad_norm": 0.9507404367699335, + "learning_rate": 1.8357770090545285e-05, + "loss": 0.5366, + "step": 2578 + }, + { + "epoch": 0.21, + "grad_norm": 3.932122035312336, + "learning_rate": 1.835632439188026e-05, + "loss": 0.6617, + "step": 2579 + }, + { + "epoch": 0.21, + "grad_norm": 0.9172867112989296, + "learning_rate": 1.8354878114129368e-05, + "loss": 0.5361, + "step": 2580 + }, + { + "epoch": 0.21, + "grad_norm": 0.9350228745493447, + "learning_rate": 1.835343125739283e-05, + "loss": 0.5781, + "step": 2581 + }, + { + "epoch": 0.21, + "grad_norm": 1.0337044836734066, + "learning_rate": 1.8351983821770915e-05, + "loss": 0.5823, + "step": 2582 + }, + { + "epoch": 0.21, + "grad_norm": 0.9051354658937525, + "learning_rate": 1.835053580736393e-05, + "loss": 0.5522, + "step": 2583 + }, + { + "epoch": 0.21, + "grad_norm": 1.081713649725828, + "learning_rate": 1.8349087214272222e-05, + "loss": 0.6442, + "step": 2584 + }, + { + "epoch": 0.21, + "grad_norm": 0.986286911076109, + "learning_rate": 1.8347638042596177e-05, + "loss": 0.5924, + "step": 2585 + }, + { + "epoch": 0.21, + "grad_norm": 0.9145313347195219, + "learning_rate": 1.834618829243622e-05, + "loss": 0.5414, + "step": 2586 + }, + { + "epoch": 0.21, + "grad_norm": 1.033848476051229, + "learning_rate": 1.8344737963892813e-05, + "loss": 0.6362, + "step": 2587 + }, + { + "epoch": 0.21, + "grad_norm": 0.9634793666586047, + "learning_rate": 1.834328705706647e-05, + "loss": 0.5892, + "step": 2588 + }, + { + "epoch": 0.21, + "grad_norm": 0.8135673361799275, + "learning_rate": 1.8341835572057735e-05, + "loss": 0.5019, + "step": 2589 + }, + { + "epoch": 0.21, + "grad_norm": 0.9544914367776864, + "learning_rate": 1.834038350896719e-05, + "loss": 0.5692, + "step": 2590 + }, + { + "epoch": 0.21, + "grad_norm": 0.9695356063078714, + "learning_rate": 1.833893086789547e-05, + "loss": 0.6074, + "step": 2591 + }, + { + "epoch": 0.21, + "grad_norm": 0.8833866555407887, + "learning_rate": 1.8337477648943236e-05, + "loss": 0.5801, + "step": 2592 + }, + { + "epoch": 0.21, + "grad_norm": 1.0225031754148959, + "learning_rate": 1.8336023852211197e-05, + "loss": 0.5934, + "step": 2593 + }, + { + "epoch": 0.21, + "grad_norm": 0.9512387406439328, + "learning_rate": 1.83345694778001e-05, + "loss": 0.6627, + "step": 2594 + }, + { + "epoch": 0.21, + "grad_norm": 0.9865220706641923, + "learning_rate": 1.8333114525810726e-05, + "loss": 0.5865, + "step": 2595 + }, + { + "epoch": 0.21, + "grad_norm": 1.0745395462324694, + "learning_rate": 1.833165899634391e-05, + "loss": 0.6437, + "step": 2596 + }, + { + "epoch": 0.21, + "grad_norm": 0.9406556691737249, + "learning_rate": 1.8330202889500518e-05, + "loss": 0.6085, + "step": 2597 + }, + { + "epoch": 0.21, + "grad_norm": 0.9666420944891609, + "learning_rate": 1.8328746205381453e-05, + "loss": 0.5894, + "step": 2598 + }, + { + "epoch": 0.21, + "grad_norm": 0.9022430872716137, + "learning_rate": 1.8327288944087663e-05, + "loss": 0.5847, + "step": 2599 + }, + { + "epoch": 0.21, + "grad_norm": 0.9809080782353568, + "learning_rate": 1.8325831105720135e-05, + "loss": 0.6222, + "step": 2600 + }, + { + "epoch": 0.21, + "grad_norm": 0.9673317294393174, + "learning_rate": 1.8324372690379896e-05, + "loss": 0.5582, + "step": 2601 + }, + { + "epoch": 0.21, + "grad_norm": 0.9761900184362405, + "learning_rate": 1.8322913698168014e-05, + "loss": 0.6216, + "step": 2602 + }, + { + "epoch": 0.21, + "grad_norm": 0.9661851831216987, + "learning_rate": 1.8321454129185597e-05, + "loss": 0.5772, + "step": 2603 + }, + { + "epoch": 0.21, + "grad_norm": 0.9906529066084958, + "learning_rate": 1.831999398353379e-05, + "loss": 0.5657, + "step": 2604 + }, + { + "epoch": 0.21, + "grad_norm": 1.0410741717992296, + "learning_rate": 1.831853326131378e-05, + "loss": 0.5796, + "step": 2605 + }, + { + "epoch": 0.21, + "grad_norm": 1.105172965112674, + "learning_rate": 1.831707196262679e-05, + "loss": 0.5994, + "step": 2606 + }, + { + "epoch": 0.21, + "grad_norm": 1.0392581992735035, + "learning_rate": 1.8315610087574088e-05, + "loss": 0.6008, + "step": 2607 + }, + { + "epoch": 0.21, + "grad_norm": 1.053043125787598, + "learning_rate": 1.831414763625699e-05, + "loss": 0.6217, + "step": 2608 + }, + { + "epoch": 0.21, + "grad_norm": 1.085606036923765, + "learning_rate": 1.831268460877683e-05, + "loss": 0.5928, + "step": 2609 + }, + { + "epoch": 0.21, + "grad_norm": 0.9618254873284892, + "learning_rate": 1.8311221005235e-05, + "loss": 0.566, + "step": 2610 + }, + { + "epoch": 0.21, + "grad_norm": 0.9879925417829869, + "learning_rate": 1.830975682573293e-05, + "loss": 0.6077, + "step": 2611 + }, + { + "epoch": 0.21, + "grad_norm": 0.9013566267896668, + "learning_rate": 1.8308292070372084e-05, + "loss": 0.6134, + "step": 2612 + }, + { + "epoch": 0.21, + "grad_norm": 0.9471939550715492, + "learning_rate": 1.8306826739253965e-05, + "loss": 0.554, + "step": 2613 + }, + { + "epoch": 0.21, + "grad_norm": 1.0117778301036309, + "learning_rate": 1.8305360832480118e-05, + "loss": 0.601, + "step": 2614 + }, + { + "epoch": 0.21, + "grad_norm": 0.8994234802143227, + "learning_rate": 1.8303894350152138e-05, + "loss": 0.4759, + "step": 2615 + }, + { + "epoch": 0.21, + "grad_norm": 1.009902132319506, + "learning_rate": 1.830242729237164e-05, + "loss": 0.6027, + "step": 2616 + }, + { + "epoch": 0.21, + "grad_norm": 0.9314631626290966, + "learning_rate": 1.8300959659240292e-05, + "loss": 0.5654, + "step": 2617 + }, + { + "epoch": 0.21, + "grad_norm": 0.9663152894675163, + "learning_rate": 1.829949145085981e-05, + "loss": 0.6001, + "step": 2618 + }, + { + "epoch": 0.21, + "grad_norm": 1.0527759807116104, + "learning_rate": 1.829802266733193e-05, + "loss": 0.554, + "step": 2619 + }, + { + "epoch": 0.21, + "grad_norm": 0.9441814027278527, + "learning_rate": 1.829655330875844e-05, + "loss": 0.5764, + "step": 2620 + }, + { + "epoch": 0.21, + "grad_norm": 0.9669943416679583, + "learning_rate": 1.829508337524116e-05, + "loss": 0.5185, + "step": 2621 + }, + { + "epoch": 0.21, + "grad_norm": 0.9532324404339418, + "learning_rate": 1.8293612866881965e-05, + "loss": 0.6193, + "step": 2622 + }, + { + "epoch": 0.21, + "grad_norm": 0.9122909261664486, + "learning_rate": 1.8292141783782754e-05, + "loss": 0.5344, + "step": 2623 + }, + { + "epoch": 0.21, + "grad_norm": 0.9184525412283985, + "learning_rate": 1.829067012604547e-05, + "loss": 0.6079, + "step": 2624 + }, + { + "epoch": 0.21, + "grad_norm": 1.0400039581612621, + "learning_rate": 1.8289197893772103e-05, + "loss": 0.6262, + "step": 2625 + }, + { + "epoch": 0.21, + "grad_norm": 1.0703975227084082, + "learning_rate": 1.8287725087064673e-05, + "loss": 0.5594, + "step": 2626 + }, + { + "epoch": 0.21, + "grad_norm": 0.9508531842550776, + "learning_rate": 1.8286251706025245e-05, + "loss": 0.5872, + "step": 2627 + }, + { + "epoch": 0.21, + "grad_norm": 1.0260098313678139, + "learning_rate": 1.828477775075592e-05, + "loss": 0.5755, + "step": 2628 + }, + { + "epoch": 0.21, + "grad_norm": 0.9910777135295307, + "learning_rate": 1.8283303221358854e-05, + "loss": 0.6457, + "step": 2629 + }, + { + "epoch": 0.21, + "grad_norm": 0.9249482186538088, + "learning_rate": 1.8281828117936217e-05, + "loss": 0.5589, + "step": 2630 + }, + { + "epoch": 0.21, + "grad_norm": 0.9602553560831182, + "learning_rate": 1.8280352440590236e-05, + "loss": 0.6608, + "step": 2631 + }, + { + "epoch": 0.21, + "grad_norm": 1.0260206878932465, + "learning_rate": 1.827887618942318e-05, + "loss": 0.5475, + "step": 2632 + }, + { + "epoch": 0.21, + "grad_norm": 1.0142523857260723, + "learning_rate": 1.8277399364537345e-05, + "loss": 0.6027, + "step": 2633 + }, + { + "epoch": 0.21, + "grad_norm": 0.9068347442951924, + "learning_rate": 1.8275921966035076e-05, + "loss": 0.5952, + "step": 2634 + }, + { + "epoch": 0.21, + "grad_norm": 0.9662920231000809, + "learning_rate": 1.8274443994018754e-05, + "loss": 0.608, + "step": 2635 + }, + { + "epoch": 0.21, + "grad_norm": 0.9831243383495013, + "learning_rate": 1.8272965448590807e-05, + "loss": 0.5982, + "step": 2636 + }, + { + "epoch": 0.21, + "grad_norm": 1.0072628424109062, + "learning_rate": 1.827148632985369e-05, + "loss": 0.5552, + "step": 2637 + }, + { + "epoch": 0.21, + "grad_norm": 0.9561275626706207, + "learning_rate": 1.8270006637909907e-05, + "loss": 0.5363, + "step": 2638 + }, + { + "epoch": 0.21, + "grad_norm": 0.9382299009178503, + "learning_rate": 1.8268526372862e-05, + "loss": 0.61, + "step": 2639 + }, + { + "epoch": 0.21, + "grad_norm": 1.036330966633798, + "learning_rate": 1.8267045534812547e-05, + "loss": 0.5729, + "step": 2640 + }, + { + "epoch": 0.21, + "grad_norm": 1.0591982746515018, + "learning_rate": 1.8265564123864174e-05, + "loss": 0.5942, + "step": 2641 + }, + { + "epoch": 0.21, + "grad_norm": 1.0280607493711271, + "learning_rate": 1.826408214011954e-05, + "loss": 0.6189, + "step": 2642 + }, + { + "epoch": 0.21, + "grad_norm": 0.9694580927695198, + "learning_rate": 1.826259958368134e-05, + "loss": 0.5646, + "step": 2643 + }, + { + "epoch": 0.21, + "grad_norm": 0.9148632149342607, + "learning_rate": 1.826111645465232e-05, + "loss": 0.5231, + "step": 2644 + }, + { + "epoch": 0.21, + "grad_norm": 1.0528796357090346, + "learning_rate": 1.8259632753135257e-05, + "loss": 0.6494, + "step": 2645 + }, + { + "epoch": 0.22, + "grad_norm": 0.8927493697457675, + "learning_rate": 1.825814847923297e-05, + "loss": 0.605, + "step": 2646 + }, + { + "epoch": 0.22, + "grad_norm": 0.9746220924690185, + "learning_rate": 1.825666363304832e-05, + "loss": 0.5596, + "step": 2647 + }, + { + "epoch": 0.22, + "grad_norm": 1.104502002989997, + "learning_rate": 1.82551782146842e-05, + "loss": 0.5217, + "step": 2648 + }, + { + "epoch": 0.22, + "grad_norm": 0.9522011695494644, + "learning_rate": 1.825369222424356e-05, + "loss": 0.5457, + "step": 2649 + }, + { + "epoch": 0.22, + "grad_norm": 0.9974276810353772, + "learning_rate": 1.8252205661829364e-05, + "loss": 0.6035, + "step": 2650 + }, + { + "epoch": 0.22, + "grad_norm": 0.8536988050512456, + "learning_rate": 1.8250718527544636e-05, + "loss": 0.561, + "step": 2651 + }, + { + "epoch": 0.22, + "grad_norm": 1.014452009587213, + "learning_rate": 1.824923082149243e-05, + "loss": 0.5939, + "step": 2652 + }, + { + "epoch": 0.22, + "grad_norm": 1.0286657701497812, + "learning_rate": 1.824774254377585e-05, + "loss": 0.6005, + "step": 2653 + }, + { + "epoch": 0.22, + "grad_norm": 0.8738102658423137, + "learning_rate": 1.8246253694498024e-05, + "loss": 0.5788, + "step": 2654 + }, + { + "epoch": 0.22, + "grad_norm": 0.9880182661920288, + "learning_rate": 1.8244764273762133e-05, + "loss": 0.632, + "step": 2655 + }, + { + "epoch": 0.22, + "grad_norm": 0.8950089128352056, + "learning_rate": 1.8243274281671392e-05, + "loss": 0.5739, + "step": 2656 + }, + { + "epoch": 0.22, + "grad_norm": 0.904422421895005, + "learning_rate": 1.824178371832905e-05, + "loss": 0.603, + "step": 2657 + }, + { + "epoch": 0.22, + "grad_norm": 0.9873253816907256, + "learning_rate": 1.824029258383841e-05, + "loss": 0.5945, + "step": 2658 + }, + { + "epoch": 0.22, + "grad_norm": 1.1132475701850797, + "learning_rate": 1.8238800878302804e-05, + "loss": 0.6235, + "step": 2659 + }, + { + "epoch": 0.22, + "grad_norm": 1.1216405221969532, + "learning_rate": 1.8237308601825604e-05, + "loss": 0.5969, + "step": 2660 + }, + { + "epoch": 0.22, + "grad_norm": 1.0266161431706164, + "learning_rate": 1.8235815754510227e-05, + "loss": 0.519, + "step": 2661 + }, + { + "epoch": 0.22, + "grad_norm": 0.9705185677800771, + "learning_rate": 1.823432233646012e-05, + "loss": 0.5804, + "step": 2662 + }, + { + "epoch": 0.22, + "grad_norm": 0.9579417158184198, + "learning_rate": 1.8232828347778778e-05, + "loss": 0.675, + "step": 2663 + }, + { + "epoch": 0.22, + "grad_norm": 0.999957902123899, + "learning_rate": 1.8231333788569737e-05, + "loss": 0.5755, + "step": 2664 + }, + { + "epoch": 0.22, + "grad_norm": 0.996708421437918, + "learning_rate": 1.8229838658936566e-05, + "loss": 0.6197, + "step": 2665 + }, + { + "epoch": 0.22, + "grad_norm": 0.8644195494317504, + "learning_rate": 1.8228342958982874e-05, + "loss": 0.5294, + "step": 2666 + }, + { + "epoch": 0.22, + "grad_norm": 0.9366598979644085, + "learning_rate": 1.8226846688812314e-05, + "loss": 0.6038, + "step": 2667 + }, + { + "epoch": 0.22, + "grad_norm": 0.8966307476883493, + "learning_rate": 1.8225349848528574e-05, + "loss": 0.5667, + "step": 2668 + }, + { + "epoch": 0.22, + "grad_norm": 1.0839760777914489, + "learning_rate": 1.822385243823539e-05, + "loss": 0.6408, + "step": 2669 + }, + { + "epoch": 0.22, + "grad_norm": 0.8627457075836641, + "learning_rate": 1.8222354458036523e-05, + "loss": 0.5858, + "step": 2670 + }, + { + "epoch": 0.22, + "grad_norm": 0.9506321931490626, + "learning_rate": 1.8220855908035783e-05, + "loss": 0.5744, + "step": 2671 + }, + { + "epoch": 0.22, + "grad_norm": 0.8980323219691284, + "learning_rate": 1.8219356788337027e-05, + "loss": 0.588, + "step": 2672 + }, + { + "epoch": 0.22, + "grad_norm": 0.9614101469490036, + "learning_rate": 1.8217857099044128e-05, + "loss": 0.5708, + "step": 2673 + }, + { + "epoch": 0.22, + "grad_norm": 0.9732045139725454, + "learning_rate": 1.8216356840261028e-05, + "loss": 0.6515, + "step": 2674 + }, + { + "epoch": 0.22, + "grad_norm": 0.9802795442791357, + "learning_rate": 1.8214856012091684e-05, + "loss": 0.634, + "step": 2675 + }, + { + "epoch": 0.22, + "grad_norm": 0.9608737963408859, + "learning_rate": 1.8213354614640105e-05, + "loss": 0.6326, + "step": 2676 + }, + { + "epoch": 0.22, + "grad_norm": 0.9140455271956307, + "learning_rate": 1.8211852648010338e-05, + "loss": 0.6295, + "step": 2677 + }, + { + "epoch": 0.22, + "grad_norm": 0.9696423375138519, + "learning_rate": 1.8210350112306466e-05, + "loss": 0.5804, + "step": 2678 + }, + { + "epoch": 0.22, + "grad_norm": 0.8442936328053501, + "learning_rate": 1.8208847007632613e-05, + "loss": 0.5968, + "step": 2679 + }, + { + "epoch": 0.22, + "grad_norm": 0.9691958315893788, + "learning_rate": 1.8207343334092944e-05, + "loss": 0.6059, + "step": 2680 + }, + { + "epoch": 0.22, + "grad_norm": 0.882560444021095, + "learning_rate": 1.820583909179166e-05, + "loss": 0.5787, + "step": 2681 + }, + { + "epoch": 0.22, + "grad_norm": 1.036455415023574, + "learning_rate": 1.8204334280833005e-05, + "loss": 0.5966, + "step": 2682 + }, + { + "epoch": 0.22, + "grad_norm": 1.0057190615511393, + "learning_rate": 1.8202828901321265e-05, + "loss": 0.6027, + "step": 2683 + }, + { + "epoch": 0.22, + "grad_norm": 1.020603493240722, + "learning_rate": 1.8201322953360758e-05, + "loss": 0.5888, + "step": 2684 + }, + { + "epoch": 0.22, + "grad_norm": 0.9371289052205584, + "learning_rate": 1.8199816437055843e-05, + "loss": 0.5487, + "step": 2685 + }, + { + "epoch": 0.22, + "grad_norm": 1.0491755776905296, + "learning_rate": 1.8198309352510924e-05, + "loss": 0.6072, + "step": 2686 + }, + { + "epoch": 0.22, + "grad_norm": 0.8660714216318303, + "learning_rate": 1.8196801699830437e-05, + "loss": 0.6076, + "step": 2687 + }, + { + "epoch": 0.22, + "grad_norm": 0.9862596290015903, + "learning_rate": 1.8195293479118863e-05, + "loss": 0.6277, + "step": 2688 + }, + { + "epoch": 0.22, + "grad_norm": 1.0437377771063885, + "learning_rate": 1.819378469048072e-05, + "loss": 0.5915, + "step": 2689 + }, + { + "epoch": 0.22, + "grad_norm": 0.9439824865798379, + "learning_rate": 1.8192275334020565e-05, + "loss": 0.5828, + "step": 2690 + }, + { + "epoch": 0.22, + "grad_norm": 1.0225402759808864, + "learning_rate": 1.8190765409842997e-05, + "loss": 0.6008, + "step": 2691 + }, + { + "epoch": 0.22, + "grad_norm": 0.9958776066891274, + "learning_rate": 1.818925491805265e-05, + "loss": 0.5392, + "step": 2692 + }, + { + "epoch": 0.22, + "grad_norm": 0.9329456271958139, + "learning_rate": 1.8187743858754206e-05, + "loss": 0.6347, + "step": 2693 + }, + { + "epoch": 0.22, + "grad_norm": 0.9906642944077385, + "learning_rate": 1.818623223205237e-05, + "loss": 0.6293, + "step": 2694 + }, + { + "epoch": 0.22, + "grad_norm": 0.940956884211602, + "learning_rate": 1.8184720038051905e-05, + "loss": 0.5864, + "step": 2695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9430103458543017, + "learning_rate": 1.8183207276857596e-05, + "loss": 0.5725, + "step": 2696 + }, + { + "epoch": 0.22, + "grad_norm": 0.9750056603913353, + "learning_rate": 1.8181693948574285e-05, + "loss": 0.6003, + "step": 2697 + }, + { + "epoch": 0.22, + "grad_norm": 0.936158816403639, + "learning_rate": 1.818018005330684e-05, + "loss": 0.5746, + "step": 2698 + }, + { + "epoch": 0.22, + "grad_norm": 1.0049390990235507, + "learning_rate": 1.817866559116017e-05, + "loss": 0.6166, + "step": 2699 + }, + { + "epoch": 0.22, + "grad_norm": 0.9979185376293706, + "learning_rate": 1.8177150562239236e-05, + "loss": 0.5793, + "step": 2700 + }, + { + "epoch": 0.22, + "grad_norm": 1.0203902925197001, + "learning_rate": 1.8175634966649015e-05, + "loss": 0.5755, + "step": 2701 + }, + { + "epoch": 0.22, + "grad_norm": 0.9142012279899165, + "learning_rate": 1.8174118804494548e-05, + "loss": 0.5524, + "step": 2702 + }, + { + "epoch": 0.22, + "grad_norm": 1.0338022613929347, + "learning_rate": 1.8172602075880893e-05, + "loss": 0.6003, + "step": 2703 + }, + { + "epoch": 0.22, + "grad_norm": 1.0650332972018386, + "learning_rate": 1.8171084780913165e-05, + "loss": 0.5795, + "step": 2704 + }, + { + "epoch": 0.22, + "grad_norm": 0.9413835902741449, + "learning_rate": 1.8169566919696512e-05, + "loss": 0.6581, + "step": 2705 + }, + { + "epoch": 0.22, + "grad_norm": 0.9301794167686281, + "learning_rate": 1.8168048492336116e-05, + "loss": 0.4732, + "step": 2706 + }, + { + "epoch": 0.22, + "grad_norm": 0.9971512309322477, + "learning_rate": 1.81665294989372e-05, + "loss": 0.5338, + "step": 2707 + }, + { + "epoch": 0.22, + "grad_norm": 0.9626054977861396, + "learning_rate": 1.8165009939605037e-05, + "loss": 0.575, + "step": 2708 + }, + { + "epoch": 0.22, + "grad_norm": 0.9686254044597287, + "learning_rate": 1.816348981444493e-05, + "loss": 0.6684, + "step": 2709 + }, + { + "epoch": 0.22, + "grad_norm": 0.817821965462608, + "learning_rate": 1.816196912356222e-05, + "loss": 0.5142, + "step": 2710 + }, + { + "epoch": 0.22, + "grad_norm": 0.9906203998916217, + "learning_rate": 1.8160447867062286e-05, + "loss": 0.6254, + "step": 2711 + }, + { + "epoch": 0.22, + "grad_norm": 1.0145536544226452, + "learning_rate": 1.8158926045050553e-05, + "loss": 0.6246, + "step": 2712 + }, + { + "epoch": 0.22, + "grad_norm": 0.8877817517453693, + "learning_rate": 1.8157403657632485e-05, + "loss": 0.5587, + "step": 2713 + }, + { + "epoch": 0.22, + "grad_norm": 0.9169924356992728, + "learning_rate": 1.8155880704913577e-05, + "loss": 0.5368, + "step": 2714 + }, + { + "epoch": 0.22, + "grad_norm": 0.7941288099049386, + "learning_rate": 1.8154357186999368e-05, + "loss": 0.5503, + "step": 2715 + }, + { + "epoch": 0.22, + "grad_norm": 0.9409239063898954, + "learning_rate": 1.8152833103995443e-05, + "loss": 0.6428, + "step": 2716 + }, + { + "epoch": 0.22, + "grad_norm": 0.9536368923356101, + "learning_rate": 1.8151308456007416e-05, + "loss": 0.6105, + "step": 2717 + }, + { + "epoch": 0.22, + "grad_norm": 0.9242821682058803, + "learning_rate": 1.814978324314094e-05, + "loss": 0.5262, + "step": 2718 + }, + { + "epoch": 0.22, + "grad_norm": 0.8923447734732718, + "learning_rate": 1.8148257465501718e-05, + "loss": 0.5968, + "step": 2719 + }, + { + "epoch": 0.22, + "grad_norm": 1.0346232613906077, + "learning_rate": 1.814673112319548e-05, + "loss": 0.6651, + "step": 2720 + }, + { + "epoch": 0.22, + "grad_norm": 0.8967208895801116, + "learning_rate": 1.8145204216327998e-05, + "loss": 0.5616, + "step": 2721 + }, + { + "epoch": 0.22, + "grad_norm": 0.9789527728091927, + "learning_rate": 1.8143676745005093e-05, + "loss": 0.5564, + "step": 2722 + }, + { + "epoch": 0.22, + "grad_norm": 0.9301175229831795, + "learning_rate": 1.814214870933261e-05, + "loss": 0.5414, + "step": 2723 + }, + { + "epoch": 0.22, + "grad_norm": 0.9279945540168463, + "learning_rate": 1.8140620109416445e-05, + "loss": 0.5891, + "step": 2724 + }, + { + "epoch": 0.22, + "grad_norm": 0.9583822050583957, + "learning_rate": 1.8139090945362525e-05, + "loss": 0.6199, + "step": 2725 + }, + { + "epoch": 0.22, + "grad_norm": 0.963837056971241, + "learning_rate": 1.8137561217276823e-05, + "loss": 0.5775, + "step": 2726 + }, + { + "epoch": 0.22, + "grad_norm": 0.9193008965088793, + "learning_rate": 1.8136030925265347e-05, + "loss": 0.6201, + "step": 2727 + }, + { + "epoch": 0.22, + "grad_norm": 0.9595001172487877, + "learning_rate": 1.8134500069434144e-05, + "loss": 0.5929, + "step": 2728 + }, + { + "epoch": 0.22, + "grad_norm": 0.8662619104759794, + "learning_rate": 1.81329686498893e-05, + "loss": 0.5501, + "step": 2729 + }, + { + "epoch": 0.22, + "grad_norm": 0.972240880760536, + "learning_rate": 1.8131436666736945e-05, + "loss": 0.5691, + "step": 2730 + }, + { + "epoch": 0.22, + "grad_norm": 0.9615177720700632, + "learning_rate": 1.8129904120083243e-05, + "loss": 0.5681, + "step": 2731 + }, + { + "epoch": 0.22, + "grad_norm": 0.9229624236657873, + "learning_rate": 1.8128371010034394e-05, + "loss": 0.555, + "step": 2732 + }, + { + "epoch": 0.22, + "grad_norm": 0.8890895179572698, + "learning_rate": 1.8126837336696645e-05, + "loss": 0.5756, + "step": 2733 + }, + { + "epoch": 0.22, + "grad_norm": 0.9067723189735549, + "learning_rate": 1.8125303100176275e-05, + "loss": 0.5928, + "step": 2734 + }, + { + "epoch": 0.22, + "grad_norm": 0.9007052662584437, + "learning_rate": 1.812376830057961e-05, + "loss": 0.6127, + "step": 2735 + }, + { + "epoch": 0.22, + "grad_norm": 0.9437405590348689, + "learning_rate": 1.812223293801301e-05, + "loss": 0.6169, + "step": 2736 + }, + { + "epoch": 0.22, + "grad_norm": 0.9646217361783188, + "learning_rate": 1.8120697012582863e-05, + "loss": 0.5928, + "step": 2737 + }, + { + "epoch": 0.22, + "grad_norm": 0.9682755714496285, + "learning_rate": 1.8119160524395622e-05, + "loss": 0.6008, + "step": 2738 + }, + { + "epoch": 0.22, + "grad_norm": 0.999055226351776, + "learning_rate": 1.8117623473557758e-05, + "loss": 0.6105, + "step": 2739 + }, + { + "epoch": 0.22, + "grad_norm": 0.9724026619999333, + "learning_rate": 1.8116085860175788e-05, + "loss": 0.5347, + "step": 2740 + }, + { + "epoch": 0.22, + "grad_norm": 1.0081974793755035, + "learning_rate": 1.8114547684356264e-05, + "loss": 0.5553, + "step": 2741 + }, + { + "epoch": 0.22, + "grad_norm": 1.0293986423368018, + "learning_rate": 1.8113008946205787e-05, + "loss": 0.6537, + "step": 2742 + }, + { + "epoch": 0.22, + "grad_norm": 0.9711812270768454, + "learning_rate": 1.8111469645830983e-05, + "loss": 0.5677, + "step": 2743 + }, + { + "epoch": 0.22, + "grad_norm": 0.8855189900145181, + "learning_rate": 1.810992978333853e-05, + "loss": 0.5616, + "step": 2744 + }, + { + "epoch": 0.22, + "grad_norm": 0.9973233513608902, + "learning_rate": 1.8108389358835135e-05, + "loss": 0.5849, + "step": 2745 + }, + { + "epoch": 0.22, + "grad_norm": 0.919178257973744, + "learning_rate": 1.810684837242755e-05, + "loss": 0.6009, + "step": 2746 + }, + { + "epoch": 0.22, + "grad_norm": 1.2479235413643202, + "learning_rate": 1.810530682422256e-05, + "loss": 0.612, + "step": 2747 + }, + { + "epoch": 0.22, + "grad_norm": 1.0580936700356294, + "learning_rate": 1.8103764714327004e-05, + "loss": 0.5988, + "step": 2748 + }, + { + "epoch": 0.22, + "grad_norm": 1.016274277429732, + "learning_rate": 1.8102222042847735e-05, + "loss": 0.5768, + "step": 2749 + }, + { + "epoch": 0.22, + "grad_norm": 1.0392968760654928, + "learning_rate": 1.8100678809891668e-05, + "loss": 0.6248, + "step": 2750 + }, + { + "epoch": 0.22, + "grad_norm": 0.9281711927515615, + "learning_rate": 1.8099135015565745e-05, + "loss": 0.58, + "step": 2751 + }, + { + "epoch": 0.22, + "grad_norm": 0.893762521635862, + "learning_rate": 1.8097590659976946e-05, + "loss": 0.5739, + "step": 2752 + }, + { + "epoch": 0.22, + "grad_norm": 0.8673575882784395, + "learning_rate": 1.8096045743232303e-05, + "loss": 0.5668, + "step": 2753 + }, + { + "epoch": 0.22, + "grad_norm": 0.9189763319220341, + "learning_rate": 1.8094500265438866e-05, + "loss": 0.5817, + "step": 2754 + }, + { + "epoch": 0.22, + "grad_norm": 0.8740099428626087, + "learning_rate": 1.8092954226703742e-05, + "loss": 0.5745, + "step": 2755 + }, + { + "epoch": 0.22, + "grad_norm": 1.0177735915260575, + "learning_rate": 1.8091407627134067e-05, + "loss": 0.6324, + "step": 2756 + }, + { + "epoch": 0.22, + "grad_norm": 0.8755765204054051, + "learning_rate": 1.8089860466837023e-05, + "loss": 0.591, + "step": 2757 + }, + { + "epoch": 0.22, + "grad_norm": 0.9844508074811542, + "learning_rate": 1.8088312745919823e-05, + "loss": 0.6863, + "step": 2758 + }, + { + "epoch": 0.22, + "grad_norm": 0.9178781550779036, + "learning_rate": 1.8086764464489723e-05, + "loss": 0.5362, + "step": 2759 + }, + { + "epoch": 0.22, + "grad_norm": 0.8438837000342646, + "learning_rate": 1.8085215622654023e-05, + "loss": 0.5502, + "step": 2760 + }, + { + "epoch": 0.22, + "grad_norm": 1.027995036155377, + "learning_rate": 1.8083666220520045e-05, + "loss": 0.6827, + "step": 2761 + }, + { + "epoch": 0.22, + "grad_norm": 1.0072627378618417, + "learning_rate": 1.8082116258195173e-05, + "loss": 0.609, + "step": 2762 + }, + { + "epoch": 0.22, + "grad_norm": 1.0404432889174489, + "learning_rate": 1.8080565735786813e-05, + "loss": 0.5405, + "step": 2763 + }, + { + "epoch": 0.22, + "grad_norm": 0.960224909807464, + "learning_rate": 1.8079014653402414e-05, + "loss": 0.5984, + "step": 2764 + }, + { + "epoch": 0.22, + "grad_norm": 0.8957444706981957, + "learning_rate": 1.8077463011149464e-05, + "loss": 0.5948, + "step": 2765 + }, + { + "epoch": 0.22, + "grad_norm": 0.8909499775341096, + "learning_rate": 1.80759108091355e-05, + "loss": 0.5498, + "step": 2766 + }, + { + "epoch": 0.22, + "grad_norm": 0.9552071296438046, + "learning_rate": 1.807435804746807e-05, + "loss": 0.5756, + "step": 2767 + }, + { + "epoch": 0.22, + "grad_norm": 1.0301741647540563, + "learning_rate": 1.8072804726254792e-05, + "loss": 0.6114, + "step": 2768 + }, + { + "epoch": 0.23, + "grad_norm": 0.9687543183786282, + "learning_rate": 1.807125084560331e-05, + "loss": 0.5714, + "step": 2769 + }, + { + "epoch": 0.23, + "grad_norm": 0.8673956813094961, + "learning_rate": 1.80696964056213e-05, + "loss": 0.572, + "step": 2770 + }, + { + "epoch": 0.23, + "grad_norm": 0.8849403306028029, + "learning_rate": 1.8068141406416487e-05, + "loss": 0.5341, + "step": 2771 + }, + { + "epoch": 0.23, + "grad_norm": 0.9106492006599914, + "learning_rate": 1.8066585848096637e-05, + "loss": 0.6088, + "step": 2772 + }, + { + "epoch": 0.23, + "grad_norm": 0.9742053568515937, + "learning_rate": 1.8065029730769534e-05, + "loss": 0.6063, + "step": 2773 + }, + { + "epoch": 0.23, + "grad_norm": 0.9903274449807491, + "learning_rate": 1.806347305454303e-05, + "loss": 0.5429, + "step": 2774 + }, + { + "epoch": 0.23, + "grad_norm": 0.9181460398525245, + "learning_rate": 1.8061915819524995e-05, + "loss": 0.5536, + "step": 2775 + }, + { + "epoch": 0.23, + "grad_norm": 1.0098508357388716, + "learning_rate": 1.8060358025823344e-05, + "loss": 0.6372, + "step": 2776 + }, + { + "epoch": 0.23, + "grad_norm": 0.9213271798021149, + "learning_rate": 1.8058799673546032e-05, + "loss": 0.5686, + "step": 2777 + }, + { + "epoch": 0.23, + "grad_norm": 0.9557490562689361, + "learning_rate": 1.805724076280105e-05, + "loss": 0.5554, + "step": 2778 + }, + { + "epoch": 0.23, + "grad_norm": 1.0084940234973645, + "learning_rate": 1.805568129369643e-05, + "loss": 0.665, + "step": 2779 + }, + { + "epoch": 0.23, + "grad_norm": 0.9126637077556393, + "learning_rate": 1.805412126634024e-05, + "loss": 0.5613, + "step": 2780 + }, + { + "epoch": 0.23, + "grad_norm": 0.9577382335710649, + "learning_rate": 1.8052560680840595e-05, + "loss": 0.6643, + "step": 2781 + }, + { + "epoch": 0.23, + "grad_norm": 1.0449965708611912, + "learning_rate": 1.8050999537305634e-05, + "loss": 0.5253, + "step": 2782 + }, + { + "epoch": 0.23, + "grad_norm": 0.9938981486681303, + "learning_rate": 1.8049437835843545e-05, + "loss": 0.5189, + "step": 2783 + }, + { + "epoch": 0.23, + "grad_norm": 0.9753113641821504, + "learning_rate": 1.8047875576562556e-05, + "loss": 0.6217, + "step": 2784 + }, + { + "epoch": 0.23, + "grad_norm": 0.9553770251097057, + "learning_rate": 1.8046312759570924e-05, + "loss": 0.5544, + "step": 2785 + }, + { + "epoch": 0.23, + "grad_norm": 0.9919726546590415, + "learning_rate": 1.804474938497696e-05, + "loss": 0.6089, + "step": 2786 + }, + { + "epoch": 0.23, + "grad_norm": 0.9808570279080948, + "learning_rate": 1.8043185452888997e-05, + "loss": 0.5294, + "step": 2787 + }, + { + "epoch": 0.23, + "grad_norm": 1.0518621525037852, + "learning_rate": 1.8041620963415418e-05, + "loss": 0.5776, + "step": 2788 + }, + { + "epoch": 0.23, + "grad_norm": 1.1022635177347593, + "learning_rate": 1.804005591666464e-05, + "loss": 0.6217, + "step": 2789 + }, + { + "epoch": 0.23, + "grad_norm": 0.9429626058610409, + "learning_rate": 1.8038490312745116e-05, + "loss": 0.6275, + "step": 2790 + }, + { + "epoch": 0.23, + "grad_norm": 0.930398967716542, + "learning_rate": 1.8036924151765345e-05, + "loss": 0.5554, + "step": 2791 + }, + { + "epoch": 0.23, + "grad_norm": 0.887025266457902, + "learning_rate": 1.803535743383386e-05, + "loss": 0.5701, + "step": 2792 + }, + { + "epoch": 0.23, + "grad_norm": 0.9402776215475784, + "learning_rate": 1.8033790159059224e-05, + "loss": 0.6054, + "step": 2793 + }, + { + "epoch": 0.23, + "grad_norm": 1.0326325235514582, + "learning_rate": 1.8032222327550063e-05, + "loss": 0.6075, + "step": 2794 + }, + { + "epoch": 0.23, + "grad_norm": 0.9775021711373921, + "learning_rate": 1.803065393941502e-05, + "loss": 0.5606, + "step": 2795 + }, + { + "epoch": 0.23, + "grad_norm": 0.9157898288063075, + "learning_rate": 1.802908499476278e-05, + "loss": 0.5374, + "step": 2796 + }, + { + "epoch": 0.23, + "grad_norm": 0.8959173642295047, + "learning_rate": 1.8027515493702075e-05, + "loss": 0.487, + "step": 2797 + }, + { + "epoch": 0.23, + "grad_norm": 0.9516732846513793, + "learning_rate": 1.8025945436341663e-05, + "loss": 0.5696, + "step": 2798 + }, + { + "epoch": 0.23, + "grad_norm": 0.9225055633764292, + "learning_rate": 1.8024374822790355e-05, + "loss": 0.5725, + "step": 2799 + }, + { + "epoch": 0.23, + "grad_norm": 0.9139056227761806, + "learning_rate": 1.8022803653156983e-05, + "loss": 0.5483, + "step": 2800 + }, + { + "epoch": 0.23, + "grad_norm": 0.9438666489422416, + "learning_rate": 1.802123192755044e-05, + "loss": 0.5711, + "step": 2801 + }, + { + "epoch": 0.23, + "grad_norm": 0.994929728112408, + "learning_rate": 1.8019659646079636e-05, + "loss": 0.5899, + "step": 2802 + }, + { + "epoch": 0.23, + "grad_norm": 0.9894663256980869, + "learning_rate": 1.8018086808853535e-05, + "loss": 0.6254, + "step": 2803 + }, + { + "epoch": 0.23, + "grad_norm": 1.0052732190279563, + "learning_rate": 1.8016513415981128e-05, + "loss": 0.5144, + "step": 2804 + }, + { + "epoch": 0.23, + "grad_norm": 1.0144845417320454, + "learning_rate": 1.801493946757145e-05, + "loss": 0.5835, + "step": 2805 + }, + { + "epoch": 0.23, + "grad_norm": 1.0252420292101474, + "learning_rate": 1.801336496373358e-05, + "loss": 0.5536, + "step": 2806 + }, + { + "epoch": 0.23, + "grad_norm": 0.8971116968992333, + "learning_rate": 1.8011789904576624e-05, + "loss": 0.5506, + "step": 2807 + }, + { + "epoch": 0.23, + "grad_norm": 0.8970316281501216, + "learning_rate": 1.8010214290209735e-05, + "loss": 0.6684, + "step": 2808 + }, + { + "epoch": 0.23, + "grad_norm": 0.9915460509347839, + "learning_rate": 1.80086381207421e-05, + "loss": 0.6473, + "step": 2809 + }, + { + "epoch": 0.23, + "grad_norm": 0.9679950558017996, + "learning_rate": 1.8007061396282944e-05, + "loss": 0.5946, + "step": 2810 + }, + { + "epoch": 0.23, + "grad_norm": 0.9297039333204747, + "learning_rate": 1.800548411694154e-05, + "loss": 0.5475, + "step": 2811 + }, + { + "epoch": 0.23, + "grad_norm": 0.8258902225497208, + "learning_rate": 1.8003906282827186e-05, + "loss": 0.5406, + "step": 2812 + }, + { + "epoch": 0.23, + "grad_norm": 0.7445951245744602, + "learning_rate": 1.8002327894049225e-05, + "loss": 0.5067, + "step": 2813 + }, + { + "epoch": 0.23, + "grad_norm": 0.8985526515890513, + "learning_rate": 1.800074895071704e-05, + "loss": 0.579, + "step": 2814 + }, + { + "epoch": 0.23, + "grad_norm": 1.0104156103033566, + "learning_rate": 1.799916945294005e-05, + "loss": 0.6262, + "step": 2815 + }, + { + "epoch": 0.23, + "grad_norm": 1.0213484101551884, + "learning_rate": 1.7997589400827712e-05, + "loss": 0.6357, + "step": 2816 + }, + { + "epoch": 0.23, + "grad_norm": 1.0685631612624997, + "learning_rate": 1.799600879448952e-05, + "loss": 0.5975, + "step": 2817 + }, + { + "epoch": 0.23, + "grad_norm": 0.9967993589986436, + "learning_rate": 1.7994427634035016e-05, + "loss": 0.5412, + "step": 2818 + }, + { + "epoch": 0.23, + "grad_norm": 0.9180677224628448, + "learning_rate": 1.799284591957376e-05, + "loss": 0.5347, + "step": 2819 + }, + { + "epoch": 0.23, + "grad_norm": 0.9747429803370978, + "learning_rate": 1.799126365121538e-05, + "loss": 0.6033, + "step": 2820 + }, + { + "epoch": 0.23, + "grad_norm": 0.9163259787114536, + "learning_rate": 1.798968082906951e-05, + "loss": 0.5792, + "step": 2821 + }, + { + "epoch": 0.23, + "grad_norm": 0.9211695025727105, + "learning_rate": 1.798809745324585e-05, + "loss": 0.5119, + "step": 2822 + }, + { + "epoch": 0.23, + "grad_norm": 0.8676048846449058, + "learning_rate": 1.798651352385412e-05, + "loss": 0.5284, + "step": 2823 + }, + { + "epoch": 0.23, + "grad_norm": 0.9015393452040311, + "learning_rate": 1.798492904100409e-05, + "loss": 0.4627, + "step": 2824 + }, + { + "epoch": 0.23, + "grad_norm": 0.9387971158929248, + "learning_rate": 1.7983344004805555e-05, + "loss": 0.6055, + "step": 2825 + }, + { + "epoch": 0.23, + "grad_norm": 0.8471959162297319, + "learning_rate": 1.7981758415368365e-05, + "loss": 0.5055, + "step": 2826 + }, + { + "epoch": 0.23, + "grad_norm": 0.988588879294854, + "learning_rate": 1.7980172272802398e-05, + "loss": 0.5528, + "step": 2827 + }, + { + "epoch": 0.23, + "grad_norm": 0.9326776260735447, + "learning_rate": 1.7978585577217568e-05, + "loss": 0.5196, + "step": 2828 + }, + { + "epoch": 0.23, + "grad_norm": 0.8966730811382917, + "learning_rate": 1.7976998328723833e-05, + "loss": 0.5468, + "step": 2829 + }, + { + "epoch": 0.23, + "grad_norm": 0.8490311284377158, + "learning_rate": 1.7975410527431195e-05, + "loss": 0.5447, + "step": 2830 + }, + { + "epoch": 0.23, + "grad_norm": 0.8813316008855758, + "learning_rate": 1.797382217344968e-05, + "loss": 0.4973, + "step": 2831 + }, + { + "epoch": 0.23, + "grad_norm": 1.089434215108231, + "learning_rate": 1.7972233266889356e-05, + "loss": 0.6135, + "step": 2832 + }, + { + "epoch": 0.23, + "grad_norm": 0.9659587366357253, + "learning_rate": 1.797064380786034e-05, + "loss": 0.5099, + "step": 2833 + }, + { + "epoch": 0.23, + "grad_norm": 0.9218666717431676, + "learning_rate": 1.7969053796472783e-05, + "loss": 0.5723, + "step": 2834 + }, + { + "epoch": 0.23, + "grad_norm": 0.9277864344934234, + "learning_rate": 1.796746323283686e-05, + "loss": 0.5605, + "step": 2835 + }, + { + "epoch": 0.23, + "grad_norm": 0.9776799469352457, + "learning_rate": 1.7965872117062806e-05, + "loss": 0.5077, + "step": 2836 + }, + { + "epoch": 0.23, + "grad_norm": 0.9318298969039761, + "learning_rate": 1.796428044926088e-05, + "loss": 0.6138, + "step": 2837 + }, + { + "epoch": 0.23, + "grad_norm": 0.9075948043353205, + "learning_rate": 1.7962688229541382e-05, + "loss": 0.5704, + "step": 2838 + }, + { + "epoch": 0.23, + "grad_norm": 0.9306145438415679, + "learning_rate": 1.7961095458014655e-05, + "loss": 0.6038, + "step": 2839 + }, + { + "epoch": 0.23, + "grad_norm": 0.8884000710243785, + "learning_rate": 1.795950213479107e-05, + "loss": 0.5054, + "step": 2840 + }, + { + "epoch": 0.23, + "grad_norm": 0.8569607622945014, + "learning_rate": 1.795790825998105e-05, + "loss": 0.5184, + "step": 2841 + }, + { + "epoch": 0.23, + "grad_norm": 0.9791079837049262, + "learning_rate": 1.7956313833695046e-05, + "loss": 0.5739, + "step": 2842 + }, + { + "epoch": 0.23, + "grad_norm": 0.9588531159827303, + "learning_rate": 1.795471885604355e-05, + "loss": 0.5694, + "step": 2843 + }, + { + "epoch": 0.23, + "grad_norm": 1.0297830187508645, + "learning_rate": 1.7953123327137093e-05, + "loss": 0.6476, + "step": 2844 + }, + { + "epoch": 0.23, + "grad_norm": 0.9288421640089412, + "learning_rate": 1.7951527247086243e-05, + "loss": 0.5351, + "step": 2845 + }, + { + "epoch": 0.23, + "grad_norm": 0.9602300038067393, + "learning_rate": 1.794993061600161e-05, + "loss": 0.596, + "step": 2846 + }, + { + "epoch": 0.23, + "grad_norm": 0.9075377212248501, + "learning_rate": 1.7948333433993833e-05, + "loss": 0.5389, + "step": 2847 + }, + { + "epoch": 0.23, + "grad_norm": 0.995373021003706, + "learning_rate": 1.7946735701173604e-05, + "loss": 0.6393, + "step": 2848 + }, + { + "epoch": 0.23, + "grad_norm": 0.9164049272640322, + "learning_rate": 1.7945137417651638e-05, + "loss": 0.6477, + "step": 2849 + }, + { + "epoch": 0.23, + "grad_norm": 0.9477115032759619, + "learning_rate": 1.7943538583538696e-05, + "loss": 0.6198, + "step": 2850 + }, + { + "epoch": 0.23, + "grad_norm": 0.90846432248897, + "learning_rate": 1.7941939198945574e-05, + "loss": 0.52, + "step": 2851 + }, + { + "epoch": 0.23, + "grad_norm": 0.9174204792362805, + "learning_rate": 1.7940339263983112e-05, + "loss": 0.5913, + "step": 2852 + }, + { + "epoch": 0.23, + "grad_norm": 0.9792924779362212, + "learning_rate": 1.7938738778762182e-05, + "loss": 0.5644, + "step": 2853 + }, + { + "epoch": 0.23, + "grad_norm": 0.9005528079322241, + "learning_rate": 1.7937137743393695e-05, + "loss": 0.5532, + "step": 2854 + }, + { + "epoch": 0.23, + "grad_norm": 0.9674891956317448, + "learning_rate": 1.7935536157988605e-05, + "loss": 0.5931, + "step": 2855 + }, + { + "epoch": 0.23, + "grad_norm": 1.0392442539723934, + "learning_rate": 1.79339340226579e-05, + "loss": 0.5858, + "step": 2856 + }, + { + "epoch": 0.23, + "grad_norm": 0.9010473425300569, + "learning_rate": 1.79323313375126e-05, + "loss": 0.5254, + "step": 2857 + }, + { + "epoch": 0.23, + "grad_norm": 0.908624006675908, + "learning_rate": 1.7930728102663775e-05, + "loss": 0.5139, + "step": 2858 + }, + { + "epoch": 0.23, + "grad_norm": 1.0041814177937274, + "learning_rate": 1.792912431822253e-05, + "loss": 0.591, + "step": 2859 + }, + { + "epoch": 0.23, + "grad_norm": 0.8424943230616456, + "learning_rate": 1.79275199843e-05, + "loss": 0.5597, + "step": 2860 + }, + { + "epoch": 0.23, + "grad_norm": 0.9827303241823689, + "learning_rate": 1.7925915101007366e-05, + "loss": 0.5951, + "step": 2861 + }, + { + "epoch": 0.23, + "grad_norm": 0.9747069232699177, + "learning_rate": 1.792430966845585e-05, + "loss": 0.5873, + "step": 2862 + }, + { + "epoch": 0.23, + "grad_norm": 1.0293522334126957, + "learning_rate": 1.7922703686756697e-05, + "loss": 0.6401, + "step": 2863 + }, + { + "epoch": 0.23, + "grad_norm": 1.011927611023172, + "learning_rate": 1.792109715602121e-05, + "loss": 0.5973, + "step": 2864 + }, + { + "epoch": 0.23, + "grad_norm": 0.9273786221962907, + "learning_rate": 1.7919490076360714e-05, + "loss": 0.6032, + "step": 2865 + }, + { + "epoch": 0.23, + "grad_norm": 1.093940323247069, + "learning_rate": 1.7917882447886585e-05, + "loss": 0.6462, + "step": 2866 + }, + { + "epoch": 0.23, + "grad_norm": 0.890019833154301, + "learning_rate": 1.7916274270710218e-05, + "loss": 0.6138, + "step": 2867 + }, + { + "epoch": 0.23, + "grad_norm": 0.9769921244269482, + "learning_rate": 1.7914665544943072e-05, + "loss": 0.6104, + "step": 2868 + }, + { + "epoch": 0.23, + "grad_norm": 0.9485632509377442, + "learning_rate": 1.791305627069662e-05, + "loss": 0.6246, + "step": 2869 + }, + { + "epoch": 0.23, + "grad_norm": 1.0018934058623412, + "learning_rate": 1.791144644808239e-05, + "loss": 0.6317, + "step": 2870 + }, + { + "epoch": 0.23, + "grad_norm": 0.9875398922579854, + "learning_rate": 1.7909836077211936e-05, + "loss": 0.6921, + "step": 2871 + }, + { + "epoch": 0.23, + "grad_norm": 0.8942446600706555, + "learning_rate": 1.790822515819686e-05, + "loss": 0.5872, + "step": 2872 + }, + { + "epoch": 0.23, + "grad_norm": 0.9604170179855209, + "learning_rate": 1.7906613691148796e-05, + "loss": 0.5829, + "step": 2873 + }, + { + "epoch": 0.23, + "grad_norm": 0.8942090505558472, + "learning_rate": 1.7905001676179414e-05, + "loss": 0.5426, + "step": 2874 + }, + { + "epoch": 0.23, + "grad_norm": 1.0487502187628917, + "learning_rate": 1.7903389113400427e-05, + "loss": 0.5667, + "step": 2875 + }, + { + "epoch": 0.23, + "grad_norm": 0.959663943230379, + "learning_rate": 1.790177600292359e-05, + "loss": 0.572, + "step": 2876 + }, + { + "epoch": 0.23, + "grad_norm": 0.9917149402947344, + "learning_rate": 1.790016234486068e-05, + "loss": 0.5906, + "step": 2877 + }, + { + "epoch": 0.23, + "grad_norm": 0.937418302425181, + "learning_rate": 1.789854813932353e-05, + "loss": 0.5202, + "step": 2878 + }, + { + "epoch": 0.23, + "grad_norm": 0.7861228868319374, + "learning_rate": 1.7896933386423998e-05, + "loss": 0.4862, + "step": 2879 + }, + { + "epoch": 0.23, + "grad_norm": 1.0141607091776144, + "learning_rate": 1.7895318086273986e-05, + "loss": 0.5141, + "step": 2880 + }, + { + "epoch": 0.23, + "grad_norm": 1.0273508303059105, + "learning_rate": 1.7893702238985433e-05, + "loss": 0.5389, + "step": 2881 + }, + { + "epoch": 0.23, + "grad_norm": 0.9399823550683523, + "learning_rate": 1.7892085844670318e-05, + "loss": 0.5422, + "step": 2882 + }, + { + "epoch": 0.23, + "grad_norm": 0.9049524508006479, + "learning_rate": 1.7890468903440656e-05, + "loss": 0.5719, + "step": 2883 + }, + { + "epoch": 0.23, + "grad_norm": 0.9956943417222794, + "learning_rate": 1.7888851415408495e-05, + "loss": 0.5924, + "step": 2884 + }, + { + "epoch": 0.23, + "grad_norm": 0.883186802287362, + "learning_rate": 1.788723338068593e-05, + "loss": 0.5544, + "step": 2885 + }, + { + "epoch": 0.23, + "grad_norm": 0.9971492225911742, + "learning_rate": 1.7885614799385086e-05, + "loss": 0.6009, + "step": 2886 + }, + { + "epoch": 0.23, + "grad_norm": 0.8355205506640969, + "learning_rate": 1.7883995671618133e-05, + "loss": 0.5485, + "step": 2887 + }, + { + "epoch": 0.23, + "grad_norm": 0.9074735160025519, + "learning_rate": 1.7882375997497273e-05, + "loss": 0.574, + "step": 2888 + }, + { + "epoch": 0.23, + "grad_norm": 1.0213868782570381, + "learning_rate": 1.788075577713475e-05, + "loss": 0.5807, + "step": 2889 + }, + { + "epoch": 0.23, + "grad_norm": 0.944087464291218, + "learning_rate": 1.7879135010642836e-05, + "loss": 0.6015, + "step": 2890 + }, + { + "epoch": 0.23, + "grad_norm": 0.9115513698895477, + "learning_rate": 1.787751369813386e-05, + "loss": 0.573, + "step": 2891 + }, + { + "epoch": 0.24, + "grad_norm": 0.937984094940414, + "learning_rate": 1.787589183972017e-05, + "loss": 0.6324, + "step": 2892 + }, + { + "epoch": 0.24, + "grad_norm": 1.1049632286846252, + "learning_rate": 1.787426943551416e-05, + "loss": 0.6556, + "step": 2893 + }, + { + "epoch": 0.24, + "grad_norm": 0.8892815904572252, + "learning_rate": 1.7872646485628266e-05, + "loss": 0.6171, + "step": 2894 + }, + { + "epoch": 0.24, + "grad_norm": 0.890347163387591, + "learning_rate": 1.7871022990174948e-05, + "loss": 0.5293, + "step": 2895 + }, + { + "epoch": 0.24, + "grad_norm": 0.8997722667227449, + "learning_rate": 1.7869398949266724e-05, + "loss": 0.565, + "step": 2896 + }, + { + "epoch": 0.24, + "grad_norm": 0.9329374331298593, + "learning_rate": 1.786777436301613e-05, + "loss": 0.5565, + "step": 2897 + }, + { + "epoch": 0.24, + "grad_norm": 0.9746227868782539, + "learning_rate": 1.7866149231535754e-05, + "loss": 0.5568, + "step": 2898 + }, + { + "epoch": 0.24, + "grad_norm": 0.8773883378129755, + "learning_rate": 1.786452355493821e-05, + "loss": 0.5434, + "step": 2899 + }, + { + "epoch": 0.24, + "grad_norm": 0.9631742937919332, + "learning_rate": 1.7862897333336162e-05, + "loss": 0.6124, + "step": 2900 + }, + { + "epoch": 0.24, + "grad_norm": 0.9779033162507079, + "learning_rate": 1.78612705668423e-05, + "loss": 0.6597, + "step": 2901 + }, + { + "epoch": 0.24, + "grad_norm": 0.9886719845976905, + "learning_rate": 1.7859643255569364e-05, + "loss": 0.5886, + "step": 2902 + }, + { + "epoch": 0.24, + "grad_norm": 0.9661761439948027, + "learning_rate": 1.785801539963012e-05, + "loss": 0.5725, + "step": 2903 + }, + { + "epoch": 0.24, + "grad_norm": 0.9641311236560048, + "learning_rate": 1.785638699913738e-05, + "loss": 0.5953, + "step": 2904 + }, + { + "epoch": 0.24, + "grad_norm": 0.9824821479543113, + "learning_rate": 1.785475805420399e-05, + "loss": 0.6451, + "step": 2905 + }, + { + "epoch": 0.24, + "grad_norm": 0.9574330850350337, + "learning_rate": 1.7853128564942834e-05, + "loss": 0.591, + "step": 2906 + }, + { + "epoch": 0.24, + "grad_norm": 1.002233316642695, + "learning_rate": 1.7851498531466833e-05, + "loss": 0.6056, + "step": 2907 + }, + { + "epoch": 0.24, + "grad_norm": 1.0215148904505245, + "learning_rate": 1.784986795388895e-05, + "loss": 0.6765, + "step": 2908 + }, + { + "epoch": 0.24, + "grad_norm": 0.976072703279664, + "learning_rate": 1.7848236832322175e-05, + "loss": 0.5366, + "step": 2909 + }, + { + "epoch": 0.24, + "grad_norm": 1.0272588509160951, + "learning_rate": 1.7846605166879555e-05, + "loss": 0.667, + "step": 2910 + }, + { + "epoch": 0.24, + "grad_norm": 0.9826121519395441, + "learning_rate": 1.7844972957674156e-05, + "loss": 0.5788, + "step": 2911 + }, + { + "epoch": 0.24, + "grad_norm": 0.978849061408577, + "learning_rate": 1.7843340204819087e-05, + "loss": 0.6177, + "step": 2912 + }, + { + "epoch": 0.24, + "grad_norm": 1.0479357254213726, + "learning_rate": 1.78417069084275e-05, + "loss": 0.5245, + "step": 2913 + }, + { + "epoch": 0.24, + "grad_norm": 0.9221183355541194, + "learning_rate": 1.784007306861258e-05, + "loss": 0.623, + "step": 2914 + }, + { + "epoch": 0.24, + "grad_norm": 0.9128945043954562, + "learning_rate": 1.783843868548755e-05, + "loss": 0.5235, + "step": 2915 + }, + { + "epoch": 0.24, + "grad_norm": 0.8916286639889864, + "learning_rate": 1.7836803759165673e-05, + "loss": 0.5629, + "step": 2916 + }, + { + "epoch": 0.24, + "grad_norm": 1.0138599166669626, + "learning_rate": 1.7835168289760248e-05, + "loss": 0.6256, + "step": 2917 + }, + { + "epoch": 0.24, + "grad_norm": 0.9382854752966578, + "learning_rate": 1.7833532277384607e-05, + "loss": 0.561, + "step": 2918 + }, + { + "epoch": 0.24, + "grad_norm": 0.8932203928499848, + "learning_rate": 1.783189572215213e-05, + "loss": 0.5669, + "step": 2919 + }, + { + "epoch": 0.24, + "grad_norm": 0.9635695529223409, + "learning_rate": 1.7830258624176224e-05, + "loss": 0.6151, + "step": 2920 + }, + { + "epoch": 0.24, + "grad_norm": 0.8869233883388015, + "learning_rate": 1.782862098357034e-05, + "loss": 0.5696, + "step": 2921 + }, + { + "epoch": 0.24, + "grad_norm": 1.0316281872406312, + "learning_rate": 1.782698280044797e-05, + "loss": 0.6051, + "step": 2922 + }, + { + "epoch": 0.24, + "grad_norm": 1.0455886074412064, + "learning_rate": 1.7825344074922633e-05, + "loss": 0.6081, + "step": 2923 + }, + { + "epoch": 0.24, + "grad_norm": 0.9910576483300347, + "learning_rate": 1.782370480710789e-05, + "loss": 0.6417, + "step": 2924 + }, + { + "epoch": 0.24, + "grad_norm": 0.9651470637576416, + "learning_rate": 1.7822064997117348e-05, + "loss": 0.5922, + "step": 2925 + }, + { + "epoch": 0.24, + "grad_norm": 0.9345754499780848, + "learning_rate": 1.7820424645064635e-05, + "loss": 0.513, + "step": 2926 + }, + { + "epoch": 0.24, + "grad_norm": 0.9903327113291599, + "learning_rate": 1.7818783751063433e-05, + "loss": 0.6042, + "step": 2927 + }, + { + "epoch": 0.24, + "grad_norm": 0.9698448430291365, + "learning_rate": 1.7817142315227452e-05, + "loss": 0.6555, + "step": 2928 + }, + { + "epoch": 0.24, + "grad_norm": 0.9449001506430976, + "learning_rate": 1.7815500337670442e-05, + "loss": 0.5328, + "step": 2929 + }, + { + "epoch": 0.24, + "grad_norm": 0.9499642731886148, + "learning_rate": 1.7813857818506194e-05, + "loss": 0.5382, + "step": 2930 + }, + { + "epoch": 0.24, + "grad_norm": 0.9114911296436924, + "learning_rate": 1.7812214757848523e-05, + "loss": 0.5718, + "step": 2931 + }, + { + "epoch": 0.24, + "grad_norm": 1.0555112155422826, + "learning_rate": 1.7810571155811307e-05, + "loss": 0.6417, + "step": 2932 + }, + { + "epoch": 0.24, + "grad_norm": 1.0012259749389651, + "learning_rate": 1.7808927012508436e-05, + "loss": 0.5963, + "step": 2933 + }, + { + "epoch": 0.24, + "grad_norm": 0.8588757511049373, + "learning_rate": 1.7807282328053847e-05, + "loss": 0.5725, + "step": 2934 + }, + { + "epoch": 0.24, + "grad_norm": 0.8962264917282546, + "learning_rate": 1.7805637102561516e-05, + "loss": 0.6398, + "step": 2935 + }, + { + "epoch": 0.24, + "grad_norm": 1.0126953458804542, + "learning_rate": 1.7803991336145462e-05, + "loss": 0.5541, + "step": 2936 + }, + { + "epoch": 0.24, + "grad_norm": 0.9485077698206339, + "learning_rate": 1.7802345028919728e-05, + "loss": 0.585, + "step": 2937 + }, + { + "epoch": 0.24, + "grad_norm": 0.9528169390915887, + "learning_rate": 1.7800698180998406e-05, + "loss": 0.6129, + "step": 2938 + }, + { + "epoch": 0.24, + "grad_norm": 0.9279286190560277, + "learning_rate": 1.7799050792495617e-05, + "loss": 0.5792, + "step": 2939 + }, + { + "epoch": 0.24, + "grad_norm": 0.907201226645678, + "learning_rate": 1.7797402863525528e-05, + "loss": 0.5317, + "step": 2940 + }, + { + "epoch": 0.24, + "grad_norm": 0.9794055147062374, + "learning_rate": 1.7795754394202334e-05, + "loss": 0.5912, + "step": 2941 + }, + { + "epoch": 0.24, + "grad_norm": 1.0109889092334905, + "learning_rate": 1.7794105384640277e-05, + "loss": 0.6199, + "step": 2942 + }, + { + "epoch": 0.24, + "grad_norm": 1.0500004928260396, + "learning_rate": 1.779245583495363e-05, + "loss": 0.5977, + "step": 2943 + }, + { + "epoch": 0.24, + "grad_norm": 0.9722339821250309, + "learning_rate": 1.7790805745256703e-05, + "loss": 0.5688, + "step": 2944 + }, + { + "epoch": 0.24, + "grad_norm": 0.9618036445187186, + "learning_rate": 1.7789155115663853e-05, + "loss": 0.5664, + "step": 2945 + }, + { + "epoch": 0.24, + "grad_norm": 0.8426791654221744, + "learning_rate": 1.778750394628946e-05, + "loss": 0.5497, + "step": 2946 + }, + { + "epoch": 0.24, + "grad_norm": 0.9612700706138941, + "learning_rate": 1.7785852237247952e-05, + "loss": 0.5915, + "step": 2947 + }, + { + "epoch": 0.24, + "grad_norm": 0.9261565886562699, + "learning_rate": 1.778419998865379e-05, + "loss": 0.5658, + "step": 2948 + }, + { + "epoch": 0.24, + "grad_norm": 0.9728962618141286, + "learning_rate": 1.7782547200621475e-05, + "loss": 0.5725, + "step": 2949 + }, + { + "epoch": 0.24, + "grad_norm": 1.045860367370037, + "learning_rate": 1.7780893873265536e-05, + "loss": 0.6454, + "step": 2950 + }, + { + "epoch": 0.24, + "grad_norm": 0.9788455225264904, + "learning_rate": 1.777924000670056e-05, + "loss": 0.6038, + "step": 2951 + }, + { + "epoch": 0.24, + "grad_norm": 1.0945387612010085, + "learning_rate": 1.777758560104115e-05, + "loss": 0.6038, + "step": 2952 + }, + { + "epoch": 0.24, + "grad_norm": 0.8957453512244588, + "learning_rate": 1.777593065640195e-05, + "loss": 0.5951, + "step": 2953 + }, + { + "epoch": 0.24, + "grad_norm": 0.9979474133002743, + "learning_rate": 1.777427517289766e-05, + "loss": 0.589, + "step": 2954 + }, + { + "epoch": 0.24, + "grad_norm": 0.9429662984163877, + "learning_rate": 1.7772619150642996e-05, + "loss": 0.5365, + "step": 2955 + }, + { + "epoch": 0.24, + "grad_norm": 0.9358474973512778, + "learning_rate": 1.777096258975272e-05, + "loss": 0.6543, + "step": 2956 + }, + { + "epoch": 0.24, + "grad_norm": 0.9096719871801495, + "learning_rate": 1.7769305490341623e-05, + "loss": 0.592, + "step": 2957 + }, + { + "epoch": 0.24, + "grad_norm": 0.9537438553085594, + "learning_rate": 1.776764785252455e-05, + "loss": 0.5968, + "step": 2958 + }, + { + "epoch": 0.24, + "grad_norm": 0.9563472444119449, + "learning_rate": 1.7765989676416374e-05, + "loss": 0.5488, + "step": 2959 + }, + { + "epoch": 0.24, + "grad_norm": 0.9289889239231146, + "learning_rate": 1.7764330962132e-05, + "loss": 0.6065, + "step": 2960 + }, + { + "epoch": 0.24, + "grad_norm": 0.9496563907877692, + "learning_rate": 1.7762671709786375e-05, + "loss": 0.5092, + "step": 2961 + }, + { + "epoch": 0.24, + "grad_norm": 0.9243170842255678, + "learning_rate": 1.776101191949449e-05, + "loss": 0.591, + "step": 2962 + }, + { + "epoch": 0.24, + "grad_norm": 1.0710754976090913, + "learning_rate": 1.775935159137136e-05, + "loss": 0.635, + "step": 2963 + }, + { + "epoch": 0.24, + "grad_norm": 0.9780586967963613, + "learning_rate": 1.7757690725532048e-05, + "loss": 0.5785, + "step": 2964 + }, + { + "epoch": 0.24, + "grad_norm": 0.9906049593399756, + "learning_rate": 1.7756029322091647e-05, + "loss": 0.5637, + "step": 2965 + }, + { + "epoch": 0.24, + "grad_norm": 0.91575458677578, + "learning_rate": 1.7754367381165298e-05, + "loss": 0.5285, + "step": 2966 + }, + { + "epoch": 0.24, + "grad_norm": 0.977131199479803, + "learning_rate": 1.7752704902868164e-05, + "loss": 0.6075, + "step": 2967 + }, + { + "epoch": 0.24, + "grad_norm": 0.9857150899066561, + "learning_rate": 1.775104188731546e-05, + "loss": 0.6261, + "step": 2968 + }, + { + "epoch": 0.24, + "grad_norm": 0.9576292313368313, + "learning_rate": 1.774937833462243e-05, + "loss": 0.5617, + "step": 2969 + }, + { + "epoch": 0.24, + "grad_norm": 0.8445171511314254, + "learning_rate": 1.7747714244904348e-05, + "loss": 0.5282, + "step": 2970 + }, + { + "epoch": 0.24, + "grad_norm": 0.9229718629347127, + "learning_rate": 1.7746049618276545e-05, + "loss": 0.5479, + "step": 2971 + }, + { + "epoch": 0.24, + "grad_norm": 0.8690650163482897, + "learning_rate": 1.7744384454854377e-05, + "loss": 0.5541, + "step": 2972 + }, + { + "epoch": 0.24, + "grad_norm": 0.9648710618900104, + "learning_rate": 1.7742718754753232e-05, + "loss": 0.6267, + "step": 2973 + }, + { + "epoch": 0.24, + "grad_norm": 0.9128519455250741, + "learning_rate": 1.774105251808855e-05, + "loss": 0.5086, + "step": 2974 + }, + { + "epoch": 0.24, + "grad_norm": 0.903480230508501, + "learning_rate": 1.7739385744975788e-05, + "loss": 0.5827, + "step": 2975 + }, + { + "epoch": 0.24, + "grad_norm": 0.9501921693907465, + "learning_rate": 1.773771843553046e-05, + "loss": 0.5899, + "step": 2976 + }, + { + "epoch": 0.24, + "grad_norm": 0.9375234127105827, + "learning_rate": 1.773605058986811e-05, + "loss": 0.5068, + "step": 2977 + }, + { + "epoch": 0.24, + "grad_norm": 1.0022123910340575, + "learning_rate": 1.7734382208104314e-05, + "loss": 0.6204, + "step": 2978 + }, + { + "epoch": 0.24, + "grad_norm": 0.9271276387355356, + "learning_rate": 1.7732713290354694e-05, + "loss": 0.5993, + "step": 2979 + }, + { + "epoch": 0.24, + "grad_norm": 0.8412067356706011, + "learning_rate": 1.77310438367349e-05, + "loss": 0.5356, + "step": 2980 + }, + { + "epoch": 0.24, + "grad_norm": 0.8963614258194879, + "learning_rate": 1.772937384736063e-05, + "loss": 0.5543, + "step": 2981 + }, + { + "epoch": 0.24, + "grad_norm": 0.941153800669109, + "learning_rate": 1.77277033223476e-05, + "loss": 0.5908, + "step": 2982 + }, + { + "epoch": 0.24, + "grad_norm": 0.8840202297399563, + "learning_rate": 1.772603226181159e-05, + "loss": 0.6028, + "step": 2983 + }, + { + "epoch": 0.24, + "grad_norm": 0.9729775167227305, + "learning_rate": 1.7724360665868395e-05, + "loss": 0.6125, + "step": 2984 + }, + { + "epoch": 0.24, + "grad_norm": 0.8596207995883371, + "learning_rate": 1.772268853463386e-05, + "loss": 0.5613, + "step": 2985 + }, + { + "epoch": 0.24, + "grad_norm": 1.0696073709578997, + "learning_rate": 1.7721015868223858e-05, + "loss": 0.596, + "step": 2986 + }, + { + "epoch": 0.24, + "grad_norm": 0.8680112938013765, + "learning_rate": 1.7719342666754307e-05, + "loss": 0.5516, + "step": 2987 + }, + { + "epoch": 0.24, + "grad_norm": 0.9696281293778938, + "learning_rate": 1.7717668930341152e-05, + "loss": 0.6295, + "step": 2988 + }, + { + "epoch": 0.24, + "grad_norm": 0.9755273934994584, + "learning_rate": 1.771599465910039e-05, + "loss": 0.6291, + "step": 2989 + }, + { + "epoch": 0.24, + "grad_norm": 0.9345644481453044, + "learning_rate": 1.771431985314804e-05, + "loss": 0.5621, + "step": 2990 + }, + { + "epoch": 0.24, + "grad_norm": 0.9037245373504185, + "learning_rate": 1.7712644512600163e-05, + "loss": 0.5962, + "step": 2991 + }, + { + "epoch": 0.24, + "grad_norm": 1.0638475033986836, + "learning_rate": 1.7710968637572866e-05, + "loss": 0.6574, + "step": 2992 + }, + { + "epoch": 0.24, + "grad_norm": 0.8344440667470702, + "learning_rate": 1.770929222818228e-05, + "loss": 0.4977, + "step": 2993 + }, + { + "epoch": 0.24, + "grad_norm": 0.8894745605787729, + "learning_rate": 1.7707615284544585e-05, + "loss": 0.5996, + "step": 2994 + }, + { + "epoch": 0.24, + "grad_norm": 0.9326281101476411, + "learning_rate": 1.7705937806775986e-05, + "loss": 0.5731, + "step": 2995 + }, + { + "epoch": 0.24, + "grad_norm": 0.8695247751218795, + "learning_rate": 1.7704259794992734e-05, + "loss": 0.5895, + "step": 2996 + }, + { + "epoch": 0.24, + "grad_norm": 1.0317406351652776, + "learning_rate": 1.7702581249311107e-05, + "loss": 0.6543, + "step": 2997 + }, + { + "epoch": 0.24, + "grad_norm": 0.8077967565584185, + "learning_rate": 1.7700902169847434e-05, + "loss": 0.518, + "step": 2998 + }, + { + "epoch": 0.24, + "grad_norm": 1.003810213369707, + "learning_rate": 1.769922255671807e-05, + "loss": 0.5289, + "step": 2999 + }, + { + "epoch": 0.24, + "grad_norm": 0.9748601712071312, + "learning_rate": 1.7697542410039413e-05, + "loss": 0.5974, + "step": 3000 + }, + { + "epoch": 0.24, + "grad_norm": 0.9220205485487152, + "learning_rate": 1.7695861729927896e-05, + "loss": 0.5275, + "step": 3001 + }, + { + "epoch": 0.24, + "grad_norm": 0.9939522843169762, + "learning_rate": 1.7694180516499986e-05, + "loss": 0.6003, + "step": 3002 + }, + { + "epoch": 0.24, + "grad_norm": 0.9431839294240236, + "learning_rate": 1.769249876987219e-05, + "loss": 0.529, + "step": 3003 + }, + { + "epoch": 0.24, + "grad_norm": 0.9163038128226331, + "learning_rate": 1.7690816490161054e-05, + "loss": 0.5529, + "step": 3004 + }, + { + "epoch": 0.24, + "grad_norm": 0.9612779979723355, + "learning_rate": 1.768913367748316e-05, + "loss": 0.5566, + "step": 3005 + }, + { + "epoch": 0.24, + "grad_norm": 0.9193225717062538, + "learning_rate": 1.7687450331955115e-05, + "loss": 0.5653, + "step": 3006 + }, + { + "epoch": 0.24, + "grad_norm": 0.8985009069505546, + "learning_rate": 1.7685766453693584e-05, + "loss": 0.5281, + "step": 3007 + }, + { + "epoch": 0.24, + "grad_norm": 0.9995819562166177, + "learning_rate": 1.7684082042815255e-05, + "loss": 0.5755, + "step": 3008 + }, + { + "epoch": 0.24, + "grad_norm": 1.0488660971736616, + "learning_rate": 1.768239709943686e-05, + "loss": 0.6405, + "step": 3009 + }, + { + "epoch": 0.24, + "grad_norm": 1.0060682914657895, + "learning_rate": 1.7680711623675155e-05, + "loss": 0.6043, + "step": 3010 + }, + { + "epoch": 0.24, + "grad_norm": 0.9685403613457841, + "learning_rate": 1.767902561564695e-05, + "loss": 0.5787, + "step": 3011 + }, + { + "epoch": 0.24, + "grad_norm": 0.8615980271016578, + "learning_rate": 1.767733907546908e-05, + "loss": 0.5722, + "step": 3012 + }, + { + "epoch": 0.24, + "grad_norm": 0.9853089580676793, + "learning_rate": 1.7675652003258427e-05, + "loss": 0.6401, + "step": 3013 + }, + { + "epoch": 0.24, + "grad_norm": 1.0187436271337882, + "learning_rate": 1.7673964399131895e-05, + "loss": 0.5397, + "step": 3014 + }, + { + "epoch": 0.25, + "grad_norm": 1.034056628820441, + "learning_rate": 1.7672276263206433e-05, + "loss": 0.5924, + "step": 3015 + }, + { + "epoch": 0.25, + "grad_norm": 0.8746264193882949, + "learning_rate": 1.7670587595599034e-05, + "loss": 0.4671, + "step": 3016 + }, + { + "epoch": 0.25, + "grad_norm": 1.0132891832260866, + "learning_rate": 1.7668898396426717e-05, + "loss": 0.5868, + "step": 3017 + }, + { + "epoch": 0.25, + "grad_norm": 0.8973560235274429, + "learning_rate": 1.766720866580655e-05, + "loss": 0.5662, + "step": 3018 + }, + { + "epoch": 0.25, + "grad_norm": 0.9830770810376366, + "learning_rate": 1.7665518403855614e-05, + "loss": 0.6166, + "step": 3019 + }, + { + "epoch": 0.25, + "grad_norm": 0.9692899867659497, + "learning_rate": 1.766382761069106e-05, + "loss": 0.6223, + "step": 3020 + }, + { + "epoch": 0.25, + "grad_norm": 0.9607598785885566, + "learning_rate": 1.7662136286430046e-05, + "loss": 0.6089, + "step": 3021 + }, + { + "epoch": 0.25, + "grad_norm": 0.9226827698337999, + "learning_rate": 1.766044443118978e-05, + "loss": 0.5055, + "step": 3022 + }, + { + "epoch": 0.25, + "grad_norm": 1.020928123282797, + "learning_rate": 1.7658752045087516e-05, + "loss": 0.6092, + "step": 3023 + }, + { + "epoch": 0.25, + "grad_norm": 0.996536739973853, + "learning_rate": 1.7657059128240526e-05, + "loss": 0.5707, + "step": 3024 + }, + { + "epoch": 0.25, + "grad_norm": 1.0197047633927752, + "learning_rate": 1.765536568076613e-05, + "loss": 0.6489, + "step": 3025 + }, + { + "epoch": 0.25, + "grad_norm": 0.9591301041949271, + "learning_rate": 1.7653671702781685e-05, + "loss": 0.5411, + "step": 3026 + }, + { + "epoch": 0.25, + "grad_norm": 0.8719742546833583, + "learning_rate": 1.7651977194404578e-05, + "loss": 0.5787, + "step": 3027 + }, + { + "epoch": 0.25, + "grad_norm": 1.046589072837998, + "learning_rate": 1.765028215575224e-05, + "loss": 0.5943, + "step": 3028 + }, + { + "epoch": 0.25, + "grad_norm": 0.93558659226997, + "learning_rate": 1.7648586586942134e-05, + "loss": 0.5325, + "step": 3029 + }, + { + "epoch": 0.25, + "grad_norm": 1.0084710284795635, + "learning_rate": 1.764689048809176e-05, + "loss": 0.5347, + "step": 3030 + }, + { + "epoch": 0.25, + "grad_norm": 0.9808260064257344, + "learning_rate": 1.7645193859318658e-05, + "loss": 0.614, + "step": 3031 + }, + { + "epoch": 0.25, + "grad_norm": 0.9556969370434116, + "learning_rate": 1.7643496700740407e-05, + "loss": 0.6143, + "step": 3032 + }, + { + "epoch": 0.25, + "grad_norm": 0.9066448474711181, + "learning_rate": 1.7641799012474608e-05, + "loss": 0.5614, + "step": 3033 + }, + { + "epoch": 0.25, + "grad_norm": 0.936506703901719, + "learning_rate": 1.764010079463892e-05, + "loss": 0.5505, + "step": 3034 + }, + { + "epoch": 0.25, + "grad_norm": 0.9555289651987778, + "learning_rate": 1.7638402047351025e-05, + "loss": 0.6526, + "step": 3035 + }, + { + "epoch": 0.25, + "grad_norm": 0.9611056933445485, + "learning_rate": 1.7636702770728637e-05, + "loss": 0.5742, + "step": 3036 + }, + { + "epoch": 0.25, + "grad_norm": 0.9059821946830916, + "learning_rate": 1.7635002964889527e-05, + "loss": 0.5577, + "step": 3037 + }, + { + "epoch": 0.25, + "grad_norm": 0.9573450830444244, + "learning_rate": 1.763330262995148e-05, + "loss": 0.6275, + "step": 3038 + }, + { + "epoch": 0.25, + "grad_norm": 0.9601163319268974, + "learning_rate": 1.7631601766032337e-05, + "loss": 0.5637, + "step": 3039 + }, + { + "epoch": 0.25, + "grad_norm": 1.002866210234754, + "learning_rate": 1.7629900373249956e-05, + "loss": 0.5704, + "step": 3040 + }, + { + "epoch": 0.25, + "grad_norm": 0.8760623142199797, + "learning_rate": 1.7628198451722247e-05, + "loss": 0.5525, + "step": 3041 + }, + { + "epoch": 0.25, + "grad_norm": 0.9523505020216828, + "learning_rate": 1.7626496001567154e-05, + "loss": 0.5804, + "step": 3042 + }, + { + "epoch": 0.25, + "grad_norm": 0.965206293258815, + "learning_rate": 1.7624793022902648e-05, + "loss": 0.58, + "step": 3043 + }, + { + "epoch": 0.25, + "grad_norm": 0.9022195843672679, + "learning_rate": 1.7623089515846752e-05, + "loss": 0.6024, + "step": 3044 + }, + { + "epoch": 0.25, + "grad_norm": 0.9534846451535837, + "learning_rate": 1.7621385480517514e-05, + "loss": 0.5893, + "step": 3045 + }, + { + "epoch": 0.25, + "grad_norm": 0.9509893948267736, + "learning_rate": 1.7619680917033023e-05, + "loss": 0.5674, + "step": 3046 + }, + { + "epoch": 0.25, + "grad_norm": 0.9667981591904946, + "learning_rate": 1.7617975825511403e-05, + "loss": 0.6137, + "step": 3047 + }, + { + "epoch": 0.25, + "grad_norm": 0.9457533097952052, + "learning_rate": 1.7616270206070814e-05, + "loss": 0.5797, + "step": 3048 + }, + { + "epoch": 0.25, + "grad_norm": 0.8800567240746717, + "learning_rate": 1.7614564058829454e-05, + "loss": 0.5534, + "step": 3049 + }, + { + "epoch": 0.25, + "grad_norm": 0.9306818061440629, + "learning_rate": 1.7612857383905565e-05, + "loss": 0.5664, + "step": 3050 + }, + { + "epoch": 0.25, + "grad_norm": 1.0394494750240004, + "learning_rate": 1.7611150181417406e-05, + "loss": 0.6443, + "step": 3051 + }, + { + "epoch": 0.25, + "grad_norm": 0.9143008362220608, + "learning_rate": 1.7609442451483292e-05, + "loss": 0.5357, + "step": 3052 + }, + { + "epoch": 0.25, + "grad_norm": 0.8459253561004924, + "learning_rate": 1.7607734194221565e-05, + "loss": 0.5381, + "step": 3053 + }, + { + "epoch": 0.25, + "grad_norm": 1.8342798033382928, + "learning_rate": 1.7606025409750608e-05, + "loss": 0.5503, + "step": 3054 + }, + { + "epoch": 0.25, + "grad_norm": 0.9497336635018657, + "learning_rate": 1.760431609818884e-05, + "loss": 0.5588, + "step": 3055 + }, + { + "epoch": 0.25, + "grad_norm": 0.9574074351131134, + "learning_rate": 1.7602606259654704e-05, + "loss": 0.5993, + "step": 3056 + }, + { + "epoch": 0.25, + "grad_norm": 1.2266619564048968, + "learning_rate": 1.7600895894266702e-05, + "loss": 0.5867, + "step": 3057 + }, + { + "epoch": 0.25, + "grad_norm": 0.9543613597242573, + "learning_rate": 1.7599185002143357e-05, + "loss": 0.6301, + "step": 3058 + }, + { + "epoch": 0.25, + "grad_norm": 0.9433524901855496, + "learning_rate": 1.759747358340323e-05, + "loss": 0.5699, + "step": 3059 + }, + { + "epoch": 0.25, + "grad_norm": 0.9989762059557455, + "learning_rate": 1.7595761638164924e-05, + "loss": 0.5463, + "step": 3060 + }, + { + "epoch": 0.25, + "grad_norm": 0.9912262872442057, + "learning_rate": 1.7594049166547073e-05, + "loss": 0.5611, + "step": 3061 + }, + { + "epoch": 0.25, + "grad_norm": 0.955654711601771, + "learning_rate": 1.7592336168668352e-05, + "loss": 0.6187, + "step": 3062 + }, + { + "epoch": 0.25, + "grad_norm": 0.9407381255668952, + "learning_rate": 1.7590622644647466e-05, + "loss": 0.571, + "step": 3063 + }, + { + "epoch": 0.25, + "grad_norm": 0.8974550887945183, + "learning_rate": 1.7588908594603165e-05, + "loss": 0.4889, + "step": 3064 + }, + { + "epoch": 0.25, + "grad_norm": 0.8719072751485014, + "learning_rate": 1.758719401865423e-05, + "loss": 0.5625, + "step": 3065 + }, + { + "epoch": 0.25, + "grad_norm": 0.8891693890171455, + "learning_rate": 1.758547891691948e-05, + "loss": 0.5671, + "step": 3066 + }, + { + "epoch": 0.25, + "grad_norm": 0.9430776803567622, + "learning_rate": 1.7583763289517767e-05, + "loss": 0.6137, + "step": 3067 + }, + { + "epoch": 0.25, + "grad_norm": 0.9764665774810648, + "learning_rate": 1.7582047136567987e-05, + "loss": 0.6012, + "step": 3068 + }, + { + "epoch": 0.25, + "grad_norm": 1.0107046063269252, + "learning_rate": 1.7580330458189066e-05, + "loss": 0.644, + "step": 3069 + }, + { + "epoch": 0.25, + "grad_norm": 0.9512873214249711, + "learning_rate": 1.757861325449997e-05, + "loss": 0.6462, + "step": 3070 + }, + { + "epoch": 0.25, + "grad_norm": 1.003399707071991, + "learning_rate": 1.7576895525619693e-05, + "loss": 0.6177, + "step": 3071 + }, + { + "epoch": 0.25, + "grad_norm": 0.9317498927261336, + "learning_rate": 1.757517727166728e-05, + "loss": 0.5163, + "step": 3072 + }, + { + "epoch": 0.25, + "grad_norm": 0.884093708449396, + "learning_rate": 1.7573458492761802e-05, + "loss": 0.473, + "step": 3073 + }, + { + "epoch": 0.25, + "grad_norm": 0.9546080085115011, + "learning_rate": 1.7571739189022365e-05, + "loss": 0.5957, + "step": 3074 + }, + { + "epoch": 0.25, + "grad_norm": 1.0330919558923133, + "learning_rate": 1.7570019360568117e-05, + "loss": 0.5716, + "step": 3075 + }, + { + "epoch": 0.25, + "grad_norm": 0.9623272014338214, + "learning_rate": 1.7568299007518247e-05, + "loss": 0.5663, + "step": 3076 + }, + { + "epoch": 0.25, + "grad_norm": 0.9888262495455644, + "learning_rate": 1.7566578129991966e-05, + "loss": 0.6225, + "step": 3077 + }, + { + "epoch": 0.25, + "grad_norm": 0.892592052293103, + "learning_rate": 1.756485672810853e-05, + "loss": 0.5885, + "step": 3078 + }, + { + "epoch": 0.25, + "grad_norm": 0.9551702736887312, + "learning_rate": 1.7563134801987235e-05, + "loss": 0.6147, + "step": 3079 + }, + { + "epoch": 0.25, + "grad_norm": 0.9954287831755133, + "learning_rate": 1.7561412351747406e-05, + "loss": 0.5876, + "step": 3080 + }, + { + "epoch": 0.25, + "grad_norm": 0.9158818809267135, + "learning_rate": 1.7559689377508413e-05, + "loss": 0.5339, + "step": 3081 + }, + { + "epoch": 0.25, + "grad_norm": 1.0174523252448435, + "learning_rate": 1.7557965879389644e-05, + "loss": 0.5493, + "step": 3082 + }, + { + "epoch": 0.25, + "grad_norm": 0.8947025622357997, + "learning_rate": 1.7556241857510547e-05, + "loss": 0.5544, + "step": 3083 + }, + { + "epoch": 0.25, + "grad_norm": 1.2270341670509768, + "learning_rate": 1.7554517311990592e-05, + "loss": 0.6078, + "step": 3084 + }, + { + "epoch": 0.25, + "grad_norm": 1.0170400035892802, + "learning_rate": 1.7552792242949287e-05, + "loss": 0.6123, + "step": 3085 + }, + { + "epoch": 0.25, + "grad_norm": 0.9139996199480314, + "learning_rate": 1.755106665050618e-05, + "loss": 0.5805, + "step": 3086 + }, + { + "epoch": 0.25, + "grad_norm": 0.9622769309264265, + "learning_rate": 1.7549340534780852e-05, + "loss": 0.6121, + "step": 3087 + }, + { + "epoch": 0.25, + "grad_norm": 0.9809266179096972, + "learning_rate": 1.754761389589292e-05, + "loss": 0.6003, + "step": 3088 + }, + { + "epoch": 0.25, + "grad_norm": 0.9361453357606391, + "learning_rate": 1.7545886733962044e-05, + "loss": 0.5877, + "step": 3089 + }, + { + "epoch": 0.25, + "grad_norm": 0.9521538902460409, + "learning_rate": 1.7544159049107902e-05, + "loss": 0.5804, + "step": 3090 + }, + { + "epoch": 0.25, + "grad_norm": 0.8638250061554852, + "learning_rate": 1.7542430841450236e-05, + "loss": 0.5138, + "step": 3091 + }, + { + "epoch": 0.25, + "grad_norm": 0.8724382049228834, + "learning_rate": 1.7540702111108803e-05, + "loss": 0.5124, + "step": 3092 + }, + { + "epoch": 0.25, + "grad_norm": 1.0254553888380893, + "learning_rate": 1.7538972858203397e-05, + "loss": 0.5292, + "step": 3093 + }, + { + "epoch": 0.25, + "grad_norm": 0.9330508544120452, + "learning_rate": 1.7537243082853866e-05, + "loss": 0.5537, + "step": 3094 + }, + { + "epoch": 0.25, + "grad_norm": 0.9432771599677009, + "learning_rate": 1.753551278518007e-05, + "loss": 0.579, + "step": 3095 + }, + { + "epoch": 0.25, + "grad_norm": 0.8919899667127843, + "learning_rate": 1.7533781965301924e-05, + "loss": 0.4923, + "step": 3096 + }, + { + "epoch": 0.25, + "grad_norm": 0.961077027530501, + "learning_rate": 1.753205062333937e-05, + "loss": 0.597, + "step": 3097 + }, + { + "epoch": 0.25, + "grad_norm": 0.9069964785933305, + "learning_rate": 1.753031875941239e-05, + "loss": 0.5856, + "step": 3098 + }, + { + "epoch": 0.25, + "grad_norm": 0.9659625727638037, + "learning_rate": 1.7528586373640997e-05, + "loss": 0.5876, + "step": 3099 + }, + { + "epoch": 0.25, + "grad_norm": 1.0598241751866497, + "learning_rate": 1.7526853466145248e-05, + "loss": 0.632, + "step": 3100 + }, + { + "epoch": 0.25, + "grad_norm": 1.0778620313469889, + "learning_rate": 1.7525120037045227e-05, + "loss": 0.5799, + "step": 3101 + }, + { + "epoch": 0.25, + "grad_norm": 0.8954798615992657, + "learning_rate": 1.7523386086461065e-05, + "loss": 0.6049, + "step": 3102 + }, + { + "epoch": 0.25, + "grad_norm": 0.935250732427432, + "learning_rate": 1.7521651614512918e-05, + "loss": 0.6179, + "step": 3103 + }, + { + "epoch": 0.25, + "grad_norm": 1.0023476270439846, + "learning_rate": 1.751991662132099e-05, + "loss": 0.5978, + "step": 3104 + }, + { + "epoch": 0.25, + "grad_norm": 0.9002821778685863, + "learning_rate": 1.751818110700551e-05, + "loss": 0.5656, + "step": 3105 + }, + { + "epoch": 0.25, + "grad_norm": 0.8800925304899958, + "learning_rate": 1.751644507168674e-05, + "loss": 0.5569, + "step": 3106 + }, + { + "epoch": 0.25, + "grad_norm": 0.9330859593957906, + "learning_rate": 1.7514708515485002e-05, + "loss": 0.6456, + "step": 3107 + }, + { + "epoch": 0.25, + "grad_norm": 0.9532235450165991, + "learning_rate": 1.7512971438520626e-05, + "loss": 0.5808, + "step": 3108 + }, + { + "epoch": 0.25, + "grad_norm": 0.8998905570101068, + "learning_rate": 1.7511233840913994e-05, + "loss": 0.5211, + "step": 3109 + }, + { + "epoch": 0.25, + "grad_norm": 0.9128687245400994, + "learning_rate": 1.7509495722785518e-05, + "loss": 0.5159, + "step": 3110 + }, + { + "epoch": 0.25, + "grad_norm": 1.075402357728703, + "learning_rate": 1.7507757084255652e-05, + "loss": 0.6144, + "step": 3111 + }, + { + "epoch": 0.25, + "grad_norm": 0.9429480058993237, + "learning_rate": 1.750601792544488e-05, + "loss": 0.5949, + "step": 3112 + }, + { + "epoch": 0.25, + "grad_norm": 0.9129715759556368, + "learning_rate": 1.750427824647372e-05, + "loss": 0.5553, + "step": 3113 + }, + { + "epoch": 0.25, + "grad_norm": 0.9127366900187914, + "learning_rate": 1.7502538047462737e-05, + "loss": 0.5545, + "step": 3114 + }, + { + "epoch": 0.25, + "grad_norm": 0.9397256695327718, + "learning_rate": 1.750079732853252e-05, + "loss": 0.5552, + "step": 3115 + }, + { + "epoch": 0.25, + "grad_norm": 1.0024073480566253, + "learning_rate": 1.74990560898037e-05, + "loss": 0.6514, + "step": 3116 + }, + { + "epoch": 0.25, + "grad_norm": 0.9620678706748186, + "learning_rate": 1.7497314331396946e-05, + "loss": 0.616, + "step": 3117 + }, + { + "epoch": 0.25, + "grad_norm": 0.980921040306892, + "learning_rate": 1.7495572053432962e-05, + "loss": 0.6389, + "step": 3118 + }, + { + "epoch": 0.25, + "grad_norm": 0.93016075302683, + "learning_rate": 1.749382925603248e-05, + "loss": 0.5954, + "step": 3119 + }, + { + "epoch": 0.25, + "grad_norm": 0.9513139305328434, + "learning_rate": 1.749208593931628e-05, + "loss": 0.5899, + "step": 3120 + }, + { + "epoch": 0.25, + "grad_norm": 0.9098228343772, + "learning_rate": 1.7490342103405168e-05, + "loss": 0.6347, + "step": 3121 + }, + { + "epoch": 0.25, + "grad_norm": 0.9982505813028052, + "learning_rate": 1.748859774841999e-05, + "loss": 0.6127, + "step": 3122 + }, + { + "epoch": 0.25, + "grad_norm": 0.9229136534442328, + "learning_rate": 1.748685287448163e-05, + "loss": 0.6267, + "step": 3123 + }, + { + "epoch": 0.25, + "grad_norm": 0.8754740252477136, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.6227, + "step": 3124 + }, + { + "epoch": 0.25, + "grad_norm": 0.9122049883525993, + "learning_rate": 1.748336157022908e-05, + "loss": 0.5726, + "step": 3125 + }, + { + "epoch": 0.25, + "grad_norm": 0.8813794257915903, + "learning_rate": 1.7481615140156837e-05, + "loss": 0.5837, + "step": 3126 + }, + { + "epoch": 0.25, + "grad_norm": 1.0392588455372822, + "learning_rate": 1.747986819161529e-05, + "loss": 0.5378, + "step": 3127 + }, + { + "epoch": 0.25, + "grad_norm": 1.063738607824746, + "learning_rate": 1.747812072472552e-05, + "loss": 0.6057, + "step": 3128 + }, + { + "epoch": 0.25, + "grad_norm": 1.0799665881792668, + "learning_rate": 1.7476372739608615e-05, + "loss": 0.6201, + "step": 3129 + }, + { + "epoch": 0.25, + "grad_norm": 0.922842839007906, + "learning_rate": 1.7474624236385706e-05, + "loss": 0.5879, + "step": 3130 + }, + { + "epoch": 0.25, + "grad_norm": 0.9427134960414176, + "learning_rate": 1.747287521517797e-05, + "loss": 0.5887, + "step": 3131 + }, + { + "epoch": 0.25, + "grad_norm": 0.9182269254202275, + "learning_rate": 1.7471125676106613e-05, + "loss": 0.6023, + "step": 3132 + }, + { + "epoch": 0.25, + "grad_norm": 0.992985791336242, + "learning_rate": 1.7469375619292873e-05, + "loss": 0.5516, + "step": 3133 + }, + { + "epoch": 0.25, + "grad_norm": 1.0000627224938892, + "learning_rate": 1.7467625044858025e-05, + "loss": 0.5806, + "step": 3134 + }, + { + "epoch": 0.25, + "grad_norm": 1.0464342928375443, + "learning_rate": 1.7465873952923386e-05, + "loss": 0.6352, + "step": 3135 + }, + { + "epoch": 0.25, + "grad_norm": 0.8489039252817476, + "learning_rate": 1.7464122343610307e-05, + "loss": 0.5423, + "step": 3136 + }, + { + "epoch": 0.25, + "grad_norm": 0.8344056548884692, + "learning_rate": 1.7462370217040167e-05, + "loss": 0.6058, + "step": 3137 + }, + { + "epoch": 0.26, + "grad_norm": 0.9413618313258731, + "learning_rate": 1.7460617573334393e-05, + "loss": 0.588, + "step": 3138 + }, + { + "epoch": 0.26, + "grad_norm": 1.0008486902893026, + "learning_rate": 1.7458864412614436e-05, + "loss": 0.6089, + "step": 3139 + }, + { + "epoch": 0.26, + "grad_norm": 0.9376106409264786, + "learning_rate": 1.745711073500179e-05, + "loss": 0.571, + "step": 3140 + }, + { + "epoch": 0.26, + "grad_norm": 0.887158908415457, + "learning_rate": 1.7455356540617988e-05, + "loss": 0.5797, + "step": 3141 + }, + { + "epoch": 0.26, + "grad_norm": 0.9221618609988214, + "learning_rate": 1.745360182958459e-05, + "loss": 0.5654, + "step": 3142 + }, + { + "epoch": 0.26, + "grad_norm": 0.9051428050775634, + "learning_rate": 1.7451846602023196e-05, + "loss": 0.5332, + "step": 3143 + }, + { + "epoch": 0.26, + "grad_norm": 0.972614082330035, + "learning_rate": 1.745009085805544e-05, + "loss": 0.5517, + "step": 3144 + }, + { + "epoch": 0.26, + "grad_norm": 1.0267964133100653, + "learning_rate": 1.7448334597803e-05, + "loss": 0.5996, + "step": 3145 + }, + { + "epoch": 0.26, + "grad_norm": 0.8669897880071976, + "learning_rate": 1.7446577821387575e-05, + "loss": 0.5437, + "step": 3146 + }, + { + "epoch": 0.26, + "grad_norm": 0.9444554670732166, + "learning_rate": 1.7444820528930914e-05, + "loss": 0.5424, + "step": 3147 + }, + { + "epoch": 0.26, + "grad_norm": 1.1496699362879719, + "learning_rate": 1.7443062720554796e-05, + "loss": 0.5784, + "step": 3148 + }, + { + "epoch": 0.26, + "grad_norm": 1.0157973115543528, + "learning_rate": 1.744130439638103e-05, + "loss": 0.634, + "step": 3149 + }, + { + "epoch": 0.26, + "grad_norm": 0.8366603399421715, + "learning_rate": 1.7439545556531473e-05, + "loss": 0.5062, + "step": 3150 + }, + { + "epoch": 0.26, + "grad_norm": 0.97088446679413, + "learning_rate": 1.7437786201128003e-05, + "loss": 0.5379, + "step": 3151 + }, + { + "epoch": 0.26, + "grad_norm": 0.9939309659717451, + "learning_rate": 1.743602633029255e-05, + "loss": 0.5846, + "step": 3152 + }, + { + "epoch": 0.26, + "grad_norm": 0.8990135518929009, + "learning_rate": 1.7434265944147068e-05, + "loss": 0.5164, + "step": 3153 + }, + { + "epoch": 0.26, + "grad_norm": 1.019817941557951, + "learning_rate": 1.743250504281355e-05, + "loss": 0.6309, + "step": 3154 + }, + { + "epoch": 0.26, + "grad_norm": 0.8928122311880572, + "learning_rate": 1.7430743626414024e-05, + "loss": 0.4856, + "step": 3155 + }, + { + "epoch": 0.26, + "grad_norm": 0.9782714945133913, + "learning_rate": 1.7428981695070558e-05, + "loss": 0.6257, + "step": 3156 + }, + { + "epoch": 0.26, + "grad_norm": 0.8879693848442041, + "learning_rate": 1.7427219248905246e-05, + "loss": 0.5216, + "step": 3157 + }, + { + "epoch": 0.26, + "grad_norm": 1.089049966696691, + "learning_rate": 1.7425456288040236e-05, + "loss": 0.6037, + "step": 3158 + }, + { + "epoch": 0.26, + "grad_norm": 0.9056987893721266, + "learning_rate": 1.7423692812597682e-05, + "loss": 0.5546, + "step": 3159 + }, + { + "epoch": 0.26, + "grad_norm": 1.0574500523825905, + "learning_rate": 1.7421928822699805e-05, + "loss": 0.5797, + "step": 3160 + }, + { + "epoch": 0.26, + "grad_norm": 1.0050064235457727, + "learning_rate": 1.7420164318468845e-05, + "loss": 0.6173, + "step": 3161 + }, + { + "epoch": 0.26, + "grad_norm": 0.9148162925439718, + "learning_rate": 1.741839930002708e-05, + "loss": 0.5662, + "step": 3162 + }, + { + "epoch": 0.26, + "grad_norm": 0.8762180869341742, + "learning_rate": 1.741663376749682e-05, + "loss": 0.5885, + "step": 3163 + }, + { + "epoch": 0.26, + "grad_norm": 0.8594642521189727, + "learning_rate": 1.7414867721000423e-05, + "loss": 0.5508, + "step": 3164 + }, + { + "epoch": 0.26, + "grad_norm": 0.9794014670832364, + "learning_rate": 1.7413101160660267e-05, + "loss": 0.5757, + "step": 3165 + }, + { + "epoch": 0.26, + "grad_norm": 0.9907022513665806, + "learning_rate": 1.7411334086598775e-05, + "loss": 0.5629, + "step": 3166 + }, + { + "epoch": 0.26, + "grad_norm": 0.8991676755470959, + "learning_rate": 1.7409566498938405e-05, + "loss": 0.5953, + "step": 3167 + }, + { + "epoch": 0.26, + "grad_norm": 1.091552960265873, + "learning_rate": 1.740779839780165e-05, + "loss": 0.5151, + "step": 3168 + }, + { + "epoch": 0.26, + "grad_norm": 1.0550607969869947, + "learning_rate": 1.7406029783311036e-05, + "loss": 0.6001, + "step": 3169 + }, + { + "epoch": 0.26, + "grad_norm": 1.023053375591623, + "learning_rate": 1.7404260655589128e-05, + "loss": 0.6253, + "step": 3170 + }, + { + "epoch": 0.26, + "grad_norm": 1.0158274844468704, + "learning_rate": 1.7402491014758526e-05, + "loss": 0.565, + "step": 3171 + }, + { + "epoch": 0.26, + "grad_norm": 0.8856276226898813, + "learning_rate": 1.740072086094186e-05, + "loss": 0.5784, + "step": 3172 + }, + { + "epoch": 0.26, + "grad_norm": 0.9297964339077929, + "learning_rate": 1.7398950194261803e-05, + "loss": 0.4888, + "step": 3173 + }, + { + "epoch": 0.26, + "grad_norm": 1.0394334239483127, + "learning_rate": 1.739717901484106e-05, + "loss": 0.6169, + "step": 3174 + }, + { + "epoch": 0.26, + "grad_norm": 1.000153477231492, + "learning_rate": 1.7395407322802374e-05, + "loss": 0.581, + "step": 3175 + }, + { + "epoch": 0.26, + "grad_norm": 0.9310772778163637, + "learning_rate": 1.739363511826852e-05, + "loss": 0.5245, + "step": 3176 + }, + { + "epoch": 0.26, + "grad_norm": 0.9003978982215697, + "learning_rate": 1.739186240136231e-05, + "loss": 0.5652, + "step": 3177 + }, + { + "epoch": 0.26, + "grad_norm": 1.0088358143669862, + "learning_rate": 1.7390089172206594e-05, + "loss": 0.6085, + "step": 3178 + }, + { + "epoch": 0.26, + "grad_norm": 0.9211698202843335, + "learning_rate": 1.7388315430924253e-05, + "loss": 0.5584, + "step": 3179 + }, + { + "epoch": 0.26, + "grad_norm": 0.8748172476650534, + "learning_rate": 1.73865411776382e-05, + "loss": 0.5751, + "step": 3180 + }, + { + "epoch": 0.26, + "grad_norm": 0.8417660121346013, + "learning_rate": 1.7384766412471405e-05, + "loss": 0.5499, + "step": 3181 + }, + { + "epoch": 0.26, + "grad_norm": 0.9408868417098869, + "learning_rate": 1.7382991135546842e-05, + "loss": 0.6425, + "step": 3182 + }, + { + "epoch": 0.26, + "grad_norm": 0.940025872410451, + "learning_rate": 1.7381215346987538e-05, + "loss": 0.5604, + "step": 3183 + }, + { + "epoch": 0.26, + "grad_norm": 0.8701094523091236, + "learning_rate": 1.7379439046916564e-05, + "loss": 0.5617, + "step": 3184 + }, + { + "epoch": 0.26, + "grad_norm": 0.9589378666465185, + "learning_rate": 1.737766223545701e-05, + "loss": 0.5457, + "step": 3185 + }, + { + "epoch": 0.26, + "grad_norm": 0.8717363773584366, + "learning_rate": 1.7375884912732004e-05, + "loss": 0.5277, + "step": 3186 + }, + { + "epoch": 0.26, + "grad_norm": 0.9283899269806596, + "learning_rate": 1.7374107078864716e-05, + "loss": 0.6118, + "step": 3187 + }, + { + "epoch": 0.26, + "grad_norm": 0.9544424219775307, + "learning_rate": 1.7372328733978348e-05, + "loss": 0.5704, + "step": 3188 + }, + { + "epoch": 0.26, + "grad_norm": 0.9255212308304944, + "learning_rate": 1.737054987819614e-05, + "loss": 0.6063, + "step": 3189 + }, + { + "epoch": 0.26, + "grad_norm": 0.9358163965712596, + "learning_rate": 1.7368770511641365e-05, + "loss": 0.5423, + "step": 3190 + }, + { + "epoch": 0.26, + "grad_norm": 0.9010266984747565, + "learning_rate": 1.7366990634437328e-05, + "loss": 0.6014, + "step": 3191 + }, + { + "epoch": 0.26, + "grad_norm": 0.9571620756129031, + "learning_rate": 1.736521024670737e-05, + "loss": 0.5905, + "step": 3192 + }, + { + "epoch": 0.26, + "grad_norm": 0.9217502531680845, + "learning_rate": 1.736342934857488e-05, + "loss": 0.6337, + "step": 3193 + }, + { + "epoch": 0.26, + "grad_norm": 0.8466450609798067, + "learning_rate": 1.7361647940163266e-05, + "loss": 0.5326, + "step": 3194 + }, + { + "epoch": 0.26, + "grad_norm": 0.9985771457839538, + "learning_rate": 1.735986602159598e-05, + "loss": 0.6586, + "step": 3195 + }, + { + "epoch": 0.26, + "grad_norm": 0.9246511819018729, + "learning_rate": 1.7358083592996507e-05, + "loss": 0.5962, + "step": 3196 + }, + { + "epoch": 0.26, + "grad_norm": 0.9695534053036614, + "learning_rate": 1.7356300654488367e-05, + "loss": 0.6388, + "step": 3197 + }, + { + "epoch": 0.26, + "grad_norm": 0.8667598153295203, + "learning_rate": 1.7354517206195115e-05, + "loss": 0.561, + "step": 3198 + }, + { + "epoch": 0.26, + "grad_norm": 0.8842929810545559, + "learning_rate": 1.7352733248240347e-05, + "loss": 0.543, + "step": 3199 + }, + { + "epoch": 0.26, + "grad_norm": 0.893497524895569, + "learning_rate": 1.7350948780747684e-05, + "loss": 0.5941, + "step": 3200 + }, + { + "epoch": 0.26, + "grad_norm": 0.8583532551872464, + "learning_rate": 1.734916380384079e-05, + "loss": 0.512, + "step": 3201 + }, + { + "epoch": 0.26, + "grad_norm": 1.0429301416865169, + "learning_rate": 1.7347378317643368e-05, + "loss": 0.6715, + "step": 3202 + }, + { + "epoch": 0.26, + "grad_norm": 0.9002956701588745, + "learning_rate": 1.7345592322279143e-05, + "loss": 0.5906, + "step": 3203 + }, + { + "epoch": 0.26, + "grad_norm": 1.0766543533538397, + "learning_rate": 1.7343805817871885e-05, + "loss": 0.614, + "step": 3204 + }, + { + "epoch": 0.26, + "grad_norm": 0.9758413415717146, + "learning_rate": 1.73420188045454e-05, + "loss": 0.6033, + "step": 3205 + }, + { + "epoch": 0.26, + "grad_norm": 1.0511316691987547, + "learning_rate": 1.734023128242352e-05, + "loss": 0.5785, + "step": 3206 + }, + { + "epoch": 0.26, + "grad_norm": 0.8933442945857915, + "learning_rate": 1.7338443251630125e-05, + "loss": 0.5391, + "step": 3207 + }, + { + "epoch": 0.26, + "grad_norm": 0.8938768216765625, + "learning_rate": 1.7336654712289125e-05, + "loss": 0.5729, + "step": 3208 + }, + { + "epoch": 0.26, + "grad_norm": 0.9178495968780668, + "learning_rate": 1.733486566452446e-05, + "loss": 0.6117, + "step": 3209 + }, + { + "epoch": 0.26, + "grad_norm": 1.0081580578635072, + "learning_rate": 1.733307610846011e-05, + "loss": 0.549, + "step": 3210 + }, + { + "epoch": 0.26, + "grad_norm": 0.9144248732928212, + "learning_rate": 1.7331286044220086e-05, + "loss": 0.5454, + "step": 3211 + }, + { + "epoch": 0.26, + "grad_norm": 1.0079526594157142, + "learning_rate": 1.7329495471928446e-05, + "loss": 0.5632, + "step": 3212 + }, + { + "epoch": 0.26, + "grad_norm": 0.9673535210182386, + "learning_rate": 1.732770439170927e-05, + "loss": 0.5216, + "step": 3213 + }, + { + "epoch": 0.26, + "grad_norm": 0.8366252919513592, + "learning_rate": 1.732591280368668e-05, + "loss": 0.5358, + "step": 3214 + }, + { + "epoch": 0.26, + "grad_norm": 0.8997963514421738, + "learning_rate": 1.732412070798483e-05, + "loss": 0.544, + "step": 3215 + }, + { + "epoch": 0.26, + "grad_norm": 1.0674135355568224, + "learning_rate": 1.732232810472791e-05, + "loss": 0.6194, + "step": 3216 + }, + { + "epoch": 0.26, + "grad_norm": 0.9130503380839831, + "learning_rate": 1.7320534994040148e-05, + "loss": 0.5702, + "step": 3217 + }, + { + "epoch": 0.26, + "grad_norm": 0.9125939393730079, + "learning_rate": 1.7318741376045806e-05, + "loss": 0.5232, + "step": 3218 + }, + { + "epoch": 0.26, + "grad_norm": 0.9152888605297167, + "learning_rate": 1.731694725086918e-05, + "loss": 0.5589, + "step": 3219 + }, + { + "epoch": 0.26, + "grad_norm": 0.918053811287748, + "learning_rate": 1.7315152618634594e-05, + "loss": 0.5515, + "step": 3220 + }, + { + "epoch": 0.26, + "grad_norm": 0.9517138283797776, + "learning_rate": 1.731335747946642e-05, + "loss": 0.491, + "step": 3221 + }, + { + "epoch": 0.26, + "grad_norm": 0.9695937631844701, + "learning_rate": 1.7311561833489065e-05, + "loss": 0.6387, + "step": 3222 + }, + { + "epoch": 0.26, + "grad_norm": 0.9344378625169966, + "learning_rate": 1.730976568082696e-05, + "loss": 0.5961, + "step": 3223 + }, + { + "epoch": 0.26, + "grad_norm": 0.9918672455564193, + "learning_rate": 1.7307969021604574e-05, + "loss": 0.5898, + "step": 3224 + }, + { + "epoch": 0.26, + "grad_norm": 0.9510075197303961, + "learning_rate": 1.730617185594642e-05, + "loss": 0.6031, + "step": 3225 + }, + { + "epoch": 0.26, + "grad_norm": 0.9366114096543621, + "learning_rate": 1.7304374183977032e-05, + "loss": 0.6217, + "step": 3226 + }, + { + "epoch": 0.26, + "grad_norm": 1.0382255870102077, + "learning_rate": 1.7302576005820997e-05, + "loss": 0.6121, + "step": 3227 + }, + { + "epoch": 0.26, + "grad_norm": 0.8940002944032726, + "learning_rate": 1.730077732160292e-05, + "loss": 0.5346, + "step": 3228 + }, + { + "epoch": 0.26, + "grad_norm": 0.9389001006992637, + "learning_rate": 1.729897813144745e-05, + "loss": 0.6312, + "step": 3229 + }, + { + "epoch": 0.26, + "grad_norm": 1.027925431969123, + "learning_rate": 1.729717843547927e-05, + "loss": 0.6487, + "step": 3230 + }, + { + "epoch": 0.26, + "grad_norm": 0.973280328221155, + "learning_rate": 1.7295378233823096e-05, + "loss": 0.5505, + "step": 3231 + }, + { + "epoch": 0.26, + "grad_norm": 0.8950161903045436, + "learning_rate": 1.7293577526603684e-05, + "loss": 0.5313, + "step": 3232 + }, + { + "epoch": 0.26, + "grad_norm": 1.0185304119690635, + "learning_rate": 1.7291776313945817e-05, + "loss": 0.5965, + "step": 3233 + }, + { + "epoch": 0.26, + "grad_norm": 0.9786576159121729, + "learning_rate": 1.728997459597432e-05, + "loss": 0.5804, + "step": 3234 + }, + { + "epoch": 0.26, + "grad_norm": 1.0669721345581529, + "learning_rate": 1.728817237281405e-05, + "loss": 0.5765, + "step": 3235 + }, + { + "epoch": 0.26, + "grad_norm": 0.9704256322697749, + "learning_rate": 1.7286369644589897e-05, + "loss": 0.631, + "step": 3236 + }, + { + "epoch": 0.26, + "grad_norm": 0.9319412904056394, + "learning_rate": 1.728456641142679e-05, + "loss": 0.5408, + "step": 3237 + }, + { + "epoch": 0.26, + "grad_norm": 1.0092120792907255, + "learning_rate": 1.7282762673449695e-05, + "loss": 0.6245, + "step": 3238 + }, + { + "epoch": 0.26, + "grad_norm": 0.9394081643734624, + "learning_rate": 1.7280958430783608e-05, + "loss": 0.6109, + "step": 3239 + }, + { + "epoch": 0.26, + "grad_norm": 0.9187040965972737, + "learning_rate": 1.7279153683553556e-05, + "loss": 0.574, + "step": 3240 + }, + { + "epoch": 0.26, + "grad_norm": 0.890777039494145, + "learning_rate": 1.7277348431884613e-05, + "loss": 0.5282, + "step": 3241 + }, + { + "epoch": 0.26, + "grad_norm": 0.9816871876859573, + "learning_rate": 1.7275542675901876e-05, + "loss": 0.5562, + "step": 3242 + }, + { + "epoch": 0.26, + "grad_norm": 1.0160416140142166, + "learning_rate": 1.7273736415730488e-05, + "loss": 0.6288, + "step": 3243 + }, + { + "epoch": 0.26, + "grad_norm": 0.976610433292306, + "learning_rate": 1.7271929651495617e-05, + "loss": 0.5985, + "step": 3244 + }, + { + "epoch": 0.26, + "grad_norm": 0.9581355875399314, + "learning_rate": 1.7270122383322473e-05, + "loss": 0.6334, + "step": 3245 + }, + { + "epoch": 0.26, + "grad_norm": 1.1556542224283102, + "learning_rate": 1.7268314611336296e-05, + "loss": 0.5319, + "step": 3246 + }, + { + "epoch": 0.26, + "grad_norm": 1.0106194182617843, + "learning_rate": 1.726650633566236e-05, + "loss": 0.6098, + "step": 3247 + }, + { + "epoch": 0.26, + "grad_norm": 0.9072489025762995, + "learning_rate": 1.726469755642598e-05, + "loss": 0.5464, + "step": 3248 + }, + { + "epoch": 0.26, + "grad_norm": 0.8495528879265195, + "learning_rate": 1.7262888273752505e-05, + "loss": 0.5391, + "step": 3249 + }, + { + "epoch": 0.26, + "grad_norm": 0.9946732239643433, + "learning_rate": 1.7261078487767317e-05, + "loss": 0.5863, + "step": 3250 + }, + { + "epoch": 0.26, + "grad_norm": 0.9445316998047976, + "learning_rate": 1.7259268198595828e-05, + "loss": 0.5619, + "step": 3251 + }, + { + "epoch": 0.26, + "grad_norm": 0.9169313514431681, + "learning_rate": 1.7257457406363495e-05, + "loss": 0.5631, + "step": 3252 + }, + { + "epoch": 0.26, + "grad_norm": 0.9302146760899365, + "learning_rate": 1.72556461111958e-05, + "loss": 0.5708, + "step": 3253 + }, + { + "epoch": 0.26, + "grad_norm": 0.9320385142628118, + "learning_rate": 1.725383431321826e-05, + "loss": 0.6121, + "step": 3254 + }, + { + "epoch": 0.26, + "grad_norm": 0.9053121367203344, + "learning_rate": 1.725202201255644e-05, + "loss": 0.5386, + "step": 3255 + }, + { + "epoch": 0.26, + "grad_norm": 0.9140031954486985, + "learning_rate": 1.725020920933593e-05, + "loss": 0.6397, + "step": 3256 + }, + { + "epoch": 0.26, + "grad_norm": 1.0485591789305269, + "learning_rate": 1.7248395903682347e-05, + "loss": 0.6717, + "step": 3257 + }, + { + "epoch": 0.26, + "grad_norm": 0.9622445974231173, + "learning_rate": 1.724658209572136e-05, + "loss": 0.5676, + "step": 3258 + }, + { + "epoch": 0.26, + "grad_norm": 0.9008820906676879, + "learning_rate": 1.724476778557866e-05, + "loss": 0.5995, + "step": 3259 + }, + { + "epoch": 0.26, + "grad_norm": 0.8493129216065038, + "learning_rate": 1.7242952973379983e-05, + "loss": 0.536, + "step": 3260 + }, + { + "epoch": 0.27, + "grad_norm": 1.0225594519850092, + "learning_rate": 1.7241137659251087e-05, + "loss": 0.5215, + "step": 3261 + }, + { + "epoch": 0.27, + "grad_norm": 0.9949755006324897, + "learning_rate": 1.723932184331777e-05, + "loss": 0.5143, + "step": 3262 + }, + { + "epoch": 0.27, + "grad_norm": 1.0648617924766293, + "learning_rate": 1.7237505525705875e-05, + "loss": 0.6225, + "step": 3263 + }, + { + "epoch": 0.27, + "grad_norm": 0.9205169832526819, + "learning_rate": 1.7235688706541266e-05, + "loss": 0.5073, + "step": 3264 + }, + { + "epoch": 0.27, + "grad_norm": 1.0317464947048756, + "learning_rate": 1.723387138594985e-05, + "loss": 0.586, + "step": 3265 + }, + { + "epoch": 0.27, + "grad_norm": 0.9378484870577685, + "learning_rate": 1.723205356405756e-05, + "loss": 0.5489, + "step": 3266 + }, + { + "epoch": 0.27, + "grad_norm": 0.9899383610437812, + "learning_rate": 1.7230235240990373e-05, + "loss": 0.565, + "step": 3267 + }, + { + "epoch": 0.27, + "grad_norm": 1.0498755184573376, + "learning_rate": 1.72284164168743e-05, + "loss": 0.5998, + "step": 3268 + }, + { + "epoch": 0.27, + "grad_norm": 0.9073391433511635, + "learning_rate": 1.7226597091835377e-05, + "loss": 0.5493, + "step": 3269 + }, + { + "epoch": 0.27, + "grad_norm": 0.8451787420231008, + "learning_rate": 1.7224777265999688e-05, + "loss": 0.5525, + "step": 3270 + }, + { + "epoch": 0.27, + "grad_norm": 0.9488853496030325, + "learning_rate": 1.722295693949334e-05, + "loss": 0.5827, + "step": 3271 + }, + { + "epoch": 0.27, + "grad_norm": 1.0495687319964833, + "learning_rate": 1.7221136112442487e-05, + "loss": 0.6238, + "step": 3272 + }, + { + "epoch": 0.27, + "grad_norm": 0.94853336956771, + "learning_rate": 1.7219314784973304e-05, + "loss": 0.523, + "step": 3273 + }, + { + "epoch": 0.27, + "grad_norm": 0.9628064262893254, + "learning_rate": 1.721749295721201e-05, + "loss": 0.5904, + "step": 3274 + }, + { + "epoch": 0.27, + "grad_norm": 0.9345929252013101, + "learning_rate": 1.7215670629284856e-05, + "loss": 0.5494, + "step": 3275 + }, + { + "epoch": 0.27, + "grad_norm": 0.8830597865479197, + "learning_rate": 1.7213847801318128e-05, + "loss": 0.5605, + "step": 3276 + }, + { + "epoch": 0.27, + "grad_norm": 0.9572622089997017, + "learning_rate": 1.7212024473438145e-05, + "loss": 0.6316, + "step": 3277 + }, + { + "epoch": 0.27, + "grad_norm": 0.9562194162392834, + "learning_rate": 1.7210200645771268e-05, + "loss": 0.5326, + "step": 3278 + }, + { + "epoch": 0.27, + "grad_norm": 1.0553393174265637, + "learning_rate": 1.7208376318443877e-05, + "loss": 0.634, + "step": 3279 + }, + { + "epoch": 0.27, + "grad_norm": 0.9970413860899692, + "learning_rate": 1.72065514915824e-05, + "loss": 0.6207, + "step": 3280 + }, + { + "epoch": 0.27, + "grad_norm": 0.9318499998745525, + "learning_rate": 1.72047261653133e-05, + "loss": 0.5454, + "step": 3281 + }, + { + "epoch": 0.27, + "grad_norm": 0.9223364403973837, + "learning_rate": 1.7202900339763066e-05, + "loss": 0.6575, + "step": 3282 + }, + { + "epoch": 0.27, + "grad_norm": 0.929898055654162, + "learning_rate": 1.7201074015058226e-05, + "loss": 0.5548, + "step": 3283 + }, + { + "epoch": 0.27, + "grad_norm": 0.9324984663522081, + "learning_rate": 1.7199247191325347e-05, + "loss": 0.6458, + "step": 3284 + }, + { + "epoch": 0.27, + "grad_norm": 0.9534813809660702, + "learning_rate": 1.7197419868691022e-05, + "loss": 0.5562, + "step": 3285 + }, + { + "epoch": 0.27, + "grad_norm": 0.953733613695882, + "learning_rate": 1.719559204728188e-05, + "loss": 0.6061, + "step": 3286 + }, + { + "epoch": 0.27, + "grad_norm": 0.9526278381395721, + "learning_rate": 1.7193763727224596e-05, + "loss": 0.6133, + "step": 3287 + }, + { + "epoch": 0.27, + "grad_norm": 0.9974347688291391, + "learning_rate": 1.719193490864587e-05, + "loss": 0.611, + "step": 3288 + }, + { + "epoch": 0.27, + "grad_norm": 0.9452782503892461, + "learning_rate": 1.719010559167243e-05, + "loss": 0.5824, + "step": 3289 + }, + { + "epoch": 0.27, + "grad_norm": 0.8358480245034036, + "learning_rate": 1.7188275776431048e-05, + "loss": 0.5364, + "step": 3290 + }, + { + "epoch": 0.27, + "grad_norm": 0.918407324135863, + "learning_rate": 1.7186445463048533e-05, + "loss": 0.5966, + "step": 3291 + }, + { + "epoch": 0.27, + "grad_norm": 0.9047942682262397, + "learning_rate": 1.7184614651651723e-05, + "loss": 0.5637, + "step": 3292 + }, + { + "epoch": 0.27, + "grad_norm": 0.9728046891966552, + "learning_rate": 1.718278334236749e-05, + "loss": 0.6377, + "step": 3293 + }, + { + "epoch": 0.27, + "grad_norm": 0.9405103120350403, + "learning_rate": 1.7180951535322742e-05, + "loss": 0.5708, + "step": 3294 + }, + { + "epoch": 0.27, + "grad_norm": 0.8591298849891674, + "learning_rate": 1.717911923064442e-05, + "loss": 0.5249, + "step": 3295 + }, + { + "epoch": 0.27, + "grad_norm": 0.8752323427602121, + "learning_rate": 1.7177286428459505e-05, + "loss": 0.5671, + "step": 3296 + }, + { + "epoch": 0.27, + "grad_norm": 0.9414564811464446, + "learning_rate": 1.717545312889501e-05, + "loss": 0.5671, + "step": 3297 + }, + { + "epoch": 0.27, + "grad_norm": 1.02907718626476, + "learning_rate": 1.7173619332077972e-05, + "loss": 0.6245, + "step": 3298 + }, + { + "epoch": 0.27, + "grad_norm": 0.883595641347089, + "learning_rate": 1.717178503813548e-05, + "loss": 0.58, + "step": 3299 + }, + { + "epoch": 0.27, + "grad_norm": 0.9260784685712767, + "learning_rate": 1.7169950247194646e-05, + "loss": 0.5683, + "step": 3300 + }, + { + "epoch": 0.27, + "grad_norm": 0.9018370832687636, + "learning_rate": 1.716811495938262e-05, + "loss": 0.6049, + "step": 3301 + }, + { + "epoch": 0.27, + "grad_norm": 0.9249549260329984, + "learning_rate": 1.716627917482658e-05, + "loss": 0.5928, + "step": 3302 + }, + { + "epoch": 0.27, + "grad_norm": 0.950760435439383, + "learning_rate": 1.716444289365376e-05, + "loss": 0.592, + "step": 3303 + }, + { + "epoch": 0.27, + "grad_norm": 1.0057267824279712, + "learning_rate": 1.7162606115991395e-05, + "loss": 0.6063, + "step": 3304 + }, + { + "epoch": 0.27, + "grad_norm": 0.9109873091350366, + "learning_rate": 1.7160768841966785e-05, + "loss": 0.5169, + "step": 3305 + }, + { + "epoch": 0.27, + "grad_norm": 0.9393355629622872, + "learning_rate": 1.7158931071707242e-05, + "loss": 0.6203, + "step": 3306 + }, + { + "epoch": 0.27, + "grad_norm": 0.9332032789248998, + "learning_rate": 1.7157092805340126e-05, + "loss": 0.5535, + "step": 3307 + }, + { + "epoch": 0.27, + "grad_norm": 0.9745364105146304, + "learning_rate": 1.7155254042992827e-05, + "loss": 0.554, + "step": 3308 + }, + { + "epoch": 0.27, + "grad_norm": 0.8993080723884361, + "learning_rate": 1.715341478479277e-05, + "loss": 0.5725, + "step": 3309 + }, + { + "epoch": 0.27, + "grad_norm": 0.9404230807613553, + "learning_rate": 1.715157503086741e-05, + "loss": 0.5964, + "step": 3310 + }, + { + "epoch": 0.27, + "grad_norm": 1.0544615187533903, + "learning_rate": 1.7149734781344247e-05, + "loss": 0.6241, + "step": 3311 + }, + { + "epoch": 0.27, + "grad_norm": 0.9740583185354336, + "learning_rate": 1.7147894036350804e-05, + "loss": 0.559, + "step": 3312 + }, + { + "epoch": 0.27, + "grad_norm": 0.906313386144417, + "learning_rate": 1.7146052796014646e-05, + "loss": 0.5712, + "step": 3313 + }, + { + "epoch": 0.27, + "grad_norm": 0.922204166116946, + "learning_rate": 1.7144211060463368e-05, + "loss": 0.5948, + "step": 3314 + }, + { + "epoch": 0.27, + "grad_norm": 0.9019297900829841, + "learning_rate": 1.7142368829824602e-05, + "loss": 0.5747, + "step": 3315 + }, + { + "epoch": 0.27, + "grad_norm": 0.8758645949867254, + "learning_rate": 1.714052610422601e-05, + "loss": 0.575, + "step": 3316 + }, + { + "epoch": 0.27, + "grad_norm": 0.9339216019045296, + "learning_rate": 1.7138682883795292e-05, + "loss": 0.5893, + "step": 3317 + }, + { + "epoch": 0.27, + "grad_norm": 0.892853688298582, + "learning_rate": 1.713683916866018e-05, + "loss": 0.5385, + "step": 3318 + }, + { + "epoch": 0.27, + "grad_norm": 0.8869662452289221, + "learning_rate": 1.7134994958948444e-05, + "loss": 0.5819, + "step": 3319 + }, + { + "epoch": 0.27, + "grad_norm": 0.9363151281743876, + "learning_rate": 1.713315025478789e-05, + "loss": 0.5378, + "step": 3320 + }, + { + "epoch": 0.27, + "grad_norm": 0.9454884558940029, + "learning_rate": 1.713130505630635e-05, + "loss": 0.5796, + "step": 3321 + }, + { + "epoch": 0.27, + "grad_norm": 0.9600187437555181, + "learning_rate": 1.7129459363631692e-05, + "loss": 0.5741, + "step": 3322 + }, + { + "epoch": 0.27, + "grad_norm": 0.9129181561719604, + "learning_rate": 1.7127613176891824e-05, + "loss": 0.5531, + "step": 3323 + }, + { + "epoch": 0.27, + "grad_norm": 0.8436607312653893, + "learning_rate": 1.7125766496214687e-05, + "loss": 0.5581, + "step": 3324 + }, + { + "epoch": 0.27, + "grad_norm": 0.9634247002821723, + "learning_rate": 1.712391932172825e-05, + "loss": 0.5728, + "step": 3325 + }, + { + "epoch": 0.27, + "grad_norm": 0.9110565883221894, + "learning_rate": 1.712207165356053e-05, + "loss": 0.6041, + "step": 3326 + }, + { + "epoch": 0.27, + "grad_norm": 0.8991361572067563, + "learning_rate": 1.7120223491839553e-05, + "loss": 0.5818, + "step": 3327 + }, + { + "epoch": 0.27, + "grad_norm": 0.9050015040122568, + "learning_rate": 1.7118374836693407e-05, + "loss": 0.6224, + "step": 3328 + }, + { + "epoch": 0.27, + "grad_norm": 0.8981409776838637, + "learning_rate": 1.71165256882502e-05, + "loss": 0.616, + "step": 3329 + }, + { + "epoch": 0.27, + "grad_norm": 0.9038761710482833, + "learning_rate": 1.7114676046638076e-05, + "loss": 0.577, + "step": 3330 + }, + { + "epoch": 0.27, + "grad_norm": 0.9331379208091041, + "learning_rate": 1.7112825911985207e-05, + "loss": 0.5678, + "step": 3331 + }, + { + "epoch": 0.27, + "grad_norm": 0.9866541584168005, + "learning_rate": 1.7110975284419814e-05, + "loss": 0.6266, + "step": 3332 + }, + { + "epoch": 0.27, + "grad_norm": 0.938458776562073, + "learning_rate": 1.7109124164070144e-05, + "loss": 0.5473, + "step": 3333 + }, + { + "epoch": 0.27, + "grad_norm": 0.9282242071375044, + "learning_rate": 1.710727255106447e-05, + "loss": 0.5782, + "step": 3334 + }, + { + "epoch": 0.27, + "grad_norm": 0.9217922793401815, + "learning_rate": 1.710542044553112e-05, + "loss": 0.5536, + "step": 3335 + }, + { + "epoch": 0.27, + "grad_norm": 0.8435229885105086, + "learning_rate": 1.710356784759843e-05, + "loss": 0.5292, + "step": 3336 + }, + { + "epoch": 0.27, + "grad_norm": 0.8888554609533063, + "learning_rate": 1.7101714757394792e-05, + "loss": 0.6026, + "step": 3337 + }, + { + "epoch": 0.27, + "grad_norm": 0.9747928015600499, + "learning_rate": 1.7099861175048617e-05, + "loss": 0.6023, + "step": 3338 + }, + { + "epoch": 0.27, + "grad_norm": 1.0846321842053752, + "learning_rate": 1.7098007100688362e-05, + "loss": 0.5917, + "step": 3339 + }, + { + "epoch": 0.27, + "grad_norm": 0.9185007365976529, + "learning_rate": 1.7096152534442515e-05, + "loss": 0.5267, + "step": 3340 + }, + { + "epoch": 0.27, + "grad_norm": 0.8376651511974664, + "learning_rate": 1.7094297476439585e-05, + "loss": 0.5167, + "step": 3341 + }, + { + "epoch": 0.27, + "grad_norm": 0.8460925887281229, + "learning_rate": 1.7092441926808138e-05, + "loss": 0.5659, + "step": 3342 + }, + { + "epoch": 0.27, + "grad_norm": 0.9936106466309771, + "learning_rate": 1.7090585885676753e-05, + "loss": 0.5279, + "step": 3343 + }, + { + "epoch": 0.27, + "grad_norm": 1.064099435320851, + "learning_rate": 1.7088729353174054e-05, + "loss": 0.5633, + "step": 3344 + }, + { + "epoch": 0.27, + "grad_norm": 0.9457408843760768, + "learning_rate": 1.7086872329428702e-05, + "loss": 0.6306, + "step": 3345 + }, + { + "epoch": 0.27, + "grad_norm": 0.8350392579479472, + "learning_rate": 1.708501481456938e-05, + "loss": 0.5424, + "step": 3346 + }, + { + "epoch": 0.27, + "grad_norm": 0.9524852783334381, + "learning_rate": 1.7083156808724817e-05, + "loss": 0.6225, + "step": 3347 + }, + { + "epoch": 0.27, + "grad_norm": 0.9605946312330363, + "learning_rate": 1.7081298312023773e-05, + "loss": 0.5867, + "step": 3348 + }, + { + "epoch": 0.27, + "grad_norm": 1.0630205296787159, + "learning_rate": 1.7079439324595038e-05, + "loss": 0.5759, + "step": 3349 + }, + { + "epoch": 0.27, + "grad_norm": 0.8757051239915971, + "learning_rate": 1.7077579846567435e-05, + "loss": 0.5896, + "step": 3350 + }, + { + "epoch": 0.27, + "grad_norm": 0.911951669277883, + "learning_rate": 1.7075719878069822e-05, + "loss": 0.5148, + "step": 3351 + }, + { + "epoch": 0.27, + "grad_norm": 0.9017122704150898, + "learning_rate": 1.7073859419231104e-05, + "loss": 0.5533, + "step": 3352 + }, + { + "epoch": 0.27, + "grad_norm": 1.130210862193503, + "learning_rate": 1.70719984701802e-05, + "loss": 0.6358, + "step": 3353 + }, + { + "epoch": 0.27, + "grad_norm": 1.015208620424021, + "learning_rate": 1.7070137031046074e-05, + "loss": 0.6288, + "step": 3354 + }, + { + "epoch": 0.27, + "grad_norm": 0.98786674028703, + "learning_rate": 1.7068275101957724e-05, + "loss": 0.5977, + "step": 3355 + }, + { + "epoch": 0.27, + "grad_norm": 0.9258362716954789, + "learning_rate": 1.7066412683044176e-05, + "loss": 0.5598, + "step": 3356 + }, + { + "epoch": 0.27, + "grad_norm": 0.9371702688369251, + "learning_rate": 1.7064549774434502e-05, + "loss": 0.5474, + "step": 3357 + }, + { + "epoch": 0.27, + "grad_norm": 0.9476739249381467, + "learning_rate": 1.7062686376257792e-05, + "loss": 0.6134, + "step": 3358 + }, + { + "epoch": 0.27, + "grad_norm": 0.8500421461497814, + "learning_rate": 1.706082248864318e-05, + "loss": 0.538, + "step": 3359 + }, + { + "epoch": 0.27, + "grad_norm": 0.8320273702343286, + "learning_rate": 1.7058958111719836e-05, + "loss": 0.5137, + "step": 3360 + }, + { + "epoch": 0.27, + "grad_norm": 0.9597941408747482, + "learning_rate": 1.7057093245616953e-05, + "loss": 0.6073, + "step": 3361 + }, + { + "epoch": 0.27, + "grad_norm": 0.9336154644932648, + "learning_rate": 1.705522789046377e-05, + "loss": 0.5306, + "step": 3362 + }, + { + "epoch": 0.27, + "grad_norm": 0.9007035249997226, + "learning_rate": 1.7053362046389553e-05, + "loss": 0.5615, + "step": 3363 + }, + { + "epoch": 0.27, + "grad_norm": 0.8606210782971025, + "learning_rate": 1.7051495713523598e-05, + "loss": 0.5333, + "step": 3364 + }, + { + "epoch": 0.27, + "grad_norm": 1.037431116748183, + "learning_rate": 1.7049628891995245e-05, + "loss": 0.6167, + "step": 3365 + }, + { + "epoch": 0.27, + "grad_norm": 0.8689014841123899, + "learning_rate": 1.7047761581933867e-05, + "loss": 0.5588, + "step": 3366 + }, + { + "epoch": 0.27, + "grad_norm": 0.8812212259999989, + "learning_rate": 1.704589378346886e-05, + "loss": 0.5603, + "step": 3367 + }, + { + "epoch": 0.27, + "grad_norm": 0.9829983554137566, + "learning_rate": 1.7044025496729665e-05, + "loss": 0.5547, + "step": 3368 + }, + { + "epoch": 0.27, + "grad_norm": 0.9032978495659435, + "learning_rate": 1.7042156721845754e-05, + "loss": 0.497, + "step": 3369 + }, + { + "epoch": 0.27, + "grad_norm": 0.8688514896023216, + "learning_rate": 1.7040287458946623e-05, + "loss": 0.5186, + "step": 3370 + }, + { + "epoch": 0.27, + "grad_norm": 0.8515062816030871, + "learning_rate": 1.7038417708161817e-05, + "loss": 0.58, + "step": 3371 + }, + { + "epoch": 0.27, + "grad_norm": 0.879654908280097, + "learning_rate": 1.7036547469620908e-05, + "loss": 0.5517, + "step": 3372 + }, + { + "epoch": 0.27, + "grad_norm": 0.9111200905537216, + "learning_rate": 1.70346767434535e-05, + "loss": 0.5802, + "step": 3373 + }, + { + "epoch": 0.27, + "grad_norm": 0.9598633626437251, + "learning_rate": 1.7032805529789233e-05, + "loss": 0.5801, + "step": 3374 + }, + { + "epoch": 0.27, + "grad_norm": 0.9596570514252659, + "learning_rate": 1.7030933828757785e-05, + "loss": 0.5795, + "step": 3375 + }, + { + "epoch": 0.27, + "grad_norm": 0.961433589710115, + "learning_rate": 1.7029061640488855e-05, + "loss": 0.5757, + "step": 3376 + }, + { + "epoch": 0.27, + "grad_norm": 0.9407600851727718, + "learning_rate": 1.702718896511219e-05, + "loss": 0.5584, + "step": 3377 + }, + { + "epoch": 0.27, + "grad_norm": 0.8088236897988393, + "learning_rate": 1.7025315802757558e-05, + "loss": 0.5237, + "step": 3378 + }, + { + "epoch": 0.27, + "grad_norm": 0.9058206622747288, + "learning_rate": 1.7023442153554776e-05, + "loss": 0.554, + "step": 3379 + }, + { + "epoch": 0.27, + "grad_norm": 0.9935709406567976, + "learning_rate": 1.7021568017633683e-05, + "loss": 0.6106, + "step": 3380 + }, + { + "epoch": 0.27, + "grad_norm": 0.9189552590843816, + "learning_rate": 1.7019693395124153e-05, + "loss": 0.5729, + "step": 3381 + }, + { + "epoch": 0.27, + "grad_norm": 0.9358733733184447, + "learning_rate": 1.70178182861561e-05, + "loss": 0.5921, + "step": 3382 + }, + { + "epoch": 0.27, + "grad_norm": 0.8672815001532652, + "learning_rate": 1.701594269085946e-05, + "loss": 0.6447, + "step": 3383 + }, + { + "epoch": 0.28, + "grad_norm": 0.9180923664460106, + "learning_rate": 1.701406660936422e-05, + "loss": 0.6046, + "step": 3384 + }, + { + "epoch": 0.28, + "grad_norm": 1.0033780298482418, + "learning_rate": 1.7012190041800384e-05, + "loss": 0.5508, + "step": 3385 + }, + { + "epoch": 0.28, + "grad_norm": 0.9146946932023774, + "learning_rate": 1.7010312988297993e-05, + "loss": 0.589, + "step": 3386 + }, + { + "epoch": 0.28, + "grad_norm": 0.9725045335715133, + "learning_rate": 1.7008435448987134e-05, + "loss": 0.6529, + "step": 3387 + }, + { + "epoch": 0.28, + "grad_norm": 0.9226018976731254, + "learning_rate": 1.7006557423997917e-05, + "loss": 0.5691, + "step": 3388 + }, + { + "epoch": 0.28, + "grad_norm": 0.9765084315182954, + "learning_rate": 1.7004678913460483e-05, + "loss": 0.6344, + "step": 3389 + }, + { + "epoch": 0.28, + "grad_norm": 1.072975921776912, + "learning_rate": 1.7002799917505014e-05, + "loss": 0.5671, + "step": 3390 + }, + { + "epoch": 0.28, + "grad_norm": 0.9540991904709908, + "learning_rate": 1.700092043626172e-05, + "loss": 0.5818, + "step": 3391 + }, + { + "epoch": 0.28, + "grad_norm": 0.9795979551718766, + "learning_rate": 1.6999040469860852e-05, + "loss": 0.5807, + "step": 3392 + }, + { + "epoch": 0.28, + "grad_norm": 0.8628947844434518, + "learning_rate": 1.6997160018432688e-05, + "loss": 0.5442, + "step": 3393 + }, + { + "epoch": 0.28, + "grad_norm": 0.8811809802727653, + "learning_rate": 1.6995279082107537e-05, + "loss": 0.5267, + "step": 3394 + }, + { + "epoch": 0.28, + "grad_norm": 0.8814467993180549, + "learning_rate": 1.6993397661015754e-05, + "loss": 0.5939, + "step": 3395 + }, + { + "epoch": 0.28, + "grad_norm": 0.892270166357497, + "learning_rate": 1.6991515755287715e-05, + "loss": 0.5586, + "step": 3396 + }, + { + "epoch": 0.28, + "grad_norm": 0.9363609939411836, + "learning_rate": 1.6989633365053837e-05, + "loss": 0.5826, + "step": 3397 + }, + { + "epoch": 0.28, + "grad_norm": 0.9460263488365313, + "learning_rate": 1.6987750490444565e-05, + "loss": 0.5738, + "step": 3398 + }, + { + "epoch": 0.28, + "grad_norm": 0.8830829644992519, + "learning_rate": 1.6985867131590383e-05, + "loss": 0.6229, + "step": 3399 + }, + { + "epoch": 0.28, + "grad_norm": 1.0074862329859187, + "learning_rate": 1.6983983288621807e-05, + "loss": 0.6148, + "step": 3400 + }, + { + "epoch": 0.28, + "grad_norm": 0.8830992932928085, + "learning_rate": 1.6982098961669383e-05, + "loss": 0.5447, + "step": 3401 + }, + { + "epoch": 0.28, + "grad_norm": 0.9669231327429048, + "learning_rate": 1.6980214150863692e-05, + "loss": 0.5977, + "step": 3402 + }, + { + "epoch": 0.28, + "grad_norm": 0.9375796919681045, + "learning_rate": 1.6978328856335354e-05, + "loss": 0.548, + "step": 3403 + }, + { + "epoch": 0.28, + "grad_norm": 0.8929206405304951, + "learning_rate": 1.6976443078215015e-05, + "loss": 0.5252, + "step": 3404 + }, + { + "epoch": 0.28, + "grad_norm": 0.8971001962743499, + "learning_rate": 1.697455681663336e-05, + "loss": 0.5643, + "step": 3405 + }, + { + "epoch": 0.28, + "grad_norm": 0.8973473047538455, + "learning_rate": 1.69726700717211e-05, + "loss": 0.5758, + "step": 3406 + }, + { + "epoch": 0.28, + "grad_norm": 1.0046297232651533, + "learning_rate": 1.6970782843608994e-05, + "loss": 0.5857, + "step": 3407 + }, + { + "epoch": 0.28, + "grad_norm": 0.991860327566595, + "learning_rate": 1.6968895132427817e-05, + "loss": 0.5205, + "step": 3408 + }, + { + "epoch": 0.28, + "grad_norm": 0.9344982001418214, + "learning_rate": 1.696700693830839e-05, + "loss": 0.5794, + "step": 3409 + }, + { + "epoch": 0.28, + "grad_norm": 0.8598062778766361, + "learning_rate": 1.6965118261381557e-05, + "loss": 0.5485, + "step": 3410 + }, + { + "epoch": 0.28, + "grad_norm": 0.9926075241949736, + "learning_rate": 1.6963229101778215e-05, + "loss": 0.5576, + "step": 3411 + }, + { + "epoch": 0.28, + "grad_norm": 0.8706724751709342, + "learning_rate": 1.696133945962927e-05, + "loss": 0.5495, + "step": 3412 + }, + { + "epoch": 0.28, + "grad_norm": 0.950973621277761, + "learning_rate": 1.695944933506567e-05, + "loss": 0.5053, + "step": 3413 + }, + { + "epoch": 0.28, + "grad_norm": 1.0268360922092827, + "learning_rate": 1.695755872821841e-05, + "loss": 0.666, + "step": 3414 + }, + { + "epoch": 0.28, + "grad_norm": 0.9053834277128515, + "learning_rate": 1.6955667639218497e-05, + "loss": 0.5685, + "step": 3415 + }, + { + "epoch": 0.28, + "grad_norm": 1.082944483643275, + "learning_rate": 1.695377606819699e-05, + "loss": 0.5695, + "step": 3416 + }, + { + "epoch": 0.28, + "grad_norm": 0.8542060396820743, + "learning_rate": 1.6951884015284966e-05, + "loss": 0.5983, + "step": 3417 + }, + { + "epoch": 0.28, + "grad_norm": 0.9432598922230787, + "learning_rate": 1.694999148061355e-05, + "loss": 0.5653, + "step": 3418 + }, + { + "epoch": 0.28, + "grad_norm": 0.9270649827866421, + "learning_rate": 1.6948098464313886e-05, + "loss": 0.5379, + "step": 3419 + }, + { + "epoch": 0.28, + "grad_norm": 1.075767072500345, + "learning_rate": 1.6946204966517165e-05, + "loss": 0.5268, + "step": 3420 + }, + { + "epoch": 0.28, + "grad_norm": 0.8705734453427241, + "learning_rate": 1.6944310987354597e-05, + "loss": 0.5886, + "step": 3421 + }, + { + "epoch": 0.28, + "grad_norm": 0.9925730390794844, + "learning_rate": 1.6942416526957438e-05, + "loss": 0.6483, + "step": 3422 + }, + { + "epoch": 0.28, + "grad_norm": 0.907248303985114, + "learning_rate": 1.694052158545697e-05, + "loss": 0.5032, + "step": 3423 + }, + { + "epoch": 0.28, + "grad_norm": 0.9043231411925035, + "learning_rate": 1.6938626162984516e-05, + "loss": 0.5614, + "step": 3424 + }, + { + "epoch": 0.28, + "grad_norm": 0.9461336445647923, + "learning_rate": 1.6936730259671423e-05, + "loss": 0.5063, + "step": 3425 + }, + { + "epoch": 0.28, + "grad_norm": 0.9913856016305469, + "learning_rate": 1.6934833875649074e-05, + "loss": 0.5747, + "step": 3426 + }, + { + "epoch": 0.28, + "grad_norm": 0.9253569554243637, + "learning_rate": 1.693293701104889e-05, + "loss": 0.5865, + "step": 3427 + }, + { + "epoch": 0.28, + "grad_norm": 0.8644437828373465, + "learning_rate": 1.693103966600232e-05, + "loss": 0.5782, + "step": 3428 + }, + { + "epoch": 0.28, + "grad_norm": 0.925353380941438, + "learning_rate": 1.692914184064085e-05, + "loss": 0.5667, + "step": 3429 + }, + { + "epoch": 0.28, + "grad_norm": 0.9749291183376588, + "learning_rate": 1.6927243535095995e-05, + "loss": 0.5387, + "step": 3430 + }, + { + "epoch": 0.28, + "grad_norm": 0.8819179929608888, + "learning_rate": 1.6925344749499308e-05, + "loss": 0.5265, + "step": 3431 + }, + { + "epoch": 0.28, + "grad_norm": 0.8728789653879663, + "learning_rate": 1.6923445483982376e-05, + "loss": 0.5412, + "step": 3432 + }, + { + "epoch": 0.28, + "grad_norm": 0.9042520287309848, + "learning_rate": 1.6921545738676807e-05, + "loss": 0.5677, + "step": 3433 + }, + { + "epoch": 0.28, + "grad_norm": 0.9097494183903341, + "learning_rate": 1.691964551371426e-05, + "loss": 0.5969, + "step": 3434 + }, + { + "epoch": 0.28, + "grad_norm": 0.9853158768766778, + "learning_rate": 1.691774480922642e-05, + "loss": 0.5842, + "step": 3435 + }, + { + "epoch": 0.28, + "grad_norm": 0.8166520864997642, + "learning_rate": 1.6915843625344997e-05, + "loss": 0.5148, + "step": 3436 + }, + { + "epoch": 0.28, + "grad_norm": 0.8609842442951521, + "learning_rate": 1.6913941962201747e-05, + "loss": 0.5372, + "step": 3437 + }, + { + "epoch": 0.28, + "grad_norm": 0.8727021376495987, + "learning_rate": 1.691203981992845e-05, + "loss": 0.6025, + "step": 3438 + }, + { + "epoch": 0.28, + "grad_norm": 0.9009716445148898, + "learning_rate": 1.6910137198656925e-05, + "loss": 0.5617, + "step": 3439 + }, + { + "epoch": 0.28, + "grad_norm": 0.9791987469560515, + "learning_rate": 1.6908234098519024e-05, + "loss": 0.5147, + "step": 3440 + }, + { + "epoch": 0.28, + "grad_norm": 0.9245555800454982, + "learning_rate": 1.6906330519646622e-05, + "loss": 0.6286, + "step": 3441 + }, + { + "epoch": 0.28, + "grad_norm": 0.9655064426884022, + "learning_rate": 1.6904426462171647e-05, + "loss": 0.6019, + "step": 3442 + }, + { + "epoch": 0.28, + "grad_norm": 0.8731797030694629, + "learning_rate": 1.690252192622604e-05, + "loss": 0.5452, + "step": 3443 + }, + { + "epoch": 0.28, + "grad_norm": 0.930713438796544, + "learning_rate": 1.6900616911941783e-05, + "loss": 0.5488, + "step": 3444 + }, + { + "epoch": 0.28, + "grad_norm": 0.9807267518738538, + "learning_rate": 1.6898711419450897e-05, + "loss": 0.5992, + "step": 3445 + }, + { + "epoch": 0.28, + "grad_norm": 0.9244327054086132, + "learning_rate": 1.689680544888543e-05, + "loss": 0.5108, + "step": 3446 + }, + { + "epoch": 0.28, + "grad_norm": 0.8835917131112488, + "learning_rate": 1.6894899000377462e-05, + "loss": 0.536, + "step": 3447 + }, + { + "epoch": 0.28, + "grad_norm": 0.9100881276299515, + "learning_rate": 1.689299207405911e-05, + "loss": 0.6109, + "step": 3448 + }, + { + "epoch": 0.28, + "grad_norm": 0.9843100087872552, + "learning_rate": 1.6891084670062517e-05, + "loss": 0.5497, + "step": 3449 + }, + { + "epoch": 0.28, + "grad_norm": 0.9185017074722286, + "learning_rate": 1.6889176788519876e-05, + "loss": 0.5741, + "step": 3450 + }, + { + "epoch": 0.28, + "grad_norm": 0.8697110048282695, + "learning_rate": 1.6887268429563387e-05, + "loss": 0.5425, + "step": 3451 + }, + { + "epoch": 0.28, + "grad_norm": 1.0280757547635149, + "learning_rate": 1.688535959332531e-05, + "loss": 0.6179, + "step": 3452 + }, + { + "epoch": 0.28, + "grad_norm": 1.0022941901330085, + "learning_rate": 1.688345027993792e-05, + "loss": 0.6289, + "step": 3453 + }, + { + "epoch": 0.28, + "grad_norm": 1.0396302382621723, + "learning_rate": 1.6881540489533527e-05, + "loss": 0.6032, + "step": 3454 + }, + { + "epoch": 0.28, + "grad_norm": 0.9853194918842026, + "learning_rate": 1.6879630222244487e-05, + "loss": 0.6601, + "step": 3455 + }, + { + "epoch": 0.28, + "grad_norm": 0.9947368061129188, + "learning_rate": 1.6877719478203172e-05, + "loss": 0.4907, + "step": 3456 + }, + { + "epoch": 0.28, + "grad_norm": 0.8532268987285451, + "learning_rate": 1.6875808257541998e-05, + "loss": 0.5378, + "step": 3457 + }, + { + "epoch": 0.28, + "grad_norm": 0.9071315028978325, + "learning_rate": 1.6873896560393413e-05, + "loss": 0.5627, + "step": 3458 + }, + { + "epoch": 0.28, + "grad_norm": 0.9136588626603805, + "learning_rate": 1.687198438688989e-05, + "loss": 0.5592, + "step": 3459 + }, + { + "epoch": 0.28, + "grad_norm": 0.9719371861308499, + "learning_rate": 1.6870071737163948e-05, + "loss": 0.5675, + "step": 3460 + }, + { + "epoch": 0.28, + "grad_norm": 0.9505609502388157, + "learning_rate": 1.6868158611348124e-05, + "loss": 0.6057, + "step": 3461 + }, + { + "epoch": 0.28, + "grad_norm": 0.8039359178601575, + "learning_rate": 1.6866245009575e-05, + "loss": 0.4808, + "step": 3462 + }, + { + "epoch": 0.28, + "grad_norm": 0.9390886461339968, + "learning_rate": 1.686433093197719e-05, + "loss": 0.6191, + "step": 3463 + }, + { + "epoch": 0.28, + "grad_norm": 0.9434824431556162, + "learning_rate": 1.686241637868734e-05, + "loss": 0.5357, + "step": 3464 + }, + { + "epoch": 0.28, + "grad_norm": 0.9336655413258358, + "learning_rate": 1.6860501349838114e-05, + "loss": 0.545, + "step": 3465 + }, + { + "epoch": 0.28, + "grad_norm": 0.9940741569523487, + "learning_rate": 1.685858584556223e-05, + "loss": 0.5828, + "step": 3466 + }, + { + "epoch": 0.28, + "grad_norm": 0.9147782411036046, + "learning_rate": 1.6856669865992437e-05, + "loss": 0.5497, + "step": 3467 + }, + { + "epoch": 0.28, + "grad_norm": 0.846982347796428, + "learning_rate": 1.68547534112615e-05, + "loss": 0.5163, + "step": 3468 + }, + { + "epoch": 0.28, + "grad_norm": 0.9498171544263488, + "learning_rate": 1.685283648150223e-05, + "loss": 0.6197, + "step": 3469 + }, + { + "epoch": 0.28, + "grad_norm": 0.9199470650867926, + "learning_rate": 1.6850919076847474e-05, + "loss": 0.6108, + "step": 3470 + }, + { + "epoch": 0.28, + "grad_norm": 1.0394969040093853, + "learning_rate": 1.68490011974301e-05, + "loss": 0.5443, + "step": 3471 + }, + { + "epoch": 0.28, + "grad_norm": 0.8709749528807426, + "learning_rate": 1.684708284338302e-05, + "loss": 0.5751, + "step": 3472 + }, + { + "epoch": 0.28, + "grad_norm": 0.855026890162834, + "learning_rate": 1.684516401483917e-05, + "loss": 0.5543, + "step": 3473 + }, + { + "epoch": 0.28, + "grad_norm": 0.9374281824638894, + "learning_rate": 1.6843244711931526e-05, + "loss": 0.6044, + "step": 3474 + }, + { + "epoch": 0.28, + "grad_norm": 0.9238721362463175, + "learning_rate": 1.6841324934793096e-05, + "loss": 0.558, + "step": 3475 + }, + { + "epoch": 0.28, + "grad_norm": 0.9072585266971033, + "learning_rate": 1.6839404683556914e-05, + "loss": 0.4736, + "step": 3476 + }, + { + "epoch": 0.28, + "grad_norm": 0.9143915587444463, + "learning_rate": 1.6837483958356054e-05, + "loss": 0.6134, + "step": 3477 + }, + { + "epoch": 0.28, + "grad_norm": 0.983794665294586, + "learning_rate": 1.6835562759323622e-05, + "loss": 0.5457, + "step": 3478 + }, + { + "epoch": 0.28, + "grad_norm": 0.9357786768311311, + "learning_rate": 1.683364108659275e-05, + "loss": 0.6289, + "step": 3479 + }, + { + "epoch": 0.28, + "grad_norm": 0.8728876774010064, + "learning_rate": 1.6831718940296617e-05, + "loss": 0.5035, + "step": 3480 + }, + { + "epoch": 0.28, + "grad_norm": 0.8528829004574374, + "learning_rate": 1.6829796320568416e-05, + "loss": 0.6151, + "step": 3481 + }, + { + "epoch": 0.28, + "grad_norm": 0.954694895551522, + "learning_rate": 1.6827873227541393e-05, + "loss": 0.6269, + "step": 3482 + }, + { + "epoch": 0.28, + "grad_norm": 0.9438303988784065, + "learning_rate": 1.6825949661348812e-05, + "loss": 0.5902, + "step": 3483 + }, + { + "epoch": 0.28, + "grad_norm": 0.9450631524045531, + "learning_rate": 1.682402562212397e-05, + "loss": 0.5956, + "step": 3484 + }, + { + "epoch": 0.28, + "grad_norm": 1.0221615000328568, + "learning_rate": 1.6822101110000207e-05, + "loss": 0.6432, + "step": 3485 + }, + { + "epoch": 0.28, + "grad_norm": 0.9340166290868366, + "learning_rate": 1.6820176125110886e-05, + "loss": 0.6274, + "step": 3486 + }, + { + "epoch": 0.28, + "grad_norm": 0.9159751842545095, + "learning_rate": 1.681825066758941e-05, + "loss": 0.5367, + "step": 3487 + }, + { + "epoch": 0.28, + "grad_norm": 0.8341778683720619, + "learning_rate": 1.6816324737569215e-05, + "loss": 0.5904, + "step": 3488 + }, + { + "epoch": 0.28, + "grad_norm": 0.9721777267639525, + "learning_rate": 1.681439833518376e-05, + "loss": 0.5366, + "step": 3489 + }, + { + "epoch": 0.28, + "grad_norm": 0.8214085641113914, + "learning_rate": 1.681247146056654e-05, + "loss": 0.6098, + "step": 3490 + }, + { + "epoch": 0.28, + "grad_norm": 0.8811256460597657, + "learning_rate": 1.6810544113851096e-05, + "loss": 0.5111, + "step": 3491 + }, + { + "epoch": 0.28, + "grad_norm": 0.9484985396662438, + "learning_rate": 1.6808616295170983e-05, + "loss": 0.5921, + "step": 3492 + }, + { + "epoch": 0.28, + "grad_norm": 0.8813089051357941, + "learning_rate": 1.6806688004659803e-05, + "loss": 0.4919, + "step": 3493 + }, + { + "epoch": 0.28, + "grad_norm": 0.9816279349466661, + "learning_rate": 1.6804759242451177e-05, + "loss": 0.5854, + "step": 3494 + }, + { + "epoch": 0.28, + "grad_norm": 0.862825517856782, + "learning_rate": 1.6802830008678777e-05, + "loss": 0.5217, + "step": 3495 + }, + { + "epoch": 0.28, + "grad_norm": 0.9204080613604179, + "learning_rate": 1.6800900303476286e-05, + "loss": 0.5527, + "step": 3496 + }, + { + "epoch": 0.28, + "grad_norm": 0.9825728162668876, + "learning_rate": 1.679897012697744e-05, + "loss": 0.5336, + "step": 3497 + }, + { + "epoch": 0.28, + "grad_norm": 0.9982188434933679, + "learning_rate": 1.6797039479315994e-05, + "loss": 0.6479, + "step": 3498 + }, + { + "epoch": 0.28, + "grad_norm": 0.913708551786946, + "learning_rate": 1.679510836062574e-05, + "loss": 0.5342, + "step": 3499 + }, + { + "epoch": 0.28, + "grad_norm": 0.9274094365407106, + "learning_rate": 1.6793176771040504e-05, + "loss": 0.5949, + "step": 3500 + }, + { + "epoch": 0.28, + "grad_norm": 0.9273663856313644, + "learning_rate": 1.6791244710694144e-05, + "loss": 0.5546, + "step": 3501 + }, + { + "epoch": 0.28, + "grad_norm": 0.98301986004304, + "learning_rate": 1.678931217972055e-05, + "loss": 0.4835, + "step": 3502 + }, + { + "epoch": 0.28, + "grad_norm": 0.84103745533568, + "learning_rate": 1.6787379178253642e-05, + "loss": 0.4985, + "step": 3503 + }, + { + "epoch": 0.28, + "grad_norm": 0.9698588777438701, + "learning_rate": 1.678544570642738e-05, + "loss": 0.5881, + "step": 3504 + }, + { + "epoch": 0.28, + "grad_norm": 0.951776115340784, + "learning_rate": 1.6783511764375745e-05, + "loss": 0.6007, + "step": 3505 + }, + { + "epoch": 0.28, + "grad_norm": 0.9741451637827968, + "learning_rate": 1.678157735223277e-05, + "loss": 0.5965, + "step": 3506 + }, + { + "epoch": 0.29, + "grad_norm": 0.9065524129003147, + "learning_rate": 1.6779642470132487e-05, + "loss": 0.5405, + "step": 3507 + }, + { + "epoch": 0.29, + "grad_norm": 0.9250130238196568, + "learning_rate": 1.6777707118209004e-05, + "loss": 0.6035, + "step": 3508 + }, + { + "epoch": 0.29, + "grad_norm": 0.9393499737202737, + "learning_rate": 1.6775771296596427e-05, + "loss": 0.5909, + "step": 3509 + }, + { + "epoch": 0.29, + "grad_norm": 0.8589539160299299, + "learning_rate": 1.677383500542891e-05, + "loss": 0.5071, + "step": 3510 + }, + { + "epoch": 0.29, + "grad_norm": 0.8916436165229539, + "learning_rate": 1.6771898244840636e-05, + "loss": 0.5841, + "step": 3511 + }, + { + "epoch": 0.29, + "grad_norm": 0.8795522853651708, + "learning_rate": 1.676996101496582e-05, + "loss": 0.5698, + "step": 3512 + }, + { + "epoch": 0.29, + "grad_norm": 0.9140986536382631, + "learning_rate": 1.6768023315938708e-05, + "loss": 0.6178, + "step": 3513 + }, + { + "epoch": 0.29, + "grad_norm": 1.0834371963750848, + "learning_rate": 1.6766085147893583e-05, + "loss": 0.6471, + "step": 3514 + }, + { + "epoch": 0.29, + "grad_norm": 0.935683335427111, + "learning_rate": 1.6764146510964762e-05, + "loss": 0.6731, + "step": 3515 + }, + { + "epoch": 0.29, + "grad_norm": 0.8334877532242587, + "learning_rate": 1.676220740528659e-05, + "loss": 0.4929, + "step": 3516 + }, + { + "epoch": 0.29, + "grad_norm": 0.8479848617399672, + "learning_rate": 1.676026783099344e-05, + "loss": 0.5383, + "step": 3517 + }, + { + "epoch": 0.29, + "grad_norm": 0.9399644753508193, + "learning_rate": 1.6758327788219722e-05, + "loss": 0.5711, + "step": 3518 + }, + { + "epoch": 0.29, + "grad_norm": 0.8976978401914122, + "learning_rate": 1.6756387277099885e-05, + "loss": 0.5118, + "step": 3519 + }, + { + "epoch": 0.29, + "grad_norm": 0.9096358011034394, + "learning_rate": 1.6754446297768404e-05, + "loss": 0.5343, + "step": 3520 + }, + { + "epoch": 0.29, + "grad_norm": 0.8756936077029496, + "learning_rate": 1.6752504850359785e-05, + "loss": 0.5871, + "step": 3521 + }, + { + "epoch": 0.29, + "grad_norm": 0.8481418927380738, + "learning_rate": 1.6750562935008572e-05, + "loss": 0.5126, + "step": 3522 + }, + { + "epoch": 0.29, + "grad_norm": 0.875692294080384, + "learning_rate": 1.6748620551849333e-05, + "loss": 0.6195, + "step": 3523 + }, + { + "epoch": 0.29, + "grad_norm": 0.9289899034178508, + "learning_rate": 1.6746677701016675e-05, + "loss": 0.6014, + "step": 3524 + }, + { + "epoch": 0.29, + "grad_norm": 0.798198379662321, + "learning_rate": 1.674473438264524e-05, + "loss": 0.5155, + "step": 3525 + }, + { + "epoch": 0.29, + "grad_norm": 0.8364506278435189, + "learning_rate": 1.674279059686969e-05, + "loss": 0.5638, + "step": 3526 + }, + { + "epoch": 0.29, + "grad_norm": 0.9674610184024626, + "learning_rate": 1.6740846343824734e-05, + "loss": 0.5908, + "step": 3527 + }, + { + "epoch": 0.29, + "grad_norm": 0.8897752692194084, + "learning_rate": 1.6738901623645107e-05, + "loss": 0.5301, + "step": 3528 + }, + { + "epoch": 0.29, + "grad_norm": 1.0070755438073646, + "learning_rate": 1.6736956436465573e-05, + "loss": 0.6214, + "step": 3529 + }, + { + "epoch": 0.29, + "grad_norm": 0.853310413688308, + "learning_rate": 1.6735010782420934e-05, + "loss": 0.5819, + "step": 3530 + }, + { + "epoch": 0.29, + "grad_norm": 1.002604659974855, + "learning_rate": 1.6733064661646023e-05, + "loss": 0.5757, + "step": 3531 + }, + { + "epoch": 0.29, + "grad_norm": 0.8825996930380876, + "learning_rate": 1.67311180742757e-05, + "loss": 0.5913, + "step": 3532 + }, + { + "epoch": 0.29, + "grad_norm": 0.9185858297292431, + "learning_rate": 1.672917102044487e-05, + "loss": 0.6152, + "step": 3533 + }, + { + "epoch": 0.29, + "grad_norm": 0.898173400038949, + "learning_rate": 1.6727223500288458e-05, + "loss": 0.6118, + "step": 3534 + }, + { + "epoch": 0.29, + "grad_norm": 1.0034091029734553, + "learning_rate": 1.672527551394142e-05, + "loss": 0.5808, + "step": 3535 + }, + { + "epoch": 0.29, + "grad_norm": 0.882816022561754, + "learning_rate": 1.6723327061538753e-05, + "loss": 0.5979, + "step": 3536 + }, + { + "epoch": 0.29, + "grad_norm": 0.8911535124949506, + "learning_rate": 1.672137814321549e-05, + "loss": 0.5396, + "step": 3537 + }, + { + "epoch": 0.29, + "grad_norm": 0.8796186206450183, + "learning_rate": 1.6719428759106676e-05, + "loss": 0.5201, + "step": 3538 + }, + { + "epoch": 0.29, + "grad_norm": 0.9169457655302597, + "learning_rate": 1.6717478909347417e-05, + "loss": 0.5862, + "step": 3539 + }, + { + "epoch": 0.29, + "grad_norm": 0.9814266566832116, + "learning_rate": 1.671552859407282e-05, + "loss": 0.6225, + "step": 3540 + }, + { + "epoch": 0.29, + "grad_norm": 0.8622223722404974, + "learning_rate": 1.6713577813418058e-05, + "loss": 0.5983, + "step": 3541 + }, + { + "epoch": 0.29, + "grad_norm": 0.9993934642157488, + "learning_rate": 1.67116265675183e-05, + "loss": 0.5768, + "step": 3542 + }, + { + "epoch": 0.29, + "grad_norm": 0.9092726740215555, + "learning_rate": 1.6709674856508775e-05, + "loss": 0.565, + "step": 3543 + }, + { + "epoch": 0.29, + "grad_norm": 0.9232881931366909, + "learning_rate": 1.6707722680524735e-05, + "loss": 0.5864, + "step": 3544 + }, + { + "epoch": 0.29, + "grad_norm": 0.9530394049860118, + "learning_rate": 1.6705770039701464e-05, + "loss": 0.5408, + "step": 3545 + }, + { + "epoch": 0.29, + "grad_norm": 0.9506612173293659, + "learning_rate": 1.670381693417428e-05, + "loss": 0.6284, + "step": 3546 + }, + { + "epoch": 0.29, + "grad_norm": 0.8434394278434862, + "learning_rate": 1.6701863364078524e-05, + "loss": 0.5427, + "step": 3547 + }, + { + "epoch": 0.29, + "grad_norm": 0.9724131503049097, + "learning_rate": 1.6699909329549583e-05, + "loss": 0.4966, + "step": 3548 + }, + { + "epoch": 0.29, + "grad_norm": 0.9893191331651435, + "learning_rate": 1.669795483072287e-05, + "loss": 0.5897, + "step": 3549 + }, + { + "epoch": 0.29, + "grad_norm": 0.8701298054577694, + "learning_rate": 1.6695999867733824e-05, + "loss": 0.5424, + "step": 3550 + }, + { + "epoch": 0.29, + "grad_norm": 0.9389970593482257, + "learning_rate": 1.669404444071793e-05, + "loss": 0.6622, + "step": 3551 + }, + { + "epoch": 0.29, + "grad_norm": 0.9024202866939837, + "learning_rate": 1.6692088549810695e-05, + "loss": 0.4986, + "step": 3552 + }, + { + "epoch": 0.29, + "grad_norm": 0.9196638626707861, + "learning_rate": 1.6690132195147655e-05, + "loss": 0.5415, + "step": 3553 + }, + { + "epoch": 0.29, + "grad_norm": 0.9315837796528372, + "learning_rate": 1.668817537686439e-05, + "loss": 0.5866, + "step": 3554 + }, + { + "epoch": 0.29, + "grad_norm": 0.9883947924463582, + "learning_rate": 1.6686218095096506e-05, + "loss": 0.5928, + "step": 3555 + }, + { + "epoch": 0.29, + "grad_norm": 0.9627628774229461, + "learning_rate": 1.6684260349979637e-05, + "loss": 0.5831, + "step": 3556 + }, + { + "epoch": 0.29, + "grad_norm": 0.9476768527246094, + "learning_rate": 1.6682302141649452e-05, + "loss": 0.5994, + "step": 3557 + }, + { + "epoch": 0.29, + "grad_norm": 0.9485565562128317, + "learning_rate": 1.668034347024166e-05, + "loss": 0.5723, + "step": 3558 + }, + { + "epoch": 0.29, + "grad_norm": 0.9760330608078868, + "learning_rate": 1.667838433589199e-05, + "loss": 0.5349, + "step": 3559 + }, + { + "epoch": 0.29, + "grad_norm": 0.8864957985018198, + "learning_rate": 1.6676424738736208e-05, + "loss": 0.5838, + "step": 3560 + }, + { + "epoch": 0.29, + "grad_norm": 0.9307026791978568, + "learning_rate": 1.6674464678910117e-05, + "loss": 0.5658, + "step": 3561 + }, + { + "epoch": 0.29, + "grad_norm": 0.9044244733439744, + "learning_rate": 1.667250415654954e-05, + "loss": 0.5329, + "step": 3562 + }, + { + "epoch": 0.29, + "grad_norm": 0.9358310455208178, + "learning_rate": 1.6670543171790347e-05, + "loss": 0.6028, + "step": 3563 + }, + { + "epoch": 0.29, + "grad_norm": 0.9945180142864156, + "learning_rate": 1.6668581724768423e-05, + "loss": 0.5394, + "step": 3564 + }, + { + "epoch": 0.29, + "grad_norm": 0.8602807961739487, + "learning_rate": 1.6666619815619703e-05, + "loss": 0.539, + "step": 3565 + }, + { + "epoch": 0.29, + "grad_norm": 0.8948693345748759, + "learning_rate": 1.6664657444480145e-05, + "loss": 0.6245, + "step": 3566 + }, + { + "epoch": 0.29, + "grad_norm": 0.8565794868024554, + "learning_rate": 1.666269461148574e-05, + "loss": 0.4878, + "step": 3567 + }, + { + "epoch": 0.29, + "grad_norm": 0.866446067109164, + "learning_rate": 1.6660731316772503e-05, + "loss": 0.5559, + "step": 3568 + }, + { + "epoch": 0.29, + "grad_norm": 0.9300978205223425, + "learning_rate": 1.6658767560476494e-05, + "loss": 0.5421, + "step": 3569 + }, + { + "epoch": 0.29, + "grad_norm": 0.8743919086457418, + "learning_rate": 1.6656803342733804e-05, + "loss": 0.6016, + "step": 3570 + }, + { + "epoch": 0.29, + "grad_norm": 0.9887159020971205, + "learning_rate": 1.6654838663680542e-05, + "loss": 0.5941, + "step": 3571 + }, + { + "epoch": 0.29, + "grad_norm": 0.9935905572056851, + "learning_rate": 1.6652873523452867e-05, + "loss": 0.5825, + "step": 3572 + }, + { + "epoch": 0.29, + "grad_norm": 0.9917578015678681, + "learning_rate": 1.6650907922186958e-05, + "loss": 0.5464, + "step": 3573 + }, + { + "epoch": 0.29, + "grad_norm": 0.8836985763212661, + "learning_rate": 1.6648941860019028e-05, + "loss": 0.5667, + "step": 3574 + }, + { + "epoch": 0.29, + "grad_norm": 0.8837790954049046, + "learning_rate": 1.6646975337085323e-05, + "loss": 0.5275, + "step": 3575 + }, + { + "epoch": 0.29, + "grad_norm": 1.0543057172385015, + "learning_rate": 1.6645008353522122e-05, + "loss": 0.6152, + "step": 3576 + }, + { + "epoch": 0.29, + "grad_norm": 0.9161591991044842, + "learning_rate": 1.6643040909465743e-05, + "loss": 0.5601, + "step": 3577 + }, + { + "epoch": 0.29, + "grad_norm": 0.9601951864694586, + "learning_rate": 1.6641073005052516e-05, + "loss": 0.5857, + "step": 3578 + }, + { + "epoch": 0.29, + "grad_norm": 0.9765714977543812, + "learning_rate": 1.663910464041882e-05, + "loss": 0.5903, + "step": 3579 + }, + { + "epoch": 0.29, + "grad_norm": 0.8749919554487148, + "learning_rate": 1.6637135815701066e-05, + "loss": 0.5547, + "step": 3580 + }, + { + "epoch": 0.29, + "grad_norm": 0.943868588563792, + "learning_rate": 1.663516653103568e-05, + "loss": 0.5624, + "step": 3581 + }, + { + "epoch": 0.29, + "grad_norm": 0.9406057711354047, + "learning_rate": 1.6633196786559143e-05, + "loss": 0.5454, + "step": 3582 + }, + { + "epoch": 0.29, + "grad_norm": 0.9602637556293583, + "learning_rate": 1.6631226582407954e-05, + "loss": 0.6411, + "step": 3583 + }, + { + "epoch": 0.29, + "grad_norm": 0.8818966099422427, + "learning_rate": 1.662925591871864e-05, + "loss": 0.5759, + "step": 3584 + }, + { + "epoch": 0.29, + "grad_norm": 1.0002142171935597, + "learning_rate": 1.6627284795627777e-05, + "loss": 0.5652, + "step": 3585 + }, + { + "epoch": 0.29, + "grad_norm": 0.8726073504452184, + "learning_rate": 1.6625313213271953e-05, + "loss": 0.5773, + "step": 3586 + }, + { + "epoch": 0.29, + "grad_norm": 0.9105240364055169, + "learning_rate": 1.66233411717878e-05, + "loss": 0.4951, + "step": 3587 + }, + { + "epoch": 0.29, + "grad_norm": 0.8467521769576934, + "learning_rate": 1.6621368671311973e-05, + "loss": 0.5547, + "step": 3588 + }, + { + "epoch": 0.29, + "grad_norm": 0.9618991888429478, + "learning_rate": 1.6619395711981183e-05, + "loss": 0.64, + "step": 3589 + }, + { + "epoch": 0.29, + "grad_norm": 0.9131903387125504, + "learning_rate": 1.661742229393213e-05, + "loss": 0.6045, + "step": 3590 + }, + { + "epoch": 0.29, + "grad_norm": 0.9734975634683273, + "learning_rate": 1.6615448417301588e-05, + "loss": 0.6041, + "step": 3591 + }, + { + "epoch": 0.29, + "grad_norm": 0.9554196630977829, + "learning_rate": 1.6613474082226338e-05, + "loss": 0.6134, + "step": 3592 + }, + { + "epoch": 0.29, + "grad_norm": 1.0722452324867415, + "learning_rate": 1.6611499288843194e-05, + "loss": 0.6594, + "step": 3593 + }, + { + "epoch": 0.29, + "grad_norm": 0.8659926589103515, + "learning_rate": 1.660952403728902e-05, + "loss": 0.5241, + "step": 3594 + }, + { + "epoch": 0.29, + "grad_norm": 1.0027663664973614, + "learning_rate": 1.6607548327700694e-05, + "loss": 0.671, + "step": 3595 + }, + { + "epoch": 0.29, + "grad_norm": 0.8707474663912872, + "learning_rate": 1.6605572160215126e-05, + "loss": 0.5783, + "step": 3596 + }, + { + "epoch": 0.29, + "grad_norm": 1.0087074986539963, + "learning_rate": 1.6603595534969266e-05, + "loss": 0.638, + "step": 3597 + }, + { + "epoch": 0.29, + "grad_norm": 0.9712820924532131, + "learning_rate": 1.6601618452100098e-05, + "loss": 0.6031, + "step": 3598 + }, + { + "epoch": 0.29, + "grad_norm": 0.9282173136440852, + "learning_rate": 1.659964091174462e-05, + "loss": 0.5563, + "step": 3599 + }, + { + "epoch": 0.29, + "grad_norm": 0.9858593134307094, + "learning_rate": 1.6597662914039885e-05, + "loss": 0.5695, + "step": 3600 + }, + { + "epoch": 0.29, + "grad_norm": 0.9790081161566879, + "learning_rate": 1.659568445912296e-05, + "loss": 0.6717, + "step": 3601 + }, + { + "epoch": 0.29, + "grad_norm": 0.92396838455597, + "learning_rate": 1.6593705547130955e-05, + "loss": 0.5798, + "step": 3602 + }, + { + "epoch": 0.29, + "grad_norm": 0.8781891791299089, + "learning_rate": 1.6591726178201e-05, + "loss": 0.5568, + "step": 3603 + }, + { + "epoch": 0.29, + "grad_norm": 0.9532094188863465, + "learning_rate": 1.6589746352470275e-05, + "loss": 0.5711, + "step": 3604 + }, + { + "epoch": 0.29, + "grad_norm": 0.9605308425776762, + "learning_rate": 1.6587766070075965e-05, + "loss": 0.5674, + "step": 3605 + }, + { + "epoch": 0.29, + "grad_norm": 0.9075877004257134, + "learning_rate": 1.6585785331155312e-05, + "loss": 0.5982, + "step": 3606 + }, + { + "epoch": 0.29, + "grad_norm": 0.9778240081915002, + "learning_rate": 1.6583804135845582e-05, + "loss": 0.6316, + "step": 3607 + }, + { + "epoch": 0.29, + "grad_norm": 0.8063905073185061, + "learning_rate": 1.658182248428406e-05, + "loss": 0.5393, + "step": 3608 + }, + { + "epoch": 0.29, + "grad_norm": 0.9095242303970597, + "learning_rate": 1.6579840376608076e-05, + "loss": 0.5845, + "step": 3609 + }, + { + "epoch": 0.29, + "grad_norm": 0.9260816695897505, + "learning_rate": 1.6577857812954994e-05, + "loss": 0.5652, + "step": 3610 + }, + { + "epoch": 0.29, + "grad_norm": 0.998827823223787, + "learning_rate": 1.65758747934622e-05, + "loss": 0.528, + "step": 3611 + }, + { + "epoch": 0.29, + "grad_norm": 0.9114557389191057, + "learning_rate": 1.6573891318267113e-05, + "loss": 0.629, + "step": 3612 + }, + { + "epoch": 0.29, + "grad_norm": 0.9823821174946192, + "learning_rate": 1.6571907387507194e-05, + "loss": 0.6058, + "step": 3613 + }, + { + "epoch": 0.29, + "grad_norm": 1.0511732864639134, + "learning_rate": 1.6569923001319916e-05, + "loss": 0.5967, + "step": 3614 + }, + { + "epoch": 0.29, + "grad_norm": 0.9686847202243349, + "learning_rate": 1.6567938159842807e-05, + "loss": 0.6398, + "step": 3615 + }, + { + "epoch": 0.29, + "grad_norm": 0.8933691431087123, + "learning_rate": 1.6565952863213407e-05, + "loss": 0.5597, + "step": 3616 + }, + { + "epoch": 0.29, + "grad_norm": 0.8945158188311984, + "learning_rate": 1.65639671115693e-05, + "loss": 0.5664, + "step": 3617 + }, + { + "epoch": 0.29, + "grad_norm": 0.8154853016245658, + "learning_rate": 1.6561980905048087e-05, + "loss": 0.5101, + "step": 3618 + }, + { + "epoch": 0.29, + "grad_norm": 0.8273776997836692, + "learning_rate": 1.6559994243787427e-05, + "loss": 0.4892, + "step": 3619 + }, + { + "epoch": 0.29, + "grad_norm": 1.0302635716889421, + "learning_rate": 1.655800712792498e-05, + "loss": 0.5976, + "step": 3620 + }, + { + "epoch": 0.29, + "grad_norm": 1.0043279125071218, + "learning_rate": 1.6556019557598453e-05, + "loss": 0.5788, + "step": 3621 + }, + { + "epoch": 0.29, + "grad_norm": 0.9753388668576959, + "learning_rate": 1.6554031532945588e-05, + "loss": 0.6504, + "step": 3622 + }, + { + "epoch": 0.29, + "grad_norm": 0.9597580412921606, + "learning_rate": 1.6552043054104153e-05, + "loss": 0.5883, + "step": 3623 + }, + { + "epoch": 0.29, + "grad_norm": 1.0242495713982198, + "learning_rate": 1.6550054121211946e-05, + "loss": 0.5753, + "step": 3624 + }, + { + "epoch": 0.29, + "grad_norm": 0.7932872760131129, + "learning_rate": 1.6548064734406798e-05, + "loss": 0.4609, + "step": 3625 + }, + { + "epoch": 0.29, + "grad_norm": 0.9463995135467075, + "learning_rate": 1.654607489382657e-05, + "loss": 0.588, + "step": 3626 + }, + { + "epoch": 0.29, + "grad_norm": 0.9566650907230213, + "learning_rate": 1.654408459960916e-05, + "loss": 0.6569, + "step": 3627 + }, + { + "epoch": 0.29, + "grad_norm": 0.9198226238212484, + "learning_rate": 1.6542093851892493e-05, + "loss": 0.5615, + "step": 3628 + }, + { + "epoch": 0.29, + "grad_norm": 0.9137048670369079, + "learning_rate": 1.654010265081452e-05, + "loss": 0.5889, + "step": 3629 + }, + { + "epoch": 0.3, + "grad_norm": 1.0686054851272107, + "learning_rate": 1.653811099651324e-05, + "loss": 0.5917, + "step": 3630 + }, + { + "epoch": 0.3, + "grad_norm": 0.8316623722805385, + "learning_rate": 1.6536118889126665e-05, + "loss": 0.5438, + "step": 3631 + }, + { + "epoch": 0.3, + "grad_norm": 0.900597140469948, + "learning_rate": 1.6534126328792846e-05, + "loss": 0.5737, + "step": 3632 + }, + { + "epoch": 0.3, + "grad_norm": 0.983236693684268, + "learning_rate": 1.653213331564987e-05, + "loss": 0.5542, + "step": 3633 + }, + { + "epoch": 0.3, + "grad_norm": 0.8923703352786861, + "learning_rate": 1.653013984983585e-05, + "loss": 0.5854, + "step": 3634 + }, + { + "epoch": 0.3, + "grad_norm": 0.860331550092371, + "learning_rate": 1.6528145931488934e-05, + "loss": 0.4834, + "step": 3635 + }, + { + "epoch": 0.3, + "grad_norm": 0.934322417674138, + "learning_rate": 1.6526151560747294e-05, + "loss": 0.5552, + "step": 3636 + }, + { + "epoch": 0.3, + "grad_norm": 1.0653721164248258, + "learning_rate": 1.6524156737749132e-05, + "loss": 0.6018, + "step": 3637 + }, + { + "epoch": 0.3, + "grad_norm": 0.8718089486691982, + "learning_rate": 1.6522161462632705e-05, + "loss": 0.5718, + "step": 3638 + }, + { + "epoch": 0.3, + "grad_norm": 0.9025461147431527, + "learning_rate": 1.6520165735536268e-05, + "loss": 0.4984, + "step": 3639 + }, + { + "epoch": 0.3, + "grad_norm": 1.0448045557872034, + "learning_rate": 1.651816955659813e-05, + "loss": 0.6191, + "step": 3640 + }, + { + "epoch": 0.3, + "grad_norm": 0.9643104817920352, + "learning_rate": 1.6516172925956624e-05, + "loss": 0.5801, + "step": 3641 + }, + { + "epoch": 0.3, + "grad_norm": 0.8573755137208006, + "learning_rate": 1.6514175843750112e-05, + "loss": 0.5469, + "step": 3642 + }, + { + "epoch": 0.3, + "grad_norm": 0.9168281111480824, + "learning_rate": 1.6512178310116994e-05, + "loss": 0.498, + "step": 3643 + }, + { + "epoch": 0.3, + "grad_norm": 0.9980497424146102, + "learning_rate": 1.6510180325195696e-05, + "loss": 0.6049, + "step": 3644 + }, + { + "epoch": 0.3, + "grad_norm": 0.8352927441471474, + "learning_rate": 1.6508181889124678e-05, + "loss": 0.562, + "step": 3645 + }, + { + "epoch": 0.3, + "grad_norm": 0.9098235155117534, + "learning_rate": 1.650618300204242e-05, + "loss": 0.5712, + "step": 3646 + }, + { + "epoch": 0.3, + "grad_norm": 0.9318970570266037, + "learning_rate": 1.6504183664087458e-05, + "loss": 0.5857, + "step": 3647 + }, + { + "epoch": 0.3, + "grad_norm": 0.9322556307904534, + "learning_rate": 1.6502183875398335e-05, + "loss": 0.4988, + "step": 3648 + }, + { + "epoch": 0.3, + "grad_norm": 0.8689832946583307, + "learning_rate": 1.6500183636113637e-05, + "loss": 0.5214, + "step": 3649 + }, + { + "epoch": 0.3, + "grad_norm": 0.9994019497128986, + "learning_rate": 1.649818294637198e-05, + "loss": 0.589, + "step": 3650 + }, + { + "epoch": 0.3, + "grad_norm": 0.9892900604091126, + "learning_rate": 1.6496181806312005e-05, + "loss": 0.5811, + "step": 3651 + }, + { + "epoch": 0.3, + "grad_norm": 0.8117548953306063, + "learning_rate": 1.6494180216072397e-05, + "loss": 0.5841, + "step": 3652 + }, + { + "epoch": 0.3, + "grad_norm": 0.9213653596995112, + "learning_rate": 1.649217817579186e-05, + "loss": 0.5614, + "step": 3653 + }, + { + "epoch": 0.3, + "grad_norm": 0.988994948049542, + "learning_rate": 1.6490175685609133e-05, + "loss": 0.5912, + "step": 3654 + }, + { + "epoch": 0.3, + "grad_norm": 0.9457260692079452, + "learning_rate": 1.6488172745662984e-05, + "loss": 0.5493, + "step": 3655 + }, + { + "epoch": 0.3, + "grad_norm": 0.8973934121192513, + "learning_rate": 1.6486169356092224e-05, + "loss": 0.6416, + "step": 3656 + }, + { + "epoch": 0.3, + "grad_norm": 1.0003701542148136, + "learning_rate": 1.648416551703568e-05, + "loss": 0.601, + "step": 3657 + }, + { + "epoch": 0.3, + "grad_norm": 0.9375819350879385, + "learning_rate": 1.6482161228632217e-05, + "loss": 0.5633, + "step": 3658 + }, + { + "epoch": 0.3, + "grad_norm": 0.8715810769204608, + "learning_rate": 1.648015649102073e-05, + "loss": 0.5143, + "step": 3659 + }, + { + "epoch": 0.3, + "grad_norm": 0.956343085905747, + "learning_rate": 1.6478151304340144e-05, + "loss": 0.598, + "step": 3660 + }, + { + "epoch": 0.3, + "grad_norm": 0.8466436290465038, + "learning_rate": 1.647614566872942e-05, + "loss": 0.5448, + "step": 3661 + }, + { + "epoch": 0.3, + "grad_norm": 1.0018889533958015, + "learning_rate": 1.6474139584327548e-05, + "loss": 0.6558, + "step": 3662 + }, + { + "epoch": 0.3, + "grad_norm": 0.9768304608041964, + "learning_rate": 1.647213305127354e-05, + "loss": 0.4928, + "step": 3663 + }, + { + "epoch": 0.3, + "grad_norm": 0.9476325871815199, + "learning_rate": 1.6470126069706456e-05, + "loss": 0.6763, + "step": 3664 + }, + { + "epoch": 0.3, + "grad_norm": 0.9116612300802762, + "learning_rate": 1.6468118639765376e-05, + "loss": 0.5577, + "step": 3665 + }, + { + "epoch": 0.3, + "grad_norm": 0.9705610218173512, + "learning_rate": 1.646611076158941e-05, + "loss": 0.6202, + "step": 3666 + }, + { + "epoch": 0.3, + "grad_norm": 0.9044189603954035, + "learning_rate": 1.6464102435317702e-05, + "loss": 0.5508, + "step": 3667 + }, + { + "epoch": 0.3, + "grad_norm": 0.8682864121536775, + "learning_rate": 1.6462093661089432e-05, + "loss": 0.643, + "step": 3668 + }, + { + "epoch": 0.3, + "grad_norm": 0.9042250301625764, + "learning_rate": 1.64600844390438e-05, + "loss": 0.614, + "step": 3669 + }, + { + "epoch": 0.3, + "grad_norm": 0.915033666481628, + "learning_rate": 1.6458074769320046e-05, + "loss": 0.5764, + "step": 3670 + }, + { + "epoch": 0.3, + "grad_norm": 0.8218929072923022, + "learning_rate": 1.6456064652057443e-05, + "loss": 0.5159, + "step": 3671 + }, + { + "epoch": 0.3, + "grad_norm": 0.8778388677902503, + "learning_rate": 1.6454054087395284e-05, + "loss": 0.5133, + "step": 3672 + }, + { + "epoch": 0.3, + "grad_norm": 0.8389863239156715, + "learning_rate": 1.6452043075472898e-05, + "loss": 0.5176, + "step": 3673 + }, + { + "epoch": 0.3, + "grad_norm": 0.9435807760681507, + "learning_rate": 1.6450031616429655e-05, + "loss": 0.6082, + "step": 3674 + }, + { + "epoch": 0.3, + "grad_norm": 0.9169409518546056, + "learning_rate": 1.6448019710404938e-05, + "loss": 0.554, + "step": 3675 + }, + { + "epoch": 0.3, + "grad_norm": 0.9801572359431846, + "learning_rate": 1.6446007357538178e-05, + "loss": 0.5588, + "step": 3676 + }, + { + "epoch": 0.3, + "grad_norm": 0.9262934877911382, + "learning_rate": 1.6443994557968826e-05, + "loss": 0.5349, + "step": 3677 + }, + { + "epoch": 0.3, + "grad_norm": 0.9778690714497104, + "learning_rate": 1.6441981311836363e-05, + "loss": 0.5439, + "step": 3678 + }, + { + "epoch": 0.3, + "grad_norm": 0.9550533350922927, + "learning_rate": 1.643996761928031e-05, + "loss": 0.603, + "step": 3679 + }, + { + "epoch": 0.3, + "grad_norm": 0.9514029700405338, + "learning_rate": 1.6437953480440217e-05, + "loss": 0.6207, + "step": 3680 + }, + { + "epoch": 0.3, + "grad_norm": 0.9208468742014804, + "learning_rate": 1.6435938895455653e-05, + "loss": 0.5565, + "step": 3681 + }, + { + "epoch": 0.3, + "grad_norm": 0.9614563981238538, + "learning_rate": 1.6433923864466235e-05, + "loss": 0.5365, + "step": 3682 + }, + { + "epoch": 0.3, + "grad_norm": 1.0040116203536171, + "learning_rate": 1.6431908387611604e-05, + "loss": 0.5948, + "step": 3683 + }, + { + "epoch": 0.3, + "grad_norm": 0.8657093495585827, + "learning_rate": 1.642989246503142e-05, + "loss": 0.5876, + "step": 3684 + }, + { + "epoch": 0.3, + "grad_norm": 0.9683313092980244, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.5575, + "step": 3685 + }, + { + "epoch": 0.3, + "grad_norm": 0.9735112189429884, + "learning_rate": 1.6425859283253255e-05, + "loss": 0.6037, + "step": 3686 + }, + { + "epoch": 0.3, + "grad_norm": 0.8483864927774368, + "learning_rate": 1.642384202433477e-05, + "loss": 0.5197, + "step": 3687 + }, + { + "epoch": 0.3, + "grad_norm": 1.0289501655311173, + "learning_rate": 1.6421824320249732e-05, + "loss": 0.5883, + "step": 3688 + }, + { + "epoch": 0.3, + "grad_norm": 0.8065027388887831, + "learning_rate": 1.641980617113796e-05, + "loss": 0.5542, + "step": 3689 + }, + { + "epoch": 0.3, + "grad_norm": 0.9721774427241994, + "learning_rate": 1.6417787577139317e-05, + "loss": 0.5892, + "step": 3690 + }, + { + "epoch": 0.3, + "grad_norm": 0.903531584027664, + "learning_rate": 1.641576853839369e-05, + "loss": 0.5987, + "step": 3691 + }, + { + "epoch": 0.3, + "grad_norm": 0.9247698353933245, + "learning_rate": 1.641374905504099e-05, + "loss": 0.5728, + "step": 3692 + }, + { + "epoch": 0.3, + "grad_norm": 0.9114264057369905, + "learning_rate": 1.641172912722117e-05, + "loss": 0.5938, + "step": 3693 + }, + { + "epoch": 0.3, + "grad_norm": 0.9351345133128272, + "learning_rate": 1.640970875507421e-05, + "loss": 0.5918, + "step": 3694 + }, + { + "epoch": 0.3, + "grad_norm": 0.847331838532848, + "learning_rate": 1.640768793874012e-05, + "loss": 0.5788, + "step": 3695 + }, + { + "epoch": 0.3, + "grad_norm": 0.9834974607562615, + "learning_rate": 1.6405666678358934e-05, + "loss": 0.553, + "step": 3696 + }, + { + "epoch": 0.3, + "grad_norm": 0.9635080252497199, + "learning_rate": 1.6403644974070732e-05, + "loss": 0.5437, + "step": 3697 + }, + { + "epoch": 0.3, + "grad_norm": 0.9081995856472028, + "learning_rate": 1.6401622826015616e-05, + "loss": 0.5518, + "step": 3698 + }, + { + "epoch": 0.3, + "grad_norm": 0.8448121229095162, + "learning_rate": 1.6399600234333716e-05, + "loss": 0.5327, + "step": 3699 + }, + { + "epoch": 0.3, + "grad_norm": 0.9396638616959802, + "learning_rate": 1.6397577199165192e-05, + "loss": 0.6707, + "step": 3700 + }, + { + "epoch": 0.3, + "grad_norm": 0.8932577772825789, + "learning_rate": 1.639555372065025e-05, + "loss": 0.5025, + "step": 3701 + }, + { + "epoch": 0.3, + "grad_norm": 0.9081954188369703, + "learning_rate": 1.6393529798929103e-05, + "loss": 0.5871, + "step": 3702 + }, + { + "epoch": 0.3, + "grad_norm": 0.8853290866239732, + "learning_rate": 1.639150543414201e-05, + "loss": 0.5313, + "step": 3703 + }, + { + "epoch": 0.3, + "grad_norm": 0.9233280166174495, + "learning_rate": 1.6389480626429262e-05, + "loss": 0.5942, + "step": 3704 + }, + { + "epoch": 0.3, + "grad_norm": 0.9463063765768046, + "learning_rate": 1.6387455375931174e-05, + "loss": 0.5392, + "step": 3705 + }, + { + "epoch": 0.3, + "grad_norm": 0.8730763067649788, + "learning_rate": 1.6385429682788095e-05, + "loss": 0.544, + "step": 3706 + }, + { + "epoch": 0.3, + "grad_norm": 0.9316457082221453, + "learning_rate": 1.63834035471404e-05, + "loss": 0.5758, + "step": 3707 + }, + { + "epoch": 0.3, + "grad_norm": 0.9542199542196371, + "learning_rate": 1.6381376969128508e-05, + "loss": 0.6123, + "step": 3708 + }, + { + "epoch": 0.3, + "grad_norm": 0.9416443659815187, + "learning_rate": 1.6379349948892845e-05, + "loss": 0.4821, + "step": 3709 + }, + { + "epoch": 0.3, + "grad_norm": 0.9333823224097688, + "learning_rate": 1.6377322486573892e-05, + "loss": 0.602, + "step": 3710 + }, + { + "epoch": 0.3, + "grad_norm": 0.8409452002423059, + "learning_rate": 1.637529458231215e-05, + "loss": 0.5059, + "step": 3711 + }, + { + "epoch": 0.3, + "grad_norm": 0.9280610886053118, + "learning_rate": 1.637326623624814e-05, + "loss": 0.6287, + "step": 3712 + }, + { + "epoch": 0.3, + "grad_norm": 0.9096302289339779, + "learning_rate": 1.637123744852244e-05, + "loss": 0.5049, + "step": 3713 + }, + { + "epoch": 0.3, + "grad_norm": 1.0049544036616702, + "learning_rate": 1.6369208219275635e-05, + "loss": 0.6629, + "step": 3714 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013101190306692, + "learning_rate": 1.6367178548648347e-05, + "loss": 0.5969, + "step": 3715 + }, + { + "epoch": 0.3, + "grad_norm": 1.0162996039363488, + "learning_rate": 1.6365148436781235e-05, + "loss": 0.5464, + "step": 3716 + }, + { + "epoch": 0.3, + "grad_norm": 1.0263918742129827, + "learning_rate": 1.6363117883814986e-05, + "loss": 0.6456, + "step": 3717 + }, + { + "epoch": 0.3, + "grad_norm": 0.860333941053944, + "learning_rate": 1.6361086889890307e-05, + "loss": 0.5763, + "step": 3718 + }, + { + "epoch": 0.3, + "grad_norm": 0.8528743046913408, + "learning_rate": 1.635905545514795e-05, + "loss": 0.5548, + "step": 3719 + }, + { + "epoch": 0.3, + "grad_norm": 0.9660427612544795, + "learning_rate": 1.635702357972869e-05, + "loss": 0.6462, + "step": 3720 + }, + { + "epoch": 0.3, + "grad_norm": 0.9134307436287689, + "learning_rate": 1.6354991263773338e-05, + "loss": 0.5735, + "step": 3721 + }, + { + "epoch": 0.3, + "grad_norm": 0.902943882911617, + "learning_rate": 1.6352958507422727e-05, + "loss": 0.53, + "step": 3722 + }, + { + "epoch": 0.3, + "grad_norm": 0.9348713243516186, + "learning_rate": 1.635092531081772e-05, + "loss": 0.6143, + "step": 3723 + }, + { + "epoch": 0.3, + "grad_norm": 0.8468682556085794, + "learning_rate": 1.634889167409923e-05, + "loss": 0.4976, + "step": 3724 + }, + { + "epoch": 0.3, + "grad_norm": 0.8311154744443643, + "learning_rate": 1.6346857597408174e-05, + "loss": 0.455, + "step": 3725 + }, + { + "epoch": 0.3, + "grad_norm": 0.8780250093580937, + "learning_rate": 1.634482308088552e-05, + "loss": 0.579, + "step": 3726 + }, + { + "epoch": 0.3, + "grad_norm": 0.8297776086320948, + "learning_rate": 1.6342788124672255e-05, + "loss": 0.5381, + "step": 3727 + }, + { + "epoch": 0.3, + "grad_norm": 0.9539167630401393, + "learning_rate": 1.634075272890939e-05, + "loss": 0.5416, + "step": 3728 + }, + { + "epoch": 0.3, + "grad_norm": 1.0019019974902386, + "learning_rate": 1.6338716893737995e-05, + "loss": 0.6171, + "step": 3729 + }, + { + "epoch": 0.3, + "grad_norm": 0.9788484051442959, + "learning_rate": 1.6336680619299138e-05, + "loss": 0.5867, + "step": 3730 + }, + { + "epoch": 0.3, + "grad_norm": 0.9222090771321823, + "learning_rate": 1.6334643905733932e-05, + "loss": 0.5908, + "step": 3731 + }, + { + "epoch": 0.3, + "grad_norm": 0.876664958229384, + "learning_rate": 1.633260675318353e-05, + "loss": 0.5143, + "step": 3732 + }, + { + "epoch": 0.3, + "grad_norm": 0.8933905718102726, + "learning_rate": 1.633056916178909e-05, + "loss": 0.5128, + "step": 3733 + }, + { + "epoch": 0.3, + "grad_norm": 0.8924163368771747, + "learning_rate": 1.6328531131691823e-05, + "loss": 0.5776, + "step": 3734 + }, + { + "epoch": 0.3, + "grad_norm": 0.8375540811736663, + "learning_rate": 1.6326492663032964e-05, + "loss": 0.5809, + "step": 3735 + }, + { + "epoch": 0.3, + "grad_norm": 0.9864446004257903, + "learning_rate": 1.6324453755953772e-05, + "loss": 0.6347, + "step": 3736 + }, + { + "epoch": 0.3, + "grad_norm": 0.9735173503758445, + "learning_rate": 1.6322414410595548e-05, + "loss": 0.5306, + "step": 3737 + }, + { + "epoch": 0.3, + "grad_norm": 0.9369967945864461, + "learning_rate": 1.6320374627099612e-05, + "loss": 0.5496, + "step": 3738 + }, + { + "epoch": 0.3, + "grad_norm": 0.8931827555344554, + "learning_rate": 1.631833440560732e-05, + "loss": 0.5092, + "step": 3739 + }, + { + "epoch": 0.3, + "grad_norm": 1.0429319179996712, + "learning_rate": 1.631629374626006e-05, + "loss": 0.5573, + "step": 3740 + }, + { + "epoch": 0.3, + "grad_norm": 1.0092123786527019, + "learning_rate": 1.6314252649199244e-05, + "loss": 0.5645, + "step": 3741 + }, + { + "epoch": 0.3, + "grad_norm": 0.9221786043515009, + "learning_rate": 1.6312211114566322e-05, + "loss": 0.5584, + "step": 3742 + }, + { + "epoch": 0.3, + "grad_norm": 0.8863761142956672, + "learning_rate": 1.6310169142502767e-05, + "loss": 0.525, + "step": 3743 + }, + { + "epoch": 0.3, + "grad_norm": 0.9347174132990639, + "learning_rate": 1.6308126733150088e-05, + "loss": 0.6126, + "step": 3744 + }, + { + "epoch": 0.3, + "grad_norm": 1.0632686311470156, + "learning_rate": 1.6306083886649823e-05, + "loss": 0.6301, + "step": 3745 + }, + { + "epoch": 0.3, + "grad_norm": 1.0074442161292392, + "learning_rate": 1.6304040603143537e-05, + "loss": 0.5749, + "step": 3746 + }, + { + "epoch": 0.3, + "grad_norm": 0.9485310866229572, + "learning_rate": 1.6301996882772828e-05, + "loss": 0.6036, + "step": 3747 + }, + { + "epoch": 0.3, + "grad_norm": 0.9259055405982872, + "learning_rate": 1.6299952725679325e-05, + "loss": 0.6138, + "step": 3748 + }, + { + "epoch": 0.3, + "grad_norm": 0.9216543019525134, + "learning_rate": 1.6297908132004688e-05, + "loss": 0.5322, + "step": 3749 + }, + { + "epoch": 0.3, + "grad_norm": 0.8867586367561826, + "learning_rate": 1.6295863101890603e-05, + "loss": 0.5266, + "step": 3750 + }, + { + "epoch": 0.3, + "grad_norm": 0.930621927448145, + "learning_rate": 1.6293817635478787e-05, + "loss": 0.5562, + "step": 3751 + }, + { + "epoch": 0.3, + "grad_norm": 0.834710674438063, + "learning_rate": 1.629177173291099e-05, + "loss": 0.5912, + "step": 3752 + }, + { + "epoch": 0.31, + "grad_norm": 0.945039577387616, + "learning_rate": 1.6289725394328998e-05, + "loss": 0.5675, + "step": 3753 + }, + { + "epoch": 0.31, + "grad_norm": 1.0046881856885677, + "learning_rate": 1.6287678619874614e-05, + "loss": 0.5822, + "step": 3754 + }, + { + "epoch": 0.31, + "grad_norm": 0.9444310869381672, + "learning_rate": 1.628563140968968e-05, + "loss": 0.6171, + "step": 3755 + }, + { + "epoch": 0.31, + "grad_norm": 0.8936053185510102, + "learning_rate": 1.6283583763916062e-05, + "loss": 0.5566, + "step": 3756 + }, + { + "epoch": 0.31, + "grad_norm": 0.851601255245641, + "learning_rate": 1.6281535682695663e-05, + "loss": 0.5427, + "step": 3757 + }, + { + "epoch": 0.31, + "grad_norm": 0.9061461756195821, + "learning_rate": 1.6279487166170412e-05, + "loss": 0.4949, + "step": 3758 + }, + { + "epoch": 0.31, + "grad_norm": 0.9271870381103675, + "learning_rate": 1.627743821448227e-05, + "loss": 0.5722, + "step": 3759 + }, + { + "epoch": 0.31, + "grad_norm": 1.0091095183053302, + "learning_rate": 1.6275388827773235e-05, + "loss": 0.5942, + "step": 3760 + }, + { + "epoch": 0.31, + "grad_norm": 0.8860061586258776, + "learning_rate": 1.627333900618531e-05, + "loss": 0.5878, + "step": 3761 + }, + { + "epoch": 0.31, + "grad_norm": 0.9550515256655273, + "learning_rate": 1.627128874986056e-05, + "loss": 0.534, + "step": 3762 + }, + { + "epoch": 0.31, + "grad_norm": 0.9466939591667332, + "learning_rate": 1.626923805894107e-05, + "loss": 0.5818, + "step": 3763 + }, + { + "epoch": 0.31, + "grad_norm": 0.8593634170820906, + "learning_rate": 1.6267186933568934e-05, + "loss": 0.623, + "step": 3764 + }, + { + "epoch": 0.31, + "grad_norm": 0.9055110244963029, + "learning_rate": 1.6265135373886303e-05, + "loss": 0.5561, + "step": 3765 + }, + { + "epoch": 0.31, + "grad_norm": 0.9522349882853491, + "learning_rate": 1.6263083380035352e-05, + "loss": 0.6138, + "step": 3766 + }, + { + "epoch": 0.31, + "grad_norm": 0.9804021740033947, + "learning_rate": 1.6261030952158275e-05, + "loss": 0.6136, + "step": 3767 + }, + { + "epoch": 0.31, + "grad_norm": 0.9771039368379923, + "learning_rate": 1.625897809039731e-05, + "loss": 0.5797, + "step": 3768 + }, + { + "epoch": 0.31, + "grad_norm": 0.993286275378227, + "learning_rate": 1.625692479489471e-05, + "loss": 0.6008, + "step": 3769 + }, + { + "epoch": 0.31, + "grad_norm": 0.9382597208111868, + "learning_rate": 1.6254871065792776e-05, + "loss": 0.6317, + "step": 3770 + }, + { + "epoch": 0.31, + "grad_norm": 0.9130802421175084, + "learning_rate": 1.6252816903233822e-05, + "loss": 0.6397, + "step": 3771 + }, + { + "epoch": 0.31, + "grad_norm": 0.8870309436502534, + "learning_rate": 1.6250762307360206e-05, + "loss": 0.5487, + "step": 3772 + }, + { + "epoch": 0.31, + "grad_norm": 0.8630507613282848, + "learning_rate": 1.6248707278314303e-05, + "loss": 0.6063, + "step": 3773 + }, + { + "epoch": 0.31, + "grad_norm": 0.8204465669044464, + "learning_rate": 1.6246651816238533e-05, + "loss": 0.5359, + "step": 3774 + }, + { + "epoch": 0.31, + "grad_norm": 0.8791486581655621, + "learning_rate": 1.6244595921275327e-05, + "loss": 0.5696, + "step": 3775 + }, + { + "epoch": 0.31, + "grad_norm": 0.8884869209713185, + "learning_rate": 1.624253959356717e-05, + "loss": 0.5646, + "step": 3776 + }, + { + "epoch": 0.31, + "grad_norm": 0.8825282203386668, + "learning_rate": 1.6240482833256548e-05, + "loss": 0.5236, + "step": 3777 + }, + { + "epoch": 0.31, + "grad_norm": 0.9366188651664406, + "learning_rate": 1.6238425640486005e-05, + "loss": 0.5926, + "step": 3778 + }, + { + "epoch": 0.31, + "grad_norm": 0.9339163870662345, + "learning_rate": 1.62363680153981e-05, + "loss": 0.5729, + "step": 3779 + }, + { + "epoch": 0.31, + "grad_norm": 0.9361957987894465, + "learning_rate": 1.623430995813542e-05, + "loss": 0.6206, + "step": 3780 + }, + { + "epoch": 0.31, + "grad_norm": 0.9142015976316188, + "learning_rate": 1.6232251468840593e-05, + "loss": 0.6301, + "step": 3781 + }, + { + "epoch": 0.31, + "grad_norm": 0.8585683173377849, + "learning_rate": 1.6230192547656264e-05, + "loss": 0.5282, + "step": 3782 + }, + { + "epoch": 0.31, + "grad_norm": 0.8833783575926998, + "learning_rate": 1.622813319472512e-05, + "loss": 0.5974, + "step": 3783 + }, + { + "epoch": 0.31, + "grad_norm": 0.8608490199464518, + "learning_rate": 1.622607341018987e-05, + "loss": 0.5524, + "step": 3784 + }, + { + "epoch": 0.31, + "grad_norm": 0.8848172429593518, + "learning_rate": 1.622401319419325e-05, + "loss": 0.5606, + "step": 3785 + }, + { + "epoch": 0.31, + "grad_norm": 0.8042328269827678, + "learning_rate": 1.6221952546878044e-05, + "loss": 0.5154, + "step": 3786 + }, + { + "epoch": 0.31, + "grad_norm": 0.8958394235761792, + "learning_rate": 1.621989146838704e-05, + "loss": 0.6145, + "step": 3787 + }, + { + "epoch": 0.31, + "grad_norm": 0.8922850703740579, + "learning_rate": 1.6217829958863077e-05, + "loss": 0.5611, + "step": 3788 + }, + { + "epoch": 0.31, + "grad_norm": 0.9013966060877183, + "learning_rate": 1.6215768018449015e-05, + "loss": 0.5549, + "step": 3789 + }, + { + "epoch": 0.31, + "grad_norm": 0.9854311867726121, + "learning_rate": 1.621370564728774e-05, + "loss": 0.5854, + "step": 3790 + }, + { + "epoch": 0.31, + "grad_norm": 0.9686953885478913, + "learning_rate": 1.6211642845522173e-05, + "loss": 0.5728, + "step": 3791 + }, + { + "epoch": 0.31, + "grad_norm": 0.848322047870251, + "learning_rate": 1.620957961329527e-05, + "loss": 0.573, + "step": 3792 + }, + { + "epoch": 0.31, + "grad_norm": 0.9168811248245282, + "learning_rate": 1.620751595075001e-05, + "loss": 0.5687, + "step": 3793 + }, + { + "epoch": 0.31, + "grad_norm": 0.8563048407779456, + "learning_rate": 1.6205451858029392e-05, + "loss": 0.5997, + "step": 3794 + }, + { + "epoch": 0.31, + "grad_norm": 0.9474923997403427, + "learning_rate": 1.620338733527647e-05, + "loss": 0.5352, + "step": 3795 + }, + { + "epoch": 0.31, + "grad_norm": 0.9299568257108631, + "learning_rate": 1.6201322382634307e-05, + "loss": 0.5198, + "step": 3796 + }, + { + "epoch": 0.31, + "grad_norm": 0.9148308383657838, + "learning_rate": 1.6199257000246004e-05, + "loss": 0.5742, + "step": 3797 + }, + { + "epoch": 0.31, + "grad_norm": 0.9363226570594413, + "learning_rate": 1.6197191188254692e-05, + "loss": 0.5975, + "step": 3798 + }, + { + "epoch": 0.31, + "grad_norm": 1.0230584582962952, + "learning_rate": 1.6195124946803527e-05, + "loss": 0.6454, + "step": 3799 + }, + { + "epoch": 0.31, + "grad_norm": 0.8742058995001382, + "learning_rate": 1.6193058276035696e-05, + "loss": 0.545, + "step": 3800 + }, + { + "epoch": 0.31, + "grad_norm": 0.8711637158069439, + "learning_rate": 1.6190991176094416e-05, + "loss": 0.6298, + "step": 3801 + }, + { + "epoch": 0.31, + "grad_norm": 0.9418798072740513, + "learning_rate": 1.6188923647122946e-05, + "loss": 0.5593, + "step": 3802 + }, + { + "epoch": 0.31, + "grad_norm": 1.0387832078071524, + "learning_rate": 1.6186855689264556e-05, + "loss": 0.6113, + "step": 3803 + }, + { + "epoch": 0.31, + "grad_norm": 0.926162906855895, + "learning_rate": 1.618478730266255e-05, + "loss": 0.572, + "step": 3804 + }, + { + "epoch": 0.31, + "grad_norm": 0.885613716903195, + "learning_rate": 1.6182718487460274e-05, + "loss": 0.5771, + "step": 3805 + }, + { + "epoch": 0.31, + "grad_norm": 0.943864432630467, + "learning_rate": 1.618064924380109e-05, + "loss": 0.5278, + "step": 3806 + }, + { + "epoch": 0.31, + "grad_norm": 0.9306896144782546, + "learning_rate": 1.6178579571828392e-05, + "loss": 0.5818, + "step": 3807 + }, + { + "epoch": 0.31, + "grad_norm": 0.9218870472956118, + "learning_rate": 1.6176509471685616e-05, + "loss": 0.5568, + "step": 3808 + }, + { + "epoch": 0.31, + "grad_norm": 0.8969001171613205, + "learning_rate": 1.6174438943516206e-05, + "loss": 0.5828, + "step": 3809 + }, + { + "epoch": 0.31, + "grad_norm": 0.9669693971910587, + "learning_rate": 1.617236798746366e-05, + "loss": 0.58, + "step": 3810 + }, + { + "epoch": 0.31, + "grad_norm": 0.9136607722559579, + "learning_rate": 1.6170296603671483e-05, + "loss": 0.5508, + "step": 3811 + }, + { + "epoch": 0.31, + "grad_norm": 0.8605010059912521, + "learning_rate": 1.6168224792283226e-05, + "loss": 0.5363, + "step": 3812 + }, + { + "epoch": 0.31, + "grad_norm": 0.9501176650973622, + "learning_rate": 1.616615255344246e-05, + "loss": 0.5826, + "step": 3813 + }, + { + "epoch": 0.31, + "grad_norm": 1.0011978284197116, + "learning_rate": 1.6164079887292795e-05, + "loss": 0.6175, + "step": 3814 + }, + { + "epoch": 0.31, + "grad_norm": 0.8646956502967698, + "learning_rate": 1.6162006793977858e-05, + "loss": 0.5554, + "step": 3815 + }, + { + "epoch": 0.31, + "grad_norm": 0.8299241484026281, + "learning_rate": 1.615993327364132e-05, + "loss": 0.57, + "step": 3816 + }, + { + "epoch": 0.31, + "grad_norm": 0.9461613392861088, + "learning_rate": 1.6157859326426865e-05, + "loss": 0.6507, + "step": 3817 + }, + { + "epoch": 0.31, + "grad_norm": 0.9065221382423563, + "learning_rate": 1.6155784952478227e-05, + "loss": 0.544, + "step": 3818 + }, + { + "epoch": 0.31, + "grad_norm": 0.9547433861128523, + "learning_rate": 1.6153710151939145e-05, + "loss": 0.5617, + "step": 3819 + }, + { + "epoch": 0.31, + "grad_norm": 0.9843060940520709, + "learning_rate": 1.615163492495341e-05, + "loss": 0.6266, + "step": 3820 + }, + { + "epoch": 0.31, + "grad_norm": 0.9216513794765518, + "learning_rate": 1.6149559271664835e-05, + "loss": 0.5406, + "step": 3821 + }, + { + "epoch": 0.31, + "grad_norm": 0.8997307743019719, + "learning_rate": 1.6147483192217252e-05, + "loss": 0.5437, + "step": 3822 + }, + { + "epoch": 0.31, + "grad_norm": 0.8841110559575263, + "learning_rate": 1.614540668675454e-05, + "loss": 0.5412, + "step": 3823 + }, + { + "epoch": 0.31, + "grad_norm": 0.8190597944819852, + "learning_rate": 1.6143329755420592e-05, + "loss": 0.5199, + "step": 3824 + }, + { + "epoch": 0.31, + "grad_norm": 0.8811575545591398, + "learning_rate": 1.6141252398359347e-05, + "loss": 0.5517, + "step": 3825 + }, + { + "epoch": 0.31, + "grad_norm": 0.9495478226528136, + "learning_rate": 1.6139174615714753e-05, + "loss": 0.6203, + "step": 3826 + }, + { + "epoch": 0.31, + "grad_norm": 0.8797998768085065, + "learning_rate": 1.6137096407630805e-05, + "loss": 0.5824, + "step": 3827 + }, + { + "epoch": 0.31, + "grad_norm": 0.9478838526700144, + "learning_rate": 1.613501777425152e-05, + "loss": 0.5624, + "step": 3828 + }, + { + "epoch": 0.31, + "grad_norm": 0.8509726001493284, + "learning_rate": 1.6132938715720946e-05, + "loss": 0.5516, + "step": 3829 + }, + { + "epoch": 0.31, + "grad_norm": 0.9303984729844625, + "learning_rate": 1.6130859232183155e-05, + "loss": 0.5794, + "step": 3830 + }, + { + "epoch": 0.31, + "grad_norm": 0.8570119246962986, + "learning_rate": 1.612877932378226e-05, + "loss": 0.5966, + "step": 3831 + }, + { + "epoch": 0.31, + "grad_norm": 0.903863068393604, + "learning_rate": 1.6126698990662393e-05, + "loss": 0.6146, + "step": 3832 + }, + { + "epoch": 0.31, + "grad_norm": 0.910826964802132, + "learning_rate": 1.6124618232967722e-05, + "loss": 0.5759, + "step": 3833 + }, + { + "epoch": 0.31, + "grad_norm": 0.8503801686254258, + "learning_rate": 1.6122537050842443e-05, + "loss": 0.5785, + "step": 3834 + }, + { + "epoch": 0.31, + "grad_norm": 0.8646395305483849, + "learning_rate": 1.612045544443077e-05, + "loss": 0.5956, + "step": 3835 + }, + { + "epoch": 0.31, + "grad_norm": 0.8979625542240427, + "learning_rate": 1.611837341387697e-05, + "loss": 0.5682, + "step": 3836 + }, + { + "epoch": 0.31, + "grad_norm": 0.877739368460374, + "learning_rate": 1.6116290959325318e-05, + "loss": 0.567, + "step": 3837 + }, + { + "epoch": 0.31, + "grad_norm": 0.8941178925109462, + "learning_rate": 1.6114208080920125e-05, + "loss": 0.4819, + "step": 3838 + }, + { + "epoch": 0.31, + "grad_norm": 0.9798634018310648, + "learning_rate": 1.6112124778805734e-05, + "loss": 0.6096, + "step": 3839 + }, + { + "epoch": 0.31, + "grad_norm": 0.9032849513071829, + "learning_rate": 1.611004105312652e-05, + "loss": 0.4802, + "step": 3840 + }, + { + "epoch": 0.31, + "grad_norm": 0.9156165559397718, + "learning_rate": 1.610795690402688e-05, + "loss": 0.5825, + "step": 3841 + }, + { + "epoch": 0.31, + "grad_norm": 0.9257049329284944, + "learning_rate": 1.6105872331651245e-05, + "loss": 0.5467, + "step": 3842 + }, + { + "epoch": 0.31, + "grad_norm": 0.9240778610764073, + "learning_rate": 1.610378733614407e-05, + "loss": 0.6246, + "step": 3843 + }, + { + "epoch": 0.31, + "grad_norm": 0.9901370283969468, + "learning_rate": 1.6101701917649852e-05, + "loss": 0.6262, + "step": 3844 + }, + { + "epoch": 0.31, + "grad_norm": 1.0349073072890402, + "learning_rate": 1.60996160763131e-05, + "loss": 0.5461, + "step": 3845 + }, + { + "epoch": 0.31, + "grad_norm": 0.8941247919258477, + "learning_rate": 1.6097529812278364e-05, + "loss": 0.5748, + "step": 3846 + }, + { + "epoch": 0.31, + "grad_norm": 0.8401322694110076, + "learning_rate": 1.6095443125690222e-05, + "loss": 0.4761, + "step": 3847 + }, + { + "epoch": 0.31, + "grad_norm": 0.9457057504482168, + "learning_rate": 1.609335601669328e-05, + "loss": 0.6018, + "step": 3848 + }, + { + "epoch": 0.31, + "grad_norm": 0.898059756360942, + "learning_rate": 1.6091268485432165e-05, + "loss": 0.6018, + "step": 3849 + }, + { + "epoch": 0.31, + "grad_norm": 0.9574558404478161, + "learning_rate": 1.6089180532051552e-05, + "loss": 0.5048, + "step": 3850 + }, + { + "epoch": 0.31, + "grad_norm": 0.9188939124978395, + "learning_rate": 1.6087092156696127e-05, + "loss": 0.5522, + "step": 3851 + }, + { + "epoch": 0.31, + "grad_norm": 0.9691554158650906, + "learning_rate": 1.6085003359510616e-05, + "loss": 0.5651, + "step": 3852 + }, + { + "epoch": 0.31, + "grad_norm": 0.9953360714177077, + "learning_rate": 1.6082914140639768e-05, + "loss": 0.5312, + "step": 3853 + }, + { + "epoch": 0.31, + "grad_norm": 0.9516548297556112, + "learning_rate": 1.6080824500228367e-05, + "loss": 0.5831, + "step": 3854 + }, + { + "epoch": 0.31, + "grad_norm": 0.9744591097344041, + "learning_rate": 1.607873443842122e-05, + "loss": 0.5357, + "step": 3855 + }, + { + "epoch": 0.31, + "grad_norm": 0.9916786491593069, + "learning_rate": 1.607664395536317e-05, + "loss": 0.5184, + "step": 3856 + }, + { + "epoch": 0.31, + "grad_norm": 1.0783830257128157, + "learning_rate": 1.6074553051199084e-05, + "loss": 0.589, + "step": 3857 + }, + { + "epoch": 0.31, + "grad_norm": 0.9327083156351761, + "learning_rate": 1.6072461726073856e-05, + "loss": 0.6027, + "step": 3858 + }, + { + "epoch": 0.31, + "grad_norm": 1.072289447924501, + "learning_rate": 1.6070369980132425e-05, + "loss": 0.6064, + "step": 3859 + }, + { + "epoch": 0.31, + "grad_norm": 0.8299534240728982, + "learning_rate": 1.6068277813519733e-05, + "loss": 0.5044, + "step": 3860 + }, + { + "epoch": 0.31, + "grad_norm": 0.9522751323222246, + "learning_rate": 1.606618522638077e-05, + "loss": 0.537, + "step": 3861 + }, + { + "epoch": 0.31, + "grad_norm": 0.9331646723727852, + "learning_rate": 1.6064092218860553e-05, + "loss": 0.5846, + "step": 3862 + }, + { + "epoch": 0.31, + "grad_norm": 0.9322832535527018, + "learning_rate": 1.6061998791104125e-05, + "loss": 0.5319, + "step": 3863 + }, + { + "epoch": 0.31, + "grad_norm": 0.8936974228375246, + "learning_rate": 1.6059904943256557e-05, + "loss": 0.5557, + "step": 3864 + }, + { + "epoch": 0.31, + "grad_norm": 0.8527992025507102, + "learning_rate": 1.605781067546295e-05, + "loss": 0.5803, + "step": 3865 + }, + { + "epoch": 0.31, + "grad_norm": 0.858642960030146, + "learning_rate": 1.605571598786844e-05, + "loss": 0.5272, + "step": 3866 + }, + { + "epoch": 0.31, + "grad_norm": 0.7971785297359876, + "learning_rate": 1.605362088061818e-05, + "loss": 0.5018, + "step": 3867 + }, + { + "epoch": 0.31, + "grad_norm": 0.8986389122506443, + "learning_rate": 1.6051525353857364e-05, + "loss": 0.5706, + "step": 3868 + }, + { + "epoch": 0.31, + "grad_norm": 0.9365303829334433, + "learning_rate": 1.604942940773121e-05, + "loss": 0.6165, + "step": 3869 + }, + { + "epoch": 0.31, + "grad_norm": 1.0017125459415919, + "learning_rate": 1.604733304238496e-05, + "loss": 0.5739, + "step": 3870 + }, + { + "epoch": 0.31, + "grad_norm": 0.9031465462671923, + "learning_rate": 1.60452362579639e-05, + "loss": 0.6438, + "step": 3871 + }, + { + "epoch": 0.31, + "grad_norm": 1.0046701734849914, + "learning_rate": 1.6043139054613326e-05, + "loss": 0.5714, + "step": 3872 + }, + { + "epoch": 0.31, + "grad_norm": 0.9879187604061295, + "learning_rate": 1.6041041432478573e-05, + "loss": 0.5833, + "step": 3873 + }, + { + "epoch": 0.31, + "grad_norm": 0.9316863553413524, + "learning_rate": 1.603894339170501e-05, + "loss": 0.6159, + "step": 3874 + }, + { + "epoch": 0.31, + "grad_norm": 0.946948808147859, + "learning_rate": 1.6036844932438028e-05, + "loss": 0.5334, + "step": 3875 + }, + { + "epoch": 0.32, + "grad_norm": 0.997715143378706, + "learning_rate": 1.603474605482305e-05, + "loss": 0.6068, + "step": 3876 + }, + { + "epoch": 0.32, + "grad_norm": 1.0769994222963795, + "learning_rate": 1.6032646759005515e-05, + "loss": 0.6482, + "step": 3877 + }, + { + "epoch": 0.32, + "grad_norm": 0.9817520043293302, + "learning_rate": 1.6030547045130912e-05, + "loss": 0.5853, + "step": 3878 + }, + { + "epoch": 0.32, + "grad_norm": 0.9648721660602143, + "learning_rate": 1.6028446913344754e-05, + "loss": 0.5616, + "step": 3879 + }, + { + "epoch": 0.32, + "grad_norm": 0.9278261013834327, + "learning_rate": 1.6026346363792565e-05, + "loss": 0.5881, + "step": 3880 + }, + { + "epoch": 0.32, + "grad_norm": 0.976882951080119, + "learning_rate": 1.6024245396619923e-05, + "loss": 0.5802, + "step": 3881 + }, + { + "epoch": 0.32, + "grad_norm": 0.982550166661968, + "learning_rate": 1.6022144011972415e-05, + "loss": 0.588, + "step": 3882 + }, + { + "epoch": 0.32, + "grad_norm": 0.8778654274821055, + "learning_rate": 1.6020042209995674e-05, + "loss": 0.5717, + "step": 3883 + }, + { + "epoch": 0.32, + "grad_norm": 0.9761246649548841, + "learning_rate": 1.601793999083534e-05, + "loss": 0.5979, + "step": 3884 + }, + { + "epoch": 0.32, + "grad_norm": 0.9127386804169086, + "learning_rate": 1.601583735463711e-05, + "loss": 0.5232, + "step": 3885 + }, + { + "epoch": 0.32, + "grad_norm": 0.943428272432976, + "learning_rate": 1.6013734301546682e-05, + "loss": 0.596, + "step": 3886 + }, + { + "epoch": 0.32, + "grad_norm": 0.8873528540453057, + "learning_rate": 1.6011630831709802e-05, + "loss": 0.4994, + "step": 3887 + }, + { + "epoch": 0.32, + "grad_norm": 1.0020297517768617, + "learning_rate": 1.6009526945272243e-05, + "loss": 0.6594, + "step": 3888 + }, + { + "epoch": 0.32, + "grad_norm": 0.969974182360959, + "learning_rate": 1.600742264237979e-05, + "loss": 0.4831, + "step": 3889 + }, + { + "epoch": 0.32, + "grad_norm": 0.8565424953400176, + "learning_rate": 1.600531792317828e-05, + "loss": 0.5725, + "step": 3890 + }, + { + "epoch": 0.32, + "grad_norm": 0.943954280881943, + "learning_rate": 1.6003212787813566e-05, + "loss": 0.5465, + "step": 3891 + }, + { + "epoch": 0.32, + "grad_norm": 0.8430972074442779, + "learning_rate": 1.6001107236431525e-05, + "loss": 0.5275, + "step": 3892 + }, + { + "epoch": 0.32, + "grad_norm": 0.8757053789419513, + "learning_rate": 1.5999001269178082e-05, + "loss": 0.5166, + "step": 3893 + }, + { + "epoch": 0.32, + "grad_norm": 1.0103937818297317, + "learning_rate": 1.5996894886199167e-05, + "loss": 0.578, + "step": 3894 + }, + { + "epoch": 0.32, + "grad_norm": 0.9004178001640264, + "learning_rate": 1.599478808764076e-05, + "loss": 0.6012, + "step": 3895 + }, + { + "epoch": 0.32, + "grad_norm": 0.9064418582095709, + "learning_rate": 1.5992680873648852e-05, + "loss": 0.5688, + "step": 3896 + }, + { + "epoch": 0.32, + "grad_norm": 0.913659760784911, + "learning_rate": 1.5990573244369478e-05, + "loss": 0.5659, + "step": 3897 + }, + { + "epoch": 0.32, + "grad_norm": 1.0124956917343046, + "learning_rate": 1.5988465199948692e-05, + "loss": 0.6485, + "step": 3898 + }, + { + "epoch": 0.32, + "grad_norm": 0.8901834527319934, + "learning_rate": 1.5986356740532577e-05, + "loss": 0.5982, + "step": 3899 + }, + { + "epoch": 0.32, + "grad_norm": 0.8764696457969312, + "learning_rate": 1.5984247866267253e-05, + "loss": 0.5469, + "step": 3900 + }, + { + "epoch": 0.32, + "grad_norm": 0.9406358564079779, + "learning_rate": 1.5982138577298857e-05, + "loss": 0.5012, + "step": 3901 + }, + { + "epoch": 0.32, + "grad_norm": 0.8906184319109551, + "learning_rate": 1.5980028873773563e-05, + "loss": 0.5248, + "step": 3902 + }, + { + "epoch": 0.32, + "grad_norm": 0.9257365052996489, + "learning_rate": 1.5977918755837576e-05, + "loss": 0.5771, + "step": 3903 + }, + { + "epoch": 0.32, + "grad_norm": 0.8697721075412118, + "learning_rate": 1.5975808223637117e-05, + "loss": 0.6153, + "step": 3904 + }, + { + "epoch": 0.32, + "grad_norm": 0.9582866753626065, + "learning_rate": 1.5973697277318452e-05, + "loss": 0.6421, + "step": 3905 + }, + { + "epoch": 0.32, + "grad_norm": 0.9999765624853517, + "learning_rate": 1.5971585917027864e-05, + "loss": 0.6416, + "step": 3906 + }, + { + "epoch": 0.32, + "grad_norm": 0.9084148944426773, + "learning_rate": 1.596947414291167e-05, + "loss": 0.5444, + "step": 3907 + }, + { + "epoch": 0.32, + "grad_norm": 0.9706443582125923, + "learning_rate": 1.5967361955116207e-05, + "loss": 0.5201, + "step": 3908 + }, + { + "epoch": 0.32, + "grad_norm": 0.9972933163735651, + "learning_rate": 1.596524935378786e-05, + "loss": 0.5934, + "step": 3909 + }, + { + "epoch": 0.32, + "grad_norm": 0.8746715492924646, + "learning_rate": 1.5963136339073023e-05, + "loss": 0.5472, + "step": 3910 + }, + { + "epoch": 0.32, + "grad_norm": 0.8610502255050615, + "learning_rate": 1.5961022911118124e-05, + "loss": 0.6129, + "step": 3911 + }, + { + "epoch": 0.32, + "grad_norm": 0.908968711863703, + "learning_rate": 1.5958909070069627e-05, + "loss": 0.5596, + "step": 3912 + }, + { + "epoch": 0.32, + "grad_norm": 0.8760887786049828, + "learning_rate": 1.5956794816074015e-05, + "loss": 0.5631, + "step": 3913 + }, + { + "epoch": 0.32, + "grad_norm": 0.9159331172803, + "learning_rate": 1.5954680149277807e-05, + "loss": 0.5787, + "step": 3914 + }, + { + "epoch": 0.32, + "grad_norm": 0.8659525657404382, + "learning_rate": 1.5952565069827544e-05, + "loss": 0.47, + "step": 3915 + }, + { + "epoch": 0.32, + "grad_norm": 0.8250608098886822, + "learning_rate": 1.5950449577869807e-05, + "loss": 0.5444, + "step": 3916 + }, + { + "epoch": 0.32, + "grad_norm": 0.8310286356585561, + "learning_rate": 1.594833367355119e-05, + "loss": 0.4955, + "step": 3917 + }, + { + "epoch": 0.32, + "grad_norm": 0.940905437018659, + "learning_rate": 1.5946217357018322e-05, + "loss": 0.61, + "step": 3918 + }, + { + "epoch": 0.32, + "grad_norm": 0.9313010060308121, + "learning_rate": 1.594410062841787e-05, + "loss": 0.5724, + "step": 3919 + }, + { + "epoch": 0.32, + "grad_norm": 0.9309610664870516, + "learning_rate": 1.5941983487896515e-05, + "loss": 0.5331, + "step": 3920 + }, + { + "epoch": 0.32, + "grad_norm": 0.8901671550634485, + "learning_rate": 1.5939865935600976e-05, + "loss": 0.5906, + "step": 3921 + }, + { + "epoch": 0.32, + "grad_norm": 0.9543630353086977, + "learning_rate": 1.5937747971677996e-05, + "loss": 0.5637, + "step": 3922 + }, + { + "epoch": 0.32, + "grad_norm": 0.8939676646410317, + "learning_rate": 1.5935629596274345e-05, + "loss": 0.5744, + "step": 3923 + }, + { + "epoch": 0.32, + "grad_norm": 0.8888549200288502, + "learning_rate": 1.593351080953683e-05, + "loss": 0.5727, + "step": 3924 + }, + { + "epoch": 0.32, + "grad_norm": 0.8968565334675708, + "learning_rate": 1.5931391611612283e-05, + "loss": 0.5727, + "step": 3925 + }, + { + "epoch": 0.32, + "grad_norm": 0.9970235070283097, + "learning_rate": 1.5929272002647554e-05, + "loss": 0.6307, + "step": 3926 + }, + { + "epoch": 0.32, + "grad_norm": 0.8795770579019754, + "learning_rate": 1.5927151982789535e-05, + "loss": 0.5426, + "step": 3927 + }, + { + "epoch": 0.32, + "grad_norm": 0.8724285002585467, + "learning_rate": 1.592503155218514e-05, + "loss": 0.5284, + "step": 3928 + }, + { + "epoch": 0.32, + "grad_norm": 0.909354024769601, + "learning_rate": 1.592291071098132e-05, + "loss": 0.5683, + "step": 3929 + }, + { + "epoch": 0.32, + "grad_norm": 1.004209989415013, + "learning_rate": 1.5920789459325034e-05, + "loss": 0.563, + "step": 3930 + }, + { + "epoch": 0.32, + "grad_norm": 0.9043095354424483, + "learning_rate": 1.5918667797363295e-05, + "loss": 0.5364, + "step": 3931 + }, + { + "epoch": 0.32, + "grad_norm": 0.9581734823349979, + "learning_rate": 1.5916545725243124e-05, + "loss": 0.5751, + "step": 3932 + }, + { + "epoch": 0.32, + "grad_norm": 0.9672287773164598, + "learning_rate": 1.5914423243111582e-05, + "loss": 0.5979, + "step": 3933 + }, + { + "epoch": 0.32, + "grad_norm": 0.9256755293346672, + "learning_rate": 1.591230035111576e-05, + "loss": 0.5826, + "step": 3934 + }, + { + "epoch": 0.32, + "grad_norm": 0.9089934348766084, + "learning_rate": 1.5910177049402762e-05, + "loss": 0.579, + "step": 3935 + }, + { + "epoch": 0.32, + "grad_norm": 0.9235722253896989, + "learning_rate": 1.5908053338119743e-05, + "loss": 0.5909, + "step": 3936 + }, + { + "epoch": 0.32, + "grad_norm": 0.9773177163752066, + "learning_rate": 1.590592921741386e-05, + "loss": 0.6234, + "step": 3937 + }, + { + "epoch": 0.32, + "grad_norm": 0.9177701352719017, + "learning_rate": 1.5903804687432325e-05, + "loss": 0.5361, + "step": 3938 + }, + { + "epoch": 0.32, + "grad_norm": 0.8916835330713223, + "learning_rate": 1.5901679748322367e-05, + "loss": 0.5138, + "step": 3939 + }, + { + "epoch": 0.32, + "grad_norm": 1.0161600179707424, + "learning_rate": 1.5899554400231233e-05, + "loss": 0.545, + "step": 3940 + }, + { + "epoch": 0.32, + "grad_norm": 0.9160973920565795, + "learning_rate": 1.5897428643306207e-05, + "loss": 0.5696, + "step": 3941 + }, + { + "epoch": 0.32, + "grad_norm": 0.9168375963244876, + "learning_rate": 1.5895302477694614e-05, + "loss": 0.5171, + "step": 3942 + }, + { + "epoch": 0.32, + "grad_norm": 0.9596387856101649, + "learning_rate": 1.5893175903543788e-05, + "loss": 0.557, + "step": 3943 + }, + { + "epoch": 0.32, + "grad_norm": 0.8806056535691328, + "learning_rate": 1.5891048921001094e-05, + "loss": 0.5787, + "step": 3944 + }, + { + "epoch": 0.32, + "grad_norm": 0.9184353595815069, + "learning_rate": 1.5888921530213938e-05, + "loss": 0.5489, + "step": 3945 + }, + { + "epoch": 0.32, + "grad_norm": 0.9008051585907707, + "learning_rate": 1.5886793731329743e-05, + "loss": 0.5742, + "step": 3946 + }, + { + "epoch": 0.32, + "grad_norm": 0.9214844580354372, + "learning_rate": 1.5884665524495965e-05, + "loss": 0.633, + "step": 3947 + }, + { + "epoch": 0.32, + "grad_norm": 1.0020293010722414, + "learning_rate": 1.5882536909860086e-05, + "loss": 0.5728, + "step": 3948 + }, + { + "epoch": 0.32, + "grad_norm": 0.8578081842433017, + "learning_rate": 1.5880407887569617e-05, + "loss": 0.5171, + "step": 3949 + }, + { + "epoch": 0.32, + "grad_norm": 0.8587555597349382, + "learning_rate": 1.5878278457772095e-05, + "loss": 0.5575, + "step": 3950 + }, + { + "epoch": 0.32, + "grad_norm": 1.0361111128796203, + "learning_rate": 1.5876148620615094e-05, + "loss": 0.5906, + "step": 3951 + }, + { + "epoch": 0.32, + "grad_norm": 0.8893338118813109, + "learning_rate": 1.5874018376246204e-05, + "loss": 0.5418, + "step": 3952 + }, + { + "epoch": 0.32, + "grad_norm": 0.9951994215390055, + "learning_rate": 1.587188772481305e-05, + "loss": 0.6434, + "step": 3953 + }, + { + "epoch": 0.32, + "grad_norm": 0.8760987314596554, + "learning_rate": 1.586975666646328e-05, + "loss": 0.5719, + "step": 3954 + }, + { + "epoch": 0.32, + "grad_norm": 0.9741708146173176, + "learning_rate": 1.586762520134459e-05, + "loss": 0.5114, + "step": 3955 + }, + { + "epoch": 0.32, + "grad_norm": 0.9053015926217912, + "learning_rate": 1.586549332960467e-05, + "loss": 0.5896, + "step": 3956 + }, + { + "epoch": 0.32, + "grad_norm": 1.0322490493095118, + "learning_rate": 1.586336105139127e-05, + "loss": 0.6103, + "step": 3957 + }, + { + "epoch": 0.32, + "grad_norm": 0.912987608954962, + "learning_rate": 1.5861228366852148e-05, + "loss": 0.5358, + "step": 3958 + }, + { + "epoch": 0.32, + "grad_norm": 0.945625272219544, + "learning_rate": 1.58590952761351e-05, + "loss": 0.547, + "step": 3959 + }, + { + "epoch": 0.32, + "grad_norm": 1.0679542684563517, + "learning_rate": 1.5856961779387945e-05, + "loss": 0.5971, + "step": 3960 + }, + { + "epoch": 0.32, + "grad_norm": 0.8642824221120665, + "learning_rate": 1.5854827876758535e-05, + "loss": 0.5196, + "step": 3961 + }, + { + "epoch": 0.32, + "grad_norm": 0.990179303335577, + "learning_rate": 1.5852693568394743e-05, + "loss": 0.5959, + "step": 3962 + }, + { + "epoch": 0.32, + "grad_norm": 0.8666422528701844, + "learning_rate": 1.585055885444448e-05, + "loss": 0.4666, + "step": 3963 + }, + { + "epoch": 0.32, + "grad_norm": 0.9884825573976187, + "learning_rate": 1.584842373505568e-05, + "loss": 0.5883, + "step": 3964 + }, + { + "epoch": 0.32, + "grad_norm": 0.8598450286175446, + "learning_rate": 1.58462882103763e-05, + "loss": 0.5026, + "step": 3965 + }, + { + "epoch": 0.32, + "grad_norm": 0.8665646932315791, + "learning_rate": 1.5844152280554333e-05, + "loss": 0.4779, + "step": 3966 + }, + { + "epoch": 0.32, + "grad_norm": 0.9650599543426119, + "learning_rate": 1.5842015945737798e-05, + "loss": 0.6127, + "step": 3967 + }, + { + "epoch": 0.32, + "grad_norm": 0.8378359622706862, + "learning_rate": 1.583987920607474e-05, + "loss": 0.5244, + "step": 3968 + }, + { + "epoch": 0.32, + "grad_norm": 0.9662255575979611, + "learning_rate": 1.583774206171323e-05, + "loss": 0.6374, + "step": 3969 + }, + { + "epoch": 0.32, + "grad_norm": 0.8429884326613644, + "learning_rate": 1.5835604512801375e-05, + "loss": 0.553, + "step": 3970 + }, + { + "epoch": 0.32, + "grad_norm": 0.9376564970901118, + "learning_rate": 1.5833466559487305e-05, + "loss": 0.5976, + "step": 3971 + }, + { + "epoch": 0.32, + "grad_norm": 0.9324971037844922, + "learning_rate": 1.5831328201919175e-05, + "loss": 0.6169, + "step": 3972 + }, + { + "epoch": 0.32, + "grad_norm": 1.0128425702126547, + "learning_rate": 1.5829189440245175e-05, + "loss": 0.6142, + "step": 3973 + }, + { + "epoch": 0.32, + "grad_norm": 1.0372645855687352, + "learning_rate": 1.5827050274613512e-05, + "loss": 0.6401, + "step": 3974 + }, + { + "epoch": 0.32, + "grad_norm": 0.9380399027238666, + "learning_rate": 1.5824910705172437e-05, + "loss": 0.5905, + "step": 3975 + }, + { + "epoch": 0.32, + "grad_norm": 0.9415163524530699, + "learning_rate": 1.5822770732070222e-05, + "loss": 0.5514, + "step": 3976 + }, + { + "epoch": 0.32, + "grad_norm": 0.866600935516856, + "learning_rate": 1.5820630355455155e-05, + "loss": 0.5763, + "step": 3977 + }, + { + "epoch": 0.32, + "grad_norm": 0.9152366905386166, + "learning_rate": 1.5818489575475564e-05, + "loss": 0.5363, + "step": 3978 + }, + { + "epoch": 0.32, + "grad_norm": 0.9422393689041517, + "learning_rate": 1.5816348392279814e-05, + "loss": 0.6887, + "step": 3979 + }, + { + "epoch": 0.32, + "grad_norm": 0.9066783559458047, + "learning_rate": 1.5814206806016273e-05, + "loss": 0.6047, + "step": 3980 + }, + { + "epoch": 0.32, + "grad_norm": 0.9600494430271688, + "learning_rate": 1.581206481683336e-05, + "loss": 0.6092, + "step": 3981 + }, + { + "epoch": 0.32, + "grad_norm": 0.8981869604826622, + "learning_rate": 1.580992242487951e-05, + "loss": 0.5736, + "step": 3982 + }, + { + "epoch": 0.32, + "grad_norm": 0.8496609966141786, + "learning_rate": 1.580777963030319e-05, + "loss": 0.5108, + "step": 3983 + }, + { + "epoch": 0.32, + "grad_norm": 0.8995294057437336, + "learning_rate": 1.5805636433252892e-05, + "loss": 0.5595, + "step": 3984 + }, + { + "epoch": 0.32, + "grad_norm": 0.8649395089955623, + "learning_rate": 1.5803492833877143e-05, + "loss": 0.5506, + "step": 3985 + }, + { + "epoch": 0.32, + "grad_norm": 0.9194681120880075, + "learning_rate": 1.5801348832324483e-05, + "loss": 0.5092, + "step": 3986 + }, + { + "epoch": 0.32, + "grad_norm": 0.9022113242229857, + "learning_rate": 1.5799204428743497e-05, + "loss": 0.5777, + "step": 3987 + }, + { + "epoch": 0.32, + "grad_norm": 0.9521786178423841, + "learning_rate": 1.5797059623282787e-05, + "loss": 0.5979, + "step": 3988 + }, + { + "epoch": 0.32, + "grad_norm": 0.975208854083481, + "learning_rate": 1.5794914416090988e-05, + "loss": 0.5444, + "step": 3989 + }, + { + "epoch": 0.32, + "grad_norm": 1.0034484704900803, + "learning_rate": 1.579276880731676e-05, + "loss": 0.6227, + "step": 3990 + }, + { + "epoch": 0.32, + "grad_norm": 1.0597278667233871, + "learning_rate": 1.579062279710879e-05, + "loss": 0.6034, + "step": 3991 + }, + { + "epoch": 0.32, + "grad_norm": 0.8784580768653654, + "learning_rate": 1.57884763856158e-05, + "loss": 0.5405, + "step": 3992 + }, + { + "epoch": 0.32, + "grad_norm": 0.9531035237183869, + "learning_rate": 1.5786329572986527e-05, + "loss": 0.6006, + "step": 3993 + }, + { + "epoch": 0.32, + "grad_norm": 0.9197563734914485, + "learning_rate": 1.578418235936975e-05, + "loss": 0.5324, + "step": 3994 + }, + { + "epoch": 0.32, + "grad_norm": 0.9454280511184845, + "learning_rate": 1.5782034744914264e-05, + "loss": 0.5458, + "step": 3995 + }, + { + "epoch": 0.32, + "grad_norm": 0.8474779750455188, + "learning_rate": 1.57798867297689e-05, + "loss": 0.5221, + "step": 3996 + }, + { + "epoch": 0.32, + "grad_norm": 0.9766094107559433, + "learning_rate": 1.5777738314082514e-05, + "loss": 0.6148, + "step": 3997 + }, + { + "epoch": 0.32, + "grad_norm": 0.9217892458525768, + "learning_rate": 1.5775589498003984e-05, + "loss": 0.5567, + "step": 3998 + }, + { + "epoch": 0.33, + "grad_norm": 0.9409235290685977, + "learning_rate": 1.5773440281682226e-05, + "loss": 0.5966, + "step": 3999 + }, + { + "epoch": 0.33, + "grad_norm": 0.904426711240461, + "learning_rate": 1.577129066526618e-05, + "loss": 0.553, + "step": 4000 + }, + { + "epoch": 0.33, + "grad_norm": 0.9370662463188123, + "learning_rate": 1.5769140648904806e-05, + "loss": 0.5454, + "step": 4001 + }, + { + "epoch": 0.33, + "grad_norm": 0.8846935856523952, + "learning_rate": 1.5766990232747106e-05, + "loss": 0.5365, + "step": 4002 + }, + { + "epoch": 0.33, + "grad_norm": 0.9434664117402615, + "learning_rate": 1.5764839416942097e-05, + "loss": 0.5992, + "step": 4003 + }, + { + "epoch": 0.33, + "grad_norm": 1.3096888551499557, + "learning_rate": 1.576268820163883e-05, + "loss": 0.521, + "step": 4004 + }, + { + "epoch": 0.33, + "grad_norm": 0.9698155276466377, + "learning_rate": 1.576053658698638e-05, + "loss": 0.6299, + "step": 4005 + }, + { + "epoch": 0.33, + "grad_norm": 0.9371335851184125, + "learning_rate": 1.5758384573133857e-05, + "loss": 0.5855, + "step": 4006 + }, + { + "epoch": 0.33, + "grad_norm": 1.0166311336574105, + "learning_rate": 1.5756232160230388e-05, + "loss": 0.6351, + "step": 4007 + }, + { + "epoch": 0.33, + "grad_norm": 0.9017351451582668, + "learning_rate": 1.5754079348425137e-05, + "loss": 0.5309, + "step": 4008 + }, + { + "epoch": 0.33, + "grad_norm": 1.0872595226025266, + "learning_rate": 1.575192613786729e-05, + "loss": 0.5589, + "step": 4009 + }, + { + "epoch": 0.33, + "grad_norm": 0.9429036522959239, + "learning_rate": 1.574977252870607e-05, + "loss": 0.5597, + "step": 4010 + }, + { + "epoch": 0.33, + "grad_norm": 1.006473815437657, + "learning_rate": 1.5747618521090706e-05, + "loss": 0.6188, + "step": 4011 + }, + { + "epoch": 0.33, + "grad_norm": 0.9831280828790899, + "learning_rate": 1.574546411517048e-05, + "loss": 0.5073, + "step": 4012 + }, + { + "epoch": 0.33, + "grad_norm": 0.9614882582761742, + "learning_rate": 1.5743309311094687e-05, + "loss": 0.5916, + "step": 4013 + }, + { + "epoch": 0.33, + "grad_norm": 0.9834440770393784, + "learning_rate": 1.574115410901265e-05, + "loss": 0.5796, + "step": 4014 + }, + { + "epoch": 0.33, + "grad_norm": 0.862865878132095, + "learning_rate": 1.573899850907373e-05, + "loss": 0.5184, + "step": 4015 + }, + { + "epoch": 0.33, + "grad_norm": 0.9369647382877944, + "learning_rate": 1.5736842511427302e-05, + "loss": 0.5914, + "step": 4016 + }, + { + "epoch": 0.33, + "grad_norm": 0.8918627312384875, + "learning_rate": 1.5734686116222775e-05, + "loss": 0.502, + "step": 4017 + }, + { + "epoch": 0.33, + "grad_norm": 0.976668862121526, + "learning_rate": 1.573252932360959e-05, + "loss": 0.5887, + "step": 4018 + }, + { + "epoch": 0.33, + "grad_norm": 0.8604324291777963, + "learning_rate": 1.5730372133737206e-05, + "loss": 0.5676, + "step": 4019 + }, + { + "epoch": 0.33, + "grad_norm": 0.9355398542457567, + "learning_rate": 1.5728214546755117e-05, + "loss": 0.5784, + "step": 4020 + }, + { + "epoch": 0.33, + "grad_norm": 1.0053396355524888, + "learning_rate": 1.572605656281284e-05, + "loss": 0.5561, + "step": 4021 + }, + { + "epoch": 0.33, + "grad_norm": 0.8865428977414068, + "learning_rate": 1.572389818205992e-05, + "loss": 0.5344, + "step": 4022 + }, + { + "epoch": 0.33, + "grad_norm": 0.917616140853694, + "learning_rate": 1.5721739404645937e-05, + "loss": 0.5448, + "step": 4023 + }, + { + "epoch": 0.33, + "grad_norm": 2.0039270053413536, + "learning_rate": 1.5719580230720485e-05, + "loss": 0.5648, + "step": 4024 + }, + { + "epoch": 0.33, + "grad_norm": 0.8715238705686509, + "learning_rate": 1.57174206604332e-05, + "loss": 0.6044, + "step": 4025 + }, + { + "epoch": 0.33, + "grad_norm": 0.9609013329719558, + "learning_rate": 1.571526069393373e-05, + "loss": 0.5953, + "step": 4026 + }, + { + "epoch": 0.33, + "grad_norm": 0.9475604172383607, + "learning_rate": 1.5713100331371768e-05, + "loss": 0.5605, + "step": 4027 + }, + { + "epoch": 0.33, + "grad_norm": 0.8078213940067273, + "learning_rate": 1.5710939572897018e-05, + "loss": 0.5468, + "step": 4028 + }, + { + "epoch": 0.33, + "grad_norm": 0.9691697196000543, + "learning_rate": 1.570877841865922e-05, + "loss": 0.5711, + "step": 4029 + }, + { + "epoch": 0.33, + "grad_norm": 1.005924058700349, + "learning_rate": 1.5706616868808142e-05, + "loss": 0.5627, + "step": 4030 + }, + { + "epoch": 0.33, + "grad_norm": 0.8603428183105295, + "learning_rate": 1.5704454923493577e-05, + "loss": 0.5344, + "step": 4031 + }, + { + "epoch": 0.33, + "grad_norm": 1.0494686065684906, + "learning_rate": 1.570229258286535e-05, + "loss": 0.5895, + "step": 4032 + }, + { + "epoch": 0.33, + "grad_norm": 0.8563575513133033, + "learning_rate": 1.5700129847073298e-05, + "loss": 0.5631, + "step": 4033 + }, + { + "epoch": 0.33, + "grad_norm": 1.0982265801325355, + "learning_rate": 1.5697966716267308e-05, + "loss": 0.6157, + "step": 4034 + }, + { + "epoch": 0.33, + "grad_norm": 0.8426234383113499, + "learning_rate": 1.5695803190597275e-05, + "loss": 0.5765, + "step": 4035 + }, + { + "epoch": 0.33, + "grad_norm": 0.9077511704024396, + "learning_rate": 1.5693639270213138e-05, + "loss": 0.5276, + "step": 4036 + }, + { + "epoch": 0.33, + "grad_norm": 0.8654428007933888, + "learning_rate": 1.5691474955264848e-05, + "loss": 0.4938, + "step": 4037 + }, + { + "epoch": 0.33, + "grad_norm": 0.9792980677325999, + "learning_rate": 1.568931024590239e-05, + "loss": 0.5882, + "step": 4038 + }, + { + "epoch": 0.33, + "grad_norm": 1.0353408848047134, + "learning_rate": 1.568714514227578e-05, + "loss": 0.6413, + "step": 4039 + }, + { + "epoch": 0.33, + "grad_norm": 0.8556489487154524, + "learning_rate": 1.5684979644535053e-05, + "loss": 0.5847, + "step": 4040 + }, + { + "epoch": 0.33, + "grad_norm": 0.9468726518340129, + "learning_rate": 1.5682813752830284e-05, + "loss": 0.5498, + "step": 4041 + }, + { + "epoch": 0.33, + "grad_norm": 0.9710687607873638, + "learning_rate": 1.568064746731156e-05, + "loss": 0.5876, + "step": 4042 + }, + { + "epoch": 0.33, + "grad_norm": 0.8165640547630525, + "learning_rate": 1.5678480788129003e-05, + "loss": 0.5817, + "step": 4043 + }, + { + "epoch": 0.33, + "grad_norm": 0.8358262932296227, + "learning_rate": 1.567631371543277e-05, + "loss": 0.5138, + "step": 4044 + }, + { + "epoch": 0.33, + "grad_norm": 0.8043492548198713, + "learning_rate": 1.5674146249373027e-05, + "loss": 0.5326, + "step": 4045 + }, + { + "epoch": 0.33, + "grad_norm": 0.8762669905406324, + "learning_rate": 1.5671978390099985e-05, + "loss": 0.5032, + "step": 4046 + }, + { + "epoch": 0.33, + "grad_norm": 0.962549549006086, + "learning_rate": 1.566981013776387e-05, + "loss": 0.5345, + "step": 4047 + }, + { + "epoch": 0.33, + "grad_norm": 0.8584925617245548, + "learning_rate": 1.5667641492514942e-05, + "loss": 0.4845, + "step": 4048 + }, + { + "epoch": 0.33, + "grad_norm": 0.9768208555080496, + "learning_rate": 1.5665472454503484e-05, + "loss": 0.6085, + "step": 4049 + }, + { + "epoch": 0.33, + "grad_norm": 0.8910712523051837, + "learning_rate": 1.566330302387981e-05, + "loss": 0.5695, + "step": 4050 + }, + { + "epoch": 0.33, + "grad_norm": 0.8993957926471542, + "learning_rate": 1.566113320079426e-05, + "loss": 0.5671, + "step": 4051 + }, + { + "epoch": 0.33, + "grad_norm": 0.9509042340172726, + "learning_rate": 1.5658962985397202e-05, + "loss": 0.5668, + "step": 4052 + }, + { + "epoch": 0.33, + "grad_norm": 0.8274785334369242, + "learning_rate": 1.565679237783903e-05, + "loss": 0.5048, + "step": 4053 + }, + { + "epoch": 0.33, + "grad_norm": 0.8602365961998485, + "learning_rate": 1.565462137827016e-05, + "loss": 0.5589, + "step": 4054 + }, + { + "epoch": 0.33, + "grad_norm": 0.9312739718943414, + "learning_rate": 1.5652449986841048e-05, + "loss": 0.574, + "step": 4055 + }, + { + "epoch": 0.33, + "grad_norm": 0.8972888462688227, + "learning_rate": 1.5650278203702162e-05, + "loss": 0.5738, + "step": 4056 + }, + { + "epoch": 0.33, + "grad_norm": 0.9849922783424293, + "learning_rate": 1.564810602900401e-05, + "loss": 0.6366, + "step": 4057 + }, + { + "epoch": 0.33, + "grad_norm": 0.9266568698973598, + "learning_rate": 1.564593346289712e-05, + "loss": 0.5807, + "step": 4058 + }, + { + "epoch": 0.33, + "grad_norm": 0.9689728059389319, + "learning_rate": 1.564376050553205e-05, + "loss": 0.6002, + "step": 4059 + }, + { + "epoch": 0.33, + "grad_norm": 0.9657283266055987, + "learning_rate": 1.564158715705938e-05, + "loss": 0.5452, + "step": 4060 + }, + { + "epoch": 0.33, + "grad_norm": 0.9252710654263362, + "learning_rate": 1.563941341762973e-05, + "loss": 0.6325, + "step": 4061 + }, + { + "epoch": 0.33, + "grad_norm": 0.9067245498427078, + "learning_rate": 1.5637239287393725e-05, + "loss": 0.601, + "step": 4062 + }, + { + "epoch": 0.33, + "grad_norm": 0.9155849496185144, + "learning_rate": 1.5635064766502042e-05, + "loss": 0.5615, + "step": 4063 + }, + { + "epoch": 0.33, + "grad_norm": 0.8936140240745216, + "learning_rate": 1.563288985510537e-05, + "loss": 0.5471, + "step": 4064 + }, + { + "epoch": 0.33, + "grad_norm": 0.942245882106811, + "learning_rate": 1.5630714553354425e-05, + "loss": 0.5919, + "step": 4065 + }, + { + "epoch": 0.33, + "grad_norm": 0.9363613460777342, + "learning_rate": 1.5628538861399956e-05, + "loss": 0.5929, + "step": 4066 + }, + { + "epoch": 0.33, + "grad_norm": 1.0024484438838643, + "learning_rate": 1.5626362779392738e-05, + "loss": 0.6062, + "step": 4067 + }, + { + "epoch": 0.33, + "grad_norm": 0.8589020510416536, + "learning_rate": 1.562418630748357e-05, + "loss": 0.5201, + "step": 4068 + }, + { + "epoch": 0.33, + "grad_norm": 0.859399789929067, + "learning_rate": 1.5622009445823274e-05, + "loss": 0.5157, + "step": 4069 + }, + { + "epoch": 0.33, + "grad_norm": 0.9655223138192177, + "learning_rate": 1.5619832194562716e-05, + "loss": 0.5716, + "step": 4070 + }, + { + "epoch": 0.33, + "grad_norm": 0.9628342084902699, + "learning_rate": 1.561765455385277e-05, + "loss": 0.5767, + "step": 4071 + }, + { + "epoch": 0.33, + "grad_norm": 0.9572679965465991, + "learning_rate": 1.5615476523844346e-05, + "loss": 0.5719, + "step": 4072 + }, + { + "epoch": 0.33, + "grad_norm": 0.8314228385114546, + "learning_rate": 1.5613298104688383e-05, + "loss": 0.4915, + "step": 4073 + }, + { + "epoch": 0.33, + "grad_norm": 0.8856555588832423, + "learning_rate": 1.5611119296535836e-05, + "loss": 0.4953, + "step": 4074 + }, + { + "epoch": 0.33, + "grad_norm": 0.8825697950973398, + "learning_rate": 1.56089400995377e-05, + "loss": 0.5186, + "step": 4075 + }, + { + "epoch": 0.33, + "grad_norm": 1.0123348773261922, + "learning_rate": 1.560676051384499e-05, + "loss": 0.6382, + "step": 4076 + }, + { + "epoch": 0.33, + "grad_norm": 0.9949471643486758, + "learning_rate": 1.560458053960875e-05, + "loss": 0.5926, + "step": 4077 + }, + { + "epoch": 0.33, + "grad_norm": 0.9208384971441007, + "learning_rate": 1.560240017698005e-05, + "loss": 0.5815, + "step": 4078 + }, + { + "epoch": 0.33, + "grad_norm": 1.0898988852025966, + "learning_rate": 1.5600219426109986e-05, + "loss": 0.6395, + "step": 4079 + }, + { + "epoch": 0.33, + "grad_norm": 0.9118273390877075, + "learning_rate": 1.5598038287149684e-05, + "loss": 0.6092, + "step": 4080 + }, + { + "epoch": 0.33, + "grad_norm": 0.8525027875549489, + "learning_rate": 1.5595856760250296e-05, + "loss": 0.5162, + "step": 4081 + }, + { + "epoch": 0.33, + "grad_norm": 0.9278775945269585, + "learning_rate": 1.5593674845562994e-05, + "loss": 0.5954, + "step": 4082 + }, + { + "epoch": 0.33, + "grad_norm": 1.330704959252747, + "learning_rate": 1.559149254323899e-05, + "loss": 0.6494, + "step": 4083 + }, + { + "epoch": 0.33, + "grad_norm": 0.9244176454658858, + "learning_rate": 1.558930985342951e-05, + "loss": 0.5549, + "step": 4084 + }, + { + "epoch": 0.33, + "grad_norm": 0.9455312227753272, + "learning_rate": 1.5587126776285818e-05, + "loss": 0.6124, + "step": 4085 + }, + { + "epoch": 0.33, + "grad_norm": 0.9137367270037585, + "learning_rate": 1.5584943311959197e-05, + "loss": 0.5225, + "step": 4086 + }, + { + "epoch": 0.33, + "grad_norm": 0.9787600920662352, + "learning_rate": 1.5582759460600952e-05, + "loss": 0.5767, + "step": 4087 + }, + { + "epoch": 0.33, + "grad_norm": 0.8621461831215322, + "learning_rate": 1.5580575222362435e-05, + "loss": 0.5406, + "step": 4088 + }, + { + "epoch": 0.33, + "grad_norm": 0.8804137554755774, + "learning_rate": 1.5578390597395e-05, + "loss": 0.5362, + "step": 4089 + }, + { + "epoch": 0.33, + "grad_norm": 0.8723709928624246, + "learning_rate": 1.5576205585850052e-05, + "loss": 0.4771, + "step": 4090 + }, + { + "epoch": 0.33, + "grad_norm": 0.9936210472826803, + "learning_rate": 1.5574020187878994e-05, + "loss": 0.603, + "step": 4091 + }, + { + "epoch": 0.33, + "grad_norm": 1.0090338812951476, + "learning_rate": 1.557183440363329e-05, + "loss": 0.6171, + "step": 4092 + }, + { + "epoch": 0.33, + "grad_norm": 0.8932855373353723, + "learning_rate": 1.5569648233264395e-05, + "loss": 0.5652, + "step": 4093 + }, + { + "epoch": 0.33, + "grad_norm": 0.8737280526971734, + "learning_rate": 1.556746167692382e-05, + "loss": 0.5322, + "step": 4094 + }, + { + "epoch": 0.33, + "grad_norm": 0.9708220775517294, + "learning_rate": 1.5565274734763094e-05, + "loss": 0.5867, + "step": 4095 + }, + { + "epoch": 0.33, + "grad_norm": 0.9697497442178686, + "learning_rate": 1.5563087406933762e-05, + "loss": 0.5511, + "step": 4096 + }, + { + "epoch": 0.33, + "grad_norm": 0.9903770214845385, + "learning_rate": 1.5560899693587405e-05, + "loss": 0.6059, + "step": 4097 + }, + { + "epoch": 0.33, + "grad_norm": 0.9203756944219564, + "learning_rate": 1.5558711594875634e-05, + "loss": 0.4967, + "step": 4098 + }, + { + "epoch": 0.33, + "grad_norm": 0.942205169285279, + "learning_rate": 1.555652311095008e-05, + "loss": 0.5684, + "step": 4099 + }, + { + "epoch": 0.33, + "grad_norm": 0.9600421552335939, + "learning_rate": 1.5554334241962403e-05, + "loss": 0.5922, + "step": 4100 + }, + { + "epoch": 0.33, + "grad_norm": 0.9174980936365247, + "learning_rate": 1.5552144988064292e-05, + "loss": 0.5991, + "step": 4101 + }, + { + "epoch": 0.33, + "grad_norm": 0.8450449715915833, + "learning_rate": 1.5549955349407456e-05, + "loss": 0.5217, + "step": 4102 + }, + { + "epoch": 0.33, + "grad_norm": 0.8690691429197966, + "learning_rate": 1.5547765326143634e-05, + "loss": 0.5026, + "step": 4103 + }, + { + "epoch": 0.33, + "grad_norm": 1.0294790378362721, + "learning_rate": 1.5545574918424602e-05, + "loss": 0.6469, + "step": 4104 + }, + { + "epoch": 0.33, + "grad_norm": 0.8553897917778106, + "learning_rate": 1.5543384126402144e-05, + "loss": 0.5178, + "step": 4105 + }, + { + "epoch": 0.33, + "grad_norm": 0.963476645302054, + "learning_rate": 1.554119295022808e-05, + "loss": 0.5804, + "step": 4106 + }, + { + "epoch": 0.33, + "grad_norm": 0.8640671675579383, + "learning_rate": 1.5539001390054265e-05, + "loss": 0.5261, + "step": 4107 + }, + { + "epoch": 0.33, + "grad_norm": 0.9682608000185056, + "learning_rate": 1.5536809446032562e-05, + "loss": 0.6721, + "step": 4108 + }, + { + "epoch": 0.33, + "grad_norm": 0.8860882499843543, + "learning_rate": 1.5534617118314882e-05, + "loss": 0.5743, + "step": 4109 + }, + { + "epoch": 0.33, + "grad_norm": 0.9069502189222868, + "learning_rate": 1.553242440705314e-05, + "loss": 0.5516, + "step": 4110 + }, + { + "epoch": 0.33, + "grad_norm": 0.8936217039676828, + "learning_rate": 1.5530231312399294e-05, + "loss": 0.5435, + "step": 4111 + }, + { + "epoch": 0.33, + "grad_norm": 0.9706173889045864, + "learning_rate": 1.5528037834505322e-05, + "loss": 0.5701, + "step": 4112 + }, + { + "epoch": 0.33, + "grad_norm": 0.9178913160396679, + "learning_rate": 1.5525843973523237e-05, + "loss": 0.5524, + "step": 4113 + }, + { + "epoch": 0.33, + "grad_norm": 0.8145499993607632, + "learning_rate": 1.552364972960506e-05, + "loss": 0.5285, + "step": 4114 + }, + { + "epoch": 0.33, + "grad_norm": 1.016269762503253, + "learning_rate": 1.552145510290286e-05, + "loss": 0.5321, + "step": 4115 + }, + { + "epoch": 0.33, + "grad_norm": 0.908990601168826, + "learning_rate": 1.5519260093568717e-05, + "loss": 0.5547, + "step": 4116 + }, + { + "epoch": 0.33, + "grad_norm": 0.9328424886518241, + "learning_rate": 1.5517064701754744e-05, + "loss": 0.6041, + "step": 4117 + }, + { + "epoch": 0.33, + "grad_norm": 0.9016063684659464, + "learning_rate": 1.5514868927613084e-05, + "loss": 0.5758, + "step": 4118 + }, + { + "epoch": 0.33, + "grad_norm": 0.8420443749720986, + "learning_rate": 1.5512672771295898e-05, + "loss": 0.5029, + "step": 4119 + }, + { + "epoch": 0.33, + "grad_norm": 0.9588642379929366, + "learning_rate": 1.5510476232955376e-05, + "loss": 0.588, + "step": 4120 + }, + { + "epoch": 0.33, + "grad_norm": 0.9003302000099962, + "learning_rate": 1.5508279312743742e-05, + "loss": 0.5722, + "step": 4121 + }, + { + "epoch": 0.34, + "grad_norm": 0.877570962504461, + "learning_rate": 1.5506082010813237e-05, + "loss": 0.4978, + "step": 4122 + }, + { + "epoch": 0.34, + "grad_norm": 0.9989799140340428, + "learning_rate": 1.550388432731613e-05, + "loss": 0.5816, + "step": 4123 + }, + { + "epoch": 0.34, + "grad_norm": 0.8644779147210184, + "learning_rate": 1.550168626240472e-05, + "loss": 0.5078, + "step": 4124 + }, + { + "epoch": 0.34, + "grad_norm": 0.9713792974934381, + "learning_rate": 1.549948781623134e-05, + "loss": 0.6444, + "step": 4125 + }, + { + "epoch": 0.34, + "grad_norm": 0.9157006453630713, + "learning_rate": 1.5497288988948326e-05, + "loss": 0.5919, + "step": 4126 + }, + { + "epoch": 0.34, + "grad_norm": 0.9740797882912834, + "learning_rate": 1.5495089780708062e-05, + "loss": 0.6244, + "step": 4127 + }, + { + "epoch": 0.34, + "grad_norm": 0.9011107822348161, + "learning_rate": 1.5492890191662954e-05, + "loss": 0.6247, + "step": 4128 + }, + { + "epoch": 0.34, + "grad_norm": 1.0004936320232625, + "learning_rate": 1.5490690221965424e-05, + "loss": 0.5899, + "step": 4129 + }, + { + "epoch": 0.34, + "grad_norm": 0.9769522662893291, + "learning_rate": 1.5488489871767928e-05, + "loss": 0.5603, + "step": 4130 + }, + { + "epoch": 0.34, + "grad_norm": 0.9531619489136708, + "learning_rate": 1.5486289141222955e-05, + "loss": 0.546, + "step": 4131 + }, + { + "epoch": 0.34, + "grad_norm": 0.8588746366213443, + "learning_rate": 1.5484088030483015e-05, + "loss": 0.548, + "step": 4132 + }, + { + "epoch": 0.34, + "grad_norm": 0.9155465841984228, + "learning_rate": 1.5481886539700636e-05, + "loss": 0.4894, + "step": 4133 + }, + { + "epoch": 0.34, + "grad_norm": 0.924147240967755, + "learning_rate": 1.5479684669028384e-05, + "loss": 0.5814, + "step": 4134 + }, + { + "epoch": 0.34, + "grad_norm": 0.8825588133700512, + "learning_rate": 1.5477482418618844e-05, + "loss": 0.625, + "step": 4135 + }, + { + "epoch": 0.34, + "grad_norm": 1.0442236829760536, + "learning_rate": 1.547527978862463e-05, + "loss": 0.6079, + "step": 4136 + }, + { + "epoch": 0.34, + "grad_norm": 0.9227304530173703, + "learning_rate": 1.5473076779198385e-05, + "loss": 0.5721, + "step": 4137 + }, + { + "epoch": 0.34, + "grad_norm": 1.0268902487716691, + "learning_rate": 1.547087339049277e-05, + "loss": 0.6532, + "step": 4138 + }, + { + "epoch": 0.34, + "grad_norm": 0.9282329564893024, + "learning_rate": 1.5468669622660487e-05, + "loss": 0.5388, + "step": 4139 + }, + { + "epoch": 0.34, + "grad_norm": 0.9878857633469456, + "learning_rate": 1.5466465475854246e-05, + "loss": 0.5986, + "step": 4140 + }, + { + "epoch": 0.34, + "grad_norm": 1.0095349326177143, + "learning_rate": 1.54642609502268e-05, + "loss": 0.5704, + "step": 4141 + }, + { + "epoch": 0.34, + "grad_norm": 0.8335990919695064, + "learning_rate": 1.5462056045930912e-05, + "loss": 0.5412, + "step": 4142 + }, + { + "epoch": 0.34, + "grad_norm": 0.9715117775274271, + "learning_rate": 1.5459850763119386e-05, + "loss": 0.5278, + "step": 4143 + }, + { + "epoch": 0.34, + "grad_norm": 0.9114149090614846, + "learning_rate": 1.5457645101945046e-05, + "loss": 0.5223, + "step": 4144 + }, + { + "epoch": 0.34, + "grad_norm": 0.858592720845518, + "learning_rate": 1.5455439062560743e-05, + "loss": 0.5791, + "step": 4145 + }, + { + "epoch": 0.34, + "grad_norm": 0.8942255917626747, + "learning_rate": 1.5453232645119348e-05, + "loss": 0.5552, + "step": 4146 + }, + { + "epoch": 0.34, + "grad_norm": 0.8826034287798566, + "learning_rate": 1.5451025849773773e-05, + "loss": 0.5712, + "step": 4147 + }, + { + "epoch": 0.34, + "grad_norm": 0.9978211602085963, + "learning_rate": 1.544881867667694e-05, + "loss": 0.587, + "step": 4148 + }, + { + "epoch": 0.34, + "grad_norm": 0.9534289695842948, + "learning_rate": 1.5446611125981804e-05, + "loss": 0.5829, + "step": 4149 + }, + { + "epoch": 0.34, + "grad_norm": 0.9171045438073293, + "learning_rate": 1.5444403197841345e-05, + "loss": 0.6322, + "step": 4150 + }, + { + "epoch": 0.34, + "grad_norm": 0.9103267468648931, + "learning_rate": 1.5442194892408583e-05, + "loss": 0.5276, + "step": 4151 + }, + { + "epoch": 0.34, + "grad_norm": 0.9318835958199996, + "learning_rate": 1.5439986209836532e-05, + "loss": 0.5889, + "step": 4152 + }, + { + "epoch": 0.34, + "grad_norm": 0.9935823989772233, + "learning_rate": 1.5437777150278268e-05, + "loss": 0.5639, + "step": 4153 + }, + { + "epoch": 0.34, + "grad_norm": 1.0297272006608487, + "learning_rate": 1.543556771388687e-05, + "loss": 0.6397, + "step": 4154 + }, + { + "epoch": 0.34, + "grad_norm": 0.9775512865777397, + "learning_rate": 1.543335790081545e-05, + "loss": 0.5728, + "step": 4155 + }, + { + "epoch": 0.34, + "grad_norm": 0.8264711311001698, + "learning_rate": 1.5431147711217147e-05, + "loss": 0.5647, + "step": 4156 + }, + { + "epoch": 0.34, + "grad_norm": 0.7969675893425409, + "learning_rate": 1.5428937145245126e-05, + "loss": 0.5264, + "step": 4157 + }, + { + "epoch": 0.34, + "grad_norm": 0.9149958151946387, + "learning_rate": 1.542672620305257e-05, + "loss": 0.5749, + "step": 4158 + }, + { + "epoch": 0.34, + "grad_norm": 0.8931961491044399, + "learning_rate": 1.542451488479271e-05, + "loss": 0.5328, + "step": 4159 + }, + { + "epoch": 0.34, + "grad_norm": 0.8109905646646205, + "learning_rate": 1.5422303190618776e-05, + "loss": 0.5047, + "step": 4160 + }, + { + "epoch": 0.34, + "grad_norm": 0.9536185768596526, + "learning_rate": 1.5420091120684042e-05, + "loss": 0.5677, + "step": 4161 + }, + { + "epoch": 0.34, + "grad_norm": 0.9340327406436755, + "learning_rate": 1.54178786751418e-05, + "loss": 0.5517, + "step": 4162 + }, + { + "epoch": 0.34, + "grad_norm": 0.8801478055544735, + "learning_rate": 1.541566585414537e-05, + "loss": 0.5112, + "step": 4163 + }, + { + "epoch": 0.34, + "grad_norm": 0.9203855454083592, + "learning_rate": 1.5413452657848104e-05, + "loss": 0.5836, + "step": 4164 + }, + { + "epoch": 0.34, + "grad_norm": 0.9008530027903892, + "learning_rate": 1.5411239086403367e-05, + "loss": 0.5972, + "step": 4165 + }, + { + "epoch": 0.34, + "grad_norm": 1.0356456643420995, + "learning_rate": 1.540902513996456e-05, + "loss": 0.597, + "step": 4166 + }, + { + "epoch": 0.34, + "grad_norm": 0.8433055793215191, + "learning_rate": 1.5406810818685113e-05, + "loss": 0.5037, + "step": 4167 + }, + { + "epoch": 0.34, + "grad_norm": 1.0015286153463043, + "learning_rate": 1.5404596122718473e-05, + "loss": 0.581, + "step": 4168 + }, + { + "epoch": 0.34, + "grad_norm": 0.9712135995909356, + "learning_rate": 1.540238105221811e-05, + "loss": 0.5599, + "step": 4169 + }, + { + "epoch": 0.34, + "grad_norm": 0.9707839063979106, + "learning_rate": 1.5400165607337534e-05, + "loss": 0.5304, + "step": 4170 + }, + { + "epoch": 0.34, + "grad_norm": 0.9938465618035094, + "learning_rate": 1.5397949788230275e-05, + "loss": 0.575, + "step": 4171 + }, + { + "epoch": 0.34, + "grad_norm": 1.1246282226906894, + "learning_rate": 1.539573359504988e-05, + "loss": 0.5929, + "step": 4172 + }, + { + "epoch": 0.34, + "grad_norm": 0.952586657296879, + "learning_rate": 1.539351702794993e-05, + "loss": 0.5333, + "step": 4173 + }, + { + "epoch": 0.34, + "grad_norm": 0.976592921038152, + "learning_rate": 1.539130008708404e-05, + "loss": 0.6206, + "step": 4174 + }, + { + "epoch": 0.34, + "grad_norm": 0.8593233700589394, + "learning_rate": 1.538908277260583e-05, + "loss": 0.5927, + "step": 4175 + }, + { + "epoch": 0.34, + "grad_norm": 0.9321633186484158, + "learning_rate": 1.538686508466897e-05, + "loss": 0.6076, + "step": 4176 + }, + { + "epoch": 0.34, + "grad_norm": 0.9736653930607452, + "learning_rate": 1.5384647023427136e-05, + "loss": 0.6351, + "step": 4177 + }, + { + "epoch": 0.34, + "grad_norm": 1.0073840250099277, + "learning_rate": 1.538242858903404e-05, + "loss": 0.5281, + "step": 4178 + }, + { + "epoch": 0.34, + "grad_norm": 0.9670824565967566, + "learning_rate": 1.538020978164341e-05, + "loss": 0.5519, + "step": 4179 + }, + { + "epoch": 0.34, + "grad_norm": 0.9003903583858006, + "learning_rate": 1.5377990601409022e-05, + "loss": 0.5621, + "step": 4180 + }, + { + "epoch": 0.34, + "grad_norm": 1.07375811358448, + "learning_rate": 1.5375771048484657e-05, + "loss": 0.5801, + "step": 4181 + }, + { + "epoch": 0.34, + "grad_norm": 0.9829663735472544, + "learning_rate": 1.5373551123024123e-05, + "loss": 0.6094, + "step": 4182 + }, + { + "epoch": 0.34, + "grad_norm": 0.8686452093382149, + "learning_rate": 1.537133082518126e-05, + "loss": 0.5506, + "step": 4183 + }, + { + "epoch": 0.34, + "grad_norm": 0.9804770241218727, + "learning_rate": 1.536911015510994e-05, + "loss": 0.5503, + "step": 4184 + }, + { + "epoch": 0.34, + "grad_norm": 0.8689620504507469, + "learning_rate": 1.5366889112964044e-05, + "loss": 0.5606, + "step": 4185 + }, + { + "epoch": 0.34, + "grad_norm": 0.880493881682588, + "learning_rate": 1.5364667698897498e-05, + "loss": 0.5309, + "step": 4186 + }, + { + "epoch": 0.34, + "grad_norm": 0.9549420150814182, + "learning_rate": 1.5362445913064238e-05, + "loss": 0.5554, + "step": 4187 + }, + { + "epoch": 0.34, + "grad_norm": 0.9255714148261335, + "learning_rate": 1.536022375561823e-05, + "loss": 0.5805, + "step": 4188 + }, + { + "epoch": 0.34, + "grad_norm": 0.9160529514074018, + "learning_rate": 1.535800122671347e-05, + "loss": 0.5463, + "step": 4189 + }, + { + "epoch": 0.34, + "grad_norm": 0.9220363895496618, + "learning_rate": 1.535577832650398e-05, + "loss": 0.4973, + "step": 4190 + }, + { + "epoch": 0.34, + "grad_norm": 0.8529340143381954, + "learning_rate": 1.53535550551438e-05, + "loss": 0.5442, + "step": 4191 + }, + { + "epoch": 0.34, + "grad_norm": 0.9997574057268795, + "learning_rate": 1.5351331412787004e-05, + "loss": 0.6065, + "step": 4192 + }, + { + "epoch": 0.34, + "grad_norm": 0.9291109892458387, + "learning_rate": 1.534910739958769e-05, + "loss": 0.5558, + "step": 4193 + }, + { + "epoch": 0.34, + "grad_norm": 0.9131668497103859, + "learning_rate": 1.5346883015699976e-05, + "loss": 0.5957, + "step": 4194 + }, + { + "epoch": 0.34, + "grad_norm": 0.9013737322445415, + "learning_rate": 1.5344658261278013e-05, + "loss": 0.6395, + "step": 4195 + }, + { + "epoch": 0.34, + "grad_norm": 0.8451928896140083, + "learning_rate": 1.5342433136475972e-05, + "loss": 0.5138, + "step": 4196 + }, + { + "epoch": 0.34, + "grad_norm": 0.9204352581964403, + "learning_rate": 1.5340207641448054e-05, + "loss": 0.5535, + "step": 4197 + }, + { + "epoch": 0.34, + "grad_norm": 0.9285526995779052, + "learning_rate": 1.5337981776348484e-05, + "loss": 0.52, + "step": 4198 + }, + { + "epoch": 0.34, + "grad_norm": 0.96092597621691, + "learning_rate": 1.533575554133151e-05, + "loss": 0.5847, + "step": 4199 + }, + { + "epoch": 0.34, + "grad_norm": 0.8841256267260578, + "learning_rate": 1.533352893655141e-05, + "loss": 0.565, + "step": 4200 + }, + { + "epoch": 0.34, + "grad_norm": 0.9820667038604786, + "learning_rate": 1.5331301962162485e-05, + "loss": 0.6118, + "step": 4201 + }, + { + "epoch": 0.34, + "grad_norm": 0.90455636810397, + "learning_rate": 1.5329074618319063e-05, + "loss": 0.5453, + "step": 4202 + }, + { + "epoch": 0.34, + "grad_norm": 1.0119288125177617, + "learning_rate": 1.5326846905175497e-05, + "loss": 0.5862, + "step": 4203 + }, + { + "epoch": 0.34, + "grad_norm": 0.9461879744824403, + "learning_rate": 1.5324618822886167e-05, + "loss": 0.6043, + "step": 4204 + }, + { + "epoch": 0.34, + "grad_norm": 0.939134668597495, + "learning_rate": 1.5322390371605473e-05, + "loss": 0.577, + "step": 4205 + }, + { + "epoch": 0.34, + "grad_norm": 0.9843968714772094, + "learning_rate": 1.532016155148785e-05, + "loss": 0.5525, + "step": 4206 + }, + { + "epoch": 0.34, + "grad_norm": 0.9416378125162804, + "learning_rate": 1.531793236268775e-05, + "loss": 0.561, + "step": 4207 + }, + { + "epoch": 0.34, + "grad_norm": 0.9043162067384275, + "learning_rate": 1.531570280535965e-05, + "loss": 0.4951, + "step": 4208 + }, + { + "epoch": 0.34, + "grad_norm": 0.953135119133098, + "learning_rate": 1.5313472879658066e-05, + "loss": 0.5997, + "step": 4209 + }, + { + "epoch": 0.34, + "grad_norm": 0.9904782885873062, + "learning_rate": 1.531124258573752e-05, + "loss": 0.5398, + "step": 4210 + }, + { + "epoch": 0.34, + "grad_norm": 0.9141887774274177, + "learning_rate": 1.530901192375258e-05, + "loss": 0.5488, + "step": 4211 + }, + { + "epoch": 0.34, + "grad_norm": 0.9307386669857576, + "learning_rate": 1.530678089385782e-05, + "loss": 0.5841, + "step": 4212 + }, + { + "epoch": 0.34, + "grad_norm": 0.9567495658547867, + "learning_rate": 1.5304549496207848e-05, + "loss": 0.5479, + "step": 4213 + }, + { + "epoch": 0.34, + "grad_norm": 0.9676728012807723, + "learning_rate": 1.5302317730957305e-05, + "loss": 0.5915, + "step": 4214 + }, + { + "epoch": 0.34, + "grad_norm": 0.9185599117650312, + "learning_rate": 1.5300085598260843e-05, + "loss": 0.5809, + "step": 4215 + }, + { + "epoch": 0.34, + "grad_norm": 0.8203698746978695, + "learning_rate": 1.5297853098273148e-05, + "loss": 0.5176, + "step": 4216 + }, + { + "epoch": 0.34, + "grad_norm": 0.9929516274827571, + "learning_rate": 1.529562023114894e-05, + "loss": 0.5683, + "step": 4217 + }, + { + "epoch": 0.34, + "grad_norm": 0.9217222912479962, + "learning_rate": 1.5293386997042943e-05, + "loss": 0.5301, + "step": 4218 + }, + { + "epoch": 0.34, + "grad_norm": 0.8681487591067317, + "learning_rate": 1.5291153396109925e-05, + "loss": 0.5411, + "step": 4219 + }, + { + "epoch": 0.34, + "grad_norm": 0.9882312511442278, + "learning_rate": 1.5288919428504668e-05, + "loss": 0.6358, + "step": 4220 + }, + { + "epoch": 0.34, + "grad_norm": 0.9066244268708632, + "learning_rate": 1.5286685094381984e-05, + "loss": 0.5119, + "step": 4221 + }, + { + "epoch": 0.34, + "grad_norm": 0.9038436185292014, + "learning_rate": 1.5284450393896713e-05, + "loss": 0.5951, + "step": 4222 + }, + { + "epoch": 0.34, + "grad_norm": 0.9642323326452018, + "learning_rate": 1.528221532720372e-05, + "loss": 0.546, + "step": 4223 + }, + { + "epoch": 0.34, + "grad_norm": 0.9956160521476128, + "learning_rate": 1.5279979894457887e-05, + "loss": 0.6047, + "step": 4224 + }, + { + "epoch": 0.34, + "grad_norm": 0.8354483533906525, + "learning_rate": 1.5277744095814132e-05, + "loss": 0.5119, + "step": 4225 + }, + { + "epoch": 0.34, + "grad_norm": 0.9604418200300491, + "learning_rate": 1.5275507931427392e-05, + "loss": 0.5881, + "step": 4226 + }, + { + "epoch": 0.34, + "grad_norm": 0.9005288726215351, + "learning_rate": 1.5273271401452633e-05, + "loss": 0.5507, + "step": 4227 + }, + { + "epoch": 0.34, + "grad_norm": 0.9390955908421621, + "learning_rate": 1.5271034506044838e-05, + "loss": 0.5362, + "step": 4228 + }, + { + "epoch": 0.34, + "grad_norm": 1.004379660532751, + "learning_rate": 1.5268797245359035e-05, + "loss": 0.5816, + "step": 4229 + }, + { + "epoch": 0.34, + "grad_norm": 0.988809361388353, + "learning_rate": 1.5266559619550254e-05, + "loss": 0.5677, + "step": 4230 + }, + { + "epoch": 0.34, + "grad_norm": 0.8965461566573829, + "learning_rate": 1.526432162877356e-05, + "loss": 0.5972, + "step": 4231 + }, + { + "epoch": 0.34, + "grad_norm": 0.8899439036923005, + "learning_rate": 1.526208327318405e-05, + "loss": 0.5549, + "step": 4232 + }, + { + "epoch": 0.34, + "grad_norm": 0.8831555105843113, + "learning_rate": 1.5259844552936833e-05, + "loss": 0.5199, + "step": 4233 + }, + { + "epoch": 0.34, + "grad_norm": 0.9053002337578933, + "learning_rate": 1.5257605468187056e-05, + "loss": 0.5575, + "step": 4234 + }, + { + "epoch": 0.34, + "grad_norm": 1.076438001507276, + "learning_rate": 1.5255366019089883e-05, + "loss": 0.4907, + "step": 4235 + }, + { + "epoch": 0.34, + "grad_norm": 0.9969810516684846, + "learning_rate": 1.5253126205800506e-05, + "loss": 0.6066, + "step": 4236 + }, + { + "epoch": 0.34, + "grad_norm": 0.9872274573817507, + "learning_rate": 1.525088602847414e-05, + "loss": 0.5654, + "step": 4237 + }, + { + "epoch": 0.34, + "grad_norm": 0.8572405337777793, + "learning_rate": 1.5248645487266036e-05, + "loss": 0.5443, + "step": 4238 + }, + { + "epoch": 0.34, + "grad_norm": 0.8459905204744106, + "learning_rate": 1.5246404582331451e-05, + "loss": 0.5223, + "step": 4239 + }, + { + "epoch": 0.34, + "grad_norm": 0.8089099757270066, + "learning_rate": 1.5244163313825684e-05, + "loss": 0.5629, + "step": 4240 + }, + { + "epoch": 0.34, + "grad_norm": 0.8868275652220692, + "learning_rate": 1.524192168190405e-05, + "loss": 0.4984, + "step": 4241 + }, + { + "epoch": 0.34, + "grad_norm": 0.9299705286995178, + "learning_rate": 1.5239679686721892e-05, + "loss": 0.5544, + "step": 4242 + }, + { + "epoch": 0.34, + "grad_norm": 1.032961198809958, + "learning_rate": 1.5237437328434581e-05, + "loss": 0.6112, + "step": 4243 + }, + { + "epoch": 0.34, + "grad_norm": 0.8999846365877913, + "learning_rate": 1.5235194607197508e-05, + "loss": 0.545, + "step": 4244 + }, + { + "epoch": 0.35, + "grad_norm": 0.9422703636135591, + "learning_rate": 1.5232951523166092e-05, + "loss": 0.5203, + "step": 4245 + }, + { + "epoch": 0.35, + "grad_norm": 0.8820713677107922, + "learning_rate": 1.5230708076495777e-05, + "loss": 0.5232, + "step": 4246 + }, + { + "epoch": 0.35, + "grad_norm": 0.9241143950002292, + "learning_rate": 1.5228464267342036e-05, + "loss": 0.5773, + "step": 4247 + }, + { + "epoch": 0.35, + "grad_norm": 0.9115901855811518, + "learning_rate": 1.5226220095860353e-05, + "loss": 0.5443, + "step": 4248 + }, + { + "epoch": 0.35, + "grad_norm": 0.9692553383520067, + "learning_rate": 1.5223975562206255e-05, + "loss": 0.5583, + "step": 4249 + }, + { + "epoch": 0.35, + "grad_norm": 1.0601455956061403, + "learning_rate": 1.5221730666535285e-05, + "loss": 0.6298, + "step": 4250 + }, + { + "epoch": 0.35, + "grad_norm": 0.7742548362033347, + "learning_rate": 1.5219485409003013e-05, + "loss": 0.5086, + "step": 4251 + }, + { + "epoch": 0.35, + "grad_norm": 1.0254340546298504, + "learning_rate": 1.5217239789765028e-05, + "loss": 0.5832, + "step": 4252 + }, + { + "epoch": 0.35, + "grad_norm": 0.8661718359978385, + "learning_rate": 1.5214993808976956e-05, + "loss": 0.5454, + "step": 4253 + }, + { + "epoch": 0.35, + "grad_norm": 0.9429372489488378, + "learning_rate": 1.5212747466794437e-05, + "loss": 0.5834, + "step": 4254 + }, + { + "epoch": 0.35, + "grad_norm": 0.9280200691809544, + "learning_rate": 1.5210500763373142e-05, + "loss": 0.5302, + "step": 4255 + }, + { + "epoch": 0.35, + "grad_norm": 0.9484388180375084, + "learning_rate": 1.5208253698868766e-05, + "loss": 0.6313, + "step": 4256 + }, + { + "epoch": 0.35, + "grad_norm": 0.9170638988041517, + "learning_rate": 1.5206006273437031e-05, + "loss": 0.6159, + "step": 4257 + }, + { + "epoch": 0.35, + "grad_norm": 1.0063274313357442, + "learning_rate": 1.5203758487233677e-05, + "loss": 0.5816, + "step": 4258 + }, + { + "epoch": 0.35, + "grad_norm": 0.8113862998367966, + "learning_rate": 1.5201510340414473e-05, + "loss": 0.5125, + "step": 4259 + }, + { + "epoch": 0.35, + "grad_norm": 0.958709181326403, + "learning_rate": 1.5199261833135222e-05, + "loss": 0.5564, + "step": 4260 + }, + { + "epoch": 0.35, + "grad_norm": 0.8211454039458304, + "learning_rate": 1.5197012965551733e-05, + "loss": 0.5896, + "step": 4261 + }, + { + "epoch": 0.35, + "grad_norm": 0.8304508772583704, + "learning_rate": 1.5194763737819856e-05, + "loss": 0.5553, + "step": 4262 + }, + { + "epoch": 0.35, + "grad_norm": 0.8899457437890497, + "learning_rate": 1.519251415009546e-05, + "loss": 0.5524, + "step": 4263 + }, + { + "epoch": 0.35, + "grad_norm": 0.9641008384200902, + "learning_rate": 1.5190264202534442e-05, + "loss": 0.5872, + "step": 4264 + }, + { + "epoch": 0.35, + "grad_norm": 1.0481063603542773, + "learning_rate": 1.5188013895292715e-05, + "loss": 0.6124, + "step": 4265 + }, + { + "epoch": 0.35, + "grad_norm": 1.0437724166588191, + "learning_rate": 1.5185763228526226e-05, + "loss": 0.6339, + "step": 4266 + }, + { + "epoch": 0.35, + "grad_norm": 0.8858211683890729, + "learning_rate": 1.5183512202390951e-05, + "loss": 0.5898, + "step": 4267 + }, + { + "epoch": 0.35, + "grad_norm": 0.8905028036724305, + "learning_rate": 1.518126081704287e-05, + "loss": 0.5172, + "step": 4268 + }, + { + "epoch": 0.35, + "grad_norm": 0.8816742044266888, + "learning_rate": 1.5179009072638016e-05, + "loss": 0.5368, + "step": 4269 + }, + { + "epoch": 0.35, + "grad_norm": 0.8944448914022917, + "learning_rate": 1.5176756969332428e-05, + "loss": 0.554, + "step": 4270 + }, + { + "epoch": 0.35, + "grad_norm": 0.8512773616826556, + "learning_rate": 1.5174504507282168e-05, + "loss": 0.485, + "step": 4271 + }, + { + "epoch": 0.35, + "grad_norm": 0.9369207232240409, + "learning_rate": 1.517225168664334e-05, + "loss": 0.5814, + "step": 4272 + }, + { + "epoch": 0.35, + "grad_norm": 0.8926504243414912, + "learning_rate": 1.5169998507572057e-05, + "loss": 0.5155, + "step": 4273 + }, + { + "epoch": 0.35, + "grad_norm": 0.9501844007981542, + "learning_rate": 1.5167744970224463e-05, + "loss": 0.5562, + "step": 4274 + }, + { + "epoch": 0.35, + "grad_norm": 0.8999480245495015, + "learning_rate": 1.5165491074756723e-05, + "loss": 0.5411, + "step": 4275 + }, + { + "epoch": 0.35, + "grad_norm": 0.8527653264746171, + "learning_rate": 1.5163236821325037e-05, + "loss": 0.5883, + "step": 4276 + }, + { + "epoch": 0.35, + "grad_norm": 0.8691186223661405, + "learning_rate": 1.5160982210085621e-05, + "loss": 0.5269, + "step": 4277 + }, + { + "epoch": 0.35, + "grad_norm": 0.933540473664518, + "learning_rate": 1.515872724119471e-05, + "loss": 0.5769, + "step": 4278 + }, + { + "epoch": 0.35, + "grad_norm": 0.9157217785033557, + "learning_rate": 1.5156471914808582e-05, + "loss": 0.542, + "step": 4279 + }, + { + "epoch": 0.35, + "grad_norm": 0.9699683221638261, + "learning_rate": 1.5154216231083522e-05, + "loss": 0.5454, + "step": 4280 + }, + { + "epoch": 0.35, + "grad_norm": 0.8620056534267265, + "learning_rate": 1.515196019017585e-05, + "loss": 0.4944, + "step": 4281 + }, + { + "epoch": 0.35, + "grad_norm": 0.8463023180790658, + "learning_rate": 1.5149703792241903e-05, + "loss": 0.509, + "step": 4282 + }, + { + "epoch": 0.35, + "grad_norm": 0.7887007205212211, + "learning_rate": 1.5147447037438055e-05, + "loss": 0.4653, + "step": 4283 + }, + { + "epoch": 0.35, + "grad_norm": 0.896276367645115, + "learning_rate": 1.5145189925920694e-05, + "loss": 0.5098, + "step": 4284 + }, + { + "epoch": 0.35, + "grad_norm": 0.9270094024131633, + "learning_rate": 1.514293245784623e-05, + "loss": 0.5909, + "step": 4285 + }, + { + "epoch": 0.35, + "grad_norm": 0.9105702606018491, + "learning_rate": 1.514067463337111e-05, + "loss": 0.5257, + "step": 4286 + }, + { + "epoch": 0.35, + "grad_norm": 0.9046292918014648, + "learning_rate": 1.5138416452651803e-05, + "loss": 0.5459, + "step": 4287 + }, + { + "epoch": 0.35, + "grad_norm": 0.8185620723801201, + "learning_rate": 1.5136157915844787e-05, + "loss": 0.4844, + "step": 4288 + }, + { + "epoch": 0.35, + "grad_norm": 0.9212215102695174, + "learning_rate": 1.5133899023106584e-05, + "loss": 0.551, + "step": 4289 + }, + { + "epoch": 0.35, + "grad_norm": 0.8689911637720217, + "learning_rate": 1.5131639774593737e-05, + "loss": 0.535, + "step": 4290 + }, + { + "epoch": 0.35, + "grad_norm": 0.9015688037502202, + "learning_rate": 1.5129380170462802e-05, + "loss": 0.5815, + "step": 4291 + }, + { + "epoch": 0.35, + "grad_norm": 0.8854452628551268, + "learning_rate": 1.512712021087037e-05, + "loss": 0.5552, + "step": 4292 + }, + { + "epoch": 0.35, + "grad_norm": 0.8375236940962522, + "learning_rate": 1.5124859895973058e-05, + "loss": 0.5811, + "step": 4293 + }, + { + "epoch": 0.35, + "grad_norm": 1.0396346487059804, + "learning_rate": 1.51225992259275e-05, + "loss": 0.548, + "step": 4294 + }, + { + "epoch": 0.35, + "grad_norm": 0.9574274447907284, + "learning_rate": 1.5120338200890356e-05, + "loss": 0.5502, + "step": 4295 + }, + { + "epoch": 0.35, + "grad_norm": 0.86965100414926, + "learning_rate": 1.5118076821018322e-05, + "loss": 0.5442, + "step": 4296 + }, + { + "epoch": 0.35, + "grad_norm": 0.8883625419684474, + "learning_rate": 1.5115815086468103e-05, + "loss": 0.5568, + "step": 4297 + }, + { + "epoch": 0.35, + "grad_norm": 0.8950422831418536, + "learning_rate": 1.511355299739643e-05, + "loss": 0.5023, + "step": 4298 + }, + { + "epoch": 0.35, + "grad_norm": 1.0147768322049326, + "learning_rate": 1.511129055396008e-05, + "loss": 0.5961, + "step": 4299 + }, + { + "epoch": 0.35, + "grad_norm": 0.905224107471028, + "learning_rate": 1.510902775631582e-05, + "loss": 0.542, + "step": 4300 + }, + { + "epoch": 0.35, + "grad_norm": 0.9008359298426322, + "learning_rate": 1.510676460462047e-05, + "loss": 0.5406, + "step": 4301 + }, + { + "epoch": 0.35, + "grad_norm": 1.0409019164334707, + "learning_rate": 1.5104501099030864e-05, + "loss": 0.6519, + "step": 4302 + }, + { + "epoch": 0.35, + "grad_norm": 0.92608839006507, + "learning_rate": 1.5102237239703858e-05, + "loss": 0.4852, + "step": 4303 + }, + { + "epoch": 0.35, + "grad_norm": 0.8558223846325838, + "learning_rate": 1.509997302679634e-05, + "loss": 0.545, + "step": 4304 + }, + { + "epoch": 0.35, + "grad_norm": 0.9071792434994643, + "learning_rate": 1.5097708460465214e-05, + "loss": 0.5647, + "step": 4305 + }, + { + "epoch": 0.35, + "grad_norm": 0.8243596727202004, + "learning_rate": 1.5095443540867412e-05, + "loss": 0.4917, + "step": 4306 + }, + { + "epoch": 0.35, + "grad_norm": 0.8528546266985911, + "learning_rate": 1.5093178268159892e-05, + "loss": 0.5034, + "step": 4307 + }, + { + "epoch": 0.35, + "grad_norm": 0.8606279197401389, + "learning_rate": 1.5090912642499635e-05, + "loss": 0.5733, + "step": 4308 + }, + { + "epoch": 0.35, + "grad_norm": 0.9107650353024779, + "learning_rate": 1.5088646664043652e-05, + "loss": 0.5562, + "step": 4309 + }, + { + "epoch": 0.35, + "grad_norm": 0.8475845684717342, + "learning_rate": 1.5086380332948962e-05, + "loss": 0.5049, + "step": 4310 + }, + { + "epoch": 0.35, + "grad_norm": 0.900875716155614, + "learning_rate": 1.5084113649372634e-05, + "loss": 0.5144, + "step": 4311 + }, + { + "epoch": 0.35, + "grad_norm": 0.9098977051510746, + "learning_rate": 1.5081846613471736e-05, + "loss": 0.5301, + "step": 4312 + }, + { + "epoch": 0.35, + "grad_norm": 0.9173494547586318, + "learning_rate": 1.5079579225403373e-05, + "loss": 0.5734, + "step": 4313 + }, + { + "epoch": 0.35, + "grad_norm": 0.8997686442804093, + "learning_rate": 1.507731148532468e-05, + "loss": 0.6031, + "step": 4314 + }, + { + "epoch": 0.35, + "grad_norm": 1.0035543282799402, + "learning_rate": 1.5075043393392799e-05, + "loss": 0.594, + "step": 4315 + }, + { + "epoch": 0.35, + "grad_norm": 0.9416183936432486, + "learning_rate": 1.5072774949764916e-05, + "loss": 0.563, + "step": 4316 + }, + { + "epoch": 0.35, + "grad_norm": 1.111424103703773, + "learning_rate": 1.5070506154598228e-05, + "loss": 0.6619, + "step": 4317 + }, + { + "epoch": 0.35, + "grad_norm": 0.9708532135401428, + "learning_rate": 1.5068237008049963e-05, + "loss": 0.5821, + "step": 4318 + }, + { + "epoch": 0.35, + "grad_norm": 0.914665472381504, + "learning_rate": 1.5065967510277366e-05, + "loss": 0.5453, + "step": 4319 + }, + { + "epoch": 0.35, + "grad_norm": 0.8798308983590678, + "learning_rate": 1.5063697661437713e-05, + "loss": 0.5403, + "step": 4320 + }, + { + "epoch": 0.35, + "grad_norm": 0.8131661194438965, + "learning_rate": 1.5061427461688306e-05, + "loss": 0.4882, + "step": 4321 + }, + { + "epoch": 0.35, + "grad_norm": 0.9820694718040851, + "learning_rate": 1.5059156911186465e-05, + "loss": 0.5487, + "step": 4322 + }, + { + "epoch": 0.35, + "grad_norm": 1.0046413759018036, + "learning_rate": 1.5056886010089536e-05, + "loss": 0.6133, + "step": 4323 + }, + { + "epoch": 0.35, + "grad_norm": 0.8878118116815175, + "learning_rate": 1.5054614758554896e-05, + "loss": 0.653, + "step": 4324 + }, + { + "epoch": 0.35, + "grad_norm": 1.2206966797071999, + "learning_rate": 1.5052343156739933e-05, + "loss": 0.6036, + "step": 4325 + }, + { + "epoch": 0.35, + "grad_norm": 0.9180573056939928, + "learning_rate": 1.5050071204802073e-05, + "loss": 0.5868, + "step": 4326 + }, + { + "epoch": 0.35, + "grad_norm": 1.0217753319934397, + "learning_rate": 1.5047798902898756e-05, + "loss": 0.622, + "step": 4327 + }, + { + "epoch": 0.35, + "grad_norm": 0.8974867336494516, + "learning_rate": 1.5045526251187452e-05, + "loss": 0.5798, + "step": 4328 + }, + { + "epoch": 0.35, + "grad_norm": 0.9152499171753111, + "learning_rate": 1.5043253249825656e-05, + "loss": 0.6113, + "step": 4329 + }, + { + "epoch": 0.35, + "grad_norm": 0.9093691066885794, + "learning_rate": 1.5040979898970883e-05, + "loss": 0.588, + "step": 4330 + }, + { + "epoch": 0.35, + "grad_norm": 1.1133396282471304, + "learning_rate": 1.5038706198780673e-05, + "loss": 0.5973, + "step": 4331 + }, + { + "epoch": 0.35, + "grad_norm": 0.9198753057184388, + "learning_rate": 1.5036432149412592e-05, + "loss": 0.5804, + "step": 4332 + }, + { + "epoch": 0.35, + "grad_norm": 1.0969343843678707, + "learning_rate": 1.5034157751024232e-05, + "loss": 0.6763, + "step": 4333 + }, + { + "epoch": 0.35, + "grad_norm": 0.9186493768039867, + "learning_rate": 1.5031883003773206e-05, + "loss": 0.5646, + "step": 4334 + }, + { + "epoch": 0.35, + "grad_norm": 0.9582047355056563, + "learning_rate": 1.502960790781715e-05, + "loss": 0.5473, + "step": 4335 + }, + { + "epoch": 0.35, + "grad_norm": 0.8762162500326096, + "learning_rate": 1.5027332463313729e-05, + "loss": 0.622, + "step": 4336 + }, + { + "epoch": 0.35, + "grad_norm": 0.8232637771036523, + "learning_rate": 1.5025056670420624e-05, + "loss": 0.5395, + "step": 4337 + }, + { + "epoch": 0.35, + "grad_norm": 0.90982048874615, + "learning_rate": 1.502278052929555e-05, + "loss": 0.5762, + "step": 4338 + }, + { + "epoch": 0.35, + "grad_norm": 0.9655525218679801, + "learning_rate": 1.5020504040096241e-05, + "loss": 0.5324, + "step": 4339 + }, + { + "epoch": 0.35, + "grad_norm": 0.7943921727751669, + "learning_rate": 1.5018227202980455e-05, + "loss": 0.4902, + "step": 4340 + }, + { + "epoch": 0.35, + "grad_norm": 1.0548661431756876, + "learning_rate": 1.5015950018105976e-05, + "loss": 0.4857, + "step": 4341 + }, + { + "epoch": 0.35, + "grad_norm": 0.9307571470045382, + "learning_rate": 1.5013672485630611e-05, + "loss": 0.6182, + "step": 4342 + }, + { + "epoch": 0.35, + "grad_norm": 0.929896560472335, + "learning_rate": 1.5011394605712188e-05, + "loss": 0.4729, + "step": 4343 + }, + { + "epoch": 0.35, + "grad_norm": 0.8867515976963221, + "learning_rate": 1.5009116378508564e-05, + "loss": 0.5139, + "step": 4344 + }, + { + "epoch": 0.35, + "grad_norm": 1.02226268665367, + "learning_rate": 1.5006837804177618e-05, + "loss": 0.556, + "step": 4345 + }, + { + "epoch": 0.35, + "grad_norm": 0.9191762225421793, + "learning_rate": 1.5004558882877254e-05, + "loss": 0.6007, + "step": 4346 + }, + { + "epoch": 0.35, + "grad_norm": 0.9514399880147593, + "learning_rate": 1.5002279614765396e-05, + "loss": 0.5309, + "step": 4347 + }, + { + "epoch": 0.35, + "grad_norm": 0.9494607283066798, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.4908, + "step": 4348 + }, + { + "epoch": 0.35, + "grad_norm": 1.1928182550584672, + "learning_rate": 1.499772003873904e-05, + "loss": 0.5622, + "step": 4349 + }, + { + "epoch": 0.35, + "grad_norm": 0.9683394095413308, + "learning_rate": 1.4995439731140512e-05, + "loss": 0.548, + "step": 4350 + }, + { + "epoch": 0.35, + "grad_norm": 1.0536520054444802, + "learning_rate": 1.4993159077362445e-05, + "loss": 0.6328, + "step": 4351 + }, + { + "epoch": 0.35, + "grad_norm": 0.9355912661637906, + "learning_rate": 1.499087807756288e-05, + "loss": 0.5965, + "step": 4352 + }, + { + "epoch": 0.35, + "grad_norm": 0.9159385022666104, + "learning_rate": 1.4988596731899889e-05, + "loss": 0.6053, + "step": 4353 + }, + { + "epoch": 0.35, + "grad_norm": 0.9298009190003068, + "learning_rate": 1.4986315040531574e-05, + "loss": 0.5606, + "step": 4354 + }, + { + "epoch": 0.35, + "grad_norm": 0.9000952798543325, + "learning_rate": 1.4984033003616047e-05, + "loss": 0.603, + "step": 4355 + }, + { + "epoch": 0.35, + "grad_norm": 0.98380076433322, + "learning_rate": 1.4981750621311453e-05, + "loss": 0.5949, + "step": 4356 + }, + { + "epoch": 0.35, + "grad_norm": 0.9435040964843473, + "learning_rate": 1.4979467893775963e-05, + "loss": 0.534, + "step": 4357 + }, + { + "epoch": 0.35, + "grad_norm": 0.9903874881684613, + "learning_rate": 1.4977184821167764e-05, + "loss": 0.545, + "step": 4358 + }, + { + "epoch": 0.35, + "grad_norm": 0.9306842224405125, + "learning_rate": 1.4974901403645068e-05, + "loss": 0.4705, + "step": 4359 + }, + { + "epoch": 0.35, + "grad_norm": 0.879005834633366, + "learning_rate": 1.497261764136612e-05, + "loss": 0.5459, + "step": 4360 + }, + { + "epoch": 0.35, + "grad_norm": 0.8701678877283088, + "learning_rate": 1.4970333534489179e-05, + "loss": 0.6014, + "step": 4361 + }, + { + "epoch": 0.35, + "grad_norm": 0.7909788159248554, + "learning_rate": 1.4968049083172534e-05, + "loss": 0.5239, + "step": 4362 + }, + { + "epoch": 0.35, + "grad_norm": 0.8666354990733092, + "learning_rate": 1.4965764287574494e-05, + "loss": 0.5672, + "step": 4363 + }, + { + "epoch": 0.35, + "grad_norm": 0.9548946836991843, + "learning_rate": 1.4963479147853393e-05, + "loss": 0.5818, + "step": 4364 + }, + { + "epoch": 0.35, + "grad_norm": 1.0469311919480322, + "learning_rate": 1.496119366416759e-05, + "loss": 0.6307, + "step": 4365 + }, + { + "epoch": 0.35, + "grad_norm": 0.9131536614559036, + "learning_rate": 1.4958907836675467e-05, + "loss": 0.5407, + "step": 4366 + }, + { + "epoch": 0.35, + "grad_norm": 0.9119903308980676, + "learning_rate": 1.495662166553543e-05, + "loss": 0.5048, + "step": 4367 + }, + { + "epoch": 0.36, + "grad_norm": 1.139918394904237, + "learning_rate": 1.4954335150905905e-05, + "loss": 0.5712, + "step": 4368 + }, + { + "epoch": 0.36, + "grad_norm": 0.982964987239106, + "learning_rate": 1.4952048292945352e-05, + "loss": 0.5777, + "step": 4369 + }, + { + "epoch": 0.36, + "grad_norm": 0.9613085225608329, + "learning_rate": 1.4949761091812243e-05, + "loss": 0.5306, + "step": 4370 + }, + { + "epoch": 0.36, + "grad_norm": 1.0194065477967498, + "learning_rate": 1.4947473547665081e-05, + "loss": 0.6194, + "step": 4371 + }, + { + "epoch": 0.36, + "grad_norm": 0.8825081634776647, + "learning_rate": 1.4945185660662391e-05, + "loss": 0.5097, + "step": 4372 + }, + { + "epoch": 0.36, + "grad_norm": 0.9650839534634333, + "learning_rate": 1.4942897430962722e-05, + "loss": 0.4978, + "step": 4373 + }, + { + "epoch": 0.36, + "grad_norm": 1.0318679026761992, + "learning_rate": 1.494060885872464e-05, + "loss": 0.5238, + "step": 4374 + }, + { + "epoch": 0.36, + "grad_norm": 0.8864586334831703, + "learning_rate": 1.4938319944106754e-05, + "loss": 0.5226, + "step": 4375 + }, + { + "epoch": 0.36, + "grad_norm": 1.0151435374926832, + "learning_rate": 1.4936030687267672e-05, + "loss": 0.5641, + "step": 4376 + }, + { + "epoch": 0.36, + "grad_norm": 0.9680320885205316, + "learning_rate": 1.4933741088366043e-05, + "loss": 0.6272, + "step": 4377 + }, + { + "epoch": 0.36, + "grad_norm": 1.0063656243311683, + "learning_rate": 1.4931451147560534e-05, + "loss": 0.5885, + "step": 4378 + }, + { + "epoch": 0.36, + "grad_norm": 0.8969813806528744, + "learning_rate": 1.492916086500983e-05, + "loss": 0.5366, + "step": 4379 + }, + { + "epoch": 0.36, + "grad_norm": 0.89570113690597, + "learning_rate": 1.4926870240872652e-05, + "loss": 0.5287, + "step": 4380 + }, + { + "epoch": 0.36, + "grad_norm": 1.6806229089415208, + "learning_rate": 1.4924579275307738e-05, + "loss": 0.5136, + "step": 4381 + }, + { + "epoch": 0.36, + "grad_norm": 0.8772278715634582, + "learning_rate": 1.492228796847385e-05, + "loss": 0.5823, + "step": 4382 + }, + { + "epoch": 0.36, + "grad_norm": 1.0148013614453049, + "learning_rate": 1.4919996320529768e-05, + "loss": 0.6213, + "step": 4383 + }, + { + "epoch": 0.36, + "grad_norm": 0.9820137070849825, + "learning_rate": 1.4917704331634306e-05, + "loss": 0.5012, + "step": 4384 + }, + { + "epoch": 0.36, + "grad_norm": 0.8983543389621446, + "learning_rate": 1.49154120019463e-05, + "loss": 0.5577, + "step": 4385 + }, + { + "epoch": 0.36, + "grad_norm": 0.839493694424055, + "learning_rate": 1.4913119331624597e-05, + "loss": 0.5441, + "step": 4386 + }, + { + "epoch": 0.36, + "grad_norm": 0.9193981006433436, + "learning_rate": 1.4910826320828085e-05, + "loss": 0.5411, + "step": 4387 + }, + { + "epoch": 0.36, + "grad_norm": 1.0090728948785002, + "learning_rate": 1.4908532969715663e-05, + "loss": 0.5255, + "step": 4388 + }, + { + "epoch": 0.36, + "grad_norm": 0.9836631247147275, + "learning_rate": 1.490623927844626e-05, + "loss": 0.5497, + "step": 4389 + }, + { + "epoch": 0.36, + "grad_norm": 0.8860894415981879, + "learning_rate": 1.4903945247178828e-05, + "loss": 0.5318, + "step": 4390 + }, + { + "epoch": 0.36, + "grad_norm": 0.8670465037749358, + "learning_rate": 1.4901650876072342e-05, + "loss": 0.5369, + "step": 4391 + }, + { + "epoch": 0.36, + "grad_norm": 0.9953631170549834, + "learning_rate": 1.4899356165285794e-05, + "loss": 0.5545, + "step": 4392 + }, + { + "epoch": 0.36, + "grad_norm": 0.9063420930863816, + "learning_rate": 1.489706111497821e-05, + "loss": 0.5357, + "step": 4393 + }, + { + "epoch": 0.36, + "grad_norm": 0.881755479709398, + "learning_rate": 1.489476572530864e-05, + "loss": 0.5343, + "step": 4394 + }, + { + "epoch": 0.36, + "grad_norm": 0.8193129452548926, + "learning_rate": 1.489246999643614e-05, + "loss": 0.4722, + "step": 4395 + }, + { + "epoch": 0.36, + "grad_norm": 0.8783440703065661, + "learning_rate": 1.489017392851981e-05, + "loss": 0.5433, + "step": 4396 + }, + { + "epoch": 0.36, + "grad_norm": 0.8547465151818406, + "learning_rate": 1.488787752171877e-05, + "loss": 0.5481, + "step": 4397 + }, + { + "epoch": 0.36, + "grad_norm": 0.9133780786690582, + "learning_rate": 1.4885580776192149e-05, + "loss": 0.5353, + "step": 4398 + }, + { + "epoch": 0.36, + "grad_norm": 0.8782420674843047, + "learning_rate": 1.4883283692099114e-05, + "loss": 0.5447, + "step": 4399 + }, + { + "epoch": 0.36, + "grad_norm": 0.9650825987663942, + "learning_rate": 1.488098626959885e-05, + "loss": 0.6096, + "step": 4400 + }, + { + "epoch": 0.36, + "grad_norm": 0.9457461408325257, + "learning_rate": 1.4878688508850567e-05, + "loss": 0.5685, + "step": 4401 + }, + { + "epoch": 0.36, + "grad_norm": 0.8396109468560583, + "learning_rate": 1.4876390410013498e-05, + "loss": 0.5542, + "step": 4402 + }, + { + "epoch": 0.36, + "grad_norm": 1.10276618772219, + "learning_rate": 1.48740919732469e-05, + "loss": 0.6523, + "step": 4403 + }, + { + "epoch": 0.36, + "grad_norm": 0.8469604534554713, + "learning_rate": 1.487179319871005e-05, + "loss": 0.5672, + "step": 4404 + }, + { + "epoch": 0.36, + "grad_norm": 0.9119790850830539, + "learning_rate": 1.4869494086562253e-05, + "loss": 0.6194, + "step": 4405 + }, + { + "epoch": 0.36, + "grad_norm": 0.887038406000676, + "learning_rate": 1.4867194636962836e-05, + "loss": 0.5182, + "step": 4406 + }, + { + "epoch": 0.36, + "grad_norm": 1.0079039365291105, + "learning_rate": 1.4864894850071147e-05, + "loss": 0.5612, + "step": 4407 + }, + { + "epoch": 0.36, + "grad_norm": 0.8390473208179717, + "learning_rate": 1.486259472604656e-05, + "loss": 0.5347, + "step": 4408 + }, + { + "epoch": 0.36, + "grad_norm": 0.9686207645890165, + "learning_rate": 1.4860294265048474e-05, + "loss": 0.5913, + "step": 4409 + }, + { + "epoch": 0.36, + "grad_norm": 1.0015965610679087, + "learning_rate": 1.4857993467236303e-05, + "loss": 0.6163, + "step": 4410 + }, + { + "epoch": 0.36, + "grad_norm": 0.9101404073900543, + "learning_rate": 1.4855692332769494e-05, + "loss": 0.537, + "step": 4411 + }, + { + "epoch": 0.36, + "grad_norm": 0.9229731168365843, + "learning_rate": 1.4853390861807518e-05, + "loss": 0.5235, + "step": 4412 + }, + { + "epoch": 0.36, + "grad_norm": 0.8095758622279768, + "learning_rate": 1.4851089054509852e-05, + "loss": 0.5163, + "step": 4413 + }, + { + "epoch": 0.36, + "grad_norm": 0.9142132578654567, + "learning_rate": 1.4848786911036023e-05, + "loss": 0.5656, + "step": 4414 + }, + { + "epoch": 0.36, + "grad_norm": 1.0076384074827356, + "learning_rate": 1.4846484431545562e-05, + "loss": 0.6286, + "step": 4415 + }, + { + "epoch": 0.36, + "grad_norm": 1.0666728554711093, + "learning_rate": 1.4844181616198028e-05, + "loss": 0.598, + "step": 4416 + }, + { + "epoch": 0.36, + "grad_norm": 0.9561431209856557, + "learning_rate": 1.4841878465153006e-05, + "loss": 0.6087, + "step": 4417 + }, + { + "epoch": 0.36, + "grad_norm": 0.8399907289741158, + "learning_rate": 1.4839574978570098e-05, + "loss": 0.5377, + "step": 4418 + }, + { + "epoch": 0.36, + "grad_norm": 0.9088906220770147, + "learning_rate": 1.4837271156608938e-05, + "loss": 0.5189, + "step": 4419 + }, + { + "epoch": 0.36, + "grad_norm": 0.907192125612477, + "learning_rate": 1.4834966999429179e-05, + "loss": 0.5094, + "step": 4420 + }, + { + "epoch": 0.36, + "grad_norm": 0.9549375328800763, + "learning_rate": 1.4832662507190493e-05, + "loss": 0.5861, + "step": 4421 + }, + { + "epoch": 0.36, + "grad_norm": 0.8195915273699955, + "learning_rate": 1.4830357680052586e-05, + "loss": 0.448, + "step": 4422 + }, + { + "epoch": 0.36, + "grad_norm": 0.8457199060127676, + "learning_rate": 1.4828052518175172e-05, + "loss": 0.5053, + "step": 4423 + }, + { + "epoch": 0.36, + "grad_norm": 0.9655078403174225, + "learning_rate": 1.4825747021718002e-05, + "loss": 0.5754, + "step": 4424 + }, + { + "epoch": 0.36, + "grad_norm": 0.8993455107136055, + "learning_rate": 1.4823441190840844e-05, + "loss": 0.5951, + "step": 4425 + }, + { + "epoch": 0.36, + "grad_norm": 0.9695646262062783, + "learning_rate": 1.4821135025703491e-05, + "loss": 0.5539, + "step": 4426 + }, + { + "epoch": 0.36, + "grad_norm": 0.9340908557152009, + "learning_rate": 1.4818828526465755e-05, + "loss": 0.5463, + "step": 4427 + }, + { + "epoch": 0.36, + "grad_norm": 0.9484184703205448, + "learning_rate": 1.4816521693287477e-05, + "loss": 0.5511, + "step": 4428 + }, + { + "epoch": 0.36, + "grad_norm": 0.8090461904484498, + "learning_rate": 1.481421452632852e-05, + "loss": 0.5391, + "step": 4429 + }, + { + "epoch": 0.36, + "grad_norm": 0.9068706093760909, + "learning_rate": 1.4811907025748764e-05, + "loss": 0.531, + "step": 4430 + }, + { + "epoch": 0.36, + "grad_norm": 0.8404838158009921, + "learning_rate": 1.480959919170812e-05, + "loss": 0.5492, + "step": 4431 + }, + { + "epoch": 0.36, + "grad_norm": 0.8289162524796344, + "learning_rate": 1.4807291024366519e-05, + "loss": 0.5513, + "step": 4432 + }, + { + "epoch": 0.36, + "grad_norm": 0.8394841803055377, + "learning_rate": 1.4804982523883915e-05, + "loss": 0.5078, + "step": 4433 + }, + { + "epoch": 0.36, + "grad_norm": 0.9250383568107076, + "learning_rate": 1.4802673690420281e-05, + "loss": 0.5885, + "step": 4434 + }, + { + "epoch": 0.36, + "grad_norm": 0.9495083386718011, + "learning_rate": 1.4800364524135622e-05, + "loss": 0.5778, + "step": 4435 + }, + { + "epoch": 0.36, + "grad_norm": 0.9536242637144737, + "learning_rate": 1.4798055025189962e-05, + "loss": 0.6067, + "step": 4436 + }, + { + "epoch": 0.36, + "grad_norm": 0.8677917692119125, + "learning_rate": 1.4795745193743341e-05, + "loss": 0.5587, + "step": 4437 + }, + { + "epoch": 0.36, + "grad_norm": 0.8866934244445475, + "learning_rate": 1.4793435029955832e-05, + "loss": 0.5103, + "step": 4438 + }, + { + "epoch": 0.36, + "grad_norm": 1.0042081691754168, + "learning_rate": 1.4791124533987529e-05, + "loss": 0.6197, + "step": 4439 + }, + { + "epoch": 0.36, + "grad_norm": 0.869082561205368, + "learning_rate": 1.4788813705998543e-05, + "loss": 0.6103, + "step": 4440 + }, + { + "epoch": 0.36, + "grad_norm": 0.9407422204488816, + "learning_rate": 1.4786502546149015e-05, + "loss": 0.5694, + "step": 4441 + }, + { + "epoch": 0.36, + "grad_norm": 0.9278704826539301, + "learning_rate": 1.4784191054599109e-05, + "loss": 0.5968, + "step": 4442 + }, + { + "epoch": 0.36, + "grad_norm": 0.9588874501973558, + "learning_rate": 1.4781879231509005e-05, + "loss": 0.5519, + "step": 4443 + }, + { + "epoch": 0.36, + "grad_norm": 0.8591868818411956, + "learning_rate": 1.4779567077038912e-05, + "loss": 0.5607, + "step": 4444 + }, + { + "epoch": 0.36, + "grad_norm": 0.8701437695360021, + "learning_rate": 1.477725459134906e-05, + "loss": 0.5065, + "step": 4445 + }, + { + "epoch": 0.36, + "grad_norm": 0.9023479514656566, + "learning_rate": 1.4774941774599703e-05, + "loss": 0.5855, + "step": 4446 + }, + { + "epoch": 0.36, + "grad_norm": 0.8904185731123726, + "learning_rate": 1.4772628626951114e-05, + "loss": 0.5713, + "step": 4447 + }, + { + "epoch": 0.36, + "grad_norm": 0.8535558813189821, + "learning_rate": 1.4770315148563596e-05, + "loss": 0.5409, + "step": 4448 + }, + { + "epoch": 0.36, + "grad_norm": 0.962305319537762, + "learning_rate": 1.4768001339597471e-05, + "loss": 0.5573, + "step": 4449 + }, + { + "epoch": 0.36, + "grad_norm": 0.9126805318858925, + "learning_rate": 1.4765687200213079e-05, + "loss": 0.5475, + "step": 4450 + }, + { + "epoch": 0.36, + "grad_norm": 0.9893873429771054, + "learning_rate": 1.4763372730570793e-05, + "loss": 0.6411, + "step": 4451 + }, + { + "epoch": 0.36, + "grad_norm": 0.935005237685736, + "learning_rate": 1.4761057930831002e-05, + "loss": 0.5588, + "step": 4452 + }, + { + "epoch": 0.36, + "grad_norm": 0.9159587474165702, + "learning_rate": 1.475874280115412e-05, + "loss": 0.5548, + "step": 4453 + }, + { + "epoch": 0.36, + "grad_norm": 0.9045849178952995, + "learning_rate": 1.475642734170058e-05, + "loss": 0.5059, + "step": 4454 + }, + { + "epoch": 0.36, + "grad_norm": 0.9272647834317185, + "learning_rate": 1.475411155263085e-05, + "loss": 0.5701, + "step": 4455 + }, + { + "epoch": 0.36, + "grad_norm": 0.8679504825088333, + "learning_rate": 1.47517954341054e-05, + "loss": 0.5504, + "step": 4456 + }, + { + "epoch": 0.36, + "grad_norm": 0.8784123628733355, + "learning_rate": 1.4749478986284743e-05, + "loss": 0.5651, + "step": 4457 + }, + { + "epoch": 0.36, + "grad_norm": 0.7613363838766739, + "learning_rate": 1.4747162209329408e-05, + "loss": 0.5165, + "step": 4458 + }, + { + "epoch": 0.36, + "grad_norm": 1.014684584701171, + "learning_rate": 1.474484510339994e-05, + "loss": 0.6104, + "step": 4459 + }, + { + "epoch": 0.36, + "grad_norm": 0.8678082577950834, + "learning_rate": 1.4742527668656915e-05, + "loss": 0.5358, + "step": 4460 + }, + { + "epoch": 0.36, + "grad_norm": 0.9801279027102768, + "learning_rate": 1.474020990526093e-05, + "loss": 0.5773, + "step": 4461 + }, + { + "epoch": 0.36, + "grad_norm": 0.829818263982647, + "learning_rate": 1.4737891813372605e-05, + "loss": 0.5223, + "step": 4462 + }, + { + "epoch": 0.36, + "grad_norm": 0.8525757596323112, + "learning_rate": 1.4735573393152576e-05, + "loss": 0.5602, + "step": 4463 + }, + { + "epoch": 0.36, + "grad_norm": 0.9103375069043098, + "learning_rate": 1.4733254644761514e-05, + "loss": 0.5933, + "step": 4464 + }, + { + "epoch": 0.36, + "grad_norm": 0.9087295841386293, + "learning_rate": 1.4730935568360103e-05, + "loss": 0.6359, + "step": 4465 + }, + { + "epoch": 0.36, + "grad_norm": 0.9403445729528365, + "learning_rate": 1.4728616164109051e-05, + "loss": 0.5755, + "step": 4466 + }, + { + "epoch": 0.36, + "grad_norm": 0.879590804392101, + "learning_rate": 1.4726296432169095e-05, + "loss": 0.5774, + "step": 4467 + }, + { + "epoch": 0.36, + "grad_norm": 0.9251101238642484, + "learning_rate": 1.472397637270099e-05, + "loss": 0.5488, + "step": 4468 + }, + { + "epoch": 0.36, + "grad_norm": 0.9440786095944539, + "learning_rate": 1.472165598586551e-05, + "loss": 0.5501, + "step": 4469 + }, + { + "epoch": 0.36, + "grad_norm": 0.892311824110475, + "learning_rate": 1.4719335271823461e-05, + "loss": 0.5545, + "step": 4470 + }, + { + "epoch": 0.36, + "grad_norm": 0.9222735252882099, + "learning_rate": 1.4717014230735661e-05, + "loss": 0.5671, + "step": 4471 + }, + { + "epoch": 0.36, + "grad_norm": 0.8774550769864838, + "learning_rate": 1.471469286276296e-05, + "loss": 0.5856, + "step": 4472 + }, + { + "epoch": 0.36, + "grad_norm": 0.8945744492384955, + "learning_rate": 1.4712371168066227e-05, + "loss": 0.5554, + "step": 4473 + }, + { + "epoch": 0.36, + "grad_norm": 0.8632486488540376, + "learning_rate": 1.4710049146806348e-05, + "loss": 0.5571, + "step": 4474 + }, + { + "epoch": 0.36, + "grad_norm": 0.9075970561265309, + "learning_rate": 1.4707726799144245e-05, + "loss": 0.5557, + "step": 4475 + }, + { + "epoch": 0.36, + "grad_norm": 0.8623183406655154, + "learning_rate": 1.4705404125240849e-05, + "loss": 0.562, + "step": 4476 + }, + { + "epoch": 0.36, + "grad_norm": 0.8993540931201318, + "learning_rate": 1.470308112525712e-05, + "loss": 0.5011, + "step": 4477 + }, + { + "epoch": 0.36, + "grad_norm": 0.9070876790888006, + "learning_rate": 1.470075779935404e-05, + "loss": 0.5386, + "step": 4478 + }, + { + "epoch": 0.36, + "grad_norm": 0.894769666418227, + "learning_rate": 1.4698434147692618e-05, + "loss": 0.6098, + "step": 4479 + }, + { + "epoch": 0.36, + "grad_norm": 0.89583961043636, + "learning_rate": 1.4696110170433873e-05, + "loss": 0.5813, + "step": 4480 + }, + { + "epoch": 0.36, + "grad_norm": 0.8861955251651111, + "learning_rate": 1.469378586773886e-05, + "loss": 0.565, + "step": 4481 + }, + { + "epoch": 0.36, + "grad_norm": 0.9275619282158948, + "learning_rate": 1.4691461239768649e-05, + "loss": 0.5474, + "step": 4482 + }, + { + "epoch": 0.36, + "grad_norm": 0.9046505136392399, + "learning_rate": 1.4689136286684335e-05, + "loss": 0.5759, + "step": 4483 + }, + { + "epoch": 0.36, + "grad_norm": 0.9305391975015446, + "learning_rate": 1.4686811008647037e-05, + "loss": 0.5531, + "step": 4484 + }, + { + "epoch": 0.36, + "grad_norm": 0.9903532887324388, + "learning_rate": 1.4684485405817897e-05, + "loss": 0.578, + "step": 4485 + }, + { + "epoch": 0.36, + "grad_norm": 0.905053324048193, + "learning_rate": 1.4682159478358067e-05, + "loss": 0.5344, + "step": 4486 + }, + { + "epoch": 0.36, + "grad_norm": 0.9706423919522855, + "learning_rate": 1.467983322642874e-05, + "loss": 0.5948, + "step": 4487 + }, + { + "epoch": 0.36, + "grad_norm": 0.9127425851510094, + "learning_rate": 1.4677506650191124e-05, + "loss": 0.5651, + "step": 4488 + }, + { + "epoch": 0.36, + "grad_norm": 0.9640169198932111, + "learning_rate": 1.4675179749806444e-05, + "loss": 0.5849, + "step": 4489 + }, + { + "epoch": 0.36, + "grad_norm": 0.8409429176257024, + "learning_rate": 1.4672852525435954e-05, + "loss": 0.5098, + "step": 4490 + }, + { + "epoch": 0.37, + "grad_norm": 0.9086225149968713, + "learning_rate": 1.4670524977240929e-05, + "loss": 0.5602, + "step": 4491 + }, + { + "epoch": 0.37, + "grad_norm": 0.9895660646217311, + "learning_rate": 1.4668197105382667e-05, + "loss": 0.6056, + "step": 4492 + }, + { + "epoch": 0.37, + "grad_norm": 0.9110056402144131, + "learning_rate": 1.4665868910022485e-05, + "loss": 0.569, + "step": 4493 + }, + { + "epoch": 0.37, + "grad_norm": 0.964291622484286, + "learning_rate": 1.4663540391321726e-05, + "loss": 0.5913, + "step": 4494 + }, + { + "epoch": 0.37, + "grad_norm": 1.0321238359078417, + "learning_rate": 1.4661211549441756e-05, + "loss": 0.6189, + "step": 4495 + }, + { + "epoch": 0.37, + "grad_norm": 0.8883996584082586, + "learning_rate": 1.465888238454396e-05, + "loss": 0.5703, + "step": 4496 + }, + { + "epoch": 0.37, + "grad_norm": 0.9221703556073723, + "learning_rate": 1.4656552896789746e-05, + "loss": 0.5558, + "step": 4497 + }, + { + "epoch": 0.37, + "grad_norm": 0.8746129298054462, + "learning_rate": 1.4654223086340547e-05, + "loss": 0.488, + "step": 4498 + }, + { + "epoch": 0.37, + "grad_norm": 0.9292851665842938, + "learning_rate": 1.4651892953357816e-05, + "loss": 0.5466, + "step": 4499 + }, + { + "epoch": 0.37, + "grad_norm": 0.9489222789168605, + "learning_rate": 1.4649562498003032e-05, + "loss": 0.6406, + "step": 4500 + }, + { + "epoch": 0.37, + "grad_norm": 0.9472256638031281, + "learning_rate": 1.4647231720437687e-05, + "loss": 0.5931, + "step": 4501 + }, + { + "epoch": 0.37, + "grad_norm": 0.8927187601605701, + "learning_rate": 1.4644900620823308e-05, + "loss": 0.5251, + "step": 4502 + }, + { + "epoch": 0.37, + "grad_norm": 0.9092235192734804, + "learning_rate": 1.4642569199321436e-05, + "loss": 0.5596, + "step": 4503 + }, + { + "epoch": 0.37, + "grad_norm": 0.9911171371540515, + "learning_rate": 1.4640237456093636e-05, + "loss": 0.6254, + "step": 4504 + }, + { + "epoch": 0.37, + "grad_norm": 0.8902682685340019, + "learning_rate": 1.4637905391301496e-05, + "loss": 0.5597, + "step": 4505 + }, + { + "epoch": 0.37, + "grad_norm": 0.9779813845697346, + "learning_rate": 1.4635573005106627e-05, + "loss": 0.5155, + "step": 4506 + }, + { + "epoch": 0.37, + "grad_norm": 0.8593153982172275, + "learning_rate": 1.4633240297670661e-05, + "loss": 0.5848, + "step": 4507 + }, + { + "epoch": 0.37, + "grad_norm": 0.9417037319181273, + "learning_rate": 1.463090726915525e-05, + "loss": 0.5036, + "step": 4508 + }, + { + "epoch": 0.37, + "grad_norm": 0.826959279039463, + "learning_rate": 1.4628573919722073e-05, + "loss": 0.5068, + "step": 4509 + }, + { + "epoch": 0.37, + "grad_norm": 0.9122443267097082, + "learning_rate": 1.4626240249532833e-05, + "loss": 0.5697, + "step": 4510 + }, + { + "epoch": 0.37, + "grad_norm": 0.911386237979581, + "learning_rate": 1.4623906258749243e-05, + "loss": 0.5511, + "step": 4511 + }, + { + "epoch": 0.37, + "grad_norm": 0.9836468674217567, + "learning_rate": 1.462157194753305e-05, + "loss": 0.6439, + "step": 4512 + }, + { + "epoch": 0.37, + "grad_norm": 0.9245030210290315, + "learning_rate": 1.4619237316046024e-05, + "loss": 0.5938, + "step": 4513 + }, + { + "epoch": 0.37, + "grad_norm": 0.8823041615405315, + "learning_rate": 1.4616902364449947e-05, + "loss": 0.5385, + "step": 4514 + }, + { + "epoch": 0.37, + "grad_norm": 0.9282146018943377, + "learning_rate": 1.4614567092906631e-05, + "loss": 0.5289, + "step": 4515 + }, + { + "epoch": 0.37, + "grad_norm": 0.8513409879459296, + "learning_rate": 1.4612231501577912e-05, + "loss": 0.4712, + "step": 4516 + }, + { + "epoch": 0.37, + "grad_norm": 0.8970738325052732, + "learning_rate": 1.4609895590625635e-05, + "loss": 0.5656, + "step": 4517 + }, + { + "epoch": 0.37, + "grad_norm": 0.8977821484073438, + "learning_rate": 1.4607559360211688e-05, + "loss": 0.5297, + "step": 4518 + }, + { + "epoch": 0.37, + "grad_norm": 0.8660669647290293, + "learning_rate": 1.460522281049796e-05, + "loss": 0.5524, + "step": 4519 + }, + { + "epoch": 0.37, + "grad_norm": 0.8910709052637866, + "learning_rate": 1.460288594164638e-05, + "loss": 0.5052, + "step": 4520 + }, + { + "epoch": 0.37, + "grad_norm": 0.8727890717084368, + "learning_rate": 1.4600548753818884e-05, + "loss": 0.4904, + "step": 4521 + }, + { + "epoch": 0.37, + "grad_norm": 0.9238031222868925, + "learning_rate": 1.4598211247177443e-05, + "loss": 0.5652, + "step": 4522 + }, + { + "epoch": 0.37, + "grad_norm": 0.9175522190299712, + "learning_rate": 1.459587342188404e-05, + "loss": 0.5368, + "step": 4523 + }, + { + "epoch": 0.37, + "grad_norm": 0.9529440524959538, + "learning_rate": 1.4593535278100684e-05, + "loss": 0.5888, + "step": 4524 + }, + { + "epoch": 0.37, + "grad_norm": 0.8746329567698374, + "learning_rate": 1.4591196815989407e-05, + "loss": 0.5749, + "step": 4525 + }, + { + "epoch": 0.37, + "grad_norm": 0.9699853208903029, + "learning_rate": 1.4588858035712266e-05, + "loss": 0.5508, + "step": 4526 + }, + { + "epoch": 0.37, + "grad_norm": 0.9555329839997029, + "learning_rate": 1.4586518937431332e-05, + "loss": 0.5743, + "step": 4527 + }, + { + "epoch": 0.37, + "grad_norm": 0.9458608579519454, + "learning_rate": 1.4584179521308703e-05, + "loss": 0.608, + "step": 4528 + }, + { + "epoch": 0.37, + "grad_norm": 0.8527041686893861, + "learning_rate": 1.45818397875065e-05, + "loss": 0.473, + "step": 4529 + }, + { + "epoch": 0.37, + "grad_norm": 0.8928812741342821, + "learning_rate": 1.4579499736186864e-05, + "loss": 0.5538, + "step": 4530 + }, + { + "epoch": 0.37, + "grad_norm": 0.9120715539821165, + "learning_rate": 1.4577159367511959e-05, + "loss": 0.4966, + "step": 4531 + }, + { + "epoch": 0.37, + "grad_norm": 1.0404997402846254, + "learning_rate": 1.457481868164397e-05, + "loss": 0.5259, + "step": 4532 + }, + { + "epoch": 0.37, + "grad_norm": 0.9213843354801686, + "learning_rate": 1.45724776787451e-05, + "loss": 0.5097, + "step": 4533 + }, + { + "epoch": 0.37, + "grad_norm": 1.1301394046908915, + "learning_rate": 1.4570136358977589e-05, + "loss": 0.5866, + "step": 4534 + }, + { + "epoch": 0.37, + "grad_norm": 0.8993396951953636, + "learning_rate": 1.456779472250368e-05, + "loss": 0.6042, + "step": 4535 + }, + { + "epoch": 0.37, + "grad_norm": 0.9597726253126289, + "learning_rate": 1.4565452769485644e-05, + "loss": 0.6088, + "step": 4536 + }, + { + "epoch": 0.37, + "grad_norm": 0.9236890364028195, + "learning_rate": 1.4563110500085786e-05, + "loss": 0.5624, + "step": 4537 + }, + { + "epoch": 0.37, + "grad_norm": 0.8956434628330934, + "learning_rate": 1.4560767914466417e-05, + "loss": 0.5511, + "step": 4538 + }, + { + "epoch": 0.37, + "grad_norm": 0.9639572643026926, + "learning_rate": 1.4558425012789873e-05, + "loss": 0.5579, + "step": 4539 + }, + { + "epoch": 0.37, + "grad_norm": 0.9794697866313931, + "learning_rate": 1.4556081795218525e-05, + "loss": 0.5269, + "step": 4540 + }, + { + "epoch": 0.37, + "grad_norm": 0.8887634982521235, + "learning_rate": 1.4553738261914742e-05, + "loss": 0.5336, + "step": 4541 + }, + { + "epoch": 0.37, + "grad_norm": 0.9542512904237881, + "learning_rate": 1.4551394413040942e-05, + "loss": 0.5485, + "step": 4542 + }, + { + "epoch": 0.37, + "grad_norm": 0.994243489296905, + "learning_rate": 1.4549050248759546e-05, + "loss": 0.5648, + "step": 4543 + }, + { + "epoch": 0.37, + "grad_norm": 0.973290658485551, + "learning_rate": 1.4546705769233003e-05, + "loss": 0.5819, + "step": 4544 + }, + { + "epoch": 0.37, + "grad_norm": 0.9707765308575302, + "learning_rate": 1.4544360974623781e-05, + "loss": 0.5097, + "step": 4545 + }, + { + "epoch": 0.37, + "grad_norm": 0.9736719646068369, + "learning_rate": 1.4542015865094377e-05, + "loss": 0.5682, + "step": 4546 + }, + { + "epoch": 0.37, + "grad_norm": 0.9026646459372545, + "learning_rate": 1.4539670440807298e-05, + "loss": 0.5721, + "step": 4547 + }, + { + "epoch": 0.37, + "grad_norm": 0.9847250280289744, + "learning_rate": 1.4537324701925088e-05, + "loss": 0.632, + "step": 4548 + }, + { + "epoch": 0.37, + "grad_norm": 0.9816038435971146, + "learning_rate": 1.4534978648610301e-05, + "loss": 0.5988, + "step": 4549 + }, + { + "epoch": 0.37, + "grad_norm": 1.0238723489848807, + "learning_rate": 1.4532632281025514e-05, + "loss": 0.6407, + "step": 4550 + }, + { + "epoch": 0.37, + "grad_norm": 0.8774092216491454, + "learning_rate": 1.4530285599333332e-05, + "loss": 0.5169, + "step": 4551 + }, + { + "epoch": 0.37, + "grad_norm": 0.8977191065810496, + "learning_rate": 1.4527938603696376e-05, + "loss": 0.5758, + "step": 4552 + }, + { + "epoch": 0.37, + "grad_norm": 0.8679359206799987, + "learning_rate": 1.452559129427729e-05, + "loss": 0.5466, + "step": 4553 + }, + { + "epoch": 0.37, + "grad_norm": 0.8862085480729563, + "learning_rate": 1.4523243671238741e-05, + "loss": 0.5008, + "step": 4554 + }, + { + "epoch": 0.37, + "grad_norm": 0.8560466744924718, + "learning_rate": 1.4520895734743419e-05, + "loss": 0.5498, + "step": 4555 + }, + { + "epoch": 0.37, + "grad_norm": 0.870445799003857, + "learning_rate": 1.4518547484954033e-05, + "loss": 0.5446, + "step": 4556 + }, + { + "epoch": 0.37, + "grad_norm": 0.9915550536321615, + "learning_rate": 1.4516198922033313e-05, + "loss": 0.5891, + "step": 4557 + }, + { + "epoch": 0.37, + "grad_norm": 0.9766747506463757, + "learning_rate": 1.4513850046144015e-05, + "loss": 0.5856, + "step": 4558 + }, + { + "epoch": 0.37, + "grad_norm": 1.1299468463134879, + "learning_rate": 1.451150085744891e-05, + "loss": 0.5327, + "step": 4559 + }, + { + "epoch": 0.37, + "grad_norm": 0.876075346471893, + "learning_rate": 1.45091513561108e-05, + "loss": 0.5676, + "step": 4560 + }, + { + "epoch": 0.37, + "grad_norm": 0.9026216340846099, + "learning_rate": 1.4506801542292501e-05, + "loss": 0.4733, + "step": 4561 + }, + { + "epoch": 0.37, + "grad_norm": 0.9271284281161375, + "learning_rate": 1.450445141615685e-05, + "loss": 0.6201, + "step": 4562 + }, + { + "epoch": 0.37, + "grad_norm": 0.9643881947320083, + "learning_rate": 1.4502100977866713e-05, + "loss": 0.63, + "step": 4563 + }, + { + "epoch": 0.37, + "grad_norm": 0.9016319613024337, + "learning_rate": 1.449975022758497e-05, + "loss": 0.5579, + "step": 4564 + }, + { + "epoch": 0.37, + "grad_norm": 0.8301182624895908, + "learning_rate": 1.449739916547453e-05, + "loss": 0.5391, + "step": 4565 + }, + { + "epoch": 0.37, + "grad_norm": 1.0885222691950676, + "learning_rate": 1.4495047791698316e-05, + "loss": 0.5193, + "step": 4566 + }, + { + "epoch": 0.37, + "grad_norm": 0.8532968406074554, + "learning_rate": 1.4492696106419275e-05, + "loss": 0.5216, + "step": 4567 + }, + { + "epoch": 0.37, + "grad_norm": 0.9434837561969038, + "learning_rate": 1.4490344109800382e-05, + "loss": 0.5765, + "step": 4568 + }, + { + "epoch": 0.37, + "grad_norm": 0.8803884583946759, + "learning_rate": 1.4487991802004625e-05, + "loss": 0.486, + "step": 4569 + }, + { + "epoch": 0.37, + "grad_norm": 0.8000039279991835, + "learning_rate": 1.4485639183195014e-05, + "loss": 0.4766, + "step": 4570 + }, + { + "epoch": 0.37, + "grad_norm": 0.9594388992920403, + "learning_rate": 1.448328625353459e-05, + "loss": 0.5695, + "step": 4571 + }, + { + "epoch": 0.37, + "grad_norm": 0.8484072973197494, + "learning_rate": 1.4480933013186403e-05, + "loss": 0.5692, + "step": 4572 + }, + { + "epoch": 0.37, + "grad_norm": 1.1050447422521597, + "learning_rate": 1.4478579462313533e-05, + "loss": 0.5268, + "step": 4573 + }, + { + "epoch": 0.37, + "grad_norm": 0.8681889570895173, + "learning_rate": 1.4476225601079078e-05, + "loss": 0.5614, + "step": 4574 + }, + { + "epoch": 0.37, + "grad_norm": 0.9827528961085784, + "learning_rate": 1.447387142964616e-05, + "loss": 0.6487, + "step": 4575 + }, + { + "epoch": 0.37, + "grad_norm": 0.9192293189477148, + "learning_rate": 1.4471516948177921e-05, + "loss": 0.5596, + "step": 4576 + }, + { + "epoch": 0.37, + "grad_norm": 0.8507728172540402, + "learning_rate": 1.4469162156837521e-05, + "loss": 0.458, + "step": 4577 + }, + { + "epoch": 0.37, + "grad_norm": 0.9116255169315678, + "learning_rate": 1.4466807055788152e-05, + "loss": 0.543, + "step": 4578 + }, + { + "epoch": 0.37, + "grad_norm": 0.8972062853518864, + "learning_rate": 1.4464451645193013e-05, + "loss": 0.5706, + "step": 4579 + }, + { + "epoch": 0.37, + "grad_norm": 0.8844699891420403, + "learning_rate": 1.4462095925215336e-05, + "loss": 0.5486, + "step": 4580 + }, + { + "epoch": 0.37, + "grad_norm": 0.8469427622135454, + "learning_rate": 1.445973989601837e-05, + "loss": 0.5295, + "step": 4581 + }, + { + "epoch": 0.37, + "grad_norm": 1.0531060450779732, + "learning_rate": 1.4457383557765385e-05, + "loss": 0.6393, + "step": 4582 + }, + { + "epoch": 0.37, + "grad_norm": 0.8996525214835874, + "learning_rate": 1.4455026910619672e-05, + "loss": 0.5354, + "step": 4583 + }, + { + "epoch": 0.37, + "grad_norm": 0.8842873124765084, + "learning_rate": 1.4452669954744545e-05, + "loss": 0.5528, + "step": 4584 + }, + { + "epoch": 0.37, + "grad_norm": 0.8787924380403497, + "learning_rate": 1.4450312690303342e-05, + "loss": 0.629, + "step": 4585 + }, + { + "epoch": 0.37, + "grad_norm": 0.8125932722275822, + "learning_rate": 1.4447955117459414e-05, + "loss": 0.4836, + "step": 4586 + }, + { + "epoch": 0.37, + "grad_norm": 0.8355003410967605, + "learning_rate": 1.4445597236376143e-05, + "loss": 0.5161, + "step": 4587 + }, + { + "epoch": 0.37, + "grad_norm": 0.890935498183146, + "learning_rate": 1.4443239047216928e-05, + "loss": 0.5175, + "step": 4588 + }, + { + "epoch": 0.37, + "grad_norm": 0.9152569502319765, + "learning_rate": 1.4440880550145187e-05, + "loss": 0.5548, + "step": 4589 + }, + { + "epoch": 0.37, + "grad_norm": 0.9338050629782046, + "learning_rate": 1.4438521745324363e-05, + "loss": 0.605, + "step": 4590 + }, + { + "epoch": 0.37, + "grad_norm": 0.8610689574026248, + "learning_rate": 1.4436162632917918e-05, + "loss": 0.5304, + "step": 4591 + }, + { + "epoch": 0.37, + "grad_norm": 0.8372614133811489, + "learning_rate": 1.4433803213089341e-05, + "loss": 0.5228, + "step": 4592 + }, + { + "epoch": 0.37, + "grad_norm": 0.8240098780319006, + "learning_rate": 1.443144348600213e-05, + "loss": 0.5509, + "step": 4593 + }, + { + "epoch": 0.37, + "grad_norm": 0.9114321836325133, + "learning_rate": 1.442908345181982e-05, + "loss": 0.5135, + "step": 4594 + }, + { + "epoch": 0.37, + "grad_norm": 0.8340969162814056, + "learning_rate": 1.442672311070595e-05, + "loss": 0.5039, + "step": 4595 + }, + { + "epoch": 0.37, + "grad_norm": 0.886391196202956, + "learning_rate": 1.44243624628241e-05, + "loss": 0.5364, + "step": 4596 + }, + { + "epoch": 0.37, + "grad_norm": 0.9536427603728262, + "learning_rate": 1.4422001508337853e-05, + "loss": 0.5255, + "step": 4597 + }, + { + "epoch": 0.37, + "grad_norm": 0.9276882420801554, + "learning_rate": 1.4419640247410827e-05, + "loss": 0.5701, + "step": 4598 + }, + { + "epoch": 0.37, + "grad_norm": 0.8960716066915557, + "learning_rate": 1.4417278680206647e-05, + "loss": 0.594, + "step": 4599 + }, + { + "epoch": 0.37, + "grad_norm": 0.9832722742030179, + "learning_rate": 1.4414916806888976e-05, + "loss": 0.6219, + "step": 4600 + }, + { + "epoch": 0.37, + "grad_norm": 0.9796662164059784, + "learning_rate": 1.4412554627621487e-05, + "loss": 0.5509, + "step": 4601 + }, + { + "epoch": 0.37, + "grad_norm": 0.9263311552853005, + "learning_rate": 1.4410192142567874e-05, + "loss": 0.4821, + "step": 4602 + }, + { + "epoch": 0.37, + "grad_norm": 0.9106880002039524, + "learning_rate": 1.4407829351891858e-05, + "loss": 0.5703, + "step": 4603 + }, + { + "epoch": 0.37, + "grad_norm": 0.8117373702202885, + "learning_rate": 1.4405466255757178e-05, + "loss": 0.5324, + "step": 4604 + }, + { + "epoch": 0.37, + "grad_norm": 0.9051324540084974, + "learning_rate": 1.4403102854327595e-05, + "loss": 0.5474, + "step": 4605 + }, + { + "epoch": 0.37, + "grad_norm": 0.8988441607793508, + "learning_rate": 1.4400739147766887e-05, + "loss": 0.5718, + "step": 4606 + }, + { + "epoch": 0.37, + "grad_norm": 0.9418428015077235, + "learning_rate": 1.4398375136238864e-05, + "loss": 0.5627, + "step": 4607 + }, + { + "epoch": 0.37, + "grad_norm": 0.8659909136750779, + "learning_rate": 1.439601081990734e-05, + "loss": 0.5206, + "step": 4608 + }, + { + "epoch": 0.37, + "grad_norm": 0.9068140592385949, + "learning_rate": 1.4393646198936169e-05, + "loss": 0.5892, + "step": 4609 + }, + { + "epoch": 0.37, + "grad_norm": 0.9967771784960222, + "learning_rate": 1.4391281273489216e-05, + "loss": 0.6452, + "step": 4610 + }, + { + "epoch": 0.37, + "grad_norm": 0.8349913229312583, + "learning_rate": 1.438891604373036e-05, + "loss": 0.5053, + "step": 4611 + }, + { + "epoch": 0.37, + "grad_norm": 0.8885153877992091, + "learning_rate": 1.4386550509823515e-05, + "loss": 0.5386, + "step": 4612 + }, + { + "epoch": 0.37, + "grad_norm": 0.8717848449955791, + "learning_rate": 1.4384184671932616e-05, + "loss": 0.4851, + "step": 4613 + }, + { + "epoch": 0.38, + "grad_norm": 0.962862508614908, + "learning_rate": 1.4381818530221604e-05, + "loss": 0.6124, + "step": 4614 + }, + { + "epoch": 0.38, + "grad_norm": 0.8910489915347034, + "learning_rate": 1.4379452084854455e-05, + "loss": 0.538, + "step": 4615 + }, + { + "epoch": 0.38, + "grad_norm": 0.90704637704644, + "learning_rate": 1.4377085335995165e-05, + "loss": 0.5425, + "step": 4616 + }, + { + "epoch": 0.38, + "grad_norm": 0.8258031697206105, + "learning_rate": 1.4374718283807738e-05, + "loss": 0.5118, + "step": 4617 + }, + { + "epoch": 0.38, + "grad_norm": 0.8838282368452399, + "learning_rate": 1.4372350928456218e-05, + "loss": 0.5603, + "step": 4618 + }, + { + "epoch": 0.38, + "grad_norm": 0.8717929142089527, + "learning_rate": 1.4369983270104654e-05, + "loss": 0.5141, + "step": 4619 + }, + { + "epoch": 0.38, + "grad_norm": 0.9120544658922455, + "learning_rate": 1.436761530891713e-05, + "loss": 0.6108, + "step": 4620 + }, + { + "epoch": 0.38, + "grad_norm": 0.9513750558154765, + "learning_rate": 1.4365247045057732e-05, + "loss": 0.5394, + "step": 4621 + }, + { + "epoch": 0.38, + "grad_norm": 0.9271504423453096, + "learning_rate": 1.4362878478690595e-05, + "loss": 0.6059, + "step": 4622 + }, + { + "epoch": 0.38, + "grad_norm": 0.935788851587579, + "learning_rate": 1.4360509609979842e-05, + "loss": 0.5753, + "step": 4623 + }, + { + "epoch": 0.38, + "grad_norm": 0.9640035939735568, + "learning_rate": 1.4358140439089644e-05, + "loss": 0.5396, + "step": 4624 + }, + { + "epoch": 0.38, + "grad_norm": 1.0144514931115078, + "learning_rate": 1.435577096618418e-05, + "loss": 0.5673, + "step": 4625 + }, + { + "epoch": 0.38, + "grad_norm": 1.0271339489351563, + "learning_rate": 1.435340119142765e-05, + "loss": 0.5648, + "step": 4626 + }, + { + "epoch": 0.38, + "grad_norm": 0.8403509294262873, + "learning_rate": 1.4351031114984277e-05, + "loss": 0.4805, + "step": 4627 + }, + { + "epoch": 0.38, + "grad_norm": 0.9565545155982719, + "learning_rate": 1.434866073701831e-05, + "loss": 0.553, + "step": 4628 + }, + { + "epoch": 0.38, + "grad_norm": 0.9695005048984443, + "learning_rate": 1.434629005769401e-05, + "loss": 0.5715, + "step": 4629 + }, + { + "epoch": 0.38, + "grad_norm": 0.977899291178307, + "learning_rate": 1.4343919077175662e-05, + "loss": 0.5849, + "step": 4630 + }, + { + "epoch": 0.38, + "grad_norm": 0.884291550415101, + "learning_rate": 1.434154779562758e-05, + "loss": 0.5444, + "step": 4631 + }, + { + "epoch": 0.38, + "grad_norm": 0.8531258142757846, + "learning_rate": 1.4339176213214084e-05, + "loss": 0.5753, + "step": 4632 + }, + { + "epoch": 0.38, + "grad_norm": 0.8577345206288213, + "learning_rate": 1.4336804330099525e-05, + "loss": 0.5059, + "step": 4633 + }, + { + "epoch": 0.38, + "grad_norm": 0.7924570235333922, + "learning_rate": 1.4334432146448272e-05, + "loss": 0.5033, + "step": 4634 + }, + { + "epoch": 0.38, + "grad_norm": 1.014824293404601, + "learning_rate": 1.433205966242472e-05, + "loss": 0.6218, + "step": 4635 + }, + { + "epoch": 0.38, + "grad_norm": 0.9407577361679119, + "learning_rate": 1.4329686878193271e-05, + "loss": 0.5733, + "step": 4636 + }, + { + "epoch": 0.38, + "grad_norm": 0.8702144654770304, + "learning_rate": 1.4327313793918362e-05, + "loss": 0.602, + "step": 4637 + }, + { + "epoch": 0.38, + "grad_norm": 0.8761690276292466, + "learning_rate": 1.432494040976445e-05, + "loss": 0.5269, + "step": 4638 + }, + { + "epoch": 0.38, + "grad_norm": 0.929781433069658, + "learning_rate": 1.4322566725895998e-05, + "loss": 0.5445, + "step": 4639 + }, + { + "epoch": 0.38, + "grad_norm": 0.7992557146672608, + "learning_rate": 1.432019274247751e-05, + "loss": 0.5139, + "step": 4640 + }, + { + "epoch": 0.38, + "grad_norm": 0.773596192547856, + "learning_rate": 1.4317818459673496e-05, + "loss": 0.4513, + "step": 4641 + }, + { + "epoch": 0.38, + "grad_norm": 0.9862725969212202, + "learning_rate": 1.4315443877648494e-05, + "loss": 0.6312, + "step": 4642 + }, + { + "epoch": 0.38, + "grad_norm": 0.9346377617043576, + "learning_rate": 1.4313068996567054e-05, + "loss": 0.5526, + "step": 4643 + }, + { + "epoch": 0.38, + "grad_norm": 0.9096616854418217, + "learning_rate": 1.4310693816593766e-05, + "loss": 0.5035, + "step": 4644 + }, + { + "epoch": 0.38, + "grad_norm": 0.8896567937140818, + "learning_rate": 1.4308318337893214e-05, + "loss": 0.5757, + "step": 4645 + }, + { + "epoch": 0.38, + "grad_norm": 0.8989146692626034, + "learning_rate": 1.4305942560630025e-05, + "loss": 0.5482, + "step": 4646 + }, + { + "epoch": 0.38, + "grad_norm": 0.9001044151809969, + "learning_rate": 1.4303566484968836e-05, + "loss": 0.5518, + "step": 4647 + }, + { + "epoch": 0.38, + "grad_norm": 0.8284432728019371, + "learning_rate": 1.4301190111074306e-05, + "loss": 0.5221, + "step": 4648 + }, + { + "epoch": 0.38, + "grad_norm": 0.8823025857272053, + "learning_rate": 1.4298813439111116e-05, + "loss": 0.5092, + "step": 4649 + }, + { + "epoch": 0.38, + "grad_norm": 0.9638802531589524, + "learning_rate": 1.4296436469243968e-05, + "loss": 0.5593, + "step": 4650 + }, + { + "epoch": 0.38, + "grad_norm": 0.8957597475760447, + "learning_rate": 1.4294059201637584e-05, + "loss": 0.6216, + "step": 4651 + }, + { + "epoch": 0.38, + "grad_norm": 0.9922406505863793, + "learning_rate": 1.4291681636456706e-05, + "loss": 0.585, + "step": 4652 + }, + { + "epoch": 0.38, + "grad_norm": 0.9067841463591434, + "learning_rate": 1.42893037738661e-05, + "loss": 0.5457, + "step": 4653 + }, + { + "epoch": 0.38, + "grad_norm": 0.8603834578523885, + "learning_rate": 1.4286925614030542e-05, + "loss": 0.5728, + "step": 4654 + }, + { + "epoch": 0.38, + "grad_norm": 0.9463071681212694, + "learning_rate": 1.4284547157114846e-05, + "loss": 0.5982, + "step": 4655 + }, + { + "epoch": 0.38, + "grad_norm": 0.8547064534994039, + "learning_rate": 1.4282168403283829e-05, + "loss": 0.5284, + "step": 4656 + }, + { + "epoch": 0.38, + "grad_norm": 0.883673974873926, + "learning_rate": 1.4279789352702342e-05, + "loss": 0.5759, + "step": 4657 + }, + { + "epoch": 0.38, + "grad_norm": 0.9929381579362356, + "learning_rate": 1.4277410005535249e-05, + "loss": 0.5804, + "step": 4658 + }, + { + "epoch": 0.38, + "grad_norm": 0.8075944223992195, + "learning_rate": 1.4275030361947438e-05, + "loss": 0.5404, + "step": 4659 + }, + { + "epoch": 0.38, + "grad_norm": 0.9575072735956454, + "learning_rate": 1.427265042210381e-05, + "loss": 0.598, + "step": 4660 + }, + { + "epoch": 0.38, + "grad_norm": 0.9730461139752252, + "learning_rate": 1.4270270186169301e-05, + "loss": 0.5552, + "step": 4661 + }, + { + "epoch": 0.38, + "grad_norm": 0.8627026160167858, + "learning_rate": 1.4267889654308858e-05, + "loss": 0.5816, + "step": 4662 + }, + { + "epoch": 0.38, + "grad_norm": 0.8134697842892554, + "learning_rate": 1.4265508826687442e-05, + "loss": 0.5051, + "step": 4663 + }, + { + "epoch": 0.38, + "grad_norm": 0.8604152441105043, + "learning_rate": 1.4263127703470053e-05, + "loss": 0.5473, + "step": 4664 + }, + { + "epoch": 0.38, + "grad_norm": 0.8854386728519589, + "learning_rate": 1.4260746284821694e-05, + "loss": 0.5699, + "step": 4665 + }, + { + "epoch": 0.38, + "grad_norm": 0.8920693498069167, + "learning_rate": 1.4258364570907395e-05, + "loss": 0.5398, + "step": 4666 + }, + { + "epoch": 0.38, + "grad_norm": 0.9962386036036195, + "learning_rate": 1.4255982561892207e-05, + "loss": 0.6245, + "step": 4667 + }, + { + "epoch": 0.38, + "grad_norm": 0.8323716865619504, + "learning_rate": 1.4253600257941208e-05, + "loss": 0.5058, + "step": 4668 + }, + { + "epoch": 0.38, + "grad_norm": 0.8699456667642135, + "learning_rate": 1.4251217659219481e-05, + "loss": 0.6105, + "step": 4669 + }, + { + "epoch": 0.38, + "grad_norm": 0.9889583687179783, + "learning_rate": 1.4248834765892139e-05, + "loss": 0.5819, + "step": 4670 + }, + { + "epoch": 0.38, + "grad_norm": 0.9126491520550216, + "learning_rate": 1.4246451578124321e-05, + "loss": 0.56, + "step": 4671 + }, + { + "epoch": 0.38, + "grad_norm": 0.9134335213125362, + "learning_rate": 1.4244068096081172e-05, + "loss": 0.5301, + "step": 4672 + }, + { + "epoch": 0.38, + "grad_norm": 0.9273990704818516, + "learning_rate": 1.4241684319927869e-05, + "loss": 0.4937, + "step": 4673 + }, + { + "epoch": 0.38, + "grad_norm": 0.9268357832113618, + "learning_rate": 1.4239300249829606e-05, + "loss": 0.5696, + "step": 4674 + }, + { + "epoch": 0.38, + "grad_norm": 0.8527064171200383, + "learning_rate": 1.4236915885951592e-05, + "loss": 0.553, + "step": 4675 + }, + { + "epoch": 0.38, + "grad_norm": 0.9562426729410712, + "learning_rate": 1.4234531228459069e-05, + "loss": 0.5627, + "step": 4676 + }, + { + "epoch": 0.38, + "grad_norm": 0.9183753162569681, + "learning_rate": 1.4232146277517289e-05, + "loss": 0.5514, + "step": 4677 + }, + { + "epoch": 0.38, + "grad_norm": 0.9248669788204029, + "learning_rate": 1.4229761033291523e-05, + "loss": 0.5248, + "step": 4678 + }, + { + "epoch": 0.38, + "grad_norm": 0.9321721977272228, + "learning_rate": 1.422737549594707e-05, + "loss": 0.537, + "step": 4679 + }, + { + "epoch": 0.38, + "grad_norm": 0.8532668316951978, + "learning_rate": 1.4224989665649248e-05, + "loss": 0.5675, + "step": 4680 + }, + { + "epoch": 0.38, + "grad_norm": 0.8601920380266586, + "learning_rate": 1.4222603542563385e-05, + "loss": 0.4955, + "step": 4681 + }, + { + "epoch": 0.38, + "grad_norm": 0.869794347959696, + "learning_rate": 1.4220217126854842e-05, + "loss": 0.5749, + "step": 4682 + }, + { + "epoch": 0.38, + "grad_norm": 0.972823849878353, + "learning_rate": 1.4217830418689e-05, + "loss": 0.558, + "step": 4683 + }, + { + "epoch": 0.38, + "grad_norm": 0.8801348435436541, + "learning_rate": 1.4215443418231248e-05, + "loss": 0.5038, + "step": 4684 + }, + { + "epoch": 0.38, + "grad_norm": 0.8921011812497122, + "learning_rate": 1.4213056125647005e-05, + "loss": 0.5297, + "step": 4685 + }, + { + "epoch": 0.38, + "grad_norm": 0.9004852636949913, + "learning_rate": 1.4210668541101713e-05, + "loss": 0.529, + "step": 4686 + }, + { + "epoch": 0.38, + "grad_norm": 1.1573939656494636, + "learning_rate": 1.4208280664760823e-05, + "loss": 0.5374, + "step": 4687 + }, + { + "epoch": 0.38, + "grad_norm": 0.8283523390582402, + "learning_rate": 1.4205892496789816e-05, + "loss": 0.5117, + "step": 4688 + }, + { + "epoch": 0.38, + "grad_norm": 0.8123582346827237, + "learning_rate": 1.4203504037354192e-05, + "loss": 0.5004, + "step": 4689 + }, + { + "epoch": 0.38, + "grad_norm": 1.0285817266191963, + "learning_rate": 1.4201115286619464e-05, + "loss": 0.6204, + "step": 4690 + }, + { + "epoch": 0.38, + "grad_norm": 0.8752059281230673, + "learning_rate": 1.4198726244751173e-05, + "loss": 0.5893, + "step": 4691 + }, + { + "epoch": 0.38, + "grad_norm": 0.9099369702348551, + "learning_rate": 1.4196336911914878e-05, + "loss": 0.5702, + "step": 4692 + }, + { + "epoch": 0.38, + "grad_norm": 0.8990956246038558, + "learning_rate": 1.419394728827616e-05, + "loss": 0.5603, + "step": 4693 + }, + { + "epoch": 0.38, + "grad_norm": 0.9622419853620431, + "learning_rate": 1.419155737400061e-05, + "loss": 0.5678, + "step": 4694 + }, + { + "epoch": 0.38, + "grad_norm": 0.8860431919599383, + "learning_rate": 1.4189167169253855e-05, + "loss": 0.4666, + "step": 4695 + }, + { + "epoch": 0.38, + "grad_norm": 0.8895897423278988, + "learning_rate": 1.4186776674201533e-05, + "loss": 0.5501, + "step": 4696 + }, + { + "epoch": 0.38, + "grad_norm": 0.920359064516808, + "learning_rate": 1.4184385889009298e-05, + "loss": 0.5521, + "step": 4697 + }, + { + "epoch": 0.38, + "grad_norm": 0.8188258847505276, + "learning_rate": 1.4181994813842831e-05, + "loss": 0.54, + "step": 4698 + }, + { + "epoch": 0.38, + "grad_norm": 0.9810026176463054, + "learning_rate": 1.4179603448867836e-05, + "loss": 0.5882, + "step": 4699 + }, + { + "epoch": 0.38, + "grad_norm": 0.9442240534749564, + "learning_rate": 1.4177211794250027e-05, + "loss": 0.5584, + "step": 4700 + }, + { + "epoch": 0.38, + "grad_norm": 0.9776295396882877, + "learning_rate": 1.4174819850155148e-05, + "loss": 0.5444, + "step": 4701 + }, + { + "epoch": 0.38, + "grad_norm": 0.8815069781560079, + "learning_rate": 1.4172427616748955e-05, + "loss": 0.521, + "step": 4702 + }, + { + "epoch": 0.38, + "grad_norm": 0.9598488602858573, + "learning_rate": 1.417003509419723e-05, + "loss": 0.5725, + "step": 4703 + }, + { + "epoch": 0.38, + "grad_norm": 0.88162059281092, + "learning_rate": 1.416764228266577e-05, + "loss": 0.5713, + "step": 4704 + }, + { + "epoch": 0.38, + "grad_norm": 0.9011421926797095, + "learning_rate": 1.4165249182320401e-05, + "loss": 0.5621, + "step": 4705 + }, + { + "epoch": 0.38, + "grad_norm": 0.8956175379198454, + "learning_rate": 1.4162855793326955e-05, + "loss": 0.5187, + "step": 4706 + }, + { + "epoch": 0.38, + "grad_norm": 0.8036258385169617, + "learning_rate": 1.4160462115851292e-05, + "loss": 0.5419, + "step": 4707 + }, + { + "epoch": 0.38, + "grad_norm": 0.9202754450019062, + "learning_rate": 1.4158068150059302e-05, + "loss": 0.5333, + "step": 4708 + }, + { + "epoch": 0.38, + "grad_norm": 0.896061684455917, + "learning_rate": 1.415567389611687e-05, + "loss": 0.5587, + "step": 4709 + }, + { + "epoch": 0.38, + "grad_norm": 0.9454485617565561, + "learning_rate": 1.4153279354189927e-05, + "loss": 0.5858, + "step": 4710 + }, + { + "epoch": 0.38, + "grad_norm": 0.9237005211467425, + "learning_rate": 1.415088452444441e-05, + "loss": 0.527, + "step": 4711 + }, + { + "epoch": 0.38, + "grad_norm": 0.9193006748691501, + "learning_rate": 1.4148489407046274e-05, + "loss": 0.5338, + "step": 4712 + }, + { + "epoch": 0.38, + "grad_norm": 0.8406689889011487, + "learning_rate": 1.4146094002161501e-05, + "loss": 0.5876, + "step": 4713 + }, + { + "epoch": 0.38, + "grad_norm": 0.8588860468374754, + "learning_rate": 1.4143698309956096e-05, + "loss": 0.5727, + "step": 4714 + }, + { + "epoch": 0.38, + "grad_norm": 0.9456318467829123, + "learning_rate": 1.414130233059607e-05, + "loss": 0.5408, + "step": 4715 + }, + { + "epoch": 0.38, + "grad_norm": 0.8965301895023075, + "learning_rate": 1.4138906064247467e-05, + "loss": 0.5793, + "step": 4716 + }, + { + "epoch": 0.38, + "grad_norm": 0.9239643757250254, + "learning_rate": 1.4136509511076347e-05, + "loss": 0.6473, + "step": 4717 + }, + { + "epoch": 0.38, + "grad_norm": 0.9022228686323259, + "learning_rate": 1.4134112671248783e-05, + "loss": 0.4977, + "step": 4718 + }, + { + "epoch": 0.38, + "grad_norm": 0.7949190841588801, + "learning_rate": 1.4131715544930878e-05, + "loss": 0.5347, + "step": 4719 + }, + { + "epoch": 0.38, + "grad_norm": 0.9740355679714173, + "learning_rate": 1.4129318132288752e-05, + "loss": 0.5923, + "step": 4720 + }, + { + "epoch": 0.38, + "grad_norm": 0.7708498171790932, + "learning_rate": 1.4126920433488542e-05, + "loss": 0.5214, + "step": 4721 + }, + { + "epoch": 0.38, + "grad_norm": 0.7865502977843916, + "learning_rate": 1.4124522448696407e-05, + "loss": 0.5338, + "step": 4722 + }, + { + "epoch": 0.38, + "grad_norm": 0.8597659959517485, + "learning_rate": 1.4122124178078522e-05, + "loss": 0.56, + "step": 4723 + }, + { + "epoch": 0.38, + "grad_norm": 0.8978989392823783, + "learning_rate": 1.4119725621801093e-05, + "loss": 0.5677, + "step": 4724 + }, + { + "epoch": 0.38, + "grad_norm": 0.9157028799721416, + "learning_rate": 1.411732678003033e-05, + "loss": 0.6368, + "step": 4725 + }, + { + "epoch": 0.38, + "grad_norm": 0.9925066776480296, + "learning_rate": 1.411492765293247e-05, + "loss": 0.5487, + "step": 4726 + }, + { + "epoch": 0.38, + "grad_norm": 0.9113747897282498, + "learning_rate": 1.4112528240673779e-05, + "loss": 0.5318, + "step": 4727 + }, + { + "epoch": 0.38, + "grad_norm": 0.8589278502208879, + "learning_rate": 1.4110128543420527e-05, + "loss": 0.555, + "step": 4728 + }, + { + "epoch": 0.38, + "grad_norm": 0.950191960101158, + "learning_rate": 1.410772856133901e-05, + "loss": 0.5285, + "step": 4729 + }, + { + "epoch": 0.38, + "grad_norm": 0.9338525505422465, + "learning_rate": 1.4105328294595549e-05, + "loss": 0.4941, + "step": 4730 + }, + { + "epoch": 0.38, + "grad_norm": 0.9615369635942083, + "learning_rate": 1.4102927743356481e-05, + "loss": 0.6274, + "step": 4731 + }, + { + "epoch": 0.38, + "grad_norm": 0.953732288802404, + "learning_rate": 1.4100526907788157e-05, + "loss": 0.5737, + "step": 4732 + }, + { + "epoch": 0.38, + "grad_norm": 1.0032491436975954, + "learning_rate": 1.4098125788056955e-05, + "loss": 0.5614, + "step": 4733 + }, + { + "epoch": 0.38, + "grad_norm": 0.889558640015762, + "learning_rate": 1.4095724384329272e-05, + "loss": 0.5449, + "step": 4734 + }, + { + "epoch": 0.38, + "grad_norm": 0.8956467014404496, + "learning_rate": 1.4093322696771523e-05, + "loss": 0.561, + "step": 4735 + }, + { + "epoch": 0.38, + "grad_norm": 0.8551706931065691, + "learning_rate": 1.409092072555014e-05, + "loss": 0.598, + "step": 4736 + }, + { + "epoch": 0.38, + "grad_norm": 0.8548839800087088, + "learning_rate": 1.408851847083158e-05, + "loss": 0.5593, + "step": 4737 + }, + { + "epoch": 0.39, + "grad_norm": 0.8901266047668741, + "learning_rate": 1.4086115932782316e-05, + "loss": 0.5516, + "step": 4738 + }, + { + "epoch": 0.39, + "grad_norm": 0.8561960107371271, + "learning_rate": 1.4083713111568841e-05, + "loss": 0.5536, + "step": 4739 + }, + { + "epoch": 0.39, + "grad_norm": 0.9031494970498228, + "learning_rate": 1.4081310007357673e-05, + "loss": 0.5372, + "step": 4740 + }, + { + "epoch": 0.39, + "grad_norm": 0.9312334703767707, + "learning_rate": 1.4078906620315343e-05, + "loss": 0.5603, + "step": 4741 + }, + { + "epoch": 0.39, + "grad_norm": 0.8599402331636063, + "learning_rate": 1.4076502950608397e-05, + "loss": 0.544, + "step": 4742 + }, + { + "epoch": 0.39, + "grad_norm": 0.8292929383194493, + "learning_rate": 1.4074098998403414e-05, + "loss": 0.4634, + "step": 4743 + }, + { + "epoch": 0.39, + "grad_norm": 0.815490200853265, + "learning_rate": 1.4071694763866988e-05, + "loss": 0.538, + "step": 4744 + }, + { + "epoch": 0.39, + "grad_norm": 0.894925411565493, + "learning_rate": 1.406929024716572e-05, + "loss": 0.6092, + "step": 4745 + }, + { + "epoch": 0.39, + "grad_norm": 0.963522106511423, + "learning_rate": 1.4066885448466252e-05, + "loss": 0.5206, + "step": 4746 + }, + { + "epoch": 0.39, + "grad_norm": 0.8876060513326824, + "learning_rate": 1.406448036793523e-05, + "loss": 0.5827, + "step": 4747 + }, + { + "epoch": 0.39, + "grad_norm": 0.9248326157942703, + "learning_rate": 1.406207500573932e-05, + "loss": 0.5604, + "step": 4748 + }, + { + "epoch": 0.39, + "grad_norm": 0.8549469819980957, + "learning_rate": 1.4059669362045216e-05, + "loss": 0.5983, + "step": 4749 + }, + { + "epoch": 0.39, + "grad_norm": 0.9122826272105434, + "learning_rate": 1.4057263437019631e-05, + "loss": 0.5094, + "step": 4750 + }, + { + "epoch": 0.39, + "grad_norm": 0.841912287988438, + "learning_rate": 1.4054857230829284e-05, + "loss": 0.526, + "step": 4751 + }, + { + "epoch": 0.39, + "grad_norm": 0.8528346965856147, + "learning_rate": 1.4052450743640926e-05, + "loss": 0.5588, + "step": 4752 + }, + { + "epoch": 0.39, + "grad_norm": 0.9629513938454854, + "learning_rate": 1.4050043975621328e-05, + "loss": 0.5635, + "step": 4753 + }, + { + "epoch": 0.39, + "grad_norm": 0.9684907374115393, + "learning_rate": 1.4047636926937278e-05, + "loss": 0.5735, + "step": 4754 + }, + { + "epoch": 0.39, + "grad_norm": 0.9225498384798198, + "learning_rate": 1.4045229597755574e-05, + "loss": 0.5369, + "step": 4755 + }, + { + "epoch": 0.39, + "grad_norm": 0.8387551385225303, + "learning_rate": 1.404282198824305e-05, + "loss": 0.5656, + "step": 4756 + }, + { + "epoch": 0.39, + "grad_norm": 0.9722381067509114, + "learning_rate": 1.4040414098566548e-05, + "loss": 0.6086, + "step": 4757 + }, + { + "epoch": 0.39, + "grad_norm": 0.8585789480518193, + "learning_rate": 1.4038005928892932e-05, + "loss": 0.5008, + "step": 4758 + }, + { + "epoch": 0.39, + "grad_norm": 0.9809877521334563, + "learning_rate": 1.4035597479389088e-05, + "loss": 0.5988, + "step": 4759 + }, + { + "epoch": 0.39, + "grad_norm": 0.9476898779708328, + "learning_rate": 1.4033188750221918e-05, + "loss": 0.595, + "step": 4760 + }, + { + "epoch": 0.39, + "grad_norm": 0.9190154556053546, + "learning_rate": 1.4030779741558345e-05, + "loss": 0.5272, + "step": 4761 + }, + { + "epoch": 0.39, + "grad_norm": 0.8986826250041086, + "learning_rate": 1.402837045356531e-05, + "loss": 0.6074, + "step": 4762 + }, + { + "epoch": 0.39, + "grad_norm": 0.9669448931268747, + "learning_rate": 1.4025960886409777e-05, + "loss": 0.5329, + "step": 4763 + }, + { + "epoch": 0.39, + "grad_norm": 0.8597820710103147, + "learning_rate": 1.4023551040258726e-05, + "loss": 0.5572, + "step": 4764 + }, + { + "epoch": 0.39, + "grad_norm": 0.9632057604242157, + "learning_rate": 1.4021140915279157e-05, + "loss": 0.5526, + "step": 4765 + }, + { + "epoch": 0.39, + "grad_norm": 0.8111326793433378, + "learning_rate": 1.4018730511638087e-05, + "loss": 0.4572, + "step": 4766 + }, + { + "epoch": 0.39, + "grad_norm": 0.8599006225258523, + "learning_rate": 1.4016319829502559e-05, + "loss": 0.5486, + "step": 4767 + }, + { + "epoch": 0.39, + "grad_norm": 0.9054438326651731, + "learning_rate": 1.4013908869039627e-05, + "loss": 0.585, + "step": 4768 + }, + { + "epoch": 0.39, + "grad_norm": 0.815284889579051, + "learning_rate": 1.4011497630416375e-05, + "loss": 0.5256, + "step": 4769 + }, + { + "epoch": 0.39, + "grad_norm": 1.0681117562803168, + "learning_rate": 1.4009086113799892e-05, + "loss": 0.5563, + "step": 4770 + }, + { + "epoch": 0.39, + "grad_norm": 0.9004269746499685, + "learning_rate": 1.4006674319357298e-05, + "loss": 0.5582, + "step": 4771 + }, + { + "epoch": 0.39, + "grad_norm": 0.9434517262368302, + "learning_rate": 1.400426224725573e-05, + "loss": 0.5648, + "step": 4772 + }, + { + "epoch": 0.39, + "grad_norm": 0.8505463208839589, + "learning_rate": 1.4001849897662337e-05, + "loss": 0.5259, + "step": 4773 + }, + { + "epoch": 0.39, + "grad_norm": 0.8630515762899552, + "learning_rate": 1.3999437270744296e-05, + "loss": 0.5718, + "step": 4774 + }, + { + "epoch": 0.39, + "grad_norm": 0.9791660948704696, + "learning_rate": 1.3997024366668802e-05, + "loss": 0.6139, + "step": 4775 + }, + { + "epoch": 0.39, + "grad_norm": 0.9069769270170313, + "learning_rate": 1.3994611185603062e-05, + "loss": 0.5569, + "step": 4776 + }, + { + "epoch": 0.39, + "grad_norm": 0.9980723952054046, + "learning_rate": 1.399219772771431e-05, + "loss": 0.5669, + "step": 4777 + }, + { + "epoch": 0.39, + "grad_norm": 0.8644689973208419, + "learning_rate": 1.3989783993169798e-05, + "loss": 0.536, + "step": 4778 + }, + { + "epoch": 0.39, + "grad_norm": 0.879292682310879, + "learning_rate": 1.3987369982136794e-05, + "loss": 0.5866, + "step": 4779 + }, + { + "epoch": 0.39, + "grad_norm": 0.9060036514914707, + "learning_rate": 1.3984955694782584e-05, + "loss": 0.5524, + "step": 4780 + }, + { + "epoch": 0.39, + "grad_norm": 0.8978917858742707, + "learning_rate": 1.3982541131274485e-05, + "loss": 0.5106, + "step": 4781 + }, + { + "epoch": 0.39, + "grad_norm": 0.8626340658742684, + "learning_rate": 1.3980126291779814e-05, + "loss": 0.5919, + "step": 4782 + }, + { + "epoch": 0.39, + "grad_norm": 0.8853907390891486, + "learning_rate": 1.3977711176465923e-05, + "loss": 0.5066, + "step": 4783 + }, + { + "epoch": 0.39, + "grad_norm": 0.9719894539559228, + "learning_rate": 1.3975295785500176e-05, + "loss": 0.6066, + "step": 4784 + }, + { + "epoch": 0.39, + "grad_norm": 0.8420668650366185, + "learning_rate": 1.3972880119049954e-05, + "loss": 0.5569, + "step": 4785 + }, + { + "epoch": 0.39, + "grad_norm": 1.0819288026025051, + "learning_rate": 1.3970464177282665e-05, + "loss": 0.6061, + "step": 4786 + }, + { + "epoch": 0.39, + "grad_norm": 0.8661264684608802, + "learning_rate": 1.3968047960365733e-05, + "loss": 0.529, + "step": 4787 + }, + { + "epoch": 0.39, + "grad_norm": 0.9256001465669449, + "learning_rate": 1.3965631468466593e-05, + "loss": 0.5234, + "step": 4788 + }, + { + "epoch": 0.39, + "grad_norm": 0.9031084374098787, + "learning_rate": 1.3963214701752714e-05, + "loss": 0.5153, + "step": 4789 + }, + { + "epoch": 0.39, + "grad_norm": 0.9398534523407612, + "learning_rate": 1.396079766039157e-05, + "loss": 0.5374, + "step": 4790 + }, + { + "epoch": 0.39, + "grad_norm": 0.9908903392902311, + "learning_rate": 1.3958380344550659e-05, + "loss": 0.5367, + "step": 4791 + }, + { + "epoch": 0.39, + "grad_norm": 0.953551175148032, + "learning_rate": 1.3955962754397505e-05, + "loss": 0.4969, + "step": 4792 + }, + { + "epoch": 0.39, + "grad_norm": 0.908420529771096, + "learning_rate": 1.395354489009964e-05, + "loss": 0.5562, + "step": 4793 + }, + { + "epoch": 0.39, + "grad_norm": 0.8407068398016209, + "learning_rate": 1.3951126751824618e-05, + "loss": 0.4925, + "step": 4794 + }, + { + "epoch": 0.39, + "grad_norm": 1.0293422320380614, + "learning_rate": 1.3948708339740019e-05, + "loss": 0.5454, + "step": 4795 + }, + { + "epoch": 0.39, + "grad_norm": 0.9554664617714606, + "learning_rate": 1.3946289654013435e-05, + "loss": 0.6564, + "step": 4796 + }, + { + "epoch": 0.39, + "grad_norm": 1.0877753621417259, + "learning_rate": 1.3943870694812475e-05, + "loss": 0.5433, + "step": 4797 + }, + { + "epoch": 0.39, + "grad_norm": 0.8856524373788587, + "learning_rate": 1.3941451462304778e-05, + "loss": 0.553, + "step": 4798 + }, + { + "epoch": 0.39, + "grad_norm": 0.9286230008806331, + "learning_rate": 1.393903195665799e-05, + "loss": 0.5859, + "step": 4799 + }, + { + "epoch": 0.39, + "grad_norm": 0.9510367799250085, + "learning_rate": 1.393661217803978e-05, + "loss": 0.5666, + "step": 4800 + }, + { + "epoch": 0.39, + "grad_norm": 0.9448143716639735, + "learning_rate": 1.3934192126617838e-05, + "loss": 0.5982, + "step": 4801 + }, + { + "epoch": 0.39, + "grad_norm": 0.9379935538051243, + "learning_rate": 1.3931771802559875e-05, + "loss": 0.5737, + "step": 4802 + }, + { + "epoch": 0.39, + "grad_norm": 0.7776618529565054, + "learning_rate": 1.3929351206033607e-05, + "loss": 0.4976, + "step": 4803 + }, + { + "epoch": 0.39, + "grad_norm": 0.8568590847033114, + "learning_rate": 1.392693033720679e-05, + "loss": 0.4904, + "step": 4804 + }, + { + "epoch": 0.39, + "grad_norm": 0.8834314638024736, + "learning_rate": 1.3924509196247185e-05, + "loss": 0.579, + "step": 4805 + }, + { + "epoch": 0.39, + "grad_norm": 0.8940184195213572, + "learning_rate": 1.392208778332257e-05, + "loss": 0.6216, + "step": 4806 + }, + { + "epoch": 0.39, + "grad_norm": 0.9112868877658168, + "learning_rate": 1.3919666098600753e-05, + "loss": 0.5378, + "step": 4807 + }, + { + "epoch": 0.39, + "grad_norm": 0.9216500150943909, + "learning_rate": 1.3917244142249551e-05, + "loss": 0.5763, + "step": 4808 + }, + { + "epoch": 0.39, + "grad_norm": 0.893654843861322, + "learning_rate": 1.3914821914436805e-05, + "loss": 0.5533, + "step": 4809 + }, + { + "epoch": 0.39, + "grad_norm": 0.8614911309355482, + "learning_rate": 1.3912399415330371e-05, + "loss": 0.5161, + "step": 4810 + }, + { + "epoch": 0.39, + "grad_norm": 0.9230499012788027, + "learning_rate": 1.3909976645098131e-05, + "loss": 0.5516, + "step": 4811 + }, + { + "epoch": 0.39, + "grad_norm": 0.9496706748160366, + "learning_rate": 1.3907553603907974e-05, + "loss": 0.5344, + "step": 4812 + }, + { + "epoch": 0.39, + "grad_norm": 0.8650379902912981, + "learning_rate": 1.3905130291927822e-05, + "loss": 0.5882, + "step": 4813 + }, + { + "epoch": 0.39, + "grad_norm": 0.998972087470307, + "learning_rate": 1.3902706709325603e-05, + "loss": 0.5186, + "step": 4814 + }, + { + "epoch": 0.39, + "grad_norm": 0.8508144263902477, + "learning_rate": 1.3900282856269271e-05, + "loss": 0.5418, + "step": 4815 + }, + { + "epoch": 0.39, + "grad_norm": 1.0025157638908624, + "learning_rate": 1.3897858732926794e-05, + "loss": 0.5818, + "step": 4816 + }, + { + "epoch": 0.39, + "grad_norm": 1.0516850697205595, + "learning_rate": 1.3895434339466167e-05, + "loss": 0.6107, + "step": 4817 + }, + { + "epoch": 0.39, + "grad_norm": 0.9658348227769601, + "learning_rate": 1.3893009676055395e-05, + "loss": 0.6161, + "step": 4818 + }, + { + "epoch": 0.39, + "grad_norm": 1.0289714416989588, + "learning_rate": 1.3890584742862508e-05, + "loss": 0.5631, + "step": 4819 + }, + { + "epoch": 0.39, + "grad_norm": 0.9460224670710317, + "learning_rate": 1.3888159540055544e-05, + "loss": 0.5635, + "step": 4820 + }, + { + "epoch": 0.39, + "grad_norm": 0.9800985935542305, + "learning_rate": 1.3885734067802576e-05, + "loss": 0.5972, + "step": 4821 + }, + { + "epoch": 0.39, + "grad_norm": 0.9354319412514424, + "learning_rate": 1.3883308326271682e-05, + "loss": 0.546, + "step": 4822 + }, + { + "epoch": 0.39, + "grad_norm": 0.9247901332190117, + "learning_rate": 1.3880882315630968e-05, + "loss": 0.6057, + "step": 4823 + }, + { + "epoch": 0.39, + "grad_norm": 0.8687178724760034, + "learning_rate": 1.387845603604855e-05, + "loss": 0.5477, + "step": 4824 + }, + { + "epoch": 0.39, + "grad_norm": 0.9721356384986916, + "learning_rate": 1.387602948769257e-05, + "loss": 0.5753, + "step": 4825 + }, + { + "epoch": 0.39, + "grad_norm": 0.9264169462238878, + "learning_rate": 1.3873602670731184e-05, + "loss": 0.5709, + "step": 4826 + }, + { + "epoch": 0.39, + "grad_norm": 0.8798247430588303, + "learning_rate": 1.387117558533257e-05, + "loss": 0.5449, + "step": 4827 + }, + { + "epoch": 0.39, + "grad_norm": 1.027001023657605, + "learning_rate": 1.3868748231664918e-05, + "loss": 0.5988, + "step": 4828 + }, + { + "epoch": 0.39, + "grad_norm": 0.8553348754174525, + "learning_rate": 1.3866320609896449e-05, + "loss": 0.5224, + "step": 4829 + }, + { + "epoch": 0.39, + "grad_norm": 0.9102274530812277, + "learning_rate": 1.3863892720195389e-05, + "loss": 0.6143, + "step": 4830 + }, + { + "epoch": 0.39, + "grad_norm": 0.8638981385026545, + "learning_rate": 1.3861464562729992e-05, + "loss": 0.4728, + "step": 4831 + }, + { + "epoch": 0.39, + "grad_norm": 0.8269196063569016, + "learning_rate": 1.3859036137668525e-05, + "loss": 0.5418, + "step": 4832 + }, + { + "epoch": 0.39, + "grad_norm": 1.0036783596030001, + "learning_rate": 1.3856607445179278e-05, + "loss": 0.6036, + "step": 4833 + }, + { + "epoch": 0.39, + "grad_norm": 0.9065362519579458, + "learning_rate": 1.3854178485430554e-05, + "loss": 0.5078, + "step": 4834 + }, + { + "epoch": 0.39, + "grad_norm": 1.048824179044158, + "learning_rate": 1.3851749258590679e-05, + "loss": 0.6005, + "step": 4835 + }, + { + "epoch": 0.39, + "grad_norm": 0.8302691187789433, + "learning_rate": 1.3849319764828e-05, + "loss": 0.4975, + "step": 4836 + }, + { + "epoch": 0.39, + "grad_norm": 0.9031582569220298, + "learning_rate": 1.3846890004310873e-05, + "loss": 0.5631, + "step": 4837 + }, + { + "epoch": 0.39, + "grad_norm": 0.9177195645049511, + "learning_rate": 1.3844459977207683e-05, + "loss": 0.5383, + "step": 4838 + }, + { + "epoch": 0.39, + "grad_norm": 0.949269225994463, + "learning_rate": 1.3842029683686826e-05, + "loss": 0.5748, + "step": 4839 + }, + { + "epoch": 0.39, + "grad_norm": 0.937659934137604, + "learning_rate": 1.3839599123916718e-05, + "loss": 0.5762, + "step": 4840 + }, + { + "epoch": 0.39, + "grad_norm": 0.8996926326469921, + "learning_rate": 1.3837168298065798e-05, + "loss": 0.61, + "step": 4841 + }, + { + "epoch": 0.39, + "grad_norm": 0.94884621874138, + "learning_rate": 1.3834737206302519e-05, + "loss": 0.6057, + "step": 4842 + }, + { + "epoch": 0.39, + "grad_norm": 0.923036476443756, + "learning_rate": 1.3832305848795352e-05, + "loss": 0.5301, + "step": 4843 + }, + { + "epoch": 0.39, + "grad_norm": 0.8541275293194943, + "learning_rate": 1.382987422571279e-05, + "loss": 0.5602, + "step": 4844 + }, + { + "epoch": 0.39, + "grad_norm": 0.9501069778376648, + "learning_rate": 1.382744233722334e-05, + "loss": 0.5255, + "step": 4845 + }, + { + "epoch": 0.39, + "grad_norm": 0.9630480388424695, + "learning_rate": 1.382501018349553e-05, + "loss": 0.5897, + "step": 4846 + }, + { + "epoch": 0.39, + "grad_norm": 0.8762902360405572, + "learning_rate": 1.3822577764697908e-05, + "loss": 0.5017, + "step": 4847 + }, + { + "epoch": 0.39, + "grad_norm": 1.043217051840068, + "learning_rate": 1.3820145080999038e-05, + "loss": 0.575, + "step": 4848 + }, + { + "epoch": 0.39, + "grad_norm": 0.9439300127928049, + "learning_rate": 1.3817712132567503e-05, + "loss": 0.5904, + "step": 4849 + }, + { + "epoch": 0.39, + "grad_norm": 0.8881934762338176, + "learning_rate": 1.3815278919571901e-05, + "loss": 0.5236, + "step": 4850 + }, + { + "epoch": 0.39, + "grad_norm": 0.9482363818913019, + "learning_rate": 1.3812845442180857e-05, + "loss": 0.6188, + "step": 4851 + }, + { + "epoch": 0.39, + "grad_norm": 1.0714832359599773, + "learning_rate": 1.3810411700563005e-05, + "loss": 0.5725, + "step": 4852 + }, + { + "epoch": 0.39, + "grad_norm": 0.9972613620841716, + "learning_rate": 1.3807977694887003e-05, + "loss": 0.6182, + "step": 4853 + }, + { + "epoch": 0.39, + "grad_norm": 0.9624229426930314, + "learning_rate": 1.3805543425321524e-05, + "loss": 0.5063, + "step": 4854 + }, + { + "epoch": 0.39, + "grad_norm": 0.8981685605321771, + "learning_rate": 1.3803108892035259e-05, + "loss": 0.522, + "step": 4855 + }, + { + "epoch": 0.39, + "grad_norm": 0.8918230664294677, + "learning_rate": 1.3800674095196922e-05, + "loss": 0.5192, + "step": 4856 + }, + { + "epoch": 0.39, + "grad_norm": 1.021530752522361, + "learning_rate": 1.3798239034975243e-05, + "loss": 0.5427, + "step": 4857 + }, + { + "epoch": 0.39, + "grad_norm": 0.8679200195770674, + "learning_rate": 1.3795803711538966e-05, + "loss": 0.5067, + "step": 4858 + }, + { + "epoch": 0.39, + "grad_norm": 1.0076351564977803, + "learning_rate": 1.3793368125056859e-05, + "loss": 0.6147, + "step": 4859 + }, + { + "epoch": 0.39, + "grad_norm": 0.9477492129643318, + "learning_rate": 1.3790932275697708e-05, + "loss": 0.5777, + "step": 4860 + }, + { + "epoch": 0.4, + "grad_norm": 0.9912531451536267, + "learning_rate": 1.378849616363031e-05, + "loss": 0.5575, + "step": 4861 + }, + { + "epoch": 0.4, + "grad_norm": 1.008150805432256, + "learning_rate": 1.3786059789023487e-05, + "loss": 0.5991, + "step": 4862 + }, + { + "epoch": 0.4, + "grad_norm": 0.967448144120769, + "learning_rate": 1.3783623152046084e-05, + "loss": 0.5237, + "step": 4863 + }, + { + "epoch": 0.4, + "grad_norm": 1.0518336967776385, + "learning_rate": 1.3781186252866948e-05, + "loss": 0.5971, + "step": 4864 + }, + { + "epoch": 0.4, + "grad_norm": 0.9082645655876715, + "learning_rate": 1.377874909165496e-05, + "loss": 0.5647, + "step": 4865 + }, + { + "epoch": 0.4, + "grad_norm": 0.9465929099003216, + "learning_rate": 1.3776311668579012e-05, + "loss": 0.5546, + "step": 4866 + }, + { + "epoch": 0.4, + "grad_norm": 0.8694075318857712, + "learning_rate": 1.3773873983808014e-05, + "loss": 0.5255, + "step": 4867 + }, + { + "epoch": 0.4, + "grad_norm": 0.9269426482677092, + "learning_rate": 1.3771436037510897e-05, + "loss": 0.5775, + "step": 4868 + }, + { + "epoch": 0.4, + "grad_norm": 0.9833980689058247, + "learning_rate": 1.3768997829856608e-05, + "loss": 0.5893, + "step": 4869 + }, + { + "epoch": 0.4, + "grad_norm": 0.9336236085410238, + "learning_rate": 1.3766559361014113e-05, + "loss": 0.5903, + "step": 4870 + }, + { + "epoch": 0.4, + "grad_norm": 0.8591684244273873, + "learning_rate": 1.3764120631152395e-05, + "loss": 0.5636, + "step": 4871 + }, + { + "epoch": 0.4, + "grad_norm": 0.896987510351119, + "learning_rate": 1.3761681640440455e-05, + "loss": 0.5898, + "step": 4872 + }, + { + "epoch": 0.4, + "grad_norm": 0.9260583363210217, + "learning_rate": 1.3759242389047315e-05, + "loss": 0.5162, + "step": 4873 + }, + { + "epoch": 0.4, + "grad_norm": 0.9281921574947163, + "learning_rate": 1.375680287714201e-05, + "loss": 0.6638, + "step": 4874 + }, + { + "epoch": 0.4, + "grad_norm": 0.9548686164779019, + "learning_rate": 1.37543631048936e-05, + "loss": 0.5717, + "step": 4875 + }, + { + "epoch": 0.4, + "grad_norm": 0.939319463753251, + "learning_rate": 1.3751923072471159e-05, + "loss": 0.6109, + "step": 4876 + }, + { + "epoch": 0.4, + "grad_norm": 0.9370066407879878, + "learning_rate": 1.3749482780043773e-05, + "loss": 0.4715, + "step": 4877 + }, + { + "epoch": 0.4, + "grad_norm": 0.9853199068327506, + "learning_rate": 1.3747042227780557e-05, + "loss": 0.5326, + "step": 4878 + }, + { + "epoch": 0.4, + "grad_norm": 0.9239784176479834, + "learning_rate": 1.3744601415850637e-05, + "loss": 0.5714, + "step": 4879 + }, + { + "epoch": 0.4, + "grad_norm": 0.910935042749798, + "learning_rate": 1.3742160344423164e-05, + "loss": 0.5666, + "step": 4880 + }, + { + "epoch": 0.4, + "grad_norm": 0.9811225149068671, + "learning_rate": 1.3739719013667297e-05, + "loss": 0.6076, + "step": 4881 + }, + { + "epoch": 0.4, + "grad_norm": 0.9149100233787881, + "learning_rate": 1.3737277423752218e-05, + "loss": 0.5563, + "step": 4882 + }, + { + "epoch": 0.4, + "grad_norm": 0.9053166838456309, + "learning_rate": 1.373483557484713e-05, + "loss": 0.4929, + "step": 4883 + }, + { + "epoch": 0.4, + "grad_norm": 0.7872734502680776, + "learning_rate": 1.3732393467121247e-05, + "loss": 0.4891, + "step": 4884 + }, + { + "epoch": 0.4, + "grad_norm": 0.8742286431117344, + "learning_rate": 1.372995110074381e-05, + "loss": 0.5329, + "step": 4885 + }, + { + "epoch": 0.4, + "grad_norm": 0.8672043564902082, + "learning_rate": 1.3727508475884071e-05, + "loss": 0.5895, + "step": 4886 + }, + { + "epoch": 0.4, + "grad_norm": 0.8414643393353751, + "learning_rate": 1.3725065592711299e-05, + "loss": 0.4682, + "step": 4887 + }, + { + "epoch": 0.4, + "grad_norm": 0.8875840210510944, + "learning_rate": 1.3722622451394784e-05, + "loss": 0.6033, + "step": 4888 + }, + { + "epoch": 0.4, + "grad_norm": 0.8779128007665025, + "learning_rate": 1.3720179052103836e-05, + "loss": 0.5105, + "step": 4889 + }, + { + "epoch": 0.4, + "grad_norm": 0.9046433815894597, + "learning_rate": 1.3717735395007786e-05, + "loss": 0.4746, + "step": 4890 + }, + { + "epoch": 0.4, + "grad_norm": 0.9050851117821455, + "learning_rate": 1.3715291480275963e-05, + "loss": 0.5518, + "step": 4891 + }, + { + "epoch": 0.4, + "grad_norm": 0.9681695345357252, + "learning_rate": 1.3712847308077737e-05, + "loss": 0.5641, + "step": 4892 + }, + { + "epoch": 0.4, + "grad_norm": 0.9480998717859649, + "learning_rate": 1.3710402878582487e-05, + "loss": 0.5638, + "step": 4893 + }, + { + "epoch": 0.4, + "grad_norm": 1.010665403165591, + "learning_rate": 1.3707958191959609e-05, + "loss": 0.5386, + "step": 4894 + }, + { + "epoch": 0.4, + "grad_norm": 0.9721875042908399, + "learning_rate": 1.3705513248378517e-05, + "loss": 0.605, + "step": 4895 + }, + { + "epoch": 0.4, + "grad_norm": 0.8790064569896765, + "learning_rate": 1.3703068048008645e-05, + "loss": 0.4673, + "step": 4896 + }, + { + "epoch": 0.4, + "grad_norm": 0.8314633413004586, + "learning_rate": 1.3700622591019439e-05, + "loss": 0.5181, + "step": 4897 + }, + { + "epoch": 0.4, + "grad_norm": 0.8243806242658588, + "learning_rate": 1.3698176877580372e-05, + "loss": 0.5087, + "step": 4898 + }, + { + "epoch": 0.4, + "grad_norm": 0.889784596263977, + "learning_rate": 1.3695730907860925e-05, + "loss": 0.5393, + "step": 4899 + }, + { + "epoch": 0.4, + "grad_norm": 0.9543436360355445, + "learning_rate": 1.3693284682030608e-05, + "loss": 0.5735, + "step": 4900 + }, + { + "epoch": 0.4, + "grad_norm": 0.8393976604033532, + "learning_rate": 1.3690838200258936e-05, + "loss": 0.5944, + "step": 4901 + }, + { + "epoch": 0.4, + "grad_norm": 1.0040665844972214, + "learning_rate": 1.368839146271545e-05, + "loss": 0.6325, + "step": 4902 + }, + { + "epoch": 0.4, + "grad_norm": 0.8661069998555928, + "learning_rate": 1.368594446956971e-05, + "loss": 0.549, + "step": 4903 + }, + { + "epoch": 0.4, + "grad_norm": 0.980762907582011, + "learning_rate": 1.3683497220991286e-05, + "loss": 0.5668, + "step": 4904 + }, + { + "epoch": 0.4, + "grad_norm": 0.8303767586905748, + "learning_rate": 1.3681049717149773e-05, + "loss": 0.5261, + "step": 4905 + }, + { + "epoch": 0.4, + "grad_norm": 1.0368179040107803, + "learning_rate": 1.3678601958214779e-05, + "loss": 0.527, + "step": 4906 + }, + { + "epoch": 0.4, + "grad_norm": 0.9249813464997794, + "learning_rate": 1.367615394435593e-05, + "loss": 0.6134, + "step": 4907 + }, + { + "epoch": 0.4, + "grad_norm": 0.8358321020913861, + "learning_rate": 1.3673705675742875e-05, + "loss": 0.5125, + "step": 4908 + }, + { + "epoch": 0.4, + "grad_norm": 1.0496005259301522, + "learning_rate": 1.3671257152545277e-05, + "loss": 0.6041, + "step": 4909 + }, + { + "epoch": 0.4, + "grad_norm": 0.946126940263957, + "learning_rate": 1.3668808374932812e-05, + "loss": 0.572, + "step": 4910 + }, + { + "epoch": 0.4, + "grad_norm": 0.848440255461486, + "learning_rate": 1.3666359343075182e-05, + "loss": 0.5319, + "step": 4911 + }, + { + "epoch": 0.4, + "grad_norm": 0.8872846916857009, + "learning_rate": 1.3663910057142102e-05, + "loss": 0.5519, + "step": 4912 + }, + { + "epoch": 0.4, + "grad_norm": 1.0799104995159567, + "learning_rate": 1.3661460517303304e-05, + "loss": 0.6399, + "step": 4913 + }, + { + "epoch": 0.4, + "grad_norm": 0.8940584447880352, + "learning_rate": 1.3659010723728542e-05, + "loss": 0.5479, + "step": 4914 + }, + { + "epoch": 0.4, + "grad_norm": 0.9056599511334902, + "learning_rate": 1.3656560676587583e-05, + "loss": 0.623, + "step": 4915 + }, + { + "epoch": 0.4, + "grad_norm": 0.8597632452723915, + "learning_rate": 1.3654110376050209e-05, + "loss": 0.5268, + "step": 4916 + }, + { + "epoch": 0.4, + "grad_norm": 0.9576599634253601, + "learning_rate": 1.3651659822286227e-05, + "loss": 0.5411, + "step": 4917 + }, + { + "epoch": 0.4, + "grad_norm": 0.9237904664858961, + "learning_rate": 1.364920901546546e-05, + "loss": 0.5251, + "step": 4918 + }, + { + "epoch": 0.4, + "grad_norm": 0.8414244460916807, + "learning_rate": 1.3646757955757746e-05, + "loss": 0.533, + "step": 4919 + }, + { + "epoch": 0.4, + "grad_norm": 0.9113629315058073, + "learning_rate": 1.3644306643332939e-05, + "loss": 0.5311, + "step": 4920 + }, + { + "epoch": 0.4, + "grad_norm": 0.903990767127771, + "learning_rate": 1.3641855078360914e-05, + "loss": 0.5323, + "step": 4921 + }, + { + "epoch": 0.4, + "grad_norm": 0.9785840692406184, + "learning_rate": 1.3639403261011563e-05, + "loss": 0.5502, + "step": 4922 + }, + { + "epoch": 0.4, + "grad_norm": 0.9387542745252239, + "learning_rate": 1.3636951191454792e-05, + "loss": 0.5561, + "step": 4923 + }, + { + "epoch": 0.4, + "grad_norm": 0.899595616734336, + "learning_rate": 1.3634498869860533e-05, + "loss": 0.4931, + "step": 4924 + }, + { + "epoch": 0.4, + "grad_norm": 0.8787075359867942, + "learning_rate": 1.3632046296398724e-05, + "loss": 0.5138, + "step": 4925 + }, + { + "epoch": 0.4, + "grad_norm": 0.8436660881527684, + "learning_rate": 1.3629593471239328e-05, + "loss": 0.5069, + "step": 4926 + }, + { + "epoch": 0.4, + "grad_norm": 0.9229983052966406, + "learning_rate": 1.3627140394552326e-05, + "loss": 0.5417, + "step": 4927 + }, + { + "epoch": 0.4, + "grad_norm": 0.9687480936383686, + "learning_rate": 1.3624687066507709e-05, + "loss": 0.6037, + "step": 4928 + }, + { + "epoch": 0.4, + "grad_norm": 0.862557622655597, + "learning_rate": 1.3622233487275493e-05, + "loss": 0.5621, + "step": 4929 + }, + { + "epoch": 0.4, + "grad_norm": 0.8746701383679512, + "learning_rate": 1.3619779657025714e-05, + "loss": 0.574, + "step": 4930 + }, + { + "epoch": 0.4, + "grad_norm": 0.9638263263332979, + "learning_rate": 1.3617325575928414e-05, + "loss": 0.5793, + "step": 4931 + }, + { + "epoch": 0.4, + "grad_norm": 0.8753934118629156, + "learning_rate": 1.3614871244153655e-05, + "loss": 0.506, + "step": 4932 + }, + { + "epoch": 0.4, + "grad_norm": 0.9135589434822865, + "learning_rate": 1.3612416661871532e-05, + "loss": 0.5127, + "step": 4933 + }, + { + "epoch": 0.4, + "grad_norm": 0.9604594234952577, + "learning_rate": 1.3609961829252133e-05, + "loss": 0.5363, + "step": 4934 + }, + { + "epoch": 0.4, + "grad_norm": 1.028473474567092, + "learning_rate": 1.3607506746465584e-05, + "loss": 0.5779, + "step": 4935 + }, + { + "epoch": 0.4, + "grad_norm": 0.902538042974246, + "learning_rate": 1.360505141368202e-05, + "loss": 0.5267, + "step": 4936 + }, + { + "epoch": 0.4, + "grad_norm": 0.8992938909760883, + "learning_rate": 1.3602595831071586e-05, + "loss": 0.5842, + "step": 4937 + }, + { + "epoch": 0.4, + "grad_norm": 0.8780589847799097, + "learning_rate": 1.3600139998804459e-05, + "loss": 0.5513, + "step": 4938 + }, + { + "epoch": 0.4, + "grad_norm": 0.9142510765861755, + "learning_rate": 1.359768391705082e-05, + "loss": 0.5406, + "step": 4939 + }, + { + "epoch": 0.4, + "grad_norm": 0.9567959566085759, + "learning_rate": 1.3595227585980881e-05, + "loss": 0.5539, + "step": 4940 + }, + { + "epoch": 0.4, + "grad_norm": 0.8807388652451061, + "learning_rate": 1.3592771005764857e-05, + "loss": 0.5587, + "step": 4941 + }, + { + "epoch": 0.4, + "grad_norm": 0.8651113785797417, + "learning_rate": 1.3590314176572989e-05, + "loss": 0.5546, + "step": 4942 + }, + { + "epoch": 0.4, + "grad_norm": 0.8171458610105073, + "learning_rate": 1.3587857098575534e-05, + "loss": 0.5267, + "step": 4943 + }, + { + "epoch": 0.4, + "grad_norm": 1.0893608287859775, + "learning_rate": 1.3585399771942764e-05, + "loss": 0.5818, + "step": 4944 + }, + { + "epoch": 0.4, + "grad_norm": 0.9605917712592748, + "learning_rate": 1.358294219684497e-05, + "loss": 0.5289, + "step": 4945 + }, + { + "epoch": 0.4, + "grad_norm": 0.7972262798819203, + "learning_rate": 1.3580484373452462e-05, + "loss": 0.5292, + "step": 4946 + }, + { + "epoch": 0.4, + "grad_norm": 0.9419043795441053, + "learning_rate": 1.357802630193556e-05, + "loss": 0.5731, + "step": 4947 + }, + { + "epoch": 0.4, + "grad_norm": 0.8941534978079186, + "learning_rate": 1.357556798246461e-05, + "loss": 0.51, + "step": 4948 + }, + { + "epoch": 0.4, + "grad_norm": 0.9127635202406887, + "learning_rate": 1.357310941520997e-05, + "loss": 0.5553, + "step": 4949 + }, + { + "epoch": 0.4, + "grad_norm": 0.9002276388779009, + "learning_rate": 1.3570650600342017e-05, + "loss": 0.5921, + "step": 4950 + }, + { + "epoch": 0.4, + "grad_norm": 0.8717763667596604, + "learning_rate": 1.3568191538031146e-05, + "loss": 0.5518, + "step": 4951 + }, + { + "epoch": 0.4, + "grad_norm": 0.8191098322900104, + "learning_rate": 1.3565732228447766e-05, + "loss": 0.4965, + "step": 4952 + }, + { + "epoch": 0.4, + "grad_norm": 0.9373761418635452, + "learning_rate": 1.3563272671762304e-05, + "loss": 0.5527, + "step": 4953 + }, + { + "epoch": 0.4, + "grad_norm": 0.9116380184176788, + "learning_rate": 1.3560812868145206e-05, + "loss": 0.5502, + "step": 4954 + }, + { + "epoch": 0.4, + "grad_norm": 0.9782949235050161, + "learning_rate": 1.3558352817766935e-05, + "loss": 0.5888, + "step": 4955 + }, + { + "epoch": 0.4, + "grad_norm": 0.8729097488069214, + "learning_rate": 1.355589252079797e-05, + "loss": 0.585, + "step": 4956 + }, + { + "epoch": 0.4, + "grad_norm": 0.92953206689005, + "learning_rate": 1.3553431977408809e-05, + "loss": 0.6116, + "step": 4957 + }, + { + "epoch": 0.4, + "grad_norm": 0.9161281687717558, + "learning_rate": 1.3550971187769964e-05, + "loss": 0.582, + "step": 4958 + }, + { + "epoch": 0.4, + "grad_norm": 0.9544317740681652, + "learning_rate": 1.3548510152051963e-05, + "loss": 0.5308, + "step": 4959 + }, + { + "epoch": 0.4, + "grad_norm": 0.9300834006161792, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.5908, + "step": 4960 + }, + { + "epoch": 0.4, + "grad_norm": 0.8682564450672268, + "learning_rate": 1.3543587343060712e-05, + "loss": 0.5659, + "step": 4961 + }, + { + "epoch": 0.4, + "grad_norm": 0.8507907896472288, + "learning_rate": 1.3541125570128603e-05, + "loss": 0.5334, + "step": 4962 + }, + { + "epoch": 0.4, + "grad_norm": 0.8841622079095046, + "learning_rate": 1.3538663551799636e-05, + "loss": 0.6271, + "step": 4963 + }, + { + "epoch": 0.4, + "grad_norm": 0.8433659287109949, + "learning_rate": 1.3536201288244425e-05, + "loss": 0.5682, + "step": 4964 + }, + { + "epoch": 0.4, + "grad_norm": 0.9244820516318176, + "learning_rate": 1.3533738779633597e-05, + "loss": 0.5897, + "step": 4965 + }, + { + "epoch": 0.4, + "grad_norm": 0.7981635507567023, + "learning_rate": 1.3531276026137807e-05, + "loss": 0.4975, + "step": 4966 + }, + { + "epoch": 0.4, + "grad_norm": 1.234573810119623, + "learning_rate": 1.3528813027927724e-05, + "loss": 0.56, + "step": 4967 + }, + { + "epoch": 0.4, + "grad_norm": 0.9720942100727602, + "learning_rate": 1.3526349785174025e-05, + "loss": 0.5927, + "step": 4968 + }, + { + "epoch": 0.4, + "grad_norm": 0.9249900333619757, + "learning_rate": 1.3523886298047412e-05, + "loss": 0.5049, + "step": 4969 + }, + { + "epoch": 0.4, + "grad_norm": 0.8817644529264543, + "learning_rate": 1.3521422566718609e-05, + "loss": 0.5317, + "step": 4970 + }, + { + "epoch": 0.4, + "grad_norm": 0.8806141034443261, + "learning_rate": 1.3518958591358345e-05, + "loss": 0.4908, + "step": 4971 + }, + { + "epoch": 0.4, + "grad_norm": 0.9637460271329971, + "learning_rate": 1.3516494372137368e-05, + "loss": 0.5397, + "step": 4972 + }, + { + "epoch": 0.4, + "grad_norm": 0.9061409021519361, + "learning_rate": 1.3514029909226454e-05, + "loss": 0.5826, + "step": 4973 + }, + { + "epoch": 0.4, + "grad_norm": 0.9483299745746715, + "learning_rate": 1.3511565202796381e-05, + "loss": 0.5759, + "step": 4974 + }, + { + "epoch": 0.4, + "grad_norm": 0.9451773146461331, + "learning_rate": 1.3509100253017958e-05, + "loss": 0.5814, + "step": 4975 + }, + { + "epoch": 0.4, + "grad_norm": 0.8567772541508121, + "learning_rate": 1.3506635060062e-05, + "loss": 0.5104, + "step": 4976 + }, + { + "epoch": 0.4, + "grad_norm": 0.8383479350018068, + "learning_rate": 1.350416962409934e-05, + "loss": 0.5064, + "step": 4977 + }, + { + "epoch": 0.4, + "grad_norm": 0.8912113016113629, + "learning_rate": 1.3501703945300832e-05, + "loss": 0.5262, + "step": 4978 + }, + { + "epoch": 0.4, + "grad_norm": 0.9870956538767125, + "learning_rate": 1.349923802383735e-05, + "loss": 0.6047, + "step": 4979 + }, + { + "epoch": 0.4, + "grad_norm": 0.977475323915746, + "learning_rate": 1.3496771859879774e-05, + "loss": 0.5306, + "step": 4980 + }, + { + "epoch": 0.4, + "grad_norm": 0.9128506647381485, + "learning_rate": 1.349430545359901e-05, + "loss": 0.5356, + "step": 4981 + }, + { + "epoch": 0.4, + "grad_norm": 0.9323166542632595, + "learning_rate": 1.349183880516598e-05, + "loss": 0.5505, + "step": 4982 + }, + { + "epoch": 0.4, + "grad_norm": 0.9276052946168218, + "learning_rate": 1.3489371914751616e-05, + "loss": 0.5412, + "step": 4983 + }, + { + "epoch": 0.41, + "grad_norm": 0.9888331579532674, + "learning_rate": 1.3486904782526876e-05, + "loss": 0.5327, + "step": 4984 + }, + { + "epoch": 0.41, + "grad_norm": 0.8900191506817302, + "learning_rate": 1.3484437408662725e-05, + "loss": 0.5338, + "step": 4985 + }, + { + "epoch": 0.41, + "grad_norm": 0.8606918204173645, + "learning_rate": 1.3481969793330151e-05, + "loss": 0.5094, + "step": 4986 + }, + { + "epoch": 0.41, + "grad_norm": 0.9423971372459556, + "learning_rate": 1.3479501936700161e-05, + "loss": 0.5849, + "step": 4987 + }, + { + "epoch": 0.41, + "grad_norm": 0.9037119495482759, + "learning_rate": 1.3477033838943774e-05, + "loss": 0.5204, + "step": 4988 + }, + { + "epoch": 0.41, + "grad_norm": 1.0464546656121712, + "learning_rate": 1.3474565500232025e-05, + "loss": 0.6279, + "step": 4989 + }, + { + "epoch": 0.41, + "grad_norm": 1.0047842505728581, + "learning_rate": 1.3472096920735966e-05, + "loss": 0.6017, + "step": 4990 + }, + { + "epoch": 0.41, + "grad_norm": 0.9207006697256435, + "learning_rate": 1.3469628100626678e-05, + "loss": 0.5511, + "step": 4991 + }, + { + "epoch": 0.41, + "grad_norm": 0.9042762572590564, + "learning_rate": 1.3467159040075233e-05, + "loss": 0.5697, + "step": 4992 + }, + { + "epoch": 0.41, + "grad_norm": 0.9258256902752167, + "learning_rate": 1.3464689739252741e-05, + "loss": 0.5919, + "step": 4993 + }, + { + "epoch": 0.41, + "grad_norm": 0.8510889422046063, + "learning_rate": 1.346222019833033e-05, + "loss": 0.5575, + "step": 4994 + }, + { + "epoch": 0.41, + "grad_norm": 0.923790834710621, + "learning_rate": 1.3459750417479125e-05, + "loss": 0.5767, + "step": 4995 + }, + { + "epoch": 0.41, + "grad_norm": 0.9655024203049409, + "learning_rate": 1.3457280396870285e-05, + "loss": 0.5502, + "step": 4996 + }, + { + "epoch": 0.41, + "grad_norm": 0.8978053659559004, + "learning_rate": 1.3454810136674983e-05, + "loss": 0.5429, + "step": 4997 + }, + { + "epoch": 0.41, + "grad_norm": 0.9173149006811787, + "learning_rate": 1.34523396370644e-05, + "loss": 0.5388, + "step": 4998 + }, + { + "epoch": 0.41, + "grad_norm": 0.8803052233081673, + "learning_rate": 1.3449868898209743e-05, + "loss": 0.5308, + "step": 4999 + }, + { + "epoch": 0.41, + "grad_norm": 0.9213276267952032, + "learning_rate": 1.3447397920282232e-05, + "loss": 0.5615, + "step": 5000 + }, + { + "epoch": 0.41, + "grad_norm": 1.0679262404551646, + "learning_rate": 1.3444926703453102e-05, + "loss": 0.5619, + "step": 5001 + }, + { + "epoch": 0.41, + "grad_norm": 0.8507200211182417, + "learning_rate": 1.3442455247893608e-05, + "loss": 0.5086, + "step": 5002 + }, + { + "epoch": 0.41, + "grad_norm": 0.9397594122523698, + "learning_rate": 1.3439983553775018e-05, + "loss": 0.5929, + "step": 5003 + }, + { + "epoch": 0.41, + "grad_norm": 0.8575371004987608, + "learning_rate": 1.3437511621268622e-05, + "loss": 0.4855, + "step": 5004 + }, + { + "epoch": 0.41, + "grad_norm": 0.8893823279593794, + "learning_rate": 1.3435039450545718e-05, + "loss": 0.5046, + "step": 5005 + }, + { + "epoch": 0.41, + "grad_norm": 0.9879627215267409, + "learning_rate": 1.3432567041777624e-05, + "loss": 0.5308, + "step": 5006 + }, + { + "epoch": 0.41, + "grad_norm": 0.8223979075210116, + "learning_rate": 1.3430094395135682e-05, + "loss": 0.4693, + "step": 5007 + }, + { + "epoch": 0.41, + "grad_norm": 1.0213441059322579, + "learning_rate": 1.342762151079124e-05, + "loss": 0.5125, + "step": 5008 + }, + { + "epoch": 0.41, + "grad_norm": 0.8893830497058866, + "learning_rate": 1.3425148388915668e-05, + "loss": 0.529, + "step": 5009 + }, + { + "epoch": 0.41, + "grad_norm": 0.8453904170689711, + "learning_rate": 1.3422675029680352e-05, + "loss": 0.5243, + "step": 5010 + }, + { + "epoch": 0.41, + "grad_norm": 0.9235327456897732, + "learning_rate": 1.342020143325669e-05, + "loss": 0.5161, + "step": 5011 + }, + { + "epoch": 0.41, + "grad_norm": 0.934246128185342, + "learning_rate": 1.3417727599816101e-05, + "loss": 0.5572, + "step": 5012 + }, + { + "epoch": 0.41, + "grad_norm": 0.843513392770215, + "learning_rate": 1.3415253529530026e-05, + "loss": 0.5877, + "step": 5013 + }, + { + "epoch": 0.41, + "grad_norm": 0.8979152627392312, + "learning_rate": 1.3412779222569907e-05, + "loss": 0.5576, + "step": 5014 + }, + { + "epoch": 0.41, + "grad_norm": 0.9027225679116367, + "learning_rate": 1.3410304679107214e-05, + "loss": 0.5798, + "step": 5015 + }, + { + "epoch": 0.41, + "grad_norm": 0.9785640983126853, + "learning_rate": 1.3407829899313435e-05, + "loss": 0.4948, + "step": 5016 + }, + { + "epoch": 0.41, + "grad_norm": 0.9026415340866438, + "learning_rate": 1.3405354883360064e-05, + "loss": 0.6247, + "step": 5017 + }, + { + "epoch": 0.41, + "grad_norm": 0.9659391475419029, + "learning_rate": 1.3402879631418621e-05, + "loss": 0.5754, + "step": 5018 + }, + { + "epoch": 0.41, + "grad_norm": 0.909795574321967, + "learning_rate": 1.3400404143660639e-05, + "loss": 0.5855, + "step": 5019 + }, + { + "epoch": 0.41, + "grad_norm": 0.9365576954027569, + "learning_rate": 1.3397928420257664e-05, + "loss": 0.572, + "step": 5020 + }, + { + "epoch": 0.41, + "grad_norm": 0.9481969842800917, + "learning_rate": 1.3395452461381265e-05, + "loss": 0.5493, + "step": 5021 + }, + { + "epoch": 0.41, + "grad_norm": 0.9159633447540382, + "learning_rate": 1.3392976267203024e-05, + "loss": 0.6018, + "step": 5022 + }, + { + "epoch": 0.41, + "grad_norm": 0.929746076040423, + "learning_rate": 1.3390499837894533e-05, + "loss": 0.5801, + "step": 5023 + }, + { + "epoch": 0.41, + "grad_norm": 0.9262633274121936, + "learning_rate": 1.3388023173627413e-05, + "loss": 0.5262, + "step": 5024 + }, + { + "epoch": 0.41, + "grad_norm": 0.9079531784297293, + "learning_rate": 1.3385546274573294e-05, + "loss": 0.5131, + "step": 5025 + }, + { + "epoch": 0.41, + "grad_norm": 1.161142792927715, + "learning_rate": 1.3383069140903816e-05, + "loss": 0.5189, + "step": 5026 + }, + { + "epoch": 0.41, + "grad_norm": 0.9383572230897598, + "learning_rate": 1.3380591772790652e-05, + "loss": 0.5646, + "step": 5027 + }, + { + "epoch": 0.41, + "grad_norm": 0.8725547375928622, + "learning_rate": 1.3378114170405473e-05, + "loss": 0.536, + "step": 5028 + }, + { + "epoch": 0.41, + "grad_norm": 0.9961487700807296, + "learning_rate": 1.3375636333919981e-05, + "loss": 0.5026, + "step": 5029 + }, + { + "epoch": 0.41, + "grad_norm": 0.9560730427420159, + "learning_rate": 1.3373158263505886e-05, + "loss": 0.5708, + "step": 5030 + }, + { + "epoch": 0.41, + "grad_norm": 0.889332135064333, + "learning_rate": 1.3370679959334911e-05, + "loss": 0.4974, + "step": 5031 + }, + { + "epoch": 0.41, + "grad_norm": 0.9112265983457682, + "learning_rate": 1.336820142157881e-05, + "loss": 0.5586, + "step": 5032 + }, + { + "epoch": 0.41, + "grad_norm": 1.0140920595535092, + "learning_rate": 1.3365722650409336e-05, + "loss": 0.6051, + "step": 5033 + }, + { + "epoch": 0.41, + "grad_norm": 0.8265136838840981, + "learning_rate": 1.3363243645998265e-05, + "loss": 0.5112, + "step": 5034 + }, + { + "epoch": 0.41, + "grad_norm": 0.8664416233893417, + "learning_rate": 1.3360764408517398e-05, + "loss": 0.5771, + "step": 5035 + }, + { + "epoch": 0.41, + "grad_norm": 0.8928211824327998, + "learning_rate": 1.3358284938138532e-05, + "loss": 0.552, + "step": 5036 + }, + { + "epoch": 0.41, + "grad_norm": 0.8708038249180292, + "learning_rate": 1.3355805235033503e-05, + "loss": 0.5487, + "step": 5037 + }, + { + "epoch": 0.41, + "grad_norm": 0.8648901980307919, + "learning_rate": 1.3353325299374147e-05, + "loss": 0.5156, + "step": 5038 + }, + { + "epoch": 0.41, + "grad_norm": 0.9253251558176558, + "learning_rate": 1.3350845131332322e-05, + "loss": 0.5946, + "step": 5039 + }, + { + "epoch": 0.41, + "grad_norm": 0.9087741111085298, + "learning_rate": 1.33483647310799e-05, + "loss": 0.5366, + "step": 5040 + }, + { + "epoch": 0.41, + "grad_norm": 0.9031770607268709, + "learning_rate": 1.3345884098788775e-05, + "loss": 0.5449, + "step": 5041 + }, + { + "epoch": 0.41, + "grad_norm": 0.9048945632273513, + "learning_rate": 1.334340323463085e-05, + "loss": 0.5902, + "step": 5042 + }, + { + "epoch": 0.41, + "grad_norm": 0.928909059359663, + "learning_rate": 1.3340922138778042e-05, + "loss": 0.5314, + "step": 5043 + }, + { + "epoch": 0.41, + "grad_norm": 0.9285281521455355, + "learning_rate": 1.3338440811402298e-05, + "loss": 0.5399, + "step": 5044 + }, + { + "epoch": 0.41, + "grad_norm": 1.0519660440082719, + "learning_rate": 1.3335959252675566e-05, + "loss": 0.6227, + "step": 5045 + }, + { + "epoch": 0.41, + "grad_norm": 0.8580949517974473, + "learning_rate": 1.3333477462769814e-05, + "loss": 0.5642, + "step": 5046 + }, + { + "epoch": 0.41, + "grad_norm": 0.9121537302597602, + "learning_rate": 1.333099544185703e-05, + "loss": 0.5169, + "step": 5047 + }, + { + "epoch": 0.41, + "grad_norm": 0.8995134871690074, + "learning_rate": 1.332851319010922e-05, + "loss": 0.5968, + "step": 5048 + }, + { + "epoch": 0.41, + "grad_norm": 0.9135632096680434, + "learning_rate": 1.3326030707698399e-05, + "loss": 0.5906, + "step": 5049 + }, + { + "epoch": 0.41, + "grad_norm": 0.9046364651883315, + "learning_rate": 1.3323547994796597e-05, + "loss": 0.6055, + "step": 5050 + }, + { + "epoch": 0.41, + "grad_norm": 0.9203931898005878, + "learning_rate": 1.3321065051575868e-05, + "loss": 0.5704, + "step": 5051 + }, + { + "epoch": 0.41, + "grad_norm": 0.8673245017777986, + "learning_rate": 1.3318581878208279e-05, + "loss": 0.4961, + "step": 5052 + }, + { + "epoch": 0.41, + "grad_norm": 0.9076849454851069, + "learning_rate": 1.3316098474865905e-05, + "loss": 0.4916, + "step": 5053 + }, + { + "epoch": 0.41, + "grad_norm": 0.9212244712246338, + "learning_rate": 1.331361484172085e-05, + "loss": 0.5526, + "step": 5054 + }, + { + "epoch": 0.41, + "grad_norm": 1.046665168224382, + "learning_rate": 1.3311130978945228e-05, + "loss": 0.5865, + "step": 5055 + }, + { + "epoch": 0.41, + "grad_norm": 0.8503823922686983, + "learning_rate": 1.3308646886711163e-05, + "loss": 0.449, + "step": 5056 + }, + { + "epoch": 0.41, + "grad_norm": 0.9493204815760115, + "learning_rate": 1.3306162565190805e-05, + "loss": 0.6048, + "step": 5057 + }, + { + "epoch": 0.41, + "grad_norm": 0.9107820615188169, + "learning_rate": 1.3303678014556316e-05, + "loss": 0.5662, + "step": 5058 + }, + { + "epoch": 0.41, + "grad_norm": 0.9714108115018177, + "learning_rate": 1.3301193234979865e-05, + "loss": 0.5288, + "step": 5059 + }, + { + "epoch": 0.41, + "grad_norm": 0.9419033164293819, + "learning_rate": 1.3298708226633657e-05, + "loss": 0.6086, + "step": 5060 + }, + { + "epoch": 0.41, + "grad_norm": 0.907506779452863, + "learning_rate": 1.3296222989689892e-05, + "loss": 0.549, + "step": 5061 + }, + { + "epoch": 0.41, + "grad_norm": 0.9435728748431338, + "learning_rate": 1.3293737524320798e-05, + "loss": 0.6021, + "step": 5062 + }, + { + "epoch": 0.41, + "grad_norm": 0.9155414972518737, + "learning_rate": 1.3291251830698615e-05, + "loss": 0.5394, + "step": 5063 + }, + { + "epoch": 0.41, + "grad_norm": 1.016370420589111, + "learning_rate": 1.3288765908995598e-05, + "loss": 0.5631, + "step": 5064 + }, + { + "epoch": 0.41, + "grad_norm": 0.9096054349595017, + "learning_rate": 1.3286279759384022e-05, + "loss": 0.543, + "step": 5065 + }, + { + "epoch": 0.41, + "grad_norm": 0.8356030266153411, + "learning_rate": 1.3283793382036175e-05, + "loss": 0.5312, + "step": 5066 + }, + { + "epoch": 0.41, + "grad_norm": 0.9494496729847453, + "learning_rate": 1.3281306777124356e-05, + "loss": 0.5967, + "step": 5067 + }, + { + "epoch": 0.41, + "grad_norm": 1.0309264218563654, + "learning_rate": 1.3278819944820893e-05, + "loss": 0.5769, + "step": 5068 + }, + { + "epoch": 0.41, + "grad_norm": 0.8414918365636198, + "learning_rate": 1.327633288529811e-05, + "loss": 0.5492, + "step": 5069 + }, + { + "epoch": 0.41, + "grad_norm": 0.8900427211041783, + "learning_rate": 1.3273845598728367e-05, + "loss": 0.5579, + "step": 5070 + }, + { + "epoch": 0.41, + "grad_norm": 0.9043752020683701, + "learning_rate": 1.3271358085284029e-05, + "loss": 0.53, + "step": 5071 + }, + { + "epoch": 0.41, + "grad_norm": 0.9876593193228603, + "learning_rate": 1.3268870345137476e-05, + "loss": 0.6066, + "step": 5072 + }, + { + "epoch": 0.41, + "grad_norm": 0.9081892965471735, + "learning_rate": 1.3266382378461109e-05, + "loss": 0.5406, + "step": 5073 + }, + { + "epoch": 0.41, + "grad_norm": 0.8612314037801095, + "learning_rate": 1.3263894185427339e-05, + "loss": 0.5397, + "step": 5074 + }, + { + "epoch": 0.41, + "grad_norm": 0.9374695390411759, + "learning_rate": 1.3261405766208598e-05, + "loss": 0.4895, + "step": 5075 + }, + { + "epoch": 0.41, + "grad_norm": 0.9984321444647333, + "learning_rate": 1.3258917120977327e-05, + "loss": 0.6127, + "step": 5076 + }, + { + "epoch": 0.41, + "grad_norm": 0.9179346377204336, + "learning_rate": 1.3256428249905998e-05, + "loss": 0.5219, + "step": 5077 + }, + { + "epoch": 0.41, + "grad_norm": 0.9518656711900425, + "learning_rate": 1.3253939153167072e-05, + "loss": 0.6394, + "step": 5078 + }, + { + "epoch": 0.41, + "grad_norm": 0.8958542762619648, + "learning_rate": 1.3251449830933052e-05, + "loss": 0.5355, + "step": 5079 + }, + { + "epoch": 0.41, + "grad_norm": 0.9534508359693109, + "learning_rate": 1.3248960283376441e-05, + "loss": 0.579, + "step": 5080 + }, + { + "epoch": 0.41, + "grad_norm": 0.9344960582822963, + "learning_rate": 1.3246470510669766e-05, + "loss": 0.5403, + "step": 5081 + }, + { + "epoch": 0.41, + "grad_norm": 0.9374250380548944, + "learning_rate": 1.3243980512985563e-05, + "loss": 0.5178, + "step": 5082 + }, + { + "epoch": 0.41, + "grad_norm": 0.8228686351928819, + "learning_rate": 1.3241490290496391e-05, + "loss": 0.5492, + "step": 5083 + }, + { + "epoch": 0.41, + "grad_norm": 0.8978557739748596, + "learning_rate": 1.3238999843374814e-05, + "loss": 0.5082, + "step": 5084 + }, + { + "epoch": 0.41, + "grad_norm": 0.9340313155095844, + "learning_rate": 1.323650917179342e-05, + "loss": 0.5216, + "step": 5085 + }, + { + "epoch": 0.41, + "grad_norm": 0.8916719591757329, + "learning_rate": 1.3234018275924814e-05, + "loss": 0.5377, + "step": 5086 + }, + { + "epoch": 0.41, + "grad_norm": 1.0150853993724218, + "learning_rate": 1.3231527155941607e-05, + "loss": 0.6204, + "step": 5087 + }, + { + "epoch": 0.41, + "grad_norm": 0.9273769401626929, + "learning_rate": 1.3229035812016438e-05, + "loss": 0.5609, + "step": 5088 + }, + { + "epoch": 0.41, + "grad_norm": 0.9169044311151517, + "learning_rate": 1.322654424432195e-05, + "loss": 0.5639, + "step": 5089 + }, + { + "epoch": 0.41, + "grad_norm": 0.9112610005312264, + "learning_rate": 1.3224052453030806e-05, + "loss": 0.5845, + "step": 5090 + }, + { + "epoch": 0.41, + "grad_norm": 0.9805041567073909, + "learning_rate": 1.3221560438315689e-05, + "loss": 0.6059, + "step": 5091 + }, + { + "epoch": 0.41, + "grad_norm": 0.9047437596079848, + "learning_rate": 1.3219068200349292e-05, + "loss": 0.5612, + "step": 5092 + }, + { + "epoch": 0.41, + "grad_norm": 0.9327788511751683, + "learning_rate": 1.321657573930432e-05, + "loss": 0.5585, + "step": 5093 + }, + { + "epoch": 0.41, + "grad_norm": 0.906266310194228, + "learning_rate": 1.3214083055353504e-05, + "loss": 0.5256, + "step": 5094 + }, + { + "epoch": 0.41, + "grad_norm": 0.9780421675103058, + "learning_rate": 1.3211590148669586e-05, + "loss": 0.5435, + "step": 5095 + }, + { + "epoch": 0.41, + "grad_norm": 0.8590529244903038, + "learning_rate": 1.3209097019425317e-05, + "loss": 0.5427, + "step": 5096 + }, + { + "epoch": 0.41, + "grad_norm": 0.871310828995196, + "learning_rate": 1.3206603667793472e-05, + "loss": 0.5254, + "step": 5097 + }, + { + "epoch": 0.41, + "grad_norm": 0.889387767827606, + "learning_rate": 1.3204110093946835e-05, + "loss": 0.4817, + "step": 5098 + }, + { + "epoch": 0.41, + "grad_norm": 0.8515472409039916, + "learning_rate": 1.3201616298058214e-05, + "loss": 0.5029, + "step": 5099 + }, + { + "epoch": 0.41, + "grad_norm": 0.9682710945847225, + "learning_rate": 1.3199122280300418e-05, + "loss": 0.6037, + "step": 5100 + }, + { + "epoch": 0.41, + "grad_norm": 0.9251328046898974, + "learning_rate": 1.319662804084629e-05, + "loss": 0.5449, + "step": 5101 + }, + { + "epoch": 0.41, + "grad_norm": 0.8627728010761364, + "learning_rate": 1.3194133579868672e-05, + "loss": 0.537, + "step": 5102 + }, + { + "epoch": 0.41, + "grad_norm": 0.8675483717514128, + "learning_rate": 1.319163889754043e-05, + "loss": 0.5234, + "step": 5103 + }, + { + "epoch": 0.41, + "grad_norm": 0.8891470238901393, + "learning_rate": 1.3189143994034448e-05, + "loss": 0.5632, + "step": 5104 + }, + { + "epoch": 0.41, + "grad_norm": 0.878464155700256, + "learning_rate": 1.318664886952361e-05, + "loss": 0.483, + "step": 5105 + }, + { + "epoch": 0.41, + "grad_norm": 0.9519110168495648, + "learning_rate": 1.3184153524180837e-05, + "loss": 0.5674, + "step": 5106 + }, + { + "epoch": 0.42, + "grad_norm": 0.8828085357873369, + "learning_rate": 1.3181657958179046e-05, + "loss": 0.5326, + "step": 5107 + }, + { + "epoch": 0.42, + "grad_norm": 1.0078717687703735, + "learning_rate": 1.317916217169118e-05, + "loss": 0.6089, + "step": 5108 + }, + { + "epoch": 0.42, + "grad_norm": 0.8520385880997858, + "learning_rate": 1.3176666164890195e-05, + "loss": 0.5646, + "step": 5109 + }, + { + "epoch": 0.42, + "grad_norm": 0.871313113477867, + "learning_rate": 1.3174169937949066e-05, + "loss": 0.4969, + "step": 5110 + }, + { + "epoch": 0.42, + "grad_norm": 0.8740222761172795, + "learning_rate": 1.3171673491040772e-05, + "loss": 0.5236, + "step": 5111 + }, + { + "epoch": 0.42, + "grad_norm": 0.9805578927608394, + "learning_rate": 1.3169176824338321e-05, + "loss": 0.5786, + "step": 5112 + }, + { + "epoch": 0.42, + "grad_norm": 0.8763258975337161, + "learning_rate": 1.3166679938014728e-05, + "loss": 0.5381, + "step": 5113 + }, + { + "epoch": 0.42, + "grad_norm": 0.8597644001374837, + "learning_rate": 1.316418283224302e-05, + "loss": 0.5418, + "step": 5114 + }, + { + "epoch": 0.42, + "grad_norm": 0.97008825068724, + "learning_rate": 1.3161685507196251e-05, + "loss": 0.5815, + "step": 5115 + }, + { + "epoch": 0.42, + "grad_norm": 0.9828993498197132, + "learning_rate": 1.3159187963047481e-05, + "loss": 0.571, + "step": 5116 + }, + { + "epoch": 0.42, + "grad_norm": 0.8992665461979235, + "learning_rate": 1.3156690199969786e-05, + "loss": 0.562, + "step": 5117 + }, + { + "epoch": 0.42, + "grad_norm": 0.9991378355301007, + "learning_rate": 1.3154192218136261e-05, + "loss": 0.6009, + "step": 5118 + }, + { + "epoch": 0.42, + "grad_norm": 0.8981685770096789, + "learning_rate": 1.3151694017720016e-05, + "loss": 0.5287, + "step": 5119 + }, + { + "epoch": 0.42, + "grad_norm": 0.9229520921845562, + "learning_rate": 1.3149195598894167e-05, + "loss": 0.5432, + "step": 5120 + }, + { + "epoch": 0.42, + "grad_norm": 1.31688240689969, + "learning_rate": 1.3146696961831858e-05, + "loss": 0.526, + "step": 5121 + }, + { + "epoch": 0.42, + "grad_norm": 0.8872179410218345, + "learning_rate": 1.314419810670624e-05, + "loss": 0.5431, + "step": 5122 + }, + { + "epoch": 0.42, + "grad_norm": 0.7965128643545243, + "learning_rate": 1.314169903369048e-05, + "loss": 0.465, + "step": 5123 + }, + { + "epoch": 0.42, + "grad_norm": 0.9228116133064066, + "learning_rate": 1.3139199742957767e-05, + "loss": 0.583, + "step": 5124 + }, + { + "epoch": 0.42, + "grad_norm": 0.9708210120494751, + "learning_rate": 1.3136700234681294e-05, + "loss": 0.6015, + "step": 5125 + }, + { + "epoch": 0.42, + "grad_norm": 0.9379346186917816, + "learning_rate": 1.313420050903428e-05, + "loss": 0.5597, + "step": 5126 + }, + { + "epoch": 0.42, + "grad_norm": 0.8583394317632376, + "learning_rate": 1.313170056618995e-05, + "loss": 0.511, + "step": 5127 + }, + { + "epoch": 0.42, + "grad_norm": 0.9664397262334126, + "learning_rate": 1.3129200406321545e-05, + "loss": 0.5993, + "step": 5128 + }, + { + "epoch": 0.42, + "grad_norm": 0.937342255366917, + "learning_rate": 1.312670002960233e-05, + "loss": 0.5576, + "step": 5129 + }, + { + "epoch": 0.42, + "grad_norm": 0.8938685765950013, + "learning_rate": 1.3124199436205575e-05, + "loss": 0.6095, + "step": 5130 + }, + { + "epoch": 0.42, + "grad_norm": 0.8795874008243144, + "learning_rate": 1.3121698626304574e-05, + "loss": 0.5268, + "step": 5131 + }, + { + "epoch": 0.42, + "grad_norm": 0.9572529534621174, + "learning_rate": 1.3119197600072624e-05, + "loss": 0.5539, + "step": 5132 + }, + { + "epoch": 0.42, + "grad_norm": 0.7892642422145666, + "learning_rate": 1.3116696357683047e-05, + "loss": 0.5267, + "step": 5133 + }, + { + "epoch": 0.42, + "grad_norm": 0.9124039323834103, + "learning_rate": 1.3114194899309175e-05, + "loss": 0.5872, + "step": 5134 + }, + { + "epoch": 0.42, + "grad_norm": 0.9034469389534122, + "learning_rate": 1.3111693225124365e-05, + "loss": 0.5655, + "step": 5135 + }, + { + "epoch": 0.42, + "grad_norm": 0.980671478125758, + "learning_rate": 1.310919133530197e-05, + "loss": 0.5409, + "step": 5136 + }, + { + "epoch": 0.42, + "grad_norm": 0.7970800799824627, + "learning_rate": 1.3106689230015372e-05, + "loss": 0.4885, + "step": 5137 + }, + { + "epoch": 0.42, + "grad_norm": 0.8885303582931813, + "learning_rate": 1.310418690943797e-05, + "loss": 0.5902, + "step": 5138 + }, + { + "epoch": 0.42, + "grad_norm": 0.8250161927876943, + "learning_rate": 1.3101684373743166e-05, + "loss": 0.5207, + "step": 5139 + }, + { + "epoch": 0.42, + "grad_norm": 0.809632259881539, + "learning_rate": 1.3099181623104386e-05, + "loss": 0.5252, + "step": 5140 + }, + { + "epoch": 0.42, + "grad_norm": 0.9036939639535884, + "learning_rate": 1.3096678657695072e-05, + "loss": 0.5677, + "step": 5141 + }, + { + "epoch": 0.42, + "grad_norm": 0.8522063526840421, + "learning_rate": 1.3094175477688671e-05, + "loss": 0.4812, + "step": 5142 + }, + { + "epoch": 0.42, + "grad_norm": 0.9591812262443219, + "learning_rate": 1.3091672083258653e-05, + "loss": 0.5718, + "step": 5143 + }, + { + "epoch": 0.42, + "grad_norm": 0.8796134146527177, + "learning_rate": 1.3089168474578504e-05, + "loss": 0.5509, + "step": 5144 + }, + { + "epoch": 0.42, + "grad_norm": 0.9637346590878184, + "learning_rate": 1.3086664651821719e-05, + "loss": 0.5226, + "step": 5145 + }, + { + "epoch": 0.42, + "grad_norm": 0.8654354312013556, + "learning_rate": 1.308416061516181e-05, + "loss": 0.5672, + "step": 5146 + }, + { + "epoch": 0.42, + "grad_norm": 1.0063143032868171, + "learning_rate": 1.3081656364772308e-05, + "loss": 0.5448, + "step": 5147 + }, + { + "epoch": 0.42, + "grad_norm": 0.9170705999289325, + "learning_rate": 1.3079151900826752e-05, + "loss": 0.5176, + "step": 5148 + }, + { + "epoch": 0.42, + "grad_norm": 0.9637334144792751, + "learning_rate": 1.3076647223498703e-05, + "loss": 0.553, + "step": 5149 + }, + { + "epoch": 0.42, + "grad_norm": 0.9671932172736195, + "learning_rate": 1.3074142332961729e-05, + "loss": 0.5069, + "step": 5150 + }, + { + "epoch": 0.42, + "grad_norm": 0.9363712624459859, + "learning_rate": 1.3071637229389416e-05, + "loss": 0.5368, + "step": 5151 + }, + { + "epoch": 0.42, + "grad_norm": 0.9080314781366527, + "learning_rate": 1.3069131912955368e-05, + "loss": 0.5389, + "step": 5152 + }, + { + "epoch": 0.42, + "grad_norm": 0.848854023646483, + "learning_rate": 1.3066626383833203e-05, + "loss": 0.5374, + "step": 5153 + }, + { + "epoch": 0.42, + "grad_norm": 0.991351160584258, + "learning_rate": 1.3064120642196549e-05, + "loss": 0.5621, + "step": 5154 + }, + { + "epoch": 0.42, + "grad_norm": 0.9563550446244192, + "learning_rate": 1.306161468821905e-05, + "loss": 0.572, + "step": 5155 + }, + { + "epoch": 0.42, + "grad_norm": 0.9125008511353809, + "learning_rate": 1.3059108522074373e-05, + "loss": 0.5362, + "step": 5156 + }, + { + "epoch": 0.42, + "grad_norm": 0.8805060313587676, + "learning_rate": 1.3056602143936185e-05, + "loss": 0.5868, + "step": 5157 + }, + { + "epoch": 0.42, + "grad_norm": 0.8926377582854529, + "learning_rate": 1.3054095553978181e-05, + "loss": 0.5372, + "step": 5158 + }, + { + "epoch": 0.42, + "grad_norm": 0.8709372652470595, + "learning_rate": 1.3051588752374067e-05, + "loss": 0.5799, + "step": 5159 + }, + { + "epoch": 0.42, + "grad_norm": 0.9118354309326878, + "learning_rate": 1.3049081739297556e-05, + "loss": 0.5731, + "step": 5160 + }, + { + "epoch": 0.42, + "grad_norm": 0.8113643930294744, + "learning_rate": 1.3046574514922386e-05, + "loss": 0.5097, + "step": 5161 + }, + { + "epoch": 0.42, + "grad_norm": 0.8474405953530317, + "learning_rate": 1.3044067079422304e-05, + "loss": 0.5331, + "step": 5162 + }, + { + "epoch": 0.42, + "grad_norm": 0.9439939773612503, + "learning_rate": 1.304155943297107e-05, + "loss": 0.5656, + "step": 5163 + }, + { + "epoch": 0.42, + "grad_norm": 0.8037282697874863, + "learning_rate": 1.303905157574247e-05, + "loss": 0.5426, + "step": 5164 + }, + { + "epoch": 0.42, + "grad_norm": 0.879080006172946, + "learning_rate": 1.303654350791029e-05, + "loss": 0.6089, + "step": 5165 + }, + { + "epoch": 0.42, + "grad_norm": 0.931439091942248, + "learning_rate": 1.3034035229648338e-05, + "loss": 0.5938, + "step": 5166 + }, + { + "epoch": 0.42, + "grad_norm": 0.897235722501806, + "learning_rate": 1.3031526741130435e-05, + "loss": 0.6158, + "step": 5167 + }, + { + "epoch": 0.42, + "grad_norm": 0.9778046778570904, + "learning_rate": 1.3029018042530421e-05, + "loss": 0.616, + "step": 5168 + }, + { + "epoch": 0.42, + "grad_norm": 0.8938904176131726, + "learning_rate": 1.3026509134022143e-05, + "loss": 0.5148, + "step": 5169 + }, + { + "epoch": 0.42, + "grad_norm": 0.7637438153298557, + "learning_rate": 1.3024000015779462e-05, + "loss": 0.5162, + "step": 5170 + }, + { + "epoch": 0.42, + "grad_norm": 0.8970184303722865, + "learning_rate": 1.3021490687976269e-05, + "loss": 0.5411, + "step": 5171 + }, + { + "epoch": 0.42, + "grad_norm": 0.9379911755413544, + "learning_rate": 1.3018981150786445e-05, + "loss": 0.5198, + "step": 5172 + }, + { + "epoch": 0.42, + "grad_norm": 0.9159176187237429, + "learning_rate": 1.3016471404383907e-05, + "loss": 0.5677, + "step": 5173 + }, + { + "epoch": 0.42, + "grad_norm": 0.8686715488022813, + "learning_rate": 1.3013961448942578e-05, + "loss": 0.5728, + "step": 5174 + }, + { + "epoch": 0.42, + "grad_norm": 1.0833391331559723, + "learning_rate": 1.301145128463639e-05, + "loss": 0.6127, + "step": 5175 + }, + { + "epoch": 0.42, + "grad_norm": 0.9097788336027876, + "learning_rate": 1.3008940911639302e-05, + "loss": 0.5475, + "step": 5176 + }, + { + "epoch": 0.42, + "grad_norm": 0.9956793165985215, + "learning_rate": 1.3006430330125279e-05, + "loss": 0.511, + "step": 5177 + }, + { + "epoch": 0.42, + "grad_norm": 0.9377355682219023, + "learning_rate": 1.30039195402683e-05, + "loss": 0.5849, + "step": 5178 + }, + { + "epoch": 0.42, + "grad_norm": 0.875112847377846, + "learning_rate": 1.300140854224236e-05, + "loss": 0.5154, + "step": 5179 + }, + { + "epoch": 0.42, + "grad_norm": 0.8801229189319658, + "learning_rate": 1.299889733622147e-05, + "loss": 0.4734, + "step": 5180 + }, + { + "epoch": 0.42, + "grad_norm": 0.968504878800029, + "learning_rate": 1.2996385922379657e-05, + "loss": 0.5744, + "step": 5181 + }, + { + "epoch": 0.42, + "grad_norm": 0.7574635362509162, + "learning_rate": 1.2993874300890956e-05, + "loss": 0.4807, + "step": 5182 + }, + { + "epoch": 0.42, + "grad_norm": 1.0783731403307208, + "learning_rate": 1.2991362471929421e-05, + "loss": 0.6033, + "step": 5183 + }, + { + "epoch": 0.42, + "grad_norm": 0.9783506627178276, + "learning_rate": 1.2988850435669123e-05, + "loss": 0.5147, + "step": 5184 + }, + { + "epoch": 0.42, + "grad_norm": 0.9580822811329507, + "learning_rate": 1.2986338192284136e-05, + "loss": 0.6557, + "step": 5185 + }, + { + "epoch": 0.42, + "grad_norm": 0.953670429295701, + "learning_rate": 1.2983825741948564e-05, + "loss": 0.5506, + "step": 5186 + }, + { + "epoch": 0.42, + "grad_norm": 0.9522694105450515, + "learning_rate": 1.2981313084836514e-05, + "loss": 0.5054, + "step": 5187 + }, + { + "epoch": 0.42, + "grad_norm": 0.8276032219953149, + "learning_rate": 1.2978800221122112e-05, + "loss": 0.5091, + "step": 5188 + }, + { + "epoch": 0.42, + "grad_norm": 0.915831507762252, + "learning_rate": 1.2976287150979497e-05, + "loss": 0.5395, + "step": 5189 + }, + { + "epoch": 0.42, + "grad_norm": 0.89869095528428, + "learning_rate": 1.297377387458282e-05, + "loss": 0.6037, + "step": 5190 + }, + { + "epoch": 0.42, + "grad_norm": 0.8638883544541025, + "learning_rate": 1.2971260392106255e-05, + "loss": 0.5695, + "step": 5191 + }, + { + "epoch": 0.42, + "grad_norm": 0.8361552108274168, + "learning_rate": 1.296874670372398e-05, + "loss": 0.4965, + "step": 5192 + }, + { + "epoch": 0.42, + "grad_norm": 0.78892219953772, + "learning_rate": 1.2966232809610189e-05, + "loss": 0.459, + "step": 5193 + }, + { + "epoch": 0.42, + "grad_norm": 1.0306619602849194, + "learning_rate": 1.2963718709939098e-05, + "loss": 0.6183, + "step": 5194 + }, + { + "epoch": 0.42, + "grad_norm": 0.9698810524692302, + "learning_rate": 1.2961204404884928e-05, + "loss": 0.6101, + "step": 5195 + }, + { + "epoch": 0.42, + "grad_norm": 0.8714946620693301, + "learning_rate": 1.2958689894621918e-05, + "loss": 0.5364, + "step": 5196 + }, + { + "epoch": 0.42, + "grad_norm": 0.9102783335520639, + "learning_rate": 1.2956175179324323e-05, + "loss": 0.5288, + "step": 5197 + }, + { + "epoch": 0.42, + "grad_norm": 0.9513914469151453, + "learning_rate": 1.2953660259166413e-05, + "loss": 0.5882, + "step": 5198 + }, + { + "epoch": 0.42, + "grad_norm": 0.9080645066236228, + "learning_rate": 1.2951145134322465e-05, + "loss": 0.5207, + "step": 5199 + }, + { + "epoch": 0.42, + "grad_norm": 0.9749830303692215, + "learning_rate": 1.2948629804966776e-05, + "loss": 0.5629, + "step": 5200 + }, + { + "epoch": 0.42, + "grad_norm": 1.0249364407270003, + "learning_rate": 1.294611427127366e-05, + "loss": 0.5952, + "step": 5201 + }, + { + "epoch": 0.42, + "grad_norm": 0.9204881999222911, + "learning_rate": 1.2943598533417437e-05, + "loss": 0.5396, + "step": 5202 + }, + { + "epoch": 0.42, + "grad_norm": 0.8352575903111814, + "learning_rate": 1.2941082591572443e-05, + "loss": 0.494, + "step": 5203 + }, + { + "epoch": 0.42, + "grad_norm": 1.0853393787202654, + "learning_rate": 1.2938566445913037e-05, + "loss": 0.6347, + "step": 5204 + }, + { + "epoch": 0.42, + "grad_norm": 0.9185748013824929, + "learning_rate": 1.2936050096613584e-05, + "loss": 0.5317, + "step": 5205 + }, + { + "epoch": 0.42, + "grad_norm": 0.9245617764417169, + "learning_rate": 1.2933533543848462e-05, + "loss": 0.5411, + "step": 5206 + }, + { + "epoch": 0.42, + "grad_norm": 1.0003230199061517, + "learning_rate": 1.2931016787792069e-05, + "loss": 0.5995, + "step": 5207 + }, + { + "epoch": 0.42, + "grad_norm": 0.9833957390509364, + "learning_rate": 1.292849982861881e-05, + "loss": 0.604, + "step": 5208 + }, + { + "epoch": 0.42, + "grad_norm": 0.9909115252326485, + "learning_rate": 1.2925982666503111e-05, + "loss": 0.5742, + "step": 5209 + }, + { + "epoch": 0.42, + "grad_norm": 0.9407003727079608, + "learning_rate": 1.2923465301619408e-05, + "loss": 0.5392, + "step": 5210 + }, + { + "epoch": 0.42, + "grad_norm": 0.9458924993894788, + "learning_rate": 1.2920947734142155e-05, + "loss": 0.5189, + "step": 5211 + }, + { + "epoch": 0.42, + "grad_norm": 0.9561205767333625, + "learning_rate": 1.2918429964245813e-05, + "loss": 0.5466, + "step": 5212 + }, + { + "epoch": 0.42, + "grad_norm": 0.8589889002194464, + "learning_rate": 1.2915911992104864e-05, + "loss": 0.5233, + "step": 5213 + }, + { + "epoch": 0.42, + "grad_norm": 0.8304101043631497, + "learning_rate": 1.2913393817893803e-05, + "loss": 0.4897, + "step": 5214 + }, + { + "epoch": 0.42, + "grad_norm": 0.9015121090208729, + "learning_rate": 1.291087544178713e-05, + "loss": 0.5162, + "step": 5215 + }, + { + "epoch": 0.42, + "grad_norm": 0.8932814665787939, + "learning_rate": 1.2908356863959372e-05, + "loss": 0.5288, + "step": 5216 + }, + { + "epoch": 0.42, + "grad_norm": 0.8710258804583972, + "learning_rate": 1.2905838084585066e-05, + "loss": 0.4831, + "step": 5217 + }, + { + "epoch": 0.42, + "grad_norm": 0.8989285859062396, + "learning_rate": 1.2903319103838756e-05, + "loss": 0.5019, + "step": 5218 + }, + { + "epoch": 0.42, + "grad_norm": 0.9187626899309298, + "learning_rate": 1.2900799921895004e-05, + "loss": 0.5805, + "step": 5219 + }, + { + "epoch": 0.42, + "grad_norm": 0.9328702855013249, + "learning_rate": 1.2898280538928396e-05, + "loss": 0.5927, + "step": 5220 + }, + { + "epoch": 0.42, + "grad_norm": 0.9985233117601223, + "learning_rate": 1.2895760955113514e-05, + "loss": 0.5583, + "step": 5221 + }, + { + "epoch": 0.42, + "grad_norm": 0.9419826483791731, + "learning_rate": 1.2893241170624968e-05, + "loss": 0.5928, + "step": 5222 + }, + { + "epoch": 0.42, + "grad_norm": 0.9519831626411481, + "learning_rate": 1.2890721185637376e-05, + "loss": 0.5869, + "step": 5223 + }, + { + "epoch": 0.42, + "grad_norm": 0.9777805383448769, + "learning_rate": 1.2888201000325368e-05, + "loss": 0.5647, + "step": 5224 + }, + { + "epoch": 0.42, + "grad_norm": 0.8718493503514608, + "learning_rate": 1.2885680614863591e-05, + "loss": 0.532, + "step": 5225 + }, + { + "epoch": 0.42, + "grad_norm": 0.9404180060319389, + "learning_rate": 1.2883160029426712e-05, + "loss": 0.5258, + "step": 5226 + }, + { + "epoch": 0.42, + "grad_norm": 0.9516912980108595, + "learning_rate": 1.2880639244189397e-05, + "loss": 0.5553, + "step": 5227 + }, + { + "epoch": 0.42, + "grad_norm": 0.920813450561122, + "learning_rate": 1.2878118259326335e-05, + "loss": 0.5373, + "step": 5228 + }, + { + "epoch": 0.42, + "grad_norm": 0.8585172444562801, + "learning_rate": 1.2875597075012236e-05, + "loss": 0.5208, + "step": 5229 + }, + { + "epoch": 0.43, + "grad_norm": 0.8584243934134526, + "learning_rate": 1.2873075691421808e-05, + "loss": 0.4836, + "step": 5230 + }, + { + "epoch": 0.43, + "grad_norm": 0.8292944955247089, + "learning_rate": 1.2870554108729783e-05, + "loss": 0.5396, + "step": 5231 + }, + { + "epoch": 0.43, + "grad_norm": 0.8223913490116747, + "learning_rate": 1.2868032327110904e-05, + "loss": 0.4598, + "step": 5232 + }, + { + "epoch": 0.43, + "grad_norm": 0.8055596776849586, + "learning_rate": 1.2865510346739928e-05, + "loss": 0.5109, + "step": 5233 + }, + { + "epoch": 0.43, + "grad_norm": 0.817052218239722, + "learning_rate": 1.2862988167791627e-05, + "loss": 0.5415, + "step": 5234 + }, + { + "epoch": 0.43, + "grad_norm": 1.0810268067366446, + "learning_rate": 1.2860465790440788e-05, + "loss": 0.5753, + "step": 5235 + }, + { + "epoch": 0.43, + "grad_norm": 0.8402617458059244, + "learning_rate": 1.2857943214862205e-05, + "loss": 0.4776, + "step": 5236 + }, + { + "epoch": 0.43, + "grad_norm": 0.8276994257844174, + "learning_rate": 1.285542044123069e-05, + "loss": 0.5087, + "step": 5237 + }, + { + "epoch": 0.43, + "grad_norm": 0.8298950139953116, + "learning_rate": 1.2852897469721074e-05, + "loss": 0.5485, + "step": 5238 + }, + { + "epoch": 0.43, + "grad_norm": 0.9031430098759592, + "learning_rate": 1.2850374300508195e-05, + "loss": 0.5088, + "step": 5239 + }, + { + "epoch": 0.43, + "grad_norm": 0.950323481245847, + "learning_rate": 1.2847850933766901e-05, + "loss": 0.5373, + "step": 5240 + }, + { + "epoch": 0.43, + "grad_norm": 1.005940982880705, + "learning_rate": 1.2845327369672069e-05, + "loss": 0.6226, + "step": 5241 + }, + { + "epoch": 0.43, + "grad_norm": 1.0628170200163112, + "learning_rate": 1.2842803608398568e-05, + "loss": 0.5751, + "step": 5242 + }, + { + "epoch": 0.43, + "grad_norm": 0.9088578999699106, + "learning_rate": 1.2840279650121301e-05, + "loss": 0.5189, + "step": 5243 + }, + { + "epoch": 0.43, + "grad_norm": 0.9250960010536801, + "learning_rate": 1.2837755495015176e-05, + "loss": 0.5714, + "step": 5244 + }, + { + "epoch": 0.43, + "grad_norm": 0.9393467217764105, + "learning_rate": 1.283523114325511e-05, + "loss": 0.546, + "step": 5245 + }, + { + "epoch": 0.43, + "grad_norm": 1.0273850999637157, + "learning_rate": 1.283270659501604e-05, + "loss": 0.5864, + "step": 5246 + }, + { + "epoch": 0.43, + "grad_norm": 0.8594473911869244, + "learning_rate": 1.2830181850472918e-05, + "loss": 0.5368, + "step": 5247 + }, + { + "epoch": 0.43, + "grad_norm": 0.8807421155459568, + "learning_rate": 1.2827656909800701e-05, + "loss": 0.5712, + "step": 5248 + }, + { + "epoch": 0.43, + "grad_norm": 0.8545283767021522, + "learning_rate": 1.2825131773174371e-05, + "loss": 0.5173, + "step": 5249 + }, + { + "epoch": 0.43, + "grad_norm": 0.8419064676592511, + "learning_rate": 1.2822606440768911e-05, + "loss": 0.5258, + "step": 5250 + }, + { + "epoch": 0.43, + "grad_norm": 0.8260318829592953, + "learning_rate": 1.2820080912759334e-05, + "loss": 0.4934, + "step": 5251 + }, + { + "epoch": 0.43, + "grad_norm": 0.9071781523737736, + "learning_rate": 1.2817555189320647e-05, + "loss": 0.5202, + "step": 5252 + }, + { + "epoch": 0.43, + "grad_norm": 0.9027757793662681, + "learning_rate": 1.2815029270627885e-05, + "loss": 0.5303, + "step": 5253 + }, + { + "epoch": 0.43, + "grad_norm": 1.0212430197446423, + "learning_rate": 1.2812503156856093e-05, + "loss": 0.6029, + "step": 5254 + }, + { + "epoch": 0.43, + "grad_norm": 0.8681516837710173, + "learning_rate": 1.2809976848180328e-05, + "loss": 0.5677, + "step": 5255 + }, + { + "epoch": 0.43, + "grad_norm": 0.93728669923876, + "learning_rate": 1.2807450344775656e-05, + "loss": 0.5703, + "step": 5256 + }, + { + "epoch": 0.43, + "grad_norm": 0.8472921948892896, + "learning_rate": 1.2804923646817169e-05, + "loss": 0.4963, + "step": 5257 + }, + { + "epoch": 0.43, + "grad_norm": 0.9384456367903224, + "learning_rate": 1.2802396754479958e-05, + "loss": 0.5729, + "step": 5258 + }, + { + "epoch": 0.43, + "grad_norm": 0.9159751306971758, + "learning_rate": 1.279986966793914e-05, + "loss": 0.5178, + "step": 5259 + }, + { + "epoch": 0.43, + "grad_norm": 0.9505984994518845, + "learning_rate": 1.2797342387369837e-05, + "loss": 0.5697, + "step": 5260 + }, + { + "epoch": 0.43, + "grad_norm": 0.8650825044597112, + "learning_rate": 1.279481491294719e-05, + "loss": 0.5482, + "step": 5261 + }, + { + "epoch": 0.43, + "grad_norm": 0.9432578608303249, + "learning_rate": 1.2792287244846345e-05, + "loss": 0.5622, + "step": 5262 + }, + { + "epoch": 0.43, + "grad_norm": 0.9072994162122723, + "learning_rate": 1.2789759383242471e-05, + "loss": 0.5521, + "step": 5263 + }, + { + "epoch": 0.43, + "grad_norm": 0.927937356858074, + "learning_rate": 1.2787231328310744e-05, + "loss": 0.5355, + "step": 5264 + }, + { + "epoch": 0.43, + "grad_norm": 0.8077970985617918, + "learning_rate": 1.2784703080226364e-05, + "loss": 0.5006, + "step": 5265 + }, + { + "epoch": 0.43, + "grad_norm": 0.9341490586832795, + "learning_rate": 1.2782174639164528e-05, + "loss": 0.5911, + "step": 5266 + }, + { + "epoch": 0.43, + "grad_norm": 0.8463801416391185, + "learning_rate": 1.2779646005300457e-05, + "loss": 0.5753, + "step": 5267 + }, + { + "epoch": 0.43, + "grad_norm": 0.8251393824383413, + "learning_rate": 1.2777117178809383e-05, + "loss": 0.52, + "step": 5268 + }, + { + "epoch": 0.43, + "grad_norm": 0.9756413025958026, + "learning_rate": 1.2774588159866554e-05, + "loss": 0.5064, + "step": 5269 + }, + { + "epoch": 0.43, + "grad_norm": 0.9333713823028544, + "learning_rate": 1.2772058948647224e-05, + "loss": 0.5589, + "step": 5270 + }, + { + "epoch": 0.43, + "grad_norm": 0.8904186468309239, + "learning_rate": 1.2769529545326669e-05, + "loss": 0.5401, + "step": 5271 + }, + { + "epoch": 0.43, + "grad_norm": 0.7971808339481724, + "learning_rate": 1.2766999950080172e-05, + "loss": 0.5002, + "step": 5272 + }, + { + "epoch": 0.43, + "grad_norm": 0.9132419627921222, + "learning_rate": 1.2764470163083034e-05, + "loss": 0.5205, + "step": 5273 + }, + { + "epoch": 0.43, + "grad_norm": 0.9284686013582293, + "learning_rate": 1.2761940184510564e-05, + "loss": 0.6319, + "step": 5274 + }, + { + "epoch": 0.43, + "grad_norm": 0.8241780076529768, + "learning_rate": 1.2759410014538092e-05, + "loss": 0.5005, + "step": 5275 + }, + { + "epoch": 0.43, + "grad_norm": 0.9026954517606373, + "learning_rate": 1.275687965334095e-05, + "loss": 0.503, + "step": 5276 + }, + { + "epoch": 0.43, + "grad_norm": 0.8621089733539753, + "learning_rate": 1.2754349101094493e-05, + "loss": 0.5055, + "step": 5277 + }, + { + "epoch": 0.43, + "grad_norm": 0.876532065494681, + "learning_rate": 1.2751818357974092e-05, + "loss": 0.5718, + "step": 5278 + }, + { + "epoch": 0.43, + "grad_norm": 0.9111254296857755, + "learning_rate": 1.2749287424155114e-05, + "loss": 0.5633, + "step": 5279 + }, + { + "epoch": 0.43, + "grad_norm": 0.8105020558905384, + "learning_rate": 1.2746756299812959e-05, + "loss": 0.5318, + "step": 5280 + }, + { + "epoch": 0.43, + "grad_norm": 0.8516231862863062, + "learning_rate": 1.2744224985123031e-05, + "loss": 0.5428, + "step": 5281 + }, + { + "epoch": 0.43, + "grad_norm": 0.89583325399951, + "learning_rate": 1.2741693480260742e-05, + "loss": 0.56, + "step": 5282 + }, + { + "epoch": 0.43, + "grad_norm": 0.9014276458558959, + "learning_rate": 1.2739161785401525e-05, + "loss": 0.5501, + "step": 5283 + }, + { + "epoch": 0.43, + "grad_norm": 0.9709594700786953, + "learning_rate": 1.2736629900720832e-05, + "loss": 0.6068, + "step": 5284 + }, + { + "epoch": 0.43, + "grad_norm": 0.8386333307614217, + "learning_rate": 1.273409782639411e-05, + "loss": 0.5139, + "step": 5285 + }, + { + "epoch": 0.43, + "grad_norm": 0.9843303001964966, + "learning_rate": 1.2731565562596833e-05, + "loss": 0.5859, + "step": 5286 + }, + { + "epoch": 0.43, + "grad_norm": 0.9895222868007922, + "learning_rate": 1.2729033109504489e-05, + "loss": 0.5086, + "step": 5287 + }, + { + "epoch": 0.43, + "grad_norm": 0.8602660281687846, + "learning_rate": 1.2726500467292569e-05, + "loss": 0.5303, + "step": 5288 + }, + { + "epoch": 0.43, + "grad_norm": 0.9087743403337459, + "learning_rate": 1.2723967636136582e-05, + "loss": 0.5649, + "step": 5289 + }, + { + "epoch": 0.43, + "grad_norm": 0.841464399939494, + "learning_rate": 1.272143461621206e-05, + "loss": 0.5266, + "step": 5290 + }, + { + "epoch": 0.43, + "grad_norm": 0.945482006218386, + "learning_rate": 1.2718901407694529e-05, + "loss": 0.5942, + "step": 5291 + }, + { + "epoch": 0.43, + "grad_norm": 1.0264055245686943, + "learning_rate": 1.2716368010759541e-05, + "loss": 0.625, + "step": 5292 + }, + { + "epoch": 0.43, + "grad_norm": 0.8925968532334795, + "learning_rate": 1.2713834425582665e-05, + "loss": 0.5599, + "step": 5293 + }, + { + "epoch": 0.43, + "grad_norm": 0.9076586798006714, + "learning_rate": 1.2711300652339466e-05, + "loss": 0.5792, + "step": 5294 + }, + { + "epoch": 0.43, + "grad_norm": 0.8088359146950926, + "learning_rate": 1.2708766691205536e-05, + "loss": 0.542, + "step": 5295 + }, + { + "epoch": 0.43, + "grad_norm": 0.8453489931750089, + "learning_rate": 1.270623254235648e-05, + "loss": 0.4916, + "step": 5296 + }, + { + "epoch": 0.43, + "grad_norm": 0.8287437918320928, + "learning_rate": 1.2703698205967907e-05, + "loss": 0.4938, + "step": 5297 + }, + { + "epoch": 0.43, + "grad_norm": 1.0493865520195256, + "learning_rate": 1.2701163682215447e-05, + "loss": 0.5401, + "step": 5298 + }, + { + "epoch": 0.43, + "grad_norm": 0.8536139904524773, + "learning_rate": 1.2698628971274743e-05, + "loss": 0.5139, + "step": 5299 + }, + { + "epoch": 0.43, + "grad_norm": 0.89012269240296, + "learning_rate": 1.269609407332144e-05, + "loss": 0.5295, + "step": 5300 + }, + { + "epoch": 0.43, + "grad_norm": 0.9291127537817483, + "learning_rate": 1.2693558988531209e-05, + "loss": 0.5729, + "step": 5301 + }, + { + "epoch": 0.43, + "grad_norm": 0.9126294441308344, + "learning_rate": 1.2691023717079735e-05, + "loss": 0.5836, + "step": 5302 + }, + { + "epoch": 0.43, + "grad_norm": 0.9426586461545383, + "learning_rate": 1.26884882591427e-05, + "loss": 0.5436, + "step": 5303 + }, + { + "epoch": 0.43, + "grad_norm": 0.9323610742444876, + "learning_rate": 1.2685952614895813e-05, + "loss": 0.5832, + "step": 5304 + }, + { + "epoch": 0.43, + "grad_norm": 0.8529103894723162, + "learning_rate": 1.2683416784514796e-05, + "loss": 0.6131, + "step": 5305 + }, + { + "epoch": 0.43, + "grad_norm": 0.8558498414608312, + "learning_rate": 1.2680880768175372e-05, + "loss": 0.5317, + "step": 5306 + }, + { + "epoch": 0.43, + "grad_norm": 0.8504492314757207, + "learning_rate": 1.267834456605329e-05, + "loss": 0.5051, + "step": 5307 + }, + { + "epoch": 0.43, + "grad_norm": 0.8464138513156245, + "learning_rate": 1.2675808178324305e-05, + "loss": 0.4996, + "step": 5308 + }, + { + "epoch": 0.43, + "grad_norm": 0.8419298654246816, + "learning_rate": 1.2673271605164189e-05, + "loss": 0.5133, + "step": 5309 + }, + { + "epoch": 0.43, + "grad_norm": 0.9322900796698717, + "learning_rate": 1.2670734846748717e-05, + "loss": 0.5407, + "step": 5310 + }, + { + "epoch": 0.43, + "grad_norm": 0.8430730919042455, + "learning_rate": 1.2668197903253694e-05, + "loss": 0.5238, + "step": 5311 + }, + { + "epoch": 0.43, + "grad_norm": 0.9579902652026588, + "learning_rate": 1.266566077485492e-05, + "loss": 0.5807, + "step": 5312 + }, + { + "epoch": 0.43, + "grad_norm": 0.9361797772639265, + "learning_rate": 1.2663123461728219e-05, + "loss": 0.5349, + "step": 5313 + }, + { + "epoch": 0.43, + "grad_norm": 0.9216298671386612, + "learning_rate": 1.2660585964049425e-05, + "loss": 0.5433, + "step": 5314 + }, + { + "epoch": 0.43, + "grad_norm": 0.895439295800641, + "learning_rate": 1.2658048281994386e-05, + "loss": 0.4908, + "step": 5315 + }, + { + "epoch": 0.43, + "grad_norm": 0.8042110357612069, + "learning_rate": 1.2655510415738954e-05, + "loss": 0.459, + "step": 5316 + }, + { + "epoch": 0.43, + "grad_norm": 0.9505226398868347, + "learning_rate": 1.2652972365459008e-05, + "loss": 0.5766, + "step": 5317 + }, + { + "epoch": 0.43, + "grad_norm": 1.0448013742685371, + "learning_rate": 1.2650434131330434e-05, + "loss": 0.6055, + "step": 5318 + }, + { + "epoch": 0.43, + "grad_norm": 0.9055457061171901, + "learning_rate": 1.2647895713529119e-05, + "loss": 0.4885, + "step": 5319 + }, + { + "epoch": 0.43, + "grad_norm": 0.9932989807705419, + "learning_rate": 1.2645357112230983e-05, + "loss": 0.5372, + "step": 5320 + }, + { + "epoch": 0.43, + "grad_norm": 0.8511979663476752, + "learning_rate": 1.2642818327611947e-05, + "loss": 0.4758, + "step": 5321 + }, + { + "epoch": 0.43, + "grad_norm": 0.8423055609806502, + "learning_rate": 1.2640279359847942e-05, + "loss": 0.4833, + "step": 5322 + }, + { + "epoch": 0.43, + "grad_norm": 0.909840172729127, + "learning_rate": 1.2637740209114918e-05, + "loss": 0.5762, + "step": 5323 + }, + { + "epoch": 0.43, + "grad_norm": 0.8666441465222685, + "learning_rate": 1.2635200875588843e-05, + "loss": 0.5552, + "step": 5324 + }, + { + "epoch": 0.43, + "grad_norm": 0.8048930938774788, + "learning_rate": 1.2632661359445682e-05, + "loss": 0.4874, + "step": 5325 + }, + { + "epoch": 0.43, + "grad_norm": 0.8658373056426494, + "learning_rate": 1.2630121660861421e-05, + "loss": 0.5751, + "step": 5326 + }, + { + "epoch": 0.43, + "grad_norm": 0.873960183814025, + "learning_rate": 1.2627581780012066e-05, + "loss": 0.5404, + "step": 5327 + }, + { + "epoch": 0.43, + "grad_norm": 0.8694573208813933, + "learning_rate": 1.2625041717073623e-05, + "loss": 0.5388, + "step": 5328 + }, + { + "epoch": 0.43, + "grad_norm": 0.8367705185934514, + "learning_rate": 1.2622501472222116e-05, + "loss": 0.4857, + "step": 5329 + }, + { + "epoch": 0.43, + "grad_norm": 1.021461286000572, + "learning_rate": 1.2619961045633584e-05, + "loss": 0.5045, + "step": 5330 + }, + { + "epoch": 0.43, + "grad_norm": 0.9150106164172199, + "learning_rate": 1.2617420437484076e-05, + "loss": 0.5622, + "step": 5331 + }, + { + "epoch": 0.43, + "grad_norm": 0.9566331795578452, + "learning_rate": 1.2614879647949652e-05, + "loss": 0.5835, + "step": 5332 + }, + { + "epoch": 0.43, + "grad_norm": 0.860789775611763, + "learning_rate": 1.261233867720639e-05, + "loss": 0.5326, + "step": 5333 + }, + { + "epoch": 0.43, + "grad_norm": 0.8448972384896664, + "learning_rate": 1.2609797525430374e-05, + "loss": 0.4775, + "step": 5334 + }, + { + "epoch": 0.43, + "grad_norm": 0.847748902308439, + "learning_rate": 1.2607256192797702e-05, + "loss": 0.5381, + "step": 5335 + }, + { + "epoch": 0.43, + "grad_norm": 0.8675705368591691, + "learning_rate": 1.260471467948449e-05, + "loss": 0.4952, + "step": 5336 + }, + { + "epoch": 0.43, + "grad_norm": 0.8005689733352995, + "learning_rate": 1.2602172985666863e-05, + "loss": 0.553, + "step": 5337 + }, + { + "epoch": 0.43, + "grad_norm": 1.074567676112369, + "learning_rate": 1.2599631111520956e-05, + "loss": 0.4664, + "step": 5338 + }, + { + "epoch": 0.43, + "grad_norm": 0.8500137656374482, + "learning_rate": 1.2597089057222915e-05, + "loss": 0.5165, + "step": 5339 + }, + { + "epoch": 0.43, + "grad_norm": 0.9387099432086206, + "learning_rate": 1.2594546822948909e-05, + "loss": 0.549, + "step": 5340 + }, + { + "epoch": 0.43, + "grad_norm": 0.8832865858693016, + "learning_rate": 1.259200440887511e-05, + "loss": 0.5153, + "step": 5341 + }, + { + "epoch": 0.43, + "grad_norm": 0.8929926334581008, + "learning_rate": 1.2589461815177702e-05, + "loss": 0.4826, + "step": 5342 + }, + { + "epoch": 0.43, + "grad_norm": 1.0259069016436662, + "learning_rate": 1.2586919042032889e-05, + "loss": 0.5732, + "step": 5343 + }, + { + "epoch": 0.43, + "grad_norm": 1.0308709360387325, + "learning_rate": 1.258437608961688e-05, + "loss": 0.555, + "step": 5344 + }, + { + "epoch": 0.43, + "grad_norm": 0.9248441205920206, + "learning_rate": 1.2581832958105902e-05, + "loss": 0.5604, + "step": 5345 + }, + { + "epoch": 0.43, + "grad_norm": 1.0011778449614308, + "learning_rate": 1.257928964767619e-05, + "loss": 0.5902, + "step": 5346 + }, + { + "epoch": 0.43, + "grad_norm": 0.9170889642768937, + "learning_rate": 1.2576746158503992e-05, + "loss": 0.5381, + "step": 5347 + }, + { + "epoch": 0.43, + "grad_norm": 0.8797931764427254, + "learning_rate": 1.257420249076557e-05, + "loss": 0.5294, + "step": 5348 + }, + { + "epoch": 0.43, + "grad_norm": 0.835885065953182, + "learning_rate": 1.25716586446372e-05, + "loss": 0.4715, + "step": 5349 + }, + { + "epoch": 0.43, + "grad_norm": 0.8538701496862996, + "learning_rate": 1.2569114620295166e-05, + "loss": 0.4901, + "step": 5350 + }, + { + "epoch": 0.43, + "grad_norm": 0.9695748609946032, + "learning_rate": 1.2566570417915769e-05, + "loss": 0.6038, + "step": 5351 + }, + { + "epoch": 0.43, + "grad_norm": 0.9125029822928001, + "learning_rate": 1.2564026037675317e-05, + "loss": 0.5378, + "step": 5352 + }, + { + "epoch": 0.44, + "grad_norm": 0.8841389114546176, + "learning_rate": 1.2561481479750135e-05, + "loss": 0.5214, + "step": 5353 + }, + { + "epoch": 0.44, + "grad_norm": 0.9362021878627022, + "learning_rate": 1.2558936744316561e-05, + "loss": 0.5825, + "step": 5354 + }, + { + "epoch": 0.44, + "grad_norm": 0.917462682069863, + "learning_rate": 1.2556391831550938e-05, + "loss": 0.5607, + "step": 5355 + }, + { + "epoch": 0.44, + "grad_norm": 0.9527731739824842, + "learning_rate": 1.255384674162963e-05, + "loss": 0.5483, + "step": 5356 + }, + { + "epoch": 0.44, + "grad_norm": 0.9007801694808039, + "learning_rate": 1.2551301474729008e-05, + "loss": 0.5005, + "step": 5357 + }, + { + "epoch": 0.44, + "grad_norm": 0.8961448090023717, + "learning_rate": 1.2548756031025455e-05, + "loss": 0.5556, + "step": 5358 + }, + { + "epoch": 0.44, + "grad_norm": 0.999360309272564, + "learning_rate": 1.254621041069537e-05, + "loss": 0.5881, + "step": 5359 + }, + { + "epoch": 0.44, + "grad_norm": 0.8481061045768453, + "learning_rate": 1.2543664613915165e-05, + "loss": 0.4995, + "step": 5360 + }, + { + "epoch": 0.44, + "grad_norm": 0.947731835355656, + "learning_rate": 1.2541118640861255e-05, + "loss": 0.539, + "step": 5361 + }, + { + "epoch": 0.44, + "grad_norm": 0.8498245240875937, + "learning_rate": 1.2538572491710079e-05, + "loss": 0.5049, + "step": 5362 + }, + { + "epoch": 0.44, + "grad_norm": 0.769815234398063, + "learning_rate": 1.2536026166638082e-05, + "loss": 0.4527, + "step": 5363 + }, + { + "epoch": 0.44, + "grad_norm": 0.9020547480424845, + "learning_rate": 1.2533479665821719e-05, + "loss": 0.5664, + "step": 5364 + }, + { + "epoch": 0.44, + "grad_norm": 0.9948695645265985, + "learning_rate": 1.2530932989437463e-05, + "loss": 0.5468, + "step": 5365 + }, + { + "epoch": 0.44, + "grad_norm": 0.8738954752780324, + "learning_rate": 1.2528386137661797e-05, + "loss": 0.5222, + "step": 5366 + }, + { + "epoch": 0.44, + "grad_norm": 0.9995522905076301, + "learning_rate": 1.2525839110671212e-05, + "loss": 0.6052, + "step": 5367 + }, + { + "epoch": 0.44, + "grad_norm": 1.0371681322258024, + "learning_rate": 1.2523291908642219e-05, + "loss": 0.5546, + "step": 5368 + }, + { + "epoch": 0.44, + "grad_norm": 0.8721378928964585, + "learning_rate": 1.2520744531751334e-05, + "loss": 0.51, + "step": 5369 + }, + { + "epoch": 0.44, + "grad_norm": 0.8634386973360564, + "learning_rate": 1.251819698017509e-05, + "loss": 0.5024, + "step": 5370 + }, + { + "epoch": 0.44, + "grad_norm": 0.9503165783798369, + "learning_rate": 1.2515649254090025e-05, + "loss": 0.5597, + "step": 5371 + }, + { + "epoch": 0.44, + "grad_norm": 0.8720813638454793, + "learning_rate": 1.2513101353672703e-05, + "loss": 0.5894, + "step": 5372 + }, + { + "epoch": 0.44, + "grad_norm": 0.9695051345654175, + "learning_rate": 1.2510553279099684e-05, + "loss": 0.5444, + "step": 5373 + }, + { + "epoch": 0.44, + "grad_norm": 0.8972858005853297, + "learning_rate": 1.250800503054755e-05, + "loss": 0.556, + "step": 5374 + }, + { + "epoch": 0.44, + "grad_norm": 0.8021491655858767, + "learning_rate": 1.2505456608192889e-05, + "loss": 0.5289, + "step": 5375 + }, + { + "epoch": 0.44, + "grad_norm": 0.9734012631123989, + "learning_rate": 1.2502908012212313e-05, + "loss": 0.5637, + "step": 5376 + }, + { + "epoch": 0.44, + "grad_norm": 0.9441552924743251, + "learning_rate": 1.2500359242782429e-05, + "loss": 0.6188, + "step": 5377 + }, + { + "epoch": 0.44, + "grad_norm": 0.899529468266094, + "learning_rate": 1.2497810300079866e-05, + "loss": 0.5009, + "step": 5378 + }, + { + "epoch": 0.44, + "grad_norm": 0.8728144939932249, + "learning_rate": 1.249526118428127e-05, + "loss": 0.5664, + "step": 5379 + }, + { + "epoch": 0.44, + "grad_norm": 0.8267144567594343, + "learning_rate": 1.2492711895563281e-05, + "loss": 0.5369, + "step": 5380 + }, + { + "epoch": 0.44, + "grad_norm": 0.873087845840797, + "learning_rate": 1.249016243410257e-05, + "loss": 0.5279, + "step": 5381 + }, + { + "epoch": 0.44, + "grad_norm": 0.858248198037963, + "learning_rate": 1.2487612800075814e-05, + "loss": 0.5452, + "step": 5382 + }, + { + "epoch": 0.44, + "grad_norm": 0.9861992051149057, + "learning_rate": 1.2485062993659696e-05, + "loss": 0.5358, + "step": 5383 + }, + { + "epoch": 0.44, + "grad_norm": 0.8845727809369965, + "learning_rate": 1.2482513015030915e-05, + "loss": 0.4948, + "step": 5384 + }, + { + "epoch": 0.44, + "grad_norm": 0.9039360878072947, + "learning_rate": 1.2479962864366186e-05, + "loss": 0.5673, + "step": 5385 + }, + { + "epoch": 0.44, + "grad_norm": 0.8615365882769283, + "learning_rate": 1.2477412541842231e-05, + "loss": 0.5202, + "step": 5386 + }, + { + "epoch": 0.44, + "grad_norm": 0.7911663726867374, + "learning_rate": 1.247486204763578e-05, + "loss": 0.4393, + "step": 5387 + }, + { + "epoch": 0.44, + "grad_norm": 0.9019516455908042, + "learning_rate": 1.247231138192359e-05, + "loss": 0.5104, + "step": 5388 + }, + { + "epoch": 0.44, + "grad_norm": 0.9517601590994649, + "learning_rate": 1.246976054488241e-05, + "loss": 0.5931, + "step": 5389 + }, + { + "epoch": 0.44, + "grad_norm": 0.8783075827940406, + "learning_rate": 1.2467209536689016e-05, + "loss": 0.512, + "step": 5390 + }, + { + "epoch": 0.44, + "grad_norm": 0.9801083493781189, + "learning_rate": 1.2464658357520192e-05, + "loss": 0.5946, + "step": 5391 + }, + { + "epoch": 0.44, + "grad_norm": 0.9940969062462093, + "learning_rate": 1.2462107007552726e-05, + "loss": 0.6072, + "step": 5392 + }, + { + "epoch": 0.44, + "grad_norm": 0.9240346316696333, + "learning_rate": 1.2459555486963431e-05, + "loss": 0.5583, + "step": 5393 + }, + { + "epoch": 0.44, + "grad_norm": 0.9381190189854962, + "learning_rate": 1.2457003795929121e-05, + "loss": 0.6326, + "step": 5394 + }, + { + "epoch": 0.44, + "grad_norm": 1.0090297957220815, + "learning_rate": 1.2454451934626628e-05, + "loss": 0.5168, + "step": 5395 + }, + { + "epoch": 0.44, + "grad_norm": 0.833215564045747, + "learning_rate": 1.2451899903232793e-05, + "loss": 0.5483, + "step": 5396 + }, + { + "epoch": 0.44, + "grad_norm": 0.9284695649603013, + "learning_rate": 1.244934770192447e-05, + "loss": 0.596, + "step": 5397 + }, + { + "epoch": 0.44, + "grad_norm": 0.8845855956442529, + "learning_rate": 1.2446795330878522e-05, + "loss": 0.5107, + "step": 5398 + }, + { + "epoch": 0.44, + "grad_norm": 1.002581956309397, + "learning_rate": 1.244424279027183e-05, + "loss": 0.493, + "step": 5399 + }, + { + "epoch": 0.44, + "grad_norm": 0.8829187422959885, + "learning_rate": 1.244169008028128e-05, + "loss": 0.4969, + "step": 5400 + }, + { + "epoch": 0.44, + "grad_norm": 0.9295869401823778, + "learning_rate": 1.2439137201083772e-05, + "loss": 0.5593, + "step": 5401 + }, + { + "epoch": 0.44, + "grad_norm": 0.9731943915057911, + "learning_rate": 1.243658415285622e-05, + "loss": 0.5267, + "step": 5402 + }, + { + "epoch": 0.44, + "grad_norm": 0.9063916835917486, + "learning_rate": 1.243403093577555e-05, + "loss": 0.5136, + "step": 5403 + }, + { + "epoch": 0.44, + "grad_norm": 0.865307935840072, + "learning_rate": 1.2431477550018691e-05, + "loss": 0.5174, + "step": 5404 + }, + { + "epoch": 0.44, + "grad_norm": 0.9523780286387016, + "learning_rate": 1.2428923995762597e-05, + "loss": 0.5624, + "step": 5405 + }, + { + "epoch": 0.44, + "grad_norm": 0.9867060620553869, + "learning_rate": 1.2426370273184226e-05, + "loss": 0.5949, + "step": 5406 + }, + { + "epoch": 0.44, + "grad_norm": 0.9363530752198639, + "learning_rate": 1.2423816382460544e-05, + "loss": 0.5715, + "step": 5407 + }, + { + "epoch": 0.44, + "grad_norm": 0.8694286672573258, + "learning_rate": 1.2421262323768537e-05, + "loss": 0.5034, + "step": 5408 + }, + { + "epoch": 0.44, + "grad_norm": 0.8299654192646883, + "learning_rate": 1.2418708097285202e-05, + "loss": 0.4707, + "step": 5409 + }, + { + "epoch": 0.44, + "grad_norm": 0.9104100825566318, + "learning_rate": 1.2416153703187537e-05, + "loss": 0.5178, + "step": 5410 + }, + { + "epoch": 0.44, + "grad_norm": 0.9194277608599328, + "learning_rate": 1.2413599141652565e-05, + "loss": 0.4901, + "step": 5411 + }, + { + "epoch": 0.44, + "grad_norm": 0.9041818758818694, + "learning_rate": 1.2411044412857317e-05, + "loss": 0.54, + "step": 5412 + }, + { + "epoch": 0.44, + "grad_norm": 1.016449033785623, + "learning_rate": 1.2408489516978824e-05, + "loss": 0.5814, + "step": 5413 + }, + { + "epoch": 0.44, + "grad_norm": 0.9805254090857588, + "learning_rate": 1.2405934454194146e-05, + "loss": 0.5594, + "step": 5414 + }, + { + "epoch": 0.44, + "grad_norm": 0.9510567274523963, + "learning_rate": 1.2403379224680346e-05, + "loss": 0.5203, + "step": 5415 + }, + { + "epoch": 0.44, + "grad_norm": 0.9394727691546123, + "learning_rate": 1.2400823828614495e-05, + "loss": 0.5802, + "step": 5416 + }, + { + "epoch": 0.44, + "grad_norm": 0.8411449916095526, + "learning_rate": 1.2398268266173683e-05, + "loss": 0.4941, + "step": 5417 + }, + { + "epoch": 0.44, + "grad_norm": 0.975070662691036, + "learning_rate": 1.239571253753501e-05, + "loss": 0.5484, + "step": 5418 + }, + { + "epoch": 0.44, + "grad_norm": 0.9109850157859793, + "learning_rate": 1.2393156642875579e-05, + "loss": 0.4947, + "step": 5419 + }, + { + "epoch": 0.44, + "grad_norm": 0.8938766973316515, + "learning_rate": 1.2390600582372517e-05, + "loss": 0.5626, + "step": 5420 + }, + { + "epoch": 0.44, + "grad_norm": 0.9179313851288792, + "learning_rate": 1.2388044356202958e-05, + "loss": 0.572, + "step": 5421 + }, + { + "epoch": 0.44, + "grad_norm": 0.8698575621096661, + "learning_rate": 1.2385487964544038e-05, + "loss": 0.5427, + "step": 5422 + }, + { + "epoch": 0.44, + "grad_norm": 0.8987838459695859, + "learning_rate": 1.238293140757292e-05, + "loss": 0.505, + "step": 5423 + }, + { + "epoch": 0.44, + "grad_norm": 0.9269681022844656, + "learning_rate": 1.2380374685466772e-05, + "loss": 0.537, + "step": 5424 + }, + { + "epoch": 0.44, + "grad_norm": 0.9014675995828588, + "learning_rate": 1.2377817798402767e-05, + "loss": 0.4887, + "step": 5425 + }, + { + "epoch": 0.44, + "grad_norm": 0.9120011897073391, + "learning_rate": 1.2375260746558098e-05, + "loss": 0.5395, + "step": 5426 + }, + { + "epoch": 0.44, + "grad_norm": 0.934333432226395, + "learning_rate": 1.2372703530109967e-05, + "loss": 0.5583, + "step": 5427 + }, + { + "epoch": 0.44, + "grad_norm": 0.990985907641905, + "learning_rate": 1.2370146149235585e-05, + "loss": 0.5498, + "step": 5428 + }, + { + "epoch": 0.44, + "grad_norm": 0.9623084732384362, + "learning_rate": 1.2367588604112177e-05, + "loss": 0.5172, + "step": 5429 + }, + { + "epoch": 0.44, + "grad_norm": 0.838520536415113, + "learning_rate": 1.236503089491698e-05, + "loss": 0.5648, + "step": 5430 + }, + { + "epoch": 0.44, + "grad_norm": 0.8518399557849459, + "learning_rate": 1.236247302182724e-05, + "loss": 0.5575, + "step": 5431 + }, + { + "epoch": 0.44, + "grad_norm": 0.8947014247499556, + "learning_rate": 1.2359914985020212e-05, + "loss": 0.5625, + "step": 5432 + }, + { + "epoch": 0.44, + "grad_norm": 0.9468389551013499, + "learning_rate": 1.2357356784673171e-05, + "loss": 0.6043, + "step": 5433 + }, + { + "epoch": 0.44, + "grad_norm": 0.8053680832226762, + "learning_rate": 1.2354798420963396e-05, + "loss": 0.4899, + "step": 5434 + }, + { + "epoch": 0.44, + "grad_norm": 0.9694037917733627, + "learning_rate": 1.2352239894068179e-05, + "loss": 0.6325, + "step": 5435 + }, + { + "epoch": 0.44, + "grad_norm": 0.8632305139677751, + "learning_rate": 1.2349681204164823e-05, + "loss": 0.5214, + "step": 5436 + }, + { + "epoch": 0.44, + "grad_norm": 0.8563381614510859, + "learning_rate": 1.2347122351430645e-05, + "loss": 0.528, + "step": 5437 + }, + { + "epoch": 0.44, + "grad_norm": 0.8623448064496342, + "learning_rate": 1.2344563336042967e-05, + "loss": 0.5471, + "step": 5438 + }, + { + "epoch": 0.44, + "grad_norm": 0.8587003201570016, + "learning_rate": 1.2342004158179133e-05, + "loss": 0.5409, + "step": 5439 + }, + { + "epoch": 0.44, + "grad_norm": 0.8868847939450765, + "learning_rate": 1.2339444818016488e-05, + "loss": 0.5818, + "step": 5440 + }, + { + "epoch": 0.44, + "grad_norm": 0.9258515474662156, + "learning_rate": 1.233688531573239e-05, + "loss": 0.4862, + "step": 5441 + }, + { + "epoch": 0.44, + "grad_norm": 0.8677169571698846, + "learning_rate": 1.2334325651504214e-05, + "loss": 0.5151, + "step": 5442 + }, + { + "epoch": 0.44, + "grad_norm": 0.870121746459327, + "learning_rate": 1.233176582550934e-05, + "loss": 0.5004, + "step": 5443 + }, + { + "epoch": 0.44, + "grad_norm": 0.8453323949166696, + "learning_rate": 1.2329205837925162e-05, + "loss": 0.4571, + "step": 5444 + }, + { + "epoch": 0.44, + "grad_norm": 0.8392697669827083, + "learning_rate": 1.2326645688929087e-05, + "loss": 0.4784, + "step": 5445 + }, + { + "epoch": 0.44, + "grad_norm": 0.8743550979582336, + "learning_rate": 1.2324085378698529e-05, + "loss": 0.5231, + "step": 5446 + }, + { + "epoch": 0.44, + "grad_norm": 0.8375402911183744, + "learning_rate": 1.2321524907410916e-05, + "loss": 0.5185, + "step": 5447 + }, + { + "epoch": 0.44, + "grad_norm": 0.90586681912078, + "learning_rate": 1.2318964275243683e-05, + "loss": 0.5223, + "step": 5448 + }, + { + "epoch": 0.44, + "grad_norm": 0.9029561290748873, + "learning_rate": 1.2316403482374289e-05, + "loss": 0.5531, + "step": 5449 + }, + { + "epoch": 0.44, + "grad_norm": 0.9179206919093462, + "learning_rate": 1.2313842528980184e-05, + "loss": 0.5461, + "step": 5450 + }, + { + "epoch": 0.44, + "grad_norm": 0.9248238397243361, + "learning_rate": 1.2311281415238842e-05, + "loss": 0.5494, + "step": 5451 + }, + { + "epoch": 0.44, + "grad_norm": 0.8941957605560482, + "learning_rate": 1.2308720141327753e-05, + "loss": 0.5751, + "step": 5452 + }, + { + "epoch": 0.44, + "grad_norm": 0.8724227657559777, + "learning_rate": 1.2306158707424402e-05, + "loss": 0.4947, + "step": 5453 + }, + { + "epoch": 0.44, + "grad_norm": 0.9910439980193736, + "learning_rate": 1.2303597113706301e-05, + "loss": 0.5735, + "step": 5454 + }, + { + "epoch": 0.44, + "grad_norm": 0.764935426291101, + "learning_rate": 1.2301035360350964e-05, + "loss": 0.4984, + "step": 5455 + }, + { + "epoch": 0.44, + "grad_norm": 0.9037221761184959, + "learning_rate": 1.2298473447535914e-05, + "loss": 0.503, + "step": 5456 + }, + { + "epoch": 0.44, + "grad_norm": 0.8955191193698375, + "learning_rate": 1.2295911375438694e-05, + "loss": 0.5683, + "step": 5457 + }, + { + "epoch": 0.44, + "grad_norm": 0.8596766302284218, + "learning_rate": 1.2293349144236855e-05, + "loss": 0.5369, + "step": 5458 + }, + { + "epoch": 0.44, + "grad_norm": 0.9915006897890639, + "learning_rate": 1.229078675410795e-05, + "loss": 0.5778, + "step": 5459 + }, + { + "epoch": 0.44, + "grad_norm": 0.8705018308535379, + "learning_rate": 1.2288224205229557e-05, + "loss": 0.4935, + "step": 5460 + }, + { + "epoch": 0.44, + "grad_norm": 0.8800565338043739, + "learning_rate": 1.228566149777926e-05, + "loss": 0.5087, + "step": 5461 + }, + { + "epoch": 0.44, + "grad_norm": 0.9986894910833827, + "learning_rate": 1.2283098631934642e-05, + "loss": 0.5404, + "step": 5462 + }, + { + "epoch": 0.44, + "grad_norm": 0.8912962051173339, + "learning_rate": 1.2280535607873318e-05, + "loss": 0.5032, + "step": 5463 + }, + { + "epoch": 0.44, + "grad_norm": 0.8038436091058793, + "learning_rate": 1.22779724257729e-05, + "loss": 0.5227, + "step": 5464 + }, + { + "epoch": 0.44, + "grad_norm": 0.938614221484701, + "learning_rate": 1.227540908581101e-05, + "loss": 0.5578, + "step": 5465 + }, + { + "epoch": 0.44, + "grad_norm": 0.8750121548910289, + "learning_rate": 1.227284558816529e-05, + "loss": 0.5488, + "step": 5466 + }, + { + "epoch": 0.44, + "grad_norm": 0.9057698650290454, + "learning_rate": 1.2270281933013388e-05, + "loss": 0.5608, + "step": 5467 + }, + { + "epoch": 0.44, + "grad_norm": 0.8328101242823822, + "learning_rate": 1.2267718120532958e-05, + "loss": 0.5489, + "step": 5468 + }, + { + "epoch": 0.44, + "grad_norm": 0.9559423602996424, + "learning_rate": 1.2265154150901677e-05, + "loss": 0.5267, + "step": 5469 + }, + { + "epoch": 0.44, + "grad_norm": 1.0768815635170421, + "learning_rate": 1.2262590024297226e-05, + "loss": 0.576, + "step": 5470 + }, + { + "epoch": 0.44, + "grad_norm": 1.017951933254484, + "learning_rate": 1.2260025740897286e-05, + "loss": 0.5198, + "step": 5471 + }, + { + "epoch": 0.44, + "grad_norm": 0.9203291906948043, + "learning_rate": 1.225746130087957e-05, + "loss": 0.5603, + "step": 5472 + }, + { + "epoch": 0.44, + "grad_norm": 1.0045960023358238, + "learning_rate": 1.2254896704421789e-05, + "loss": 0.5984, + "step": 5473 + }, + { + "epoch": 0.44, + "grad_norm": 0.9114512728903359, + "learning_rate": 1.2252331951701665e-05, + "loss": 0.5265, + "step": 5474 + }, + { + "epoch": 0.44, + "grad_norm": 0.8752647547925406, + "learning_rate": 1.2249767042896934e-05, + "loss": 0.5302, + "step": 5475 + }, + { + "epoch": 0.45, + "grad_norm": 0.8801933966164635, + "learning_rate": 1.2247201978185346e-05, + "loss": 0.5267, + "step": 5476 + }, + { + "epoch": 0.45, + "grad_norm": 0.992872988262125, + "learning_rate": 1.224463675774465e-05, + "loss": 0.5711, + "step": 5477 + }, + { + "epoch": 0.45, + "grad_norm": 0.9604426674753421, + "learning_rate": 1.224207138175262e-05, + "loss": 0.5276, + "step": 5478 + }, + { + "epoch": 0.45, + "grad_norm": 0.8640035052898114, + "learning_rate": 1.2239505850387032e-05, + "loss": 0.618, + "step": 5479 + }, + { + "epoch": 0.45, + "grad_norm": 0.9810432720165322, + "learning_rate": 1.2236940163825675e-05, + "loss": 0.5615, + "step": 5480 + }, + { + "epoch": 0.45, + "grad_norm": 0.8961399600071736, + "learning_rate": 1.2234374322246348e-05, + "loss": 0.5542, + "step": 5481 + }, + { + "epoch": 0.45, + "grad_norm": 0.879129224347014, + "learning_rate": 1.2231808325826862e-05, + "loss": 0.548, + "step": 5482 + }, + { + "epoch": 0.45, + "grad_norm": 0.8514413978104286, + "learning_rate": 1.222924217474504e-05, + "loss": 0.5043, + "step": 5483 + }, + { + "epoch": 0.45, + "grad_norm": 1.0162949354532185, + "learning_rate": 1.2226675869178713e-05, + "loss": 0.6186, + "step": 5484 + }, + { + "epoch": 0.45, + "grad_norm": 0.9010809840353643, + "learning_rate": 1.222410940930572e-05, + "loss": 0.4572, + "step": 5485 + }, + { + "epoch": 0.45, + "grad_norm": 0.9070348230842781, + "learning_rate": 1.2221542795303921e-05, + "loss": 0.5338, + "step": 5486 + }, + { + "epoch": 0.45, + "grad_norm": 0.905376480185276, + "learning_rate": 1.2218976027351177e-05, + "loss": 0.4919, + "step": 5487 + }, + { + "epoch": 0.45, + "grad_norm": 0.9351701239895585, + "learning_rate": 1.221640910562536e-05, + "loss": 0.5618, + "step": 5488 + }, + { + "epoch": 0.45, + "grad_norm": 0.87105543303774, + "learning_rate": 1.2213842030304358e-05, + "loss": 0.5298, + "step": 5489 + }, + { + "epoch": 0.45, + "grad_norm": 1.0073370084141626, + "learning_rate": 1.221127480156607e-05, + "loss": 0.5373, + "step": 5490 + }, + { + "epoch": 0.45, + "grad_norm": 0.877935805226189, + "learning_rate": 1.2208707419588397e-05, + "loss": 0.5437, + "step": 5491 + }, + { + "epoch": 0.45, + "grad_norm": 0.9243438906000147, + "learning_rate": 1.220613988454926e-05, + "loss": 0.5324, + "step": 5492 + }, + { + "epoch": 0.45, + "grad_norm": 0.9477932541999349, + "learning_rate": 1.2203572196626587e-05, + "loss": 0.5556, + "step": 5493 + }, + { + "epoch": 0.45, + "grad_norm": 0.9184085706505207, + "learning_rate": 1.2201004355998312e-05, + "loss": 0.5626, + "step": 5494 + }, + { + "epoch": 0.45, + "grad_norm": 0.8732092279319581, + "learning_rate": 1.2198436362842389e-05, + "loss": 0.5584, + "step": 5495 + }, + { + "epoch": 0.45, + "grad_norm": 0.8818231916647266, + "learning_rate": 1.2195868217336778e-05, + "loss": 0.4974, + "step": 5496 + }, + { + "epoch": 0.45, + "grad_norm": 0.9253509177877544, + "learning_rate": 1.2193299919659444e-05, + "loss": 0.507, + "step": 5497 + }, + { + "epoch": 0.45, + "grad_norm": 0.9976868248539731, + "learning_rate": 1.2190731469988372e-05, + "loss": 0.6117, + "step": 5498 + }, + { + "epoch": 0.45, + "grad_norm": 0.8668678379356317, + "learning_rate": 1.2188162868501557e-05, + "loss": 0.5028, + "step": 5499 + }, + { + "epoch": 0.45, + "grad_norm": 0.9165726909760753, + "learning_rate": 1.2185594115376991e-05, + "loss": 0.5579, + "step": 5500 + }, + { + "epoch": 0.45, + "grad_norm": 0.9154946875925669, + "learning_rate": 1.2183025210792692e-05, + "loss": 0.5251, + "step": 5501 + }, + { + "epoch": 0.45, + "grad_norm": 0.8404918378432502, + "learning_rate": 1.218045615492668e-05, + "loss": 0.5286, + "step": 5502 + }, + { + "epoch": 0.45, + "grad_norm": 0.8712436361695308, + "learning_rate": 1.2177886947956997e-05, + "loss": 0.5304, + "step": 5503 + }, + { + "epoch": 0.45, + "grad_norm": 0.8286927110237194, + "learning_rate": 1.2175317590061676e-05, + "loss": 0.5094, + "step": 5504 + }, + { + "epoch": 0.45, + "grad_norm": 0.866565081858643, + "learning_rate": 1.2172748081418775e-05, + "loss": 0.5399, + "step": 5505 + }, + { + "epoch": 0.45, + "grad_norm": 0.9016013035316106, + "learning_rate": 1.2170178422206362e-05, + "loss": 0.5273, + "step": 5506 + }, + { + "epoch": 0.45, + "grad_norm": 0.8433212719566922, + "learning_rate": 1.2167608612602507e-05, + "loss": 0.4995, + "step": 5507 + }, + { + "epoch": 0.45, + "grad_norm": 0.8611469548274269, + "learning_rate": 1.2165038652785297e-05, + "loss": 0.5434, + "step": 5508 + }, + { + "epoch": 0.45, + "grad_norm": 0.9387736783283651, + "learning_rate": 1.2162468542932832e-05, + "loss": 0.5529, + "step": 5509 + }, + { + "epoch": 0.45, + "grad_norm": 0.7884933758412176, + "learning_rate": 1.2159898283223213e-05, + "loss": 0.4898, + "step": 5510 + }, + { + "epoch": 0.45, + "grad_norm": 0.8797535779005713, + "learning_rate": 1.2157327873834559e-05, + "loss": 0.5357, + "step": 5511 + }, + { + "epoch": 0.45, + "grad_norm": 0.7845933710931979, + "learning_rate": 1.2154757314944997e-05, + "loss": 0.4465, + "step": 5512 + }, + { + "epoch": 0.45, + "grad_norm": 0.8716534801191331, + "learning_rate": 1.2152186606732665e-05, + "loss": 0.4596, + "step": 5513 + }, + { + "epoch": 0.45, + "grad_norm": 0.8806006921198266, + "learning_rate": 1.2149615749375707e-05, + "loss": 0.5238, + "step": 5514 + }, + { + "epoch": 0.45, + "grad_norm": 0.9519784857876017, + "learning_rate": 1.2147044743052288e-05, + "loss": 0.5458, + "step": 5515 + }, + { + "epoch": 0.45, + "grad_norm": 0.8809363718552123, + "learning_rate": 1.2144473587940573e-05, + "loss": 0.5031, + "step": 5516 + }, + { + "epoch": 0.45, + "grad_norm": 0.9694116837326615, + "learning_rate": 1.2141902284218738e-05, + "loss": 0.4856, + "step": 5517 + }, + { + "epoch": 0.45, + "grad_norm": 1.005998546729219, + "learning_rate": 1.2139330832064975e-05, + "loss": 0.5647, + "step": 5518 + }, + { + "epoch": 0.45, + "grad_norm": 0.9363523110506934, + "learning_rate": 1.2136759231657485e-05, + "loss": 0.514, + "step": 5519 + }, + { + "epoch": 0.45, + "grad_norm": 0.8834684878319838, + "learning_rate": 1.2134187483174474e-05, + "loss": 0.5651, + "step": 5520 + }, + { + "epoch": 0.45, + "grad_norm": 0.9199792993437208, + "learning_rate": 1.2131615586794162e-05, + "loss": 0.5119, + "step": 5521 + }, + { + "epoch": 0.45, + "grad_norm": 0.9083253044159368, + "learning_rate": 1.2129043542694783e-05, + "loss": 0.5347, + "step": 5522 + }, + { + "epoch": 0.45, + "grad_norm": 0.9079207141814368, + "learning_rate": 1.2126471351054574e-05, + "loss": 0.5234, + "step": 5523 + }, + { + "epoch": 0.45, + "grad_norm": 0.8996213537502855, + "learning_rate": 1.2123899012051785e-05, + "loss": 0.5154, + "step": 5524 + }, + { + "epoch": 0.45, + "grad_norm": 0.8167759767195947, + "learning_rate": 1.212132652586468e-05, + "loss": 0.5262, + "step": 5525 + }, + { + "epoch": 0.45, + "grad_norm": 0.8599043700048763, + "learning_rate": 1.211875389267153e-05, + "loss": 0.5309, + "step": 5526 + }, + { + "epoch": 0.45, + "grad_norm": 0.8820657466473959, + "learning_rate": 1.211618111265061e-05, + "loss": 0.4873, + "step": 5527 + }, + { + "epoch": 0.45, + "grad_norm": 0.8665427619337003, + "learning_rate": 1.2113608185980221e-05, + "loss": 0.5374, + "step": 5528 + }, + { + "epoch": 0.45, + "grad_norm": 0.8908908247445878, + "learning_rate": 1.2111035112838657e-05, + "loss": 0.4539, + "step": 5529 + }, + { + "epoch": 0.45, + "grad_norm": 0.8837991806660707, + "learning_rate": 1.2108461893404231e-05, + "loss": 0.5449, + "step": 5530 + }, + { + "epoch": 0.45, + "grad_norm": 0.9001160427372054, + "learning_rate": 1.210588852785527e-05, + "loss": 0.608, + "step": 5531 + }, + { + "epoch": 0.45, + "grad_norm": 0.9406825613037999, + "learning_rate": 1.2103315016370098e-05, + "loss": 0.5248, + "step": 5532 + }, + { + "epoch": 0.45, + "grad_norm": 0.9492003060158064, + "learning_rate": 1.2100741359127062e-05, + "loss": 0.6211, + "step": 5533 + }, + { + "epoch": 0.45, + "grad_norm": 0.8165428959386474, + "learning_rate": 1.2098167556304514e-05, + "loss": 0.5004, + "step": 5534 + }, + { + "epoch": 0.45, + "grad_norm": 0.9547917538163686, + "learning_rate": 1.2095593608080815e-05, + "loss": 0.5788, + "step": 5535 + }, + { + "epoch": 0.45, + "grad_norm": 0.8694750633823777, + "learning_rate": 1.2093019514634337e-05, + "loss": 0.5144, + "step": 5536 + }, + { + "epoch": 0.45, + "grad_norm": 0.8783697437957985, + "learning_rate": 1.2090445276143466e-05, + "loss": 0.4806, + "step": 5537 + }, + { + "epoch": 0.45, + "grad_norm": 0.8783928877709485, + "learning_rate": 1.2087870892786588e-05, + "loss": 0.5376, + "step": 5538 + }, + { + "epoch": 0.45, + "grad_norm": 0.949269064846333, + "learning_rate": 1.208529636474211e-05, + "loss": 0.5852, + "step": 5539 + }, + { + "epoch": 0.45, + "grad_norm": 0.9646333868024378, + "learning_rate": 1.2082721692188446e-05, + "loss": 0.5591, + "step": 5540 + }, + { + "epoch": 0.45, + "grad_norm": 0.901401626566398, + "learning_rate": 1.2080146875304012e-05, + "loss": 0.5857, + "step": 5541 + }, + { + "epoch": 0.45, + "grad_norm": 0.8454219718524448, + "learning_rate": 1.2077571914267248e-05, + "loss": 0.5336, + "step": 5542 + }, + { + "epoch": 0.45, + "grad_norm": 0.8808345197786775, + "learning_rate": 1.2074996809256594e-05, + "loss": 0.5246, + "step": 5543 + }, + { + "epoch": 0.45, + "grad_norm": 0.8775532655927482, + "learning_rate": 1.2072421560450497e-05, + "loss": 0.5933, + "step": 5544 + }, + { + "epoch": 0.45, + "grad_norm": 0.9150474033054725, + "learning_rate": 1.2069846168027427e-05, + "loss": 0.5064, + "step": 5545 + }, + { + "epoch": 0.45, + "grad_norm": 0.9070112198834746, + "learning_rate": 1.2067270632165856e-05, + "loss": 0.5646, + "step": 5546 + }, + { + "epoch": 0.45, + "grad_norm": 0.8538367633178499, + "learning_rate": 1.2064694953044259e-05, + "loss": 0.5207, + "step": 5547 + }, + { + "epoch": 0.45, + "grad_norm": 0.9279054018383129, + "learning_rate": 1.2062119130841135e-05, + "loss": 0.6147, + "step": 5548 + }, + { + "epoch": 0.45, + "grad_norm": 1.0040818800527798, + "learning_rate": 1.2059543165734986e-05, + "loss": 0.6258, + "step": 5549 + }, + { + "epoch": 0.45, + "grad_norm": 0.9316998246149493, + "learning_rate": 1.2056967057904319e-05, + "loss": 0.5779, + "step": 5550 + }, + { + "epoch": 0.45, + "grad_norm": 0.9190557347896983, + "learning_rate": 1.2054390807527661e-05, + "loss": 0.5837, + "step": 5551 + }, + { + "epoch": 0.45, + "grad_norm": 0.9606163824300413, + "learning_rate": 1.2051814414783544e-05, + "loss": 0.501, + "step": 5552 + }, + { + "epoch": 0.45, + "grad_norm": 0.9640744408170316, + "learning_rate": 1.2049237879850506e-05, + "loss": 0.535, + "step": 5553 + }, + { + "epoch": 0.45, + "grad_norm": 0.8788815642734494, + "learning_rate": 1.2046661202907101e-05, + "loss": 0.4897, + "step": 5554 + }, + { + "epoch": 0.45, + "grad_norm": 0.7727154304321403, + "learning_rate": 1.2044084384131891e-05, + "loss": 0.5226, + "step": 5555 + }, + { + "epoch": 0.45, + "grad_norm": 1.0129556356194376, + "learning_rate": 1.2041507423703445e-05, + "loss": 0.6603, + "step": 5556 + }, + { + "epoch": 0.45, + "grad_norm": 0.9331842751781539, + "learning_rate": 1.2038930321800346e-05, + "loss": 0.5507, + "step": 5557 + }, + { + "epoch": 0.45, + "grad_norm": 0.9206369409583717, + "learning_rate": 1.2036353078601187e-05, + "loss": 0.5849, + "step": 5558 + }, + { + "epoch": 0.45, + "grad_norm": 0.9268985407098875, + "learning_rate": 1.2033775694284562e-05, + "loss": 0.5296, + "step": 5559 + }, + { + "epoch": 0.45, + "grad_norm": 0.8851565422818308, + "learning_rate": 1.2031198169029084e-05, + "loss": 0.5511, + "step": 5560 + }, + { + "epoch": 0.45, + "grad_norm": 0.7522171063517573, + "learning_rate": 1.2028620503013377e-05, + "loss": 0.5037, + "step": 5561 + }, + { + "epoch": 0.45, + "grad_norm": 0.9237884426494588, + "learning_rate": 1.2026042696416069e-05, + "loss": 0.4642, + "step": 5562 + }, + { + "epoch": 0.45, + "grad_norm": 0.8747303264360102, + "learning_rate": 1.20234647494158e-05, + "loss": 0.5071, + "step": 5563 + }, + { + "epoch": 0.45, + "grad_norm": 0.9304691768041673, + "learning_rate": 1.2020886662191216e-05, + "loss": 0.5432, + "step": 5564 + }, + { + "epoch": 0.45, + "grad_norm": 0.983912008294001, + "learning_rate": 1.2018308434920983e-05, + "loss": 0.628, + "step": 5565 + }, + { + "epoch": 0.45, + "grad_norm": 0.908796074695498, + "learning_rate": 1.201573006778376e-05, + "loss": 0.5621, + "step": 5566 + }, + { + "epoch": 0.45, + "grad_norm": 0.9866490956856787, + "learning_rate": 1.2013151560958233e-05, + "loss": 0.5905, + "step": 5567 + }, + { + "epoch": 0.45, + "grad_norm": 0.958082254827949, + "learning_rate": 1.2010572914623091e-05, + "loss": 0.5013, + "step": 5568 + }, + { + "epoch": 0.45, + "grad_norm": 0.9013354679959612, + "learning_rate": 1.2007994128957029e-05, + "loss": 0.4822, + "step": 5569 + }, + { + "epoch": 0.45, + "grad_norm": 0.8861126057246507, + "learning_rate": 1.2005415204138753e-05, + "loss": 0.5551, + "step": 5570 + }, + { + "epoch": 0.45, + "grad_norm": 0.8977421349365613, + "learning_rate": 1.2002836140346984e-05, + "loss": 0.5817, + "step": 5571 + }, + { + "epoch": 0.45, + "grad_norm": 0.9163167902421183, + "learning_rate": 1.2000256937760446e-05, + "loss": 0.5701, + "step": 5572 + }, + { + "epoch": 0.45, + "grad_norm": 0.8061350723006823, + "learning_rate": 1.1997677596557875e-05, + "loss": 0.4702, + "step": 5573 + }, + { + "epoch": 0.45, + "grad_norm": 0.8525210977763026, + "learning_rate": 1.1995098116918022e-05, + "loss": 0.4862, + "step": 5574 + }, + { + "epoch": 0.45, + "grad_norm": 0.9739495505934435, + "learning_rate": 1.1992518499019637e-05, + "loss": 0.5159, + "step": 5575 + }, + { + "epoch": 0.45, + "grad_norm": 0.8864361775922159, + "learning_rate": 1.1989938743041487e-05, + "loss": 0.5461, + "step": 5576 + }, + { + "epoch": 0.45, + "grad_norm": 1.004216155887953, + "learning_rate": 1.1987358849162349e-05, + "loss": 0.5517, + "step": 5577 + }, + { + "epoch": 0.45, + "grad_norm": 0.9796249965818342, + "learning_rate": 1.1984778817561002e-05, + "loss": 0.5026, + "step": 5578 + }, + { + "epoch": 0.45, + "grad_norm": 0.8592763662836835, + "learning_rate": 1.1982198648416245e-05, + "loss": 0.5431, + "step": 5579 + }, + { + "epoch": 0.45, + "grad_norm": 0.8886989571121534, + "learning_rate": 1.1979618341906884e-05, + "loss": 0.5394, + "step": 5580 + }, + { + "epoch": 0.45, + "grad_norm": 0.9863795713903801, + "learning_rate": 1.1977037898211723e-05, + "loss": 0.5857, + "step": 5581 + }, + { + "epoch": 0.45, + "grad_norm": 0.8654981222560337, + "learning_rate": 1.1974457317509591e-05, + "loss": 0.5435, + "step": 5582 + }, + { + "epoch": 0.45, + "grad_norm": 0.8571963875092631, + "learning_rate": 1.197187659997932e-05, + "loss": 0.5196, + "step": 5583 + }, + { + "epoch": 0.45, + "grad_norm": 0.9245798277364339, + "learning_rate": 1.1969295745799746e-05, + "loss": 0.548, + "step": 5584 + }, + { + "epoch": 0.45, + "grad_norm": 0.8977120983327718, + "learning_rate": 1.1966714755149724e-05, + "loss": 0.5561, + "step": 5585 + }, + { + "epoch": 0.45, + "grad_norm": 0.911986785371461, + "learning_rate": 1.1964133628208116e-05, + "loss": 0.5717, + "step": 5586 + }, + { + "epoch": 0.45, + "grad_norm": 0.8219590372461145, + "learning_rate": 1.196155236515379e-05, + "loss": 0.4861, + "step": 5587 + }, + { + "epoch": 0.45, + "grad_norm": 0.8681933932768588, + "learning_rate": 1.1958970966165622e-05, + "loss": 0.512, + "step": 5588 + }, + { + "epoch": 0.45, + "grad_norm": 1.0060914497630464, + "learning_rate": 1.1956389431422508e-05, + "loss": 0.6282, + "step": 5589 + }, + { + "epoch": 0.45, + "grad_norm": 1.028934286984717, + "learning_rate": 1.1953807761103338e-05, + "loss": 0.5839, + "step": 5590 + }, + { + "epoch": 0.45, + "grad_norm": 0.8952935651557266, + "learning_rate": 1.1951225955387025e-05, + "loss": 0.5875, + "step": 5591 + }, + { + "epoch": 0.45, + "grad_norm": 1.1605839344131503, + "learning_rate": 1.1948644014452484e-05, + "loss": 0.538, + "step": 5592 + }, + { + "epoch": 0.45, + "grad_norm": 0.9642560605768177, + "learning_rate": 1.1946061938478638e-05, + "loss": 0.5382, + "step": 5593 + }, + { + "epoch": 0.45, + "grad_norm": 0.9704106218133949, + "learning_rate": 1.1943479727644429e-05, + "loss": 0.5499, + "step": 5594 + }, + { + "epoch": 0.45, + "grad_norm": 0.940747211343841, + "learning_rate": 1.19408973821288e-05, + "loss": 0.5168, + "step": 5595 + }, + { + "epoch": 0.45, + "grad_norm": 0.9471035641500342, + "learning_rate": 1.1938314902110701e-05, + "loss": 0.5431, + "step": 5596 + }, + { + "epoch": 0.45, + "grad_norm": 0.8100368901211942, + "learning_rate": 1.1935732287769099e-05, + "loss": 0.5276, + "step": 5597 + }, + { + "epoch": 0.45, + "grad_norm": 0.9216492266864472, + "learning_rate": 1.193314953928297e-05, + "loss": 0.5553, + "step": 5598 + }, + { + "epoch": 0.46, + "grad_norm": 0.9298432323689065, + "learning_rate": 1.1930566656831288e-05, + "loss": 0.5584, + "step": 5599 + }, + { + "epoch": 0.46, + "grad_norm": 0.9101378356734033, + "learning_rate": 1.1927983640593053e-05, + "loss": 0.4693, + "step": 5600 + }, + { + "epoch": 0.46, + "grad_norm": 0.854826184502737, + "learning_rate": 1.192540049074726e-05, + "loss": 0.5558, + "step": 5601 + }, + { + "epoch": 0.46, + "grad_norm": 1.029647115535759, + "learning_rate": 1.1922817207472921e-05, + "loss": 0.5951, + "step": 5602 + }, + { + "epoch": 0.46, + "grad_norm": 0.8843012858921222, + "learning_rate": 1.1920233790949051e-05, + "loss": 0.5116, + "step": 5603 + }, + { + "epoch": 0.46, + "grad_norm": 0.8733868563514845, + "learning_rate": 1.191765024135469e-05, + "loss": 0.5402, + "step": 5604 + }, + { + "epoch": 0.46, + "grad_norm": 0.8751251911535606, + "learning_rate": 1.1915066558868865e-05, + "loss": 0.4786, + "step": 5605 + }, + { + "epoch": 0.46, + "grad_norm": 0.8603668761491104, + "learning_rate": 1.1912482743670624e-05, + "loss": 0.4809, + "step": 5606 + }, + { + "epoch": 0.46, + "grad_norm": 0.8454520161691437, + "learning_rate": 1.1909898795939028e-05, + "loss": 0.5249, + "step": 5607 + }, + { + "epoch": 0.46, + "grad_norm": 0.9231626415447299, + "learning_rate": 1.1907314715853138e-05, + "loss": 0.4975, + "step": 5608 + }, + { + "epoch": 0.46, + "grad_norm": 0.9305487251057392, + "learning_rate": 1.190473050359203e-05, + "loss": 0.5702, + "step": 5609 + }, + { + "epoch": 0.46, + "grad_norm": 0.9161739295244067, + "learning_rate": 1.1902146159334788e-05, + "loss": 0.5287, + "step": 5610 + }, + { + "epoch": 0.46, + "grad_norm": 0.9175136570243649, + "learning_rate": 1.1899561683260506e-05, + "loss": 0.5298, + "step": 5611 + }, + { + "epoch": 0.46, + "grad_norm": 0.9511351394030586, + "learning_rate": 1.189697707554828e-05, + "loss": 0.4931, + "step": 5612 + }, + { + "epoch": 0.46, + "grad_norm": 0.9041144006702436, + "learning_rate": 1.189439233637723e-05, + "loss": 0.5236, + "step": 5613 + }, + { + "epoch": 0.46, + "grad_norm": 0.9583396369583367, + "learning_rate": 1.1891807465926467e-05, + "loss": 0.5616, + "step": 5614 + }, + { + "epoch": 0.46, + "grad_norm": 0.8930178278090726, + "learning_rate": 1.1889222464375127e-05, + "loss": 0.5447, + "step": 5615 + }, + { + "epoch": 0.46, + "grad_norm": 0.8980531194959175, + "learning_rate": 1.1886637331902349e-05, + "loss": 0.4901, + "step": 5616 + }, + { + "epoch": 0.46, + "grad_norm": 0.8984989642229662, + "learning_rate": 1.1884052068687273e-05, + "loss": 0.5205, + "step": 5617 + }, + { + "epoch": 0.46, + "grad_norm": 0.9111022679999827, + "learning_rate": 1.188146667490906e-05, + "loss": 0.5738, + "step": 5618 + }, + { + "epoch": 0.46, + "grad_norm": 0.9693879175100667, + "learning_rate": 1.1878881150746878e-05, + "loss": 0.5605, + "step": 5619 + }, + { + "epoch": 0.46, + "grad_norm": 0.8351788339485661, + "learning_rate": 1.1876295496379894e-05, + "loss": 0.5249, + "step": 5620 + }, + { + "epoch": 0.46, + "grad_norm": 0.8617744689704865, + "learning_rate": 1.18737097119873e-05, + "loss": 0.5436, + "step": 5621 + }, + { + "epoch": 0.46, + "grad_norm": 0.8774024363332183, + "learning_rate": 1.1871123797748285e-05, + "loss": 0.5631, + "step": 5622 + }, + { + "epoch": 0.46, + "grad_norm": 0.8808367572990808, + "learning_rate": 1.1868537753842052e-05, + "loss": 0.515, + "step": 5623 + }, + { + "epoch": 0.46, + "grad_norm": 0.9095122786929855, + "learning_rate": 1.1865951580447805e-05, + "loss": 0.5841, + "step": 5624 + }, + { + "epoch": 0.46, + "grad_norm": 0.8600399569527147, + "learning_rate": 1.1863365277744771e-05, + "loss": 0.5722, + "step": 5625 + }, + { + "epoch": 0.46, + "grad_norm": 1.0620943004279169, + "learning_rate": 1.1860778845912177e-05, + "loss": 0.6108, + "step": 5626 + }, + { + "epoch": 0.46, + "grad_norm": 0.9001555709351374, + "learning_rate": 1.185819228512926e-05, + "loss": 0.5784, + "step": 5627 + }, + { + "epoch": 0.46, + "grad_norm": 0.8901095149384635, + "learning_rate": 1.1855605595575263e-05, + "loss": 0.5315, + "step": 5628 + }, + { + "epoch": 0.46, + "grad_norm": 0.8751412997045999, + "learning_rate": 1.1853018777429449e-05, + "loss": 0.5046, + "step": 5629 + }, + { + "epoch": 0.46, + "grad_norm": 0.9685659253082625, + "learning_rate": 1.1850431830871075e-05, + "loss": 0.6441, + "step": 5630 + }, + { + "epoch": 0.46, + "grad_norm": 0.8627187869655145, + "learning_rate": 1.1847844756079414e-05, + "loss": 0.4905, + "step": 5631 + }, + { + "epoch": 0.46, + "grad_norm": 0.9491405505682985, + "learning_rate": 1.1845257553233753e-05, + "loss": 0.4991, + "step": 5632 + }, + { + "epoch": 0.46, + "grad_norm": 0.8990900153819141, + "learning_rate": 1.1842670222513379e-05, + "loss": 0.5544, + "step": 5633 + }, + { + "epoch": 0.46, + "grad_norm": 0.8231690363328513, + "learning_rate": 1.1840082764097593e-05, + "loss": 0.5093, + "step": 5634 + }, + { + "epoch": 0.46, + "grad_norm": 0.8340209505703706, + "learning_rate": 1.1837495178165706e-05, + "loss": 0.5191, + "step": 5635 + }, + { + "epoch": 0.46, + "grad_norm": 0.8824202838144878, + "learning_rate": 1.183490746489703e-05, + "loss": 0.5046, + "step": 5636 + }, + { + "epoch": 0.46, + "grad_norm": 0.9022132591342349, + "learning_rate": 1.1832319624470895e-05, + "loss": 0.5241, + "step": 5637 + }, + { + "epoch": 0.46, + "grad_norm": 0.8615562704331575, + "learning_rate": 1.1829731657066638e-05, + "loss": 0.5016, + "step": 5638 + }, + { + "epoch": 0.46, + "grad_norm": 0.8257423829771974, + "learning_rate": 1.1827143562863597e-05, + "loss": 0.526, + "step": 5639 + }, + { + "epoch": 0.46, + "grad_norm": 0.9633143307811811, + "learning_rate": 1.1824555342041129e-05, + "loss": 0.5566, + "step": 5640 + }, + { + "epoch": 0.46, + "grad_norm": 0.8663038774802817, + "learning_rate": 1.1821966994778594e-05, + "loss": 0.4996, + "step": 5641 + }, + { + "epoch": 0.46, + "grad_norm": 0.7574394729770879, + "learning_rate": 1.1819378521255362e-05, + "loss": 0.4978, + "step": 5642 + }, + { + "epoch": 0.46, + "grad_norm": 0.874450535779544, + "learning_rate": 1.181678992165081e-05, + "loss": 0.5369, + "step": 5643 + }, + { + "epoch": 0.46, + "grad_norm": 0.8685484962413709, + "learning_rate": 1.1814201196144332e-05, + "loss": 0.5223, + "step": 5644 + }, + { + "epoch": 0.46, + "grad_norm": 0.9004285750051082, + "learning_rate": 1.181161234491532e-05, + "loss": 0.5838, + "step": 5645 + }, + { + "epoch": 0.46, + "grad_norm": 0.8438898141326269, + "learning_rate": 1.1809023368143178e-05, + "loss": 0.5334, + "step": 5646 + }, + { + "epoch": 0.46, + "grad_norm": 0.9248851968313923, + "learning_rate": 1.180643426600732e-05, + "loss": 0.5461, + "step": 5647 + }, + { + "epoch": 0.46, + "grad_norm": 0.937478179130208, + "learning_rate": 1.1803845038687171e-05, + "loss": 0.5556, + "step": 5648 + }, + { + "epoch": 0.46, + "grad_norm": 0.8211910043207973, + "learning_rate": 1.1801255686362161e-05, + "loss": 0.4998, + "step": 5649 + }, + { + "epoch": 0.46, + "grad_norm": 0.9253745602627443, + "learning_rate": 1.1798666209211729e-05, + "loss": 0.5692, + "step": 5650 + }, + { + "epoch": 0.46, + "grad_norm": 0.8931107878011122, + "learning_rate": 1.1796076607415324e-05, + "loss": 0.5769, + "step": 5651 + }, + { + "epoch": 0.46, + "grad_norm": 1.032722335210612, + "learning_rate": 1.1793486881152405e-05, + "loss": 0.6256, + "step": 5652 + }, + { + "epoch": 0.46, + "grad_norm": 0.8894699649744721, + "learning_rate": 1.1790897030602436e-05, + "loss": 0.461, + "step": 5653 + }, + { + "epoch": 0.46, + "grad_norm": 0.864199629548567, + "learning_rate": 1.1788307055944887e-05, + "loss": 0.4806, + "step": 5654 + }, + { + "epoch": 0.46, + "grad_norm": 0.9561792929043047, + "learning_rate": 1.178571695735925e-05, + "loss": 0.527, + "step": 5655 + }, + { + "epoch": 0.46, + "grad_norm": 1.0988366775659597, + "learning_rate": 1.178312673502501e-05, + "loss": 0.5619, + "step": 5656 + }, + { + "epoch": 0.46, + "grad_norm": 0.809626546189783, + "learning_rate": 1.1780536389121668e-05, + "loss": 0.505, + "step": 5657 + }, + { + "epoch": 0.46, + "grad_norm": 0.8827486792369572, + "learning_rate": 1.1777945919828735e-05, + "loss": 0.4705, + "step": 5658 + }, + { + "epoch": 0.46, + "grad_norm": 0.9744348262600504, + "learning_rate": 1.1775355327325726e-05, + "loss": 0.5681, + "step": 5659 + }, + { + "epoch": 0.46, + "grad_norm": 0.9503573694547869, + "learning_rate": 1.1772764611792167e-05, + "loss": 0.588, + "step": 5660 + }, + { + "epoch": 0.46, + "grad_norm": 0.855926523059355, + "learning_rate": 1.1770173773407594e-05, + "loss": 0.5295, + "step": 5661 + }, + { + "epoch": 0.46, + "grad_norm": 0.8577754564930304, + "learning_rate": 1.176758281235155e-05, + "loss": 0.5658, + "step": 5662 + }, + { + "epoch": 0.46, + "grad_norm": 0.8640342306312921, + "learning_rate": 1.1764991728803582e-05, + "loss": 0.4698, + "step": 5663 + }, + { + "epoch": 0.46, + "grad_norm": 0.8347933853968565, + "learning_rate": 1.1762400522943254e-05, + "loss": 0.5503, + "step": 5664 + }, + { + "epoch": 0.46, + "grad_norm": 0.9123725024563369, + "learning_rate": 1.1759809194950134e-05, + "loss": 0.5921, + "step": 5665 + }, + { + "epoch": 0.46, + "grad_norm": 0.8766709410074994, + "learning_rate": 1.1757217745003797e-05, + "loss": 0.5239, + "step": 5666 + }, + { + "epoch": 0.46, + "grad_norm": 1.0059369165290584, + "learning_rate": 1.1754626173283827e-05, + "loss": 0.5633, + "step": 5667 + }, + { + "epoch": 0.46, + "grad_norm": 0.8504046043846838, + "learning_rate": 1.1752034479969822e-05, + "loss": 0.5828, + "step": 5668 + }, + { + "epoch": 0.46, + "grad_norm": 0.9058726927790431, + "learning_rate": 1.1749442665241382e-05, + "loss": 0.5252, + "step": 5669 + }, + { + "epoch": 0.46, + "grad_norm": 0.862169907036434, + "learning_rate": 1.1746850729278114e-05, + "loss": 0.5506, + "step": 5670 + }, + { + "epoch": 0.46, + "grad_norm": 0.9637561441997063, + "learning_rate": 1.1744258672259642e-05, + "loss": 0.5342, + "step": 5671 + }, + { + "epoch": 0.46, + "grad_norm": 1.1554538855075298, + "learning_rate": 1.174166649436559e-05, + "loss": 0.602, + "step": 5672 + }, + { + "epoch": 0.46, + "grad_norm": 1.1545314272880411, + "learning_rate": 1.1739074195775597e-05, + "loss": 0.6155, + "step": 5673 + }, + { + "epoch": 0.46, + "grad_norm": 0.8747730766584026, + "learning_rate": 1.1736481776669307e-05, + "loss": 0.5385, + "step": 5674 + }, + { + "epoch": 0.46, + "grad_norm": 0.8670802659680884, + "learning_rate": 1.1733889237226364e-05, + "loss": 0.5082, + "step": 5675 + }, + { + "epoch": 0.46, + "grad_norm": 0.8223266345411775, + "learning_rate": 1.1731296577626437e-05, + "loss": 0.5325, + "step": 5676 + }, + { + "epoch": 0.46, + "grad_norm": 0.872649722870166, + "learning_rate": 1.1728703798049194e-05, + "loss": 0.5906, + "step": 5677 + }, + { + "epoch": 0.46, + "grad_norm": 0.8882721443314949, + "learning_rate": 1.172611089867431e-05, + "loss": 0.6095, + "step": 5678 + }, + { + "epoch": 0.46, + "grad_norm": 0.8946066834933611, + "learning_rate": 1.1723517879681472e-05, + "loss": 0.5391, + "step": 5679 + }, + { + "epoch": 0.46, + "grad_norm": 0.9320314319421376, + "learning_rate": 1.1720924741250378e-05, + "loss": 0.5406, + "step": 5680 + }, + { + "epoch": 0.46, + "grad_norm": 0.8376780591969047, + "learning_rate": 1.1718331483560719e-05, + "loss": 0.5445, + "step": 5681 + }, + { + "epoch": 0.46, + "grad_norm": 1.0277180139899027, + "learning_rate": 1.1715738106792214e-05, + "loss": 0.5892, + "step": 5682 + }, + { + "epoch": 0.46, + "grad_norm": 1.0513463702008126, + "learning_rate": 1.1713144611124583e-05, + "loss": 0.573, + "step": 5683 + }, + { + "epoch": 0.46, + "grad_norm": 0.9765226894666977, + "learning_rate": 1.1710550996737548e-05, + "loss": 0.5579, + "step": 5684 + }, + { + "epoch": 0.46, + "grad_norm": 0.860387461075728, + "learning_rate": 1.1707957263810845e-05, + "loss": 0.5598, + "step": 5685 + }, + { + "epoch": 0.46, + "grad_norm": 1.0143344858243026, + "learning_rate": 1.170536341252422e-05, + "loss": 0.579, + "step": 5686 + }, + { + "epoch": 0.46, + "grad_norm": 0.8562956547995043, + "learning_rate": 1.1702769443057425e-05, + "loss": 0.4799, + "step": 5687 + }, + { + "epoch": 0.46, + "grad_norm": 0.8056493289664926, + "learning_rate": 1.1700175355590215e-05, + "loss": 0.5143, + "step": 5688 + }, + { + "epoch": 0.46, + "grad_norm": 0.9927643119593819, + "learning_rate": 1.1697581150302362e-05, + "loss": 0.5658, + "step": 5689 + }, + { + "epoch": 0.46, + "grad_norm": 0.9375696945093996, + "learning_rate": 1.1694986827373642e-05, + "loss": 0.5483, + "step": 5690 + }, + { + "epoch": 0.46, + "grad_norm": 0.9725306054037465, + "learning_rate": 1.1692392386983837e-05, + "loss": 0.59, + "step": 5691 + }, + { + "epoch": 0.46, + "grad_norm": 0.9917244244980913, + "learning_rate": 1.1689797829312741e-05, + "loss": 0.5406, + "step": 5692 + }, + { + "epoch": 0.46, + "grad_norm": 0.9484530592426267, + "learning_rate": 1.1687203154540154e-05, + "loss": 0.5242, + "step": 5693 + }, + { + "epoch": 0.46, + "grad_norm": 0.9394009906033737, + "learning_rate": 1.1684608362845886e-05, + "loss": 0.5825, + "step": 5694 + }, + { + "epoch": 0.46, + "grad_norm": 0.8359560568490447, + "learning_rate": 1.168201345440975e-05, + "loss": 0.4697, + "step": 5695 + }, + { + "epoch": 0.46, + "grad_norm": 0.8926529091421872, + "learning_rate": 1.1679418429411577e-05, + "loss": 0.5752, + "step": 5696 + }, + { + "epoch": 0.46, + "grad_norm": 0.8486560830613838, + "learning_rate": 1.1676823288031197e-05, + "loss": 0.5451, + "step": 5697 + }, + { + "epoch": 0.46, + "grad_norm": 0.8495140880211136, + "learning_rate": 1.1674228030448447e-05, + "loss": 0.5324, + "step": 5698 + }, + { + "epoch": 0.46, + "grad_norm": 1.0059261820258834, + "learning_rate": 1.167163265684318e-05, + "loss": 0.6143, + "step": 5699 + }, + { + "epoch": 0.46, + "grad_norm": 0.9051695413093083, + "learning_rate": 1.1669037167395256e-05, + "loss": 0.6137, + "step": 5700 + }, + { + "epoch": 0.46, + "grad_norm": 0.988907191699513, + "learning_rate": 1.1666441562284534e-05, + "loss": 0.5691, + "step": 5701 + }, + { + "epoch": 0.46, + "grad_norm": 0.8473370068987643, + "learning_rate": 1.166384584169089e-05, + "loss": 0.4898, + "step": 5702 + }, + { + "epoch": 0.46, + "grad_norm": 1.0623212023511857, + "learning_rate": 1.1661250005794206e-05, + "loss": 0.5344, + "step": 5703 + }, + { + "epoch": 0.46, + "grad_norm": 0.860164057627655, + "learning_rate": 1.1658654054774368e-05, + "loss": 0.478, + "step": 5704 + }, + { + "epoch": 0.46, + "grad_norm": 0.97579313832456, + "learning_rate": 1.1656057988811278e-05, + "loss": 0.5864, + "step": 5705 + }, + { + "epoch": 0.46, + "grad_norm": 0.9823768671863748, + "learning_rate": 1.1653461808084839e-05, + "loss": 0.5735, + "step": 5706 + }, + { + "epoch": 0.46, + "grad_norm": 0.9273988931839644, + "learning_rate": 1.1650865512774959e-05, + "loss": 0.4811, + "step": 5707 + }, + { + "epoch": 0.46, + "grad_norm": 0.955658161290628, + "learning_rate": 1.1648269103061567e-05, + "loss": 0.5759, + "step": 5708 + }, + { + "epoch": 0.46, + "grad_norm": 0.9437176531862602, + "learning_rate": 1.1645672579124586e-05, + "loss": 0.6016, + "step": 5709 + }, + { + "epoch": 0.46, + "grad_norm": 0.8760405278095192, + "learning_rate": 1.1643075941143956e-05, + "loss": 0.5191, + "step": 5710 + }, + { + "epoch": 0.46, + "grad_norm": 0.8942711976734233, + "learning_rate": 1.164047918929962e-05, + "loss": 0.5556, + "step": 5711 + }, + { + "epoch": 0.46, + "grad_norm": 0.902951537057684, + "learning_rate": 1.1637882323771532e-05, + "loss": 0.5103, + "step": 5712 + }, + { + "epoch": 0.46, + "grad_norm": 0.8573316626768663, + "learning_rate": 1.163528534473965e-05, + "loss": 0.5217, + "step": 5713 + }, + { + "epoch": 0.46, + "grad_norm": 1.0027552259507533, + "learning_rate": 1.1632688252383948e-05, + "loss": 0.5788, + "step": 5714 + }, + { + "epoch": 0.46, + "grad_norm": 0.8367820712338532, + "learning_rate": 1.1630091046884394e-05, + "loss": 0.5012, + "step": 5715 + }, + { + "epoch": 0.46, + "grad_norm": 0.8006736737941973, + "learning_rate": 1.1627493728420978e-05, + "loss": 0.4689, + "step": 5716 + }, + { + "epoch": 0.46, + "grad_norm": 0.9051130579150752, + "learning_rate": 1.1624896297173693e-05, + "loss": 0.6383, + "step": 5717 + }, + { + "epoch": 0.46, + "grad_norm": 0.8939363733073373, + "learning_rate": 1.1622298753322531e-05, + "loss": 0.5842, + "step": 5718 + }, + { + "epoch": 0.46, + "grad_norm": 0.9497539058849697, + "learning_rate": 1.1619701097047507e-05, + "loss": 0.4973, + "step": 5719 + }, + { + "epoch": 0.46, + "grad_norm": 0.8778289148182196, + "learning_rate": 1.1617103328528634e-05, + "loss": 0.6009, + "step": 5720 + }, + { + "epoch": 0.46, + "grad_norm": 0.8498858417717557, + "learning_rate": 1.1614505447945935e-05, + "loss": 0.4794, + "step": 5721 + }, + { + "epoch": 0.47, + "grad_norm": 0.891479859871484, + "learning_rate": 1.1611907455479439e-05, + "loss": 0.5115, + "step": 5722 + }, + { + "epoch": 0.47, + "grad_norm": 0.8737329204498719, + "learning_rate": 1.1609309351309185e-05, + "loss": 0.5272, + "step": 5723 + }, + { + "epoch": 0.47, + "grad_norm": 0.8746927578986288, + "learning_rate": 1.1606711135615223e-05, + "loss": 0.5308, + "step": 5724 + }, + { + "epoch": 0.47, + "grad_norm": 0.8336535581674581, + "learning_rate": 1.1604112808577603e-05, + "loss": 0.5128, + "step": 5725 + }, + { + "epoch": 0.47, + "grad_norm": 0.8336680782694602, + "learning_rate": 1.160151437037639e-05, + "loss": 0.5382, + "step": 5726 + }, + { + "epoch": 0.47, + "grad_norm": 1.0068426087236186, + "learning_rate": 1.159891582119165e-05, + "loss": 0.5948, + "step": 5727 + }, + { + "epoch": 0.47, + "grad_norm": 0.8861976744403234, + "learning_rate": 1.159631716120346e-05, + "loss": 0.5187, + "step": 5728 + }, + { + "epoch": 0.47, + "grad_norm": 0.8648814082644428, + "learning_rate": 1.1593718390591913e-05, + "loss": 0.535, + "step": 5729 + }, + { + "epoch": 0.47, + "grad_norm": 0.8212720183780278, + "learning_rate": 1.159111950953709e-05, + "loss": 0.4522, + "step": 5730 + }, + { + "epoch": 0.47, + "grad_norm": 0.8500083762981806, + "learning_rate": 1.1588520518219095e-05, + "loss": 0.5048, + "step": 5731 + }, + { + "epoch": 0.47, + "grad_norm": 0.9552823604332356, + "learning_rate": 1.1585921416818042e-05, + "loss": 0.5432, + "step": 5732 + }, + { + "epoch": 0.47, + "grad_norm": 0.9879107801336218, + "learning_rate": 1.1583322205514039e-05, + "loss": 0.5447, + "step": 5733 + }, + { + "epoch": 0.47, + "grad_norm": 0.9424763879235039, + "learning_rate": 1.158072288448721e-05, + "loss": 0.5644, + "step": 5734 + }, + { + "epoch": 0.47, + "grad_norm": 0.9149517629325111, + "learning_rate": 1.1578123453917692e-05, + "loss": 0.5157, + "step": 5735 + }, + { + "epoch": 0.47, + "grad_norm": 0.8057691249709181, + "learning_rate": 1.1575523913985614e-05, + "loss": 0.5248, + "step": 5736 + }, + { + "epoch": 0.47, + "grad_norm": 0.8919661942830389, + "learning_rate": 1.1572924264871126e-05, + "loss": 0.4596, + "step": 5737 + }, + { + "epoch": 0.47, + "grad_norm": 0.9432369794674959, + "learning_rate": 1.1570324506754385e-05, + "loss": 0.5393, + "step": 5738 + }, + { + "epoch": 0.47, + "grad_norm": 0.9501011722952974, + "learning_rate": 1.1567724639815546e-05, + "loss": 0.597, + "step": 5739 + }, + { + "epoch": 0.47, + "grad_norm": 1.0192209841846716, + "learning_rate": 1.156512466423478e-05, + "loss": 0.5543, + "step": 5740 + }, + { + "epoch": 0.47, + "grad_norm": 0.9118793885972419, + "learning_rate": 1.1562524580192265e-05, + "loss": 0.5442, + "step": 5741 + }, + { + "epoch": 0.47, + "grad_norm": 0.8681686671544112, + "learning_rate": 1.155992438786818e-05, + "loss": 0.5597, + "step": 5742 + }, + { + "epoch": 0.47, + "grad_norm": 0.8270316553924332, + "learning_rate": 1.1557324087442719e-05, + "loss": 0.5338, + "step": 5743 + }, + { + "epoch": 0.47, + "grad_norm": 0.8938029173929268, + "learning_rate": 1.1554723679096083e-05, + "loss": 0.5791, + "step": 5744 + }, + { + "epoch": 0.47, + "grad_norm": 1.0399905111718846, + "learning_rate": 1.155212316300847e-05, + "loss": 0.5945, + "step": 5745 + }, + { + "epoch": 0.47, + "grad_norm": 0.8980299501605497, + "learning_rate": 1.1549522539360103e-05, + "loss": 0.5261, + "step": 5746 + }, + { + "epoch": 0.47, + "grad_norm": 0.9107191322138124, + "learning_rate": 1.1546921808331196e-05, + "loss": 0.5396, + "step": 5747 + }, + { + "epoch": 0.47, + "grad_norm": 0.8837694344631738, + "learning_rate": 1.1544320970101981e-05, + "loss": 0.5647, + "step": 5748 + }, + { + "epoch": 0.47, + "grad_norm": 1.1697362226170598, + "learning_rate": 1.1541720024852692e-05, + "loss": 0.5716, + "step": 5749 + }, + { + "epoch": 0.47, + "grad_norm": 0.9325464531274007, + "learning_rate": 1.1539118972763572e-05, + "loss": 0.549, + "step": 5750 + }, + { + "epoch": 0.47, + "grad_norm": 0.910145331273149, + "learning_rate": 1.1536517814014876e-05, + "loss": 0.5646, + "step": 5751 + }, + { + "epoch": 0.47, + "grad_norm": 0.9314768239082533, + "learning_rate": 1.1533916548786856e-05, + "loss": 0.5403, + "step": 5752 + }, + { + "epoch": 0.47, + "grad_norm": 0.8545528383721573, + "learning_rate": 1.153131517725978e-05, + "loss": 0.5583, + "step": 5753 + }, + { + "epoch": 0.47, + "grad_norm": 0.9390530857052009, + "learning_rate": 1.1528713699613921e-05, + "loss": 0.5674, + "step": 5754 + }, + { + "epoch": 0.47, + "grad_norm": 0.9354357281115419, + "learning_rate": 1.1526112116029555e-05, + "loss": 0.5262, + "step": 5755 + }, + { + "epoch": 0.47, + "grad_norm": 0.8490438361271319, + "learning_rate": 1.1523510426686977e-05, + "loss": 0.53, + "step": 5756 + }, + { + "epoch": 0.47, + "grad_norm": 0.8766802437406609, + "learning_rate": 1.1520908631766476e-05, + "loss": 0.6403, + "step": 5757 + }, + { + "epoch": 0.47, + "grad_norm": 0.8991187659471015, + "learning_rate": 1.1518306731448357e-05, + "loss": 0.5231, + "step": 5758 + }, + { + "epoch": 0.47, + "grad_norm": 0.9545109160032564, + "learning_rate": 1.1515704725912926e-05, + "loss": 0.5326, + "step": 5759 + }, + { + "epoch": 0.47, + "grad_norm": 1.010803460178925, + "learning_rate": 1.1513102615340505e-05, + "loss": 0.5481, + "step": 5760 + }, + { + "epoch": 0.47, + "grad_norm": 0.9448566756878536, + "learning_rate": 1.1510500399911413e-05, + "loss": 0.53, + "step": 5761 + }, + { + "epoch": 0.47, + "grad_norm": 0.889478480629088, + "learning_rate": 1.1507898079805984e-05, + "loss": 0.5635, + "step": 5762 + }, + { + "epoch": 0.47, + "grad_norm": 0.7889905138028853, + "learning_rate": 1.1505295655204557e-05, + "loss": 0.5205, + "step": 5763 + }, + { + "epoch": 0.47, + "grad_norm": 0.9482501001206173, + "learning_rate": 1.1502693126287473e-05, + "loss": 0.5474, + "step": 5764 + }, + { + "epoch": 0.47, + "grad_norm": 0.8962124749836796, + "learning_rate": 1.1500090493235088e-05, + "loss": 0.5721, + "step": 5765 + }, + { + "epoch": 0.47, + "grad_norm": 0.9142381851003678, + "learning_rate": 1.1497487756227765e-05, + "loss": 0.5591, + "step": 5766 + }, + { + "epoch": 0.47, + "grad_norm": 0.9397027105849795, + "learning_rate": 1.1494884915445867e-05, + "loss": 0.5431, + "step": 5767 + }, + { + "epoch": 0.47, + "grad_norm": 0.931798640912055, + "learning_rate": 1.1492281971069772e-05, + "loss": 0.6008, + "step": 5768 + }, + { + "epoch": 0.47, + "grad_norm": 0.8497883566831262, + "learning_rate": 1.148967892327986e-05, + "loss": 0.4807, + "step": 5769 + }, + { + "epoch": 0.47, + "grad_norm": 1.0284837210866395, + "learning_rate": 1.1487075772256517e-05, + "loss": 0.5645, + "step": 5770 + }, + { + "epoch": 0.47, + "grad_norm": 0.9293165286236764, + "learning_rate": 1.1484472518180146e-05, + "loss": 0.5455, + "step": 5771 + }, + { + "epoch": 0.47, + "grad_norm": 0.8389218359292604, + "learning_rate": 1.1481869161231146e-05, + "loss": 0.5453, + "step": 5772 + }, + { + "epoch": 0.47, + "grad_norm": 0.9159749563956316, + "learning_rate": 1.1479265701589924e-05, + "loss": 0.5282, + "step": 5773 + }, + { + "epoch": 0.47, + "grad_norm": 0.9070787010145015, + "learning_rate": 1.1476662139436903e-05, + "loss": 0.5471, + "step": 5774 + }, + { + "epoch": 0.47, + "grad_norm": 1.010745817868924, + "learning_rate": 1.1474058474952505e-05, + "loss": 0.4959, + "step": 5775 + }, + { + "epoch": 0.47, + "grad_norm": 0.8536666744909506, + "learning_rate": 1.1471454708317163e-05, + "loss": 0.5393, + "step": 5776 + }, + { + "epoch": 0.47, + "grad_norm": 0.9417959392783372, + "learning_rate": 1.1468850839711314e-05, + "loss": 0.5515, + "step": 5777 + }, + { + "epoch": 0.47, + "grad_norm": 0.9056937867738876, + "learning_rate": 1.1466246869315407e-05, + "loss": 0.5208, + "step": 5778 + }, + { + "epoch": 0.47, + "grad_norm": 0.9178472792346383, + "learning_rate": 1.1463642797309889e-05, + "loss": 0.54, + "step": 5779 + }, + { + "epoch": 0.47, + "grad_norm": 0.9127826094034939, + "learning_rate": 1.1461038623875224e-05, + "loss": 0.5335, + "step": 5780 + }, + { + "epoch": 0.47, + "grad_norm": 0.9142747269421215, + "learning_rate": 1.145843434919188e-05, + "loss": 0.5424, + "step": 5781 + }, + { + "epoch": 0.47, + "grad_norm": 0.9521292000322145, + "learning_rate": 1.1455829973440328e-05, + "loss": 0.4752, + "step": 5782 + }, + { + "epoch": 0.47, + "grad_norm": 0.9023297612335568, + "learning_rate": 1.1453225496801052e-05, + "loss": 0.5146, + "step": 5783 + }, + { + "epoch": 0.47, + "grad_norm": 0.8767471354615514, + "learning_rate": 1.1450620919454538e-05, + "loss": 0.5128, + "step": 5784 + }, + { + "epoch": 0.47, + "grad_norm": 0.9506251594156782, + "learning_rate": 1.144801624158128e-05, + "loss": 0.6153, + "step": 5785 + }, + { + "epoch": 0.47, + "grad_norm": 0.8936628203180962, + "learning_rate": 1.144541146336178e-05, + "loss": 0.5134, + "step": 5786 + }, + { + "epoch": 0.47, + "grad_norm": 0.9560096879745319, + "learning_rate": 1.1442806584976549e-05, + "loss": 0.547, + "step": 5787 + }, + { + "epoch": 0.47, + "grad_norm": 0.9683698526031369, + "learning_rate": 1.14402016066061e-05, + "loss": 0.5176, + "step": 5788 + }, + { + "epoch": 0.47, + "grad_norm": 0.8838884747303756, + "learning_rate": 1.1437596528430956e-05, + "loss": 0.5551, + "step": 5789 + }, + { + "epoch": 0.47, + "grad_norm": 1.0051779630328477, + "learning_rate": 1.143499135063165e-05, + "loss": 0.6128, + "step": 5790 + }, + { + "epoch": 0.47, + "grad_norm": 0.9116566361869124, + "learning_rate": 1.1432386073388718e-05, + "loss": 0.4915, + "step": 5791 + }, + { + "epoch": 0.47, + "grad_norm": 0.9302519835107923, + "learning_rate": 1.1429780696882697e-05, + "loss": 0.5214, + "step": 5792 + }, + { + "epoch": 0.47, + "grad_norm": 0.8898580580646048, + "learning_rate": 1.1427175221294145e-05, + "loss": 0.524, + "step": 5793 + }, + { + "epoch": 0.47, + "grad_norm": 0.9166491607210097, + "learning_rate": 1.1424569646803616e-05, + "loss": 0.5383, + "step": 5794 + }, + { + "epoch": 0.47, + "grad_norm": 0.9242787204585731, + "learning_rate": 1.1421963973591674e-05, + "loss": 0.466, + "step": 5795 + }, + { + "epoch": 0.47, + "grad_norm": 0.896400261572487, + "learning_rate": 1.1419358201838888e-05, + "loss": 0.5314, + "step": 5796 + }, + { + "epoch": 0.47, + "grad_norm": 0.9348660459482668, + "learning_rate": 1.1416752331725842e-05, + "loss": 0.486, + "step": 5797 + }, + { + "epoch": 0.47, + "grad_norm": 0.9553987352915291, + "learning_rate": 1.1414146363433112e-05, + "loss": 0.4985, + "step": 5798 + }, + { + "epoch": 0.47, + "grad_norm": 0.9954151620187609, + "learning_rate": 1.1411540297141293e-05, + "loss": 0.5526, + "step": 5799 + }, + { + "epoch": 0.47, + "grad_norm": 0.8631823698760575, + "learning_rate": 1.1408934133030985e-05, + "loss": 0.5711, + "step": 5800 + }, + { + "epoch": 0.47, + "grad_norm": 0.8611002496151663, + "learning_rate": 1.1406327871282792e-05, + "loss": 0.5432, + "step": 5801 + }, + { + "epoch": 0.47, + "grad_norm": 0.9373509114087041, + "learning_rate": 1.1403721512077324e-05, + "loss": 0.5104, + "step": 5802 + }, + { + "epoch": 0.47, + "grad_norm": 0.8651715229849188, + "learning_rate": 1.14011150555952e-05, + "loss": 0.5173, + "step": 5803 + }, + { + "epoch": 0.47, + "grad_norm": 0.8591973900203775, + "learning_rate": 1.1398508502017047e-05, + "loss": 0.5324, + "step": 5804 + }, + { + "epoch": 0.47, + "grad_norm": 0.9105118528653173, + "learning_rate": 1.1395901851523494e-05, + "loss": 0.5249, + "step": 5805 + }, + { + "epoch": 0.47, + "grad_norm": 0.9613722495446257, + "learning_rate": 1.1393295104295178e-05, + "loss": 0.5782, + "step": 5806 + }, + { + "epoch": 0.47, + "grad_norm": 0.7954537842824811, + "learning_rate": 1.1390688260512755e-05, + "loss": 0.5114, + "step": 5807 + }, + { + "epoch": 0.47, + "grad_norm": 0.9248260565773117, + "learning_rate": 1.1388081320356861e-05, + "loss": 0.5524, + "step": 5808 + }, + { + "epoch": 0.47, + "grad_norm": 0.8469594823476866, + "learning_rate": 1.1385474284008167e-05, + "loss": 0.4896, + "step": 5809 + }, + { + "epoch": 0.47, + "grad_norm": 0.8069975623735556, + "learning_rate": 1.1382867151647333e-05, + "loss": 0.5172, + "step": 5810 + }, + { + "epoch": 0.47, + "grad_norm": 0.9064033476636228, + "learning_rate": 1.1380259923455033e-05, + "loss": 0.5351, + "step": 5811 + }, + { + "epoch": 0.47, + "grad_norm": 0.8459661189916762, + "learning_rate": 1.1377652599611942e-05, + "loss": 0.5321, + "step": 5812 + }, + { + "epoch": 0.47, + "grad_norm": 0.8653354047603307, + "learning_rate": 1.1375045180298749e-05, + "loss": 0.5899, + "step": 5813 + }, + { + "epoch": 0.47, + "grad_norm": 0.9715947135905709, + "learning_rate": 1.1372437665696145e-05, + "loss": 0.5819, + "step": 5814 + }, + { + "epoch": 0.47, + "grad_norm": 0.8273124149020723, + "learning_rate": 1.136983005598483e-05, + "loss": 0.4911, + "step": 5815 + }, + { + "epoch": 0.47, + "grad_norm": 0.9421996751596458, + "learning_rate": 1.1367222351345504e-05, + "loss": 0.5739, + "step": 5816 + }, + { + "epoch": 0.47, + "grad_norm": 0.9107314035512659, + "learning_rate": 1.136461455195888e-05, + "loss": 0.4639, + "step": 5817 + }, + { + "epoch": 0.47, + "grad_norm": 0.8399480002609003, + "learning_rate": 1.1362006658005684e-05, + "loss": 0.4968, + "step": 5818 + }, + { + "epoch": 0.47, + "grad_norm": 0.9050184961256491, + "learning_rate": 1.135939866966663e-05, + "loss": 0.5679, + "step": 5819 + }, + { + "epoch": 0.47, + "grad_norm": 0.8902604436710996, + "learning_rate": 1.1356790587122457e-05, + "loss": 0.5179, + "step": 5820 + }, + { + "epoch": 0.47, + "grad_norm": 0.8722050269762409, + "learning_rate": 1.13541824105539e-05, + "loss": 0.543, + "step": 5821 + }, + { + "epoch": 0.47, + "grad_norm": 0.9447918438261024, + "learning_rate": 1.1351574140141701e-05, + "loss": 0.5653, + "step": 5822 + }, + { + "epoch": 0.47, + "grad_norm": 0.874738767797678, + "learning_rate": 1.1348965776066611e-05, + "loss": 0.5667, + "step": 5823 + }, + { + "epoch": 0.47, + "grad_norm": 0.974237757914514, + "learning_rate": 1.1346357318509395e-05, + "loss": 0.5381, + "step": 5824 + }, + { + "epoch": 0.47, + "grad_norm": 0.8495673811717946, + "learning_rate": 1.1343748767650806e-05, + "loss": 0.5473, + "step": 5825 + }, + { + "epoch": 0.47, + "grad_norm": 0.941721385466632, + "learning_rate": 1.1341140123671621e-05, + "loss": 0.5479, + "step": 5826 + }, + { + "epoch": 0.47, + "grad_norm": 0.9913803444659351, + "learning_rate": 1.1338531386752618e-05, + "loss": 0.5276, + "step": 5827 + }, + { + "epoch": 0.47, + "grad_norm": 0.8792161364554925, + "learning_rate": 1.1335922557074572e-05, + "loss": 0.5012, + "step": 5828 + }, + { + "epoch": 0.47, + "grad_norm": 0.8561321256615947, + "learning_rate": 1.133331363481828e-05, + "loss": 0.5516, + "step": 5829 + }, + { + "epoch": 0.47, + "grad_norm": 0.8714825042279836, + "learning_rate": 1.133070462016454e-05, + "loss": 0.4823, + "step": 5830 + }, + { + "epoch": 0.47, + "grad_norm": 0.8371557202330407, + "learning_rate": 1.1328095513294143e-05, + "loss": 0.5371, + "step": 5831 + }, + { + "epoch": 0.47, + "grad_norm": 0.8041356268035228, + "learning_rate": 1.1325486314387908e-05, + "loss": 0.4643, + "step": 5832 + }, + { + "epoch": 0.47, + "grad_norm": 0.8817325273916904, + "learning_rate": 1.1322877023626647e-05, + "loss": 0.4926, + "step": 5833 + }, + { + "epoch": 0.47, + "grad_norm": 0.8390037601845575, + "learning_rate": 1.1320267641191183e-05, + "loss": 0.5356, + "step": 5834 + }, + { + "epoch": 0.47, + "grad_norm": 0.9465123705146559, + "learning_rate": 1.131765816726234e-05, + "loss": 0.5277, + "step": 5835 + }, + { + "epoch": 0.47, + "grad_norm": 0.8709013251621589, + "learning_rate": 1.1315048602020956e-05, + "loss": 0.5292, + "step": 5836 + }, + { + "epoch": 0.47, + "grad_norm": 0.9327073017044094, + "learning_rate": 1.1312438945647873e-05, + "loss": 0.5589, + "step": 5837 + }, + { + "epoch": 0.47, + "grad_norm": 0.913930006801104, + "learning_rate": 1.1309829198323929e-05, + "loss": 0.5097, + "step": 5838 + }, + { + "epoch": 0.47, + "grad_norm": 0.9536570039795854, + "learning_rate": 1.1307219360229991e-05, + "loss": 0.5517, + "step": 5839 + }, + { + "epoch": 0.47, + "grad_norm": 0.8559047225531874, + "learning_rate": 1.1304609431546905e-05, + "loss": 0.4756, + "step": 5840 + }, + { + "epoch": 0.47, + "grad_norm": 0.8773984137628708, + "learning_rate": 1.1301999412455545e-05, + "loss": 0.5227, + "step": 5841 + }, + { + "epoch": 0.47, + "grad_norm": 1.056976234813652, + "learning_rate": 1.129938930313678e-05, + "loss": 0.5863, + "step": 5842 + }, + { + "epoch": 0.47, + "grad_norm": 0.8246643505599478, + "learning_rate": 1.129677910377149e-05, + "loss": 0.4886, + "step": 5843 + }, + { + "epoch": 0.47, + "grad_norm": 0.949542374607715, + "learning_rate": 1.1294168814540554e-05, + "loss": 0.5693, + "step": 5844 + }, + { + "epoch": 0.48, + "grad_norm": 1.0296550313854473, + "learning_rate": 1.1291558435624871e-05, + "loss": 0.5254, + "step": 5845 + }, + { + "epoch": 0.48, + "grad_norm": 0.8437049883098626, + "learning_rate": 1.1288947967205335e-05, + "loss": 0.5596, + "step": 5846 + }, + { + "epoch": 0.48, + "grad_norm": 0.8585532394054722, + "learning_rate": 1.1286337409462844e-05, + "loss": 0.5473, + "step": 5847 + }, + { + "epoch": 0.48, + "grad_norm": 0.9820576693821286, + "learning_rate": 1.1283726762578316e-05, + "loss": 0.6178, + "step": 5848 + }, + { + "epoch": 0.48, + "grad_norm": 0.9108796836809981, + "learning_rate": 1.128111602673266e-05, + "loss": 0.5125, + "step": 5849 + }, + { + "epoch": 0.48, + "grad_norm": 0.8617572776326249, + "learning_rate": 1.1278505202106797e-05, + "loss": 0.5172, + "step": 5850 + }, + { + "epoch": 0.48, + "grad_norm": 0.886056947138033, + "learning_rate": 1.1275894288881664e-05, + "loss": 0.498, + "step": 5851 + }, + { + "epoch": 0.48, + "grad_norm": 0.9296123160821662, + "learning_rate": 1.1273283287238184e-05, + "loss": 0.5488, + "step": 5852 + }, + { + "epoch": 0.48, + "grad_norm": 0.9310583665236684, + "learning_rate": 1.12706721973573e-05, + "loss": 0.5318, + "step": 5853 + }, + { + "epoch": 0.48, + "grad_norm": 0.8348396049829269, + "learning_rate": 1.1268061019419965e-05, + "loss": 0.5751, + "step": 5854 + }, + { + "epoch": 0.48, + "grad_norm": 0.8750748024173053, + "learning_rate": 1.1265449753607122e-05, + "loss": 0.5463, + "step": 5855 + }, + { + "epoch": 0.48, + "grad_norm": 0.8049671125117622, + "learning_rate": 1.1262838400099733e-05, + "loss": 0.4689, + "step": 5856 + }, + { + "epoch": 0.48, + "grad_norm": 0.8821628993628409, + "learning_rate": 1.1260226959078766e-05, + "loss": 0.4792, + "step": 5857 + }, + { + "epoch": 0.48, + "grad_norm": 0.8499335731309763, + "learning_rate": 1.1257615430725188e-05, + "loss": 0.5666, + "step": 5858 + }, + { + "epoch": 0.48, + "grad_norm": 0.8016705903840738, + "learning_rate": 1.1255003815219973e-05, + "loss": 0.4629, + "step": 5859 + }, + { + "epoch": 0.48, + "grad_norm": 0.8575923821384105, + "learning_rate": 1.1252392112744113e-05, + "loss": 0.5538, + "step": 5860 + }, + { + "epoch": 0.48, + "grad_norm": 0.9091042763411217, + "learning_rate": 1.1249780323478585e-05, + "loss": 0.5528, + "step": 5861 + }, + { + "epoch": 0.48, + "grad_norm": 0.879419120033352, + "learning_rate": 1.124716844760439e-05, + "loss": 0.5088, + "step": 5862 + }, + { + "epoch": 0.48, + "grad_norm": 0.9238669229492489, + "learning_rate": 1.1244556485302532e-05, + "loss": 0.5262, + "step": 5863 + }, + { + "epoch": 0.48, + "grad_norm": 0.9622603224438493, + "learning_rate": 1.1241944436754008e-05, + "loss": 0.6327, + "step": 5864 + }, + { + "epoch": 0.48, + "grad_norm": 0.8709572099514622, + "learning_rate": 1.1239332302139839e-05, + "loss": 0.6088, + "step": 5865 + }, + { + "epoch": 0.48, + "grad_norm": 0.8728800808030194, + "learning_rate": 1.1236720081641042e-05, + "loss": 0.5195, + "step": 5866 + }, + { + "epoch": 0.48, + "grad_norm": 0.9157664222436604, + "learning_rate": 1.1234107775438637e-05, + "loss": 0.521, + "step": 5867 + }, + { + "epoch": 0.48, + "grad_norm": 0.9421227582596474, + "learning_rate": 1.1231495383713657e-05, + "loss": 0.5419, + "step": 5868 + }, + { + "epoch": 0.48, + "grad_norm": 0.8996081993865935, + "learning_rate": 1.1228882906647142e-05, + "loss": 0.4896, + "step": 5869 + }, + { + "epoch": 0.48, + "grad_norm": 0.8783857047148886, + "learning_rate": 1.1226270344420131e-05, + "loss": 0.516, + "step": 5870 + }, + { + "epoch": 0.48, + "grad_norm": 0.7932167932457265, + "learning_rate": 1.1223657697213672e-05, + "loss": 0.4892, + "step": 5871 + }, + { + "epoch": 0.48, + "grad_norm": 0.9248614707298407, + "learning_rate": 1.1221044965208821e-05, + "loss": 0.5565, + "step": 5872 + }, + { + "epoch": 0.48, + "grad_norm": 0.8308674630147707, + "learning_rate": 1.1218432148586638e-05, + "loss": 0.5883, + "step": 5873 + }, + { + "epoch": 0.48, + "grad_norm": 0.8704808487120602, + "learning_rate": 1.1215819247528186e-05, + "loss": 0.5364, + "step": 5874 + }, + { + "epoch": 0.48, + "grad_norm": 0.9057993331161298, + "learning_rate": 1.121320626221454e-05, + "loss": 0.5342, + "step": 5875 + }, + { + "epoch": 0.48, + "grad_norm": 0.9622876164726756, + "learning_rate": 1.1210593192826776e-05, + "loss": 0.6037, + "step": 5876 + }, + { + "epoch": 0.48, + "grad_norm": 0.8488925867142036, + "learning_rate": 1.1207980039545976e-05, + "loss": 0.5314, + "step": 5877 + }, + { + "epoch": 0.48, + "grad_norm": 0.8730947745792921, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.5412, + "step": 5878 + }, + { + "epoch": 0.48, + "grad_norm": 0.9506959222264564, + "learning_rate": 1.1202753482029639e-05, + "loss": 0.6111, + "step": 5879 + }, + { + "epoch": 0.48, + "grad_norm": 0.8871307714612382, + "learning_rate": 1.1200140078156293e-05, + "loss": 0.5414, + "step": 5880 + }, + { + "epoch": 0.48, + "grad_norm": 0.9470858331126085, + "learning_rate": 1.1197526591114306e-05, + "loss": 0.5387, + "step": 5881 + }, + { + "epoch": 0.48, + "grad_norm": 0.968646981582715, + "learning_rate": 1.119491302108479e-05, + "loss": 0.6078, + "step": 5882 + }, + { + "epoch": 0.48, + "grad_norm": 0.9271943209176021, + "learning_rate": 1.1192299368248858e-05, + "loss": 0.569, + "step": 5883 + }, + { + "epoch": 0.48, + "grad_norm": 0.899014706576815, + "learning_rate": 1.1189685632787638e-05, + "loss": 0.5318, + "step": 5884 + }, + { + "epoch": 0.48, + "grad_norm": 0.8912231576345668, + "learning_rate": 1.1187071814882262e-05, + "loss": 0.5458, + "step": 5885 + }, + { + "epoch": 0.48, + "grad_norm": 0.9244677150779027, + "learning_rate": 1.118445791471386e-05, + "loss": 0.5491, + "step": 5886 + }, + { + "epoch": 0.48, + "grad_norm": 0.7735443209972106, + "learning_rate": 1.1181843932463577e-05, + "loss": 0.4898, + "step": 5887 + }, + { + "epoch": 0.48, + "grad_norm": 0.8876592892316715, + "learning_rate": 1.1179229868312555e-05, + "loss": 0.4807, + "step": 5888 + }, + { + "epoch": 0.48, + "grad_norm": 0.8108220261166168, + "learning_rate": 1.117661572244195e-05, + "loss": 0.4873, + "step": 5889 + }, + { + "epoch": 0.48, + "grad_norm": 0.9695562013180646, + "learning_rate": 1.117400149503292e-05, + "loss": 0.6291, + "step": 5890 + }, + { + "epoch": 0.48, + "grad_norm": 1.0105840919169105, + "learning_rate": 1.1171387186266628e-05, + "loss": 0.6225, + "step": 5891 + }, + { + "epoch": 0.48, + "grad_norm": 0.8261616963980778, + "learning_rate": 1.1168772796324241e-05, + "loss": 0.4972, + "step": 5892 + }, + { + "epoch": 0.48, + "grad_norm": 0.938537542798799, + "learning_rate": 1.1166158325386938e-05, + "loss": 0.5204, + "step": 5893 + }, + { + "epoch": 0.48, + "grad_norm": 0.8614582593712691, + "learning_rate": 1.1163543773635896e-05, + "loss": 0.5719, + "step": 5894 + }, + { + "epoch": 0.48, + "grad_norm": 0.9374411897404901, + "learning_rate": 1.1160929141252303e-05, + "loss": 0.491, + "step": 5895 + }, + { + "epoch": 0.48, + "grad_norm": 0.9359501939977077, + "learning_rate": 1.115831442841735e-05, + "loss": 0.4821, + "step": 5896 + }, + { + "epoch": 0.48, + "grad_norm": 0.9385210570409233, + "learning_rate": 1.1155699635312235e-05, + "loss": 0.5466, + "step": 5897 + }, + { + "epoch": 0.48, + "grad_norm": 0.8988786435675817, + "learning_rate": 1.1153084762118163e-05, + "loss": 0.5296, + "step": 5898 + }, + { + "epoch": 0.48, + "grad_norm": 0.9405919058361396, + "learning_rate": 1.1150469809016336e-05, + "loss": 0.5517, + "step": 5899 + }, + { + "epoch": 0.48, + "grad_norm": 0.9164295167852962, + "learning_rate": 1.1147854776187973e-05, + "loss": 0.6041, + "step": 5900 + }, + { + "epoch": 0.48, + "grad_norm": 1.0104231088332736, + "learning_rate": 1.1145239663814291e-05, + "loss": 0.5689, + "step": 5901 + }, + { + "epoch": 0.48, + "grad_norm": 0.9624897296538513, + "learning_rate": 1.1142624472076518e-05, + "loss": 0.643, + "step": 5902 + }, + { + "epoch": 0.48, + "grad_norm": 0.9035371445616356, + "learning_rate": 1.1140009201155881e-05, + "loss": 0.5335, + "step": 5903 + }, + { + "epoch": 0.48, + "grad_norm": 0.8552416264952798, + "learning_rate": 1.1137393851233618e-05, + "loss": 0.5205, + "step": 5904 + }, + { + "epoch": 0.48, + "grad_norm": 0.8690331556336678, + "learning_rate": 1.1134778422490971e-05, + "loss": 0.5088, + "step": 5905 + }, + { + "epoch": 0.48, + "grad_norm": 0.8007198775976306, + "learning_rate": 1.1132162915109186e-05, + "loss": 0.5099, + "step": 5906 + }, + { + "epoch": 0.48, + "grad_norm": 0.8398136874931055, + "learning_rate": 1.1129547329269512e-05, + "loss": 0.4875, + "step": 5907 + }, + { + "epoch": 0.48, + "grad_norm": 0.968349450845184, + "learning_rate": 1.1126931665153213e-05, + "loss": 0.6041, + "step": 5908 + }, + { + "epoch": 0.48, + "grad_norm": 0.8786642078065504, + "learning_rate": 1.1124315922941549e-05, + "loss": 0.4978, + "step": 5909 + }, + { + "epoch": 0.48, + "grad_norm": 0.9614973758016057, + "learning_rate": 1.1121700102815787e-05, + "loss": 0.5669, + "step": 5910 + }, + { + "epoch": 0.48, + "grad_norm": 0.9050042577946916, + "learning_rate": 1.1119084204957204e-05, + "loss": 0.5286, + "step": 5911 + }, + { + "epoch": 0.48, + "grad_norm": 0.8748269695827618, + "learning_rate": 1.1116468229547079e-05, + "loss": 0.4821, + "step": 5912 + }, + { + "epoch": 0.48, + "grad_norm": 0.9758458472696362, + "learning_rate": 1.1113852176766695e-05, + "loss": 0.5561, + "step": 5913 + }, + { + "epoch": 0.48, + "grad_norm": 0.968997489808725, + "learning_rate": 1.1111236046797342e-05, + "loss": 0.5863, + "step": 5914 + }, + { + "epoch": 0.48, + "grad_norm": 0.8520026964412684, + "learning_rate": 1.110861983982032e-05, + "loss": 0.5155, + "step": 5915 + }, + { + "epoch": 0.48, + "grad_norm": 0.9705553198685964, + "learning_rate": 1.1106003556016924e-05, + "loss": 0.5326, + "step": 5916 + }, + { + "epoch": 0.48, + "grad_norm": 0.942824468226576, + "learning_rate": 1.1103387195568463e-05, + "loss": 0.5439, + "step": 5917 + }, + { + "epoch": 0.48, + "grad_norm": 1.002773840554435, + "learning_rate": 1.110077075865625e-05, + "loss": 0.6232, + "step": 5918 + }, + { + "epoch": 0.48, + "grad_norm": 0.9123206687630057, + "learning_rate": 1.1098154245461597e-05, + "loss": 0.5355, + "step": 5919 + }, + { + "epoch": 0.48, + "grad_norm": 0.8978286343392776, + "learning_rate": 1.109553765616583e-05, + "loss": 0.5299, + "step": 5920 + }, + { + "epoch": 0.48, + "grad_norm": 0.9972585079973012, + "learning_rate": 1.1092920990950276e-05, + "loss": 0.5836, + "step": 5921 + }, + { + "epoch": 0.48, + "grad_norm": 0.8419429273808501, + "learning_rate": 1.1090304249996264e-05, + "loss": 0.4881, + "step": 5922 + }, + { + "epoch": 0.48, + "grad_norm": 0.9619008015768903, + "learning_rate": 1.1087687433485135e-05, + "loss": 0.4746, + "step": 5923 + }, + { + "epoch": 0.48, + "grad_norm": 0.9965086736539188, + "learning_rate": 1.1085070541598235e-05, + "loss": 0.5737, + "step": 5924 + }, + { + "epoch": 0.48, + "grad_norm": 0.8223959891987548, + "learning_rate": 1.1082453574516907e-05, + "loss": 0.5213, + "step": 5925 + }, + { + "epoch": 0.48, + "grad_norm": 0.9194358573261691, + "learning_rate": 1.1079836532422505e-05, + "loss": 0.5685, + "step": 5926 + }, + { + "epoch": 0.48, + "grad_norm": 0.8433036241482647, + "learning_rate": 1.1077219415496391e-05, + "loss": 0.5119, + "step": 5927 + }, + { + "epoch": 0.48, + "grad_norm": 0.8369149997767066, + "learning_rate": 1.1074602223919925e-05, + "loss": 0.5315, + "step": 5928 + }, + { + "epoch": 0.48, + "grad_norm": 0.962608668637937, + "learning_rate": 1.107198495787448e-05, + "loss": 0.5197, + "step": 5929 + }, + { + "epoch": 0.48, + "grad_norm": 0.9299123924527506, + "learning_rate": 1.106936761754143e-05, + "loss": 0.4922, + "step": 5930 + }, + { + "epoch": 0.48, + "grad_norm": 0.9346185712137653, + "learning_rate": 1.1066750203102148e-05, + "loss": 0.5297, + "step": 5931 + }, + { + "epoch": 0.48, + "grad_norm": 0.8796465411194002, + "learning_rate": 1.1064132714738024e-05, + "loss": 0.5336, + "step": 5932 + }, + { + "epoch": 0.48, + "grad_norm": 0.9216539178960147, + "learning_rate": 1.1061515152630448e-05, + "loss": 0.6052, + "step": 5933 + }, + { + "epoch": 0.48, + "grad_norm": 0.9549392231390016, + "learning_rate": 1.1058897516960817e-05, + "loss": 0.6129, + "step": 5934 + }, + { + "epoch": 0.48, + "grad_norm": 0.9376318659921885, + "learning_rate": 1.1056279807910522e-05, + "loss": 0.6062, + "step": 5935 + }, + { + "epoch": 0.48, + "grad_norm": 0.9601200523129949, + "learning_rate": 1.1053662025660973e-05, + "loss": 0.4865, + "step": 5936 + }, + { + "epoch": 0.48, + "grad_norm": 0.9388283502974828, + "learning_rate": 1.1051044170393586e-05, + "loss": 0.5551, + "step": 5937 + }, + { + "epoch": 0.48, + "grad_norm": 0.9367217738098178, + "learning_rate": 1.1048426242289767e-05, + "loss": 0.5412, + "step": 5938 + }, + { + "epoch": 0.48, + "grad_norm": 0.9095242772845795, + "learning_rate": 1.1045808241530937e-05, + "loss": 0.5014, + "step": 5939 + }, + { + "epoch": 0.48, + "grad_norm": 0.9201435611696015, + "learning_rate": 1.1043190168298527e-05, + "loss": 0.5581, + "step": 5940 + }, + { + "epoch": 0.48, + "grad_norm": 0.942430264175165, + "learning_rate": 1.104057202277396e-05, + "loss": 0.5376, + "step": 5941 + }, + { + "epoch": 0.48, + "grad_norm": 0.9284963591597374, + "learning_rate": 1.1037953805138679e-05, + "loss": 0.5984, + "step": 5942 + }, + { + "epoch": 0.48, + "grad_norm": 0.9806126690798201, + "learning_rate": 1.103533551557412e-05, + "loss": 0.5994, + "step": 5943 + }, + { + "epoch": 0.48, + "grad_norm": 0.9852923229857014, + "learning_rate": 1.1032717154261725e-05, + "loss": 0.5086, + "step": 5944 + }, + { + "epoch": 0.48, + "grad_norm": 0.8857293632409518, + "learning_rate": 1.1030098721382947e-05, + "loss": 0.5406, + "step": 5945 + }, + { + "epoch": 0.48, + "grad_norm": 0.9089245246997715, + "learning_rate": 1.1027480217119245e-05, + "loss": 0.5156, + "step": 5946 + }, + { + "epoch": 0.48, + "grad_norm": 0.8422760311162406, + "learning_rate": 1.102486164165207e-05, + "loss": 0.5645, + "step": 5947 + }, + { + "epoch": 0.48, + "grad_norm": 0.8867799252180668, + "learning_rate": 1.1022242995162895e-05, + "loss": 0.5605, + "step": 5948 + }, + { + "epoch": 0.48, + "grad_norm": 0.8775600612421792, + "learning_rate": 1.1019624277833188e-05, + "loss": 0.6068, + "step": 5949 + }, + { + "epoch": 0.48, + "grad_norm": 0.919899703948138, + "learning_rate": 1.1017005489844424e-05, + "loss": 0.4996, + "step": 5950 + }, + { + "epoch": 0.48, + "grad_norm": 0.9793631542392197, + "learning_rate": 1.1014386631378079e-05, + "loss": 0.5301, + "step": 5951 + }, + { + "epoch": 0.48, + "grad_norm": 0.8502436939737742, + "learning_rate": 1.101176770261564e-05, + "loss": 0.5321, + "step": 5952 + }, + { + "epoch": 0.48, + "grad_norm": 0.9908848621988976, + "learning_rate": 1.1009148703738599e-05, + "loss": 0.5246, + "step": 5953 + }, + { + "epoch": 0.48, + "grad_norm": 0.8478840248580101, + "learning_rate": 1.1006529634928446e-05, + "loss": 0.503, + "step": 5954 + }, + { + "epoch": 0.48, + "grad_norm": 0.9149829104564555, + "learning_rate": 1.1003910496366683e-05, + "loss": 0.5766, + "step": 5955 + }, + { + "epoch": 0.48, + "grad_norm": 1.3735137099920995, + "learning_rate": 1.1001291288234812e-05, + "loss": 0.5335, + "step": 5956 + }, + { + "epoch": 0.48, + "grad_norm": 1.0109614737297976, + "learning_rate": 1.0998672010714344e-05, + "loss": 0.5876, + "step": 5957 + }, + { + "epoch": 0.48, + "grad_norm": 0.9525877879244522, + "learning_rate": 1.0996052663986791e-05, + "loss": 0.5306, + "step": 5958 + }, + { + "epoch": 0.48, + "grad_norm": 0.872468204745674, + "learning_rate": 1.0993433248233672e-05, + "loss": 0.5275, + "step": 5959 + }, + { + "epoch": 0.48, + "grad_norm": 0.9800315769595653, + "learning_rate": 1.0990813763636511e-05, + "loss": 0.5752, + "step": 5960 + }, + { + "epoch": 0.48, + "grad_norm": 0.9484721067744775, + "learning_rate": 1.0988194210376834e-05, + "loss": 0.5565, + "step": 5961 + }, + { + "epoch": 0.48, + "grad_norm": 0.970274479323285, + "learning_rate": 1.0985574588636174e-05, + "loss": 0.5795, + "step": 5962 + }, + { + "epoch": 0.48, + "grad_norm": 0.8814469628462033, + "learning_rate": 1.0982954898596072e-05, + "loss": 0.4921, + "step": 5963 + }, + { + "epoch": 0.48, + "grad_norm": 0.9531102160371265, + "learning_rate": 1.0980335140438066e-05, + "loss": 0.5659, + "step": 5964 + }, + { + "epoch": 0.48, + "grad_norm": 0.8076295124586604, + "learning_rate": 1.0977715314343702e-05, + "loss": 0.5268, + "step": 5965 + }, + { + "epoch": 0.48, + "grad_norm": 0.9189308585416595, + "learning_rate": 1.0975095420494537e-05, + "loss": 0.529, + "step": 5966 + }, + { + "epoch": 0.48, + "grad_norm": 0.8835362326913786, + "learning_rate": 1.0972475459072124e-05, + "loss": 0.5095, + "step": 5967 + }, + { + "epoch": 0.49, + "grad_norm": 0.9200688922262339, + "learning_rate": 1.0969855430258022e-05, + "loss": 0.5085, + "step": 5968 + }, + { + "epoch": 0.49, + "grad_norm": 1.0103589244278846, + "learning_rate": 1.0967235334233802e-05, + "loss": 0.6143, + "step": 5969 + }, + { + "epoch": 0.49, + "grad_norm": 0.8747556366743678, + "learning_rate": 1.096461517118103e-05, + "loss": 0.5439, + "step": 5970 + }, + { + "epoch": 0.49, + "grad_norm": 0.8383655597267304, + "learning_rate": 1.096199494128128e-05, + "loss": 0.5071, + "step": 5971 + }, + { + "epoch": 0.49, + "grad_norm": 0.9352976550371722, + "learning_rate": 1.0959374644716137e-05, + "loss": 0.5272, + "step": 5972 + }, + { + "epoch": 0.49, + "grad_norm": 0.9736793571068132, + "learning_rate": 1.0956754281667182e-05, + "loss": 0.5717, + "step": 5973 + }, + { + "epoch": 0.49, + "grad_norm": 0.8320012054898668, + "learning_rate": 1.0954133852316003e-05, + "loss": 0.4946, + "step": 5974 + }, + { + "epoch": 0.49, + "grad_norm": 0.9549832251089269, + "learning_rate": 1.0951513356844192e-05, + "loss": 0.5193, + "step": 5975 + }, + { + "epoch": 0.49, + "grad_norm": 0.7905276524529273, + "learning_rate": 1.0948892795433353e-05, + "loss": 0.4973, + "step": 5976 + }, + { + "epoch": 0.49, + "grad_norm": 0.9168115233955988, + "learning_rate": 1.0946272168265081e-05, + "loss": 0.5515, + "step": 5977 + }, + { + "epoch": 0.49, + "grad_norm": 0.9139760132751583, + "learning_rate": 1.094365147552099e-05, + "loss": 0.5441, + "step": 5978 + }, + { + "epoch": 0.49, + "grad_norm": 0.9405480838687121, + "learning_rate": 1.094103071738269e-05, + "loss": 0.5967, + "step": 5979 + }, + { + "epoch": 0.49, + "grad_norm": 0.8594264268924838, + "learning_rate": 1.0938409894031793e-05, + "loss": 0.5401, + "step": 5980 + }, + { + "epoch": 0.49, + "grad_norm": 0.8756876468757003, + "learning_rate": 1.0935789005649924e-05, + "loss": 0.5171, + "step": 5981 + }, + { + "epoch": 0.49, + "grad_norm": 0.8974310052441253, + "learning_rate": 1.0933168052418708e-05, + "loss": 0.5579, + "step": 5982 + }, + { + "epoch": 0.49, + "grad_norm": 0.9251336556455075, + "learning_rate": 1.0930547034519772e-05, + "loss": 0.5695, + "step": 5983 + }, + { + "epoch": 0.49, + "grad_norm": 0.9314886487604079, + "learning_rate": 1.0927925952134753e-05, + "loss": 0.5214, + "step": 5984 + }, + { + "epoch": 0.49, + "grad_norm": 0.869125763237957, + "learning_rate": 1.092530480544529e-05, + "loss": 0.501, + "step": 5985 + }, + { + "epoch": 0.49, + "grad_norm": 0.915804992331489, + "learning_rate": 1.092268359463302e-05, + "loss": 0.5217, + "step": 5986 + }, + { + "epoch": 0.49, + "grad_norm": 0.8962623903964275, + "learning_rate": 1.0920062319879599e-05, + "loss": 0.5452, + "step": 5987 + }, + { + "epoch": 0.49, + "grad_norm": 0.8486741066180117, + "learning_rate": 1.0917440981366677e-05, + "loss": 0.5229, + "step": 5988 + }, + { + "epoch": 0.49, + "grad_norm": 0.9043838061637083, + "learning_rate": 1.0914819579275903e-05, + "loss": 0.5257, + "step": 5989 + }, + { + "epoch": 0.49, + "grad_norm": 0.8980008090414996, + "learning_rate": 1.0912198113788947e-05, + "loss": 0.539, + "step": 5990 + }, + { + "epoch": 0.49, + "grad_norm": 0.8991497597361532, + "learning_rate": 1.0909576585087472e-05, + "loss": 0.5007, + "step": 5991 + }, + { + "epoch": 0.49, + "grad_norm": 0.8883674978924093, + "learning_rate": 1.0906954993353145e-05, + "loss": 0.5107, + "step": 5992 + }, + { + "epoch": 0.49, + "grad_norm": 0.8998564171217721, + "learning_rate": 1.0904333338767641e-05, + "loss": 0.5625, + "step": 5993 + }, + { + "epoch": 0.49, + "grad_norm": 0.8363802061981552, + "learning_rate": 1.090171162151264e-05, + "loss": 0.5212, + "step": 5994 + }, + { + "epoch": 0.49, + "grad_norm": 0.9017747043223552, + "learning_rate": 1.0899089841769824e-05, + "loss": 0.5528, + "step": 5995 + }, + { + "epoch": 0.49, + "grad_norm": 0.9231025436302872, + "learning_rate": 1.0896467999720876e-05, + "loss": 0.5525, + "step": 5996 + }, + { + "epoch": 0.49, + "grad_norm": 0.8912850378764711, + "learning_rate": 1.0893846095547493e-05, + "loss": 0.5891, + "step": 5997 + }, + { + "epoch": 0.49, + "grad_norm": 0.9500971063396939, + "learning_rate": 1.0891224129431368e-05, + "loss": 0.5333, + "step": 5998 + }, + { + "epoch": 0.49, + "grad_norm": 0.8674435355298813, + "learning_rate": 1.0888602101554202e-05, + "loss": 0.4728, + "step": 5999 + }, + { + "epoch": 0.49, + "grad_norm": 0.8477710168689605, + "learning_rate": 1.0885980012097698e-05, + "loss": 0.4872, + "step": 6000 + }, + { + "epoch": 0.49, + "grad_norm": 0.9681405308198463, + "learning_rate": 1.0883357861243567e-05, + "loss": 0.5292, + "step": 6001 + }, + { + "epoch": 0.49, + "grad_norm": 0.8159825384348709, + "learning_rate": 1.0880735649173518e-05, + "loss": 0.5195, + "step": 6002 + }, + { + "epoch": 0.49, + "grad_norm": 0.8594649461848198, + "learning_rate": 1.0878113376069268e-05, + "loss": 0.5064, + "step": 6003 + }, + { + "epoch": 0.49, + "grad_norm": 0.8755507296784738, + "learning_rate": 1.0875491042112543e-05, + "loss": 0.4985, + "step": 6004 + }, + { + "epoch": 0.49, + "grad_norm": 1.1053674630111627, + "learning_rate": 1.0872868647485064e-05, + "loss": 0.5157, + "step": 6005 + }, + { + "epoch": 0.49, + "grad_norm": 0.8650612305952349, + "learning_rate": 1.087024619236856e-05, + "loss": 0.5791, + "step": 6006 + }, + { + "epoch": 0.49, + "grad_norm": 0.9342489013084267, + "learning_rate": 1.0867623676944771e-05, + "loss": 0.5481, + "step": 6007 + }, + { + "epoch": 0.49, + "grad_norm": 0.8953188110291666, + "learning_rate": 1.0865001101395429e-05, + "loss": 0.4963, + "step": 6008 + }, + { + "epoch": 0.49, + "grad_norm": 0.9266589925253965, + "learning_rate": 1.0862378465902276e-05, + "loss": 0.5699, + "step": 6009 + }, + { + "epoch": 0.49, + "grad_norm": 0.876106966122857, + "learning_rate": 1.0859755770647063e-05, + "loss": 0.5398, + "step": 6010 + }, + { + "epoch": 0.49, + "grad_norm": 0.9158713362331197, + "learning_rate": 1.0857133015811537e-05, + "loss": 0.5676, + "step": 6011 + }, + { + "epoch": 0.49, + "grad_norm": 0.8439366191962501, + "learning_rate": 1.0854510201577451e-05, + "loss": 0.4834, + "step": 6012 + }, + { + "epoch": 0.49, + "grad_norm": 0.9129278894147207, + "learning_rate": 1.0851887328126569e-05, + "loss": 0.4796, + "step": 6013 + }, + { + "epoch": 0.49, + "grad_norm": 0.9651114390249649, + "learning_rate": 1.084926439564065e-05, + "loss": 0.519, + "step": 6014 + }, + { + "epoch": 0.49, + "grad_norm": 0.9133574311556374, + "learning_rate": 1.084664140430146e-05, + "loss": 0.569, + "step": 6015 + }, + { + "epoch": 0.49, + "grad_norm": 0.9137297031652001, + "learning_rate": 1.0844018354290776e-05, + "loss": 0.5623, + "step": 6016 + }, + { + "epoch": 0.49, + "grad_norm": 0.8509472903986172, + "learning_rate": 1.0841395245790363e-05, + "loss": 0.5037, + "step": 6017 + }, + { + "epoch": 0.49, + "grad_norm": 0.9168976366504523, + "learning_rate": 1.0838772078982008e-05, + "loss": 0.5698, + "step": 6018 + }, + { + "epoch": 0.49, + "grad_norm": 0.902456982349151, + "learning_rate": 1.0836148854047494e-05, + "loss": 0.5128, + "step": 6019 + }, + { + "epoch": 0.49, + "grad_norm": 0.922315991787359, + "learning_rate": 1.0833525571168603e-05, + "loss": 0.4852, + "step": 6020 + }, + { + "epoch": 0.49, + "grad_norm": 0.9847818345055375, + "learning_rate": 1.0830902230527129e-05, + "loss": 0.5451, + "step": 6021 + }, + { + "epoch": 0.49, + "grad_norm": 0.9403531190902872, + "learning_rate": 1.082827883230487e-05, + "loss": 0.5087, + "step": 6022 + }, + { + "epoch": 0.49, + "grad_norm": 0.9536798293761942, + "learning_rate": 1.0825655376683621e-05, + "loss": 0.5691, + "step": 6023 + }, + { + "epoch": 0.49, + "grad_norm": 0.8687923617162312, + "learning_rate": 1.0823031863845189e-05, + "loss": 0.5028, + "step": 6024 + }, + { + "epoch": 0.49, + "grad_norm": 0.9344505134076202, + "learning_rate": 1.082040829397138e-05, + "loss": 0.5811, + "step": 6025 + }, + { + "epoch": 0.49, + "grad_norm": 1.0892474648248411, + "learning_rate": 1.0817784667243998e-05, + "loss": 0.5767, + "step": 6026 + }, + { + "epoch": 0.49, + "grad_norm": 0.9631439527676332, + "learning_rate": 1.0815160983844865e-05, + "loss": 0.5175, + "step": 6027 + }, + { + "epoch": 0.49, + "grad_norm": 0.9986721453630223, + "learning_rate": 1.0812537243955804e-05, + "loss": 0.5998, + "step": 6028 + }, + { + "epoch": 0.49, + "grad_norm": 0.9578778424630721, + "learning_rate": 1.0809913447758628e-05, + "loss": 0.5775, + "step": 6029 + }, + { + "epoch": 0.49, + "grad_norm": 0.9161777185614014, + "learning_rate": 1.080728959543517e-05, + "loss": 0.5776, + "step": 6030 + }, + { + "epoch": 0.49, + "grad_norm": 0.9106523123320278, + "learning_rate": 1.0804665687167262e-05, + "loss": 0.5458, + "step": 6031 + }, + { + "epoch": 0.49, + "grad_norm": 0.8102666418535739, + "learning_rate": 1.0802041723136731e-05, + "loss": 0.5212, + "step": 6032 + }, + { + "epoch": 0.49, + "grad_norm": 0.9620209665623649, + "learning_rate": 1.079941770352542e-05, + "loss": 0.5381, + "step": 6033 + }, + { + "epoch": 0.49, + "grad_norm": 0.9765426156604666, + "learning_rate": 1.0796793628515176e-05, + "loss": 0.5639, + "step": 6034 + }, + { + "epoch": 0.49, + "grad_norm": 0.9410479105661026, + "learning_rate": 1.0794169498287837e-05, + "loss": 0.5848, + "step": 6035 + }, + { + "epoch": 0.49, + "grad_norm": 0.9258676956045075, + "learning_rate": 1.0791545313025255e-05, + "loss": 0.5677, + "step": 6036 + }, + { + "epoch": 0.49, + "grad_norm": 0.9773392276289621, + "learning_rate": 1.078892107290929e-05, + "loss": 0.634, + "step": 6037 + }, + { + "epoch": 0.49, + "grad_norm": 0.9256631034571904, + "learning_rate": 1.0786296778121787e-05, + "loss": 0.5795, + "step": 6038 + }, + { + "epoch": 0.49, + "grad_norm": 0.933137084813143, + "learning_rate": 1.078367242884462e-05, + "loss": 0.5872, + "step": 6039 + }, + { + "epoch": 0.49, + "grad_norm": 0.9953572621075247, + "learning_rate": 1.0781048025259648e-05, + "loss": 0.6192, + "step": 6040 + }, + { + "epoch": 0.49, + "grad_norm": 1.0050752288603593, + "learning_rate": 1.0778423567548739e-05, + "loss": 0.5837, + "step": 6041 + }, + { + "epoch": 0.49, + "grad_norm": 0.8950958764206185, + "learning_rate": 1.0775799055893768e-05, + "loss": 0.5146, + "step": 6042 + }, + { + "epoch": 0.49, + "grad_norm": 0.9012573457259359, + "learning_rate": 1.0773174490476613e-05, + "loss": 0.5663, + "step": 6043 + }, + { + "epoch": 0.49, + "grad_norm": 0.9568547546834592, + "learning_rate": 1.0770549871479149e-05, + "loss": 0.5659, + "step": 6044 + }, + { + "epoch": 0.49, + "grad_norm": 0.921074163124563, + "learning_rate": 1.0767925199083262e-05, + "loss": 0.5878, + "step": 6045 + }, + { + "epoch": 0.49, + "grad_norm": 0.8790758009068457, + "learning_rate": 1.0765300473470841e-05, + "loss": 0.4825, + "step": 6046 + }, + { + "epoch": 0.49, + "grad_norm": 0.9692177897991965, + "learning_rate": 1.0762675694823777e-05, + "loss": 0.5294, + "step": 6047 + }, + { + "epoch": 0.49, + "grad_norm": 0.7912159684386474, + "learning_rate": 1.0760050863323961e-05, + "loss": 0.4587, + "step": 6048 + }, + { + "epoch": 0.49, + "grad_norm": 0.902387535550519, + "learning_rate": 1.0757425979153297e-05, + "loss": 0.5185, + "step": 6049 + }, + { + "epoch": 0.49, + "grad_norm": 0.90900599928075, + "learning_rate": 1.0754801042493683e-05, + "loss": 0.5249, + "step": 6050 + }, + { + "epoch": 0.49, + "grad_norm": 0.9150820410769867, + "learning_rate": 1.0752176053527025e-05, + "loss": 0.5712, + "step": 6051 + }, + { + "epoch": 0.49, + "grad_norm": 0.9762071829758882, + "learning_rate": 1.0749551012435237e-05, + "loss": 0.573, + "step": 6052 + }, + { + "epoch": 0.49, + "grad_norm": 0.8573612056669551, + "learning_rate": 1.0746925919400226e-05, + "loss": 0.4773, + "step": 6053 + }, + { + "epoch": 0.49, + "grad_norm": 0.911707052115374, + "learning_rate": 1.0744300774603914e-05, + "loss": 0.5026, + "step": 6054 + }, + { + "epoch": 0.49, + "grad_norm": 0.9979792020943502, + "learning_rate": 1.0741675578228216e-05, + "loss": 0.6148, + "step": 6055 + }, + { + "epoch": 0.49, + "grad_norm": 0.9553815644356218, + "learning_rate": 1.073905033045506e-05, + "loss": 0.5726, + "step": 6056 + }, + { + "epoch": 0.49, + "grad_norm": 0.9136612647564725, + "learning_rate": 1.0736425031466369e-05, + "loss": 0.5387, + "step": 6057 + }, + { + "epoch": 0.49, + "grad_norm": 0.973117546702641, + "learning_rate": 1.0733799681444077e-05, + "loss": 0.5942, + "step": 6058 + }, + { + "epoch": 0.49, + "grad_norm": 0.9243498996981262, + "learning_rate": 1.073117428057012e-05, + "loss": 0.5464, + "step": 6059 + }, + { + "epoch": 0.49, + "grad_norm": 0.9654957940954922, + "learning_rate": 1.0728548829026433e-05, + "loss": 0.5123, + "step": 6060 + }, + { + "epoch": 0.49, + "grad_norm": 0.8545999154255652, + "learning_rate": 1.0725923326994958e-05, + "loss": 0.502, + "step": 6061 + }, + { + "epoch": 0.49, + "grad_norm": 0.9459939415916986, + "learning_rate": 1.0723297774657642e-05, + "loss": 0.6115, + "step": 6062 + }, + { + "epoch": 0.49, + "grad_norm": 0.8484583387652496, + "learning_rate": 1.0720672172196432e-05, + "loss": 0.5328, + "step": 6063 + }, + { + "epoch": 0.49, + "grad_norm": 0.827531492547263, + "learning_rate": 1.0718046519793276e-05, + "loss": 0.4541, + "step": 6064 + }, + { + "epoch": 0.49, + "grad_norm": 0.8574412032251646, + "learning_rate": 1.0715420817630137e-05, + "loss": 0.4967, + "step": 6065 + }, + { + "epoch": 0.49, + "grad_norm": 0.8550222014048962, + "learning_rate": 1.0712795065888968e-05, + "loss": 0.4633, + "step": 6066 + }, + { + "epoch": 0.49, + "grad_norm": 0.9021045314787502, + "learning_rate": 1.0710169264751733e-05, + "loss": 0.5815, + "step": 6067 + }, + { + "epoch": 0.49, + "grad_norm": 0.9894222824218805, + "learning_rate": 1.0707543414400398e-05, + "loss": 0.528, + "step": 6068 + }, + { + "epoch": 0.49, + "grad_norm": 0.9728184096575895, + "learning_rate": 1.0704917515016933e-05, + "loss": 0.5035, + "step": 6069 + }, + { + "epoch": 0.49, + "grad_norm": 0.9641583065696216, + "learning_rate": 1.0702291566783307e-05, + "loss": 0.5558, + "step": 6070 + }, + { + "epoch": 0.49, + "grad_norm": 0.8391781018752039, + "learning_rate": 1.0699665569881503e-05, + "loss": 0.5121, + "step": 6071 + }, + { + "epoch": 0.49, + "grad_norm": 0.8621647843592861, + "learning_rate": 1.0697039524493492e-05, + "loss": 0.5277, + "step": 6072 + }, + { + "epoch": 0.49, + "grad_norm": 0.7567533685287966, + "learning_rate": 1.069441343080126e-05, + "loss": 0.4858, + "step": 6073 + }, + { + "epoch": 0.49, + "grad_norm": 0.8648137761316245, + "learning_rate": 1.0691787288986795e-05, + "loss": 0.532, + "step": 6074 + }, + { + "epoch": 0.49, + "grad_norm": 0.9042486379215297, + "learning_rate": 1.0689161099232084e-05, + "loss": 0.526, + "step": 6075 + }, + { + "epoch": 0.49, + "grad_norm": 0.906851550856305, + "learning_rate": 1.0686534861719118e-05, + "loss": 0.5236, + "step": 6076 + }, + { + "epoch": 0.49, + "grad_norm": 0.9131446706717494, + "learning_rate": 1.06839085766299e-05, + "loss": 0.4741, + "step": 6077 + }, + { + "epoch": 0.49, + "grad_norm": 0.8710047907608442, + "learning_rate": 1.068128224414642e-05, + "loss": 0.5528, + "step": 6078 + }, + { + "epoch": 0.49, + "grad_norm": 0.9268905719223248, + "learning_rate": 1.0678655864450684e-05, + "loss": 0.5913, + "step": 6079 + }, + { + "epoch": 0.49, + "grad_norm": 0.9435191680227256, + "learning_rate": 1.0676029437724703e-05, + "loss": 0.5458, + "step": 6080 + }, + { + "epoch": 0.49, + "grad_norm": 0.8724092493981516, + "learning_rate": 1.0673402964150479e-05, + "loss": 0.5784, + "step": 6081 + }, + { + "epoch": 0.49, + "grad_norm": 0.8601665421495636, + "learning_rate": 1.0670776443910024e-05, + "loss": 0.5473, + "step": 6082 + }, + { + "epoch": 0.49, + "grad_norm": 0.9680710017013635, + "learning_rate": 1.0668149877185361e-05, + "loss": 0.6032, + "step": 6083 + }, + { + "epoch": 0.49, + "grad_norm": 0.9492639664421051, + "learning_rate": 1.0665523264158501e-05, + "loss": 0.5389, + "step": 6084 + }, + { + "epoch": 0.49, + "grad_norm": 0.8455152863436715, + "learning_rate": 1.0662896605011472e-05, + "loss": 0.566, + "step": 6085 + }, + { + "epoch": 0.49, + "grad_norm": 0.85243477804133, + "learning_rate": 1.0660269899926296e-05, + "loss": 0.5244, + "step": 6086 + }, + { + "epoch": 0.49, + "grad_norm": 0.9435897995784096, + "learning_rate": 1.0657643149084999e-05, + "loss": 0.5268, + "step": 6087 + }, + { + "epoch": 0.49, + "grad_norm": 0.9004172884401473, + "learning_rate": 1.0655016352669616e-05, + "loss": 0.5125, + "step": 6088 + }, + { + "epoch": 0.49, + "grad_norm": 0.8696831235448563, + "learning_rate": 1.0652389510862182e-05, + "loss": 0.4969, + "step": 6089 + }, + { + "epoch": 0.49, + "grad_norm": 0.8757254452087241, + "learning_rate": 1.0649762623844733e-05, + "loss": 0.5422, + "step": 6090 + }, + { + "epoch": 0.5, + "grad_norm": 0.8533339193252824, + "learning_rate": 1.064713569179931e-05, + "loss": 0.49, + "step": 6091 + }, + { + "epoch": 0.5, + "grad_norm": 0.929141233600302, + "learning_rate": 1.064450871490796e-05, + "loss": 0.5793, + "step": 6092 + }, + { + "epoch": 0.5, + "grad_norm": 0.9853060175197331, + "learning_rate": 1.0641881693352724e-05, + "loss": 0.5836, + "step": 6093 + }, + { + "epoch": 0.5, + "grad_norm": 0.8969091374661617, + "learning_rate": 1.0639254627315658e-05, + "loss": 0.572, + "step": 6094 + }, + { + "epoch": 0.5, + "grad_norm": 0.9445842218624012, + "learning_rate": 1.0636627516978815e-05, + "loss": 0.5491, + "step": 6095 + }, + { + "epoch": 0.5, + "grad_norm": 0.8521417788675616, + "learning_rate": 1.0634000362524247e-05, + "loss": 0.4567, + "step": 6096 + }, + { + "epoch": 0.5, + "grad_norm": 0.8243800789666074, + "learning_rate": 1.0631373164134015e-05, + "loss": 0.5237, + "step": 6097 + }, + { + "epoch": 0.5, + "grad_norm": 0.8972677822447758, + "learning_rate": 1.0628745921990184e-05, + "loss": 0.5347, + "step": 6098 + }, + { + "epoch": 0.5, + "grad_norm": 0.9096471053000708, + "learning_rate": 1.062611863627482e-05, + "loss": 0.5355, + "step": 6099 + }, + { + "epoch": 0.5, + "grad_norm": 0.8451782292679187, + "learning_rate": 1.062349130716999e-05, + "loss": 0.4855, + "step": 6100 + }, + { + "epoch": 0.5, + "grad_norm": 0.8535857129649261, + "learning_rate": 1.0620863934857764e-05, + "loss": 0.5551, + "step": 6101 + }, + { + "epoch": 0.5, + "grad_norm": 0.816157477591498, + "learning_rate": 1.0618236519520219e-05, + "loss": 0.501, + "step": 6102 + }, + { + "epoch": 0.5, + "grad_norm": 1.0310068024642354, + "learning_rate": 1.0615609061339431e-05, + "loss": 0.6114, + "step": 6103 + }, + { + "epoch": 0.5, + "grad_norm": 0.9423701264828378, + "learning_rate": 1.061298156049748e-05, + "loss": 0.5538, + "step": 6104 + }, + { + "epoch": 0.5, + "grad_norm": 0.8732552391809214, + "learning_rate": 1.061035401717645e-05, + "loss": 0.5382, + "step": 6105 + }, + { + "epoch": 0.5, + "grad_norm": 0.8797775091115982, + "learning_rate": 1.0607726431558431e-05, + "loss": 0.5089, + "step": 6106 + }, + { + "epoch": 0.5, + "grad_norm": 0.9400596223756281, + "learning_rate": 1.060509880382551e-05, + "loss": 0.5566, + "step": 6107 + }, + { + "epoch": 0.5, + "grad_norm": 0.8467700720411315, + "learning_rate": 1.0602471134159773e-05, + "loss": 0.5317, + "step": 6108 + }, + { + "epoch": 0.5, + "grad_norm": 0.822881476771488, + "learning_rate": 1.0599843422743328e-05, + "loss": 0.505, + "step": 6109 + }, + { + "epoch": 0.5, + "grad_norm": 0.945363657168463, + "learning_rate": 1.059721566975826e-05, + "loss": 0.5576, + "step": 6110 + }, + { + "epoch": 0.5, + "grad_norm": 0.8550973491019075, + "learning_rate": 1.0594587875386677e-05, + "loss": 0.4685, + "step": 6111 + }, + { + "epoch": 0.5, + "grad_norm": 0.8978924311350094, + "learning_rate": 1.0591960039810684e-05, + "loss": 0.5041, + "step": 6112 + }, + { + "epoch": 0.5, + "grad_norm": 0.9659817672441282, + "learning_rate": 1.0589332163212384e-05, + "loss": 0.5497, + "step": 6113 + }, + { + "epoch": 0.5, + "grad_norm": 0.9426550282970327, + "learning_rate": 1.0586704245773886e-05, + "loss": 0.5407, + "step": 6114 + }, + { + "epoch": 0.5, + "grad_norm": 0.9234953615378141, + "learning_rate": 1.0584076287677307e-05, + "loss": 0.6009, + "step": 6115 + }, + { + "epoch": 0.5, + "grad_norm": 0.8702469617913521, + "learning_rate": 1.0581448289104759e-05, + "loss": 0.493, + "step": 6116 + }, + { + "epoch": 0.5, + "grad_norm": 0.9994597487559939, + "learning_rate": 1.057882025023836e-05, + "loss": 0.5883, + "step": 6117 + }, + { + "epoch": 0.5, + "grad_norm": 0.918411318404071, + "learning_rate": 1.0576192171260228e-05, + "loss": 0.5317, + "step": 6118 + }, + { + "epoch": 0.5, + "grad_norm": 0.9373647802629542, + "learning_rate": 1.0573564052352496e-05, + "loss": 0.5399, + "step": 6119 + }, + { + "epoch": 0.5, + "grad_norm": 0.8050241018619839, + "learning_rate": 1.0570935893697278e-05, + "loss": 0.4442, + "step": 6120 + }, + { + "epoch": 0.5, + "grad_norm": 0.9022679662284697, + "learning_rate": 1.0568307695476712e-05, + "loss": 0.5033, + "step": 6121 + }, + { + "epoch": 0.5, + "grad_norm": 0.8846901204793555, + "learning_rate": 1.0565679457872928e-05, + "loss": 0.5182, + "step": 6122 + }, + { + "epoch": 0.5, + "grad_norm": 1.0059532477165203, + "learning_rate": 1.0563051181068056e-05, + "loss": 0.5464, + "step": 6123 + }, + { + "epoch": 0.5, + "grad_norm": 0.9327245880379895, + "learning_rate": 1.0560422865244237e-05, + "loss": 0.5133, + "step": 6124 + }, + { + "epoch": 0.5, + "grad_norm": 0.8496071103162126, + "learning_rate": 1.0557794510583611e-05, + "loss": 0.4938, + "step": 6125 + }, + { + "epoch": 0.5, + "grad_norm": 0.8298240451208877, + "learning_rate": 1.0555166117268322e-05, + "loss": 0.5385, + "step": 6126 + }, + { + "epoch": 0.5, + "grad_norm": 0.934187978424719, + "learning_rate": 1.0552537685480512e-05, + "loss": 0.5143, + "step": 6127 + }, + { + "epoch": 0.5, + "grad_norm": 0.8674548058152174, + "learning_rate": 1.054990921540233e-05, + "loss": 0.5794, + "step": 6128 + }, + { + "epoch": 0.5, + "grad_norm": 1.0370693329998053, + "learning_rate": 1.054728070721593e-05, + "loss": 0.5977, + "step": 6129 + }, + { + "epoch": 0.5, + "grad_norm": 0.8819244109215885, + "learning_rate": 1.0544652161103459e-05, + "loss": 0.5054, + "step": 6130 + }, + { + "epoch": 0.5, + "grad_norm": 0.8960578144446292, + "learning_rate": 1.0542023577247076e-05, + "loss": 0.545, + "step": 6131 + }, + { + "epoch": 0.5, + "grad_norm": 0.886580731545749, + "learning_rate": 1.0539394955828944e-05, + "loss": 0.5429, + "step": 6132 + }, + { + "epoch": 0.5, + "grad_norm": 0.9374761540391363, + "learning_rate": 1.0536766297031216e-05, + "loss": 0.6374, + "step": 6133 + }, + { + "epoch": 0.5, + "grad_norm": 0.9103004242142594, + "learning_rate": 1.053413760103606e-05, + "loss": 0.5159, + "step": 6134 + }, + { + "epoch": 0.5, + "grad_norm": 0.8968055272655576, + "learning_rate": 1.0531508868025647e-05, + "loss": 0.5283, + "step": 6135 + }, + { + "epoch": 0.5, + "grad_norm": 0.970055594503045, + "learning_rate": 1.0528880098182136e-05, + "loss": 0.5435, + "step": 6136 + }, + { + "epoch": 0.5, + "grad_norm": 0.9389369412572518, + "learning_rate": 1.0526251291687703e-05, + "loss": 0.5747, + "step": 6137 + }, + { + "epoch": 0.5, + "grad_norm": 0.8078187197158875, + "learning_rate": 1.0523622448724524e-05, + "loss": 0.5405, + "step": 6138 + }, + { + "epoch": 0.5, + "grad_norm": 0.9190980092900863, + "learning_rate": 1.0520993569474773e-05, + "loss": 0.5321, + "step": 6139 + }, + { + "epoch": 0.5, + "grad_norm": 0.958215322494185, + "learning_rate": 1.051836465412063e-05, + "loss": 0.5699, + "step": 6140 + }, + { + "epoch": 0.5, + "grad_norm": 0.8721689328952731, + "learning_rate": 1.051573570284428e-05, + "loss": 0.6109, + "step": 6141 + }, + { + "epoch": 0.5, + "grad_norm": 1.0429424389942195, + "learning_rate": 1.0513106715827897e-05, + "loss": 0.5949, + "step": 6142 + }, + { + "epoch": 0.5, + "grad_norm": 0.8826173376580875, + "learning_rate": 1.0510477693253676e-05, + "loss": 0.4822, + "step": 6143 + }, + { + "epoch": 0.5, + "grad_norm": 0.9558074179782857, + "learning_rate": 1.0507848635303805e-05, + "loss": 0.5409, + "step": 6144 + }, + { + "epoch": 0.5, + "grad_norm": 1.0047521941779742, + "learning_rate": 1.0505219542160474e-05, + "loss": 0.5804, + "step": 6145 + }, + { + "epoch": 0.5, + "grad_norm": 1.0319590843517332, + "learning_rate": 1.0502590414005875e-05, + "loss": 0.5394, + "step": 6146 + }, + { + "epoch": 0.5, + "grad_norm": 0.8462689043839046, + "learning_rate": 1.0499961251022208e-05, + "loss": 0.5235, + "step": 6147 + }, + { + "epoch": 0.5, + "grad_norm": 0.9645087832836281, + "learning_rate": 1.049733205339167e-05, + "loss": 0.5744, + "step": 6148 + }, + { + "epoch": 0.5, + "grad_norm": 0.9931263205618819, + "learning_rate": 1.0494702821296458e-05, + "loss": 0.5615, + "step": 6149 + }, + { + "epoch": 0.5, + "grad_norm": 0.8977359164608016, + "learning_rate": 1.0492073554918782e-05, + "loss": 0.5124, + "step": 6150 + }, + { + "epoch": 0.5, + "grad_norm": 0.9580917800214114, + "learning_rate": 1.0489444254440846e-05, + "loss": 0.556, + "step": 6151 + }, + { + "epoch": 0.5, + "grad_norm": 0.9868313720168621, + "learning_rate": 1.0486814920044857e-05, + "loss": 0.6069, + "step": 6152 + }, + { + "epoch": 0.5, + "grad_norm": 0.9786912759940932, + "learning_rate": 1.0484185551913027e-05, + "loss": 0.5963, + "step": 6153 + }, + { + "epoch": 0.5, + "grad_norm": 0.8802270262465605, + "learning_rate": 1.0481556150227562e-05, + "loss": 0.4737, + "step": 6154 + }, + { + "epoch": 0.5, + "grad_norm": 0.9904658437694336, + "learning_rate": 1.0478926715170687e-05, + "loss": 0.5483, + "step": 6155 + }, + { + "epoch": 0.5, + "grad_norm": 0.922169756232924, + "learning_rate": 1.0476297246924619e-05, + "loss": 0.5358, + "step": 6156 + }, + { + "epoch": 0.5, + "grad_norm": 0.9532812373990939, + "learning_rate": 1.047366774567157e-05, + "loss": 0.4799, + "step": 6157 + }, + { + "epoch": 0.5, + "grad_norm": 0.9210650459076102, + "learning_rate": 1.0471038211593764e-05, + "loss": 0.5301, + "step": 6158 + }, + { + "epoch": 0.5, + "grad_norm": 0.7225895551992169, + "learning_rate": 1.0468408644873433e-05, + "loss": 0.4377, + "step": 6159 + }, + { + "epoch": 0.5, + "grad_norm": 0.9545247474276838, + "learning_rate": 1.0465779045692796e-05, + "loss": 0.564, + "step": 6160 + }, + { + "epoch": 0.5, + "grad_norm": 0.9428918239243667, + "learning_rate": 1.0463149414234084e-05, + "loss": 0.5187, + "step": 6161 + }, + { + "epoch": 0.5, + "grad_norm": 0.9060269955149981, + "learning_rate": 1.046051975067953e-05, + "loss": 0.5088, + "step": 6162 + }, + { + "epoch": 0.5, + "grad_norm": 0.9264511973693403, + "learning_rate": 1.0457890055211364e-05, + "loss": 0.5766, + "step": 6163 + }, + { + "epoch": 0.5, + "grad_norm": 0.899604789095249, + "learning_rate": 1.0455260328011822e-05, + "loss": 0.516, + "step": 6164 + }, + { + "epoch": 0.5, + "grad_norm": 0.9773994581290497, + "learning_rate": 1.0452630569263147e-05, + "loss": 0.6503, + "step": 6165 + }, + { + "epoch": 0.5, + "grad_norm": 0.9720121111533556, + "learning_rate": 1.0450000779147573e-05, + "loss": 0.5527, + "step": 6166 + }, + { + "epoch": 0.5, + "grad_norm": 0.9737789980852884, + "learning_rate": 1.0447370957847343e-05, + "loss": 0.5923, + "step": 6167 + }, + { + "epoch": 0.5, + "grad_norm": 0.8662034958602722, + "learning_rate": 1.0444741105544705e-05, + "loss": 0.5748, + "step": 6168 + }, + { + "epoch": 0.5, + "grad_norm": 0.9015025314707872, + "learning_rate": 1.04421112224219e-05, + "loss": 0.5418, + "step": 6169 + }, + { + "epoch": 0.5, + "grad_norm": 0.8663857088672641, + "learning_rate": 1.0439481308661181e-05, + "loss": 0.515, + "step": 6170 + }, + { + "epoch": 0.5, + "grad_norm": 0.9607331905556366, + "learning_rate": 1.0436851364444798e-05, + "loss": 0.5785, + "step": 6171 + }, + { + "epoch": 0.5, + "grad_norm": 0.8698730300041885, + "learning_rate": 1.0434221389955002e-05, + "loss": 0.4768, + "step": 6172 + }, + { + "epoch": 0.5, + "grad_norm": 0.9302907035593343, + "learning_rate": 1.0431591385374047e-05, + "loss": 0.5422, + "step": 6173 + }, + { + "epoch": 0.5, + "grad_norm": 0.8590944033288993, + "learning_rate": 1.0428961350884194e-05, + "loss": 0.5405, + "step": 6174 + }, + { + "epoch": 0.5, + "grad_norm": 0.8604175708088144, + "learning_rate": 1.0426331286667701e-05, + "loss": 0.5365, + "step": 6175 + }, + { + "epoch": 0.5, + "grad_norm": 0.8882444903167186, + "learning_rate": 1.0423701192906825e-05, + "loss": 0.5045, + "step": 6176 + }, + { + "epoch": 0.5, + "grad_norm": 0.8505136672359528, + "learning_rate": 1.0421071069783834e-05, + "loss": 0.4987, + "step": 6177 + }, + { + "epoch": 0.5, + "grad_norm": 0.8571941770141829, + "learning_rate": 1.0418440917480992e-05, + "loss": 0.5007, + "step": 6178 + }, + { + "epoch": 0.5, + "grad_norm": 0.9102254227045845, + "learning_rate": 1.0415810736180563e-05, + "loss": 0.5232, + "step": 6179 + }, + { + "epoch": 0.5, + "grad_norm": 0.7864199735738805, + "learning_rate": 1.0413180526064824e-05, + "loss": 0.4712, + "step": 6180 + }, + { + "epoch": 0.5, + "grad_norm": 0.8072441676513875, + "learning_rate": 1.0410550287316035e-05, + "loss": 0.5144, + "step": 6181 + }, + { + "epoch": 0.5, + "grad_norm": 0.850410157333761, + "learning_rate": 1.0407920020116477e-05, + "loss": 0.5598, + "step": 6182 + }, + { + "epoch": 0.5, + "grad_norm": 0.9830887682458233, + "learning_rate": 1.0405289724648425e-05, + "loss": 0.5918, + "step": 6183 + }, + { + "epoch": 0.5, + "grad_norm": 0.9114690900097432, + "learning_rate": 1.0402659401094154e-05, + "loss": 0.525, + "step": 6184 + }, + { + "epoch": 0.5, + "grad_norm": 0.8534813523391873, + "learning_rate": 1.0400029049635942e-05, + "loss": 0.4923, + "step": 6185 + }, + { + "epoch": 0.5, + "grad_norm": 0.877842763272256, + "learning_rate": 1.039739867045607e-05, + "loss": 0.5255, + "step": 6186 + }, + { + "epoch": 0.5, + "grad_norm": 1.0236133388921236, + "learning_rate": 1.039476826373683e-05, + "loss": 0.5916, + "step": 6187 + }, + { + "epoch": 0.5, + "grad_norm": 0.8590065023812046, + "learning_rate": 1.0392137829660494e-05, + "loss": 0.4965, + "step": 6188 + }, + { + "epoch": 0.5, + "grad_norm": 0.8639027295230077, + "learning_rate": 1.0389507368409356e-05, + "loss": 0.5561, + "step": 6189 + }, + { + "epoch": 0.5, + "grad_norm": 0.8836484623587807, + "learning_rate": 1.0386876880165701e-05, + "loss": 0.5385, + "step": 6190 + }, + { + "epoch": 0.5, + "grad_norm": 0.8632103809318213, + "learning_rate": 1.0384246365111823e-05, + "loss": 0.5109, + "step": 6191 + }, + { + "epoch": 0.5, + "grad_norm": 0.9279412725843376, + "learning_rate": 1.0381615823430012e-05, + "loss": 0.4993, + "step": 6192 + }, + { + "epoch": 0.5, + "grad_norm": 0.8684010140359392, + "learning_rate": 1.0378985255302565e-05, + "loss": 0.5207, + "step": 6193 + }, + { + "epoch": 0.5, + "grad_norm": 0.8880388535422927, + "learning_rate": 1.0376354660911772e-05, + "loss": 0.5221, + "step": 6194 + }, + { + "epoch": 0.5, + "grad_norm": 1.0301038902267186, + "learning_rate": 1.0373724040439936e-05, + "loss": 0.5964, + "step": 6195 + }, + { + "epoch": 0.5, + "grad_norm": 0.7284968100578956, + "learning_rate": 1.0371093394069359e-05, + "loss": 0.4624, + "step": 6196 + }, + { + "epoch": 0.5, + "grad_norm": 0.859546061851474, + "learning_rate": 1.0368462721982336e-05, + "loss": 0.4492, + "step": 6197 + }, + { + "epoch": 0.5, + "grad_norm": 0.8540284446590735, + "learning_rate": 1.0365832024361173e-05, + "loss": 0.5058, + "step": 6198 + }, + { + "epoch": 0.5, + "grad_norm": 0.9311871787223283, + "learning_rate": 1.0363201301388177e-05, + "loss": 0.5237, + "step": 6199 + }, + { + "epoch": 0.5, + "grad_norm": 0.8783458652186021, + "learning_rate": 1.036057055324565e-05, + "loss": 0.5155, + "step": 6200 + }, + { + "epoch": 0.5, + "grad_norm": 0.9133086616533669, + "learning_rate": 1.0357939780115906e-05, + "loss": 0.5134, + "step": 6201 + }, + { + "epoch": 0.5, + "grad_norm": 0.93527568205224, + "learning_rate": 1.0355308982181254e-05, + "loss": 0.5286, + "step": 6202 + }, + { + "epoch": 0.5, + "grad_norm": 0.9748174070640031, + "learning_rate": 1.0352678159624e-05, + "loss": 0.4804, + "step": 6203 + }, + { + "epoch": 0.5, + "grad_norm": 0.8304324868783131, + "learning_rate": 1.0350047312626465e-05, + "loss": 0.5307, + "step": 6204 + }, + { + "epoch": 0.5, + "grad_norm": 0.737183332887448, + "learning_rate": 1.0347416441370963e-05, + "loss": 0.4458, + "step": 6205 + }, + { + "epoch": 0.5, + "grad_norm": 0.8193594305614337, + "learning_rate": 1.0344785546039808e-05, + "loss": 0.4981, + "step": 6206 + }, + { + "epoch": 0.5, + "grad_norm": 0.9325176197238431, + "learning_rate": 1.0342154626815321e-05, + "loss": 0.5829, + "step": 6207 + }, + { + "epoch": 0.5, + "grad_norm": 0.9374309225936001, + "learning_rate": 1.0339523683879824e-05, + "loss": 0.5537, + "step": 6208 + }, + { + "epoch": 0.5, + "grad_norm": 0.8479987941359138, + "learning_rate": 1.0336892717415635e-05, + "loss": 0.5356, + "step": 6209 + }, + { + "epoch": 0.5, + "grad_norm": 0.7980171217781268, + "learning_rate": 1.0334261727605076e-05, + "loss": 0.5199, + "step": 6210 + }, + { + "epoch": 0.5, + "grad_norm": 0.8531021371446437, + "learning_rate": 1.0331630714630481e-05, + "loss": 0.5068, + "step": 6211 + }, + { + "epoch": 0.5, + "grad_norm": 0.8625990920480998, + "learning_rate": 1.032899967867417e-05, + "loss": 0.4964, + "step": 6212 + }, + { + "epoch": 0.5, + "grad_norm": 0.9672841283305325, + "learning_rate": 1.0326368619918471e-05, + "loss": 0.5782, + "step": 6213 + }, + { + "epoch": 0.51, + "grad_norm": 1.0735602156729935, + "learning_rate": 1.032373753854572e-05, + "loss": 0.5248, + "step": 6214 + }, + { + "epoch": 0.51, + "grad_norm": 0.9113137478830905, + "learning_rate": 1.0321106434738242e-05, + "loss": 0.569, + "step": 6215 + }, + { + "epoch": 0.51, + "grad_norm": 0.811414298943247, + "learning_rate": 1.0318475308678374e-05, + "loss": 0.5266, + "step": 6216 + }, + { + "epoch": 0.51, + "grad_norm": 0.9225312761086015, + "learning_rate": 1.031584416054845e-05, + "loss": 0.5044, + "step": 6217 + }, + { + "epoch": 0.51, + "grad_norm": 0.8248517090127305, + "learning_rate": 1.0313212990530804e-05, + "loss": 0.4785, + "step": 6218 + }, + { + "epoch": 0.51, + "grad_norm": 0.9024948796769673, + "learning_rate": 1.0310581798807776e-05, + "loss": 0.5658, + "step": 6219 + }, + { + "epoch": 0.51, + "grad_norm": 0.9641619822749372, + "learning_rate": 1.0307950585561705e-05, + "loss": 0.5675, + "step": 6220 + }, + { + "epoch": 0.51, + "grad_norm": 0.8721024784016015, + "learning_rate": 1.0305319350974932e-05, + "loss": 0.5326, + "step": 6221 + }, + { + "epoch": 0.51, + "grad_norm": 0.9505797840996292, + "learning_rate": 1.0302688095229798e-05, + "loss": 0.5678, + "step": 6222 + }, + { + "epoch": 0.51, + "grad_norm": 0.8774987959531362, + "learning_rate": 1.030005681850865e-05, + "loss": 0.5067, + "step": 6223 + }, + { + "epoch": 0.51, + "grad_norm": 0.9024372804219885, + "learning_rate": 1.0297425520993829e-05, + "loss": 0.5454, + "step": 6224 + }, + { + "epoch": 0.51, + "grad_norm": 0.9296782390888202, + "learning_rate": 1.0294794202867681e-05, + "loss": 0.5238, + "step": 6225 + }, + { + "epoch": 0.51, + "grad_norm": 0.8898639473811754, + "learning_rate": 1.029216286431256e-05, + "loss": 0.5284, + "step": 6226 + }, + { + "epoch": 0.51, + "grad_norm": 0.9559196004726771, + "learning_rate": 1.028953150551081e-05, + "loss": 0.606, + "step": 6227 + }, + { + "epoch": 0.51, + "grad_norm": 0.9758923851832058, + "learning_rate": 1.0286900126644783e-05, + "loss": 0.6068, + "step": 6228 + }, + { + "epoch": 0.51, + "grad_norm": 0.8429290545153291, + "learning_rate": 1.0284268727896833e-05, + "loss": 0.5196, + "step": 6229 + }, + { + "epoch": 0.51, + "grad_norm": 0.9241137844044234, + "learning_rate": 1.028163730944931e-05, + "loss": 0.5021, + "step": 6230 + }, + { + "epoch": 0.51, + "grad_norm": 0.8844502814224067, + "learning_rate": 1.0279005871484572e-05, + "loss": 0.5981, + "step": 6231 + }, + { + "epoch": 0.51, + "grad_norm": 0.7940233993428926, + "learning_rate": 1.0276374414184977e-05, + "loss": 0.4836, + "step": 6232 + }, + { + "epoch": 0.51, + "grad_norm": 0.8540121985206928, + "learning_rate": 1.0273742937732877e-05, + "loss": 0.5212, + "step": 6233 + }, + { + "epoch": 0.51, + "grad_norm": 0.8411679107015873, + "learning_rate": 1.0271111442310638e-05, + "loss": 0.4643, + "step": 6234 + }, + { + "epoch": 0.51, + "grad_norm": 0.843190016884267, + "learning_rate": 1.0268479928100615e-05, + "loss": 0.4889, + "step": 6235 + }, + { + "epoch": 0.51, + "grad_norm": 0.9298687033475067, + "learning_rate": 1.026584839528517e-05, + "loss": 0.4891, + "step": 6236 + }, + { + "epoch": 0.51, + "grad_norm": 0.8314982900576823, + "learning_rate": 1.0263216844046666e-05, + "loss": 0.5256, + "step": 6237 + }, + { + "epoch": 0.51, + "grad_norm": 0.9635576321688502, + "learning_rate": 1.026058527456747e-05, + "loss": 0.5927, + "step": 6238 + }, + { + "epoch": 0.51, + "grad_norm": 0.9399777954113789, + "learning_rate": 1.0257953687029945e-05, + "loss": 0.5366, + "step": 6239 + }, + { + "epoch": 0.51, + "grad_norm": 0.9062288398497633, + "learning_rate": 1.0255322081616456e-05, + "loss": 0.4792, + "step": 6240 + }, + { + "epoch": 0.51, + "grad_norm": 0.9000858459834027, + "learning_rate": 1.025269045850938e-05, + "loss": 0.4799, + "step": 6241 + }, + { + "epoch": 0.51, + "grad_norm": 0.9559802820060481, + "learning_rate": 1.0250058817891074e-05, + "loss": 0.5548, + "step": 6242 + }, + { + "epoch": 0.51, + "grad_norm": 0.9062852807311451, + "learning_rate": 1.0247427159943912e-05, + "loss": 0.4895, + "step": 6243 + }, + { + "epoch": 0.51, + "grad_norm": 0.9464049844786903, + "learning_rate": 1.0244795484850272e-05, + "loss": 0.5534, + "step": 6244 + }, + { + "epoch": 0.51, + "grad_norm": 0.8847921545422844, + "learning_rate": 1.024216379279252e-05, + "loss": 0.5079, + "step": 6245 + }, + { + "epoch": 0.51, + "grad_norm": 0.8978136301515469, + "learning_rate": 1.0239532083953032e-05, + "loss": 0.5608, + "step": 6246 + }, + { + "epoch": 0.51, + "grad_norm": 0.9463582730856492, + "learning_rate": 1.0236900358514181e-05, + "loss": 0.5277, + "step": 6247 + }, + { + "epoch": 0.51, + "grad_norm": 0.8067583675753313, + "learning_rate": 1.023426861665835e-05, + "loss": 0.4961, + "step": 6248 + }, + { + "epoch": 0.51, + "grad_norm": 0.8883255946762623, + "learning_rate": 1.0231636858567909e-05, + "loss": 0.5654, + "step": 6249 + }, + { + "epoch": 0.51, + "grad_norm": 0.9190681551860245, + "learning_rate": 1.022900508442524e-05, + "loss": 0.562, + "step": 6250 + }, + { + "epoch": 0.51, + "grad_norm": 0.9494150603533084, + "learning_rate": 1.0226373294412718e-05, + "loss": 0.5676, + "step": 6251 + }, + { + "epoch": 0.51, + "grad_norm": 0.9061517922527588, + "learning_rate": 1.0223741488712732e-05, + "loss": 0.5428, + "step": 6252 + }, + { + "epoch": 0.51, + "grad_norm": 0.9275782262305359, + "learning_rate": 1.0221109667507656e-05, + "loss": 0.5184, + "step": 6253 + }, + { + "epoch": 0.51, + "grad_norm": 0.9189579629915223, + "learning_rate": 1.0218477830979878e-05, + "loss": 0.5439, + "step": 6254 + }, + { + "epoch": 0.51, + "grad_norm": 0.9182833448414004, + "learning_rate": 1.0215845979311783e-05, + "loss": 0.5327, + "step": 6255 + }, + { + "epoch": 0.51, + "grad_norm": 0.8902344636216859, + "learning_rate": 1.0213214112685747e-05, + "loss": 0.4828, + "step": 6256 + }, + { + "epoch": 0.51, + "grad_norm": 0.897277648623057, + "learning_rate": 1.0210582231284165e-05, + "loss": 0.5015, + "step": 6257 + }, + { + "epoch": 0.51, + "grad_norm": 0.9182502828960334, + "learning_rate": 1.0207950335289423e-05, + "loss": 0.552, + "step": 6258 + }, + { + "epoch": 0.51, + "grad_norm": 0.885910996981317, + "learning_rate": 1.0205318424883906e-05, + "loss": 0.4874, + "step": 6259 + }, + { + "epoch": 0.51, + "grad_norm": 0.922634853593885, + "learning_rate": 1.0202686500250003e-05, + "loss": 0.529, + "step": 6260 + }, + { + "epoch": 0.51, + "grad_norm": 0.8720759553847209, + "learning_rate": 1.0200054561570108e-05, + "loss": 0.5301, + "step": 6261 + }, + { + "epoch": 0.51, + "grad_norm": 0.9239527432654803, + "learning_rate": 1.0197422609026606e-05, + "loss": 0.5539, + "step": 6262 + }, + { + "epoch": 0.51, + "grad_norm": 0.8890671690015547, + "learning_rate": 1.0194790642801893e-05, + "loss": 0.5476, + "step": 6263 + }, + { + "epoch": 0.51, + "grad_norm": 0.8569505481042301, + "learning_rate": 1.0192158663078362e-05, + "loss": 0.5216, + "step": 6264 + }, + { + "epoch": 0.51, + "grad_norm": 0.8738179518705772, + "learning_rate": 1.0189526670038407e-05, + "loss": 0.5254, + "step": 6265 + }, + { + "epoch": 0.51, + "grad_norm": 0.8517879706535724, + "learning_rate": 1.0186894663864421e-05, + "loss": 0.5613, + "step": 6266 + }, + { + "epoch": 0.51, + "grad_norm": 0.9467237412597023, + "learning_rate": 1.01842626447388e-05, + "loss": 0.565, + "step": 6267 + }, + { + "epoch": 0.51, + "grad_norm": 0.9372933446510155, + "learning_rate": 1.0181630612843943e-05, + "loss": 0.5869, + "step": 6268 + }, + { + "epoch": 0.51, + "grad_norm": 0.8379813367737269, + "learning_rate": 1.0178998568362243e-05, + "loss": 0.4953, + "step": 6269 + }, + { + "epoch": 0.51, + "grad_norm": 0.8807616083485671, + "learning_rate": 1.0176366511476102e-05, + "loss": 0.5605, + "step": 6270 + }, + { + "epoch": 0.51, + "grad_norm": 0.909667415705031, + "learning_rate": 1.0173734442367919e-05, + "loss": 0.5494, + "step": 6271 + }, + { + "epoch": 0.51, + "grad_norm": 0.9180665855259821, + "learning_rate": 1.0171102361220093e-05, + "loss": 0.5776, + "step": 6272 + }, + { + "epoch": 0.51, + "grad_norm": 0.8866368103793839, + "learning_rate": 1.0168470268215025e-05, + "loss": 0.5162, + "step": 6273 + }, + { + "epoch": 0.51, + "grad_norm": 0.9331207432565604, + "learning_rate": 1.0165838163535115e-05, + "loss": 0.5799, + "step": 6274 + }, + { + "epoch": 0.51, + "grad_norm": 0.9299756848302255, + "learning_rate": 1.0163206047362773e-05, + "loss": 0.5631, + "step": 6275 + }, + { + "epoch": 0.51, + "grad_norm": 0.9153501532529058, + "learning_rate": 1.016057391988039e-05, + "loss": 0.5746, + "step": 6276 + }, + { + "epoch": 0.51, + "grad_norm": 0.9106064921432052, + "learning_rate": 1.015794178127038e-05, + "loss": 0.4931, + "step": 6277 + }, + { + "epoch": 0.51, + "grad_norm": 0.8583263808738999, + "learning_rate": 1.0155309631715145e-05, + "loss": 0.5349, + "step": 6278 + }, + { + "epoch": 0.51, + "grad_norm": 0.9011920327335722, + "learning_rate": 1.015267747139709e-05, + "loss": 0.5075, + "step": 6279 + }, + { + "epoch": 0.51, + "grad_norm": 0.8793178650201761, + "learning_rate": 1.0150045300498618e-05, + "loss": 0.5156, + "step": 6280 + }, + { + "epoch": 0.51, + "grad_norm": 0.8115799752291708, + "learning_rate": 1.0147413119202145e-05, + "loss": 0.4846, + "step": 6281 + }, + { + "epoch": 0.51, + "grad_norm": 0.9177489139524344, + "learning_rate": 1.0144780927690072e-05, + "loss": 0.5457, + "step": 6282 + }, + { + "epoch": 0.51, + "grad_norm": 0.8929122588141614, + "learning_rate": 1.0142148726144807e-05, + "loss": 0.5309, + "step": 6283 + }, + { + "epoch": 0.51, + "grad_norm": 0.804018811198706, + "learning_rate": 1.0139516514748767e-05, + "loss": 0.4608, + "step": 6284 + }, + { + "epoch": 0.51, + "grad_norm": 0.9347609164966086, + "learning_rate": 1.013688429368435e-05, + "loss": 0.4832, + "step": 6285 + }, + { + "epoch": 0.51, + "grad_norm": 0.9217079376541866, + "learning_rate": 1.0134252063133976e-05, + "loss": 0.5749, + "step": 6286 + }, + { + "epoch": 0.51, + "grad_norm": 0.8506565309327903, + "learning_rate": 1.0131619823280053e-05, + "loss": 0.5394, + "step": 6287 + }, + { + "epoch": 0.51, + "grad_norm": 0.8415600271925963, + "learning_rate": 1.0128987574304991e-05, + "loss": 0.5208, + "step": 6288 + }, + { + "epoch": 0.51, + "grad_norm": 0.8758726806842629, + "learning_rate": 1.0126355316391206e-05, + "loss": 0.5892, + "step": 6289 + }, + { + "epoch": 0.51, + "grad_norm": 0.9039543673993381, + "learning_rate": 1.012372304972111e-05, + "loss": 0.572, + "step": 6290 + }, + { + "epoch": 0.51, + "grad_norm": 0.8837831665094975, + "learning_rate": 1.0121090774477116e-05, + "loss": 0.5164, + "step": 6291 + }, + { + "epoch": 0.51, + "grad_norm": 0.7969372582312041, + "learning_rate": 1.0118458490841639e-05, + "loss": 0.4687, + "step": 6292 + }, + { + "epoch": 0.51, + "grad_norm": 0.9144472654901419, + "learning_rate": 1.0115826198997094e-05, + "loss": 0.4895, + "step": 6293 + }, + { + "epoch": 0.51, + "grad_norm": 0.8546462905815698, + "learning_rate": 1.0113193899125895e-05, + "loss": 0.4768, + "step": 6294 + }, + { + "epoch": 0.51, + "grad_norm": 0.8571567901864389, + "learning_rate": 1.0110561591410456e-05, + "loss": 0.5441, + "step": 6295 + }, + { + "epoch": 0.51, + "grad_norm": 0.8062069436489295, + "learning_rate": 1.0107929276033204e-05, + "loss": 0.4905, + "step": 6296 + }, + { + "epoch": 0.51, + "grad_norm": 0.8690171443782525, + "learning_rate": 1.0105296953176544e-05, + "loss": 0.4848, + "step": 6297 + }, + { + "epoch": 0.51, + "grad_norm": 0.8929739364675837, + "learning_rate": 1.01026646230229e-05, + "loss": 0.5132, + "step": 6298 + }, + { + "epoch": 0.51, + "grad_norm": 0.8985021936347624, + "learning_rate": 1.010003228575469e-05, + "loss": 0.5021, + "step": 6299 + }, + { + "epoch": 0.51, + "grad_norm": 0.8734985699165543, + "learning_rate": 1.009739994155433e-05, + "loss": 0.4902, + "step": 6300 + }, + { + "epoch": 0.51, + "grad_norm": 0.9145764779595406, + "learning_rate": 1.0094767590604238e-05, + "loss": 0.5366, + "step": 6301 + }, + { + "epoch": 0.51, + "grad_norm": 0.938111655768843, + "learning_rate": 1.009213523308684e-05, + "loss": 0.5352, + "step": 6302 + }, + { + "epoch": 0.51, + "grad_norm": 0.9187062301830011, + "learning_rate": 1.0089502869184549e-05, + "loss": 0.5064, + "step": 6303 + }, + { + "epoch": 0.51, + "grad_norm": 0.8909106006968533, + "learning_rate": 1.0086870499079791e-05, + "loss": 0.526, + "step": 6304 + }, + { + "epoch": 0.51, + "grad_norm": 0.9891112088673406, + "learning_rate": 1.0084238122954984e-05, + "loss": 0.4361, + "step": 6305 + }, + { + "epoch": 0.51, + "grad_norm": 0.9986087397097535, + "learning_rate": 1.0081605740992548e-05, + "loss": 0.5481, + "step": 6306 + }, + { + "epoch": 0.51, + "grad_norm": 0.9478586520651313, + "learning_rate": 1.0078973353374908e-05, + "loss": 0.5567, + "step": 6307 + }, + { + "epoch": 0.51, + "grad_norm": 0.889552514423573, + "learning_rate": 1.0076340960284483e-05, + "loss": 0.5208, + "step": 6308 + }, + { + "epoch": 0.51, + "grad_norm": 0.8643499230620482, + "learning_rate": 1.0073708561903702e-05, + "loss": 0.5414, + "step": 6309 + }, + { + "epoch": 0.51, + "grad_norm": 0.976399247172449, + "learning_rate": 1.0071076158414977e-05, + "loss": 0.6492, + "step": 6310 + }, + { + "epoch": 0.51, + "grad_norm": 0.9571144105482331, + "learning_rate": 1.006844375000074e-05, + "loss": 0.6006, + "step": 6311 + }, + { + "epoch": 0.51, + "grad_norm": 0.8677112699372111, + "learning_rate": 1.0065811336843412e-05, + "loss": 0.5269, + "step": 6312 + }, + { + "epoch": 0.51, + "grad_norm": 0.8705189403238094, + "learning_rate": 1.0063178919125416e-05, + "loss": 0.5816, + "step": 6313 + }, + { + "epoch": 0.51, + "grad_norm": 0.936745039273025, + "learning_rate": 1.0060546497029178e-05, + "loss": 0.519, + "step": 6314 + }, + { + "epoch": 0.51, + "grad_norm": 1.2267196954416242, + "learning_rate": 1.0057914070737123e-05, + "loss": 0.6564, + "step": 6315 + }, + { + "epoch": 0.51, + "grad_norm": 0.8408749871775812, + "learning_rate": 1.0055281640431669e-05, + "loss": 0.5499, + "step": 6316 + }, + { + "epoch": 0.51, + "grad_norm": 0.9485173287734637, + "learning_rate": 1.005264920629525e-05, + "loss": 0.5244, + "step": 6317 + }, + { + "epoch": 0.51, + "grad_norm": 0.8744806225444277, + "learning_rate": 1.0050016768510288e-05, + "loss": 0.4819, + "step": 6318 + }, + { + "epoch": 0.51, + "grad_norm": 0.9837730314596066, + "learning_rate": 1.0047384327259207e-05, + "loss": 0.5324, + "step": 6319 + }, + { + "epoch": 0.51, + "grad_norm": 0.9897169442511425, + "learning_rate": 1.0044751882724436e-05, + "loss": 0.5116, + "step": 6320 + }, + { + "epoch": 0.51, + "grad_norm": 0.8355025962859695, + "learning_rate": 1.0042119435088397e-05, + "loss": 0.5085, + "step": 6321 + }, + { + "epoch": 0.51, + "grad_norm": 0.8822374522103449, + "learning_rate": 1.003948698453352e-05, + "loss": 0.5093, + "step": 6322 + }, + { + "epoch": 0.51, + "grad_norm": 0.8868032716019236, + "learning_rate": 1.0036854531242228e-05, + "loss": 0.5216, + "step": 6323 + }, + { + "epoch": 0.51, + "grad_norm": 1.0298087875025894, + "learning_rate": 1.0034222075396954e-05, + "loss": 0.5794, + "step": 6324 + }, + { + "epoch": 0.51, + "grad_norm": 0.859260370043238, + "learning_rate": 1.0031589617180115e-05, + "loss": 0.4686, + "step": 6325 + }, + { + "epoch": 0.51, + "grad_norm": 0.8158325936763585, + "learning_rate": 1.0028957156774146e-05, + "loss": 0.4922, + "step": 6326 + }, + { + "epoch": 0.51, + "grad_norm": 0.9242530488088277, + "learning_rate": 1.0026324694361474e-05, + "loss": 0.5227, + "step": 6327 + }, + { + "epoch": 0.51, + "grad_norm": 1.0280646267210969, + "learning_rate": 1.002369223012452e-05, + "loss": 0.5422, + "step": 6328 + }, + { + "epoch": 0.51, + "grad_norm": 0.848472981714665, + "learning_rate": 1.0021059764245718e-05, + "loss": 0.514, + "step": 6329 + }, + { + "epoch": 0.51, + "grad_norm": 0.8569811344917727, + "learning_rate": 1.0018427296907494e-05, + "loss": 0.5174, + "step": 6330 + }, + { + "epoch": 0.51, + "grad_norm": 1.042013096707966, + "learning_rate": 1.001579482829227e-05, + "loss": 0.5577, + "step": 6331 + }, + { + "epoch": 0.51, + "grad_norm": 1.0091968409285097, + "learning_rate": 1.0013162358582483e-05, + "loss": 0.5897, + "step": 6332 + }, + { + "epoch": 0.51, + "grad_norm": 0.8723519274260123, + "learning_rate": 1.0010529887960554e-05, + "loss": 0.4901, + "step": 6333 + }, + { + "epoch": 0.51, + "grad_norm": 0.9537740574635022, + "learning_rate": 1.0007897416608914e-05, + "loss": 0.5724, + "step": 6334 + }, + { + "epoch": 0.51, + "grad_norm": 0.9789588220022455, + "learning_rate": 1.0005264944709989e-05, + "loss": 0.5557, + "step": 6335 + }, + { + "epoch": 0.51, + "grad_norm": 0.9737723535686139, + "learning_rate": 1.000263247244621e-05, + "loss": 0.5868, + "step": 6336 + }, + { + "epoch": 0.52, + "grad_norm": 0.9126233242598474, + "learning_rate": 1e-05, + "loss": 0.5436, + "step": 6337 + }, + { + "epoch": 0.52, + "grad_norm": 0.9065250250113098, + "learning_rate": 9.997367527553795e-06, + "loss": 0.5473, + "step": 6338 + }, + { + "epoch": 0.52, + "grad_norm": 0.7824478859233489, + "learning_rate": 9.994735055290011e-06, + "loss": 0.4108, + "step": 6339 + }, + { + "epoch": 0.52, + "grad_norm": 0.9394386325341539, + "learning_rate": 9.992102583391089e-06, + "loss": 0.5323, + "step": 6340 + }, + { + "epoch": 0.52, + "grad_norm": 0.9901071026908101, + "learning_rate": 9.98947011203945e-06, + "loss": 0.5428, + "step": 6341 + }, + { + "epoch": 0.52, + "grad_norm": 0.8936489106322111, + "learning_rate": 9.986837641417519e-06, + "loss": 0.5059, + "step": 6342 + }, + { + "epoch": 0.52, + "grad_norm": 0.8990739204643596, + "learning_rate": 9.984205171707731e-06, + "loss": 0.5022, + "step": 6343 + }, + { + "epoch": 0.52, + "grad_norm": 0.9451398467900175, + "learning_rate": 9.981572703092513e-06, + "loss": 0.5639, + "step": 6344 + }, + { + "epoch": 0.52, + "grad_norm": 0.9201894358093854, + "learning_rate": 9.978940235754283e-06, + "loss": 0.5315, + "step": 6345 + }, + { + "epoch": 0.52, + "grad_norm": 0.9482135020604635, + "learning_rate": 9.976307769875483e-06, + "loss": 0.5399, + "step": 6346 + }, + { + "epoch": 0.52, + "grad_norm": 0.9473339865184615, + "learning_rate": 9.973675305638531e-06, + "loss": 0.5061, + "step": 6347 + }, + { + "epoch": 0.52, + "grad_norm": 0.9599685256552569, + "learning_rate": 9.971042843225856e-06, + "loss": 0.5604, + "step": 6348 + }, + { + "epoch": 0.52, + "grad_norm": 0.8559541252886981, + "learning_rate": 9.968410382819888e-06, + "loss": 0.5574, + "step": 6349 + }, + { + "epoch": 0.52, + "grad_norm": 0.8868070586988529, + "learning_rate": 9.965777924603053e-06, + "loss": 0.5052, + "step": 6350 + }, + { + "epoch": 0.52, + "grad_norm": 0.9358125147616042, + "learning_rate": 9.963145468757773e-06, + "loss": 0.5475, + "step": 6351 + }, + { + "epoch": 0.52, + "grad_norm": 0.9305033132415802, + "learning_rate": 9.960513015466484e-06, + "loss": 0.5354, + "step": 6352 + }, + { + "epoch": 0.52, + "grad_norm": 0.9508925128715102, + "learning_rate": 9.957880564911608e-06, + "loss": 0.5395, + "step": 6353 + }, + { + "epoch": 0.52, + "grad_norm": 0.8046306445228953, + "learning_rate": 9.955248117275566e-06, + "loss": 0.4776, + "step": 6354 + }, + { + "epoch": 0.52, + "grad_norm": 0.8810642297736707, + "learning_rate": 9.952615672740795e-06, + "loss": 0.465, + "step": 6355 + }, + { + "epoch": 0.52, + "grad_norm": 0.8271357839574274, + "learning_rate": 9.949983231489717e-06, + "loss": 0.5456, + "step": 6356 + }, + { + "epoch": 0.52, + "grad_norm": 0.9932580719434206, + "learning_rate": 9.947350793704751e-06, + "loss": 0.5465, + "step": 6357 + }, + { + "epoch": 0.52, + "grad_norm": 0.8911737003658028, + "learning_rate": 9.944718359568333e-06, + "loss": 0.4734, + "step": 6358 + }, + { + "epoch": 0.52, + "grad_norm": 0.9414596037214545, + "learning_rate": 9.942085929262884e-06, + "loss": 0.5159, + "step": 6359 + }, + { + "epoch": 0.52, + "grad_norm": 0.9720304123433685, + "learning_rate": 9.939453502970824e-06, + "loss": 0.5785, + "step": 6360 + }, + { + "epoch": 0.52, + "grad_norm": 0.9146124769744249, + "learning_rate": 9.936821080874587e-06, + "loss": 0.5701, + "step": 6361 + }, + { + "epoch": 0.52, + "grad_norm": 0.9294401348339756, + "learning_rate": 9.934188663156592e-06, + "loss": 0.5041, + "step": 6362 + }, + { + "epoch": 0.52, + "grad_norm": 0.896786916401712, + "learning_rate": 9.931556249999262e-06, + "loss": 0.4878, + "step": 6363 + }, + { + "epoch": 0.52, + "grad_norm": 0.9214512261006776, + "learning_rate": 9.928923841585025e-06, + "loss": 0.5577, + "step": 6364 + }, + { + "epoch": 0.52, + "grad_norm": 0.952653505975468, + "learning_rate": 9.926291438096305e-06, + "loss": 0.5745, + "step": 6365 + }, + { + "epoch": 0.52, + "grad_norm": 0.894555231533999, + "learning_rate": 9.923659039715517e-06, + "loss": 0.538, + "step": 6366 + }, + { + "epoch": 0.52, + "grad_norm": 0.9779553837225547, + "learning_rate": 9.921026646625094e-06, + "loss": 0.5475, + "step": 6367 + }, + { + "epoch": 0.52, + "grad_norm": 0.9825468080035322, + "learning_rate": 9.918394259007458e-06, + "loss": 0.5964, + "step": 6368 + }, + { + "epoch": 0.52, + "grad_norm": 0.9816450220972703, + "learning_rate": 9.91576187704502e-06, + "loss": 0.5483, + "step": 6369 + }, + { + "epoch": 0.52, + "grad_norm": 0.9936843969441377, + "learning_rate": 9.913129500920214e-06, + "loss": 0.5398, + "step": 6370 + }, + { + "epoch": 0.52, + "grad_norm": 0.8657007233111466, + "learning_rate": 9.910497130815454e-06, + "loss": 0.4922, + "step": 6371 + }, + { + "epoch": 0.52, + "grad_norm": 0.8313299803368648, + "learning_rate": 9.907864766913162e-06, + "loss": 0.5318, + "step": 6372 + }, + { + "epoch": 0.52, + "grad_norm": 0.9539489317761674, + "learning_rate": 9.905232409395764e-06, + "loss": 0.5689, + "step": 6373 + }, + { + "epoch": 0.52, + "grad_norm": 0.8829599504299749, + "learning_rate": 9.902600058445676e-06, + "loss": 0.5331, + "step": 6374 + }, + { + "epoch": 0.52, + "grad_norm": 1.0323815487828643, + "learning_rate": 9.899967714245313e-06, + "loss": 0.6064, + "step": 6375 + }, + { + "epoch": 0.52, + "grad_norm": 0.8874248857833198, + "learning_rate": 9.897335376977104e-06, + "loss": 0.5538, + "step": 6376 + }, + { + "epoch": 0.52, + "grad_norm": 0.9590342091526441, + "learning_rate": 9.894703046823461e-06, + "loss": 0.571, + "step": 6377 + }, + { + "epoch": 0.52, + "grad_norm": 0.9608727054645129, + "learning_rate": 9.8920707239668e-06, + "loss": 0.5436, + "step": 6378 + }, + { + "epoch": 0.52, + "grad_norm": 0.9303455966981541, + "learning_rate": 9.889438408589545e-06, + "loss": 0.4842, + "step": 6379 + }, + { + "epoch": 0.52, + "grad_norm": 0.9910314393804825, + "learning_rate": 9.88680610087411e-06, + "loss": 0.528, + "step": 6380 + }, + { + "epoch": 0.52, + "grad_norm": 0.9227435341657937, + "learning_rate": 9.884173801002909e-06, + "loss": 0.5307, + "step": 6381 + }, + { + "epoch": 0.52, + "grad_norm": 0.8368401603641216, + "learning_rate": 9.881541509158366e-06, + "loss": 0.4937, + "step": 6382 + }, + { + "epoch": 0.52, + "grad_norm": 0.8543858991076968, + "learning_rate": 9.878909225522889e-06, + "loss": 0.4891, + "step": 6383 + }, + { + "epoch": 0.52, + "grad_norm": 0.9324945143635585, + "learning_rate": 9.876276950278893e-06, + "loss": 0.5354, + "step": 6384 + }, + { + "epoch": 0.52, + "grad_norm": 0.8203979580170194, + "learning_rate": 9.873644683608798e-06, + "loss": 0.4777, + "step": 6385 + }, + { + "epoch": 0.52, + "grad_norm": 0.8715151945909194, + "learning_rate": 9.87101242569501e-06, + "loss": 0.5123, + "step": 6386 + }, + { + "epoch": 0.52, + "grad_norm": 0.954708756603573, + "learning_rate": 9.86838017671995e-06, + "loss": 0.5242, + "step": 6387 + }, + { + "epoch": 0.52, + "grad_norm": 0.9317365832888832, + "learning_rate": 9.865747936866027e-06, + "loss": 0.5395, + "step": 6388 + }, + { + "epoch": 0.52, + "grad_norm": 0.8770081131433685, + "learning_rate": 9.863115706315652e-06, + "loss": 0.5174, + "step": 6389 + }, + { + "epoch": 0.52, + "grad_norm": 0.8598260750153, + "learning_rate": 9.860483485251238e-06, + "loss": 0.4591, + "step": 6390 + }, + { + "epoch": 0.52, + "grad_norm": 0.9755980531605132, + "learning_rate": 9.857851273855195e-06, + "loss": 0.5801, + "step": 6391 + }, + { + "epoch": 0.52, + "grad_norm": 1.0087368311921079, + "learning_rate": 9.855219072309931e-06, + "loss": 0.498, + "step": 6392 + }, + { + "epoch": 0.52, + "grad_norm": 0.9230860988633534, + "learning_rate": 9.852586880797857e-06, + "loss": 0.5284, + "step": 6393 + }, + { + "epoch": 0.52, + "grad_norm": 0.9058880053248296, + "learning_rate": 9.849954699501383e-06, + "loss": 0.5242, + "step": 6394 + }, + { + "epoch": 0.52, + "grad_norm": 0.9677734450221844, + "learning_rate": 9.847322528602913e-06, + "loss": 0.5892, + "step": 6395 + }, + { + "epoch": 0.52, + "grad_norm": 0.9367373557695239, + "learning_rate": 9.844690368284857e-06, + "loss": 0.56, + "step": 6396 + }, + { + "epoch": 0.52, + "grad_norm": 0.878603950726151, + "learning_rate": 9.842058218729623e-06, + "loss": 0.4691, + "step": 6397 + }, + { + "epoch": 0.52, + "grad_norm": 0.8616635511939935, + "learning_rate": 9.839426080119612e-06, + "loss": 0.5389, + "step": 6398 + }, + { + "epoch": 0.52, + "grad_norm": 0.9986338396517531, + "learning_rate": 9.836793952637232e-06, + "loss": 0.552, + "step": 6399 + }, + { + "epoch": 0.52, + "grad_norm": 0.9511148684059743, + "learning_rate": 9.834161836464888e-06, + "loss": 0.5818, + "step": 6400 + }, + { + "epoch": 0.52, + "grad_norm": 1.0105896126209348, + "learning_rate": 9.831529731784975e-06, + "loss": 0.5089, + "step": 6401 + }, + { + "epoch": 0.52, + "grad_norm": 0.8717280136431198, + "learning_rate": 9.828897638779909e-06, + "loss": 0.5305, + "step": 6402 + }, + { + "epoch": 0.52, + "grad_norm": 0.9162669287847024, + "learning_rate": 9.826265557632083e-06, + "loss": 0.5128, + "step": 6403 + }, + { + "epoch": 0.52, + "grad_norm": 0.8920982133737825, + "learning_rate": 9.823633488523898e-06, + "loss": 0.5622, + "step": 6404 + }, + { + "epoch": 0.52, + "grad_norm": 0.9054418294786118, + "learning_rate": 9.821001431637759e-06, + "loss": 0.5298, + "step": 6405 + }, + { + "epoch": 0.52, + "grad_norm": 0.8805665336617836, + "learning_rate": 9.81836938715606e-06, + "loss": 0.5627, + "step": 6406 + }, + { + "epoch": 0.52, + "grad_norm": 0.9678033064998812, + "learning_rate": 9.815737355261201e-06, + "loss": 0.5519, + "step": 6407 + }, + { + "epoch": 0.52, + "grad_norm": 0.8733129307408266, + "learning_rate": 9.813105336135582e-06, + "loss": 0.5598, + "step": 6408 + }, + { + "epoch": 0.52, + "grad_norm": 0.9086165997165361, + "learning_rate": 9.810473329961595e-06, + "loss": 0.524, + "step": 6409 + }, + { + "epoch": 0.52, + "grad_norm": 0.9492530342714538, + "learning_rate": 9.807841336921639e-06, + "loss": 0.4998, + "step": 6410 + }, + { + "epoch": 0.52, + "grad_norm": 0.8393818681658488, + "learning_rate": 9.80520935719811e-06, + "loss": 0.544, + "step": 6411 + }, + { + "epoch": 0.52, + "grad_norm": 0.9381276538883282, + "learning_rate": 9.802577390973397e-06, + "loss": 0.5205, + "step": 6412 + }, + { + "epoch": 0.52, + "grad_norm": 0.8978597929624021, + "learning_rate": 9.799945438429895e-06, + "loss": 0.5191, + "step": 6413 + }, + { + "epoch": 0.52, + "grad_norm": 0.9130042053056334, + "learning_rate": 9.79731349975e-06, + "loss": 0.52, + "step": 6414 + }, + { + "epoch": 0.52, + "grad_norm": 0.909046664883525, + "learning_rate": 9.794681575116097e-06, + "loss": 0.5285, + "step": 6415 + }, + { + "epoch": 0.52, + "grad_norm": 0.9534951232598357, + "learning_rate": 9.792049664710579e-06, + "loss": 0.5247, + "step": 6416 + }, + { + "epoch": 0.52, + "grad_norm": 0.93802065691835, + "learning_rate": 9.789417768715837e-06, + "loss": 0.4866, + "step": 6417 + }, + { + "epoch": 0.52, + "grad_norm": 0.7893135352036715, + "learning_rate": 9.786785887314255e-06, + "loss": 0.412, + "step": 6418 + }, + { + "epoch": 0.52, + "grad_norm": 0.9111844420461667, + "learning_rate": 9.784154020688222e-06, + "loss": 0.5623, + "step": 6419 + }, + { + "epoch": 0.52, + "grad_norm": 0.9935656482433806, + "learning_rate": 9.781522169020125e-06, + "loss": 0.5779, + "step": 6420 + }, + { + "epoch": 0.52, + "grad_norm": 0.8351481116157773, + "learning_rate": 9.778890332492346e-06, + "loss": 0.4643, + "step": 6421 + }, + { + "epoch": 0.52, + "grad_norm": 0.9588974823514563, + "learning_rate": 9.776258511287271e-06, + "loss": 0.5176, + "step": 6422 + }, + { + "epoch": 0.52, + "grad_norm": 0.828264983941755, + "learning_rate": 9.773626705587283e-06, + "loss": 0.4941, + "step": 6423 + }, + { + "epoch": 0.52, + "grad_norm": 0.9097675795507923, + "learning_rate": 9.770994915574766e-06, + "loss": 0.5653, + "step": 6424 + }, + { + "epoch": 0.52, + "grad_norm": 0.8263116395093373, + "learning_rate": 9.768363141432095e-06, + "loss": 0.5402, + "step": 6425 + }, + { + "epoch": 0.52, + "grad_norm": 0.8329323971338786, + "learning_rate": 9.765731383341654e-06, + "loss": 0.4887, + "step": 6426 + }, + { + "epoch": 0.52, + "grad_norm": 0.9319485658181705, + "learning_rate": 9.76309964148582e-06, + "loss": 0.5301, + "step": 6427 + }, + { + "epoch": 0.52, + "grad_norm": 0.9109510386316052, + "learning_rate": 9.760467916046971e-06, + "loss": 0.5702, + "step": 6428 + }, + { + "epoch": 0.52, + "grad_norm": 0.9192063800704567, + "learning_rate": 9.757836207207483e-06, + "loss": 0.5517, + "step": 6429 + }, + { + "epoch": 0.52, + "grad_norm": 0.8938122304610931, + "learning_rate": 9.755204515149731e-06, + "loss": 0.5129, + "step": 6430 + }, + { + "epoch": 0.52, + "grad_norm": 0.8885073683990292, + "learning_rate": 9.75257284005609e-06, + "loss": 0.4888, + "step": 6431 + }, + { + "epoch": 0.52, + "grad_norm": 0.9694255751986594, + "learning_rate": 9.74994118210893e-06, + "loss": 0.5145, + "step": 6432 + }, + { + "epoch": 0.52, + "grad_norm": 0.9905287843588529, + "learning_rate": 9.747309541490627e-06, + "loss": 0.5592, + "step": 6433 + }, + { + "epoch": 0.52, + "grad_norm": 0.9468510667397407, + "learning_rate": 9.744677918383546e-06, + "loss": 0.4493, + "step": 6434 + }, + { + "epoch": 0.52, + "grad_norm": 0.8159532262872364, + "learning_rate": 9.742046312970058e-06, + "loss": 0.4615, + "step": 6435 + }, + { + "epoch": 0.52, + "grad_norm": 0.9604668106872334, + "learning_rate": 9.739414725432535e-06, + "loss": 0.5214, + "step": 6436 + }, + { + "epoch": 0.52, + "grad_norm": 0.9556439331531814, + "learning_rate": 9.736783155953338e-06, + "loss": 0.5053, + "step": 6437 + }, + { + "epoch": 0.52, + "grad_norm": 0.916486932919353, + "learning_rate": 9.734151604714834e-06, + "loss": 0.5413, + "step": 6438 + }, + { + "epoch": 0.52, + "grad_norm": 0.8681676499631316, + "learning_rate": 9.73152007189939e-06, + "loss": 0.4963, + "step": 6439 + }, + { + "epoch": 0.52, + "grad_norm": 0.8529159346600007, + "learning_rate": 9.728888557689364e-06, + "loss": 0.5299, + "step": 6440 + }, + { + "epoch": 0.52, + "grad_norm": 0.8944587393736042, + "learning_rate": 9.726257062267124e-06, + "loss": 0.523, + "step": 6441 + }, + { + "epoch": 0.52, + "grad_norm": 0.8535702354962332, + "learning_rate": 9.723625585815028e-06, + "loss": 0.5471, + "step": 6442 + }, + { + "epoch": 0.52, + "grad_norm": 0.9894346512705576, + "learning_rate": 9.720994128515428e-06, + "loss": 0.5799, + "step": 6443 + }, + { + "epoch": 0.52, + "grad_norm": 0.8912292132663773, + "learning_rate": 9.718362690550693e-06, + "loss": 0.5419, + "step": 6444 + }, + { + "epoch": 0.52, + "grad_norm": 0.8431590559324045, + "learning_rate": 9.715731272103172e-06, + "loss": 0.5395, + "step": 6445 + }, + { + "epoch": 0.52, + "grad_norm": 0.8296842798941493, + "learning_rate": 9.713099873355219e-06, + "loss": 0.5027, + "step": 6446 + }, + { + "epoch": 0.52, + "grad_norm": 1.049682332775139, + "learning_rate": 9.710468494489194e-06, + "loss": 0.5357, + "step": 6447 + }, + { + "epoch": 0.52, + "grad_norm": 0.9346713890089724, + "learning_rate": 9.707837135687444e-06, + "loss": 0.5126, + "step": 6448 + }, + { + "epoch": 0.52, + "grad_norm": 0.9759963987028532, + "learning_rate": 9.705205797132319e-06, + "loss": 0.5742, + "step": 6449 + }, + { + "epoch": 0.52, + "grad_norm": 0.878814498064152, + "learning_rate": 9.702574479006174e-06, + "loss": 0.5266, + "step": 6450 + }, + { + "epoch": 0.52, + "grad_norm": 0.8991362672949392, + "learning_rate": 9.699943181491355e-06, + "loss": 0.4653, + "step": 6451 + }, + { + "epoch": 0.52, + "grad_norm": 0.8413370445817436, + "learning_rate": 9.697311904770202e-06, + "loss": 0.469, + "step": 6452 + }, + { + "epoch": 0.52, + "grad_norm": 0.8689595579603493, + "learning_rate": 9.69468064902507e-06, + "loss": 0.5227, + "step": 6453 + }, + { + "epoch": 0.52, + "grad_norm": 0.9697152793274977, + "learning_rate": 9.692049414438298e-06, + "loss": 0.5844, + "step": 6454 + }, + { + "epoch": 0.52, + "grad_norm": 0.8949486024463249, + "learning_rate": 9.689418201192226e-06, + "loss": 0.5068, + "step": 6455 + }, + { + "epoch": 0.52, + "grad_norm": 0.9773141105296431, + "learning_rate": 9.6867870094692e-06, + "loss": 0.5538, + "step": 6456 + }, + { + "epoch": 0.52, + "grad_norm": 0.857328825009141, + "learning_rate": 9.684155839451555e-06, + "loss": 0.4758, + "step": 6457 + }, + { + "epoch": 0.52, + "grad_norm": 0.9429415025858808, + "learning_rate": 9.681524691321628e-06, + "loss": 0.6401, + "step": 6458 + }, + { + "epoch": 0.52, + "grad_norm": 1.0145411858316762, + "learning_rate": 9.678893565261761e-06, + "loss": 0.5969, + "step": 6459 + }, + { + "epoch": 0.53, + "grad_norm": 0.9661312013835944, + "learning_rate": 9.676262461454285e-06, + "loss": 0.5137, + "step": 6460 + }, + { + "epoch": 0.53, + "grad_norm": 0.9337415348932296, + "learning_rate": 9.67363138008153e-06, + "loss": 0.5644, + "step": 6461 + }, + { + "epoch": 0.53, + "grad_norm": 0.8901844897600891, + "learning_rate": 9.671000321325832e-06, + "loss": 0.6057, + "step": 6462 + }, + { + "epoch": 0.53, + "grad_norm": 0.9016225379646999, + "learning_rate": 9.668369285369524e-06, + "loss": 0.498, + "step": 6463 + }, + { + "epoch": 0.53, + "grad_norm": 0.9297387666303769, + "learning_rate": 9.665738272394924e-06, + "loss": 0.5294, + "step": 6464 + }, + { + "epoch": 0.53, + "grad_norm": 0.9734669398659537, + "learning_rate": 9.66310728258437e-06, + "loss": 0.5146, + "step": 6465 + }, + { + "epoch": 0.53, + "grad_norm": 0.9545143524091929, + "learning_rate": 9.660476316120181e-06, + "loss": 0.5714, + "step": 6466 + }, + { + "epoch": 0.53, + "grad_norm": 0.8922757284332623, + "learning_rate": 9.65784537318468e-06, + "loss": 0.542, + "step": 6467 + }, + { + "epoch": 0.53, + "grad_norm": 0.8573370579926729, + "learning_rate": 9.655214453960195e-06, + "loss": 0.4861, + "step": 6468 + }, + { + "epoch": 0.53, + "grad_norm": 0.8977961285324865, + "learning_rate": 9.652583558629042e-06, + "loss": 0.5144, + "step": 6469 + }, + { + "epoch": 0.53, + "grad_norm": 0.9579689428849033, + "learning_rate": 9.649952687373535e-06, + "loss": 0.5472, + "step": 6470 + }, + { + "epoch": 0.53, + "grad_norm": 0.8805879235281294, + "learning_rate": 9.647321840376001e-06, + "loss": 0.5174, + "step": 6471 + }, + { + "epoch": 0.53, + "grad_norm": 1.051855667302005, + "learning_rate": 9.644691017818752e-06, + "loss": 0.5926, + "step": 6472 + }, + { + "epoch": 0.53, + "grad_norm": 0.8989815009031404, + "learning_rate": 9.642060219884096e-06, + "loss": 0.4875, + "step": 6473 + }, + { + "epoch": 0.53, + "grad_norm": 0.9211094147866692, + "learning_rate": 9.639429446754352e-06, + "loss": 0.5151, + "step": 6474 + }, + { + "epoch": 0.53, + "grad_norm": 0.9399000526890481, + "learning_rate": 9.636798698611828e-06, + "loss": 0.554, + "step": 6475 + }, + { + "epoch": 0.53, + "grad_norm": 0.8830150022783372, + "learning_rate": 9.634167975638828e-06, + "loss": 0.5276, + "step": 6476 + }, + { + "epoch": 0.53, + "grad_norm": 0.858616695781131, + "learning_rate": 9.631537278017667e-06, + "loss": 0.4538, + "step": 6477 + }, + { + "epoch": 0.53, + "grad_norm": 0.9342039254589489, + "learning_rate": 9.628906605930647e-06, + "loss": 0.5417, + "step": 6478 + }, + { + "epoch": 0.53, + "grad_norm": 0.8611122539892918, + "learning_rate": 9.626275959560064e-06, + "loss": 0.536, + "step": 6479 + }, + { + "epoch": 0.53, + "grad_norm": 0.857609366638501, + "learning_rate": 9.62364533908823e-06, + "loss": 0.489, + "step": 6480 + }, + { + "epoch": 0.53, + "grad_norm": 0.825074690220345, + "learning_rate": 9.621014744697442e-06, + "loss": 0.4701, + "step": 6481 + }, + { + "epoch": 0.53, + "grad_norm": 0.934085548351372, + "learning_rate": 9.61838417656999e-06, + "loss": 0.4784, + "step": 6482 + }, + { + "epoch": 0.53, + "grad_norm": 0.9527486975769726, + "learning_rate": 9.615753634888179e-06, + "loss": 0.5408, + "step": 6483 + }, + { + "epoch": 0.53, + "grad_norm": 0.84672859944973, + "learning_rate": 9.613123119834304e-06, + "loss": 0.5066, + "step": 6484 + }, + { + "epoch": 0.53, + "grad_norm": 0.8997458292305717, + "learning_rate": 9.610492631590646e-06, + "loss": 0.5416, + "step": 6485 + }, + { + "epoch": 0.53, + "grad_norm": 0.9626727248227898, + "learning_rate": 9.60786217033951e-06, + "loss": 0.5657, + "step": 6486 + }, + { + "epoch": 0.53, + "grad_norm": 0.764459498528756, + "learning_rate": 9.605231736263176e-06, + "loss": 0.4537, + "step": 6487 + }, + { + "epoch": 0.53, + "grad_norm": 0.9296132740721262, + "learning_rate": 9.602601329543928e-06, + "loss": 0.4775, + "step": 6488 + }, + { + "epoch": 0.53, + "grad_norm": 0.8278080916393366, + "learning_rate": 9.599970950364061e-06, + "loss": 0.4814, + "step": 6489 + }, + { + "epoch": 0.53, + "grad_norm": 0.9615444489092069, + "learning_rate": 9.597340598905851e-06, + "loss": 0.591, + "step": 6490 + }, + { + "epoch": 0.53, + "grad_norm": 0.9166588125726522, + "learning_rate": 9.594710275351577e-06, + "loss": 0.5656, + "step": 6491 + }, + { + "epoch": 0.53, + "grad_norm": 0.9398183643638113, + "learning_rate": 9.592079979883526e-06, + "loss": 0.5495, + "step": 6492 + }, + { + "epoch": 0.53, + "grad_norm": 0.9101662724536649, + "learning_rate": 9.58944971268397e-06, + "loss": 0.575, + "step": 6493 + }, + { + "epoch": 0.53, + "grad_norm": 0.9027671688552364, + "learning_rate": 9.586819473935181e-06, + "loss": 0.5345, + "step": 6494 + }, + { + "epoch": 0.53, + "grad_norm": 0.9741851024488997, + "learning_rate": 9.58418926381944e-06, + "loss": 0.6057, + "step": 6495 + }, + { + "epoch": 0.53, + "grad_norm": 0.8528086376805016, + "learning_rate": 9.581559082519015e-06, + "loss": 0.4635, + "step": 6496 + }, + { + "epoch": 0.53, + "grad_norm": 0.9158825951760049, + "learning_rate": 9.578928930216167e-06, + "loss": 0.5513, + "step": 6497 + }, + { + "epoch": 0.53, + "grad_norm": 0.9498083322382603, + "learning_rate": 9.576298807093177e-06, + "loss": 0.5602, + "step": 6498 + }, + { + "epoch": 0.53, + "grad_norm": 0.8378768819215866, + "learning_rate": 9.573668713332305e-06, + "loss": 0.4934, + "step": 6499 + }, + { + "epoch": 0.53, + "grad_norm": 0.849447929846769, + "learning_rate": 9.571038649115807e-06, + "loss": 0.5089, + "step": 6500 + }, + { + "epoch": 0.53, + "grad_norm": 0.9094774639747515, + "learning_rate": 9.568408614625956e-06, + "loss": 0.5415, + "step": 6501 + }, + { + "epoch": 0.53, + "grad_norm": 0.8683994871193035, + "learning_rate": 9.565778610045003e-06, + "loss": 0.4996, + "step": 6502 + }, + { + "epoch": 0.53, + "grad_norm": 0.8416512392944917, + "learning_rate": 9.563148635555205e-06, + "loss": 0.4769, + "step": 6503 + }, + { + "epoch": 0.53, + "grad_norm": 0.8417091367157625, + "learning_rate": 9.560518691338822e-06, + "loss": 0.5091, + "step": 6504 + }, + { + "epoch": 0.53, + "grad_norm": 0.9307177969399728, + "learning_rate": 9.557888777578105e-06, + "loss": 0.5736, + "step": 6505 + }, + { + "epoch": 0.53, + "grad_norm": 0.9148668617918116, + "learning_rate": 9.555258894455298e-06, + "loss": 0.4671, + "step": 6506 + }, + { + "epoch": 0.53, + "grad_norm": 0.933610510661884, + "learning_rate": 9.55262904215266e-06, + "loss": 0.5215, + "step": 6507 + }, + { + "epoch": 0.53, + "grad_norm": 0.8493681051755907, + "learning_rate": 9.549999220852432e-06, + "loss": 0.4692, + "step": 6508 + }, + { + "epoch": 0.53, + "grad_norm": 0.9008032770859512, + "learning_rate": 9.547369430736857e-06, + "loss": 0.5602, + "step": 6509 + }, + { + "epoch": 0.53, + "grad_norm": 0.8313031359121507, + "learning_rate": 9.54473967198818e-06, + "loss": 0.4551, + "step": 6510 + }, + { + "epoch": 0.53, + "grad_norm": 2.0024548477030657, + "learning_rate": 9.542109944788643e-06, + "loss": 0.4755, + "step": 6511 + }, + { + "epoch": 0.53, + "grad_norm": 0.9424448090376205, + "learning_rate": 9.539480249320473e-06, + "loss": 0.5853, + "step": 6512 + }, + { + "epoch": 0.53, + "grad_norm": 0.8858742800185914, + "learning_rate": 9.53685058576592e-06, + "loss": 0.5339, + "step": 6513 + }, + { + "epoch": 0.53, + "grad_norm": 0.9352935076731403, + "learning_rate": 9.53422095430721e-06, + "loss": 0.5547, + "step": 6514 + }, + { + "epoch": 0.53, + "grad_norm": 1.0227461291170266, + "learning_rate": 9.53159135512657e-06, + "loss": 0.5612, + "step": 6515 + }, + { + "epoch": 0.53, + "grad_norm": 0.9646149787645724, + "learning_rate": 9.528961788406237e-06, + "loss": 0.5668, + "step": 6516 + }, + { + "epoch": 0.53, + "grad_norm": 0.9089296205705552, + "learning_rate": 9.526332254328437e-06, + "loss": 0.5376, + "step": 6517 + }, + { + "epoch": 0.53, + "grad_norm": 0.8838124970501346, + "learning_rate": 9.523702753075386e-06, + "loss": 0.5065, + "step": 6518 + }, + { + "epoch": 0.53, + "grad_norm": 0.9206866517683869, + "learning_rate": 9.521073284829315e-06, + "loss": 0.4834, + "step": 6519 + }, + { + "epoch": 0.53, + "grad_norm": 0.912784923782488, + "learning_rate": 9.518443849772441e-06, + "loss": 0.512, + "step": 6520 + }, + { + "epoch": 0.53, + "grad_norm": 0.9058344101233732, + "learning_rate": 9.515814448086978e-06, + "loss": 0.5461, + "step": 6521 + }, + { + "epoch": 0.53, + "grad_norm": 0.8793164453057128, + "learning_rate": 9.513185079955148e-06, + "loss": 0.4845, + "step": 6522 + }, + { + "epoch": 0.53, + "grad_norm": 0.8984635639989134, + "learning_rate": 9.51055574555916e-06, + "loss": 0.5754, + "step": 6523 + }, + { + "epoch": 0.53, + "grad_norm": 0.9293547040387503, + "learning_rate": 9.50792644508122e-06, + "loss": 0.5819, + "step": 6524 + }, + { + "epoch": 0.53, + "grad_norm": 0.784032342081321, + "learning_rate": 9.505297178703546e-06, + "loss": 0.5305, + "step": 6525 + }, + { + "epoch": 0.53, + "grad_norm": 0.883478994718842, + "learning_rate": 9.502667946608332e-06, + "loss": 0.556, + "step": 6526 + }, + { + "epoch": 0.53, + "grad_norm": 0.8416266967465563, + "learning_rate": 9.500038748977794e-06, + "loss": 0.5466, + "step": 6527 + }, + { + "epoch": 0.53, + "grad_norm": 1.0312877433341645, + "learning_rate": 9.497409585994128e-06, + "loss": 0.5922, + "step": 6528 + }, + { + "epoch": 0.53, + "grad_norm": 0.9451565220427494, + "learning_rate": 9.494780457839527e-06, + "loss": 0.4781, + "step": 6529 + }, + { + "epoch": 0.53, + "grad_norm": 0.9527771032047712, + "learning_rate": 9.492151364696196e-06, + "loss": 0.5277, + "step": 6530 + }, + { + "epoch": 0.53, + "grad_norm": 0.8612304851836948, + "learning_rate": 9.489522306746327e-06, + "loss": 0.487, + "step": 6531 + }, + { + "epoch": 0.53, + "grad_norm": 0.8498362511354313, + "learning_rate": 9.486893284172103e-06, + "loss": 0.5537, + "step": 6532 + }, + { + "epoch": 0.53, + "grad_norm": 0.9141501671369846, + "learning_rate": 9.484264297155724e-06, + "loss": 0.521, + "step": 6533 + }, + { + "epoch": 0.53, + "grad_norm": 0.8471454028075913, + "learning_rate": 9.481635345879373e-06, + "loss": 0.5362, + "step": 6534 + }, + { + "epoch": 0.53, + "grad_norm": 0.8502542534921609, + "learning_rate": 9.479006430525227e-06, + "loss": 0.5177, + "step": 6535 + }, + { + "epoch": 0.53, + "grad_norm": 0.8558504290560964, + "learning_rate": 9.476377551275478e-06, + "loss": 0.5179, + "step": 6536 + }, + { + "epoch": 0.53, + "grad_norm": 0.9149921933622068, + "learning_rate": 9.4737487083123e-06, + "loss": 0.4804, + "step": 6537 + }, + { + "epoch": 0.53, + "grad_norm": 0.8624426856664629, + "learning_rate": 9.471119901817866e-06, + "loss": 0.5192, + "step": 6538 + }, + { + "epoch": 0.53, + "grad_norm": 0.9475697368614927, + "learning_rate": 9.468491131974358e-06, + "loss": 0.5907, + "step": 6539 + }, + { + "epoch": 0.53, + "grad_norm": 0.8597657353172048, + "learning_rate": 9.465862398963943e-06, + "loss": 0.4512, + "step": 6540 + }, + { + "epoch": 0.53, + "grad_norm": 0.791808646314481, + "learning_rate": 9.463233702968784e-06, + "loss": 0.4951, + "step": 6541 + }, + { + "epoch": 0.53, + "grad_norm": 0.9190383831550103, + "learning_rate": 9.46060504417106e-06, + "loss": 0.5751, + "step": 6542 + }, + { + "epoch": 0.53, + "grad_norm": 0.9219461465709242, + "learning_rate": 9.457976422752925e-06, + "loss": 0.5326, + "step": 6543 + }, + { + "epoch": 0.53, + "grad_norm": 0.8887657703238295, + "learning_rate": 9.455347838896541e-06, + "loss": 0.4661, + "step": 6544 + }, + { + "epoch": 0.53, + "grad_norm": 0.8733624243772576, + "learning_rate": 9.452719292784074e-06, + "loss": 0.5464, + "step": 6545 + }, + { + "epoch": 0.53, + "grad_norm": 0.9112115008584419, + "learning_rate": 9.450090784597673e-06, + "loss": 0.5647, + "step": 6546 + }, + { + "epoch": 0.53, + "grad_norm": 1.7311921806377226, + "learning_rate": 9.44746231451949e-06, + "loss": 0.5059, + "step": 6547 + }, + { + "epoch": 0.53, + "grad_norm": 0.8463909442796727, + "learning_rate": 9.444833882731681e-06, + "loss": 0.5476, + "step": 6548 + }, + { + "epoch": 0.53, + "grad_norm": 0.9461525435875031, + "learning_rate": 9.442205489416392e-06, + "loss": 0.5637, + "step": 6549 + }, + { + "epoch": 0.53, + "grad_norm": 0.8698853022532326, + "learning_rate": 9.439577134755763e-06, + "loss": 0.476, + "step": 6550 + }, + { + "epoch": 0.53, + "grad_norm": 0.8221458505598757, + "learning_rate": 9.436948818931947e-06, + "loss": 0.4741, + "step": 6551 + }, + { + "epoch": 0.53, + "grad_norm": 0.8672724748753476, + "learning_rate": 9.434320542127075e-06, + "loss": 0.5206, + "step": 6552 + }, + { + "epoch": 0.53, + "grad_norm": 0.9586388160464261, + "learning_rate": 9.43169230452329e-06, + "loss": 0.5331, + "step": 6553 + }, + { + "epoch": 0.53, + "grad_norm": 0.9699403966085434, + "learning_rate": 9.429064106302724e-06, + "loss": 0.5506, + "step": 6554 + }, + { + "epoch": 0.53, + "grad_norm": 0.9896460820121302, + "learning_rate": 9.426435947647508e-06, + "loss": 0.5081, + "step": 6555 + }, + { + "epoch": 0.53, + "grad_norm": 0.8609140681024577, + "learning_rate": 9.42380782873977e-06, + "loss": 0.4972, + "step": 6556 + }, + { + "epoch": 0.53, + "grad_norm": 0.9419865663383613, + "learning_rate": 9.421179749761643e-06, + "loss": 0.5552, + "step": 6557 + }, + { + "epoch": 0.53, + "grad_norm": 0.8792446463968906, + "learning_rate": 9.418551710895243e-06, + "loss": 0.4424, + "step": 6558 + }, + { + "epoch": 0.53, + "grad_norm": 0.9401380367951486, + "learning_rate": 9.415923712322693e-06, + "loss": 0.4913, + "step": 6559 + }, + { + "epoch": 0.53, + "grad_norm": 0.9564624625255295, + "learning_rate": 9.413295754226115e-06, + "loss": 0.4873, + "step": 6560 + }, + { + "epoch": 0.53, + "grad_norm": 0.9038480801805764, + "learning_rate": 9.410667836787619e-06, + "loss": 0.5728, + "step": 6561 + }, + { + "epoch": 0.53, + "grad_norm": 0.8070238910164431, + "learning_rate": 9.408039960189317e-06, + "loss": 0.4674, + "step": 6562 + }, + { + "epoch": 0.53, + "grad_norm": 0.9804935597251521, + "learning_rate": 9.405412124613325e-06, + "loss": 0.5187, + "step": 6563 + }, + { + "epoch": 0.53, + "grad_norm": 0.8857730008887388, + "learning_rate": 9.402784330241743e-06, + "loss": 0.5002, + "step": 6564 + }, + { + "epoch": 0.53, + "grad_norm": 0.8739355689761309, + "learning_rate": 9.400156577256675e-06, + "loss": 0.5278, + "step": 6565 + }, + { + "epoch": 0.53, + "grad_norm": 0.9579620085111423, + "learning_rate": 9.397528865840229e-06, + "loss": 0.5324, + "step": 6566 + }, + { + "epoch": 0.53, + "grad_norm": 0.8396241787978719, + "learning_rate": 9.394901196174496e-06, + "loss": 0.5279, + "step": 6567 + }, + { + "epoch": 0.53, + "grad_norm": 0.9116099647277938, + "learning_rate": 9.39227356844157e-06, + "loss": 0.4807, + "step": 6568 + }, + { + "epoch": 0.53, + "grad_norm": 0.9085824599768635, + "learning_rate": 9.389645982823552e-06, + "loss": 0.5559, + "step": 6569 + }, + { + "epoch": 0.53, + "grad_norm": 0.946189269407832, + "learning_rate": 9.387018439502524e-06, + "loss": 0.5012, + "step": 6570 + }, + { + "epoch": 0.53, + "grad_norm": 0.8718190611058443, + "learning_rate": 9.384390938660572e-06, + "loss": 0.5061, + "step": 6571 + }, + { + "epoch": 0.53, + "grad_norm": 0.8547334865321193, + "learning_rate": 9.381763480479784e-06, + "loss": 0.5166, + "step": 6572 + }, + { + "epoch": 0.53, + "grad_norm": 1.0652378374846714, + "learning_rate": 9.379136065142241e-06, + "loss": 0.5806, + "step": 6573 + }, + { + "epoch": 0.53, + "grad_norm": 0.8143736521167203, + "learning_rate": 9.376508692830012e-06, + "loss": 0.5379, + "step": 6574 + }, + { + "epoch": 0.53, + "grad_norm": 0.9218503896969388, + "learning_rate": 9.373881363725182e-06, + "loss": 0.5158, + "step": 6575 + }, + { + "epoch": 0.53, + "grad_norm": 0.9985452463714473, + "learning_rate": 9.371254078009819e-06, + "loss": 0.5931, + "step": 6576 + }, + { + "epoch": 0.53, + "grad_norm": 0.9356941082925173, + "learning_rate": 9.368626835865987e-06, + "loss": 0.4867, + "step": 6577 + }, + { + "epoch": 0.53, + "grad_norm": 0.9597854538853388, + "learning_rate": 9.365999637475756e-06, + "loss": 0.609, + "step": 6578 + }, + { + "epoch": 0.53, + "grad_norm": 0.8403719685382467, + "learning_rate": 9.363372483021191e-06, + "loss": 0.4493, + "step": 6579 + }, + { + "epoch": 0.53, + "grad_norm": 0.989244981681324, + "learning_rate": 9.360745372684346e-06, + "loss": 0.6439, + "step": 6580 + }, + { + "epoch": 0.53, + "grad_norm": 0.9416956127576758, + "learning_rate": 9.358118306647278e-06, + "loss": 0.5829, + "step": 6581 + }, + { + "epoch": 0.53, + "grad_norm": 0.9294458423197391, + "learning_rate": 9.355491285092045e-06, + "loss": 0.597, + "step": 6582 + }, + { + "epoch": 0.54, + "grad_norm": 0.8599793380974954, + "learning_rate": 9.352864308200693e-06, + "loss": 0.5003, + "step": 6583 + }, + { + "epoch": 0.54, + "grad_norm": 0.8562934086264922, + "learning_rate": 9.350237376155269e-06, + "loss": 0.4935, + "step": 6584 + }, + { + "epoch": 0.54, + "grad_norm": 0.9499034838164225, + "learning_rate": 9.347610489137821e-06, + "loss": 0.5657, + "step": 6585 + }, + { + "epoch": 0.54, + "grad_norm": 0.9886652459642102, + "learning_rate": 9.344983647330386e-06, + "loss": 0.4546, + "step": 6586 + }, + { + "epoch": 0.54, + "grad_norm": 0.8408184507746581, + "learning_rate": 9.342356850915003e-06, + "loss": 0.4653, + "step": 6587 + }, + { + "epoch": 0.54, + "grad_norm": 0.9170817507583063, + "learning_rate": 9.339730100073709e-06, + "loss": 0.5481, + "step": 6588 + }, + { + "epoch": 0.54, + "grad_norm": 0.8354998641142983, + "learning_rate": 9.33710339498853e-06, + "loss": 0.4542, + "step": 6589 + }, + { + "epoch": 0.54, + "grad_norm": 0.8346026834004256, + "learning_rate": 9.3344767358415e-06, + "loss": 0.4872, + "step": 6590 + }, + { + "epoch": 0.54, + "grad_norm": 0.9166805271898558, + "learning_rate": 9.331850122814644e-06, + "loss": 0.5649, + "step": 6591 + }, + { + "epoch": 0.54, + "grad_norm": 0.9016451210187113, + "learning_rate": 9.329223556089976e-06, + "loss": 0.5336, + "step": 6592 + }, + { + "epoch": 0.54, + "grad_norm": 0.8541527908291205, + "learning_rate": 9.326597035849524e-06, + "loss": 0.514, + "step": 6593 + }, + { + "epoch": 0.54, + "grad_norm": 0.8920893747093328, + "learning_rate": 9.323970562275302e-06, + "loss": 0.5288, + "step": 6594 + }, + { + "epoch": 0.54, + "grad_norm": 0.887453410935998, + "learning_rate": 9.321344135549316e-06, + "loss": 0.5056, + "step": 6595 + }, + { + "epoch": 0.54, + "grad_norm": 0.8794368276693035, + "learning_rate": 9.318717755853583e-06, + "loss": 0.4963, + "step": 6596 + }, + { + "epoch": 0.54, + "grad_norm": 0.8652655941315313, + "learning_rate": 9.316091423370105e-06, + "loss": 0.5329, + "step": 6597 + }, + { + "epoch": 0.54, + "grad_norm": 0.9208644302407073, + "learning_rate": 9.313465138280882e-06, + "loss": 0.5108, + "step": 6598 + }, + { + "epoch": 0.54, + "grad_norm": 0.963722543617233, + "learning_rate": 9.31083890076792e-06, + "loss": 0.575, + "step": 6599 + }, + { + "epoch": 0.54, + "grad_norm": 0.9541286094340493, + "learning_rate": 9.30821271101321e-06, + "loss": 0.5455, + "step": 6600 + }, + { + "epoch": 0.54, + "grad_norm": 1.0065486795180587, + "learning_rate": 9.305586569198742e-06, + "loss": 0.5707, + "step": 6601 + }, + { + "epoch": 0.54, + "grad_norm": 0.9259839776751105, + "learning_rate": 9.30296047550651e-06, + "loss": 0.4889, + "step": 6602 + }, + { + "epoch": 0.54, + "grad_norm": 0.9726764156210098, + "learning_rate": 9.300334430118504e-06, + "loss": 0.5617, + "step": 6603 + }, + { + "epoch": 0.54, + "grad_norm": 0.884838643280822, + "learning_rate": 9.297708433216693e-06, + "loss": 0.503, + "step": 6604 + }, + { + "epoch": 0.54, + "grad_norm": 0.8280384803844053, + "learning_rate": 9.29508248498307e-06, + "loss": 0.4996, + "step": 6605 + }, + { + "epoch": 0.54, + "grad_norm": 0.8906426897381897, + "learning_rate": 9.292456585599607e-06, + "loss": 0.5115, + "step": 6606 + }, + { + "epoch": 0.54, + "grad_norm": 0.996972130518505, + "learning_rate": 9.289830735248269e-06, + "loss": 0.5234, + "step": 6607 + }, + { + "epoch": 0.54, + "grad_norm": 0.8291707987913881, + "learning_rate": 9.287204934111035e-06, + "loss": 0.5578, + "step": 6608 + }, + { + "epoch": 0.54, + "grad_norm": 0.915568083458182, + "learning_rate": 9.284579182369868e-06, + "loss": 0.5419, + "step": 6609 + }, + { + "epoch": 0.54, + "grad_norm": 0.9954268850193402, + "learning_rate": 9.281953480206725e-06, + "loss": 0.522, + "step": 6610 + }, + { + "epoch": 0.54, + "grad_norm": 0.9047193985797033, + "learning_rate": 9.279327827803573e-06, + "loss": 0.4784, + "step": 6611 + }, + { + "epoch": 0.54, + "grad_norm": 0.8701895692193299, + "learning_rate": 9.276702225342363e-06, + "loss": 0.4848, + "step": 6612 + }, + { + "epoch": 0.54, + "grad_norm": 0.9950557491152223, + "learning_rate": 9.274076673005042e-06, + "loss": 0.5792, + "step": 6613 + }, + { + "epoch": 0.54, + "grad_norm": 0.9822586130445328, + "learning_rate": 9.271451170973568e-06, + "loss": 0.4991, + "step": 6614 + }, + { + "epoch": 0.54, + "grad_norm": 0.9107882853370662, + "learning_rate": 9.268825719429884e-06, + "loss": 0.5307, + "step": 6615 + }, + { + "epoch": 0.54, + "grad_norm": 0.9603488113009204, + "learning_rate": 9.266200318555923e-06, + "loss": 0.5128, + "step": 6616 + }, + { + "epoch": 0.54, + "grad_norm": 0.9931853248124778, + "learning_rate": 9.263574968533635e-06, + "loss": 0.5775, + "step": 6617 + }, + { + "epoch": 0.54, + "grad_norm": 0.9562773602607122, + "learning_rate": 9.260949669544946e-06, + "loss": 0.4898, + "step": 6618 + }, + { + "epoch": 0.54, + "grad_norm": 0.9262129155659268, + "learning_rate": 9.258324421771785e-06, + "loss": 0.5486, + "step": 6619 + }, + { + "epoch": 0.54, + "grad_norm": 0.9512134810062653, + "learning_rate": 9.255699225396091e-06, + "loss": 0.5639, + "step": 6620 + }, + { + "epoch": 0.54, + "grad_norm": 0.8827005731153343, + "learning_rate": 9.25307408059978e-06, + "loss": 0.5139, + "step": 6621 + }, + { + "epoch": 0.54, + "grad_norm": 0.8965142343662935, + "learning_rate": 9.250448987564765e-06, + "loss": 0.5929, + "step": 6622 + }, + { + "epoch": 0.54, + "grad_norm": 0.9594934835087743, + "learning_rate": 9.247823946472978e-06, + "loss": 0.5294, + "step": 6623 + }, + { + "epoch": 0.54, + "grad_norm": 0.9948381063542425, + "learning_rate": 9.245198957506324e-06, + "loss": 0.5583, + "step": 6624 + }, + { + "epoch": 0.54, + "grad_norm": 0.9747739230563899, + "learning_rate": 9.242574020846706e-06, + "loss": 0.596, + "step": 6625 + }, + { + "epoch": 0.54, + "grad_norm": 0.9866782142190802, + "learning_rate": 9.239949136676042e-06, + "loss": 0.5165, + "step": 6626 + }, + { + "epoch": 0.54, + "grad_norm": 0.836617718413994, + "learning_rate": 9.23732430517623e-06, + "loss": 0.4931, + "step": 6627 + }, + { + "epoch": 0.54, + "grad_norm": 0.9969240754051573, + "learning_rate": 9.23469952652916e-06, + "loss": 0.5709, + "step": 6628 + }, + { + "epoch": 0.54, + "grad_norm": 1.084925424323057, + "learning_rate": 9.232074800916741e-06, + "loss": 0.5608, + "step": 6629 + }, + { + "epoch": 0.54, + "grad_norm": 0.8014165391119704, + "learning_rate": 9.229450128520856e-06, + "loss": 0.4672, + "step": 6630 + }, + { + "epoch": 0.54, + "grad_norm": 0.9275634877793757, + "learning_rate": 9.22682550952339e-06, + "loss": 0.5796, + "step": 6631 + }, + { + "epoch": 0.54, + "grad_norm": 0.9943343152879219, + "learning_rate": 9.224200944106234e-06, + "loss": 0.5677, + "step": 6632 + }, + { + "epoch": 0.54, + "grad_norm": 0.875222035331371, + "learning_rate": 9.221576432451266e-06, + "loss": 0.5456, + "step": 6633 + }, + { + "epoch": 0.54, + "grad_norm": 0.7647523024721653, + "learning_rate": 9.218951974740354e-06, + "loss": 0.4952, + "step": 6634 + }, + { + "epoch": 0.54, + "grad_norm": 0.8870884362180773, + "learning_rate": 9.216327571155384e-06, + "loss": 0.4923, + "step": 6635 + }, + { + "epoch": 0.54, + "grad_norm": 0.8953762772911025, + "learning_rate": 9.213703221878217e-06, + "loss": 0.5387, + "step": 6636 + }, + { + "epoch": 0.54, + "grad_norm": 0.7947655687162931, + "learning_rate": 9.211078927090714e-06, + "loss": 0.536, + "step": 6637 + }, + { + "epoch": 0.54, + "grad_norm": 0.9651351409257324, + "learning_rate": 9.208454686974748e-06, + "loss": 0.4714, + "step": 6638 + }, + { + "epoch": 0.54, + "grad_norm": 0.9095238031554407, + "learning_rate": 9.205830501712168e-06, + "loss": 0.5529, + "step": 6639 + }, + { + "epoch": 0.54, + "grad_norm": 0.8954728476730376, + "learning_rate": 9.203206371484827e-06, + "loss": 0.4868, + "step": 6640 + }, + { + "epoch": 0.54, + "grad_norm": 0.9236020847975327, + "learning_rate": 9.200582296474581e-06, + "loss": 0.5553, + "step": 6641 + }, + { + "epoch": 0.54, + "grad_norm": 0.973573370807308, + "learning_rate": 9.197958276863274e-06, + "loss": 0.6337, + "step": 6642 + }, + { + "epoch": 0.54, + "grad_norm": 0.8218184879634992, + "learning_rate": 9.195334312832742e-06, + "loss": 0.5422, + "step": 6643 + }, + { + "epoch": 0.54, + "grad_norm": 0.8703081984309136, + "learning_rate": 9.192710404564833e-06, + "loss": 0.5375, + "step": 6644 + }, + { + "epoch": 0.54, + "grad_norm": 0.868266999358897, + "learning_rate": 9.190086552241375e-06, + "loss": 0.5186, + "step": 6645 + }, + { + "epoch": 0.54, + "grad_norm": 0.9387964986316296, + "learning_rate": 9.187462756044198e-06, + "loss": 0.5584, + "step": 6646 + }, + { + "epoch": 0.54, + "grad_norm": 0.9318884814426076, + "learning_rate": 9.184839016155136e-06, + "loss": 0.5393, + "step": 6647 + }, + { + "epoch": 0.54, + "grad_norm": 0.894118676292748, + "learning_rate": 9.182215332756003e-06, + "loss": 0.5073, + "step": 6648 + }, + { + "epoch": 0.54, + "grad_norm": 0.9782985369401567, + "learning_rate": 9.179591706028626e-06, + "loss": 0.5251, + "step": 6649 + }, + { + "epoch": 0.54, + "grad_norm": 0.9914198352405411, + "learning_rate": 9.176968136154815e-06, + "loss": 0.5461, + "step": 6650 + }, + { + "epoch": 0.54, + "grad_norm": 0.8639055680751122, + "learning_rate": 9.174344623316377e-06, + "loss": 0.5449, + "step": 6651 + }, + { + "epoch": 0.54, + "grad_norm": 0.9484262400017193, + "learning_rate": 9.171721167695132e-06, + "loss": 0.4951, + "step": 6652 + }, + { + "epoch": 0.54, + "grad_norm": 0.897854575462643, + "learning_rate": 9.169097769472873e-06, + "loss": 0.5389, + "step": 6653 + }, + { + "epoch": 0.54, + "grad_norm": 0.9094567346832353, + "learning_rate": 9.166474428831399e-06, + "loss": 0.5056, + "step": 6654 + }, + { + "epoch": 0.54, + "grad_norm": 0.8990709731930566, + "learning_rate": 9.16385114595251e-06, + "loss": 0.4614, + "step": 6655 + }, + { + "epoch": 0.54, + "grad_norm": 0.8562388383715867, + "learning_rate": 9.161227921017996e-06, + "loss": 0.5157, + "step": 6656 + }, + { + "epoch": 0.54, + "grad_norm": 0.8713141936842353, + "learning_rate": 9.158604754209637e-06, + "loss": 0.4757, + "step": 6657 + }, + { + "epoch": 0.54, + "grad_norm": 0.8443143174087063, + "learning_rate": 9.15598164570923e-06, + "loss": 0.5069, + "step": 6658 + }, + { + "epoch": 0.54, + "grad_norm": 0.92136601553196, + "learning_rate": 9.153358595698542e-06, + "loss": 0.5045, + "step": 6659 + }, + { + "epoch": 0.54, + "grad_norm": 0.827326202667017, + "learning_rate": 9.15073560435935e-06, + "loss": 0.5152, + "step": 6660 + }, + { + "epoch": 0.54, + "grad_norm": 0.9489168131505338, + "learning_rate": 9.148112671873433e-06, + "loss": 0.5234, + "step": 6661 + }, + { + "epoch": 0.54, + "grad_norm": 0.9148988968297469, + "learning_rate": 9.14548979842255e-06, + "loss": 0.5703, + "step": 6662 + }, + { + "epoch": 0.54, + "grad_norm": 0.9248357145748127, + "learning_rate": 9.142866984188465e-06, + "loss": 0.5427, + "step": 6663 + }, + { + "epoch": 0.54, + "grad_norm": 0.9191461067370301, + "learning_rate": 9.140244229352939e-06, + "loss": 0.5771, + "step": 6664 + }, + { + "epoch": 0.54, + "grad_norm": 0.8586547352456922, + "learning_rate": 9.137621534097727e-06, + "loss": 0.5078, + "step": 6665 + }, + { + "epoch": 0.54, + "grad_norm": 0.9672594179057945, + "learning_rate": 9.134998898604573e-06, + "loss": 0.5568, + "step": 6666 + }, + { + "epoch": 0.54, + "grad_norm": 0.9385908631281702, + "learning_rate": 9.13237632305523e-06, + "loss": 0.5224, + "step": 6667 + }, + { + "epoch": 0.54, + "grad_norm": 0.8288168870807058, + "learning_rate": 9.129753807631441e-06, + "loss": 0.4726, + "step": 6668 + }, + { + "epoch": 0.54, + "grad_norm": 0.822299693879274, + "learning_rate": 9.127131352514936e-06, + "loss": 0.4879, + "step": 6669 + }, + { + "epoch": 0.54, + "grad_norm": 0.9248585728305219, + "learning_rate": 9.124508957887458e-06, + "loss": 0.5317, + "step": 6670 + }, + { + "epoch": 0.54, + "grad_norm": 0.8571133889927011, + "learning_rate": 9.121886623930735e-06, + "loss": 0.55, + "step": 6671 + }, + { + "epoch": 0.54, + "grad_norm": 0.9419444582984893, + "learning_rate": 9.119264350826484e-06, + "loss": 0.538, + "step": 6672 + }, + { + "epoch": 0.54, + "grad_norm": 0.9999021530257328, + "learning_rate": 9.116642138756436e-06, + "loss": 0.5301, + "step": 6673 + }, + { + "epoch": 0.54, + "grad_norm": 0.9350978949571519, + "learning_rate": 9.114019987902305e-06, + "loss": 0.5313, + "step": 6674 + }, + { + "epoch": 0.54, + "grad_norm": 0.8767162700759692, + "learning_rate": 9.111397898445798e-06, + "loss": 0.4861, + "step": 6675 + }, + { + "epoch": 0.54, + "grad_norm": 0.811548256992899, + "learning_rate": 9.108775870568633e-06, + "loss": 0.4757, + "step": 6676 + }, + { + "epoch": 0.54, + "grad_norm": 0.8172579337528821, + "learning_rate": 9.10615390445251e-06, + "loss": 0.4537, + "step": 6677 + }, + { + "epoch": 0.54, + "grad_norm": 0.8618640332615486, + "learning_rate": 9.103532000279126e-06, + "loss": 0.473, + "step": 6678 + }, + { + "epoch": 0.54, + "grad_norm": 0.8147714207686093, + "learning_rate": 9.100910158230181e-06, + "loss": 0.4793, + "step": 6679 + }, + { + "epoch": 0.54, + "grad_norm": 0.8892794701271312, + "learning_rate": 9.098288378487365e-06, + "loss": 0.4554, + "step": 6680 + }, + { + "epoch": 0.54, + "grad_norm": 0.9288749782454486, + "learning_rate": 9.095666661232359e-06, + "loss": 0.5453, + "step": 6681 + }, + { + "epoch": 0.54, + "grad_norm": 0.9337429815153883, + "learning_rate": 9.093045006646858e-06, + "loss": 0.5492, + "step": 6682 + }, + { + "epoch": 0.54, + "grad_norm": 0.8679010196336439, + "learning_rate": 9.090423414912533e-06, + "loss": 0.5516, + "step": 6683 + }, + { + "epoch": 0.54, + "grad_norm": 1.0241778177487386, + "learning_rate": 9.087801886211054e-06, + "loss": 0.5558, + "step": 6684 + }, + { + "epoch": 0.54, + "grad_norm": 0.9616016252490783, + "learning_rate": 9.085180420724098e-06, + "loss": 0.5434, + "step": 6685 + }, + { + "epoch": 0.54, + "grad_norm": 1.0637639240727763, + "learning_rate": 9.08255901863333e-06, + "loss": 0.5941, + "step": 6686 + }, + { + "epoch": 0.54, + "grad_norm": 0.8857949184792532, + "learning_rate": 9.079937680120403e-06, + "loss": 0.513, + "step": 6687 + }, + { + "epoch": 0.54, + "grad_norm": 0.8920117605122396, + "learning_rate": 9.07731640536698e-06, + "loss": 0.5177, + "step": 6688 + }, + { + "epoch": 0.54, + "grad_norm": 0.9771864597037313, + "learning_rate": 9.074695194554716e-06, + "loss": 0.5003, + "step": 6689 + }, + { + "epoch": 0.54, + "grad_norm": 0.8469320043425077, + "learning_rate": 9.072074047865249e-06, + "loss": 0.5068, + "step": 6690 + }, + { + "epoch": 0.54, + "grad_norm": 0.9443738087006439, + "learning_rate": 9.06945296548023e-06, + "loss": 0.5157, + "step": 6691 + }, + { + "epoch": 0.54, + "grad_norm": 0.9300489829476154, + "learning_rate": 9.066831947581297e-06, + "loss": 0.5403, + "step": 6692 + }, + { + "epoch": 0.54, + "grad_norm": 1.053327155730105, + "learning_rate": 9.064210994350077e-06, + "loss": 0.6007, + "step": 6693 + }, + { + "epoch": 0.54, + "grad_norm": 1.4474967110123402, + "learning_rate": 9.061590105968208e-06, + "loss": 0.575, + "step": 6694 + }, + { + "epoch": 0.54, + "grad_norm": 0.8526689997310141, + "learning_rate": 9.058969282617314e-06, + "loss": 0.5209, + "step": 6695 + }, + { + "epoch": 0.54, + "grad_norm": 0.8097094716494864, + "learning_rate": 9.056348524479011e-06, + "loss": 0.4076, + "step": 6696 + }, + { + "epoch": 0.54, + "grad_norm": 0.8654787700636426, + "learning_rate": 9.05372783173492e-06, + "loss": 0.4703, + "step": 6697 + }, + { + "epoch": 0.54, + "grad_norm": 0.8186544312924194, + "learning_rate": 9.051107204566652e-06, + "loss": 0.4329, + "step": 6698 + }, + { + "epoch": 0.54, + "grad_norm": 0.8674267898300603, + "learning_rate": 9.04848664315581e-06, + "loss": 0.5081, + "step": 6699 + }, + { + "epoch": 0.54, + "grad_norm": 0.925496440387974, + "learning_rate": 9.045866147684002e-06, + "loss": 0.6014, + "step": 6700 + }, + { + "epoch": 0.54, + "grad_norm": 0.974342361846761, + "learning_rate": 9.043245718332821e-06, + "loss": 0.5809, + "step": 6701 + }, + { + "epoch": 0.54, + "grad_norm": 0.8923396073658344, + "learning_rate": 9.040625355283865e-06, + "loss": 0.5784, + "step": 6702 + }, + { + "epoch": 0.54, + "grad_norm": 0.861738870749732, + "learning_rate": 9.038005058718722e-06, + "loss": 0.5445, + "step": 6703 + }, + { + "epoch": 0.54, + "grad_norm": 0.8438303457500101, + "learning_rate": 9.035384828818974e-06, + "loss": 0.5452, + "step": 6704 + }, + { + "epoch": 0.54, + "grad_norm": 0.8829006208766645, + "learning_rate": 9.0327646657662e-06, + "loss": 0.5218, + "step": 6705 + }, + { + "epoch": 0.55, + "grad_norm": 0.8418172476199486, + "learning_rate": 9.03014456974198e-06, + "loss": 0.4835, + "step": 6706 + }, + { + "epoch": 0.55, + "grad_norm": 1.008223396610821, + "learning_rate": 9.027524540927878e-06, + "loss": 0.5586, + "step": 6707 + }, + { + "epoch": 0.55, + "grad_norm": 0.8712725958245666, + "learning_rate": 9.024904579505465e-06, + "loss": 0.4877, + "step": 6708 + }, + { + "epoch": 0.55, + "grad_norm": 0.9502419365005647, + "learning_rate": 9.0222846856563e-06, + "loss": 0.5817, + "step": 6709 + }, + { + "epoch": 0.55, + "grad_norm": 0.951721220447067, + "learning_rate": 9.019664859561938e-06, + "loss": 0.5443, + "step": 6710 + }, + { + "epoch": 0.55, + "grad_norm": 0.8979820465371918, + "learning_rate": 9.01704510140393e-06, + "loss": 0.5145, + "step": 6711 + }, + { + "epoch": 0.55, + "grad_norm": 0.9145141115880605, + "learning_rate": 9.014425411363827e-06, + "loss": 0.5198, + "step": 6712 + }, + { + "epoch": 0.55, + "grad_norm": 0.891068921265497, + "learning_rate": 9.011805789623168e-06, + "loss": 0.5452, + "step": 6713 + }, + { + "epoch": 0.55, + "grad_norm": 0.8933266887513471, + "learning_rate": 9.00918623636349e-06, + "loss": 0.5038, + "step": 6714 + }, + { + "epoch": 0.55, + "grad_norm": 0.8983927826301715, + "learning_rate": 9.00656675176633e-06, + "loss": 0.5516, + "step": 6715 + }, + { + "epoch": 0.55, + "grad_norm": 1.145812936653427, + "learning_rate": 9.003947336013212e-06, + "loss": 0.5339, + "step": 6716 + }, + { + "epoch": 0.55, + "grad_norm": 0.800954853444465, + "learning_rate": 9.001327989285658e-06, + "loss": 0.4313, + "step": 6717 + }, + { + "epoch": 0.55, + "grad_norm": 0.9599757274661858, + "learning_rate": 8.99870871176519e-06, + "loss": 0.5393, + "step": 6718 + }, + { + "epoch": 0.55, + "grad_norm": 0.8637559886276752, + "learning_rate": 8.99608950363332e-06, + "loss": 0.4987, + "step": 6719 + }, + { + "epoch": 0.55, + "grad_norm": 0.9177154870428762, + "learning_rate": 8.993470365071557e-06, + "loss": 0.4984, + "step": 6720 + }, + { + "epoch": 0.55, + "grad_norm": 0.8671382758403665, + "learning_rate": 8.990851296261403e-06, + "loss": 0.4806, + "step": 6721 + }, + { + "epoch": 0.55, + "grad_norm": 0.8485265776210158, + "learning_rate": 8.988232297384363e-06, + "loss": 0.5311, + "step": 6722 + }, + { + "epoch": 0.55, + "grad_norm": 0.8996283623951162, + "learning_rate": 8.985613368621923e-06, + "loss": 0.479, + "step": 6723 + }, + { + "epoch": 0.55, + "grad_norm": 0.8339680584043668, + "learning_rate": 8.98299451015558e-06, + "loss": 0.4592, + "step": 6724 + }, + { + "epoch": 0.55, + "grad_norm": 0.911429171245189, + "learning_rate": 8.980375722166816e-06, + "loss": 0.5446, + "step": 6725 + }, + { + "epoch": 0.55, + "grad_norm": 0.9296998668878459, + "learning_rate": 8.977757004837107e-06, + "loss": 0.526, + "step": 6726 + }, + { + "epoch": 0.55, + "grad_norm": 0.8934152401740832, + "learning_rate": 8.975138358347931e-06, + "loss": 0.5168, + "step": 6727 + }, + { + "epoch": 0.55, + "grad_norm": 0.8699340458300051, + "learning_rate": 8.97251978288076e-06, + "loss": 0.5115, + "step": 6728 + }, + { + "epoch": 0.55, + "grad_norm": 0.8042443273524694, + "learning_rate": 8.969901278617056e-06, + "loss": 0.493, + "step": 6729 + }, + { + "epoch": 0.55, + "grad_norm": 0.9139150214408609, + "learning_rate": 8.967282845738278e-06, + "loss": 0.5782, + "step": 6730 + }, + { + "epoch": 0.55, + "grad_norm": 0.9087431719596738, + "learning_rate": 8.964664484425887e-06, + "loss": 0.4987, + "step": 6731 + }, + { + "epoch": 0.55, + "grad_norm": 0.9863678036432829, + "learning_rate": 8.962046194861324e-06, + "loss": 0.5613, + "step": 6732 + }, + { + "epoch": 0.55, + "grad_norm": 0.8027492964740189, + "learning_rate": 8.959427977226041e-06, + "loss": 0.5256, + "step": 6733 + }, + { + "epoch": 0.55, + "grad_norm": 0.9706281749582049, + "learning_rate": 8.956809831701478e-06, + "loss": 0.5961, + "step": 6734 + }, + { + "epoch": 0.55, + "grad_norm": 0.837992334486424, + "learning_rate": 8.954191758469065e-06, + "loss": 0.5812, + "step": 6735 + }, + { + "epoch": 0.55, + "grad_norm": 0.9209016693375767, + "learning_rate": 8.951573757710237e-06, + "loss": 0.5021, + "step": 6736 + }, + { + "epoch": 0.55, + "grad_norm": 0.8954100318518413, + "learning_rate": 8.948955829606419e-06, + "loss": 0.5376, + "step": 6737 + }, + { + "epoch": 0.55, + "grad_norm": 0.8414160352133793, + "learning_rate": 8.946337974339025e-06, + "loss": 0.4799, + "step": 6738 + }, + { + "epoch": 0.55, + "grad_norm": 0.9261969375567992, + "learning_rate": 8.94372019208948e-06, + "loss": 0.5435, + "step": 6739 + }, + { + "epoch": 0.55, + "grad_norm": 0.8862438885604225, + "learning_rate": 8.941102483039188e-06, + "loss": 0.4679, + "step": 6740 + }, + { + "epoch": 0.55, + "grad_norm": 0.998787432571941, + "learning_rate": 8.938484847369552e-06, + "loss": 0.529, + "step": 6741 + }, + { + "epoch": 0.55, + "grad_norm": 0.933973434550398, + "learning_rate": 8.935867285261977e-06, + "loss": 0.5466, + "step": 6742 + }, + { + "epoch": 0.55, + "grad_norm": 0.8179520104205413, + "learning_rate": 8.933249796897857e-06, + "loss": 0.5482, + "step": 6743 + }, + { + "epoch": 0.55, + "grad_norm": 1.0059179488657666, + "learning_rate": 8.930632382458574e-06, + "loss": 0.5231, + "step": 6744 + }, + { + "epoch": 0.55, + "grad_norm": 0.8559062638771029, + "learning_rate": 8.928015042125523e-06, + "loss": 0.5056, + "step": 6745 + }, + { + "epoch": 0.55, + "grad_norm": 0.9097764966615546, + "learning_rate": 8.92539777608008e-06, + "loss": 0.5607, + "step": 6746 + }, + { + "epoch": 0.55, + "grad_norm": 0.9397337848518229, + "learning_rate": 8.92278058450361e-06, + "loss": 0.5203, + "step": 6747 + }, + { + "epoch": 0.55, + "grad_norm": 0.9255883182156407, + "learning_rate": 8.920163467577498e-06, + "loss": 0.5321, + "step": 6748 + }, + { + "epoch": 0.55, + "grad_norm": 1.0030233998599511, + "learning_rate": 8.9175464254831e-06, + "loss": 0.5644, + "step": 6749 + }, + { + "epoch": 0.55, + "grad_norm": 0.8286218798653401, + "learning_rate": 8.914929458401767e-06, + "loss": 0.5238, + "step": 6750 + }, + { + "epoch": 0.55, + "grad_norm": 0.8400491207198404, + "learning_rate": 8.912312566514867e-06, + "loss": 0.5077, + "step": 6751 + }, + { + "epoch": 0.55, + "grad_norm": 1.0080073977252997, + "learning_rate": 8.909695750003741e-06, + "loss": 0.5366, + "step": 6752 + }, + { + "epoch": 0.55, + "grad_norm": 0.9119849927606701, + "learning_rate": 8.907079009049728e-06, + "loss": 0.4794, + "step": 6753 + }, + { + "epoch": 0.55, + "grad_norm": 0.9238807903194287, + "learning_rate": 8.904462343834174e-06, + "loss": 0.4932, + "step": 6754 + }, + { + "epoch": 0.55, + "grad_norm": 0.8953783979565785, + "learning_rate": 8.901845754538408e-06, + "loss": 0.5132, + "step": 6755 + }, + { + "epoch": 0.55, + "grad_norm": 0.8881835413458181, + "learning_rate": 8.899229241343753e-06, + "loss": 0.5388, + "step": 6756 + }, + { + "epoch": 0.55, + "grad_norm": 0.8059493226002098, + "learning_rate": 8.89661280443154e-06, + "loss": 0.4756, + "step": 6757 + }, + { + "epoch": 0.55, + "grad_norm": 0.8589226325513761, + "learning_rate": 8.89399644398308e-06, + "loss": 0.5012, + "step": 6758 + }, + { + "epoch": 0.55, + "grad_norm": 0.8661157764886962, + "learning_rate": 8.891380160179683e-06, + "loss": 0.5472, + "step": 6759 + }, + { + "epoch": 0.55, + "grad_norm": 0.8802847133080646, + "learning_rate": 8.88876395320266e-06, + "loss": 0.5278, + "step": 6760 + }, + { + "epoch": 0.55, + "grad_norm": 0.839132635416687, + "learning_rate": 8.88614782323331e-06, + "loss": 0.5119, + "step": 6761 + }, + { + "epoch": 0.55, + "grad_norm": 0.8662673702579419, + "learning_rate": 8.883531770452924e-06, + "loss": 0.5376, + "step": 6762 + }, + { + "epoch": 0.55, + "grad_norm": 0.8629463522649556, + "learning_rate": 8.880915795042798e-06, + "loss": 0.4509, + "step": 6763 + }, + { + "epoch": 0.55, + "grad_norm": 0.9198570897646432, + "learning_rate": 8.878299897184218e-06, + "loss": 0.5309, + "step": 6764 + }, + { + "epoch": 0.55, + "grad_norm": 0.9682401080681828, + "learning_rate": 8.875684077058453e-06, + "loss": 0.5539, + "step": 6765 + }, + { + "epoch": 0.55, + "grad_norm": 0.9131512395292984, + "learning_rate": 8.87306833484679e-06, + "loss": 0.5804, + "step": 6766 + }, + { + "epoch": 0.55, + "grad_norm": 0.9043883538180063, + "learning_rate": 8.870452670730491e-06, + "loss": 0.5376, + "step": 6767 + }, + { + "epoch": 0.55, + "grad_norm": 0.8842007847462771, + "learning_rate": 8.867837084890817e-06, + "loss": 0.5603, + "step": 6768 + }, + { + "epoch": 0.55, + "grad_norm": 0.8917436434495278, + "learning_rate": 8.865221577509034e-06, + "loss": 0.5391, + "step": 6769 + }, + { + "epoch": 0.55, + "grad_norm": 0.8969528174452776, + "learning_rate": 8.862606148766386e-06, + "loss": 0.5521, + "step": 6770 + }, + { + "epoch": 0.55, + "grad_norm": 0.8549046038065036, + "learning_rate": 8.85999079884412e-06, + "loss": 0.5104, + "step": 6771 + }, + { + "epoch": 0.55, + "grad_norm": 0.8370046492826245, + "learning_rate": 8.857375527923487e-06, + "loss": 0.4694, + "step": 6772 + }, + { + "epoch": 0.55, + "grad_norm": 0.9201091933672658, + "learning_rate": 8.854760336185709e-06, + "loss": 0.5549, + "step": 6773 + }, + { + "epoch": 0.55, + "grad_norm": 0.9192420832976912, + "learning_rate": 8.85214522381203e-06, + "loss": 0.5098, + "step": 6774 + }, + { + "epoch": 0.55, + "grad_norm": 0.8566796001040332, + "learning_rate": 8.849530190983669e-06, + "loss": 0.4605, + "step": 6775 + }, + { + "epoch": 0.55, + "grad_norm": 0.8527635064927266, + "learning_rate": 8.846915237881838e-06, + "loss": 0.5575, + "step": 6776 + }, + { + "epoch": 0.55, + "grad_norm": 0.8049186625770314, + "learning_rate": 8.844300364687766e-06, + "loss": 0.4864, + "step": 6777 + }, + { + "epoch": 0.55, + "grad_norm": 0.874725545010611, + "learning_rate": 8.841685571582652e-06, + "loss": 0.5308, + "step": 6778 + }, + { + "epoch": 0.55, + "grad_norm": 0.9224938208424457, + "learning_rate": 8.839070858747697e-06, + "loss": 0.5409, + "step": 6779 + }, + { + "epoch": 0.55, + "grad_norm": 0.8814961698465333, + "learning_rate": 8.836456226364106e-06, + "loss": 0.5138, + "step": 6780 + }, + { + "epoch": 0.55, + "grad_norm": 0.9087348939049014, + "learning_rate": 8.833841674613066e-06, + "loss": 0.548, + "step": 6781 + }, + { + "epoch": 0.55, + "grad_norm": 0.9069806988718064, + "learning_rate": 8.831227203675759e-06, + "loss": 0.5685, + "step": 6782 + }, + { + "epoch": 0.55, + "grad_norm": 0.9541666664847532, + "learning_rate": 8.828612813733375e-06, + "loss": 0.5086, + "step": 6783 + }, + { + "epoch": 0.55, + "grad_norm": 0.8871536751057871, + "learning_rate": 8.825998504967083e-06, + "loss": 0.5631, + "step": 6784 + }, + { + "epoch": 0.55, + "grad_norm": 0.8795996020980996, + "learning_rate": 8.823384277558049e-06, + "loss": 0.5313, + "step": 6785 + }, + { + "epoch": 0.55, + "grad_norm": 0.9247932552615178, + "learning_rate": 8.820770131687447e-06, + "loss": 0.5294, + "step": 6786 + }, + { + "epoch": 0.55, + "grad_norm": 0.8857003101974371, + "learning_rate": 8.818156067536428e-06, + "loss": 0.5431, + "step": 6787 + }, + { + "epoch": 0.55, + "grad_norm": 0.9214931834118468, + "learning_rate": 8.81554208528614e-06, + "loss": 0.5022, + "step": 6788 + }, + { + "epoch": 0.55, + "grad_norm": 0.9891148743193596, + "learning_rate": 8.81292818511774e-06, + "loss": 0.5028, + "step": 6789 + }, + { + "epoch": 0.55, + "grad_norm": 0.8078361977439683, + "learning_rate": 8.810314367212363e-06, + "loss": 0.4507, + "step": 6790 + }, + { + "epoch": 0.55, + "grad_norm": 0.8869455383110579, + "learning_rate": 8.807700631751142e-06, + "loss": 0.5118, + "step": 6791 + }, + { + "epoch": 0.55, + "grad_norm": 0.9101666591005333, + "learning_rate": 8.805086978915215e-06, + "loss": 0.5276, + "step": 6792 + }, + { + "epoch": 0.55, + "grad_norm": 0.8699626001339014, + "learning_rate": 8.802473408885698e-06, + "loss": 0.5151, + "step": 6793 + }, + { + "epoch": 0.55, + "grad_norm": 0.8984143344812208, + "learning_rate": 8.799859921843708e-06, + "loss": 0.5043, + "step": 6794 + }, + { + "epoch": 0.55, + "grad_norm": 0.8903818770283506, + "learning_rate": 8.797246517970365e-06, + "loss": 0.4752, + "step": 6795 + }, + { + "epoch": 0.55, + "grad_norm": 0.8499060192340044, + "learning_rate": 8.79463319744677e-06, + "loss": 0.5005, + "step": 6796 + }, + { + "epoch": 0.55, + "grad_norm": 0.8716575656378632, + "learning_rate": 8.792019960454025e-06, + "loss": 0.5114, + "step": 6797 + }, + { + "epoch": 0.55, + "grad_norm": 0.9138479375865836, + "learning_rate": 8.789406807173226e-06, + "loss": 0.5285, + "step": 6798 + }, + { + "epoch": 0.55, + "grad_norm": 0.9210455881917426, + "learning_rate": 8.786793737785465e-06, + "loss": 0.5623, + "step": 6799 + }, + { + "epoch": 0.55, + "grad_norm": 1.059953054962172, + "learning_rate": 8.784180752471814e-06, + "loss": 0.6021, + "step": 6800 + }, + { + "epoch": 0.55, + "grad_norm": 0.9384117653547553, + "learning_rate": 8.781567851413363e-06, + "loss": 0.4719, + "step": 6801 + }, + { + "epoch": 0.55, + "grad_norm": 0.9469162552609387, + "learning_rate": 8.77895503479118e-06, + "loss": 0.544, + "step": 6802 + }, + { + "epoch": 0.55, + "grad_norm": 0.918780292574788, + "learning_rate": 8.776342302786327e-06, + "loss": 0.5458, + "step": 6803 + }, + { + "epoch": 0.55, + "grad_norm": 0.8163978896404098, + "learning_rate": 8.773729655579872e-06, + "loss": 0.4747, + "step": 6804 + }, + { + "epoch": 0.55, + "grad_norm": 0.8581993144879657, + "learning_rate": 8.771117093352861e-06, + "loss": 0.4709, + "step": 6805 + }, + { + "epoch": 0.55, + "grad_norm": 0.7416031093198225, + "learning_rate": 8.768504616286343e-06, + "loss": 0.4403, + "step": 6806 + }, + { + "epoch": 0.55, + "grad_norm": 0.9415202265596095, + "learning_rate": 8.765892224561367e-06, + "loss": 0.4747, + "step": 6807 + }, + { + "epoch": 0.55, + "grad_norm": 0.8392199576195705, + "learning_rate": 8.763279918358965e-06, + "loss": 0.4821, + "step": 6808 + }, + { + "epoch": 0.55, + "grad_norm": 0.9156867689783389, + "learning_rate": 8.760667697860163e-06, + "loss": 0.5173, + "step": 6809 + }, + { + "epoch": 0.55, + "grad_norm": 0.8526700674513049, + "learning_rate": 8.758055563245994e-06, + "loss": 0.4999, + "step": 6810 + }, + { + "epoch": 0.55, + "grad_norm": 0.9897941428051377, + "learning_rate": 8.755443514697475e-06, + "loss": 0.5442, + "step": 6811 + }, + { + "epoch": 0.55, + "grad_norm": 0.9994090842074854, + "learning_rate": 8.75283155239561e-06, + "loss": 0.5672, + "step": 6812 + }, + { + "epoch": 0.55, + "grad_norm": 0.9643497009203205, + "learning_rate": 8.750219676521417e-06, + "loss": 0.5924, + "step": 6813 + }, + { + "epoch": 0.55, + "grad_norm": 0.9052958275216263, + "learning_rate": 8.747607887255892e-06, + "loss": 0.4995, + "step": 6814 + }, + { + "epoch": 0.55, + "grad_norm": 0.9110981702917157, + "learning_rate": 8.744996184780027e-06, + "loss": 0.5191, + "step": 6815 + }, + { + "epoch": 0.55, + "grad_norm": 0.9418260441700346, + "learning_rate": 8.742384569274815e-06, + "loss": 0.5403, + "step": 6816 + }, + { + "epoch": 0.55, + "grad_norm": 0.9570874072452588, + "learning_rate": 8.739773040921237e-06, + "loss": 0.5515, + "step": 6817 + }, + { + "epoch": 0.55, + "grad_norm": 0.8969656147692997, + "learning_rate": 8.737161599900267e-06, + "loss": 0.5075, + "step": 6818 + }, + { + "epoch": 0.55, + "grad_norm": 0.8998191286869733, + "learning_rate": 8.734550246392881e-06, + "loss": 0.5951, + "step": 6819 + }, + { + "epoch": 0.55, + "grad_norm": 0.8362595923787626, + "learning_rate": 8.73193898058004e-06, + "loss": 0.4867, + "step": 6820 + }, + { + "epoch": 0.55, + "grad_norm": 0.8519217677907318, + "learning_rate": 8.7293278026427e-06, + "loss": 0.545, + "step": 6821 + }, + { + "epoch": 0.55, + "grad_norm": 0.9234059581598991, + "learning_rate": 8.726716712761821e-06, + "loss": 0.5394, + "step": 6822 + }, + { + "epoch": 0.55, + "grad_norm": 0.8558200276209291, + "learning_rate": 8.724105711118342e-06, + "loss": 0.5496, + "step": 6823 + }, + { + "epoch": 0.55, + "grad_norm": 0.8377667534966279, + "learning_rate": 8.721494797893201e-06, + "loss": 0.5268, + "step": 6824 + }, + { + "epoch": 0.55, + "grad_norm": 0.8517243807535965, + "learning_rate": 8.718883973267344e-06, + "loss": 0.5228, + "step": 6825 + }, + { + "epoch": 0.55, + "grad_norm": 0.8573727687125215, + "learning_rate": 8.716273237421688e-06, + "loss": 0.4513, + "step": 6826 + }, + { + "epoch": 0.55, + "grad_norm": 0.8949363228005951, + "learning_rate": 8.713662590537155e-06, + "loss": 0.5142, + "step": 6827 + }, + { + "epoch": 0.55, + "grad_norm": 0.9823612055364075, + "learning_rate": 8.711052032794668e-06, + "loss": 0.6011, + "step": 6828 + }, + { + "epoch": 0.56, + "grad_norm": 0.9043280010894629, + "learning_rate": 8.708441564375132e-06, + "loss": 0.5257, + "step": 6829 + }, + { + "epoch": 0.56, + "grad_norm": 0.8884025036511776, + "learning_rate": 8.705831185459446e-06, + "loss": 0.5371, + "step": 6830 + }, + { + "epoch": 0.56, + "grad_norm": 0.8851961931117828, + "learning_rate": 8.703220896228515e-06, + "loss": 0.5296, + "step": 6831 + }, + { + "epoch": 0.56, + "grad_norm": 0.9496141722730437, + "learning_rate": 8.700610696863225e-06, + "loss": 0.5642, + "step": 6832 + }, + { + "epoch": 0.56, + "grad_norm": 0.8771895138899715, + "learning_rate": 8.698000587544457e-06, + "loss": 0.5325, + "step": 6833 + }, + { + "epoch": 0.56, + "grad_norm": 0.8952605449602975, + "learning_rate": 8.695390568453099e-06, + "loss": 0.5094, + "step": 6834 + }, + { + "epoch": 0.56, + "grad_norm": 0.9544184892925124, + "learning_rate": 8.692780639770015e-06, + "loss": 0.5366, + "step": 6835 + }, + { + "epoch": 0.56, + "grad_norm": 0.9309194420917352, + "learning_rate": 8.69017080167607e-06, + "loss": 0.541, + "step": 6836 + }, + { + "epoch": 0.56, + "grad_norm": 0.8723271018988956, + "learning_rate": 8.687561054352132e-06, + "loss": 0.4398, + "step": 6837 + }, + { + "epoch": 0.56, + "grad_norm": 0.8901099533404632, + "learning_rate": 8.684951397979049e-06, + "loss": 0.4952, + "step": 6838 + }, + { + "epoch": 0.56, + "grad_norm": 0.9158614565675948, + "learning_rate": 8.68234183273766e-06, + "loss": 0.498, + "step": 6839 + }, + { + "epoch": 0.56, + "grad_norm": 0.9034404937210446, + "learning_rate": 8.679732358808822e-06, + "loss": 0.5241, + "step": 6840 + }, + { + "epoch": 0.56, + "grad_norm": 0.91821176168655, + "learning_rate": 8.677122976373356e-06, + "loss": 0.5759, + "step": 6841 + }, + { + "epoch": 0.56, + "grad_norm": 0.9047712664252271, + "learning_rate": 8.674513685612093e-06, + "loss": 0.5376, + "step": 6842 + }, + { + "epoch": 0.56, + "grad_norm": 0.9452668129583185, + "learning_rate": 8.67190448670586e-06, + "loss": 0.5454, + "step": 6843 + }, + { + "epoch": 0.56, + "grad_norm": 0.9045047109963618, + "learning_rate": 8.669295379835467e-06, + "loss": 0.4366, + "step": 6844 + }, + { + "epoch": 0.56, + "grad_norm": 0.848893201507573, + "learning_rate": 8.66668636518172e-06, + "loss": 0.5043, + "step": 6845 + }, + { + "epoch": 0.56, + "grad_norm": 0.9522331654771756, + "learning_rate": 8.66407744292543e-06, + "loss": 0.5705, + "step": 6846 + }, + { + "epoch": 0.56, + "grad_norm": 0.8362035815919093, + "learning_rate": 8.661468613247387e-06, + "loss": 0.528, + "step": 6847 + }, + { + "epoch": 0.56, + "grad_norm": 0.8439852570234441, + "learning_rate": 8.658859876328379e-06, + "loss": 0.5723, + "step": 6848 + }, + { + "epoch": 0.56, + "grad_norm": 0.8729192359407367, + "learning_rate": 8.656251232349196e-06, + "loss": 0.5041, + "step": 6849 + }, + { + "epoch": 0.56, + "grad_norm": 0.9365431033561334, + "learning_rate": 8.653642681490608e-06, + "loss": 0.541, + "step": 6850 + }, + { + "epoch": 0.56, + "grad_norm": 0.9090828954084572, + "learning_rate": 8.651034223933387e-06, + "loss": 0.5001, + "step": 6851 + }, + { + "epoch": 0.56, + "grad_norm": 0.9040470072628123, + "learning_rate": 8.648425859858302e-06, + "loss": 0.5238, + "step": 6852 + }, + { + "epoch": 0.56, + "grad_norm": 0.8724924522448079, + "learning_rate": 8.645817589446104e-06, + "loss": 0.5102, + "step": 6853 + }, + { + "epoch": 0.56, + "grad_norm": 0.9094503790788749, + "learning_rate": 8.643209412877545e-06, + "loss": 0.4854, + "step": 6854 + }, + { + "epoch": 0.56, + "grad_norm": 0.8456235151710461, + "learning_rate": 8.640601330333372e-06, + "loss": 0.4561, + "step": 6855 + }, + { + "epoch": 0.56, + "grad_norm": 0.847154330493836, + "learning_rate": 8.637993341994318e-06, + "loss": 0.4553, + "step": 6856 + }, + { + "epoch": 0.56, + "grad_norm": 0.9621209098974626, + "learning_rate": 8.63538544804112e-06, + "loss": 0.6182, + "step": 6857 + }, + { + "epoch": 0.56, + "grad_norm": 0.8996509203921742, + "learning_rate": 8.6327776486545e-06, + "loss": 0.557, + "step": 6858 + }, + { + "epoch": 0.56, + "grad_norm": 0.9084173699307313, + "learning_rate": 8.630169944015175e-06, + "loss": 0.5676, + "step": 6859 + }, + { + "epoch": 0.56, + "grad_norm": 0.9108279490107368, + "learning_rate": 8.627562334303856e-06, + "loss": 0.5245, + "step": 6860 + }, + { + "epoch": 0.56, + "grad_norm": 0.951437155317072, + "learning_rate": 8.624954819701254e-06, + "loss": 0.5329, + "step": 6861 + }, + { + "epoch": 0.56, + "grad_norm": 0.8850829837968626, + "learning_rate": 8.62234740038806e-06, + "loss": 0.5477, + "step": 6862 + }, + { + "epoch": 0.56, + "grad_norm": 0.9464888994757814, + "learning_rate": 8.61974007654497e-06, + "loss": 0.5163, + "step": 6863 + }, + { + "epoch": 0.56, + "grad_norm": 0.8720404929484166, + "learning_rate": 8.617132848352672e-06, + "loss": 0.5445, + "step": 6864 + }, + { + "epoch": 0.56, + "grad_norm": 0.8728133761361274, + "learning_rate": 8.614525715991838e-06, + "loss": 0.4891, + "step": 6865 + }, + { + "epoch": 0.56, + "grad_norm": 0.8127871687528759, + "learning_rate": 8.61191867964314e-06, + "loss": 0.5541, + "step": 6866 + }, + { + "epoch": 0.56, + "grad_norm": 0.9500036866001732, + "learning_rate": 8.60931173948725e-06, + "loss": 0.5357, + "step": 6867 + }, + { + "epoch": 0.56, + "grad_norm": 0.9133395294343943, + "learning_rate": 8.606704895704824e-06, + "loss": 0.5075, + "step": 6868 + }, + { + "epoch": 0.56, + "grad_norm": 0.8935830929853792, + "learning_rate": 8.60409814847651e-06, + "loss": 0.5594, + "step": 6869 + }, + { + "epoch": 0.56, + "grad_norm": 0.8317842659008998, + "learning_rate": 8.601491497982956e-06, + "loss": 0.4818, + "step": 6870 + }, + { + "epoch": 0.56, + "grad_norm": 0.8450371891233477, + "learning_rate": 8.598884944404803e-06, + "loss": 0.5497, + "step": 6871 + }, + { + "epoch": 0.56, + "grad_norm": 0.8318739755346117, + "learning_rate": 8.59627848792268e-06, + "loss": 0.4475, + "step": 6872 + }, + { + "epoch": 0.56, + "grad_norm": 0.8811775776785117, + "learning_rate": 8.593672128717211e-06, + "loss": 0.549, + "step": 6873 + }, + { + "epoch": 0.56, + "grad_norm": 0.9664851965942245, + "learning_rate": 8.591065866969019e-06, + "loss": 0.5226, + "step": 6874 + }, + { + "epoch": 0.56, + "grad_norm": 0.896869716587723, + "learning_rate": 8.588459702858709e-06, + "loss": 0.5001, + "step": 6875 + }, + { + "epoch": 0.56, + "grad_norm": 0.8946735161087283, + "learning_rate": 8.585853636566891e-06, + "loss": 0.5486, + "step": 6876 + }, + { + "epoch": 0.56, + "grad_norm": 0.9655935262527788, + "learning_rate": 8.583247668274163e-06, + "loss": 0.5092, + "step": 6877 + }, + { + "epoch": 0.56, + "grad_norm": 0.8410789878708989, + "learning_rate": 8.580641798161114e-06, + "loss": 0.544, + "step": 6878 + }, + { + "epoch": 0.56, + "grad_norm": 0.8277452055258614, + "learning_rate": 8.578036026408329e-06, + "loss": 0.4597, + "step": 6879 + }, + { + "epoch": 0.56, + "grad_norm": 0.815244774195561, + "learning_rate": 8.575430353196388e-06, + "loss": 0.444, + "step": 6880 + }, + { + "epoch": 0.56, + "grad_norm": 0.9093735584737653, + "learning_rate": 8.572824778705858e-06, + "loss": 0.5305, + "step": 6881 + }, + { + "epoch": 0.56, + "grad_norm": 0.824061186847819, + "learning_rate": 8.570219303117305e-06, + "loss": 0.5214, + "step": 6882 + }, + { + "epoch": 0.56, + "grad_norm": 0.8606650396604164, + "learning_rate": 8.567613926611287e-06, + "loss": 0.5188, + "step": 6883 + }, + { + "epoch": 0.56, + "grad_norm": 0.8166098225852793, + "learning_rate": 8.565008649368353e-06, + "loss": 0.4376, + "step": 6884 + }, + { + "epoch": 0.56, + "grad_norm": 0.8347231452093975, + "learning_rate": 8.562403471569045e-06, + "loss": 0.547, + "step": 6885 + }, + { + "epoch": 0.56, + "grad_norm": 0.8097988214796327, + "learning_rate": 8.559798393393905e-06, + "loss": 0.5447, + "step": 6886 + }, + { + "epoch": 0.56, + "grad_norm": 0.9244774758950597, + "learning_rate": 8.557193415023453e-06, + "loss": 0.5247, + "step": 6887 + }, + { + "epoch": 0.56, + "grad_norm": 0.9112277686677142, + "learning_rate": 8.554588536638223e-06, + "loss": 0.5273, + "step": 6888 + }, + { + "epoch": 0.56, + "grad_norm": 0.9646248263987668, + "learning_rate": 8.551983758418726e-06, + "loss": 0.5838, + "step": 6889 + }, + { + "epoch": 0.56, + "grad_norm": 0.9010237227640583, + "learning_rate": 8.549379080545465e-06, + "loss": 0.5373, + "step": 6890 + }, + { + "epoch": 0.56, + "grad_norm": 0.8840857549032455, + "learning_rate": 8.546774503198952e-06, + "loss": 0.5212, + "step": 6891 + }, + { + "epoch": 0.56, + "grad_norm": 0.8703836152904741, + "learning_rate": 8.544170026559675e-06, + "loss": 0.515, + "step": 6892 + }, + { + "epoch": 0.56, + "grad_norm": 0.9110363190568741, + "learning_rate": 8.541565650808121e-06, + "loss": 0.536, + "step": 6893 + }, + { + "epoch": 0.56, + "grad_norm": 0.8667711655341527, + "learning_rate": 8.538961376124778e-06, + "loss": 0.4941, + "step": 6894 + }, + { + "epoch": 0.56, + "grad_norm": 0.8986411060576323, + "learning_rate": 8.536357202690115e-06, + "loss": 0.5344, + "step": 6895 + }, + { + "epoch": 0.56, + "grad_norm": 0.8943813528753555, + "learning_rate": 8.533753130684596e-06, + "loss": 0.5101, + "step": 6896 + }, + { + "epoch": 0.56, + "grad_norm": 0.7868350464166751, + "learning_rate": 8.531149160288689e-06, + "loss": 0.4988, + "step": 6897 + }, + { + "epoch": 0.56, + "grad_norm": 0.8812378069066342, + "learning_rate": 8.528545291682839e-06, + "loss": 0.4645, + "step": 6898 + }, + { + "epoch": 0.56, + "grad_norm": 0.8133762449585381, + "learning_rate": 8.525941525047497e-06, + "loss": 0.4826, + "step": 6899 + }, + { + "epoch": 0.56, + "grad_norm": 0.9905503171322216, + "learning_rate": 8.5233378605631e-06, + "loss": 0.5461, + "step": 6900 + }, + { + "epoch": 0.56, + "grad_norm": 0.9258311041322348, + "learning_rate": 8.520734298410078e-06, + "loss": 0.5587, + "step": 6901 + }, + { + "epoch": 0.56, + "grad_norm": 0.8495388917943362, + "learning_rate": 8.51813083876886e-06, + "loss": 0.5298, + "step": 6902 + }, + { + "epoch": 0.56, + "grad_norm": 0.9334580528775865, + "learning_rate": 8.51552748181986e-06, + "loss": 0.4874, + "step": 6903 + }, + { + "epoch": 0.56, + "grad_norm": 0.8939242695409655, + "learning_rate": 8.512924227743482e-06, + "loss": 0.5229, + "step": 6904 + }, + { + "epoch": 0.56, + "grad_norm": 0.8807139695796883, + "learning_rate": 8.510321076720143e-06, + "loss": 0.5246, + "step": 6905 + }, + { + "epoch": 0.56, + "grad_norm": 0.8559006349127389, + "learning_rate": 8.507718028930232e-06, + "loss": 0.4901, + "step": 6906 + }, + { + "epoch": 0.56, + "grad_norm": 1.0000213916080734, + "learning_rate": 8.505115084554133e-06, + "loss": 0.5512, + "step": 6907 + }, + { + "epoch": 0.56, + "grad_norm": 0.8732842596234691, + "learning_rate": 8.502512243772238e-06, + "loss": 0.5052, + "step": 6908 + }, + { + "epoch": 0.56, + "grad_norm": 0.883491030341863, + "learning_rate": 8.499909506764914e-06, + "loss": 0.4797, + "step": 6909 + }, + { + "epoch": 0.56, + "grad_norm": 0.8306560563324287, + "learning_rate": 8.497306873712529e-06, + "loss": 0.5345, + "step": 6910 + }, + { + "epoch": 0.56, + "grad_norm": 0.9538067482071597, + "learning_rate": 8.494704344795447e-06, + "loss": 0.5573, + "step": 6911 + }, + { + "epoch": 0.56, + "grad_norm": 0.8159645915961217, + "learning_rate": 8.49210192019402e-06, + "loss": 0.4658, + "step": 6912 + }, + { + "epoch": 0.56, + "grad_norm": 0.8417783364151031, + "learning_rate": 8.489499600088587e-06, + "loss": 0.4694, + "step": 6913 + }, + { + "epoch": 0.56, + "grad_norm": 0.9480484593522762, + "learning_rate": 8.486897384659496e-06, + "loss": 0.5814, + "step": 6914 + }, + { + "epoch": 0.56, + "grad_norm": 0.9328104862498665, + "learning_rate": 8.484295274087077e-06, + "loss": 0.5421, + "step": 6915 + }, + { + "epoch": 0.56, + "grad_norm": 0.9775408046991969, + "learning_rate": 8.481693268551645e-06, + "loss": 0.5591, + "step": 6916 + }, + { + "epoch": 0.56, + "grad_norm": 0.9216249536740916, + "learning_rate": 8.479091368233527e-06, + "loss": 0.5456, + "step": 6917 + }, + { + "epoch": 0.56, + "grad_norm": 0.8600506186216511, + "learning_rate": 8.476489573313026e-06, + "loss": 0.4522, + "step": 6918 + }, + { + "epoch": 0.56, + "grad_norm": 0.9424247242351319, + "learning_rate": 8.473887883970444e-06, + "loss": 0.5218, + "step": 6919 + }, + { + "epoch": 0.56, + "grad_norm": 0.9545641299517474, + "learning_rate": 8.471286300386084e-06, + "loss": 0.5171, + "step": 6920 + }, + { + "epoch": 0.56, + "grad_norm": 0.9165517930847911, + "learning_rate": 8.468684822740226e-06, + "loss": 0.554, + "step": 6921 + }, + { + "epoch": 0.56, + "grad_norm": 0.8727313173381859, + "learning_rate": 8.466083451213145e-06, + "loss": 0.4691, + "step": 6922 + }, + { + "epoch": 0.56, + "grad_norm": 0.8684377015353146, + "learning_rate": 8.463482185985127e-06, + "loss": 0.5302, + "step": 6923 + }, + { + "epoch": 0.56, + "grad_norm": 0.85865382887651, + "learning_rate": 8.46088102723643e-06, + "loss": 0.5485, + "step": 6924 + }, + { + "epoch": 0.56, + "grad_norm": 0.9178264914623333, + "learning_rate": 8.458279975147308e-06, + "loss": 0.5013, + "step": 6925 + }, + { + "epoch": 0.56, + "grad_norm": 0.8364225548664053, + "learning_rate": 8.45567902989802e-06, + "loss": 0.4822, + "step": 6926 + }, + { + "epoch": 0.56, + "grad_norm": 0.8802465512047095, + "learning_rate": 8.453078191668806e-06, + "loss": 0.4854, + "step": 6927 + }, + { + "epoch": 0.56, + "grad_norm": 0.9392464025402688, + "learning_rate": 8.450477460639898e-06, + "loss": 0.5663, + "step": 6928 + }, + { + "epoch": 0.56, + "grad_norm": 0.9006510750755666, + "learning_rate": 8.447876836991531e-06, + "loss": 0.5447, + "step": 6929 + }, + { + "epoch": 0.56, + "grad_norm": 0.8320821905591601, + "learning_rate": 8.445276320903922e-06, + "loss": 0.4939, + "step": 6930 + }, + { + "epoch": 0.56, + "grad_norm": 0.8839694642333762, + "learning_rate": 8.442675912557281e-06, + "loss": 0.506, + "step": 6931 + }, + { + "epoch": 0.56, + "grad_norm": 0.9395330291731121, + "learning_rate": 8.440075612131823e-06, + "loss": 0.5072, + "step": 6932 + }, + { + "epoch": 0.56, + "grad_norm": 0.8635009440795924, + "learning_rate": 8.437475419807742e-06, + "loss": 0.5385, + "step": 6933 + }, + { + "epoch": 0.56, + "grad_norm": 0.9849878980592961, + "learning_rate": 8.434875335765222e-06, + "loss": 0.6095, + "step": 6934 + }, + { + "epoch": 0.56, + "grad_norm": 0.9255841893242462, + "learning_rate": 8.432275360184458e-06, + "loss": 0.5174, + "step": 6935 + }, + { + "epoch": 0.56, + "grad_norm": 0.9090364558178271, + "learning_rate": 8.42967549324562e-06, + "loss": 0.5509, + "step": 6936 + }, + { + "epoch": 0.56, + "grad_norm": 1.0063817065902216, + "learning_rate": 8.427075735128874e-06, + "loss": 0.5363, + "step": 6937 + }, + { + "epoch": 0.56, + "grad_norm": 0.9496204700450661, + "learning_rate": 8.42447608601439e-06, + "loss": 0.4875, + "step": 6938 + }, + { + "epoch": 0.56, + "grad_norm": 0.7609053421255205, + "learning_rate": 8.421876546082315e-06, + "loss": 0.4664, + "step": 6939 + }, + { + "epoch": 0.56, + "grad_norm": 0.915322118497608, + "learning_rate": 8.419277115512791e-06, + "loss": 0.5448, + "step": 6940 + }, + { + "epoch": 0.56, + "grad_norm": 0.9743354645399194, + "learning_rate": 8.416677794485965e-06, + "loss": 0.483, + "step": 6941 + }, + { + "epoch": 0.56, + "grad_norm": 0.8938767929650255, + "learning_rate": 8.414078583181963e-06, + "loss": 0.4847, + "step": 6942 + }, + { + "epoch": 0.56, + "grad_norm": 0.8624235874313937, + "learning_rate": 8.411479481780904e-06, + "loss": 0.4361, + "step": 6943 + }, + { + "epoch": 0.56, + "grad_norm": 0.8460237552503656, + "learning_rate": 8.408880490462914e-06, + "loss": 0.5013, + "step": 6944 + }, + { + "epoch": 0.56, + "grad_norm": 0.9505478379663955, + "learning_rate": 8.406281609408094e-06, + "loss": 0.5043, + "step": 6945 + }, + { + "epoch": 0.56, + "grad_norm": 1.0443733816143181, + "learning_rate": 8.403682838796539e-06, + "loss": 0.6496, + "step": 6946 + }, + { + "epoch": 0.56, + "grad_norm": 0.8187946528356171, + "learning_rate": 8.401084178808353e-06, + "loss": 0.529, + "step": 6947 + }, + { + "epoch": 0.56, + "grad_norm": 1.0118247077047577, + "learning_rate": 8.398485629623613e-06, + "loss": 0.6163, + "step": 6948 + }, + { + "epoch": 0.56, + "grad_norm": 0.9438440213096889, + "learning_rate": 8.395887191422397e-06, + "loss": 0.5318, + "step": 6949 + }, + { + "epoch": 0.56, + "grad_norm": 0.8133948811582256, + "learning_rate": 8.393288864384778e-06, + "loss": 0.491, + "step": 6950 + }, + { + "epoch": 0.56, + "grad_norm": 0.8383840381164659, + "learning_rate": 8.390690648690818e-06, + "loss": 0.4626, + "step": 6951 + }, + { + "epoch": 0.57, + "grad_norm": 0.9563077923567581, + "learning_rate": 8.388092544520563e-06, + "loss": 0.4867, + "step": 6952 + }, + { + "epoch": 0.57, + "grad_norm": 0.9220258557852953, + "learning_rate": 8.385494552054069e-06, + "loss": 0.4902, + "step": 6953 + }, + { + "epoch": 0.57, + "grad_norm": 0.8819023030230523, + "learning_rate": 8.38289667147137e-06, + "loss": 0.5122, + "step": 6954 + }, + { + "epoch": 0.57, + "grad_norm": 0.9896701536921275, + "learning_rate": 8.380298902952493e-06, + "loss": 0.4423, + "step": 6955 + }, + { + "epoch": 0.57, + "grad_norm": 0.8947119579675652, + "learning_rate": 8.37770124667747e-06, + "loss": 0.4862, + "step": 6956 + }, + { + "epoch": 0.57, + "grad_norm": 0.884022859472568, + "learning_rate": 8.375103702826313e-06, + "loss": 0.5322, + "step": 6957 + }, + { + "epoch": 0.57, + "grad_norm": 0.8643526429122288, + "learning_rate": 8.372506271579022e-06, + "loss": 0.4419, + "step": 6958 + }, + { + "epoch": 0.57, + "grad_norm": 0.836333281072999, + "learning_rate": 8.369908953115609e-06, + "loss": 0.4877, + "step": 6959 + }, + { + "epoch": 0.57, + "grad_norm": 0.8772669558464992, + "learning_rate": 8.367311747616057e-06, + "loss": 0.5038, + "step": 6960 + }, + { + "epoch": 0.57, + "grad_norm": 0.902382786223706, + "learning_rate": 8.36471465526035e-06, + "loss": 0.5299, + "step": 6961 + }, + { + "epoch": 0.57, + "grad_norm": 0.7776272809054607, + "learning_rate": 8.362117676228471e-06, + "loss": 0.4358, + "step": 6962 + }, + { + "epoch": 0.57, + "grad_norm": 0.8457410241726842, + "learning_rate": 8.359520810700384e-06, + "loss": 0.5294, + "step": 6963 + }, + { + "epoch": 0.57, + "grad_norm": 0.8787627448587005, + "learning_rate": 8.356924058856046e-06, + "loss": 0.5382, + "step": 6964 + }, + { + "epoch": 0.57, + "grad_norm": 0.8641307163943069, + "learning_rate": 8.354327420875416e-06, + "loss": 0.5088, + "step": 6965 + }, + { + "epoch": 0.57, + "grad_norm": 0.9338969195084026, + "learning_rate": 8.351730896938438e-06, + "loss": 0.5405, + "step": 6966 + }, + { + "epoch": 0.57, + "grad_norm": 0.9571853366920003, + "learning_rate": 8.349134487225041e-06, + "loss": 0.5288, + "step": 6967 + }, + { + "epoch": 0.57, + "grad_norm": 0.9484888179387783, + "learning_rate": 8.346538191915166e-06, + "loss": 0.5038, + "step": 6968 + }, + { + "epoch": 0.57, + "grad_norm": 0.9092391023046498, + "learning_rate": 8.343942011188726e-06, + "loss": 0.5155, + "step": 6969 + }, + { + "epoch": 0.57, + "grad_norm": 0.9628755470864837, + "learning_rate": 8.341345945225632e-06, + "loss": 0.5141, + "step": 6970 + }, + { + "epoch": 0.57, + "grad_norm": 0.8925177180147768, + "learning_rate": 8.338749994205797e-06, + "loss": 0.5277, + "step": 6971 + }, + { + "epoch": 0.57, + "grad_norm": 0.8013471751031944, + "learning_rate": 8.336154158309114e-06, + "loss": 0.4862, + "step": 6972 + }, + { + "epoch": 0.57, + "grad_norm": 0.9451259573313375, + "learning_rate": 8.333558437715468e-06, + "loss": 0.6033, + "step": 6973 + }, + { + "epoch": 0.57, + "grad_norm": 0.9367395661597856, + "learning_rate": 8.330962832604747e-06, + "loss": 0.5249, + "step": 6974 + }, + { + "epoch": 0.57, + "grad_norm": 0.9432032175641137, + "learning_rate": 8.328367343156823e-06, + "loss": 0.5391, + "step": 6975 + }, + { + "epoch": 0.57, + "grad_norm": 0.8479723797015235, + "learning_rate": 8.325771969551553e-06, + "loss": 0.5147, + "step": 6976 + }, + { + "epoch": 0.57, + "grad_norm": 0.893248762027732, + "learning_rate": 8.323176711968807e-06, + "loss": 0.5, + "step": 6977 + }, + { + "epoch": 0.57, + "grad_norm": 0.8770357987840728, + "learning_rate": 8.320581570588426e-06, + "loss": 0.5198, + "step": 6978 + }, + { + "epoch": 0.57, + "grad_norm": 0.9602752488044517, + "learning_rate": 8.31798654559025e-06, + "loss": 0.4744, + "step": 6979 + }, + { + "epoch": 0.57, + "grad_norm": 0.9535097302063359, + "learning_rate": 8.315391637154116e-06, + "loss": 0.4729, + "step": 6980 + }, + { + "epoch": 0.57, + "grad_norm": 0.9705726891103388, + "learning_rate": 8.31279684545985e-06, + "loss": 0.5509, + "step": 6981 + }, + { + "epoch": 0.57, + "grad_norm": 0.8861562625840504, + "learning_rate": 8.31020217068726e-06, + "loss": 0.5101, + "step": 6982 + }, + { + "epoch": 0.57, + "grad_norm": 0.9006660475549908, + "learning_rate": 8.307607613016166e-06, + "loss": 0.505, + "step": 6983 + }, + { + "epoch": 0.57, + "grad_norm": 0.9894974040605742, + "learning_rate": 8.305013172626363e-06, + "loss": 0.4981, + "step": 6984 + }, + { + "epoch": 0.57, + "grad_norm": 0.8933836425811029, + "learning_rate": 8.30241884969764e-06, + "loss": 0.5277, + "step": 6985 + }, + { + "epoch": 0.57, + "grad_norm": 0.8515700939920732, + "learning_rate": 8.299824644409787e-06, + "loss": 0.521, + "step": 6986 + }, + { + "epoch": 0.57, + "grad_norm": 0.941090038946153, + "learning_rate": 8.29723055694258e-06, + "loss": 0.5387, + "step": 6987 + }, + { + "epoch": 0.57, + "grad_norm": 0.896364200664156, + "learning_rate": 8.294636587475781e-06, + "loss": 0.609, + "step": 6988 + }, + { + "epoch": 0.57, + "grad_norm": 1.03393686900745, + "learning_rate": 8.292042736189156e-06, + "loss": 0.5685, + "step": 6989 + }, + { + "epoch": 0.57, + "grad_norm": 0.9231576064543081, + "learning_rate": 8.289449003262457e-06, + "loss": 0.5319, + "step": 6990 + }, + { + "epoch": 0.57, + "grad_norm": 0.9946567275629944, + "learning_rate": 8.286855388875418e-06, + "loss": 0.5518, + "step": 6991 + }, + { + "epoch": 0.57, + "grad_norm": 0.886001255462769, + "learning_rate": 8.284261893207788e-06, + "loss": 0.5029, + "step": 6992 + }, + { + "epoch": 0.57, + "grad_norm": 0.9284596693800119, + "learning_rate": 8.281668516439286e-06, + "loss": 0.5258, + "step": 6993 + }, + { + "epoch": 0.57, + "grad_norm": 0.8278539413461194, + "learning_rate": 8.279075258749627e-06, + "loss": 0.4514, + "step": 6994 + }, + { + "epoch": 0.57, + "grad_norm": 0.9555577959427803, + "learning_rate": 8.276482120318532e-06, + "loss": 0.6038, + "step": 6995 + }, + { + "epoch": 0.57, + "grad_norm": 0.8387302642220866, + "learning_rate": 8.273889101325693e-06, + "loss": 0.4807, + "step": 6996 + }, + { + "epoch": 0.57, + "grad_norm": 0.8962507721279136, + "learning_rate": 8.271296201950809e-06, + "loss": 0.5234, + "step": 6997 + }, + { + "epoch": 0.57, + "grad_norm": 0.8712383728560765, + "learning_rate": 8.268703422373564e-06, + "loss": 0.4448, + "step": 6998 + }, + { + "epoch": 0.57, + "grad_norm": 1.0123153230611763, + "learning_rate": 8.266110762773638e-06, + "loss": 0.643, + "step": 6999 + }, + { + "epoch": 0.57, + "grad_norm": 0.935456479745211, + "learning_rate": 8.263518223330698e-06, + "loss": 0.4819, + "step": 7000 + }, + { + "epoch": 0.57, + "grad_norm": 0.8840558016114327, + "learning_rate": 8.260925804224406e-06, + "loss": 0.5238, + "step": 7001 + }, + { + "epoch": 0.57, + "grad_norm": 0.9027702652673076, + "learning_rate": 8.25833350563441e-06, + "loss": 0.4374, + "step": 7002 + }, + { + "epoch": 0.57, + "grad_norm": 0.9705104517200848, + "learning_rate": 8.25574132774036e-06, + "loss": 0.5247, + "step": 7003 + }, + { + "epoch": 0.57, + "grad_norm": 0.93427552403838, + "learning_rate": 8.253149270721889e-06, + "loss": 0.5646, + "step": 7004 + }, + { + "epoch": 0.57, + "grad_norm": 0.9734270631354048, + "learning_rate": 8.250557334758623e-06, + "loss": 0.5581, + "step": 7005 + }, + { + "epoch": 0.57, + "grad_norm": 0.8741399160481841, + "learning_rate": 8.24796552003018e-06, + "loss": 0.5323, + "step": 7006 + }, + { + "epoch": 0.57, + "grad_norm": 0.9273933255218394, + "learning_rate": 8.245373826716176e-06, + "loss": 0.4716, + "step": 7007 + }, + { + "epoch": 0.57, + "grad_norm": 0.8434637395989865, + "learning_rate": 8.242782254996207e-06, + "loss": 0.5463, + "step": 7008 + }, + { + "epoch": 0.57, + "grad_norm": 0.9336333731635436, + "learning_rate": 8.240190805049868e-06, + "loss": 0.548, + "step": 7009 + }, + { + "epoch": 0.57, + "grad_norm": 0.8929443364357782, + "learning_rate": 8.23759947705675e-06, + "loss": 0.5514, + "step": 7010 + }, + { + "epoch": 0.57, + "grad_norm": 0.9201455293723961, + "learning_rate": 8.235008271196421e-06, + "loss": 0.559, + "step": 7011 + }, + { + "epoch": 0.57, + "grad_norm": 0.9102701414183574, + "learning_rate": 8.232417187648454e-06, + "loss": 0.509, + "step": 7012 + }, + { + "epoch": 0.57, + "grad_norm": 0.9264031657521252, + "learning_rate": 8.22982622659241e-06, + "loss": 0.5409, + "step": 7013 + }, + { + "epoch": 0.57, + "grad_norm": 1.0630764564563944, + "learning_rate": 8.227235388207835e-06, + "loss": 0.4508, + "step": 7014 + }, + { + "epoch": 0.57, + "grad_norm": 0.8822532387155293, + "learning_rate": 8.224644672674276e-06, + "loss": 0.5393, + "step": 7015 + }, + { + "epoch": 0.57, + "grad_norm": 0.9174219199288024, + "learning_rate": 8.222054080171267e-06, + "loss": 0.5642, + "step": 7016 + }, + { + "epoch": 0.57, + "grad_norm": 0.864390451108849, + "learning_rate": 8.219463610878336e-06, + "loss": 0.4691, + "step": 7017 + }, + { + "epoch": 0.57, + "grad_norm": 0.8587854032206618, + "learning_rate": 8.216873264974993e-06, + "loss": 0.4795, + "step": 7018 + }, + { + "epoch": 0.57, + "grad_norm": 0.9363362878742622, + "learning_rate": 8.214283042640752e-06, + "loss": 0.5474, + "step": 7019 + }, + { + "epoch": 0.57, + "grad_norm": 0.9144074402642741, + "learning_rate": 8.211692944055116e-06, + "loss": 0.5136, + "step": 7020 + }, + { + "epoch": 0.57, + "grad_norm": 1.032086763222279, + "learning_rate": 8.20910296939757e-06, + "loss": 0.4673, + "step": 7021 + }, + { + "epoch": 0.57, + "grad_norm": 0.917707524800078, + "learning_rate": 8.206513118847598e-06, + "loss": 0.5868, + "step": 7022 + }, + { + "epoch": 0.57, + "grad_norm": 0.922625636653506, + "learning_rate": 8.203923392584676e-06, + "loss": 0.4888, + "step": 7023 + }, + { + "epoch": 0.57, + "grad_norm": 0.8764509529934924, + "learning_rate": 8.201333790788275e-06, + "loss": 0.5328, + "step": 7024 + }, + { + "epoch": 0.57, + "grad_norm": 0.859695349317226, + "learning_rate": 8.198744313637842e-06, + "loss": 0.5474, + "step": 7025 + }, + { + "epoch": 0.57, + "grad_norm": 1.0344486716589807, + "learning_rate": 8.19615496131283e-06, + "loss": 0.5053, + "step": 7026 + }, + { + "epoch": 0.57, + "grad_norm": 0.8837125709121174, + "learning_rate": 8.193565733992684e-06, + "loss": 0.5018, + "step": 7027 + }, + { + "epoch": 0.57, + "grad_norm": 0.8819851186881112, + "learning_rate": 8.190976631856827e-06, + "loss": 0.5101, + "step": 7028 + }, + { + "epoch": 0.57, + "grad_norm": 0.798697172712139, + "learning_rate": 8.188387655084684e-06, + "loss": 0.4678, + "step": 7029 + }, + { + "epoch": 0.57, + "grad_norm": 0.8911852719708158, + "learning_rate": 8.185798803855671e-06, + "loss": 0.5498, + "step": 7030 + }, + { + "epoch": 0.57, + "grad_norm": 0.9231967644330206, + "learning_rate": 8.183210078349191e-06, + "loss": 0.5421, + "step": 7031 + }, + { + "epoch": 0.57, + "grad_norm": 0.8819993157677812, + "learning_rate": 8.180621478744641e-06, + "loss": 0.5196, + "step": 7032 + }, + { + "epoch": 0.57, + "grad_norm": 0.9075262050015743, + "learning_rate": 8.178033005221412e-06, + "loss": 0.4782, + "step": 7033 + }, + { + "epoch": 0.57, + "grad_norm": 0.8737630080614599, + "learning_rate": 8.175444657958875e-06, + "loss": 0.487, + "step": 7034 + }, + { + "epoch": 0.57, + "grad_norm": 0.9154320954425816, + "learning_rate": 8.172856437136407e-06, + "loss": 0.5353, + "step": 7035 + }, + { + "epoch": 0.57, + "grad_norm": 0.9871118216349929, + "learning_rate": 8.170268342933365e-06, + "loss": 0.5067, + "step": 7036 + }, + { + "epoch": 0.57, + "grad_norm": 0.8265507952904249, + "learning_rate": 8.167680375529108e-06, + "loss": 0.531, + "step": 7037 + }, + { + "epoch": 0.57, + "grad_norm": 0.9496077446671609, + "learning_rate": 8.165092535102972e-06, + "loss": 0.5114, + "step": 7038 + }, + { + "epoch": 0.57, + "grad_norm": 0.8783081821659814, + "learning_rate": 8.162504821834296e-06, + "loss": 0.4619, + "step": 7039 + }, + { + "epoch": 0.57, + "grad_norm": 0.8717227626487613, + "learning_rate": 8.159917235902409e-06, + "loss": 0.5163, + "step": 7040 + }, + { + "epoch": 0.57, + "grad_norm": 0.8932338295571399, + "learning_rate": 8.157329777486623e-06, + "loss": 0.5319, + "step": 7041 + }, + { + "epoch": 0.57, + "grad_norm": 0.9199211873496858, + "learning_rate": 8.154742446766249e-06, + "loss": 0.498, + "step": 7042 + }, + { + "epoch": 0.57, + "grad_norm": 0.891227190168023, + "learning_rate": 8.15215524392059e-06, + "loss": 0.4943, + "step": 7043 + }, + { + "epoch": 0.57, + "grad_norm": 0.9412199201992494, + "learning_rate": 8.14956816912893e-06, + "loss": 0.5129, + "step": 7044 + }, + { + "epoch": 0.57, + "grad_norm": 0.8556676533700766, + "learning_rate": 8.146981222570553e-06, + "loss": 0.5195, + "step": 7045 + }, + { + "epoch": 0.57, + "grad_norm": 0.9710161830339895, + "learning_rate": 8.144394404424739e-06, + "loss": 0.5701, + "step": 7046 + }, + { + "epoch": 0.57, + "grad_norm": 0.9812013727215241, + "learning_rate": 8.141807714870743e-06, + "loss": 0.5025, + "step": 7047 + }, + { + "epoch": 0.57, + "grad_norm": 1.6689600561603462, + "learning_rate": 8.139221154087825e-06, + "loss": 0.4416, + "step": 7048 + }, + { + "epoch": 0.57, + "grad_norm": 0.8059420540732499, + "learning_rate": 8.136634722255232e-06, + "loss": 0.4449, + "step": 7049 + }, + { + "epoch": 0.57, + "grad_norm": 0.9333593303331403, + "learning_rate": 8.134048419552197e-06, + "loss": 0.568, + "step": 7050 + }, + { + "epoch": 0.57, + "grad_norm": 0.8472084290656892, + "learning_rate": 8.131462246157953e-06, + "loss": 0.5321, + "step": 7051 + }, + { + "epoch": 0.57, + "grad_norm": 0.9126154074678705, + "learning_rate": 8.128876202251719e-06, + "loss": 0.5109, + "step": 7052 + }, + { + "epoch": 0.57, + "grad_norm": 0.8763009264392355, + "learning_rate": 8.126290288012701e-06, + "loss": 0.5216, + "step": 7053 + }, + { + "epoch": 0.57, + "grad_norm": 0.8264540746375304, + "learning_rate": 8.123704503620107e-06, + "loss": 0.485, + "step": 7054 + }, + { + "epoch": 0.57, + "grad_norm": 0.8936189165840086, + "learning_rate": 8.121118849253127e-06, + "loss": 0.5393, + "step": 7055 + }, + { + "epoch": 0.57, + "grad_norm": 0.8849486877355847, + "learning_rate": 8.11853332509094e-06, + "loss": 0.5256, + "step": 7056 + }, + { + "epoch": 0.57, + "grad_norm": 0.874095324515099, + "learning_rate": 8.11594793131273e-06, + "loss": 0.5495, + "step": 7057 + }, + { + "epoch": 0.57, + "grad_norm": 0.8441694949243681, + "learning_rate": 8.113362668097658e-06, + "loss": 0.4437, + "step": 7058 + }, + { + "epoch": 0.57, + "grad_norm": 0.902798856936396, + "learning_rate": 8.110777535624875e-06, + "loss": 0.5449, + "step": 7059 + }, + { + "epoch": 0.57, + "grad_norm": 0.8255384853373445, + "learning_rate": 8.108192534073534e-06, + "loss": 0.5014, + "step": 7060 + }, + { + "epoch": 0.57, + "grad_norm": 0.8517134480254044, + "learning_rate": 8.105607663622775e-06, + "loss": 0.5029, + "step": 7061 + }, + { + "epoch": 0.57, + "grad_norm": 0.8067488124368722, + "learning_rate": 8.10302292445172e-06, + "loss": 0.452, + "step": 7062 + }, + { + "epoch": 0.57, + "grad_norm": 0.9532445683932202, + "learning_rate": 8.100438316739499e-06, + "loss": 0.5633, + "step": 7063 + }, + { + "epoch": 0.57, + "grad_norm": 0.8412583157335193, + "learning_rate": 8.097853840665217e-06, + "loss": 0.4861, + "step": 7064 + }, + { + "epoch": 0.57, + "grad_norm": 0.9635981222701807, + "learning_rate": 8.095269496407972e-06, + "loss": 0.5369, + "step": 7065 + }, + { + "epoch": 0.57, + "grad_norm": 0.9723101800664249, + "learning_rate": 8.092685284146865e-06, + "loss": 0.5529, + "step": 7066 + }, + { + "epoch": 0.57, + "grad_norm": 0.882965623304785, + "learning_rate": 8.090101204060977e-06, + "loss": 0.548, + "step": 7067 + }, + { + "epoch": 0.57, + "grad_norm": 0.8898000120431133, + "learning_rate": 8.087517256329376e-06, + "loss": 0.5369, + "step": 7068 + }, + { + "epoch": 0.57, + "grad_norm": 0.9130973916070677, + "learning_rate": 8.08493344113114e-06, + "loss": 0.557, + "step": 7069 + }, + { + "epoch": 0.57, + "grad_norm": 0.8234264567707733, + "learning_rate": 8.082349758645316e-06, + "loss": 0.525, + "step": 7070 + }, + { + "epoch": 0.57, + "grad_norm": 0.7937641653928239, + "learning_rate": 8.079766209050947e-06, + "loss": 0.4841, + "step": 7071 + }, + { + "epoch": 0.57, + "grad_norm": 0.875824564412514, + "learning_rate": 8.077182792527082e-06, + "loss": 0.4847, + "step": 7072 + }, + { + "epoch": 0.57, + "grad_norm": 0.9159490950540292, + "learning_rate": 8.074599509252745e-06, + "loss": 0.5131, + "step": 7073 + }, + { + "epoch": 0.57, + "grad_norm": 0.8557200095988629, + "learning_rate": 8.072016359406949e-06, + "loss": 0.511, + "step": 7074 + }, + { + "epoch": 0.58, + "grad_norm": 0.860909775478447, + "learning_rate": 8.069433343168713e-06, + "loss": 0.4982, + "step": 7075 + }, + { + "epoch": 0.58, + "grad_norm": 0.9084085645848757, + "learning_rate": 8.066850460717035e-06, + "loss": 0.4887, + "step": 7076 + }, + { + "epoch": 0.58, + "grad_norm": 0.9552739657622884, + "learning_rate": 8.0642677122309e-06, + "loss": 0.5326, + "step": 7077 + }, + { + "epoch": 0.58, + "grad_norm": 0.8301615599182511, + "learning_rate": 8.0616850978893e-06, + "loss": 0.4942, + "step": 7078 + }, + { + "epoch": 0.58, + "grad_norm": 0.8730009479479265, + "learning_rate": 8.059102617871203e-06, + "loss": 0.4616, + "step": 7079 + }, + { + "epoch": 0.58, + "grad_norm": 0.9619992822577931, + "learning_rate": 8.056520272355571e-06, + "loss": 0.492, + "step": 7080 + }, + { + "epoch": 0.58, + "grad_norm": 0.9328794065066321, + "learning_rate": 8.053938061521363e-06, + "loss": 0.5819, + "step": 7081 + }, + { + "epoch": 0.58, + "grad_norm": 0.8940979839317937, + "learning_rate": 8.05135598554752e-06, + "loss": 0.5396, + "step": 7082 + }, + { + "epoch": 0.58, + "grad_norm": 0.9114092619314533, + "learning_rate": 8.048774044612977e-06, + "loss": 0.4878, + "step": 7083 + }, + { + "epoch": 0.58, + "grad_norm": 0.9038805767427703, + "learning_rate": 8.046192238896665e-06, + "loss": 0.5246, + "step": 7084 + }, + { + "epoch": 0.58, + "grad_norm": 0.9281520876936268, + "learning_rate": 8.043610568577497e-06, + "loss": 0.5246, + "step": 7085 + }, + { + "epoch": 0.58, + "grad_norm": 0.8755547684924566, + "learning_rate": 8.041029033834378e-06, + "loss": 0.5387, + "step": 7086 + }, + { + "epoch": 0.58, + "grad_norm": 1.0413527560484352, + "learning_rate": 8.038447634846214e-06, + "loss": 0.4924, + "step": 7087 + }, + { + "epoch": 0.58, + "grad_norm": 0.9365148241845588, + "learning_rate": 8.035866371791889e-06, + "loss": 0.5573, + "step": 7088 + }, + { + "epoch": 0.58, + "grad_norm": 0.8583013464946238, + "learning_rate": 8.033285244850276e-06, + "loss": 0.4931, + "step": 7089 + }, + { + "epoch": 0.58, + "grad_norm": 0.9798987553897932, + "learning_rate": 8.030704254200256e-06, + "loss": 0.5488, + "step": 7090 + }, + { + "epoch": 0.58, + "grad_norm": 0.8481462707609014, + "learning_rate": 8.028123400020686e-06, + "loss": 0.5436, + "step": 7091 + }, + { + "epoch": 0.58, + "grad_norm": 0.8908057979277204, + "learning_rate": 8.02554268249041e-06, + "loss": 0.5143, + "step": 7092 + }, + { + "epoch": 0.58, + "grad_norm": 0.9223070040189832, + "learning_rate": 8.02296210178828e-06, + "loss": 0.5308, + "step": 7093 + }, + { + "epoch": 0.58, + "grad_norm": 0.8816963595479083, + "learning_rate": 8.02038165809312e-06, + "loss": 0.5327, + "step": 7094 + }, + { + "epoch": 0.58, + "grad_norm": 0.8929250749843264, + "learning_rate": 8.017801351583753e-06, + "loss": 0.4613, + "step": 7095 + }, + { + "epoch": 0.58, + "grad_norm": 0.7983790901220064, + "learning_rate": 8.015221182439e-06, + "loss": 0.4302, + "step": 7096 + }, + { + "epoch": 0.58, + "grad_norm": 0.8384414175374417, + "learning_rate": 8.012641150837656e-06, + "loss": 0.431, + "step": 7097 + }, + { + "epoch": 0.58, + "grad_norm": 0.8930745141481806, + "learning_rate": 8.010061256958515e-06, + "loss": 0.517, + "step": 7098 + }, + { + "epoch": 0.58, + "grad_norm": 0.9267017949110046, + "learning_rate": 8.007481500980366e-06, + "loss": 0.4581, + "step": 7099 + }, + { + "epoch": 0.58, + "grad_norm": 0.8493740892753433, + "learning_rate": 8.004901883081983e-06, + "loss": 0.4934, + "step": 7100 + }, + { + "epoch": 0.58, + "grad_norm": 0.885590654760843, + "learning_rate": 8.002322403442125e-06, + "loss": 0.5524, + "step": 7101 + }, + { + "epoch": 0.58, + "grad_norm": 0.9324619489542131, + "learning_rate": 7.999743062239557e-06, + "loss": 0.5681, + "step": 7102 + }, + { + "epoch": 0.58, + "grad_norm": 0.9608122058291328, + "learning_rate": 7.99716385965302e-06, + "loss": 0.5449, + "step": 7103 + }, + { + "epoch": 0.58, + "grad_norm": 0.8549468729838979, + "learning_rate": 7.994584795861248e-06, + "loss": 0.5272, + "step": 7104 + }, + { + "epoch": 0.58, + "grad_norm": 0.9065242963654259, + "learning_rate": 7.992005871042975e-06, + "loss": 0.5188, + "step": 7105 + }, + { + "epoch": 0.58, + "grad_norm": 0.8642241252773025, + "learning_rate": 7.989427085376914e-06, + "loss": 0.4868, + "step": 7106 + }, + { + "epoch": 0.58, + "grad_norm": 0.8191305164843538, + "learning_rate": 7.986848439041767e-06, + "loss": 0.5231, + "step": 7107 + }, + { + "epoch": 0.58, + "grad_norm": 0.9345831959630203, + "learning_rate": 7.984269932216241e-06, + "loss": 0.5122, + "step": 7108 + }, + { + "epoch": 0.58, + "grad_norm": 2.0591280413854145, + "learning_rate": 7.981691565079024e-06, + "loss": 0.5526, + "step": 7109 + }, + { + "epoch": 0.58, + "grad_norm": 1.0004936234543034, + "learning_rate": 7.979113337808786e-06, + "loss": 0.4942, + "step": 7110 + }, + { + "epoch": 0.58, + "grad_norm": 0.816574338598874, + "learning_rate": 7.976535250584204e-06, + "loss": 0.5098, + "step": 7111 + }, + { + "epoch": 0.58, + "grad_norm": 0.8664437742943795, + "learning_rate": 7.973957303583936e-06, + "loss": 0.5368, + "step": 7112 + }, + { + "epoch": 0.58, + "grad_norm": 0.91862171457887, + "learning_rate": 7.971379496986625e-06, + "loss": 0.5826, + "step": 7113 + }, + { + "epoch": 0.58, + "grad_norm": 0.9417244042887671, + "learning_rate": 7.968801830970917e-06, + "loss": 0.5015, + "step": 7114 + }, + { + "epoch": 0.58, + "grad_norm": 0.9634260358824619, + "learning_rate": 7.966224305715443e-06, + "loss": 0.5266, + "step": 7115 + }, + { + "epoch": 0.58, + "grad_norm": 0.8893242016477338, + "learning_rate": 7.963646921398818e-06, + "loss": 0.4669, + "step": 7116 + }, + { + "epoch": 0.58, + "grad_norm": 0.9680052559082771, + "learning_rate": 7.961069678199658e-06, + "loss": 0.5176, + "step": 7117 + }, + { + "epoch": 0.58, + "grad_norm": 0.8865734175274613, + "learning_rate": 7.95849257629656e-06, + "loss": 0.5046, + "step": 7118 + }, + { + "epoch": 0.58, + "grad_norm": 0.9284345145026058, + "learning_rate": 7.95591561586811e-06, + "loss": 0.5279, + "step": 7119 + }, + { + "epoch": 0.58, + "grad_norm": 0.8619392433080499, + "learning_rate": 7.953338797092902e-06, + "loss": 0.4835, + "step": 7120 + }, + { + "epoch": 0.58, + "grad_norm": 1.0180767658901928, + "learning_rate": 7.950762120149499e-06, + "loss": 0.5636, + "step": 7121 + }, + { + "epoch": 0.58, + "grad_norm": 0.8686763777204012, + "learning_rate": 7.94818558521646e-06, + "loss": 0.5189, + "step": 7122 + }, + { + "epoch": 0.58, + "grad_norm": 0.93918029784971, + "learning_rate": 7.94560919247234e-06, + "loss": 0.4968, + "step": 7123 + }, + { + "epoch": 0.58, + "grad_norm": 0.9442326161843043, + "learning_rate": 7.943032942095685e-06, + "loss": 0.5392, + "step": 7124 + }, + { + "epoch": 0.58, + "grad_norm": 0.9935891972254188, + "learning_rate": 7.940456834265017e-06, + "loss": 0.5106, + "step": 7125 + }, + { + "epoch": 0.58, + "grad_norm": 0.8666080937052979, + "learning_rate": 7.937880869158868e-06, + "loss": 0.4962, + "step": 7126 + }, + { + "epoch": 0.58, + "grad_norm": 0.849261195585096, + "learning_rate": 7.935305046955746e-06, + "loss": 0.5083, + "step": 7127 + }, + { + "epoch": 0.58, + "grad_norm": 0.8464975330500006, + "learning_rate": 7.93272936783415e-06, + "loss": 0.5107, + "step": 7128 + }, + { + "epoch": 0.58, + "grad_norm": 0.9556219695413329, + "learning_rate": 7.930153831972575e-06, + "loss": 0.5619, + "step": 7129 + }, + { + "epoch": 0.58, + "grad_norm": 0.9412515314641338, + "learning_rate": 7.927578439549506e-06, + "loss": 0.5491, + "step": 7130 + }, + { + "epoch": 0.58, + "grad_norm": 0.8079216989915801, + "learning_rate": 7.92500319074341e-06, + "loss": 0.5057, + "step": 7131 + }, + { + "epoch": 0.58, + "grad_norm": 0.9025189985626063, + "learning_rate": 7.922428085732755e-06, + "loss": 0.5127, + "step": 7132 + }, + { + "epoch": 0.58, + "grad_norm": 0.9482220895231857, + "learning_rate": 7.919853124695993e-06, + "loss": 0.4783, + "step": 7133 + }, + { + "epoch": 0.58, + "grad_norm": 0.8388943639401142, + "learning_rate": 7.917278307811557e-06, + "loss": 0.4902, + "step": 7134 + }, + { + "epoch": 0.58, + "grad_norm": 0.8968964756970629, + "learning_rate": 7.914703635257892e-06, + "loss": 0.4887, + "step": 7135 + }, + { + "epoch": 0.58, + "grad_norm": 0.8833263700487893, + "learning_rate": 7.912129107213417e-06, + "loss": 0.5087, + "step": 7136 + }, + { + "epoch": 0.58, + "grad_norm": 0.8963174813537905, + "learning_rate": 7.909554723856537e-06, + "loss": 0.5032, + "step": 7137 + }, + { + "epoch": 0.58, + "grad_norm": 0.9074153912009865, + "learning_rate": 7.906980485365665e-06, + "loss": 0.5232, + "step": 7138 + }, + { + "epoch": 0.58, + "grad_norm": 0.8283767399063273, + "learning_rate": 7.90440639191919e-06, + "loss": 0.4602, + "step": 7139 + }, + { + "epoch": 0.58, + "grad_norm": 0.8903407901604158, + "learning_rate": 7.901832443695487e-06, + "loss": 0.5257, + "step": 7140 + }, + { + "epoch": 0.58, + "grad_norm": 0.9091972597395084, + "learning_rate": 7.899258640872942e-06, + "loss": 0.5404, + "step": 7141 + }, + { + "epoch": 0.58, + "grad_norm": 1.0976786999602899, + "learning_rate": 7.896684983629907e-06, + "loss": 0.5162, + "step": 7142 + }, + { + "epoch": 0.58, + "grad_norm": 0.8003995289691711, + "learning_rate": 7.894111472144733e-06, + "loss": 0.4649, + "step": 7143 + }, + { + "epoch": 0.58, + "grad_norm": 0.8420642146592251, + "learning_rate": 7.89153810659577e-06, + "loss": 0.5088, + "step": 7144 + }, + { + "epoch": 0.58, + "grad_norm": 1.0426511293723346, + "learning_rate": 7.888964887161348e-06, + "loss": 0.576, + "step": 7145 + }, + { + "epoch": 0.58, + "grad_norm": 0.9027421728823426, + "learning_rate": 7.886391814019782e-06, + "loss": 0.5263, + "step": 7146 + }, + { + "epoch": 0.58, + "grad_norm": 0.9367759837162823, + "learning_rate": 7.883818887349391e-06, + "loss": 0.5385, + "step": 7147 + }, + { + "epoch": 0.58, + "grad_norm": 0.8860146610850669, + "learning_rate": 7.881246107328472e-06, + "loss": 0.4902, + "step": 7148 + }, + { + "epoch": 0.58, + "grad_norm": 0.9387255568652029, + "learning_rate": 7.878673474135321e-06, + "loss": 0.5192, + "step": 7149 + }, + { + "epoch": 0.58, + "grad_norm": 0.9864023313146227, + "learning_rate": 7.876100987948217e-06, + "loss": 0.588, + "step": 7150 + }, + { + "epoch": 0.58, + "grad_norm": 0.994880359320626, + "learning_rate": 7.873528648945428e-06, + "loss": 0.5507, + "step": 7151 + }, + { + "epoch": 0.58, + "grad_norm": 0.8755218895848427, + "learning_rate": 7.87095645730522e-06, + "loss": 0.5207, + "step": 7152 + }, + { + "epoch": 0.58, + "grad_norm": 0.8424039240765442, + "learning_rate": 7.868384413205842e-06, + "loss": 0.5149, + "step": 7153 + }, + { + "epoch": 0.58, + "grad_norm": 0.9298680000054024, + "learning_rate": 7.865812516825528e-06, + "loss": 0.4513, + "step": 7154 + }, + { + "epoch": 0.58, + "grad_norm": 0.9378542402912848, + "learning_rate": 7.863240768342518e-06, + "loss": 0.56, + "step": 7155 + }, + { + "epoch": 0.58, + "grad_norm": 0.941553399591606, + "learning_rate": 7.860669167935028e-06, + "loss": 0.5182, + "step": 7156 + }, + { + "epoch": 0.58, + "grad_norm": 0.9906634051079232, + "learning_rate": 7.858097715781264e-06, + "loss": 0.5827, + "step": 7157 + }, + { + "epoch": 0.58, + "grad_norm": 0.8681871023458261, + "learning_rate": 7.85552641205943e-06, + "loss": 0.5365, + "step": 7158 + }, + { + "epoch": 0.58, + "grad_norm": 0.8604262563644228, + "learning_rate": 7.852955256947717e-06, + "loss": 0.4941, + "step": 7159 + }, + { + "epoch": 0.58, + "grad_norm": 0.9035245788633994, + "learning_rate": 7.850384250624293e-06, + "loss": 0.4967, + "step": 7160 + }, + { + "epoch": 0.58, + "grad_norm": 0.9448437606155281, + "learning_rate": 7.847813393267338e-06, + "loss": 0.5439, + "step": 7161 + }, + { + "epoch": 0.58, + "grad_norm": 0.8518107595401718, + "learning_rate": 7.845242685055008e-06, + "loss": 0.4615, + "step": 7162 + }, + { + "epoch": 0.58, + "grad_norm": 0.9074697637525523, + "learning_rate": 7.842672126165443e-06, + "loss": 0.545, + "step": 7163 + }, + { + "epoch": 0.58, + "grad_norm": 0.8956320738778516, + "learning_rate": 7.84010171677679e-06, + "loss": 0.5295, + "step": 7164 + }, + { + "epoch": 0.58, + "grad_norm": 0.9930685920080908, + "learning_rate": 7.837531457067171e-06, + "loss": 0.5653, + "step": 7165 + }, + { + "epoch": 0.58, + "grad_norm": 0.9672444191250434, + "learning_rate": 7.834961347214704e-06, + "loss": 0.496, + "step": 7166 + }, + { + "epoch": 0.58, + "grad_norm": 0.9470240702819805, + "learning_rate": 7.832391387397495e-06, + "loss": 0.5262, + "step": 7167 + }, + { + "epoch": 0.58, + "grad_norm": 0.890014574596134, + "learning_rate": 7.829821577793642e-06, + "loss": 0.5246, + "step": 7168 + }, + { + "epoch": 0.58, + "grad_norm": 0.9901783082879074, + "learning_rate": 7.827251918581225e-06, + "loss": 0.6147, + "step": 7169 + }, + { + "epoch": 0.58, + "grad_norm": 0.8810342910057687, + "learning_rate": 7.824682409938328e-06, + "loss": 0.5329, + "step": 7170 + }, + { + "epoch": 0.58, + "grad_norm": 0.9177398262818202, + "learning_rate": 7.822113052043007e-06, + "loss": 0.5194, + "step": 7171 + }, + { + "epoch": 0.58, + "grad_norm": 0.9514360356116424, + "learning_rate": 7.819543845073319e-06, + "loss": 0.5121, + "step": 7172 + }, + { + "epoch": 0.58, + "grad_norm": 0.9172146908279984, + "learning_rate": 7.816974789207311e-06, + "loss": 0.4944, + "step": 7173 + }, + { + "epoch": 0.58, + "grad_norm": 0.8682178687211466, + "learning_rate": 7.814405884623012e-06, + "loss": 0.5039, + "step": 7174 + }, + { + "epoch": 0.58, + "grad_norm": 0.8708333791973569, + "learning_rate": 7.811837131498448e-06, + "loss": 0.5385, + "step": 7175 + }, + { + "epoch": 0.58, + "grad_norm": 0.8474069428549672, + "learning_rate": 7.80926853001163e-06, + "loss": 0.4809, + "step": 7176 + }, + { + "epoch": 0.58, + "grad_norm": 0.9659314208908955, + "learning_rate": 7.806700080340558e-06, + "loss": 0.5581, + "step": 7177 + }, + { + "epoch": 0.58, + "grad_norm": 1.0070230808243348, + "learning_rate": 7.804131782663224e-06, + "loss": 0.5707, + "step": 7178 + }, + { + "epoch": 0.58, + "grad_norm": 0.9345471374137605, + "learning_rate": 7.801563637157614e-06, + "loss": 0.5631, + "step": 7179 + }, + { + "epoch": 0.58, + "grad_norm": 0.8770225209942285, + "learning_rate": 7.79899564400169e-06, + "loss": 0.5538, + "step": 7180 + }, + { + "epoch": 0.58, + "grad_norm": 0.9830268920995888, + "learning_rate": 7.796427803373416e-06, + "loss": 0.5664, + "step": 7181 + }, + { + "epoch": 0.58, + "grad_norm": 0.9248689772553825, + "learning_rate": 7.793860115450744e-06, + "loss": 0.5054, + "step": 7182 + }, + { + "epoch": 0.58, + "grad_norm": 0.9004370145255842, + "learning_rate": 7.791292580411606e-06, + "loss": 0.4964, + "step": 7183 + }, + { + "epoch": 0.58, + "grad_norm": 0.8716515992647562, + "learning_rate": 7.788725198433933e-06, + "loss": 0.4919, + "step": 7184 + }, + { + "epoch": 0.58, + "grad_norm": 0.8169934980297398, + "learning_rate": 7.786157969695643e-06, + "loss": 0.4821, + "step": 7185 + }, + { + "epoch": 0.58, + "grad_norm": 0.8647980879138384, + "learning_rate": 7.783590894374642e-06, + "loss": 0.5286, + "step": 7186 + }, + { + "epoch": 0.58, + "grad_norm": 0.8614129816503232, + "learning_rate": 7.781023972648826e-06, + "loss": 0.4908, + "step": 7187 + }, + { + "epoch": 0.58, + "grad_norm": 0.9324921763996864, + "learning_rate": 7.778457204696082e-06, + "loss": 0.5547, + "step": 7188 + }, + { + "epoch": 0.58, + "grad_norm": 0.9111081523908714, + "learning_rate": 7.775890590694283e-06, + "loss": 0.4669, + "step": 7189 + }, + { + "epoch": 0.58, + "grad_norm": 0.9610565102640668, + "learning_rate": 7.77332413082129e-06, + "loss": 0.5295, + "step": 7190 + }, + { + "epoch": 0.58, + "grad_norm": 0.9293235516497907, + "learning_rate": 7.770757825254962e-06, + "loss": 0.5041, + "step": 7191 + }, + { + "epoch": 0.58, + "grad_norm": 0.9121050505057278, + "learning_rate": 7.768191674173142e-06, + "loss": 0.5348, + "step": 7192 + }, + { + "epoch": 0.58, + "grad_norm": 0.7854722354449046, + "learning_rate": 7.765625677753656e-06, + "loss": 0.4922, + "step": 7193 + }, + { + "epoch": 0.58, + "grad_norm": 0.9010995980188043, + "learning_rate": 7.763059836174329e-06, + "loss": 0.5002, + "step": 7194 + }, + { + "epoch": 0.58, + "grad_norm": 0.8549641547968623, + "learning_rate": 7.760494149612971e-06, + "loss": 0.4898, + "step": 7195 + }, + { + "epoch": 0.58, + "grad_norm": 0.8698049730542157, + "learning_rate": 7.757928618247384e-06, + "loss": 0.4968, + "step": 7196 + }, + { + "epoch": 0.58, + "grad_norm": 0.9109125156970205, + "learning_rate": 7.755363242255352e-06, + "loss": 0.5399, + "step": 7197 + }, + { + "epoch": 0.59, + "grad_norm": 0.9878217858400418, + "learning_rate": 7.752798021814659e-06, + "loss": 0.56, + "step": 7198 + }, + { + "epoch": 0.59, + "grad_norm": 0.9468859034587981, + "learning_rate": 7.750232957103068e-06, + "loss": 0.4919, + "step": 7199 + }, + { + "epoch": 0.59, + "grad_norm": 0.9303700707294289, + "learning_rate": 7.747668048298338e-06, + "loss": 0.572, + "step": 7200 + }, + { + "epoch": 0.59, + "grad_norm": 0.9098675738960599, + "learning_rate": 7.745103295578216e-06, + "loss": 0.5466, + "step": 7201 + }, + { + "epoch": 0.59, + "grad_norm": 0.9343802851406701, + "learning_rate": 7.74253869912043e-06, + "loss": 0.5386, + "step": 7202 + }, + { + "epoch": 0.59, + "grad_norm": 0.9476829141758246, + "learning_rate": 7.739974259102716e-06, + "loss": 0.5189, + "step": 7203 + }, + { + "epoch": 0.59, + "grad_norm": 0.7783194872808666, + "learning_rate": 7.73740997570278e-06, + "loss": 0.4538, + "step": 7204 + }, + { + "epoch": 0.59, + "grad_norm": 1.1009354505492426, + "learning_rate": 7.734845849098324e-06, + "loss": 0.5546, + "step": 7205 + }, + { + "epoch": 0.59, + "grad_norm": 0.9266294852249874, + "learning_rate": 7.732281879467043e-06, + "loss": 0.5376, + "step": 7206 + }, + { + "epoch": 0.59, + "grad_norm": 0.8711521298481469, + "learning_rate": 7.729718066986617e-06, + "loss": 0.496, + "step": 7207 + }, + { + "epoch": 0.59, + "grad_norm": 0.8892162259369395, + "learning_rate": 7.727154411834712e-06, + "loss": 0.483, + "step": 7208 + }, + { + "epoch": 0.59, + "grad_norm": 0.9028078827519032, + "learning_rate": 7.724590914188994e-06, + "loss": 0.5044, + "step": 7209 + }, + { + "epoch": 0.59, + "grad_norm": 0.9815763452473252, + "learning_rate": 7.722027574227107e-06, + "loss": 0.5594, + "step": 7210 + }, + { + "epoch": 0.59, + "grad_norm": 0.8967294073556648, + "learning_rate": 7.719464392126684e-06, + "loss": 0.5169, + "step": 7211 + }, + { + "epoch": 0.59, + "grad_norm": 0.9065002695688444, + "learning_rate": 7.71690136806536e-06, + "loss": 0.5255, + "step": 7212 + }, + { + "epoch": 0.59, + "grad_norm": 0.9222558351013072, + "learning_rate": 7.714338502220746e-06, + "loss": 0.5538, + "step": 7213 + }, + { + "epoch": 0.59, + "grad_norm": 0.8671006891727058, + "learning_rate": 7.711775794770443e-06, + "loss": 0.5086, + "step": 7214 + }, + { + "epoch": 0.59, + "grad_norm": 1.0232282025989965, + "learning_rate": 7.709213245892051e-06, + "loss": 0.548, + "step": 7215 + }, + { + "epoch": 0.59, + "grad_norm": 0.9529100879092692, + "learning_rate": 7.70665085576315e-06, + "loss": 0.5342, + "step": 7216 + }, + { + "epoch": 0.59, + "grad_norm": 0.9734892020915824, + "learning_rate": 7.704088624561306e-06, + "loss": 0.5127, + "step": 7217 + }, + { + "epoch": 0.59, + "grad_norm": 0.9270597288383303, + "learning_rate": 7.701526552464087e-06, + "loss": 0.5173, + "step": 7218 + }, + { + "epoch": 0.59, + "grad_norm": 0.8720870889009098, + "learning_rate": 7.698964639649041e-06, + "loss": 0.4809, + "step": 7219 + }, + { + "epoch": 0.59, + "grad_norm": 0.923839288776839, + "learning_rate": 7.6964028862937e-06, + "loss": 0.5046, + "step": 7220 + }, + { + "epoch": 0.59, + "grad_norm": 1.02966338890196, + "learning_rate": 7.6938412925756e-06, + "loss": 0.5881, + "step": 7221 + }, + { + "epoch": 0.59, + "grad_norm": 0.9594420898093694, + "learning_rate": 7.691279858672252e-06, + "loss": 0.5048, + "step": 7222 + }, + { + "epoch": 0.59, + "grad_norm": 0.9115683816055996, + "learning_rate": 7.688718584761158e-06, + "loss": 0.4904, + "step": 7223 + }, + { + "epoch": 0.59, + "grad_norm": 0.9054139733472941, + "learning_rate": 7.68615747101982e-06, + "loss": 0.5241, + "step": 7224 + }, + { + "epoch": 0.59, + "grad_norm": 0.7437088104314823, + "learning_rate": 7.683596517625716e-06, + "loss": 0.4348, + "step": 7225 + }, + { + "epoch": 0.59, + "grad_norm": 0.8783371843681202, + "learning_rate": 7.681035724756317e-06, + "loss": 0.5548, + "step": 7226 + }, + { + "epoch": 0.59, + "grad_norm": 0.9830512757805387, + "learning_rate": 7.678475092589088e-06, + "loss": 0.5338, + "step": 7227 + }, + { + "epoch": 0.59, + "grad_norm": 0.8493114744220931, + "learning_rate": 7.675914621301476e-06, + "loss": 0.4998, + "step": 7228 + }, + { + "epoch": 0.59, + "grad_norm": 0.9992033936519503, + "learning_rate": 7.673354311070914e-06, + "loss": 0.5458, + "step": 7229 + }, + { + "epoch": 0.59, + "grad_norm": 0.8176998664379581, + "learning_rate": 7.67079416207484e-06, + "loss": 0.4448, + "step": 7230 + }, + { + "epoch": 0.59, + "grad_norm": 0.8844593418859039, + "learning_rate": 7.668234174490664e-06, + "loss": 0.508, + "step": 7231 + }, + { + "epoch": 0.59, + "grad_norm": 0.8417371432578538, + "learning_rate": 7.665674348495788e-06, + "loss": 0.4771, + "step": 7232 + }, + { + "epoch": 0.59, + "grad_norm": 0.9956057436389267, + "learning_rate": 7.663114684267612e-06, + "loss": 0.5216, + "step": 7233 + }, + { + "epoch": 0.59, + "grad_norm": 0.9428429486313148, + "learning_rate": 7.660555181983517e-06, + "loss": 0.4689, + "step": 7234 + }, + { + "epoch": 0.59, + "grad_norm": 0.9375806767617715, + "learning_rate": 7.657995841820869e-06, + "loss": 0.5296, + "step": 7235 + }, + { + "epoch": 0.59, + "grad_norm": 0.8450786498494435, + "learning_rate": 7.655436663957035e-06, + "loss": 0.4782, + "step": 7236 + }, + { + "epoch": 0.59, + "grad_norm": 0.9589117881742945, + "learning_rate": 7.65287764856936e-06, + "loss": 0.5631, + "step": 7237 + }, + { + "epoch": 0.59, + "grad_norm": 0.9600384970168856, + "learning_rate": 7.650318795835179e-06, + "loss": 0.5248, + "step": 7238 + }, + { + "epoch": 0.59, + "grad_norm": 0.9766270180046419, + "learning_rate": 7.647760105931825e-06, + "loss": 0.5195, + "step": 7239 + }, + { + "epoch": 0.59, + "grad_norm": 1.0108489371038865, + "learning_rate": 7.64520157903661e-06, + "loss": 0.5331, + "step": 7240 + }, + { + "epoch": 0.59, + "grad_norm": 0.9379986049299248, + "learning_rate": 7.64264321532683e-06, + "loss": 0.4585, + "step": 7241 + }, + { + "epoch": 0.59, + "grad_norm": 0.9131980397882025, + "learning_rate": 7.640085014979792e-06, + "loss": 0.5732, + "step": 7242 + }, + { + "epoch": 0.59, + "grad_norm": 0.9529997529146212, + "learning_rate": 7.637526978172767e-06, + "loss": 0.5263, + "step": 7243 + }, + { + "epoch": 0.59, + "grad_norm": 0.8851984451396883, + "learning_rate": 7.634969105083023e-06, + "loss": 0.5099, + "step": 7244 + }, + { + "epoch": 0.59, + "grad_norm": 0.9883433976165381, + "learning_rate": 7.632411395887826e-06, + "loss": 0.553, + "step": 7245 + }, + { + "epoch": 0.59, + "grad_norm": 0.9934441008382426, + "learning_rate": 7.62985385076442e-06, + "loss": 0.543, + "step": 7246 + }, + { + "epoch": 0.59, + "grad_norm": 0.9492112256678701, + "learning_rate": 7.6272964698900356e-06, + "loss": 0.5126, + "step": 7247 + }, + { + "epoch": 0.59, + "grad_norm": 0.8564922898626104, + "learning_rate": 7.624739253441905e-06, + "loss": 0.5169, + "step": 7248 + }, + { + "epoch": 0.59, + "grad_norm": 1.0115163408161787, + "learning_rate": 7.622182201597238e-06, + "loss": 0.5686, + "step": 7249 + }, + { + "epoch": 0.59, + "grad_norm": 0.9392761542496402, + "learning_rate": 7.619625314533231e-06, + "loss": 0.5365, + "step": 7250 + }, + { + "epoch": 0.59, + "grad_norm": 0.8739873176329055, + "learning_rate": 7.6170685924270815e-06, + "loss": 0.5418, + "step": 7251 + }, + { + "epoch": 0.59, + "grad_norm": 0.8558631122985492, + "learning_rate": 7.6145120354559666e-06, + "loss": 0.5435, + "step": 7252 + }, + { + "epoch": 0.59, + "grad_norm": 0.9014633753616382, + "learning_rate": 7.611955643797046e-06, + "loss": 0.5492, + "step": 7253 + }, + { + "epoch": 0.59, + "grad_norm": 1.0011582929357539, + "learning_rate": 7.609399417627486e-06, + "loss": 0.6088, + "step": 7254 + }, + { + "epoch": 0.59, + "grad_norm": 0.875694028910975, + "learning_rate": 7.606843357124426e-06, + "loss": 0.534, + "step": 7255 + }, + { + "epoch": 0.59, + "grad_norm": 0.8900901373620113, + "learning_rate": 7.604287462464995e-06, + "loss": 0.5299, + "step": 7256 + }, + { + "epoch": 0.59, + "grad_norm": 0.8280069475864995, + "learning_rate": 7.60173173382632e-06, + "loss": 0.4966, + "step": 7257 + }, + { + "epoch": 0.59, + "grad_norm": 0.880686332922401, + "learning_rate": 7.599176171385509e-06, + "loss": 0.5073, + "step": 7258 + }, + { + "epoch": 0.59, + "grad_norm": 0.8840388220644066, + "learning_rate": 7.5966207753196574e-06, + "loss": 0.5394, + "step": 7259 + }, + { + "epoch": 0.59, + "grad_norm": 0.8961429378494427, + "learning_rate": 7.5940655458058575e-06, + "loss": 0.4807, + "step": 7260 + }, + { + "epoch": 0.59, + "grad_norm": 0.9821194016091872, + "learning_rate": 7.59151048302118e-06, + "loss": 0.5458, + "step": 7261 + }, + { + "epoch": 0.59, + "grad_norm": 0.9752484428211466, + "learning_rate": 7.588955587142688e-06, + "loss": 0.516, + "step": 7262 + }, + { + "epoch": 0.59, + "grad_norm": 0.9843824414210166, + "learning_rate": 7.586400858347438e-06, + "loss": 0.5626, + "step": 7263 + }, + { + "epoch": 0.59, + "grad_norm": 0.8707842938023541, + "learning_rate": 7.583846296812467e-06, + "loss": 0.4788, + "step": 7264 + }, + { + "epoch": 0.59, + "grad_norm": 0.8926431378262193, + "learning_rate": 7.581291902714801e-06, + "loss": 0.4816, + "step": 7265 + }, + { + "epoch": 0.59, + "grad_norm": 0.8237824939681814, + "learning_rate": 7.578737676231466e-06, + "loss": 0.508, + "step": 7266 + }, + { + "epoch": 0.59, + "grad_norm": 0.9437865429552902, + "learning_rate": 7.576183617539461e-06, + "loss": 0.5317, + "step": 7267 + }, + { + "epoch": 0.59, + "grad_norm": 0.9322937825461017, + "learning_rate": 7.573629726815778e-06, + "loss": 0.5568, + "step": 7268 + }, + { + "epoch": 0.59, + "grad_norm": 0.8211316207432359, + "learning_rate": 7.5710760042374056e-06, + "loss": 0.4647, + "step": 7269 + }, + { + "epoch": 0.59, + "grad_norm": 0.8637140877102417, + "learning_rate": 7.568522449981308e-06, + "loss": 0.4874, + "step": 7270 + }, + { + "epoch": 0.59, + "grad_norm": 0.8855791028344088, + "learning_rate": 7.565969064224453e-06, + "loss": 0.5189, + "step": 7271 + }, + { + "epoch": 0.59, + "grad_norm": 1.014135661037069, + "learning_rate": 7.563415847143782e-06, + "loss": 0.5775, + "step": 7272 + }, + { + "epoch": 0.59, + "grad_norm": 0.9497931596687522, + "learning_rate": 7.560862798916229e-06, + "loss": 0.5442, + "step": 7273 + }, + { + "epoch": 0.59, + "grad_norm": 0.9961094952611982, + "learning_rate": 7.558309919718723e-06, + "loss": 0.5498, + "step": 7274 + }, + { + "epoch": 0.59, + "grad_norm": 0.8910273913576999, + "learning_rate": 7.555757209728174e-06, + "loss": 0.5109, + "step": 7275 + }, + { + "epoch": 0.59, + "grad_norm": 0.8401278429671708, + "learning_rate": 7.553204669121478e-06, + "loss": 0.5287, + "step": 7276 + }, + { + "epoch": 0.59, + "grad_norm": 1.0258306091345222, + "learning_rate": 7.550652298075532e-06, + "loss": 0.5642, + "step": 7277 + }, + { + "epoch": 0.59, + "grad_norm": 1.0388928356690446, + "learning_rate": 7.54810009676721e-06, + "loss": 0.5273, + "step": 7278 + }, + { + "epoch": 0.59, + "grad_norm": 0.894137035048821, + "learning_rate": 7.545548065373372e-06, + "loss": 0.5418, + "step": 7279 + }, + { + "epoch": 0.59, + "grad_norm": 0.8520592226061066, + "learning_rate": 7.54299620407088e-06, + "loss": 0.5039, + "step": 7280 + }, + { + "epoch": 0.59, + "grad_norm": 0.9289674724415184, + "learning_rate": 7.540444513036572e-06, + "loss": 0.5908, + "step": 7281 + }, + { + "epoch": 0.59, + "grad_norm": 0.8688189308375063, + "learning_rate": 7.5378929924472735e-06, + "loss": 0.5631, + "step": 7282 + }, + { + "epoch": 0.59, + "grad_norm": 0.9200999552756037, + "learning_rate": 7.535341642479811e-06, + "loss": 0.5082, + "step": 7283 + }, + { + "epoch": 0.59, + "grad_norm": 0.9210146899679073, + "learning_rate": 7.532790463310986e-06, + "loss": 0.5396, + "step": 7284 + }, + { + "epoch": 0.59, + "grad_norm": 0.9137799113440104, + "learning_rate": 7.530239455117589e-06, + "loss": 0.5432, + "step": 7285 + }, + { + "epoch": 0.59, + "grad_norm": 0.9363505809898554, + "learning_rate": 7.527688618076413e-06, + "loss": 0.5339, + "step": 7286 + }, + { + "epoch": 0.59, + "grad_norm": 0.9763535749511509, + "learning_rate": 7.525137952364222e-06, + "loss": 0.5225, + "step": 7287 + }, + { + "epoch": 0.59, + "grad_norm": 0.8645863285917511, + "learning_rate": 7.522587458157771e-06, + "loss": 0.518, + "step": 7288 + }, + { + "epoch": 0.59, + "grad_norm": 0.8407841151958855, + "learning_rate": 7.520037135633817e-06, + "loss": 0.4463, + "step": 7289 + }, + { + "epoch": 0.59, + "grad_norm": 0.9285178402786097, + "learning_rate": 7.517486984969088e-06, + "loss": 0.5605, + "step": 7290 + }, + { + "epoch": 0.59, + "grad_norm": 0.8945561346406458, + "learning_rate": 7.514937006340306e-06, + "loss": 0.5643, + "step": 7291 + }, + { + "epoch": 0.59, + "grad_norm": 0.8963075832585806, + "learning_rate": 7.512387199924189e-06, + "loss": 0.5199, + "step": 7292 + }, + { + "epoch": 0.59, + "grad_norm": 0.9563988735599971, + "learning_rate": 7.509837565897432e-06, + "loss": 0.5658, + "step": 7293 + }, + { + "epoch": 0.59, + "grad_norm": 0.9423292581170268, + "learning_rate": 7.507288104436719e-06, + "loss": 0.5352, + "step": 7294 + }, + { + "epoch": 0.59, + "grad_norm": 0.992898315830082, + "learning_rate": 7.504738815718734e-06, + "loss": 0.5415, + "step": 7295 + }, + { + "epoch": 0.59, + "grad_norm": 0.9020731368058453, + "learning_rate": 7.502189699920136e-06, + "loss": 0.5312, + "step": 7296 + }, + { + "epoch": 0.59, + "grad_norm": 0.9363841774035592, + "learning_rate": 7.499640757217572e-06, + "loss": 0.5127, + "step": 7297 + }, + { + "epoch": 0.59, + "grad_norm": 0.8631830942622, + "learning_rate": 7.497091987787689e-06, + "loss": 0.4652, + "step": 7298 + }, + { + "epoch": 0.59, + "grad_norm": 0.9029696039606694, + "learning_rate": 7.494543391807112e-06, + "loss": 0.469, + "step": 7299 + }, + { + "epoch": 0.59, + "grad_norm": 1.0395322395699418, + "learning_rate": 7.4919949694524506e-06, + "loss": 0.5787, + "step": 7300 + }, + { + "epoch": 0.59, + "grad_norm": 0.9576457627425172, + "learning_rate": 7.489446720900319e-06, + "loss": 0.541, + "step": 7301 + }, + { + "epoch": 0.59, + "grad_norm": 0.9116151576314977, + "learning_rate": 7.486898646327301e-06, + "loss": 0.514, + "step": 7302 + }, + { + "epoch": 0.59, + "grad_norm": 0.9491659654888721, + "learning_rate": 7.484350745909974e-06, + "loss": 0.497, + "step": 7303 + }, + { + "epoch": 0.59, + "grad_norm": 0.9257407304664244, + "learning_rate": 7.481803019824914e-06, + "loss": 0.5212, + "step": 7304 + }, + { + "epoch": 0.59, + "grad_norm": 0.830204046262912, + "learning_rate": 7.47925546824867e-06, + "loss": 0.5119, + "step": 7305 + }, + { + "epoch": 0.59, + "grad_norm": 0.9099262225306468, + "learning_rate": 7.476708091357783e-06, + "loss": 0.5123, + "step": 7306 + }, + { + "epoch": 0.59, + "grad_norm": 0.9658265435756943, + "learning_rate": 7.47416088932879e-06, + "loss": 0.5334, + "step": 7307 + }, + { + "epoch": 0.59, + "grad_norm": 0.9570047689135044, + "learning_rate": 7.471613862338207e-06, + "loss": 0.5118, + "step": 7308 + }, + { + "epoch": 0.59, + "grad_norm": 0.8938157106055836, + "learning_rate": 7.469067010562538e-06, + "loss": 0.5066, + "step": 7309 + }, + { + "epoch": 0.59, + "grad_norm": 0.9216259117988101, + "learning_rate": 7.466520334178284e-06, + "loss": 0.4809, + "step": 7310 + }, + { + "epoch": 0.59, + "grad_norm": 0.9052948098015442, + "learning_rate": 7.463973833361923e-06, + "loss": 0.5192, + "step": 7311 + }, + { + "epoch": 0.59, + "grad_norm": 0.9010973814962012, + "learning_rate": 7.461427508289922e-06, + "loss": 0.4918, + "step": 7312 + }, + { + "epoch": 0.59, + "grad_norm": 0.8954927965727761, + "learning_rate": 7.458881359138746e-06, + "loss": 0.4777, + "step": 7313 + }, + { + "epoch": 0.59, + "grad_norm": 0.9163133645892921, + "learning_rate": 7.4563353860848375e-06, + "loss": 0.5013, + "step": 7314 + }, + { + "epoch": 0.59, + "grad_norm": 0.9584330383366784, + "learning_rate": 7.453789589304629e-06, + "loss": 0.5731, + "step": 7315 + }, + { + "epoch": 0.59, + "grad_norm": 0.8238821401967524, + "learning_rate": 7.451243968974547e-06, + "loss": 0.4768, + "step": 7316 + }, + { + "epoch": 0.59, + "grad_norm": 0.9103939120287774, + "learning_rate": 7.448698525270995e-06, + "loss": 0.54, + "step": 7317 + }, + { + "epoch": 0.59, + "grad_norm": 0.8802397289849867, + "learning_rate": 7.446153258370372e-06, + "loss": 0.489, + "step": 7318 + }, + { + "epoch": 0.59, + "grad_norm": 0.9323534952947754, + "learning_rate": 7.443608168449063e-06, + "loss": 0.5387, + "step": 7319 + }, + { + "epoch": 0.59, + "grad_norm": 0.9041005321484384, + "learning_rate": 7.44106325568344e-06, + "loss": 0.5025, + "step": 7320 + }, + { + "epoch": 0.6, + "grad_norm": 0.9351837814636498, + "learning_rate": 7.438518520249865e-06, + "loss": 0.5344, + "step": 7321 + }, + { + "epoch": 0.6, + "grad_norm": 0.9896753959713007, + "learning_rate": 7.435973962324685e-06, + "loss": 0.509, + "step": 7322 + }, + { + "epoch": 0.6, + "grad_norm": 0.9217534565109436, + "learning_rate": 7.433429582084233e-06, + "loss": 0.527, + "step": 7323 + }, + { + "epoch": 0.6, + "grad_norm": 0.8924443816109849, + "learning_rate": 7.4308853797048355e-06, + "loss": 0.493, + "step": 7324 + }, + { + "epoch": 0.6, + "grad_norm": 0.790774098801519, + "learning_rate": 7.428341355362803e-06, + "loss": 0.5064, + "step": 7325 + }, + { + "epoch": 0.6, + "grad_norm": 0.895239464915871, + "learning_rate": 7.425797509234433e-06, + "loss": 0.5247, + "step": 7326 + }, + { + "epoch": 0.6, + "grad_norm": 0.8337451552078113, + "learning_rate": 7.423253841496011e-06, + "loss": 0.4385, + "step": 7327 + }, + { + "epoch": 0.6, + "grad_norm": 0.9388895050134273, + "learning_rate": 7.420710352323814e-06, + "loss": 0.5099, + "step": 7328 + }, + { + "epoch": 0.6, + "grad_norm": 0.8729235534099536, + "learning_rate": 7.418167041894101e-06, + "loss": 0.5312, + "step": 7329 + }, + { + "epoch": 0.6, + "grad_norm": 0.8780973606573884, + "learning_rate": 7.415623910383121e-06, + "loss": 0.5222, + "step": 7330 + }, + { + "epoch": 0.6, + "grad_norm": 0.8900258292874804, + "learning_rate": 7.413080957967114e-06, + "loss": 0.5311, + "step": 7331 + }, + { + "epoch": 0.6, + "grad_norm": 0.8735285883101643, + "learning_rate": 7.4105381848223005e-06, + "loss": 0.4678, + "step": 7332 + }, + { + "epoch": 0.6, + "grad_norm": 0.9108584288618684, + "learning_rate": 7.407995591124892e-06, + "loss": 0.5694, + "step": 7333 + }, + { + "epoch": 0.6, + "grad_norm": 0.8409007811031063, + "learning_rate": 7.405453177051092e-06, + "loss": 0.5019, + "step": 7334 + }, + { + "epoch": 0.6, + "grad_norm": 0.9763197336086359, + "learning_rate": 7.4029109427770875e-06, + "loss": 0.5829, + "step": 7335 + }, + { + "epoch": 0.6, + "grad_norm": 0.8358779487127997, + "learning_rate": 7.400368888479048e-06, + "loss": 0.4701, + "step": 7336 + }, + { + "epoch": 0.6, + "grad_norm": 0.9256093445064898, + "learning_rate": 7.3978270143331386e-06, + "loss": 0.4672, + "step": 7337 + }, + { + "epoch": 0.6, + "grad_norm": 0.8703907207903545, + "learning_rate": 7.395285320515513e-06, + "loss": 0.4854, + "step": 7338 + }, + { + "epoch": 0.6, + "grad_norm": 0.8341471873689289, + "learning_rate": 7.392743807202301e-06, + "loss": 0.4957, + "step": 7339 + }, + { + "epoch": 0.6, + "grad_norm": 0.9060027701674533, + "learning_rate": 7.39020247456963e-06, + "loss": 0.5042, + "step": 7340 + }, + { + "epoch": 0.6, + "grad_norm": 0.8764931059475367, + "learning_rate": 7.3876613227936145e-06, + "loss": 0.4678, + "step": 7341 + }, + { + "epoch": 0.6, + "grad_norm": 1.0120630716408041, + "learning_rate": 7.38512035205035e-06, + "loss": 0.5496, + "step": 7342 + }, + { + "epoch": 0.6, + "grad_norm": 0.8493623592295444, + "learning_rate": 7.382579562515926e-06, + "loss": 0.4452, + "step": 7343 + }, + { + "epoch": 0.6, + "grad_norm": 0.7699481233724054, + "learning_rate": 7.38003895436642e-06, + "loss": 0.4266, + "step": 7344 + }, + { + "epoch": 0.6, + "grad_norm": 0.8349390281702176, + "learning_rate": 7.377498527777887e-06, + "loss": 0.5024, + "step": 7345 + }, + { + "epoch": 0.6, + "grad_norm": 0.8471489722827498, + "learning_rate": 7.374958282926381e-06, + "loss": 0.5316, + "step": 7346 + }, + { + "epoch": 0.6, + "grad_norm": 0.9221474469176514, + "learning_rate": 7.372418219987938e-06, + "loss": 0.5334, + "step": 7347 + }, + { + "epoch": 0.6, + "grad_norm": 0.8966209510688218, + "learning_rate": 7.369878339138581e-06, + "loss": 0.5412, + "step": 7348 + }, + { + "epoch": 0.6, + "grad_norm": 0.9422004357531978, + "learning_rate": 7.367338640554322e-06, + "loss": 0.5665, + "step": 7349 + }, + { + "epoch": 0.6, + "grad_norm": 0.9077745442266718, + "learning_rate": 7.364799124411162e-06, + "loss": 0.4843, + "step": 7350 + }, + { + "epoch": 0.6, + "grad_norm": 0.8639359821414474, + "learning_rate": 7.36225979088508e-06, + "loss": 0.5359, + "step": 7351 + }, + { + "epoch": 0.6, + "grad_norm": 0.9410610289961132, + "learning_rate": 7.359720640152061e-06, + "loss": 0.5008, + "step": 7352 + }, + { + "epoch": 0.6, + "grad_norm": 0.7901229733572144, + "learning_rate": 7.357181672388059e-06, + "loss": 0.5045, + "step": 7353 + }, + { + "epoch": 0.6, + "grad_norm": 0.9425112865880872, + "learning_rate": 7.354642887769018e-06, + "loss": 0.5639, + "step": 7354 + }, + { + "epoch": 0.6, + "grad_norm": 0.8782766252811707, + "learning_rate": 7.3521042864708825e-06, + "loss": 0.4998, + "step": 7355 + }, + { + "epoch": 0.6, + "grad_norm": 0.9183737268629781, + "learning_rate": 7.349565868669573e-06, + "loss": 0.5099, + "step": 7356 + }, + { + "epoch": 0.6, + "grad_norm": 0.9348792780089465, + "learning_rate": 7.347027634540993e-06, + "loss": 0.5533, + "step": 7357 + }, + { + "epoch": 0.6, + "grad_norm": 0.9344656577875735, + "learning_rate": 7.344489584261047e-06, + "loss": 0.5298, + "step": 7358 + }, + { + "epoch": 0.6, + "grad_norm": 0.8642903378823802, + "learning_rate": 7.34195171800562e-06, + "loss": 0.4594, + "step": 7359 + }, + { + "epoch": 0.6, + "grad_norm": 0.981632696955239, + "learning_rate": 7.339414035950576e-06, + "loss": 0.5647, + "step": 7360 + }, + { + "epoch": 0.6, + "grad_norm": 0.9097634668291024, + "learning_rate": 7.3368765382717835e-06, + "loss": 0.5358, + "step": 7361 + }, + { + "epoch": 0.6, + "grad_norm": 0.8220863835173063, + "learning_rate": 7.334339225145084e-06, + "loss": 0.5158, + "step": 7362 + }, + { + "epoch": 0.6, + "grad_norm": 0.8641456239026523, + "learning_rate": 7.331802096746309e-06, + "loss": 0.484, + "step": 7363 + }, + { + "epoch": 0.6, + "grad_norm": 0.8618023808624063, + "learning_rate": 7.329265153251285e-06, + "loss": 0.4845, + "step": 7364 + }, + { + "epoch": 0.6, + "grad_norm": 0.8504294393521689, + "learning_rate": 7.326728394835818e-06, + "loss": 0.4895, + "step": 7365 + }, + { + "epoch": 0.6, + "grad_norm": 0.8267207028527193, + "learning_rate": 7.324191821675697e-06, + "loss": 0.4765, + "step": 7366 + }, + { + "epoch": 0.6, + "grad_norm": 0.8853464997118534, + "learning_rate": 7.321655433946714e-06, + "loss": 0.4995, + "step": 7367 + }, + { + "epoch": 0.6, + "grad_norm": 0.8718350723134894, + "learning_rate": 7.319119231824633e-06, + "loss": 0.5163, + "step": 7368 + }, + { + "epoch": 0.6, + "grad_norm": 0.9512318446047324, + "learning_rate": 7.316583215485208e-06, + "loss": 0.5818, + "step": 7369 + }, + { + "epoch": 0.6, + "grad_norm": 0.8815002766107952, + "learning_rate": 7.314047385104189e-06, + "loss": 0.5589, + "step": 7370 + }, + { + "epoch": 0.6, + "grad_norm": 0.8286685998102985, + "learning_rate": 7.311511740857304e-06, + "loss": 0.4836, + "step": 7371 + }, + { + "epoch": 0.6, + "grad_norm": 0.9162307895735335, + "learning_rate": 7.308976282920268e-06, + "loss": 0.5594, + "step": 7372 + }, + { + "epoch": 0.6, + "grad_norm": 1.0199676957259531, + "learning_rate": 7.306441011468792e-06, + "loss": 0.5708, + "step": 7373 + }, + { + "epoch": 0.6, + "grad_norm": 0.8913322632852547, + "learning_rate": 7.303905926678565e-06, + "loss": 0.5483, + "step": 7374 + }, + { + "epoch": 0.6, + "grad_norm": 0.8926702359397981, + "learning_rate": 7.301371028725261e-06, + "loss": 0.6005, + "step": 7375 + }, + { + "epoch": 0.6, + "grad_norm": 0.8996125260292345, + "learning_rate": 7.298836317784556e-06, + "loss": 0.5087, + "step": 7376 + }, + { + "epoch": 0.6, + "grad_norm": 0.8145218099302595, + "learning_rate": 7.296301794032097e-06, + "loss": 0.5221, + "step": 7377 + }, + { + "epoch": 0.6, + "grad_norm": 0.9077749787424425, + "learning_rate": 7.293767457643523e-06, + "loss": 0.4831, + "step": 7378 + }, + { + "epoch": 0.6, + "grad_norm": 1.011921708629939, + "learning_rate": 7.291233308794467e-06, + "loss": 0.5276, + "step": 7379 + }, + { + "epoch": 0.6, + "grad_norm": 0.8568360633757683, + "learning_rate": 7.28869934766054e-06, + "loss": 0.4962, + "step": 7380 + }, + { + "epoch": 0.6, + "grad_norm": 0.8952349900165589, + "learning_rate": 7.286165574417339e-06, + "loss": 0.5249, + "step": 7381 + }, + { + "epoch": 0.6, + "grad_norm": 0.944935733661351, + "learning_rate": 7.283631989240461e-06, + "loss": 0.5662, + "step": 7382 + }, + { + "epoch": 0.6, + "grad_norm": 0.9099120859981927, + "learning_rate": 7.281098592305475e-06, + "loss": 0.4988, + "step": 7383 + }, + { + "epoch": 0.6, + "grad_norm": 0.8401807741972126, + "learning_rate": 7.2785653837879435e-06, + "loss": 0.4836, + "step": 7384 + }, + { + "epoch": 0.6, + "grad_norm": 0.9144781640878747, + "learning_rate": 7.276032363863419e-06, + "loss": 0.551, + "step": 7385 + }, + { + "epoch": 0.6, + "grad_norm": 0.8775209203998569, + "learning_rate": 7.273499532707438e-06, + "loss": 0.5175, + "step": 7386 + }, + { + "epoch": 0.6, + "grad_norm": 0.9255115313293777, + "learning_rate": 7.270966890495515e-06, + "loss": 0.5155, + "step": 7387 + }, + { + "epoch": 0.6, + "grad_norm": 0.7901310404074847, + "learning_rate": 7.268434437403169e-06, + "loss": 0.3836, + "step": 7388 + }, + { + "epoch": 0.6, + "grad_norm": 0.911734446879483, + "learning_rate": 7.2659021736058966e-06, + "loss": 0.5051, + "step": 7389 + }, + { + "epoch": 0.6, + "grad_norm": 0.8213031260282923, + "learning_rate": 7.263370099279173e-06, + "loss": 0.4969, + "step": 7390 + }, + { + "epoch": 0.6, + "grad_norm": 0.8756849290520115, + "learning_rate": 7.260838214598475e-06, + "loss": 0.5081, + "step": 7391 + }, + { + "epoch": 0.6, + "grad_norm": 0.9154177602786717, + "learning_rate": 7.258306519739263e-06, + "loss": 0.5225, + "step": 7392 + }, + { + "epoch": 0.6, + "grad_norm": 0.8133074922872229, + "learning_rate": 7.2557750148769724e-06, + "loss": 0.4734, + "step": 7393 + }, + { + "epoch": 0.6, + "grad_norm": 0.9995526440978737, + "learning_rate": 7.253243700187043e-06, + "loss": 0.5527, + "step": 7394 + }, + { + "epoch": 0.6, + "grad_norm": 0.9495255233213361, + "learning_rate": 7.250712575844885e-06, + "loss": 0.5328, + "step": 7395 + }, + { + "epoch": 0.6, + "grad_norm": 0.8704164475758905, + "learning_rate": 7.248181642025911e-06, + "loss": 0.5269, + "step": 7396 + }, + { + "epoch": 0.6, + "grad_norm": 0.9126078437554483, + "learning_rate": 7.245650898905507e-06, + "loss": 0.5635, + "step": 7397 + }, + { + "epoch": 0.6, + "grad_norm": 0.852380419372046, + "learning_rate": 7.243120346659049e-06, + "loss": 0.4722, + "step": 7398 + }, + { + "epoch": 0.6, + "grad_norm": 0.9106044444571901, + "learning_rate": 7.240589985461911e-06, + "loss": 0.4851, + "step": 7399 + }, + { + "epoch": 0.6, + "grad_norm": 0.850767141256381, + "learning_rate": 7.238059815489439e-06, + "loss": 0.4833, + "step": 7400 + }, + { + "epoch": 0.6, + "grad_norm": 0.9655676492891794, + "learning_rate": 7.235529836916968e-06, + "loss": 0.4686, + "step": 7401 + }, + { + "epoch": 0.6, + "grad_norm": 0.9151038100861985, + "learning_rate": 7.233000049919829e-06, + "loss": 0.5022, + "step": 7402 + }, + { + "epoch": 0.6, + "grad_norm": 0.8971978008944801, + "learning_rate": 7.230470454673335e-06, + "loss": 0.5701, + "step": 7403 + }, + { + "epoch": 0.6, + "grad_norm": 0.9944557059493082, + "learning_rate": 7.227941051352777e-06, + "loss": 0.5367, + "step": 7404 + }, + { + "epoch": 0.6, + "grad_norm": 0.9504825452251051, + "learning_rate": 7.225411840133449e-06, + "loss": 0.5686, + "step": 7405 + }, + { + "epoch": 0.6, + "grad_norm": 1.0065511172114605, + "learning_rate": 7.22288282119062e-06, + "loss": 0.5309, + "step": 7406 + }, + { + "epoch": 0.6, + "grad_norm": 0.9250351141978376, + "learning_rate": 7.2203539946995435e-06, + "loss": 0.4719, + "step": 7407 + }, + { + "epoch": 0.6, + "grad_norm": 0.8965608857066834, + "learning_rate": 7.217825360835475e-06, + "loss": 0.5639, + "step": 7408 + }, + { + "epoch": 0.6, + "grad_norm": 0.877393505298854, + "learning_rate": 7.21529691977364e-06, + "loss": 0.4718, + "step": 7409 + }, + { + "epoch": 0.6, + "grad_norm": 0.9556307511449684, + "learning_rate": 7.212768671689255e-06, + "loss": 0.4729, + "step": 7410 + }, + { + "epoch": 0.6, + "grad_norm": 0.9377000629117607, + "learning_rate": 7.210240616757531e-06, + "loss": 0.4944, + "step": 7411 + }, + { + "epoch": 0.6, + "grad_norm": 0.8263931357979006, + "learning_rate": 7.20771275515366e-06, + "loss": 0.4764, + "step": 7412 + }, + { + "epoch": 0.6, + "grad_norm": 0.9111012936356778, + "learning_rate": 7.205185087052813e-06, + "loss": 0.5189, + "step": 7413 + }, + { + "epoch": 0.6, + "grad_norm": 1.3651652953137372, + "learning_rate": 7.202657612630165e-06, + "loss": 0.5264, + "step": 7414 + }, + { + "epoch": 0.6, + "grad_norm": 0.9200714563305669, + "learning_rate": 7.200130332060864e-06, + "loss": 0.5478, + "step": 7415 + }, + { + "epoch": 0.6, + "grad_norm": 0.9002547184998756, + "learning_rate": 7.197603245520042e-06, + "loss": 0.5124, + "step": 7416 + }, + { + "epoch": 0.6, + "grad_norm": 1.013949035151007, + "learning_rate": 7.195076353182834e-06, + "loss": 0.4858, + "step": 7417 + }, + { + "epoch": 0.6, + "grad_norm": 0.824976407945111, + "learning_rate": 7.192549655224346e-06, + "loss": 0.4967, + "step": 7418 + }, + { + "epoch": 0.6, + "grad_norm": 0.8838276237041307, + "learning_rate": 7.190023151819674e-06, + "loss": 0.5368, + "step": 7419 + }, + { + "epoch": 0.6, + "grad_norm": 0.8606988929514742, + "learning_rate": 7.187496843143908e-06, + "loss": 0.4799, + "step": 7420 + }, + { + "epoch": 0.6, + "grad_norm": 0.8872973984439431, + "learning_rate": 7.184970729372118e-06, + "loss": 0.4783, + "step": 7421 + }, + { + "epoch": 0.6, + "grad_norm": 0.9381768183168121, + "learning_rate": 7.182444810679354e-06, + "loss": 0.5491, + "step": 7422 + }, + { + "epoch": 0.6, + "grad_norm": 0.9379227502504542, + "learning_rate": 7.17991908724067e-06, + "loss": 0.556, + "step": 7423 + }, + { + "epoch": 0.6, + "grad_norm": 0.9137485386338625, + "learning_rate": 7.17739355923109e-06, + "loss": 0.51, + "step": 7424 + }, + { + "epoch": 0.6, + "grad_norm": 1.2644241726683547, + "learning_rate": 7.174868226825631e-06, + "loss": 0.5416, + "step": 7425 + }, + { + "epoch": 0.6, + "grad_norm": 0.8958335672404645, + "learning_rate": 7.172343090199301e-06, + "loss": 0.5207, + "step": 7426 + }, + { + "epoch": 0.6, + "grad_norm": 0.8590210296481723, + "learning_rate": 7.169818149527087e-06, + "loss": 0.5337, + "step": 7427 + }, + { + "epoch": 0.6, + "grad_norm": 0.8384171957758327, + "learning_rate": 7.167293404983962e-06, + "loss": 0.4613, + "step": 7428 + }, + { + "epoch": 0.6, + "grad_norm": 0.954130281660937, + "learning_rate": 7.164768856744893e-06, + "loss": 0.5646, + "step": 7429 + }, + { + "epoch": 0.6, + "grad_norm": 1.0030607106468707, + "learning_rate": 7.1622445049848286e-06, + "loss": 0.5386, + "step": 7430 + }, + { + "epoch": 0.6, + "grad_norm": 0.8139381902091727, + "learning_rate": 7.159720349878698e-06, + "loss": 0.4671, + "step": 7431 + }, + { + "epoch": 0.6, + "grad_norm": 0.8966599226454, + "learning_rate": 7.157196391601433e-06, + "loss": 0.5602, + "step": 7432 + }, + { + "epoch": 0.6, + "grad_norm": 0.9522741156653227, + "learning_rate": 7.154672630327937e-06, + "loss": 0.5023, + "step": 7433 + }, + { + "epoch": 0.6, + "grad_norm": 0.9261395171564893, + "learning_rate": 7.152149066233098e-06, + "loss": 0.5739, + "step": 7434 + }, + { + "epoch": 0.6, + "grad_norm": 0.9771922952299095, + "learning_rate": 7.149625699491809e-06, + "loss": 0.5441, + "step": 7435 + }, + { + "epoch": 0.6, + "grad_norm": 0.8530678821927267, + "learning_rate": 7.147102530278929e-06, + "loss": 0.4926, + "step": 7436 + }, + { + "epoch": 0.6, + "grad_norm": 0.926699880774957, + "learning_rate": 7.14457955876931e-06, + "loss": 0.5174, + "step": 7437 + }, + { + "epoch": 0.6, + "grad_norm": 0.9079063732597161, + "learning_rate": 7.142056785137799e-06, + "loss": 0.475, + "step": 7438 + }, + { + "epoch": 0.6, + "grad_norm": 1.0016805486137383, + "learning_rate": 7.139534209559217e-06, + "loss": 0.6123, + "step": 7439 + }, + { + "epoch": 0.6, + "grad_norm": 0.926003799746655, + "learning_rate": 7.137011832208374e-06, + "loss": 0.5129, + "step": 7440 + }, + { + "epoch": 0.6, + "grad_norm": 0.8901597233303454, + "learning_rate": 7.134489653260075e-06, + "loss": 0.4922, + "step": 7441 + }, + { + "epoch": 0.6, + "grad_norm": 0.9039581597904545, + "learning_rate": 7.131967672889101e-06, + "loss": 0.494, + "step": 7442 + }, + { + "epoch": 0.6, + "grad_norm": 0.7942695446448926, + "learning_rate": 7.129445891270219e-06, + "loss": 0.4672, + "step": 7443 + }, + { + "epoch": 0.61, + "grad_norm": 0.8833734306634364, + "learning_rate": 7.126924308578196e-06, + "loss": 0.5106, + "step": 7444 + }, + { + "epoch": 0.61, + "grad_norm": 0.8939098610259423, + "learning_rate": 7.124402924987767e-06, + "loss": 0.5146, + "step": 7445 + }, + { + "epoch": 0.61, + "grad_norm": 0.8382061476262879, + "learning_rate": 7.121881740673664e-06, + "loss": 0.4823, + "step": 7446 + }, + { + "epoch": 0.61, + "grad_norm": 0.7778752928864608, + "learning_rate": 7.119360755810607e-06, + "loss": 0.4548, + "step": 7447 + }, + { + "epoch": 0.61, + "grad_norm": 0.8011529749679672, + "learning_rate": 7.116839970573292e-06, + "loss": 0.4825, + "step": 7448 + }, + { + "epoch": 0.61, + "grad_norm": 0.9036417498542302, + "learning_rate": 7.114319385136408e-06, + "loss": 0.5491, + "step": 7449 + }, + { + "epoch": 0.61, + "grad_norm": 0.8761401707218572, + "learning_rate": 7.111798999674635e-06, + "loss": 0.4814, + "step": 7450 + }, + { + "epoch": 0.61, + "grad_norm": 0.8523161458806872, + "learning_rate": 7.109278814362629e-06, + "loss": 0.543, + "step": 7451 + }, + { + "epoch": 0.61, + "grad_norm": 0.9528197182425885, + "learning_rate": 7.106758829375033e-06, + "loss": 0.5567, + "step": 7452 + }, + { + "epoch": 0.61, + "grad_norm": 0.8899638209596322, + "learning_rate": 7.104239044886487e-06, + "loss": 0.5432, + "step": 7453 + }, + { + "epoch": 0.61, + "grad_norm": 0.870345394746601, + "learning_rate": 7.101719461071608e-06, + "loss": 0.4923, + "step": 7454 + }, + { + "epoch": 0.61, + "grad_norm": 0.9432026321304644, + "learning_rate": 7.099200078104995e-06, + "loss": 0.4855, + "step": 7455 + }, + { + "epoch": 0.61, + "grad_norm": 0.909353097814008, + "learning_rate": 7.0966808961612475e-06, + "loss": 0.5445, + "step": 7456 + }, + { + "epoch": 0.61, + "grad_norm": 0.8222116703513183, + "learning_rate": 7.094161915414939e-06, + "loss": 0.5132, + "step": 7457 + }, + { + "epoch": 0.61, + "grad_norm": 0.8998287656530938, + "learning_rate": 7.091643136040629e-06, + "loss": 0.5051, + "step": 7458 + }, + { + "epoch": 0.61, + "grad_norm": 0.909655368823903, + "learning_rate": 7.089124558212872e-06, + "loss": 0.5384, + "step": 7459 + }, + { + "epoch": 0.61, + "grad_norm": 0.9122072478770556, + "learning_rate": 7.0866061821062025e-06, + "loss": 0.4496, + "step": 7460 + }, + { + "epoch": 0.61, + "grad_norm": 0.9958858222651449, + "learning_rate": 7.084088007895136e-06, + "loss": 0.5486, + "step": 7461 + }, + { + "epoch": 0.61, + "grad_norm": 0.8786152819765978, + "learning_rate": 7.081570035754189e-06, + "loss": 0.4973, + "step": 7462 + }, + { + "epoch": 0.61, + "grad_norm": 0.895279929473395, + "learning_rate": 7.079052265857847e-06, + "loss": 0.5163, + "step": 7463 + }, + { + "epoch": 0.61, + "grad_norm": 0.9016037453401397, + "learning_rate": 7.0765346983805925e-06, + "loss": 0.5418, + "step": 7464 + }, + { + "epoch": 0.61, + "grad_norm": 0.9594257408814107, + "learning_rate": 7.074017333496892e-06, + "loss": 0.4872, + "step": 7465 + }, + { + "epoch": 0.61, + "grad_norm": 0.9198839250326495, + "learning_rate": 7.071500171381193e-06, + "loss": 0.5639, + "step": 7466 + }, + { + "epoch": 0.61, + "grad_norm": 0.9113176502875616, + "learning_rate": 7.068983212207934e-06, + "loss": 0.5025, + "step": 7467 + }, + { + "epoch": 0.61, + "grad_norm": 0.8920345474434166, + "learning_rate": 7.066466456151541e-06, + "loss": 0.5027, + "step": 7468 + }, + { + "epoch": 0.61, + "grad_norm": 0.926683305043765, + "learning_rate": 7.063949903386419e-06, + "loss": 0.478, + "step": 7469 + }, + { + "epoch": 0.61, + "grad_norm": 0.8747561455025346, + "learning_rate": 7.061433554086964e-06, + "loss": 0.5093, + "step": 7470 + }, + { + "epoch": 0.61, + "grad_norm": 0.8233419034552449, + "learning_rate": 7.058917408427559e-06, + "loss": 0.5061, + "step": 7471 + }, + { + "epoch": 0.61, + "grad_norm": 0.968703122551061, + "learning_rate": 7.056401466582567e-06, + "loss": 0.5174, + "step": 7472 + }, + { + "epoch": 0.61, + "grad_norm": 0.98020612835273, + "learning_rate": 7.053885728726343e-06, + "loss": 0.4846, + "step": 7473 + }, + { + "epoch": 0.61, + "grad_norm": 0.9245074098344465, + "learning_rate": 7.051370195033227e-06, + "loss": 0.5086, + "step": 7474 + }, + { + "epoch": 0.61, + "grad_norm": 0.8498280153892569, + "learning_rate": 7.048854865677538e-06, + "loss": 0.4974, + "step": 7475 + }, + { + "epoch": 0.61, + "grad_norm": 0.9584978657105867, + "learning_rate": 7.04633974083359e-06, + "loss": 0.553, + "step": 7476 + }, + { + "epoch": 0.61, + "grad_norm": 0.9813074133240363, + "learning_rate": 7.04382482067568e-06, + "loss": 0.5338, + "step": 7477 + }, + { + "epoch": 0.61, + "grad_norm": 0.8597723255573146, + "learning_rate": 7.041310105378085e-06, + "loss": 0.5018, + "step": 7478 + }, + { + "epoch": 0.61, + "grad_norm": 0.9700494913363266, + "learning_rate": 7.038795595115076e-06, + "loss": 0.5172, + "step": 7479 + }, + { + "epoch": 0.61, + "grad_norm": 0.9476074585160819, + "learning_rate": 7.036281290060907e-06, + "loss": 0.5126, + "step": 7480 + }, + { + "epoch": 0.61, + "grad_norm": 0.9883722573627236, + "learning_rate": 7.033767190389814e-06, + "loss": 0.5522, + "step": 7481 + }, + { + "epoch": 0.61, + "grad_norm": 0.8750535173574526, + "learning_rate": 7.031253296276024e-06, + "loss": 0.4966, + "step": 7482 + }, + { + "epoch": 0.61, + "grad_norm": 0.8808199103635561, + "learning_rate": 7.028739607893746e-06, + "loss": 0.4345, + "step": 7483 + }, + { + "epoch": 0.61, + "grad_norm": 0.9292995866899433, + "learning_rate": 7.026226125417182e-06, + "loss": 0.5172, + "step": 7484 + }, + { + "epoch": 0.61, + "grad_norm": 0.8682760191868, + "learning_rate": 7.023712849020506e-06, + "loss": 0.5119, + "step": 7485 + }, + { + "epoch": 0.61, + "grad_norm": 0.9130486211387956, + "learning_rate": 7.021199778877891e-06, + "loss": 0.508, + "step": 7486 + }, + { + "epoch": 0.61, + "grad_norm": 0.8989745028238334, + "learning_rate": 7.01868691516349e-06, + "loss": 0.5389, + "step": 7487 + }, + { + "epoch": 0.61, + "grad_norm": 0.8711288089074521, + "learning_rate": 7.016174258051441e-06, + "loss": 0.5308, + "step": 7488 + }, + { + "epoch": 0.61, + "grad_norm": 0.8887074836595515, + "learning_rate": 7.013661807715866e-06, + "loss": 0.5039, + "step": 7489 + }, + { + "epoch": 0.61, + "grad_norm": 0.915092893637171, + "learning_rate": 7.0111495643308836e-06, + "loss": 0.5086, + "step": 7490 + }, + { + "epoch": 0.61, + "grad_norm": 0.9629412883487775, + "learning_rate": 7.008637528070583e-06, + "loss": 0.5252, + "step": 7491 + }, + { + "epoch": 0.61, + "grad_norm": 0.8843063699819966, + "learning_rate": 7.006125699109048e-06, + "loss": 0.4488, + "step": 7492 + }, + { + "epoch": 0.61, + "grad_norm": 0.9509376929761509, + "learning_rate": 7.003614077620348e-06, + "loss": 0.525, + "step": 7493 + }, + { + "epoch": 0.61, + "grad_norm": 0.8383261728774117, + "learning_rate": 7.001102663778533e-06, + "loss": 0.5241, + "step": 7494 + }, + { + "epoch": 0.61, + "grad_norm": 0.8631840695371249, + "learning_rate": 6.998591457757643e-06, + "loss": 0.4911, + "step": 7495 + }, + { + "epoch": 0.61, + "grad_norm": 0.8323660725814972, + "learning_rate": 6.9960804597317045e-06, + "loss": 0.421, + "step": 7496 + }, + { + "epoch": 0.61, + "grad_norm": 0.7882803380832858, + "learning_rate": 6.993569669874724e-06, + "loss": 0.4342, + "step": 7497 + }, + { + "epoch": 0.61, + "grad_norm": 0.9324318208303033, + "learning_rate": 6.9910590883607e-06, + "loss": 0.477, + "step": 7498 + }, + { + "epoch": 0.61, + "grad_norm": 0.8485127165181465, + "learning_rate": 6.9885487153636125e-06, + "loss": 0.4926, + "step": 7499 + }, + { + "epoch": 0.61, + "grad_norm": 0.9585992901038118, + "learning_rate": 6.986038551057426e-06, + "loss": 0.4797, + "step": 7500 + }, + { + "epoch": 0.61, + "grad_norm": 0.9353390797402146, + "learning_rate": 6.983528595616096e-06, + "loss": 0.505, + "step": 7501 + }, + { + "epoch": 0.61, + "grad_norm": 0.8813237756519612, + "learning_rate": 6.98101884921356e-06, + "loss": 0.4647, + "step": 7502 + }, + { + "epoch": 0.61, + "grad_norm": 0.8437093853065601, + "learning_rate": 6.978509312023736e-06, + "loss": 0.4573, + "step": 7503 + }, + { + "epoch": 0.61, + "grad_norm": 0.8652761631005595, + "learning_rate": 6.975999984220541e-06, + "loss": 0.4656, + "step": 7504 + }, + { + "epoch": 0.61, + "grad_norm": 0.9284862732710436, + "learning_rate": 6.9734908659778636e-06, + "loss": 0.5179, + "step": 7505 + }, + { + "epoch": 0.61, + "grad_norm": 0.8519841381992954, + "learning_rate": 6.97098195746958e-06, + "loss": 0.4691, + "step": 7506 + }, + { + "epoch": 0.61, + "grad_norm": 0.9432823008748794, + "learning_rate": 6.968473258869566e-06, + "loss": 0.5224, + "step": 7507 + }, + { + "epoch": 0.61, + "grad_norm": 0.9314459449139519, + "learning_rate": 6.965964770351665e-06, + "loss": 0.5125, + "step": 7508 + }, + { + "epoch": 0.61, + "grad_norm": 0.9017714642380549, + "learning_rate": 6.963456492089711e-06, + "loss": 0.5316, + "step": 7509 + }, + { + "epoch": 0.61, + "grad_norm": 0.9322043966660608, + "learning_rate": 6.960948424257532e-06, + "loss": 0.4958, + "step": 7510 + }, + { + "epoch": 0.61, + "grad_norm": 1.0163089194858204, + "learning_rate": 6.9584405670289326e-06, + "loss": 0.5121, + "step": 7511 + }, + { + "epoch": 0.61, + "grad_norm": 0.9067873188043927, + "learning_rate": 6.955932920577699e-06, + "loss": 0.576, + "step": 7512 + }, + { + "epoch": 0.61, + "grad_norm": 0.8910120630030297, + "learning_rate": 6.953425485077618e-06, + "loss": 0.5537, + "step": 7513 + }, + { + "epoch": 0.61, + "grad_norm": 0.9099219769405817, + "learning_rate": 6.950918260702449e-06, + "loss": 0.5537, + "step": 7514 + }, + { + "epoch": 0.61, + "grad_norm": 0.9177750168038247, + "learning_rate": 6.948411247625937e-06, + "loss": 0.5237, + "step": 7515 + }, + { + "epoch": 0.61, + "grad_norm": 0.8678464941195639, + "learning_rate": 6.9459044460218205e-06, + "loss": 0.5501, + "step": 7516 + }, + { + "epoch": 0.61, + "grad_norm": 0.8816036797700124, + "learning_rate": 6.943397856063818e-06, + "loss": 0.5234, + "step": 7517 + }, + { + "epoch": 0.61, + "grad_norm": 0.9188057188168129, + "learning_rate": 6.9408914779256285e-06, + "loss": 0.5016, + "step": 7518 + }, + { + "epoch": 0.61, + "grad_norm": 0.8878831599490864, + "learning_rate": 6.938385311780951e-06, + "loss": 0.4565, + "step": 7519 + }, + { + "epoch": 0.61, + "grad_norm": 0.9888578876329767, + "learning_rate": 6.935879357803453e-06, + "loss": 0.5835, + "step": 7520 + }, + { + "epoch": 0.61, + "grad_norm": 0.993637491505397, + "learning_rate": 6.933373616166799e-06, + "loss": 0.5403, + "step": 7521 + }, + { + "epoch": 0.61, + "grad_norm": 0.8068368590318049, + "learning_rate": 6.930868087044634e-06, + "loss": 0.4891, + "step": 7522 + }, + { + "epoch": 0.61, + "grad_norm": 0.8173237797717122, + "learning_rate": 6.9283627706105836e-06, + "loss": 0.4959, + "step": 7523 + }, + { + "epoch": 0.61, + "grad_norm": 0.8868903526963838, + "learning_rate": 6.925857667038274e-06, + "loss": 0.5011, + "step": 7524 + }, + { + "epoch": 0.61, + "grad_norm": 0.882363978650632, + "learning_rate": 6.923352776501302e-06, + "loss": 0.5676, + "step": 7525 + }, + { + "epoch": 0.61, + "grad_norm": 0.8759337087339624, + "learning_rate": 6.920848099173247e-06, + "loss": 0.5372, + "step": 7526 + }, + { + "epoch": 0.61, + "grad_norm": 0.9091568143961809, + "learning_rate": 6.918343635227694e-06, + "loss": 0.5224, + "step": 7527 + }, + { + "epoch": 0.61, + "grad_norm": 0.9492937811993646, + "learning_rate": 6.915839384838192e-06, + "loss": 0.5301, + "step": 7528 + }, + { + "epoch": 0.61, + "grad_norm": 0.9466146485903585, + "learning_rate": 6.913335348178283e-06, + "loss": 0.4828, + "step": 7529 + }, + { + "epoch": 0.61, + "grad_norm": 0.8734794349166198, + "learning_rate": 6.910831525421499e-06, + "loss": 0.5246, + "step": 7530 + }, + { + "epoch": 0.61, + "grad_norm": 0.9151217895547622, + "learning_rate": 6.90832791674135e-06, + "loss": 0.5339, + "step": 7531 + }, + { + "epoch": 0.61, + "grad_norm": 0.8742496126873698, + "learning_rate": 6.905824522311331e-06, + "loss": 0.4919, + "step": 7532 + }, + { + "epoch": 0.61, + "grad_norm": 0.9385429607671967, + "learning_rate": 6.90332134230493e-06, + "loss": 0.5338, + "step": 7533 + }, + { + "epoch": 0.61, + "grad_norm": 0.9194053424538637, + "learning_rate": 6.900818376895615e-06, + "loss": 0.5629, + "step": 7534 + }, + { + "epoch": 0.61, + "grad_norm": 0.9612350119647755, + "learning_rate": 6.898315626256833e-06, + "loss": 0.5202, + "step": 7535 + }, + { + "epoch": 0.61, + "grad_norm": 0.9415097629294374, + "learning_rate": 6.895813090562031e-06, + "loss": 0.5008, + "step": 7536 + }, + { + "epoch": 0.61, + "grad_norm": 0.8088669541501505, + "learning_rate": 6.893310769984629e-06, + "loss": 0.5369, + "step": 7537 + }, + { + "epoch": 0.61, + "grad_norm": 0.826545654887671, + "learning_rate": 6.890808664698031e-06, + "loss": 0.468, + "step": 7538 + }, + { + "epoch": 0.61, + "grad_norm": 0.8972849914206232, + "learning_rate": 6.888306774875638e-06, + "loss": 0.4912, + "step": 7539 + }, + { + "epoch": 0.61, + "grad_norm": 0.8815680247579892, + "learning_rate": 6.885805100690825e-06, + "loss": 0.5126, + "step": 7540 + }, + { + "epoch": 0.61, + "grad_norm": 0.9884025324541118, + "learning_rate": 6.883303642316954e-06, + "loss": 0.5923, + "step": 7541 + }, + { + "epoch": 0.61, + "grad_norm": 0.8249574853963665, + "learning_rate": 6.8808023999273784e-06, + "loss": 0.5241, + "step": 7542 + }, + { + "epoch": 0.61, + "grad_norm": 0.8637034169688512, + "learning_rate": 6.878301373695431e-06, + "loss": 0.488, + "step": 7543 + }, + { + "epoch": 0.61, + "grad_norm": 0.930755552156985, + "learning_rate": 6.8758005637944245e-06, + "loss": 0.5284, + "step": 7544 + }, + { + "epoch": 0.61, + "grad_norm": 0.9909722910723238, + "learning_rate": 6.873299970397672e-06, + "loss": 0.5452, + "step": 7545 + }, + { + "epoch": 0.61, + "grad_norm": 0.9544814719075744, + "learning_rate": 6.870799593678459e-06, + "loss": 0.5359, + "step": 7546 + }, + { + "epoch": 0.61, + "grad_norm": 0.8329864765301852, + "learning_rate": 6.868299433810053e-06, + "loss": 0.4407, + "step": 7547 + }, + { + "epoch": 0.61, + "grad_norm": 0.9008973754613913, + "learning_rate": 6.8657994909657235e-06, + "loss": 0.5463, + "step": 7548 + }, + { + "epoch": 0.61, + "grad_norm": 0.8973104532423516, + "learning_rate": 6.86329976531871e-06, + "loss": 0.5159, + "step": 7549 + }, + { + "epoch": 0.61, + "grad_norm": 0.8675670232532094, + "learning_rate": 6.860800257042235e-06, + "loss": 0.4741, + "step": 7550 + }, + { + "epoch": 0.61, + "grad_norm": 0.9491734938139588, + "learning_rate": 6.8583009663095215e-06, + "loss": 0.5078, + "step": 7551 + }, + { + "epoch": 0.61, + "grad_norm": 1.036963655326414, + "learning_rate": 6.855801893293765e-06, + "loss": 0.6484, + "step": 7552 + }, + { + "epoch": 0.61, + "grad_norm": 0.9071145539618047, + "learning_rate": 6.853303038168144e-06, + "loss": 0.5096, + "step": 7553 + }, + { + "epoch": 0.61, + "grad_norm": 1.0233484930312005, + "learning_rate": 6.8508044011058375e-06, + "loss": 0.5484, + "step": 7554 + }, + { + "epoch": 0.61, + "grad_norm": 0.941587849993408, + "learning_rate": 6.84830598227999e-06, + "loss": 0.5463, + "step": 7555 + }, + { + "epoch": 0.61, + "grad_norm": 0.9071870766619743, + "learning_rate": 6.845807781863739e-06, + "loss": 0.5312, + "step": 7556 + }, + { + "epoch": 0.61, + "grad_norm": 0.8802657720651078, + "learning_rate": 6.8433098000302155e-06, + "loss": 0.5114, + "step": 7557 + }, + { + "epoch": 0.61, + "grad_norm": 0.8716345359948411, + "learning_rate": 6.840812036952522e-06, + "loss": 0.43, + "step": 7558 + }, + { + "epoch": 0.61, + "grad_norm": 0.9806567088859193, + "learning_rate": 6.83831449280375e-06, + "loss": 0.4978, + "step": 7559 + }, + { + "epoch": 0.61, + "grad_norm": 0.927207704508905, + "learning_rate": 6.8358171677569814e-06, + "loss": 0.5402, + "step": 7560 + }, + { + "epoch": 0.61, + "grad_norm": 0.9100483563196377, + "learning_rate": 6.833320061985278e-06, + "loss": 0.5295, + "step": 7561 + }, + { + "epoch": 0.61, + "grad_norm": 0.9003269433900096, + "learning_rate": 6.830823175661681e-06, + "loss": 0.5654, + "step": 7562 + }, + { + "epoch": 0.61, + "grad_norm": 0.8768709859948077, + "learning_rate": 6.828326508959229e-06, + "loss": 0.4975, + "step": 7563 + }, + { + "epoch": 0.61, + "grad_norm": 0.9148195812629236, + "learning_rate": 6.825830062050939e-06, + "loss": 0.5429, + "step": 7564 + }, + { + "epoch": 0.61, + "grad_norm": 0.8353913853922079, + "learning_rate": 6.823333835109805e-06, + "loss": 0.5226, + "step": 7565 + }, + { + "epoch": 0.61, + "grad_norm": 0.9198354109819284, + "learning_rate": 6.820837828308823e-06, + "loss": 0.519, + "step": 7566 + }, + { + "epoch": 0.62, + "grad_norm": 0.9656566943225391, + "learning_rate": 6.818342041820959e-06, + "loss": 0.5811, + "step": 7567 + }, + { + "epoch": 0.62, + "grad_norm": 0.9093452706591472, + "learning_rate": 6.815846475819166e-06, + "loss": 0.5197, + "step": 7568 + }, + { + "epoch": 0.62, + "grad_norm": 0.9813198921313696, + "learning_rate": 6.813351130476391e-06, + "loss": 0.4577, + "step": 7569 + }, + { + "epoch": 0.62, + "grad_norm": 1.0110714074684641, + "learning_rate": 6.810856005965558e-06, + "loss": 0.5877, + "step": 7570 + }, + { + "epoch": 0.62, + "grad_norm": 0.9041099671510405, + "learning_rate": 6.808361102459568e-06, + "loss": 0.5148, + "step": 7571 + }, + { + "epoch": 0.62, + "grad_norm": 0.8462419013566584, + "learning_rate": 6.80586642013133e-06, + "loss": 0.4424, + "step": 7572 + }, + { + "epoch": 0.62, + "grad_norm": 0.954344805260286, + "learning_rate": 6.803371959153714e-06, + "loss": 0.5495, + "step": 7573 + }, + { + "epoch": 0.62, + "grad_norm": 0.8968157028231019, + "learning_rate": 6.800877719699581e-06, + "loss": 0.5173, + "step": 7574 + }, + { + "epoch": 0.62, + "grad_norm": 0.8759713185900999, + "learning_rate": 6.798383701941791e-06, + "loss": 0.5261, + "step": 7575 + }, + { + "epoch": 0.62, + "grad_norm": 0.9532005915781344, + "learning_rate": 6.795889906053168e-06, + "loss": 0.5458, + "step": 7576 + }, + { + "epoch": 0.62, + "grad_norm": 0.958076507362562, + "learning_rate": 6.79339633220653e-06, + "loss": 0.4898, + "step": 7577 + }, + { + "epoch": 0.62, + "grad_norm": 0.9654309823547377, + "learning_rate": 6.7909029805746855e-06, + "loss": 0.4927, + "step": 7578 + }, + { + "epoch": 0.62, + "grad_norm": 0.8362198322373752, + "learning_rate": 6.788409851330419e-06, + "loss": 0.4931, + "step": 7579 + }, + { + "epoch": 0.62, + "grad_norm": 0.8885475948416547, + "learning_rate": 6.7859169446464955e-06, + "loss": 0.464, + "step": 7580 + }, + { + "epoch": 0.62, + "grad_norm": 0.8621341356587938, + "learning_rate": 6.783424260695681e-06, + "loss": 0.5048, + "step": 7581 + }, + { + "epoch": 0.62, + "grad_norm": 0.9836189609295446, + "learning_rate": 6.780931799650714e-06, + "loss": 0.5756, + "step": 7582 + }, + { + "epoch": 0.62, + "grad_norm": 0.9043346316869034, + "learning_rate": 6.778439561684311e-06, + "loss": 0.5557, + "step": 7583 + }, + { + "epoch": 0.62, + "grad_norm": 0.9260466410600587, + "learning_rate": 6.775947546969195e-06, + "loss": 0.5034, + "step": 7584 + }, + { + "epoch": 0.62, + "grad_norm": 0.9557467442673623, + "learning_rate": 6.773455755678054e-06, + "loss": 0.5385, + "step": 7585 + }, + { + "epoch": 0.62, + "grad_norm": 0.8211146342579827, + "learning_rate": 6.770964187983563e-06, + "loss": 0.4589, + "step": 7586 + }, + { + "epoch": 0.62, + "grad_norm": 0.9688902980547851, + "learning_rate": 6.7684728440583934e-06, + "loss": 0.5368, + "step": 7587 + }, + { + "epoch": 0.62, + "grad_norm": 0.9309423999874568, + "learning_rate": 6.7659817240751906e-06, + "loss": 0.5372, + "step": 7588 + }, + { + "epoch": 0.62, + "grad_norm": 0.8931467289113518, + "learning_rate": 6.76349082820658e-06, + "loss": 0.4692, + "step": 7589 + }, + { + "epoch": 0.62, + "grad_norm": 0.8857663662886142, + "learning_rate": 6.7610001566251885e-06, + "loss": 0.4539, + "step": 7590 + }, + { + "epoch": 0.62, + "grad_norm": 0.8738241729867862, + "learning_rate": 6.758509709503614e-06, + "loss": 0.5054, + "step": 7591 + }, + { + "epoch": 0.62, + "grad_norm": 0.85924221496039, + "learning_rate": 6.756019487014437e-06, + "loss": 0.4874, + "step": 7592 + }, + { + "epoch": 0.62, + "grad_norm": 0.8547022449248107, + "learning_rate": 6.753529489330235e-06, + "loss": 0.5146, + "step": 7593 + }, + { + "epoch": 0.62, + "grad_norm": 0.7745590974676932, + "learning_rate": 6.751039716623562e-06, + "loss": 0.4518, + "step": 7594 + }, + { + "epoch": 0.62, + "grad_norm": 0.9500988557366958, + "learning_rate": 6.7485501690669495e-06, + "loss": 0.5643, + "step": 7595 + }, + { + "epoch": 0.62, + "grad_norm": 0.9158231254949393, + "learning_rate": 6.74606084683293e-06, + "loss": 0.4598, + "step": 7596 + }, + { + "epoch": 0.62, + "grad_norm": 0.9059197379989498, + "learning_rate": 6.743571750094009e-06, + "loss": 0.534, + "step": 7597 + }, + { + "epoch": 0.62, + "grad_norm": 0.9930882474280531, + "learning_rate": 6.741082879022671e-06, + "loss": 0.6028, + "step": 7598 + }, + { + "epoch": 0.62, + "grad_norm": 0.9398046854732968, + "learning_rate": 6.738594233791405e-06, + "loss": 0.5218, + "step": 7599 + }, + { + "epoch": 0.62, + "grad_norm": 0.9324822671783136, + "learning_rate": 6.7361058145726645e-06, + "loss": 0.5388, + "step": 7600 + }, + { + "epoch": 0.62, + "grad_norm": 0.9652578493993295, + "learning_rate": 6.733617621538893e-06, + "loss": 0.6204, + "step": 7601 + }, + { + "epoch": 0.62, + "grad_norm": 0.9326608597566178, + "learning_rate": 6.731129654862526e-06, + "loss": 0.5349, + "step": 7602 + }, + { + "epoch": 0.62, + "grad_norm": 0.874818064011275, + "learning_rate": 6.7286419147159745e-06, + "loss": 0.5201, + "step": 7603 + }, + { + "epoch": 0.62, + "grad_norm": 0.8026460981939355, + "learning_rate": 6.726154401271633e-06, + "loss": 0.452, + "step": 7604 + }, + { + "epoch": 0.62, + "grad_norm": 0.9147355097876252, + "learning_rate": 6.723667114701892e-06, + "loss": 0.512, + "step": 7605 + }, + { + "epoch": 0.62, + "grad_norm": 0.8944621682572889, + "learning_rate": 6.721180055179113e-06, + "loss": 0.4557, + "step": 7606 + }, + { + "epoch": 0.62, + "grad_norm": 1.0302155219494231, + "learning_rate": 6.718693222875644e-06, + "loss": 0.4877, + "step": 7607 + }, + { + "epoch": 0.62, + "grad_norm": 0.8921531524431237, + "learning_rate": 6.7162066179638286e-06, + "loss": 0.5354, + "step": 7608 + }, + { + "epoch": 0.62, + "grad_norm": 0.8902878385006948, + "learning_rate": 6.713720240615982e-06, + "loss": 0.4891, + "step": 7609 + }, + { + "epoch": 0.62, + "grad_norm": 0.8666985623979152, + "learning_rate": 6.711234091004404e-06, + "loss": 0.4978, + "step": 7610 + }, + { + "epoch": 0.62, + "grad_norm": 0.8367718524021635, + "learning_rate": 6.708748169301389e-06, + "loss": 0.5615, + "step": 7611 + }, + { + "epoch": 0.62, + "grad_norm": 0.9411993131581817, + "learning_rate": 6.706262475679205e-06, + "loss": 0.5926, + "step": 7612 + }, + { + "epoch": 0.62, + "grad_norm": 0.9335725128206275, + "learning_rate": 6.703777010310111e-06, + "loss": 0.5449, + "step": 7613 + }, + { + "epoch": 0.62, + "grad_norm": 0.9219406710574435, + "learning_rate": 6.701291773366347e-06, + "loss": 0.5564, + "step": 7614 + }, + { + "epoch": 0.62, + "grad_norm": 0.9183979061357251, + "learning_rate": 6.698806765020136e-06, + "loss": 0.5195, + "step": 7615 + }, + { + "epoch": 0.62, + "grad_norm": 0.9554532988160243, + "learning_rate": 6.696321985443688e-06, + "loss": 0.5166, + "step": 7616 + }, + { + "epoch": 0.62, + "grad_norm": 0.8800060318983304, + "learning_rate": 6.693837434809199e-06, + "loss": 0.4841, + "step": 7617 + }, + { + "epoch": 0.62, + "grad_norm": 0.9066257671231671, + "learning_rate": 6.691353113288839e-06, + "loss": 0.4821, + "step": 7618 + }, + { + "epoch": 0.62, + "grad_norm": 0.8696221704487445, + "learning_rate": 6.688869021054773e-06, + "loss": 0.4696, + "step": 7619 + }, + { + "epoch": 0.62, + "grad_norm": 0.810305875465549, + "learning_rate": 6.686385158279151e-06, + "loss": 0.4334, + "step": 7620 + }, + { + "epoch": 0.62, + "grad_norm": 0.9236041534274468, + "learning_rate": 6.683901525134096e-06, + "loss": 0.5369, + "step": 7621 + }, + { + "epoch": 0.62, + "grad_norm": 0.9437786609905792, + "learning_rate": 6.681418121791725e-06, + "loss": 0.525, + "step": 7622 + }, + { + "epoch": 0.62, + "grad_norm": 0.9462694496156745, + "learning_rate": 6.678934948424134e-06, + "loss": 0.4898, + "step": 7623 + }, + { + "epoch": 0.62, + "grad_norm": 0.9412840607698836, + "learning_rate": 6.6764520052034054e-06, + "loss": 0.5509, + "step": 7624 + }, + { + "epoch": 0.62, + "grad_norm": 0.8845126699011401, + "learning_rate": 6.673969292301604e-06, + "loss": 0.4877, + "step": 7625 + }, + { + "epoch": 0.62, + "grad_norm": 0.8569357062286083, + "learning_rate": 6.6714868098907825e-06, + "loss": 0.5046, + "step": 7626 + }, + { + "epoch": 0.62, + "grad_norm": 0.9405002337042057, + "learning_rate": 6.6690045581429705e-06, + "loss": 0.5373, + "step": 7627 + }, + { + "epoch": 0.62, + "grad_norm": 0.8880358306525056, + "learning_rate": 6.666522537230189e-06, + "loss": 0.5047, + "step": 7628 + }, + { + "epoch": 0.62, + "grad_norm": 0.9598858005780319, + "learning_rate": 6.664040747324437e-06, + "loss": 0.5166, + "step": 7629 + }, + { + "epoch": 0.62, + "grad_norm": 0.8742479010742386, + "learning_rate": 6.661559188597706e-06, + "loss": 0.5402, + "step": 7630 + }, + { + "epoch": 0.62, + "grad_norm": 0.894172077733656, + "learning_rate": 6.659077861221959e-06, + "loss": 0.4528, + "step": 7631 + }, + { + "epoch": 0.62, + "grad_norm": 0.8451101514996602, + "learning_rate": 6.656596765369153e-06, + "loss": 0.4539, + "step": 7632 + }, + { + "epoch": 0.62, + "grad_norm": 0.9342223311005059, + "learning_rate": 6.654115901211229e-06, + "loss": 0.5558, + "step": 7633 + }, + { + "epoch": 0.62, + "grad_norm": 0.9352876875867336, + "learning_rate": 6.651635268920101e-06, + "loss": 0.5217, + "step": 7634 + }, + { + "epoch": 0.62, + "grad_norm": 0.9387669753786152, + "learning_rate": 6.64915486866768e-06, + "loss": 0.4883, + "step": 7635 + }, + { + "epoch": 0.62, + "grad_norm": 0.7909913569761613, + "learning_rate": 6.646674700625857e-06, + "loss": 0.4587, + "step": 7636 + }, + { + "epoch": 0.62, + "grad_norm": 0.940002808704583, + "learning_rate": 6.644194764966499e-06, + "loss": 0.5573, + "step": 7637 + }, + { + "epoch": 0.62, + "grad_norm": 0.8798200273870914, + "learning_rate": 6.641715061861469e-06, + "loss": 0.5314, + "step": 7638 + }, + { + "epoch": 0.62, + "grad_norm": 0.9118097465638171, + "learning_rate": 6.639235591482608e-06, + "loss": 0.4951, + "step": 7639 + }, + { + "epoch": 0.62, + "grad_norm": 0.9265393540958253, + "learning_rate": 6.636756354001737e-06, + "loss": 0.5746, + "step": 7640 + }, + { + "epoch": 0.62, + "grad_norm": 0.8996667634118599, + "learning_rate": 6.6342773495906675e-06, + "loss": 0.5058, + "step": 7641 + }, + { + "epoch": 0.62, + "grad_norm": 0.9338778709815375, + "learning_rate": 6.631798578421195e-06, + "loss": 0.5082, + "step": 7642 + }, + { + "epoch": 0.62, + "grad_norm": 0.8415158595072395, + "learning_rate": 6.62932004066509e-06, + "loss": 0.467, + "step": 7643 + }, + { + "epoch": 0.62, + "grad_norm": 0.9406207341615731, + "learning_rate": 6.626841736494119e-06, + "loss": 0.551, + "step": 7644 + }, + { + "epoch": 0.62, + "grad_norm": 0.9956040687979807, + "learning_rate": 6.624363666080021e-06, + "loss": 0.5501, + "step": 7645 + }, + { + "epoch": 0.62, + "grad_norm": 0.984348172818565, + "learning_rate": 6.62188582959453e-06, + "loss": 0.5825, + "step": 7646 + }, + { + "epoch": 0.62, + "grad_norm": 0.8693191073068617, + "learning_rate": 6.619408227209352e-06, + "loss": 0.5317, + "step": 7647 + }, + { + "epoch": 0.62, + "grad_norm": 0.9489546286652094, + "learning_rate": 6.616930859096185e-06, + "loss": 0.5389, + "step": 7648 + }, + { + "epoch": 0.62, + "grad_norm": 0.9362386586362307, + "learning_rate": 6.61445372542671e-06, + "loss": 0.4964, + "step": 7649 + }, + { + "epoch": 0.62, + "grad_norm": 0.8275225693781739, + "learning_rate": 6.61197682637259e-06, + "loss": 0.5144, + "step": 7650 + }, + { + "epoch": 0.62, + "grad_norm": 0.8801001351782364, + "learning_rate": 6.609500162105469e-06, + "loss": 0.5059, + "step": 7651 + }, + { + "epoch": 0.62, + "grad_norm": 0.8904305060344985, + "learning_rate": 6.60702373279698e-06, + "loss": 0.5316, + "step": 7652 + }, + { + "epoch": 0.62, + "grad_norm": 0.8565058285424657, + "learning_rate": 6.6045475386187376e-06, + "loss": 0.4461, + "step": 7653 + }, + { + "epoch": 0.62, + "grad_norm": 0.9101220847706999, + "learning_rate": 6.602071579742337e-06, + "loss": 0.5361, + "step": 7654 + }, + { + "epoch": 0.62, + "grad_norm": 0.9587323995861344, + "learning_rate": 6.599595856339363e-06, + "loss": 0.4869, + "step": 7655 + }, + { + "epoch": 0.62, + "grad_norm": 0.976018429112491, + "learning_rate": 6.597120368581382e-06, + "loss": 0.518, + "step": 7656 + }, + { + "epoch": 0.62, + "grad_norm": 0.7956454072227547, + "learning_rate": 6.594645116639939e-06, + "loss": 0.4691, + "step": 7657 + }, + { + "epoch": 0.62, + "grad_norm": 0.9794199560151343, + "learning_rate": 6.592170100686568e-06, + "loss": 0.5548, + "step": 7658 + }, + { + "epoch": 0.62, + "grad_norm": 0.9088047741398635, + "learning_rate": 6.5896953208927886e-06, + "loss": 0.5546, + "step": 7659 + }, + { + "epoch": 0.62, + "grad_norm": 0.8844084202928726, + "learning_rate": 6.587220777430097e-06, + "loss": 0.5246, + "step": 7660 + }, + { + "epoch": 0.62, + "grad_norm": 0.9737946962118639, + "learning_rate": 6.584746470469978e-06, + "loss": 0.4907, + "step": 7661 + }, + { + "epoch": 0.62, + "grad_norm": 0.8894751424608862, + "learning_rate": 6.582272400183901e-06, + "loss": 0.5086, + "step": 7662 + }, + { + "epoch": 0.62, + "grad_norm": 0.9085894752314807, + "learning_rate": 6.579798566743314e-06, + "loss": 0.4848, + "step": 7663 + }, + { + "epoch": 0.62, + "grad_norm": 0.8982538680909244, + "learning_rate": 6.577324970319652e-06, + "loss": 0.5328, + "step": 7664 + }, + { + "epoch": 0.62, + "grad_norm": 0.8584185228163657, + "learning_rate": 6.574851611084335e-06, + "loss": 0.5389, + "step": 7665 + }, + { + "epoch": 0.62, + "grad_norm": 0.9102759729627776, + "learning_rate": 6.572378489208762e-06, + "loss": 0.5237, + "step": 7666 + }, + { + "epoch": 0.62, + "grad_norm": 0.8677201197600535, + "learning_rate": 6.569905604864319e-06, + "loss": 0.4592, + "step": 7667 + }, + { + "epoch": 0.62, + "grad_norm": 1.2005174417117501, + "learning_rate": 6.567432958222379e-06, + "loss": 0.537, + "step": 7668 + }, + { + "epoch": 0.62, + "grad_norm": 0.8504096257899675, + "learning_rate": 6.564960549454285e-06, + "loss": 0.5129, + "step": 7669 + }, + { + "epoch": 0.62, + "grad_norm": 0.9084205797953231, + "learning_rate": 6.562488378731381e-06, + "loss": 0.5082, + "step": 7670 + }, + { + "epoch": 0.62, + "grad_norm": 0.961813736450623, + "learning_rate": 6.560016446224983e-06, + "loss": 0.5459, + "step": 7671 + }, + { + "epoch": 0.62, + "grad_norm": 0.8893401843439549, + "learning_rate": 6.557544752106392e-06, + "loss": 0.5062, + "step": 7672 + }, + { + "epoch": 0.62, + "grad_norm": 0.8730833336776954, + "learning_rate": 6.5550732965468985e-06, + "loss": 0.4991, + "step": 7673 + }, + { + "epoch": 0.62, + "grad_norm": 0.96633050431716, + "learning_rate": 6.552602079717772e-06, + "loss": 0.5265, + "step": 7674 + }, + { + "epoch": 0.62, + "grad_norm": 0.9610131291886921, + "learning_rate": 6.550131101790258e-06, + "loss": 0.5188, + "step": 7675 + }, + { + "epoch": 0.62, + "grad_norm": 0.9082109466001308, + "learning_rate": 6.547660362935603e-06, + "loss": 0.542, + "step": 7676 + }, + { + "epoch": 0.62, + "grad_norm": 0.9664922066319224, + "learning_rate": 6.545189863325023e-06, + "loss": 0.5302, + "step": 7677 + }, + { + "epoch": 0.62, + "grad_norm": 0.8572812606217288, + "learning_rate": 6.542719603129716e-06, + "loss": 0.5131, + "step": 7678 + }, + { + "epoch": 0.62, + "grad_norm": 1.0805563703213321, + "learning_rate": 6.540249582520879e-06, + "loss": 0.5196, + "step": 7679 + }, + { + "epoch": 0.62, + "grad_norm": 0.9355592376625326, + "learning_rate": 6.537779801669677e-06, + "loss": 0.51, + "step": 7680 + }, + { + "epoch": 0.62, + "grad_norm": 1.0094796867058717, + "learning_rate": 6.535310260747259e-06, + "loss": 0.5075, + "step": 7681 + }, + { + "epoch": 0.62, + "grad_norm": 0.9183654901634732, + "learning_rate": 6.5328409599247715e-06, + "loss": 0.4783, + "step": 7682 + }, + { + "epoch": 0.62, + "grad_norm": 0.918630109918347, + "learning_rate": 6.530371899373329e-06, + "loss": 0.5179, + "step": 7683 + }, + { + "epoch": 0.62, + "grad_norm": 0.8697336708812505, + "learning_rate": 6.527903079264033e-06, + "loss": 0.4734, + "step": 7684 + }, + { + "epoch": 0.62, + "grad_norm": 1.0264393557260938, + "learning_rate": 6.525434499767978e-06, + "loss": 0.5394, + "step": 7685 + }, + { + "epoch": 0.62, + "grad_norm": 0.8638538677983133, + "learning_rate": 6.52296616105623e-06, + "loss": 0.4703, + "step": 7686 + }, + { + "epoch": 0.62, + "grad_norm": 0.9552971643499772, + "learning_rate": 6.5204980632998394e-06, + "loss": 0.5097, + "step": 7687 + }, + { + "epoch": 0.62, + "grad_norm": 0.8846975559947114, + "learning_rate": 6.5180302066698495e-06, + "loss": 0.5423, + "step": 7688 + }, + { + "epoch": 0.62, + "grad_norm": 0.8952890587377771, + "learning_rate": 6.515562591337279e-06, + "loss": 0.5103, + "step": 7689 + }, + { + "epoch": 0.62, + "grad_norm": 0.9432984337915332, + "learning_rate": 6.513095217473127e-06, + "loss": 0.5504, + "step": 7690 + }, + { + "epoch": 0.63, + "grad_norm": 0.9411424831312857, + "learning_rate": 6.510628085248385e-06, + "loss": 0.4815, + "step": 7691 + }, + { + "epoch": 0.63, + "grad_norm": 0.93391599034221, + "learning_rate": 6.508161194834024e-06, + "loss": 0.5194, + "step": 7692 + }, + { + "epoch": 0.63, + "grad_norm": 0.8209924490621792, + "learning_rate": 6.505694546400989e-06, + "loss": 0.4406, + "step": 7693 + }, + { + "epoch": 0.63, + "grad_norm": 0.9632496572429818, + "learning_rate": 6.503228140120228e-06, + "loss": 0.5211, + "step": 7694 + }, + { + "epoch": 0.63, + "grad_norm": 0.9636864179124074, + "learning_rate": 6.500761976162655e-06, + "loss": 0.544, + "step": 7695 + }, + { + "epoch": 0.63, + "grad_norm": 0.9628458753847853, + "learning_rate": 6.498296054699169e-06, + "loss": 0.4921, + "step": 7696 + }, + { + "epoch": 0.63, + "grad_norm": 0.8605164541806277, + "learning_rate": 6.495830375900665e-06, + "loss": 0.5045, + "step": 7697 + }, + { + "epoch": 0.63, + "grad_norm": 0.8357418989520136, + "learning_rate": 6.493364939938007e-06, + "loss": 0.5184, + "step": 7698 + }, + { + "epoch": 0.63, + "grad_norm": 0.8775988577494073, + "learning_rate": 6.490899746982045e-06, + "loss": 0.5773, + "step": 7699 + }, + { + "epoch": 0.63, + "grad_norm": 0.8507036855449437, + "learning_rate": 6.48843479720362e-06, + "loss": 0.5532, + "step": 7700 + }, + { + "epoch": 0.63, + "grad_norm": 0.885247279447743, + "learning_rate": 6.48597009077355e-06, + "loss": 0.4917, + "step": 7701 + }, + { + "epoch": 0.63, + "grad_norm": 0.8213151157416119, + "learning_rate": 6.483505627862632e-06, + "loss": 0.4146, + "step": 7702 + }, + { + "epoch": 0.63, + "grad_norm": 0.9521487130066696, + "learning_rate": 6.481041408641659e-06, + "loss": 0.4853, + "step": 7703 + }, + { + "epoch": 0.63, + "grad_norm": 0.9047442954374255, + "learning_rate": 6.478577433281394e-06, + "loss": 0.501, + "step": 7704 + }, + { + "epoch": 0.63, + "grad_norm": 0.9123953950706234, + "learning_rate": 6.476113701952587e-06, + "loss": 0.505, + "step": 7705 + }, + { + "epoch": 0.63, + "grad_norm": 0.8918355614489641, + "learning_rate": 6.473650214825979e-06, + "loss": 0.5052, + "step": 7706 + }, + { + "epoch": 0.63, + "grad_norm": 0.8460692223732004, + "learning_rate": 6.4711869720722804e-06, + "loss": 0.5007, + "step": 7707 + }, + { + "epoch": 0.63, + "grad_norm": 0.9514163634580783, + "learning_rate": 6.468723973862194e-06, + "loss": 0.4888, + "step": 7708 + }, + { + "epoch": 0.63, + "grad_norm": 0.8407870858699927, + "learning_rate": 6.466261220366406e-06, + "loss": 0.5123, + "step": 7709 + }, + { + "epoch": 0.63, + "grad_norm": 1.0438409207420924, + "learning_rate": 6.463798711755582e-06, + "loss": 0.5079, + "step": 7710 + }, + { + "epoch": 0.63, + "grad_norm": 0.8393621192093546, + "learning_rate": 6.461336448200366e-06, + "loss": 0.5016, + "step": 7711 + }, + { + "epoch": 0.63, + "grad_norm": 0.8504075861887, + "learning_rate": 6.458874429871399e-06, + "loss": 0.5265, + "step": 7712 + }, + { + "epoch": 0.63, + "grad_norm": 0.8860413988426402, + "learning_rate": 6.456412656939293e-06, + "loss": 0.5535, + "step": 7713 + }, + { + "epoch": 0.63, + "grad_norm": 1.0156180242304687, + "learning_rate": 6.453951129574644e-06, + "loss": 0.5815, + "step": 7714 + }, + { + "epoch": 0.63, + "grad_norm": 1.0120525970975776, + "learning_rate": 6.451489847948039e-06, + "loss": 0.5458, + "step": 7715 + }, + { + "epoch": 0.63, + "grad_norm": 0.9642455156600728, + "learning_rate": 6.44902881223004e-06, + "loss": 0.5594, + "step": 7716 + }, + { + "epoch": 0.63, + "grad_norm": 0.8316070911321247, + "learning_rate": 6.446568022591192e-06, + "loss": 0.4743, + "step": 7717 + }, + { + "epoch": 0.63, + "grad_norm": 0.9448781886763189, + "learning_rate": 6.4441074792020305e-06, + "loss": 0.5924, + "step": 7718 + }, + { + "epoch": 0.63, + "grad_norm": 0.8313716764000463, + "learning_rate": 6.4416471822330684e-06, + "loss": 0.4905, + "step": 7719 + }, + { + "epoch": 0.63, + "grad_norm": 0.9063248870520652, + "learning_rate": 6.439187131854796e-06, + "loss": 0.5645, + "step": 7720 + }, + { + "epoch": 0.63, + "grad_norm": 0.9106218132909022, + "learning_rate": 6.436727328237699e-06, + "loss": 0.5104, + "step": 7721 + }, + { + "epoch": 0.63, + "grad_norm": 0.9383859538169318, + "learning_rate": 6.43426777155224e-06, + "loss": 0.5514, + "step": 7722 + }, + { + "epoch": 0.63, + "grad_norm": 0.8098969358101791, + "learning_rate": 6.431808461968856e-06, + "loss": 0.4979, + "step": 7723 + }, + { + "epoch": 0.63, + "grad_norm": 0.9013036702084408, + "learning_rate": 6.429349399657985e-06, + "loss": 0.4835, + "step": 7724 + }, + { + "epoch": 0.63, + "grad_norm": 0.939866858909316, + "learning_rate": 6.4268905847900335e-06, + "loss": 0.505, + "step": 7725 + }, + { + "epoch": 0.63, + "grad_norm": 0.84843342037553, + "learning_rate": 6.424432017535391e-06, + "loss": 0.4922, + "step": 7726 + }, + { + "epoch": 0.63, + "grad_norm": 0.9328549244243041, + "learning_rate": 6.421973698064443e-06, + "loss": 0.5338, + "step": 7727 + }, + { + "epoch": 0.63, + "grad_norm": 0.9802417980606807, + "learning_rate": 6.419515626547543e-06, + "loss": 0.5675, + "step": 7728 + }, + { + "epoch": 0.63, + "grad_norm": 0.8478739144926289, + "learning_rate": 6.41705780315503e-06, + "loss": 0.4788, + "step": 7729 + }, + { + "epoch": 0.63, + "grad_norm": 0.8969066595806814, + "learning_rate": 6.414600228057237e-06, + "loss": 0.5265, + "step": 7730 + }, + { + "epoch": 0.63, + "grad_norm": 0.961301932687817, + "learning_rate": 6.41214290142447e-06, + "loss": 0.5468, + "step": 7731 + }, + { + "epoch": 0.63, + "grad_norm": 0.9124040441320103, + "learning_rate": 6.409685823427012e-06, + "loss": 0.5271, + "step": 7732 + }, + { + "epoch": 0.63, + "grad_norm": 0.9247331515893926, + "learning_rate": 6.407228994235146e-06, + "loss": 0.4851, + "step": 7733 + }, + { + "epoch": 0.63, + "grad_norm": 0.8764915994820545, + "learning_rate": 6.404772414019124e-06, + "loss": 0.5494, + "step": 7734 + }, + { + "epoch": 0.63, + "grad_norm": 0.8456362246353507, + "learning_rate": 6.40231608294918e-06, + "loss": 0.4967, + "step": 7735 + }, + { + "epoch": 0.63, + "grad_norm": 0.8166377742541707, + "learning_rate": 6.399860001195546e-06, + "loss": 0.4253, + "step": 7736 + }, + { + "epoch": 0.63, + "grad_norm": 0.8533838510419213, + "learning_rate": 6.397404168928418e-06, + "loss": 0.5479, + "step": 7737 + }, + { + "epoch": 0.63, + "grad_norm": 0.85629970661516, + "learning_rate": 6.394948586317984e-06, + "loss": 0.4668, + "step": 7738 + }, + { + "epoch": 0.63, + "grad_norm": 0.9226611146431447, + "learning_rate": 6.392493253534418e-06, + "loss": 0.4656, + "step": 7739 + }, + { + "epoch": 0.63, + "grad_norm": 0.9290966735283805, + "learning_rate": 6.39003817074787e-06, + "loss": 0.5628, + "step": 7740 + }, + { + "epoch": 0.63, + "grad_norm": 0.9762580395364672, + "learning_rate": 6.387583338128471e-06, + "loss": 0.5663, + "step": 7741 + }, + { + "epoch": 0.63, + "grad_norm": 0.9107305782629377, + "learning_rate": 6.385128755846346e-06, + "loss": 0.5754, + "step": 7742 + }, + { + "epoch": 0.63, + "grad_norm": 0.8881321158916198, + "learning_rate": 6.382674424071593e-06, + "loss": 0.4683, + "step": 7743 + }, + { + "epoch": 0.63, + "grad_norm": 0.8133542061284578, + "learning_rate": 6.3802203429742884e-06, + "loss": 0.4527, + "step": 7744 + }, + { + "epoch": 0.63, + "grad_norm": 0.8277818013192928, + "learning_rate": 6.377766512724508e-06, + "loss": 0.4517, + "step": 7745 + }, + { + "epoch": 0.63, + "grad_norm": 0.8749312544643651, + "learning_rate": 6.375312933492295e-06, + "loss": 0.493, + "step": 7746 + }, + { + "epoch": 0.63, + "grad_norm": 0.8712980229852738, + "learning_rate": 6.372859605447677e-06, + "loss": 0.4727, + "step": 7747 + }, + { + "epoch": 0.63, + "grad_norm": 0.8990200778286532, + "learning_rate": 6.370406528760675e-06, + "loss": 0.4291, + "step": 7748 + }, + { + "epoch": 0.63, + "grad_norm": 0.9048743821787182, + "learning_rate": 6.367953703601282e-06, + "loss": 0.5218, + "step": 7749 + }, + { + "epoch": 0.63, + "grad_norm": 0.8505330860159265, + "learning_rate": 6.36550113013947e-06, + "loss": 0.4798, + "step": 7750 + }, + { + "epoch": 0.63, + "grad_norm": 0.9402198572741421, + "learning_rate": 6.3630488085452105e-06, + "loss": 0.5336, + "step": 7751 + }, + { + "epoch": 0.63, + "grad_norm": 0.9729586284896738, + "learning_rate": 6.360596738988443e-06, + "loss": 0.5871, + "step": 7752 + }, + { + "epoch": 0.63, + "grad_norm": 0.8969176170147588, + "learning_rate": 6.358144921639089e-06, + "loss": 0.549, + "step": 7753 + }, + { + "epoch": 0.63, + "grad_norm": 0.8885249861439682, + "learning_rate": 6.3556933566670656e-06, + "loss": 0.5263, + "step": 7754 + }, + { + "epoch": 0.63, + "grad_norm": 0.9578092824346056, + "learning_rate": 6.353242044242261e-06, + "loss": 0.5295, + "step": 7755 + }, + { + "epoch": 0.63, + "grad_norm": 0.8080942797341487, + "learning_rate": 6.350790984534543e-06, + "loss": 0.4604, + "step": 7756 + }, + { + "epoch": 0.63, + "grad_norm": 0.9077725823304328, + "learning_rate": 6.348340177713776e-06, + "loss": 0.4722, + "step": 7757 + }, + { + "epoch": 0.63, + "grad_norm": 0.9109428713389269, + "learning_rate": 6.3458896239497965e-06, + "loss": 0.5195, + "step": 7758 + }, + { + "epoch": 0.63, + "grad_norm": 0.9463265119734536, + "learning_rate": 6.343439323412422e-06, + "loss": 0.4988, + "step": 7759 + }, + { + "epoch": 0.63, + "grad_norm": 0.9059028499902411, + "learning_rate": 6.340989276271462e-06, + "loss": 0.4988, + "step": 7760 + }, + { + "epoch": 0.63, + "grad_norm": 0.8568743425472681, + "learning_rate": 6.3385394826966975e-06, + "loss": 0.5055, + "step": 7761 + }, + { + "epoch": 0.63, + "grad_norm": 0.942925223145349, + "learning_rate": 6.336089942857899e-06, + "loss": 0.519, + "step": 7762 + }, + { + "epoch": 0.63, + "grad_norm": 0.9032602036621082, + "learning_rate": 6.33364065692482e-06, + "loss": 0.4703, + "step": 7763 + }, + { + "epoch": 0.63, + "grad_norm": 0.8792975094902231, + "learning_rate": 6.33119162506719e-06, + "loss": 0.4999, + "step": 7764 + }, + { + "epoch": 0.63, + "grad_norm": 0.9501182269012853, + "learning_rate": 6.3287428474547256e-06, + "loss": 0.5199, + "step": 7765 + }, + { + "epoch": 0.63, + "grad_norm": 0.9448877264739883, + "learning_rate": 6.326294324257127e-06, + "loss": 0.4954, + "step": 7766 + }, + { + "epoch": 0.63, + "grad_norm": 0.8565240003077423, + "learning_rate": 6.32384605564407e-06, + "loss": 0.4208, + "step": 7767 + }, + { + "epoch": 0.63, + "grad_norm": 0.8486055449790287, + "learning_rate": 6.321398041785225e-06, + "loss": 0.5115, + "step": 7768 + }, + { + "epoch": 0.63, + "grad_norm": 0.9117738135586135, + "learning_rate": 6.318950282850231e-06, + "loss": 0.5145, + "step": 7769 + }, + { + "epoch": 0.63, + "grad_norm": 0.8913146224618086, + "learning_rate": 6.3165027790087156e-06, + "loss": 0.4775, + "step": 7770 + }, + { + "epoch": 0.63, + "grad_norm": 0.7365979070890599, + "learning_rate": 6.3140555304302915e-06, + "loss": 0.3933, + "step": 7771 + }, + { + "epoch": 0.63, + "grad_norm": 0.9768804673095871, + "learning_rate": 6.311608537284553e-06, + "loss": 0.4881, + "step": 7772 + }, + { + "epoch": 0.63, + "grad_norm": 1.016542017230885, + "learning_rate": 6.309161799741064e-06, + "loss": 0.526, + "step": 7773 + }, + { + "epoch": 0.63, + "grad_norm": 0.8925076055403189, + "learning_rate": 6.306715317969394e-06, + "loss": 0.4922, + "step": 7774 + }, + { + "epoch": 0.63, + "grad_norm": 0.8267041800491832, + "learning_rate": 6.304269092139077e-06, + "loss": 0.4777, + "step": 7775 + }, + { + "epoch": 0.63, + "grad_norm": 1.0650334644803787, + "learning_rate": 6.3018231224196305e-06, + "loss": 0.5939, + "step": 7776 + }, + { + "epoch": 0.63, + "grad_norm": 0.8943182460099235, + "learning_rate": 6.299377408980563e-06, + "loss": 0.5152, + "step": 7777 + }, + { + "epoch": 0.63, + "grad_norm": 0.888006560881075, + "learning_rate": 6.296931951991358e-06, + "loss": 0.4804, + "step": 7778 + }, + { + "epoch": 0.63, + "grad_norm": 0.9545552190672068, + "learning_rate": 6.2944867516214845e-06, + "loss": 0.5142, + "step": 7779 + }, + { + "epoch": 0.63, + "grad_norm": 0.9929717798897725, + "learning_rate": 6.292041808040393e-06, + "loss": 0.5967, + "step": 7780 + }, + { + "epoch": 0.63, + "grad_norm": 0.8714173763064299, + "learning_rate": 6.289597121417514e-06, + "loss": 0.4446, + "step": 7781 + }, + { + "epoch": 0.63, + "grad_norm": 0.9078441721534038, + "learning_rate": 6.287152691922264e-06, + "loss": 0.5224, + "step": 7782 + }, + { + "epoch": 0.63, + "grad_norm": 0.996162174356343, + "learning_rate": 6.284708519724041e-06, + "loss": 0.5057, + "step": 7783 + }, + { + "epoch": 0.63, + "grad_norm": 0.8774138005700622, + "learning_rate": 6.2822646049922185e-06, + "loss": 0.4563, + "step": 7784 + }, + { + "epoch": 0.63, + "grad_norm": 0.8856112477861796, + "learning_rate": 6.279820947896163e-06, + "loss": 0.5064, + "step": 7785 + }, + { + "epoch": 0.63, + "grad_norm": 0.9295493307030581, + "learning_rate": 6.277377548605217e-06, + "loss": 0.5247, + "step": 7786 + }, + { + "epoch": 0.63, + "grad_norm": 0.8765121315853294, + "learning_rate": 6.274934407288704e-06, + "loss": 0.4566, + "step": 7787 + }, + { + "epoch": 0.63, + "grad_norm": 0.9399025860311087, + "learning_rate": 6.2724915241159315e-06, + "loss": 0.5375, + "step": 7788 + }, + { + "epoch": 0.63, + "grad_norm": 0.8557716593867082, + "learning_rate": 6.2700488992561925e-06, + "loss": 0.4419, + "step": 7789 + }, + { + "epoch": 0.63, + "grad_norm": 0.9658120983337019, + "learning_rate": 6.267606532878754e-06, + "loss": 0.5243, + "step": 7790 + }, + { + "epoch": 0.63, + "grad_norm": 0.958679780460244, + "learning_rate": 6.265164425152872e-06, + "loss": 0.5179, + "step": 7791 + }, + { + "epoch": 0.63, + "grad_norm": 0.9658173133446795, + "learning_rate": 6.262722576247785e-06, + "loss": 0.5464, + "step": 7792 + }, + { + "epoch": 0.63, + "grad_norm": 0.8716878684699769, + "learning_rate": 6.260280986332707e-06, + "loss": 0.4832, + "step": 7793 + }, + { + "epoch": 0.63, + "grad_norm": 0.9349477383809048, + "learning_rate": 6.257839655576839e-06, + "loss": 0.4898, + "step": 7794 + }, + { + "epoch": 0.63, + "grad_norm": 0.9180139743849978, + "learning_rate": 6.255398584149366e-06, + "loss": 0.55, + "step": 7795 + }, + { + "epoch": 0.63, + "grad_norm": 0.9231079645761651, + "learning_rate": 6.252957772219446e-06, + "loss": 0.5134, + "step": 7796 + }, + { + "epoch": 0.63, + "grad_norm": 0.9223827123007751, + "learning_rate": 6.25051721995623e-06, + "loss": 0.5552, + "step": 7797 + }, + { + "epoch": 0.63, + "grad_norm": 1.0099215371331756, + "learning_rate": 6.248076927528845e-06, + "loss": 0.5718, + "step": 7798 + }, + { + "epoch": 0.63, + "grad_norm": 0.8269613087050963, + "learning_rate": 6.245636895106403e-06, + "loss": 0.4471, + "step": 7799 + }, + { + "epoch": 0.63, + "grad_norm": 0.8713953691636022, + "learning_rate": 6.243197122857991e-06, + "loss": 0.5009, + "step": 7800 + }, + { + "epoch": 0.63, + "grad_norm": 1.1097324395029895, + "learning_rate": 6.240757610952688e-06, + "loss": 0.4947, + "step": 7801 + }, + { + "epoch": 0.63, + "grad_norm": 0.9103097554000824, + "learning_rate": 6.238318359559548e-06, + "loss": 0.4672, + "step": 7802 + }, + { + "epoch": 0.63, + "grad_norm": 0.9089885001686926, + "learning_rate": 6.2358793688476085e-06, + "loss": 0.4802, + "step": 7803 + }, + { + "epoch": 0.63, + "grad_norm": 0.9365668706441963, + "learning_rate": 6.233440638985889e-06, + "loss": 0.509, + "step": 7804 + }, + { + "epoch": 0.63, + "grad_norm": 1.0305363078095187, + "learning_rate": 6.231002170143395e-06, + "loss": 0.5831, + "step": 7805 + }, + { + "epoch": 0.63, + "grad_norm": 0.8677823936257921, + "learning_rate": 6.228563962489106e-06, + "loss": 0.5054, + "step": 7806 + }, + { + "epoch": 0.63, + "grad_norm": 0.8956157333665741, + "learning_rate": 6.226126016191989e-06, + "loss": 0.4789, + "step": 7807 + }, + { + "epoch": 0.63, + "grad_norm": 0.8960203160039782, + "learning_rate": 6.223688331420992e-06, + "loss": 0.4975, + "step": 7808 + }, + { + "epoch": 0.63, + "grad_norm": 0.809149890697131, + "learning_rate": 6.221250908345043e-06, + "loss": 0.4941, + "step": 7809 + }, + { + "epoch": 0.63, + "grad_norm": 0.943687691157147, + "learning_rate": 6.218813747133054e-06, + "loss": 0.4947, + "step": 7810 + }, + { + "epoch": 0.63, + "grad_norm": 0.8036480414231428, + "learning_rate": 6.2163768479539224e-06, + "loss": 0.4487, + "step": 7811 + }, + { + "epoch": 0.63, + "grad_norm": 0.9941996127619828, + "learning_rate": 6.2139402109765145e-06, + "loss": 0.4801, + "step": 7812 + }, + { + "epoch": 0.63, + "grad_norm": 0.9008355927023047, + "learning_rate": 6.211503836369695e-06, + "loss": 0.4602, + "step": 7813 + }, + { + "epoch": 0.64, + "grad_norm": 0.9154622606923055, + "learning_rate": 6.209067724302298e-06, + "loss": 0.5009, + "step": 7814 + }, + { + "epoch": 0.64, + "grad_norm": 0.8444425530184236, + "learning_rate": 6.206631874943142e-06, + "loss": 0.4654, + "step": 7815 + }, + { + "epoch": 0.64, + "grad_norm": 0.8723556650932853, + "learning_rate": 6.204196288461037e-06, + "loss": 0.4781, + "step": 7816 + }, + { + "epoch": 0.64, + "grad_norm": 0.868682917707013, + "learning_rate": 6.2017609650247616e-06, + "loss": 0.4906, + "step": 7817 + }, + { + "epoch": 0.64, + "grad_norm": 0.9050595246873593, + "learning_rate": 6.19932590480308e-06, + "loss": 0.5166, + "step": 7818 + }, + { + "epoch": 0.64, + "grad_norm": 0.9716180252733098, + "learning_rate": 6.196891107964744e-06, + "loss": 0.5476, + "step": 7819 + }, + { + "epoch": 0.64, + "grad_norm": 0.8566243321600501, + "learning_rate": 6.194456574678481e-06, + "loss": 0.5152, + "step": 7820 + }, + { + "epoch": 0.64, + "grad_norm": 0.9824046976752693, + "learning_rate": 6.192022305112999e-06, + "loss": 0.544, + "step": 7821 + }, + { + "epoch": 0.64, + "grad_norm": 0.9305953427523077, + "learning_rate": 6.189588299436997e-06, + "loss": 0.5713, + "step": 7822 + }, + { + "epoch": 0.64, + "grad_norm": 0.8070106072056493, + "learning_rate": 6.187154557819146e-06, + "loss": 0.4906, + "step": 7823 + }, + { + "epoch": 0.64, + "grad_norm": 0.9498681934374452, + "learning_rate": 6.184721080428098e-06, + "loss": 0.5671, + "step": 7824 + }, + { + "epoch": 0.64, + "grad_norm": 0.820186983805444, + "learning_rate": 6.1822878674324995e-06, + "loss": 0.4313, + "step": 7825 + }, + { + "epoch": 0.64, + "grad_norm": 0.9463991323254897, + "learning_rate": 6.179854919000965e-06, + "loss": 0.507, + "step": 7826 + }, + { + "epoch": 0.64, + "grad_norm": 0.9640186853955188, + "learning_rate": 6.177422235302093e-06, + "loss": 0.5476, + "step": 7827 + }, + { + "epoch": 0.64, + "grad_norm": 0.9492051998384852, + "learning_rate": 6.174989816504472e-06, + "loss": 0.4895, + "step": 7828 + }, + { + "epoch": 0.64, + "grad_norm": 0.8160052673463781, + "learning_rate": 6.172557662776665e-06, + "loss": 0.4485, + "step": 7829 + }, + { + "epoch": 0.64, + "grad_norm": 0.832072497682488, + "learning_rate": 6.170125774287212e-06, + "loss": 0.4871, + "step": 7830 + }, + { + "epoch": 0.64, + "grad_norm": 0.9821687629742838, + "learning_rate": 6.167694151204651e-06, + "loss": 0.5267, + "step": 7831 + }, + { + "epoch": 0.64, + "grad_norm": 0.8475254419513637, + "learning_rate": 6.165262793697486e-06, + "loss": 0.4286, + "step": 7832 + }, + { + "epoch": 0.64, + "grad_norm": 1.0397427201090619, + "learning_rate": 6.162831701934203e-06, + "loss": 0.5719, + "step": 7833 + }, + { + "epoch": 0.64, + "grad_norm": 0.9163483237401832, + "learning_rate": 6.160400876083283e-06, + "loss": 0.4926, + "step": 7834 + }, + { + "epoch": 0.64, + "grad_norm": 0.8156428618841962, + "learning_rate": 6.157970316313179e-06, + "loss": 0.4452, + "step": 7835 + }, + { + "epoch": 0.64, + "grad_norm": 0.8687730375391427, + "learning_rate": 6.155540022792319e-06, + "loss": 0.543, + "step": 7836 + }, + { + "epoch": 0.64, + "grad_norm": 0.7983037783374483, + "learning_rate": 6.153109995689129e-06, + "loss": 0.4492, + "step": 7837 + }, + { + "epoch": 0.64, + "grad_norm": 0.9766388620578832, + "learning_rate": 6.150680235172004e-06, + "loss": 0.4917, + "step": 7838 + }, + { + "epoch": 0.64, + "grad_norm": 0.8463802416443318, + "learning_rate": 6.148250741409321e-06, + "loss": 0.4741, + "step": 7839 + }, + { + "epoch": 0.64, + "grad_norm": 0.894432537815772, + "learning_rate": 6.145821514569449e-06, + "loss": 0.4996, + "step": 7840 + }, + { + "epoch": 0.64, + "grad_norm": 0.8991657851225787, + "learning_rate": 6.143392554820726e-06, + "loss": 0.4578, + "step": 7841 + }, + { + "epoch": 0.64, + "grad_norm": 0.9054030277853536, + "learning_rate": 6.140963862331476e-06, + "loss": 0.4742, + "step": 7842 + }, + { + "epoch": 0.64, + "grad_norm": 0.8797646494203251, + "learning_rate": 6.13853543727001e-06, + "loss": 0.5034, + "step": 7843 + }, + { + "epoch": 0.64, + "grad_norm": 0.9484979833940284, + "learning_rate": 6.1361072798046155e-06, + "loss": 0.513, + "step": 7844 + }, + { + "epoch": 0.64, + "grad_norm": 0.9038890293909475, + "learning_rate": 6.1336793901035526e-06, + "loss": 0.4885, + "step": 7845 + }, + { + "epoch": 0.64, + "grad_norm": 0.9380943990182781, + "learning_rate": 6.131251768335083e-06, + "loss": 0.5606, + "step": 7846 + }, + { + "epoch": 0.64, + "grad_norm": 0.8525723409370285, + "learning_rate": 6.128824414667436e-06, + "loss": 0.481, + "step": 7847 + }, + { + "epoch": 0.64, + "grad_norm": 0.8900496438478617, + "learning_rate": 6.126397329268817e-06, + "loss": 0.5184, + "step": 7848 + }, + { + "epoch": 0.64, + "grad_norm": 0.924435430812342, + "learning_rate": 6.123970512307433e-06, + "loss": 0.5249, + "step": 7849 + }, + { + "epoch": 0.64, + "grad_norm": 1.0436953649832952, + "learning_rate": 6.121543963951453e-06, + "loss": 0.5956, + "step": 7850 + }, + { + "epoch": 0.64, + "grad_norm": 0.967753485718807, + "learning_rate": 6.119117684369033e-06, + "loss": 0.5847, + "step": 7851 + }, + { + "epoch": 0.64, + "grad_norm": 0.9268366254061351, + "learning_rate": 6.116691673728319e-06, + "loss": 0.5063, + "step": 7852 + }, + { + "epoch": 0.64, + "grad_norm": 1.0184125987174175, + "learning_rate": 6.114265932197427e-06, + "loss": 0.5789, + "step": 7853 + }, + { + "epoch": 0.64, + "grad_norm": 0.9061381924635158, + "learning_rate": 6.111840459944456e-06, + "loss": 0.4939, + "step": 7854 + }, + { + "epoch": 0.64, + "grad_norm": 0.8112643985533705, + "learning_rate": 6.109415257137496e-06, + "loss": 0.4545, + "step": 7855 + }, + { + "epoch": 0.64, + "grad_norm": 0.8822508528963398, + "learning_rate": 6.1069903239446085e-06, + "loss": 0.4922, + "step": 7856 + }, + { + "epoch": 0.64, + "grad_norm": 0.9376464388034186, + "learning_rate": 6.104565660533834e-06, + "loss": 0.5353, + "step": 7857 + }, + { + "epoch": 0.64, + "grad_norm": 1.1579114301426976, + "learning_rate": 6.102141267073207e-06, + "loss": 0.586, + "step": 7858 + }, + { + "epoch": 0.64, + "grad_norm": 0.9165096943932053, + "learning_rate": 6.099717143730735e-06, + "loss": 0.4738, + "step": 7859 + }, + { + "epoch": 0.64, + "grad_norm": 0.9121514176411212, + "learning_rate": 6.0972932906744e-06, + "loss": 0.5212, + "step": 7860 + }, + { + "epoch": 0.64, + "grad_norm": 0.9866894805187648, + "learning_rate": 6.094869708072182e-06, + "loss": 0.5184, + "step": 7861 + }, + { + "epoch": 0.64, + "grad_norm": 0.9143118932789693, + "learning_rate": 6.092446396092029e-06, + "loss": 0.5419, + "step": 7862 + }, + { + "epoch": 0.64, + "grad_norm": 0.8796210889513394, + "learning_rate": 6.0900233549018715e-06, + "loss": 0.5014, + "step": 7863 + }, + { + "epoch": 0.64, + "grad_norm": 0.9600479927730894, + "learning_rate": 6.087600584669631e-06, + "loss": 0.5166, + "step": 7864 + }, + { + "epoch": 0.64, + "grad_norm": 0.8497125065287534, + "learning_rate": 6.0851780855632005e-06, + "loss": 0.4715, + "step": 7865 + }, + { + "epoch": 0.64, + "grad_norm": 0.8294242835797552, + "learning_rate": 6.082755857750451e-06, + "loss": 0.4082, + "step": 7866 + }, + { + "epoch": 0.64, + "grad_norm": 0.8411002825465396, + "learning_rate": 6.080333901399252e-06, + "loss": 0.4416, + "step": 7867 + }, + { + "epoch": 0.64, + "grad_norm": 1.0470602635891193, + "learning_rate": 6.077912216677435e-06, + "loss": 0.5246, + "step": 7868 + }, + { + "epoch": 0.64, + "grad_norm": 0.9070343840890479, + "learning_rate": 6.075490803752818e-06, + "loss": 0.4806, + "step": 7869 + }, + { + "epoch": 0.64, + "grad_norm": 0.9051386050346829, + "learning_rate": 6.073069662793213e-06, + "loss": 0.4905, + "step": 7870 + }, + { + "epoch": 0.64, + "grad_norm": 0.8752448696259255, + "learning_rate": 6.070648793966396e-06, + "loss": 0.4699, + "step": 7871 + }, + { + "epoch": 0.64, + "grad_norm": 0.8925887090586246, + "learning_rate": 6.068228197440129e-06, + "loss": 0.5167, + "step": 7872 + }, + { + "epoch": 0.64, + "grad_norm": 0.964748588002339, + "learning_rate": 6.065807873382163e-06, + "loss": 0.5169, + "step": 7873 + }, + { + "epoch": 0.64, + "grad_norm": 0.9275308110947283, + "learning_rate": 6.063387821960224e-06, + "loss": 0.5562, + "step": 7874 + }, + { + "epoch": 0.64, + "grad_norm": 0.8429438517721336, + "learning_rate": 6.060968043342013e-06, + "loss": 0.4673, + "step": 7875 + }, + { + "epoch": 0.64, + "grad_norm": 0.8850378488006732, + "learning_rate": 6.058548537695225e-06, + "loss": 0.4752, + "step": 7876 + }, + { + "epoch": 0.64, + "grad_norm": 0.9524802145799047, + "learning_rate": 6.056129305187528e-06, + "loss": 0.5744, + "step": 7877 + }, + { + "epoch": 0.64, + "grad_norm": 0.9361706523073291, + "learning_rate": 6.053710345986568e-06, + "loss": 0.6133, + "step": 7878 + }, + { + "epoch": 0.64, + "grad_norm": 0.9839367384874946, + "learning_rate": 6.051291660259984e-06, + "loss": 0.5047, + "step": 7879 + }, + { + "epoch": 0.64, + "grad_norm": 0.7923563256741162, + "learning_rate": 6.048873248175387e-06, + "loss": 0.4301, + "step": 7880 + }, + { + "epoch": 0.64, + "grad_norm": 0.8936789644649209, + "learning_rate": 6.046455109900364e-06, + "loss": 0.4995, + "step": 7881 + }, + { + "epoch": 0.64, + "grad_norm": 0.868032478797101, + "learning_rate": 6.044037245602498e-06, + "loss": 0.5239, + "step": 7882 + }, + { + "epoch": 0.64, + "grad_norm": 0.98929414338559, + "learning_rate": 6.041619655449345e-06, + "loss": 0.5709, + "step": 7883 + }, + { + "epoch": 0.64, + "grad_norm": 0.8506950731385884, + "learning_rate": 6.039202339608432e-06, + "loss": 0.5399, + "step": 7884 + }, + { + "epoch": 0.64, + "grad_norm": 0.8806249635423963, + "learning_rate": 6.03678529824729e-06, + "loss": 0.4679, + "step": 7885 + }, + { + "epoch": 0.64, + "grad_norm": 0.8525185718459264, + "learning_rate": 6.03436853153341e-06, + "loss": 0.4969, + "step": 7886 + }, + { + "epoch": 0.64, + "grad_norm": 0.9305266634188015, + "learning_rate": 6.031952039634269e-06, + "loss": 0.4984, + "step": 7887 + }, + { + "epoch": 0.64, + "grad_norm": 0.902035425375862, + "learning_rate": 6.029535822717336e-06, + "loss": 0.443, + "step": 7888 + }, + { + "epoch": 0.64, + "grad_norm": 0.8470784591374255, + "learning_rate": 6.0271198809500495e-06, + "loss": 0.5191, + "step": 7889 + }, + { + "epoch": 0.64, + "grad_norm": 0.974557691623203, + "learning_rate": 6.024704214499828e-06, + "loss": 0.5018, + "step": 7890 + }, + { + "epoch": 0.64, + "grad_norm": 0.787087408132767, + "learning_rate": 6.02228882353408e-06, + "loss": 0.5078, + "step": 7891 + }, + { + "epoch": 0.64, + "grad_norm": 0.8542335974387377, + "learning_rate": 6.019873708220187e-06, + "loss": 0.4908, + "step": 7892 + }, + { + "epoch": 0.64, + "grad_norm": 0.9474423848025191, + "learning_rate": 6.0174588687255175e-06, + "loss": 0.5605, + "step": 7893 + }, + { + "epoch": 0.64, + "grad_norm": 0.8575669120750463, + "learning_rate": 6.0150443052174165e-06, + "loss": 0.5251, + "step": 7894 + }, + { + "epoch": 0.64, + "grad_norm": 0.8612373472694415, + "learning_rate": 6.012630017863207e-06, + "loss": 0.4617, + "step": 7895 + }, + { + "epoch": 0.64, + "grad_norm": 0.8586947151074552, + "learning_rate": 6.010216006830204e-06, + "loss": 0.4587, + "step": 7896 + }, + { + "epoch": 0.64, + "grad_norm": 0.8632841159755757, + "learning_rate": 6.007802272285693e-06, + "loss": 0.5079, + "step": 7897 + }, + { + "epoch": 0.64, + "grad_norm": 0.8379444445979347, + "learning_rate": 6.0053888143969395e-06, + "loss": 0.4695, + "step": 7898 + }, + { + "epoch": 0.64, + "grad_norm": 1.0050526074923891, + "learning_rate": 6.002975633331202e-06, + "loss": 0.4807, + "step": 7899 + }, + { + "epoch": 0.64, + "grad_norm": 0.9495555894351163, + "learning_rate": 6.000562729255708e-06, + "loss": 0.5104, + "step": 7900 + }, + { + "epoch": 0.64, + "grad_norm": 0.8893561306967496, + "learning_rate": 5.998150102337665e-06, + "loss": 0.4987, + "step": 7901 + }, + { + "epoch": 0.64, + "grad_norm": 0.9049187018469663, + "learning_rate": 5.995737752744274e-06, + "loss": 0.3982, + "step": 7902 + }, + { + "epoch": 0.64, + "grad_norm": 0.9645879156768508, + "learning_rate": 5.9933256806427056e-06, + "loss": 0.52, + "step": 7903 + }, + { + "epoch": 0.64, + "grad_norm": 0.8483830405355759, + "learning_rate": 5.990913886200109e-06, + "loss": 0.5376, + "step": 7904 + }, + { + "epoch": 0.64, + "grad_norm": 0.8176645223893816, + "learning_rate": 5.9885023695836285e-06, + "loss": 0.4675, + "step": 7905 + }, + { + "epoch": 0.64, + "grad_norm": 0.8683650406947293, + "learning_rate": 5.986091130960374e-06, + "loss": 0.4619, + "step": 7906 + }, + { + "epoch": 0.64, + "grad_norm": 0.8638758477258518, + "learning_rate": 5.983680170497441e-06, + "loss": 0.4918, + "step": 7907 + }, + { + "epoch": 0.64, + "grad_norm": 0.9072700579393461, + "learning_rate": 5.981269488361915e-06, + "loss": 0.5199, + "step": 7908 + }, + { + "epoch": 0.64, + "grad_norm": 0.8330566847213845, + "learning_rate": 5.978859084720847e-06, + "loss": 0.4817, + "step": 7909 + }, + { + "epoch": 0.64, + "grad_norm": 0.853580850835999, + "learning_rate": 5.9764489597412744e-06, + "loss": 0.4865, + "step": 7910 + }, + { + "epoch": 0.64, + "grad_norm": 0.9350521083805324, + "learning_rate": 5.974039113590224e-06, + "loss": 0.5252, + "step": 7911 + }, + { + "epoch": 0.64, + "grad_norm": 0.8370042228867535, + "learning_rate": 5.971629546434692e-06, + "loss": 0.4613, + "step": 7912 + }, + { + "epoch": 0.64, + "grad_norm": 0.9258955551707068, + "learning_rate": 5.969220258441656e-06, + "loss": 0.464, + "step": 7913 + }, + { + "epoch": 0.64, + "grad_norm": 1.004101425937656, + "learning_rate": 5.966811249778084e-06, + "loss": 0.5862, + "step": 7914 + }, + { + "epoch": 0.64, + "grad_norm": 1.0387617616671656, + "learning_rate": 5.964402520610915e-06, + "loss": 0.5051, + "step": 7915 + }, + { + "epoch": 0.64, + "grad_norm": 0.9856894626875013, + "learning_rate": 5.961994071107067e-06, + "loss": 0.5665, + "step": 7916 + }, + { + "epoch": 0.64, + "grad_norm": 0.9105291820348035, + "learning_rate": 5.959585901433453e-06, + "loss": 0.4826, + "step": 7917 + }, + { + "epoch": 0.64, + "grad_norm": 0.8617063645840858, + "learning_rate": 5.957178011756952e-06, + "loss": 0.4357, + "step": 7918 + }, + { + "epoch": 0.64, + "grad_norm": 0.9170271796296493, + "learning_rate": 5.954770402244425e-06, + "loss": 0.5619, + "step": 7919 + }, + { + "epoch": 0.64, + "grad_norm": 0.9411333802566632, + "learning_rate": 5.9523630730627255e-06, + "loss": 0.4756, + "step": 7920 + }, + { + "epoch": 0.64, + "grad_norm": 0.7903009111273711, + "learning_rate": 5.949956024378673e-06, + "loss": 0.5194, + "step": 7921 + }, + { + "epoch": 0.64, + "grad_norm": 0.8558134713436262, + "learning_rate": 5.947549256359074e-06, + "loss": 0.493, + "step": 7922 + }, + { + "epoch": 0.64, + "grad_norm": 0.8332628028313226, + "learning_rate": 5.94514276917072e-06, + "loss": 0.488, + "step": 7923 + }, + { + "epoch": 0.64, + "grad_norm": 0.9005806018388123, + "learning_rate": 5.9427365629803756e-06, + "loss": 0.5588, + "step": 7924 + }, + { + "epoch": 0.64, + "grad_norm": 0.9971919710537726, + "learning_rate": 5.940330637954783e-06, + "loss": 0.478, + "step": 7925 + }, + { + "epoch": 0.64, + "grad_norm": 0.8456390472224813, + "learning_rate": 5.937924994260682e-06, + "loss": 0.4745, + "step": 7926 + }, + { + "epoch": 0.64, + "grad_norm": 0.8541660700367373, + "learning_rate": 5.9355196320647745e-06, + "loss": 0.4937, + "step": 7927 + }, + { + "epoch": 0.64, + "grad_norm": 0.8553699609380877, + "learning_rate": 5.933114551533749e-06, + "loss": 0.4747, + "step": 7928 + }, + { + "epoch": 0.64, + "grad_norm": 0.9374042630967032, + "learning_rate": 5.930709752834281e-06, + "loss": 0.4991, + "step": 7929 + }, + { + "epoch": 0.64, + "grad_norm": 0.9049318778548395, + "learning_rate": 5.928305236133016e-06, + "loss": 0.5303, + "step": 7930 + }, + { + "epoch": 0.64, + "grad_norm": 0.9678237556822508, + "learning_rate": 5.925901001596586e-06, + "loss": 0.5392, + "step": 7931 + }, + { + "epoch": 0.64, + "grad_norm": 0.8766681404745948, + "learning_rate": 5.923497049391605e-06, + "loss": 0.4992, + "step": 7932 + }, + { + "epoch": 0.64, + "grad_norm": 1.0277711153649378, + "learning_rate": 5.9210933796846616e-06, + "loss": 0.5993, + "step": 7933 + }, + { + "epoch": 0.64, + "grad_norm": 0.9128322651045907, + "learning_rate": 5.918689992642328e-06, + "loss": 0.5476, + "step": 7934 + }, + { + "epoch": 0.64, + "grad_norm": 0.835028262630942, + "learning_rate": 5.9162868884311596e-06, + "loss": 0.448, + "step": 7935 + }, + { + "epoch": 0.64, + "grad_norm": 0.9091933878596423, + "learning_rate": 5.913884067217686e-06, + "loss": 0.4761, + "step": 7936 + }, + { + "epoch": 0.65, + "grad_norm": 0.8631763279005022, + "learning_rate": 5.911481529168421e-06, + "loss": 0.4818, + "step": 7937 + }, + { + "epoch": 0.65, + "grad_norm": 0.9481818857199612, + "learning_rate": 5.9090792744498625e-06, + "loss": 0.5641, + "step": 7938 + }, + { + "epoch": 0.65, + "grad_norm": 0.8160053258398908, + "learning_rate": 5.9066773032284804e-06, + "loss": 0.448, + "step": 7939 + }, + { + "epoch": 0.65, + "grad_norm": 0.9511732542896206, + "learning_rate": 5.9042756156707295e-06, + "loss": 0.5564, + "step": 7940 + }, + { + "epoch": 0.65, + "grad_norm": 0.929582203893334, + "learning_rate": 5.901874211943048e-06, + "loss": 0.5347, + "step": 7941 + }, + { + "epoch": 0.65, + "grad_norm": 0.8946384425547035, + "learning_rate": 5.899473092211847e-06, + "loss": 0.5131, + "step": 7942 + }, + { + "epoch": 0.65, + "grad_norm": 0.8740876502947456, + "learning_rate": 5.897072256643522e-06, + "loss": 0.4932, + "step": 7943 + }, + { + "epoch": 0.65, + "grad_norm": 0.8753668882927068, + "learning_rate": 5.894671705404453e-06, + "loss": 0.495, + "step": 7944 + }, + { + "epoch": 0.65, + "grad_norm": 0.9465339505107648, + "learning_rate": 5.892271438660993e-06, + "loss": 0.5105, + "step": 7945 + }, + { + "epoch": 0.65, + "grad_norm": 0.8751984352332609, + "learning_rate": 5.889871456579477e-06, + "loss": 0.5559, + "step": 7946 + }, + { + "epoch": 0.65, + "grad_norm": 1.0290060841753401, + "learning_rate": 5.887471759326223e-06, + "loss": 0.5856, + "step": 7947 + }, + { + "epoch": 0.65, + "grad_norm": 0.8780230357458085, + "learning_rate": 5.885072347067531e-06, + "loss": 0.5009, + "step": 7948 + }, + { + "epoch": 0.65, + "grad_norm": 0.880023849643744, + "learning_rate": 5.882673219969673e-06, + "loss": 0.5137, + "step": 7949 + }, + { + "epoch": 0.65, + "grad_norm": 0.865486218156423, + "learning_rate": 5.880274378198909e-06, + "loss": 0.4774, + "step": 7950 + }, + { + "epoch": 0.65, + "grad_norm": 0.8907322492856702, + "learning_rate": 5.877875821921479e-06, + "loss": 0.4895, + "step": 7951 + }, + { + "epoch": 0.65, + "grad_norm": 0.8918944661195479, + "learning_rate": 5.875477551303596e-06, + "loss": 0.5237, + "step": 7952 + }, + { + "epoch": 0.65, + "grad_norm": 0.9662505920258414, + "learning_rate": 5.873079566511459e-06, + "loss": 0.5327, + "step": 7953 + }, + { + "epoch": 0.65, + "grad_norm": 0.9372929223735276, + "learning_rate": 5.870681867711252e-06, + "loss": 0.4707, + "step": 7954 + }, + { + "epoch": 0.65, + "grad_norm": 0.9801069798337949, + "learning_rate": 5.868284455069124e-06, + "loss": 0.4903, + "step": 7955 + }, + { + "epoch": 0.65, + "grad_norm": 0.8265909750429461, + "learning_rate": 5.865887328751221e-06, + "loss": 0.5053, + "step": 7956 + }, + { + "epoch": 0.65, + "grad_norm": 0.8773827747816701, + "learning_rate": 5.86349048892366e-06, + "loss": 0.5323, + "step": 7957 + }, + { + "epoch": 0.65, + "grad_norm": 0.9125462407522994, + "learning_rate": 5.8610939357525365e-06, + "loss": 0.457, + "step": 7958 + }, + { + "epoch": 0.65, + "grad_norm": 0.9009283341814897, + "learning_rate": 5.8586976694039325e-06, + "loss": 0.4907, + "step": 7959 + }, + { + "epoch": 0.65, + "grad_norm": 0.8862610572080457, + "learning_rate": 5.856301690043909e-06, + "loss": 0.4928, + "step": 7960 + }, + { + "epoch": 0.65, + "grad_norm": 0.8271397349442127, + "learning_rate": 5.8539059978385e-06, + "loss": 0.4825, + "step": 7961 + }, + { + "epoch": 0.65, + "grad_norm": 1.0027140353575406, + "learning_rate": 5.851510592953729e-06, + "loss": 0.5539, + "step": 7962 + }, + { + "epoch": 0.65, + "grad_norm": 0.9227278978093961, + "learning_rate": 5.849115475555596e-06, + "loss": 0.4464, + "step": 7963 + }, + { + "epoch": 0.65, + "grad_norm": 0.872548999788522, + "learning_rate": 5.846720645810073e-06, + "loss": 0.4953, + "step": 7964 + }, + { + "epoch": 0.65, + "grad_norm": 0.8580393334113791, + "learning_rate": 5.844326103883131e-06, + "loss": 0.4775, + "step": 7965 + }, + { + "epoch": 0.65, + "grad_norm": 1.0315902936759194, + "learning_rate": 5.841931849940704e-06, + "loss": 0.494, + "step": 7966 + }, + { + "epoch": 0.65, + "grad_norm": 1.0272260657495227, + "learning_rate": 5.839537884148707e-06, + "loss": 0.5617, + "step": 7967 + }, + { + "epoch": 0.65, + "grad_norm": 0.8998302210454385, + "learning_rate": 5.837144206673049e-06, + "loss": 0.4609, + "step": 7968 + }, + { + "epoch": 0.65, + "grad_norm": 0.9546931742771414, + "learning_rate": 5.834750817679606e-06, + "loss": 0.5295, + "step": 7969 + }, + { + "epoch": 0.65, + "grad_norm": 0.9181160691313232, + "learning_rate": 5.832357717334229e-06, + "loss": 0.5299, + "step": 7970 + }, + { + "epoch": 0.65, + "grad_norm": 0.866694492393031, + "learning_rate": 5.829964905802774e-06, + "loss": 0.4898, + "step": 7971 + }, + { + "epoch": 0.65, + "grad_norm": 0.829496100447018, + "learning_rate": 5.827572383251048e-06, + "loss": 0.5281, + "step": 7972 + }, + { + "epoch": 0.65, + "grad_norm": 0.8751261025322412, + "learning_rate": 5.825180149844856e-06, + "loss": 0.4339, + "step": 7973 + }, + { + "epoch": 0.65, + "grad_norm": 0.8704829205465353, + "learning_rate": 5.822788205749974e-06, + "loss": 0.4777, + "step": 7974 + }, + { + "epoch": 0.65, + "grad_norm": 0.849680191816542, + "learning_rate": 5.82039655113217e-06, + "loss": 0.4442, + "step": 7975 + }, + { + "epoch": 0.65, + "grad_norm": 0.8376653229107791, + "learning_rate": 5.8180051861571695e-06, + "loss": 0.4852, + "step": 7976 + }, + { + "epoch": 0.65, + "grad_norm": 0.8389294368790021, + "learning_rate": 5.815614110990708e-06, + "loss": 0.462, + "step": 7977 + }, + { + "epoch": 0.65, + "grad_norm": 0.8807585814155449, + "learning_rate": 5.813223325798473e-06, + "loss": 0.5042, + "step": 7978 + }, + { + "epoch": 0.65, + "grad_norm": 1.000691488112414, + "learning_rate": 5.810832830746147e-06, + "loss": 0.4713, + "step": 7979 + }, + { + "epoch": 0.65, + "grad_norm": 0.8532742444248469, + "learning_rate": 5.8084426259993905e-06, + "loss": 0.4491, + "step": 7980 + }, + { + "epoch": 0.65, + "grad_norm": 0.9100918922284562, + "learning_rate": 5.8060527117238475e-06, + "loss": 0.5095, + "step": 7981 + }, + { + "epoch": 0.65, + "grad_norm": 0.8929274143229312, + "learning_rate": 5.803663088085123e-06, + "loss": 0.5257, + "step": 7982 + }, + { + "epoch": 0.65, + "grad_norm": 0.8455248193661312, + "learning_rate": 5.801273755248831e-06, + "loss": 0.477, + "step": 7983 + }, + { + "epoch": 0.65, + "grad_norm": 0.8811988221328768, + "learning_rate": 5.798884713380542e-06, + "loss": 0.5042, + "step": 7984 + }, + { + "epoch": 0.65, + "grad_norm": 0.967260471731767, + "learning_rate": 5.796495962645814e-06, + "loss": 0.5802, + "step": 7985 + }, + { + "epoch": 0.65, + "grad_norm": 0.98700331385454, + "learning_rate": 5.794107503210187e-06, + "loss": 0.5205, + "step": 7986 + }, + { + "epoch": 0.65, + "grad_norm": 0.902464167796891, + "learning_rate": 5.791719335239185e-06, + "loss": 0.5258, + "step": 7987 + }, + { + "epoch": 0.65, + "grad_norm": 0.852618831364867, + "learning_rate": 5.7893314588982905e-06, + "loss": 0.4333, + "step": 7988 + }, + { + "epoch": 0.65, + "grad_norm": 0.9179412089940903, + "learning_rate": 5.7869438743529994e-06, + "loss": 0.5615, + "step": 7989 + }, + { + "epoch": 0.65, + "grad_norm": 0.9335365738514484, + "learning_rate": 5.784556581768757e-06, + "loss": 0.5715, + "step": 7990 + }, + { + "epoch": 0.65, + "grad_norm": 0.7787335792129625, + "learning_rate": 5.782169581311006e-06, + "loss": 0.478, + "step": 7991 + }, + { + "epoch": 0.65, + "grad_norm": 0.9464878407470988, + "learning_rate": 5.7797828731451596e-06, + "loss": 0.4992, + "step": 7992 + }, + { + "epoch": 0.65, + "grad_norm": 0.9479587984475946, + "learning_rate": 5.7773964574366185e-06, + "loss": 0.5334, + "step": 7993 + }, + { + "epoch": 0.65, + "grad_norm": 0.8369870158085331, + "learning_rate": 5.7750103343507565e-06, + "loss": 0.4922, + "step": 7994 + }, + { + "epoch": 0.65, + "grad_norm": 0.8797202532805338, + "learning_rate": 5.772624504052935e-06, + "loss": 0.5185, + "step": 7995 + }, + { + "epoch": 0.65, + "grad_norm": 0.8525431115226239, + "learning_rate": 5.770238966708482e-06, + "loss": 0.4396, + "step": 7996 + }, + { + "epoch": 0.65, + "grad_norm": 0.8919277953759788, + "learning_rate": 5.767853722482717e-06, + "loss": 0.4865, + "step": 7997 + }, + { + "epoch": 0.65, + "grad_norm": 0.924505401141746, + "learning_rate": 5.765468771540934e-06, + "loss": 0.5014, + "step": 7998 + }, + { + "epoch": 0.65, + "grad_norm": 0.9132203193112512, + "learning_rate": 5.763084114048409e-06, + "loss": 0.4474, + "step": 7999 + }, + { + "epoch": 0.65, + "grad_norm": 0.9793395084081031, + "learning_rate": 5.7606997501703975e-06, + "loss": 0.5358, + "step": 8000 + }, + { + "epoch": 0.65, + "grad_norm": 0.9415267497989768, + "learning_rate": 5.758315680072137e-06, + "loss": 0.4871, + "step": 8001 + }, + { + "epoch": 0.65, + "grad_norm": 0.899446816115677, + "learning_rate": 5.755931903918835e-06, + "loss": 0.4913, + "step": 8002 + }, + { + "epoch": 0.65, + "grad_norm": 0.8433015727474659, + "learning_rate": 5.753548421875686e-06, + "loss": 0.4995, + "step": 8003 + }, + { + "epoch": 0.65, + "grad_norm": 0.928114233175902, + "learning_rate": 5.751165234107864e-06, + "loss": 0.5294, + "step": 8004 + }, + { + "epoch": 0.65, + "grad_norm": 0.7961058992253185, + "learning_rate": 5.748782340780523e-06, + "loss": 0.4393, + "step": 8005 + }, + { + "epoch": 0.65, + "grad_norm": 0.9787936172926667, + "learning_rate": 5.746399742058796e-06, + "loss": 0.5387, + "step": 8006 + }, + { + "epoch": 0.65, + "grad_norm": 0.8751511295227653, + "learning_rate": 5.744017438107796e-06, + "loss": 0.4633, + "step": 8007 + }, + { + "epoch": 0.65, + "grad_norm": 0.9077415186497616, + "learning_rate": 5.741635429092611e-06, + "loss": 0.5377, + "step": 8008 + }, + { + "epoch": 0.65, + "grad_norm": 0.8952564586058097, + "learning_rate": 5.7392537151783125e-06, + "loss": 0.5049, + "step": 8009 + }, + { + "epoch": 0.65, + "grad_norm": 0.8646877275625106, + "learning_rate": 5.736872296529952e-06, + "loss": 0.4616, + "step": 8010 + }, + { + "epoch": 0.65, + "grad_norm": 1.022240202103064, + "learning_rate": 5.734491173312559e-06, + "loss": 0.5524, + "step": 8011 + }, + { + "epoch": 0.65, + "grad_norm": 0.8915803775655604, + "learning_rate": 5.732110345691146e-06, + "loss": 0.511, + "step": 8012 + }, + { + "epoch": 0.65, + "grad_norm": 0.8479179390848613, + "learning_rate": 5.7297298138307e-06, + "loss": 0.4507, + "step": 8013 + }, + { + "epoch": 0.65, + "grad_norm": 0.9476076758828081, + "learning_rate": 5.727349577896194e-06, + "loss": 0.4865, + "step": 8014 + }, + { + "epoch": 0.65, + "grad_norm": 0.8676077389176223, + "learning_rate": 5.724969638052569e-06, + "loss": 0.4834, + "step": 8015 + }, + { + "epoch": 0.65, + "grad_norm": 0.9255473966779431, + "learning_rate": 5.722589994464754e-06, + "loss": 0.5681, + "step": 8016 + }, + { + "epoch": 0.65, + "grad_norm": 0.8470769391341288, + "learning_rate": 5.72021064729766e-06, + "loss": 0.5295, + "step": 8017 + }, + { + "epoch": 0.65, + "grad_norm": 0.8572510515489944, + "learning_rate": 5.717831596716173e-06, + "loss": 0.487, + "step": 8018 + }, + { + "epoch": 0.65, + "grad_norm": 1.0118769672041892, + "learning_rate": 5.715452842885157e-06, + "loss": 0.5201, + "step": 8019 + }, + { + "epoch": 0.65, + "grad_norm": 0.9132906756275612, + "learning_rate": 5.713074385969457e-06, + "loss": 0.5172, + "step": 8020 + }, + { + "epoch": 0.65, + "grad_norm": 0.8406249617881854, + "learning_rate": 5.710696226133905e-06, + "loss": 0.4563, + "step": 8021 + }, + { + "epoch": 0.65, + "grad_norm": 0.9288896517388993, + "learning_rate": 5.708318363543297e-06, + "loss": 0.4847, + "step": 8022 + }, + { + "epoch": 0.65, + "grad_norm": 0.9714000918656338, + "learning_rate": 5.705940798362417e-06, + "loss": 0.5423, + "step": 8023 + }, + { + "epoch": 0.65, + "grad_norm": 0.7738297426860237, + "learning_rate": 5.703563530756033e-06, + "loss": 0.4568, + "step": 8024 + }, + { + "epoch": 0.65, + "grad_norm": 0.9015248782470349, + "learning_rate": 5.701186560888885e-06, + "loss": 0.4834, + "step": 8025 + }, + { + "epoch": 0.65, + "grad_norm": 0.9615545542578514, + "learning_rate": 5.698809888925696e-06, + "loss": 0.4691, + "step": 8026 + }, + { + "epoch": 0.65, + "grad_norm": 0.8640647567153519, + "learning_rate": 5.696433515031169e-06, + "loss": 0.4209, + "step": 8027 + }, + { + "epoch": 0.65, + "grad_norm": 0.8546615520462587, + "learning_rate": 5.694057439369979e-06, + "loss": 0.4583, + "step": 8028 + }, + { + "epoch": 0.65, + "grad_norm": 0.8921814771572696, + "learning_rate": 5.6916816621067895e-06, + "loss": 0.5259, + "step": 8029 + }, + { + "epoch": 0.65, + "grad_norm": 0.8644158011998065, + "learning_rate": 5.689306183406238e-06, + "loss": 0.5409, + "step": 8030 + }, + { + "epoch": 0.65, + "grad_norm": 0.9855746769335014, + "learning_rate": 5.686931003432945e-06, + "loss": 0.498, + "step": 8031 + }, + { + "epoch": 0.65, + "grad_norm": 0.8982014572232642, + "learning_rate": 5.684556122351508e-06, + "loss": 0.4861, + "step": 8032 + }, + { + "epoch": 0.65, + "grad_norm": 0.925016604280845, + "learning_rate": 5.682181540326503e-06, + "loss": 0.4698, + "step": 8033 + }, + { + "epoch": 0.65, + "grad_norm": 0.9717081180975732, + "learning_rate": 5.679807257522493e-06, + "loss": 0.4723, + "step": 8034 + }, + { + "epoch": 0.65, + "grad_norm": 0.8842071042921347, + "learning_rate": 5.677433274104003e-06, + "loss": 0.546, + "step": 8035 + }, + { + "epoch": 0.65, + "grad_norm": 0.9502961755883366, + "learning_rate": 5.675059590235553e-06, + "loss": 0.5469, + "step": 8036 + }, + { + "epoch": 0.65, + "grad_norm": 0.9981482851768781, + "learning_rate": 5.672686206081638e-06, + "loss": 0.5291, + "step": 8037 + }, + { + "epoch": 0.65, + "grad_norm": 0.9835649935761642, + "learning_rate": 5.67031312180673e-06, + "loss": 0.5477, + "step": 8038 + }, + { + "epoch": 0.65, + "grad_norm": 0.8502872797613393, + "learning_rate": 5.6679403375752816e-06, + "loss": 0.5428, + "step": 8039 + }, + { + "epoch": 0.65, + "grad_norm": 0.9344665940312018, + "learning_rate": 5.66556785355173e-06, + "loss": 0.5195, + "step": 8040 + }, + { + "epoch": 0.65, + "grad_norm": 0.9192075507681627, + "learning_rate": 5.663195669900479e-06, + "loss": 0.4879, + "step": 8041 + }, + { + "epoch": 0.65, + "grad_norm": 0.986064051073003, + "learning_rate": 5.6608237867859184e-06, + "loss": 0.5824, + "step": 8042 + }, + { + "epoch": 0.65, + "grad_norm": 0.8066618730198377, + "learning_rate": 5.6584522043724226e-06, + "loss": 0.4839, + "step": 8043 + }, + { + "epoch": 0.65, + "grad_norm": 0.9818638450965491, + "learning_rate": 5.656080922824337e-06, + "loss": 0.5366, + "step": 8044 + }, + { + "epoch": 0.65, + "grad_norm": 0.8365237997024432, + "learning_rate": 5.65370994230599e-06, + "loss": 0.4576, + "step": 8045 + }, + { + "epoch": 0.65, + "grad_norm": 0.9445525770789083, + "learning_rate": 5.651339262981694e-06, + "loss": 0.5432, + "step": 8046 + }, + { + "epoch": 0.65, + "grad_norm": 0.8473995922041537, + "learning_rate": 5.648968885015726e-06, + "loss": 0.4734, + "step": 8047 + }, + { + "epoch": 0.65, + "grad_norm": 0.9149444795504126, + "learning_rate": 5.646598808572355e-06, + "loss": 0.5067, + "step": 8048 + }, + { + "epoch": 0.65, + "grad_norm": 0.9382812760027435, + "learning_rate": 5.644229033815823e-06, + "loss": 0.5029, + "step": 8049 + }, + { + "epoch": 0.65, + "grad_norm": 0.9131763915953596, + "learning_rate": 5.641859560910356e-06, + "loss": 0.5207, + "step": 8050 + }, + { + "epoch": 0.65, + "grad_norm": 0.9083095218043912, + "learning_rate": 5.639490390020158e-06, + "loss": 0.4829, + "step": 8051 + }, + { + "epoch": 0.65, + "grad_norm": 0.9217744465084563, + "learning_rate": 5.637121521309411e-06, + "loss": 0.5367, + "step": 8052 + }, + { + "epoch": 0.65, + "grad_norm": 0.8950244905018768, + "learning_rate": 5.634752954942264e-06, + "loss": 0.4848, + "step": 8053 + }, + { + "epoch": 0.65, + "grad_norm": 0.9220101284527779, + "learning_rate": 5.632384691082874e-06, + "loss": 0.4886, + "step": 8054 + }, + { + "epoch": 0.65, + "grad_norm": 0.8510786318647385, + "learning_rate": 5.630016729895346e-06, + "loss": 0.4604, + "step": 8055 + }, + { + "epoch": 0.65, + "grad_norm": 1.0006639059946054, + "learning_rate": 5.627649071543784e-06, + "loss": 0.5222, + "step": 8056 + }, + { + "epoch": 0.65, + "grad_norm": 0.9580343958505725, + "learning_rate": 5.6252817161922616e-06, + "loss": 0.5535, + "step": 8057 + }, + { + "epoch": 0.65, + "grad_norm": 0.9379452625036485, + "learning_rate": 5.6229146640048415e-06, + "loss": 0.5168, + "step": 8058 + }, + { + "epoch": 0.65, + "grad_norm": 0.872992477532868, + "learning_rate": 5.620547915145542e-06, + "loss": 0.5281, + "step": 8059 + }, + { + "epoch": 0.66, + "grad_norm": 0.7956540694611413, + "learning_rate": 5.618181469778399e-06, + "loss": 0.4395, + "step": 8060 + }, + { + "epoch": 0.66, + "grad_norm": 0.8921079894021173, + "learning_rate": 5.615815328067387e-06, + "loss": 0.4747, + "step": 8061 + }, + { + "epoch": 0.66, + "grad_norm": 0.8262492322669378, + "learning_rate": 5.613449490176484e-06, + "loss": 0.4567, + "step": 8062 + }, + { + "epoch": 0.66, + "grad_norm": 0.8583148399648072, + "learning_rate": 5.6110839562696404e-06, + "loss": 0.4694, + "step": 8063 + }, + { + "epoch": 0.66, + "grad_norm": 0.9328997683673895, + "learning_rate": 5.608718726510791e-06, + "loss": 0.5111, + "step": 8064 + }, + { + "epoch": 0.66, + "grad_norm": 0.8097909207413837, + "learning_rate": 5.60635380106383e-06, + "loss": 0.4543, + "step": 8065 + }, + { + "epoch": 0.66, + "grad_norm": 0.863117123650207, + "learning_rate": 5.603989180092661e-06, + "loss": 0.4795, + "step": 8066 + }, + { + "epoch": 0.66, + "grad_norm": 0.9068547480911131, + "learning_rate": 5.6016248637611395e-06, + "loss": 0.4863, + "step": 8067 + }, + { + "epoch": 0.66, + "grad_norm": 0.8251512618816547, + "learning_rate": 5.5992608522331126e-06, + "loss": 0.4601, + "step": 8068 + }, + { + "epoch": 0.66, + "grad_norm": 0.9337837799868443, + "learning_rate": 5.596897145672407e-06, + "loss": 0.537, + "step": 8069 + }, + { + "epoch": 0.66, + "grad_norm": 0.8894318690847395, + "learning_rate": 5.594533744242825e-06, + "loss": 0.4685, + "step": 8070 + }, + { + "epoch": 0.66, + "grad_norm": 0.8633650959858853, + "learning_rate": 5.5921706481081405e-06, + "loss": 0.536, + "step": 8071 + }, + { + "epoch": 0.66, + "grad_norm": 0.9743246388023774, + "learning_rate": 5.589807857432128e-06, + "loss": 0.5396, + "step": 8072 + }, + { + "epoch": 0.66, + "grad_norm": 0.9548473977649928, + "learning_rate": 5.587445372378515e-06, + "loss": 0.5391, + "step": 8073 + }, + { + "epoch": 0.66, + "grad_norm": 0.8957082635019312, + "learning_rate": 5.585083193111025e-06, + "loss": 0.4466, + "step": 8074 + }, + { + "epoch": 0.66, + "grad_norm": 0.9149137254325227, + "learning_rate": 5.582721319793351e-06, + "loss": 0.4543, + "step": 8075 + }, + { + "epoch": 0.66, + "grad_norm": 0.9338244908287366, + "learning_rate": 5.580359752589178e-06, + "loss": 0.4991, + "step": 8076 + }, + { + "epoch": 0.66, + "grad_norm": 0.9282335254284353, + "learning_rate": 5.5779984916621455e-06, + "loss": 0.5303, + "step": 8077 + }, + { + "epoch": 0.66, + "grad_norm": 0.9862482780242752, + "learning_rate": 5.575637537175902e-06, + "loss": 0.4957, + "step": 8078 + }, + { + "epoch": 0.66, + "grad_norm": 0.7528909278795783, + "learning_rate": 5.57327688929405e-06, + "loss": 0.4195, + "step": 8079 + }, + { + "epoch": 0.66, + "grad_norm": 0.9557139196501997, + "learning_rate": 5.570916548180183e-06, + "loss": 0.4914, + "step": 8080 + }, + { + "epoch": 0.66, + "grad_norm": 0.8315133258006441, + "learning_rate": 5.568556513997869e-06, + "loss": 0.4918, + "step": 8081 + }, + { + "epoch": 0.66, + "grad_norm": 1.0828275890412324, + "learning_rate": 5.566196786910665e-06, + "loss": 0.564, + "step": 8082 + }, + { + "epoch": 0.66, + "grad_norm": 0.9521834237171527, + "learning_rate": 5.56383736708208e-06, + "loss": 0.4365, + "step": 8083 + }, + { + "epoch": 0.66, + "grad_norm": 0.8505018319958945, + "learning_rate": 5.561478254675639e-06, + "loss": 0.4845, + "step": 8084 + }, + { + "epoch": 0.66, + "grad_norm": 0.9605289024212322, + "learning_rate": 5.559119449854815e-06, + "loss": 0.5306, + "step": 8085 + }, + { + "epoch": 0.66, + "grad_norm": 1.0344634196486486, + "learning_rate": 5.556760952783073e-06, + "loss": 0.4886, + "step": 8086 + }, + { + "epoch": 0.66, + "grad_norm": 0.9630866882564898, + "learning_rate": 5.554402763623857e-06, + "loss": 0.4603, + "step": 8087 + }, + { + "epoch": 0.66, + "grad_norm": 0.8845755841128609, + "learning_rate": 5.55204488254059e-06, + "loss": 0.4963, + "step": 8088 + }, + { + "epoch": 0.66, + "grad_norm": 0.927336189724174, + "learning_rate": 5.549687309696658e-06, + "loss": 0.5091, + "step": 8089 + }, + { + "epoch": 0.66, + "grad_norm": 0.9453205589847619, + "learning_rate": 5.547330045255458e-06, + "loss": 0.5375, + "step": 8090 + }, + { + "epoch": 0.66, + "grad_norm": 0.9241048097039017, + "learning_rate": 5.5449730893803326e-06, + "loss": 0.4709, + "step": 8091 + }, + { + "epoch": 0.66, + "grad_norm": 0.8701604477982022, + "learning_rate": 5.542616442234618e-06, + "loss": 0.4896, + "step": 8092 + }, + { + "epoch": 0.66, + "grad_norm": 1.0247372373451231, + "learning_rate": 5.5402601039816315e-06, + "loss": 0.5078, + "step": 8093 + }, + { + "epoch": 0.66, + "grad_norm": 0.9368300927082287, + "learning_rate": 5.537904074784668e-06, + "loss": 0.5267, + "step": 8094 + }, + { + "epoch": 0.66, + "grad_norm": 0.8953371671011336, + "learning_rate": 5.5355483548069866e-06, + "loss": 0.4929, + "step": 8095 + }, + { + "epoch": 0.66, + "grad_norm": 0.8632314786605364, + "learning_rate": 5.533192944211852e-06, + "loss": 0.4403, + "step": 8096 + }, + { + "epoch": 0.66, + "grad_norm": 0.9249710661785733, + "learning_rate": 5.53083784316248e-06, + "loss": 0.5305, + "step": 8097 + }, + { + "epoch": 0.66, + "grad_norm": 0.9478487611876848, + "learning_rate": 5.528483051822082e-06, + "loss": 0.485, + "step": 8098 + }, + { + "epoch": 0.66, + "grad_norm": 1.1189668918359563, + "learning_rate": 5.526128570353842e-06, + "loss": 0.5108, + "step": 8099 + }, + { + "epoch": 0.66, + "grad_norm": 1.011156273732329, + "learning_rate": 5.523774398920927e-06, + "loss": 0.5158, + "step": 8100 + }, + { + "epoch": 0.66, + "grad_norm": 0.8582598832505969, + "learning_rate": 5.521420537686468e-06, + "loss": 0.5109, + "step": 8101 + }, + { + "epoch": 0.66, + "grad_norm": 0.890431587765962, + "learning_rate": 5.519066986813602e-06, + "loss": 0.5406, + "step": 8102 + }, + { + "epoch": 0.66, + "grad_norm": 0.9234301484449461, + "learning_rate": 5.5167137464654155e-06, + "loss": 0.5037, + "step": 8103 + }, + { + "epoch": 0.66, + "grad_norm": 0.8827748934064557, + "learning_rate": 5.514360816804989e-06, + "loss": 0.5078, + "step": 8104 + }, + { + "epoch": 0.66, + "grad_norm": 0.9646521521805869, + "learning_rate": 5.512008197995379e-06, + "loss": 0.5017, + "step": 8105 + }, + { + "epoch": 0.66, + "grad_norm": 0.9690885937124468, + "learning_rate": 5.5096558901996235e-06, + "loss": 0.5432, + "step": 8106 + }, + { + "epoch": 0.66, + "grad_norm": 0.8448217293276843, + "learning_rate": 5.507303893580724e-06, + "loss": 0.4992, + "step": 8107 + }, + { + "epoch": 0.66, + "grad_norm": 0.9631252639189662, + "learning_rate": 5.5049522083016895e-06, + "loss": 0.5089, + "step": 8108 + }, + { + "epoch": 0.66, + "grad_norm": 0.9643236838561009, + "learning_rate": 5.502600834525475e-06, + "loss": 0.5508, + "step": 8109 + }, + { + "epoch": 0.66, + "grad_norm": 0.9953018292367835, + "learning_rate": 5.500249772415033e-06, + "loss": 0.5355, + "step": 8110 + }, + { + "epoch": 0.66, + "grad_norm": 0.8863125003391487, + "learning_rate": 5.49789902213329e-06, + "loss": 0.543, + "step": 8111 + }, + { + "epoch": 0.66, + "grad_norm": 0.8268057293234355, + "learning_rate": 5.495548583843155e-06, + "loss": 0.4978, + "step": 8112 + }, + { + "epoch": 0.66, + "grad_norm": 0.8258851400525931, + "learning_rate": 5.493198457707503e-06, + "loss": 0.4619, + "step": 8113 + }, + { + "epoch": 0.66, + "grad_norm": 0.9138777270458119, + "learning_rate": 5.490848643889205e-06, + "loss": 0.4819, + "step": 8114 + }, + { + "epoch": 0.66, + "grad_norm": 0.8789485096300262, + "learning_rate": 5.488499142551094e-06, + "loss": 0.4584, + "step": 8115 + }, + { + "epoch": 0.66, + "grad_norm": 0.9067069193255343, + "learning_rate": 5.48614995385599e-06, + "loss": 0.5312, + "step": 8116 + }, + { + "epoch": 0.66, + "grad_norm": 0.9639711693883865, + "learning_rate": 5.48380107796669e-06, + "loss": 0.474, + "step": 8117 + }, + { + "epoch": 0.66, + "grad_norm": 0.9871386684830713, + "learning_rate": 5.481452515045974e-06, + "loss": 0.5496, + "step": 8118 + }, + { + "epoch": 0.66, + "grad_norm": 0.856246256936417, + "learning_rate": 5.479104265256583e-06, + "loss": 0.4469, + "step": 8119 + }, + { + "epoch": 0.66, + "grad_norm": 0.9666843510279574, + "learning_rate": 5.476756328761264e-06, + "loss": 0.4889, + "step": 8120 + }, + { + "epoch": 0.66, + "grad_norm": 0.8589689713812814, + "learning_rate": 5.474408705722716e-06, + "loss": 0.4909, + "step": 8121 + }, + { + "epoch": 0.66, + "grad_norm": 0.8871792197882404, + "learning_rate": 5.47206139630363e-06, + "loss": 0.4712, + "step": 8122 + }, + { + "epoch": 0.66, + "grad_norm": 0.8684447485140943, + "learning_rate": 5.469714400666673e-06, + "loss": 0.4664, + "step": 8123 + }, + { + "epoch": 0.66, + "grad_norm": 0.9112182769895082, + "learning_rate": 5.467367718974492e-06, + "loss": 0.5288, + "step": 8124 + }, + { + "epoch": 0.66, + "grad_norm": 0.8856249596176711, + "learning_rate": 5.465021351389702e-06, + "loss": 0.5061, + "step": 8125 + }, + { + "epoch": 0.66, + "grad_norm": 1.0244913612133435, + "learning_rate": 5.462675298074918e-06, + "loss": 0.5295, + "step": 8126 + }, + { + "epoch": 0.66, + "grad_norm": 0.8781385330939273, + "learning_rate": 5.460329559192705e-06, + "loss": 0.4622, + "step": 8127 + }, + { + "epoch": 0.66, + "grad_norm": 0.9235091415547136, + "learning_rate": 5.4579841349056285e-06, + "loss": 0.548, + "step": 8128 + }, + { + "epoch": 0.66, + "grad_norm": 0.8837638906811883, + "learning_rate": 5.455639025376223e-06, + "loss": 0.4918, + "step": 8129 + }, + { + "epoch": 0.66, + "grad_norm": 0.9490852904400812, + "learning_rate": 5.453294230767005e-06, + "loss": 0.5258, + "step": 8130 + }, + { + "epoch": 0.66, + "grad_norm": 0.8936978817795787, + "learning_rate": 5.450949751240456e-06, + "loss": 0.5531, + "step": 8131 + }, + { + "epoch": 0.66, + "grad_norm": 0.8136657702496216, + "learning_rate": 5.448605586959063e-06, + "loss": 0.419, + "step": 8132 + }, + { + "epoch": 0.66, + "grad_norm": 0.8734233583149454, + "learning_rate": 5.446261738085261e-06, + "loss": 0.4688, + "step": 8133 + }, + { + "epoch": 0.66, + "grad_norm": 0.9808667220509426, + "learning_rate": 5.443918204781482e-06, + "loss": 0.5776, + "step": 8134 + }, + { + "epoch": 0.66, + "grad_norm": 0.9728198474455235, + "learning_rate": 5.44157498721013e-06, + "loss": 0.4703, + "step": 8135 + }, + { + "epoch": 0.66, + "grad_norm": 0.8547436340944682, + "learning_rate": 5.439232085533592e-06, + "loss": 0.4995, + "step": 8136 + }, + { + "epoch": 0.66, + "grad_norm": 1.037319351927126, + "learning_rate": 5.436889499914218e-06, + "loss": 0.5527, + "step": 8137 + }, + { + "epoch": 0.66, + "grad_norm": 0.9336300856188522, + "learning_rate": 5.43454723051436e-06, + "loss": 0.5334, + "step": 8138 + }, + { + "epoch": 0.66, + "grad_norm": 0.91337608051943, + "learning_rate": 5.432205277496327e-06, + "loss": 0.4993, + "step": 8139 + }, + { + "epoch": 0.66, + "grad_norm": 0.9486914224331876, + "learning_rate": 5.429863641022416e-06, + "loss": 0.5073, + "step": 8140 + }, + { + "epoch": 0.66, + "grad_norm": 0.9595276005602759, + "learning_rate": 5.427522321254901e-06, + "loss": 0.5485, + "step": 8141 + }, + { + "epoch": 0.66, + "grad_norm": 0.9541951421866232, + "learning_rate": 5.425181318356035e-06, + "loss": 0.5148, + "step": 8142 + }, + { + "epoch": 0.66, + "grad_norm": 0.8624392948766714, + "learning_rate": 5.4228406324880434e-06, + "loss": 0.503, + "step": 8143 + }, + { + "epoch": 0.66, + "grad_norm": 0.9007306472206367, + "learning_rate": 5.420500263813141e-06, + "loss": 0.5872, + "step": 8144 + }, + { + "epoch": 0.66, + "grad_norm": 0.9184533974463028, + "learning_rate": 5.418160212493501e-06, + "loss": 0.5086, + "step": 8145 + }, + { + "epoch": 0.66, + "grad_norm": 0.9301255956721959, + "learning_rate": 5.415820478691301e-06, + "loss": 0.6116, + "step": 8146 + }, + { + "epoch": 0.66, + "grad_norm": 0.8777358719757851, + "learning_rate": 5.413481062568672e-06, + "loss": 0.4959, + "step": 8147 + }, + { + "epoch": 0.66, + "grad_norm": 0.944307831775758, + "learning_rate": 5.411141964287737e-06, + "loss": 0.5362, + "step": 8148 + }, + { + "epoch": 0.66, + "grad_norm": 0.9121351906003211, + "learning_rate": 5.408803184010593e-06, + "loss": 0.544, + "step": 8149 + }, + { + "epoch": 0.66, + "grad_norm": 0.8415452275491403, + "learning_rate": 5.406464721899323e-06, + "loss": 0.497, + "step": 8150 + }, + { + "epoch": 0.66, + "grad_norm": 0.9062392888284309, + "learning_rate": 5.404126578115962e-06, + "loss": 0.5131, + "step": 8151 + }, + { + "epoch": 0.66, + "grad_norm": 0.98075689579646, + "learning_rate": 5.401788752822562e-06, + "loss": 0.5472, + "step": 8152 + }, + { + "epoch": 0.66, + "grad_norm": 0.970371527601638, + "learning_rate": 5.399451246181118e-06, + "loss": 0.5397, + "step": 8153 + }, + { + "epoch": 0.66, + "grad_norm": 0.9526438115570307, + "learning_rate": 5.397114058353623e-06, + "loss": 0.5376, + "step": 8154 + }, + { + "epoch": 0.66, + "grad_norm": 0.8734375021626799, + "learning_rate": 5.39477718950204e-06, + "loss": 0.4983, + "step": 8155 + }, + { + "epoch": 0.66, + "grad_norm": 0.9938007613913521, + "learning_rate": 5.3924406397883174e-06, + "loss": 0.536, + "step": 8156 + }, + { + "epoch": 0.66, + "grad_norm": 1.0030332858642188, + "learning_rate": 5.390104409374364e-06, + "loss": 0.5175, + "step": 8157 + }, + { + "epoch": 0.66, + "grad_norm": 0.9435118882877143, + "learning_rate": 5.3877684984220945e-06, + "loss": 0.5013, + "step": 8158 + }, + { + "epoch": 0.66, + "grad_norm": 0.9512703874552919, + "learning_rate": 5.385432907093371e-06, + "loss": 0.5388, + "step": 8159 + }, + { + "epoch": 0.66, + "grad_norm": 0.873374334708962, + "learning_rate": 5.383097635550057e-06, + "loss": 0.5086, + "step": 8160 + }, + { + "epoch": 0.66, + "grad_norm": 1.0269739332825232, + "learning_rate": 5.380762683953978e-06, + "loss": 0.464, + "step": 8161 + }, + { + "epoch": 0.66, + "grad_norm": 0.8972458312913809, + "learning_rate": 5.37842805246695e-06, + "loss": 0.5318, + "step": 8162 + }, + { + "epoch": 0.66, + "grad_norm": 0.987479872006018, + "learning_rate": 5.376093741250758e-06, + "loss": 0.522, + "step": 8163 + }, + { + "epoch": 0.66, + "grad_norm": 0.8746417642079718, + "learning_rate": 5.373759750467173e-06, + "loss": 0.4816, + "step": 8164 + }, + { + "epoch": 0.66, + "grad_norm": 0.9936009159063266, + "learning_rate": 5.371426080277928e-06, + "loss": 0.5823, + "step": 8165 + }, + { + "epoch": 0.66, + "grad_norm": 0.8921448928229553, + "learning_rate": 5.369092730844752e-06, + "loss": 0.5245, + "step": 8166 + }, + { + "epoch": 0.66, + "grad_norm": 0.8386162750995261, + "learning_rate": 5.366759702329343e-06, + "loss": 0.4778, + "step": 8167 + }, + { + "epoch": 0.66, + "grad_norm": 0.8330787084771702, + "learning_rate": 5.364426994893375e-06, + "loss": 0.483, + "step": 8168 + }, + { + "epoch": 0.66, + "grad_norm": 0.9387651680510244, + "learning_rate": 5.362094608698505e-06, + "loss": 0.5306, + "step": 8169 + }, + { + "epoch": 0.66, + "grad_norm": 0.8747296879461391, + "learning_rate": 5.3597625439063685e-06, + "loss": 0.486, + "step": 8170 + }, + { + "epoch": 0.66, + "grad_norm": 0.9628154040176343, + "learning_rate": 5.357430800678568e-06, + "loss": 0.5462, + "step": 8171 + }, + { + "epoch": 0.66, + "grad_norm": 0.8612974069576437, + "learning_rate": 5.3550993791766955e-06, + "loss": 0.4286, + "step": 8172 + }, + { + "epoch": 0.66, + "grad_norm": 1.026048945583325, + "learning_rate": 5.352768279562315e-06, + "loss": 0.5305, + "step": 8173 + }, + { + "epoch": 0.66, + "grad_norm": 0.852733927613396, + "learning_rate": 5.350437501996972e-06, + "loss": 0.5464, + "step": 8174 + }, + { + "epoch": 0.66, + "grad_norm": 0.9309496207123703, + "learning_rate": 5.348107046642186e-06, + "loss": 0.535, + "step": 8175 + }, + { + "epoch": 0.66, + "grad_norm": 0.9011431585683592, + "learning_rate": 5.345776913659458e-06, + "loss": 0.4597, + "step": 8176 + }, + { + "epoch": 0.66, + "grad_norm": 0.9514530227899883, + "learning_rate": 5.343447103210257e-06, + "loss": 0.4818, + "step": 8177 + }, + { + "epoch": 0.66, + "grad_norm": 0.9556001417671066, + "learning_rate": 5.341117615456044e-06, + "loss": 0.482, + "step": 8178 + }, + { + "epoch": 0.66, + "grad_norm": 0.9221886068743322, + "learning_rate": 5.338788450558246e-06, + "loss": 0.4494, + "step": 8179 + }, + { + "epoch": 0.66, + "grad_norm": 0.9579423098386334, + "learning_rate": 5.336459608678275e-06, + "loss": 0.5036, + "step": 8180 + }, + { + "epoch": 0.66, + "grad_norm": 0.9695762173510455, + "learning_rate": 5.334131089977516e-06, + "loss": 0.5367, + "step": 8181 + }, + { + "epoch": 0.66, + "grad_norm": 0.9761452544087317, + "learning_rate": 5.331802894617333e-06, + "loss": 0.5363, + "step": 8182 + }, + { + "epoch": 0.67, + "grad_norm": 0.9141851363212332, + "learning_rate": 5.329475022759074e-06, + "loss": 0.5249, + "step": 8183 + }, + { + "epoch": 0.67, + "grad_norm": 0.9093173516423304, + "learning_rate": 5.32714747456405e-06, + "loss": 0.4814, + "step": 8184 + }, + { + "epoch": 0.67, + "grad_norm": 0.8240847030191094, + "learning_rate": 5.324820250193559e-06, + "loss": 0.4586, + "step": 8185 + }, + { + "epoch": 0.67, + "grad_norm": 0.9576069962900918, + "learning_rate": 5.322493349808878e-06, + "loss": 0.5164, + "step": 8186 + }, + { + "epoch": 0.67, + "grad_norm": 0.8550129467805008, + "learning_rate": 5.32016677357126e-06, + "loss": 0.4515, + "step": 8187 + }, + { + "epoch": 0.67, + "grad_norm": 0.989239559909075, + "learning_rate": 5.3178405216419325e-06, + "loss": 0.5694, + "step": 8188 + }, + { + "epoch": 0.67, + "grad_norm": 0.8402449147196775, + "learning_rate": 5.31551459418211e-06, + "loss": 0.4815, + "step": 8189 + }, + { + "epoch": 0.67, + "grad_norm": 0.929650535088433, + "learning_rate": 5.313188991352964e-06, + "loss": 0.4851, + "step": 8190 + }, + { + "epoch": 0.67, + "grad_norm": 0.8608747172187374, + "learning_rate": 5.310863713315666e-06, + "loss": 0.5316, + "step": 8191 + }, + { + "epoch": 0.67, + "grad_norm": 0.8601414518555403, + "learning_rate": 5.308538760231352e-06, + "loss": 0.48, + "step": 8192 + }, + { + "epoch": 0.67, + "grad_norm": 0.8621843116643283, + "learning_rate": 5.306214132261141e-06, + "loss": 0.5108, + "step": 8193 + }, + { + "epoch": 0.67, + "grad_norm": 0.9873625363606519, + "learning_rate": 5.303889829566128e-06, + "loss": 0.5105, + "step": 8194 + }, + { + "epoch": 0.67, + "grad_norm": 0.9533788876320213, + "learning_rate": 5.301565852307388e-06, + "loss": 0.562, + "step": 8195 + }, + { + "epoch": 0.67, + "grad_norm": 0.9941344908746977, + "learning_rate": 5.299242200645959e-06, + "loss": 0.5008, + "step": 8196 + }, + { + "epoch": 0.67, + "grad_norm": 0.8709593447651172, + "learning_rate": 5.296918874742882e-06, + "loss": 0.3924, + "step": 8197 + }, + { + "epoch": 0.67, + "grad_norm": 0.889954834129633, + "learning_rate": 5.294595874759154e-06, + "loss": 0.483, + "step": 8198 + }, + { + "epoch": 0.67, + "grad_norm": 0.8876593394083251, + "learning_rate": 5.292273200855758e-06, + "loss": 0.4491, + "step": 8199 + }, + { + "epoch": 0.67, + "grad_norm": 0.8524918627473157, + "learning_rate": 5.2899508531936526e-06, + "loss": 0.5124, + "step": 8200 + }, + { + "epoch": 0.67, + "grad_norm": 0.9321934019395298, + "learning_rate": 5.2876288319337785e-06, + "loss": 0.5354, + "step": 8201 + }, + { + "epoch": 0.67, + "grad_norm": 0.9706937675078717, + "learning_rate": 5.285307137237039e-06, + "loss": 0.5392, + "step": 8202 + }, + { + "epoch": 0.67, + "grad_norm": 0.888202310462275, + "learning_rate": 5.282985769264342e-06, + "loss": 0.4573, + "step": 8203 + }, + { + "epoch": 0.67, + "grad_norm": 0.9661150146788169, + "learning_rate": 5.280664728176542e-06, + "loss": 0.5915, + "step": 8204 + }, + { + "epoch": 0.67, + "grad_norm": 0.8687295099552259, + "learning_rate": 5.278344014134491e-06, + "loss": 0.4291, + "step": 8205 + }, + { + "epoch": 0.67, + "grad_norm": 0.9305137050651606, + "learning_rate": 5.276023627299011e-06, + "loss": 0.545, + "step": 8206 + }, + { + "epoch": 0.67, + "grad_norm": 0.9249269297925938, + "learning_rate": 5.273703567830908e-06, + "loss": 0.5775, + "step": 8207 + }, + { + "epoch": 0.67, + "grad_norm": 0.8719197817568556, + "learning_rate": 5.271383835890947e-06, + "loss": 0.5072, + "step": 8208 + }, + { + "epoch": 0.67, + "grad_norm": 1.035278658434689, + "learning_rate": 5.269064431639901e-06, + "loss": 0.5533, + "step": 8209 + }, + { + "epoch": 0.67, + "grad_norm": 0.8978940252138314, + "learning_rate": 5.266745355238489e-06, + "loss": 0.5206, + "step": 8210 + }, + { + "epoch": 0.67, + "grad_norm": 0.9912583159987712, + "learning_rate": 5.264426606847426e-06, + "loss": 0.5126, + "step": 8211 + }, + { + "epoch": 0.67, + "grad_norm": 0.9517846552457596, + "learning_rate": 5.262108186627397e-06, + "loss": 0.5587, + "step": 8212 + }, + { + "epoch": 0.67, + "grad_norm": 0.8812396536972811, + "learning_rate": 5.259790094739073e-06, + "loss": 0.4878, + "step": 8213 + }, + { + "epoch": 0.67, + "grad_norm": 0.9195417195160543, + "learning_rate": 5.257472331343083e-06, + "loss": 0.5044, + "step": 8214 + }, + { + "epoch": 0.67, + "grad_norm": 0.8872421603933094, + "learning_rate": 5.2551548966000635e-06, + "loss": 0.5249, + "step": 8215 + }, + { + "epoch": 0.67, + "grad_norm": 0.9310784935184504, + "learning_rate": 5.252837790670595e-06, + "loss": 0.5033, + "step": 8216 + }, + { + "epoch": 0.67, + "grad_norm": 0.997219302064515, + "learning_rate": 5.250521013715257e-06, + "loss": 0.5841, + "step": 8217 + }, + { + "epoch": 0.67, + "grad_norm": 0.9227493754424385, + "learning_rate": 5.2482045658946e-06, + "loss": 0.5427, + "step": 8218 + }, + { + "epoch": 0.67, + "grad_norm": 0.8252507468536766, + "learning_rate": 5.245888447369157e-06, + "loss": 0.4472, + "step": 8219 + }, + { + "epoch": 0.67, + "grad_norm": 0.8750538145933262, + "learning_rate": 5.243572658299418e-06, + "loss": 0.5236, + "step": 8220 + }, + { + "epoch": 0.67, + "grad_norm": 0.8899187586334513, + "learning_rate": 5.241257198845884e-06, + "loss": 0.497, + "step": 8221 + }, + { + "epoch": 0.67, + "grad_norm": 0.9948214678414993, + "learning_rate": 5.238942069169e-06, + "loss": 0.5945, + "step": 8222 + }, + { + "epoch": 0.67, + "grad_norm": 0.9692887420393211, + "learning_rate": 5.236627269429208e-06, + "loss": 0.535, + "step": 8223 + }, + { + "epoch": 0.67, + "grad_norm": 0.990515988397188, + "learning_rate": 5.234312799786921e-06, + "loss": 0.4657, + "step": 8224 + }, + { + "epoch": 0.67, + "grad_norm": 0.9534115892704664, + "learning_rate": 5.231998660402535e-06, + "loss": 0.5732, + "step": 8225 + }, + { + "epoch": 0.67, + "grad_norm": 1.0066116250736143, + "learning_rate": 5.229684851436403e-06, + "loss": 0.4618, + "step": 8226 + }, + { + "epoch": 0.67, + "grad_norm": 1.0698888040079477, + "learning_rate": 5.2273713730488886e-06, + "loss": 0.4996, + "step": 8227 + }, + { + "epoch": 0.67, + "grad_norm": 0.9414620310160213, + "learning_rate": 5.2250582254003016e-06, + "loss": 0.4974, + "step": 8228 + }, + { + "epoch": 0.67, + "grad_norm": 0.9323123866798388, + "learning_rate": 5.222745408650942e-06, + "loss": 0.5358, + "step": 8229 + }, + { + "epoch": 0.67, + "grad_norm": 0.9696890580161104, + "learning_rate": 5.220432922961089e-06, + "loss": 0.552, + "step": 8230 + }, + { + "epoch": 0.67, + "grad_norm": 0.7697669490610657, + "learning_rate": 5.218120768491e-06, + "loss": 0.3773, + "step": 8231 + }, + { + "epoch": 0.67, + "grad_norm": 0.822697108368092, + "learning_rate": 5.215808945400891e-06, + "loss": 0.5099, + "step": 8232 + }, + { + "epoch": 0.67, + "grad_norm": 0.9367819561071206, + "learning_rate": 5.213497453850986e-06, + "loss": 0.4753, + "step": 8233 + }, + { + "epoch": 0.67, + "grad_norm": 0.9265513771122531, + "learning_rate": 5.21118629400146e-06, + "loss": 0.4978, + "step": 8234 + }, + { + "epoch": 0.67, + "grad_norm": 0.8839751231544198, + "learning_rate": 5.208875466012475e-06, + "loss": 0.4512, + "step": 8235 + }, + { + "epoch": 0.67, + "grad_norm": 0.853117385808319, + "learning_rate": 5.20656497004417e-06, + "loss": 0.4687, + "step": 8236 + }, + { + "epoch": 0.67, + "grad_norm": 0.8691450740963526, + "learning_rate": 5.2042548062566654e-06, + "loss": 0.4995, + "step": 8237 + }, + { + "epoch": 0.67, + "grad_norm": 0.9239210980343396, + "learning_rate": 5.2019449748100405e-06, + "loss": 0.4635, + "step": 8238 + }, + { + "epoch": 0.67, + "grad_norm": 0.8869835721887451, + "learning_rate": 5.19963547586438e-06, + "loss": 0.4768, + "step": 8239 + }, + { + "epoch": 0.67, + "grad_norm": 0.8739268679710483, + "learning_rate": 5.197326309579721e-06, + "loss": 0.5038, + "step": 8240 + }, + { + "epoch": 0.67, + "grad_norm": 0.7983910758111474, + "learning_rate": 5.195017476116089e-06, + "loss": 0.4672, + "step": 8241 + }, + { + "epoch": 0.67, + "grad_norm": 0.9395625394613684, + "learning_rate": 5.192708975633483e-06, + "loss": 0.5245, + "step": 8242 + }, + { + "epoch": 0.67, + "grad_norm": 0.8647830622368983, + "learning_rate": 5.190400808291884e-06, + "loss": 0.4765, + "step": 8243 + }, + { + "epoch": 0.67, + "grad_norm": 0.8477915105418694, + "learning_rate": 5.1880929742512355e-06, + "loss": 0.4986, + "step": 8244 + }, + { + "epoch": 0.67, + "grad_norm": 0.9637514843118539, + "learning_rate": 5.185785473671484e-06, + "loss": 0.5349, + "step": 8245 + }, + { + "epoch": 0.67, + "grad_norm": 0.8844263857942123, + "learning_rate": 5.183478306712525e-06, + "loss": 0.4652, + "step": 8246 + }, + { + "epoch": 0.67, + "grad_norm": 0.9204972924084194, + "learning_rate": 5.181171473534248e-06, + "loss": 0.5243, + "step": 8247 + }, + { + "epoch": 0.67, + "grad_norm": 0.987097280515057, + "learning_rate": 5.178864974296511e-06, + "loss": 0.5505, + "step": 8248 + }, + { + "epoch": 0.67, + "grad_norm": 0.8715329192132406, + "learning_rate": 5.176558809159161e-06, + "loss": 0.473, + "step": 8249 + }, + { + "epoch": 0.67, + "grad_norm": 0.7745699602973506, + "learning_rate": 5.174252978281999e-06, + "loss": 0.449, + "step": 8250 + }, + { + "epoch": 0.67, + "grad_norm": 0.9377200872321961, + "learning_rate": 5.171947481824832e-06, + "loss": 0.4785, + "step": 8251 + }, + { + "epoch": 0.67, + "grad_norm": 1.0250797101577, + "learning_rate": 5.16964231994742e-06, + "loss": 0.527, + "step": 8252 + }, + { + "epoch": 0.67, + "grad_norm": 0.9173853321475266, + "learning_rate": 5.16733749280951e-06, + "loss": 0.5303, + "step": 8253 + }, + { + "epoch": 0.67, + "grad_norm": 0.8971832660446327, + "learning_rate": 5.165033000570825e-06, + "loss": 0.5138, + "step": 8254 + }, + { + "epoch": 0.67, + "grad_norm": 0.9360969938703607, + "learning_rate": 5.162728843391067e-06, + "loss": 0.532, + "step": 8255 + }, + { + "epoch": 0.67, + "grad_norm": 0.8989978474119811, + "learning_rate": 5.160425021429904e-06, + "loss": 0.4812, + "step": 8256 + }, + { + "epoch": 0.67, + "grad_norm": 1.0203611815836584, + "learning_rate": 5.158121534847e-06, + "loss": 0.4886, + "step": 8257 + }, + { + "epoch": 0.67, + "grad_norm": 0.9387756661800578, + "learning_rate": 5.155818383801976e-06, + "loss": 0.4807, + "step": 8258 + }, + { + "epoch": 0.67, + "grad_norm": 0.9140901137656047, + "learning_rate": 5.153515568454441e-06, + "loss": 0.4608, + "step": 8259 + }, + { + "epoch": 0.67, + "grad_norm": 0.8470184521435462, + "learning_rate": 5.1512130889639785e-06, + "loss": 0.4343, + "step": 8260 + }, + { + "epoch": 0.67, + "grad_norm": 0.9058316582478934, + "learning_rate": 5.148910945490152e-06, + "loss": 0.5013, + "step": 8261 + }, + { + "epoch": 0.67, + "grad_norm": 0.9659521040360318, + "learning_rate": 5.1466091381924864e-06, + "loss": 0.5635, + "step": 8262 + }, + { + "epoch": 0.67, + "grad_norm": 0.8385312611403691, + "learning_rate": 5.144307667230511e-06, + "loss": 0.4924, + "step": 8263 + }, + { + "epoch": 0.67, + "grad_norm": 0.8757450526978969, + "learning_rate": 5.142006532763698e-06, + "loss": 0.4793, + "step": 8264 + }, + { + "epoch": 0.67, + "grad_norm": 0.9538110402970061, + "learning_rate": 5.139705734951532e-06, + "loss": 0.5155, + "step": 8265 + }, + { + "epoch": 0.67, + "grad_norm": 0.9180442618190727, + "learning_rate": 5.137405273953443e-06, + "loss": 0.4495, + "step": 8266 + }, + { + "epoch": 0.67, + "grad_norm": 0.9031867790761441, + "learning_rate": 5.1351051499288565e-06, + "loss": 0.462, + "step": 8267 + }, + { + "epoch": 0.67, + "grad_norm": 0.8721581631393269, + "learning_rate": 5.1328053630371656e-06, + "loss": 0.4448, + "step": 8268 + }, + { + "epoch": 0.67, + "grad_norm": 0.9379064954460689, + "learning_rate": 5.130505913437751e-06, + "loss": 0.4893, + "step": 8269 + }, + { + "epoch": 0.67, + "grad_norm": 0.914567607388558, + "learning_rate": 5.12820680128995e-06, + "loss": 0.4544, + "step": 8270 + }, + { + "epoch": 0.67, + "grad_norm": 0.9786738451139384, + "learning_rate": 5.125908026753105e-06, + "loss": 0.5393, + "step": 8271 + }, + { + "epoch": 0.67, + "grad_norm": 0.9384087179971159, + "learning_rate": 5.123609589986505e-06, + "loss": 0.5539, + "step": 8272 + }, + { + "epoch": 0.67, + "grad_norm": 0.874501938753754, + "learning_rate": 5.121311491149437e-06, + "loss": 0.4775, + "step": 8273 + }, + { + "epoch": 0.67, + "grad_norm": 0.9130947014954597, + "learning_rate": 5.119013730401152e-06, + "loss": 0.4939, + "step": 8274 + }, + { + "epoch": 0.67, + "grad_norm": 0.8959183379839931, + "learning_rate": 5.116716307900893e-06, + "loss": 0.4896, + "step": 8275 + }, + { + "epoch": 0.67, + "grad_norm": 0.8765212972837662, + "learning_rate": 5.114419223807854e-06, + "loss": 0.4999, + "step": 8276 + }, + { + "epoch": 0.67, + "grad_norm": 0.9158320029828727, + "learning_rate": 5.112122478281236e-06, + "loss": 0.4995, + "step": 8277 + }, + { + "epoch": 0.67, + "grad_norm": 0.9464418970680727, + "learning_rate": 5.109826071480191e-06, + "loss": 0.4647, + "step": 8278 + }, + { + "epoch": 0.67, + "grad_norm": 0.9579586965573048, + "learning_rate": 5.107530003563862e-06, + "loss": 0.4866, + "step": 8279 + }, + { + "epoch": 0.67, + "grad_norm": 0.8705674475250073, + "learning_rate": 5.105234274691364e-06, + "loss": 0.4887, + "step": 8280 + }, + { + "epoch": 0.67, + "grad_norm": 0.8972048362169255, + "learning_rate": 5.1029388850217935e-06, + "loss": 0.4719, + "step": 8281 + }, + { + "epoch": 0.67, + "grad_norm": 0.9114081984949848, + "learning_rate": 5.100643834714206e-06, + "loss": 0.461, + "step": 8282 + }, + { + "epoch": 0.67, + "grad_norm": 1.0612571737985634, + "learning_rate": 5.098349123927664e-06, + "loss": 0.5402, + "step": 8283 + }, + { + "epoch": 0.67, + "grad_norm": 0.9419682913709696, + "learning_rate": 5.096054752821174e-06, + "loss": 0.4755, + "step": 8284 + }, + { + "epoch": 0.67, + "grad_norm": 0.9728192014206865, + "learning_rate": 5.093760721553742e-06, + "loss": 0.4933, + "step": 8285 + }, + { + "epoch": 0.67, + "grad_norm": 0.9092408942177812, + "learning_rate": 5.091467030284339e-06, + "loss": 0.5053, + "step": 8286 + }, + { + "epoch": 0.67, + "grad_norm": 0.9806559724074531, + "learning_rate": 5.089173679171922e-06, + "loss": 0.5024, + "step": 8287 + }, + { + "epoch": 0.67, + "grad_norm": 0.8201955970089637, + "learning_rate": 5.086880668375404e-06, + "loss": 0.44, + "step": 8288 + }, + { + "epoch": 0.67, + "grad_norm": 0.8804686334163445, + "learning_rate": 5.084587998053706e-06, + "loss": 0.4512, + "step": 8289 + }, + { + "epoch": 0.67, + "grad_norm": 0.9055907430616575, + "learning_rate": 5.082295668365695e-06, + "loss": 0.5823, + "step": 8290 + }, + { + "epoch": 0.67, + "grad_norm": 0.9471393593487665, + "learning_rate": 5.080003679470234e-06, + "loss": 0.5007, + "step": 8291 + }, + { + "epoch": 0.67, + "grad_norm": 0.9592584808386907, + "learning_rate": 5.077712031526153e-06, + "loss": 0.4993, + "step": 8292 + }, + { + "epoch": 0.67, + "grad_norm": 0.969919058632812, + "learning_rate": 5.075420724692266e-06, + "loss": 0.5404, + "step": 8293 + }, + { + "epoch": 0.67, + "grad_norm": 0.9528343829077746, + "learning_rate": 5.073129759127346e-06, + "loss": 0.4887, + "step": 8294 + }, + { + "epoch": 0.67, + "grad_norm": 0.9123961547566919, + "learning_rate": 5.070839134990173e-06, + "loss": 0.5215, + "step": 8295 + }, + { + "epoch": 0.67, + "grad_norm": 0.9159772154630276, + "learning_rate": 5.0685488524394725e-06, + "loss": 0.5017, + "step": 8296 + }, + { + "epoch": 0.67, + "grad_norm": 1.0016846413150464, + "learning_rate": 5.06625891163396e-06, + "loss": 0.5566, + "step": 8297 + }, + { + "epoch": 0.67, + "grad_norm": 0.8644745935064114, + "learning_rate": 5.063969312732331e-06, + "loss": 0.484, + "step": 8298 + }, + { + "epoch": 0.67, + "grad_norm": 0.9849624018056626, + "learning_rate": 5.0616800558932525e-06, + "loss": 0.5766, + "step": 8299 + }, + { + "epoch": 0.67, + "grad_norm": 0.740789537902068, + "learning_rate": 5.059391141275358e-06, + "loss": 0.4458, + "step": 8300 + }, + { + "epoch": 0.67, + "grad_norm": 0.9474555248146016, + "learning_rate": 5.057102569037284e-06, + "loss": 0.4613, + "step": 8301 + }, + { + "epoch": 0.67, + "grad_norm": 0.9662314646834963, + "learning_rate": 5.054814339337613e-06, + "loss": 0.5123, + "step": 8302 + }, + { + "epoch": 0.67, + "grad_norm": 0.8568849751085164, + "learning_rate": 5.052526452334923e-06, + "loss": 0.4924, + "step": 8303 + }, + { + "epoch": 0.67, + "grad_norm": 0.8870863599394765, + "learning_rate": 5.050238908187759e-06, + "loss": 0.5371, + "step": 8304 + }, + { + "epoch": 0.67, + "grad_norm": 0.9210411495904444, + "learning_rate": 5.047951707054655e-06, + "loss": 0.5217, + "step": 8305 + }, + { + "epoch": 0.68, + "grad_norm": 0.8181478158838038, + "learning_rate": 5.0456648490940966e-06, + "loss": 0.4833, + "step": 8306 + }, + { + "epoch": 0.68, + "grad_norm": 1.0159089504437258, + "learning_rate": 5.043378334464576e-06, + "loss": 0.5152, + "step": 8307 + }, + { + "epoch": 0.68, + "grad_norm": 0.9093263342954216, + "learning_rate": 5.041092163324537e-06, + "loss": 0.4997, + "step": 8308 + }, + { + "epoch": 0.68, + "grad_norm": 0.7751798119901546, + "learning_rate": 5.038806335832414e-06, + "loss": 0.4602, + "step": 8309 + }, + { + "epoch": 0.68, + "grad_norm": 0.94501594818902, + "learning_rate": 5.036520852146609e-06, + "loss": 0.481, + "step": 8310 + }, + { + "epoch": 0.68, + "grad_norm": 0.8825734086585589, + "learning_rate": 5.034235712425508e-06, + "loss": 0.4882, + "step": 8311 + }, + { + "epoch": 0.68, + "grad_norm": 0.9411659546887424, + "learning_rate": 5.031950916827467e-06, + "loss": 0.5628, + "step": 8312 + }, + { + "epoch": 0.68, + "grad_norm": 0.7861532836500162, + "learning_rate": 5.029666465510825e-06, + "loss": 0.4508, + "step": 8313 + }, + { + "epoch": 0.68, + "grad_norm": 0.8417521207534142, + "learning_rate": 5.027382358633884e-06, + "loss": 0.4638, + "step": 8314 + }, + { + "epoch": 0.68, + "grad_norm": 0.9312435206636756, + "learning_rate": 5.0250985963549356e-06, + "loss": 0.5199, + "step": 8315 + }, + { + "epoch": 0.68, + "grad_norm": 0.8892045252922159, + "learning_rate": 5.0228151788322414e-06, + "loss": 0.454, + "step": 8316 + }, + { + "epoch": 0.68, + "grad_norm": 0.8297226180785294, + "learning_rate": 5.020532106224041e-06, + "loss": 0.4403, + "step": 8317 + }, + { + "epoch": 0.68, + "grad_norm": 0.8955990912624906, + "learning_rate": 5.018249378688547e-06, + "loss": 0.4802, + "step": 8318 + }, + { + "epoch": 0.68, + "grad_norm": 0.8838311608237159, + "learning_rate": 5.0159669963839575e-06, + "loss": 0.4684, + "step": 8319 + }, + { + "epoch": 0.68, + "grad_norm": 0.9126526422811626, + "learning_rate": 5.01368495946843e-06, + "loss": 0.4477, + "step": 8320 + }, + { + "epoch": 0.68, + "grad_norm": 0.9647439971830661, + "learning_rate": 5.011403268100112e-06, + "loss": 0.5284, + "step": 8321 + }, + { + "epoch": 0.68, + "grad_norm": 0.9468468051520715, + "learning_rate": 5.009121922437124e-06, + "loss": 0.492, + "step": 8322 + }, + { + "epoch": 0.68, + "grad_norm": 0.9635052588870198, + "learning_rate": 5.006840922637559e-06, + "loss": 0.5155, + "step": 8323 + }, + { + "epoch": 0.68, + "grad_norm": 0.858159908275672, + "learning_rate": 5.004560268859488e-06, + "loss": 0.4753, + "step": 8324 + }, + { + "epoch": 0.68, + "grad_norm": 0.8783926198443746, + "learning_rate": 5.002279961260965e-06, + "loss": 0.463, + "step": 8325 + }, + { + "epoch": 0.68, + "grad_norm": 0.9444190542938617, + "learning_rate": 5.000000000000003e-06, + "loss": 0.5454, + "step": 8326 + }, + { + "epoch": 0.68, + "grad_norm": 0.8049607873816147, + "learning_rate": 4.9977203852346054e-06, + "loss": 0.4467, + "step": 8327 + }, + { + "epoch": 0.68, + "grad_norm": 0.8703985735991734, + "learning_rate": 4.995441117122749e-06, + "loss": 0.4628, + "step": 8328 + }, + { + "epoch": 0.68, + "grad_norm": 0.8775384023172903, + "learning_rate": 4.993162195822383e-06, + "loss": 0.5367, + "step": 8329 + }, + { + "epoch": 0.68, + "grad_norm": 0.9282200217981799, + "learning_rate": 4.990883621491437e-06, + "loss": 0.5325, + "step": 8330 + }, + { + "epoch": 0.68, + "grad_norm": 0.9705331260749077, + "learning_rate": 4.988605394287813e-06, + "loss": 0.5236, + "step": 8331 + }, + { + "epoch": 0.68, + "grad_norm": 0.8734560259788998, + "learning_rate": 4.986327514369393e-06, + "loss": 0.5211, + "step": 8332 + }, + { + "epoch": 0.68, + "grad_norm": 0.8882083363563135, + "learning_rate": 4.9840499818940255e-06, + "loss": 0.478, + "step": 8333 + }, + { + "epoch": 0.68, + "grad_norm": 0.9391766553099113, + "learning_rate": 4.981772797019546e-06, + "loss": 0.4778, + "step": 8334 + }, + { + "epoch": 0.68, + "grad_norm": 0.8743280481822453, + "learning_rate": 4.979495959903759e-06, + "loss": 0.5005, + "step": 8335 + }, + { + "epoch": 0.68, + "grad_norm": 1.0280600118997267, + "learning_rate": 4.977219470704451e-06, + "loss": 0.5358, + "step": 8336 + }, + { + "epoch": 0.68, + "grad_norm": 0.8990860486061055, + "learning_rate": 4.974943329579377e-06, + "loss": 0.4788, + "step": 8337 + }, + { + "epoch": 0.68, + "grad_norm": 0.8991675776632321, + "learning_rate": 4.972667536686276e-06, + "loss": 0.4737, + "step": 8338 + }, + { + "epoch": 0.68, + "grad_norm": 1.0160340949846591, + "learning_rate": 4.970392092182853e-06, + "loss": 0.5754, + "step": 8339 + }, + { + "epoch": 0.68, + "grad_norm": 0.8822072783767629, + "learning_rate": 4.9681169962267975e-06, + "loss": 0.4731, + "step": 8340 + }, + { + "epoch": 0.68, + "grad_norm": 0.8928340994522213, + "learning_rate": 4.96584224897577e-06, + "loss": 0.5183, + "step": 8341 + }, + { + "epoch": 0.68, + "grad_norm": 0.8628952774436416, + "learning_rate": 4.963567850587408e-06, + "loss": 0.483, + "step": 8342 + }, + { + "epoch": 0.68, + "grad_norm": 0.894242138765183, + "learning_rate": 4.961293801219328e-06, + "loss": 0.5149, + "step": 8343 + }, + { + "epoch": 0.68, + "grad_norm": 0.9176497901536859, + "learning_rate": 4.959020101029122e-06, + "loss": 0.5161, + "step": 8344 + }, + { + "epoch": 0.68, + "grad_norm": 0.9679310502922771, + "learning_rate": 4.956746750174344e-06, + "loss": 0.5017, + "step": 8345 + }, + { + "epoch": 0.68, + "grad_norm": 0.9750828264808324, + "learning_rate": 4.954473748812551e-06, + "loss": 0.4805, + "step": 8346 + }, + { + "epoch": 0.68, + "grad_norm": 0.8381740441734827, + "learning_rate": 4.9522010971012465e-06, + "loss": 0.4963, + "step": 8347 + }, + { + "epoch": 0.68, + "grad_norm": 0.8706129568474391, + "learning_rate": 4.949928795197931e-06, + "loss": 0.5171, + "step": 8348 + }, + { + "epoch": 0.68, + "grad_norm": 0.8983833831441445, + "learning_rate": 4.947656843260068e-06, + "loss": 0.5004, + "step": 8349 + }, + { + "epoch": 0.68, + "grad_norm": 0.893544769279935, + "learning_rate": 4.9453852414451085e-06, + "loss": 0.454, + "step": 8350 + }, + { + "epoch": 0.68, + "grad_norm": 0.9605367288494783, + "learning_rate": 4.943113989910462e-06, + "loss": 0.5191, + "step": 8351 + }, + { + "epoch": 0.68, + "grad_norm": 0.8768862838630231, + "learning_rate": 4.940843088813537e-06, + "loss": 0.4355, + "step": 8352 + }, + { + "epoch": 0.68, + "grad_norm": 0.9872960457295382, + "learning_rate": 4.938572538311696e-06, + "loss": 0.4885, + "step": 8353 + }, + { + "epoch": 0.68, + "grad_norm": 0.9488391552128462, + "learning_rate": 4.936302338562288e-06, + "loss": 0.5248, + "step": 8354 + }, + { + "epoch": 0.68, + "grad_norm": 0.8430325210263615, + "learning_rate": 4.934032489722637e-06, + "loss": 0.4987, + "step": 8355 + }, + { + "epoch": 0.68, + "grad_norm": 0.9887525415033993, + "learning_rate": 4.931762991950043e-06, + "loss": 0.5418, + "step": 8356 + }, + { + "epoch": 0.68, + "grad_norm": 0.8868405672461215, + "learning_rate": 4.929493845401772e-06, + "loss": 0.512, + "step": 8357 + }, + { + "epoch": 0.68, + "grad_norm": 0.9305561141732096, + "learning_rate": 4.927225050235087e-06, + "loss": 0.4856, + "step": 8358 + }, + { + "epoch": 0.68, + "grad_norm": 0.9093606578240055, + "learning_rate": 4.924956606607203e-06, + "loss": 0.4728, + "step": 8359 + }, + { + "epoch": 0.68, + "grad_norm": 0.9255114452581292, + "learning_rate": 4.922688514675325e-06, + "loss": 0.5543, + "step": 8360 + }, + { + "epoch": 0.68, + "grad_norm": 0.9108840803155676, + "learning_rate": 4.9204207745966285e-06, + "loss": 0.4961, + "step": 8361 + }, + { + "epoch": 0.68, + "grad_norm": 0.9470093265948771, + "learning_rate": 4.918153386528271e-06, + "loss": 0.5079, + "step": 8362 + }, + { + "epoch": 0.68, + "grad_norm": 0.9337221774130737, + "learning_rate": 4.915886350627368e-06, + "loss": 0.5819, + "step": 8363 + }, + { + "epoch": 0.68, + "grad_norm": 0.8493640279177445, + "learning_rate": 4.91361966705104e-06, + "loss": 0.4399, + "step": 8364 + }, + { + "epoch": 0.68, + "grad_norm": 0.896803616770482, + "learning_rate": 4.911353335956353e-06, + "loss": 0.4911, + "step": 8365 + }, + { + "epoch": 0.68, + "grad_norm": 0.9681357654854811, + "learning_rate": 4.909087357500366e-06, + "loss": 0.5298, + "step": 8366 + }, + { + "epoch": 0.68, + "grad_norm": 0.9894347696243155, + "learning_rate": 4.906821731840109e-06, + "loss": 0.5156, + "step": 8367 + }, + { + "epoch": 0.68, + "grad_norm": 1.0480262385449974, + "learning_rate": 4.904556459132593e-06, + "loss": 0.5048, + "step": 8368 + }, + { + "epoch": 0.68, + "grad_norm": 0.9904266056570253, + "learning_rate": 4.902291539534787e-06, + "loss": 0.5019, + "step": 8369 + }, + { + "epoch": 0.68, + "grad_norm": 0.8826399168957701, + "learning_rate": 4.900026973203663e-06, + "loss": 0.4766, + "step": 8370 + }, + { + "epoch": 0.68, + "grad_norm": 0.9321355444330982, + "learning_rate": 4.897762760296143e-06, + "loss": 0.4984, + "step": 8371 + }, + { + "epoch": 0.68, + "grad_norm": 0.9711002397447057, + "learning_rate": 4.895498900969138e-06, + "loss": 0.5301, + "step": 8372 + }, + { + "epoch": 0.68, + "grad_norm": 0.8176926757979365, + "learning_rate": 4.893235395379531e-06, + "loss": 0.4271, + "step": 8373 + }, + { + "epoch": 0.68, + "grad_norm": 0.9430190210323978, + "learning_rate": 4.890972243684185e-06, + "loss": 0.4597, + "step": 8374 + }, + { + "epoch": 0.68, + "grad_norm": 0.8833110729017636, + "learning_rate": 4.888709446039923e-06, + "loss": 0.4718, + "step": 8375 + }, + { + "epoch": 0.68, + "grad_norm": 0.93005358908339, + "learning_rate": 4.8864470026035715e-06, + "loss": 0.5218, + "step": 8376 + }, + { + "epoch": 0.68, + "grad_norm": 0.8287871369359837, + "learning_rate": 4.8841849135319015e-06, + "loss": 0.4328, + "step": 8377 + }, + { + "epoch": 0.68, + "grad_norm": 0.9779055683662576, + "learning_rate": 4.881923178981681e-06, + "loss": 0.4185, + "step": 8378 + }, + { + "epoch": 0.68, + "grad_norm": 1.0164807398807587, + "learning_rate": 4.879661799109644e-06, + "loss": 0.4969, + "step": 8379 + }, + { + "epoch": 0.68, + "grad_norm": 0.9724315338703285, + "learning_rate": 4.877400774072506e-06, + "loss": 0.5074, + "step": 8380 + }, + { + "epoch": 0.68, + "grad_norm": 1.039312544558339, + "learning_rate": 4.875140104026943e-06, + "loss": 0.5116, + "step": 8381 + }, + { + "epoch": 0.68, + "grad_norm": 0.9431211589333425, + "learning_rate": 4.872879789129632e-06, + "loss": 0.5218, + "step": 8382 + }, + { + "epoch": 0.68, + "grad_norm": 0.8790106929243705, + "learning_rate": 4.870619829537201e-06, + "loss": 0.4876, + "step": 8383 + }, + { + "epoch": 0.68, + "grad_norm": 1.0114065824703244, + "learning_rate": 4.8683602254062665e-06, + "loss": 0.5155, + "step": 8384 + }, + { + "epoch": 0.68, + "grad_norm": 0.8819030896745794, + "learning_rate": 4.866100976893416e-06, + "loss": 0.5177, + "step": 8385 + }, + { + "epoch": 0.68, + "grad_norm": 0.9055194541725219, + "learning_rate": 4.863842084155217e-06, + "loss": 0.4772, + "step": 8386 + }, + { + "epoch": 0.68, + "grad_norm": 0.8911414355791749, + "learning_rate": 4.8615835473482e-06, + "loss": 0.5097, + "step": 8387 + }, + { + "epoch": 0.68, + "grad_norm": 0.8504547122828845, + "learning_rate": 4.859325366628892e-06, + "loss": 0.4843, + "step": 8388 + }, + { + "epoch": 0.68, + "grad_norm": 0.8963887036025582, + "learning_rate": 4.8570675421537685e-06, + "loss": 0.5021, + "step": 8389 + }, + { + "epoch": 0.68, + "grad_norm": 0.9273190455880552, + "learning_rate": 4.854810074079311e-06, + "loss": 0.5287, + "step": 8390 + }, + { + "epoch": 0.68, + "grad_norm": 0.8755536442112973, + "learning_rate": 4.852552962561946e-06, + "loss": 0.5171, + "step": 8391 + }, + { + "epoch": 0.68, + "grad_norm": 0.8631816572604099, + "learning_rate": 4.8502962077580965e-06, + "loss": 0.4588, + "step": 8392 + }, + { + "epoch": 0.68, + "grad_norm": 0.7915483046368798, + "learning_rate": 4.848039809824151e-06, + "loss": 0.4609, + "step": 8393 + }, + { + "epoch": 0.68, + "grad_norm": 0.8926376451506014, + "learning_rate": 4.845783768916482e-06, + "loss": 0.5605, + "step": 8394 + }, + { + "epoch": 0.68, + "grad_norm": 0.8692178271144266, + "learning_rate": 4.843528085191418e-06, + "loss": 0.4615, + "step": 8395 + }, + { + "epoch": 0.68, + "grad_norm": 0.8715520588557414, + "learning_rate": 4.841272758805291e-06, + "loss": 0.4289, + "step": 8396 + }, + { + "epoch": 0.68, + "grad_norm": 0.8749040524844592, + "learning_rate": 4.839017789914382e-06, + "loss": 0.4879, + "step": 8397 + }, + { + "epoch": 0.68, + "grad_norm": 0.8892874514883294, + "learning_rate": 4.836763178674963e-06, + "loss": 0.5299, + "step": 8398 + }, + { + "epoch": 0.68, + "grad_norm": 0.9678418266925191, + "learning_rate": 4.8345089252432765e-06, + "loss": 0.4949, + "step": 8399 + }, + { + "epoch": 0.68, + "grad_norm": 1.0388938109902268, + "learning_rate": 4.832255029775542e-06, + "loss": 0.5974, + "step": 8400 + }, + { + "epoch": 0.68, + "grad_norm": 0.883887908245481, + "learning_rate": 4.830001492427943e-06, + "loss": 0.4758, + "step": 8401 + }, + { + "epoch": 0.68, + "grad_norm": 0.8568929953037968, + "learning_rate": 4.827748313356664e-06, + "loss": 0.4778, + "step": 8402 + }, + { + "epoch": 0.68, + "grad_norm": 0.84966750040475, + "learning_rate": 4.825495492717833e-06, + "loss": 0.433, + "step": 8403 + }, + { + "epoch": 0.68, + "grad_norm": 0.8802543271044627, + "learning_rate": 4.823243030667576e-06, + "loss": 0.5092, + "step": 8404 + }, + { + "epoch": 0.68, + "grad_norm": 0.8394079105908062, + "learning_rate": 4.8209909273619845e-06, + "loss": 0.448, + "step": 8405 + }, + { + "epoch": 0.68, + "grad_norm": 0.9688607104012111, + "learning_rate": 4.818739182957131e-06, + "loss": 0.5156, + "step": 8406 + }, + { + "epoch": 0.68, + "grad_norm": 0.9174760388846913, + "learning_rate": 4.816487797609051e-06, + "loss": 0.5143, + "step": 8407 + }, + { + "epoch": 0.68, + "grad_norm": 0.8252282392609436, + "learning_rate": 4.814236771473774e-06, + "loss": 0.4832, + "step": 8408 + }, + { + "epoch": 0.68, + "grad_norm": 0.9581235332847905, + "learning_rate": 4.811986104707288e-06, + "loss": 0.4909, + "step": 8409 + }, + { + "epoch": 0.68, + "grad_norm": 0.9153656263449683, + "learning_rate": 4.8097357974655615e-06, + "loss": 0.5051, + "step": 8410 + }, + { + "epoch": 0.68, + "grad_norm": 0.8799875134221885, + "learning_rate": 4.8074858499045405e-06, + "loss": 0.4633, + "step": 8411 + }, + { + "epoch": 0.68, + "grad_norm": 0.9366573540901902, + "learning_rate": 4.8052362621801484e-06, + "loss": 0.5065, + "step": 8412 + }, + { + "epoch": 0.68, + "grad_norm": 0.8184587541847153, + "learning_rate": 4.802987034448267e-06, + "loss": 0.4333, + "step": 8413 + }, + { + "epoch": 0.68, + "grad_norm": 0.8854510262214775, + "learning_rate": 4.800738166864784e-06, + "loss": 0.5147, + "step": 8414 + }, + { + "epoch": 0.68, + "grad_norm": 0.8934024712409966, + "learning_rate": 4.798489659585529e-06, + "loss": 0.4496, + "step": 8415 + }, + { + "epoch": 0.68, + "grad_norm": 0.8190161995760137, + "learning_rate": 4.7962415127663265e-06, + "loss": 0.4901, + "step": 8416 + }, + { + "epoch": 0.68, + "grad_norm": 0.9948716753673862, + "learning_rate": 4.7939937265629725e-06, + "loss": 0.5582, + "step": 8417 + }, + { + "epoch": 0.68, + "grad_norm": 0.9359470085761421, + "learning_rate": 4.79174630113124e-06, + "loss": 0.5093, + "step": 8418 + }, + { + "epoch": 0.68, + "grad_norm": 0.8856683904774265, + "learning_rate": 4.789499236626859e-06, + "loss": 0.4876, + "step": 8419 + }, + { + "epoch": 0.68, + "grad_norm": 0.8698069429816057, + "learning_rate": 4.7872525332055685e-06, + "loss": 0.4606, + "step": 8420 + }, + { + "epoch": 0.68, + "grad_norm": 0.951276069893926, + "learning_rate": 4.7850061910230495e-06, + "loss": 0.5241, + "step": 8421 + }, + { + "epoch": 0.68, + "grad_norm": 0.9954898094083914, + "learning_rate": 4.782760210234976e-06, + "loss": 0.5543, + "step": 8422 + }, + { + "epoch": 0.68, + "grad_norm": 0.8824398322853885, + "learning_rate": 4.780514590996992e-06, + "loss": 0.4602, + "step": 8423 + }, + { + "epoch": 0.68, + "grad_norm": 0.8618284654301703, + "learning_rate": 4.778269333464721e-06, + "loss": 0.4473, + "step": 8424 + }, + { + "epoch": 0.68, + "grad_norm": 0.8492717540748159, + "learning_rate": 4.776024437793746e-06, + "loss": 0.4279, + "step": 8425 + }, + { + "epoch": 0.68, + "grad_norm": 0.9246816255673895, + "learning_rate": 4.773779904139652e-06, + "loss": 0.4564, + "step": 8426 + }, + { + "epoch": 0.68, + "grad_norm": 0.919388251924666, + "learning_rate": 4.7715357326579705e-06, + "loss": 0.4705, + "step": 8427 + }, + { + "epoch": 0.68, + "grad_norm": 0.8265756691259849, + "learning_rate": 4.769291923504226e-06, + "loss": 0.4794, + "step": 8428 + }, + { + "epoch": 0.69, + "grad_norm": 0.8666817154876048, + "learning_rate": 4.76704847683391e-06, + "loss": 0.4728, + "step": 8429 + }, + { + "epoch": 0.69, + "grad_norm": 0.9208247524740416, + "learning_rate": 4.764805392802497e-06, + "loss": 0.4782, + "step": 8430 + }, + { + "epoch": 0.69, + "grad_norm": 0.8166297428420718, + "learning_rate": 4.7625626715654205e-06, + "loss": 0.5053, + "step": 8431 + }, + { + "epoch": 0.69, + "grad_norm": 0.9197309838095528, + "learning_rate": 4.760320313278112e-06, + "loss": 0.4857, + "step": 8432 + }, + { + "epoch": 0.69, + "grad_norm": 0.8730053148337936, + "learning_rate": 4.758078318095953e-06, + "loss": 0.5088, + "step": 8433 + }, + { + "epoch": 0.69, + "grad_norm": 0.9183706154217698, + "learning_rate": 4.755836686174319e-06, + "loss": 0.4954, + "step": 8434 + }, + { + "epoch": 0.69, + "grad_norm": 0.7651626575366501, + "learning_rate": 4.753595417668551e-06, + "loss": 0.4731, + "step": 8435 + }, + { + "epoch": 0.69, + "grad_norm": 0.8658156966163371, + "learning_rate": 4.75135451273397e-06, + "loss": 0.4705, + "step": 8436 + }, + { + "epoch": 0.69, + "grad_norm": 0.8977597260123354, + "learning_rate": 4.749113971525858e-06, + "loss": 0.515, + "step": 8437 + }, + { + "epoch": 0.69, + "grad_norm": 1.0343449984371662, + "learning_rate": 4.746873794199498e-06, + "loss": 0.569, + "step": 8438 + }, + { + "epoch": 0.69, + "grad_norm": 0.8206778279220518, + "learning_rate": 4.744633980910122e-06, + "loss": 0.4751, + "step": 8439 + }, + { + "epoch": 0.69, + "grad_norm": 0.9921478178561506, + "learning_rate": 4.742394531812949e-06, + "loss": 0.5699, + "step": 8440 + }, + { + "epoch": 0.69, + "grad_norm": 0.9236598458945292, + "learning_rate": 4.74015544706317e-06, + "loss": 0.5102, + "step": 8441 + }, + { + "epoch": 0.69, + "grad_norm": 0.9029506730016744, + "learning_rate": 4.737916726815958e-06, + "loss": 0.4348, + "step": 8442 + }, + { + "epoch": 0.69, + "grad_norm": 0.8466552613302887, + "learning_rate": 4.7356783712264405e-06, + "loss": 0.503, + "step": 8443 + }, + { + "epoch": 0.69, + "grad_norm": 0.8989842391438065, + "learning_rate": 4.733440380449752e-06, + "loss": 0.4691, + "step": 8444 + }, + { + "epoch": 0.69, + "grad_norm": 0.9437254308309423, + "learning_rate": 4.731202754640969e-06, + "loss": 0.4926, + "step": 8445 + }, + { + "epoch": 0.69, + "grad_norm": 0.8748975753828981, + "learning_rate": 4.728965493955162e-06, + "loss": 0.4764, + "step": 8446 + }, + { + "epoch": 0.69, + "grad_norm": 0.8641293354033068, + "learning_rate": 4.72672859854737e-06, + "loss": 0.4868, + "step": 8447 + }, + { + "epoch": 0.69, + "grad_norm": 0.9008577718286481, + "learning_rate": 4.724492068572614e-06, + "loss": 0.5056, + "step": 8448 + }, + { + "epoch": 0.69, + "grad_norm": 0.947620089849962, + "learning_rate": 4.722255904185869e-06, + "loss": 0.5184, + "step": 8449 + }, + { + "epoch": 0.69, + "grad_norm": 0.9941821471275278, + "learning_rate": 4.720020105542117e-06, + "loss": 0.5347, + "step": 8450 + }, + { + "epoch": 0.69, + "grad_norm": 0.8432679680464656, + "learning_rate": 4.717784672796285e-06, + "loss": 0.5014, + "step": 8451 + }, + { + "epoch": 0.69, + "grad_norm": 0.8928988386363494, + "learning_rate": 4.715549606103289e-06, + "loss": 0.4781, + "step": 8452 + }, + { + "epoch": 0.69, + "grad_norm": 1.0339335317195717, + "learning_rate": 4.7133149056180185e-06, + "loss": 0.5745, + "step": 8453 + }, + { + "epoch": 0.69, + "grad_norm": 0.8003162042838572, + "learning_rate": 4.7110805714953385e-06, + "loss": 0.4694, + "step": 8454 + }, + { + "epoch": 0.69, + "grad_norm": 0.8461276414515637, + "learning_rate": 4.708846603890077e-06, + "loss": 0.4899, + "step": 8455 + }, + { + "epoch": 0.69, + "grad_norm": 0.8958768222352202, + "learning_rate": 4.70661300295706e-06, + "loss": 0.4872, + "step": 8456 + }, + { + "epoch": 0.69, + "grad_norm": 0.9505494510294067, + "learning_rate": 4.704379768851063e-06, + "loss": 0.5211, + "step": 8457 + }, + { + "epoch": 0.69, + "grad_norm": 0.8700792444455523, + "learning_rate": 4.702146901726851e-06, + "loss": 0.4902, + "step": 8458 + }, + { + "epoch": 0.69, + "grad_norm": 0.9200429978046982, + "learning_rate": 4.69991440173916e-06, + "loss": 0.5211, + "step": 8459 + }, + { + "epoch": 0.69, + "grad_norm": 0.8938147931028365, + "learning_rate": 4.697682269042698e-06, + "loss": 0.5195, + "step": 8460 + }, + { + "epoch": 0.69, + "grad_norm": 0.8550642884636459, + "learning_rate": 4.695450503792153e-06, + "loss": 0.4923, + "step": 8461 + }, + { + "epoch": 0.69, + "grad_norm": 1.0662928591320688, + "learning_rate": 4.693219106142186e-06, + "loss": 0.577, + "step": 8462 + }, + { + "epoch": 0.69, + "grad_norm": 0.8924981782282854, + "learning_rate": 4.690988076247425e-06, + "loss": 0.4816, + "step": 8463 + }, + { + "epoch": 0.69, + "grad_norm": 0.903820363407162, + "learning_rate": 4.68875741426248e-06, + "loss": 0.5177, + "step": 8464 + }, + { + "epoch": 0.69, + "grad_norm": 1.0387163445050478, + "learning_rate": 4.686527120341936e-06, + "loss": 0.5164, + "step": 8465 + }, + { + "epoch": 0.69, + "grad_norm": 0.9481820813313033, + "learning_rate": 4.68429719464035e-06, + "loss": 0.5185, + "step": 8466 + }, + { + "epoch": 0.69, + "grad_norm": 0.8594449777348103, + "learning_rate": 4.6820676373122535e-06, + "loss": 0.4656, + "step": 8467 + }, + { + "epoch": 0.69, + "grad_norm": 0.8502875164783491, + "learning_rate": 4.679838448512155e-06, + "loss": 0.4476, + "step": 8468 + }, + { + "epoch": 0.69, + "grad_norm": 0.9414262894799519, + "learning_rate": 4.677609628394529e-06, + "loss": 0.4923, + "step": 8469 + }, + { + "epoch": 0.69, + "grad_norm": 0.8068791737650673, + "learning_rate": 4.675381177113837e-06, + "loss": 0.4124, + "step": 8470 + }, + { + "epoch": 0.69, + "grad_norm": 0.909949833574802, + "learning_rate": 4.673153094824505e-06, + "loss": 0.4937, + "step": 8471 + }, + { + "epoch": 0.69, + "grad_norm": 0.9170348465696564, + "learning_rate": 4.670925381680938e-06, + "loss": 0.5086, + "step": 8472 + }, + { + "epoch": 0.69, + "grad_norm": 0.9315108992160059, + "learning_rate": 4.668698037837517e-06, + "loss": 0.5393, + "step": 8473 + }, + { + "epoch": 0.69, + "grad_norm": 0.873085812567832, + "learning_rate": 4.666471063448595e-06, + "loss": 0.5344, + "step": 8474 + }, + { + "epoch": 0.69, + "grad_norm": 1.0079423729232555, + "learning_rate": 4.664244458668496e-06, + "loss": 0.5647, + "step": 8475 + }, + { + "epoch": 0.69, + "grad_norm": 0.8207324962622179, + "learning_rate": 4.662018223651521e-06, + "loss": 0.4896, + "step": 8476 + }, + { + "epoch": 0.69, + "grad_norm": 0.9446632078134728, + "learning_rate": 4.65979235855195e-06, + "loss": 0.5198, + "step": 8477 + }, + { + "epoch": 0.69, + "grad_norm": 0.8965188809159295, + "learning_rate": 4.65756686352403e-06, + "loss": 0.467, + "step": 8478 + }, + { + "epoch": 0.69, + "grad_norm": 0.8649659172553711, + "learning_rate": 4.655341738721989e-06, + "loss": 0.4583, + "step": 8479 + }, + { + "epoch": 0.69, + "grad_norm": 0.9028168849235375, + "learning_rate": 4.653116984300024e-06, + "loss": 0.5207, + "step": 8480 + }, + { + "epoch": 0.69, + "grad_norm": 0.8951444423118171, + "learning_rate": 4.6508926004123145e-06, + "loss": 0.4682, + "step": 8481 + }, + { + "epoch": 0.69, + "grad_norm": 0.8852410562883132, + "learning_rate": 4.648668587212998e-06, + "loss": 0.4784, + "step": 8482 + }, + { + "epoch": 0.69, + "grad_norm": 0.9167657976916679, + "learning_rate": 4.646444944856202e-06, + "loss": 0.5031, + "step": 8483 + }, + { + "epoch": 0.69, + "grad_norm": 0.9260651434554181, + "learning_rate": 4.644221673496023e-06, + "loss": 0.5693, + "step": 8484 + }, + { + "epoch": 0.69, + "grad_norm": 1.0223575682469483, + "learning_rate": 4.641998773286531e-06, + "loss": 0.4936, + "step": 8485 + }, + { + "epoch": 0.69, + "grad_norm": 0.8672998551519863, + "learning_rate": 4.639776244381772e-06, + "loss": 0.491, + "step": 8486 + }, + { + "epoch": 0.69, + "grad_norm": 0.8243916096199051, + "learning_rate": 4.637554086935768e-06, + "loss": 0.49, + "step": 8487 + }, + { + "epoch": 0.69, + "grad_norm": 0.8811146797443113, + "learning_rate": 4.635332301102507e-06, + "loss": 0.4524, + "step": 8488 + }, + { + "epoch": 0.69, + "grad_norm": 1.0189452727877433, + "learning_rate": 4.633110887035957e-06, + "loss": 0.5636, + "step": 8489 + }, + { + "epoch": 0.69, + "grad_norm": 0.9832362112575355, + "learning_rate": 4.630889844890063e-06, + "loss": 0.4915, + "step": 8490 + }, + { + "epoch": 0.69, + "grad_norm": 0.9609507067698836, + "learning_rate": 4.628669174818741e-06, + "loss": 0.5044, + "step": 8491 + }, + { + "epoch": 0.69, + "grad_norm": 0.8886354571444008, + "learning_rate": 4.6264488769758795e-06, + "loss": 0.4868, + "step": 8492 + }, + { + "epoch": 0.69, + "grad_norm": 0.9127612427732726, + "learning_rate": 4.6242289515153495e-06, + "loss": 0.5326, + "step": 8493 + }, + { + "epoch": 0.69, + "grad_norm": 0.9293163321558532, + "learning_rate": 4.622009398590976e-06, + "loss": 0.4987, + "step": 8494 + }, + { + "epoch": 0.69, + "grad_norm": 0.8595017051048842, + "learning_rate": 4.619790218356589e-06, + "loss": 0.4478, + "step": 8495 + }, + { + "epoch": 0.69, + "grad_norm": 0.932634641983612, + "learning_rate": 4.617571410965964e-06, + "loss": 0.5015, + "step": 8496 + }, + { + "epoch": 0.69, + "grad_norm": 0.9429550807269602, + "learning_rate": 4.615352976572867e-06, + "loss": 0.4838, + "step": 8497 + }, + { + "epoch": 0.69, + "grad_norm": 0.9593082113581994, + "learning_rate": 4.613134915331031e-06, + "loss": 0.4767, + "step": 8498 + }, + { + "epoch": 0.69, + "grad_norm": 0.8782623572538939, + "learning_rate": 4.610917227394172e-06, + "loss": 0.4841, + "step": 8499 + }, + { + "epoch": 0.69, + "grad_norm": 0.9823091850599716, + "learning_rate": 4.608699912915961e-06, + "loss": 0.5373, + "step": 8500 + }, + { + "epoch": 0.69, + "grad_norm": 0.9240946447050321, + "learning_rate": 4.606482972050072e-06, + "loss": 0.493, + "step": 8501 + }, + { + "epoch": 0.69, + "grad_norm": 0.9995884422788283, + "learning_rate": 4.604266404950124e-06, + "loss": 0.4942, + "step": 8502 + }, + { + "epoch": 0.69, + "grad_norm": 0.9315818382316459, + "learning_rate": 4.60205021176973e-06, + "loss": 0.5339, + "step": 8503 + }, + { + "epoch": 0.69, + "grad_norm": 0.9313904947633198, + "learning_rate": 4.599834392662467e-06, + "loss": 0.5309, + "step": 8504 + }, + { + "epoch": 0.69, + "grad_norm": 0.9234364248702774, + "learning_rate": 4.5976189477818945e-06, + "loss": 0.5381, + "step": 8505 + }, + { + "epoch": 0.69, + "grad_norm": 0.9642841600311677, + "learning_rate": 4.5954038772815305e-06, + "loss": 0.5149, + "step": 8506 + }, + { + "epoch": 0.69, + "grad_norm": 0.9758857960131119, + "learning_rate": 4.5931891813148895e-06, + "loss": 0.5616, + "step": 8507 + }, + { + "epoch": 0.69, + "grad_norm": 0.9939874774025128, + "learning_rate": 4.5909748600354395e-06, + "loss": 0.5817, + "step": 8508 + }, + { + "epoch": 0.69, + "grad_norm": 0.883662600989568, + "learning_rate": 4.588760913596635e-06, + "loss": 0.4887, + "step": 8509 + }, + { + "epoch": 0.69, + "grad_norm": 1.0144586260482555, + "learning_rate": 4.586547342151898e-06, + "loss": 0.5689, + "step": 8510 + }, + { + "epoch": 0.69, + "grad_norm": 0.9213786833800294, + "learning_rate": 4.584334145854633e-06, + "loss": 0.5044, + "step": 8511 + }, + { + "epoch": 0.69, + "grad_norm": 0.8710235090027155, + "learning_rate": 4.582121324858201e-06, + "loss": 0.475, + "step": 8512 + }, + { + "epoch": 0.69, + "grad_norm": 0.9984563099572079, + "learning_rate": 4.579908879315962e-06, + "loss": 0.5219, + "step": 8513 + }, + { + "epoch": 0.69, + "grad_norm": 0.907665700346533, + "learning_rate": 4.577696809381222e-06, + "loss": 0.5097, + "step": 8514 + }, + { + "epoch": 0.69, + "grad_norm": 0.8598035995585397, + "learning_rate": 4.5754851152072935e-06, + "loss": 0.4853, + "step": 8515 + }, + { + "epoch": 0.69, + "grad_norm": 0.9093082298469464, + "learning_rate": 4.573273796947429e-06, + "loss": 0.5327, + "step": 8516 + }, + { + "epoch": 0.69, + "grad_norm": 0.8808884117504743, + "learning_rate": 4.571062854754878e-06, + "loss": 0.5258, + "step": 8517 + }, + { + "epoch": 0.69, + "grad_norm": 0.8533527701048154, + "learning_rate": 4.568852288782854e-06, + "loss": 0.4881, + "step": 8518 + }, + { + "epoch": 0.69, + "grad_norm": 0.9163197742017514, + "learning_rate": 4.566642099184555e-06, + "loss": 0.5018, + "step": 8519 + }, + { + "epoch": 0.69, + "grad_norm": 0.8215761882741016, + "learning_rate": 4.564432286113131e-06, + "loss": 0.4834, + "step": 8520 + }, + { + "epoch": 0.69, + "grad_norm": 0.7933050125474624, + "learning_rate": 4.562222849721735e-06, + "loss": 0.4094, + "step": 8521 + }, + { + "epoch": 0.69, + "grad_norm": 0.8897154668309718, + "learning_rate": 4.560013790163469e-06, + "loss": 0.5105, + "step": 8522 + }, + { + "epoch": 0.69, + "grad_norm": 0.855198442809487, + "learning_rate": 4.557805107591421e-06, + "loss": 0.455, + "step": 8523 + }, + { + "epoch": 0.69, + "grad_norm": 0.9524596667633722, + "learning_rate": 4.555596802158653e-06, + "loss": 0.5091, + "step": 8524 + }, + { + "epoch": 0.69, + "grad_norm": 0.950194938204538, + "learning_rate": 4.553388874018201e-06, + "loss": 0.5583, + "step": 8525 + }, + { + "epoch": 0.69, + "grad_norm": 0.9400851744243489, + "learning_rate": 4.551181323323062e-06, + "loss": 0.4941, + "step": 8526 + }, + { + "epoch": 0.69, + "grad_norm": 0.9146266111978486, + "learning_rate": 4.548974150226231e-06, + "loss": 0.4986, + "step": 8527 + }, + { + "epoch": 0.69, + "grad_norm": 0.9660153299413448, + "learning_rate": 4.546767354880653e-06, + "loss": 0.5393, + "step": 8528 + }, + { + "epoch": 0.69, + "grad_norm": 0.9297055371592443, + "learning_rate": 4.544560937439258e-06, + "loss": 0.4953, + "step": 8529 + }, + { + "epoch": 0.69, + "grad_norm": 0.9696637231242728, + "learning_rate": 4.542354898054953e-06, + "loss": 0.5172, + "step": 8530 + }, + { + "epoch": 0.69, + "grad_norm": 0.9218306554310379, + "learning_rate": 4.540149236880616e-06, + "loss": 0.4773, + "step": 8531 + }, + { + "epoch": 0.69, + "grad_norm": 0.9486063599967517, + "learning_rate": 4.537943954069088e-06, + "loss": 0.546, + "step": 8532 + }, + { + "epoch": 0.69, + "grad_norm": 0.9924791760385479, + "learning_rate": 4.535739049773206e-06, + "loss": 0.5536, + "step": 8533 + }, + { + "epoch": 0.69, + "grad_norm": 0.8468685421929781, + "learning_rate": 4.533534524145756e-06, + "loss": 0.4782, + "step": 8534 + }, + { + "epoch": 0.69, + "grad_norm": 0.9512118915905358, + "learning_rate": 4.531330377339516e-06, + "loss": 0.4956, + "step": 8535 + }, + { + "epoch": 0.69, + "grad_norm": 0.9716515137110536, + "learning_rate": 4.529126609507229e-06, + "loss": 0.5597, + "step": 8536 + }, + { + "epoch": 0.69, + "grad_norm": 0.8938265448169714, + "learning_rate": 4.52692322080162e-06, + "loss": 0.4546, + "step": 8537 + }, + { + "epoch": 0.69, + "grad_norm": 0.8921292523190358, + "learning_rate": 4.52472021137537e-06, + "loss": 0.4938, + "step": 8538 + }, + { + "epoch": 0.69, + "grad_norm": 0.977254514957181, + "learning_rate": 4.5225175813811585e-06, + "loss": 0.4989, + "step": 8539 + }, + { + "epoch": 0.69, + "grad_norm": 0.9901426180974761, + "learning_rate": 4.520315330971617e-06, + "loss": 0.523, + "step": 8540 + }, + { + "epoch": 0.69, + "grad_norm": 0.9762476438745803, + "learning_rate": 4.518113460299364e-06, + "loss": 0.5033, + "step": 8541 + }, + { + "epoch": 0.69, + "grad_norm": 0.9511493492501386, + "learning_rate": 4.515911969516985e-06, + "loss": 0.5201, + "step": 8542 + }, + { + "epoch": 0.69, + "grad_norm": 0.9544821754504096, + "learning_rate": 4.513710858777045e-06, + "loss": 0.5167, + "step": 8543 + }, + { + "epoch": 0.69, + "grad_norm": 1.0477136437004189, + "learning_rate": 4.5115101282320695e-06, + "loss": 0.5641, + "step": 8544 + }, + { + "epoch": 0.69, + "grad_norm": 0.9212158833631946, + "learning_rate": 4.509309778034582e-06, + "loss": 0.4567, + "step": 8545 + }, + { + "epoch": 0.69, + "grad_norm": 0.9859534975875366, + "learning_rate": 4.507109808337051e-06, + "loss": 0.5537, + "step": 8546 + }, + { + "epoch": 0.69, + "grad_norm": 0.8521980359414838, + "learning_rate": 4.504910219291941e-06, + "loss": 0.4825, + "step": 8547 + }, + { + "epoch": 0.69, + "grad_norm": 0.8726401564149688, + "learning_rate": 4.502711011051676e-06, + "loss": 0.4641, + "step": 8548 + }, + { + "epoch": 0.69, + "grad_norm": 0.8850970445322484, + "learning_rate": 4.500512183768666e-06, + "loss": 0.4365, + "step": 8549 + }, + { + "epoch": 0.69, + "grad_norm": 0.8528034962174915, + "learning_rate": 4.498313737595277e-06, + "loss": 0.4914, + "step": 8550 + }, + { + "epoch": 0.69, + "grad_norm": 0.8943513341597491, + "learning_rate": 4.4961156726838725e-06, + "loss": 0.5079, + "step": 8551 + }, + { + "epoch": 0.7, + "grad_norm": 0.8238291990363437, + "learning_rate": 4.493917989186768e-06, + "loss": 0.5233, + "step": 8552 + }, + { + "epoch": 0.7, + "grad_norm": 0.935896226406557, + "learning_rate": 4.491720687256261e-06, + "loss": 0.492, + "step": 8553 + }, + { + "epoch": 0.7, + "grad_norm": 0.8779507327285243, + "learning_rate": 4.489523767044625e-06, + "loss": 0.4805, + "step": 8554 + }, + { + "epoch": 0.7, + "grad_norm": 0.998133203070941, + "learning_rate": 4.487327228704108e-06, + "loss": 0.5005, + "step": 8555 + }, + { + "epoch": 0.7, + "grad_norm": 0.9016653665148767, + "learning_rate": 4.4851310723869176e-06, + "loss": 0.4611, + "step": 8556 + }, + { + "epoch": 0.7, + "grad_norm": 0.9136562820143127, + "learning_rate": 4.482935298245259e-06, + "loss": 0.4689, + "step": 8557 + }, + { + "epoch": 0.7, + "grad_norm": 0.990290997862627, + "learning_rate": 4.480739906431287e-06, + "loss": 0.5298, + "step": 8558 + }, + { + "epoch": 0.7, + "grad_norm": 0.9447926331667892, + "learning_rate": 4.478544897097144e-06, + "loss": 0.521, + "step": 8559 + }, + { + "epoch": 0.7, + "grad_norm": 0.8593961200741635, + "learning_rate": 4.476350270394942e-06, + "loss": 0.4391, + "step": 8560 + }, + { + "epoch": 0.7, + "grad_norm": 0.8770524066755371, + "learning_rate": 4.47415602647677e-06, + "loss": 0.5104, + "step": 8561 + }, + { + "epoch": 0.7, + "grad_norm": 0.9107096575839747, + "learning_rate": 4.471962165494678e-06, + "loss": 0.5393, + "step": 8562 + }, + { + "epoch": 0.7, + "grad_norm": 0.9230544531802135, + "learning_rate": 4.469768687600711e-06, + "loss": 0.4659, + "step": 8563 + }, + { + "epoch": 0.7, + "grad_norm": 0.9017244828244761, + "learning_rate": 4.467575592946865e-06, + "loss": 0.5099, + "step": 8564 + }, + { + "epoch": 0.7, + "grad_norm": 0.9072309195504163, + "learning_rate": 4.465382881685122e-06, + "loss": 0.4922, + "step": 8565 + }, + { + "epoch": 0.7, + "grad_norm": 0.9127834367641511, + "learning_rate": 4.463190553967438e-06, + "loss": 0.531, + "step": 8566 + }, + { + "epoch": 0.7, + "grad_norm": 0.9630119394437752, + "learning_rate": 4.46099860994574e-06, + "loss": 0.4919, + "step": 8567 + }, + { + "epoch": 0.7, + "grad_norm": 1.402515032284407, + "learning_rate": 4.458807049771918e-06, + "loss": 0.5254, + "step": 8568 + }, + { + "epoch": 0.7, + "grad_norm": 0.979726220736822, + "learning_rate": 4.4566158735978605e-06, + "loss": 0.5463, + "step": 8569 + }, + { + "epoch": 0.7, + "grad_norm": 0.9475301973882784, + "learning_rate": 4.454425081575402e-06, + "loss": 0.5353, + "step": 8570 + }, + { + "epoch": 0.7, + "grad_norm": 0.939575507723578, + "learning_rate": 4.452234673856366e-06, + "loss": 0.5089, + "step": 8571 + }, + { + "epoch": 0.7, + "grad_norm": 0.9487876723711443, + "learning_rate": 4.450044650592547e-06, + "loss": 0.5164, + "step": 8572 + }, + { + "epoch": 0.7, + "grad_norm": 0.9522260285452261, + "learning_rate": 4.447855011935714e-06, + "loss": 0.5307, + "step": 8573 + }, + { + "epoch": 0.7, + "grad_norm": 0.8633787236123219, + "learning_rate": 4.4456657580375966e-06, + "loss": 0.4582, + "step": 8574 + }, + { + "epoch": 0.7, + "grad_norm": 0.9049310398365067, + "learning_rate": 4.443476889049923e-06, + "loss": 0.5123, + "step": 8575 + }, + { + "epoch": 0.7, + "grad_norm": 0.9042445194944134, + "learning_rate": 4.441288405124368e-06, + "loss": 0.4971, + "step": 8576 + }, + { + "epoch": 0.7, + "grad_norm": 0.9061722854430694, + "learning_rate": 4.439100306412596e-06, + "loss": 0.5561, + "step": 8577 + }, + { + "epoch": 0.7, + "grad_norm": 0.9681196865332937, + "learning_rate": 4.436912593066241e-06, + "loss": 0.5445, + "step": 8578 + }, + { + "epoch": 0.7, + "grad_norm": 0.9045333714159708, + "learning_rate": 4.434725265236912e-06, + "loss": 0.4744, + "step": 8579 + }, + { + "epoch": 0.7, + "grad_norm": 0.9791358579263623, + "learning_rate": 4.4325383230761785e-06, + "loss": 0.5647, + "step": 8580 + }, + { + "epoch": 0.7, + "grad_norm": 0.9190795854611128, + "learning_rate": 4.430351766735609e-06, + "loss": 0.4574, + "step": 8581 + }, + { + "epoch": 0.7, + "grad_norm": 0.9443589825222185, + "learning_rate": 4.428165596366717e-06, + "loss": 0.4894, + "step": 8582 + }, + { + "epoch": 0.7, + "grad_norm": 1.010699088324618, + "learning_rate": 4.425979812121008e-06, + "loss": 0.5239, + "step": 8583 + }, + { + "epoch": 0.7, + "grad_norm": 1.0005524797056764, + "learning_rate": 4.423794414149953e-06, + "loss": 0.5694, + "step": 8584 + }, + { + "epoch": 0.7, + "grad_norm": 0.9028625873596521, + "learning_rate": 4.421609402605003e-06, + "loss": 0.5101, + "step": 8585 + }, + { + "epoch": 0.7, + "grad_norm": 0.9555413133270083, + "learning_rate": 4.419424777637565e-06, + "loss": 0.4983, + "step": 8586 + }, + { + "epoch": 0.7, + "grad_norm": 1.018225022093693, + "learning_rate": 4.4172405393990495e-06, + "loss": 0.4761, + "step": 8587 + }, + { + "epoch": 0.7, + "grad_norm": 0.9019231468353708, + "learning_rate": 4.415056688040807e-06, + "loss": 0.509, + "step": 8588 + }, + { + "epoch": 0.7, + "grad_norm": 0.8944092264707802, + "learning_rate": 4.412873223714184e-06, + "loss": 0.4492, + "step": 8589 + }, + { + "epoch": 0.7, + "grad_norm": 1.0205877351895996, + "learning_rate": 4.41069014657049e-06, + "loss": 0.5679, + "step": 8590 + }, + { + "epoch": 0.7, + "grad_norm": 0.9175443585321001, + "learning_rate": 4.408507456761014e-06, + "loss": 0.4681, + "step": 8591 + }, + { + "epoch": 0.7, + "grad_norm": 0.941166738333424, + "learning_rate": 4.4063251544370055e-06, + "loss": 0.5679, + "step": 8592 + }, + { + "epoch": 0.7, + "grad_norm": 0.9257517601389404, + "learning_rate": 4.404143239749709e-06, + "loss": 0.5314, + "step": 8593 + }, + { + "epoch": 0.7, + "grad_norm": 0.9041427478022955, + "learning_rate": 4.401961712850318e-06, + "loss": 0.4892, + "step": 8594 + }, + { + "epoch": 0.7, + "grad_norm": 0.8909288731891772, + "learning_rate": 4.399780573890016e-06, + "loss": 0.4575, + "step": 8595 + }, + { + "epoch": 0.7, + "grad_norm": 0.9788352851978374, + "learning_rate": 4.397599823019953e-06, + "loss": 0.4932, + "step": 8596 + }, + { + "epoch": 0.7, + "grad_norm": 0.9628780507902193, + "learning_rate": 4.395419460391256e-06, + "loss": 0.5098, + "step": 8597 + }, + { + "epoch": 0.7, + "grad_norm": 0.9626038196010682, + "learning_rate": 4.393239486155011e-06, + "loss": 0.5357, + "step": 8598 + }, + { + "epoch": 0.7, + "grad_norm": 0.9024459623483713, + "learning_rate": 4.391059900462305e-06, + "loss": 0.5328, + "step": 8599 + }, + { + "epoch": 0.7, + "grad_norm": 0.9620848702073997, + "learning_rate": 4.3888807034641686e-06, + "loss": 0.4771, + "step": 8600 + }, + { + "epoch": 0.7, + "grad_norm": 0.973843994968412, + "learning_rate": 4.386701895311622e-06, + "loss": 0.4848, + "step": 8601 + }, + { + "epoch": 0.7, + "grad_norm": 0.9457771770198199, + "learning_rate": 4.384523476155657e-06, + "loss": 0.5132, + "step": 8602 + }, + { + "epoch": 0.7, + "grad_norm": 0.929037612513199, + "learning_rate": 4.382345446147236e-06, + "loss": 0.5633, + "step": 8603 + }, + { + "epoch": 0.7, + "grad_norm": 0.8646828981519963, + "learning_rate": 4.380167805437285e-06, + "loss": 0.4819, + "step": 8604 + }, + { + "epoch": 0.7, + "grad_norm": 0.9185236496323183, + "learning_rate": 4.377990554176729e-06, + "loss": 0.499, + "step": 8605 + }, + { + "epoch": 0.7, + "grad_norm": 0.9021456393842846, + "learning_rate": 4.375813692516437e-06, + "loss": 0.493, + "step": 8606 + }, + { + "epoch": 0.7, + "grad_norm": 0.9828541683033288, + "learning_rate": 4.3736372206072666e-06, + "loss": 0.5731, + "step": 8607 + }, + { + "epoch": 0.7, + "grad_norm": 0.9002309994632519, + "learning_rate": 4.371461138600047e-06, + "loss": 0.554, + "step": 8608 + }, + { + "epoch": 0.7, + "grad_norm": 0.960763045179828, + "learning_rate": 4.369285446645578e-06, + "loss": 0.5608, + "step": 8609 + }, + { + "epoch": 0.7, + "grad_norm": 0.9058200343498363, + "learning_rate": 4.367110144894633e-06, + "loss": 0.5084, + "step": 8610 + }, + { + "epoch": 0.7, + "grad_norm": 0.9395399780793992, + "learning_rate": 4.364935233497962e-06, + "loss": 0.5184, + "step": 8611 + }, + { + "epoch": 0.7, + "grad_norm": 0.9215093750756229, + "learning_rate": 4.362760712606278e-06, + "loss": 0.4846, + "step": 8612 + }, + { + "epoch": 0.7, + "grad_norm": 0.8610037956075174, + "learning_rate": 4.360586582370275e-06, + "loss": 0.4386, + "step": 8613 + }, + { + "epoch": 0.7, + "grad_norm": 0.882746044843972, + "learning_rate": 4.35841284294062e-06, + "loss": 0.4848, + "step": 8614 + }, + { + "epoch": 0.7, + "grad_norm": 0.8953735482621343, + "learning_rate": 4.356239494467952e-06, + "loss": 0.4674, + "step": 8615 + }, + { + "epoch": 0.7, + "grad_norm": 0.943203974943108, + "learning_rate": 4.35406653710288e-06, + "loss": 0.492, + "step": 8616 + }, + { + "epoch": 0.7, + "grad_norm": 0.8983064274161381, + "learning_rate": 4.351893970995994e-06, + "loss": 0.4969, + "step": 8617 + }, + { + "epoch": 0.7, + "grad_norm": 0.9973530547501277, + "learning_rate": 4.349721796297841e-06, + "loss": 0.5303, + "step": 8618 + }, + { + "epoch": 0.7, + "grad_norm": 0.9117438767939997, + "learning_rate": 4.347550013158956e-06, + "loss": 0.4307, + "step": 8619 + }, + { + "epoch": 0.7, + "grad_norm": 1.0095913131582344, + "learning_rate": 4.345378621729842e-06, + "loss": 0.5226, + "step": 8620 + }, + { + "epoch": 0.7, + "grad_norm": 0.8285377532732942, + "learning_rate": 4.343207622160973e-06, + "loss": 0.4997, + "step": 8621 + }, + { + "epoch": 0.7, + "grad_norm": 0.9936775538811486, + "learning_rate": 4.341037014602799e-06, + "loss": 0.5532, + "step": 8622 + }, + { + "epoch": 0.7, + "grad_norm": 0.8750442052146093, + "learning_rate": 4.338866799205744e-06, + "loss": 0.4762, + "step": 8623 + }, + { + "epoch": 0.7, + "grad_norm": 1.012281747560933, + "learning_rate": 4.3366969761201935e-06, + "loss": 0.5108, + "step": 8624 + }, + { + "epoch": 0.7, + "grad_norm": 0.9792624934234849, + "learning_rate": 4.334527545496521e-06, + "loss": 0.5235, + "step": 8625 + }, + { + "epoch": 0.7, + "grad_norm": 0.9781064179748855, + "learning_rate": 4.332358507485064e-06, + "loss": 0.5202, + "step": 8626 + }, + { + "epoch": 0.7, + "grad_norm": 0.9132213170356942, + "learning_rate": 4.330189862236134e-06, + "loss": 0.5103, + "step": 8627 + }, + { + "epoch": 0.7, + "grad_norm": 0.8954379103279083, + "learning_rate": 4.328021609900018e-06, + "loss": 0.5055, + "step": 8628 + }, + { + "epoch": 0.7, + "grad_norm": 0.9164004427300606, + "learning_rate": 4.3258537506269735e-06, + "loss": 0.4858, + "step": 8629 + }, + { + "epoch": 0.7, + "grad_norm": 0.951325657558114, + "learning_rate": 4.3236862845672355e-06, + "loss": 0.5028, + "step": 8630 + }, + { + "epoch": 0.7, + "grad_norm": 0.8636461833908563, + "learning_rate": 4.3215192118709984e-06, + "loss": 0.5028, + "step": 8631 + }, + { + "epoch": 0.7, + "grad_norm": 1.1844359900170125, + "learning_rate": 4.319352532688444e-06, + "loss": 0.4569, + "step": 8632 + }, + { + "epoch": 0.7, + "grad_norm": 0.9495900125986644, + "learning_rate": 4.317186247169719e-06, + "loss": 0.5037, + "step": 8633 + }, + { + "epoch": 0.7, + "grad_norm": 0.9685745342821419, + "learning_rate": 4.315020355464947e-06, + "loss": 0.5319, + "step": 8634 + }, + { + "epoch": 0.7, + "grad_norm": 0.8690042344461989, + "learning_rate": 4.312854857724222e-06, + "loss": 0.4636, + "step": 8635 + }, + { + "epoch": 0.7, + "grad_norm": 0.9177292653684901, + "learning_rate": 4.3106897540976154e-06, + "loss": 0.5345, + "step": 8636 + }, + { + "epoch": 0.7, + "grad_norm": 0.9735025676432103, + "learning_rate": 4.308525044735158e-06, + "loss": 0.5682, + "step": 8637 + }, + { + "epoch": 0.7, + "grad_norm": 0.9660018093666137, + "learning_rate": 4.306360729786867e-06, + "loss": 0.5515, + "step": 8638 + }, + { + "epoch": 0.7, + "grad_norm": 0.9595466170331142, + "learning_rate": 4.304196809402726e-06, + "loss": 0.4858, + "step": 8639 + }, + { + "epoch": 0.7, + "grad_norm": 0.9330526134964297, + "learning_rate": 4.302033283732695e-06, + "loss": 0.4753, + "step": 8640 + }, + { + "epoch": 0.7, + "grad_norm": 0.9228600220437362, + "learning_rate": 4.299870152926703e-06, + "loss": 0.4527, + "step": 8641 + }, + { + "epoch": 0.7, + "grad_norm": 0.946059941678235, + "learning_rate": 4.297707417134653e-06, + "loss": 0.5222, + "step": 8642 + }, + { + "epoch": 0.7, + "grad_norm": 0.9401672436336672, + "learning_rate": 4.295545076506422e-06, + "loss": 0.5587, + "step": 8643 + }, + { + "epoch": 0.7, + "grad_norm": 0.9230695826194234, + "learning_rate": 4.293383131191861e-06, + "loss": 0.5095, + "step": 8644 + }, + { + "epoch": 0.7, + "grad_norm": 0.9264723424024507, + "learning_rate": 4.291221581340783e-06, + "loss": 0.5024, + "step": 8645 + }, + { + "epoch": 0.7, + "grad_norm": 0.7676869641906288, + "learning_rate": 4.2890604271029855e-06, + "loss": 0.444, + "step": 8646 + }, + { + "epoch": 0.7, + "grad_norm": 0.9593290265798053, + "learning_rate": 4.286899668628235e-06, + "loss": 0.5213, + "step": 8647 + }, + { + "epoch": 0.7, + "grad_norm": 0.9010995858897504, + "learning_rate": 4.28473930606627e-06, + "loss": 0.5134, + "step": 8648 + }, + { + "epoch": 0.7, + "grad_norm": 0.9418499445234443, + "learning_rate": 4.282579339566802e-06, + "loss": 0.509, + "step": 8649 + }, + { + "epoch": 0.7, + "grad_norm": 0.9286350494558199, + "learning_rate": 4.280419769279518e-06, + "loss": 0.5012, + "step": 8650 + }, + { + "epoch": 0.7, + "grad_norm": 0.8983920940568924, + "learning_rate": 4.278260595354067e-06, + "loss": 0.5336, + "step": 8651 + }, + { + "epoch": 0.7, + "grad_norm": 0.8294771280671885, + "learning_rate": 4.276101817940082e-06, + "loss": 0.4598, + "step": 8652 + }, + { + "epoch": 0.7, + "grad_norm": 0.8855833075909199, + "learning_rate": 4.273943437187163e-06, + "loss": 0.4714, + "step": 8653 + }, + { + "epoch": 0.7, + "grad_norm": 0.9034299559561075, + "learning_rate": 4.271785453244886e-06, + "loss": 0.4397, + "step": 8654 + }, + { + "epoch": 0.7, + "grad_norm": 0.9608987870079996, + "learning_rate": 4.269627866262794e-06, + "loss": 0.5578, + "step": 8655 + }, + { + "epoch": 0.7, + "grad_norm": 0.8949249243243168, + "learning_rate": 4.267470676390414e-06, + "loss": 0.4699, + "step": 8656 + }, + { + "epoch": 0.7, + "grad_norm": 0.8901792843107094, + "learning_rate": 4.2653138837772265e-06, + "loss": 0.5048, + "step": 8657 + }, + { + "epoch": 0.7, + "grad_norm": 0.9389581626819746, + "learning_rate": 4.2631574885727e-06, + "loss": 0.5082, + "step": 8658 + }, + { + "epoch": 0.7, + "grad_norm": 0.9186421948955802, + "learning_rate": 4.261001490926272e-06, + "loss": 0.52, + "step": 8659 + }, + { + "epoch": 0.7, + "grad_norm": 0.9401889077750639, + "learning_rate": 4.25884589098735e-06, + "loss": 0.4897, + "step": 8660 + }, + { + "epoch": 0.7, + "grad_norm": 0.9039686624889018, + "learning_rate": 4.256690688905315e-06, + "loss": 0.5232, + "step": 8661 + }, + { + "epoch": 0.7, + "grad_norm": 0.9773294716052752, + "learning_rate": 4.254535884829524e-06, + "loss": 0.5253, + "step": 8662 + }, + { + "epoch": 0.7, + "grad_norm": 0.7950138750683371, + "learning_rate": 4.252381478909293e-06, + "loss": 0.4714, + "step": 8663 + }, + { + "epoch": 0.7, + "grad_norm": 1.0925343616353147, + "learning_rate": 4.2502274712939355e-06, + "loss": 0.5992, + "step": 8664 + }, + { + "epoch": 0.7, + "grad_norm": 0.8961489935419249, + "learning_rate": 4.24807386213271e-06, + "loss": 0.4854, + "step": 8665 + }, + { + "epoch": 0.7, + "grad_norm": 0.8461885044508616, + "learning_rate": 4.245920651574864e-06, + "loss": 0.4316, + "step": 8666 + }, + { + "epoch": 0.7, + "grad_norm": 0.8998814032194048, + "learning_rate": 4.243767839769612e-06, + "loss": 0.4834, + "step": 8667 + }, + { + "epoch": 0.7, + "grad_norm": 0.8852003798729836, + "learning_rate": 4.241615426866148e-06, + "loss": 0.4867, + "step": 8668 + }, + { + "epoch": 0.7, + "grad_norm": 0.9462793357999046, + "learning_rate": 4.239463413013619e-06, + "loss": 0.5006, + "step": 8669 + }, + { + "epoch": 0.7, + "grad_norm": 0.8870614184576234, + "learning_rate": 4.237311798361175e-06, + "loss": 0.5095, + "step": 8670 + }, + { + "epoch": 0.7, + "grad_norm": 0.8429600883646559, + "learning_rate": 4.235160583057905e-06, + "loss": 0.5015, + "step": 8671 + }, + { + "epoch": 0.7, + "grad_norm": 0.948909062706813, + "learning_rate": 4.233009767252896e-06, + "loss": 0.5001, + "step": 8672 + }, + { + "epoch": 0.7, + "grad_norm": 0.9127038362884151, + "learning_rate": 4.230859351095193e-06, + "loss": 0.5077, + "step": 8673 + }, + { + "epoch": 0.7, + "grad_norm": 0.9305603826139454, + "learning_rate": 4.2287093347338245e-06, + "loss": 0.5122, + "step": 8674 + }, + { + "epoch": 0.71, + "grad_norm": 0.8877865608484397, + "learning_rate": 4.226559718317773e-06, + "loss": 0.4166, + "step": 8675 + }, + { + "epoch": 0.71, + "grad_norm": 0.9078446718113264, + "learning_rate": 4.224410501996018e-06, + "loss": 0.5177, + "step": 8676 + }, + { + "epoch": 0.71, + "grad_norm": 0.8757235975774688, + "learning_rate": 4.222261685917489e-06, + "loss": 0.4597, + "step": 8677 + }, + { + "epoch": 0.71, + "grad_norm": 0.9691724908290217, + "learning_rate": 4.220113270231101e-06, + "loss": 0.5702, + "step": 8678 + }, + { + "epoch": 0.71, + "grad_norm": 0.9633912836541412, + "learning_rate": 4.217965255085737e-06, + "loss": 0.5597, + "step": 8679 + }, + { + "epoch": 0.71, + "grad_norm": 0.8084054151127114, + "learning_rate": 4.215817640630254e-06, + "loss": 0.4092, + "step": 8680 + }, + { + "epoch": 0.71, + "grad_norm": 0.9928820511774611, + "learning_rate": 4.2136704270134725e-06, + "loss": 0.5033, + "step": 8681 + }, + { + "epoch": 0.71, + "grad_norm": 0.9771886848828312, + "learning_rate": 4.2115236143842046e-06, + "loss": 0.522, + "step": 8682 + }, + { + "epoch": 0.71, + "grad_norm": 0.9075441822519663, + "learning_rate": 4.209377202891212e-06, + "loss": 0.4654, + "step": 8683 + }, + { + "epoch": 0.71, + "grad_norm": 0.9183635386993099, + "learning_rate": 4.207231192683243e-06, + "loss": 0.5257, + "step": 8684 + }, + { + "epoch": 0.71, + "grad_norm": 0.8588744030939618, + "learning_rate": 4.205085583909014e-06, + "loss": 0.4934, + "step": 8685 + }, + { + "epoch": 0.71, + "grad_norm": 0.9081390911988166, + "learning_rate": 4.2029403767172175e-06, + "loss": 0.4586, + "step": 8686 + }, + { + "epoch": 0.71, + "grad_norm": 0.8660306671818201, + "learning_rate": 4.200795571256504e-06, + "loss": 0.4527, + "step": 8687 + }, + { + "epoch": 0.71, + "grad_norm": 0.9407829774136649, + "learning_rate": 4.19865116767552e-06, + "loss": 0.528, + "step": 8688 + }, + { + "epoch": 0.71, + "grad_norm": 0.8590314558726129, + "learning_rate": 4.196507166122862e-06, + "loss": 0.4732, + "step": 8689 + }, + { + "epoch": 0.71, + "grad_norm": 0.8731436134748567, + "learning_rate": 4.1943635667471095e-06, + "loss": 0.4915, + "step": 8690 + }, + { + "epoch": 0.71, + "grad_norm": 0.8511358601352428, + "learning_rate": 4.192220369696811e-06, + "loss": 0.4785, + "step": 8691 + }, + { + "epoch": 0.71, + "grad_norm": 1.068335180497299, + "learning_rate": 4.190077575120493e-06, + "loss": 0.5777, + "step": 8692 + }, + { + "epoch": 0.71, + "grad_norm": 1.0273366727449977, + "learning_rate": 4.187935183166641e-06, + "loss": 0.5265, + "step": 8693 + }, + { + "epoch": 0.71, + "grad_norm": 0.9019956570633716, + "learning_rate": 4.1857931939837305e-06, + "loss": 0.4879, + "step": 8694 + }, + { + "epoch": 0.71, + "grad_norm": 0.8957925039835313, + "learning_rate": 4.18365160772019e-06, + "loss": 0.4607, + "step": 8695 + }, + { + "epoch": 0.71, + "grad_norm": 0.8908531131530597, + "learning_rate": 4.1815104245244364e-06, + "loss": 0.5171, + "step": 8696 + }, + { + "epoch": 0.71, + "grad_norm": 0.9426170847860915, + "learning_rate": 4.179369644544849e-06, + "loss": 0.4745, + "step": 8697 + }, + { + "epoch": 0.71, + "grad_norm": 0.9223070695579061, + "learning_rate": 4.177229267929785e-06, + "loss": 0.5021, + "step": 8698 + }, + { + "epoch": 0.71, + "grad_norm": 1.0669590657342318, + "learning_rate": 4.17508929482756e-06, + "loss": 0.5615, + "step": 8699 + }, + { + "epoch": 0.71, + "grad_norm": 0.9170041445449919, + "learning_rate": 4.172949725386488e-06, + "loss": 0.4175, + "step": 8700 + }, + { + "epoch": 0.71, + "grad_norm": 0.8346287732255635, + "learning_rate": 4.170810559754829e-06, + "loss": 0.4755, + "step": 8701 + }, + { + "epoch": 0.71, + "grad_norm": 0.825095119982811, + "learning_rate": 4.168671798080826e-06, + "loss": 0.4639, + "step": 8702 + }, + { + "epoch": 0.71, + "grad_norm": 0.9463481324513929, + "learning_rate": 4.166533440512696e-06, + "loss": 0.4778, + "step": 8703 + }, + { + "epoch": 0.71, + "grad_norm": 1.1810244437305535, + "learning_rate": 4.164395487198628e-06, + "loss": 0.5336, + "step": 8704 + }, + { + "epoch": 0.71, + "grad_norm": 1.0187738412895564, + "learning_rate": 4.1622579382867686e-06, + "loss": 0.5458, + "step": 8705 + }, + { + "epoch": 0.71, + "grad_norm": 0.9659152585832806, + "learning_rate": 4.160120793925264e-06, + "loss": 0.5135, + "step": 8706 + }, + { + "epoch": 0.71, + "grad_norm": 0.9033350716520201, + "learning_rate": 4.157984054262205e-06, + "loss": 0.5468, + "step": 8707 + }, + { + "epoch": 0.71, + "grad_norm": 1.01327241202795, + "learning_rate": 4.155847719445669e-06, + "loss": 0.5685, + "step": 8708 + }, + { + "epoch": 0.71, + "grad_norm": 0.8969467151084107, + "learning_rate": 4.1537117896237026e-06, + "loss": 0.4953, + "step": 8709 + }, + { + "epoch": 0.71, + "grad_norm": 0.8190720523141264, + "learning_rate": 4.151576264944326e-06, + "loss": 0.5048, + "step": 8710 + }, + { + "epoch": 0.71, + "grad_norm": 0.8853197781726923, + "learning_rate": 4.14944114555552e-06, + "loss": 0.511, + "step": 8711 + }, + { + "epoch": 0.71, + "grad_norm": 0.9285788886413802, + "learning_rate": 4.14730643160526e-06, + "loss": 0.5235, + "step": 8712 + }, + { + "epoch": 0.71, + "grad_norm": 0.9257368811326977, + "learning_rate": 4.14517212324147e-06, + "loss": 0.4874, + "step": 8713 + }, + { + "epoch": 0.71, + "grad_norm": 0.881845676784708, + "learning_rate": 4.143038220612058e-06, + "loss": 0.4813, + "step": 8714 + }, + { + "epoch": 0.71, + "grad_norm": 0.9902780238114853, + "learning_rate": 4.140904723864903e-06, + "loss": 0.5445, + "step": 8715 + }, + { + "epoch": 0.71, + "grad_norm": 1.04868441240058, + "learning_rate": 4.138771633147856e-06, + "loss": 0.508, + "step": 8716 + }, + { + "epoch": 0.71, + "grad_norm": 0.8659370475684836, + "learning_rate": 4.13663894860873e-06, + "loss": 0.5388, + "step": 8717 + }, + { + "epoch": 0.71, + "grad_norm": 0.8893485951346831, + "learning_rate": 4.13450667039533e-06, + "loss": 0.4901, + "step": 8718 + }, + { + "epoch": 0.71, + "grad_norm": 0.8054311327775409, + "learning_rate": 4.132374798655413e-06, + "loss": 0.4823, + "step": 8719 + }, + { + "epoch": 0.71, + "grad_norm": 1.0250186548566913, + "learning_rate": 4.130243333536718e-06, + "loss": 0.5844, + "step": 8720 + }, + { + "epoch": 0.71, + "grad_norm": 0.9198453007877438, + "learning_rate": 4.128112275186952e-06, + "loss": 0.5061, + "step": 8721 + }, + { + "epoch": 0.71, + "grad_norm": 0.9542831606083931, + "learning_rate": 4.125981623753801e-06, + "loss": 0.5177, + "step": 8722 + }, + { + "epoch": 0.71, + "grad_norm": 0.9138929453969917, + "learning_rate": 4.1238513793849065e-06, + "loss": 0.4643, + "step": 8723 + }, + { + "epoch": 0.71, + "grad_norm": 1.008733247589399, + "learning_rate": 4.121721542227906e-06, + "loss": 0.4846, + "step": 8724 + }, + { + "epoch": 0.71, + "grad_norm": 0.9343475870206165, + "learning_rate": 4.1195921124303864e-06, + "loss": 0.5228, + "step": 8725 + }, + { + "epoch": 0.71, + "grad_norm": 0.9956146315203084, + "learning_rate": 4.117463090139916e-06, + "loss": 0.5167, + "step": 8726 + }, + { + "epoch": 0.71, + "grad_norm": 0.9457657258776898, + "learning_rate": 4.1153344755040355e-06, + "loss": 0.5152, + "step": 8727 + }, + { + "epoch": 0.71, + "grad_norm": 0.8830292824760735, + "learning_rate": 4.11320626867026e-06, + "loss": 0.4626, + "step": 8728 + }, + { + "epoch": 0.71, + "grad_norm": 0.9490427326245271, + "learning_rate": 4.111078469786062e-06, + "loss": 0.4836, + "step": 8729 + }, + { + "epoch": 0.71, + "grad_norm": 0.9092871656552273, + "learning_rate": 4.10895107899891e-06, + "loss": 0.458, + "step": 8730 + }, + { + "epoch": 0.71, + "grad_norm": 0.93005774304165, + "learning_rate": 4.106824096456217e-06, + "loss": 0.4641, + "step": 8731 + }, + { + "epoch": 0.71, + "grad_norm": 0.8860861090356404, + "learning_rate": 4.104697522305388e-06, + "loss": 0.5386, + "step": 8732 + }, + { + "epoch": 0.71, + "grad_norm": 0.846867975757357, + "learning_rate": 4.102571356693793e-06, + "loss": 0.4506, + "step": 8733 + }, + { + "epoch": 0.71, + "grad_norm": 0.9016018547200599, + "learning_rate": 4.100445599768774e-06, + "loss": 0.4755, + "step": 8734 + }, + { + "epoch": 0.71, + "grad_norm": 0.9656100627050762, + "learning_rate": 4.098320251677637e-06, + "loss": 0.5042, + "step": 8735 + }, + { + "epoch": 0.71, + "grad_norm": 0.8760587893762447, + "learning_rate": 4.096195312567677e-06, + "loss": 0.5041, + "step": 8736 + }, + { + "epoch": 0.71, + "grad_norm": 0.9051519692958128, + "learning_rate": 4.094070782586141e-06, + "loss": 0.4741, + "step": 8737 + }, + { + "epoch": 0.71, + "grad_norm": 0.9101216832063238, + "learning_rate": 4.091946661880262e-06, + "loss": 0.5247, + "step": 8738 + }, + { + "epoch": 0.71, + "grad_norm": 0.9338495810228892, + "learning_rate": 4.089822950597239e-06, + "loss": 0.4601, + "step": 8739 + }, + { + "epoch": 0.71, + "grad_norm": 0.9623678853244182, + "learning_rate": 4.087699648884248e-06, + "loss": 0.5257, + "step": 8740 + }, + { + "epoch": 0.71, + "grad_norm": 1.3681873907347593, + "learning_rate": 4.085576756888418e-06, + "loss": 0.49, + "step": 8741 + }, + { + "epoch": 0.71, + "grad_norm": 0.8947410753937106, + "learning_rate": 4.083454274756881e-06, + "loss": 0.5437, + "step": 8742 + }, + { + "epoch": 0.71, + "grad_norm": 0.8560471468894103, + "learning_rate": 4.081332202636711e-06, + "loss": 0.553, + "step": 8743 + }, + { + "epoch": 0.71, + "grad_norm": 0.8614050583496167, + "learning_rate": 4.07921054067497e-06, + "loss": 0.5101, + "step": 8744 + }, + { + "epoch": 0.71, + "grad_norm": 0.8858829368669646, + "learning_rate": 4.0770892890186854e-06, + "loss": 0.5019, + "step": 8745 + }, + { + "epoch": 0.71, + "grad_norm": 0.8358142772584893, + "learning_rate": 4.074968447814865e-06, + "loss": 0.4682, + "step": 8746 + }, + { + "epoch": 0.71, + "grad_norm": 1.008198929727442, + "learning_rate": 4.072848017210467e-06, + "loss": 0.4819, + "step": 8747 + }, + { + "epoch": 0.71, + "grad_norm": 0.831564582604135, + "learning_rate": 4.070727997352451e-06, + "loss": 0.4379, + "step": 8748 + }, + { + "epoch": 0.71, + "grad_norm": 0.9825779377624262, + "learning_rate": 4.068608388387722e-06, + "loss": 0.5053, + "step": 8749 + }, + { + "epoch": 0.71, + "grad_norm": 1.0133746817461498, + "learning_rate": 4.066489190463171e-06, + "loss": 0.4938, + "step": 8750 + }, + { + "epoch": 0.71, + "grad_norm": 1.0040275358581465, + "learning_rate": 4.0643704037256556e-06, + "loss": 0.5035, + "step": 8751 + }, + { + "epoch": 0.71, + "grad_norm": 0.8856134943574181, + "learning_rate": 4.0622520283220115e-06, + "loss": 0.491, + "step": 8752 + }, + { + "epoch": 0.71, + "grad_norm": 0.8965321999288322, + "learning_rate": 4.060134064399026e-06, + "loss": 0.4345, + "step": 8753 + }, + { + "epoch": 0.71, + "grad_norm": 0.8186774390161647, + "learning_rate": 4.05801651210349e-06, + "loss": 0.4117, + "step": 8754 + }, + { + "epoch": 0.71, + "grad_norm": 1.0365583157244873, + "learning_rate": 4.0558993715821335e-06, + "loss": 0.5259, + "step": 8755 + }, + { + "epoch": 0.71, + "grad_norm": 0.8711117721422899, + "learning_rate": 4.053782642981679e-06, + "loss": 0.4699, + "step": 8756 + }, + { + "epoch": 0.71, + "grad_norm": 0.8808213604888707, + "learning_rate": 4.0516663264488145e-06, + "loss": 0.4718, + "step": 8757 + }, + { + "epoch": 0.71, + "grad_norm": 0.9377918559791827, + "learning_rate": 4.049550422130196e-06, + "loss": 0.4345, + "step": 8758 + }, + { + "epoch": 0.71, + "grad_norm": 0.8839879672109973, + "learning_rate": 4.047434930172456e-06, + "loss": 0.5011, + "step": 8759 + }, + { + "epoch": 0.71, + "grad_norm": 0.8693425125764165, + "learning_rate": 4.045319850722198e-06, + "loss": 0.4625, + "step": 8760 + }, + { + "epoch": 0.71, + "grad_norm": 0.9119674025694322, + "learning_rate": 4.04320518392599e-06, + "loss": 0.5342, + "step": 8761 + }, + { + "epoch": 0.71, + "grad_norm": 1.7183687244172643, + "learning_rate": 4.041090929930378e-06, + "loss": 0.5476, + "step": 8762 + }, + { + "epoch": 0.71, + "grad_norm": 0.9880235645483642, + "learning_rate": 4.03897708888188e-06, + "loss": 0.5248, + "step": 8763 + }, + { + "epoch": 0.71, + "grad_norm": 0.871714760467819, + "learning_rate": 4.036863660926982e-06, + "loss": 0.4717, + "step": 8764 + }, + { + "epoch": 0.71, + "grad_norm": 0.9458534642219324, + "learning_rate": 4.0347506462121434e-06, + "loss": 0.5022, + "step": 8765 + }, + { + "epoch": 0.71, + "grad_norm": 0.893834833687743, + "learning_rate": 4.032638044883796e-06, + "loss": 0.4513, + "step": 8766 + }, + { + "epoch": 0.71, + "grad_norm": 0.9164768678852234, + "learning_rate": 4.0305258570883336e-06, + "loss": 0.4428, + "step": 8767 + }, + { + "epoch": 0.71, + "grad_norm": 0.9575711407404988, + "learning_rate": 4.028414082972141e-06, + "loss": 0.4718, + "step": 8768 + }, + { + "epoch": 0.71, + "grad_norm": 1.1116004691640229, + "learning_rate": 4.026302722681551e-06, + "loss": 0.498, + "step": 8769 + }, + { + "epoch": 0.71, + "grad_norm": 0.8936242693860171, + "learning_rate": 4.024191776362884e-06, + "loss": 0.4714, + "step": 8770 + }, + { + "epoch": 0.71, + "grad_norm": 0.9006703505518066, + "learning_rate": 4.022081244162428e-06, + "loss": 0.5088, + "step": 8771 + }, + { + "epoch": 0.71, + "grad_norm": 0.9518530865834943, + "learning_rate": 4.019971126226442e-06, + "loss": 0.5148, + "step": 8772 + }, + { + "epoch": 0.71, + "grad_norm": 0.8983615821416384, + "learning_rate": 4.017861422701144e-06, + "loss": 0.5111, + "step": 8773 + }, + { + "epoch": 0.71, + "grad_norm": 0.9018554754900465, + "learning_rate": 4.015752133732752e-06, + "loss": 0.5303, + "step": 8774 + }, + { + "epoch": 0.71, + "grad_norm": 0.9615507125371773, + "learning_rate": 4.013643259467426e-06, + "loss": 0.5036, + "step": 8775 + }, + { + "epoch": 0.71, + "grad_norm": 0.9221345131864341, + "learning_rate": 4.011534800051311e-06, + "loss": 0.509, + "step": 8776 + }, + { + "epoch": 0.71, + "grad_norm": 0.9346443887263562, + "learning_rate": 4.0094267556305236e-06, + "loss": 0.5485, + "step": 8777 + }, + { + "epoch": 0.71, + "grad_norm": 0.8919882717782461, + "learning_rate": 4.0073191263511475e-06, + "loss": 0.4993, + "step": 8778 + }, + { + "epoch": 0.71, + "grad_norm": 0.9302969938319966, + "learning_rate": 4.005211912359241e-06, + "loss": 0.4702, + "step": 8779 + }, + { + "epoch": 0.71, + "grad_norm": 0.9522390605496788, + "learning_rate": 4.003105113800835e-06, + "loss": 0.5209, + "step": 8780 + }, + { + "epoch": 0.71, + "grad_norm": 0.8053579275810232, + "learning_rate": 4.000998730821922e-06, + "loss": 0.454, + "step": 8781 + }, + { + "epoch": 0.71, + "grad_norm": 0.9899449056444444, + "learning_rate": 3.998892763568476e-06, + "loss": 0.5019, + "step": 8782 + }, + { + "epoch": 0.71, + "grad_norm": 0.9943263088056388, + "learning_rate": 3.996787212186438e-06, + "loss": 0.5298, + "step": 8783 + }, + { + "epoch": 0.71, + "grad_norm": 0.9242626375286663, + "learning_rate": 3.994682076821721e-06, + "loss": 0.501, + "step": 8784 + }, + { + "epoch": 0.71, + "grad_norm": 0.9675664947615202, + "learning_rate": 3.99257735762021e-06, + "loss": 0.515, + "step": 8785 + }, + { + "epoch": 0.71, + "grad_norm": 0.8811697102442329, + "learning_rate": 3.990473054727764e-06, + "loss": 0.4832, + "step": 8786 + }, + { + "epoch": 0.71, + "grad_norm": 0.9777855258544456, + "learning_rate": 3.988369168290199e-06, + "loss": 0.5334, + "step": 8787 + }, + { + "epoch": 0.71, + "grad_norm": 1.0989247905108661, + "learning_rate": 3.98626569845332e-06, + "loss": 0.5236, + "step": 8788 + }, + { + "epoch": 0.71, + "grad_norm": 1.0143168723303095, + "learning_rate": 3.984162645362893e-06, + "loss": 0.5851, + "step": 8789 + }, + { + "epoch": 0.71, + "grad_norm": 0.8706688945273519, + "learning_rate": 3.98206000916466e-06, + "loss": 0.5129, + "step": 8790 + }, + { + "epoch": 0.71, + "grad_norm": 1.0117501135253928, + "learning_rate": 3.97995779000433e-06, + "loss": 0.4835, + "step": 8791 + }, + { + "epoch": 0.71, + "grad_norm": 0.9361724235863483, + "learning_rate": 3.977855988027585e-06, + "loss": 0.5167, + "step": 8792 + }, + { + "epoch": 0.71, + "grad_norm": 0.9441390871206049, + "learning_rate": 3.975754603380082e-06, + "loss": 0.52, + "step": 8793 + }, + { + "epoch": 0.71, + "grad_norm": 0.9325045022867406, + "learning_rate": 3.973653636207437e-06, + "loss": 0.501, + "step": 8794 + }, + { + "epoch": 0.71, + "grad_norm": 0.8737033160786428, + "learning_rate": 3.971553086655251e-06, + "loss": 0.5012, + "step": 8795 + }, + { + "epoch": 0.71, + "grad_norm": 0.8631823070347903, + "learning_rate": 3.969452954869089e-06, + "loss": 0.5286, + "step": 8796 + }, + { + "epoch": 0.71, + "grad_norm": 0.8817665001884151, + "learning_rate": 3.967353240994487e-06, + "loss": 0.4844, + "step": 8797 + }, + { + "epoch": 0.72, + "grad_norm": 0.9297465007856393, + "learning_rate": 3.9652539451769554e-06, + "loss": 0.4972, + "step": 8798 + }, + { + "epoch": 0.72, + "grad_norm": 1.0959093161662927, + "learning_rate": 3.963155067561976e-06, + "loss": 0.4475, + "step": 8799 + }, + { + "epoch": 0.72, + "grad_norm": 0.8978766746333792, + "learning_rate": 3.961056608294992e-06, + "loss": 0.4925, + "step": 8800 + }, + { + "epoch": 0.72, + "grad_norm": 0.9327436321983399, + "learning_rate": 3.958958567521428e-06, + "loss": 0.4779, + "step": 8801 + }, + { + "epoch": 0.72, + "grad_norm": 0.923248650172493, + "learning_rate": 3.956860945386677e-06, + "loss": 0.4828, + "step": 8802 + }, + { + "epoch": 0.72, + "grad_norm": 0.9605785520965624, + "learning_rate": 3.954763742036103e-06, + "loss": 0.5256, + "step": 8803 + }, + { + "epoch": 0.72, + "grad_norm": 0.8488022308694663, + "learning_rate": 3.952666957615039e-06, + "loss": 0.4822, + "step": 8804 + }, + { + "epoch": 0.72, + "grad_norm": 1.012607013833751, + "learning_rate": 3.950570592268794e-06, + "loss": 0.4769, + "step": 8805 + }, + { + "epoch": 0.72, + "grad_norm": 0.8621331876085766, + "learning_rate": 3.948474646142638e-06, + "loss": 0.4088, + "step": 8806 + }, + { + "epoch": 0.72, + "grad_norm": 0.9353129675040686, + "learning_rate": 3.946379119381822e-06, + "loss": 0.4666, + "step": 8807 + }, + { + "epoch": 0.72, + "grad_norm": 0.928181887357087, + "learning_rate": 3.9442840121315625e-06, + "loss": 0.5957, + "step": 8808 + }, + { + "epoch": 0.72, + "grad_norm": 0.8097859720618114, + "learning_rate": 3.94218932453705e-06, + "loss": 0.3988, + "step": 8809 + }, + { + "epoch": 0.72, + "grad_norm": 0.8555817269549301, + "learning_rate": 3.940095056743444e-06, + "loss": 0.4944, + "step": 8810 + }, + { + "epoch": 0.72, + "grad_norm": 0.8688428830218954, + "learning_rate": 3.938001208895878e-06, + "loss": 0.5357, + "step": 8811 + }, + { + "epoch": 0.72, + "grad_norm": 0.8943758455626842, + "learning_rate": 3.935907781139446e-06, + "loss": 0.495, + "step": 8812 + }, + { + "epoch": 0.72, + "grad_norm": 0.9526621111614739, + "learning_rate": 3.933814773619232e-06, + "loss": 0.4879, + "step": 8813 + }, + { + "epoch": 0.72, + "grad_norm": 0.8697916155567965, + "learning_rate": 3.93172218648027e-06, + "loss": 0.474, + "step": 8814 + }, + { + "epoch": 0.72, + "grad_norm": 1.009523382502148, + "learning_rate": 3.929630019867579e-06, + "loss": 0.4285, + "step": 8815 + }, + { + "epoch": 0.72, + "grad_norm": 1.0902489236515984, + "learning_rate": 3.927538273926141e-06, + "loss": 0.4748, + "step": 8816 + }, + { + "epoch": 0.72, + "grad_norm": 0.9224848262996478, + "learning_rate": 3.92544694880092e-06, + "loss": 0.4925, + "step": 8817 + }, + { + "epoch": 0.72, + "grad_norm": 0.928690974890286, + "learning_rate": 3.923356044636829e-06, + "loss": 0.449, + "step": 8818 + }, + { + "epoch": 0.72, + "grad_norm": 1.0201833658121, + "learning_rate": 3.921265561578781e-06, + "loss": 0.548, + "step": 8819 + }, + { + "epoch": 0.72, + "grad_norm": 0.9215773732856094, + "learning_rate": 3.919175499771635e-06, + "loss": 0.4728, + "step": 8820 + }, + { + "epoch": 0.72, + "grad_norm": 0.9183799946338584, + "learning_rate": 3.917085859360234e-06, + "loss": 0.4792, + "step": 8821 + }, + { + "epoch": 0.72, + "grad_norm": 0.9020914043322761, + "learning_rate": 3.9149966404893854e-06, + "loss": 0.4599, + "step": 8822 + }, + { + "epoch": 0.72, + "grad_norm": 1.0179660059920808, + "learning_rate": 3.912907843303877e-06, + "loss": 0.5157, + "step": 8823 + }, + { + "epoch": 0.72, + "grad_norm": 0.8934013813007406, + "learning_rate": 3.910819467948448e-06, + "loss": 0.4946, + "step": 8824 + }, + { + "epoch": 0.72, + "grad_norm": 0.9137460578656958, + "learning_rate": 3.908731514567836e-06, + "loss": 0.4284, + "step": 8825 + }, + { + "epoch": 0.72, + "grad_norm": 0.8983840220601542, + "learning_rate": 3.906643983306724e-06, + "loss": 0.4622, + "step": 8826 + }, + { + "epoch": 0.72, + "grad_norm": 0.8299218289902504, + "learning_rate": 3.904556874309779e-06, + "loss": 0.4446, + "step": 8827 + }, + { + "epoch": 0.72, + "grad_norm": 0.8824793357068349, + "learning_rate": 3.902470187721636e-06, + "loss": 0.4705, + "step": 8828 + }, + { + "epoch": 0.72, + "grad_norm": 0.9958803430976835, + "learning_rate": 3.900383923686905e-06, + "loss": 0.5064, + "step": 8829 + }, + { + "epoch": 0.72, + "grad_norm": 0.908448281142226, + "learning_rate": 3.898298082350149e-06, + "loss": 0.5247, + "step": 8830 + }, + { + "epoch": 0.72, + "grad_norm": 0.9807265834771793, + "learning_rate": 3.896212663855932e-06, + "loss": 0.5566, + "step": 8831 + }, + { + "epoch": 0.72, + "grad_norm": 0.918857203642734, + "learning_rate": 3.894127668348759e-06, + "loss": 0.4687, + "step": 8832 + }, + { + "epoch": 0.72, + "grad_norm": 0.9019988377097433, + "learning_rate": 3.892043095973123e-06, + "loss": 0.4991, + "step": 8833 + }, + { + "epoch": 0.72, + "grad_norm": 0.9049263772974447, + "learning_rate": 3.889958946873482e-06, + "loss": 0.4594, + "step": 8834 + }, + { + "epoch": 0.72, + "grad_norm": 1.1347811419110758, + "learning_rate": 3.887875221194271e-06, + "loss": 0.5158, + "step": 8835 + }, + { + "epoch": 0.72, + "grad_norm": 0.827489198544077, + "learning_rate": 3.885791919079878e-06, + "loss": 0.4375, + "step": 8836 + }, + { + "epoch": 0.72, + "grad_norm": 0.8444464095083356, + "learning_rate": 3.883709040674688e-06, + "loss": 0.4174, + "step": 8837 + }, + { + "epoch": 0.72, + "grad_norm": 0.9260301120795823, + "learning_rate": 3.881626586123034e-06, + "loss": 0.4922, + "step": 8838 + }, + { + "epoch": 0.72, + "grad_norm": 0.9377917778876699, + "learning_rate": 3.8795445555692305e-06, + "loss": 0.5823, + "step": 8839 + }, + { + "epoch": 0.72, + "grad_norm": 0.9395718084838521, + "learning_rate": 3.87746294915756e-06, + "loss": 0.5049, + "step": 8840 + }, + { + "epoch": 0.72, + "grad_norm": 0.9314406826381335, + "learning_rate": 3.87538176703228e-06, + "loss": 0.4928, + "step": 8841 + }, + { + "epoch": 0.72, + "grad_norm": 0.9542956906849946, + "learning_rate": 3.873301009337604e-06, + "loss": 0.4691, + "step": 8842 + }, + { + "epoch": 0.72, + "grad_norm": 0.8688695931633011, + "learning_rate": 3.871220676217742e-06, + "loss": 0.4676, + "step": 8843 + }, + { + "epoch": 0.72, + "grad_norm": 0.986603792422116, + "learning_rate": 3.869140767816846e-06, + "loss": 0.5367, + "step": 8844 + }, + { + "epoch": 0.72, + "grad_norm": 0.9600355737891579, + "learning_rate": 3.867061284279058e-06, + "loss": 0.5112, + "step": 8845 + }, + { + "epoch": 0.72, + "grad_norm": 0.9573543631323532, + "learning_rate": 3.864982225748481e-06, + "loss": 0.5086, + "step": 8846 + }, + { + "epoch": 0.72, + "grad_norm": 0.8153846929151147, + "learning_rate": 3.862903592369199e-06, + "loss": 0.455, + "step": 8847 + }, + { + "epoch": 0.72, + "grad_norm": 0.9317872841841982, + "learning_rate": 3.860825384285247e-06, + "loss": 0.537, + "step": 8848 + }, + { + "epoch": 0.72, + "grad_norm": 0.9100160868925283, + "learning_rate": 3.858747601640658e-06, + "loss": 0.4817, + "step": 8849 + }, + { + "epoch": 0.72, + "grad_norm": 0.8958109692096803, + "learning_rate": 3.856670244579409e-06, + "loss": 0.4736, + "step": 8850 + }, + { + "epoch": 0.72, + "grad_norm": 0.8956550140399889, + "learning_rate": 3.854593313245463e-06, + "loss": 0.4674, + "step": 8851 + }, + { + "epoch": 0.72, + "grad_norm": 0.9936883431069679, + "learning_rate": 3.852516807782749e-06, + "loss": 0.5141, + "step": 8852 + }, + { + "epoch": 0.72, + "grad_norm": 0.9475637588463383, + "learning_rate": 3.850440728335171e-06, + "loss": 0.4938, + "step": 8853 + }, + { + "epoch": 0.72, + "grad_norm": 0.9297870299422121, + "learning_rate": 3.848365075046589e-06, + "loss": 0.526, + "step": 8854 + }, + { + "epoch": 0.72, + "grad_norm": 0.8497992598755991, + "learning_rate": 3.846289848060858e-06, + "loss": 0.4832, + "step": 8855 + }, + { + "epoch": 0.72, + "grad_norm": 0.9046064998492238, + "learning_rate": 3.844215047521779e-06, + "loss": 0.5173, + "step": 8856 + }, + { + "epoch": 0.72, + "grad_norm": 1.0890530082437015, + "learning_rate": 3.842140673573136e-06, + "loss": 0.5348, + "step": 8857 + }, + { + "epoch": 0.72, + "grad_norm": 1.002209227608526, + "learning_rate": 3.840066726358683e-06, + "loss": 0.4426, + "step": 8858 + }, + { + "epoch": 0.72, + "grad_norm": 0.8119903241278402, + "learning_rate": 3.837993206022146e-06, + "loss": 0.4799, + "step": 8859 + }, + { + "epoch": 0.72, + "grad_norm": 0.9668063740346428, + "learning_rate": 3.8359201127072065e-06, + "loss": 0.524, + "step": 8860 + }, + { + "epoch": 0.72, + "grad_norm": 0.8836622588003586, + "learning_rate": 3.8338474465575425e-06, + "loss": 0.5279, + "step": 8861 + }, + { + "epoch": 0.72, + "grad_norm": 0.9555312989641757, + "learning_rate": 3.831775207716778e-06, + "loss": 0.4856, + "step": 8862 + }, + { + "epoch": 0.72, + "grad_norm": 0.9065472201631909, + "learning_rate": 3.82970339632852e-06, + "loss": 0.5127, + "step": 8863 + }, + { + "epoch": 0.72, + "grad_norm": 0.8870497018587068, + "learning_rate": 3.827632012536344e-06, + "loss": 0.4403, + "step": 8864 + }, + { + "epoch": 0.72, + "grad_norm": 0.8824030645406618, + "learning_rate": 3.825561056483798e-06, + "loss": 0.5422, + "step": 8865 + }, + { + "epoch": 0.72, + "grad_norm": 0.8760984152703735, + "learning_rate": 3.823490528314387e-06, + "loss": 0.4916, + "step": 8866 + }, + { + "epoch": 0.72, + "grad_norm": 0.9023598070151023, + "learning_rate": 3.821420428171611e-06, + "loss": 0.5103, + "step": 8867 + }, + { + "epoch": 0.72, + "grad_norm": 0.9606187506813286, + "learning_rate": 3.819350756198915e-06, + "loss": 0.5248, + "step": 8868 + }, + { + "epoch": 0.72, + "grad_norm": 0.9106250445068397, + "learning_rate": 3.81728151253973e-06, + "loss": 0.3984, + "step": 8869 + }, + { + "epoch": 0.72, + "grad_norm": 0.8406825255854373, + "learning_rate": 3.815212697337451e-06, + "loss": 0.533, + "step": 8870 + }, + { + "epoch": 0.72, + "grad_norm": 0.8989819504597925, + "learning_rate": 3.8131443107354503e-06, + "loss": 0.5037, + "step": 8871 + }, + { + "epoch": 0.72, + "grad_norm": 0.7929612535636658, + "learning_rate": 3.8110763528770543e-06, + "loss": 0.3884, + "step": 8872 + }, + { + "epoch": 0.72, + "grad_norm": 0.814849985403055, + "learning_rate": 3.8090088239055843e-06, + "loss": 0.4299, + "step": 8873 + }, + { + "epoch": 0.72, + "grad_norm": 0.857194615734218, + "learning_rate": 3.8069417239643082e-06, + "loss": 0.4467, + "step": 8874 + }, + { + "epoch": 0.72, + "grad_norm": 0.8907631497574886, + "learning_rate": 3.804875053196477e-06, + "loss": 0.4932, + "step": 8875 + }, + { + "epoch": 0.72, + "grad_norm": 0.8830049019992933, + "learning_rate": 3.80280881174531e-06, + "loss": 0.486, + "step": 8876 + }, + { + "epoch": 0.72, + "grad_norm": 0.9476380674101876, + "learning_rate": 3.800742999753999e-06, + "loss": 0.4813, + "step": 8877 + }, + { + "epoch": 0.72, + "grad_norm": 0.9612638714238807, + "learning_rate": 3.7986776173656927e-06, + "loss": 0.4955, + "step": 8878 + }, + { + "epoch": 0.72, + "grad_norm": 0.9275579157713626, + "learning_rate": 3.7966126647235326e-06, + "loss": 0.4744, + "step": 8879 + }, + { + "epoch": 0.72, + "grad_norm": 0.9605038116462703, + "learning_rate": 3.79454814197061e-06, + "loss": 0.4821, + "step": 8880 + }, + { + "epoch": 0.72, + "grad_norm": 0.871832545094725, + "learning_rate": 3.792484049249996e-06, + "loss": 0.5148, + "step": 8881 + }, + { + "epoch": 0.72, + "grad_norm": 1.0045689213531521, + "learning_rate": 3.790420386704733e-06, + "loss": 0.5913, + "step": 8882 + }, + { + "epoch": 0.72, + "grad_norm": 0.8229711090759823, + "learning_rate": 3.788357154477831e-06, + "loss": 0.4665, + "step": 8883 + }, + { + "epoch": 0.72, + "grad_norm": 0.92957969362573, + "learning_rate": 3.786294352712262e-06, + "loss": 0.4311, + "step": 8884 + }, + { + "epoch": 0.72, + "grad_norm": 0.8949166464230011, + "learning_rate": 3.784231981550991e-06, + "loss": 0.4892, + "step": 8885 + }, + { + "epoch": 0.72, + "grad_norm": 0.9962607277158397, + "learning_rate": 3.782170041136922e-06, + "loss": 0.5495, + "step": 8886 + }, + { + "epoch": 0.72, + "grad_norm": 0.958883598870349, + "learning_rate": 3.7801085316129615e-06, + "loss": 0.5129, + "step": 8887 + }, + { + "epoch": 0.72, + "grad_norm": 0.9222815789807826, + "learning_rate": 3.778047453121958e-06, + "loss": 0.5116, + "step": 8888 + }, + { + "epoch": 0.72, + "grad_norm": 1.030209109733231, + "learning_rate": 3.7759868058067483e-06, + "loss": 0.5329, + "step": 8889 + }, + { + "epoch": 0.72, + "grad_norm": 1.0035102607125828, + "learning_rate": 3.773926589810133e-06, + "loss": 0.4956, + "step": 8890 + }, + { + "epoch": 0.72, + "grad_norm": 0.8946525169233979, + "learning_rate": 3.7718668052748842e-06, + "loss": 0.4211, + "step": 8891 + }, + { + "epoch": 0.72, + "grad_norm": 0.7977790198384506, + "learning_rate": 3.7698074523437355e-06, + "loss": 0.4643, + "step": 8892 + }, + { + "epoch": 0.72, + "grad_norm": 1.0180235740982717, + "learning_rate": 3.7677485311594107e-06, + "loss": 0.5146, + "step": 8893 + }, + { + "epoch": 0.72, + "grad_norm": 0.9956941872006838, + "learning_rate": 3.7656900418645826e-06, + "loss": 0.5284, + "step": 8894 + }, + { + "epoch": 0.72, + "grad_norm": 0.8680531550447771, + "learning_rate": 3.763631984601903e-06, + "loss": 0.4795, + "step": 8895 + }, + { + "epoch": 0.72, + "grad_norm": 0.8941870294776818, + "learning_rate": 3.7615743595139965e-06, + "loss": 0.4795, + "step": 8896 + }, + { + "epoch": 0.72, + "grad_norm": 0.911033857549809, + "learning_rate": 3.759517166743456e-06, + "loss": 0.5178, + "step": 8897 + }, + { + "epoch": 0.72, + "grad_norm": 0.9492113344650935, + "learning_rate": 3.7574604064328336e-06, + "loss": 0.4921, + "step": 8898 + }, + { + "epoch": 0.72, + "grad_norm": 0.877478301058727, + "learning_rate": 3.7554040787246746e-06, + "loss": 0.4545, + "step": 8899 + }, + { + "epoch": 0.72, + "grad_norm": 0.9549658963774811, + "learning_rate": 3.7533481837614717e-06, + "loss": 0.5422, + "step": 8900 + }, + { + "epoch": 0.72, + "grad_norm": 0.8911457825497752, + "learning_rate": 3.7512927216856987e-06, + "loss": 0.5134, + "step": 8901 + }, + { + "epoch": 0.72, + "grad_norm": 0.8854910809568246, + "learning_rate": 3.7492376926397966e-06, + "loss": 0.4683, + "step": 8902 + }, + { + "epoch": 0.72, + "grad_norm": 0.9407344486002542, + "learning_rate": 3.7471830967661815e-06, + "loss": 0.489, + "step": 8903 + }, + { + "epoch": 0.72, + "grad_norm": 0.8656497076194637, + "learning_rate": 3.745128934207225e-06, + "loss": 0.4724, + "step": 8904 + }, + { + "epoch": 0.72, + "grad_norm": 0.8441131056004215, + "learning_rate": 3.743075205105292e-06, + "loss": 0.4291, + "step": 8905 + }, + { + "epoch": 0.72, + "grad_norm": 0.9530959715206083, + "learning_rate": 3.7410219096026944e-06, + "loss": 0.5696, + "step": 8906 + }, + { + "epoch": 0.72, + "grad_norm": 0.9786935110086086, + "learning_rate": 3.7389690478417273e-06, + "loss": 0.4892, + "step": 8907 + }, + { + "epoch": 0.72, + "grad_norm": 0.9564410548228618, + "learning_rate": 3.7369166199646502e-06, + "loss": 0.492, + "step": 8908 + }, + { + "epoch": 0.72, + "grad_norm": 0.9658684056753355, + "learning_rate": 3.7348646261137e-06, + "loss": 0.5069, + "step": 8909 + }, + { + "epoch": 0.72, + "grad_norm": 0.8958230878728628, + "learning_rate": 3.732813066431068e-06, + "loss": 0.4948, + "step": 8910 + }, + { + "epoch": 0.72, + "grad_norm": 0.9869591315367217, + "learning_rate": 3.730761941058938e-06, + "loss": 0.5447, + "step": 8911 + }, + { + "epoch": 0.72, + "grad_norm": 0.92782933804282, + "learning_rate": 3.7287112501394406e-06, + "loss": 0.5225, + "step": 8912 + }, + { + "epoch": 0.72, + "grad_norm": 0.9217797291013208, + "learning_rate": 3.7266609938146912e-06, + "loss": 0.4856, + "step": 8913 + }, + { + "epoch": 0.72, + "grad_norm": 0.9822645901036696, + "learning_rate": 3.724611172226771e-06, + "loss": 0.457, + "step": 8914 + }, + { + "epoch": 0.72, + "grad_norm": 0.8908297362107798, + "learning_rate": 3.722561785517732e-06, + "loss": 0.4382, + "step": 8915 + }, + { + "epoch": 0.72, + "grad_norm": 1.0721806990810707, + "learning_rate": 3.7205128338295884e-06, + "loss": 0.4757, + "step": 8916 + }, + { + "epoch": 0.72, + "grad_norm": 0.8454484712366738, + "learning_rate": 3.718464317304341e-06, + "loss": 0.455, + "step": 8917 + }, + { + "epoch": 0.72, + "grad_norm": 0.8733536681919967, + "learning_rate": 3.716416236083942e-06, + "loss": 0.4972, + "step": 8918 + }, + { + "epoch": 0.72, + "grad_norm": 0.9702493580649202, + "learning_rate": 3.7143685903103242e-06, + "loss": 0.5163, + "step": 8919 + }, + { + "epoch": 0.72, + "grad_norm": 0.8858387912709786, + "learning_rate": 3.7123213801253876e-06, + "loss": 0.4982, + "step": 8920 + }, + { + "epoch": 0.73, + "grad_norm": 0.9127452804890583, + "learning_rate": 3.7102746056710025e-06, + "loss": 0.4876, + "step": 8921 + }, + { + "epoch": 0.73, + "grad_norm": 0.8992842307594613, + "learning_rate": 3.708228267089008e-06, + "loss": 0.5092, + "step": 8922 + }, + { + "epoch": 0.73, + "grad_norm": 0.8843837452095896, + "learning_rate": 3.706182364521217e-06, + "loss": 0.4668, + "step": 8923 + }, + { + "epoch": 0.73, + "grad_norm": 1.0186044909447514, + "learning_rate": 3.704136898109403e-06, + "loss": 0.5457, + "step": 8924 + }, + { + "epoch": 0.73, + "grad_norm": 0.9663204881717411, + "learning_rate": 3.7020918679953166e-06, + "loss": 0.5576, + "step": 8925 + }, + { + "epoch": 0.73, + "grad_norm": 0.9004867321863538, + "learning_rate": 3.7000472743206773e-06, + "loss": 0.4949, + "step": 8926 + }, + { + "epoch": 0.73, + "grad_norm": 0.8848182522515886, + "learning_rate": 3.698003117227175e-06, + "loss": 0.5097, + "step": 8927 + }, + { + "epoch": 0.73, + "grad_norm": 0.9150649446475405, + "learning_rate": 3.6959593968564654e-06, + "loss": 0.454, + "step": 8928 + }, + { + "epoch": 0.73, + "grad_norm": 0.8919034910686628, + "learning_rate": 3.6939161133501823e-06, + "loss": 0.5015, + "step": 8929 + }, + { + "epoch": 0.73, + "grad_norm": 0.953865934457871, + "learning_rate": 3.691873266849916e-06, + "loss": 0.5266, + "step": 8930 + }, + { + "epoch": 0.73, + "grad_norm": 0.9061766912229933, + "learning_rate": 3.6898308574972365e-06, + "loss": 0.5061, + "step": 8931 + }, + { + "epoch": 0.73, + "grad_norm": 1.1822247801890384, + "learning_rate": 3.6877888854336808e-06, + "loss": 0.5964, + "step": 8932 + }, + { + "epoch": 0.73, + "grad_norm": 0.8468554061541979, + "learning_rate": 3.6857473508007567e-06, + "loss": 0.4568, + "step": 8933 + }, + { + "epoch": 0.73, + "grad_norm": 0.9397863791139718, + "learning_rate": 3.6837062537399414e-06, + "loss": 0.5087, + "step": 8934 + }, + { + "epoch": 0.73, + "grad_norm": 0.9095373378132872, + "learning_rate": 3.6816655943926825e-06, + "loss": 0.498, + "step": 8935 + }, + { + "epoch": 0.73, + "grad_norm": 0.885010856329452, + "learning_rate": 3.6796253729003905e-06, + "loss": 0.5331, + "step": 8936 + }, + { + "epoch": 0.73, + "grad_norm": 1.0204269253203586, + "learning_rate": 3.6775855894044543e-06, + "loss": 0.5541, + "step": 8937 + }, + { + "epoch": 0.73, + "grad_norm": 0.947069376220401, + "learning_rate": 3.6755462440462288e-06, + "loss": 0.4936, + "step": 8938 + }, + { + "epoch": 0.73, + "grad_norm": 0.9047457184730575, + "learning_rate": 3.673507336967038e-06, + "loss": 0.5206, + "step": 8939 + }, + { + "epoch": 0.73, + "grad_norm": 1.045100952717416, + "learning_rate": 3.6714688683081778e-06, + "loss": 0.4995, + "step": 8940 + }, + { + "epoch": 0.73, + "grad_norm": 0.9339554233023288, + "learning_rate": 3.669430838210911e-06, + "loss": 0.4846, + "step": 8941 + }, + { + "epoch": 0.73, + "grad_norm": 0.9311600655124447, + "learning_rate": 3.6673932468164763e-06, + "loss": 0.5243, + "step": 8942 + }, + { + "epoch": 0.73, + "grad_norm": 0.9600652156787634, + "learning_rate": 3.6653560942660694e-06, + "loss": 0.5025, + "step": 8943 + }, + { + "epoch": 0.73, + "grad_norm": 0.875907664889498, + "learning_rate": 3.663319380700865e-06, + "loss": 0.4649, + "step": 8944 + }, + { + "epoch": 0.73, + "grad_norm": 0.9481821720808001, + "learning_rate": 3.661283106262008e-06, + "loss": 0.4747, + "step": 8945 + }, + { + "epoch": 0.73, + "grad_norm": 1.0168558120772304, + "learning_rate": 3.659247271090609e-06, + "loss": 0.4481, + "step": 8946 + }, + { + "epoch": 0.73, + "grad_norm": 0.9162316079435071, + "learning_rate": 3.6572118753277495e-06, + "loss": 0.4684, + "step": 8947 + }, + { + "epoch": 0.73, + "grad_norm": 0.9225979331389788, + "learning_rate": 3.655176919114485e-06, + "loss": 0.4973, + "step": 8948 + }, + { + "epoch": 0.73, + "grad_norm": 0.9014094862874904, + "learning_rate": 3.6531424025918284e-06, + "loss": 0.5053, + "step": 8949 + }, + { + "epoch": 0.73, + "grad_norm": 0.970179655664484, + "learning_rate": 3.651108325900773e-06, + "loss": 0.466, + "step": 8950 + }, + { + "epoch": 0.73, + "grad_norm": 1.032032574074345, + "learning_rate": 3.6490746891822806e-06, + "loss": 0.5263, + "step": 8951 + }, + { + "epoch": 0.73, + "grad_norm": 0.9129906619640558, + "learning_rate": 3.647041492577278e-06, + "loss": 0.5223, + "step": 8952 + }, + { + "epoch": 0.73, + "grad_norm": 0.8550106008853208, + "learning_rate": 3.645008736226664e-06, + "loss": 0.4532, + "step": 8953 + }, + { + "epoch": 0.73, + "grad_norm": 0.801384404024512, + "learning_rate": 3.6429764202713124e-06, + "loss": 0.4851, + "step": 8954 + }, + { + "epoch": 0.73, + "grad_norm": 0.9889555676924645, + "learning_rate": 3.6409445448520533e-06, + "loss": 0.5049, + "step": 8955 + }, + { + "epoch": 0.73, + "grad_norm": 0.9403531883891941, + "learning_rate": 3.6389131101096953e-06, + "loss": 0.536, + "step": 8956 + }, + { + "epoch": 0.73, + "grad_norm": 1.0502677111502992, + "learning_rate": 3.6368821161850176e-06, + "loss": 0.5614, + "step": 8957 + }, + { + "epoch": 0.73, + "grad_norm": 0.8630171779379783, + "learning_rate": 3.6348515632187643e-06, + "loss": 0.4034, + "step": 8958 + }, + { + "epoch": 0.73, + "grad_norm": 0.9047027156195153, + "learning_rate": 3.6328214513516523e-06, + "loss": 0.4744, + "step": 8959 + }, + { + "epoch": 0.73, + "grad_norm": 0.916757086523125, + "learning_rate": 3.6307917807243697e-06, + "loss": 0.4842, + "step": 8960 + }, + { + "epoch": 0.73, + "grad_norm": 0.9940495108617984, + "learning_rate": 3.6287625514775602e-06, + "loss": 0.5406, + "step": 8961 + }, + { + "epoch": 0.73, + "grad_norm": 0.9253795932773297, + "learning_rate": 3.626733763751861e-06, + "loss": 0.4978, + "step": 8962 + }, + { + "epoch": 0.73, + "grad_norm": 0.9150118745581817, + "learning_rate": 3.624705417687856e-06, + "loss": 0.4843, + "step": 8963 + }, + { + "epoch": 0.73, + "grad_norm": 0.8946597973175461, + "learning_rate": 3.6226775134261106e-06, + "loss": 0.455, + "step": 8964 + }, + { + "epoch": 0.73, + "grad_norm": 0.8485633591778271, + "learning_rate": 3.620650051107156e-06, + "loss": 0.5026, + "step": 8965 + }, + { + "epoch": 0.73, + "grad_norm": 0.8989951788879078, + "learning_rate": 3.6186230308714985e-06, + "loss": 0.4281, + "step": 8966 + }, + { + "epoch": 0.73, + "grad_norm": 0.9932172971365646, + "learning_rate": 3.6165964528595988e-06, + "loss": 0.4487, + "step": 8967 + }, + { + "epoch": 0.73, + "grad_norm": 0.9419138818036944, + "learning_rate": 3.6145703172119085e-06, + "loss": 0.5094, + "step": 8968 + }, + { + "epoch": 0.73, + "grad_norm": 1.0242264441554165, + "learning_rate": 3.6125446240688276e-06, + "loss": 0.533, + "step": 8969 + }, + { + "epoch": 0.73, + "grad_norm": 0.9880175132776277, + "learning_rate": 3.61051937357074e-06, + "loss": 0.5336, + "step": 8970 + }, + { + "epoch": 0.73, + "grad_norm": 0.9623432157150716, + "learning_rate": 3.6084945658579918e-06, + "loss": 0.5216, + "step": 8971 + }, + { + "epoch": 0.73, + "grad_norm": 0.9547697834135409, + "learning_rate": 3.606470201070904e-06, + "loss": 0.4688, + "step": 8972 + }, + { + "epoch": 0.73, + "grad_norm": 0.9583453259256163, + "learning_rate": 3.6044462793497526e-06, + "loss": 0.5612, + "step": 8973 + }, + { + "epoch": 0.73, + "grad_norm": 0.9720258126741251, + "learning_rate": 3.6024228008348096e-06, + "loss": 0.4992, + "step": 8974 + }, + { + "epoch": 0.73, + "grad_norm": 0.9474217582347544, + "learning_rate": 3.600399765666287e-06, + "loss": 0.5311, + "step": 8975 + }, + { + "epoch": 0.73, + "grad_norm": 0.9902690886560699, + "learning_rate": 3.5983771739843855e-06, + "loss": 0.5334, + "step": 8976 + }, + { + "epoch": 0.73, + "grad_norm": 0.9613087078021019, + "learning_rate": 3.596355025929267e-06, + "loss": 0.4961, + "step": 8977 + }, + { + "epoch": 0.73, + "grad_norm": 0.9757830970674216, + "learning_rate": 3.594333321641068e-06, + "loss": 0.5468, + "step": 8978 + }, + { + "epoch": 0.73, + "grad_norm": 0.9347073297449352, + "learning_rate": 3.5923120612598828e-06, + "loss": 0.4568, + "step": 8979 + }, + { + "epoch": 0.73, + "grad_norm": 0.9029241676866938, + "learning_rate": 3.590291244925793e-06, + "loss": 0.4688, + "step": 8980 + }, + { + "epoch": 0.73, + "grad_norm": 0.8488345626658037, + "learning_rate": 3.588270872778833e-06, + "loss": 0.4678, + "step": 8981 + }, + { + "epoch": 0.73, + "grad_norm": 0.9033708830753971, + "learning_rate": 3.5862509449590135e-06, + "loss": 0.4801, + "step": 8982 + }, + { + "epoch": 0.73, + "grad_norm": 0.9888788145320088, + "learning_rate": 3.5842314616063134e-06, + "loss": 0.5105, + "step": 8983 + }, + { + "epoch": 0.73, + "grad_norm": 0.8813975859723068, + "learning_rate": 3.582212422860687e-06, + "loss": 0.4764, + "step": 8984 + }, + { + "epoch": 0.73, + "grad_norm": 1.0035285428927048, + "learning_rate": 3.5801938288620395e-06, + "loss": 0.4795, + "step": 8985 + }, + { + "epoch": 0.73, + "grad_norm": 0.9402018886597426, + "learning_rate": 3.5781756797502733e-06, + "loss": 0.4551, + "step": 8986 + }, + { + "epoch": 0.73, + "grad_norm": 0.9275798940011335, + "learning_rate": 3.576157975665232e-06, + "loss": 0.5709, + "step": 8987 + }, + { + "epoch": 0.73, + "grad_norm": 0.88546274771075, + "learning_rate": 3.5741407167467444e-06, + "loss": 0.4388, + "step": 8988 + }, + { + "epoch": 0.73, + "grad_norm": 0.8964212413484605, + "learning_rate": 3.5721239031346067e-06, + "loss": 0.4633, + "step": 8989 + }, + { + "epoch": 0.73, + "grad_norm": 0.8716362942550768, + "learning_rate": 3.5701075349685842e-06, + "loss": 0.4732, + "step": 8990 + }, + { + "epoch": 0.73, + "grad_norm": 0.872765870527491, + "learning_rate": 3.568091612388399e-06, + "loss": 0.4433, + "step": 8991 + }, + { + "epoch": 0.73, + "grad_norm": 0.8806035542732507, + "learning_rate": 3.566076135533767e-06, + "loss": 0.4762, + "step": 8992 + }, + { + "epoch": 0.73, + "grad_norm": 0.9063579768886735, + "learning_rate": 3.5640611045443485e-06, + "loss": 0.5312, + "step": 8993 + }, + { + "epoch": 0.73, + "grad_norm": 1.1252243359435834, + "learning_rate": 3.5620465195597865e-06, + "loss": 0.4534, + "step": 8994 + }, + { + "epoch": 0.73, + "grad_norm": 0.9411728999230131, + "learning_rate": 3.5600323807196912e-06, + "loss": 0.4822, + "step": 8995 + }, + { + "epoch": 0.73, + "grad_norm": 0.9028802160371076, + "learning_rate": 3.5580186881636414e-06, + "loss": 0.519, + "step": 8996 + }, + { + "epoch": 0.73, + "grad_norm": 0.918337987256864, + "learning_rate": 3.5560054420311776e-06, + "loss": 0.4788, + "step": 8997 + }, + { + "epoch": 0.73, + "grad_norm": 0.9989192754355186, + "learning_rate": 3.5539926424618265e-06, + "loss": 0.5386, + "step": 8998 + }, + { + "epoch": 0.73, + "grad_norm": 0.838803831436354, + "learning_rate": 3.551980289595064e-06, + "loss": 0.4483, + "step": 8999 + }, + { + "epoch": 0.73, + "grad_norm": 0.8445468941893491, + "learning_rate": 3.5499683835703493e-06, + "loss": 0.4687, + "step": 9000 + }, + { + "epoch": 0.73, + "grad_norm": 0.848857920123972, + "learning_rate": 3.547956924527103e-06, + "loss": 0.5277, + "step": 9001 + }, + { + "epoch": 0.73, + "grad_norm": 0.8680873155519754, + "learning_rate": 3.5459459126047226e-06, + "loss": 0.4873, + "step": 9002 + }, + { + "epoch": 0.73, + "grad_norm": 0.9469250103942338, + "learning_rate": 3.5439353479425597e-06, + "loss": 0.5117, + "step": 9003 + }, + { + "epoch": 0.73, + "grad_norm": 1.0264133283896932, + "learning_rate": 3.5419252306799567e-06, + "loss": 0.5101, + "step": 9004 + }, + { + "epoch": 0.73, + "grad_norm": 0.8621328186046997, + "learning_rate": 3.539915560956204e-06, + "loss": 0.4558, + "step": 9005 + }, + { + "epoch": 0.73, + "grad_norm": 0.8616535313317472, + "learning_rate": 3.5379063389105727e-06, + "loss": 0.4193, + "step": 9006 + }, + { + "epoch": 0.73, + "grad_norm": 0.8940689750239936, + "learning_rate": 3.5358975646823002e-06, + "loss": 0.5044, + "step": 9007 + }, + { + "epoch": 0.73, + "grad_norm": 0.9983698752868682, + "learning_rate": 3.533889238410596e-06, + "loss": 0.5353, + "step": 9008 + }, + { + "epoch": 0.73, + "grad_norm": 1.1563379381969516, + "learning_rate": 3.5318813602346257e-06, + "loss": 0.5731, + "step": 9009 + }, + { + "epoch": 0.73, + "grad_norm": 0.8336786353501896, + "learning_rate": 3.529873930293546e-06, + "loss": 0.457, + "step": 9010 + }, + { + "epoch": 0.73, + "grad_norm": 0.9314020854618265, + "learning_rate": 3.5278669487264583e-06, + "loss": 0.5305, + "step": 9011 + }, + { + "epoch": 0.73, + "grad_norm": 0.9373727014930834, + "learning_rate": 3.525860415672456e-06, + "loss": 0.5118, + "step": 9012 + }, + { + "epoch": 0.73, + "grad_norm": 1.036601676300659, + "learning_rate": 3.523854331270582e-06, + "loss": 0.5551, + "step": 9013 + }, + { + "epoch": 0.73, + "grad_norm": 0.9470780237472647, + "learning_rate": 3.5218486956598573e-06, + "loss": 0.4952, + "step": 9014 + }, + { + "epoch": 0.73, + "grad_norm": 0.9371280542011889, + "learning_rate": 3.5198435089792726e-06, + "loss": 0.5261, + "step": 9015 + }, + { + "epoch": 0.73, + "grad_norm": 0.9707691354773395, + "learning_rate": 3.5178387713677886e-06, + "loss": 0.4942, + "step": 9016 + }, + { + "epoch": 0.73, + "grad_norm": 0.9204704483209045, + "learning_rate": 3.515834482964321e-06, + "loss": 0.5014, + "step": 9017 + }, + { + "epoch": 0.73, + "grad_norm": 0.8861801504688388, + "learning_rate": 3.5138306439077784e-06, + "loss": 0.4825, + "step": 9018 + }, + { + "epoch": 0.73, + "grad_norm": 0.8744753752055189, + "learning_rate": 3.5118272543370157e-06, + "loss": 0.4632, + "step": 9019 + }, + { + "epoch": 0.73, + "grad_norm": 0.9304866500057495, + "learning_rate": 3.5098243143908694e-06, + "loss": 0.5482, + "step": 9020 + }, + { + "epoch": 0.73, + "grad_norm": 0.9796429237511085, + "learning_rate": 3.507821824208142e-06, + "loss": 0.5174, + "step": 9021 + }, + { + "epoch": 0.73, + "grad_norm": 0.8162021792821621, + "learning_rate": 3.5058197839276064e-06, + "loss": 0.4494, + "step": 9022 + }, + { + "epoch": 0.73, + "grad_norm": 0.8470310632620339, + "learning_rate": 3.5038181936879932e-06, + "loss": 0.4983, + "step": 9023 + }, + { + "epoch": 0.73, + "grad_norm": 0.9326230422525369, + "learning_rate": 3.5018170536280237e-06, + "loss": 0.4876, + "step": 9024 + }, + { + "epoch": 0.73, + "grad_norm": 1.081302621542103, + "learning_rate": 3.4998163638863646e-06, + "loss": 0.5806, + "step": 9025 + }, + { + "epoch": 0.73, + "grad_norm": 0.8398561267240168, + "learning_rate": 3.4978161246016664e-06, + "loss": 0.5022, + "step": 9026 + }, + { + "epoch": 0.73, + "grad_norm": 1.008510871462587, + "learning_rate": 3.495816335912543e-06, + "loss": 0.5308, + "step": 9027 + }, + { + "epoch": 0.73, + "grad_norm": 0.813490241806617, + "learning_rate": 3.493816997957582e-06, + "loss": 0.3922, + "step": 9028 + }, + { + "epoch": 0.73, + "grad_norm": 0.8522965620275039, + "learning_rate": 3.4918181108753247e-06, + "loss": 0.447, + "step": 9029 + }, + { + "epoch": 0.73, + "grad_norm": 1.0595297523859928, + "learning_rate": 3.4898196748043065e-06, + "loss": 0.4964, + "step": 9030 + }, + { + "epoch": 0.73, + "grad_norm": 0.9004845069622794, + "learning_rate": 3.4878216898830074e-06, + "loss": 0.4881, + "step": 9031 + }, + { + "epoch": 0.73, + "grad_norm": 0.9295585565321477, + "learning_rate": 3.4858241562498884e-06, + "loss": 0.5394, + "step": 9032 + }, + { + "epoch": 0.73, + "grad_norm": 0.9078302483781503, + "learning_rate": 3.4838270740433776e-06, + "loss": 0.4755, + "step": 9033 + }, + { + "epoch": 0.73, + "grad_norm": 0.891665135254238, + "learning_rate": 3.4818304434018734e-06, + "loss": 0.4798, + "step": 9034 + }, + { + "epoch": 0.73, + "grad_norm": 0.8578424459037337, + "learning_rate": 3.4798342644637327e-06, + "loss": 0.5138, + "step": 9035 + }, + { + "epoch": 0.73, + "grad_norm": 0.8970354823693978, + "learning_rate": 3.4778385373672996e-06, + "loss": 0.4402, + "step": 9036 + }, + { + "epoch": 0.73, + "grad_norm": 0.8832316717638571, + "learning_rate": 3.4758432622508677e-06, + "loss": 0.4979, + "step": 9037 + }, + { + "epoch": 0.73, + "grad_norm": 0.8935845014449028, + "learning_rate": 3.4738484392527107e-06, + "loss": 0.4677, + "step": 9038 + }, + { + "epoch": 0.73, + "grad_norm": 0.9709035988811506, + "learning_rate": 3.471854068511068e-06, + "loss": 0.4925, + "step": 9039 + }, + { + "epoch": 0.73, + "grad_norm": 0.99935079521221, + "learning_rate": 3.4698601501641517e-06, + "loss": 0.4836, + "step": 9040 + }, + { + "epoch": 0.73, + "grad_norm": 0.9496127903701702, + "learning_rate": 3.4678666843501276e-06, + "loss": 0.4915, + "step": 9041 + }, + { + "epoch": 0.73, + "grad_norm": 0.8638931125885806, + "learning_rate": 3.465873671207155e-06, + "loss": 0.5133, + "step": 9042 + }, + { + "epoch": 0.73, + "grad_norm": 0.8278711857326407, + "learning_rate": 3.4638811108733383e-06, + "loss": 0.4489, + "step": 9043 + }, + { + "epoch": 0.74, + "grad_norm": 0.9039654110449227, + "learning_rate": 3.4618890034867626e-06, + "loss": 0.5066, + "step": 9044 + }, + { + "epoch": 0.74, + "grad_norm": 0.9173666067331488, + "learning_rate": 3.4598973491854804e-06, + "loss": 0.473, + "step": 9045 + }, + { + "epoch": 0.74, + "grad_norm": 1.0228975566013427, + "learning_rate": 3.4579061481075137e-06, + "loss": 0.5098, + "step": 9046 + }, + { + "epoch": 0.74, + "grad_norm": 0.8442960933660839, + "learning_rate": 3.455915400390841e-06, + "loss": 0.4608, + "step": 9047 + }, + { + "epoch": 0.74, + "grad_norm": 0.9276664713771727, + "learning_rate": 3.4539251061734337e-06, + "loss": 0.5207, + "step": 9048 + }, + { + "epoch": 0.74, + "grad_norm": 0.8647425352503317, + "learning_rate": 3.451935265593207e-06, + "loss": 0.5113, + "step": 9049 + }, + { + "epoch": 0.74, + "grad_norm": 0.9182509306051623, + "learning_rate": 3.449945878788058e-06, + "loss": 0.4752, + "step": 9050 + }, + { + "epoch": 0.74, + "grad_norm": 0.9667703242017811, + "learning_rate": 3.4479569458958494e-06, + "loss": 0.5247, + "step": 9051 + }, + { + "epoch": 0.74, + "grad_norm": 0.890490488057399, + "learning_rate": 3.4459684670544157e-06, + "loss": 0.4682, + "step": 9052 + }, + { + "epoch": 0.74, + "grad_norm": 0.8894062584761423, + "learning_rate": 3.4439804424015486e-06, + "loss": 0.4623, + "step": 9053 + }, + { + "epoch": 0.74, + "grad_norm": 1.1678266481326605, + "learning_rate": 3.4419928720750274e-06, + "loss": 0.5092, + "step": 9054 + }, + { + "epoch": 0.74, + "grad_norm": 0.9180142808067063, + "learning_rate": 3.44000575621258e-06, + "loss": 0.4445, + "step": 9055 + }, + { + "epoch": 0.74, + "grad_norm": 0.9747151736279898, + "learning_rate": 3.4380190949519155e-06, + "loss": 0.5326, + "step": 9056 + }, + { + "epoch": 0.74, + "grad_norm": 0.8864171290008381, + "learning_rate": 3.4360328884307058e-06, + "loss": 0.4892, + "step": 9057 + }, + { + "epoch": 0.74, + "grad_norm": 0.9903037454617845, + "learning_rate": 3.4340471367865992e-06, + "loss": 0.5742, + "step": 9058 + }, + { + "epoch": 0.74, + "grad_norm": 0.8513222506288766, + "learning_rate": 3.432061840157196e-06, + "loss": 0.5226, + "step": 9059 + }, + { + "epoch": 0.74, + "grad_norm": 1.0307240084812919, + "learning_rate": 3.4300769986800863e-06, + "loss": 0.5663, + "step": 9060 + }, + { + "epoch": 0.74, + "grad_norm": 0.9968384356366111, + "learning_rate": 3.4280926124928115e-06, + "loss": 0.5288, + "step": 9061 + }, + { + "epoch": 0.74, + "grad_norm": 0.9519071916218125, + "learning_rate": 3.4261086817328882e-06, + "loss": 0.4699, + "step": 9062 + }, + { + "epoch": 0.74, + "grad_norm": 0.8450654754196741, + "learning_rate": 3.424125206537803e-06, + "loss": 0.4618, + "step": 9063 + }, + { + "epoch": 0.74, + "grad_norm": 0.8698269817828128, + "learning_rate": 3.422142187045011e-06, + "loss": 0.4738, + "step": 9064 + }, + { + "epoch": 0.74, + "grad_norm": 0.8777962455889311, + "learning_rate": 3.4201596233919243e-06, + "loss": 0.455, + "step": 9065 + }, + { + "epoch": 0.74, + "grad_norm": 0.9974300937915439, + "learning_rate": 3.418177515715947e-06, + "loss": 0.6206, + "step": 9066 + }, + { + "epoch": 0.74, + "grad_norm": 0.9838505746206851, + "learning_rate": 3.416195864154426e-06, + "loss": 0.525, + "step": 9067 + }, + { + "epoch": 0.74, + "grad_norm": 1.0055224851762832, + "learning_rate": 3.414214668844691e-06, + "loss": 0.4833, + "step": 9068 + }, + { + "epoch": 0.74, + "grad_norm": 1.012178258142545, + "learning_rate": 3.4122339299240383e-06, + "loss": 0.5122, + "step": 9069 + }, + { + "epoch": 0.74, + "grad_norm": 0.9477903204487034, + "learning_rate": 3.410253647529731e-06, + "loss": 0.5254, + "step": 9070 + }, + { + "epoch": 0.74, + "grad_norm": 0.9954868910750313, + "learning_rate": 3.408273821799001e-06, + "loss": 0.4825, + "step": 9071 + }, + { + "epoch": 0.74, + "grad_norm": 0.9441065536533816, + "learning_rate": 3.4062944528690512e-06, + "loss": 0.4791, + "step": 9072 + }, + { + "epoch": 0.74, + "grad_norm": 0.9837226393405486, + "learning_rate": 3.4043155408770435e-06, + "loss": 0.5217, + "step": 9073 + }, + { + "epoch": 0.74, + "grad_norm": 0.883534359457053, + "learning_rate": 3.4023370859601192e-06, + "loss": 0.4569, + "step": 9074 + }, + { + "epoch": 0.74, + "grad_norm": 0.8829348070197951, + "learning_rate": 3.400359088255383e-06, + "loss": 0.4665, + "step": 9075 + }, + { + "epoch": 0.74, + "grad_norm": 0.9242326361724409, + "learning_rate": 3.3983815478999073e-06, + "loss": 0.4844, + "step": 9076 + }, + { + "epoch": 0.74, + "grad_norm": 0.8751730630958149, + "learning_rate": 3.396404465030735e-06, + "loss": 0.5159, + "step": 9077 + }, + { + "epoch": 0.74, + "grad_norm": 0.9524856623633947, + "learning_rate": 3.3944278397848797e-06, + "loss": 0.4986, + "step": 9078 + }, + { + "epoch": 0.74, + "grad_norm": 0.9643769255032821, + "learning_rate": 3.3924516722993115e-06, + "loss": 0.5314, + "step": 9079 + }, + { + "epoch": 0.74, + "grad_norm": 0.9979175564097665, + "learning_rate": 3.3904759627109828e-06, + "loss": 0.5332, + "step": 9080 + }, + { + "epoch": 0.74, + "grad_norm": 0.9073140095491039, + "learning_rate": 3.388500711156807e-06, + "loss": 0.4793, + "step": 9081 + }, + { + "epoch": 0.74, + "grad_norm": 0.8235618593206219, + "learning_rate": 3.3865259177736663e-06, + "loss": 0.4428, + "step": 9082 + }, + { + "epoch": 0.74, + "grad_norm": 0.945033382039681, + "learning_rate": 3.3845515826984143e-06, + "loss": 0.576, + "step": 9083 + }, + { + "epoch": 0.74, + "grad_norm": 0.8513977017652784, + "learning_rate": 3.3825777060678734e-06, + "loss": 0.4843, + "step": 9084 + }, + { + "epoch": 0.74, + "grad_norm": 0.938115084213264, + "learning_rate": 3.380604288018824e-06, + "loss": 0.481, + "step": 9085 + }, + { + "epoch": 0.74, + "grad_norm": 0.9257780529818783, + "learning_rate": 3.3786313286880257e-06, + "loss": 0.4521, + "step": 9086 + }, + { + "epoch": 0.74, + "grad_norm": 0.9387396351436141, + "learning_rate": 3.3766588282122037e-06, + "loss": 0.458, + "step": 9087 + }, + { + "epoch": 0.74, + "grad_norm": 0.9144270665316344, + "learning_rate": 3.3746867867280496e-06, + "loss": 0.5025, + "step": 9088 + }, + { + "epoch": 0.74, + "grad_norm": 0.8722399311563757, + "learning_rate": 3.3727152043722257e-06, + "loss": 0.4524, + "step": 9089 + }, + { + "epoch": 0.74, + "grad_norm": 0.955180665187194, + "learning_rate": 3.3707440812813584e-06, + "loss": 0.515, + "step": 9090 + }, + { + "epoch": 0.74, + "grad_norm": 1.0856676436543693, + "learning_rate": 3.3687734175920505e-06, + "loss": 0.5452, + "step": 9091 + }, + { + "epoch": 0.74, + "grad_norm": 0.9367234887166933, + "learning_rate": 3.366803213440859e-06, + "loss": 0.5332, + "step": 9092 + }, + { + "epoch": 0.74, + "grad_norm": 0.9966112933019008, + "learning_rate": 3.3648334689643214e-06, + "loss": 0.4963, + "step": 9093 + }, + { + "epoch": 0.74, + "grad_norm": 0.9522114454224588, + "learning_rate": 3.3628641842989384e-06, + "loss": 0.4583, + "step": 9094 + }, + { + "epoch": 0.74, + "grad_norm": 0.8942989570626828, + "learning_rate": 3.36089535958118e-06, + "loss": 0.5099, + "step": 9095 + }, + { + "epoch": 0.74, + "grad_norm": 0.924628658722338, + "learning_rate": 3.3589269949474856e-06, + "loss": 0.4925, + "step": 9096 + }, + { + "epoch": 0.74, + "grad_norm": 0.890733017360491, + "learning_rate": 3.356959090534262e-06, + "loss": 0.4837, + "step": 9097 + }, + { + "epoch": 0.74, + "grad_norm": 0.9854852437262702, + "learning_rate": 3.3549916464778787e-06, + "loss": 0.4994, + "step": 9098 + }, + { + "epoch": 0.74, + "grad_norm": 0.9036658514233938, + "learning_rate": 3.35302466291468e-06, + "loss": 0.5012, + "step": 9099 + }, + { + "epoch": 0.74, + "grad_norm": 0.9687654855435422, + "learning_rate": 3.3510581399809762e-06, + "loss": 0.4729, + "step": 9100 + }, + { + "epoch": 0.74, + "grad_norm": 1.0067087474395098, + "learning_rate": 3.3490920778130455e-06, + "loss": 0.5084, + "step": 9101 + }, + { + "epoch": 0.74, + "grad_norm": 0.9463734582922495, + "learning_rate": 3.3471264765471346e-06, + "loss": 0.5594, + "step": 9102 + }, + { + "epoch": 0.74, + "grad_norm": 1.0202413779029968, + "learning_rate": 3.3451613363194603e-06, + "loss": 0.5247, + "step": 9103 + }, + { + "epoch": 0.74, + "grad_norm": 0.8987810339759174, + "learning_rate": 3.3431966572662e-06, + "loss": 0.5141, + "step": 9104 + }, + { + "epoch": 0.74, + "grad_norm": 0.8590651835433493, + "learning_rate": 3.341232439523506e-06, + "loss": 0.5129, + "step": 9105 + }, + { + "epoch": 0.74, + "grad_norm": 0.9656965528739612, + "learning_rate": 3.339268683227499e-06, + "loss": 0.5474, + "step": 9106 + }, + { + "epoch": 0.74, + "grad_norm": 0.8168953824209993, + "learning_rate": 3.3373053885142636e-06, + "loss": 0.4938, + "step": 9107 + }, + { + "epoch": 0.74, + "grad_norm": 0.9425801989390152, + "learning_rate": 3.335342555519855e-06, + "loss": 0.5095, + "step": 9108 + }, + { + "epoch": 0.74, + "grad_norm": 1.0167412583769833, + "learning_rate": 3.3333801843802994e-06, + "loss": 0.5314, + "step": 9109 + }, + { + "epoch": 0.74, + "grad_norm": 0.8834281023977991, + "learning_rate": 3.331418275231576e-06, + "loss": 0.4723, + "step": 9110 + }, + { + "epoch": 0.74, + "grad_norm": 0.9367658540260484, + "learning_rate": 3.3294568282096586e-06, + "loss": 0.4902, + "step": 9111 + }, + { + "epoch": 0.74, + "grad_norm": 1.0125504424161789, + "learning_rate": 3.3274958434504625e-06, + "loss": 0.5003, + "step": 9112 + }, + { + "epoch": 0.74, + "grad_norm": 0.8158336140834846, + "learning_rate": 3.3255353210898866e-06, + "loss": 0.4647, + "step": 9113 + }, + { + "epoch": 0.74, + "grad_norm": 0.8216839414525001, + "learning_rate": 3.3235752612637917e-06, + "loss": 0.496, + "step": 9114 + }, + { + "epoch": 0.74, + "grad_norm": 0.98806789784039, + "learning_rate": 3.3216156641080134e-06, + "loss": 0.4611, + "step": 9115 + }, + { + "epoch": 0.74, + "grad_norm": 0.8943357113254146, + "learning_rate": 3.319656529758339e-06, + "loss": 0.5236, + "step": 9116 + }, + { + "epoch": 0.74, + "grad_norm": 0.9066027966449727, + "learning_rate": 3.317697858350548e-06, + "loss": 0.4432, + "step": 9117 + }, + { + "epoch": 0.74, + "grad_norm": 0.8431741448497101, + "learning_rate": 3.3157396500203655e-06, + "loss": 0.4948, + "step": 9118 + }, + { + "epoch": 0.74, + "grad_norm": 0.9503603522033455, + "learning_rate": 3.3137819049034957e-06, + "loss": 0.4786, + "step": 9119 + }, + { + "epoch": 0.74, + "grad_norm": 0.9381882710223372, + "learning_rate": 3.31182462313561e-06, + "loss": 0.5003, + "step": 9120 + }, + { + "epoch": 0.74, + "grad_norm": 0.8472589492270928, + "learning_rate": 3.309867804852348e-06, + "loss": 0.4728, + "step": 9121 + }, + { + "epoch": 0.74, + "grad_norm": 0.9965341501914862, + "learning_rate": 3.3079114501893063e-06, + "loss": 0.5548, + "step": 9122 + }, + { + "epoch": 0.74, + "grad_norm": 0.9186851430217279, + "learning_rate": 3.3059555592820726e-06, + "loss": 0.4724, + "step": 9123 + }, + { + "epoch": 0.74, + "grad_norm": 0.9803717640615625, + "learning_rate": 3.3040001322661772e-06, + "loss": 0.5643, + "step": 9124 + }, + { + "epoch": 0.74, + "grad_norm": 1.011552371194042, + "learning_rate": 3.3020451692771337e-06, + "loss": 0.537, + "step": 9125 + }, + { + "epoch": 0.74, + "grad_norm": 0.9319617645191561, + "learning_rate": 3.3000906704504176e-06, + "loss": 0.5305, + "step": 9126 + }, + { + "epoch": 0.74, + "grad_norm": 0.98808305950421, + "learning_rate": 3.2981366359214806e-06, + "loss": 0.5001, + "step": 9127 + }, + { + "epoch": 0.74, + "grad_norm": 1.028054694369749, + "learning_rate": 3.296183065825722e-06, + "loss": 0.5232, + "step": 9128 + }, + { + "epoch": 0.74, + "grad_norm": 0.9378023811252546, + "learning_rate": 3.294229960298537e-06, + "loss": 0.5227, + "step": 9129 + }, + { + "epoch": 0.74, + "grad_norm": 0.8934671402668811, + "learning_rate": 3.2922773194752653e-06, + "loss": 0.4626, + "step": 9130 + }, + { + "epoch": 0.74, + "grad_norm": 0.8791588094117161, + "learning_rate": 3.2903251434912265e-06, + "loss": 0.4498, + "step": 9131 + }, + { + "epoch": 0.74, + "grad_norm": 0.9675333837599392, + "learning_rate": 3.288373432481703e-06, + "loss": 0.5532, + "step": 9132 + }, + { + "epoch": 0.74, + "grad_norm": 0.9734440759086094, + "learning_rate": 3.28642218658195e-06, + "loss": 0.4815, + "step": 9133 + }, + { + "epoch": 0.74, + "grad_norm": 0.9726973591593845, + "learning_rate": 3.2844714059271788e-06, + "loss": 0.5354, + "step": 9134 + }, + { + "epoch": 0.74, + "grad_norm": 0.8416426012871814, + "learning_rate": 3.2825210906525885e-06, + "loss": 0.4334, + "step": 9135 + }, + { + "epoch": 0.74, + "grad_norm": 0.854242262963165, + "learning_rate": 3.2805712408933223e-06, + "loss": 0.428, + "step": 9136 + }, + { + "epoch": 0.74, + "grad_norm": 0.9023628780098923, + "learning_rate": 3.278621856784514e-06, + "loss": 0.4703, + "step": 9137 + }, + { + "epoch": 0.74, + "grad_norm": 0.8885723574629798, + "learning_rate": 3.2766729384612473e-06, + "loss": 0.4343, + "step": 9138 + }, + { + "epoch": 0.74, + "grad_norm": 1.0417131985652142, + "learning_rate": 3.2747244860585823e-06, + "loss": 0.5197, + "step": 9139 + }, + { + "epoch": 0.74, + "grad_norm": 0.8936134291252534, + "learning_rate": 3.272776499711545e-06, + "loss": 0.4929, + "step": 9140 + }, + { + "epoch": 0.74, + "grad_norm": 1.0327830703736356, + "learning_rate": 3.270828979555133e-06, + "loss": 0.5942, + "step": 9141 + }, + { + "epoch": 0.74, + "grad_norm": 0.950809911721351, + "learning_rate": 3.2688819257242963e-06, + "loss": 0.4976, + "step": 9142 + }, + { + "epoch": 0.74, + "grad_norm": 0.9363799676811368, + "learning_rate": 3.266935338353978e-06, + "loss": 0.572, + "step": 9143 + }, + { + "epoch": 0.74, + "grad_norm": 0.893436845428354, + "learning_rate": 3.2649892175790667e-06, + "loss": 0.4879, + "step": 9144 + }, + { + "epoch": 0.74, + "grad_norm": 0.9416979646344061, + "learning_rate": 3.2630435635344283e-06, + "loss": 0.4463, + "step": 9145 + }, + { + "epoch": 0.74, + "grad_norm": 0.9346917071558514, + "learning_rate": 3.261098376354894e-06, + "loss": 0.5248, + "step": 9146 + }, + { + "epoch": 0.74, + "grad_norm": 0.9589850809495569, + "learning_rate": 3.259153656175269e-06, + "loss": 0.4937, + "step": 9147 + }, + { + "epoch": 0.74, + "grad_norm": 0.8429765478620248, + "learning_rate": 3.2572094031303103e-06, + "loss": 0.4373, + "step": 9148 + }, + { + "epoch": 0.74, + "grad_norm": 0.8736249174868056, + "learning_rate": 3.255265617354766e-06, + "loss": 0.4693, + "step": 9149 + }, + { + "epoch": 0.74, + "grad_norm": 0.8879306103161134, + "learning_rate": 3.253322298983327e-06, + "loss": 0.4574, + "step": 9150 + }, + { + "epoch": 0.74, + "grad_norm": 0.9695827910172448, + "learning_rate": 3.25137944815067e-06, + "loss": 0.5195, + "step": 9151 + }, + { + "epoch": 0.74, + "grad_norm": 0.8645467946324216, + "learning_rate": 3.2494370649914296e-06, + "loss": 0.4675, + "step": 9152 + }, + { + "epoch": 0.74, + "grad_norm": 1.0658706154975952, + "learning_rate": 3.2474951496402175e-06, + "loss": 0.567, + "step": 9153 + }, + { + "epoch": 0.74, + "grad_norm": 0.8517465790920219, + "learning_rate": 3.245553702231595e-06, + "loss": 0.4684, + "step": 9154 + }, + { + "epoch": 0.74, + "grad_norm": 1.0026469376848923, + "learning_rate": 3.243612722900117e-06, + "loss": 0.4907, + "step": 9155 + }, + { + "epoch": 0.74, + "grad_norm": 0.9615501023379072, + "learning_rate": 3.2416722117802803e-06, + "loss": 0.5011, + "step": 9156 + }, + { + "epoch": 0.74, + "grad_norm": 0.8908905189634679, + "learning_rate": 3.2397321690065643e-06, + "loss": 0.4729, + "step": 9157 + }, + { + "epoch": 0.74, + "grad_norm": 0.9676487859660207, + "learning_rate": 3.2377925947134137e-06, + "loss": 0.4864, + "step": 9158 + }, + { + "epoch": 0.74, + "grad_norm": 0.9666007142870987, + "learning_rate": 3.235853489035241e-06, + "loss": 0.5508, + "step": 9159 + }, + { + "epoch": 0.74, + "grad_norm": 1.0605811965729126, + "learning_rate": 3.2339148521064146e-06, + "loss": 0.5257, + "step": 9160 + }, + { + "epoch": 0.74, + "grad_norm": 1.0201172769043783, + "learning_rate": 3.2319766840612954e-06, + "loss": 0.4734, + "step": 9161 + }, + { + "epoch": 0.74, + "grad_norm": 0.8410082186099735, + "learning_rate": 3.230038985034184e-06, + "loss": 0.4574, + "step": 9162 + }, + { + "epoch": 0.74, + "grad_norm": 1.0832135483157863, + "learning_rate": 3.2281017551593665e-06, + "loss": 0.4434, + "step": 9163 + }, + { + "epoch": 0.74, + "grad_norm": 1.0581813148791377, + "learning_rate": 3.2261649945710916e-06, + "loss": 0.5907, + "step": 9164 + }, + { + "epoch": 0.74, + "grad_norm": 0.908821683888088, + "learning_rate": 3.2242287034035756e-06, + "loss": 0.4716, + "step": 9165 + }, + { + "epoch": 0.74, + "grad_norm": 0.8791225901790715, + "learning_rate": 3.222292881790996e-06, + "loss": 0.4546, + "step": 9166 + }, + { + "epoch": 0.75, + "grad_norm": 0.9342783962101158, + "learning_rate": 3.2203575298675126e-06, + "loss": 0.5162, + "step": 9167 + }, + { + "epoch": 0.75, + "grad_norm": 0.9890614420190501, + "learning_rate": 3.2184226477672366e-06, + "loss": 0.5029, + "step": 9168 + }, + { + "epoch": 0.75, + "grad_norm": 0.9082676055397952, + "learning_rate": 3.2164882356242555e-06, + "loss": 0.4896, + "step": 9169 + }, + { + "epoch": 0.75, + "grad_norm": 0.9268421651069393, + "learning_rate": 3.2145542935726224e-06, + "loss": 0.4767, + "step": 9170 + }, + { + "epoch": 0.75, + "grad_norm": 0.8961329360018759, + "learning_rate": 3.212620821746362e-06, + "loss": 0.4751, + "step": 9171 + }, + { + "epoch": 0.75, + "grad_norm": 0.8984912218661776, + "learning_rate": 3.2106878202794513e-06, + "loss": 0.5043, + "step": 9172 + }, + { + "epoch": 0.75, + "grad_norm": 0.9263734336016696, + "learning_rate": 3.2087552893058594e-06, + "loss": 0.4616, + "step": 9173 + }, + { + "epoch": 0.75, + "grad_norm": 0.992989819188374, + "learning_rate": 3.206823228959498e-06, + "loss": 0.5415, + "step": 9174 + }, + { + "epoch": 0.75, + "grad_norm": 0.96993392568458, + "learning_rate": 3.2048916393742622e-06, + "loss": 0.5095, + "step": 9175 + }, + { + "epoch": 0.75, + "grad_norm": 0.9049184006651029, + "learning_rate": 3.2029605206840088e-06, + "loss": 0.4609, + "step": 9176 + }, + { + "epoch": 0.75, + "grad_norm": 0.8877311555149989, + "learning_rate": 3.201029873022565e-06, + "loss": 0.4444, + "step": 9177 + }, + { + "epoch": 0.75, + "grad_norm": 0.8661027546548056, + "learning_rate": 3.1990996965237143e-06, + "loss": 0.408, + "step": 9178 + }, + { + "epoch": 0.75, + "grad_norm": 0.9130267941869538, + "learning_rate": 3.1971699913212272e-06, + "loss": 0.4907, + "step": 9179 + }, + { + "epoch": 0.75, + "grad_norm": 0.9096717689460737, + "learning_rate": 3.1952407575488243e-06, + "loss": 0.5102, + "step": 9180 + }, + { + "epoch": 0.75, + "grad_norm": 0.8384692208379614, + "learning_rate": 3.1933119953402e-06, + "loss": 0.4654, + "step": 9181 + }, + { + "epoch": 0.75, + "grad_norm": 0.916298509249038, + "learning_rate": 3.1913837048290176e-06, + "loss": 0.4712, + "step": 9182 + }, + { + "epoch": 0.75, + "grad_norm": 1.0794714537379706, + "learning_rate": 3.189455886148908e-06, + "loss": 0.509, + "step": 9183 + }, + { + "epoch": 0.75, + "grad_norm": 0.9145557666072167, + "learning_rate": 3.1875285394334575e-06, + "loss": 0.5139, + "step": 9184 + }, + { + "epoch": 0.75, + "grad_norm": 0.9588017745471442, + "learning_rate": 3.1856016648162435e-06, + "loss": 0.511, + "step": 9185 + }, + { + "epoch": 0.75, + "grad_norm": 0.93977533850814, + "learning_rate": 3.1836752624307878e-06, + "loss": 0.4925, + "step": 9186 + }, + { + "epoch": 0.75, + "grad_norm": 0.9905180889851486, + "learning_rate": 3.1817493324105884e-06, + "loss": 0.5343, + "step": 9187 + }, + { + "epoch": 0.75, + "grad_norm": 0.9626382947667614, + "learning_rate": 3.179823874889113e-06, + "loss": 0.5192, + "step": 9188 + }, + { + "epoch": 0.75, + "grad_norm": 1.008409226761721, + "learning_rate": 3.1778988899997977e-06, + "loss": 0.576, + "step": 9189 + }, + { + "epoch": 0.75, + "grad_norm": 0.9254016781809032, + "learning_rate": 3.175974377876031e-06, + "loss": 0.5201, + "step": 9190 + }, + { + "epoch": 0.75, + "grad_norm": 0.8827104917802043, + "learning_rate": 3.1740503386511933e-06, + "loss": 0.5004, + "step": 9191 + }, + { + "epoch": 0.75, + "grad_norm": 0.9195045711783788, + "learning_rate": 3.17212677245861e-06, + "loss": 0.4723, + "step": 9192 + }, + { + "epoch": 0.75, + "grad_norm": 0.9178177294623786, + "learning_rate": 3.1702036794315837e-06, + "loss": 0.52, + "step": 9193 + }, + { + "epoch": 0.75, + "grad_norm": 0.930776420999054, + "learning_rate": 3.1682810597033853e-06, + "loss": 0.5447, + "step": 9194 + }, + { + "epoch": 0.75, + "grad_norm": 0.8603670066733564, + "learning_rate": 3.1663589134072537e-06, + "loss": 0.539, + "step": 9195 + }, + { + "epoch": 0.75, + "grad_norm": 0.9266579862289025, + "learning_rate": 3.16443724067638e-06, + "loss": 0.4666, + "step": 9196 + }, + { + "epoch": 0.75, + "grad_norm": 0.8913826793152476, + "learning_rate": 3.1625160416439503e-06, + "loss": 0.4906, + "step": 9197 + }, + { + "epoch": 0.75, + "grad_norm": 0.8448070499801077, + "learning_rate": 3.1605953164430904e-06, + "loss": 0.4222, + "step": 9198 + }, + { + "epoch": 0.75, + "grad_norm": 0.8777398534519062, + "learning_rate": 3.1586750652069077e-06, + "loss": 0.5065, + "step": 9199 + }, + { + "epoch": 0.75, + "grad_norm": 0.8780703673114545, + "learning_rate": 3.156755288068475e-06, + "loss": 0.494, + "step": 9200 + }, + { + "epoch": 0.75, + "grad_norm": 1.002260390936769, + "learning_rate": 3.1548359851608344e-06, + "loss": 0.5149, + "step": 9201 + }, + { + "epoch": 0.75, + "grad_norm": 0.9969350471910058, + "learning_rate": 3.1529171566169825e-06, + "loss": 0.5127, + "step": 9202 + }, + { + "epoch": 0.75, + "grad_norm": 0.8782140061192855, + "learning_rate": 3.1509988025699046e-06, + "loss": 0.4238, + "step": 9203 + }, + { + "epoch": 0.75, + "grad_norm": 0.9204540185363628, + "learning_rate": 3.14908092315253e-06, + "loss": 0.5102, + "step": 9204 + }, + { + "epoch": 0.75, + "grad_norm": 0.8708557332916683, + "learning_rate": 3.147163518497772e-06, + "loss": 0.4824, + "step": 9205 + }, + { + "epoch": 0.75, + "grad_norm": 0.9061520741038792, + "learning_rate": 3.145246588738503e-06, + "loss": 0.4841, + "step": 9206 + }, + { + "epoch": 0.75, + "grad_norm": 0.9419277444337851, + "learning_rate": 3.1433301340075694e-06, + "loss": 0.4981, + "step": 9207 + }, + { + "epoch": 0.75, + "grad_norm": 1.0182335668415685, + "learning_rate": 3.1414141544377686e-06, + "loss": 0.5721, + "step": 9208 + }, + { + "epoch": 0.75, + "grad_norm": 0.9456494049006637, + "learning_rate": 3.1394986501618897e-06, + "loss": 0.4902, + "step": 9209 + }, + { + "epoch": 0.75, + "grad_norm": 0.9234880089818398, + "learning_rate": 3.1375836213126653e-06, + "loss": 0.5534, + "step": 9210 + }, + { + "epoch": 0.75, + "grad_norm": 0.8922445520981266, + "learning_rate": 3.135669068022811e-06, + "loss": 0.4862, + "step": 9211 + }, + { + "epoch": 0.75, + "grad_norm": 1.0138485298431035, + "learning_rate": 3.1337549904249996e-06, + "loss": 0.5239, + "step": 9212 + }, + { + "epoch": 0.75, + "grad_norm": 0.8779972974608878, + "learning_rate": 3.1318413886518804e-06, + "loss": 0.4709, + "step": 9213 + }, + { + "epoch": 0.75, + "grad_norm": 0.9511817289467243, + "learning_rate": 3.129928262836055e-06, + "loss": 0.5514, + "step": 9214 + }, + { + "epoch": 0.75, + "grad_norm": 0.9160031996320639, + "learning_rate": 3.1280156131101136e-06, + "loss": 0.4835, + "step": 9215 + }, + { + "epoch": 0.75, + "grad_norm": 0.9136188010687992, + "learning_rate": 3.1261034396065924e-06, + "loss": 0.4356, + "step": 9216 + }, + { + "epoch": 0.75, + "grad_norm": 0.9672584722550528, + "learning_rate": 3.1241917424580047e-06, + "loss": 0.5193, + "step": 9217 + }, + { + "epoch": 0.75, + "grad_norm": 0.8269303323982298, + "learning_rate": 3.122280521796831e-06, + "loss": 0.4713, + "step": 9218 + }, + { + "epoch": 0.75, + "grad_norm": 0.9173435143365514, + "learning_rate": 3.1203697777555163e-06, + "loss": 0.5053, + "step": 9219 + }, + { + "epoch": 0.75, + "grad_norm": 0.8772695738063876, + "learning_rate": 3.1184595104664726e-06, + "loss": 0.5159, + "step": 9220 + }, + { + "epoch": 0.75, + "grad_norm": 0.8953996339227956, + "learning_rate": 3.1165497200620863e-06, + "loss": 0.4851, + "step": 9221 + }, + { + "epoch": 0.75, + "grad_norm": 0.9178968916849854, + "learning_rate": 3.114640406674694e-06, + "loss": 0.4623, + "step": 9222 + }, + { + "epoch": 0.75, + "grad_norm": 0.9705439786022239, + "learning_rate": 3.1127315704366144e-06, + "loss": 0.4735, + "step": 9223 + }, + { + "epoch": 0.75, + "grad_norm": 1.0537737416285637, + "learning_rate": 3.1108232114801283e-06, + "loss": 0.5305, + "step": 9224 + }, + { + "epoch": 0.75, + "grad_norm": 1.006997561164009, + "learning_rate": 3.108915329937483e-06, + "loss": 0.4649, + "step": 9225 + }, + { + "epoch": 0.75, + "grad_norm": 0.9484263275516899, + "learning_rate": 3.1070079259408934e-06, + "loss": 0.5857, + "step": 9226 + }, + { + "epoch": 0.75, + "grad_norm": 0.8931141053668399, + "learning_rate": 3.1051009996225434e-06, + "loss": 0.433, + "step": 9227 + }, + { + "epoch": 0.75, + "grad_norm": 0.9955212573195641, + "learning_rate": 3.1031945511145744e-06, + "loss": 0.4869, + "step": 9228 + }, + { + "epoch": 0.75, + "grad_norm": 0.9445210297345634, + "learning_rate": 3.101288580549107e-06, + "loss": 0.5201, + "step": 9229 + }, + { + "epoch": 0.75, + "grad_norm": 0.8845297197394515, + "learning_rate": 3.09938308805822e-06, + "loss": 0.4468, + "step": 9230 + }, + { + "epoch": 0.75, + "grad_norm": 0.8677616975176144, + "learning_rate": 3.0974780737739653e-06, + "loss": 0.4696, + "step": 9231 + }, + { + "epoch": 0.75, + "grad_norm": 0.8440699624266844, + "learning_rate": 3.095573537828357e-06, + "loss": 0.4719, + "step": 9232 + }, + { + "epoch": 0.75, + "grad_norm": 1.0118764043381994, + "learning_rate": 3.0936694803533817e-06, + "loss": 0.5053, + "step": 9233 + }, + { + "epoch": 0.75, + "grad_norm": 1.010118539516588, + "learning_rate": 3.091765901480983e-06, + "loss": 0.5169, + "step": 9234 + }, + { + "epoch": 0.75, + "grad_norm": 0.8893225432422138, + "learning_rate": 3.0898628013430787e-06, + "loss": 0.4511, + "step": 9235 + }, + { + "epoch": 0.75, + "grad_norm": 0.9298359284103193, + "learning_rate": 3.087960180071553e-06, + "loss": 0.5121, + "step": 9236 + }, + { + "epoch": 0.75, + "grad_norm": 0.899101273379794, + "learning_rate": 3.0860580377982563e-06, + "loss": 0.5377, + "step": 9237 + }, + { + "epoch": 0.75, + "grad_norm": 0.9002401268153558, + "learning_rate": 3.084156374655005e-06, + "loss": 0.4642, + "step": 9238 + }, + { + "epoch": 0.75, + "grad_norm": 0.9113145964585156, + "learning_rate": 3.0822551907735833e-06, + "loss": 0.5161, + "step": 9239 + }, + { + "epoch": 0.75, + "grad_norm": 0.8565877087319691, + "learning_rate": 3.080354486285743e-06, + "loss": 0.4561, + "step": 9240 + }, + { + "epoch": 0.75, + "grad_norm": 0.9296165703485347, + "learning_rate": 3.078454261323196e-06, + "loss": 0.4638, + "step": 9241 + }, + { + "epoch": 0.75, + "grad_norm": 0.9871738649945768, + "learning_rate": 3.076554516017629e-06, + "loss": 0.5422, + "step": 9242 + }, + { + "epoch": 0.75, + "grad_norm": 0.9845398816258983, + "learning_rate": 3.074655250500693e-06, + "loss": 0.5418, + "step": 9243 + }, + { + "epoch": 0.75, + "grad_norm": 0.8907187812517292, + "learning_rate": 3.0727564649040066e-06, + "loss": 0.4575, + "step": 9244 + }, + { + "epoch": 0.75, + "grad_norm": 0.9005922717320375, + "learning_rate": 3.0708581593591513e-06, + "loss": 0.4844, + "step": 9245 + }, + { + "epoch": 0.75, + "grad_norm": 0.8811211339067954, + "learning_rate": 3.068960333997684e-06, + "loss": 0.4851, + "step": 9246 + }, + { + "epoch": 0.75, + "grad_norm": 0.8297490979293182, + "learning_rate": 3.0670629889511128e-06, + "loss": 0.4663, + "step": 9247 + }, + { + "epoch": 0.75, + "grad_norm": 0.9965530095100212, + "learning_rate": 3.0651661243509277e-06, + "loss": 0.465, + "step": 9248 + }, + { + "epoch": 0.75, + "grad_norm": 0.9304897978087547, + "learning_rate": 3.063269740328579e-06, + "loss": 0.5054, + "step": 9249 + }, + { + "epoch": 0.75, + "grad_norm": 1.247859611749849, + "learning_rate": 3.0613738370154853e-06, + "loss": 0.5959, + "step": 9250 + }, + { + "epoch": 0.75, + "grad_norm": 0.8559408643557311, + "learning_rate": 3.059478414543029e-06, + "loss": 0.4542, + "step": 9251 + }, + { + "epoch": 0.75, + "grad_norm": 0.9487040433145538, + "learning_rate": 3.0575834730425658e-06, + "loss": 0.443, + "step": 9252 + }, + { + "epoch": 0.75, + "grad_norm": 1.0418904384512588, + "learning_rate": 3.0556890126454075e-06, + "loss": 0.5321, + "step": 9253 + }, + { + "epoch": 0.75, + "grad_norm": 0.9142524061866827, + "learning_rate": 3.0537950334828405e-06, + "loss": 0.5283, + "step": 9254 + }, + { + "epoch": 0.75, + "grad_norm": 0.9849848109141258, + "learning_rate": 3.051901535686116e-06, + "loss": 0.5214, + "step": 9255 + }, + { + "epoch": 0.75, + "grad_norm": 0.9474074774382758, + "learning_rate": 3.0500085193864525e-06, + "loss": 0.4811, + "step": 9256 + }, + { + "epoch": 0.75, + "grad_norm": 0.9814885554674336, + "learning_rate": 3.0481159847150343e-06, + "loss": 0.5196, + "step": 9257 + }, + { + "epoch": 0.75, + "grad_norm": 0.9491077327581923, + "learning_rate": 3.046223931803015e-06, + "loss": 0.4665, + "step": 9258 + }, + { + "epoch": 0.75, + "grad_norm": 0.9047139713494868, + "learning_rate": 3.044332360781502e-06, + "loss": 0.4961, + "step": 9259 + }, + { + "epoch": 0.75, + "grad_norm": 0.9322500438401069, + "learning_rate": 3.0424412717815943e-06, + "loss": 0.4989, + "step": 9260 + }, + { + "epoch": 0.75, + "grad_norm": 1.048631041096126, + "learning_rate": 3.040550664934332e-06, + "loss": 0.5397, + "step": 9261 + }, + { + "epoch": 0.75, + "grad_norm": 0.9549270032708144, + "learning_rate": 3.0386605403707347e-06, + "loss": 0.5324, + "step": 9262 + }, + { + "epoch": 0.75, + "grad_norm": 0.9334118783548825, + "learning_rate": 3.036770898221787e-06, + "loss": 0.4805, + "step": 9263 + }, + { + "epoch": 0.75, + "grad_norm": 0.8760160616514998, + "learning_rate": 3.0348817386184403e-06, + "loss": 0.523, + "step": 9264 + }, + { + "epoch": 0.75, + "grad_norm": 0.8602677591239977, + "learning_rate": 3.0329930616916114e-06, + "loss": 0.4556, + "step": 9265 + }, + { + "epoch": 0.75, + "grad_norm": 0.942749923235785, + "learning_rate": 3.0311048675721865e-06, + "loss": 0.4959, + "step": 9266 + }, + { + "epoch": 0.75, + "grad_norm": 0.967553145530537, + "learning_rate": 3.02921715639101e-06, + "loss": 0.5025, + "step": 9267 + }, + { + "epoch": 0.75, + "grad_norm": 0.875163075806297, + "learning_rate": 3.0273299282789004e-06, + "loss": 0.4791, + "step": 9268 + }, + { + "epoch": 0.75, + "grad_norm": 0.9610020745516507, + "learning_rate": 3.025443183366643e-06, + "loss": 0.5296, + "step": 9269 + }, + { + "epoch": 0.75, + "grad_norm": 1.0015074057092335, + "learning_rate": 3.023556921784987e-06, + "loss": 0.5247, + "step": 9270 + }, + { + "epoch": 0.75, + "grad_norm": 0.93960616711754, + "learning_rate": 3.021671143664647e-06, + "loss": 0.481, + "step": 9271 + }, + { + "epoch": 0.75, + "grad_norm": 0.9447586684544488, + "learning_rate": 3.019785849136311e-06, + "loss": 0.5358, + "step": 9272 + }, + { + "epoch": 0.75, + "grad_norm": 0.9784661202493571, + "learning_rate": 3.0179010383306208e-06, + "loss": 0.4865, + "step": 9273 + }, + { + "epoch": 0.75, + "grad_norm": 0.9071899625936168, + "learning_rate": 3.0160167113781945e-06, + "loss": 0.5029, + "step": 9274 + }, + { + "epoch": 0.75, + "grad_norm": 0.839349146751607, + "learning_rate": 3.014132868409617e-06, + "loss": 0.5041, + "step": 9275 + }, + { + "epoch": 0.75, + "grad_norm": 0.8927128357241134, + "learning_rate": 3.012249509555435e-06, + "loss": 0.4717, + "step": 9276 + }, + { + "epoch": 0.75, + "grad_norm": 0.9196859640341792, + "learning_rate": 3.0103666349461624e-06, + "loss": 0.5086, + "step": 9277 + }, + { + "epoch": 0.75, + "grad_norm": 1.0067864835070655, + "learning_rate": 3.008484244712286e-06, + "loss": 0.4523, + "step": 9278 + }, + { + "epoch": 0.75, + "grad_norm": 0.933143092486881, + "learning_rate": 3.0066023389842446e-06, + "loss": 0.482, + "step": 9279 + }, + { + "epoch": 0.75, + "grad_norm": 0.9386096596080008, + "learning_rate": 3.004720917892464e-06, + "loss": 0.4981, + "step": 9280 + }, + { + "epoch": 0.75, + "grad_norm": 0.9007385757794655, + "learning_rate": 3.0028399815673147e-06, + "loss": 0.4798, + "step": 9281 + }, + { + "epoch": 0.75, + "grad_norm": 1.080130380478513, + "learning_rate": 3.0009595301391494e-06, + "loss": 0.5987, + "step": 9282 + }, + { + "epoch": 0.75, + "grad_norm": 1.0126379378704609, + "learning_rate": 2.999079563738281e-06, + "loss": 0.494, + "step": 9283 + }, + { + "epoch": 0.75, + "grad_norm": 0.9617860043838014, + "learning_rate": 2.9972000824949908e-06, + "loss": 0.5165, + "step": 9284 + }, + { + "epoch": 0.75, + "grad_norm": 0.8559597205644912, + "learning_rate": 2.9953210865395176e-06, + "loss": 0.4558, + "step": 9285 + }, + { + "epoch": 0.75, + "grad_norm": 0.8985111470687281, + "learning_rate": 2.9934425760020857e-06, + "loss": 0.4772, + "step": 9286 + }, + { + "epoch": 0.75, + "grad_norm": 0.9475458320896335, + "learning_rate": 2.9915645510128666e-06, + "loss": 0.454, + "step": 9287 + }, + { + "epoch": 0.75, + "grad_norm": 0.935781669540675, + "learning_rate": 2.9896870117020073e-06, + "loss": 0.4811, + "step": 9288 + }, + { + "epoch": 0.75, + "grad_norm": 0.9664864293034208, + "learning_rate": 2.987809958199619e-06, + "loss": 0.577, + "step": 9289 + }, + { + "epoch": 0.76, + "grad_norm": 0.8836746825941467, + "learning_rate": 2.9859333906357845e-06, + "loss": 0.5378, + "step": 9290 + }, + { + "epoch": 0.76, + "grad_norm": 0.9582144787011051, + "learning_rate": 2.984057309140539e-06, + "loss": 0.478, + "step": 9291 + }, + { + "epoch": 0.76, + "grad_norm": 0.8878155885088148, + "learning_rate": 2.9821817138439036e-06, + "loss": 0.5188, + "step": 9292 + }, + { + "epoch": 0.76, + "grad_norm": 0.8990987481542069, + "learning_rate": 2.980306604875849e-06, + "loss": 0.4875, + "step": 9293 + }, + { + "epoch": 0.76, + "grad_norm": 0.927337620844019, + "learning_rate": 2.9784319823663188e-06, + "loss": 0.5161, + "step": 9294 + }, + { + "epoch": 0.76, + "grad_norm": 0.8955209176918927, + "learning_rate": 2.976557846445225e-06, + "loss": 0.5026, + "step": 9295 + }, + { + "epoch": 0.76, + "grad_norm": 0.9350116459240718, + "learning_rate": 2.9746841972424456e-06, + "loss": 0.5127, + "step": 9296 + }, + { + "epoch": 0.76, + "grad_norm": 0.9022083479959, + "learning_rate": 2.9728110348878135e-06, + "loss": 0.4831, + "step": 9297 + }, + { + "epoch": 0.76, + "grad_norm": 0.8905880164735378, + "learning_rate": 2.9709383595111506e-06, + "loss": 0.4788, + "step": 9298 + }, + { + "epoch": 0.76, + "grad_norm": 0.9840888834902731, + "learning_rate": 2.969066171242221e-06, + "loss": 0.5257, + "step": 9299 + }, + { + "epoch": 0.76, + "grad_norm": 0.8482427926265703, + "learning_rate": 2.967194470210769e-06, + "loss": 0.5342, + "step": 9300 + }, + { + "epoch": 0.76, + "grad_norm": 0.8943824965526529, + "learning_rate": 2.9653232565465017e-06, + "loss": 0.4111, + "step": 9301 + }, + { + "epoch": 0.76, + "grad_norm": 0.8255995194547043, + "learning_rate": 2.9634525303790973e-06, + "loss": 0.4847, + "step": 9302 + }, + { + "epoch": 0.76, + "grad_norm": 0.9574954648262838, + "learning_rate": 2.9615822918381844e-06, + "loss": 0.5338, + "step": 9303 + }, + { + "epoch": 0.76, + "grad_norm": 0.9660797147778306, + "learning_rate": 2.959712541053381e-06, + "loss": 0.4644, + "step": 9304 + }, + { + "epoch": 0.76, + "grad_norm": 0.9252816124540626, + "learning_rate": 2.9578432781542523e-06, + "loss": 0.5135, + "step": 9305 + }, + { + "epoch": 0.76, + "grad_norm": 0.9478822301015203, + "learning_rate": 2.955974503270337e-06, + "loss": 0.4625, + "step": 9306 + }, + { + "epoch": 0.76, + "grad_norm": 0.962311200737902, + "learning_rate": 2.954106216531141e-06, + "loss": 0.5013, + "step": 9307 + }, + { + "epoch": 0.76, + "grad_norm": 0.838888152056901, + "learning_rate": 2.952238418066137e-06, + "loss": 0.4996, + "step": 9308 + }, + { + "epoch": 0.76, + "grad_norm": 0.98241779392544, + "learning_rate": 2.9503711080047535e-06, + "loss": 0.4344, + "step": 9309 + }, + { + "epoch": 0.76, + "grad_norm": 1.0205651361476762, + "learning_rate": 2.9485042864764047e-06, + "loss": 0.5232, + "step": 9310 + }, + { + "epoch": 0.76, + "grad_norm": 0.9279477006987505, + "learning_rate": 2.9466379536104518e-06, + "loss": 0.4812, + "step": 9311 + }, + { + "epoch": 0.76, + "grad_norm": 0.8593933307119445, + "learning_rate": 2.9447721095362325e-06, + "loss": 0.4849, + "step": 9312 + }, + { + "epoch": 0.76, + "grad_norm": 0.9690665581476634, + "learning_rate": 2.942906754383048e-06, + "loss": 0.4679, + "step": 9313 + }, + { + "epoch": 0.76, + "grad_norm": 0.8951759511066802, + "learning_rate": 2.9410418882801682e-06, + "loss": 0.5178, + "step": 9314 + }, + { + "epoch": 0.76, + "grad_norm": 0.9530573012082669, + "learning_rate": 2.939177511356819e-06, + "loss": 0.5129, + "step": 9315 + }, + { + "epoch": 0.76, + "grad_norm": 1.0308139688157876, + "learning_rate": 2.9373136237422107e-06, + "loss": 0.5144, + "step": 9316 + }, + { + "epoch": 0.76, + "grad_norm": 0.9297141613759037, + "learning_rate": 2.9354502255655002e-06, + "loss": 0.5619, + "step": 9317 + }, + { + "epoch": 0.76, + "grad_norm": 0.8485557605262156, + "learning_rate": 2.9335873169558236e-06, + "loss": 0.4499, + "step": 9318 + }, + { + "epoch": 0.76, + "grad_norm": 0.9753919721602036, + "learning_rate": 2.9317248980422785e-06, + "loss": 0.5164, + "step": 9319 + }, + { + "epoch": 0.76, + "grad_norm": 0.9329553550167934, + "learning_rate": 2.9298629689539315e-06, + "loss": 0.4638, + "step": 9320 + }, + { + "epoch": 0.76, + "grad_norm": 0.9864133114751812, + "learning_rate": 2.9280015298198026e-06, + "loss": 0.4754, + "step": 9321 + }, + { + "epoch": 0.76, + "grad_norm": 0.9716573961646281, + "learning_rate": 2.9261405807689014e-06, + "loss": 0.5036, + "step": 9322 + }, + { + "epoch": 0.76, + "grad_norm": 0.9333844851764159, + "learning_rate": 2.9242801219301797e-06, + "loss": 0.4928, + "step": 9323 + }, + { + "epoch": 0.76, + "grad_norm": 0.8484772389666334, + "learning_rate": 2.9224201534325703e-06, + "loss": 0.4605, + "step": 9324 + }, + { + "epoch": 0.76, + "grad_norm": 0.887435635454266, + "learning_rate": 2.9205606754049667e-06, + "loss": 0.4935, + "step": 9325 + }, + { + "epoch": 0.76, + "grad_norm": 0.9015840885486097, + "learning_rate": 2.918701687976231e-06, + "loss": 0.4776, + "step": 9326 + }, + { + "epoch": 0.76, + "grad_norm": 0.9225799390724025, + "learning_rate": 2.9168431912751805e-06, + "loss": 0.489, + "step": 9327 + }, + { + "epoch": 0.76, + "grad_norm": 0.9216293988583227, + "learning_rate": 2.914985185430621e-06, + "loss": 0.4783, + "step": 9328 + }, + { + "epoch": 0.76, + "grad_norm": 0.9834546685153829, + "learning_rate": 2.9131276705713008e-06, + "loss": 0.5459, + "step": 9329 + }, + { + "epoch": 0.76, + "grad_norm": 0.9503774917454169, + "learning_rate": 2.9112706468259478e-06, + "loss": 0.5112, + "step": 9330 + }, + { + "epoch": 0.76, + "grad_norm": 0.8558042581003334, + "learning_rate": 2.90941411432325e-06, + "loss": 0.4642, + "step": 9331 + }, + { + "epoch": 0.76, + "grad_norm": 0.8959538771990759, + "learning_rate": 2.9075580731918684e-06, + "loss": 0.4843, + "step": 9332 + }, + { + "epoch": 0.76, + "grad_norm": 0.9834208135406659, + "learning_rate": 2.905702523560415e-06, + "loss": 0.4608, + "step": 9333 + }, + { + "epoch": 0.76, + "grad_norm": 0.926465065803759, + "learning_rate": 2.90384746555749e-06, + "loss": 0.4842, + "step": 9334 + }, + { + "epoch": 0.76, + "grad_norm": 0.8753341930421762, + "learning_rate": 2.9019928993116388e-06, + "loss": 0.4617, + "step": 9335 + }, + { + "epoch": 0.76, + "grad_norm": 0.87421173569085, + "learning_rate": 2.900138824951383e-06, + "loss": 0.5181, + "step": 9336 + }, + { + "epoch": 0.76, + "grad_norm": 0.9219804198644024, + "learning_rate": 2.89828524260521e-06, + "loss": 0.4836, + "step": 9337 + }, + { + "epoch": 0.76, + "grad_norm": 0.9697470723187763, + "learning_rate": 2.8964321524015725e-06, + "loss": 0.5239, + "step": 9338 + }, + { + "epoch": 0.76, + "grad_norm": 0.9569066941173024, + "learning_rate": 2.8945795544688814e-06, + "loss": 0.544, + "step": 9339 + }, + { + "epoch": 0.76, + "grad_norm": 0.982711465105853, + "learning_rate": 2.8927274489355296e-06, + "loss": 0.5016, + "step": 9340 + }, + { + "epoch": 0.76, + "grad_norm": 0.9280691158631974, + "learning_rate": 2.890875835929858e-06, + "loss": 0.5355, + "step": 9341 + }, + { + "epoch": 0.76, + "grad_norm": 0.9580433080173101, + "learning_rate": 2.8890247155801864e-06, + "loss": 0.5485, + "step": 9342 + }, + { + "epoch": 0.76, + "grad_norm": 0.9684247483206608, + "learning_rate": 2.8871740880147935e-06, + "loss": 0.5603, + "step": 9343 + }, + { + "epoch": 0.76, + "grad_norm": 0.8835115268579586, + "learning_rate": 2.8853239533619314e-06, + "loss": 0.5186, + "step": 9344 + }, + { + "epoch": 0.76, + "grad_norm": 1.2424654229776397, + "learning_rate": 2.883474311749802e-06, + "loss": 0.5008, + "step": 9345 + }, + { + "epoch": 0.76, + "grad_norm": 0.842275429217379, + "learning_rate": 2.8816251633065963e-06, + "loss": 0.4049, + "step": 9346 + }, + { + "epoch": 0.76, + "grad_norm": 0.9070179199220347, + "learning_rate": 2.87977650816045e-06, + "loss": 0.5289, + "step": 9347 + }, + { + "epoch": 0.76, + "grad_norm": 0.851064073144121, + "learning_rate": 2.877928346439476e-06, + "loss": 0.5164, + "step": 9348 + }, + { + "epoch": 0.76, + "grad_norm": 0.9271901856488475, + "learning_rate": 2.87608067827175e-06, + "loss": 0.4729, + "step": 9349 + }, + { + "epoch": 0.76, + "grad_norm": 0.9421025894776374, + "learning_rate": 2.8742335037853173e-06, + "loss": 0.4991, + "step": 9350 + }, + { + "epoch": 0.76, + "grad_norm": 0.9507336913105746, + "learning_rate": 2.8723868231081762e-06, + "loss": 0.4617, + "step": 9351 + }, + { + "epoch": 0.76, + "grad_norm": 0.8159297374452942, + "learning_rate": 2.870540636368312e-06, + "loss": 0.4482, + "step": 9352 + }, + { + "epoch": 0.76, + "grad_norm": 1.0030324921252392, + "learning_rate": 2.868694943693655e-06, + "loss": 0.4944, + "step": 9353 + }, + { + "epoch": 0.76, + "grad_norm": 0.9596171869334853, + "learning_rate": 2.8668497452121137e-06, + "loss": 0.5073, + "step": 9354 + }, + { + "epoch": 0.76, + "grad_norm": 0.9262832524325116, + "learning_rate": 2.8650050410515573e-06, + "loss": 0.4648, + "step": 9355 + }, + { + "epoch": 0.76, + "grad_norm": 0.9137971459810631, + "learning_rate": 2.8631608313398252e-06, + "loss": 0.4643, + "step": 9356 + }, + { + "epoch": 0.76, + "grad_norm": 1.0017381152896594, + "learning_rate": 2.8613171162047116e-06, + "loss": 0.5519, + "step": 9357 + }, + { + "epoch": 0.76, + "grad_norm": 1.0135694589387556, + "learning_rate": 2.8594738957739964e-06, + "loss": 0.4904, + "step": 9358 + }, + { + "epoch": 0.76, + "grad_norm": 0.9381392192800052, + "learning_rate": 2.8576311701754033e-06, + "loss": 0.4843, + "step": 9359 + }, + { + "epoch": 0.76, + "grad_norm": 0.9194632348387407, + "learning_rate": 2.8557889395366344e-06, + "loss": 0.4958, + "step": 9360 + }, + { + "epoch": 0.76, + "grad_norm": 0.9099550661414307, + "learning_rate": 2.8539472039853557e-06, + "loss": 0.4452, + "step": 9361 + }, + { + "epoch": 0.76, + "grad_norm": 0.9598811542751433, + "learning_rate": 2.8521059636492e-06, + "loss": 0.5969, + "step": 9362 + }, + { + "epoch": 0.76, + "grad_norm": 0.9889306022941791, + "learning_rate": 2.8502652186557546e-06, + "loss": 0.5026, + "step": 9363 + }, + { + "epoch": 0.76, + "grad_norm": 0.9599089363640353, + "learning_rate": 2.8484249691325936e-06, + "loss": 0.5357, + "step": 9364 + }, + { + "epoch": 0.76, + "grad_norm": 0.8071106061246606, + "learning_rate": 2.846585215207236e-06, + "loss": 0.4196, + "step": 9365 + }, + { + "epoch": 0.76, + "grad_norm": 0.8789960512738482, + "learning_rate": 2.844745957007178e-06, + "loss": 0.4865, + "step": 9366 + }, + { + "epoch": 0.76, + "grad_norm": 1.004915994614177, + "learning_rate": 2.8429071946598784e-06, + "loss": 0.4525, + "step": 9367 + }, + { + "epoch": 0.76, + "grad_norm": 0.949052188692021, + "learning_rate": 2.841068928292762e-06, + "loss": 0.515, + "step": 9368 + }, + { + "epoch": 0.76, + "grad_norm": 0.8423956562674407, + "learning_rate": 2.839231158033219e-06, + "loss": 0.3948, + "step": 9369 + }, + { + "epoch": 0.76, + "grad_norm": 0.8096642908837642, + "learning_rate": 2.837393884008608e-06, + "loss": 0.4039, + "step": 9370 + }, + { + "epoch": 0.76, + "grad_norm": 0.8576437588563828, + "learning_rate": 2.835557106346244e-06, + "loss": 0.4479, + "step": 9371 + }, + { + "epoch": 0.76, + "grad_norm": 0.9191350794963664, + "learning_rate": 2.8337208251734183e-06, + "loss": 0.4786, + "step": 9372 + }, + { + "epoch": 0.76, + "grad_norm": 0.9644820042800843, + "learning_rate": 2.8318850406173827e-06, + "loss": 0.4845, + "step": 9373 + }, + { + "epoch": 0.76, + "grad_norm": 0.9416056292197382, + "learning_rate": 2.830049752805356e-06, + "loss": 0.4415, + "step": 9374 + }, + { + "epoch": 0.76, + "grad_norm": 0.9614306850353689, + "learning_rate": 2.8282149618645215e-06, + "loss": 0.5065, + "step": 9375 + }, + { + "epoch": 0.76, + "grad_norm": 0.919458526211513, + "learning_rate": 2.826380667922032e-06, + "loss": 0.4998, + "step": 9376 + }, + { + "epoch": 0.76, + "grad_norm": 0.9254357372660033, + "learning_rate": 2.824546871104996e-06, + "loss": 0.4881, + "step": 9377 + }, + { + "epoch": 0.76, + "grad_norm": 0.940488112437101, + "learning_rate": 2.8227135715404975e-06, + "loss": 0.5278, + "step": 9378 + }, + { + "epoch": 0.76, + "grad_norm": 0.9064149146132878, + "learning_rate": 2.820880769355582e-06, + "loss": 0.4802, + "step": 9379 + }, + { + "epoch": 0.76, + "grad_norm": 0.9971665290177097, + "learning_rate": 2.819048464677261e-06, + "loss": 0.5214, + "step": 9380 + }, + { + "epoch": 0.76, + "grad_norm": 0.9594023669258108, + "learning_rate": 2.817216657632512e-06, + "loss": 0.5152, + "step": 9381 + }, + { + "epoch": 0.76, + "grad_norm": 0.9912338435984925, + "learning_rate": 2.8153853483482817e-06, + "loss": 0.4847, + "step": 9382 + }, + { + "epoch": 0.76, + "grad_norm": 0.918143957172683, + "learning_rate": 2.813554536951466e-06, + "loss": 0.5439, + "step": 9383 + }, + { + "epoch": 0.76, + "grad_norm": 0.9542177968776675, + "learning_rate": 2.8117242235689546e-06, + "loss": 0.5633, + "step": 9384 + }, + { + "epoch": 0.76, + "grad_norm": 0.9676961660664399, + "learning_rate": 2.8098944083275735e-06, + "loss": 0.4745, + "step": 9385 + }, + { + "epoch": 0.76, + "grad_norm": 0.8463354068720405, + "learning_rate": 2.8080650913541343e-06, + "loss": 0.4333, + "step": 9386 + }, + { + "epoch": 0.76, + "grad_norm": 0.9071559562682353, + "learning_rate": 2.8062362727754034e-06, + "loss": 0.4202, + "step": 9387 + }, + { + "epoch": 0.76, + "grad_norm": 1.0620416420487766, + "learning_rate": 2.804407952718119e-06, + "loss": 0.5192, + "step": 9388 + }, + { + "epoch": 0.76, + "grad_norm": 0.9700130342590119, + "learning_rate": 2.8025801313089808e-06, + "loss": 0.4732, + "step": 9389 + }, + { + "epoch": 0.76, + "grad_norm": 0.905026639613572, + "learning_rate": 2.8007528086746574e-06, + "loss": 0.4955, + "step": 9390 + }, + { + "epoch": 0.76, + "grad_norm": 0.8949781812267134, + "learning_rate": 2.798925984941776e-06, + "loss": 0.4971, + "step": 9391 + }, + { + "epoch": 0.76, + "grad_norm": 1.0893057417483687, + "learning_rate": 2.797099660236937e-06, + "loss": 0.5203, + "step": 9392 + }, + { + "epoch": 0.76, + "grad_norm": 0.9056671218015045, + "learning_rate": 2.7952738346867026e-06, + "loss": 0.5157, + "step": 9393 + }, + { + "epoch": 0.76, + "grad_norm": 0.9267444438266536, + "learning_rate": 2.7934485084176012e-06, + "loss": 0.4864, + "step": 9394 + }, + { + "epoch": 0.76, + "grad_norm": 0.9428738338535183, + "learning_rate": 2.791623681556125e-06, + "loss": 0.4714, + "step": 9395 + }, + { + "epoch": 0.76, + "grad_norm": 0.9593348069639677, + "learning_rate": 2.789799354228737e-06, + "loss": 0.4895, + "step": 9396 + }, + { + "epoch": 0.76, + "grad_norm": 1.0047888951482236, + "learning_rate": 2.7879755265618558e-06, + "loss": 0.4577, + "step": 9397 + }, + { + "epoch": 0.76, + "grad_norm": 1.0235280843992327, + "learning_rate": 2.786152198681874e-06, + "loss": 0.5219, + "step": 9398 + }, + { + "epoch": 0.76, + "grad_norm": 0.9472089839708605, + "learning_rate": 2.7843293707151455e-06, + "loss": 0.5328, + "step": 9399 + }, + { + "epoch": 0.76, + "grad_norm": 0.9448776068653041, + "learning_rate": 2.782507042787991e-06, + "loss": 0.4486, + "step": 9400 + }, + { + "epoch": 0.76, + "grad_norm": 1.0305616862250657, + "learning_rate": 2.7806852150266974e-06, + "loss": 0.5552, + "step": 9401 + }, + { + "epoch": 0.76, + "grad_norm": 0.8278570108296879, + "learning_rate": 2.778863887557517e-06, + "loss": 0.4195, + "step": 9402 + }, + { + "epoch": 0.76, + "grad_norm": 0.9276535441346776, + "learning_rate": 2.777043060506661e-06, + "loss": 0.4551, + "step": 9403 + }, + { + "epoch": 0.76, + "grad_norm": 0.9487354552538336, + "learning_rate": 2.7752227340003145e-06, + "loss": 0.5439, + "step": 9404 + }, + { + "epoch": 0.76, + "grad_norm": 0.9832271050557329, + "learning_rate": 2.773402908164625e-06, + "loss": 0.5789, + "step": 9405 + }, + { + "epoch": 0.76, + "grad_norm": 0.9550858429413105, + "learning_rate": 2.771583583125703e-06, + "loss": 0.4835, + "step": 9406 + }, + { + "epoch": 0.76, + "grad_norm": 1.0062011570686709, + "learning_rate": 2.7697647590096277e-06, + "loss": 0.5558, + "step": 9407 + }, + { + "epoch": 0.76, + "grad_norm": 0.868966115835517, + "learning_rate": 2.76794643594244e-06, + "loss": 0.4517, + "step": 9408 + }, + { + "epoch": 0.76, + "grad_norm": 0.9535790756363516, + "learning_rate": 2.766128614050154e-06, + "loss": 0.4804, + "step": 9409 + }, + { + "epoch": 0.76, + "grad_norm": 0.9308323891682322, + "learning_rate": 2.7643112934587346e-06, + "loss": 0.4971, + "step": 9410 + }, + { + "epoch": 0.76, + "grad_norm": 1.0176657701318508, + "learning_rate": 2.7624944742941253e-06, + "loss": 0.5166, + "step": 9411 + }, + { + "epoch": 0.76, + "grad_norm": 0.9501763096451348, + "learning_rate": 2.760678156682229e-06, + "loss": 0.4569, + "step": 9412 + }, + { + "epoch": 0.77, + "grad_norm": 0.9236459156836029, + "learning_rate": 2.7588623407489158e-06, + "loss": 0.5252, + "step": 9413 + }, + { + "epoch": 0.77, + "grad_norm": 0.8433794190856604, + "learning_rate": 2.7570470266200177e-06, + "loss": 0.4018, + "step": 9414 + }, + { + "epoch": 0.77, + "grad_norm": 0.915106529891791, + "learning_rate": 2.7552322144213405e-06, + "loss": 0.4964, + "step": 9415 + }, + { + "epoch": 0.77, + "grad_norm": 0.8850114828887584, + "learning_rate": 2.753417904278641e-06, + "loss": 0.5148, + "step": 9416 + }, + { + "epoch": 0.77, + "grad_norm": 0.9452199835968088, + "learning_rate": 2.751604096317655e-06, + "loss": 0.4584, + "step": 9417 + }, + { + "epoch": 0.77, + "grad_norm": 0.94196405618762, + "learning_rate": 2.749790790664074e-06, + "loss": 0.5041, + "step": 9418 + }, + { + "epoch": 0.77, + "grad_norm": 0.9554860308445395, + "learning_rate": 2.7479779874435607e-06, + "loss": 0.4885, + "step": 9419 + }, + { + "epoch": 0.77, + "grad_norm": 0.8251822970851799, + "learning_rate": 2.7461656867817397e-06, + "loss": 0.4526, + "step": 9420 + }, + { + "epoch": 0.77, + "grad_norm": 0.9104051781210106, + "learning_rate": 2.7443538888042065e-06, + "loss": 0.4844, + "step": 9421 + }, + { + "epoch": 0.77, + "grad_norm": 0.9173725950469258, + "learning_rate": 2.742542593636509e-06, + "loss": 0.4853, + "step": 9422 + }, + { + "epoch": 0.77, + "grad_norm": 0.9440247777740798, + "learning_rate": 2.7407318014041727e-06, + "loss": 0.4653, + "step": 9423 + }, + { + "epoch": 0.77, + "grad_norm": 0.9456146477999366, + "learning_rate": 2.738921512232684e-06, + "loss": 0.4939, + "step": 9424 + }, + { + "epoch": 0.77, + "grad_norm": 0.8871216035281972, + "learning_rate": 2.7371117262474945e-06, + "loss": 0.4852, + "step": 9425 + }, + { + "epoch": 0.77, + "grad_norm": 0.8784139534365634, + "learning_rate": 2.7353024435740194e-06, + "loss": 0.5012, + "step": 9426 + }, + { + "epoch": 0.77, + "grad_norm": 0.9562799444064749, + "learning_rate": 2.7334936643376443e-06, + "loss": 0.5088, + "step": 9427 + }, + { + "epoch": 0.77, + "grad_norm": 0.8914075216309687, + "learning_rate": 2.7316853886637075e-06, + "loss": 0.4959, + "step": 9428 + }, + { + "epoch": 0.77, + "grad_norm": 0.8622154551462666, + "learning_rate": 2.729877616677531e-06, + "loss": 0.5002, + "step": 9429 + }, + { + "epoch": 0.77, + "grad_norm": 0.8689274219881318, + "learning_rate": 2.7280703485043846e-06, + "loss": 0.445, + "step": 9430 + }, + { + "epoch": 0.77, + "grad_norm": 0.8775797672203981, + "learning_rate": 2.726263584269513e-06, + "loss": 0.477, + "step": 9431 + }, + { + "epoch": 0.77, + "grad_norm": 0.9325799673463466, + "learning_rate": 2.724457324098123e-06, + "loss": 0.5041, + "step": 9432 + }, + { + "epoch": 0.77, + "grad_norm": 0.9626934216789831, + "learning_rate": 2.7226515681153907e-06, + "loss": 0.5497, + "step": 9433 + }, + { + "epoch": 0.77, + "grad_norm": 0.9011603340435773, + "learning_rate": 2.720846316446443e-06, + "loss": 0.48, + "step": 9434 + }, + { + "epoch": 0.77, + "grad_norm": 1.0063613488595504, + "learning_rate": 2.7190415692163954e-06, + "loss": 0.4801, + "step": 9435 + }, + { + "epoch": 0.77, + "grad_norm": 0.9004373985017571, + "learning_rate": 2.717237326550306e-06, + "loss": 0.4982, + "step": 9436 + }, + { + "epoch": 0.77, + "grad_norm": 0.9216341366680187, + "learning_rate": 2.71543358857321e-06, + "loss": 0.5106, + "step": 9437 + }, + { + "epoch": 0.77, + "grad_norm": 1.0024610759720805, + "learning_rate": 2.713630355410104e-06, + "loss": 0.4884, + "step": 9438 + }, + { + "epoch": 0.77, + "grad_norm": 0.9858367768202332, + "learning_rate": 2.7118276271859555e-06, + "loss": 0.5171, + "step": 9439 + }, + { + "epoch": 0.77, + "grad_norm": 0.9180550622016094, + "learning_rate": 2.7100254040256813e-06, + "loss": 0.5102, + "step": 9440 + }, + { + "epoch": 0.77, + "grad_norm": 0.839648095978644, + "learning_rate": 2.7082236860541867e-06, + "loss": 0.4548, + "step": 9441 + }, + { + "epoch": 0.77, + "grad_norm": 0.965488004574052, + "learning_rate": 2.7064224733963197e-06, + "loss": 0.5141, + "step": 9442 + }, + { + "epoch": 0.77, + "grad_norm": 0.9617855033127427, + "learning_rate": 2.704621766176905e-06, + "loss": 0.5163, + "step": 9443 + }, + { + "epoch": 0.77, + "grad_norm": 0.9178157408943385, + "learning_rate": 2.702821564520732e-06, + "loss": 0.4896, + "step": 9444 + }, + { + "epoch": 0.77, + "grad_norm": 0.9110526638080855, + "learning_rate": 2.7010218685525545e-06, + "loss": 0.462, + "step": 9445 + }, + { + "epoch": 0.77, + "grad_norm": 0.9618892367926404, + "learning_rate": 2.699222678397082e-06, + "loss": 0.4735, + "step": 9446 + }, + { + "epoch": 0.77, + "grad_norm": 1.0025662808262406, + "learning_rate": 2.697423994179007e-06, + "loss": 0.4799, + "step": 9447 + }, + { + "epoch": 0.77, + "grad_norm": 0.9728581194952428, + "learning_rate": 2.69562581602297e-06, + "loss": 0.4956, + "step": 9448 + }, + { + "epoch": 0.77, + "grad_norm": 0.8845833875493481, + "learning_rate": 2.693828144053584e-06, + "loss": 0.4821, + "step": 9449 + }, + { + "epoch": 0.77, + "grad_norm": 0.9953067711145615, + "learning_rate": 2.6920309783954277e-06, + "loss": 0.5049, + "step": 9450 + }, + { + "epoch": 0.77, + "grad_norm": 0.9240944266957233, + "learning_rate": 2.690234319173045e-06, + "loss": 0.4273, + "step": 9451 + }, + { + "epoch": 0.77, + "grad_norm": 0.8056754390924662, + "learning_rate": 2.688438166510935e-06, + "loss": 0.4646, + "step": 9452 + }, + { + "epoch": 0.77, + "grad_norm": 0.953057999448235, + "learning_rate": 2.68664252053358e-06, + "loss": 0.5191, + "step": 9453 + }, + { + "epoch": 0.77, + "grad_norm": 0.8365867448944447, + "learning_rate": 2.6848473813654087e-06, + "loss": 0.4236, + "step": 9454 + }, + { + "epoch": 0.77, + "grad_norm": 0.889545304390988, + "learning_rate": 2.6830527491308257e-06, + "loss": 0.459, + "step": 9455 + }, + { + "epoch": 0.77, + "grad_norm": 0.9223982552953418, + "learning_rate": 2.681258623954196e-06, + "loss": 0.4824, + "step": 9456 + }, + { + "epoch": 0.77, + "grad_norm": 0.9608557480794483, + "learning_rate": 2.679465005959856e-06, + "loss": 0.5088, + "step": 9457 + }, + { + "epoch": 0.77, + "grad_norm": 1.0188180580969628, + "learning_rate": 2.6776718952720903e-06, + "loss": 0.458, + "step": 9458 + }, + { + "epoch": 0.77, + "grad_norm": 0.8616425434444939, + "learning_rate": 2.6758792920151745e-06, + "loss": 0.4269, + "step": 9459 + }, + { + "epoch": 0.77, + "grad_norm": 1.0145444858253545, + "learning_rate": 2.6740871963133243e-06, + "loss": 0.4662, + "step": 9460 + }, + { + "epoch": 0.77, + "grad_norm": 0.9901094833309612, + "learning_rate": 2.6722956082907334e-06, + "loss": 0.4941, + "step": 9461 + }, + { + "epoch": 0.77, + "grad_norm": 0.90990267972996, + "learning_rate": 2.670504528071557e-06, + "loss": 0.4996, + "step": 9462 + }, + { + "epoch": 0.77, + "grad_norm": 0.8671739375567898, + "learning_rate": 2.668713955779918e-06, + "loss": 0.4801, + "step": 9463 + }, + { + "epoch": 0.77, + "grad_norm": 0.8410079626584746, + "learning_rate": 2.6669238915398943e-06, + "loss": 0.4196, + "step": 9464 + }, + { + "epoch": 0.77, + "grad_norm": 0.9790064700550006, + "learning_rate": 2.6651343354755453e-06, + "loss": 0.5025, + "step": 9465 + }, + { + "epoch": 0.77, + "grad_norm": 0.989725553659738, + "learning_rate": 2.663345287710878e-06, + "loss": 0.4847, + "step": 9466 + }, + { + "epoch": 0.77, + "grad_norm": 1.0120797707377107, + "learning_rate": 2.6615567483698746e-06, + "loss": 0.5814, + "step": 9467 + }, + { + "epoch": 0.77, + "grad_norm": 1.0247258994347321, + "learning_rate": 2.65976871757648e-06, + "loss": 0.5506, + "step": 9468 + }, + { + "epoch": 0.77, + "grad_norm": 0.9353296234328144, + "learning_rate": 2.6579811954546054e-06, + "loss": 0.4999, + "step": 9469 + }, + { + "epoch": 0.77, + "grad_norm": 0.873564732962634, + "learning_rate": 2.6561941821281145e-06, + "loss": 0.4801, + "step": 9470 + }, + { + "epoch": 0.77, + "grad_norm": 0.9463813696150183, + "learning_rate": 2.6544076777208603e-06, + "loss": 0.4883, + "step": 9471 + }, + { + "epoch": 0.77, + "grad_norm": 0.9701094798832111, + "learning_rate": 2.6526216823566342e-06, + "loss": 0.4821, + "step": 9472 + }, + { + "epoch": 0.77, + "grad_norm": 0.9290121340326827, + "learning_rate": 2.65083619615921e-06, + "loss": 0.5389, + "step": 9473 + }, + { + "epoch": 0.77, + "grad_norm": 0.9520983634733708, + "learning_rate": 2.6490512192523175e-06, + "loss": 0.4464, + "step": 9474 + }, + { + "epoch": 0.77, + "grad_norm": 0.8680418166970947, + "learning_rate": 2.6472667517596584e-06, + "loss": 0.461, + "step": 9475 + }, + { + "epoch": 0.77, + "grad_norm": 0.8481399018890614, + "learning_rate": 2.6454827938048855e-06, + "loss": 0.4635, + "step": 9476 + }, + { + "epoch": 0.77, + "grad_norm": 0.8970798829121703, + "learning_rate": 2.643699345511638e-06, + "loss": 0.4517, + "step": 9477 + }, + { + "epoch": 0.77, + "grad_norm": 1.0208441448818784, + "learning_rate": 2.6419164070034974e-06, + "loss": 0.517, + "step": 9478 + }, + { + "epoch": 0.77, + "grad_norm": 0.8962370261002454, + "learning_rate": 2.6401339784040226e-06, + "loss": 0.5019, + "step": 9479 + }, + { + "epoch": 0.77, + "grad_norm": 0.8456090475029107, + "learning_rate": 2.6383520598367363e-06, + "loss": 0.4345, + "step": 9480 + }, + { + "epoch": 0.77, + "grad_norm": 0.8804027202694467, + "learning_rate": 2.6365706514251244e-06, + "loss": 0.4929, + "step": 9481 + }, + { + "epoch": 0.77, + "grad_norm": 0.9137295020872604, + "learning_rate": 2.6347897532926293e-06, + "loss": 0.5308, + "step": 9482 + }, + { + "epoch": 0.77, + "grad_norm": 0.9212246006767905, + "learning_rate": 2.6330093655626777e-06, + "loss": 0.5352, + "step": 9483 + }, + { + "epoch": 0.77, + "grad_norm": 1.042293489524842, + "learning_rate": 2.6312294883586385e-06, + "loss": 0.5123, + "step": 9484 + }, + { + "epoch": 0.77, + "grad_norm": 0.8841512558148169, + "learning_rate": 2.6294501218038603e-06, + "loss": 0.446, + "step": 9485 + }, + { + "epoch": 0.77, + "grad_norm": 0.8420400723533827, + "learning_rate": 2.627671266021652e-06, + "loss": 0.4708, + "step": 9486 + }, + { + "epoch": 0.77, + "grad_norm": 0.9083137612965833, + "learning_rate": 2.625892921135288e-06, + "loss": 0.4783, + "step": 9487 + }, + { + "epoch": 0.77, + "grad_norm": 0.9938593713070609, + "learning_rate": 2.6241150872679968e-06, + "loss": 0.5045, + "step": 9488 + }, + { + "epoch": 0.77, + "grad_norm": 0.9119577080481137, + "learning_rate": 2.6223377645429948e-06, + "loss": 0.4987, + "step": 9489 + }, + { + "epoch": 0.77, + "grad_norm": 1.0419405547070333, + "learning_rate": 2.6205609530834388e-06, + "loss": 0.4877, + "step": 9490 + }, + { + "epoch": 0.77, + "grad_norm": 0.9126973386049844, + "learning_rate": 2.6187846530124615e-06, + "loss": 0.5284, + "step": 9491 + }, + { + "epoch": 0.77, + "grad_norm": 0.9253444650389665, + "learning_rate": 2.6170088644531623e-06, + "loss": 0.5231, + "step": 9492 + }, + { + "epoch": 0.77, + "grad_norm": 0.8321810930190079, + "learning_rate": 2.6152335875286027e-06, + "loss": 0.4274, + "step": 9493 + }, + { + "epoch": 0.77, + "grad_norm": 0.9589537397942688, + "learning_rate": 2.6134588223617995e-06, + "loss": 0.5199, + "step": 9494 + }, + { + "epoch": 0.77, + "grad_norm": 0.9208045079162227, + "learning_rate": 2.6116845690757533e-06, + "loss": 0.4873, + "step": 9495 + }, + { + "epoch": 0.77, + "grad_norm": 0.9568307857077881, + "learning_rate": 2.6099108277934105e-06, + "loss": 0.5072, + "step": 9496 + }, + { + "epoch": 0.77, + "grad_norm": 1.0430924603891203, + "learning_rate": 2.6081375986376924e-06, + "loss": 0.5863, + "step": 9497 + }, + { + "epoch": 0.77, + "grad_norm": 0.8728274230320672, + "learning_rate": 2.6063648817314825e-06, + "loss": 0.4295, + "step": 9498 + }, + { + "epoch": 0.77, + "grad_norm": 0.9260757150277996, + "learning_rate": 2.6045926771976306e-06, + "loss": 0.4946, + "step": 9499 + }, + { + "epoch": 0.77, + "grad_norm": 0.9797267215825568, + "learning_rate": 2.6028209851589403e-06, + "loss": 0.5314, + "step": 9500 + }, + { + "epoch": 0.77, + "grad_norm": 0.8235502974705514, + "learning_rate": 2.6010498057382005e-06, + "loss": 0.4063, + "step": 9501 + }, + { + "epoch": 0.77, + "grad_norm": 0.9412399480520027, + "learning_rate": 2.599279139058143e-06, + "loss": 0.5289, + "step": 9502 + }, + { + "epoch": 0.77, + "grad_norm": 0.9456699294244987, + "learning_rate": 2.597508985241477e-06, + "loss": 0.4849, + "step": 9503 + }, + { + "epoch": 0.77, + "grad_norm": 0.8843131800728579, + "learning_rate": 2.5957393444108724e-06, + "loss": 0.462, + "step": 9504 + }, + { + "epoch": 0.77, + "grad_norm": 0.9220872425336669, + "learning_rate": 2.593970216688967e-06, + "loss": 0.5103, + "step": 9505 + }, + { + "epoch": 0.77, + "grad_norm": 0.9703291160659552, + "learning_rate": 2.592201602198351e-06, + "loss": 0.4847, + "step": 9506 + }, + { + "epoch": 0.77, + "grad_norm": 1.0437899523711882, + "learning_rate": 2.5904335010615976e-06, + "loss": 0.4661, + "step": 9507 + }, + { + "epoch": 0.77, + "grad_norm": 0.8931272353575075, + "learning_rate": 2.588665913401226e-06, + "loss": 0.4767, + "step": 9508 + }, + { + "epoch": 0.77, + "grad_norm": 0.8971499854589432, + "learning_rate": 2.5868988393397376e-06, + "loss": 0.4886, + "step": 9509 + }, + { + "epoch": 0.77, + "grad_norm": 0.9114204447284524, + "learning_rate": 2.5851322789995815e-06, + "loss": 0.4812, + "step": 9510 + }, + { + "epoch": 0.77, + "grad_norm": 0.9433208349417566, + "learning_rate": 2.5833662325031816e-06, + "loss": 0.4965, + "step": 9511 + }, + { + "epoch": 0.77, + "grad_norm": 0.9745841552592122, + "learning_rate": 2.5816006999729225e-06, + "loss": 0.5398, + "step": 9512 + }, + { + "epoch": 0.77, + "grad_norm": 1.1338912061174495, + "learning_rate": 2.5798356815311587e-06, + "loss": 0.4673, + "step": 9513 + }, + { + "epoch": 0.77, + "grad_norm": 0.9102039459425405, + "learning_rate": 2.5780711773001943e-06, + "loss": 0.4687, + "step": 9514 + }, + { + "epoch": 0.77, + "grad_norm": 0.8972357697454645, + "learning_rate": 2.5763071874023205e-06, + "loss": 0.442, + "step": 9515 + }, + { + "epoch": 0.77, + "grad_norm": 0.9881441739800304, + "learning_rate": 2.5745437119597704e-06, + "loss": 0.4863, + "step": 9516 + }, + { + "epoch": 0.77, + "grad_norm": 0.9675120535718381, + "learning_rate": 2.5727807510947545e-06, + "loss": 0.5134, + "step": 9517 + }, + { + "epoch": 0.77, + "grad_norm": 0.9015375721815215, + "learning_rate": 2.5710183049294445e-06, + "loss": 0.4743, + "step": 9518 + }, + { + "epoch": 0.77, + "grad_norm": 0.9734237840850912, + "learning_rate": 2.56925637358598e-06, + "loss": 0.5198, + "step": 9519 + }, + { + "epoch": 0.77, + "grad_norm": 0.9644298697485151, + "learning_rate": 2.5674949571864517e-06, + "loss": 0.4779, + "step": 9520 + }, + { + "epoch": 0.77, + "grad_norm": 0.9969693321534621, + "learning_rate": 2.5657340558529353e-06, + "loss": 0.4861, + "step": 9521 + }, + { + "epoch": 0.77, + "grad_norm": 0.9181947952380234, + "learning_rate": 2.5639736697074525e-06, + "loss": 0.4836, + "step": 9522 + }, + { + "epoch": 0.77, + "grad_norm": 0.9606810410171697, + "learning_rate": 2.5622137988719985e-06, + "loss": 0.5083, + "step": 9523 + }, + { + "epoch": 0.77, + "grad_norm": 0.940512663001103, + "learning_rate": 2.5604544434685307e-06, + "loss": 0.4637, + "step": 9524 + }, + { + "epoch": 0.77, + "grad_norm": 0.9785135388690958, + "learning_rate": 2.558695603618975e-06, + "loss": 0.4966, + "step": 9525 + }, + { + "epoch": 0.77, + "grad_norm": 0.9331808129454607, + "learning_rate": 2.5569372794452063e-06, + "loss": 0.5326, + "step": 9526 + }, + { + "epoch": 0.77, + "grad_norm": 0.9294999568518234, + "learning_rate": 2.555179471069089e-06, + "loss": 0.48, + "step": 9527 + }, + { + "epoch": 0.77, + "grad_norm": 0.824147125924437, + "learning_rate": 2.553422178612427e-06, + "loss": 0.4187, + "step": 9528 + }, + { + "epoch": 0.77, + "grad_norm": 0.8936141587086273, + "learning_rate": 2.5516654021970035e-06, + "loss": 0.5049, + "step": 9529 + }, + { + "epoch": 0.77, + "grad_norm": 0.9344279939268219, + "learning_rate": 2.549909141944561e-06, + "loss": 0.479, + "step": 9530 + }, + { + "epoch": 0.77, + "grad_norm": 0.9227147305713791, + "learning_rate": 2.5481533979768092e-06, + "loss": 0.4666, + "step": 9531 + }, + { + "epoch": 0.77, + "grad_norm": 0.9643823288376145, + "learning_rate": 2.546398170415412e-06, + "loss": 0.4633, + "step": 9532 + }, + { + "epoch": 0.77, + "grad_norm": 0.9135184290568981, + "learning_rate": 2.5446434593820156e-06, + "loss": 0.4838, + "step": 9533 + }, + { + "epoch": 0.77, + "grad_norm": 0.9140822658348559, + "learning_rate": 2.5428892649982117e-06, + "loss": 0.4997, + "step": 9534 + }, + { + "epoch": 0.77, + "grad_norm": 0.9113683203067207, + "learning_rate": 2.5411355873855683e-06, + "loss": 0.4603, + "step": 9535 + }, + { + "epoch": 0.78, + "grad_norm": 0.8322288199107464, + "learning_rate": 2.539382426665611e-06, + "loss": 0.4867, + "step": 9536 + }, + { + "epoch": 0.78, + "grad_norm": 0.9147448880750836, + "learning_rate": 2.537629782959835e-06, + "loss": 0.4898, + "step": 9537 + }, + { + "epoch": 0.78, + "grad_norm": 0.868170086743266, + "learning_rate": 2.5358776563896957e-06, + "loss": 0.5012, + "step": 9538 + }, + { + "epoch": 0.78, + "grad_norm": 0.9781015542954844, + "learning_rate": 2.5341260470766173e-06, + "loss": 0.4501, + "step": 9539 + }, + { + "epoch": 0.78, + "grad_norm": 0.933814696684243, + "learning_rate": 2.5323749551419775e-06, + "loss": 0.4684, + "step": 9540 + }, + { + "epoch": 0.78, + "grad_norm": 0.8989376433156853, + "learning_rate": 2.5306243807071305e-06, + "loss": 0.5363, + "step": 9541 + }, + { + "epoch": 0.78, + "grad_norm": 0.8773684970723838, + "learning_rate": 2.5288743238933887e-06, + "loss": 0.4537, + "step": 9542 + }, + { + "epoch": 0.78, + "grad_norm": 0.9498142865358258, + "learning_rate": 2.5271247848220294e-06, + "loss": 0.5679, + "step": 9543 + }, + { + "epoch": 0.78, + "grad_norm": 1.0550451747440102, + "learning_rate": 2.525375763614294e-06, + "loss": 0.5219, + "step": 9544 + }, + { + "epoch": 0.78, + "grad_norm": 0.9255261798407214, + "learning_rate": 2.5236272603913915e-06, + "loss": 0.5036, + "step": 9545 + }, + { + "epoch": 0.78, + "grad_norm": 0.9255632591648001, + "learning_rate": 2.5218792752744847e-06, + "loss": 0.5053, + "step": 9546 + }, + { + "epoch": 0.78, + "grad_norm": 0.9945535840087542, + "learning_rate": 2.5201318083847105e-06, + "loss": 0.5119, + "step": 9547 + }, + { + "epoch": 0.78, + "grad_norm": 0.9160402986044256, + "learning_rate": 2.518384859843168e-06, + "loss": 0.4775, + "step": 9548 + }, + { + "epoch": 0.78, + "grad_norm": 0.9130901844914137, + "learning_rate": 2.516638429770919e-06, + "loss": 0.4579, + "step": 9549 + }, + { + "epoch": 0.78, + "grad_norm": 0.8844580873802608, + "learning_rate": 2.514892518288988e-06, + "loss": 0.4504, + "step": 9550 + }, + { + "epoch": 0.78, + "grad_norm": 0.9584403151191385, + "learning_rate": 2.5131471255183705e-06, + "loss": 0.525, + "step": 9551 + }, + { + "epoch": 0.78, + "grad_norm": 0.9838552272295368, + "learning_rate": 2.511402251580013e-06, + "loss": 0.5415, + "step": 9552 + }, + { + "epoch": 0.78, + "grad_norm": 0.9275636019742892, + "learning_rate": 2.509657896594837e-06, + "loss": 0.4796, + "step": 9553 + }, + { + "epoch": 0.78, + "grad_norm": 0.9780448168288274, + "learning_rate": 2.507914060683725e-06, + "loss": 0.4797, + "step": 9554 + }, + { + "epoch": 0.78, + "grad_norm": 1.0122611059232476, + "learning_rate": 2.5061707439675222e-06, + "loss": 0.4896, + "step": 9555 + }, + { + "epoch": 0.78, + "grad_norm": 0.98051994871729, + "learning_rate": 2.5044279465670408e-06, + "loss": 0.5126, + "step": 9556 + }, + { + "epoch": 0.78, + "grad_norm": 0.9005729862836364, + "learning_rate": 2.502685668603053e-06, + "loss": 0.5542, + "step": 9557 + }, + { + "epoch": 0.78, + "grad_norm": 0.9593751701329121, + "learning_rate": 2.5009439101963027e-06, + "loss": 0.5163, + "step": 9558 + }, + { + "epoch": 0.78, + "grad_norm": 0.9336429381380371, + "learning_rate": 2.499202671467483e-06, + "loss": 0.553, + "step": 9559 + }, + { + "epoch": 0.78, + "grad_norm": 1.0034038590580017, + "learning_rate": 2.497461952537267e-06, + "loss": 0.5383, + "step": 9560 + }, + { + "epoch": 0.78, + "grad_norm": 0.8763757759226454, + "learning_rate": 2.4957217535262824e-06, + "loss": 0.4981, + "step": 9561 + }, + { + "epoch": 0.78, + "grad_norm": 0.8586560124179438, + "learning_rate": 2.4939820745551235e-06, + "loss": 0.4485, + "step": 9562 + }, + { + "epoch": 0.78, + "grad_norm": 0.9065661965736719, + "learning_rate": 2.4922429157443484e-06, + "loss": 0.4697, + "step": 9563 + }, + { + "epoch": 0.78, + "grad_norm": 0.8918877224490259, + "learning_rate": 2.490504277214484e-06, + "loss": 0.4276, + "step": 9564 + }, + { + "epoch": 0.78, + "grad_norm": 0.8620501304929654, + "learning_rate": 2.488766159086009e-06, + "loss": 0.4448, + "step": 9565 + }, + { + "epoch": 0.78, + "grad_norm": 0.9677343569180769, + "learning_rate": 2.4870285614793764e-06, + "loss": 0.5492, + "step": 9566 + }, + { + "epoch": 0.78, + "grad_norm": 0.962363169336885, + "learning_rate": 2.485291484515e-06, + "loss": 0.5051, + "step": 9567 + }, + { + "epoch": 0.78, + "grad_norm": 0.959581770332624, + "learning_rate": 2.4835549283132597e-06, + "loss": 0.5402, + "step": 9568 + }, + { + "epoch": 0.78, + "grad_norm": 0.8514647633831238, + "learning_rate": 2.4818188929944942e-06, + "loss": 0.4795, + "step": 9569 + }, + { + "epoch": 0.78, + "grad_norm": 0.9092107495200746, + "learning_rate": 2.4800833786790145e-06, + "loss": 0.4939, + "step": 9570 + }, + { + "epoch": 0.78, + "grad_norm": 0.9327251184011716, + "learning_rate": 2.478348385487084e-06, + "loss": 0.4533, + "step": 9571 + }, + { + "epoch": 0.78, + "grad_norm": 0.8735652553810023, + "learning_rate": 2.476613913538938e-06, + "loss": 0.4457, + "step": 9572 + }, + { + "epoch": 0.78, + "grad_norm": 1.0708948579788935, + "learning_rate": 2.474879962954775e-06, + "loss": 0.5, + "step": 9573 + }, + { + "epoch": 0.78, + "grad_norm": 0.9015351195751412, + "learning_rate": 2.4731465338547556e-06, + "loss": 0.5406, + "step": 9574 + }, + { + "epoch": 0.78, + "grad_norm": 0.9339400308308337, + "learning_rate": 2.471413626359005e-06, + "loss": 0.4519, + "step": 9575 + }, + { + "epoch": 0.78, + "grad_norm": 0.9899716519567556, + "learning_rate": 2.4696812405876147e-06, + "loss": 0.4821, + "step": 9576 + }, + { + "epoch": 0.78, + "grad_norm": 0.9336255754909419, + "learning_rate": 2.46794937666063e-06, + "loss": 0.5054, + "step": 9577 + }, + { + "epoch": 0.78, + "grad_norm": 0.9350326299793474, + "learning_rate": 2.466218034698078e-06, + "loss": 0.4756, + "step": 9578 + }, + { + "epoch": 0.78, + "grad_norm": 0.896737867789691, + "learning_rate": 2.4644872148199316e-06, + "loss": 0.4517, + "step": 9579 + }, + { + "epoch": 0.78, + "grad_norm": 0.9455232413075647, + "learning_rate": 2.4627569171461363e-06, + "loss": 0.5259, + "step": 9580 + }, + { + "epoch": 0.78, + "grad_norm": 0.8955076931033673, + "learning_rate": 2.4610271417966014e-06, + "loss": 0.4832, + "step": 9581 + }, + { + "epoch": 0.78, + "grad_norm": 0.866670299293872, + "learning_rate": 2.4592978888912013e-06, + "loss": 0.477, + "step": 9582 + }, + { + "epoch": 0.78, + "grad_norm": 0.8821913909176877, + "learning_rate": 2.457569158549763e-06, + "loss": 0.4958, + "step": 9583 + }, + { + "epoch": 0.78, + "grad_norm": 0.9863938976553523, + "learning_rate": 2.455840950892099e-06, + "loss": 0.5138, + "step": 9584 + }, + { + "epoch": 0.78, + "grad_norm": 0.9181851836678736, + "learning_rate": 2.4541132660379606e-06, + "loss": 0.5052, + "step": 9585 + }, + { + "epoch": 0.78, + "grad_norm": 0.87896082720906, + "learning_rate": 2.4523861041070806e-06, + "loss": 0.4395, + "step": 9586 + }, + { + "epoch": 0.78, + "grad_norm": 0.9452044022637043, + "learning_rate": 2.4506594652191485e-06, + "loss": 0.4585, + "step": 9587 + }, + { + "epoch": 0.78, + "grad_norm": 0.9581124622435845, + "learning_rate": 2.448933349493823e-06, + "loss": 0.4749, + "step": 9588 + }, + { + "epoch": 0.78, + "grad_norm": 0.9126209879683963, + "learning_rate": 2.4472077570507124e-06, + "loss": 0.5034, + "step": 9589 + }, + { + "epoch": 0.78, + "grad_norm": 0.8679783835342433, + "learning_rate": 2.4454826880094106e-06, + "loss": 0.5103, + "step": 9590 + }, + { + "epoch": 0.78, + "grad_norm": 0.8997725986140164, + "learning_rate": 2.443758142489454e-06, + "loss": 0.4383, + "step": 9591 + }, + { + "epoch": 0.78, + "grad_norm": 0.9282741810737473, + "learning_rate": 2.442034120610357e-06, + "loss": 0.4888, + "step": 9592 + }, + { + "epoch": 0.78, + "grad_norm": 0.9426438609355607, + "learning_rate": 2.4403106224915908e-06, + "loss": 0.4972, + "step": 9593 + }, + { + "epoch": 0.78, + "grad_norm": 0.9193151062029296, + "learning_rate": 2.438587648252596e-06, + "loss": 0.4282, + "step": 9594 + }, + { + "epoch": 0.78, + "grad_norm": 0.8577142049418723, + "learning_rate": 2.4368651980127644e-06, + "loss": 0.4865, + "step": 9595 + }, + { + "epoch": 0.78, + "grad_norm": 0.9277333955003407, + "learning_rate": 2.4351432718914727e-06, + "loss": 0.4682, + "step": 9596 + }, + { + "epoch": 0.78, + "grad_norm": 0.8571646694128863, + "learning_rate": 2.433421870008038e-06, + "loss": 0.4404, + "step": 9597 + }, + { + "epoch": 0.78, + "grad_norm": 0.9350419161187955, + "learning_rate": 2.431700992481757e-06, + "loss": 0.468, + "step": 9598 + }, + { + "epoch": 0.78, + "grad_norm": 0.7483768064636855, + "learning_rate": 2.4299806394318837e-06, + "loss": 0.3972, + "step": 9599 + }, + { + "epoch": 0.78, + "grad_norm": 1.0325912865114175, + "learning_rate": 2.428260810977641e-06, + "loss": 0.5122, + "step": 9600 + }, + { + "epoch": 0.78, + "grad_norm": 0.9058356524191399, + "learning_rate": 2.4265415072382016e-06, + "loss": 0.5053, + "step": 9601 + }, + { + "epoch": 0.78, + "grad_norm": 0.942026873185667, + "learning_rate": 2.424822728332724e-06, + "loss": 0.5308, + "step": 9602 + }, + { + "epoch": 0.78, + "grad_norm": 0.9747815459252755, + "learning_rate": 2.423104474380309e-06, + "loss": 0.4839, + "step": 9603 + }, + { + "epoch": 0.78, + "grad_norm": 0.8950242754724765, + "learning_rate": 2.421386745500034e-06, + "loss": 0.476, + "step": 9604 + }, + { + "epoch": 0.78, + "grad_norm": 0.8927261857002168, + "learning_rate": 2.419669541810934e-06, + "loss": 0.4962, + "step": 9605 + }, + { + "epoch": 0.78, + "grad_norm": 1.031794549609845, + "learning_rate": 2.417952863432015e-06, + "loss": 0.4638, + "step": 9606 + }, + { + "epoch": 0.78, + "grad_norm": 0.941660614300367, + "learning_rate": 2.4162367104822313e-06, + "loss": 0.4732, + "step": 9607 + }, + { + "epoch": 0.78, + "grad_norm": 0.9447255800757794, + "learning_rate": 2.414521083080523e-06, + "loss": 0.4594, + "step": 9608 + }, + { + "epoch": 0.78, + "grad_norm": 1.0667354565333285, + "learning_rate": 2.4128059813457716e-06, + "loss": 0.5123, + "step": 9609 + }, + { + "epoch": 0.78, + "grad_norm": 0.8808929346517816, + "learning_rate": 2.411091405396836e-06, + "loss": 0.4741, + "step": 9610 + }, + { + "epoch": 0.78, + "grad_norm": 0.9070711715732838, + "learning_rate": 2.409377355352536e-06, + "loss": 0.4799, + "step": 9611 + }, + { + "epoch": 0.78, + "grad_norm": 0.9931652854573912, + "learning_rate": 2.4076638313316537e-06, + "loss": 0.5847, + "step": 9612 + }, + { + "epoch": 0.78, + "grad_norm": 0.9773303592488506, + "learning_rate": 2.405950833452928e-06, + "loss": 0.438, + "step": 9613 + }, + { + "epoch": 0.78, + "grad_norm": 0.8959575285566649, + "learning_rate": 2.4042383618350795e-06, + "loss": 0.4453, + "step": 9614 + }, + { + "epoch": 0.78, + "grad_norm": 0.907181588357818, + "learning_rate": 2.402526416596772e-06, + "loss": 0.4528, + "step": 9615 + }, + { + "epoch": 0.78, + "grad_norm": 0.9749348938822829, + "learning_rate": 2.400814997856645e-06, + "loss": 0.515, + "step": 9616 + }, + { + "epoch": 0.78, + "grad_norm": 0.901830383250741, + "learning_rate": 2.399104105733299e-06, + "loss": 0.4434, + "step": 9617 + }, + { + "epoch": 0.78, + "grad_norm": 0.8893553496072407, + "learning_rate": 2.3973937403452983e-06, + "loss": 0.4025, + "step": 9618 + }, + { + "epoch": 0.78, + "grad_norm": 0.9038113038070881, + "learning_rate": 2.3956839018111634e-06, + "loss": 0.4542, + "step": 9619 + }, + { + "epoch": 0.78, + "grad_norm": 0.992165409961388, + "learning_rate": 2.393974590249394e-06, + "loss": 0.5901, + "step": 9620 + }, + { + "epoch": 0.78, + "grad_norm": 0.8849862750002848, + "learning_rate": 2.3922658057784355e-06, + "loss": 0.5163, + "step": 9621 + }, + { + "epoch": 0.78, + "grad_norm": 1.0192890227700766, + "learning_rate": 2.3905575485167098e-06, + "loss": 0.4513, + "step": 9622 + }, + { + "epoch": 0.78, + "grad_norm": 0.8242519230545053, + "learning_rate": 2.388849818582596e-06, + "loss": 0.4359, + "step": 9623 + }, + { + "epoch": 0.78, + "grad_norm": 0.91491375213291, + "learning_rate": 2.387142616094441e-06, + "loss": 0.4939, + "step": 9624 + }, + { + "epoch": 0.78, + "grad_norm": 0.9370320013929132, + "learning_rate": 2.385435941170544e-06, + "loss": 0.534, + "step": 9625 + }, + { + "epoch": 0.78, + "grad_norm": 1.0038811640559153, + "learning_rate": 2.3837297939291893e-06, + "loss": 0.5309, + "step": 9626 + }, + { + "epoch": 0.78, + "grad_norm": 0.8885292541660332, + "learning_rate": 2.382024174488601e-06, + "loss": 0.4796, + "step": 9627 + }, + { + "epoch": 0.78, + "grad_norm": 0.9018924053530425, + "learning_rate": 2.38031908296698e-06, + "loss": 0.4756, + "step": 9628 + }, + { + "epoch": 0.78, + "grad_norm": 0.9536886642846818, + "learning_rate": 2.378614519482487e-06, + "loss": 0.476, + "step": 9629 + }, + { + "epoch": 0.78, + "grad_norm": 0.8693603792931843, + "learning_rate": 2.376910484153252e-06, + "loss": 0.4338, + "step": 9630 + }, + { + "epoch": 0.78, + "grad_norm": 0.9319492597355499, + "learning_rate": 2.375206977097353e-06, + "loss": 0.4931, + "step": 9631 + }, + { + "epoch": 0.78, + "grad_norm": 0.8780828804269664, + "learning_rate": 2.373503998432852e-06, + "loss": 0.4885, + "step": 9632 + }, + { + "epoch": 0.78, + "grad_norm": 0.869729081783994, + "learning_rate": 2.3718015482777535e-06, + "loss": 0.4, + "step": 9633 + }, + { + "epoch": 0.78, + "grad_norm": 0.9881607907408684, + "learning_rate": 2.3700996267500486e-06, + "loss": 0.4615, + "step": 9634 + }, + { + "epoch": 0.78, + "grad_norm": 0.8627314779565832, + "learning_rate": 2.368398233967668e-06, + "loss": 0.4861, + "step": 9635 + }, + { + "epoch": 0.78, + "grad_norm": 0.9965660580842806, + "learning_rate": 2.3666973700485207e-06, + "loss": 0.4252, + "step": 9636 + }, + { + "epoch": 0.78, + "grad_norm": 0.8821204773558616, + "learning_rate": 2.3649970351104744e-06, + "loss": 0.4828, + "step": 9637 + }, + { + "epoch": 0.78, + "grad_norm": 0.8515620291830194, + "learning_rate": 2.363297229271365e-06, + "loss": 0.4645, + "step": 9638 + }, + { + "epoch": 0.78, + "grad_norm": 0.9134141990975033, + "learning_rate": 2.3615979526489773e-06, + "loss": 0.4702, + "step": 9639 + }, + { + "epoch": 0.78, + "grad_norm": 1.045250600324794, + "learning_rate": 2.3598992053610826e-06, + "loss": 0.514, + "step": 9640 + }, + { + "epoch": 0.78, + "grad_norm": 0.9719330232433728, + "learning_rate": 2.358200987525393e-06, + "loss": 0.4843, + "step": 9641 + }, + { + "epoch": 0.78, + "grad_norm": 0.9890798245689238, + "learning_rate": 2.356503299259597e-06, + "loss": 0.5332, + "step": 9642 + }, + { + "epoch": 0.78, + "grad_norm": 0.951865022725942, + "learning_rate": 2.354806140681343e-06, + "loss": 0.477, + "step": 9643 + }, + { + "epoch": 0.78, + "grad_norm": 0.9038312048330461, + "learning_rate": 2.353109511908245e-06, + "loss": 0.4452, + "step": 9644 + }, + { + "epoch": 0.78, + "grad_norm": 0.9811003847716361, + "learning_rate": 2.351413413057868e-06, + "loss": 0.5233, + "step": 9645 + }, + { + "epoch": 0.78, + "grad_norm": 0.9479546781437039, + "learning_rate": 2.349717844247764e-06, + "loss": 0.4685, + "step": 9646 + }, + { + "epoch": 0.78, + "grad_norm": 0.8049590726514992, + "learning_rate": 2.3480228055954246e-06, + "loss": 0.4421, + "step": 9647 + }, + { + "epoch": 0.78, + "grad_norm": 0.9733781462938889, + "learning_rate": 2.3463282972183176e-06, + "loss": 0.5053, + "step": 9648 + }, + { + "epoch": 0.78, + "grad_norm": 0.9600108414024033, + "learning_rate": 2.3446343192338705e-06, + "loss": 0.5029, + "step": 9649 + }, + { + "epoch": 0.78, + "grad_norm": 0.908930633454571, + "learning_rate": 2.342940871759477e-06, + "loss": 0.51, + "step": 9650 + }, + { + "epoch": 0.78, + "grad_norm": 0.8563848690253233, + "learning_rate": 2.3412479549124843e-06, + "loss": 0.4161, + "step": 9651 + }, + { + "epoch": 0.78, + "grad_norm": 0.9269137705402973, + "learning_rate": 2.339555568810221e-06, + "loss": 0.4778, + "step": 9652 + }, + { + "epoch": 0.78, + "grad_norm": 0.9345994871074489, + "learning_rate": 2.337863713569959e-06, + "loss": 0.4064, + "step": 9653 + }, + { + "epoch": 0.78, + "grad_norm": 0.8561141204612545, + "learning_rate": 2.336172389308945e-06, + "loss": 0.4461, + "step": 9654 + }, + { + "epoch": 0.78, + "grad_norm": 0.9263354207130737, + "learning_rate": 2.334481596144387e-06, + "loss": 0.4931, + "step": 9655 + }, + { + "epoch": 0.78, + "grad_norm": 0.9646139588123631, + "learning_rate": 2.3327913341934573e-06, + "loss": 0.4798, + "step": 9656 + }, + { + "epoch": 0.78, + "grad_norm": 0.9821076222973479, + "learning_rate": 2.3311016035732825e-06, + "loss": 0.5305, + "step": 9657 + }, + { + "epoch": 0.78, + "grad_norm": 0.8870591207772264, + "learning_rate": 2.329412404400969e-06, + "loss": 0.4557, + "step": 9658 + }, + { + "epoch": 0.79, + "grad_norm": 0.8341473641915862, + "learning_rate": 2.3277237367935702e-06, + "loss": 0.3986, + "step": 9659 + }, + { + "epoch": 0.79, + "grad_norm": 1.0667231560326589, + "learning_rate": 2.3260356008681107e-06, + "loss": 0.5143, + "step": 9660 + }, + { + "epoch": 0.79, + "grad_norm": 0.9747798801186214, + "learning_rate": 2.3243479967415773e-06, + "loss": 0.4896, + "step": 9661 + }, + { + "epoch": 0.79, + "grad_norm": 0.934749581531243, + "learning_rate": 2.322660924530922e-06, + "loss": 0.4454, + "step": 9662 + }, + { + "epoch": 0.79, + "grad_norm": 0.9290559957629269, + "learning_rate": 2.32097438435305e-06, + "loss": 0.4966, + "step": 9663 + }, + { + "epoch": 0.79, + "grad_norm": 1.0453163615005325, + "learning_rate": 2.319288376324846e-06, + "loss": 0.4847, + "step": 9664 + }, + { + "epoch": 0.79, + "grad_norm": 0.9662583465504331, + "learning_rate": 2.317602900563143e-06, + "loss": 0.5262, + "step": 9665 + }, + { + "epoch": 0.79, + "grad_norm": 1.0136411008586397, + "learning_rate": 2.3159179571847446e-06, + "loss": 0.5118, + "step": 9666 + }, + { + "epoch": 0.79, + "grad_norm": 0.9338870766327807, + "learning_rate": 2.314233546306416e-06, + "loss": 0.4899, + "step": 9667 + }, + { + "epoch": 0.79, + "grad_norm": 1.472138551273479, + "learning_rate": 2.3125496680448877e-06, + "loss": 0.5472, + "step": 9668 + }, + { + "epoch": 0.79, + "grad_norm": 0.809799494625484, + "learning_rate": 2.3108663225168436e-06, + "loss": 0.424, + "step": 9669 + }, + { + "epoch": 0.79, + "grad_norm": 0.9614148731201514, + "learning_rate": 2.3091835098389493e-06, + "loss": 0.4957, + "step": 9670 + }, + { + "epoch": 0.79, + "grad_norm": 1.0110403015033504, + "learning_rate": 2.307501230127812e-06, + "loss": 0.4923, + "step": 9671 + }, + { + "epoch": 0.79, + "grad_norm": 0.9439389881727236, + "learning_rate": 2.3058194835000167e-06, + "loss": 0.4792, + "step": 9672 + }, + { + "epoch": 0.79, + "grad_norm": 0.9739089259424125, + "learning_rate": 2.3041382700721073e-06, + "loss": 0.5028, + "step": 9673 + }, + { + "epoch": 0.79, + "grad_norm": 0.9288763116072336, + "learning_rate": 2.3024575899605906e-06, + "loss": 0.5383, + "step": 9674 + }, + { + "epoch": 0.79, + "grad_norm": 1.0166464866010603, + "learning_rate": 2.3007774432819308e-06, + "loss": 0.4986, + "step": 9675 + }, + { + "epoch": 0.79, + "grad_norm": 0.9510888787436993, + "learning_rate": 2.2990978301525702e-06, + "loss": 0.4687, + "step": 9676 + }, + { + "epoch": 0.79, + "grad_norm": 1.015865022970959, + "learning_rate": 2.297418750688897e-06, + "loss": 0.4786, + "step": 9677 + }, + { + "epoch": 0.79, + "grad_norm": 0.8650789735205615, + "learning_rate": 2.2957402050072717e-06, + "loss": 0.4507, + "step": 9678 + }, + { + "epoch": 0.79, + "grad_norm": 1.0254599114877216, + "learning_rate": 2.294062193224016e-06, + "loss": 0.4999, + "step": 9679 + }, + { + "epoch": 0.79, + "grad_norm": 0.9169938351661332, + "learning_rate": 2.292384715455419e-06, + "loss": 0.4526, + "step": 9680 + }, + { + "epoch": 0.79, + "grad_norm": 1.0437066091326381, + "learning_rate": 2.2907077718177183e-06, + "loss": 0.5488, + "step": 9681 + }, + { + "epoch": 0.79, + "grad_norm": 0.9602945871231334, + "learning_rate": 2.2890313624271363e-06, + "loss": 0.4523, + "step": 9682 + }, + { + "epoch": 0.79, + "grad_norm": 0.9967843968821193, + "learning_rate": 2.2873554873998393e-06, + "loss": 0.5272, + "step": 9683 + }, + { + "epoch": 0.79, + "grad_norm": 0.9266329545721437, + "learning_rate": 2.285680146851965e-06, + "loss": 0.4719, + "step": 9684 + }, + { + "epoch": 0.79, + "grad_norm": 0.9560564901228441, + "learning_rate": 2.2840053408996154e-06, + "loss": 0.5117, + "step": 9685 + }, + { + "epoch": 0.79, + "grad_norm": 0.9832147229170937, + "learning_rate": 2.28233106965885e-06, + "loss": 0.4882, + "step": 9686 + }, + { + "epoch": 0.79, + "grad_norm": 0.9488749898311409, + "learning_rate": 2.2806573332456973e-06, + "loss": 0.4401, + "step": 9687 + }, + { + "epoch": 0.79, + "grad_norm": 0.9169415250097036, + "learning_rate": 2.278984131776145e-06, + "loss": 0.4657, + "step": 9688 + }, + { + "epoch": 0.79, + "grad_norm": 0.8472737030846431, + "learning_rate": 2.2773114653661433e-06, + "loss": 0.4264, + "step": 9689 + }, + { + "epoch": 0.79, + "grad_norm": 0.9309930762749372, + "learning_rate": 2.2756393341316065e-06, + "loss": 0.4762, + "step": 9690 + }, + { + "epoch": 0.79, + "grad_norm": 0.9340556544258505, + "learning_rate": 2.2739677381884117e-06, + "loss": 0.4586, + "step": 9691 + }, + { + "epoch": 0.79, + "grad_norm": 0.8572938512075292, + "learning_rate": 2.272296677652399e-06, + "loss": 0.4299, + "step": 9692 + }, + { + "epoch": 0.79, + "grad_norm": 0.9464483530843911, + "learning_rate": 2.2706261526393734e-06, + "loss": 0.4628, + "step": 9693 + }, + { + "epoch": 0.79, + "grad_norm": 0.9382340497970053, + "learning_rate": 2.2689561632651024e-06, + "loss": 0.4493, + "step": 9694 + }, + { + "epoch": 0.79, + "grad_norm": 0.9729695950363514, + "learning_rate": 2.267286709645309e-06, + "loss": 0.4547, + "step": 9695 + }, + { + "epoch": 0.79, + "grad_norm": 0.9290966977858953, + "learning_rate": 2.2656177918956867e-06, + "loss": 0.499, + "step": 9696 + }, + { + "epoch": 0.79, + "grad_norm": 0.9104737506886303, + "learning_rate": 2.2639494101318914e-06, + "loss": 0.4894, + "step": 9697 + }, + { + "epoch": 0.79, + "grad_norm": 0.8565759329384154, + "learning_rate": 2.262281564469541e-06, + "loss": 0.472, + "step": 9698 + }, + { + "epoch": 0.79, + "grad_norm": 0.9881347553725216, + "learning_rate": 2.260614255024214e-06, + "loss": 0.5355, + "step": 9699 + }, + { + "epoch": 0.79, + "grad_norm": 0.8516162680336561, + "learning_rate": 2.2589474819114564e-06, + "loss": 0.4418, + "step": 9700 + }, + { + "epoch": 0.79, + "grad_norm": 1.0157475120079382, + "learning_rate": 2.2572812452467708e-06, + "loss": 0.5232, + "step": 9701 + }, + { + "epoch": 0.79, + "grad_norm": 0.9035265623515684, + "learning_rate": 2.255615545145626e-06, + "loss": 0.4444, + "step": 9702 + }, + { + "epoch": 0.79, + "grad_norm": 0.9245265400864687, + "learning_rate": 2.2539503817234553e-06, + "loss": 0.5143, + "step": 9703 + }, + { + "epoch": 0.79, + "grad_norm": 0.9118533686851575, + "learning_rate": 2.252285755095652e-06, + "loss": 0.5552, + "step": 9704 + }, + { + "epoch": 0.79, + "grad_norm": 0.8264811753969631, + "learning_rate": 2.2506216653775736e-06, + "loss": 0.5094, + "step": 9705 + }, + { + "epoch": 0.79, + "grad_norm": 0.8832554789712772, + "learning_rate": 2.2489581126845408e-06, + "loss": 0.4546, + "step": 9706 + }, + { + "epoch": 0.79, + "grad_norm": 0.8941894011676278, + "learning_rate": 2.2472950971318377e-06, + "loss": 0.4624, + "step": 9707 + }, + { + "epoch": 0.79, + "grad_norm": 0.9404419890377869, + "learning_rate": 2.2456326188347045e-06, + "loss": 0.5373, + "step": 9708 + }, + { + "epoch": 0.79, + "grad_norm": 0.8772512482369041, + "learning_rate": 2.2439706779083538e-06, + "loss": 0.431, + "step": 9709 + }, + { + "epoch": 0.79, + "grad_norm": 0.9547698813593285, + "learning_rate": 2.2423092744679553e-06, + "loss": 0.5533, + "step": 9710 + }, + { + "epoch": 0.79, + "grad_norm": 0.93032348439044, + "learning_rate": 2.240648408628643e-06, + "loss": 0.4784, + "step": 9711 + }, + { + "epoch": 0.79, + "grad_norm": 0.9153025168867757, + "learning_rate": 2.238988080505513e-06, + "loss": 0.4683, + "step": 9712 + }, + { + "epoch": 0.79, + "grad_norm": 0.9114927690819346, + "learning_rate": 2.2373282902136273e-06, + "loss": 0.4661, + "step": 9713 + }, + { + "epoch": 0.79, + "grad_norm": 0.8452718104433999, + "learning_rate": 2.2356690378680036e-06, + "loss": 0.4077, + "step": 9714 + }, + { + "epoch": 0.79, + "grad_norm": 0.8560275740625997, + "learning_rate": 2.2340103235836286e-06, + "loss": 0.5184, + "step": 9715 + }, + { + "epoch": 0.79, + "grad_norm": 1.0038013957013665, + "learning_rate": 2.2323521474754508e-06, + "loss": 0.5474, + "step": 9716 + }, + { + "epoch": 0.79, + "grad_norm": 0.946783776261924, + "learning_rate": 2.2306945096583775e-06, + "loss": 0.4912, + "step": 9717 + }, + { + "epoch": 0.79, + "grad_norm": 0.9489969255455457, + "learning_rate": 2.2290374102472846e-06, + "loss": 0.4942, + "step": 9718 + }, + { + "epoch": 0.79, + "grad_norm": 0.9521301748260809, + "learning_rate": 2.2273808493570082e-06, + "loss": 0.4411, + "step": 9719 + }, + { + "epoch": 0.79, + "grad_norm": 0.9577675331331204, + "learning_rate": 2.2257248271023424e-06, + "loss": 0.4104, + "step": 9720 + }, + { + "epoch": 0.79, + "grad_norm": 1.0214226660713241, + "learning_rate": 2.22406934359805e-06, + "loss": 0.4496, + "step": 9721 + }, + { + "epoch": 0.79, + "grad_norm": 0.8994988307598074, + "learning_rate": 2.2224143989588545e-06, + "loss": 0.4956, + "step": 9722 + }, + { + "epoch": 0.79, + "grad_norm": 0.8937085342620766, + "learning_rate": 2.2207599932994427e-06, + "loss": 0.4643, + "step": 9723 + }, + { + "epoch": 0.79, + "grad_norm": 0.9139353897263605, + "learning_rate": 2.2191061267344636e-06, + "loss": 0.4705, + "step": 9724 + }, + { + "epoch": 0.79, + "grad_norm": 0.8709930127962902, + "learning_rate": 2.217452799378531e-06, + "loss": 0.4819, + "step": 9725 + }, + { + "epoch": 0.79, + "grad_norm": 0.995164754817237, + "learning_rate": 2.215800011346211e-06, + "loss": 0.5095, + "step": 9726 + }, + { + "epoch": 0.79, + "grad_norm": 0.8863722636206753, + "learning_rate": 2.2141477627520504e-06, + "loss": 0.4836, + "step": 9727 + }, + { + "epoch": 0.79, + "grad_norm": 1.0011157201630954, + "learning_rate": 2.212496053710541e-06, + "loss": 0.5673, + "step": 9728 + }, + { + "epoch": 0.79, + "grad_norm": 0.9278185762841324, + "learning_rate": 2.2108448843361487e-06, + "loss": 0.4849, + "step": 9729 + }, + { + "epoch": 0.79, + "grad_norm": 0.9013886456943522, + "learning_rate": 2.209194254743295e-06, + "loss": 0.4775, + "step": 9730 + }, + { + "epoch": 0.79, + "grad_norm": 0.9261698216325288, + "learning_rate": 2.2075441650463734e-06, + "loss": 0.4504, + "step": 9731 + }, + { + "epoch": 0.79, + "grad_norm": 1.0073436550756663, + "learning_rate": 2.205894615359724e-06, + "loss": 0.5411, + "step": 9732 + }, + { + "epoch": 0.79, + "grad_norm": 1.1251748379561548, + "learning_rate": 2.2042456057976693e-06, + "loss": 0.5031, + "step": 9733 + }, + { + "epoch": 0.79, + "grad_norm": 1.012572403508521, + "learning_rate": 2.2025971364744758e-06, + "loss": 0.506, + "step": 9734 + }, + { + "epoch": 0.79, + "grad_norm": 0.93453836437706, + "learning_rate": 2.2009492075043847e-06, + "loss": 0.5359, + "step": 9735 + }, + { + "epoch": 0.79, + "grad_norm": 0.9697596467071894, + "learning_rate": 2.199301819001597e-06, + "loss": 0.5473, + "step": 9736 + }, + { + "epoch": 0.79, + "grad_norm": 1.071892038087734, + "learning_rate": 2.1976549710802754e-06, + "loss": 0.4695, + "step": 9737 + }, + { + "epoch": 0.79, + "grad_norm": 1.0109392674982929, + "learning_rate": 2.1960086638545385e-06, + "loss": 0.4593, + "step": 9738 + }, + { + "epoch": 0.79, + "grad_norm": 0.8912902858755715, + "learning_rate": 2.1943628974384858e-06, + "loss": 0.4292, + "step": 9739 + }, + { + "epoch": 0.79, + "grad_norm": 0.9169358523666661, + "learning_rate": 2.192717671946156e-06, + "loss": 0.5011, + "step": 9740 + }, + { + "epoch": 0.79, + "grad_norm": 0.9412617557898649, + "learning_rate": 2.191072987491567e-06, + "loss": 0.479, + "step": 9741 + }, + { + "epoch": 0.79, + "grad_norm": 0.9641902627726724, + "learning_rate": 2.1894288441886946e-06, + "loss": 0.484, + "step": 9742 + }, + { + "epoch": 0.79, + "grad_norm": 0.9835215942561565, + "learning_rate": 2.1877852421514767e-06, + "loss": 0.5121, + "step": 9743 + }, + { + "epoch": 0.79, + "grad_norm": 0.8855674381525747, + "learning_rate": 2.1861421814938076e-06, + "loss": 0.3979, + "step": 9744 + }, + { + "epoch": 0.79, + "grad_norm": 0.9306720357373683, + "learning_rate": 2.18449966232956e-06, + "loss": 0.487, + "step": 9745 + }, + { + "epoch": 0.79, + "grad_norm": 0.9644690013564537, + "learning_rate": 2.18285768477255e-06, + "loss": 0.4636, + "step": 9746 + }, + { + "epoch": 0.79, + "grad_norm": 0.9276625157351951, + "learning_rate": 2.1812162489365686e-06, + "loss": 0.5162, + "step": 9747 + }, + { + "epoch": 0.79, + "grad_norm": 0.9515009929127202, + "learning_rate": 2.179575354935366e-06, + "loss": 0.5239, + "step": 9748 + }, + { + "epoch": 0.79, + "grad_norm": 0.9622644087270956, + "learning_rate": 2.1779350028826584e-06, + "loss": 0.5131, + "step": 9749 + }, + { + "epoch": 0.79, + "grad_norm": 0.9126946605383349, + "learning_rate": 2.1762951928921105e-06, + "loss": 0.4957, + "step": 9750 + }, + { + "epoch": 0.79, + "grad_norm": 1.0443250368096582, + "learning_rate": 2.174655925077371e-06, + "loss": 0.5188, + "step": 9751 + }, + { + "epoch": 0.79, + "grad_norm": 0.8529784079548666, + "learning_rate": 2.1730171995520334e-06, + "loss": 0.452, + "step": 9752 + }, + { + "epoch": 0.79, + "grad_norm": 0.9125903642765387, + "learning_rate": 2.171379016429661e-06, + "loss": 0.4801, + "step": 9753 + }, + { + "epoch": 0.79, + "grad_norm": 1.2263550367912526, + "learning_rate": 2.1697413758237785e-06, + "loss": 0.5218, + "step": 9754 + }, + { + "epoch": 0.79, + "grad_norm": 0.8015976310048045, + "learning_rate": 2.1681042778478755e-06, + "loss": 0.4424, + "step": 9755 + }, + { + "epoch": 0.79, + "grad_norm": 1.0733405439258612, + "learning_rate": 2.166467722615394e-06, + "loss": 0.5043, + "step": 9756 + }, + { + "epoch": 0.79, + "grad_norm": 0.9307547619272879, + "learning_rate": 2.1648317102397565e-06, + "loss": 0.4392, + "step": 9757 + }, + { + "epoch": 0.79, + "grad_norm": 0.9033183363968029, + "learning_rate": 2.1631962408343264e-06, + "loss": 0.5032, + "step": 9758 + }, + { + "epoch": 0.79, + "grad_norm": 1.0066450643953964, + "learning_rate": 2.1615613145124514e-06, + "loss": 0.5493, + "step": 9759 + }, + { + "epoch": 0.79, + "grad_norm": 0.9444646156175104, + "learning_rate": 2.1599269313874217e-06, + "loss": 0.488, + "step": 9760 + }, + { + "epoch": 0.79, + "grad_norm": 1.014755699670725, + "learning_rate": 2.158293091572501e-06, + "loss": 0.5632, + "step": 9761 + }, + { + "epoch": 0.79, + "grad_norm": 1.022504963163021, + "learning_rate": 2.156659795180913e-06, + "loss": 0.5, + "step": 9762 + }, + { + "epoch": 0.79, + "grad_norm": 0.8841065130850687, + "learning_rate": 2.155027042325848e-06, + "loss": 0.5152, + "step": 9763 + }, + { + "epoch": 0.79, + "grad_norm": 0.8805351611241143, + "learning_rate": 2.1533948331204445e-06, + "loss": 0.4458, + "step": 9764 + }, + { + "epoch": 0.79, + "grad_norm": 1.1318116744956215, + "learning_rate": 2.151763167677825e-06, + "loss": 0.4929, + "step": 9765 + }, + { + "epoch": 0.79, + "grad_norm": 0.9358085439942705, + "learning_rate": 2.150132046111054e-06, + "loss": 0.5011, + "step": 9766 + }, + { + "epoch": 0.79, + "grad_norm": 0.9357248606524596, + "learning_rate": 2.1485014685331684e-06, + "loss": 0.5508, + "step": 9767 + }, + { + "epoch": 0.79, + "grad_norm": 0.9589413280149912, + "learning_rate": 2.1468714350571683e-06, + "loss": 0.46, + "step": 9768 + }, + { + "epoch": 0.79, + "grad_norm": 0.9224443985715425, + "learning_rate": 2.145241945796014e-06, + "loss": 0.4801, + "step": 9769 + }, + { + "epoch": 0.79, + "grad_norm": 0.8847139248412932, + "learning_rate": 2.14361300086262e-06, + "loss": 0.5088, + "step": 9770 + }, + { + "epoch": 0.79, + "grad_norm": 0.962732416595997, + "learning_rate": 2.141984600369882e-06, + "loss": 0.4688, + "step": 9771 + }, + { + "epoch": 0.79, + "grad_norm": 0.8677824980011563, + "learning_rate": 2.1403567444306384e-06, + "loss": 0.4751, + "step": 9772 + }, + { + "epoch": 0.79, + "grad_norm": 0.9071070373566746, + "learning_rate": 2.1387294331577e-06, + "loss": 0.4645, + "step": 9773 + }, + { + "epoch": 0.79, + "grad_norm": 0.9406593469392999, + "learning_rate": 2.1371026666638404e-06, + "loss": 0.4911, + "step": 9774 + }, + { + "epoch": 0.79, + "grad_norm": 0.9851660325459202, + "learning_rate": 2.1354764450617937e-06, + "loss": 0.4579, + "step": 9775 + }, + { + "epoch": 0.79, + "grad_norm": 0.9575769497626949, + "learning_rate": 2.1338507684642483e-06, + "loss": 0.4854, + "step": 9776 + }, + { + "epoch": 0.79, + "grad_norm": 0.9043458981275626, + "learning_rate": 2.1322256369838723e-06, + "loss": 0.4312, + "step": 9777 + }, + { + "epoch": 0.79, + "grad_norm": 0.9136899357618365, + "learning_rate": 2.1306010507332787e-06, + "loss": 0.4731, + "step": 9778 + }, + { + "epoch": 0.79, + "grad_norm": 0.9489654635266751, + "learning_rate": 2.128977009825052e-06, + "loss": 0.4946, + "step": 9779 + }, + { + "epoch": 0.79, + "grad_norm": 0.886178646999231, + "learning_rate": 2.1273535143717372e-06, + "loss": 0.4779, + "step": 9780 + }, + { + "epoch": 0.79, + "grad_norm": 0.9274155006249445, + "learning_rate": 2.125730564485844e-06, + "loss": 0.4499, + "step": 9781 + }, + { + "epoch": 0.8, + "grad_norm": 1.0206326818075697, + "learning_rate": 2.124108160279832e-06, + "loss": 0.5138, + "step": 9782 + }, + { + "epoch": 0.8, + "grad_norm": 0.9908500142071262, + "learning_rate": 2.1224863018661435e-06, + "loss": 0.5542, + "step": 9783 + }, + { + "epoch": 0.8, + "grad_norm": 0.9421564240045448, + "learning_rate": 2.1208649893571653e-06, + "loss": 0.472, + "step": 9784 + }, + { + "epoch": 0.8, + "grad_norm": 0.9529297223093769, + "learning_rate": 2.119244222865253e-06, + "loss": 0.5093, + "step": 9785 + }, + { + "epoch": 0.8, + "grad_norm": 0.983152381687885, + "learning_rate": 2.117624002502727e-06, + "loss": 0.5081, + "step": 9786 + }, + { + "epoch": 0.8, + "grad_norm": 0.9269249520941745, + "learning_rate": 2.1160043283818697e-06, + "loss": 0.4889, + "step": 9787 + }, + { + "epoch": 0.8, + "grad_norm": 0.9311810373511374, + "learning_rate": 2.114385200614912e-06, + "loss": 0.4725, + "step": 9788 + }, + { + "epoch": 0.8, + "grad_norm": 0.9330190902902091, + "learning_rate": 2.112766619314072e-06, + "loss": 0.5116, + "step": 9789 + }, + { + "epoch": 0.8, + "grad_norm": 0.9001553215481263, + "learning_rate": 2.111148584591506e-06, + "loss": 0.5007, + "step": 9790 + }, + { + "epoch": 0.8, + "grad_norm": 1.0008529581665582, + "learning_rate": 2.1095310965593463e-06, + "loss": 0.5086, + "step": 9791 + }, + { + "epoch": 0.8, + "grad_norm": 0.969189216671511, + "learning_rate": 2.107914155329682e-06, + "loss": 0.542, + "step": 9792 + }, + { + "epoch": 0.8, + "grad_norm": 0.9746252765484869, + "learning_rate": 2.1062977610145697e-06, + "loss": 0.4838, + "step": 9793 + }, + { + "epoch": 0.8, + "grad_norm": 0.9223795804782231, + "learning_rate": 2.1046819137260155e-06, + "loss": 0.4721, + "step": 9794 + }, + { + "epoch": 0.8, + "grad_norm": 0.9041889187855691, + "learning_rate": 2.103066613576007e-06, + "loss": 0.476, + "step": 9795 + }, + { + "epoch": 0.8, + "grad_norm": 0.9913638606757014, + "learning_rate": 2.1014518606764744e-06, + "loss": 0.4791, + "step": 9796 + }, + { + "epoch": 0.8, + "grad_norm": 0.97369286260393, + "learning_rate": 2.0998376551393218e-06, + "loss": 0.5438, + "step": 9797 + }, + { + "epoch": 0.8, + "grad_norm": 0.8437686252604286, + "learning_rate": 2.0982239970764127e-06, + "loss": 0.3833, + "step": 9798 + }, + { + "epoch": 0.8, + "grad_norm": 0.8703143734179483, + "learning_rate": 2.096610886599575e-06, + "loss": 0.4263, + "step": 9799 + }, + { + "epoch": 0.8, + "grad_norm": 1.035672950025825, + "learning_rate": 2.0949983238205863e-06, + "loss": 0.542, + "step": 9800 + }, + { + "epoch": 0.8, + "grad_norm": 0.9362809465383324, + "learning_rate": 2.0933863088512076e-06, + "loss": 0.4959, + "step": 9801 + }, + { + "epoch": 0.8, + "grad_norm": 0.9017408168518072, + "learning_rate": 2.0917748418031415e-06, + "loss": 0.4708, + "step": 9802 + }, + { + "epoch": 0.8, + "grad_norm": 1.0087137373104949, + "learning_rate": 2.0901639227880643e-06, + "loss": 0.4836, + "step": 9803 + }, + { + "epoch": 0.8, + "grad_norm": 0.8750531971104142, + "learning_rate": 2.0885535519176115e-06, + "loss": 0.4628, + "step": 9804 + }, + { + "epoch": 0.8, + "grad_norm": 0.9406350767966913, + "learning_rate": 2.0869437293033835e-06, + "loss": 0.4725, + "step": 9805 + }, + { + "epoch": 0.8, + "grad_norm": 0.9721435535796548, + "learning_rate": 2.08533445505693e-06, + "loss": 0.4899, + "step": 9806 + }, + { + "epoch": 0.8, + "grad_norm": 0.9351073095710531, + "learning_rate": 2.083725729289784e-06, + "loss": 0.462, + "step": 9807 + }, + { + "epoch": 0.8, + "grad_norm": 0.9166700362208122, + "learning_rate": 2.0821175521134208e-06, + "loss": 0.4798, + "step": 9808 + }, + { + "epoch": 0.8, + "grad_norm": 0.9199003461402707, + "learning_rate": 2.080509923639288e-06, + "loss": 0.4413, + "step": 9809 + }, + { + "epoch": 0.8, + "grad_norm": 0.9284407199731455, + "learning_rate": 2.078902843978792e-06, + "loss": 0.4966, + "step": 9810 + }, + { + "epoch": 0.8, + "grad_norm": 0.882987750650245, + "learning_rate": 2.0772963132433065e-06, + "loss": 0.4945, + "step": 9811 + }, + { + "epoch": 0.8, + "grad_norm": 0.9658350421424726, + "learning_rate": 2.0756903315441535e-06, + "loss": 0.5078, + "step": 9812 + }, + { + "epoch": 0.8, + "grad_norm": 0.9092385679023566, + "learning_rate": 2.0740848989926365e-06, + "loss": 0.4854, + "step": 9813 + }, + { + "epoch": 0.8, + "grad_norm": 1.0058460963863638, + "learning_rate": 2.0724800157000034e-06, + "loss": 0.5533, + "step": 9814 + }, + { + "epoch": 0.8, + "grad_norm": 0.8937420971510914, + "learning_rate": 2.0708756817774743e-06, + "loss": 0.401, + "step": 9815 + }, + { + "epoch": 0.8, + "grad_norm": 1.0443858846384912, + "learning_rate": 2.069271897336227e-06, + "loss": 0.5096, + "step": 9816 + }, + { + "epoch": 0.8, + "grad_norm": 1.0056854570600682, + "learning_rate": 2.0676686624874054e-06, + "loss": 0.5215, + "step": 9817 + }, + { + "epoch": 0.8, + "grad_norm": 0.9695529148626671, + "learning_rate": 2.066065977342103e-06, + "loss": 0.5354, + "step": 9818 + }, + { + "epoch": 0.8, + "grad_norm": 0.8368337032103081, + "learning_rate": 2.064463842011397e-06, + "loss": 0.4196, + "step": 9819 + }, + { + "epoch": 0.8, + "grad_norm": 0.8703363610587038, + "learning_rate": 2.0628622566063063e-06, + "loss": 0.481, + "step": 9820 + }, + { + "epoch": 0.8, + "grad_norm": 0.8656780500628107, + "learning_rate": 2.06126122123782e-06, + "loss": 0.431, + "step": 9821 + }, + { + "epoch": 0.8, + "grad_norm": 0.89463213892815, + "learning_rate": 2.0596607360168897e-06, + "loss": 0.5256, + "step": 9822 + }, + { + "epoch": 0.8, + "grad_norm": 0.8925044512010746, + "learning_rate": 2.058060801054429e-06, + "loss": 0.4504, + "step": 9823 + }, + { + "epoch": 0.8, + "grad_norm": 0.8702517722379945, + "learning_rate": 2.0564614164613064e-06, + "loss": 0.4574, + "step": 9824 + }, + { + "epoch": 0.8, + "grad_norm": 0.9533218505721511, + "learning_rate": 2.054862582348366e-06, + "loss": 0.5221, + "step": 9825 + }, + { + "epoch": 0.8, + "grad_norm": 0.9865782669574705, + "learning_rate": 2.0532642988263994e-06, + "loss": 0.5369, + "step": 9826 + }, + { + "epoch": 0.8, + "grad_norm": 1.086952839581523, + "learning_rate": 2.0516665660061675e-06, + "loss": 0.5524, + "step": 9827 + }, + { + "epoch": 0.8, + "grad_norm": 0.9065297806646161, + "learning_rate": 2.050069383998393e-06, + "loss": 0.4822, + "step": 9828 + }, + { + "epoch": 0.8, + "grad_norm": 0.9407575239281395, + "learning_rate": 2.0484727529137616e-06, + "loss": 0.4421, + "step": 9829 + }, + { + "epoch": 0.8, + "grad_norm": 1.0282959759015617, + "learning_rate": 2.0468766728629084e-06, + "loss": 0.5221, + "step": 9830 + }, + { + "epoch": 0.8, + "grad_norm": 0.9230027591994061, + "learning_rate": 2.045281143956455e-06, + "loss": 0.4707, + "step": 9831 + }, + { + "epoch": 0.8, + "grad_norm": 0.9706643303868061, + "learning_rate": 2.0436861663049577e-06, + "loss": 0.5287, + "step": 9832 + }, + { + "epoch": 0.8, + "grad_norm": 0.8624615421512666, + "learning_rate": 2.0420917400189532e-06, + "loss": 0.4672, + "step": 9833 + }, + { + "epoch": 0.8, + "grad_norm": 1.019489997515685, + "learning_rate": 2.0404978652089325e-06, + "loss": 0.5163, + "step": 9834 + }, + { + "epoch": 0.8, + "grad_norm": 0.8595574720850923, + "learning_rate": 2.0389045419853483e-06, + "loss": 0.4102, + "step": 9835 + }, + { + "epoch": 0.8, + "grad_norm": 1.0556470578827364, + "learning_rate": 2.037311770458619e-06, + "loss": 0.4998, + "step": 9836 + }, + { + "epoch": 0.8, + "grad_norm": 0.9005746593140518, + "learning_rate": 2.0357195507391237e-06, + "loss": 0.4978, + "step": 9837 + }, + { + "epoch": 0.8, + "grad_norm": 0.9848758102170952, + "learning_rate": 2.034127882937197e-06, + "loss": 0.5397, + "step": 9838 + }, + { + "epoch": 0.8, + "grad_norm": 1.0766503341546003, + "learning_rate": 2.032536767163141e-06, + "loss": 0.5174, + "step": 9839 + }, + { + "epoch": 0.8, + "grad_norm": 1.0545814409469874, + "learning_rate": 2.0309462035272207e-06, + "loss": 0.5166, + "step": 9840 + }, + { + "epoch": 0.8, + "grad_norm": 0.9315275679597564, + "learning_rate": 2.02935619213966e-06, + "loss": 0.4328, + "step": 9841 + }, + { + "epoch": 0.8, + "grad_norm": 0.9159068671624757, + "learning_rate": 2.0277667331106456e-06, + "loss": 0.4934, + "step": 9842 + }, + { + "epoch": 0.8, + "grad_norm": 0.9956840495068574, + "learning_rate": 2.026177826550326e-06, + "loss": 0.5241, + "step": 9843 + }, + { + "epoch": 0.8, + "grad_norm": 0.9763647677692302, + "learning_rate": 2.0245894725688097e-06, + "loss": 0.509, + "step": 9844 + }, + { + "epoch": 0.8, + "grad_norm": 0.8663904354062106, + "learning_rate": 2.023001671276168e-06, + "loss": 0.476, + "step": 9845 + }, + { + "epoch": 0.8, + "grad_norm": 0.9302822464015351, + "learning_rate": 2.021414422782435e-06, + "loss": 0.526, + "step": 9846 + }, + { + "epoch": 0.8, + "grad_norm": 0.9151958944350562, + "learning_rate": 2.019827727197605e-06, + "loss": 0.4823, + "step": 9847 + }, + { + "epoch": 0.8, + "grad_norm": 0.871270534613334, + "learning_rate": 2.018241584631636e-06, + "loss": 0.471, + "step": 9848 + }, + { + "epoch": 0.8, + "grad_norm": 0.9485613903404995, + "learning_rate": 2.0166559951944477e-06, + "loss": 0.5482, + "step": 9849 + }, + { + "epoch": 0.8, + "grad_norm": 1.0218535933745767, + "learning_rate": 2.015070958995915e-06, + "loss": 0.5702, + "step": 9850 + }, + { + "epoch": 0.8, + "grad_norm": 0.9870914624477832, + "learning_rate": 2.0134864761458815e-06, + "loss": 0.5596, + "step": 9851 + }, + { + "epoch": 0.8, + "grad_norm": 0.9066468027478812, + "learning_rate": 2.011902546754152e-06, + "loss": 0.4668, + "step": 9852 + }, + { + "epoch": 0.8, + "grad_norm": 0.929133236447092, + "learning_rate": 2.01031917093049e-06, + "loss": 0.5044, + "step": 9853 + }, + { + "epoch": 0.8, + "grad_norm": 0.9296472004568087, + "learning_rate": 2.0087363487846236e-06, + "loss": 0.464, + "step": 9854 + }, + { + "epoch": 0.8, + "grad_norm": 0.8409541167939985, + "learning_rate": 2.007154080426239e-06, + "loss": 0.4338, + "step": 9855 + }, + { + "epoch": 0.8, + "grad_norm": 0.8973893618611448, + "learning_rate": 2.0055723659649907e-06, + "loss": 0.4885, + "step": 9856 + }, + { + "epoch": 0.8, + "grad_norm": 0.8864057016507616, + "learning_rate": 2.0039912055104826e-06, + "loss": 0.412, + "step": 9857 + }, + { + "epoch": 0.8, + "grad_norm": 0.9103346867478691, + "learning_rate": 2.002410599172292e-06, + "loss": 0.4636, + "step": 9858 + }, + { + "epoch": 0.8, + "grad_norm": 0.9040653111981451, + "learning_rate": 2.0008305470599533e-06, + "loss": 0.3976, + "step": 9859 + }, + { + "epoch": 0.8, + "grad_norm": 0.9571850512409811, + "learning_rate": 1.999251049282962e-06, + "loss": 0.4924, + "step": 9860 + }, + { + "epoch": 0.8, + "grad_norm": 0.9831040082324778, + "learning_rate": 1.9976721059507766e-06, + "loss": 0.5063, + "step": 9861 + }, + { + "epoch": 0.8, + "grad_norm": 0.9674607752432008, + "learning_rate": 1.996093717172819e-06, + "loss": 0.4871, + "step": 9862 + }, + { + "epoch": 0.8, + "grad_norm": 0.8735604631321405, + "learning_rate": 1.994515883058464e-06, + "loss": 0.443, + "step": 9863 + }, + { + "epoch": 0.8, + "grad_norm": 0.8727484466187492, + "learning_rate": 1.9929386037170574e-06, + "loss": 0.4173, + "step": 9864 + }, + { + "epoch": 0.8, + "grad_norm": 0.893569677723104, + "learning_rate": 1.9913618792579037e-06, + "loss": 0.4256, + "step": 9865 + }, + { + "epoch": 0.8, + "grad_norm": 0.79604340010775, + "learning_rate": 1.9897857097902683e-06, + "loss": 0.4173, + "step": 9866 + }, + { + "epoch": 0.8, + "grad_norm": 0.9552205687203998, + "learning_rate": 1.9882100954233786e-06, + "loss": 0.449, + "step": 9867 + }, + { + "epoch": 0.8, + "grad_norm": 0.9992518118991707, + "learning_rate": 1.9866350362664243e-06, + "loss": 0.4373, + "step": 9868 + }, + { + "epoch": 0.8, + "grad_norm": 0.9556895391027411, + "learning_rate": 1.98506053242855e-06, + "loss": 0.529, + "step": 9869 + }, + { + "epoch": 0.8, + "grad_norm": 0.9899771953409056, + "learning_rate": 1.9834865840188767e-06, + "loss": 0.4856, + "step": 9870 + }, + { + "epoch": 0.8, + "grad_norm": 0.9073343077121624, + "learning_rate": 1.9819131911464682e-06, + "loss": 0.4587, + "step": 9871 + }, + { + "epoch": 0.8, + "grad_norm": 0.8637435483735759, + "learning_rate": 1.9803403539203657e-06, + "loss": 0.4847, + "step": 9872 + }, + { + "epoch": 0.8, + "grad_norm": 0.8414267955167837, + "learning_rate": 1.9787680724495617e-06, + "loss": 0.4817, + "step": 9873 + }, + { + "epoch": 0.8, + "grad_norm": 0.8750397831401274, + "learning_rate": 1.977196346843019e-06, + "loss": 0.4716, + "step": 9874 + }, + { + "epoch": 0.8, + "grad_norm": 0.986056259393715, + "learning_rate": 1.975625177209648e-06, + "loss": 0.5231, + "step": 9875 + }, + { + "epoch": 0.8, + "grad_norm": 0.9361734808532571, + "learning_rate": 1.9740545636583397e-06, + "loss": 0.5589, + "step": 9876 + }, + { + "epoch": 0.8, + "grad_norm": 0.8785889300977264, + "learning_rate": 1.9724845062979283e-06, + "loss": 0.4577, + "step": 9877 + }, + { + "epoch": 0.8, + "grad_norm": 0.9376677737805472, + "learning_rate": 1.9709150052372206e-06, + "loss": 0.4759, + "step": 9878 + }, + { + "epoch": 0.8, + "grad_norm": 0.9065893288202759, + "learning_rate": 1.96934606058498e-06, + "loss": 0.5017, + "step": 9879 + }, + { + "epoch": 0.8, + "grad_norm": 0.9532402399142695, + "learning_rate": 1.9677776724499354e-06, + "loss": 0.4666, + "step": 9880 + }, + { + "epoch": 0.8, + "grad_norm": 0.956789992815111, + "learning_rate": 1.9662098409407737e-06, + "loss": 0.489, + "step": 9881 + }, + { + "epoch": 0.8, + "grad_norm": 0.9697198209953076, + "learning_rate": 1.964642566166146e-06, + "loss": 0.4756, + "step": 9882 + }, + { + "epoch": 0.8, + "grad_norm": 0.9394385645475665, + "learning_rate": 1.963075848234659e-06, + "loss": 0.4685, + "step": 9883 + }, + { + "epoch": 0.8, + "grad_norm": 0.8343886173938521, + "learning_rate": 1.9615096872548865e-06, + "loss": 0.3947, + "step": 9884 + }, + { + "epoch": 0.8, + "grad_norm": 0.8882866642715643, + "learning_rate": 1.9599440833353624e-06, + "loss": 0.468, + "step": 9885 + }, + { + "epoch": 0.8, + "grad_norm": 0.9117902743086408, + "learning_rate": 1.9583790365845823e-06, + "loss": 0.5095, + "step": 9886 + }, + { + "epoch": 0.8, + "grad_norm": 0.919139555747959, + "learning_rate": 1.9568145471110024e-06, + "loss": 0.4837, + "step": 9887 + }, + { + "epoch": 0.8, + "grad_norm": 0.9313516499344854, + "learning_rate": 1.955250615023042e-06, + "loss": 0.4367, + "step": 9888 + }, + { + "epoch": 0.8, + "grad_norm": 0.9586301700560018, + "learning_rate": 1.953687240429073e-06, + "loss": 0.5063, + "step": 9889 + }, + { + "epoch": 0.8, + "grad_norm": 0.9736966037058312, + "learning_rate": 1.952124423437447e-06, + "loss": 0.5151, + "step": 9890 + }, + { + "epoch": 0.8, + "grad_norm": 0.8183733956732533, + "learning_rate": 1.9505621641564567e-06, + "loss": 0.4253, + "step": 9891 + }, + { + "epoch": 0.8, + "grad_norm": 0.9848703923333934, + "learning_rate": 1.9490004626943693e-06, + "loss": 0.5306, + "step": 9892 + }, + { + "epoch": 0.8, + "grad_norm": 0.9515937598579888, + "learning_rate": 1.9474393191594076e-06, + "loss": 0.5488, + "step": 9893 + }, + { + "epoch": 0.8, + "grad_norm": 0.9331171965430829, + "learning_rate": 1.9458787336597617e-06, + "loss": 0.5142, + "step": 9894 + }, + { + "epoch": 0.8, + "grad_norm": 0.9951701520003015, + "learning_rate": 1.9443187063035707e-06, + "loss": 0.4915, + "step": 9895 + }, + { + "epoch": 0.8, + "grad_norm": 1.0027301745746888, + "learning_rate": 1.9427592371989533e-06, + "loss": 0.4976, + "step": 9896 + }, + { + "epoch": 0.8, + "grad_norm": 1.0164921417024477, + "learning_rate": 1.94120032645397e-06, + "loss": 0.4775, + "step": 9897 + }, + { + "epoch": 0.8, + "grad_norm": 0.9411467082573, + "learning_rate": 1.939641974176658e-06, + "loss": 0.4446, + "step": 9898 + }, + { + "epoch": 0.8, + "grad_norm": 0.9271568784067858, + "learning_rate": 1.9380841804750063e-06, + "loss": 0.4688, + "step": 9899 + }, + { + "epoch": 0.8, + "grad_norm": 0.9318796970360436, + "learning_rate": 1.9365269454569724e-06, + "loss": 0.4144, + "step": 9900 + }, + { + "epoch": 0.8, + "grad_norm": 0.9201807476325126, + "learning_rate": 1.934970269230464e-06, + "loss": 0.4805, + "step": 9901 + }, + { + "epoch": 0.8, + "grad_norm": 0.9016938331848386, + "learning_rate": 1.9334141519033676e-06, + "loss": 0.4726, + "step": 9902 + }, + { + "epoch": 0.8, + "grad_norm": 0.944460903658482, + "learning_rate": 1.931858593583513e-06, + "loss": 0.4517, + "step": 9903 + }, + { + "epoch": 0.8, + "grad_norm": 0.9583119094345428, + "learning_rate": 1.9303035943787017e-06, + "loss": 0.4548, + "step": 9904 + }, + { + "epoch": 0.81, + "grad_norm": 0.9361054748728511, + "learning_rate": 1.928749154396693e-06, + "loss": 0.4618, + "step": 9905 + }, + { + "epoch": 0.81, + "grad_norm": 0.9730274138362711, + "learning_rate": 1.9271952737452116e-06, + "loss": 0.5136, + "step": 9906 + }, + { + "epoch": 0.81, + "grad_norm": 0.9661630484610683, + "learning_rate": 1.9256419525319316e-06, + "loss": 0.5029, + "step": 9907 + }, + { + "epoch": 0.81, + "grad_norm": 1.0001036297857349, + "learning_rate": 1.9240891908645075e-06, + "loss": 0.4867, + "step": 9908 + }, + { + "epoch": 0.81, + "grad_norm": 0.928526629453305, + "learning_rate": 1.9225369888505364e-06, + "loss": 0.4901, + "step": 9909 + }, + { + "epoch": 0.81, + "grad_norm": 0.9721509399919, + "learning_rate": 1.920985346597588e-06, + "loss": 0.469, + "step": 9910 + }, + { + "epoch": 0.81, + "grad_norm": 0.9657540152337845, + "learning_rate": 1.919434264213188e-06, + "loss": 0.4857, + "step": 9911 + }, + { + "epoch": 0.81, + "grad_norm": 0.8993769666472281, + "learning_rate": 1.917883741804829e-06, + "loss": 0.4626, + "step": 9912 + }, + { + "epoch": 0.81, + "grad_norm": 0.9028837295832993, + "learning_rate": 1.916333779479953e-06, + "loss": 0.4623, + "step": 9913 + }, + { + "epoch": 0.81, + "grad_norm": 0.9350439033146903, + "learning_rate": 1.914784377345982e-06, + "loss": 0.4892, + "step": 9914 + }, + { + "epoch": 0.81, + "grad_norm": 0.9658679451666188, + "learning_rate": 1.9132355355102772e-06, + "loss": 0.5008, + "step": 9915 + }, + { + "epoch": 0.81, + "grad_norm": 1.0958571373836872, + "learning_rate": 1.911687254080179e-06, + "loss": 0.5447, + "step": 9916 + }, + { + "epoch": 0.81, + "grad_norm": 0.9567367250311967, + "learning_rate": 1.910139533162978e-06, + "loss": 0.5503, + "step": 9917 + }, + { + "epoch": 0.81, + "grad_norm": 0.9283114470381654, + "learning_rate": 1.908592372865935e-06, + "loss": 0.4855, + "step": 9918 + }, + { + "epoch": 0.81, + "grad_norm": 0.902042369443858, + "learning_rate": 1.907045773296259e-06, + "loss": 0.4982, + "step": 9919 + }, + { + "epoch": 0.81, + "grad_norm": 0.904011334270541, + "learning_rate": 1.905499734561137e-06, + "loss": 0.4867, + "step": 9920 + }, + { + "epoch": 0.81, + "grad_norm": 0.8789908542582429, + "learning_rate": 1.9039542567677005e-06, + "loss": 0.4767, + "step": 9921 + }, + { + "epoch": 0.81, + "grad_norm": 0.9020209635226962, + "learning_rate": 1.9024093400230537e-06, + "loss": 0.4793, + "step": 9922 + }, + { + "epoch": 0.81, + "grad_norm": 0.9024267922140833, + "learning_rate": 1.9008649844342563e-06, + "loss": 0.4655, + "step": 9923 + }, + { + "epoch": 0.81, + "grad_norm": 0.8526771021123333, + "learning_rate": 1.8993211901083353e-06, + "loss": 0.4777, + "step": 9924 + }, + { + "epoch": 0.81, + "grad_norm": 0.9247623057643407, + "learning_rate": 1.8977779571522648e-06, + "loss": 0.4666, + "step": 9925 + }, + { + "epoch": 0.81, + "grad_norm": 0.904002456384038, + "learning_rate": 1.8962352856729994e-06, + "loss": 0.4571, + "step": 9926 + }, + { + "epoch": 0.81, + "grad_norm": 0.9967168916175249, + "learning_rate": 1.894693175777439e-06, + "loss": 0.4864, + "step": 9927 + }, + { + "epoch": 0.81, + "grad_norm": 0.9685133596114146, + "learning_rate": 1.8931516275724527e-06, + "loss": 0.4826, + "step": 9928 + }, + { + "epoch": 0.81, + "grad_norm": 0.9063216564545956, + "learning_rate": 1.8916106411648671e-06, + "loss": 0.4927, + "step": 9929 + }, + { + "epoch": 0.81, + "grad_norm": 0.9582577486773548, + "learning_rate": 1.8900702166614748e-06, + "loss": 0.5338, + "step": 9930 + }, + { + "epoch": 0.81, + "grad_norm": 0.9567418434865032, + "learning_rate": 1.888530354169017e-06, + "loss": 0.5355, + "step": 9931 + }, + { + "epoch": 0.81, + "grad_norm": 1.003573912510184, + "learning_rate": 1.886991053794217e-06, + "loss": 0.5383, + "step": 9932 + }, + { + "epoch": 0.81, + "grad_norm": 0.9694975334450654, + "learning_rate": 1.8854523156437378e-06, + "loss": 0.484, + "step": 9933 + }, + { + "epoch": 0.81, + "grad_norm": 1.098245385864126, + "learning_rate": 1.8839141398242145e-06, + "loss": 0.488, + "step": 9934 + }, + { + "epoch": 0.81, + "grad_norm": 0.8761128268936527, + "learning_rate": 1.8823765264422433e-06, + "loss": 0.4723, + "step": 9935 + }, + { + "epoch": 0.81, + "grad_norm": 0.9493800786184847, + "learning_rate": 1.8808394756043813e-06, + "loss": 0.4919, + "step": 9936 + }, + { + "epoch": 0.81, + "grad_norm": 1.3088263147613775, + "learning_rate": 1.8793029874171365e-06, + "loss": 0.5155, + "step": 9937 + }, + { + "epoch": 0.81, + "grad_norm": 0.9159160082187201, + "learning_rate": 1.877767061986997e-06, + "loss": 0.4776, + "step": 9938 + }, + { + "epoch": 0.81, + "grad_norm": 0.9965361309784785, + "learning_rate": 1.8762316994203933e-06, + "loss": 0.5508, + "step": 9939 + }, + { + "epoch": 0.81, + "grad_norm": 0.9365775609977088, + "learning_rate": 1.874696899823727e-06, + "loss": 0.4507, + "step": 9940 + }, + { + "epoch": 0.81, + "grad_norm": 0.9352189676628262, + "learning_rate": 1.8731626633033573e-06, + "loss": 0.4177, + "step": 9941 + }, + { + "epoch": 0.81, + "grad_norm": 1.001110662973336, + "learning_rate": 1.8716289899656104e-06, + "loss": 0.4951, + "step": 9942 + }, + { + "epoch": 0.81, + "grad_norm": 0.9350494135836109, + "learning_rate": 1.870095879916759e-06, + "loss": 0.4889, + "step": 9943 + }, + { + "epoch": 0.81, + "grad_norm": 0.897962644434671, + "learning_rate": 1.868563333263057e-06, + "loss": 0.4506, + "step": 9944 + }, + { + "epoch": 0.81, + "grad_norm": 0.9040067942876323, + "learning_rate": 1.8670313501107007e-06, + "loss": 0.4252, + "step": 9945 + }, + { + "epoch": 0.81, + "grad_norm": 0.9556567090230454, + "learning_rate": 1.8654999305658584e-06, + "loss": 0.4749, + "step": 9946 + }, + { + "epoch": 0.81, + "grad_norm": 1.0508522995130318, + "learning_rate": 1.863969074734655e-06, + "loss": 0.5285, + "step": 9947 + }, + { + "epoch": 0.81, + "grad_norm": 0.9940303655091299, + "learning_rate": 1.8624387827231815e-06, + "loss": 0.4676, + "step": 9948 + }, + { + "epoch": 0.81, + "grad_norm": 1.0131111844858838, + "learning_rate": 1.8609090546374764e-06, + "loss": 0.4565, + "step": 9949 + }, + { + "epoch": 0.81, + "grad_norm": 1.0139059658204395, + "learning_rate": 1.8593798905835602e-06, + "loss": 0.5247, + "step": 9950 + }, + { + "epoch": 0.81, + "grad_norm": 1.0027110042791425, + "learning_rate": 1.857851290667394e-06, + "loss": 0.5166, + "step": 9951 + }, + { + "epoch": 0.81, + "grad_norm": 0.9092026195676937, + "learning_rate": 1.8563232549949107e-06, + "loss": 0.4741, + "step": 9952 + }, + { + "epoch": 0.81, + "grad_norm": 0.9812192496856625, + "learning_rate": 1.8547957836720032e-06, + "loss": 0.5114, + "step": 9953 + }, + { + "epoch": 0.81, + "grad_norm": 0.9298750878339397, + "learning_rate": 1.853268876804526e-06, + "loss": 0.4892, + "step": 9954 + }, + { + "epoch": 0.81, + "grad_norm": 0.9908936671001775, + "learning_rate": 1.8517425344982831e-06, + "loss": 0.4935, + "step": 9955 + }, + { + "epoch": 0.81, + "grad_norm": 0.8566091038180487, + "learning_rate": 1.8502167568590611e-06, + "loss": 0.4603, + "step": 9956 + }, + { + "epoch": 0.81, + "grad_norm": 0.857918949746223, + "learning_rate": 1.8486915439925857e-06, + "loss": 0.4569, + "step": 9957 + }, + { + "epoch": 0.81, + "grad_norm": 0.9498102863681523, + "learning_rate": 1.8471668960045575e-06, + "loss": 0.4881, + "step": 9958 + }, + { + "epoch": 0.81, + "grad_norm": 1.020434986705423, + "learning_rate": 1.845642813000631e-06, + "loss": 0.5073, + "step": 9959 + }, + { + "epoch": 0.81, + "grad_norm": 0.8760589828007601, + "learning_rate": 1.8441192950864273e-06, + "loss": 0.4469, + "step": 9960 + }, + { + "epoch": 0.81, + "grad_norm": 0.9536048276057314, + "learning_rate": 1.8425963423675164e-06, + "loss": 0.4702, + "step": 9961 + }, + { + "epoch": 0.81, + "grad_norm": 0.9686770758998478, + "learning_rate": 1.8410739549494494e-06, + "loss": 0.5042, + "step": 9962 + }, + { + "epoch": 0.81, + "grad_norm": 0.9767156015980288, + "learning_rate": 1.8395521329377175e-06, + "loss": 0.5231, + "step": 9963 + }, + { + "epoch": 0.81, + "grad_norm": 0.8311337343768164, + "learning_rate": 1.8380308764377841e-06, + "loss": 0.4839, + "step": 9964 + }, + { + "epoch": 0.81, + "grad_norm": 0.8986677240710658, + "learning_rate": 1.8365101855550716e-06, + "loss": 0.564, + "step": 9965 + }, + { + "epoch": 0.81, + "grad_norm": 1.0756588962698206, + "learning_rate": 1.8349900603949644e-06, + "loss": 0.4841, + "step": 9966 + }, + { + "epoch": 0.81, + "grad_norm": 0.8578985450159758, + "learning_rate": 1.8334705010627996e-06, + "loss": 0.4791, + "step": 9967 + }, + { + "epoch": 0.81, + "grad_norm": 0.9483676107078279, + "learning_rate": 1.8319515076638893e-06, + "loss": 0.4844, + "step": 9968 + }, + { + "epoch": 0.81, + "grad_norm": 0.8803822299638585, + "learning_rate": 1.8304330803034932e-06, + "loss": 0.4359, + "step": 9969 + }, + { + "epoch": 0.81, + "grad_norm": 0.9990550895909772, + "learning_rate": 1.8289152190868376e-06, + "loss": 0.4617, + "step": 9970 + }, + { + "epoch": 0.81, + "grad_norm": 1.0814653391866287, + "learning_rate": 1.8273979241191087e-06, + "loss": 0.5006, + "step": 9971 + }, + { + "epoch": 0.81, + "grad_norm": 0.8372147293790334, + "learning_rate": 1.8258811955054578e-06, + "loss": 0.4535, + "step": 9972 + }, + { + "epoch": 0.81, + "grad_norm": 0.90090598405615, + "learning_rate": 1.8243650333509854e-06, + "loss": 0.4859, + "step": 9973 + }, + { + "epoch": 0.81, + "grad_norm": 0.8238375354825378, + "learning_rate": 1.8228494377607686e-06, + "loss": 0.3835, + "step": 9974 + }, + { + "epoch": 0.81, + "grad_norm": 0.9275498090759045, + "learning_rate": 1.82133440883983e-06, + "loss": 0.4434, + "step": 9975 + }, + { + "epoch": 0.81, + "grad_norm": 0.9679147524700789, + "learning_rate": 1.819819946693162e-06, + "loss": 0.4931, + "step": 9976 + }, + { + "epoch": 0.81, + "grad_norm": 0.9938967763690725, + "learning_rate": 1.8183060514257167e-06, + "loss": 0.506, + "step": 9977 + }, + { + "epoch": 0.81, + "grad_norm": 1.0073247438748036, + "learning_rate": 1.8167927231424077e-06, + "loss": 0.492, + "step": 9978 + }, + { + "epoch": 0.81, + "grad_norm": 0.9441319716629862, + "learning_rate": 1.8152799619480986e-06, + "loss": 0.4797, + "step": 9979 + }, + { + "epoch": 0.81, + "grad_norm": 0.8386503280748313, + "learning_rate": 1.813767767947634e-06, + "loss": 0.446, + "step": 9980 + }, + { + "epoch": 0.81, + "grad_norm": 0.9339502064681194, + "learning_rate": 1.8122561412457984e-06, + "loss": 0.4854, + "step": 9981 + }, + { + "epoch": 0.81, + "grad_norm": 0.9193184258857796, + "learning_rate": 1.8107450819473505e-06, + "loss": 0.4928, + "step": 9982 + }, + { + "epoch": 0.81, + "grad_norm": 0.8856714395591396, + "learning_rate": 1.8092345901570053e-06, + "loss": 0.4528, + "step": 9983 + }, + { + "epoch": 0.81, + "grad_norm": 0.9024058082729695, + "learning_rate": 1.8077246659794368e-06, + "loss": 0.4922, + "step": 9984 + }, + { + "epoch": 0.81, + "grad_norm": 0.9992991290855968, + "learning_rate": 1.8062153095192826e-06, + "loss": 0.4805, + "step": 9985 + }, + { + "epoch": 0.81, + "grad_norm": 0.9208122168752765, + "learning_rate": 1.8047065208811421e-06, + "loss": 0.5148, + "step": 9986 + }, + { + "epoch": 0.81, + "grad_norm": 0.8837422771578278, + "learning_rate": 1.8031983001695674e-06, + "loss": 0.4828, + "step": 9987 + }, + { + "epoch": 0.81, + "grad_norm": 0.9831378843029416, + "learning_rate": 1.8016906474890805e-06, + "loss": 0.5306, + "step": 9988 + }, + { + "epoch": 0.81, + "grad_norm": 0.9996761669814016, + "learning_rate": 1.80018356294416e-06, + "loss": 0.4487, + "step": 9989 + }, + { + "epoch": 0.81, + "grad_norm": 0.9537374004354636, + "learning_rate": 1.7986770466392445e-06, + "loss": 0.5528, + "step": 9990 + }, + { + "epoch": 0.81, + "grad_norm": 0.9596722342287717, + "learning_rate": 1.797171098678736e-06, + "loss": 0.486, + "step": 9991 + }, + { + "epoch": 0.81, + "grad_norm": 1.0236408628013114, + "learning_rate": 1.7956657191669969e-06, + "loss": 0.5186, + "step": 9992 + }, + { + "epoch": 0.81, + "grad_norm": 0.8524064003382771, + "learning_rate": 1.7941609082083434e-06, + "loss": 0.4421, + "step": 9993 + }, + { + "epoch": 0.81, + "grad_norm": 0.8894376608304295, + "learning_rate": 1.792656665907061e-06, + "loss": 0.4256, + "step": 9994 + }, + { + "epoch": 0.81, + "grad_norm": 0.8876867329730918, + "learning_rate": 1.7911529923673908e-06, + "loss": 0.4742, + "step": 9995 + }, + { + "epoch": 0.81, + "grad_norm": 1.0183024170797783, + "learning_rate": 1.7896498876935374e-06, + "loss": 0.5163, + "step": 9996 + }, + { + "epoch": 0.81, + "grad_norm": 0.9581560806374572, + "learning_rate": 1.7881473519896642e-06, + "loss": 0.4665, + "step": 9997 + }, + { + "epoch": 0.81, + "grad_norm": 0.8977743384520495, + "learning_rate": 1.7866453853598985e-06, + "loss": 0.497, + "step": 9998 + }, + { + "epoch": 0.81, + "grad_norm": 0.967090112841617, + "learning_rate": 1.7851439879083188e-06, + "loss": 0.4658, + "step": 9999 + }, + { + "epoch": 0.81, + "grad_norm": 0.9733260797622603, + "learning_rate": 1.7836431597389758e-06, + "loss": 0.5535, + "step": 10000 + }, + { + "epoch": 0.81, + "grad_norm": 0.9988413406779859, + "learning_rate": 1.7821429009558723e-06, + "loss": 0.5021, + "step": 10001 + }, + { + "epoch": 0.81, + "grad_norm": 0.9240963254362822, + "learning_rate": 1.7806432116629779e-06, + "loss": 0.4725, + "step": 10002 + }, + { + "epoch": 0.81, + "grad_norm": 0.9397626741846741, + "learning_rate": 1.7791440919642178e-06, + "loss": 0.5162, + "step": 10003 + }, + { + "epoch": 0.81, + "grad_norm": 1.0066418221971942, + "learning_rate": 1.7776455419634797e-06, + "loss": 0.4792, + "step": 10004 + }, + { + "epoch": 0.81, + "grad_norm": 1.007951156924576, + "learning_rate": 1.776147561764613e-06, + "loss": 0.4439, + "step": 10005 + }, + { + "epoch": 0.81, + "grad_norm": 0.900720562872108, + "learning_rate": 1.7746501514714277e-06, + "loss": 0.4627, + "step": 10006 + }, + { + "epoch": 0.81, + "grad_norm": 0.8383180580559433, + "learning_rate": 1.7731533111876887e-06, + "loss": 0.4472, + "step": 10007 + }, + { + "epoch": 0.81, + "grad_norm": 0.922154067229698, + "learning_rate": 1.7716570410171285e-06, + "loss": 0.4728, + "step": 10008 + }, + { + "epoch": 0.81, + "grad_norm": 0.9071711336797791, + "learning_rate": 1.7701613410634367e-06, + "loss": 0.4782, + "step": 10009 + }, + { + "epoch": 0.81, + "grad_norm": 0.9574378588931637, + "learning_rate": 1.7686662114302633e-06, + "loss": 0.4943, + "step": 10010 + }, + { + "epoch": 0.81, + "grad_norm": 0.974259591928751, + "learning_rate": 1.7671716522212212e-06, + "loss": 0.4604, + "step": 10011 + }, + { + "epoch": 0.81, + "grad_norm": 0.8729156997285382, + "learning_rate": 1.7656776635398832e-06, + "loss": 0.4835, + "step": 10012 + }, + { + "epoch": 0.81, + "grad_norm": 0.9662586084214412, + "learning_rate": 1.7641842454897772e-06, + "loss": 0.472, + "step": 10013 + }, + { + "epoch": 0.81, + "grad_norm": 0.8912794524180124, + "learning_rate": 1.7626913981743975e-06, + "loss": 0.4739, + "step": 10014 + }, + { + "epoch": 0.81, + "grad_norm": 0.9871749963458629, + "learning_rate": 1.761199121697197e-06, + "loss": 0.4227, + "step": 10015 + }, + { + "epoch": 0.81, + "grad_norm": 0.9448824397611246, + "learning_rate": 1.75970741616159e-06, + "loss": 0.5074, + "step": 10016 + }, + { + "epoch": 0.81, + "grad_norm": 0.9480742488796361, + "learning_rate": 1.7582162816709503e-06, + "loss": 0.451, + "step": 10017 + }, + { + "epoch": 0.81, + "grad_norm": 1.090097064319148, + "learning_rate": 1.7567257183286113e-06, + "loss": 0.5197, + "step": 10018 + }, + { + "epoch": 0.81, + "grad_norm": 0.9388593712796945, + "learning_rate": 1.7552357262378705e-06, + "loss": 0.4594, + "step": 10019 + }, + { + "epoch": 0.81, + "grad_norm": 0.9233057993892104, + "learning_rate": 1.7537463055019788e-06, + "loss": 0.4664, + "step": 10020 + }, + { + "epoch": 0.81, + "grad_norm": 0.9478652667485742, + "learning_rate": 1.7522574562241535e-06, + "loss": 0.4949, + "step": 10021 + }, + { + "epoch": 0.81, + "grad_norm": 0.880962159646419, + "learning_rate": 1.750769178507571e-06, + "loss": 0.5246, + "step": 10022 + }, + { + "epoch": 0.81, + "grad_norm": 1.0028321411856618, + "learning_rate": 1.7492814724553664e-06, + "loss": 0.4973, + "step": 10023 + }, + { + "epoch": 0.81, + "grad_norm": 0.9466321446941334, + "learning_rate": 1.7477943381706386e-06, + "loss": 0.4993, + "step": 10024 + }, + { + "epoch": 0.81, + "grad_norm": 0.9446176667725231, + "learning_rate": 1.7463077757564452e-06, + "loss": 0.4882, + "step": 10025 + }, + { + "epoch": 0.81, + "grad_norm": 0.8584036715987096, + "learning_rate": 1.7448217853158e-06, + "loss": 0.4834, + "step": 10026 + }, + { + "epoch": 0.81, + "grad_norm": 0.9167014228298731, + "learning_rate": 1.7433363669516823e-06, + "loss": 0.4908, + "step": 10027 + }, + { + "epoch": 0.82, + "grad_norm": 0.8828091074105395, + "learning_rate": 1.7418515207670306e-06, + "loss": 0.4757, + "step": 10028 + }, + { + "epoch": 0.82, + "grad_norm": 0.9526950744639585, + "learning_rate": 1.7403672468647436e-06, + "loss": 0.5107, + "step": 10029 + }, + { + "epoch": 0.82, + "grad_norm": 0.949876811338634, + "learning_rate": 1.7388835453476805e-06, + "loss": 0.5153, + "step": 10030 + }, + { + "epoch": 0.82, + "grad_norm": 0.8093822357593842, + "learning_rate": 1.737400416318663e-06, + "loss": 0.4581, + "step": 10031 + }, + { + "epoch": 0.82, + "grad_norm": 0.9160558834764048, + "learning_rate": 1.7359178598804637e-06, + "loss": 0.5068, + "step": 10032 + }, + { + "epoch": 0.82, + "grad_norm": 0.9326020932997082, + "learning_rate": 1.7344358761358283e-06, + "loss": 0.5048, + "step": 10033 + }, + { + "epoch": 0.82, + "grad_norm": 0.9103459982568537, + "learning_rate": 1.7329544651874542e-06, + "loss": 0.511, + "step": 10034 + }, + { + "epoch": 0.82, + "grad_norm": 1.0039486942702247, + "learning_rate": 1.7314736271380029e-06, + "loss": 0.4903, + "step": 10035 + }, + { + "epoch": 0.82, + "grad_norm": 0.9825062782601638, + "learning_rate": 1.7299933620900945e-06, + "loss": 0.5486, + "step": 10036 + }, + { + "epoch": 0.82, + "grad_norm": 0.9997925465861273, + "learning_rate": 1.7285136701463134e-06, + "loss": 0.4759, + "step": 10037 + }, + { + "epoch": 0.82, + "grad_norm": 0.9244469095884913, + "learning_rate": 1.7270345514091936e-06, + "loss": 0.4788, + "step": 10038 + }, + { + "epoch": 0.82, + "grad_norm": 1.033859739615693, + "learning_rate": 1.725556005981246e-06, + "loss": 0.5205, + "step": 10039 + }, + { + "epoch": 0.82, + "grad_norm": 1.0181678912579275, + "learning_rate": 1.7240780339649255e-06, + "loss": 0.5217, + "step": 10040 + }, + { + "epoch": 0.82, + "grad_norm": 0.9828476813834534, + "learning_rate": 1.7226006354626567e-06, + "loss": 0.5503, + "step": 10041 + }, + { + "epoch": 0.82, + "grad_norm": 0.9345978924819633, + "learning_rate": 1.7211238105768213e-06, + "loss": 0.5011, + "step": 10042 + }, + { + "epoch": 0.82, + "grad_norm": 0.9078361413963695, + "learning_rate": 1.719647559409765e-06, + "loss": 0.4669, + "step": 10043 + }, + { + "epoch": 0.82, + "grad_norm": 0.9774706241629969, + "learning_rate": 1.7181718820637839e-06, + "loss": 0.4891, + "step": 10044 + }, + { + "epoch": 0.82, + "grad_norm": 0.8467688352719807, + "learning_rate": 1.7166967786411493e-06, + "loss": 0.456, + "step": 10045 + }, + { + "epoch": 0.82, + "grad_norm": 0.9516124394986565, + "learning_rate": 1.7152222492440796e-06, + "loss": 0.444, + "step": 10046 + }, + { + "epoch": 0.82, + "grad_norm": 0.99074343373286, + "learning_rate": 1.7137482939747574e-06, + "loss": 0.5475, + "step": 10047 + }, + { + "epoch": 0.82, + "grad_norm": 0.9347063175331168, + "learning_rate": 1.71227491293533e-06, + "loss": 0.4965, + "step": 10048 + }, + { + "epoch": 0.82, + "grad_norm": 0.9351374387744484, + "learning_rate": 1.7108021062279023e-06, + "loss": 0.5165, + "step": 10049 + }, + { + "epoch": 0.82, + "grad_norm": 0.8743820993543441, + "learning_rate": 1.7093298739545305e-06, + "loss": 0.4837, + "step": 10050 + }, + { + "epoch": 0.82, + "grad_norm": 0.964425055640185, + "learning_rate": 1.7078582162172509e-06, + "loss": 0.4377, + "step": 10051 + }, + { + "epoch": 0.82, + "grad_norm": 0.993934678759803, + "learning_rate": 1.7063871331180382e-06, + "loss": 0.4785, + "step": 10052 + }, + { + "epoch": 0.82, + "grad_norm": 0.9542850256323199, + "learning_rate": 1.704916624758841e-06, + "loss": 0.5095, + "step": 10053 + }, + { + "epoch": 0.82, + "grad_norm": 1.003853157296499, + "learning_rate": 1.7034466912415638e-06, + "loss": 0.4687, + "step": 10054 + }, + { + "epoch": 0.82, + "grad_norm": 0.9517152267431251, + "learning_rate": 1.7019773326680745e-06, + "loss": 0.5356, + "step": 10055 + }, + { + "epoch": 0.82, + "grad_norm": 0.895878308211716, + "learning_rate": 1.7005085491401908e-06, + "loss": 0.4277, + "step": 10056 + }, + { + "epoch": 0.82, + "grad_norm": 0.9082358180531004, + "learning_rate": 1.6990403407597078e-06, + "loss": 0.4525, + "step": 10057 + }, + { + "epoch": 0.82, + "grad_norm": 0.9908955989458099, + "learning_rate": 1.6975727076283642e-06, + "loss": 0.5181, + "step": 10058 + }, + { + "epoch": 0.82, + "grad_norm": 0.8707024973610689, + "learning_rate": 1.6961056498478666e-06, + "loss": 0.4356, + "step": 10059 + }, + { + "epoch": 0.82, + "grad_norm": 0.9765181578225404, + "learning_rate": 1.6946391675198838e-06, + "loss": 0.5583, + "step": 10060 + }, + { + "epoch": 0.82, + "grad_norm": 0.958357904820011, + "learning_rate": 1.6931732607460405e-06, + "loss": 0.52, + "step": 10061 + }, + { + "epoch": 0.82, + "grad_norm": 1.0838303700246354, + "learning_rate": 1.6917079296279181e-06, + "loss": 0.5312, + "step": 10062 + }, + { + "epoch": 0.82, + "grad_norm": 0.7970114790011827, + "learning_rate": 1.690243174267071e-06, + "loss": 0.4548, + "step": 10063 + }, + { + "epoch": 0.82, + "grad_norm": 0.9075527996205083, + "learning_rate": 1.6887789947649991e-06, + "loss": 0.4227, + "step": 10064 + }, + { + "epoch": 0.82, + "grad_norm": 0.9414766364746932, + "learning_rate": 1.687315391223171e-06, + "loss": 0.4893, + "step": 10065 + }, + { + "epoch": 0.82, + "grad_norm": 0.9237635388169918, + "learning_rate": 1.6858523637430136e-06, + "loss": 0.4995, + "step": 10066 + }, + { + "epoch": 0.82, + "grad_norm": 0.9143611340266095, + "learning_rate": 1.6843899124259133e-06, + "loss": 0.4894, + "step": 10067 + }, + { + "epoch": 0.82, + "grad_norm": 0.9646461705809476, + "learning_rate": 1.6829280373732126e-06, + "loss": 0.4884, + "step": 10068 + }, + { + "epoch": 0.82, + "grad_norm": 0.9470992263257096, + "learning_rate": 1.681466738686227e-06, + "loss": 0.5024, + "step": 10069 + }, + { + "epoch": 0.82, + "grad_norm": 0.9202676930536429, + "learning_rate": 1.6800060164662146e-06, + "loss": 0.4791, + "step": 10070 + }, + { + "epoch": 0.82, + "grad_norm": 0.9064573173443682, + "learning_rate": 1.6785458708144053e-06, + "loss": 0.517, + "step": 10071 + }, + { + "epoch": 0.82, + "grad_norm": 0.9259280196769716, + "learning_rate": 1.677086301831986e-06, + "loss": 0.511, + "step": 10072 + }, + { + "epoch": 0.82, + "grad_norm": 0.9364034530429901, + "learning_rate": 1.675627309620107e-06, + "loss": 0.5445, + "step": 10073 + }, + { + "epoch": 0.82, + "grad_norm": 0.8440682150423323, + "learning_rate": 1.6741688942798663e-06, + "loss": 0.4847, + "step": 10074 + }, + { + "epoch": 0.82, + "grad_norm": 0.94140642798752, + "learning_rate": 1.6727110559123405e-06, + "loss": 0.4846, + "step": 10075 + }, + { + "epoch": 0.82, + "grad_norm": 0.9448124346191853, + "learning_rate": 1.6712537946185503e-06, + "loss": 0.4815, + "step": 10076 + }, + { + "epoch": 0.82, + "grad_norm": 0.9390309876950016, + "learning_rate": 1.6697971104994847e-06, + "loss": 0.4681, + "step": 10077 + }, + { + "epoch": 0.82, + "grad_norm": 1.5830720212508986, + "learning_rate": 1.6683410036560899e-06, + "loss": 0.4805, + "step": 10078 + }, + { + "epoch": 0.82, + "grad_norm": 0.9370005137394446, + "learning_rate": 1.666885474189276e-06, + "loss": 0.463, + "step": 10079 + }, + { + "epoch": 0.82, + "grad_norm": 0.8736685269958002, + "learning_rate": 1.6654305221999035e-06, + "loss": 0.465, + "step": 10080 + }, + { + "epoch": 0.82, + "grad_norm": 1.006200032644295, + "learning_rate": 1.663976147788806e-06, + "loss": 0.4743, + "step": 10081 + }, + { + "epoch": 0.82, + "grad_norm": 1.0545813247664437, + "learning_rate": 1.6625223510567667e-06, + "loss": 0.5324, + "step": 10082 + }, + { + "epoch": 0.82, + "grad_norm": 1.0119780993174157, + "learning_rate": 1.6610691321045325e-06, + "loss": 0.5058, + "step": 10083 + }, + { + "epoch": 0.82, + "grad_norm": 0.9609372264458037, + "learning_rate": 1.6596164910328106e-06, + "loss": 0.5189, + "step": 10084 + }, + { + "epoch": 0.82, + "grad_norm": 0.937127977991446, + "learning_rate": 1.6581644279422705e-06, + "loss": 0.4489, + "step": 10085 + }, + { + "epoch": 0.82, + "grad_norm": 0.9913881125396772, + "learning_rate": 1.6567129429335316e-06, + "loss": 0.4661, + "step": 10086 + }, + { + "epoch": 0.82, + "grad_norm": 0.9612288385089394, + "learning_rate": 1.6552620361071903e-06, + "loss": 0.432, + "step": 10087 + }, + { + "epoch": 0.82, + "grad_norm": 0.895201885032304, + "learning_rate": 1.6538117075637849e-06, + "loss": 0.4727, + "step": 10088 + }, + { + "epoch": 0.82, + "grad_norm": 0.8722790693420445, + "learning_rate": 1.6523619574038264e-06, + "loss": 0.4433, + "step": 10089 + }, + { + "epoch": 0.82, + "grad_norm": 0.8694586279318163, + "learning_rate": 1.6509127857277784e-06, + "loss": 0.4481, + "step": 10090 + }, + { + "epoch": 0.82, + "grad_norm": 0.9235188229616803, + "learning_rate": 1.6494641926360722e-06, + "loss": 0.4391, + "step": 10091 + }, + { + "epoch": 0.82, + "grad_norm": 1.035054049678654, + "learning_rate": 1.6480161782290849e-06, + "loss": 0.5072, + "step": 10092 + }, + { + "epoch": 0.82, + "grad_norm": 0.9909914510220467, + "learning_rate": 1.6465687426071741e-06, + "loss": 0.5084, + "step": 10093 + }, + { + "epoch": 0.82, + "grad_norm": 0.9044075400050785, + "learning_rate": 1.6451218858706374e-06, + "loss": 0.4573, + "step": 10094 + }, + { + "epoch": 0.82, + "grad_norm": 0.9300078914155345, + "learning_rate": 1.6436756081197426e-06, + "loss": 0.5399, + "step": 10095 + }, + { + "epoch": 0.82, + "grad_norm": 0.9039526767115124, + "learning_rate": 1.6422299094547156e-06, + "loss": 0.4677, + "step": 10096 + }, + { + "epoch": 0.82, + "grad_norm": 0.9784978778245498, + "learning_rate": 1.6407847899757468e-06, + "loss": 0.4883, + "step": 10097 + }, + { + "epoch": 0.82, + "grad_norm": 1.0152703015299072, + "learning_rate": 1.6393402497829713e-06, + "loss": 0.5826, + "step": 10098 + }, + { + "epoch": 0.82, + "grad_norm": 0.9145163251531081, + "learning_rate": 1.6378962889765048e-06, + "loss": 0.4215, + "step": 10099 + }, + { + "epoch": 0.82, + "grad_norm": 0.8961781171614732, + "learning_rate": 1.6364529076564072e-06, + "loss": 0.4545, + "step": 10100 + }, + { + "epoch": 0.82, + "grad_norm": 0.9460249075728607, + "learning_rate": 1.635010105922704e-06, + "loss": 0.5136, + "step": 10101 + }, + { + "epoch": 0.82, + "grad_norm": 0.9431549507296044, + "learning_rate": 1.633567883875381e-06, + "loss": 0.4878, + "step": 10102 + }, + { + "epoch": 0.82, + "grad_norm": 0.966938572007313, + "learning_rate": 1.6321262416143856e-06, + "loss": 0.555, + "step": 10103 + }, + { + "epoch": 0.82, + "grad_norm": 0.9395240377305119, + "learning_rate": 1.6306851792396138e-06, + "loss": 0.4301, + "step": 10104 + }, + { + "epoch": 0.82, + "grad_norm": 0.9624464563853393, + "learning_rate": 1.62924469685094e-06, + "loss": 0.52, + "step": 10105 + }, + { + "epoch": 0.82, + "grad_norm": 0.8621026113911863, + "learning_rate": 1.6278047945481823e-06, + "loss": 0.4814, + "step": 10106 + }, + { + "epoch": 0.82, + "grad_norm": 0.8405037196377936, + "learning_rate": 1.626365472431125e-06, + "loss": 0.4594, + "step": 10107 + }, + { + "epoch": 0.82, + "grad_norm": 0.9767441682840015, + "learning_rate": 1.6249267305995141e-06, + "loss": 0.496, + "step": 10108 + }, + { + "epoch": 0.82, + "grad_norm": 0.8078450200011851, + "learning_rate": 1.6234885691530543e-06, + "loss": 0.4337, + "step": 10109 + }, + { + "epoch": 0.82, + "grad_norm": 0.9512121505034593, + "learning_rate": 1.6220509881914015e-06, + "loss": 0.489, + "step": 10110 + }, + { + "epoch": 0.82, + "grad_norm": 0.9531062549342812, + "learning_rate": 1.620613987814189e-06, + "loss": 0.5192, + "step": 10111 + }, + { + "epoch": 0.82, + "grad_norm": 0.9742192433408755, + "learning_rate": 1.6191775681209932e-06, + "loss": 0.4612, + "step": 10112 + }, + { + "epoch": 0.82, + "grad_norm": 0.922332170634465, + "learning_rate": 1.6177417292113572e-06, + "loss": 0.4994, + "step": 10113 + }, + { + "epoch": 0.82, + "grad_norm": 0.8630218836631168, + "learning_rate": 1.6163064711847842e-06, + "loss": 0.4881, + "step": 10114 + }, + { + "epoch": 0.82, + "grad_norm": 1.0168371457320893, + "learning_rate": 1.6148717941407387e-06, + "loss": 0.4818, + "step": 10115 + }, + { + "epoch": 0.82, + "grad_norm": 1.0180019521693955, + "learning_rate": 1.613437698178636e-06, + "loss": 0.4638, + "step": 10116 + }, + { + "epoch": 0.82, + "grad_norm": 0.9034584910700775, + "learning_rate": 1.6120041833978662e-06, + "loss": 0.4181, + "step": 10117 + }, + { + "epoch": 0.82, + "grad_norm": 1.026478894740036, + "learning_rate": 1.6105712498977644e-06, + "loss": 0.5086, + "step": 10118 + }, + { + "epoch": 0.82, + "grad_norm": 0.9523384413831647, + "learning_rate": 1.6091388977776334e-06, + "loss": 0.4532, + "step": 10119 + }, + { + "epoch": 0.82, + "grad_norm": 0.8911241632838921, + "learning_rate": 1.607707127136734e-06, + "loss": 0.4532, + "step": 10120 + }, + { + "epoch": 0.82, + "grad_norm": 0.9356572273897498, + "learning_rate": 1.6062759380742898e-06, + "loss": 0.4329, + "step": 10121 + }, + { + "epoch": 0.82, + "grad_norm": 0.9142388310791838, + "learning_rate": 1.6048453306894719e-06, + "loss": 0.4623, + "step": 10122 + }, + { + "epoch": 0.82, + "grad_norm": 0.8648947239779669, + "learning_rate": 1.6034153050814315e-06, + "loss": 0.4531, + "step": 10123 + }, + { + "epoch": 0.82, + "grad_norm": 0.9380175538325769, + "learning_rate": 1.601985861349261e-06, + "loss": 0.5184, + "step": 10124 + }, + { + "epoch": 0.82, + "grad_norm": 0.9619274961680155, + "learning_rate": 1.60055699959202e-06, + "loss": 0.5017, + "step": 10125 + }, + { + "epoch": 0.82, + "grad_norm": 1.0598349113443915, + "learning_rate": 1.599128719908729e-06, + "loss": 0.5305, + "step": 10126 + }, + { + "epoch": 0.82, + "grad_norm": 1.0604631061633714, + "learning_rate": 1.5977010223983692e-06, + "loss": 0.5205, + "step": 10127 + }, + { + "epoch": 0.82, + "grad_norm": 0.932318399473858, + "learning_rate": 1.5962739071598709e-06, + "loss": 0.5136, + "step": 10128 + }, + { + "epoch": 0.82, + "grad_norm": 1.0945709533633685, + "learning_rate": 1.594847374292141e-06, + "loss": 0.4599, + "step": 10129 + }, + { + "epoch": 0.82, + "grad_norm": 0.9370398617218378, + "learning_rate": 1.5934214238940282e-06, + "loss": 0.4661, + "step": 10130 + }, + { + "epoch": 0.82, + "grad_norm": 0.9230957099531405, + "learning_rate": 1.5919960560643589e-06, + "loss": 0.4422, + "step": 10131 + }, + { + "epoch": 0.82, + "grad_norm": 0.9601615361009249, + "learning_rate": 1.5905712709019017e-06, + "loss": 0.5276, + "step": 10132 + }, + { + "epoch": 0.82, + "grad_norm": 0.8953985465198216, + "learning_rate": 1.589147068505398e-06, + "loss": 0.4196, + "step": 10133 + }, + { + "epoch": 0.82, + "grad_norm": 0.8861250406008324, + "learning_rate": 1.5877234489735405e-06, + "loss": 0.4784, + "step": 10134 + }, + { + "epoch": 0.82, + "grad_norm": 0.8392301724786144, + "learning_rate": 1.5863004124049897e-06, + "loss": 0.4503, + "step": 10135 + }, + { + "epoch": 0.82, + "grad_norm": 0.8732452868340083, + "learning_rate": 1.584877958898352e-06, + "loss": 0.476, + "step": 10136 + }, + { + "epoch": 0.82, + "grad_norm": 1.0485661019215877, + "learning_rate": 1.583456088552212e-06, + "loss": 0.4975, + "step": 10137 + }, + { + "epoch": 0.82, + "grad_norm": 0.9059777099372927, + "learning_rate": 1.5820348014650977e-06, + "loss": 0.5057, + "step": 10138 + }, + { + "epoch": 0.82, + "grad_norm": 0.975352945782583, + "learning_rate": 1.5806140977355056e-06, + "loss": 0.5405, + "step": 10139 + }, + { + "epoch": 0.82, + "grad_norm": 0.9568187340392101, + "learning_rate": 1.579193977461887e-06, + "loss": 0.4768, + "step": 10140 + }, + { + "epoch": 0.82, + "grad_norm": 0.8368006214403227, + "learning_rate": 1.5777744407426598e-06, + "loss": 0.4269, + "step": 10141 + }, + { + "epoch": 0.82, + "grad_norm": 0.9323506413597026, + "learning_rate": 1.5763554876761888e-06, + "loss": 0.4785, + "step": 10142 + }, + { + "epoch": 0.82, + "grad_norm": 1.0340089929500242, + "learning_rate": 1.5749371183608154e-06, + "loss": 0.4363, + "step": 10143 + }, + { + "epoch": 0.82, + "grad_norm": 0.8927867883701817, + "learning_rate": 1.573519332894824e-06, + "loss": 0.4133, + "step": 10144 + }, + { + "epoch": 0.82, + "grad_norm": 0.8767356770638337, + "learning_rate": 1.5721021313764684e-06, + "loss": 0.5238, + "step": 10145 + }, + { + "epoch": 0.82, + "grad_norm": 0.9740716993726325, + "learning_rate": 1.57068551390396e-06, + "loss": 0.5013, + "step": 10146 + }, + { + "epoch": 0.82, + "grad_norm": 0.8677354945105021, + "learning_rate": 1.5692694805754716e-06, + "loss": 0.4156, + "step": 10147 + }, + { + "epoch": 0.82, + "grad_norm": 1.031162555983234, + "learning_rate": 1.5678540314891243e-06, + "loss": 0.5158, + "step": 10148 + }, + { + "epoch": 0.82, + "grad_norm": 0.9945018198037907, + "learning_rate": 1.566439166743019e-06, + "loss": 0.5158, + "step": 10149 + }, + { + "epoch": 0.82, + "grad_norm": 1.0414349625809685, + "learning_rate": 1.5650248864351957e-06, + "loss": 0.5062, + "step": 10150 + }, + { + "epoch": 0.83, + "grad_norm": 0.9229593872417223, + "learning_rate": 1.5636111906636665e-06, + "loss": 0.4811, + "step": 10151 + }, + { + "epoch": 0.83, + "grad_norm": 0.9315096534678732, + "learning_rate": 1.5621980795263981e-06, + "loss": 0.4762, + "step": 10152 + }, + { + "epoch": 0.83, + "grad_norm": 0.9962588003282389, + "learning_rate": 1.560785553121319e-06, + "loss": 0.5632, + "step": 10153 + }, + { + "epoch": 0.83, + "grad_norm": 1.001513498736759, + "learning_rate": 1.5593736115463154e-06, + "loss": 0.4853, + "step": 10154 + }, + { + "epoch": 0.83, + "grad_norm": 0.9768955139754243, + "learning_rate": 1.5579622548992356e-06, + "loss": 0.4419, + "step": 10155 + }, + { + "epoch": 0.83, + "grad_norm": 0.8768967199113278, + "learning_rate": 1.5565514832778816e-06, + "loss": 0.4644, + "step": 10156 + }, + { + "epoch": 0.83, + "grad_norm": 0.8558221097469616, + "learning_rate": 1.5551412967800206e-06, + "loss": 0.4429, + "step": 10157 + }, + { + "epoch": 0.83, + "grad_norm": 0.881466729197389, + "learning_rate": 1.5537316955033766e-06, + "loss": 0.432, + "step": 10158 + }, + { + "epoch": 0.83, + "grad_norm": 1.051284596756194, + "learning_rate": 1.5523226795456349e-06, + "loss": 0.5281, + "step": 10159 + }, + { + "epoch": 0.83, + "grad_norm": 0.9155417122416348, + "learning_rate": 1.5509142490044382e-06, + "loss": 0.4846, + "step": 10160 + }, + { + "epoch": 0.83, + "grad_norm": 1.0411739639118625, + "learning_rate": 1.5495064039773921e-06, + "loss": 0.5852, + "step": 10161 + }, + { + "epoch": 0.83, + "grad_norm": 0.8887895632464202, + "learning_rate": 1.5480991445620541e-06, + "loss": 0.5016, + "step": 10162 + }, + { + "epoch": 0.83, + "grad_norm": 0.9360948530755474, + "learning_rate": 1.5466924708559483e-06, + "loss": 0.4885, + "step": 10163 + }, + { + "epoch": 0.83, + "grad_norm": 0.945720706242638, + "learning_rate": 1.5452863829565568e-06, + "loss": 0.5147, + "step": 10164 + }, + { + "epoch": 0.83, + "grad_norm": 0.9668172070220067, + "learning_rate": 1.5438808809613193e-06, + "loss": 0.5072, + "step": 10165 + }, + { + "epoch": 0.83, + "grad_norm": 0.8863857853165308, + "learning_rate": 1.5424759649676357e-06, + "loss": 0.5039, + "step": 10166 + }, + { + "epoch": 0.83, + "grad_norm": 0.9821899836139799, + "learning_rate": 1.5410716350728671e-06, + "loss": 0.5183, + "step": 10167 + }, + { + "epoch": 0.83, + "grad_norm": 0.869172883665788, + "learning_rate": 1.5396678913743324e-06, + "loss": 0.4712, + "step": 10168 + }, + { + "epoch": 0.83, + "grad_norm": 0.8392357457508518, + "learning_rate": 1.5382647339693068e-06, + "loss": 0.4401, + "step": 10169 + }, + { + "epoch": 0.83, + "grad_norm": 0.887219175964945, + "learning_rate": 1.5368621629550295e-06, + "loss": 0.4847, + "step": 10170 + }, + { + "epoch": 0.83, + "grad_norm": 0.8524345364458238, + "learning_rate": 1.535460178428697e-06, + "loss": 0.4431, + "step": 10171 + }, + { + "epoch": 0.83, + "grad_norm": 0.9630643844671861, + "learning_rate": 1.5340587804874662e-06, + "loss": 0.4392, + "step": 10172 + }, + { + "epoch": 0.83, + "grad_norm": 0.9904964289364382, + "learning_rate": 1.5326579692284537e-06, + "loss": 0.4567, + "step": 10173 + }, + { + "epoch": 0.83, + "grad_norm": 0.9282806035313729, + "learning_rate": 1.5312577447487342e-06, + "loss": 0.5784, + "step": 10174 + }, + { + "epoch": 0.83, + "grad_norm": 1.0558418299723358, + "learning_rate": 1.52985810714534e-06, + "loss": 0.4927, + "step": 10175 + }, + { + "epoch": 0.83, + "grad_norm": 1.1063856478240854, + "learning_rate": 1.5284590565152658e-06, + "loss": 0.4835, + "step": 10176 + }, + { + "epoch": 0.83, + "grad_norm": 0.9117316773353915, + "learning_rate": 1.527060592955464e-06, + "loss": 0.5542, + "step": 10177 + }, + { + "epoch": 0.83, + "grad_norm": 0.8713169752157789, + "learning_rate": 1.525662716562849e-06, + "loss": 0.4683, + "step": 10178 + }, + { + "epoch": 0.83, + "grad_norm": 0.9073438743697106, + "learning_rate": 1.5242654274342895e-06, + "loss": 0.4708, + "step": 10179 + }, + { + "epoch": 0.83, + "grad_norm": 0.8966897278534155, + "learning_rate": 1.5228687256666209e-06, + "loss": 0.4603, + "step": 10180 + }, + { + "epoch": 0.83, + "grad_norm": 0.9327987256824378, + "learning_rate": 1.521472611356628e-06, + "loss": 0.5207, + "step": 10181 + }, + { + "epoch": 0.83, + "grad_norm": 0.9177579625453101, + "learning_rate": 1.5200770846010626e-06, + "loss": 0.4408, + "step": 10182 + }, + { + "epoch": 0.83, + "grad_norm": 0.8501020411606455, + "learning_rate": 1.518682145496634e-06, + "loss": 0.4893, + "step": 10183 + }, + { + "epoch": 0.83, + "grad_norm": 0.9254580734831974, + "learning_rate": 1.51728779414001e-06, + "loss": 0.4761, + "step": 10184 + }, + { + "epoch": 0.83, + "grad_norm": 0.8867520160877831, + "learning_rate": 1.515894030627817e-06, + "loss": 0.4256, + "step": 10185 + }, + { + "epoch": 0.83, + "grad_norm": 0.8189185529115245, + "learning_rate": 1.5145008550566454e-06, + "loss": 0.4715, + "step": 10186 + }, + { + "epoch": 0.83, + "grad_norm": 1.0093508277724619, + "learning_rate": 1.5131082675230325e-06, + "loss": 0.5081, + "step": 10187 + }, + { + "epoch": 0.83, + "grad_norm": 0.9998210644300601, + "learning_rate": 1.5117162681234932e-06, + "loss": 0.4606, + "step": 10188 + }, + { + "epoch": 0.83, + "grad_norm": 0.9591548783203413, + "learning_rate": 1.510324856954486e-06, + "loss": 0.4548, + "step": 10189 + }, + { + "epoch": 0.83, + "grad_norm": 0.8416750074760201, + "learning_rate": 1.5089340341124348e-06, + "loss": 0.4433, + "step": 10190 + }, + { + "epoch": 0.83, + "grad_norm": 0.8237614999712479, + "learning_rate": 1.5075437996937248e-06, + "loss": 0.458, + "step": 10191 + }, + { + "epoch": 0.83, + "grad_norm": 0.8947058468112148, + "learning_rate": 1.5061541537946979e-06, + "loss": 0.4556, + "step": 10192 + }, + { + "epoch": 0.83, + "grad_norm": 0.8301616530169087, + "learning_rate": 1.50476509651165e-06, + "loss": 0.4465, + "step": 10193 + }, + { + "epoch": 0.83, + "grad_norm": 0.9188082792789959, + "learning_rate": 1.5033766279408502e-06, + "loss": 0.4811, + "step": 10194 + }, + { + "epoch": 0.83, + "grad_norm": 1.0092638616018894, + "learning_rate": 1.5019887481785112e-06, + "loss": 0.5283, + "step": 10195 + }, + { + "epoch": 0.83, + "grad_norm": 0.9427040337991623, + "learning_rate": 1.500601457320814e-06, + "loss": 0.4537, + "step": 10196 + }, + { + "epoch": 0.83, + "grad_norm": 0.9217148432993036, + "learning_rate": 1.499214755463898e-06, + "loss": 0.4822, + "step": 10197 + }, + { + "epoch": 0.83, + "grad_norm": 1.0911851164229525, + "learning_rate": 1.4978286427038602e-06, + "loss": 0.6013, + "step": 10198 + }, + { + "epoch": 0.83, + "grad_norm": 0.9671555249802585, + "learning_rate": 1.4964431191367524e-06, + "loss": 0.4105, + "step": 10199 + }, + { + "epoch": 0.83, + "grad_norm": 0.8743030992042353, + "learning_rate": 1.4950581848585977e-06, + "loss": 0.4241, + "step": 10200 + }, + { + "epoch": 0.83, + "grad_norm": 0.88143266868202, + "learning_rate": 1.493673839965365e-06, + "loss": 0.4976, + "step": 10201 + }, + { + "epoch": 0.83, + "grad_norm": 1.0492592623073465, + "learning_rate": 1.4922900845529898e-06, + "loss": 0.4855, + "step": 10202 + }, + { + "epoch": 0.83, + "grad_norm": 0.9898697168016961, + "learning_rate": 1.4909069187173652e-06, + "loss": 0.4903, + "step": 10203 + }, + { + "epoch": 0.83, + "grad_norm": 0.8645242451171236, + "learning_rate": 1.4895243425543459e-06, + "loss": 0.4411, + "step": 10204 + }, + { + "epoch": 0.83, + "grad_norm": 0.877275663461517, + "learning_rate": 1.4881423561597374e-06, + "loss": 0.4829, + "step": 10205 + }, + { + "epoch": 0.83, + "grad_norm": 0.8964463282505074, + "learning_rate": 1.4867609596293165e-06, + "loss": 0.4686, + "step": 10206 + }, + { + "epoch": 0.83, + "grad_norm": 1.0755508778310003, + "learning_rate": 1.4853801530588085e-06, + "loss": 0.5166, + "step": 10207 + }, + { + "epoch": 0.83, + "grad_norm": 0.9564994262029834, + "learning_rate": 1.4839999365439039e-06, + "loss": 0.4424, + "step": 10208 + }, + { + "epoch": 0.83, + "grad_norm": 0.9465572798859154, + "learning_rate": 1.4826203101802494e-06, + "loss": 0.4262, + "step": 10209 + }, + { + "epoch": 0.83, + "grad_norm": 0.8966023133247082, + "learning_rate": 1.4812412740634552e-06, + "loss": 0.4824, + "step": 10210 + }, + { + "epoch": 0.83, + "grad_norm": 0.8745563853905437, + "learning_rate": 1.4798628282890793e-06, + "loss": 0.4788, + "step": 10211 + }, + { + "epoch": 0.83, + "grad_norm": 1.0027440224193567, + "learning_rate": 1.4784849729526573e-06, + "loss": 0.5189, + "step": 10212 + }, + { + "epoch": 0.83, + "grad_norm": 0.8894999547435524, + "learning_rate": 1.4771077081496654e-06, + "loss": 0.4733, + "step": 10213 + }, + { + "epoch": 0.83, + "grad_norm": 0.8973434912002435, + "learning_rate": 1.4757310339755494e-06, + "loss": 0.4915, + "step": 10214 + }, + { + "epoch": 0.83, + "grad_norm": 0.9867159710071842, + "learning_rate": 1.4743549505257126e-06, + "loss": 0.4814, + "step": 10215 + }, + { + "epoch": 0.83, + "grad_norm": 0.9946006345156738, + "learning_rate": 1.472979457895517e-06, + "loss": 0.5211, + "step": 10216 + }, + { + "epoch": 0.83, + "grad_norm": 0.9379457171519671, + "learning_rate": 1.4716045561802772e-06, + "loss": 0.4925, + "step": 10217 + }, + { + "epoch": 0.83, + "grad_norm": 0.8979517459470215, + "learning_rate": 1.4702302454752815e-06, + "loss": 0.4941, + "step": 10218 + }, + { + "epoch": 0.83, + "grad_norm": 1.018496605305982, + "learning_rate": 1.4688565258757615e-06, + "loss": 0.4438, + "step": 10219 + }, + { + "epoch": 0.83, + "grad_norm": 0.9519939488958427, + "learning_rate": 1.4674833974769166e-06, + "loss": 0.4612, + "step": 10220 + }, + { + "epoch": 0.83, + "grad_norm": 0.8882005768328872, + "learning_rate": 1.466110860373905e-06, + "loss": 0.428, + "step": 10221 + }, + { + "epoch": 0.83, + "grad_norm": 0.9357707331145971, + "learning_rate": 1.464738914661843e-06, + "loss": 0.5477, + "step": 10222 + }, + { + "epoch": 0.83, + "grad_norm": 1.0508828769107081, + "learning_rate": 1.4633675604357988e-06, + "loss": 0.5031, + "step": 10223 + }, + { + "epoch": 0.83, + "grad_norm": 1.0270950208906446, + "learning_rate": 1.4619967977908157e-06, + "loss": 0.4976, + "step": 10224 + }, + { + "epoch": 0.83, + "grad_norm": 0.987460185788436, + "learning_rate": 1.4606266268218783e-06, + "loss": 0.5491, + "step": 10225 + }, + { + "epoch": 0.83, + "grad_norm": 0.9871122718704699, + "learning_rate": 1.4592570476239421e-06, + "loss": 0.4953, + "step": 10226 + }, + { + "epoch": 0.83, + "grad_norm": 0.9820611895502117, + "learning_rate": 1.4578880602919165e-06, + "loss": 0.5188, + "step": 10227 + }, + { + "epoch": 0.83, + "grad_norm": 0.9576539109395478, + "learning_rate": 1.4565196649206737e-06, + "loss": 0.5063, + "step": 10228 + }, + { + "epoch": 0.83, + "grad_norm": 0.8461603678292988, + "learning_rate": 1.4551518616050352e-06, + "loss": 0.4568, + "step": 10229 + }, + { + "epoch": 0.83, + "grad_norm": 0.9297742997758425, + "learning_rate": 1.453784650439798e-06, + "loss": 0.5646, + "step": 10230 + }, + { + "epoch": 0.83, + "grad_norm": 0.9075481777747914, + "learning_rate": 1.4524180315197023e-06, + "loss": 0.4459, + "step": 10231 + }, + { + "epoch": 0.83, + "grad_norm": 0.9125842697656487, + "learning_rate": 1.451052004939455e-06, + "loss": 0.4746, + "step": 10232 + }, + { + "epoch": 0.83, + "grad_norm": 0.862288669193609, + "learning_rate": 1.4496865707937201e-06, + "loss": 0.4309, + "step": 10233 + }, + { + "epoch": 0.83, + "grad_norm": 0.9072812931637467, + "learning_rate": 1.4483217291771257e-06, + "loss": 0.4439, + "step": 10234 + }, + { + "epoch": 0.83, + "grad_norm": 0.8165505502618535, + "learning_rate": 1.4469574801842445e-06, + "loss": 0.4725, + "step": 10235 + }, + { + "epoch": 0.83, + "grad_norm": 0.869627256256952, + "learning_rate": 1.445593823909628e-06, + "loss": 0.4678, + "step": 10236 + }, + { + "epoch": 0.83, + "grad_norm": 0.9151138992538269, + "learning_rate": 1.444230760447769e-06, + "loss": 0.4892, + "step": 10237 + }, + { + "epoch": 0.83, + "grad_norm": 1.2319204202974274, + "learning_rate": 1.44286828989313e-06, + "loss": 0.4873, + "step": 10238 + }, + { + "epoch": 0.83, + "grad_norm": 0.9497741178270789, + "learning_rate": 1.441506412340129e-06, + "loss": 0.482, + "step": 10239 + }, + { + "epoch": 0.83, + "grad_norm": 0.8981045686296287, + "learning_rate": 1.4401451278831435e-06, + "loss": 0.4408, + "step": 10240 + }, + { + "epoch": 0.83, + "grad_norm": 0.9088762480381548, + "learning_rate": 1.4387844366165038e-06, + "loss": 0.5165, + "step": 10241 + }, + { + "epoch": 0.83, + "grad_norm": 0.8337177904896431, + "learning_rate": 1.4374243386345132e-06, + "loss": 0.4659, + "step": 10242 + }, + { + "epoch": 0.83, + "grad_norm": 1.0151898625090054, + "learning_rate": 1.4360648340314188e-06, + "loss": 0.5337, + "step": 10243 + }, + { + "epoch": 0.83, + "grad_norm": 1.0033732105045368, + "learning_rate": 1.4347059229014359e-06, + "loss": 0.5321, + "step": 10244 + }, + { + "epoch": 0.83, + "grad_norm": 0.9061509243075366, + "learning_rate": 1.433347605338734e-06, + "loss": 0.4419, + "step": 10245 + }, + { + "epoch": 0.83, + "grad_norm": 1.0150086633032052, + "learning_rate": 1.4319898814374477e-06, + "loss": 0.4895, + "step": 10246 + }, + { + "epoch": 0.83, + "grad_norm": 0.9799337144189451, + "learning_rate": 1.4306327512916574e-06, + "loss": 0.5107, + "step": 10247 + }, + { + "epoch": 0.83, + "grad_norm": 0.9820232899095072, + "learning_rate": 1.429276214995421e-06, + "loss": 0.5189, + "step": 10248 + }, + { + "epoch": 0.83, + "grad_norm": 1.1088600675559581, + "learning_rate": 1.4279202726427387e-06, + "loss": 0.5836, + "step": 10249 + }, + { + "epoch": 0.83, + "grad_norm": 1.0022309856003913, + "learning_rate": 1.4265649243275782e-06, + "loss": 0.4901, + "step": 10250 + }, + { + "epoch": 0.83, + "grad_norm": 0.8821887053125698, + "learning_rate": 1.4252101701438636e-06, + "loss": 0.4594, + "step": 10251 + }, + { + "epoch": 0.83, + "grad_norm": 1.0464708818785635, + "learning_rate": 1.4238560101854815e-06, + "loss": 0.5232, + "step": 10252 + }, + { + "epoch": 0.83, + "grad_norm": 0.8163985577430741, + "learning_rate": 1.4225024445462654e-06, + "loss": 0.4529, + "step": 10253 + }, + { + "epoch": 0.83, + "grad_norm": 0.9243481894091906, + "learning_rate": 1.421149473320026e-06, + "loss": 0.4898, + "step": 10254 + }, + { + "epoch": 0.83, + "grad_norm": 0.8764138643875491, + "learning_rate": 1.4197970966005148e-06, + "loss": 0.4318, + "step": 10255 + }, + { + "epoch": 0.83, + "grad_norm": 0.92803108348694, + "learning_rate": 1.418445314481458e-06, + "loss": 0.5064, + "step": 10256 + }, + { + "epoch": 0.83, + "grad_norm": 0.8426680871851131, + "learning_rate": 1.4170941270565275e-06, + "loss": 0.4832, + "step": 10257 + }, + { + "epoch": 0.83, + "grad_norm": 0.9808605193966883, + "learning_rate": 1.4157435344193605e-06, + "loss": 0.4438, + "step": 10258 + }, + { + "epoch": 0.83, + "grad_norm": 0.9243613521277586, + "learning_rate": 1.4143935366635531e-06, + "loss": 0.4952, + "step": 10259 + }, + { + "epoch": 0.83, + "grad_norm": 0.9816508493061582, + "learning_rate": 1.4130441338826595e-06, + "loss": 0.454, + "step": 10260 + }, + { + "epoch": 0.83, + "grad_norm": 0.9007035804833416, + "learning_rate": 1.411695326170187e-06, + "loss": 0.4572, + "step": 10261 + }, + { + "epoch": 0.83, + "grad_norm": 0.9111091371833596, + "learning_rate": 1.4103471136196145e-06, + "loss": 0.4367, + "step": 10262 + }, + { + "epoch": 0.83, + "grad_norm": 0.8489740540487186, + "learning_rate": 1.4089994963243658e-06, + "loss": 0.3963, + "step": 10263 + }, + { + "epoch": 0.83, + "grad_norm": 0.9519934832289844, + "learning_rate": 1.407652474377832e-06, + "loss": 0.5052, + "step": 10264 + }, + { + "epoch": 0.83, + "grad_norm": 0.9604546986315811, + "learning_rate": 1.4063060478733604e-06, + "loss": 0.4527, + "step": 10265 + }, + { + "epoch": 0.83, + "grad_norm": 0.8780119658630006, + "learning_rate": 1.4049602169042598e-06, + "loss": 0.4459, + "step": 10266 + }, + { + "epoch": 0.83, + "grad_norm": 0.8712225686758214, + "learning_rate": 1.4036149815637866e-06, + "loss": 0.4109, + "step": 10267 + }, + { + "epoch": 0.83, + "grad_norm": 0.8832135456375555, + "learning_rate": 1.4022703419451755e-06, + "loss": 0.4775, + "step": 10268 + }, + { + "epoch": 0.83, + "grad_norm": 0.8817635322244807, + "learning_rate": 1.4009262981416016e-06, + "loss": 0.4529, + "step": 10269 + }, + { + "epoch": 0.83, + "grad_norm": 0.978087555119488, + "learning_rate": 1.3995828502462072e-06, + "loss": 0.4929, + "step": 10270 + }, + { + "epoch": 0.83, + "grad_norm": 0.9898534652973374, + "learning_rate": 1.3982399983520934e-06, + "loss": 0.4752, + "step": 10271 + }, + { + "epoch": 0.83, + "grad_norm": 0.8987085477583058, + "learning_rate": 1.39689774255232e-06, + "loss": 0.4729, + "step": 10272 + }, + { + "epoch": 0.83, + "grad_norm": 0.9777419186698381, + "learning_rate": 1.3955560829398974e-06, + "loss": 0.5074, + "step": 10273 + }, + { + "epoch": 0.84, + "grad_norm": 0.9044479183395163, + "learning_rate": 1.3942150196078108e-06, + "loss": 0.4735, + "step": 10274 + }, + { + "epoch": 0.84, + "grad_norm": 0.955084130118838, + "learning_rate": 1.3928745526489874e-06, + "loss": 0.4806, + "step": 10275 + }, + { + "epoch": 0.84, + "grad_norm": 1.0172347146907044, + "learning_rate": 1.3915346821563235e-06, + "loss": 0.5519, + "step": 10276 + }, + { + "epoch": 0.84, + "grad_norm": 1.084576363094847, + "learning_rate": 1.3901954082226698e-06, + "loss": 0.4468, + "step": 10277 + }, + { + "epoch": 0.84, + "grad_norm": 0.952157339824633, + "learning_rate": 1.3888567309408396e-06, + "loss": 0.5082, + "step": 10278 + }, + { + "epoch": 0.84, + "grad_norm": 0.8740904269122499, + "learning_rate": 1.3875186504035965e-06, + "loss": 0.5169, + "step": 10279 + }, + { + "epoch": 0.84, + "grad_norm": 0.9403765835118713, + "learning_rate": 1.386181166703675e-06, + "loss": 0.4676, + "step": 10280 + }, + { + "epoch": 0.84, + "grad_norm": 0.8786768842079513, + "learning_rate": 1.384844279933757e-06, + "loss": 0.4222, + "step": 10281 + }, + { + "epoch": 0.84, + "grad_norm": 0.9925467422216124, + "learning_rate": 1.3835079901864878e-06, + "loss": 0.5285, + "step": 10282 + }, + { + "epoch": 0.84, + "grad_norm": 0.9809311190484911, + "learning_rate": 1.3821722975544727e-06, + "loss": 0.5496, + "step": 10283 + }, + { + "epoch": 0.84, + "grad_norm": 0.9394321489302069, + "learning_rate": 1.3808372021302752e-06, + "loss": 0.4923, + "step": 10284 + }, + { + "epoch": 0.84, + "grad_norm": 0.9952986041724703, + "learning_rate": 1.37950270400641e-06, + "loss": 0.493, + "step": 10285 + }, + { + "epoch": 0.84, + "grad_norm": 0.934921142448034, + "learning_rate": 1.378168803275366e-06, + "loss": 0.4647, + "step": 10286 + }, + { + "epoch": 0.84, + "grad_norm": 0.9848679192787415, + "learning_rate": 1.376835500029573e-06, + "loss": 0.4982, + "step": 10287 + }, + { + "epoch": 0.84, + "grad_norm": 0.9792804880817738, + "learning_rate": 1.375502794361432e-06, + "loss": 0.5542, + "step": 10288 + }, + { + "epoch": 0.84, + "grad_norm": 0.9281471341149863, + "learning_rate": 1.3741706863632976e-06, + "loss": 0.5077, + "step": 10289 + }, + { + "epoch": 0.84, + "grad_norm": 0.9541313032337133, + "learning_rate": 1.372839176127485e-06, + "loss": 0.4716, + "step": 10290 + }, + { + "epoch": 0.84, + "grad_norm": 0.8903951337293582, + "learning_rate": 1.3715082637462607e-06, + "loss": 0.514, + "step": 10291 + }, + { + "epoch": 0.84, + "grad_norm": 0.926295498225597, + "learning_rate": 1.370177949311866e-06, + "loss": 0.4976, + "step": 10292 + }, + { + "epoch": 0.84, + "grad_norm": 0.8380071746620592, + "learning_rate": 1.368848232916481e-06, + "loss": 0.4658, + "step": 10293 + }, + { + "epoch": 0.84, + "grad_norm": 0.9757498423299027, + "learning_rate": 1.3675191146522593e-06, + "loss": 0.5021, + "step": 10294 + }, + { + "epoch": 0.84, + "grad_norm": 0.8970210924053779, + "learning_rate": 1.366190594611304e-06, + "loss": 0.4657, + "step": 10295 + }, + { + "epoch": 0.84, + "grad_norm": 0.8435500538474133, + "learning_rate": 1.3648626728856862e-06, + "loss": 0.4304, + "step": 10296 + }, + { + "epoch": 0.84, + "grad_norm": 0.8889349239109522, + "learning_rate": 1.3635353495674208e-06, + "loss": 0.5115, + "step": 10297 + }, + { + "epoch": 0.84, + "grad_norm": 0.8356644523131306, + "learning_rate": 1.3622086247484989e-06, + "loss": 0.4342, + "step": 10298 + }, + { + "epoch": 0.84, + "grad_norm": 0.8973942966517006, + "learning_rate": 1.3608824985208569e-06, + "loss": 0.4747, + "step": 10299 + }, + { + "epoch": 0.84, + "grad_norm": 0.9808056088548277, + "learning_rate": 1.3595569709763934e-06, + "loss": 0.4904, + "step": 10300 + }, + { + "epoch": 0.84, + "grad_norm": 0.9684079981307847, + "learning_rate": 1.3582320422069684e-06, + "loss": 0.4608, + "step": 10301 + }, + { + "epoch": 0.84, + "grad_norm": 0.9465905249500525, + "learning_rate": 1.3569077123043973e-06, + "loss": 0.4564, + "step": 10302 + }, + { + "epoch": 0.84, + "grad_norm": 0.9502539835566461, + "learning_rate": 1.3555839813604555e-06, + "loss": 0.5308, + "step": 10303 + }, + { + "epoch": 0.84, + "grad_norm": 0.8392779707666963, + "learning_rate": 1.3542608494668785e-06, + "loss": 0.4489, + "step": 10304 + }, + { + "epoch": 0.84, + "grad_norm": 0.9518759760255114, + "learning_rate": 1.3529383167153543e-06, + "loss": 0.5635, + "step": 10305 + }, + { + "epoch": 0.84, + "grad_norm": 0.9503279328972887, + "learning_rate": 1.3516163831975337e-06, + "loss": 0.446, + "step": 10306 + }, + { + "epoch": 0.84, + "grad_norm": 1.0188594497418193, + "learning_rate": 1.350295049005027e-06, + "loss": 0.5087, + "step": 10307 + }, + { + "epoch": 0.84, + "grad_norm": 0.9222776847545859, + "learning_rate": 1.348974314229401e-06, + "loss": 0.4797, + "step": 10308 + }, + { + "epoch": 0.84, + "grad_norm": 0.8973687429402823, + "learning_rate": 1.3476541789621822e-06, + "loss": 0.4761, + "step": 10309 + }, + { + "epoch": 0.84, + "grad_norm": 0.924539907796562, + "learning_rate": 1.3463346432948555e-06, + "loss": 0.4248, + "step": 10310 + }, + { + "epoch": 0.84, + "grad_norm": 0.913652622748875, + "learning_rate": 1.3450157073188608e-06, + "loss": 0.4537, + "step": 10311 + }, + { + "epoch": 0.84, + "grad_norm": 0.9662074046160266, + "learning_rate": 1.3436973711256006e-06, + "loss": 0.4851, + "step": 10312 + }, + { + "epoch": 0.84, + "grad_norm": 0.8981108949974423, + "learning_rate": 1.3423796348064343e-06, + "loss": 0.4598, + "step": 10313 + }, + { + "epoch": 0.84, + "grad_norm": 0.9353563433744401, + "learning_rate": 1.34106249845268e-06, + "loss": 0.5112, + "step": 10314 + }, + { + "epoch": 0.84, + "grad_norm": 0.9043064502707608, + "learning_rate": 1.339745962155613e-06, + "loss": 0.4663, + "step": 10315 + }, + { + "epoch": 0.84, + "grad_norm": 1.0139357510706588, + "learning_rate": 1.338430026006471e-06, + "loss": 0.4875, + "step": 10316 + }, + { + "epoch": 0.84, + "grad_norm": 0.891505665915497, + "learning_rate": 1.337114690096446e-06, + "loss": 0.4168, + "step": 10317 + }, + { + "epoch": 0.84, + "grad_norm": 1.3950374138336772, + "learning_rate": 1.3357999545166878e-06, + "loss": 0.5183, + "step": 10318 + }, + { + "epoch": 0.84, + "grad_norm": 0.9195164034412875, + "learning_rate": 1.3344858193583076e-06, + "loss": 0.4681, + "step": 10319 + }, + { + "epoch": 0.84, + "grad_norm": 0.9308858780333419, + "learning_rate": 1.333172284712373e-06, + "loss": 0.453, + "step": 10320 + }, + { + "epoch": 0.84, + "grad_norm": 1.0416797789084544, + "learning_rate": 1.3318593506699129e-06, + "loss": 0.5003, + "step": 10321 + }, + { + "epoch": 0.84, + "grad_norm": 1.041236641456775, + "learning_rate": 1.3305470173219104e-06, + "loss": 0.5134, + "step": 10322 + }, + { + "epoch": 0.84, + "grad_norm": 0.8013652571175578, + "learning_rate": 1.3292352847593115e-06, + "loss": 0.4064, + "step": 10323 + }, + { + "epoch": 0.84, + "grad_norm": 0.9611643454044503, + "learning_rate": 1.3279241530730147e-06, + "loss": 0.4909, + "step": 10324 + }, + { + "epoch": 0.84, + "grad_norm": 0.8897865375319298, + "learning_rate": 1.3266136223538827e-06, + "loss": 0.4491, + "step": 10325 + }, + { + "epoch": 0.84, + "grad_norm": 0.9901942315736499, + "learning_rate": 1.325303692692732e-06, + "loss": 0.4868, + "step": 10326 + }, + { + "epoch": 0.84, + "grad_norm": 0.9971852320774585, + "learning_rate": 1.323994364180342e-06, + "loss": 0.5157, + "step": 10327 + }, + { + "epoch": 0.84, + "grad_norm": 0.8725406497712802, + "learning_rate": 1.322685636907447e-06, + "loss": 0.4606, + "step": 10328 + }, + { + "epoch": 0.84, + "grad_norm": 1.0280428865888989, + "learning_rate": 1.321377510964742e-06, + "loss": 0.526, + "step": 10329 + }, + { + "epoch": 0.84, + "grad_norm": 0.9565540414208218, + "learning_rate": 1.3200699864428757e-06, + "loss": 0.5046, + "step": 10330 + }, + { + "epoch": 0.84, + "grad_norm": 1.0174627243449055, + "learning_rate": 1.31876306343246e-06, + "loss": 0.5204, + "step": 10331 + }, + { + "epoch": 0.84, + "grad_norm": 0.9857465322789754, + "learning_rate": 1.3174567420240647e-06, + "loss": 0.4811, + "step": 10332 + }, + { + "epoch": 0.84, + "grad_norm": 0.938586286054605, + "learning_rate": 1.3161510223082152e-06, + "loss": 0.4589, + "step": 10333 + }, + { + "epoch": 0.84, + "grad_norm": 1.0344397790703828, + "learning_rate": 1.314845904375397e-06, + "loss": 0.5439, + "step": 10334 + }, + { + "epoch": 0.84, + "grad_norm": 0.9056043167338509, + "learning_rate": 1.3135413883160564e-06, + "loss": 0.4655, + "step": 10335 + }, + { + "epoch": 0.84, + "grad_norm": 0.9769940959605952, + "learning_rate": 1.3122374742205878e-06, + "loss": 0.4396, + "step": 10336 + }, + { + "epoch": 0.84, + "grad_norm": 0.9450823830694117, + "learning_rate": 1.3109341621793614e-06, + "loss": 0.4665, + "step": 10337 + }, + { + "epoch": 0.84, + "grad_norm": 0.9852144192111556, + "learning_rate": 1.309631452282688e-06, + "loss": 0.5602, + "step": 10338 + }, + { + "epoch": 0.84, + "grad_norm": 0.9717639282246133, + "learning_rate": 1.3083293446208467e-06, + "loss": 0.452, + "step": 10339 + }, + { + "epoch": 0.84, + "grad_norm": 0.9232201932196542, + "learning_rate": 1.3070278392840718e-06, + "loss": 0.5083, + "step": 10340 + }, + { + "epoch": 0.84, + "grad_norm": 0.9227017022484458, + "learning_rate": 1.305726936362559e-06, + "loss": 0.4331, + "step": 10341 + }, + { + "epoch": 0.84, + "grad_norm": 1.0087015750858035, + "learning_rate": 1.3044266359464542e-06, + "loss": 0.5254, + "step": 10342 + }, + { + "epoch": 0.84, + "grad_norm": 0.9414831130487048, + "learning_rate": 1.3031269381258737e-06, + "loss": 0.4517, + "step": 10343 + }, + { + "epoch": 0.84, + "grad_norm": 0.9980339567237606, + "learning_rate": 1.3018278429908815e-06, + "loss": 0.5105, + "step": 10344 + }, + { + "epoch": 0.84, + "grad_norm": 0.8688966377834313, + "learning_rate": 1.3005293506315042e-06, + "loss": 0.4611, + "step": 10345 + }, + { + "epoch": 0.84, + "grad_norm": 0.945465953881035, + "learning_rate": 1.2992314611377255e-06, + "loss": 0.5041, + "step": 10346 + }, + { + "epoch": 0.84, + "grad_norm": 0.9456803456419439, + "learning_rate": 1.2979341745994922e-06, + "loss": 0.4881, + "step": 10347 + }, + { + "epoch": 0.84, + "grad_norm": 0.9546926099393971, + "learning_rate": 1.296637491106697e-06, + "loss": 0.4886, + "step": 10348 + }, + { + "epoch": 0.84, + "grad_norm": 1.0014373713572333, + "learning_rate": 1.295341410749208e-06, + "loss": 0.5031, + "step": 10349 + }, + { + "epoch": 0.84, + "grad_norm": 0.9578354382404265, + "learning_rate": 1.2940459336168366e-06, + "loss": 0.4757, + "step": 10350 + }, + { + "epoch": 0.84, + "grad_norm": 0.9225824045489499, + "learning_rate": 1.2927510597993598e-06, + "loss": 0.4755, + "step": 10351 + }, + { + "epoch": 0.84, + "grad_norm": 1.0158769338187628, + "learning_rate": 1.2914567893865103e-06, + "loss": 0.5099, + "step": 10352 + }, + { + "epoch": 0.84, + "grad_norm": 0.9259783971877905, + "learning_rate": 1.2901631224679844e-06, + "loss": 0.4483, + "step": 10353 + }, + { + "epoch": 0.84, + "grad_norm": 1.0430204010462292, + "learning_rate": 1.2888700591334225e-06, + "loss": 0.5591, + "step": 10354 + }, + { + "epoch": 0.84, + "grad_norm": 0.9358738496499883, + "learning_rate": 1.2875775994724448e-06, + "loss": 0.4656, + "step": 10355 + }, + { + "epoch": 0.84, + "grad_norm": 0.999405660521046, + "learning_rate": 1.2862857435746078e-06, + "loss": 0.5058, + "step": 10356 + }, + { + "epoch": 0.84, + "grad_norm": 1.0669907720051155, + "learning_rate": 1.284994491529441e-06, + "loss": 0.5028, + "step": 10357 + }, + { + "epoch": 0.84, + "grad_norm": 1.0632046162659259, + "learning_rate": 1.283703843426425e-06, + "loss": 0.4915, + "step": 10358 + }, + { + "epoch": 0.84, + "grad_norm": 0.9965790286536484, + "learning_rate": 1.2824137993550033e-06, + "loss": 0.5406, + "step": 10359 + }, + { + "epoch": 0.84, + "grad_norm": 0.9001904962554457, + "learning_rate": 1.2811243594045697e-06, + "loss": 0.5098, + "step": 10360 + }, + { + "epoch": 0.84, + "grad_norm": 1.0280818029750178, + "learning_rate": 1.2798355236644876e-06, + "loss": 0.5532, + "step": 10361 + }, + { + "epoch": 0.84, + "grad_norm": 0.931773872431827, + "learning_rate": 1.278547292224067e-06, + "loss": 0.4716, + "step": 10362 + }, + { + "epoch": 0.84, + "grad_norm": 0.8939526381099582, + "learning_rate": 1.2772596651725833e-06, + "loss": 0.4572, + "step": 10363 + }, + { + "epoch": 0.84, + "grad_norm": 0.9181306232702845, + "learning_rate": 1.275972642599268e-06, + "loss": 0.4845, + "step": 10364 + }, + { + "epoch": 0.84, + "grad_norm": 0.9712261562760381, + "learning_rate": 1.274686224593311e-06, + "loss": 0.4899, + "step": 10365 + }, + { + "epoch": 0.84, + "grad_norm": 1.011891784830918, + "learning_rate": 1.273400411243857e-06, + "loss": 0.4992, + "step": 10366 + }, + { + "epoch": 0.84, + "grad_norm": 0.9281749249363781, + "learning_rate": 1.2721152026400174e-06, + "loss": 0.4891, + "step": 10367 + }, + { + "epoch": 0.84, + "grad_norm": 1.0111059017453472, + "learning_rate": 1.2708305988708502e-06, + "loss": 0.4853, + "step": 10368 + }, + { + "epoch": 0.84, + "grad_norm": 1.0195681428784356, + "learning_rate": 1.2695466000253798e-06, + "loss": 0.512, + "step": 10369 + }, + { + "epoch": 0.84, + "grad_norm": 0.9951959453513336, + "learning_rate": 1.268263206192587e-06, + "loss": 0.5128, + "step": 10370 + }, + { + "epoch": 0.84, + "grad_norm": 1.0410562492605302, + "learning_rate": 1.2669804174614097e-06, + "loss": 0.5366, + "step": 10371 + }, + { + "epoch": 0.84, + "grad_norm": 0.8844639541632079, + "learning_rate": 1.2656982339207401e-06, + "loss": 0.4934, + "step": 10372 + }, + { + "epoch": 0.84, + "grad_norm": 0.9036479544451441, + "learning_rate": 1.2644166556594396e-06, + "loss": 0.5045, + "step": 10373 + }, + { + "epoch": 0.84, + "grad_norm": 1.0051379719537108, + "learning_rate": 1.2631356827663144e-06, + "loss": 0.5172, + "step": 10374 + }, + { + "epoch": 0.84, + "grad_norm": 0.9048632205778424, + "learning_rate": 1.2618553153301361e-06, + "loss": 0.421, + "step": 10375 + }, + { + "epoch": 0.84, + "grad_norm": 1.0617817877623916, + "learning_rate": 1.2605755534396347e-06, + "loss": 0.542, + "step": 10376 + }, + { + "epoch": 0.84, + "grad_norm": 0.8293522861775061, + "learning_rate": 1.259296397183497e-06, + "loss": 0.427, + "step": 10377 + }, + { + "epoch": 0.84, + "grad_norm": 1.0283502189905431, + "learning_rate": 1.2580178466503623e-06, + "loss": 0.5244, + "step": 10378 + }, + { + "epoch": 0.84, + "grad_norm": 0.8610204224832774, + "learning_rate": 1.2567399019288406e-06, + "loss": 0.5077, + "step": 10379 + }, + { + "epoch": 0.84, + "grad_norm": 0.9043700711845449, + "learning_rate": 1.2554625631074846e-06, + "loss": 0.4689, + "step": 10380 + }, + { + "epoch": 0.84, + "grad_norm": 0.9362311134421188, + "learning_rate": 1.2541858302748199e-06, + "loss": 0.4184, + "step": 10381 + }, + { + "epoch": 0.84, + "grad_norm": 0.9579535314078547, + "learning_rate": 1.2529097035193183e-06, + "loss": 0.5297, + "step": 10382 + }, + { + "epoch": 0.84, + "grad_norm": 1.0046460874542875, + "learning_rate": 1.2516341829294155e-06, + "loss": 0.5359, + "step": 10383 + }, + { + "epoch": 0.84, + "grad_norm": 0.859887174800114, + "learning_rate": 1.2503592685935039e-06, + "loss": 0.3874, + "step": 10384 + }, + { + "epoch": 0.84, + "grad_norm": 1.1239221563293567, + "learning_rate": 1.2490849605999355e-06, + "loss": 0.5216, + "step": 10385 + }, + { + "epoch": 0.84, + "grad_norm": 0.8692722620485404, + "learning_rate": 1.2478112590370139e-06, + "loss": 0.3922, + "step": 10386 + }, + { + "epoch": 0.84, + "grad_norm": 0.9322363876829223, + "learning_rate": 1.246538163993013e-06, + "loss": 0.4898, + "step": 10387 + }, + { + "epoch": 0.84, + "grad_norm": 0.944415903299271, + "learning_rate": 1.2452656755561509e-06, + "loss": 0.4815, + "step": 10388 + }, + { + "epoch": 0.84, + "grad_norm": 0.914130294902286, + "learning_rate": 1.2439937938146118e-06, + "loss": 0.4974, + "step": 10389 + }, + { + "epoch": 0.84, + "grad_norm": 0.8514860899931579, + "learning_rate": 1.2427225188565362e-06, + "loss": 0.4952, + "step": 10390 + }, + { + "epoch": 0.84, + "grad_norm": 0.9783298964863831, + "learning_rate": 1.2414518507700247e-06, + "loss": 0.5409, + "step": 10391 + }, + { + "epoch": 0.84, + "grad_norm": 0.9345235574956751, + "learning_rate": 1.2401817896431268e-06, + "loss": 0.5215, + "step": 10392 + }, + { + "epoch": 0.84, + "grad_norm": 0.9557235832363349, + "learning_rate": 1.2389123355638655e-06, + "loss": 0.4642, + "step": 10393 + }, + { + "epoch": 0.84, + "grad_norm": 0.9882934130814738, + "learning_rate": 1.237643488620206e-06, + "loss": 0.4993, + "step": 10394 + }, + { + "epoch": 0.84, + "grad_norm": 0.9229014262843889, + "learning_rate": 1.2363752489000802e-06, + "loss": 0.5369, + "step": 10395 + }, + { + "epoch": 0.84, + "grad_norm": 0.8677797825145028, + "learning_rate": 1.2351076164913767e-06, + "loss": 0.5179, + "step": 10396 + }, + { + "epoch": 0.85, + "grad_norm": 0.9079183428436072, + "learning_rate": 1.2338405914819428e-06, + "loss": 0.4528, + "step": 10397 + }, + { + "epoch": 0.85, + "grad_norm": 1.0616692072686915, + "learning_rate": 1.2325741739595753e-06, + "loss": 0.4966, + "step": 10398 + }, + { + "epoch": 0.85, + "grad_norm": 1.017928690459014, + "learning_rate": 1.2313083640120461e-06, + "loss": 0.4986, + "step": 10399 + }, + { + "epoch": 0.85, + "grad_norm": 1.0044277617256578, + "learning_rate": 1.2300431617270669e-06, + "loss": 0.5175, + "step": 10400 + }, + { + "epoch": 0.85, + "grad_norm": 0.9618426399337734, + "learning_rate": 1.228778567192318e-06, + "loss": 0.5235, + "step": 10401 + }, + { + "epoch": 0.85, + "grad_norm": 0.9374152649408706, + "learning_rate": 1.2275145804954347e-06, + "loss": 0.4962, + "step": 10402 + }, + { + "epoch": 0.85, + "grad_norm": 0.8818312190067937, + "learning_rate": 1.2262512017240113e-06, + "loss": 0.4294, + "step": 10403 + }, + { + "epoch": 0.85, + "grad_norm": 0.8987846617538389, + "learning_rate": 1.2249884309655935e-06, + "loss": 0.4726, + "step": 10404 + }, + { + "epoch": 0.85, + "grad_norm": 0.8476644702669166, + "learning_rate": 1.2237262683076979e-06, + "loss": 0.4813, + "step": 10405 + }, + { + "epoch": 0.85, + "grad_norm": 0.9203532221043269, + "learning_rate": 1.2224647138377854e-06, + "loss": 0.508, + "step": 10406 + }, + { + "epoch": 0.85, + "grad_norm": 0.9444794075657026, + "learning_rate": 1.221203767643282e-06, + "loss": 0.494, + "step": 10407 + }, + { + "epoch": 0.85, + "grad_norm": 0.9873844191944869, + "learning_rate": 1.219943429811571e-06, + "loss": 0.4555, + "step": 10408 + }, + { + "epoch": 0.85, + "grad_norm": 0.9150066558016045, + "learning_rate": 1.2186837004299957e-06, + "loss": 0.4924, + "step": 10409 + }, + { + "epoch": 0.85, + "grad_norm": 0.8881440228122434, + "learning_rate": 1.2174245795858454e-06, + "loss": 0.4637, + "step": 10410 + }, + { + "epoch": 0.85, + "grad_norm": 0.9006863395728119, + "learning_rate": 1.2161660673663855e-06, + "loss": 0.4692, + "step": 10411 + }, + { + "epoch": 0.85, + "grad_norm": 0.9597572812479213, + "learning_rate": 1.2149081638588246e-06, + "loss": 0.4673, + "step": 10412 + }, + { + "epoch": 0.85, + "grad_norm": 0.8682495435123212, + "learning_rate": 1.2136508691503357e-06, + "loss": 0.4435, + "step": 10413 + }, + { + "epoch": 0.85, + "grad_norm": 0.9486120778717507, + "learning_rate": 1.2123941833280472e-06, + "loss": 0.5288, + "step": 10414 + }, + { + "epoch": 0.85, + "grad_norm": 0.9196492823660393, + "learning_rate": 1.2111381064790506e-06, + "loss": 0.5112, + "step": 10415 + }, + { + "epoch": 0.85, + "grad_norm": 0.8973238884603776, + "learning_rate": 1.2098826386903829e-06, + "loss": 0.4586, + "step": 10416 + }, + { + "epoch": 0.85, + "grad_norm": 0.9190498993137616, + "learning_rate": 1.2086277800490554e-06, + "loss": 0.4639, + "step": 10417 + }, + { + "epoch": 0.85, + "grad_norm": 0.8949475889209494, + "learning_rate": 1.207373530642022e-06, + "loss": 0.4672, + "step": 10418 + }, + { + "epoch": 0.85, + "grad_norm": 0.9286970525038918, + "learning_rate": 1.2061198905562043e-06, + "loss": 0.489, + "step": 10419 + }, + { + "epoch": 0.85, + "grad_norm": 1.0214400088266034, + "learning_rate": 1.2048668598784785e-06, + "loss": 0.5502, + "step": 10420 + }, + { + "epoch": 0.85, + "grad_norm": 0.9371816622979336, + "learning_rate": 1.2036144386956805e-06, + "loss": 0.5025, + "step": 10421 + }, + { + "epoch": 0.85, + "grad_norm": 0.8994263459924541, + "learning_rate": 1.2023626270945943e-06, + "loss": 0.4472, + "step": 10422 + }, + { + "epoch": 0.85, + "grad_norm": 1.0525060037546006, + "learning_rate": 1.2011114251619792e-06, + "loss": 0.5032, + "step": 10423 + }, + { + "epoch": 0.85, + "grad_norm": 1.1654049372519806, + "learning_rate": 1.1998608329845362e-06, + "loss": 0.5165, + "step": 10424 + }, + { + "epoch": 0.85, + "grad_norm": 1.0218906819426945, + "learning_rate": 1.1986108506489314e-06, + "loss": 0.5377, + "step": 10425 + }, + { + "epoch": 0.85, + "grad_norm": 0.9387363153022205, + "learning_rate": 1.1973614782417874e-06, + "loss": 0.4903, + "step": 10426 + }, + { + "epoch": 0.85, + "grad_norm": 0.8818776810322095, + "learning_rate": 1.1961127158496866e-06, + "loss": 0.4606, + "step": 10427 + }, + { + "epoch": 0.85, + "grad_norm": 0.9745946305449849, + "learning_rate": 1.1948645635591627e-06, + "loss": 0.4892, + "step": 10428 + }, + { + "epoch": 0.85, + "grad_norm": 1.1204945414201928, + "learning_rate": 1.1936170214567177e-06, + "loss": 0.5149, + "step": 10429 + }, + { + "epoch": 0.85, + "grad_norm": 1.000795057220358, + "learning_rate": 1.1923700896288004e-06, + "loss": 0.5303, + "step": 10430 + }, + { + "epoch": 0.85, + "grad_norm": 0.9685116179114811, + "learning_rate": 1.1911237681618226e-06, + "loss": 0.4936, + "step": 10431 + }, + { + "epoch": 0.85, + "grad_norm": 0.9088048618312514, + "learning_rate": 1.1898780571421554e-06, + "loss": 0.4575, + "step": 10432 + }, + { + "epoch": 0.85, + "grad_norm": 0.8908139699595279, + "learning_rate": 1.1886329566561262e-06, + "loss": 0.4623, + "step": 10433 + }, + { + "epoch": 0.85, + "grad_norm": 0.9185747240565809, + "learning_rate": 1.1873884667900125e-06, + "loss": 0.4527, + "step": 10434 + }, + { + "epoch": 0.85, + "grad_norm": 1.0019140346220445, + "learning_rate": 1.186144587630066e-06, + "loss": 0.5332, + "step": 10435 + }, + { + "epoch": 0.85, + "grad_norm": 0.9822650328652477, + "learning_rate": 1.184901319262479e-06, + "loss": 0.5219, + "step": 10436 + }, + { + "epoch": 0.85, + "grad_norm": 0.9366252276807113, + "learning_rate": 1.1836586617734114e-06, + "loss": 0.4705, + "step": 10437 + }, + { + "epoch": 0.85, + "grad_norm": 0.9925375074413082, + "learning_rate": 1.1824166152489791e-06, + "loss": 0.493, + "step": 10438 + }, + { + "epoch": 0.85, + "grad_norm": 0.9522970269239825, + "learning_rate": 1.181175179775257e-06, + "loss": 0.4728, + "step": 10439 + }, + { + "epoch": 0.85, + "grad_norm": 0.9604610233217911, + "learning_rate": 1.179934355438267e-06, + "loss": 0.4928, + "step": 10440 + }, + { + "epoch": 0.85, + "grad_norm": 0.8529289245362368, + "learning_rate": 1.1786941423240072e-06, + "loss": 0.4278, + "step": 10441 + }, + { + "epoch": 0.85, + "grad_norm": 1.0000596664112409, + "learning_rate": 1.1774545405184178e-06, + "loss": 0.4782, + "step": 10442 + }, + { + "epoch": 0.85, + "grad_norm": 0.9101942517684707, + "learning_rate": 1.1762155501074024e-06, + "loss": 0.5051, + "step": 10443 + }, + { + "epoch": 0.85, + "grad_norm": 0.9253558360388284, + "learning_rate": 1.1749771711768233e-06, + "loss": 0.492, + "step": 10444 + }, + { + "epoch": 0.85, + "grad_norm": 0.919619505554477, + "learning_rate": 1.1737394038124994e-06, + "loss": 0.4892, + "step": 10445 + }, + { + "epoch": 0.85, + "grad_norm": 0.9461661472433754, + "learning_rate": 1.1725022481002024e-06, + "loss": 0.4437, + "step": 10446 + }, + { + "epoch": 0.85, + "grad_norm": 0.9388303265636923, + "learning_rate": 1.1712657041256737e-06, + "loss": 0.4766, + "step": 10447 + }, + { + "epoch": 0.85, + "grad_norm": 0.983735419601325, + "learning_rate": 1.170029771974599e-06, + "loss": 0.5595, + "step": 10448 + }, + { + "epoch": 0.85, + "grad_norm": 0.9821181424572664, + "learning_rate": 1.1687944517326289e-06, + "loss": 0.4663, + "step": 10449 + }, + { + "epoch": 0.85, + "grad_norm": 0.9464613846937909, + "learning_rate": 1.1675597434853692e-06, + "loss": 0.4726, + "step": 10450 + }, + { + "epoch": 0.85, + "grad_norm": 0.9347967181397874, + "learning_rate": 1.1663256473183858e-06, + "loss": 0.4676, + "step": 10451 + }, + { + "epoch": 0.85, + "grad_norm": 1.1063597249501547, + "learning_rate": 1.1650921633171985e-06, + "loss": 0.5331, + "step": 10452 + }, + { + "epoch": 0.85, + "grad_norm": 0.8829825883905474, + "learning_rate": 1.1638592915672908e-06, + "loss": 0.4304, + "step": 10453 + }, + { + "epoch": 0.85, + "grad_norm": 1.0064095451786466, + "learning_rate": 1.1626270321540945e-06, + "loss": 0.497, + "step": 10454 + }, + { + "epoch": 0.85, + "grad_norm": 0.9513480613132469, + "learning_rate": 1.1613953851630055e-06, + "loss": 0.483, + "step": 10455 + }, + { + "epoch": 0.85, + "grad_norm": 0.9381733547839735, + "learning_rate": 1.160164350679377e-06, + "loss": 0.4916, + "step": 10456 + }, + { + "epoch": 0.85, + "grad_norm": 0.9758864109716439, + "learning_rate": 1.158933928788518e-06, + "loss": 0.4989, + "step": 10457 + }, + { + "epoch": 0.85, + "grad_norm": 0.9612788161193682, + "learning_rate": 1.1577041195756954e-06, + "loss": 0.4806, + "step": 10458 + }, + { + "epoch": 0.85, + "grad_norm": 0.8620183622668911, + "learning_rate": 1.1564749231261364e-06, + "loss": 0.4454, + "step": 10459 + }, + { + "epoch": 0.85, + "grad_norm": 0.984063658349551, + "learning_rate": 1.155246339525019e-06, + "loss": 0.546, + "step": 10460 + }, + { + "epoch": 0.85, + "grad_norm": 0.9123286243679032, + "learning_rate": 1.1540183688574847e-06, + "loss": 0.4465, + "step": 10461 + }, + { + "epoch": 0.85, + "grad_norm": 0.9541839139858166, + "learning_rate": 1.1527910112086315e-06, + "loss": 0.4698, + "step": 10462 + }, + { + "epoch": 0.85, + "grad_norm": 1.0870922549213533, + "learning_rate": 1.151564266663514e-06, + "loss": 0.5383, + "step": 10463 + }, + { + "epoch": 0.85, + "grad_norm": 0.9692547053719215, + "learning_rate": 1.150338135307144e-06, + "loss": 0.4782, + "step": 10464 + }, + { + "epoch": 0.85, + "grad_norm": 0.8367552594366106, + "learning_rate": 1.1491126172244915e-06, + "loss": 0.4565, + "step": 10465 + }, + { + "epoch": 0.85, + "grad_norm": 0.9307612165354187, + "learning_rate": 1.147887712500486e-06, + "loss": 0.4579, + "step": 10466 + }, + { + "epoch": 0.85, + "grad_norm": 0.8715305493419948, + "learning_rate": 1.1466634212200079e-06, + "loss": 0.4211, + "step": 10467 + }, + { + "epoch": 0.85, + "grad_norm": 0.9695418907066585, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.5061, + "step": 10468 + }, + { + "epoch": 0.85, + "grad_norm": 0.9374443923373761, + "learning_rate": 1.1442166793289677e-06, + "loss": 0.566, + "step": 10469 + }, + { + "epoch": 0.85, + "grad_norm": 0.9450478119481537, + "learning_rate": 1.1429942288879626e-06, + "loss": 0.5034, + "step": 10470 + }, + { + "epoch": 0.85, + "grad_norm": 0.983148186080234, + "learning_rate": 1.1417723922296008e-06, + "loss": 0.492, + "step": 10471 + }, + { + "epoch": 0.85, + "grad_norm": 0.8598251224417236, + "learning_rate": 1.1405511694385584e-06, + "loss": 0.4438, + "step": 10472 + }, + { + "epoch": 0.85, + "grad_norm": 0.9462950488597034, + "learning_rate": 1.1393305605994587e-06, + "loss": 0.4756, + "step": 10473 + }, + { + "epoch": 0.85, + "grad_norm": 0.9482638073194297, + "learning_rate": 1.1381105657968916e-06, + "loss": 0.4996, + "step": 10474 + }, + { + "epoch": 0.85, + "grad_norm": 0.860704600578187, + "learning_rate": 1.1368911851154019e-06, + "loss": 0.5384, + "step": 10475 + }, + { + "epoch": 0.85, + "grad_norm": 0.9570375760818484, + "learning_rate": 1.1356724186394918e-06, + "loss": 0.4809, + "step": 10476 + }, + { + "epoch": 0.85, + "grad_norm": 0.9634767434215831, + "learning_rate": 1.1344542664536196e-06, + "loss": 0.5581, + "step": 10477 + }, + { + "epoch": 0.85, + "grad_norm": 0.9795070369522804, + "learning_rate": 1.1332367286422064e-06, + "loss": 0.5486, + "step": 10478 + }, + { + "epoch": 0.85, + "grad_norm": 0.8677691664078212, + "learning_rate": 1.1320198052896203e-06, + "loss": 0.474, + "step": 10479 + }, + { + "epoch": 0.85, + "grad_norm": 0.9434054008110407, + "learning_rate": 1.130803496480195e-06, + "loss": 0.524, + "step": 10480 + }, + { + "epoch": 0.85, + "grad_norm": 0.9553611331557014, + "learning_rate": 1.129587802298222e-06, + "loss": 0.4732, + "step": 10481 + }, + { + "epoch": 0.85, + "grad_norm": 0.8139743634257981, + "learning_rate": 1.128372722827945e-06, + "loss": 0.4421, + "step": 10482 + }, + { + "epoch": 0.85, + "grad_norm": 0.8881985728660753, + "learning_rate": 1.12715825815357e-06, + "loss": 0.4171, + "step": 10483 + }, + { + "epoch": 0.85, + "grad_norm": 0.9326228528390237, + "learning_rate": 1.1259444083592585e-06, + "loss": 0.4373, + "step": 10484 + }, + { + "epoch": 0.85, + "grad_norm": 0.958442471726399, + "learning_rate": 1.1247311735291255e-06, + "loss": 0.4941, + "step": 10485 + }, + { + "epoch": 0.85, + "grad_norm": 0.9091981400958642, + "learning_rate": 1.1235185537472537e-06, + "loss": 0.5158, + "step": 10486 + }, + { + "epoch": 0.85, + "grad_norm": 0.9715206218825831, + "learning_rate": 1.1223065490976692e-06, + "loss": 0.5159, + "step": 10487 + }, + { + "epoch": 0.85, + "grad_norm": 0.8629851659158863, + "learning_rate": 1.1210951596643682e-06, + "loss": 0.4602, + "step": 10488 + }, + { + "epoch": 0.85, + "grad_norm": 0.9510654460373925, + "learning_rate": 1.1198843855312958e-06, + "loss": 0.5051, + "step": 10489 + }, + { + "epoch": 0.85, + "grad_norm": 0.9736674492886307, + "learning_rate": 1.1186742267823614e-06, + "loss": 0.5066, + "step": 10490 + }, + { + "epoch": 0.85, + "grad_norm": 1.041070879920255, + "learning_rate": 1.1174646835014213e-06, + "loss": 0.5287, + "step": 10491 + }, + { + "epoch": 0.85, + "grad_norm": 0.9975479878026994, + "learning_rate": 1.1162557557723042e-06, + "loss": 0.4458, + "step": 10492 + }, + { + "epoch": 0.85, + "grad_norm": 0.9844734373597187, + "learning_rate": 1.1150474436787806e-06, + "loss": 0.5017, + "step": 10493 + }, + { + "epoch": 0.85, + "grad_norm": 0.9716899197564739, + "learning_rate": 1.113839747304588e-06, + "loss": 0.4888, + "step": 10494 + }, + { + "epoch": 0.85, + "grad_norm": 0.9370907891923271, + "learning_rate": 1.1126326667334196e-06, + "loss": 0.4597, + "step": 10495 + }, + { + "epoch": 0.85, + "grad_norm": 0.9241490676999198, + "learning_rate": 1.1114262020489264e-06, + "loss": 0.456, + "step": 10496 + }, + { + "epoch": 0.85, + "grad_norm": 0.8374254769318923, + "learning_rate": 1.1102203533347089e-06, + "loss": 0.4738, + "step": 10497 + }, + { + "epoch": 0.85, + "grad_norm": 0.8877474626590348, + "learning_rate": 1.1090151206743393e-06, + "loss": 0.4866, + "step": 10498 + }, + { + "epoch": 0.85, + "grad_norm": 0.9475470780816515, + "learning_rate": 1.1078105041513343e-06, + "loss": 0.4918, + "step": 10499 + }, + { + "epoch": 0.85, + "grad_norm": 0.9275589409212524, + "learning_rate": 1.1066065038491735e-06, + "loss": 0.4926, + "step": 10500 + }, + { + "epoch": 0.85, + "grad_norm": 0.8795569509194492, + "learning_rate": 1.1054031198512938e-06, + "loss": 0.4436, + "step": 10501 + }, + { + "epoch": 0.85, + "grad_norm": 0.8691998892476426, + "learning_rate": 1.1042003522410882e-06, + "loss": 0.4589, + "step": 10502 + }, + { + "epoch": 0.85, + "grad_norm": 0.9802431120077391, + "learning_rate": 1.102998201101908e-06, + "loss": 0.4924, + "step": 10503 + }, + { + "epoch": 0.85, + "grad_norm": 0.9531451083089679, + "learning_rate": 1.1017966665170632e-06, + "loss": 0.4823, + "step": 10504 + }, + { + "epoch": 0.85, + "grad_norm": 0.8918469153039225, + "learning_rate": 1.1005957485698115e-06, + "loss": 0.4249, + "step": 10505 + }, + { + "epoch": 0.85, + "grad_norm": 0.8277939680529098, + "learning_rate": 1.0993954473433854e-06, + "loss": 0.474, + "step": 10506 + }, + { + "epoch": 0.85, + "grad_norm": 0.9217263067232511, + "learning_rate": 1.0981957629209584e-06, + "loss": 0.4761, + "step": 10507 + }, + { + "epoch": 0.85, + "grad_norm": 0.8490029541683193, + "learning_rate": 1.096996695385668e-06, + "loss": 0.4925, + "step": 10508 + }, + { + "epoch": 0.85, + "grad_norm": 0.9751680603334909, + "learning_rate": 1.0957982448206105e-06, + "loss": 0.5118, + "step": 10509 + }, + { + "epoch": 0.85, + "grad_norm": 0.9325847643538916, + "learning_rate": 1.0946004113088381e-06, + "loss": 0.4822, + "step": 10510 + }, + { + "epoch": 0.85, + "grad_norm": 0.8462218962246554, + "learning_rate": 1.0934031949333546e-06, + "loss": 0.4943, + "step": 10511 + }, + { + "epoch": 0.85, + "grad_norm": 0.8824927665558341, + "learning_rate": 1.0922065957771332e-06, + "loss": 0.4399, + "step": 10512 + }, + { + "epoch": 0.85, + "grad_norm": 0.9460646657620165, + "learning_rate": 1.0910106139230913e-06, + "loss": 0.4845, + "step": 10513 + }, + { + "epoch": 0.85, + "grad_norm": 1.010466471062037, + "learning_rate": 1.0898152494541124e-06, + "loss": 0.5382, + "step": 10514 + }, + { + "epoch": 0.85, + "grad_norm": 1.0629758525690058, + "learning_rate": 1.0886205024530327e-06, + "loss": 0.4585, + "step": 10515 + }, + { + "epoch": 0.85, + "grad_norm": 0.9760959154365196, + "learning_rate": 1.0874263730026502e-06, + "loss": 0.4848, + "step": 10516 + }, + { + "epoch": 0.85, + "grad_norm": 0.9352196398984858, + "learning_rate": 1.0862328611857109e-06, + "loss": 0.4978, + "step": 10517 + }, + { + "epoch": 0.85, + "grad_norm": 0.9499481473322007, + "learning_rate": 1.085039967084931e-06, + "loss": 0.4877, + "step": 10518 + }, + { + "epoch": 0.85, + "grad_norm": 0.8820445238550193, + "learning_rate": 1.083847690782972e-06, + "loss": 0.4984, + "step": 10519 + }, + { + "epoch": 0.86, + "grad_norm": 0.8875572183370747, + "learning_rate": 1.0826560323624591e-06, + "loss": 0.4319, + "step": 10520 + }, + { + "epoch": 0.86, + "grad_norm": 0.9909807008910421, + "learning_rate": 1.081464991905975e-06, + "loss": 0.4924, + "step": 10521 + }, + { + "epoch": 0.86, + "grad_norm": 0.923524185885788, + "learning_rate": 1.080274569496057e-06, + "loss": 0.4533, + "step": 10522 + }, + { + "epoch": 0.86, + "grad_norm": 0.933531295118856, + "learning_rate": 1.079084765215196e-06, + "loss": 0.4548, + "step": 10523 + }, + { + "epoch": 0.86, + "grad_norm": 0.9891970105285252, + "learning_rate": 1.0778955791458513e-06, + "loss": 0.4882, + "step": 10524 + }, + { + "epoch": 0.86, + "grad_norm": 0.9041791644495576, + "learning_rate": 1.076707011370427e-06, + "loss": 0.4721, + "step": 10525 + }, + { + "epoch": 0.86, + "grad_norm": 0.8494593233155678, + "learning_rate": 1.075519061971293e-06, + "loss": 0.4562, + "step": 10526 + }, + { + "epoch": 0.86, + "grad_norm": 0.9632722345680252, + "learning_rate": 1.074331731030771e-06, + "loss": 0.5007, + "step": 10527 + }, + { + "epoch": 0.86, + "grad_norm": 0.9903562734018743, + "learning_rate": 1.0731450186311454e-06, + "loss": 0.5376, + "step": 10528 + }, + { + "epoch": 0.86, + "grad_norm": 0.8708142022778427, + "learning_rate": 1.0719589248546469e-06, + "loss": 0.4663, + "step": 10529 + }, + { + "epoch": 0.86, + "grad_norm": 0.9039712102106406, + "learning_rate": 1.070773449783481e-06, + "loss": 0.4545, + "step": 10530 + }, + { + "epoch": 0.86, + "grad_norm": 0.9787575342806338, + "learning_rate": 1.069588593499793e-06, + "loss": 0.4533, + "step": 10531 + }, + { + "epoch": 0.86, + "grad_norm": 0.906339996163714, + "learning_rate": 1.0684043560856928e-06, + "loss": 0.4621, + "step": 10532 + }, + { + "epoch": 0.86, + "grad_norm": 0.9995109838408539, + "learning_rate": 1.067220737623249e-06, + "loss": 0.4951, + "step": 10533 + }, + { + "epoch": 0.86, + "grad_norm": 0.9337158432010438, + "learning_rate": 1.0660377381944876e-06, + "loss": 0.4803, + "step": 10534 + }, + { + "epoch": 0.86, + "grad_norm": 0.8939043705730141, + "learning_rate": 1.0648553578813813e-06, + "loss": 0.5323, + "step": 10535 + }, + { + "epoch": 0.86, + "grad_norm": 0.9014274053264499, + "learning_rate": 1.0636735967658785e-06, + "loss": 0.4608, + "step": 10536 + }, + { + "epoch": 0.86, + "grad_norm": 0.9596380740959803, + "learning_rate": 1.0624924549298666e-06, + "loss": 0.5336, + "step": 10537 + }, + { + "epoch": 0.86, + "grad_norm": 0.9184750871004159, + "learning_rate": 1.061311932455199e-06, + "loss": 0.472, + "step": 10538 + }, + { + "epoch": 0.86, + "grad_norm": 0.9820976331082859, + "learning_rate": 1.0601320294236872e-06, + "loss": 0.4911, + "step": 10539 + }, + { + "epoch": 0.86, + "grad_norm": 0.9766999400801963, + "learning_rate": 1.0589527459170967e-06, + "loss": 0.5862, + "step": 10540 + }, + { + "epoch": 0.86, + "grad_norm": 0.9341904993019802, + "learning_rate": 1.0577740820171468e-06, + "loss": 0.4902, + "step": 10541 + }, + { + "epoch": 0.86, + "grad_norm": 0.8466239721311413, + "learning_rate": 1.0565960378055263e-06, + "loss": 0.4735, + "step": 10542 + }, + { + "epoch": 0.86, + "grad_norm": 0.9136573595418811, + "learning_rate": 1.0554186133638643e-06, + "loss": 0.4461, + "step": 10543 + }, + { + "epoch": 0.86, + "grad_norm": 0.8845945755514436, + "learning_rate": 1.0542418087737593e-06, + "loss": 0.4808, + "step": 10544 + }, + { + "epoch": 0.86, + "grad_norm": 0.9664680081992767, + "learning_rate": 1.0530656241167613e-06, + "loss": 0.4904, + "step": 10545 + }, + { + "epoch": 0.86, + "grad_norm": 0.9369665727003559, + "learning_rate": 1.051890059474382e-06, + "loss": 0.4683, + "step": 10546 + }, + { + "epoch": 0.86, + "grad_norm": 0.9122493649287006, + "learning_rate": 1.0507151149280804e-06, + "loss": 0.4651, + "step": 10547 + }, + { + "epoch": 0.86, + "grad_norm": 0.9006968892963719, + "learning_rate": 1.049540790559288e-06, + "loss": 0.5234, + "step": 10548 + }, + { + "epoch": 0.86, + "grad_norm": 0.894069435907734, + "learning_rate": 1.0483670864493777e-06, + "loss": 0.4758, + "step": 10549 + }, + { + "epoch": 0.86, + "grad_norm": 0.9225468427309412, + "learning_rate": 1.0471940026796878e-06, + "loss": 0.3915, + "step": 10550 + }, + { + "epoch": 0.86, + "grad_norm": 0.9415756739526433, + "learning_rate": 1.046021539331512e-06, + "loss": 0.5204, + "step": 10551 + }, + { + "epoch": 0.86, + "grad_norm": 0.9793688442119916, + "learning_rate": 1.0448496964861044e-06, + "loss": 0.443, + "step": 10552 + }, + { + "epoch": 0.86, + "grad_norm": 0.9143015605359969, + "learning_rate": 1.0436784742246652e-06, + "loss": 0.44, + "step": 10553 + }, + { + "epoch": 0.86, + "grad_norm": 0.9957353280803909, + "learning_rate": 1.0425078726283667e-06, + "loss": 0.5254, + "step": 10554 + }, + { + "epoch": 0.86, + "grad_norm": 1.095502882617657, + "learning_rate": 1.0413378917783267e-06, + "loss": 0.5872, + "step": 10555 + }, + { + "epoch": 0.86, + "grad_norm": 1.018574131899571, + "learning_rate": 1.0401685317556232e-06, + "loss": 0.5307, + "step": 10556 + }, + { + "epoch": 0.86, + "grad_norm": 1.0398118534675804, + "learning_rate": 1.0389997926412942e-06, + "loss": 0.5394, + "step": 10557 + }, + { + "epoch": 0.86, + "grad_norm": 0.9024083075890255, + "learning_rate": 1.037831674516332e-06, + "loss": 0.4886, + "step": 10558 + }, + { + "epoch": 0.86, + "grad_norm": 0.9127674671846514, + "learning_rate": 1.0366641774616826e-06, + "loss": 0.504, + "step": 10559 + }, + { + "epoch": 0.86, + "grad_norm": 0.8940640858916057, + "learning_rate": 1.0354973015582582e-06, + "loss": 0.4301, + "step": 10560 + }, + { + "epoch": 0.86, + "grad_norm": 0.9477172972392047, + "learning_rate": 1.0343310468869171e-06, + "loss": 0.4708, + "step": 10561 + }, + { + "epoch": 0.86, + "grad_norm": 0.9215397595796041, + "learning_rate": 1.033165413528483e-06, + "loss": 0.4648, + "step": 10562 + }, + { + "epoch": 0.86, + "grad_norm": 0.9870791421156618, + "learning_rate": 1.0320004015637319e-06, + "loss": 0.4537, + "step": 10563 + }, + { + "epoch": 0.86, + "grad_norm": 0.9182962662979826, + "learning_rate": 1.0308360110733994e-06, + "loss": 0.4726, + "step": 10564 + }, + { + "epoch": 0.86, + "grad_norm": 0.9498092146289201, + "learning_rate": 1.0296722421381733e-06, + "loss": 0.4866, + "step": 10565 + }, + { + "epoch": 0.86, + "grad_norm": 1.003002994594343, + "learning_rate": 1.0285090948387065e-06, + "loss": 0.4608, + "step": 10566 + }, + { + "epoch": 0.86, + "grad_norm": 0.8831435494424852, + "learning_rate": 1.0273465692556006e-06, + "loss": 0.4632, + "step": 10567 + }, + { + "epoch": 0.86, + "grad_norm": 0.9964518745942484, + "learning_rate": 1.0261846654694184e-06, + "loss": 0.4706, + "step": 10568 + }, + { + "epoch": 0.86, + "grad_norm": 0.8837425792652869, + "learning_rate": 1.0250233835606805e-06, + "loss": 0.4616, + "step": 10569 + }, + { + "epoch": 0.86, + "grad_norm": 0.8938867215972366, + "learning_rate": 1.0238627236098619e-06, + "loss": 0.4166, + "step": 10570 + }, + { + "epoch": 0.86, + "grad_norm": 0.8980332943128998, + "learning_rate": 1.0227026856973909e-06, + "loss": 0.5217, + "step": 10571 + }, + { + "epoch": 0.86, + "grad_norm": 0.9194033443879248, + "learning_rate": 1.0215432699036643e-06, + "loss": 0.4979, + "step": 10572 + }, + { + "epoch": 0.86, + "grad_norm": 1.0607340177473767, + "learning_rate": 1.0203844763090243e-06, + "loss": 0.4971, + "step": 10573 + }, + { + "epoch": 0.86, + "grad_norm": 0.7714088041106679, + "learning_rate": 1.0192263049937745e-06, + "loss": 0.4175, + "step": 10574 + }, + { + "epoch": 0.86, + "grad_norm": 0.8751825274589954, + "learning_rate": 1.0180687560381764e-06, + "loss": 0.4855, + "step": 10575 + }, + { + "epoch": 0.86, + "grad_norm": 0.9285605238440512, + "learning_rate": 1.0169118295224488e-06, + "loss": 0.4911, + "step": 10576 + }, + { + "epoch": 0.86, + "grad_norm": 0.8986266092966625, + "learning_rate": 1.0157555255267581e-06, + "loss": 0.4886, + "step": 10577 + }, + { + "epoch": 0.86, + "grad_norm": 1.0638855808898067, + "learning_rate": 1.0145998441312455e-06, + "loss": 0.5261, + "step": 10578 + }, + { + "epoch": 0.86, + "grad_norm": 0.9644109630050147, + "learning_rate": 1.0134447854159913e-06, + "loss": 0.4931, + "step": 10579 + }, + { + "epoch": 0.86, + "grad_norm": 0.9806403030365124, + "learning_rate": 1.0122903494610426e-06, + "loss": 0.4277, + "step": 10580 + }, + { + "epoch": 0.86, + "grad_norm": 0.9415593828325067, + "learning_rate": 1.011136536346401e-06, + "loss": 0.4543, + "step": 10581 + }, + { + "epoch": 0.86, + "grad_norm": 0.8746620984571902, + "learning_rate": 1.009983346152026e-06, + "loss": 0.4319, + "step": 10582 + }, + { + "epoch": 0.86, + "grad_norm": 0.9425651262758186, + "learning_rate": 1.0088307789578266e-06, + "loss": 0.4588, + "step": 10583 + }, + { + "epoch": 0.86, + "grad_norm": 1.001027698974047, + "learning_rate": 1.0076788348436827e-06, + "loss": 0.5128, + "step": 10584 + }, + { + "epoch": 0.86, + "grad_norm": 0.9521648979712655, + "learning_rate": 1.0065275138894182e-06, + "loss": 0.4982, + "step": 10585 + }, + { + "epoch": 0.86, + "grad_norm": 0.8767109127856868, + "learning_rate": 1.00537681617482e-06, + "loss": 0.4291, + "step": 10586 + }, + { + "epoch": 0.86, + "grad_norm": 0.9233767299328738, + "learning_rate": 1.0042267417796292e-06, + "loss": 0.4564, + "step": 10587 + }, + { + "epoch": 0.86, + "grad_norm": 0.8818885464968629, + "learning_rate": 1.0030772907835484e-06, + "loss": 0.4689, + "step": 10588 + }, + { + "epoch": 0.86, + "grad_norm": 0.8781990150711255, + "learning_rate": 1.0019284632662274e-06, + "loss": 0.4965, + "step": 10589 + }, + { + "epoch": 0.86, + "grad_norm": 0.9439405869978958, + "learning_rate": 1.000780259307287e-06, + "loss": 0.5321, + "step": 10590 + }, + { + "epoch": 0.86, + "grad_norm": 1.0273924686655207, + "learning_rate": 9.996326789862897e-07, + "loss": 0.5068, + "step": 10591 + }, + { + "epoch": 0.86, + "grad_norm": 0.9148259332012102, + "learning_rate": 9.984857223827637e-07, + "loss": 0.5091, + "step": 10592 + }, + { + "epoch": 0.86, + "grad_norm": 0.9510707852522677, + "learning_rate": 9.97339389576194e-07, + "loss": 0.4678, + "step": 10593 + }, + { + "epoch": 0.86, + "grad_norm": 0.9856242720669215, + "learning_rate": 9.961936806460194e-07, + "loss": 0.4869, + "step": 10594 + }, + { + "epoch": 0.86, + "grad_norm": 0.9662211813822499, + "learning_rate": 9.950485956716349e-07, + "loss": 0.478, + "step": 10595 + }, + { + "epoch": 0.86, + "grad_norm": 0.9995700843908029, + "learning_rate": 9.939041347323986e-07, + "loss": 0.4696, + "step": 10596 + }, + { + "epoch": 0.86, + "grad_norm": 0.9763678246865335, + "learning_rate": 9.927602979076146e-07, + "loss": 0.496, + "step": 10597 + }, + { + "epoch": 0.86, + "grad_norm": 0.9359566163419666, + "learning_rate": 9.91617085276554e-07, + "loss": 0.5372, + "step": 10598 + }, + { + "epoch": 0.86, + "grad_norm": 0.9833137658610944, + "learning_rate": 9.904744969184377e-07, + "loss": 0.5355, + "step": 10599 + }, + { + "epoch": 0.86, + "grad_norm": 0.8636732279612003, + "learning_rate": 9.89332532912447e-07, + "loss": 0.4137, + "step": 10600 + }, + { + "epoch": 0.86, + "grad_norm": 1.017749469597937, + "learning_rate": 9.881911933377197e-07, + "loss": 0.5332, + "step": 10601 + }, + { + "epoch": 0.86, + "grad_norm": 0.9590708469565017, + "learning_rate": 9.870504782733515e-07, + "loss": 0.4546, + "step": 10602 + }, + { + "epoch": 0.86, + "grad_norm": 0.875187921212929, + "learning_rate": 9.85910387798389e-07, + "loss": 0.4611, + "step": 10603 + }, + { + "epoch": 0.86, + "grad_norm": 0.9268857981738736, + "learning_rate": 9.8477092199184e-07, + "loss": 0.4842, + "step": 10604 + }, + { + "epoch": 0.86, + "grad_norm": 0.9375699374447519, + "learning_rate": 9.836320809326704e-07, + "loss": 0.524, + "step": 10605 + }, + { + "epoch": 0.86, + "grad_norm": 1.0500442909628693, + "learning_rate": 9.824938646998005e-07, + "loss": 0.5259, + "step": 10606 + }, + { + "epoch": 0.86, + "grad_norm": 0.9722789226492576, + "learning_rate": 9.813562733721072e-07, + "loss": 0.4629, + "step": 10607 + }, + { + "epoch": 0.86, + "grad_norm": 0.8146063568062981, + "learning_rate": 9.80219307028426e-07, + "loss": 0.4079, + "step": 10608 + }, + { + "epoch": 0.86, + "grad_norm": 1.0053294621074196, + "learning_rate": 9.790829657475443e-07, + "loss": 0.5076, + "step": 10609 + }, + { + "epoch": 0.86, + "grad_norm": 0.9424850866724473, + "learning_rate": 9.77947249608211e-07, + "loss": 0.4957, + "step": 10610 + }, + { + "epoch": 0.86, + "grad_norm": 0.9350508643965163, + "learning_rate": 9.768121586891322e-07, + "loss": 0.5018, + "step": 10611 + }, + { + "epoch": 0.86, + "grad_norm": 0.9163531937596598, + "learning_rate": 9.75677693068966e-07, + "loss": 0.4881, + "step": 10612 + }, + { + "epoch": 0.86, + "grad_norm": 1.0296132540056735, + "learning_rate": 9.745438528263319e-07, + "loss": 0.5175, + "step": 10613 + }, + { + "epoch": 0.86, + "grad_norm": 0.907944549759089, + "learning_rate": 9.734106380398022e-07, + "loss": 0.4734, + "step": 10614 + }, + { + "epoch": 0.86, + "grad_norm": 0.9589597429120368, + "learning_rate": 9.722780487879124e-07, + "loss": 0.4598, + "step": 10615 + }, + { + "epoch": 0.86, + "grad_norm": 0.9015626629553082, + "learning_rate": 9.711460851491427e-07, + "loss": 0.4868, + "step": 10616 + }, + { + "epoch": 0.86, + "grad_norm": 0.9564663604750182, + "learning_rate": 9.700147472019416e-07, + "loss": 0.5047, + "step": 10617 + }, + { + "epoch": 0.86, + "grad_norm": 0.9176161448635084, + "learning_rate": 9.688840350247085e-07, + "loss": 0.464, + "step": 10618 + }, + { + "epoch": 0.86, + "grad_norm": 0.9724339749225807, + "learning_rate": 9.67753948695801e-07, + "loss": 0.4263, + "step": 10619 + }, + { + "epoch": 0.86, + "grad_norm": 1.0027820480186806, + "learning_rate": 9.666244882935339e-07, + "loss": 0.5469, + "step": 10620 + }, + { + "epoch": 0.86, + "grad_norm": 1.0048726193719604, + "learning_rate": 9.65495653896179e-07, + "loss": 0.508, + "step": 10621 + }, + { + "epoch": 0.86, + "grad_norm": 1.0317181478099673, + "learning_rate": 9.643674455819597e-07, + "loss": 0.4928, + "step": 10622 + }, + { + "epoch": 0.86, + "grad_norm": 0.8361262764620936, + "learning_rate": 9.632398634290607e-07, + "loss": 0.4458, + "step": 10623 + }, + { + "epoch": 0.86, + "grad_norm": 1.0278494857565195, + "learning_rate": 9.621129075156256e-07, + "loss": 0.4697, + "step": 10624 + }, + { + "epoch": 0.86, + "grad_norm": 1.0175732831964581, + "learning_rate": 9.60986577919748e-07, + "loss": 0.5281, + "step": 10625 + }, + { + "epoch": 0.86, + "grad_norm": 0.8834953084316942, + "learning_rate": 9.598608747194826e-07, + "loss": 0.4439, + "step": 10626 + }, + { + "epoch": 0.86, + "grad_norm": 0.9985919876888131, + "learning_rate": 9.587357979928414e-07, + "loss": 0.5225, + "step": 10627 + }, + { + "epoch": 0.86, + "grad_norm": 0.9658488397988649, + "learning_rate": 9.576113478177905e-07, + "loss": 0.4927, + "step": 10628 + }, + { + "epoch": 0.86, + "grad_norm": 0.9535489105894337, + "learning_rate": 9.564875242722516e-07, + "loss": 0.472, + "step": 10629 + }, + { + "epoch": 0.86, + "grad_norm": 1.0003753547033694, + "learning_rate": 9.55364327434105e-07, + "loss": 0.4753, + "step": 10630 + }, + { + "epoch": 0.86, + "grad_norm": 0.8012825219021779, + "learning_rate": 9.54241757381188e-07, + "loss": 0.4108, + "step": 10631 + }, + { + "epoch": 0.86, + "grad_norm": 1.0029078064696209, + "learning_rate": 9.531198141912945e-07, + "loss": 0.5254, + "step": 10632 + }, + { + "epoch": 0.86, + "grad_norm": 1.0070371903454114, + "learning_rate": 9.519984979421725e-07, + "loss": 0.5345, + "step": 10633 + }, + { + "epoch": 0.86, + "grad_norm": 0.9225172268554366, + "learning_rate": 9.508778087115289e-07, + "loss": 0.4823, + "step": 10634 + }, + { + "epoch": 0.86, + "grad_norm": 0.9962315849223458, + "learning_rate": 9.497577465770292e-07, + "loss": 0.5062, + "step": 10635 + }, + { + "epoch": 0.86, + "grad_norm": 0.9698496145273163, + "learning_rate": 9.486383116162878e-07, + "loss": 0.4988, + "step": 10636 + }, + { + "epoch": 0.86, + "grad_norm": 0.9541529632180376, + "learning_rate": 9.475195039068818e-07, + "loss": 0.4303, + "step": 10637 + }, + { + "epoch": 0.86, + "grad_norm": 0.9132257518289799, + "learning_rate": 9.464013235263458e-07, + "loss": 0.46, + "step": 10638 + }, + { + "epoch": 0.86, + "grad_norm": 0.9288673980028853, + "learning_rate": 9.452837705521678e-07, + "loss": 0.4592, + "step": 10639 + }, + { + "epoch": 0.86, + "grad_norm": 0.8436898988686481, + "learning_rate": 9.441668450617924e-07, + "loss": 0.4092, + "step": 10640 + }, + { + "epoch": 0.86, + "grad_norm": 0.9952227583228156, + "learning_rate": 9.430505471326246e-07, + "loss": 0.503, + "step": 10641 + }, + { + "epoch": 0.86, + "grad_norm": 0.9311932065408992, + "learning_rate": 9.419348768420178e-07, + "loss": 0.4727, + "step": 10642 + }, + { + "epoch": 0.87, + "grad_norm": 0.9032988698297807, + "learning_rate": 9.408198342672903e-07, + "loss": 0.4627, + "step": 10643 + }, + { + "epoch": 0.87, + "grad_norm": 0.8884037877380864, + "learning_rate": 9.397054194857125e-07, + "loss": 0.4278, + "step": 10644 + }, + { + "epoch": 0.87, + "grad_norm": 0.923458523901903, + "learning_rate": 9.385916325745115e-07, + "loss": 0.4998, + "step": 10645 + }, + { + "epoch": 0.87, + "grad_norm": 0.8204070866581368, + "learning_rate": 9.374784736108744e-07, + "loss": 0.4271, + "step": 10646 + }, + { + "epoch": 0.87, + "grad_norm": 0.9939294169077609, + "learning_rate": 9.363659426719418e-07, + "loss": 0.5363, + "step": 10647 + }, + { + "epoch": 0.87, + "grad_norm": 0.869244868759126, + "learning_rate": 9.352540398348087e-07, + "loss": 0.4782, + "step": 10648 + }, + { + "epoch": 0.87, + "grad_norm": 1.0670999004444721, + "learning_rate": 9.3414276517653e-07, + "loss": 0.4762, + "step": 10649 + }, + { + "epoch": 0.87, + "grad_norm": 0.9443594709032453, + "learning_rate": 9.330321187741154e-07, + "loss": 0.511, + "step": 10650 + }, + { + "epoch": 0.87, + "grad_norm": 0.8889307971922469, + "learning_rate": 9.319221007045331e-07, + "loss": 0.5285, + "step": 10651 + }, + { + "epoch": 0.87, + "grad_norm": 0.8981113533125855, + "learning_rate": 9.308127110447063e-07, + "loss": 0.4873, + "step": 10652 + }, + { + "epoch": 0.87, + "grad_norm": 0.9017810176305928, + "learning_rate": 9.297039498715155e-07, + "loss": 0.4455, + "step": 10653 + }, + { + "epoch": 0.87, + "grad_norm": 1.0546082206019811, + "learning_rate": 9.285958172617926e-07, + "loss": 0.5087, + "step": 10654 + }, + { + "epoch": 0.87, + "grad_norm": 1.0022472235181799, + "learning_rate": 9.274883132923362e-07, + "loss": 0.4739, + "step": 10655 + }, + { + "epoch": 0.87, + "grad_norm": 0.9002821987327692, + "learning_rate": 9.263814380398917e-07, + "loss": 0.4679, + "step": 10656 + }, + { + "epoch": 0.87, + "grad_norm": 1.0302589475729587, + "learning_rate": 9.252751915811642e-07, + "loss": 0.5019, + "step": 10657 + }, + { + "epoch": 0.87, + "grad_norm": 0.9775376726614513, + "learning_rate": 9.241695739928169e-07, + "loss": 0.5532, + "step": 10658 + }, + { + "epoch": 0.87, + "grad_norm": 1.0537623712647077, + "learning_rate": 9.230645853514697e-07, + "loss": 0.4992, + "step": 10659 + }, + { + "epoch": 0.87, + "grad_norm": 0.8776796418795295, + "learning_rate": 9.219602257336913e-07, + "loss": 0.4862, + "step": 10660 + }, + { + "epoch": 0.87, + "grad_norm": 0.962781037418971, + "learning_rate": 9.208564952160215e-07, + "loss": 0.493, + "step": 10661 + }, + { + "epoch": 0.87, + "grad_norm": 1.0364614771872267, + "learning_rate": 9.197533938749414e-07, + "loss": 0.5501, + "step": 10662 + }, + { + "epoch": 0.87, + "grad_norm": 0.9556984026805189, + "learning_rate": 9.186509217868966e-07, + "loss": 0.4743, + "step": 10663 + }, + { + "epoch": 0.87, + "grad_norm": 0.9616683878097968, + "learning_rate": 9.175490790282882e-07, + "loss": 0.4714, + "step": 10664 + }, + { + "epoch": 0.87, + "grad_norm": 0.9021918453873632, + "learning_rate": 9.164478656754739e-07, + "loss": 0.4759, + "step": 10665 + }, + { + "epoch": 0.87, + "grad_norm": 0.9790575486335737, + "learning_rate": 9.153472818047627e-07, + "loss": 0.5063, + "step": 10666 + }, + { + "epoch": 0.87, + "grad_norm": 0.8468403295990357, + "learning_rate": 9.142473274924291e-07, + "loss": 0.4937, + "step": 10667 + }, + { + "epoch": 0.87, + "grad_norm": 0.9145902214127684, + "learning_rate": 9.131480028146955e-07, + "loss": 0.4602, + "step": 10668 + }, + { + "epoch": 0.87, + "grad_norm": 0.9168147252846369, + "learning_rate": 9.120493078477455e-07, + "loss": 0.4463, + "step": 10669 + }, + { + "epoch": 0.87, + "grad_norm": 0.8323131927350101, + "learning_rate": 9.109512426677169e-07, + "loss": 0.4231, + "step": 10670 + }, + { + "epoch": 0.87, + "grad_norm": 0.9473418109738616, + "learning_rate": 9.09853807350708e-07, + "loss": 0.5073, + "step": 10671 + }, + { + "epoch": 0.87, + "grad_norm": 0.9973434046450055, + "learning_rate": 9.08757001972762e-07, + "loss": 0.5461, + "step": 10672 + }, + { + "epoch": 0.87, + "grad_norm": 0.9805885913133771, + "learning_rate": 9.076608266098974e-07, + "loss": 0.4762, + "step": 10673 + }, + { + "epoch": 0.87, + "grad_norm": 0.9568985687028682, + "learning_rate": 9.065652813380699e-07, + "loss": 0.5519, + "step": 10674 + }, + { + "epoch": 0.87, + "grad_norm": 0.92717019860699, + "learning_rate": 9.054703662332021e-07, + "loss": 0.477, + "step": 10675 + }, + { + "epoch": 0.87, + "grad_norm": 0.8976396587311715, + "learning_rate": 9.04376081371171e-07, + "loss": 0.4784, + "step": 10676 + }, + { + "epoch": 0.87, + "grad_norm": 0.8976010389778204, + "learning_rate": 9.032824268278129e-07, + "loss": 0.411, + "step": 10677 + }, + { + "epoch": 0.87, + "grad_norm": 0.9962401365447401, + "learning_rate": 9.021894026789091e-07, + "loss": 0.5412, + "step": 10678 + }, + { + "epoch": 0.87, + "grad_norm": 0.9492064342678247, + "learning_rate": 9.010970090002135e-07, + "loss": 0.4523, + "step": 10679 + }, + { + "epoch": 0.87, + "grad_norm": 0.8457178835348091, + "learning_rate": 9.000052458674224e-07, + "loss": 0.4362, + "step": 10680 + }, + { + "epoch": 0.87, + "grad_norm": 1.1243935410473074, + "learning_rate": 8.989141133561974e-07, + "loss": 0.5494, + "step": 10681 + }, + { + "epoch": 0.87, + "grad_norm": 0.9755791044220888, + "learning_rate": 8.978236115421501e-07, + "loss": 0.5408, + "step": 10682 + }, + { + "epoch": 0.87, + "grad_norm": 0.9845268895265248, + "learning_rate": 8.967337405008558e-07, + "loss": 0.4561, + "step": 10683 + }, + { + "epoch": 0.87, + "grad_norm": 0.8520746587525816, + "learning_rate": 8.956445003078351e-07, + "loss": 0.4592, + "step": 10684 + }, + { + "epoch": 0.87, + "grad_norm": 0.9837026512736496, + "learning_rate": 8.945558910385776e-07, + "loss": 0.4943, + "step": 10685 + }, + { + "epoch": 0.87, + "grad_norm": 0.9434157332007918, + "learning_rate": 8.934679127685197e-07, + "loss": 0.4327, + "step": 10686 + }, + { + "epoch": 0.87, + "grad_norm": 1.0156228938382437, + "learning_rate": 8.923805655730577e-07, + "loss": 0.5323, + "step": 10687 + }, + { + "epoch": 0.87, + "grad_norm": 1.0581383360974685, + "learning_rate": 8.912938495275436e-07, + "loss": 0.4858, + "step": 10688 + }, + { + "epoch": 0.87, + "grad_norm": 0.989591810706298, + "learning_rate": 8.902077647072883e-07, + "loss": 0.4476, + "step": 10689 + }, + { + "epoch": 0.87, + "grad_norm": 0.9795339881955004, + "learning_rate": 8.891223111875513e-07, + "loss": 0.438, + "step": 10690 + }, + { + "epoch": 0.87, + "grad_norm": 0.8201781263068513, + "learning_rate": 8.880374890435595e-07, + "loss": 0.4644, + "step": 10691 + }, + { + "epoch": 0.87, + "grad_norm": 0.9322419967945786, + "learning_rate": 8.869532983504859e-07, + "loss": 0.4709, + "step": 10692 + }, + { + "epoch": 0.87, + "grad_norm": 0.8780674815289228, + "learning_rate": 8.858697391834658e-07, + "loss": 0.4736, + "step": 10693 + }, + { + "epoch": 0.87, + "grad_norm": 1.0073703685587576, + "learning_rate": 8.847868116175883e-07, + "loss": 0.5376, + "step": 10694 + }, + { + "epoch": 0.87, + "grad_norm": 0.8760429696909396, + "learning_rate": 8.837045157279023e-07, + "loss": 0.4444, + "step": 10695 + }, + { + "epoch": 0.87, + "grad_norm": 0.8900133031097341, + "learning_rate": 8.82622851589402e-07, + "loss": 0.516, + "step": 10696 + }, + { + "epoch": 0.87, + "grad_norm": 1.0236461508169916, + "learning_rate": 8.815418192770553e-07, + "loss": 0.5519, + "step": 10697 + }, + { + "epoch": 0.87, + "grad_norm": 0.9442346465952964, + "learning_rate": 8.804614188657712e-07, + "loss": 0.5174, + "step": 10698 + }, + { + "epoch": 0.87, + "grad_norm": 1.0651480720040911, + "learning_rate": 8.793816504304209e-07, + "loss": 0.4848, + "step": 10699 + }, + { + "epoch": 0.87, + "grad_norm": 0.9156691302175118, + "learning_rate": 8.783025140458334e-07, + "loss": 0.4594, + "step": 10700 + }, + { + "epoch": 0.87, + "grad_norm": 0.9191454740517091, + "learning_rate": 8.772240097867912e-07, + "loss": 0.4513, + "step": 10701 + }, + { + "epoch": 0.87, + "grad_norm": 0.842071309153491, + "learning_rate": 8.761461377280311e-07, + "loss": 0.4486, + "step": 10702 + }, + { + "epoch": 0.87, + "grad_norm": 1.001306254241895, + "learning_rate": 8.750688979442534e-07, + "loss": 0.5239, + "step": 10703 + }, + { + "epoch": 0.87, + "grad_norm": 0.8248599393774404, + "learning_rate": 8.739922905101051e-07, + "loss": 0.4204, + "step": 10704 + }, + { + "epoch": 0.87, + "grad_norm": 0.9491459913050245, + "learning_rate": 8.729163155001975e-07, + "loss": 0.5073, + "step": 10705 + }, + { + "epoch": 0.87, + "grad_norm": 1.2972173703532743, + "learning_rate": 8.71840972989092e-07, + "loss": 0.4949, + "step": 10706 + }, + { + "epoch": 0.87, + "grad_norm": 0.8906322183288898, + "learning_rate": 8.707662630513136e-07, + "loss": 0.4899, + "step": 10707 + }, + { + "epoch": 0.87, + "grad_norm": 0.976107320047754, + "learning_rate": 8.696921857613317e-07, + "loss": 0.521, + "step": 10708 + }, + { + "epoch": 0.87, + "grad_norm": 0.9772816546434508, + "learning_rate": 8.686187411935854e-07, + "loss": 0.4138, + "step": 10709 + }, + { + "epoch": 0.87, + "grad_norm": 0.9880903316140534, + "learning_rate": 8.675459294224597e-07, + "loss": 0.5624, + "step": 10710 + }, + { + "epoch": 0.87, + "grad_norm": 0.9692471090732485, + "learning_rate": 8.664737505223009e-07, + "loss": 0.4838, + "step": 10711 + }, + { + "epoch": 0.87, + "grad_norm": 0.9192191189203573, + "learning_rate": 8.654022045674093e-07, + "loss": 0.4744, + "step": 10712 + }, + { + "epoch": 0.87, + "grad_norm": 0.8948526661133622, + "learning_rate": 8.643312916320446e-07, + "loss": 0.4988, + "step": 10713 + }, + { + "epoch": 0.87, + "grad_norm": 0.9549774148056884, + "learning_rate": 8.632610117904139e-07, + "loss": 0.4531, + "step": 10714 + }, + { + "epoch": 0.87, + "grad_norm": 0.8601975926054979, + "learning_rate": 8.621913651166947e-07, + "loss": 0.4734, + "step": 10715 + }, + { + "epoch": 0.87, + "grad_norm": 0.9024200533857866, + "learning_rate": 8.611223516850076e-07, + "loss": 0.4966, + "step": 10716 + }, + { + "epoch": 0.87, + "grad_norm": 0.8210537135329794, + "learning_rate": 8.600539715694344e-07, + "loss": 0.4491, + "step": 10717 + }, + { + "epoch": 0.87, + "grad_norm": 1.0435012893610338, + "learning_rate": 8.58986224844014e-07, + "loss": 0.492, + "step": 10718 + }, + { + "epoch": 0.87, + "grad_norm": 0.905447549472864, + "learning_rate": 8.579191115827423e-07, + "loss": 0.4656, + "step": 10719 + }, + { + "epoch": 0.87, + "grad_norm": 0.9702528122983963, + "learning_rate": 8.568526318595638e-07, + "loss": 0.5027, + "step": 10720 + }, + { + "epoch": 0.87, + "grad_norm": 0.9028108539852475, + "learning_rate": 8.557867857483915e-07, + "loss": 0.4443, + "step": 10721 + }, + { + "epoch": 0.87, + "grad_norm": 0.9828446495002682, + "learning_rate": 8.547215733230818e-07, + "loss": 0.4912, + "step": 10722 + }, + { + "epoch": 0.87, + "grad_norm": 1.1314179435472174, + "learning_rate": 8.536569946574546e-07, + "loss": 0.5257, + "step": 10723 + }, + { + "epoch": 0.87, + "grad_norm": 1.0384549778518128, + "learning_rate": 8.525930498252855e-07, + "loss": 0.5085, + "step": 10724 + }, + { + "epoch": 0.87, + "grad_norm": 1.0259094490019722, + "learning_rate": 8.515297389003063e-07, + "loss": 0.4782, + "step": 10725 + }, + { + "epoch": 0.87, + "grad_norm": 0.8670612352353387, + "learning_rate": 8.504670619561983e-07, + "loss": 0.4417, + "step": 10726 + }, + { + "epoch": 0.87, + "grad_norm": 0.9064697350293002, + "learning_rate": 8.494050190666103e-07, + "loss": 0.4855, + "step": 10727 + }, + { + "epoch": 0.87, + "grad_norm": 0.9116173287893347, + "learning_rate": 8.483436103051357e-07, + "loss": 0.4691, + "step": 10728 + }, + { + "epoch": 0.87, + "grad_norm": 0.9671185448154255, + "learning_rate": 8.472828357453323e-07, + "loss": 0.5249, + "step": 10729 + }, + { + "epoch": 0.87, + "grad_norm": 0.9073917664980375, + "learning_rate": 8.4622269546071e-07, + "loss": 0.4435, + "step": 10730 + }, + { + "epoch": 0.87, + "grad_norm": 0.9893708961286303, + "learning_rate": 8.45163189524737e-07, + "loss": 0.5178, + "step": 10731 + }, + { + "epoch": 0.87, + "grad_norm": 0.8057742513191817, + "learning_rate": 8.441043180108299e-07, + "loss": 0.4719, + "step": 10732 + }, + { + "epoch": 0.87, + "grad_norm": 0.9424113091164239, + "learning_rate": 8.430460809923768e-07, + "loss": 0.4641, + "step": 10733 + }, + { + "epoch": 0.87, + "grad_norm": 1.0577210260964864, + "learning_rate": 8.419884785427068e-07, + "loss": 0.5043, + "step": 10734 + }, + { + "epoch": 0.87, + "grad_norm": 1.017229119006042, + "learning_rate": 8.409315107351112e-07, + "loss": 0.4864, + "step": 10735 + }, + { + "epoch": 0.87, + "grad_norm": 0.9160592998089141, + "learning_rate": 8.398751776428393e-07, + "loss": 0.483, + "step": 10736 + }, + { + "epoch": 0.87, + "grad_norm": 0.8600297093969423, + "learning_rate": 8.388194793390924e-07, + "loss": 0.4128, + "step": 10737 + }, + { + "epoch": 0.87, + "grad_norm": 0.8505559599580625, + "learning_rate": 8.377644158970277e-07, + "loss": 0.4854, + "step": 10738 + }, + { + "epoch": 0.87, + "grad_norm": 0.9108433349901007, + "learning_rate": 8.367099873897644e-07, + "loss": 0.541, + "step": 10739 + }, + { + "epoch": 0.87, + "grad_norm": 0.9776569227810032, + "learning_rate": 8.356561938903707e-07, + "loss": 0.5111, + "step": 10740 + }, + { + "epoch": 0.87, + "grad_norm": 0.9698205117012005, + "learning_rate": 8.346030354718727e-07, + "loss": 0.4707, + "step": 10741 + }, + { + "epoch": 0.87, + "grad_norm": 0.9094940212730316, + "learning_rate": 8.335505122072551e-07, + "loss": 0.4906, + "step": 10742 + }, + { + "epoch": 0.87, + "grad_norm": 0.9679125119758466, + "learning_rate": 8.324986241694566e-07, + "loss": 0.4892, + "step": 10743 + }, + { + "epoch": 0.87, + "grad_norm": 0.9341757886112941, + "learning_rate": 8.31447371431372e-07, + "loss": 0.4337, + "step": 10744 + }, + { + "epoch": 0.87, + "grad_norm": 0.8865816044503693, + "learning_rate": 8.303967540658531e-07, + "loss": 0.4954, + "step": 10745 + }, + { + "epoch": 0.87, + "grad_norm": 1.0152423724483588, + "learning_rate": 8.293467721457038e-07, + "loss": 0.5048, + "step": 10746 + }, + { + "epoch": 0.87, + "grad_norm": 1.0778633743848953, + "learning_rate": 8.282974257436904e-07, + "loss": 0.4798, + "step": 10747 + }, + { + "epoch": 0.87, + "grad_norm": 0.9664900899900147, + "learning_rate": 8.272487149325281e-07, + "loss": 0.504, + "step": 10748 + }, + { + "epoch": 0.87, + "grad_norm": 0.8752434555018468, + "learning_rate": 8.262006397848954e-07, + "loss": 0.4622, + "step": 10749 + }, + { + "epoch": 0.87, + "grad_norm": 0.9312621506287185, + "learning_rate": 8.251532003734197e-07, + "loss": 0.4799, + "step": 10750 + }, + { + "epoch": 0.87, + "grad_norm": 0.9890867364594251, + "learning_rate": 8.24106396770692e-07, + "loss": 0.4658, + "step": 10751 + }, + { + "epoch": 0.87, + "grad_norm": 0.7633149847762869, + "learning_rate": 8.230602290492485e-07, + "loss": 0.4521, + "step": 10752 + }, + { + "epoch": 0.87, + "grad_norm": 0.8720687155099099, + "learning_rate": 8.220146972815946e-07, + "loss": 0.4463, + "step": 10753 + }, + { + "epoch": 0.87, + "grad_norm": 0.9359147457315855, + "learning_rate": 8.209698015401791e-07, + "loss": 0.4866, + "step": 10754 + }, + { + "epoch": 0.87, + "grad_norm": 0.958715743537337, + "learning_rate": 8.19925541897415e-07, + "loss": 0.5369, + "step": 10755 + }, + { + "epoch": 0.87, + "grad_norm": 1.0407037298595796, + "learning_rate": 8.188819184256669e-07, + "loss": 0.5594, + "step": 10756 + }, + { + "epoch": 0.87, + "grad_norm": 0.9444534547445447, + "learning_rate": 8.178389311972612e-07, + "loss": 0.5005, + "step": 10757 + }, + { + "epoch": 0.87, + "grad_norm": 0.8801319988525427, + "learning_rate": 8.167965802844691e-07, + "loss": 0.4225, + "step": 10758 + }, + { + "epoch": 0.87, + "grad_norm": 0.9247134193102192, + "learning_rate": 8.157548657595327e-07, + "loss": 0.4827, + "step": 10759 + }, + { + "epoch": 0.87, + "grad_norm": 0.9280867544437789, + "learning_rate": 8.147137876946354e-07, + "loss": 0.427, + "step": 10760 + }, + { + "epoch": 0.87, + "grad_norm": 0.9742366224152857, + "learning_rate": 8.136733461619251e-07, + "loss": 0.4937, + "step": 10761 + }, + { + "epoch": 0.87, + "grad_norm": 0.9089698408617726, + "learning_rate": 8.12633541233504e-07, + "loss": 0.4541, + "step": 10762 + }, + { + "epoch": 0.87, + "grad_norm": 0.9281472717487014, + "learning_rate": 8.11594372981429e-07, + "loss": 0.4826, + "step": 10763 + }, + { + "epoch": 0.87, + "grad_norm": 0.8543302871144066, + "learning_rate": 8.105558414777137e-07, + "loss": 0.4622, + "step": 10764 + }, + { + "epoch": 0.87, + "grad_norm": 0.8930667784194314, + "learning_rate": 8.095179467943293e-07, + "loss": 0.4421, + "step": 10765 + }, + { + "epoch": 0.88, + "grad_norm": 1.0264206424491298, + "learning_rate": 8.084806890031982e-07, + "loss": 0.5471, + "step": 10766 + }, + { + "epoch": 0.88, + "grad_norm": 0.8440403105832182, + "learning_rate": 8.074440681762019e-07, + "loss": 0.4352, + "step": 10767 + }, + { + "epoch": 0.88, + "grad_norm": 0.9838278479205321, + "learning_rate": 8.064080843851785e-07, + "loss": 0.5155, + "step": 10768 + }, + { + "epoch": 0.88, + "grad_norm": 0.9654762427606757, + "learning_rate": 8.053727377019194e-07, + "loss": 0.5047, + "step": 10769 + }, + { + "epoch": 0.88, + "grad_norm": 0.9212823039571989, + "learning_rate": 8.043380281981739e-07, + "loss": 0.4555, + "step": 10770 + }, + { + "epoch": 0.88, + "grad_norm": 0.9962002914394492, + "learning_rate": 8.03303955945649e-07, + "loss": 0.4671, + "step": 10771 + }, + { + "epoch": 0.88, + "grad_norm": 0.9551262436521267, + "learning_rate": 8.022705210159997e-07, + "loss": 0.4842, + "step": 10772 + }, + { + "epoch": 0.88, + "grad_norm": 1.0214526838284614, + "learning_rate": 8.012377234808455e-07, + "loss": 0.4988, + "step": 10773 + }, + { + "epoch": 0.88, + "grad_norm": 0.8715570587673264, + "learning_rate": 8.002055634117578e-07, + "loss": 0.4734, + "step": 10774 + }, + { + "epoch": 0.88, + "grad_norm": 0.9266589001004784, + "learning_rate": 7.991740408802651e-07, + "loss": 0.5139, + "step": 10775 + }, + { + "epoch": 0.88, + "grad_norm": 1.0169422673348818, + "learning_rate": 7.98143155957849e-07, + "loss": 0.4969, + "step": 10776 + }, + { + "epoch": 0.88, + "grad_norm": 1.0076439713948204, + "learning_rate": 7.971129087159524e-07, + "loss": 0.5081, + "step": 10777 + }, + { + "epoch": 0.88, + "grad_norm": 0.9002053839249186, + "learning_rate": 7.960832992259671e-07, + "loss": 0.4021, + "step": 10778 + }, + { + "epoch": 0.88, + "grad_norm": 0.9091524190557226, + "learning_rate": 7.950543275592449e-07, + "loss": 0.4644, + "step": 10779 + }, + { + "epoch": 0.88, + "grad_norm": 1.0333079754352386, + "learning_rate": 7.94025993787092e-07, + "loss": 0.514, + "step": 10780 + }, + { + "epoch": 0.88, + "grad_norm": 0.9823506631557626, + "learning_rate": 7.929982979807738e-07, + "loss": 0.5355, + "step": 10781 + }, + { + "epoch": 0.88, + "grad_norm": 0.9000299533512557, + "learning_rate": 7.919712402115054e-07, + "loss": 0.422, + "step": 10782 + }, + { + "epoch": 0.88, + "grad_norm": 1.0293709792663799, + "learning_rate": 7.909448205504633e-07, + "loss": 0.536, + "step": 10783 + }, + { + "epoch": 0.88, + "grad_norm": 0.9110279158047219, + "learning_rate": 7.899190390687783e-07, + "loss": 0.4659, + "step": 10784 + }, + { + "epoch": 0.88, + "grad_norm": 0.9492518309308532, + "learning_rate": 7.888938958375325e-07, + "loss": 0.5341, + "step": 10785 + }, + { + "epoch": 0.88, + "grad_norm": 1.0609084182976722, + "learning_rate": 7.878693909277702e-07, + "loss": 0.5124, + "step": 10786 + }, + { + "epoch": 0.88, + "grad_norm": 1.0009821719336907, + "learning_rate": 7.868455244104878e-07, + "loss": 0.4542, + "step": 10787 + }, + { + "epoch": 0.88, + "grad_norm": 0.8843299754398897, + "learning_rate": 7.858222963566386e-07, + "loss": 0.4965, + "step": 10788 + }, + { + "epoch": 0.88, + "grad_norm": 0.8676271737361042, + "learning_rate": 7.847997068371305e-07, + "loss": 0.4224, + "step": 10789 + }, + { + "epoch": 0.88, + "grad_norm": 0.9568586304329556, + "learning_rate": 7.83777755922831e-07, + "loss": 0.4841, + "step": 10790 + }, + { + "epoch": 0.88, + "grad_norm": 0.9568289014292817, + "learning_rate": 7.827564436845569e-07, + "loss": 0.4305, + "step": 10791 + }, + { + "epoch": 0.88, + "grad_norm": 0.9007924177189064, + "learning_rate": 7.81735770193085e-07, + "loss": 0.4777, + "step": 10792 + }, + { + "epoch": 0.88, + "grad_norm": 1.0356572627261904, + "learning_rate": 7.807157355191475e-07, + "loss": 0.4975, + "step": 10793 + }, + { + "epoch": 0.88, + "grad_norm": 0.9153262416116543, + "learning_rate": 7.796963397334323e-07, + "loss": 0.4281, + "step": 10794 + }, + { + "epoch": 0.88, + "grad_norm": 0.9513672979130858, + "learning_rate": 7.786775829065829e-07, + "loss": 0.5165, + "step": 10795 + }, + { + "epoch": 0.88, + "grad_norm": 0.9632117362919128, + "learning_rate": 7.776594651091995e-07, + "loss": 0.487, + "step": 10796 + }, + { + "epoch": 0.88, + "grad_norm": 0.9401247195442137, + "learning_rate": 7.766419864118325e-07, + "loss": 0.4191, + "step": 10797 + }, + { + "epoch": 0.88, + "grad_norm": 0.9176285200237425, + "learning_rate": 7.756251468849951e-07, + "loss": 0.4436, + "step": 10798 + }, + { + "epoch": 0.88, + "grad_norm": 0.8611310925495618, + "learning_rate": 7.746089465991525e-07, + "loss": 0.4971, + "step": 10799 + }, + { + "epoch": 0.88, + "grad_norm": 0.9305324159779296, + "learning_rate": 7.735933856247269e-07, + "loss": 0.4685, + "step": 10800 + }, + { + "epoch": 0.88, + "grad_norm": 0.9107641464674445, + "learning_rate": 7.725784640320966e-07, + "loss": 0.4931, + "step": 10801 + }, + { + "epoch": 0.88, + "grad_norm": 0.9806018240071644, + "learning_rate": 7.715641818915953e-07, + "loss": 0.5458, + "step": 10802 + }, + { + "epoch": 0.88, + "grad_norm": 0.9257241322988893, + "learning_rate": 7.70550539273508e-07, + "loss": 0.4582, + "step": 10803 + }, + { + "epoch": 0.88, + "grad_norm": 0.8949923140018721, + "learning_rate": 7.695375362480839e-07, + "loss": 0.4443, + "step": 10804 + }, + { + "epoch": 0.88, + "grad_norm": 1.0077955943455112, + "learning_rate": 7.685251728855203e-07, + "loss": 0.4722, + "step": 10805 + }, + { + "epoch": 0.88, + "grad_norm": 0.8865636886945156, + "learning_rate": 7.675134492559733e-07, + "loss": 0.5064, + "step": 10806 + }, + { + "epoch": 0.88, + "grad_norm": 1.0190987369655813, + "learning_rate": 7.665023654295556e-07, + "loss": 0.4824, + "step": 10807 + }, + { + "epoch": 0.88, + "grad_norm": 0.9215282547212966, + "learning_rate": 7.654919214763357e-07, + "loss": 0.4839, + "step": 10808 + }, + { + "epoch": 0.88, + "grad_norm": 0.9555787188807393, + "learning_rate": 7.644821174663308e-07, + "loss": 0.4842, + "step": 10809 + }, + { + "epoch": 0.88, + "grad_norm": 0.9429919388180836, + "learning_rate": 7.634729534695273e-07, + "loss": 0.4577, + "step": 10810 + }, + { + "epoch": 0.88, + "grad_norm": 0.8458292575931327, + "learning_rate": 7.624644295558525e-07, + "loss": 0.4069, + "step": 10811 + }, + { + "epoch": 0.88, + "grad_norm": 0.8661683221124166, + "learning_rate": 7.614565457952005e-07, + "loss": 0.4705, + "step": 10812 + }, + { + "epoch": 0.88, + "grad_norm": 0.9333855764767786, + "learning_rate": 7.604493022574144e-07, + "loss": 0.5074, + "step": 10813 + }, + { + "epoch": 0.88, + "grad_norm": 0.9956363320104379, + "learning_rate": 7.594426990122972e-07, + "loss": 0.5214, + "step": 10814 + }, + { + "epoch": 0.88, + "grad_norm": 0.9448774180808857, + "learning_rate": 7.58436736129603e-07, + "loss": 0.4514, + "step": 10815 + }, + { + "epoch": 0.88, + "grad_norm": 0.9361065086180625, + "learning_rate": 7.574314136790472e-07, + "loss": 0.4398, + "step": 10816 + }, + { + "epoch": 0.88, + "grad_norm": 0.9122194932866072, + "learning_rate": 7.564267317302965e-07, + "loss": 0.4273, + "step": 10817 + }, + { + "epoch": 0.88, + "grad_norm": 0.9753113897717414, + "learning_rate": 7.554226903529726e-07, + "loss": 0.5101, + "step": 10818 + }, + { + "epoch": 0.88, + "grad_norm": 0.914937109857151, + "learning_rate": 7.544192896166569e-07, + "loss": 0.4923, + "step": 10819 + }, + { + "epoch": 0.88, + "grad_norm": 0.9201451087784769, + "learning_rate": 7.534165295908857e-07, + "loss": 0.4767, + "step": 10820 + }, + { + "epoch": 0.88, + "grad_norm": 0.9650755957588284, + "learning_rate": 7.524144103451436e-07, + "loss": 0.4629, + "step": 10821 + }, + { + "epoch": 0.88, + "grad_norm": 0.9972999225253916, + "learning_rate": 7.514129319488839e-07, + "loss": 0.5472, + "step": 10822 + }, + { + "epoch": 0.88, + "grad_norm": 0.9633231367308103, + "learning_rate": 7.504120944715021e-07, + "loss": 0.4474, + "step": 10823 + }, + { + "epoch": 0.88, + "grad_norm": 0.9708221061585173, + "learning_rate": 7.494118979823584e-07, + "loss": 0.4915, + "step": 10824 + }, + { + "epoch": 0.88, + "grad_norm": 0.9672049816339459, + "learning_rate": 7.48412342550765e-07, + "loss": 0.5398, + "step": 10825 + }, + { + "epoch": 0.88, + "grad_norm": 0.8233298296033982, + "learning_rate": 7.47413428245991e-07, + "loss": 0.4419, + "step": 10826 + }, + { + "epoch": 0.88, + "grad_norm": 0.9569979497855227, + "learning_rate": 7.464151551372567e-07, + "loss": 0.4963, + "step": 10827 + }, + { + "epoch": 0.88, + "grad_norm": 0.9372064091819019, + "learning_rate": 7.454175232937478e-07, + "loss": 0.557, + "step": 10828 + }, + { + "epoch": 0.88, + "grad_norm": 0.9857819829803073, + "learning_rate": 7.444205327845932e-07, + "loss": 0.5188, + "step": 10829 + }, + { + "epoch": 0.88, + "grad_norm": 0.8485091146600624, + "learning_rate": 7.43424183678887e-07, + "loss": 0.445, + "step": 10830 + }, + { + "epoch": 0.88, + "grad_norm": 0.9093988479334427, + "learning_rate": 7.424284760456734e-07, + "loss": 0.4602, + "step": 10831 + }, + { + "epoch": 0.88, + "grad_norm": 0.8697372110048283, + "learning_rate": 7.414334099539577e-07, + "loss": 0.3928, + "step": 10832 + }, + { + "epoch": 0.88, + "grad_norm": 0.8500637707605192, + "learning_rate": 7.404389854726901e-07, + "loss": 0.4605, + "step": 10833 + }, + { + "epoch": 0.88, + "grad_norm": 0.9450728224522065, + "learning_rate": 7.39445202670791e-07, + "loss": 0.495, + "step": 10834 + }, + { + "epoch": 0.88, + "grad_norm": 0.9922786195641765, + "learning_rate": 7.384520616171232e-07, + "loss": 0.5178, + "step": 10835 + }, + { + "epoch": 0.88, + "grad_norm": 0.9618426009645406, + "learning_rate": 7.374595623805137e-07, + "loss": 0.4966, + "step": 10836 + }, + { + "epoch": 0.88, + "grad_norm": 0.9462859467040637, + "learning_rate": 7.364677050297398e-07, + "loss": 0.4739, + "step": 10837 + }, + { + "epoch": 0.88, + "grad_norm": 1.0227678882461722, + "learning_rate": 7.354764896335398e-07, + "loss": 0.5576, + "step": 10838 + }, + { + "epoch": 0.88, + "grad_norm": 0.9980851593236497, + "learning_rate": 7.344859162605966e-07, + "loss": 0.4946, + "step": 10839 + }, + { + "epoch": 0.88, + "grad_norm": 0.9237865868634112, + "learning_rate": 7.334959849795653e-07, + "loss": 0.4839, + "step": 10840 + }, + { + "epoch": 0.88, + "grad_norm": 0.9178993647703343, + "learning_rate": 7.32506695859041e-07, + "loss": 0.4641, + "step": 10841 + }, + { + "epoch": 0.88, + "grad_norm": 0.9519479260351561, + "learning_rate": 7.315180489675822e-07, + "loss": 0.5189, + "step": 10842 + }, + { + "epoch": 0.88, + "grad_norm": 0.960758292084623, + "learning_rate": 7.305300443737018e-07, + "loss": 0.5433, + "step": 10843 + }, + { + "epoch": 0.88, + "grad_norm": 0.9801777325896154, + "learning_rate": 7.295426821458684e-07, + "loss": 0.5057, + "step": 10844 + }, + { + "epoch": 0.88, + "grad_norm": 0.9286446248352245, + "learning_rate": 7.285559623525018e-07, + "loss": 0.4899, + "step": 10845 + }, + { + "epoch": 0.88, + "grad_norm": 0.9399120756588593, + "learning_rate": 7.275698850619861e-07, + "loss": 0.5041, + "step": 10846 + }, + { + "epoch": 0.88, + "grad_norm": 0.8796425834099239, + "learning_rate": 7.265844503426512e-07, + "loss": 0.4751, + "step": 10847 + }, + { + "epoch": 0.88, + "grad_norm": 0.9478441410449189, + "learning_rate": 7.255996582627878e-07, + "loss": 0.4731, + "step": 10848 + }, + { + "epoch": 0.88, + "grad_norm": 0.9602936406666883, + "learning_rate": 7.246155088906426e-07, + "loss": 0.4869, + "step": 10849 + }, + { + "epoch": 0.88, + "grad_norm": 0.8380944081829118, + "learning_rate": 7.236320022944166e-07, + "loss": 0.4285, + "step": 10850 + }, + { + "epoch": 0.88, + "grad_norm": 1.0144426336924712, + "learning_rate": 7.226491385422618e-07, + "loss": 0.5351, + "step": 10851 + }, + { + "epoch": 0.88, + "grad_norm": 0.9866428772036293, + "learning_rate": 7.21666917702295e-07, + "loss": 0.4802, + "step": 10852 + }, + { + "epoch": 0.88, + "grad_norm": 1.0833877967513377, + "learning_rate": 7.206853398425806e-07, + "loss": 0.4986, + "step": 10853 + }, + { + "epoch": 0.88, + "grad_norm": 0.9699105979255526, + "learning_rate": 7.197044050311408e-07, + "loss": 0.495, + "step": 10854 + }, + { + "epoch": 0.88, + "grad_norm": 0.8461947804009808, + "learning_rate": 7.187241133359535e-07, + "loss": 0.4701, + "step": 10855 + }, + { + "epoch": 0.88, + "grad_norm": 1.0087068121306357, + "learning_rate": 7.177444648249554e-07, + "loss": 0.5119, + "step": 10856 + }, + { + "epoch": 0.88, + "grad_norm": 0.927842032367001, + "learning_rate": 7.167654595660279e-07, + "loss": 0.4756, + "step": 10857 + }, + { + "epoch": 0.88, + "grad_norm": 1.0954507707339836, + "learning_rate": 7.157870976270243e-07, + "loss": 0.5273, + "step": 10858 + }, + { + "epoch": 0.88, + "grad_norm": 0.9649063436888304, + "learning_rate": 7.148093790757371e-07, + "loss": 0.5375, + "step": 10859 + }, + { + "epoch": 0.88, + "grad_norm": 0.8600627356479995, + "learning_rate": 7.138323039799256e-07, + "loss": 0.4304, + "step": 10860 + }, + { + "epoch": 0.88, + "grad_norm": 1.0326935851609784, + "learning_rate": 7.128558724072976e-07, + "loss": 0.489, + "step": 10861 + }, + { + "epoch": 0.88, + "grad_norm": 0.8968902380842844, + "learning_rate": 7.118800844255214e-07, + "loss": 0.5305, + "step": 10862 + }, + { + "epoch": 0.88, + "grad_norm": 0.9482496308476169, + "learning_rate": 7.109049401022139e-07, + "loss": 0.5105, + "step": 10863 + }, + { + "epoch": 0.88, + "grad_norm": 1.0149047865729435, + "learning_rate": 7.099304395049566e-07, + "loss": 0.554, + "step": 10864 + }, + { + "epoch": 0.88, + "grad_norm": 0.9135791307678812, + "learning_rate": 7.089565827012801e-07, + "loss": 0.5188, + "step": 10865 + }, + { + "epoch": 0.88, + "grad_norm": 0.8845742531981535, + "learning_rate": 7.079833697586702e-07, + "loss": 0.4493, + "step": 10866 + }, + { + "epoch": 0.88, + "grad_norm": 1.0187836100942078, + "learning_rate": 7.070108007445708e-07, + "loss": 0.5208, + "step": 10867 + }, + { + "epoch": 0.88, + "grad_norm": 0.9641153712470332, + "learning_rate": 7.060388757263815e-07, + "loss": 0.4721, + "step": 10868 + }, + { + "epoch": 0.88, + "grad_norm": 0.9580963683240067, + "learning_rate": 7.050675947714514e-07, + "loss": 0.525, + "step": 10869 + }, + { + "epoch": 0.88, + "grad_norm": 0.9812100492397047, + "learning_rate": 7.040969579470947e-07, + "loss": 0.4649, + "step": 10870 + }, + { + "epoch": 0.88, + "grad_norm": 0.9725844565798529, + "learning_rate": 7.03126965320573e-07, + "loss": 0.5216, + "step": 10871 + }, + { + "epoch": 0.88, + "grad_norm": 1.0468445203398065, + "learning_rate": 7.02157616959106e-07, + "loss": 0.4856, + "step": 10872 + }, + { + "epoch": 0.88, + "grad_norm": 0.938268266463854, + "learning_rate": 7.011889129298688e-07, + "loss": 0.4724, + "step": 10873 + }, + { + "epoch": 0.88, + "grad_norm": 0.9344094084559463, + "learning_rate": 7.002208532999933e-07, + "loss": 0.4824, + "step": 10874 + }, + { + "epoch": 0.88, + "grad_norm": 1.0943727000052077, + "learning_rate": 6.992534381365612e-07, + "loss": 0.4795, + "step": 10875 + }, + { + "epoch": 0.88, + "grad_norm": 0.9361214667194621, + "learning_rate": 6.98286667506618e-07, + "loss": 0.46, + "step": 10876 + }, + { + "epoch": 0.88, + "grad_norm": 0.9742787944574746, + "learning_rate": 6.973205414771567e-07, + "loss": 0.4995, + "step": 10877 + }, + { + "epoch": 0.88, + "grad_norm": 0.9823861996132334, + "learning_rate": 6.963550601151326e-07, + "loss": 0.4549, + "step": 10878 + }, + { + "epoch": 0.88, + "grad_norm": 0.8710597836786871, + "learning_rate": 6.953902234874488e-07, + "loss": 0.4224, + "step": 10879 + }, + { + "epoch": 0.88, + "grad_norm": 0.9640184246732854, + "learning_rate": 6.944260316609696e-07, + "loss": 0.4772, + "step": 10880 + }, + { + "epoch": 0.88, + "grad_norm": 0.9443593319000072, + "learning_rate": 6.934624847025117e-07, + "loss": 0.4872, + "step": 10881 + }, + { + "epoch": 0.88, + "grad_norm": 0.9102355113413119, + "learning_rate": 6.924995826788516e-07, + "loss": 0.4886, + "step": 10882 + }, + { + "epoch": 0.88, + "grad_norm": 0.8545741021936641, + "learning_rate": 6.915373256567104e-07, + "loss": 0.4124, + "step": 10883 + }, + { + "epoch": 0.88, + "grad_norm": 0.8849429457331734, + "learning_rate": 6.905757137027791e-07, + "loss": 0.5111, + "step": 10884 + }, + { + "epoch": 0.88, + "grad_norm": 0.9274095690131027, + "learning_rate": 6.896147468836923e-07, + "loss": 0.4492, + "step": 10885 + }, + { + "epoch": 0.88, + "grad_norm": 0.9868491348465521, + "learning_rate": 6.886544252660455e-07, + "loss": 0.4611, + "step": 10886 + }, + { + "epoch": 0.88, + "grad_norm": 0.9705796300856186, + "learning_rate": 6.876947489163877e-07, + "loss": 0.5528, + "step": 10887 + }, + { + "epoch": 0.88, + "grad_norm": 0.8459196623914732, + "learning_rate": 6.867357179012257e-07, + "loss": 0.4765, + "step": 10888 + }, + { + "epoch": 0.88, + "grad_norm": 0.9718706349770099, + "learning_rate": 6.857773322870132e-07, + "loss": 0.5085, + "step": 10889 + }, + { + "epoch": 0.89, + "grad_norm": 1.0231757989355228, + "learning_rate": 6.848195921401745e-07, + "loss": 0.5608, + "step": 10890 + }, + { + "epoch": 0.89, + "grad_norm": 0.9926785324714231, + "learning_rate": 6.838624975270724e-07, + "loss": 0.556, + "step": 10891 + }, + { + "epoch": 0.89, + "grad_norm": 0.8648754199032377, + "learning_rate": 6.829060485140371e-07, + "loss": 0.4641, + "step": 10892 + }, + { + "epoch": 0.89, + "grad_norm": 0.9565565653738822, + "learning_rate": 6.819502451673477e-07, + "loss": 0.4483, + "step": 10893 + }, + { + "epoch": 0.89, + "grad_norm": 0.9210008914980209, + "learning_rate": 6.809950875532434e-07, + "loss": 0.4526, + "step": 10894 + }, + { + "epoch": 0.89, + "grad_norm": 0.9462179584061005, + "learning_rate": 6.800405757379103e-07, + "loss": 0.458, + "step": 10895 + }, + { + "epoch": 0.89, + "grad_norm": 0.9380696791158867, + "learning_rate": 6.790867097875009e-07, + "loss": 0.4847, + "step": 10896 + }, + { + "epoch": 0.89, + "grad_norm": 0.8742693957607353, + "learning_rate": 6.781334897681136e-07, + "loss": 0.4233, + "step": 10897 + }, + { + "epoch": 0.89, + "grad_norm": 0.9531969331088594, + "learning_rate": 6.771809157458076e-07, + "loss": 0.4273, + "step": 10898 + }, + { + "epoch": 0.89, + "grad_norm": 0.9797729723207618, + "learning_rate": 6.762289877865946e-07, + "loss": 0.5301, + "step": 10899 + }, + { + "epoch": 0.89, + "grad_norm": 0.9302885601403638, + "learning_rate": 6.752777059564431e-07, + "loss": 0.498, + "step": 10900 + }, + { + "epoch": 0.89, + "grad_norm": 0.9756389310003124, + "learning_rate": 6.743270703212734e-07, + "loss": 0.5063, + "step": 10901 + }, + { + "epoch": 0.89, + "grad_norm": 0.8954492108953634, + "learning_rate": 6.733770809469686e-07, + "loss": 0.4318, + "step": 10902 + }, + { + "epoch": 0.89, + "grad_norm": 0.9195584154690314, + "learning_rate": 6.724277378993582e-07, + "loss": 0.463, + "step": 10903 + }, + { + "epoch": 0.89, + "grad_norm": 0.8998700767219938, + "learning_rate": 6.714790412442318e-07, + "loss": 0.4945, + "step": 10904 + }, + { + "epoch": 0.89, + "grad_norm": 0.9843924532722907, + "learning_rate": 6.705309910473334e-07, + "loss": 0.5438, + "step": 10905 + }, + { + "epoch": 0.89, + "grad_norm": 0.8902657261878095, + "learning_rate": 6.695835873743639e-07, + "loss": 0.4437, + "step": 10906 + }, + { + "epoch": 0.89, + "grad_norm": 0.9797169219451506, + "learning_rate": 6.686368302909729e-07, + "loss": 0.5289, + "step": 10907 + }, + { + "epoch": 0.89, + "grad_norm": 1.074011462740834, + "learning_rate": 6.676907198627758e-07, + "loss": 0.5319, + "step": 10908 + }, + { + "epoch": 0.89, + "grad_norm": 0.8932783969377371, + "learning_rate": 6.667452561553312e-07, + "loss": 0.4721, + "step": 10909 + }, + { + "epoch": 0.89, + "grad_norm": 1.007817520165382, + "learning_rate": 6.658004392341633e-07, + "loss": 0.4764, + "step": 10910 + }, + { + "epoch": 0.89, + "grad_norm": 0.9753415517502435, + "learning_rate": 6.648562691647443e-07, + "loss": 0.5611, + "step": 10911 + }, + { + "epoch": 0.89, + "grad_norm": 0.8876119878062856, + "learning_rate": 6.639127460125061e-07, + "loss": 0.4456, + "step": 10912 + }, + { + "epoch": 0.89, + "grad_norm": 0.8825191330874997, + "learning_rate": 6.629698698428333e-07, + "loss": 0.4839, + "step": 10913 + }, + { + "epoch": 0.89, + "grad_norm": 0.8801074389167012, + "learning_rate": 6.620276407210691e-07, + "loss": 0.435, + "step": 10914 + }, + { + "epoch": 0.89, + "grad_norm": 0.907815847163721, + "learning_rate": 6.610860587125046e-07, + "loss": 0.4488, + "step": 10915 + }, + { + "epoch": 0.89, + "grad_norm": 0.8857465115695343, + "learning_rate": 6.60145123882392e-07, + "loss": 0.4601, + "step": 10916 + }, + { + "epoch": 0.89, + "grad_norm": 1.0385760901105907, + "learning_rate": 6.59204836295938e-07, + "loss": 0.5081, + "step": 10917 + }, + { + "epoch": 0.89, + "grad_norm": 0.8568978773404449, + "learning_rate": 6.582651960183039e-07, + "loss": 0.4356, + "step": 10918 + }, + { + "epoch": 0.89, + "grad_norm": 0.988066516943962, + "learning_rate": 6.573262031146055e-07, + "loss": 0.4722, + "step": 10919 + }, + { + "epoch": 0.89, + "grad_norm": 0.8581986976490972, + "learning_rate": 6.563878576499161e-07, + "loss": 0.4413, + "step": 10920 + }, + { + "epoch": 0.89, + "grad_norm": 0.7929721822827478, + "learning_rate": 6.554501596892582e-07, + "loss": 0.4118, + "step": 10921 + }, + { + "epoch": 0.89, + "grad_norm": 0.9392852715664914, + "learning_rate": 6.545131092976165e-07, + "loss": 0.5108, + "step": 10922 + }, + { + "epoch": 0.89, + "grad_norm": 0.8630871527253957, + "learning_rate": 6.535767065399268e-07, + "loss": 0.4296, + "step": 10923 + }, + { + "epoch": 0.89, + "grad_norm": 0.9950951675346889, + "learning_rate": 6.526409514810805e-07, + "loss": 0.4728, + "step": 10924 + }, + { + "epoch": 0.89, + "grad_norm": 0.9887197876289022, + "learning_rate": 6.517058441859248e-07, + "loss": 0.4998, + "step": 10925 + }, + { + "epoch": 0.89, + "grad_norm": 0.9341212052113047, + "learning_rate": 6.507713847192643e-07, + "loss": 0.488, + "step": 10926 + }, + { + "epoch": 0.89, + "grad_norm": 0.9132481462189752, + "learning_rate": 6.498375731458529e-07, + "loss": 0.4746, + "step": 10927 + }, + { + "epoch": 0.89, + "grad_norm": 0.9152190176417002, + "learning_rate": 6.489044095304031e-07, + "loss": 0.4698, + "step": 10928 + }, + { + "epoch": 0.89, + "grad_norm": 0.90130991738369, + "learning_rate": 6.479718939375846e-07, + "loss": 0.4477, + "step": 10929 + }, + { + "epoch": 0.89, + "grad_norm": 0.91942422370572, + "learning_rate": 6.470400264320176e-07, + "loss": 0.5178, + "step": 10930 + }, + { + "epoch": 0.89, + "grad_norm": 1.0685961246932574, + "learning_rate": 6.461088070782806e-07, + "loss": 0.5362, + "step": 10931 + }, + { + "epoch": 0.89, + "grad_norm": 0.8916758914035164, + "learning_rate": 6.451782359409076e-07, + "loss": 0.4689, + "step": 10932 + }, + { + "epoch": 0.89, + "grad_norm": 0.872337058372946, + "learning_rate": 6.442483130843857e-07, + "loss": 0.4741, + "step": 10933 + }, + { + "epoch": 0.89, + "grad_norm": 0.9024681459121816, + "learning_rate": 6.433190385731558e-07, + "loss": 0.4499, + "step": 10934 + }, + { + "epoch": 0.89, + "grad_norm": 0.9169060857895663, + "learning_rate": 6.423904124716174e-07, + "loss": 0.4825, + "step": 10935 + }, + { + "epoch": 0.89, + "grad_norm": 0.9640055684093934, + "learning_rate": 6.414624348441223e-07, + "loss": 0.4722, + "step": 10936 + }, + { + "epoch": 0.89, + "grad_norm": 0.9129689547723554, + "learning_rate": 6.405351057549803e-07, + "loss": 0.4385, + "step": 10937 + }, + { + "epoch": 0.89, + "grad_norm": 1.0296726654633954, + "learning_rate": 6.396084252684532e-07, + "loss": 0.4515, + "step": 10938 + }, + { + "epoch": 0.89, + "grad_norm": 0.9326947083487568, + "learning_rate": 6.386823934487619e-07, + "loss": 0.5332, + "step": 10939 + }, + { + "epoch": 0.89, + "grad_norm": 0.9960249084282743, + "learning_rate": 6.377570103600749e-07, + "loss": 0.4267, + "step": 10940 + }, + { + "epoch": 0.89, + "grad_norm": 0.8830118434965389, + "learning_rate": 6.368322760665235e-07, + "loss": 0.467, + "step": 10941 + }, + { + "epoch": 0.89, + "grad_norm": 0.9758683488076908, + "learning_rate": 6.359081906321896e-07, + "loss": 0.4982, + "step": 10942 + }, + { + "epoch": 0.89, + "grad_norm": 0.9488295398679685, + "learning_rate": 6.349847541211119e-07, + "loss": 0.4827, + "step": 10943 + }, + { + "epoch": 0.89, + "grad_norm": 0.9750251787938423, + "learning_rate": 6.340619665972847e-07, + "loss": 0.46, + "step": 10944 + }, + { + "epoch": 0.89, + "grad_norm": 1.0234616225302677, + "learning_rate": 6.33139828124657e-07, + "loss": 0.4685, + "step": 10945 + }, + { + "epoch": 0.89, + "grad_norm": 1.0455466925977317, + "learning_rate": 6.322183387671299e-07, + "loss": 0.4855, + "step": 10946 + }, + { + "epoch": 0.89, + "grad_norm": 0.9754433315723913, + "learning_rate": 6.312974985885612e-07, + "loss": 0.4757, + "step": 10947 + }, + { + "epoch": 0.89, + "grad_norm": 0.9357797846488201, + "learning_rate": 6.303773076527663e-07, + "loss": 0.5077, + "step": 10948 + }, + { + "epoch": 0.89, + "grad_norm": 0.9014709161998172, + "learning_rate": 6.294577660235146e-07, + "loss": 0.4161, + "step": 10949 + }, + { + "epoch": 0.89, + "grad_norm": 0.968122195587275, + "learning_rate": 6.28538873764526e-07, + "loss": 0.413, + "step": 10950 + }, + { + "epoch": 0.89, + "grad_norm": 0.9321219084135877, + "learning_rate": 6.276206309394839e-07, + "loss": 0.4747, + "step": 10951 + }, + { + "epoch": 0.89, + "grad_norm": 0.9857430350810453, + "learning_rate": 6.267030376120154e-07, + "loss": 0.4877, + "step": 10952 + }, + { + "epoch": 0.89, + "grad_norm": 0.9257006654534886, + "learning_rate": 6.25786093845715e-07, + "loss": 0.5431, + "step": 10953 + }, + { + "epoch": 0.89, + "grad_norm": 0.7813395537425469, + "learning_rate": 6.248697997041219e-07, + "loss": 0.3824, + "step": 10954 + }, + { + "epoch": 0.89, + "grad_norm": 0.9357085513534893, + "learning_rate": 6.239541552507367e-07, + "loss": 0.5074, + "step": 10955 + }, + { + "epoch": 0.89, + "grad_norm": 0.9781819541718835, + "learning_rate": 6.230391605490105e-07, + "loss": 0.5044, + "step": 10956 + }, + { + "epoch": 0.89, + "grad_norm": 1.0309509193357862, + "learning_rate": 6.22124815662356e-07, + "loss": 0.5121, + "step": 10957 + }, + { + "epoch": 0.89, + "grad_norm": 1.0824573399034947, + "learning_rate": 6.212111206541305e-07, + "loss": 0.5119, + "step": 10958 + }, + { + "epoch": 0.89, + "grad_norm": 1.0262200241800767, + "learning_rate": 6.202980755876575e-07, + "loss": 0.4679, + "step": 10959 + }, + { + "epoch": 0.89, + "grad_norm": 0.9675571655609572, + "learning_rate": 6.193856805262078e-07, + "loss": 0.483, + "step": 10960 + }, + { + "epoch": 0.89, + "grad_norm": 0.9213920082584263, + "learning_rate": 6.184739355330083e-07, + "loss": 0.5297, + "step": 10961 + }, + { + "epoch": 0.89, + "grad_norm": 1.052124394282512, + "learning_rate": 6.175628406712452e-07, + "loss": 0.4136, + "step": 10962 + }, + { + "epoch": 0.89, + "grad_norm": 0.9134209043859236, + "learning_rate": 6.166523960040549e-07, + "loss": 0.4796, + "step": 10963 + }, + { + "epoch": 0.89, + "grad_norm": 1.0368144330888827, + "learning_rate": 6.157426015945289e-07, + "loss": 0.5225, + "step": 10964 + }, + { + "epoch": 0.89, + "grad_norm": 1.0664674535151522, + "learning_rate": 6.148334575057191e-07, + "loss": 0.5076, + "step": 10965 + }, + { + "epoch": 0.89, + "grad_norm": 1.0175371255955596, + "learning_rate": 6.139249638006251e-07, + "loss": 0.5053, + "step": 10966 + }, + { + "epoch": 0.89, + "grad_norm": 1.0233060979369943, + "learning_rate": 6.130171205422053e-07, + "loss": 0.4673, + "step": 10967 + }, + { + "epoch": 0.89, + "grad_norm": 0.9631441370141315, + "learning_rate": 6.121099277933728e-07, + "loss": 0.5094, + "step": 10968 + }, + { + "epoch": 0.89, + "grad_norm": 1.0057396520217246, + "learning_rate": 6.112033856169974e-07, + "loss": 0.5465, + "step": 10969 + }, + { + "epoch": 0.89, + "grad_norm": 0.9561951029569171, + "learning_rate": 6.102974940758954e-07, + "loss": 0.4974, + "step": 10970 + }, + { + "epoch": 0.89, + "grad_norm": 0.9866723996930895, + "learning_rate": 6.093922532328522e-07, + "loss": 0.542, + "step": 10971 + }, + { + "epoch": 0.89, + "grad_norm": 1.052357825958954, + "learning_rate": 6.084876631505943e-07, + "loss": 0.5748, + "step": 10972 + }, + { + "epoch": 0.89, + "grad_norm": 0.8457013897677658, + "learning_rate": 6.075837238918114e-07, + "loss": 0.4256, + "step": 10973 + }, + { + "epoch": 0.89, + "grad_norm": 0.9843655807602537, + "learning_rate": 6.066804355191458e-07, + "loss": 0.5103, + "step": 10974 + }, + { + "epoch": 0.89, + "grad_norm": 0.9114613086221101, + "learning_rate": 6.057777980951951e-07, + "loss": 0.3927, + "step": 10975 + }, + { + "epoch": 0.89, + "grad_norm": 1.0110177998447731, + "learning_rate": 6.048758116825071e-07, + "loss": 0.5294, + "step": 10976 + }, + { + "epoch": 0.89, + "grad_norm": 0.9426252423474122, + "learning_rate": 6.039744763435951e-07, + "loss": 0.4731, + "step": 10977 + }, + { + "epoch": 0.89, + "grad_norm": 0.9087938185398369, + "learning_rate": 6.030737921409169e-07, + "loss": 0.4269, + "step": 10978 + }, + { + "epoch": 0.89, + "grad_norm": 0.8901526154211044, + "learning_rate": 6.021737591368892e-07, + "loss": 0.483, + "step": 10979 + }, + { + "epoch": 0.89, + "grad_norm": 0.929054298926748, + "learning_rate": 6.012743773938845e-07, + "loss": 0.4684, + "step": 10980 + }, + { + "epoch": 0.89, + "grad_norm": 0.9998305319817405, + "learning_rate": 6.003756469742294e-07, + "loss": 0.5043, + "step": 10981 + }, + { + "epoch": 0.89, + "grad_norm": 0.9654263645064304, + "learning_rate": 5.994775679402021e-07, + "loss": 0.4799, + "step": 10982 + }, + { + "epoch": 0.89, + "grad_norm": 0.8812144427476013, + "learning_rate": 5.985801403540436e-07, + "loss": 0.4791, + "step": 10983 + }, + { + "epoch": 0.89, + "grad_norm": 1.0297036512654214, + "learning_rate": 5.976833642779422e-07, + "loss": 0.4931, + "step": 10984 + }, + { + "epoch": 0.89, + "grad_norm": 1.0959168985236405, + "learning_rate": 5.967872397740427e-07, + "loss": 0.5191, + "step": 10985 + }, + { + "epoch": 0.89, + "grad_norm": 0.9548852649181496, + "learning_rate": 5.958917669044472e-07, + "loss": 0.4726, + "step": 10986 + }, + { + "epoch": 0.89, + "grad_norm": 0.8860636695272686, + "learning_rate": 5.949969457312122e-07, + "loss": 0.4133, + "step": 10987 + }, + { + "epoch": 0.89, + "grad_norm": 0.9551069857313534, + "learning_rate": 5.941027763163432e-07, + "loss": 0.4779, + "step": 10988 + }, + { + "epoch": 0.89, + "grad_norm": 1.9587762764673151, + "learning_rate": 5.93209258721812e-07, + "loss": 0.4449, + "step": 10989 + }, + { + "epoch": 0.89, + "grad_norm": 0.9764154005326386, + "learning_rate": 5.923163930095344e-07, + "loss": 0.5057, + "step": 10990 + }, + { + "epoch": 0.89, + "grad_norm": 0.8694608537022765, + "learning_rate": 5.914241792413855e-07, + "loss": 0.433, + "step": 10991 + }, + { + "epoch": 0.89, + "grad_norm": 0.8938395567692832, + "learning_rate": 5.905326174791959e-07, + "loss": 0.4547, + "step": 10992 + }, + { + "epoch": 0.89, + "grad_norm": 0.8700693290069454, + "learning_rate": 5.896417077847505e-07, + "loss": 0.443, + "step": 10993 + }, + { + "epoch": 0.89, + "grad_norm": 0.9365351744115846, + "learning_rate": 5.887514502197855e-07, + "loss": 0.5051, + "step": 10994 + }, + { + "epoch": 0.89, + "grad_norm": 0.953988434394032, + "learning_rate": 5.878618448460005e-07, + "loss": 0.4766, + "step": 10995 + }, + { + "epoch": 0.89, + "grad_norm": 1.0255533396332113, + "learning_rate": 5.869728917250394e-07, + "loss": 0.4966, + "step": 10996 + }, + { + "epoch": 0.89, + "grad_norm": 0.9351106480000976, + "learning_rate": 5.860845909185076e-07, + "loss": 0.4979, + "step": 10997 + }, + { + "epoch": 0.89, + "grad_norm": 0.9267295875852679, + "learning_rate": 5.851969424879633e-07, + "loss": 0.5132, + "step": 10998 + }, + { + "epoch": 0.89, + "grad_norm": 1.044249424301423, + "learning_rate": 5.843099464949198e-07, + "loss": 0.5185, + "step": 10999 + }, + { + "epoch": 0.89, + "grad_norm": 1.064022787085525, + "learning_rate": 5.834236030008455e-07, + "loss": 0.5138, + "step": 11000 + }, + { + "epoch": 0.89, + "grad_norm": 0.8423978228749698, + "learning_rate": 5.825379120671649e-07, + "loss": 0.4502, + "step": 11001 + }, + { + "epoch": 0.89, + "grad_norm": 0.8854906110383404, + "learning_rate": 5.816528737552496e-07, + "loss": 0.4603, + "step": 11002 + }, + { + "epoch": 0.89, + "grad_norm": 0.9662435120986029, + "learning_rate": 5.807684881264397e-07, + "loss": 0.4929, + "step": 11003 + }, + { + "epoch": 0.89, + "grad_norm": 0.8918757606853558, + "learning_rate": 5.798847552420184e-07, + "loss": 0.4759, + "step": 11004 + }, + { + "epoch": 0.89, + "grad_norm": 0.9781143034468275, + "learning_rate": 5.790016751632266e-07, + "loss": 0.529, + "step": 11005 + }, + { + "epoch": 0.89, + "grad_norm": 0.9408427539253995, + "learning_rate": 5.781192479512621e-07, + "loss": 0.4402, + "step": 11006 + }, + { + "epoch": 0.89, + "grad_norm": 0.9169600366711169, + "learning_rate": 5.772374736672793e-07, + "loss": 0.4747, + "step": 11007 + }, + { + "epoch": 0.89, + "grad_norm": 0.9420777741526181, + "learning_rate": 5.763563523723769e-07, + "loss": 0.4748, + "step": 11008 + }, + { + "epoch": 0.89, + "grad_norm": 0.9565403624864097, + "learning_rate": 5.754758841276243e-07, + "loss": 0.4469, + "step": 11009 + }, + { + "epoch": 0.89, + "grad_norm": 0.991885868237849, + "learning_rate": 5.745960689940322e-07, + "loss": 0.4439, + "step": 11010 + }, + { + "epoch": 0.89, + "grad_norm": 0.9151544363228036, + "learning_rate": 5.73716907032572e-07, + "loss": 0.4787, + "step": 11011 + }, + { + "epoch": 0.89, + "grad_norm": 0.8132533514402628, + "learning_rate": 5.728383983041696e-07, + "loss": 0.5255, + "step": 11012 + }, + { + "epoch": 0.9, + "grad_norm": 0.9310339443213517, + "learning_rate": 5.719605428697051e-07, + "loss": 0.5024, + "step": 11013 + }, + { + "epoch": 0.9, + "grad_norm": 0.8574936208717879, + "learning_rate": 5.710833407900096e-07, + "loss": 0.433, + "step": 11014 + }, + { + "epoch": 0.9, + "grad_norm": 0.8741036577067072, + "learning_rate": 5.70206792125878e-07, + "loss": 0.4794, + "step": 11015 + }, + { + "epoch": 0.9, + "grad_norm": 0.967796820842037, + "learning_rate": 5.693308969380495e-07, + "loss": 0.479, + "step": 11016 + }, + { + "epoch": 0.9, + "grad_norm": 0.9932310538013736, + "learning_rate": 5.684556552872256e-07, + "loss": 0.4961, + "step": 11017 + }, + { + "epoch": 0.9, + "grad_norm": 0.9871267665159187, + "learning_rate": 5.675810672340587e-07, + "loss": 0.4598, + "step": 11018 + }, + { + "epoch": 0.9, + "grad_norm": 0.986056563416071, + "learning_rate": 5.667071328391593e-07, + "loss": 0.532, + "step": 11019 + }, + { + "epoch": 0.9, + "grad_norm": 0.9476270127764385, + "learning_rate": 5.658338521630846e-07, + "loss": 0.4759, + "step": 11020 + }, + { + "epoch": 0.9, + "grad_norm": 0.9657676173085136, + "learning_rate": 5.649612252663583e-07, + "loss": 0.5025, + "step": 11021 + }, + { + "epoch": 0.9, + "grad_norm": 0.8858286395635273, + "learning_rate": 5.640892522094499e-07, + "loss": 0.434, + "step": 11022 + }, + { + "epoch": 0.9, + "grad_norm": 0.9552774366100201, + "learning_rate": 5.632179330527865e-07, + "loss": 0.4871, + "step": 11023 + }, + { + "epoch": 0.9, + "grad_norm": 0.9143189633790539, + "learning_rate": 5.623472678567498e-07, + "loss": 0.4927, + "step": 11024 + }, + { + "epoch": 0.9, + "grad_norm": 0.8714387741551404, + "learning_rate": 5.614772566816773e-07, + "loss": 0.4832, + "step": 11025 + }, + { + "epoch": 0.9, + "grad_norm": 0.8858930867826196, + "learning_rate": 5.606078995878562e-07, + "loss": 0.4106, + "step": 11026 + }, + { + "epoch": 0.9, + "grad_norm": 0.9568935096034147, + "learning_rate": 5.597391966355381e-07, + "loss": 0.4394, + "step": 11027 + }, + { + "epoch": 0.9, + "grad_norm": 1.0728162873381937, + "learning_rate": 5.588711478849185e-07, + "loss": 0.5342, + "step": 11028 + }, + { + "epoch": 0.9, + "grad_norm": 0.9180312152603842, + "learning_rate": 5.580037533961546e-07, + "loss": 0.4655, + "step": 11029 + }, + { + "epoch": 0.9, + "grad_norm": 0.9517370360129557, + "learning_rate": 5.571370132293552e-07, + "loss": 0.4426, + "step": 11030 + }, + { + "epoch": 0.9, + "grad_norm": 0.9945553716176935, + "learning_rate": 5.562709274445866e-07, + "loss": 0.5017, + "step": 11031 + }, + { + "epoch": 0.9, + "grad_norm": 0.8730934042078565, + "learning_rate": 5.554054961018628e-07, + "loss": 0.4803, + "step": 11032 + }, + { + "epoch": 0.9, + "grad_norm": 0.9385185317818835, + "learning_rate": 5.54540719261164e-07, + "loss": 0.441, + "step": 11033 + }, + { + "epoch": 0.9, + "grad_norm": 0.9249336081996793, + "learning_rate": 5.536765969824132e-07, + "loss": 0.4769, + "step": 11034 + }, + { + "epoch": 0.9, + "grad_norm": 0.9050719472637894, + "learning_rate": 5.528131293254957e-07, + "loss": 0.4702, + "step": 11035 + }, + { + "epoch": 0.9, + "grad_norm": 0.9136625775583729, + "learning_rate": 5.519503163502493e-07, + "loss": 0.4389, + "step": 11036 + }, + { + "epoch": 0.9, + "grad_norm": 0.9086878888383371, + "learning_rate": 5.510881581164662e-07, + "loss": 0.3929, + "step": 11037 + }, + { + "epoch": 0.9, + "grad_norm": 0.92276744581274, + "learning_rate": 5.502266546838897e-07, + "loss": 0.5428, + "step": 11038 + }, + { + "epoch": 0.9, + "grad_norm": 0.8805940036916493, + "learning_rate": 5.493658061122276e-07, + "loss": 0.4426, + "step": 11039 + }, + { + "epoch": 0.9, + "grad_norm": 0.9771763240276137, + "learning_rate": 5.48505612461131e-07, + "loss": 0.4668, + "step": 11040 + }, + { + "epoch": 0.9, + "grad_norm": 0.8734235333940492, + "learning_rate": 5.476460737902111e-07, + "loss": 0.4037, + "step": 11041 + }, + { + "epoch": 0.9, + "grad_norm": 1.0466353923440146, + "learning_rate": 5.467871901590349e-07, + "loss": 0.5033, + "step": 11042 + }, + { + "epoch": 0.9, + "grad_norm": 0.9785685435203199, + "learning_rate": 5.459289616271224e-07, + "loss": 0.4902, + "step": 11043 + }, + { + "epoch": 0.9, + "grad_norm": 0.9503035291633423, + "learning_rate": 5.450713882539449e-07, + "loss": 0.4766, + "step": 11044 + }, + { + "epoch": 0.9, + "grad_norm": 1.0429553331877732, + "learning_rate": 5.44214470098936e-07, + "loss": 0.5244, + "step": 11045 + }, + { + "epoch": 0.9, + "grad_norm": 0.97021397045449, + "learning_rate": 5.43358207221476e-07, + "loss": 0.5031, + "step": 11046 + }, + { + "epoch": 0.9, + "grad_norm": 0.9445804788065731, + "learning_rate": 5.425025996809042e-07, + "loss": 0.4727, + "step": 11047 + }, + { + "epoch": 0.9, + "grad_norm": 0.9205907205100843, + "learning_rate": 5.416476475365129e-07, + "loss": 0.4863, + "step": 11048 + }, + { + "epoch": 0.9, + "grad_norm": 1.3195746670830866, + "learning_rate": 5.407933508475515e-07, + "loss": 0.5233, + "step": 11049 + }, + { + "epoch": 0.9, + "grad_norm": 0.9794249235949587, + "learning_rate": 5.399397096732184e-07, + "loss": 0.5045, + "step": 11050 + }, + { + "epoch": 0.9, + "grad_norm": 0.894756142856474, + "learning_rate": 5.39086724072675e-07, + "loss": 0.3943, + "step": 11051 + }, + { + "epoch": 0.9, + "grad_norm": 0.9204342660541922, + "learning_rate": 5.382343941050272e-07, + "loss": 0.471, + "step": 11052 + }, + { + "epoch": 0.9, + "grad_norm": 0.9181966409748372, + "learning_rate": 5.373827198293446e-07, + "loss": 0.4943, + "step": 11053 + }, + { + "epoch": 0.9, + "grad_norm": 0.9282760705343676, + "learning_rate": 5.365317013046456e-07, + "loss": 0.4279, + "step": 11054 + }, + { + "epoch": 0.9, + "grad_norm": 0.9893124938884263, + "learning_rate": 5.356813385899074e-07, + "loss": 0.5161, + "step": 11055 + }, + { + "epoch": 0.9, + "grad_norm": 0.9524158665869225, + "learning_rate": 5.348316317440549e-07, + "loss": 0.4846, + "step": 11056 + }, + { + "epoch": 0.9, + "grad_norm": 0.9733809276947557, + "learning_rate": 5.339825808259779e-07, + "loss": 0.4934, + "step": 11057 + }, + { + "epoch": 0.9, + "grad_norm": 0.9904644382388924, + "learning_rate": 5.331341858945094e-07, + "loss": 0.5697, + "step": 11058 + }, + { + "epoch": 0.9, + "grad_norm": 0.9217495342550637, + "learning_rate": 5.322864470084455e-07, + "loss": 0.4578, + "step": 11059 + }, + { + "epoch": 0.9, + "grad_norm": 0.864877204773377, + "learning_rate": 5.314393642265314e-07, + "loss": 0.4253, + "step": 11060 + }, + { + "epoch": 0.9, + "grad_norm": 0.981346430166337, + "learning_rate": 5.305929376074725e-07, + "loss": 0.5363, + "step": 11061 + }, + { + "epoch": 0.9, + "grad_norm": 0.972993618315643, + "learning_rate": 5.29747167209923e-07, + "loss": 0.4663, + "step": 11062 + }, + { + "epoch": 0.9, + "grad_norm": 0.8667091636509169, + "learning_rate": 5.28902053092496e-07, + "loss": 0.4178, + "step": 11063 + }, + { + "epoch": 0.9, + "grad_norm": 0.8573274759690804, + "learning_rate": 5.280575953137545e-07, + "loss": 0.4511, + "step": 11064 + }, + { + "epoch": 0.9, + "grad_norm": 0.9976398282977128, + "learning_rate": 5.272137939322208e-07, + "loss": 0.487, + "step": 11065 + }, + { + "epoch": 0.9, + "grad_norm": 0.9493121201956117, + "learning_rate": 5.26370649006368e-07, + "loss": 0.4588, + "step": 11066 + }, + { + "epoch": 0.9, + "grad_norm": 0.8258826117106118, + "learning_rate": 5.255281605946261e-07, + "loss": 0.4348, + "step": 11067 + }, + { + "epoch": 0.9, + "grad_norm": 0.9915494613066878, + "learning_rate": 5.246863287553794e-07, + "loss": 0.4855, + "step": 11068 + }, + { + "epoch": 0.9, + "grad_norm": 0.9091035541846134, + "learning_rate": 5.238451535469658e-07, + "loss": 0.4527, + "step": 11069 + }, + { + "epoch": 0.9, + "grad_norm": 0.8570386649899411, + "learning_rate": 5.230046350276774e-07, + "loss": 0.4344, + "step": 11070 + }, + { + "epoch": 0.9, + "grad_norm": 0.9304912751627228, + "learning_rate": 5.221647732557611e-07, + "loss": 0.5141, + "step": 11071 + }, + { + "epoch": 0.9, + "grad_norm": 1.0122734919257559, + "learning_rate": 5.213255682894192e-07, + "loss": 0.483, + "step": 11072 + }, + { + "epoch": 0.9, + "grad_norm": 0.9107296385416488, + "learning_rate": 5.204870201868084e-07, + "loss": 0.4708, + "step": 11073 + }, + { + "epoch": 0.9, + "grad_norm": 0.9662778251197135, + "learning_rate": 5.19649129006039e-07, + "loss": 0.5313, + "step": 11074 + }, + { + "epoch": 0.9, + "grad_norm": 0.9539025751341297, + "learning_rate": 5.188118948051768e-07, + "loss": 0.4911, + "step": 11075 + }, + { + "epoch": 0.9, + "grad_norm": 0.9027565677142386, + "learning_rate": 5.179753176422386e-07, + "loss": 0.4842, + "step": 11076 + }, + { + "epoch": 0.9, + "grad_norm": 1.100122923406752, + "learning_rate": 5.171393975752015e-07, + "loss": 0.5235, + "step": 11077 + }, + { + "epoch": 0.9, + "grad_norm": 0.9943585472020804, + "learning_rate": 5.163041346619913e-07, + "loss": 0.4866, + "step": 11078 + }, + { + "epoch": 0.9, + "grad_norm": 0.9150676597442815, + "learning_rate": 5.154695289604938e-07, + "loss": 0.4607, + "step": 11079 + }, + { + "epoch": 0.9, + "grad_norm": 0.8694083085530301, + "learning_rate": 5.146355805285452e-07, + "loss": 0.465, + "step": 11080 + }, + { + "epoch": 0.9, + "grad_norm": 1.134853604728181, + "learning_rate": 5.138022894239369e-07, + "loss": 0.5574, + "step": 11081 + }, + { + "epoch": 0.9, + "grad_norm": 0.8853817334786239, + "learning_rate": 5.129696557044173e-07, + "loss": 0.4293, + "step": 11082 + }, + { + "epoch": 0.9, + "grad_norm": 0.98048161606491, + "learning_rate": 5.121376794276834e-07, + "loss": 0.5062, + "step": 11083 + }, + { + "epoch": 0.9, + "grad_norm": 0.9456961383840721, + "learning_rate": 5.113063606513935e-07, + "loss": 0.4381, + "step": 11084 + }, + { + "epoch": 0.9, + "grad_norm": 0.9519112022167273, + "learning_rate": 5.104756994331561e-07, + "loss": 0.4312, + "step": 11085 + }, + { + "epoch": 0.9, + "grad_norm": 0.948613829358276, + "learning_rate": 5.096456958305351e-07, + "loss": 0.4807, + "step": 11086 + }, + { + "epoch": 0.9, + "grad_norm": 0.9351289986800121, + "learning_rate": 5.088163499010502e-07, + "loss": 0.5065, + "step": 11087 + }, + { + "epoch": 0.9, + "grad_norm": 0.8956220061900297, + "learning_rate": 5.07987661702174e-07, + "loss": 0.4606, + "step": 11088 + }, + { + "epoch": 0.9, + "grad_norm": 0.9545684634715169, + "learning_rate": 5.071596312913329e-07, + "loss": 0.4764, + "step": 11089 + }, + { + "epoch": 0.9, + "grad_norm": 1.012292041949086, + "learning_rate": 5.06332258725909e-07, + "loss": 0.4834, + "step": 11090 + }, + { + "epoch": 0.9, + "grad_norm": 0.9546015528272861, + "learning_rate": 5.055055440632384e-07, + "loss": 0.4927, + "step": 11091 + }, + { + "epoch": 0.9, + "grad_norm": 0.8922578203787093, + "learning_rate": 5.046794873606131e-07, + "loss": 0.464, + "step": 11092 + }, + { + "epoch": 0.9, + "grad_norm": 1.002581031774825, + "learning_rate": 5.038540886752752e-07, + "loss": 0.5291, + "step": 11093 + }, + { + "epoch": 0.9, + "grad_norm": 0.956594306713076, + "learning_rate": 5.030293480644289e-07, + "loss": 0.5018, + "step": 11094 + }, + { + "epoch": 0.9, + "grad_norm": 1.0593410366903622, + "learning_rate": 5.022052655852228e-07, + "loss": 0.554, + "step": 11095 + }, + { + "epoch": 0.9, + "grad_norm": 0.9620155031743906, + "learning_rate": 5.013818412947669e-07, + "loss": 0.4683, + "step": 11096 + }, + { + "epoch": 0.9, + "grad_norm": 0.9369348023729992, + "learning_rate": 5.005590752501244e-07, + "loss": 0.4979, + "step": 11097 + }, + { + "epoch": 0.9, + "grad_norm": 0.9417310972907301, + "learning_rate": 4.997369675083131e-07, + "loss": 0.4617, + "step": 11098 + }, + { + "epoch": 0.9, + "grad_norm": 0.934368976889229, + "learning_rate": 4.989155181263017e-07, + "loss": 0.487, + "step": 11099 + }, + { + "epoch": 0.9, + "grad_norm": 0.9313517760063982, + "learning_rate": 4.980947271610192e-07, + "loss": 0.4887, + "step": 11100 + }, + { + "epoch": 0.9, + "grad_norm": 0.9351666968503773, + "learning_rate": 4.972745946693414e-07, + "loss": 0.4626, + "step": 11101 + }, + { + "epoch": 0.9, + "grad_norm": 0.9262804697487227, + "learning_rate": 4.964551207081081e-07, + "loss": 0.4518, + "step": 11102 + }, + { + "epoch": 0.9, + "grad_norm": 0.9592488579633498, + "learning_rate": 4.95636305334104e-07, + "loss": 0.5235, + "step": 11103 + }, + { + "epoch": 0.9, + "grad_norm": 0.99004950050435, + "learning_rate": 4.948181486040737e-07, + "loss": 0.4683, + "step": 11104 + }, + { + "epoch": 0.9, + "grad_norm": 0.9846886660796365, + "learning_rate": 4.940006505747142e-07, + "loss": 0.513, + "step": 11105 + }, + { + "epoch": 0.9, + "grad_norm": 0.9454931036011615, + "learning_rate": 4.931838113026798e-07, + "loss": 0.541, + "step": 11106 + }, + { + "epoch": 0.9, + "grad_norm": 0.920001078733727, + "learning_rate": 4.923676308445713e-07, + "loss": 0.5109, + "step": 11107 + }, + { + "epoch": 0.9, + "grad_norm": 0.9192704813594433, + "learning_rate": 4.915521092569553e-07, + "loss": 0.4743, + "step": 11108 + }, + { + "epoch": 0.9, + "grad_norm": 0.955057808863621, + "learning_rate": 4.907372465963434e-07, + "loss": 0.4339, + "step": 11109 + }, + { + "epoch": 0.9, + "grad_norm": 0.9254482314436216, + "learning_rate": 4.899230429192059e-07, + "loss": 0.466, + "step": 11110 + }, + { + "epoch": 0.9, + "grad_norm": 0.8747271004206972, + "learning_rate": 4.891094982819656e-07, + "loss": 0.4546, + "step": 11111 + }, + { + "epoch": 0.9, + "grad_norm": 0.9775214852283891, + "learning_rate": 4.882966127410016e-07, + "loss": 0.4646, + "step": 11112 + }, + { + "epoch": 0.9, + "grad_norm": 1.0915192222629713, + "learning_rate": 4.874843863526435e-07, + "loss": 0.5919, + "step": 11113 + }, + { + "epoch": 0.9, + "grad_norm": 0.990407126815721, + "learning_rate": 4.866728191731829e-07, + "loss": 0.4546, + "step": 11114 + }, + { + "epoch": 0.9, + "grad_norm": 0.9709797991935482, + "learning_rate": 4.858619112588559e-07, + "loss": 0.4967, + "step": 11115 + }, + { + "epoch": 0.9, + "grad_norm": 0.9241489813877177, + "learning_rate": 4.850516626658585e-07, + "loss": 0.4624, + "step": 11116 + }, + { + "epoch": 0.9, + "grad_norm": 0.9445547169009616, + "learning_rate": 4.842420734503428e-07, + "loss": 0.48, + "step": 11117 + }, + { + "epoch": 0.9, + "grad_norm": 0.9599734450815728, + "learning_rate": 4.834331436684114e-07, + "loss": 0.4661, + "step": 11118 + }, + { + "epoch": 0.9, + "grad_norm": 0.9320491380802918, + "learning_rate": 4.826248733761185e-07, + "loss": 0.5014, + "step": 11119 + }, + { + "epoch": 0.9, + "grad_norm": 0.964224195814961, + "learning_rate": 4.818172626294837e-07, + "loss": 0.4735, + "step": 11120 + }, + { + "epoch": 0.9, + "grad_norm": 0.9579047810051492, + "learning_rate": 4.810103114844688e-07, + "loss": 0.4381, + "step": 11121 + }, + { + "epoch": 0.9, + "grad_norm": 0.8961668308564504, + "learning_rate": 4.802040199969959e-07, + "loss": 0.462, + "step": 11122 + }, + { + "epoch": 0.9, + "grad_norm": 0.9865262747521256, + "learning_rate": 4.793983882229402e-07, + "loss": 0.4953, + "step": 11123 + }, + { + "epoch": 0.9, + "grad_norm": 1.106608268670239, + "learning_rate": 4.785934162181305e-07, + "loss": 0.572, + "step": 11124 + }, + { + "epoch": 0.9, + "grad_norm": 0.928105720357525, + "learning_rate": 4.777891040383531e-07, + "loss": 0.4759, + "step": 11125 + }, + { + "epoch": 0.9, + "grad_norm": 0.9034973838841738, + "learning_rate": 4.769854517393447e-07, + "loss": 0.4502, + "step": 11126 + }, + { + "epoch": 0.9, + "grad_norm": 0.9056139419125202, + "learning_rate": 4.761824593767961e-07, + "loss": 0.4717, + "step": 11127 + }, + { + "epoch": 0.9, + "grad_norm": 0.91672933848264, + "learning_rate": 4.753801270063574e-07, + "loss": 0.4453, + "step": 11128 + }, + { + "epoch": 0.9, + "grad_norm": 0.9868633642457375, + "learning_rate": 4.7457845468362627e-07, + "loss": 0.522, + "step": 11129 + }, + { + "epoch": 0.9, + "grad_norm": 0.9925761450136723, + "learning_rate": 4.7377744246415837e-07, + "loss": 0.5019, + "step": 11130 + }, + { + "epoch": 0.9, + "grad_norm": 0.8302684218548028, + "learning_rate": 4.7297709040346474e-07, + "loss": 0.466, + "step": 11131 + }, + { + "epoch": 0.9, + "grad_norm": 0.8809889972200473, + "learning_rate": 4.7217739855700995e-07, + "loss": 0.4608, + "step": 11132 + }, + { + "epoch": 0.9, + "grad_norm": 0.949125194133086, + "learning_rate": 4.7137836698020747e-07, + "loss": 0.5073, + "step": 11133 + }, + { + "epoch": 0.9, + "grad_norm": 0.9828922076326778, + "learning_rate": 4.7057999572843516e-07, + "loss": 0.5245, + "step": 11134 + }, + { + "epoch": 0.9, + "grad_norm": 0.9807408745494204, + "learning_rate": 4.6978228485701437e-07, + "loss": 0.4997, + "step": 11135 + }, + { + "epoch": 0.91, + "grad_norm": 0.8783668016785334, + "learning_rate": 4.689852344212287e-07, + "loss": 0.416, + "step": 11136 + }, + { + "epoch": 0.91, + "grad_norm": 0.8545384196573856, + "learning_rate": 4.681888444763116e-07, + "loss": 0.4716, + "step": 11137 + }, + { + "epoch": 0.91, + "grad_norm": 1.006594980432949, + "learning_rate": 4.673931150774547e-07, + "loss": 0.5269, + "step": 11138 + }, + { + "epoch": 0.91, + "grad_norm": 0.9954424066558223, + "learning_rate": 4.6659804627979697e-07, + "loss": 0.5066, + "step": 11139 + }, + { + "epoch": 0.91, + "grad_norm": 0.9063227086751301, + "learning_rate": 4.658036381384412e-07, + "loss": 0.4942, + "step": 11140 + }, + { + "epoch": 0.91, + "grad_norm": 0.871552804947988, + "learning_rate": 4.650098907084355e-07, + "loss": 0.4441, + "step": 11141 + }, + { + "epoch": 0.91, + "grad_norm": 1.0236407870819795, + "learning_rate": 4.6421680404478587e-07, + "loss": 0.5124, + "step": 11142 + }, + { + "epoch": 0.91, + "grad_norm": 0.824417457432014, + "learning_rate": 4.634243782024539e-07, + "loss": 0.4079, + "step": 11143 + }, + { + "epoch": 0.91, + "grad_norm": 1.133583472750711, + "learning_rate": 4.6263261323635455e-07, + "loss": 0.5321, + "step": 11144 + }, + { + "epoch": 0.91, + "grad_norm": 0.8843697218228761, + "learning_rate": 4.6184150920135395e-07, + "loss": 0.4356, + "step": 11145 + }, + { + "epoch": 0.91, + "grad_norm": 0.9000063294594467, + "learning_rate": 4.610510661522782e-07, + "loss": 0.4835, + "step": 11146 + }, + { + "epoch": 0.91, + "grad_norm": 0.9434582772507041, + "learning_rate": 4.602612841439014e-07, + "loss": 0.5203, + "step": 11147 + }, + { + "epoch": 0.91, + "grad_norm": 0.9692831264058703, + "learning_rate": 4.594721632309551e-07, + "loss": 0.4553, + "step": 11148 + }, + { + "epoch": 0.91, + "grad_norm": 0.9482907368711121, + "learning_rate": 4.5868370346812685e-07, + "loss": 0.4821, + "step": 11149 + }, + { + "epoch": 0.91, + "grad_norm": 0.848294913758441, + "learning_rate": 4.5789590491005507e-07, + "loss": 0.433, + "step": 11150 + }, + { + "epoch": 0.91, + "grad_norm": 0.9580137905855243, + "learning_rate": 4.571087676113306e-07, + "loss": 0.4262, + "step": 11151 + }, + { + "epoch": 0.91, + "grad_norm": 0.9885469522737916, + "learning_rate": 4.5632229162650755e-07, + "loss": 0.4879, + "step": 11152 + }, + { + "epoch": 0.91, + "grad_norm": 0.9540913556936466, + "learning_rate": 4.555364770100823e-07, + "loss": 0.4256, + "step": 11153 + }, + { + "epoch": 0.91, + "grad_norm": 0.901228943473374, + "learning_rate": 4.5475132381651356e-07, + "loss": 0.5005, + "step": 11154 + }, + { + "epoch": 0.91, + "grad_norm": 0.8633625027678152, + "learning_rate": 4.5396683210021107e-07, + "loss": 0.4495, + "step": 11155 + }, + { + "epoch": 0.91, + "grad_norm": 0.975255504720242, + "learning_rate": 4.531830019155425e-07, + "loss": 0.5066, + "step": 11156 + }, + { + "epoch": 0.91, + "grad_norm": 0.9191865185057262, + "learning_rate": 4.5239983331682e-07, + "loss": 0.4824, + "step": 11157 + }, + { + "epoch": 0.91, + "grad_norm": 0.9105743658096604, + "learning_rate": 4.516173263583234e-07, + "loss": 0.475, + "step": 11158 + }, + { + "epoch": 0.91, + "grad_norm": 1.0033483384909352, + "learning_rate": 4.50835481094275e-07, + "loss": 0.518, + "step": 11159 + }, + { + "epoch": 0.91, + "grad_norm": 0.9174121202867608, + "learning_rate": 4.50054297578858e-07, + "loss": 0.5079, + "step": 11160 + }, + { + "epoch": 0.91, + "grad_norm": 0.9138216995553686, + "learning_rate": 4.492737758662069e-07, + "loss": 0.4926, + "step": 11161 + }, + { + "epoch": 0.91, + "grad_norm": 0.8698376707960717, + "learning_rate": 4.484939160104129e-07, + "loss": 0.478, + "step": 11162 + }, + { + "epoch": 0.91, + "grad_norm": 1.0407956612094187, + "learning_rate": 4.4771471806551614e-07, + "loss": 0.5134, + "step": 11163 + }, + { + "epoch": 0.91, + "grad_norm": 0.9911654660963026, + "learning_rate": 4.469361820855189e-07, + "loss": 0.4979, + "step": 11164 + }, + { + "epoch": 0.91, + "grad_norm": 0.8965315448269694, + "learning_rate": 4.4615830812437035e-07, + "loss": 0.5161, + "step": 11165 + }, + { + "epoch": 0.91, + "grad_norm": 0.9476143911761191, + "learning_rate": 4.4538109623597617e-07, + "loss": 0.4384, + "step": 11166 + }, + { + "epoch": 0.91, + "grad_norm": 0.9106293415533426, + "learning_rate": 4.446045464741966e-07, + "loss": 0.4845, + "step": 11167 + }, + { + "epoch": 0.91, + "grad_norm": 0.9679582344937153, + "learning_rate": 4.4382865889284756e-07, + "loss": 0.5291, + "step": 11168 + }, + { + "epoch": 0.91, + "grad_norm": 1.003241904827064, + "learning_rate": 4.4305343354569483e-07, + "loss": 0.5008, + "step": 11169 + }, + { + "epoch": 0.91, + "grad_norm": 0.9200873262321915, + "learning_rate": 4.4227887048646335e-07, + "loss": 0.4728, + "step": 11170 + }, + { + "epoch": 0.91, + "grad_norm": 0.8862449874934346, + "learning_rate": 4.4150496976882783e-07, + "loss": 0.4437, + "step": 11171 + }, + { + "epoch": 0.91, + "grad_norm": 0.8813859946917624, + "learning_rate": 4.407317314464199e-07, + "loss": 0.4773, + "step": 11172 + }, + { + "epoch": 0.91, + "grad_norm": 0.9116257945503873, + "learning_rate": 4.399591555728233e-07, + "loss": 0.457, + "step": 11173 + }, + { + "epoch": 0.91, + "grad_norm": 0.8952447845796265, + "learning_rate": 4.391872422015786e-07, + "loss": 0.4996, + "step": 11174 + }, + { + "epoch": 0.91, + "grad_norm": 1.0264398330218911, + "learning_rate": 4.384159913861752e-07, + "loss": 0.5262, + "step": 11175 + }, + { + "epoch": 0.91, + "grad_norm": 0.8986090002767066, + "learning_rate": 4.3764540318006587e-07, + "loss": 0.5164, + "step": 11176 + }, + { + "epoch": 0.91, + "grad_norm": 0.8649641214293282, + "learning_rate": 4.368754776366457e-07, + "loss": 0.4174, + "step": 11177 + }, + { + "epoch": 0.91, + "grad_norm": 0.880916340853826, + "learning_rate": 4.3610621480927315e-07, + "loss": 0.4489, + "step": 11178 + }, + { + "epoch": 0.91, + "grad_norm": 0.9090094229803269, + "learning_rate": 4.3533761475125666e-07, + "loss": 0.4797, + "step": 11179 + }, + { + "epoch": 0.91, + "grad_norm": 0.9788562410334294, + "learning_rate": 4.3456967751586143e-07, + "loss": 0.4673, + "step": 11180 + }, + { + "epoch": 0.91, + "grad_norm": 1.0350007495095783, + "learning_rate": 4.338024031562993e-07, + "loss": 0.4968, + "step": 11181 + }, + { + "epoch": 0.91, + "grad_norm": 0.9862582443544196, + "learning_rate": 4.3303579172574884e-07, + "loss": 0.5074, + "step": 11182 + }, + { + "epoch": 0.91, + "grad_norm": 0.9358507276894988, + "learning_rate": 4.3226984327733093e-07, + "loss": 0.4914, + "step": 11183 + }, + { + "epoch": 0.91, + "grad_norm": 0.9753101507137873, + "learning_rate": 4.3150455786412526e-07, + "loss": 0.4711, + "step": 11184 + }, + { + "epoch": 0.91, + "grad_norm": 0.944450308660901, + "learning_rate": 4.3073993553916726e-07, + "loss": 0.4919, + "step": 11185 + }, + { + "epoch": 0.91, + "grad_norm": 0.8749586270650181, + "learning_rate": 4.2997597635544563e-07, + "loss": 0.4242, + "step": 11186 + }, + { + "epoch": 0.91, + "grad_norm": 0.9996020749611297, + "learning_rate": 4.292126803658969e-07, + "loss": 0.4821, + "step": 11187 + }, + { + "epoch": 0.91, + "grad_norm": 0.863509502257761, + "learning_rate": 4.2845004762342325e-07, + "loss": 0.459, + "step": 11188 + }, + { + "epoch": 0.91, + "grad_norm": 0.8681959186479662, + "learning_rate": 4.27688078180869e-07, + "loss": 0.4579, + "step": 11189 + }, + { + "epoch": 0.91, + "grad_norm": 0.8916377298074757, + "learning_rate": 4.269267720910419e-07, + "loss": 0.4552, + "step": 11190 + }, + { + "epoch": 0.91, + "grad_norm": 0.913625486076061, + "learning_rate": 4.2616612940669657e-07, + "loss": 0.4839, + "step": 11191 + }, + { + "epoch": 0.91, + "grad_norm": 0.9608375155236945, + "learning_rate": 4.254061501805484e-07, + "loss": 0.5321, + "step": 11192 + }, + { + "epoch": 0.91, + "grad_norm": 0.9390229983486676, + "learning_rate": 4.246468344652599e-07, + "loss": 0.5257, + "step": 11193 + }, + { + "epoch": 0.91, + "grad_norm": 1.028975729366445, + "learning_rate": 4.238881823134533e-07, + "loss": 0.4979, + "step": 11194 + }, + { + "epoch": 0.91, + "grad_norm": 0.873398927746758, + "learning_rate": 4.2313019377770104e-07, + "loss": 0.435, + "step": 11195 + }, + { + "epoch": 0.91, + "grad_norm": 0.9497373872488999, + "learning_rate": 4.223728689105322e-07, + "loss": 0.5039, + "step": 11196 + }, + { + "epoch": 0.91, + "grad_norm": 0.9519321022576744, + "learning_rate": 4.216162077644281e-07, + "loss": 0.5677, + "step": 11197 + }, + { + "epoch": 0.91, + "grad_norm": 0.8908282880627073, + "learning_rate": 4.208602103918258e-07, + "loss": 0.4711, + "step": 11198 + }, + { + "epoch": 0.91, + "grad_norm": 1.0498824109635534, + "learning_rate": 4.2010487684511105e-07, + "loss": 0.5055, + "step": 11199 + }, + { + "epoch": 0.91, + "grad_norm": 0.9252361422177733, + "learning_rate": 4.1935020717663423e-07, + "loss": 0.4992, + "step": 11200 + }, + { + "epoch": 0.91, + "grad_norm": 0.9402917514441262, + "learning_rate": 4.1859620143868793e-07, + "loss": 0.4984, + "step": 11201 + }, + { + "epoch": 0.91, + "grad_norm": 1.0012578522166926, + "learning_rate": 4.178428596835271e-07, + "loss": 0.4871, + "step": 11202 + }, + { + "epoch": 0.91, + "grad_norm": 0.872894891355588, + "learning_rate": 4.170901819633566e-07, + "loss": 0.4665, + "step": 11203 + }, + { + "epoch": 0.91, + "grad_norm": 0.977140130196723, + "learning_rate": 4.1633816833033804e-07, + "loss": 0.497, + "step": 11204 + }, + { + "epoch": 0.91, + "grad_norm": 1.0056578141335417, + "learning_rate": 4.155868188365808e-07, + "loss": 0.4943, + "step": 11205 + }, + { + "epoch": 0.91, + "grad_norm": 0.9423785526526659, + "learning_rate": 4.1483613353415775e-07, + "loss": 0.4813, + "step": 11206 + }, + { + "epoch": 0.91, + "grad_norm": 1.0023662451744835, + "learning_rate": 4.1408611247508723e-07, + "loss": 0.491, + "step": 11207 + }, + { + "epoch": 0.91, + "grad_norm": 1.0159130710754287, + "learning_rate": 4.133367557113477e-07, + "loss": 0.5373, + "step": 11208 + }, + { + "epoch": 0.91, + "grad_norm": 0.8312726422881775, + "learning_rate": 4.1258806329486644e-07, + "loss": 0.4175, + "step": 11209 + }, + { + "epoch": 0.91, + "grad_norm": 0.8091738993603446, + "learning_rate": 4.118400352775287e-07, + "loss": 0.4436, + "step": 11210 + }, + { + "epoch": 0.91, + "grad_norm": 0.8804197075085122, + "learning_rate": 4.1109267171117184e-07, + "loss": 0.468, + "step": 11211 + }, + { + "epoch": 0.91, + "grad_norm": 0.9020380820861756, + "learning_rate": 4.103459726475889e-07, + "loss": 0.4497, + "step": 11212 + }, + { + "epoch": 0.91, + "grad_norm": 0.9761918417426805, + "learning_rate": 4.095999381385229e-07, + "loss": 0.5338, + "step": 11213 + }, + { + "epoch": 0.91, + "grad_norm": 0.8782319385368993, + "learning_rate": 4.088545682356748e-07, + "loss": 0.433, + "step": 11214 + }, + { + "epoch": 0.91, + "grad_norm": 0.950696236898299, + "learning_rate": 4.0810986299069656e-07, + "loss": 0.436, + "step": 11215 + }, + { + "epoch": 0.91, + "grad_norm": 0.9189322841765247, + "learning_rate": 4.0736582245519795e-07, + "loss": 0.4815, + "step": 11216 + }, + { + "epoch": 0.91, + "grad_norm": 0.884327380500117, + "learning_rate": 4.066224466807389e-07, + "loss": 0.4702, + "step": 11217 + }, + { + "epoch": 0.91, + "grad_norm": 0.9787955704244248, + "learning_rate": 4.0587973571883596e-07, + "loss": 0.5555, + "step": 11218 + }, + { + "epoch": 0.91, + "grad_norm": 0.9858389648304431, + "learning_rate": 4.051376896209558e-07, + "loss": 0.5023, + "step": 11219 + }, + { + "epoch": 0.91, + "grad_norm": 0.9994956181583511, + "learning_rate": 4.0439630843852383e-07, + "loss": 0.5066, + "step": 11220 + }, + { + "epoch": 0.91, + "grad_norm": 1.0228356795696085, + "learning_rate": 4.0365559222291684e-07, + "loss": 0.4589, + "step": 11221 + }, + { + "epoch": 0.91, + "grad_norm": 0.9459753139302353, + "learning_rate": 4.029155410254637e-07, + "loss": 0.4816, + "step": 11222 + }, + { + "epoch": 0.91, + "grad_norm": 1.091750919560779, + "learning_rate": 4.021761548974523e-07, + "loss": 0.5559, + "step": 11223 + }, + { + "epoch": 0.91, + "grad_norm": 0.9724075012713982, + "learning_rate": 4.014374338901206e-07, + "loss": 0.4748, + "step": 11224 + }, + { + "epoch": 0.91, + "grad_norm": 1.006268228578613, + "learning_rate": 4.0069937805466084e-07, + "loss": 0.4083, + "step": 11225 + }, + { + "epoch": 0.91, + "grad_norm": 0.9153690149268117, + "learning_rate": 3.999619874422178e-07, + "loss": 0.4788, + "step": 11226 + }, + { + "epoch": 0.91, + "grad_norm": 0.9338564214866565, + "learning_rate": 3.99225262103895e-07, + "loss": 0.4906, + "step": 11227 + }, + { + "epoch": 0.91, + "grad_norm": 1.0277728033158247, + "learning_rate": 3.9848920209074606e-07, + "loss": 0.5594, + "step": 11228 + }, + { + "epoch": 0.91, + "grad_norm": 0.9886771525159102, + "learning_rate": 3.977538074537779e-07, + "loss": 0.494, + "step": 11229 + }, + { + "epoch": 0.91, + "grad_norm": 0.9988823203761128, + "learning_rate": 3.9701907824395426e-07, + "loss": 0.4573, + "step": 11230 + }, + { + "epoch": 0.91, + "grad_norm": 0.9668732903658148, + "learning_rate": 3.9628501451219106e-07, + "loss": 0.5663, + "step": 11231 + }, + { + "epoch": 0.91, + "grad_norm": 1.0333664090017052, + "learning_rate": 3.9555161630935756e-07, + "loss": 0.5688, + "step": 11232 + }, + { + "epoch": 0.91, + "grad_norm": 0.870329084161462, + "learning_rate": 3.9481888368627764e-07, + "loss": 0.4793, + "step": 11233 + }, + { + "epoch": 0.91, + "grad_norm": 0.926284708098634, + "learning_rate": 3.940868166937295e-07, + "loss": 0.4408, + "step": 11234 + }, + { + "epoch": 0.91, + "grad_norm": 0.9604043534785126, + "learning_rate": 3.933554153824437e-07, + "loss": 0.5178, + "step": 11235 + }, + { + "epoch": 0.91, + "grad_norm": 0.9889579675663253, + "learning_rate": 3.9262467980310747e-07, + "loss": 0.4689, + "step": 11236 + }, + { + "epoch": 0.91, + "grad_norm": 0.899283890650655, + "learning_rate": 3.918946100063603e-07, + "loss": 0.4952, + "step": 11237 + }, + { + "epoch": 0.91, + "grad_norm": 1.0361637174713196, + "learning_rate": 3.9116520604279285e-07, + "loss": 0.4996, + "step": 11238 + }, + { + "epoch": 0.91, + "grad_norm": 0.9260930406702779, + "learning_rate": 3.904364679629535e-07, + "loss": 0.442, + "step": 11239 + }, + { + "epoch": 0.91, + "grad_norm": 0.9405497553842672, + "learning_rate": 3.897083958173431e-07, + "loss": 0.4688, + "step": 11240 + }, + { + "epoch": 0.91, + "grad_norm": 1.039178970007914, + "learning_rate": 3.889809896564167e-07, + "loss": 0.5402, + "step": 11241 + }, + { + "epoch": 0.91, + "grad_norm": 0.8742429461856939, + "learning_rate": 3.88254249530583e-07, + "loss": 0.5077, + "step": 11242 + }, + { + "epoch": 0.91, + "grad_norm": 0.9717294505105695, + "learning_rate": 3.8752817549020494e-07, + "loss": 0.4859, + "step": 11243 + }, + { + "epoch": 0.91, + "grad_norm": 0.9651825344259987, + "learning_rate": 3.868027675855968e-07, + "loss": 0.5584, + "step": 11244 + }, + { + "epoch": 0.91, + "grad_norm": 0.949190123858509, + "learning_rate": 3.8607802586703045e-07, + "loss": 0.479, + "step": 11245 + }, + { + "epoch": 0.91, + "grad_norm": 1.0633748862786825, + "learning_rate": 3.853539503847292e-07, + "loss": 0.5105, + "step": 11246 + }, + { + "epoch": 0.91, + "grad_norm": 0.907406015353476, + "learning_rate": 3.8463054118887064e-07, + "loss": 0.4661, + "step": 11247 + }, + { + "epoch": 0.91, + "grad_norm": 0.9235434459247656, + "learning_rate": 3.8390779832958804e-07, + "loss": 0.4984, + "step": 11248 + }, + { + "epoch": 0.91, + "grad_norm": 0.871393155044762, + "learning_rate": 3.831857218569646e-07, + "loss": 0.4764, + "step": 11249 + }, + { + "epoch": 0.91, + "grad_norm": 1.002044778848067, + "learning_rate": 3.824643118210403e-07, + "loss": 0.4984, + "step": 11250 + }, + { + "epoch": 0.91, + "grad_norm": 0.8642745302575283, + "learning_rate": 3.817435682718096e-07, + "loss": 0.4246, + "step": 11251 + }, + { + "epoch": 0.91, + "grad_norm": 0.8914467494927035, + "learning_rate": 3.810234912592181e-07, + "loss": 0.4409, + "step": 11252 + }, + { + "epoch": 0.91, + "grad_norm": 0.916483430239664, + "learning_rate": 3.803040808331659e-07, + "loss": 0.481, + "step": 11253 + }, + { + "epoch": 0.91, + "grad_norm": 0.898092244042881, + "learning_rate": 3.7958533704350763e-07, + "loss": 0.4732, + "step": 11254 + }, + { + "epoch": 0.91, + "grad_norm": 0.9126761753612629, + "learning_rate": 3.788672599400534e-07, + "loss": 0.4555, + "step": 11255 + }, + { + "epoch": 0.91, + "grad_norm": 0.8453746843953241, + "learning_rate": 3.7814984957256327e-07, + "loss": 0.4511, + "step": 11256 + }, + { + "epoch": 0.91, + "grad_norm": 0.9432585840238175, + "learning_rate": 3.7743310599075543e-07, + "loss": 0.5324, + "step": 11257 + }, + { + "epoch": 0.91, + "grad_norm": 0.8631479672202498, + "learning_rate": 3.7671702924429677e-07, + "loss": 0.4059, + "step": 11258 + }, + { + "epoch": 0.92, + "grad_norm": 1.003445796403605, + "learning_rate": 3.7600161938281196e-07, + "loss": 0.4981, + "step": 11259 + }, + { + "epoch": 0.92, + "grad_norm": 0.8847485019144521, + "learning_rate": 3.7528687645587924e-07, + "loss": 0.4643, + "step": 11260 + }, + { + "epoch": 0.92, + "grad_norm": 0.9521515676316561, + "learning_rate": 3.745728005130289e-07, + "loss": 0.4825, + "step": 11261 + }, + { + "epoch": 0.92, + "grad_norm": 0.9943019297322592, + "learning_rate": 3.7385939160374476e-07, + "loss": 0.4789, + "step": 11262 + }, + { + "epoch": 0.92, + "grad_norm": 1.0443854704039524, + "learning_rate": 3.731466497774683e-07, + "loss": 0.5328, + "step": 11263 + }, + { + "epoch": 0.92, + "grad_norm": 0.9224820193567518, + "learning_rate": 3.7243457508358784e-07, + "loss": 0.4358, + "step": 11264 + }, + { + "epoch": 0.92, + "grad_norm": 0.9718348345724628, + "learning_rate": 3.717231675714539e-07, + "loss": 0.5314, + "step": 11265 + }, + { + "epoch": 0.92, + "grad_norm": 0.9328422043792436, + "learning_rate": 3.710124272903626e-07, + "loss": 0.5085, + "step": 11266 + }, + { + "epoch": 0.92, + "grad_norm": 0.8573445232214344, + "learning_rate": 3.7030235428956895e-07, + "loss": 0.4688, + "step": 11267 + }, + { + "epoch": 0.92, + "grad_norm": 0.8719481532054565, + "learning_rate": 3.6959294861828145e-07, + "loss": 0.464, + "step": 11268 + }, + { + "epoch": 0.92, + "grad_norm": 1.022972115265359, + "learning_rate": 3.688842103256607e-07, + "loss": 0.5125, + "step": 11269 + }, + { + "epoch": 0.92, + "grad_norm": 0.907652029442627, + "learning_rate": 3.681761394608197e-07, + "loss": 0.4468, + "step": 11270 + }, + { + "epoch": 0.92, + "grad_norm": 0.9742138500191758, + "learning_rate": 3.674687360728313e-07, + "loss": 0.4637, + "step": 11271 + }, + { + "epoch": 0.92, + "grad_norm": 1.0138898185358978, + "learning_rate": 3.667620002107142e-07, + "loss": 0.4617, + "step": 11272 + }, + { + "epoch": 0.92, + "grad_norm": 0.9212464426233573, + "learning_rate": 3.660559319234447e-07, + "loss": 0.4477, + "step": 11273 + }, + { + "epoch": 0.92, + "grad_norm": 0.9109639141362548, + "learning_rate": 3.653505312599548e-07, + "loss": 0.4575, + "step": 11274 + }, + { + "epoch": 0.92, + "grad_norm": 0.9821477248340027, + "learning_rate": 3.646457982691287e-07, + "loss": 0.5487, + "step": 11275 + }, + { + "epoch": 0.92, + "grad_norm": 0.9427187757868164, + "learning_rate": 3.639417329997996e-07, + "loss": 0.4833, + "step": 11276 + }, + { + "epoch": 0.92, + "grad_norm": 0.9306861527841653, + "learning_rate": 3.632383355007629e-07, + "loss": 0.4816, + "step": 11277 + }, + { + "epoch": 0.92, + "grad_norm": 0.9649502106841656, + "learning_rate": 3.6253560582076075e-07, + "loss": 0.4746, + "step": 11278 + }, + { + "epoch": 0.92, + "grad_norm": 0.9670849975896374, + "learning_rate": 3.6183354400849304e-07, + "loss": 0.5206, + "step": 11279 + }, + { + "epoch": 0.92, + "grad_norm": 1.0014901452286138, + "learning_rate": 3.6113215011261194e-07, + "loss": 0.5014, + "step": 11280 + }, + { + "epoch": 0.92, + "grad_norm": 0.9168158272006056, + "learning_rate": 3.604314241817242e-07, + "loss": 0.4216, + "step": 11281 + }, + { + "epoch": 0.92, + "grad_norm": 1.011810912246533, + "learning_rate": 3.5973136626438644e-07, + "loss": 0.4856, + "step": 11282 + }, + { + "epoch": 0.92, + "grad_norm": 1.070538404968705, + "learning_rate": 3.5903197640911546e-07, + "loss": 0.5257, + "step": 11283 + }, + { + "epoch": 0.92, + "grad_norm": 0.9009972646160386, + "learning_rate": 3.5833325466437697e-07, + "loss": 0.4777, + "step": 11284 + }, + { + "epoch": 0.92, + "grad_norm": 0.8799512675526913, + "learning_rate": 3.576352010785911e-07, + "loss": 0.4593, + "step": 11285 + }, + { + "epoch": 0.92, + "grad_norm": 0.9844801336103844, + "learning_rate": 3.5693781570013243e-07, + "loss": 0.4648, + "step": 11286 + }, + { + "epoch": 0.92, + "grad_norm": 0.9068441044013625, + "learning_rate": 3.5624109857733234e-07, + "loss": 0.4397, + "step": 11287 + }, + { + "epoch": 0.92, + "grad_norm": 1.049042656285058, + "learning_rate": 3.555450497584667e-07, + "loss": 0.4826, + "step": 11288 + }, + { + "epoch": 0.92, + "grad_norm": 0.9362852634910507, + "learning_rate": 3.548496692917769e-07, + "loss": 0.4907, + "step": 11289 + }, + { + "epoch": 0.92, + "grad_norm": 0.9580446529346488, + "learning_rate": 3.541549572254488e-07, + "loss": 0.4849, + "step": 11290 + }, + { + "epoch": 0.92, + "grad_norm": 0.9345954545496868, + "learning_rate": 3.5346091360762615e-07, + "loss": 0.513, + "step": 11291 + }, + { + "epoch": 0.92, + "grad_norm": 0.9650252641059298, + "learning_rate": 3.52767538486406e-07, + "loss": 0.4729, + "step": 11292 + }, + { + "epoch": 0.92, + "grad_norm": 0.9106790175650638, + "learning_rate": 3.520748319098399e-07, + "loss": 0.4588, + "step": 11293 + }, + { + "epoch": 0.92, + "grad_norm": 0.8666227556153863, + "learning_rate": 3.513827939259273e-07, + "loss": 0.446, + "step": 11294 + }, + { + "epoch": 0.92, + "grad_norm": 0.8980952689189213, + "learning_rate": 3.5069142458263093e-07, + "loss": 0.4815, + "step": 11295 + }, + { + "epoch": 0.92, + "grad_norm": 1.0014655996836292, + "learning_rate": 3.500007239278591e-07, + "loss": 0.506, + "step": 11296 + }, + { + "epoch": 0.92, + "grad_norm": 0.985359380024748, + "learning_rate": 3.49310692009478e-07, + "loss": 0.5175, + "step": 11297 + }, + { + "epoch": 0.92, + "grad_norm": 0.9953536043633036, + "learning_rate": 3.4862132887530485e-07, + "loss": 0.5532, + "step": 11298 + }, + { + "epoch": 0.92, + "grad_norm": 0.8713096817640018, + "learning_rate": 3.4793263457311487e-07, + "loss": 0.4345, + "step": 11299 + }, + { + "epoch": 0.92, + "grad_norm": 0.9010164983973311, + "learning_rate": 3.4724460915062874e-07, + "loss": 0.4683, + "step": 11300 + }, + { + "epoch": 0.92, + "grad_norm": 0.8829612221593353, + "learning_rate": 3.4655725265553276e-07, + "loss": 0.4497, + "step": 11301 + }, + { + "epoch": 0.92, + "grad_norm": 0.9468093798201783, + "learning_rate": 3.458705651354544e-07, + "loss": 0.4913, + "step": 11302 + }, + { + "epoch": 0.92, + "grad_norm": 0.8907019359980703, + "learning_rate": 3.451845466379833e-07, + "loss": 0.4794, + "step": 11303 + }, + { + "epoch": 0.92, + "grad_norm": 0.9557898281434373, + "learning_rate": 3.4449919721065815e-07, + "loss": 0.4513, + "step": 11304 + }, + { + "epoch": 0.92, + "grad_norm": 0.8546099421797825, + "learning_rate": 3.4381451690097653e-07, + "loss": 0.4572, + "step": 11305 + }, + { + "epoch": 0.92, + "grad_norm": 0.8776209132717654, + "learning_rate": 3.4313050575638164e-07, + "loss": 0.4674, + "step": 11306 + }, + { + "epoch": 0.92, + "grad_norm": 1.0620298487621678, + "learning_rate": 3.4244716382427876e-07, + "loss": 0.4877, + "step": 11307 + }, + { + "epoch": 0.92, + "grad_norm": 0.9600773262046887, + "learning_rate": 3.417644911520202e-07, + "loss": 0.4858, + "step": 11308 + }, + { + "epoch": 0.92, + "grad_norm": 1.0172245807626483, + "learning_rate": 3.410824877869157e-07, + "loss": 0.5105, + "step": 11309 + }, + { + "epoch": 0.92, + "grad_norm": 1.0641696939999097, + "learning_rate": 3.404011537762275e-07, + "loss": 0.5477, + "step": 11310 + }, + { + "epoch": 0.92, + "grad_norm": 0.9414882028257976, + "learning_rate": 3.3972048916717127e-07, + "loss": 0.4423, + "step": 11311 + }, + { + "epoch": 0.92, + "grad_norm": 0.9345099908540642, + "learning_rate": 3.3904049400691585e-07, + "loss": 0.5102, + "step": 11312 + }, + { + "epoch": 0.92, + "grad_norm": 1.0032701922871918, + "learning_rate": 3.3836116834258583e-07, + "loss": 0.491, + "step": 11313 + }, + { + "epoch": 0.92, + "grad_norm": 1.2780296973011878, + "learning_rate": 3.376825122212568e-07, + "loss": 0.5556, + "step": 11314 + }, + { + "epoch": 0.92, + "grad_norm": 1.0067969180363423, + "learning_rate": 3.37004525689959e-07, + "loss": 0.4622, + "step": 11315 + }, + { + "epoch": 0.92, + "grad_norm": 0.9048930187275969, + "learning_rate": 3.3632720879567594e-07, + "loss": 0.5202, + "step": 11316 + }, + { + "epoch": 0.92, + "grad_norm": 0.9190783750855306, + "learning_rate": 3.356505615853478e-07, + "loss": 0.4788, + "step": 11317 + }, + { + "epoch": 0.92, + "grad_norm": 0.8700597041027842, + "learning_rate": 3.349745841058605e-07, + "loss": 0.4796, + "step": 11318 + }, + { + "epoch": 0.92, + "grad_norm": 0.9765173966674195, + "learning_rate": 3.3429927640406425e-07, + "loss": 0.4858, + "step": 11319 + }, + { + "epoch": 0.92, + "grad_norm": 1.0247910175502006, + "learning_rate": 3.336246385267528e-07, + "loss": 0.4862, + "step": 11320 + }, + { + "epoch": 0.92, + "grad_norm": 0.8672758790970835, + "learning_rate": 3.3295067052068086e-07, + "loss": 0.4239, + "step": 11321 + }, + { + "epoch": 0.92, + "grad_norm": 0.9803320123619911, + "learning_rate": 3.322773724325523e-07, + "loss": 0.4698, + "step": 11322 + }, + { + "epoch": 0.92, + "grad_norm": 0.8993180142347001, + "learning_rate": 3.3160474430902756e-07, + "loss": 0.4318, + "step": 11323 + }, + { + "epoch": 0.92, + "grad_norm": 1.022883190886677, + "learning_rate": 3.30932786196716e-07, + "loss": 0.5367, + "step": 11324 + }, + { + "epoch": 0.92, + "grad_norm": 0.8970949373639794, + "learning_rate": 3.302614981421881e-07, + "loss": 0.4721, + "step": 11325 + }, + { + "epoch": 0.92, + "grad_norm": 0.9270048427254719, + "learning_rate": 3.2959088019196005e-07, + "loss": 0.4475, + "step": 11326 + }, + { + "epoch": 0.92, + "grad_norm": 1.0993593442790337, + "learning_rate": 3.2892093239250686e-07, + "loss": 0.5155, + "step": 11327 + }, + { + "epoch": 0.92, + "grad_norm": 0.9603091910994954, + "learning_rate": 3.282516547902548e-07, + "loss": 0.4498, + "step": 11328 + }, + { + "epoch": 0.92, + "grad_norm": 0.9606031793318405, + "learning_rate": 3.2758304743158554e-07, + "loss": 0.4966, + "step": 11329 + }, + { + "epoch": 0.92, + "grad_norm": 0.990904867623026, + "learning_rate": 3.2691511036282875e-07, + "loss": 0.4707, + "step": 11330 + }, + { + "epoch": 0.92, + "grad_norm": 0.9704452825773399, + "learning_rate": 3.262478436302774e-07, + "loss": 0.4255, + "step": 11331 + }, + { + "epoch": 0.92, + "grad_norm": 0.9581276075878125, + "learning_rate": 3.255812472801689e-07, + "loss": 0.5424, + "step": 11332 + }, + { + "epoch": 0.92, + "grad_norm": 0.9665561902448118, + "learning_rate": 3.2491532135869865e-07, + "loss": 0.5122, + "step": 11333 + }, + { + "epoch": 0.92, + "grad_norm": 0.8347893223320174, + "learning_rate": 3.24250065912014e-07, + "loss": 0.4394, + "step": 11334 + }, + { + "epoch": 0.92, + "grad_norm": 0.8701345936737639, + "learning_rate": 3.235854809862193e-07, + "loss": 0.4746, + "step": 11335 + }, + { + "epoch": 0.92, + "grad_norm": 0.982254247110148, + "learning_rate": 3.2292156662736554e-07, + "loss": 0.4859, + "step": 11336 + }, + { + "epoch": 0.92, + "grad_norm": 1.4109891620559623, + "learning_rate": 3.2225832288146577e-07, + "loss": 0.4721, + "step": 11337 + }, + { + "epoch": 0.92, + "grad_norm": 0.8886097420198746, + "learning_rate": 3.2159574979447996e-07, + "loss": 0.4585, + "step": 11338 + }, + { + "epoch": 0.92, + "grad_norm": 1.0076090882264177, + "learning_rate": 3.209338474123225e-07, + "loss": 0.4674, + "step": 11339 + }, + { + "epoch": 0.92, + "grad_norm": 0.8590134875014034, + "learning_rate": 3.2027261578086443e-07, + "loss": 0.4567, + "step": 11340 + }, + { + "epoch": 0.92, + "grad_norm": 1.041771758544706, + "learning_rate": 3.1961205494593027e-07, + "loss": 0.467, + "step": 11341 + }, + { + "epoch": 0.92, + "grad_norm": 0.9674539778793504, + "learning_rate": 3.1895216495329116e-07, + "loss": 0.4917, + "step": 11342 + }, + { + "epoch": 0.92, + "grad_norm": 0.9700115735131218, + "learning_rate": 3.1829294584868166e-07, + "loss": 0.4876, + "step": 11343 + }, + { + "epoch": 0.92, + "grad_norm": 0.9355995499943076, + "learning_rate": 3.1763439767778293e-07, + "loss": 0.4811, + "step": 11344 + }, + { + "epoch": 0.92, + "grad_norm": 0.9956711944594656, + "learning_rate": 3.1697652048623185e-07, + "loss": 0.5596, + "step": 11345 + }, + { + "epoch": 0.92, + "grad_norm": 0.8720294164515175, + "learning_rate": 3.163193143196197e-07, + "loss": 0.3914, + "step": 11346 + }, + { + "epoch": 0.92, + "grad_norm": 0.9699170744496788, + "learning_rate": 3.156627792234901e-07, + "loss": 0.4809, + "step": 11347 + }, + { + "epoch": 0.92, + "grad_norm": 0.922892098617445, + "learning_rate": 3.150069152433377e-07, + "loss": 0.4439, + "step": 11348 + }, + { + "epoch": 0.92, + "grad_norm": 0.9107802722920333, + "learning_rate": 3.143517224246184e-07, + "loss": 0.4472, + "step": 11349 + }, + { + "epoch": 0.92, + "grad_norm": 0.8922634635118099, + "learning_rate": 3.1369720081273147e-07, + "loss": 0.4981, + "step": 11350 + }, + { + "epoch": 0.92, + "grad_norm": 0.9435902052682902, + "learning_rate": 3.1304335045303724e-07, + "loss": 0.4766, + "step": 11351 + }, + { + "epoch": 0.92, + "grad_norm": 0.8859616279603214, + "learning_rate": 3.1239017139084725e-07, + "loss": 0.4254, + "step": 11352 + }, + { + "epoch": 0.92, + "grad_norm": 0.8330376400127653, + "learning_rate": 3.1173766367142534e-07, + "loss": 0.4531, + "step": 11353 + }, + { + "epoch": 0.92, + "grad_norm": 0.936319051976937, + "learning_rate": 3.110858273399886e-07, + "loss": 0.4438, + "step": 11354 + }, + { + "epoch": 0.92, + "grad_norm": 0.9737792784501018, + "learning_rate": 3.1043466244171204e-07, + "loss": 0.5205, + "step": 11355 + }, + { + "epoch": 0.92, + "grad_norm": 0.9971314327878192, + "learning_rate": 3.097841690217174e-07, + "loss": 0.5359, + "step": 11356 + }, + { + "epoch": 0.92, + "grad_norm": 0.938727495682044, + "learning_rate": 3.0913434712508406e-07, + "loss": 0.4843, + "step": 11357 + }, + { + "epoch": 0.92, + "grad_norm": 0.9856444957341504, + "learning_rate": 3.0848519679684606e-07, + "loss": 0.5354, + "step": 11358 + }, + { + "epoch": 0.92, + "grad_norm": 0.8538657256871016, + "learning_rate": 3.078367180819863e-07, + "loss": 0.4759, + "step": 11359 + }, + { + "epoch": 0.92, + "grad_norm": 0.956572623856092, + "learning_rate": 3.0718891102544556e-07, + "loss": 0.5073, + "step": 11360 + }, + { + "epoch": 0.92, + "grad_norm": 0.8913295893657242, + "learning_rate": 3.0654177567211675e-07, + "loss": 0.4138, + "step": 11361 + }, + { + "epoch": 0.92, + "grad_norm": 0.9338857544751735, + "learning_rate": 3.0589531206684397e-07, + "loss": 0.4947, + "step": 11362 + }, + { + "epoch": 0.92, + "grad_norm": 0.9580188168889444, + "learning_rate": 3.05249520254427e-07, + "loss": 0.4949, + "step": 11363 + }, + { + "epoch": 0.92, + "grad_norm": 0.9029820242373844, + "learning_rate": 3.046044002796189e-07, + "loss": 0.4515, + "step": 11364 + }, + { + "epoch": 0.92, + "grad_norm": 1.0361265416089567, + "learning_rate": 3.039599521871273e-07, + "loss": 0.5229, + "step": 11365 + }, + { + "epoch": 0.92, + "grad_norm": 0.9361873322359499, + "learning_rate": 3.0331617602160965e-07, + "loss": 0.509, + "step": 11366 + }, + { + "epoch": 0.92, + "grad_norm": 0.9951052754963993, + "learning_rate": 3.026730718276805e-07, + "loss": 0.5192, + "step": 11367 + }, + { + "epoch": 0.92, + "grad_norm": 0.9165161306116084, + "learning_rate": 3.020306396499062e-07, + "loss": 0.4799, + "step": 11368 + }, + { + "epoch": 0.92, + "grad_norm": 0.9247733833045126, + "learning_rate": 3.0138887953280573e-07, + "loss": 0.4666, + "step": 11369 + }, + { + "epoch": 0.92, + "grad_norm": 1.0017284450473003, + "learning_rate": 3.0074779152085345e-07, + "loss": 0.4935, + "step": 11370 + }, + { + "epoch": 0.92, + "grad_norm": 0.8499168034190486, + "learning_rate": 3.00107375658476e-07, + "loss": 0.4418, + "step": 11371 + }, + { + "epoch": 0.92, + "grad_norm": 0.9375159709601683, + "learning_rate": 2.9946763199005356e-07, + "loss": 0.5188, + "step": 11372 + }, + { + "epoch": 0.92, + "grad_norm": 0.9849719113347157, + "learning_rate": 2.988285605599206e-07, + "loss": 0.4752, + "step": 11373 + }, + { + "epoch": 0.92, + "grad_norm": 0.9466116530237808, + "learning_rate": 2.981901614123617e-07, + "loss": 0.513, + "step": 11374 + }, + { + "epoch": 0.92, + "grad_norm": 0.9249846932225795, + "learning_rate": 2.9755243459162144e-07, + "loss": 0.475, + "step": 11375 + }, + { + "epoch": 0.92, + "grad_norm": 1.1001174601861894, + "learning_rate": 2.9691538014189005e-07, + "loss": 0.5434, + "step": 11376 + }, + { + "epoch": 0.92, + "grad_norm": 0.8908442139851055, + "learning_rate": 2.9627899810731666e-07, + "loss": 0.4922, + "step": 11377 + }, + { + "epoch": 0.92, + "grad_norm": 1.006276155178277, + "learning_rate": 2.956432885320004e-07, + "loss": 0.5509, + "step": 11378 + }, + { + "epoch": 0.92, + "grad_norm": 0.958713801071645, + "learning_rate": 2.9500825145999723e-07, + "loss": 0.4641, + "step": 11379 + }, + { + "epoch": 0.92, + "grad_norm": 0.925297448457726, + "learning_rate": 2.943738869353141e-07, + "loss": 0.4948, + "step": 11380 + }, + { + "epoch": 0.92, + "grad_norm": 0.9610141487552996, + "learning_rate": 2.9374019500191255e-07, + "loss": 0.5046, + "step": 11381 + }, + { + "epoch": 0.93, + "grad_norm": 1.0200439829001273, + "learning_rate": 2.9310717570370516e-07, + "loss": 0.5139, + "step": 11382 + }, + { + "epoch": 0.93, + "grad_norm": 0.8904018710302055, + "learning_rate": 2.9247482908456027e-07, + "loss": 0.4644, + "step": 11383 + }, + { + "epoch": 0.93, + "grad_norm": 0.9344880041590723, + "learning_rate": 2.918431551882994e-07, + "loss": 0.465, + "step": 11384 + }, + { + "epoch": 0.93, + "grad_norm": 0.9531718916462352, + "learning_rate": 2.9121215405869653e-07, + "loss": 0.5138, + "step": 11385 + }, + { + "epoch": 0.93, + "grad_norm": 0.8398539963659245, + "learning_rate": 2.905818257394799e-07, + "loss": 0.4241, + "step": 11386 + }, + { + "epoch": 0.93, + "grad_norm": 0.8422635777860722, + "learning_rate": 2.899521702743313e-07, + "loss": 0.4459, + "step": 11387 + }, + { + "epoch": 0.93, + "grad_norm": 0.9634311386263213, + "learning_rate": 2.8932318770688364e-07, + "loss": 0.5256, + "step": 11388 + }, + { + "epoch": 0.93, + "grad_norm": 0.9472397347375905, + "learning_rate": 2.886948780807253e-07, + "loss": 0.4538, + "step": 11389 + }, + { + "epoch": 0.93, + "grad_norm": 0.9592197845636875, + "learning_rate": 2.8806724143939814e-07, + "loss": 0.5103, + "step": 11390 + }, + { + "epoch": 0.93, + "grad_norm": 1.018974190459149, + "learning_rate": 2.874402778263974e-07, + "loss": 0.476, + "step": 11391 + }, + { + "epoch": 0.93, + "grad_norm": 1.0259783487545626, + "learning_rate": 2.868139872851694e-07, + "loss": 0.5273, + "step": 11392 + }, + { + "epoch": 0.93, + "grad_norm": 0.9251938911299553, + "learning_rate": 2.8618836985911837e-07, + "loss": 0.4512, + "step": 11393 + }, + { + "epoch": 0.93, + "grad_norm": 0.9463515132362895, + "learning_rate": 2.8556342559159513e-07, + "loss": 0.519, + "step": 11394 + }, + { + "epoch": 0.93, + "grad_norm": 0.9191038400094846, + "learning_rate": 2.849391545259106e-07, + "loss": 0.4614, + "step": 11395 + }, + { + "epoch": 0.93, + "grad_norm": 1.0263031963280673, + "learning_rate": 2.843155567053246e-07, + "loss": 0.5306, + "step": 11396 + }, + { + "epoch": 0.93, + "grad_norm": 0.8546711846548672, + "learning_rate": 2.8369263217305374e-07, + "loss": 0.442, + "step": 11397 + }, + { + "epoch": 0.93, + "grad_norm": 0.9869733608981098, + "learning_rate": 2.830703809722646e-07, + "loss": 0.45, + "step": 11398 + }, + { + "epoch": 0.93, + "grad_norm": 0.8344249591406251, + "learning_rate": 2.8244880314607924e-07, + "loss": 0.3998, + "step": 11399 + }, + { + "epoch": 0.93, + "grad_norm": 1.027015109040339, + "learning_rate": 2.8182789873757334e-07, + "loss": 0.4416, + "step": 11400 + }, + { + "epoch": 0.93, + "grad_norm": 0.9066407958880655, + "learning_rate": 2.812076677897735e-07, + "loss": 0.4828, + "step": 11401 + }, + { + "epoch": 0.93, + "grad_norm": 0.9570317674310846, + "learning_rate": 2.8058811034566094e-07, + "loss": 0.5149, + "step": 11402 + }, + { + "epoch": 0.93, + "grad_norm": 0.9276312174699367, + "learning_rate": 2.7996922644817126e-07, + "loss": 0.5011, + "step": 11403 + }, + { + "epoch": 0.93, + "grad_norm": 0.9758733729187016, + "learning_rate": 2.7935101614019354e-07, + "loss": 0.5011, + "step": 11404 + }, + { + "epoch": 0.93, + "grad_norm": 1.0575231778547123, + "learning_rate": 2.7873347946456684e-07, + "loss": 0.4679, + "step": 11405 + }, + { + "epoch": 0.93, + "grad_norm": 0.9270474171243843, + "learning_rate": 2.7811661646408915e-07, + "loss": 0.4469, + "step": 11406 + }, + { + "epoch": 0.93, + "grad_norm": 0.9410766556350038, + "learning_rate": 2.7750042718150514e-07, + "loss": 0.4999, + "step": 11407 + }, + { + "epoch": 0.93, + "grad_norm": 0.8715469726023198, + "learning_rate": 2.768849116595185e-07, + "loss": 0.434, + "step": 11408 + }, + { + "epoch": 0.93, + "grad_norm": 0.9514148647505117, + "learning_rate": 2.762700699407828e-07, + "loss": 0.4589, + "step": 11409 + }, + { + "epoch": 0.93, + "grad_norm": 0.9448864220490569, + "learning_rate": 2.7565590206790613e-07, + "loss": 0.4982, + "step": 11410 + }, + { + "epoch": 0.93, + "grad_norm": 0.9840067038967264, + "learning_rate": 2.7504240808344906e-07, + "loss": 0.497, + "step": 11411 + }, + { + "epoch": 0.93, + "grad_norm": 0.9206231070024818, + "learning_rate": 2.744295880299297e-07, + "loss": 0.451, + "step": 11412 + }, + { + "epoch": 0.93, + "grad_norm": 0.995856159062488, + "learning_rate": 2.7381744194980963e-07, + "loss": 0.5632, + "step": 11413 + }, + { + "epoch": 0.93, + "grad_norm": 0.9625196276014706, + "learning_rate": 2.732059698855172e-07, + "loss": 0.5134, + "step": 11414 + }, + { + "epoch": 0.93, + "grad_norm": 0.948921370108036, + "learning_rate": 2.7259517187942174e-07, + "loss": 0.4777, + "step": 11415 + }, + { + "epoch": 0.93, + "grad_norm": 0.9561701392693762, + "learning_rate": 2.7198504797385286e-07, + "loss": 0.4564, + "step": 11416 + }, + { + "epoch": 0.93, + "grad_norm": 1.013002742298461, + "learning_rate": 2.7137559821109104e-07, + "loss": 0.5106, + "step": 11417 + }, + { + "epoch": 0.93, + "grad_norm": 0.9886559129055978, + "learning_rate": 2.7076682263337264e-07, + "loss": 0.5182, + "step": 11418 + }, + { + "epoch": 0.93, + "grad_norm": 0.9297366408530876, + "learning_rate": 2.701587212828816e-07, + "loss": 0.4637, + "step": 11419 + }, + { + "epoch": 0.93, + "grad_norm": 0.8628281199103401, + "learning_rate": 2.6955129420176193e-07, + "loss": 0.462, + "step": 11420 + }, + { + "epoch": 0.93, + "grad_norm": 0.8915919614811436, + "learning_rate": 2.689445414321057e-07, + "loss": 0.4719, + "step": 11421 + }, + { + "epoch": 0.93, + "grad_norm": 0.9077066766545857, + "learning_rate": 2.6833846301596246e-07, + "loss": 0.4627, + "step": 11422 + }, + { + "epoch": 0.93, + "grad_norm": 0.9649476009742295, + "learning_rate": 2.6773305899533084e-07, + "loss": 0.5364, + "step": 11423 + }, + { + "epoch": 0.93, + "grad_norm": 0.9958264945406542, + "learning_rate": 2.6712832941216735e-07, + "loss": 0.494, + "step": 11424 + }, + { + "epoch": 0.93, + "grad_norm": 0.9259154281760127, + "learning_rate": 2.6652427430837513e-07, + "loss": 0.4736, + "step": 11425 + }, + { + "epoch": 0.93, + "grad_norm": 0.9181811509348105, + "learning_rate": 2.659208937258195e-07, + "loss": 0.4899, + "step": 11426 + }, + { + "epoch": 0.93, + "grad_norm": 0.9557135320962066, + "learning_rate": 2.653181877063105e-07, + "loss": 0.4539, + "step": 11427 + }, + { + "epoch": 0.93, + "grad_norm": 0.9424434836374508, + "learning_rate": 2.6471615629161564e-07, + "loss": 0.4835, + "step": 11428 + }, + { + "epoch": 0.93, + "grad_norm": 1.006653551445918, + "learning_rate": 2.641147995234572e-07, + "loss": 0.451, + "step": 11429 + }, + { + "epoch": 0.93, + "grad_norm": 0.972800610895364, + "learning_rate": 2.6351411744350853e-07, + "loss": 0.4451, + "step": 11430 + }, + { + "epoch": 0.93, + "grad_norm": 0.9801622688087769, + "learning_rate": 2.6291411009339184e-07, + "loss": 0.4533, + "step": 11431 + }, + { + "epoch": 0.93, + "grad_norm": 0.9436460199587007, + "learning_rate": 2.623147775146939e-07, + "loss": 0.5323, + "step": 11432 + }, + { + "epoch": 0.93, + "grad_norm": 0.9656819963627655, + "learning_rate": 2.617161197489426e-07, + "loss": 0.4455, + "step": 11433 + }, + { + "epoch": 0.93, + "grad_norm": 0.8882390107869854, + "learning_rate": 2.61118136837627e-07, + "loss": 0.4535, + "step": 11434 + }, + { + "epoch": 0.93, + "grad_norm": 0.9408173864700979, + "learning_rate": 2.605208288221861e-07, + "loss": 0.4936, + "step": 11435 + }, + { + "epoch": 0.93, + "grad_norm": 0.9464672305317218, + "learning_rate": 2.599241957440135e-07, + "loss": 0.5004, + "step": 11436 + }, + { + "epoch": 0.93, + "grad_norm": 0.9689635678352339, + "learning_rate": 2.593282376444539e-07, + "loss": 0.5032, + "step": 11437 + }, + { + "epoch": 0.93, + "grad_norm": 0.9571939695204263, + "learning_rate": 2.587329545648076e-07, + "loss": 0.4825, + "step": 11438 + }, + { + "epoch": 0.93, + "grad_norm": 1.0129788746581043, + "learning_rate": 2.581383465463272e-07, + "loss": 0.4993, + "step": 11439 + }, + { + "epoch": 0.93, + "grad_norm": 0.9450247269430384, + "learning_rate": 2.5754441363021854e-07, + "loss": 0.4955, + "step": 11440 + }, + { + "epoch": 0.93, + "grad_norm": 0.9431474803716476, + "learning_rate": 2.5695115585763985e-07, + "loss": 0.4658, + "step": 11441 + }, + { + "epoch": 0.93, + "grad_norm": 0.9351517255823307, + "learning_rate": 2.5635857326970494e-07, + "loss": 0.4728, + "step": 11442 + }, + { + "epoch": 0.93, + "grad_norm": 1.0523809194784852, + "learning_rate": 2.5576666590747647e-07, + "loss": 0.6033, + "step": 11443 + }, + { + "epoch": 0.93, + "grad_norm": 0.9171374814882678, + "learning_rate": 2.5517543381197715e-07, + "loss": 0.4813, + "step": 11444 + }, + { + "epoch": 0.93, + "grad_norm": 0.9422827978562496, + "learning_rate": 2.5458487702417544e-07, + "loss": 0.4849, + "step": 11445 + }, + { + "epoch": 0.93, + "grad_norm": 0.9136321288515592, + "learning_rate": 2.539949955849985e-07, + "loss": 0.4322, + "step": 11446 + }, + { + "epoch": 0.93, + "grad_norm": 0.9397003802292838, + "learning_rate": 2.5340578953532256e-07, + "loss": 0.4441, + "step": 11447 + }, + { + "epoch": 0.93, + "grad_norm": 1.0308380015724954, + "learning_rate": 2.5281725891598166e-07, + "loss": 0.5122, + "step": 11448 + }, + { + "epoch": 0.93, + "grad_norm": 0.8825546477677132, + "learning_rate": 2.522294037677564e-07, + "loss": 0.4516, + "step": 11449 + }, + { + "epoch": 0.93, + "grad_norm": 0.8923664008410982, + "learning_rate": 2.516422241313898e-07, + "loss": 0.4327, + "step": 11450 + }, + { + "epoch": 0.93, + "grad_norm": 0.9709155618170434, + "learning_rate": 2.5105572004756827e-07, + "loss": 0.4749, + "step": 11451 + }, + { + "epoch": 0.93, + "grad_norm": 0.9445659754416634, + "learning_rate": 2.504698915569392e-07, + "loss": 0.4851, + "step": 11452 + }, + { + "epoch": 0.93, + "grad_norm": 1.0358932911976195, + "learning_rate": 2.49884738700098e-07, + "loss": 0.4879, + "step": 11453 + }, + { + "epoch": 0.93, + "grad_norm": 0.9421156716446982, + "learning_rate": 2.493002615175977e-07, + "loss": 0.5585, + "step": 11454 + }, + { + "epoch": 0.93, + "grad_norm": 0.9830053855246939, + "learning_rate": 2.487164600499381e-07, + "loss": 0.5162, + "step": 11455 + }, + { + "epoch": 0.93, + "grad_norm": 0.852493323525767, + "learning_rate": 2.481333343375802e-07, + "loss": 0.466, + "step": 11456 + }, + { + "epoch": 0.93, + "grad_norm": 0.9097985826103769, + "learning_rate": 2.475508844209318e-07, + "loss": 0.5315, + "step": 11457 + }, + { + "epoch": 0.93, + "grad_norm": 0.977593827825391, + "learning_rate": 2.469691103403571e-07, + "loss": 0.5219, + "step": 11458 + }, + { + "epoch": 0.93, + "grad_norm": 0.8948840283251344, + "learning_rate": 2.463880121361717e-07, + "loss": 0.4955, + "step": 11459 + }, + { + "epoch": 0.93, + "grad_norm": 0.9137935694279657, + "learning_rate": 2.4580758984864675e-07, + "loss": 0.4488, + "step": 11460 + }, + { + "epoch": 0.93, + "grad_norm": 0.9623904350660415, + "learning_rate": 2.452278435180011e-07, + "loss": 0.4811, + "step": 11461 + }, + { + "epoch": 0.93, + "grad_norm": 0.9768541913562684, + "learning_rate": 2.44648773184416e-07, + "loss": 0.5311, + "step": 11462 + }, + { + "epoch": 0.93, + "grad_norm": 0.9811267247772674, + "learning_rate": 2.440703788880172e-07, + "loss": 0.5502, + "step": 11463 + }, + { + "epoch": 0.93, + "grad_norm": 0.9739306300978986, + "learning_rate": 2.4349266066888697e-07, + "loss": 0.4836, + "step": 11464 + }, + { + "epoch": 0.93, + "grad_norm": 0.885085228057854, + "learning_rate": 2.4291561856706224e-07, + "loss": 0.4172, + "step": 11465 + }, + { + "epoch": 0.93, + "grad_norm": 1.0019125523263028, + "learning_rate": 2.42339252622531e-07, + "loss": 0.4975, + "step": 11466 + }, + { + "epoch": 0.93, + "grad_norm": 1.0379549781503792, + "learning_rate": 2.417635628752324e-07, + "loss": 0.5312, + "step": 11467 + }, + { + "epoch": 0.93, + "grad_norm": 0.9748225775151601, + "learning_rate": 2.411885493650656e-07, + "loss": 0.5303, + "step": 11468 + }, + { + "epoch": 0.93, + "grad_norm": 0.9013719123124777, + "learning_rate": 2.4061421213187553e-07, + "loss": 0.4759, + "step": 11469 + }, + { + "epoch": 0.93, + "grad_norm": 0.9363216769145684, + "learning_rate": 2.4004055121546354e-07, + "loss": 0.4897, + "step": 11470 + }, + { + "epoch": 0.93, + "grad_norm": 0.9190009329597287, + "learning_rate": 2.3946756665558457e-07, + "loss": 0.5444, + "step": 11471 + }, + { + "epoch": 0.93, + "grad_norm": 0.9129701848377056, + "learning_rate": 2.3889525849194573e-07, + "loss": 0.4854, + "step": 11472 + }, + { + "epoch": 0.93, + "grad_norm": 0.9861834302459811, + "learning_rate": 2.383236267642064e-07, + "loss": 0.5198, + "step": 11473 + }, + { + "epoch": 0.93, + "grad_norm": 0.9487541856662814, + "learning_rate": 2.3775267151198268e-07, + "loss": 0.4662, + "step": 11474 + }, + { + "epoch": 0.93, + "grad_norm": 0.9313717143200725, + "learning_rate": 2.3718239277483957e-07, + "loss": 0.4983, + "step": 11475 + }, + { + "epoch": 0.93, + "grad_norm": 0.9914572028788192, + "learning_rate": 2.3661279059229547e-07, + "loss": 0.5113, + "step": 11476 + }, + { + "epoch": 0.93, + "grad_norm": 0.9384206526654694, + "learning_rate": 2.3604386500382658e-07, + "loss": 0.4799, + "step": 11477 + }, + { + "epoch": 0.93, + "grad_norm": 0.91476640221212, + "learning_rate": 2.3547561604885693e-07, + "loss": 0.4808, + "step": 11478 + }, + { + "epoch": 0.93, + "grad_norm": 0.959476425363997, + "learning_rate": 2.34908043766765e-07, + "loss": 0.5028, + "step": 11479 + }, + { + "epoch": 0.93, + "grad_norm": 0.8586439006338363, + "learning_rate": 2.3434114819688492e-07, + "loss": 0.4727, + "step": 11480 + }, + { + "epoch": 0.93, + "grad_norm": 0.9686850870111136, + "learning_rate": 2.3377492937850077e-07, + "loss": 0.4618, + "step": 11481 + }, + { + "epoch": 0.93, + "grad_norm": 0.9513867379474965, + "learning_rate": 2.332093873508512e-07, + "loss": 0.4792, + "step": 11482 + }, + { + "epoch": 0.93, + "grad_norm": 0.9789415893310047, + "learning_rate": 2.3264452215312817e-07, + "loss": 0.4585, + "step": 11483 + }, + { + "epoch": 0.93, + "grad_norm": 0.8857061877304372, + "learning_rate": 2.3208033382447703e-07, + "loss": 0.4137, + "step": 11484 + }, + { + "epoch": 0.93, + "grad_norm": 1.1075940052741962, + "learning_rate": 2.315168224039932e-07, + "loss": 0.524, + "step": 11485 + }, + { + "epoch": 0.93, + "grad_norm": 1.0648445676642262, + "learning_rate": 2.3095398793072988e-07, + "loss": 0.4903, + "step": 11486 + }, + { + "epoch": 0.93, + "grad_norm": 0.9380650984956824, + "learning_rate": 2.3039183044368918e-07, + "loss": 0.4808, + "step": 11487 + }, + { + "epoch": 0.93, + "grad_norm": 0.9195255391855791, + "learning_rate": 2.2983034998182997e-07, + "loss": 0.5148, + "step": 11488 + }, + { + "epoch": 0.93, + "grad_norm": 0.9767623140075157, + "learning_rate": 2.292695465840611e-07, + "loss": 0.509, + "step": 11489 + }, + { + "epoch": 0.93, + "grad_norm": 0.9511340960467927, + "learning_rate": 2.2870942028924592e-07, + "loss": 0.4871, + "step": 11490 + }, + { + "epoch": 0.93, + "grad_norm": 0.9257448355190214, + "learning_rate": 2.2814997113620008e-07, + "loss": 0.4549, + "step": 11491 + }, + { + "epoch": 0.93, + "grad_norm": 0.9765459552883505, + "learning_rate": 2.2759119916369475e-07, + "loss": 0.4934, + "step": 11492 + }, + { + "epoch": 0.93, + "grad_norm": 0.9485530275142726, + "learning_rate": 2.2703310441045012e-07, + "loss": 0.5296, + "step": 11493 + }, + { + "epoch": 0.93, + "grad_norm": 0.9788948524867847, + "learning_rate": 2.264756869151441e-07, + "loss": 0.4941, + "step": 11494 + }, + { + "epoch": 0.93, + "grad_norm": 0.976896256270598, + "learning_rate": 2.2591894671640246e-07, + "loss": 0.5197, + "step": 11495 + }, + { + "epoch": 0.93, + "grad_norm": 0.9282784520344509, + "learning_rate": 2.253628838528088e-07, + "loss": 0.462, + "step": 11496 + }, + { + "epoch": 0.93, + "grad_norm": 0.978036939703617, + "learning_rate": 2.2480749836289672e-07, + "loss": 0.5081, + "step": 11497 + }, + { + "epoch": 0.93, + "grad_norm": 0.9645700321451685, + "learning_rate": 2.2425279028515658e-07, + "loss": 0.4979, + "step": 11498 + }, + { + "epoch": 0.93, + "grad_norm": 0.904580208924153, + "learning_rate": 2.2369875965802424e-07, + "loss": 0.5012, + "step": 11499 + }, + { + "epoch": 0.93, + "grad_norm": 1.0060872487309571, + "learning_rate": 2.231454065198979e-07, + "loss": 0.518, + "step": 11500 + }, + { + "epoch": 0.93, + "grad_norm": 1.0372439651101641, + "learning_rate": 2.225927309091225e-07, + "loss": 0.545, + "step": 11501 + }, + { + "epoch": 0.93, + "grad_norm": 0.7751415828819416, + "learning_rate": 2.220407328639973e-07, + "loss": 0.4071, + "step": 11502 + }, + { + "epoch": 0.93, + "grad_norm": 0.9450030394120906, + "learning_rate": 2.2148941242277732e-07, + "loss": 0.45, + "step": 11503 + }, + { + "epoch": 0.93, + "grad_norm": 0.9381623403458342, + "learning_rate": 2.2093876962366755e-07, + "loss": 0.4971, + "step": 11504 + }, + { + "epoch": 0.94, + "grad_norm": 1.0936081725087992, + "learning_rate": 2.2038880450482635e-07, + "loss": 0.5285, + "step": 11505 + }, + { + "epoch": 0.94, + "grad_norm": 0.9464753191272031, + "learning_rate": 2.1983951710436768e-07, + "loss": 0.5045, + "step": 11506 + }, + { + "epoch": 0.94, + "grad_norm": 0.9439441443903467, + "learning_rate": 2.1929090746035442e-07, + "loss": 0.4977, + "step": 11507 + }, + { + "epoch": 0.94, + "grad_norm": 0.9693561399893567, + "learning_rate": 2.1874297561080616e-07, + "loss": 0.4464, + "step": 11508 + }, + { + "epoch": 0.94, + "grad_norm": 0.9700127600386009, + "learning_rate": 2.1819572159369362e-07, + "loss": 0.5189, + "step": 11509 + }, + { + "epoch": 0.94, + "grad_norm": 0.9969850062131683, + "learning_rate": 2.1764914544694203e-07, + "loss": 0.5442, + "step": 11510 + }, + { + "epoch": 0.94, + "grad_norm": 0.9328818708633978, + "learning_rate": 2.1710324720842556e-07, + "loss": 0.5314, + "step": 11511 + }, + { + "epoch": 0.94, + "grad_norm": 0.9966265518299623, + "learning_rate": 2.1655802691597837e-07, + "loss": 0.4564, + "step": 11512 + }, + { + "epoch": 0.94, + "grad_norm": 1.1030087220932174, + "learning_rate": 2.1601348460738136e-07, + "loss": 0.5491, + "step": 11513 + }, + { + "epoch": 0.94, + "grad_norm": 1.0014104948023606, + "learning_rate": 2.1546962032037211e-07, + "loss": 0.4876, + "step": 11514 + }, + { + "epoch": 0.94, + "grad_norm": 1.0001591152367717, + "learning_rate": 2.1492643409263826e-07, + "loss": 0.469, + "step": 11515 + }, + { + "epoch": 0.94, + "grad_norm": 0.9063598650995034, + "learning_rate": 2.1438392596182522e-07, + "loss": 0.5024, + "step": 11516 + }, + { + "epoch": 0.94, + "grad_norm": 0.969634253237918, + "learning_rate": 2.1384209596552297e-07, + "loss": 0.5086, + "step": 11517 + }, + { + "epoch": 0.94, + "grad_norm": 0.9050830053343426, + "learning_rate": 2.1330094414128588e-07, + "loss": 0.4827, + "step": 11518 + }, + { + "epoch": 0.94, + "grad_norm": 0.9616622729265908, + "learning_rate": 2.1276047052661176e-07, + "loss": 0.4942, + "step": 11519 + }, + { + "epoch": 0.94, + "grad_norm": 0.9292719969220156, + "learning_rate": 2.1222067515895618e-07, + "loss": 0.5031, + "step": 11520 + }, + { + "epoch": 0.94, + "grad_norm": 0.8583985298289181, + "learning_rate": 2.1168155807572476e-07, + "loss": 0.4731, + "step": 11521 + }, + { + "epoch": 0.94, + "grad_norm": 1.0368097064485176, + "learning_rate": 2.11143119314281e-07, + "loss": 0.4492, + "step": 11522 + }, + { + "epoch": 0.94, + "grad_norm": 0.9604901260255988, + "learning_rate": 2.1060535891193502e-07, + "loss": 0.5228, + "step": 11523 + }, + { + "epoch": 0.94, + "grad_norm": 0.9001466531715809, + "learning_rate": 2.1006827690595478e-07, + "loss": 0.4525, + "step": 11524 + }, + { + "epoch": 0.94, + "grad_norm": 0.9317278582374193, + "learning_rate": 2.095318733335594e-07, + "loss": 0.4114, + "step": 11525 + }, + { + "epoch": 0.94, + "grad_norm": 0.933645590445262, + "learning_rate": 2.089961482319214e-07, + "loss": 0.4291, + "step": 11526 + }, + { + "epoch": 0.94, + "grad_norm": 0.9756302117349039, + "learning_rate": 2.0846110163816547e-07, + "loss": 0.49, + "step": 11527 + }, + { + "epoch": 0.94, + "grad_norm": 1.0351340753968052, + "learning_rate": 2.0792673358936978e-07, + "loss": 0.5204, + "step": 11528 + }, + { + "epoch": 0.94, + "grad_norm": 0.893173614049143, + "learning_rate": 2.0739304412256578e-07, + "loss": 0.4897, + "step": 11529 + }, + { + "epoch": 0.94, + "grad_norm": 0.93363362407905, + "learning_rate": 2.0686003327473837e-07, + "loss": 0.4362, + "step": 11530 + }, + { + "epoch": 0.94, + "grad_norm": 0.94204371654142, + "learning_rate": 2.0632770108282462e-07, + "loss": 0.4555, + "step": 11531 + }, + { + "epoch": 0.94, + "grad_norm": 0.9595420828187516, + "learning_rate": 2.0579604758371286e-07, + "loss": 0.496, + "step": 11532 + }, + { + "epoch": 0.94, + "grad_norm": 0.9189135525666059, + "learning_rate": 2.05265072814248e-07, + "loss": 0.4318, + "step": 11533 + }, + { + "epoch": 0.94, + "grad_norm": 0.8494707099918426, + "learning_rate": 2.047347768112262e-07, + "loss": 0.4793, + "step": 11534 + }, + { + "epoch": 0.94, + "grad_norm": 1.0353601474519885, + "learning_rate": 2.0420515961139475e-07, + "loss": 0.4841, + "step": 11535 + }, + { + "epoch": 0.94, + "grad_norm": 0.8312928039244486, + "learning_rate": 2.0367622125145868e-07, + "loss": 0.5352, + "step": 11536 + }, + { + "epoch": 0.94, + "grad_norm": 0.9498648164663277, + "learning_rate": 2.0314796176806984e-07, + "loss": 0.4937, + "step": 11537 + }, + { + "epoch": 0.94, + "grad_norm": 0.9609294146325073, + "learning_rate": 2.0262038119783778e-07, + "loss": 0.5138, + "step": 11538 + }, + { + "epoch": 0.94, + "grad_norm": 0.9108235617292304, + "learning_rate": 2.0209347957732328e-07, + "loss": 0.5079, + "step": 11539 + }, + { + "epoch": 0.94, + "grad_norm": 0.8957003143894978, + "learning_rate": 2.0156725694303936e-07, + "loss": 0.4517, + "step": 11540 + }, + { + "epoch": 0.94, + "grad_norm": 0.9433740502699994, + "learning_rate": 2.010417133314535e-07, + "loss": 0.4947, + "step": 11541 + }, + { + "epoch": 0.94, + "grad_norm": 1.0002718530207013, + "learning_rate": 2.0051684877898547e-07, + "loss": 0.4633, + "step": 11542 + }, + { + "epoch": 0.94, + "grad_norm": 0.9221570895181908, + "learning_rate": 1.999926633220084e-07, + "loss": 0.4623, + "step": 11543 + }, + { + "epoch": 0.94, + "grad_norm": 0.9640475017965786, + "learning_rate": 1.9946915699684653e-07, + "loss": 0.4682, + "step": 11544 + }, + { + "epoch": 0.94, + "grad_norm": 0.9504884884382505, + "learning_rate": 1.9894632983977868e-07, + "loss": 0.4907, + "step": 11545 + }, + { + "epoch": 0.94, + "grad_norm": 0.9108308409096453, + "learning_rate": 1.9842418188703694e-07, + "loss": 0.4612, + "step": 11546 + }, + { + "epoch": 0.94, + "grad_norm": 0.967929844407366, + "learning_rate": 1.9790271317480458e-07, + "loss": 0.5103, + "step": 11547 + }, + { + "epoch": 0.94, + "grad_norm": 0.9606433023604526, + "learning_rate": 1.973819237392205e-07, + "loss": 0.5287, + "step": 11548 + }, + { + "epoch": 0.94, + "grad_norm": 0.8816364193756345, + "learning_rate": 1.968618136163747e-07, + "loss": 0.4454, + "step": 11549 + }, + { + "epoch": 0.94, + "grad_norm": 0.977224015587263, + "learning_rate": 1.9634238284230945e-07, + "loss": 0.4818, + "step": 11550 + }, + { + "epoch": 0.94, + "grad_norm": 1.1635547112143767, + "learning_rate": 1.9582363145302152e-07, + "loss": 0.4756, + "step": 11551 + }, + { + "epoch": 0.94, + "grad_norm": 1.029802951948784, + "learning_rate": 1.9530555948445883e-07, + "loss": 0.5323, + "step": 11552 + }, + { + "epoch": 0.94, + "grad_norm": 1.0420832717806676, + "learning_rate": 1.947881669725249e-07, + "loss": 0.4768, + "step": 11553 + }, + { + "epoch": 0.94, + "grad_norm": 0.8901052369947996, + "learning_rate": 1.9427145395307322e-07, + "loss": 0.4326, + "step": 11554 + }, + { + "epoch": 0.94, + "grad_norm": 1.0468425019933045, + "learning_rate": 1.9375542046191297e-07, + "loss": 0.5095, + "step": 11555 + }, + { + "epoch": 0.94, + "grad_norm": 0.8997505720146292, + "learning_rate": 1.9324006653480332e-07, + "loss": 0.4905, + "step": 11556 + }, + { + "epoch": 0.94, + "grad_norm": 0.9338220987399636, + "learning_rate": 1.927253922074579e-07, + "loss": 0.4623, + "step": 11557 + }, + { + "epoch": 0.94, + "grad_norm": 0.8516812243067089, + "learning_rate": 1.9221139751554373e-07, + "loss": 0.4504, + "step": 11558 + }, + { + "epoch": 0.94, + "grad_norm": 0.9289614121327456, + "learning_rate": 1.9169808249468125e-07, + "loss": 0.5018, + "step": 11559 + }, + { + "epoch": 0.94, + "grad_norm": 0.9779634493064184, + "learning_rate": 1.9118544718044084e-07, + "loss": 0.5542, + "step": 11560 + }, + { + "epoch": 0.94, + "grad_norm": 0.8994294029334284, + "learning_rate": 1.906734916083497e-07, + "loss": 0.4226, + "step": 11561 + }, + { + "epoch": 0.94, + "grad_norm": 0.9682669746822895, + "learning_rate": 1.9016221581388272e-07, + "loss": 0.5273, + "step": 11562 + }, + { + "epoch": 0.94, + "grad_norm": 0.9761134799360173, + "learning_rate": 1.8965161983247494e-07, + "loss": 0.5418, + "step": 11563 + }, + { + "epoch": 0.94, + "grad_norm": 1.0555396355770037, + "learning_rate": 1.891417036995069e-07, + "loss": 0.5418, + "step": 11564 + }, + { + "epoch": 0.94, + "grad_norm": 0.9106038553461743, + "learning_rate": 1.8863246745031704e-07, + "loss": 0.5275, + "step": 11565 + }, + { + "epoch": 0.94, + "grad_norm": 0.9721292591901912, + "learning_rate": 1.881239111201949e-07, + "loss": 0.4808, + "step": 11566 + }, + { + "epoch": 0.94, + "grad_norm": 1.2443193603745144, + "learning_rate": 1.876160347443823e-07, + "loss": 0.4499, + "step": 11567 + }, + { + "epoch": 0.94, + "grad_norm": 0.9113977420502493, + "learning_rate": 1.8710883835807437e-07, + "loss": 0.4509, + "step": 11568 + }, + { + "epoch": 0.94, + "grad_norm": 0.9157774007045484, + "learning_rate": 1.866023219964208e-07, + "loss": 0.4312, + "step": 11569 + }, + { + "epoch": 0.94, + "grad_norm": 0.9780163504554356, + "learning_rate": 1.8609648569452132e-07, + "loss": 0.4702, + "step": 11570 + }, + { + "epoch": 0.94, + "grad_norm": 0.9718901158746666, + "learning_rate": 1.8559132948743007e-07, + "loss": 0.4922, + "step": 11571 + }, + { + "epoch": 0.94, + "grad_norm": 1.0308700763315264, + "learning_rate": 1.8508685341015465e-07, + "loss": 0.4986, + "step": 11572 + }, + { + "epoch": 0.94, + "grad_norm": 0.8754438065720972, + "learning_rate": 1.845830574976548e-07, + "loss": 0.4633, + "step": 11573 + }, + { + "epoch": 0.94, + "grad_norm": 0.9917172090708526, + "learning_rate": 1.8407994178484155e-07, + "loss": 0.4981, + "step": 11574 + }, + { + "epoch": 0.94, + "grad_norm": 0.897217849065791, + "learning_rate": 1.8357750630658367e-07, + "loss": 0.4333, + "step": 11575 + }, + { + "epoch": 0.94, + "grad_norm": 0.8707656123011874, + "learning_rate": 1.830757510976966e-07, + "loss": 0.4689, + "step": 11576 + }, + { + "epoch": 0.94, + "grad_norm": 0.9677192816664028, + "learning_rate": 1.8257467619295143e-07, + "loss": 0.5139, + "step": 11577 + }, + { + "epoch": 0.94, + "grad_norm": 1.0527355591460235, + "learning_rate": 1.8207428162707374e-07, + "loss": 0.497, + "step": 11578 + }, + { + "epoch": 0.94, + "grad_norm": 0.9315945499644502, + "learning_rate": 1.8157456743474133e-07, + "loss": 0.5082, + "step": 11579 + }, + { + "epoch": 0.94, + "grad_norm": 0.8556914362013863, + "learning_rate": 1.8107553365057983e-07, + "loss": 0.4383, + "step": 11580 + }, + { + "epoch": 0.94, + "grad_norm": 0.9714958630042003, + "learning_rate": 1.8057718030917714e-07, + "loss": 0.4837, + "step": 11581 + }, + { + "epoch": 0.94, + "grad_norm": 0.7953942637319843, + "learning_rate": 1.8007950744506454e-07, + "loss": 0.4541, + "step": 11582 + }, + { + "epoch": 0.94, + "grad_norm": 1.0624559488720686, + "learning_rate": 1.7958251509273106e-07, + "loss": 0.5234, + "step": 11583 + }, + { + "epoch": 0.94, + "grad_norm": 1.0426144132894177, + "learning_rate": 1.790862032866203e-07, + "loss": 0.4869, + "step": 11584 + }, + { + "epoch": 0.94, + "grad_norm": 0.8442765483827533, + "learning_rate": 1.7859057206112361e-07, + "loss": 0.3807, + "step": 11585 + }, + { + "epoch": 0.94, + "grad_norm": 1.1444035809631279, + "learning_rate": 1.78095621450588e-07, + "loss": 0.5447, + "step": 11586 + }, + { + "epoch": 0.94, + "grad_norm": 0.9206041439662114, + "learning_rate": 1.776013514893149e-07, + "loss": 0.4671, + "step": 11587 + }, + { + "epoch": 0.94, + "grad_norm": 1.0077606054336101, + "learning_rate": 1.7710776221155578e-07, + "loss": 0.4941, + "step": 11588 + }, + { + "epoch": 0.94, + "grad_norm": 0.8720283736829387, + "learning_rate": 1.7661485365151553e-07, + "loss": 0.4178, + "step": 11589 + }, + { + "epoch": 0.94, + "grad_norm": 1.0005935424660755, + "learning_rate": 1.761226258433524e-07, + "loss": 0.5458, + "step": 11590 + }, + { + "epoch": 0.94, + "grad_norm": 0.9880090308093082, + "learning_rate": 1.756310788211779e-07, + "loss": 0.4767, + "step": 11591 + }, + { + "epoch": 0.94, + "grad_norm": 0.9555884247413868, + "learning_rate": 1.751402126190549e-07, + "loss": 0.5261, + "step": 11592 + }, + { + "epoch": 0.94, + "grad_norm": 0.9378522243429254, + "learning_rate": 1.7465002727100055e-07, + "loss": 0.4732, + "step": 11593 + }, + { + "epoch": 0.94, + "grad_norm": 1.0058916977963035, + "learning_rate": 1.741605228109844e-07, + "loss": 0.5062, + "step": 11594 + }, + { + "epoch": 0.94, + "grad_norm": 0.9147081497526133, + "learning_rate": 1.7367169927292925e-07, + "loss": 0.4483, + "step": 11595 + }, + { + "epoch": 0.94, + "grad_norm": 0.8592595462485942, + "learning_rate": 1.7318355669070807e-07, + "loss": 0.4526, + "step": 11596 + }, + { + "epoch": 0.94, + "grad_norm": 0.831385833810018, + "learning_rate": 1.7269609509815156e-07, + "loss": 0.43, + "step": 11597 + }, + { + "epoch": 0.94, + "grad_norm": 0.924732035163592, + "learning_rate": 1.7220931452903712e-07, + "loss": 0.4925, + "step": 11598 + }, + { + "epoch": 0.94, + "grad_norm": 0.9140520056972847, + "learning_rate": 1.7172321501710109e-07, + "loss": 0.4782, + "step": 11599 + }, + { + "epoch": 0.94, + "grad_norm": 0.975889917886747, + "learning_rate": 1.712377965960288e-07, + "loss": 0.5251, + "step": 11600 + }, + { + "epoch": 0.94, + "grad_norm": 0.9777399777316796, + "learning_rate": 1.7075305929945775e-07, + "loss": 0.4418, + "step": 11601 + }, + { + "epoch": 0.94, + "grad_norm": 0.9104619502187835, + "learning_rate": 1.7026900316098217e-07, + "loss": 0.4639, + "step": 11602 + }, + { + "epoch": 0.94, + "grad_norm": 1.0389219870998976, + "learning_rate": 1.6978562821414635e-07, + "loss": 0.5252, + "step": 11603 + }, + { + "epoch": 0.94, + "grad_norm": 0.8893812909710748, + "learning_rate": 1.6930293449244573e-07, + "loss": 0.4562, + "step": 11604 + }, + { + "epoch": 0.94, + "grad_norm": 1.0139611356319098, + "learning_rate": 1.6882092202933242e-07, + "loss": 0.5476, + "step": 11605 + }, + { + "epoch": 0.94, + "grad_norm": 0.9920808416502068, + "learning_rate": 1.683395908582097e-07, + "loss": 0.5021, + "step": 11606 + }, + { + "epoch": 0.94, + "grad_norm": 0.9401684008955127, + "learning_rate": 1.6785894101243205e-07, + "loss": 0.4315, + "step": 11607 + }, + { + "epoch": 0.94, + "grad_norm": 1.0326745169755602, + "learning_rate": 1.6737897252530832e-07, + "loss": 0.4319, + "step": 11608 + }, + { + "epoch": 0.94, + "grad_norm": 1.4288513899528752, + "learning_rate": 1.6689968543010082e-07, + "loss": 0.4886, + "step": 11609 + }, + { + "epoch": 0.94, + "grad_norm": 0.8585485958823579, + "learning_rate": 1.6642107976002186e-07, + "loss": 0.4083, + "step": 11610 + }, + { + "epoch": 0.94, + "grad_norm": 0.9069449809793708, + "learning_rate": 1.659431555482416e-07, + "loss": 0.4786, + "step": 11611 + }, + { + "epoch": 0.94, + "grad_norm": 1.1631691820206738, + "learning_rate": 1.6546591282787683e-07, + "loss": 0.5776, + "step": 11612 + }, + { + "epoch": 0.94, + "grad_norm": 1.0616439313560415, + "learning_rate": 1.6498935163200114e-07, + "loss": 0.5342, + "step": 11613 + }, + { + "epoch": 0.94, + "grad_norm": 0.8923630631274349, + "learning_rate": 1.6451347199364032e-07, + "loss": 0.517, + "step": 11614 + }, + { + "epoch": 0.94, + "grad_norm": 0.9362423351584785, + "learning_rate": 1.6403827394577244e-07, + "loss": 0.4897, + "step": 11615 + }, + { + "epoch": 0.94, + "grad_norm": 0.9151508211734752, + "learning_rate": 1.6356375752132558e-07, + "loss": 0.4951, + "step": 11616 + }, + { + "epoch": 0.94, + "grad_norm": 0.8389818007257338, + "learning_rate": 1.630899227531868e-07, + "loss": 0.4103, + "step": 11617 + }, + { + "epoch": 0.94, + "grad_norm": 0.9323848955910646, + "learning_rate": 1.6261676967419094e-07, + "loss": 0.468, + "step": 11618 + }, + { + "epoch": 0.94, + "grad_norm": 0.8786132211991765, + "learning_rate": 1.621442983171262e-07, + "loss": 0.4523, + "step": 11619 + }, + { + "epoch": 0.94, + "grad_norm": 0.8676920931406709, + "learning_rate": 1.616725087147364e-07, + "loss": 0.49, + "step": 11620 + }, + { + "epoch": 0.94, + "grad_norm": 0.9795059798461183, + "learning_rate": 1.6120140089971536e-07, + "loss": 0.5262, + "step": 11621 + }, + { + "epoch": 0.94, + "grad_norm": 1.0434011389872648, + "learning_rate": 1.6073097490470924e-07, + "loss": 0.5017, + "step": 11622 + }, + { + "epoch": 0.94, + "grad_norm": 0.9551902553910484, + "learning_rate": 1.602612307623208e-07, + "loss": 0.4933, + "step": 11623 + }, + { + "epoch": 0.94, + "grad_norm": 0.9575849488802687, + "learning_rate": 1.5979216850509848e-07, + "loss": 0.4581, + "step": 11624 + }, + { + "epoch": 0.94, + "grad_norm": 0.903798519803686, + "learning_rate": 1.5932378816555405e-07, + "loss": 0.4304, + "step": 11625 + }, + { + "epoch": 0.94, + "grad_norm": 0.903238053594976, + "learning_rate": 1.5885608977614043e-07, + "loss": 0.4708, + "step": 11626 + }, + { + "epoch": 0.94, + "grad_norm": 1.0195498094816038, + "learning_rate": 1.5838907336927055e-07, + "loss": 0.493, + "step": 11627 + }, + { + "epoch": 0.95, + "grad_norm": 0.948691606472617, + "learning_rate": 1.5792273897730858e-07, + "loss": 0.5152, + "step": 11628 + }, + { + "epoch": 0.95, + "grad_norm": 0.9576141491811475, + "learning_rate": 1.5745708663257199e-07, + "loss": 0.4619, + "step": 11629 + }, + { + "epoch": 0.95, + "grad_norm": 0.9671330738151036, + "learning_rate": 1.5699211636732714e-07, + "loss": 0.4614, + "step": 11630 + }, + { + "epoch": 0.95, + "grad_norm": 0.9491379972972037, + "learning_rate": 1.5652782821379942e-07, + "loss": 0.4687, + "step": 11631 + }, + { + "epoch": 0.95, + "grad_norm": 0.8516708278279836, + "learning_rate": 1.5606422220416196e-07, + "loss": 0.4502, + "step": 11632 + }, + { + "epoch": 0.95, + "grad_norm": 0.922946174629565, + "learning_rate": 1.5560129837054127e-07, + "loss": 0.4468, + "step": 11633 + }, + { + "epoch": 0.95, + "grad_norm": 0.893755085344053, + "learning_rate": 1.551390567450195e-07, + "loss": 0.4817, + "step": 11634 + }, + { + "epoch": 0.95, + "grad_norm": 1.0082700354791259, + "learning_rate": 1.5467749735962878e-07, + "loss": 0.4937, + "step": 11635 + }, + { + "epoch": 0.95, + "grad_norm": 0.9353409160951671, + "learning_rate": 1.5421662024635353e-07, + "loss": 0.4265, + "step": 11636 + }, + { + "epoch": 0.95, + "grad_norm": 0.9418824879588475, + "learning_rate": 1.5375642543713488e-07, + "loss": 0.4844, + "step": 11637 + }, + { + "epoch": 0.95, + "grad_norm": 0.9828534306732699, + "learning_rate": 1.5329691296386174e-07, + "loss": 0.4977, + "step": 11638 + }, + { + "epoch": 0.95, + "grad_norm": 1.044063249037734, + "learning_rate": 1.5283808285837754e-07, + "loss": 0.5204, + "step": 11639 + }, + { + "epoch": 0.95, + "grad_norm": 1.027630071859994, + "learning_rate": 1.5237993515248017e-07, + "loss": 0.4924, + "step": 11640 + }, + { + "epoch": 0.95, + "grad_norm": 0.9350323418549664, + "learning_rate": 1.519224698779198e-07, + "loss": 0.5401, + "step": 11641 + }, + { + "epoch": 0.95, + "grad_norm": 0.9073285796594803, + "learning_rate": 1.5146568706639552e-07, + "loss": 0.4147, + "step": 11642 + }, + { + "epoch": 0.95, + "grad_norm": 0.9303432068459472, + "learning_rate": 1.5100958674956424e-07, + "loss": 0.4919, + "step": 11643 + }, + { + "epoch": 0.95, + "grad_norm": 0.8080326063767643, + "learning_rate": 1.5055416895903284e-07, + "loss": 0.413, + "step": 11644 + }, + { + "epoch": 0.95, + "grad_norm": 0.8638585497713773, + "learning_rate": 1.500994337263606e-07, + "loss": 0.4604, + "step": 11645 + }, + { + "epoch": 0.95, + "grad_norm": 1.0138512478569417, + "learning_rate": 1.4964538108306004e-07, + "loss": 0.4739, + "step": 11646 + }, + { + "epoch": 0.95, + "grad_norm": 1.0237844054054825, + "learning_rate": 1.4919201106059932e-07, + "loss": 0.5367, + "step": 11647 + }, + { + "epoch": 0.95, + "grad_norm": 0.9469968312258004, + "learning_rate": 1.4873932369039223e-07, + "loss": 0.4471, + "step": 11648 + }, + { + "epoch": 0.95, + "grad_norm": 0.9348968855520209, + "learning_rate": 1.4828731900381366e-07, + "loss": 0.4464, + "step": 11649 + }, + { + "epoch": 0.95, + "grad_norm": 0.9454783504849241, + "learning_rate": 1.478359970321852e-07, + "loss": 0.4805, + "step": 11650 + }, + { + "epoch": 0.95, + "grad_norm": 0.9696086869071582, + "learning_rate": 1.47385357806783e-07, + "loss": 0.4803, + "step": 11651 + }, + { + "epoch": 0.95, + "grad_norm": 0.9258787941079295, + "learning_rate": 1.4693540135883533e-07, + "loss": 0.4343, + "step": 11652 + }, + { + "epoch": 0.95, + "grad_norm": 0.859426434883943, + "learning_rate": 1.4648612771952618e-07, + "loss": 0.4454, + "step": 11653 + }, + { + "epoch": 0.95, + "grad_norm": 0.9625867462394057, + "learning_rate": 1.4603753691998735e-07, + "loss": 0.4783, + "step": 11654 + }, + { + "epoch": 0.95, + "grad_norm": 0.9656030188852532, + "learning_rate": 1.4558962899130724e-07, + "loss": 0.5484, + "step": 11655 + }, + { + "epoch": 0.95, + "grad_norm": 0.9501134464292218, + "learning_rate": 1.4514240396452438e-07, + "loss": 0.4818, + "step": 11656 + }, + { + "epoch": 0.95, + "grad_norm": 1.0097149440429334, + "learning_rate": 1.4469586187063289e-07, + "loss": 0.524, + "step": 11657 + }, + { + "epoch": 0.95, + "grad_norm": 1.0225104022893932, + "learning_rate": 1.4425000274057577e-07, + "loss": 0.5142, + "step": 11658 + }, + { + "epoch": 0.95, + "grad_norm": 0.8977241993375531, + "learning_rate": 1.4380482660525164e-07, + "loss": 0.4279, + "step": 11659 + }, + { + "epoch": 0.95, + "grad_norm": 0.9465529505683918, + "learning_rate": 1.4336033349550916e-07, + "loss": 0.4816, + "step": 11660 + }, + { + "epoch": 0.95, + "grad_norm": 0.9588676788041502, + "learning_rate": 1.429165234421548e-07, + "loss": 0.4907, + "step": 11661 + }, + { + "epoch": 0.95, + "grad_norm": 0.9247383698200462, + "learning_rate": 1.424733964759406e-07, + "loss": 0.4896, + "step": 11662 + }, + { + "epoch": 0.95, + "grad_norm": 0.9732645453172724, + "learning_rate": 1.420309526275776e-07, + "loss": 0.4748, + "step": 11663 + }, + { + "epoch": 0.95, + "grad_norm": 0.8902318164715665, + "learning_rate": 1.4158919192772458e-07, + "loss": 0.4787, + "step": 11664 + }, + { + "epoch": 0.95, + "grad_norm": 0.9836209649754226, + "learning_rate": 1.4114811440699706e-07, + "loss": 0.4465, + "step": 11665 + }, + { + "epoch": 0.95, + "grad_norm": 0.9865694640029874, + "learning_rate": 1.4070772009595944e-07, + "loss": 0.446, + "step": 11666 + }, + { + "epoch": 0.95, + "grad_norm": 1.1353592274720519, + "learning_rate": 1.4026800902513293e-07, + "loss": 0.5372, + "step": 11667 + }, + { + "epoch": 0.95, + "grad_norm": 0.9894621895910385, + "learning_rate": 1.3982898122498755e-07, + "loss": 0.4594, + "step": 11668 + }, + { + "epoch": 0.95, + "grad_norm": 0.8808762541092449, + "learning_rate": 1.3939063672594677e-07, + "loss": 0.415, + "step": 11669 + }, + { + "epoch": 0.95, + "grad_norm": 0.9543222001061445, + "learning_rate": 1.389529755583885e-07, + "loss": 0.5108, + "step": 11670 + }, + { + "epoch": 0.95, + "grad_norm": 0.9031544153763041, + "learning_rate": 1.3851599775264403e-07, + "loss": 0.4892, + "step": 11671 + }, + { + "epoch": 0.95, + "grad_norm": 1.051975022845536, + "learning_rate": 1.3807970333899133e-07, + "loss": 0.5505, + "step": 11672 + }, + { + "epoch": 0.95, + "grad_norm": 0.9494252597569944, + "learning_rate": 1.3764409234766962e-07, + "loss": 0.4922, + "step": 11673 + }, + { + "epoch": 0.95, + "grad_norm": 0.9844827083958534, + "learning_rate": 1.3720916480886359e-07, + "loss": 0.5427, + "step": 11674 + }, + { + "epoch": 0.95, + "grad_norm": 1.0213755760482455, + "learning_rate": 1.367749207527147e-07, + "loss": 0.4659, + "step": 11675 + }, + { + "epoch": 0.95, + "grad_norm": 1.1219645031442136, + "learning_rate": 1.3634136020931444e-07, + "loss": 0.5685, + "step": 11676 + }, + { + "epoch": 0.95, + "grad_norm": 0.9198137919526792, + "learning_rate": 1.3590848320870874e-07, + "loss": 0.4975, + "step": 11677 + }, + { + "epoch": 0.95, + "grad_norm": 0.9814407636972431, + "learning_rate": 1.35476289780897e-07, + "loss": 0.4589, + "step": 11678 + }, + { + "epoch": 0.95, + "grad_norm": 0.9265380091276773, + "learning_rate": 1.3504477995582744e-07, + "loss": 0.4739, + "step": 11679 + }, + { + "epoch": 0.95, + "grad_norm": 0.9235688603751667, + "learning_rate": 1.3461395376340502e-07, + "loss": 0.5288, + "step": 11680 + }, + { + "epoch": 0.95, + "grad_norm": 0.9593894678972065, + "learning_rate": 1.3418381123348477e-07, + "loss": 0.4638, + "step": 11681 + }, + { + "epoch": 0.95, + "grad_norm": 0.9690623318955943, + "learning_rate": 1.337543523958751e-07, + "loss": 0.5005, + "step": 11682 + }, + { + "epoch": 0.95, + "grad_norm": 0.846608089060443, + "learning_rate": 1.333255772803377e-07, + "loss": 0.408, + "step": 11683 + }, + { + "epoch": 0.95, + "grad_norm": 0.969082169503564, + "learning_rate": 1.3289748591658546e-07, + "loss": 0.4923, + "step": 11684 + }, + { + "epoch": 0.95, + "grad_norm": 0.8407132879000558, + "learning_rate": 1.3247007833428694e-07, + "loss": 0.4794, + "step": 11685 + }, + { + "epoch": 0.95, + "grad_norm": 1.0111315851398217, + "learning_rate": 1.320433545630584e-07, + "loss": 0.532, + "step": 11686 + }, + { + "epoch": 0.95, + "grad_norm": 1.0221013452148755, + "learning_rate": 1.3161731463247284e-07, + "loss": 0.4515, + "step": 11687 + }, + { + "epoch": 0.95, + "grad_norm": 0.8748986501368735, + "learning_rate": 1.3119195857205337e-07, + "loss": 0.4978, + "step": 11688 + }, + { + "epoch": 0.95, + "grad_norm": 0.9497318881951801, + "learning_rate": 1.3076728641127857e-07, + "loss": 0.478, + "step": 11689 + }, + { + "epoch": 0.95, + "grad_norm": 0.915779655254726, + "learning_rate": 1.3034329817957603e-07, + "loss": 0.424, + "step": 11690 + }, + { + "epoch": 0.95, + "grad_norm": 0.9553825667674619, + "learning_rate": 1.2991999390632892e-07, + "loss": 0.4347, + "step": 11691 + }, + { + "epoch": 0.95, + "grad_norm": 0.9041932919429256, + "learning_rate": 1.2949737362087156e-07, + "loss": 0.4739, + "step": 11692 + }, + { + "epoch": 0.95, + "grad_norm": 0.8968792184451714, + "learning_rate": 1.2907543735249163e-07, + "loss": 0.4658, + "step": 11693 + }, + { + "epoch": 0.95, + "grad_norm": 1.0653933580624912, + "learning_rate": 1.286541851304268e-07, + "loss": 0.5547, + "step": 11694 + }, + { + "epoch": 0.95, + "grad_norm": 0.9609039869721445, + "learning_rate": 1.282336169838727e-07, + "loss": 0.4894, + "step": 11695 + }, + { + "epoch": 0.95, + "grad_norm": 1.020385805816101, + "learning_rate": 1.278137329419715e-07, + "loss": 0.5107, + "step": 11696 + }, + { + "epoch": 0.95, + "grad_norm": 0.8729444977310155, + "learning_rate": 1.2739453303382222e-07, + "loss": 0.4691, + "step": 11697 + }, + { + "epoch": 0.95, + "grad_norm": 0.9736436538183029, + "learning_rate": 1.2697601728847596e-07, + "loss": 0.4704, + "step": 11698 + }, + { + "epoch": 0.95, + "grad_norm": 1.009484260023586, + "learning_rate": 1.2655818573493295e-07, + "loss": 0.4842, + "step": 11699 + }, + { + "epoch": 0.95, + "grad_norm": 0.9396092829306968, + "learning_rate": 1.261410384021511e-07, + "loss": 0.5107, + "step": 11700 + }, + { + "epoch": 0.95, + "grad_norm": 0.872628805875551, + "learning_rate": 1.2572457531903614e-07, + "loss": 0.4342, + "step": 11701 + }, + { + "epoch": 0.95, + "grad_norm": 0.9414232683288218, + "learning_rate": 1.2530879651444949e-07, + "loss": 0.4543, + "step": 11702 + }, + { + "epoch": 0.95, + "grad_norm": 0.864248669810779, + "learning_rate": 1.2489370201720473e-07, + "loss": 0.4684, + "step": 11703 + }, + { + "epoch": 0.95, + "grad_norm": 0.9694194096349622, + "learning_rate": 1.2447929185606778e-07, + "loss": 0.4468, + "step": 11704 + }, + { + "epoch": 0.95, + "grad_norm": 1.0015075266884892, + "learning_rate": 1.2406556605975673e-07, + "loss": 0.5852, + "step": 11705 + }, + { + "epoch": 0.95, + "grad_norm": 0.9396979059242895, + "learning_rate": 1.2365252465694088e-07, + "loss": 0.4968, + "step": 11706 + }, + { + "epoch": 0.95, + "grad_norm": 0.9359777283532162, + "learning_rate": 1.2324016767624515e-07, + "loss": 0.4768, + "step": 11707 + }, + { + "epoch": 0.95, + "grad_norm": 1.0363612056953866, + "learning_rate": 1.228284951462444e-07, + "loss": 0.5069, + "step": 11708 + }, + { + "epoch": 0.95, + "grad_norm": 0.9450225369152239, + "learning_rate": 1.2241750709546918e-07, + "loss": 0.4555, + "step": 11709 + }, + { + "epoch": 0.95, + "grad_norm": 0.945398340483411, + "learning_rate": 1.2200720355239893e-07, + "loss": 0.4209, + "step": 11710 + }, + { + "epoch": 0.95, + "grad_norm": 0.9641827946918541, + "learning_rate": 1.2159758454546643e-07, + "loss": 0.465, + "step": 11711 + }, + { + "epoch": 0.95, + "grad_norm": 1.0174498372544294, + "learning_rate": 1.2118865010306124e-07, + "loss": 0.5257, + "step": 11712 + }, + { + "epoch": 0.95, + "grad_norm": 0.8686355378031086, + "learning_rate": 1.2078040025351844e-07, + "loss": 0.4278, + "step": 11713 + }, + { + "epoch": 0.95, + "grad_norm": 1.1133168236049928, + "learning_rate": 1.2037283502513208e-07, + "loss": 0.4626, + "step": 11714 + }, + { + "epoch": 0.95, + "grad_norm": 0.9425915566814234, + "learning_rate": 1.1996595444614511e-07, + "loss": 0.4682, + "step": 11715 + }, + { + "epoch": 0.95, + "grad_norm": 0.8035223312922553, + "learning_rate": 1.1955975854475388e-07, + "loss": 0.479, + "step": 11716 + }, + { + "epoch": 0.95, + "grad_norm": 0.8922305833970967, + "learning_rate": 1.1915424734910585e-07, + "loss": 0.4497, + "step": 11717 + }, + { + "epoch": 0.95, + "grad_norm": 0.9559911600741078, + "learning_rate": 1.1874942088730635e-07, + "loss": 0.4816, + "step": 11718 + }, + { + "epoch": 0.95, + "grad_norm": 0.9758579168957457, + "learning_rate": 1.1834527918740624e-07, + "loss": 0.5539, + "step": 11719 + }, + { + "epoch": 0.95, + "grad_norm": 1.0506259051311835, + "learning_rate": 1.1794182227741314e-07, + "loss": 0.4521, + "step": 11720 + }, + { + "epoch": 0.95, + "grad_norm": 0.9705461383571655, + "learning_rate": 1.1753905018528688e-07, + "loss": 0.5648, + "step": 11721 + }, + { + "epoch": 0.95, + "grad_norm": 1.0157017913842958, + "learning_rate": 1.171369629389385e-07, + "loss": 0.5057, + "step": 11722 + }, + { + "epoch": 0.95, + "grad_norm": 0.9483830944456155, + "learning_rate": 1.1673556056623237e-07, + "loss": 0.4671, + "step": 11723 + }, + { + "epoch": 0.95, + "grad_norm": 0.9072532108846191, + "learning_rate": 1.1633484309498511e-07, + "loss": 0.4809, + "step": 11724 + }, + { + "epoch": 0.95, + "grad_norm": 0.9568152372299135, + "learning_rate": 1.1593481055296673e-07, + "loss": 0.4797, + "step": 11725 + }, + { + "epoch": 0.95, + "grad_norm": 0.8821466617527249, + "learning_rate": 1.1553546296789952e-07, + "loss": 0.4821, + "step": 11726 + }, + { + "epoch": 0.95, + "grad_norm": 0.9175803795045503, + "learning_rate": 1.1513680036745578e-07, + "loss": 0.4646, + "step": 11727 + }, + { + "epoch": 0.95, + "grad_norm": 1.0322661343458974, + "learning_rate": 1.1473882277926562e-07, + "loss": 0.4446, + "step": 11728 + }, + { + "epoch": 0.95, + "grad_norm": 1.0408277460676767, + "learning_rate": 1.1434153023090589e-07, + "loss": 0.5255, + "step": 11729 + }, + { + "epoch": 0.95, + "grad_norm": 1.027227376134949, + "learning_rate": 1.1394492274991009e-07, + "loss": 0.5529, + "step": 11730 + }, + { + "epoch": 0.95, + "grad_norm": 0.9732458654486648, + "learning_rate": 1.1354900036376181e-07, + "loss": 0.5195, + "step": 11731 + }, + { + "epoch": 0.95, + "grad_norm": 1.0535313265351196, + "learning_rate": 1.13153763099898e-07, + "loss": 0.5504, + "step": 11732 + }, + { + "epoch": 0.95, + "grad_norm": 0.9198569397489443, + "learning_rate": 1.1275921098570896e-07, + "loss": 0.4748, + "step": 11733 + }, + { + "epoch": 0.95, + "grad_norm": 0.917712254839116, + "learning_rate": 1.1236534404853727e-07, + "loss": 0.4632, + "step": 11734 + }, + { + "epoch": 0.95, + "grad_norm": 0.856909983032463, + "learning_rate": 1.1197216231567664e-07, + "loss": 0.4514, + "step": 11735 + }, + { + "epoch": 0.95, + "grad_norm": 0.9469785652163093, + "learning_rate": 1.1157966581437419e-07, + "loss": 0.4618, + "step": 11736 + }, + { + "epoch": 0.95, + "grad_norm": 0.9599933656874255, + "learning_rate": 1.1118785457183034e-07, + "loss": 0.4708, + "step": 11737 + }, + { + "epoch": 0.95, + "grad_norm": 1.094933152970052, + "learning_rate": 1.1079672861519675e-07, + "loss": 0.56, + "step": 11738 + }, + { + "epoch": 0.95, + "grad_norm": 1.073555695579005, + "learning_rate": 1.1040628797157727e-07, + "loss": 0.4836, + "step": 11739 + }, + { + "epoch": 0.95, + "grad_norm": 0.9336131282070538, + "learning_rate": 1.1001653266803136e-07, + "loss": 0.4616, + "step": 11740 + }, + { + "epoch": 0.95, + "grad_norm": 0.9517994592150936, + "learning_rate": 1.0962746273156633e-07, + "loss": 0.5279, + "step": 11741 + }, + { + "epoch": 0.95, + "grad_norm": 0.9092708185894586, + "learning_rate": 1.0923907818914614e-07, + "loss": 0.4783, + "step": 11742 + }, + { + "epoch": 0.95, + "grad_norm": 0.9137401074747392, + "learning_rate": 1.0885137906768373e-07, + "loss": 0.4915, + "step": 11743 + }, + { + "epoch": 0.95, + "grad_norm": 0.886062036636107, + "learning_rate": 1.084643653940487e-07, + "loss": 0.41, + "step": 11744 + }, + { + "epoch": 0.95, + "grad_norm": 1.0168251352297248, + "learning_rate": 1.0807803719505849e-07, + "loss": 0.4606, + "step": 11745 + }, + { + "epoch": 0.95, + "grad_norm": 0.8764053557786847, + "learning_rate": 1.0769239449748614e-07, + "loss": 0.4161, + "step": 11746 + }, + { + "epoch": 0.95, + "grad_norm": 0.9543498410492781, + "learning_rate": 1.0730743732805581e-07, + "loss": 0.4976, + "step": 11747 + }, + { + "epoch": 0.95, + "grad_norm": 0.8988669209801309, + "learning_rate": 1.0692316571344619e-07, + "loss": 0.4746, + "step": 11748 + }, + { + "epoch": 0.95, + "grad_norm": 0.9705097304261154, + "learning_rate": 1.0653957968028594e-07, + "loss": 0.531, + "step": 11749 + }, + { + "epoch": 0.95, + "grad_norm": 0.8087770163850684, + "learning_rate": 1.0615667925515716e-07, + "loss": 0.4503, + "step": 11750 + }, + { + "epoch": 0.96, + "grad_norm": 0.8585770296241498, + "learning_rate": 1.0577446446459416e-07, + "loss": 0.394, + "step": 11751 + }, + { + "epoch": 0.96, + "grad_norm": 0.8991131732136636, + "learning_rate": 1.0539293533508577e-07, + "loss": 0.4604, + "step": 11752 + }, + { + "epoch": 0.96, + "grad_norm": 1.043978687288304, + "learning_rate": 1.0501209189306972e-07, + "loss": 0.5689, + "step": 11753 + }, + { + "epoch": 0.96, + "grad_norm": 1.012373658397403, + "learning_rate": 1.0463193416493933e-07, + "loss": 0.5616, + "step": 11754 + }, + { + "epoch": 0.96, + "grad_norm": 0.9632718766958349, + "learning_rate": 1.04252462177038e-07, + "loss": 0.4913, + "step": 11755 + }, + { + "epoch": 0.96, + "grad_norm": 0.952867772462053, + "learning_rate": 1.0387367595566355e-07, + "loss": 0.4671, + "step": 11756 + }, + { + "epoch": 0.96, + "grad_norm": 0.9312195502524956, + "learning_rate": 1.0349557552706613e-07, + "loss": 0.5035, + "step": 11757 + }, + { + "epoch": 0.96, + "grad_norm": 0.9671971360550594, + "learning_rate": 1.0311816091744698e-07, + "loss": 0.4186, + "step": 11758 + }, + { + "epoch": 0.96, + "grad_norm": 0.9494946903358517, + "learning_rate": 1.0274143215296073e-07, + "loss": 0.4798, + "step": 11759 + }, + { + "epoch": 0.96, + "grad_norm": 0.8579853360947254, + "learning_rate": 1.0236538925971429e-07, + "loss": 0.4214, + "step": 11760 + }, + { + "epoch": 0.96, + "grad_norm": 0.9821881441558292, + "learning_rate": 1.019900322637668e-07, + "loss": 0.4483, + "step": 11761 + }, + { + "epoch": 0.96, + "grad_norm": 0.919312383287609, + "learning_rate": 1.016153611911308e-07, + "loss": 0.5016, + "step": 11762 + }, + { + "epoch": 0.96, + "grad_norm": 0.9690368869195982, + "learning_rate": 1.0124137606777107e-07, + "loss": 0.4951, + "step": 11763 + }, + { + "epoch": 0.96, + "grad_norm": 0.9219625016430492, + "learning_rate": 1.0086807691960243e-07, + "loss": 0.4489, + "step": 11764 + }, + { + "epoch": 0.96, + "grad_norm": 0.9490039649323291, + "learning_rate": 1.0049546377249642e-07, + "loss": 0.5141, + "step": 11765 + }, + { + "epoch": 0.96, + "grad_norm": 1.0328827891759012, + "learning_rate": 1.0012353665227458e-07, + "loss": 0.4956, + "step": 11766 + }, + { + "epoch": 0.96, + "grad_norm": 0.8300609810503681, + "learning_rate": 9.975229558470967e-08, + "loss": 0.4694, + "step": 11767 + }, + { + "epoch": 0.96, + "grad_norm": 0.9583215834991651, + "learning_rate": 9.938174059552885e-08, + "loss": 0.4489, + "step": 11768 + }, + { + "epoch": 0.96, + "grad_norm": 1.1407787654736952, + "learning_rate": 9.901187171041271e-08, + "loss": 0.5123, + "step": 11769 + }, + { + "epoch": 0.96, + "grad_norm": 0.9188810440416877, + "learning_rate": 9.864268895499074e-08, + "loss": 0.475, + "step": 11770 + }, + { + "epoch": 0.96, + "grad_norm": 0.9768542214641207, + "learning_rate": 9.827419235484803e-08, + "loss": 0.5083, + "step": 11771 + }, + { + "epoch": 0.96, + "grad_norm": 0.971834748912786, + "learning_rate": 9.790638193552082e-08, + "loss": 0.4618, + "step": 11772 + }, + { + "epoch": 0.96, + "grad_norm": 0.9809466241068123, + "learning_rate": 9.753925772249873e-08, + "loss": 0.5157, + "step": 11773 + }, + { + "epoch": 0.96, + "grad_norm": 1.046921175293461, + "learning_rate": 9.71728197412225e-08, + "loss": 0.5045, + "step": 11774 + }, + { + "epoch": 0.96, + "grad_norm": 0.9619930134152861, + "learning_rate": 9.680706801708517e-08, + "loss": 0.4513, + "step": 11775 + }, + { + "epoch": 0.96, + "grad_norm": 1.0247706813665163, + "learning_rate": 9.644200257543534e-08, + "loss": 0.5578, + "step": 11776 + }, + { + "epoch": 0.96, + "grad_norm": 0.9856133551445193, + "learning_rate": 9.607762344156946e-08, + "loss": 0.5058, + "step": 11777 + }, + { + "epoch": 0.96, + "grad_norm": 0.9145922251023627, + "learning_rate": 9.571393064073953e-08, + "loss": 0.4999, + "step": 11778 + }, + { + "epoch": 0.96, + "grad_norm": 0.9405462474920527, + "learning_rate": 9.535092419814873e-08, + "loss": 0.4495, + "step": 11779 + }, + { + "epoch": 0.96, + "grad_norm": 1.0159098070267714, + "learning_rate": 9.498860413895472e-08, + "loss": 0.5144, + "step": 11780 + }, + { + "epoch": 0.96, + "grad_norm": 0.8960909471248009, + "learning_rate": 9.462697048826408e-08, + "loss": 0.4778, + "step": 11781 + }, + { + "epoch": 0.96, + "grad_norm": 0.9080420987831723, + "learning_rate": 9.426602327113788e-08, + "loss": 0.4559, + "step": 11782 + }, + { + "epoch": 0.96, + "grad_norm": 0.9544893878383572, + "learning_rate": 9.390576251258943e-08, + "loss": 0.4975, + "step": 11783 + }, + { + "epoch": 0.96, + "grad_norm": 0.9637330708653101, + "learning_rate": 9.354618823758654e-08, + "loss": 0.4813, + "step": 11784 + }, + { + "epoch": 0.96, + "grad_norm": 0.9727251888984069, + "learning_rate": 9.318730047104484e-08, + "loss": 0.4482, + "step": 11785 + }, + { + "epoch": 0.96, + "grad_norm": 1.0301592271271238, + "learning_rate": 9.282909923783557e-08, + "loss": 0.4473, + "step": 11786 + }, + { + "epoch": 0.96, + "grad_norm": 0.9666339201017452, + "learning_rate": 9.247158456278327e-08, + "loss": 0.4611, + "step": 11787 + }, + { + "epoch": 0.96, + "grad_norm": 0.9457148871860079, + "learning_rate": 9.211475647066148e-08, + "loss": 0.4772, + "step": 11788 + }, + { + "epoch": 0.96, + "grad_norm": 0.9110564036623252, + "learning_rate": 9.175861498619821e-08, + "loss": 0.4615, + "step": 11789 + }, + { + "epoch": 0.96, + "grad_norm": 0.9682975620629547, + "learning_rate": 9.140316013407479e-08, + "loss": 0.4591, + "step": 11790 + }, + { + "epoch": 0.96, + "grad_norm": 1.0789231927916287, + "learning_rate": 9.104839193892379e-08, + "loss": 0.4509, + "step": 11791 + }, + { + "epoch": 0.96, + "grad_norm": 1.07751499957963, + "learning_rate": 9.069431042532995e-08, + "loss": 0.5149, + "step": 11792 + }, + { + "epoch": 0.96, + "grad_norm": 0.9618200013702595, + "learning_rate": 9.034091561783032e-08, + "loss": 0.541, + "step": 11793 + }, + { + "epoch": 0.96, + "grad_norm": 1.033258536988785, + "learning_rate": 8.99882075409153e-08, + "loss": 0.5121, + "step": 11794 + }, + { + "epoch": 0.96, + "grad_norm": 0.7940648088676413, + "learning_rate": 8.963618621902759e-08, + "loss": 0.4257, + "step": 11795 + }, + { + "epoch": 0.96, + "grad_norm": 0.9676953589118892, + "learning_rate": 8.928485167656208e-08, + "loss": 0.482, + "step": 11796 + }, + { + "epoch": 0.96, + "grad_norm": 0.9316837620156098, + "learning_rate": 8.89342039378649e-08, + "loss": 0.4366, + "step": 11797 + }, + { + "epoch": 0.96, + "grad_norm": 0.9919385490875576, + "learning_rate": 8.858424302723767e-08, + "loss": 0.4911, + "step": 11798 + }, + { + "epoch": 0.96, + "grad_norm": 0.8620054709348294, + "learning_rate": 8.823496896892991e-08, + "loss": 0.4312, + "step": 11799 + }, + { + "epoch": 0.96, + "grad_norm": 0.9563793794175025, + "learning_rate": 8.78863817871467e-08, + "loss": 0.4966, + "step": 11800 + }, + { + "epoch": 0.96, + "grad_norm": 0.9019147172069846, + "learning_rate": 8.753848150604538e-08, + "loss": 0.457, + "step": 11801 + }, + { + "epoch": 0.96, + "grad_norm": 1.007078630021055, + "learning_rate": 8.719126814973556e-08, + "loss": 0.491, + "step": 11802 + }, + { + "epoch": 0.96, + "grad_norm": 0.9815358823107124, + "learning_rate": 8.684474174227797e-08, + "loss": 0.5185, + "step": 11803 + }, + { + "epoch": 0.96, + "grad_norm": 0.9684981077551171, + "learning_rate": 8.649890230768676e-08, + "loss": 0.4662, + "step": 11804 + }, + { + "epoch": 0.96, + "grad_norm": 0.8355804604535023, + "learning_rate": 8.615374986992831e-08, + "loss": 0.4454, + "step": 11805 + }, + { + "epoch": 0.96, + "grad_norm": 0.9829781903898468, + "learning_rate": 8.580928445292124e-08, + "loss": 0.5163, + "step": 11806 + }, + { + "epoch": 0.96, + "grad_norm": 0.9820450700426611, + "learning_rate": 8.54655060805365e-08, + "loss": 0.5061, + "step": 11807 + }, + { + "epoch": 0.96, + "grad_norm": 0.9170387812859044, + "learning_rate": 8.512241477659944e-08, + "loss": 0.4526, + "step": 11808 + }, + { + "epoch": 0.96, + "grad_norm": 0.8588684291904626, + "learning_rate": 8.478001056488327e-08, + "loss": 0.4278, + "step": 11809 + }, + { + "epoch": 0.96, + "grad_norm": 1.004476225320024, + "learning_rate": 8.443829346911792e-08, + "loss": 0.4339, + "step": 11810 + }, + { + "epoch": 0.96, + "grad_norm": 0.9955030591553591, + "learning_rate": 8.409726351298441e-08, + "loss": 0.4798, + "step": 11811 + }, + { + "epoch": 0.96, + "grad_norm": 1.0319475619112393, + "learning_rate": 8.375692072011388e-08, + "loss": 0.5098, + "step": 11812 + }, + { + "epoch": 0.96, + "grad_norm": 0.9512626411668296, + "learning_rate": 8.341726511409409e-08, + "loss": 0.4742, + "step": 11813 + }, + { + "epoch": 0.96, + "grad_norm": 0.8647860949043975, + "learning_rate": 8.307829671846179e-08, + "loss": 0.4773, + "step": 11814 + }, + { + "epoch": 0.96, + "grad_norm": 0.966937294863239, + "learning_rate": 8.274001555670597e-08, + "loss": 0.5045, + "step": 11815 + }, + { + "epoch": 0.96, + "grad_norm": 1.0511562530909382, + "learning_rate": 8.24024216522723e-08, + "loss": 0.5244, + "step": 11816 + }, + { + "epoch": 0.96, + "grad_norm": 1.0529033296786536, + "learning_rate": 8.20655150285521e-08, + "loss": 0.4635, + "step": 11817 + }, + { + "epoch": 0.96, + "grad_norm": 0.9419210827460939, + "learning_rate": 8.172929570889553e-08, + "loss": 0.4529, + "step": 11818 + }, + { + "epoch": 0.96, + "grad_norm": 1.057298308091194, + "learning_rate": 8.139376371660179e-08, + "loss": 0.5259, + "step": 11819 + }, + { + "epoch": 0.96, + "grad_norm": 0.9911938418309101, + "learning_rate": 8.105891907492224e-08, + "loss": 0.5373, + "step": 11820 + }, + { + "epoch": 0.96, + "grad_norm": 0.9799078980237341, + "learning_rate": 8.072476180705946e-08, + "loss": 0.5207, + "step": 11821 + }, + { + "epoch": 0.96, + "grad_norm": 0.9169456394142715, + "learning_rate": 8.03912919361749e-08, + "loss": 0.4771, + "step": 11822 + }, + { + "epoch": 0.96, + "grad_norm": 0.9701500291611556, + "learning_rate": 8.005850948537453e-08, + "loss": 0.4812, + "step": 11823 + }, + { + "epoch": 0.96, + "grad_norm": 0.9376443184836865, + "learning_rate": 7.972641447771989e-08, + "loss": 0.4849, + "step": 11824 + }, + { + "epoch": 0.96, + "grad_norm": 0.8836556164802101, + "learning_rate": 7.939500693622481e-08, + "loss": 0.4832, + "step": 11825 + }, + { + "epoch": 0.96, + "grad_norm": 1.0238583224039954, + "learning_rate": 7.906428688385759e-08, + "loss": 0.5001, + "step": 11826 + }, + { + "epoch": 0.96, + "grad_norm": 0.9368949132542735, + "learning_rate": 7.873425434353432e-08, + "loss": 0.4871, + "step": 11827 + }, + { + "epoch": 0.96, + "grad_norm": 0.9951882378946979, + "learning_rate": 7.840490933812783e-08, + "loss": 0.4845, + "step": 11828 + }, + { + "epoch": 0.96, + "grad_norm": 0.8502504258314992, + "learning_rate": 7.807625189046098e-08, + "loss": 0.4095, + "step": 11829 + }, + { + "epoch": 0.96, + "grad_norm": 0.9569555972363435, + "learning_rate": 7.774828202330776e-08, + "loss": 0.4657, + "step": 11830 + }, + { + "epoch": 0.96, + "grad_norm": 0.9069133566490915, + "learning_rate": 7.742099975939888e-08, + "loss": 0.4634, + "step": 11831 + }, + { + "epoch": 0.96, + "grad_norm": 0.9656614654654957, + "learning_rate": 7.709440512141286e-08, + "loss": 0.5422, + "step": 11832 + }, + { + "epoch": 0.96, + "grad_norm": 0.9992615048473477, + "learning_rate": 7.676849813198272e-08, + "loss": 0.5122, + "step": 11833 + }, + { + "epoch": 0.96, + "grad_norm": 0.928341283376421, + "learning_rate": 7.644327881369485e-08, + "loss": 0.4389, + "step": 11834 + }, + { + "epoch": 0.96, + "grad_norm": 1.0356863433018246, + "learning_rate": 7.611874718908452e-08, + "loss": 0.5778, + "step": 11835 + }, + { + "epoch": 0.96, + "grad_norm": 0.9279200145760254, + "learning_rate": 7.579490328064265e-08, + "loss": 0.4886, + "step": 11836 + }, + { + "epoch": 0.96, + "grad_norm": 1.0535959759120828, + "learning_rate": 7.547174711081128e-08, + "loss": 0.4395, + "step": 11837 + }, + { + "epoch": 0.96, + "grad_norm": 0.9987245906603993, + "learning_rate": 7.514927870198475e-08, + "loss": 0.5428, + "step": 11838 + }, + { + "epoch": 0.96, + "grad_norm": 0.9292302350613081, + "learning_rate": 7.482749807650958e-08, + "loss": 0.4683, + "step": 11839 + }, + { + "epoch": 0.96, + "grad_norm": 0.999598281395877, + "learning_rate": 7.450640525668573e-08, + "loss": 0.5147, + "step": 11840 + }, + { + "epoch": 0.96, + "grad_norm": 0.9016518616157955, + "learning_rate": 7.41860002647643e-08, + "loss": 0.4539, + "step": 11841 + }, + { + "epoch": 0.96, + "grad_norm": 0.9069015870968691, + "learning_rate": 7.386628312294863e-08, + "loss": 0.3802, + "step": 11842 + }, + { + "epoch": 0.96, + "grad_norm": 0.9050331418584707, + "learning_rate": 7.354725385339546e-08, + "loss": 0.4511, + "step": 11843 + }, + { + "epoch": 0.96, + "grad_norm": 1.083380607724216, + "learning_rate": 7.322891247821151e-08, + "loss": 0.5082, + "step": 11844 + }, + { + "epoch": 0.96, + "grad_norm": 0.8287870334091962, + "learning_rate": 7.291125901946027e-08, + "loss": 0.4287, + "step": 11845 + }, + { + "epoch": 0.96, + "grad_norm": 0.9519750070137744, + "learning_rate": 7.259429349915303e-08, + "loss": 0.5054, + "step": 11846 + }, + { + "epoch": 0.96, + "grad_norm": 0.9186236888951284, + "learning_rate": 7.227801593925555e-08, + "loss": 0.4396, + "step": 11847 + }, + { + "epoch": 0.96, + "grad_norm": 1.0156722316213873, + "learning_rate": 7.196242636168582e-08, + "loss": 0.5054, + "step": 11848 + }, + { + "epoch": 0.96, + "grad_norm": 0.9044042392774271, + "learning_rate": 7.164752478831305e-08, + "loss": 0.442, + "step": 11849 + }, + { + "epoch": 0.96, + "grad_norm": 1.071712246415441, + "learning_rate": 7.133331124096087e-08, + "loss": 0.5517, + "step": 11850 + }, + { + "epoch": 0.96, + "grad_norm": 0.9086881899095651, + "learning_rate": 7.101978574140411e-08, + "loss": 0.5043, + "step": 11851 + }, + { + "epoch": 0.96, + "grad_norm": 0.9903068859691339, + "learning_rate": 7.070694831136871e-08, + "loss": 0.5096, + "step": 11852 + }, + { + "epoch": 0.96, + "grad_norm": 0.7962250008333884, + "learning_rate": 7.039479897253509e-08, + "loss": 0.4403, + "step": 11853 + }, + { + "epoch": 0.96, + "grad_norm": 0.9390602910887271, + "learning_rate": 7.008333774653376e-08, + "loss": 0.4721, + "step": 11854 + }, + { + "epoch": 0.96, + "grad_norm": 0.9426156451013604, + "learning_rate": 6.977256465494853e-08, + "loss": 0.4614, + "step": 11855 + }, + { + "epoch": 0.96, + "grad_norm": 0.9799328098160812, + "learning_rate": 6.946247971931774e-08, + "loss": 0.474, + "step": 11856 + }, + { + "epoch": 0.96, + "grad_norm": 0.9250217103653412, + "learning_rate": 6.915308296112755e-08, + "loss": 0.4106, + "step": 11857 + }, + { + "epoch": 0.96, + "grad_norm": 0.9025833591773156, + "learning_rate": 6.88443744018208e-08, + "loss": 0.4388, + "step": 11858 + }, + { + "epoch": 0.96, + "grad_norm": 0.9340544233362317, + "learning_rate": 6.853635406279035e-08, + "loss": 0.4985, + "step": 11859 + }, + { + "epoch": 0.96, + "grad_norm": 0.8905377200903977, + "learning_rate": 6.822902196538028e-08, + "loss": 0.4445, + "step": 11860 + }, + { + "epoch": 0.96, + "grad_norm": 0.9160671303363982, + "learning_rate": 6.792237813089131e-08, + "loss": 0.4647, + "step": 11861 + }, + { + "epoch": 0.96, + "grad_norm": 1.0165528326770312, + "learning_rate": 6.761642258056977e-08, + "loss": 0.4915, + "step": 11862 + }, + { + "epoch": 0.96, + "grad_norm": 0.9790731286567422, + "learning_rate": 6.731115533562094e-08, + "loss": 0.4872, + "step": 11863 + }, + { + "epoch": 0.96, + "grad_norm": 0.940150772056775, + "learning_rate": 6.7006576417199e-08, + "loss": 0.5599, + "step": 11864 + }, + { + "epoch": 0.96, + "grad_norm": 0.8891355445636352, + "learning_rate": 6.670268584641148e-08, + "loss": 0.5077, + "step": 11865 + }, + { + "epoch": 0.96, + "grad_norm": 1.0427807053065226, + "learning_rate": 6.639948364431492e-08, + "loss": 0.4946, + "step": 11866 + }, + { + "epoch": 0.96, + "grad_norm": 0.9213346316427382, + "learning_rate": 6.60969698319247e-08, + "loss": 0.4955, + "step": 11867 + }, + { + "epoch": 0.96, + "grad_norm": 0.9091893947814887, + "learning_rate": 6.579514443020296e-08, + "loss": 0.4964, + "step": 11868 + }, + { + "epoch": 0.96, + "grad_norm": 1.0501205485179521, + "learning_rate": 6.549400746006629e-08, + "loss": 0.4752, + "step": 11869 + }, + { + "epoch": 0.96, + "grad_norm": 0.883953055819093, + "learning_rate": 6.519355894238245e-08, + "loss": 0.4733, + "step": 11870 + }, + { + "epoch": 0.96, + "grad_norm": 0.9817174202768156, + "learning_rate": 6.489379889797254e-08, + "loss": 0.4572, + "step": 11871 + }, + { + "epoch": 0.96, + "grad_norm": 0.9823410939883479, + "learning_rate": 6.459472734760997e-08, + "loss": 0.521, + "step": 11872 + }, + { + "epoch": 0.96, + "grad_norm": 1.0098939609336977, + "learning_rate": 6.429634431202036e-08, + "loss": 0.5193, + "step": 11873 + }, + { + "epoch": 0.97, + "grad_norm": 1.0662717994985875, + "learning_rate": 6.399864981188164e-08, + "loss": 0.5038, + "step": 11874 + }, + { + "epoch": 0.97, + "grad_norm": 0.9839336837767094, + "learning_rate": 6.370164386782285e-08, + "loss": 0.4969, + "step": 11875 + }, + { + "epoch": 0.97, + "grad_norm": 0.9035783762848933, + "learning_rate": 6.340532650042641e-08, + "loss": 0.4771, + "step": 11876 + }, + { + "epoch": 0.97, + "grad_norm": 0.969875008374207, + "learning_rate": 6.310969773022701e-08, + "loss": 0.5079, + "step": 11877 + }, + { + "epoch": 0.97, + "grad_norm": 0.9436460184683367, + "learning_rate": 6.281475757771161e-08, + "loss": 0.4733, + "step": 11878 + }, + { + "epoch": 0.97, + "grad_norm": 0.972534101604993, + "learning_rate": 6.252050606332049e-08, + "loss": 0.501, + "step": 11879 + }, + { + "epoch": 0.97, + "grad_norm": 0.9578199633119227, + "learning_rate": 6.222694320744182e-08, + "loss": 0.4724, + "step": 11880 + }, + { + "epoch": 0.97, + "grad_norm": 0.8301656923384986, + "learning_rate": 6.193406903042265e-08, + "loss": 0.475, + "step": 11881 + }, + { + "epoch": 0.97, + "grad_norm": 0.9039558845681417, + "learning_rate": 6.164188355255673e-08, + "loss": 0.4935, + "step": 11882 + }, + { + "epoch": 0.97, + "grad_norm": 0.9081915049960225, + "learning_rate": 6.135038679409344e-08, + "loss": 0.5057, + "step": 11883 + }, + { + "epoch": 0.97, + "grad_norm": 1.0382548299120893, + "learning_rate": 6.105957877523216e-08, + "loss": 0.4951, + "step": 11884 + }, + { + "epoch": 0.97, + "grad_norm": 0.905766580597293, + "learning_rate": 6.076945951612678e-08, + "loss": 0.4418, + "step": 11885 + }, + { + "epoch": 0.97, + "grad_norm": 0.8931595039530028, + "learning_rate": 6.048002903688121e-08, + "loss": 0.4915, + "step": 11886 + }, + { + "epoch": 0.97, + "grad_norm": 1.0160363862096509, + "learning_rate": 6.019128735755386e-08, + "loss": 0.5485, + "step": 11887 + }, + { + "epoch": 0.97, + "grad_norm": 0.8877215619065384, + "learning_rate": 5.990323449815316e-08, + "loss": 0.4455, + "step": 11888 + }, + { + "epoch": 0.97, + "grad_norm": 0.9249415508767169, + "learning_rate": 5.961587047864204e-08, + "loss": 0.4407, + "step": 11889 + }, + { + "epoch": 0.97, + "grad_norm": 0.8467704348053258, + "learning_rate": 5.932919531893344e-08, + "loss": 0.4518, + "step": 11890 + }, + { + "epoch": 0.97, + "grad_norm": 0.928828790733546, + "learning_rate": 5.9043209038894825e-08, + "loss": 0.4528, + "step": 11891 + }, + { + "epoch": 0.97, + "grad_norm": 0.9258296897330218, + "learning_rate": 5.8757911658343657e-08, + "loss": 0.4908, + "step": 11892 + }, + { + "epoch": 0.97, + "grad_norm": 0.993635545931022, + "learning_rate": 5.847330319705191e-08, + "loss": 0.5191, + "step": 11893 + }, + { + "epoch": 0.97, + "grad_norm": 1.0503873247954665, + "learning_rate": 5.818938367474159e-08, + "loss": 0.4728, + "step": 11894 + }, + { + "epoch": 0.97, + "grad_norm": 0.909336100675651, + "learning_rate": 5.790615311108805e-08, + "loss": 0.4634, + "step": 11895 + }, + { + "epoch": 0.97, + "grad_norm": 1.0197521172435708, + "learning_rate": 5.7623611525721155e-08, + "loss": 0.5245, + "step": 11896 + }, + { + "epoch": 0.97, + "grad_norm": 0.8289684625646265, + "learning_rate": 5.7341758938217474e-08, + "loss": 0.404, + "step": 11897 + }, + { + "epoch": 0.97, + "grad_norm": 1.0007797587248377, + "learning_rate": 5.706059536811137e-08, + "loss": 0.4432, + "step": 11898 + }, + { + "epoch": 0.97, + "grad_norm": 0.8384808151151713, + "learning_rate": 5.6780120834887264e-08, + "loss": 0.3979, + "step": 11899 + }, + { + "epoch": 0.97, + "grad_norm": 0.8634048374650863, + "learning_rate": 5.650033535798072e-08, + "loss": 0.4548, + "step": 11900 + }, + { + "epoch": 0.97, + "grad_norm": 0.9543880848177454, + "learning_rate": 5.6221238956780664e-08, + "loss": 0.4442, + "step": 11901 + }, + { + "epoch": 0.97, + "grad_norm": 0.9048198696439131, + "learning_rate": 5.5942831650628303e-08, + "loss": 0.4611, + "step": 11902 + }, + { + "epoch": 0.97, + "grad_norm": 0.9538814286699375, + "learning_rate": 5.566511345881931e-08, + "loss": 0.5113, + "step": 11903 + }, + { + "epoch": 0.97, + "grad_norm": 0.9043391175027943, + "learning_rate": 5.5388084400594954e-08, + "loss": 0.4949, + "step": 11904 + }, + { + "epoch": 0.97, + "grad_norm": 0.9293652419355056, + "learning_rate": 5.511174449515655e-08, + "loss": 0.46, + "step": 11905 + }, + { + "epoch": 0.97, + "grad_norm": 1.0324711413947731, + "learning_rate": 5.483609376165322e-08, + "loss": 0.5127, + "step": 11906 + }, + { + "epoch": 0.97, + "grad_norm": 0.8941503163390319, + "learning_rate": 5.456113221918746e-08, + "loss": 0.4751, + "step": 11907 + }, + { + "epoch": 0.97, + "grad_norm": 0.919163316555529, + "learning_rate": 5.428685988681292e-08, + "loss": 0.4502, + "step": 11908 + }, + { + "epoch": 0.97, + "grad_norm": 0.9342751986032585, + "learning_rate": 5.401327678353774e-08, + "loss": 0.4755, + "step": 11909 + }, + { + "epoch": 0.97, + "grad_norm": 0.9684954869778296, + "learning_rate": 5.3740382928320065e-08, + "loss": 0.4781, + "step": 11910 + }, + { + "epoch": 0.97, + "grad_norm": 1.0860550064505223, + "learning_rate": 5.346817834007145e-08, + "loss": 0.4832, + "step": 11911 + }, + { + "epoch": 0.97, + "grad_norm": 0.915553037036275, + "learning_rate": 5.3196663037655695e-08, + "loss": 0.5023, + "step": 11912 + }, + { + "epoch": 0.97, + "grad_norm": 0.9504761418845167, + "learning_rate": 5.292583703988885e-08, + "loss": 0.4172, + "step": 11913 + }, + { + "epoch": 0.97, + "grad_norm": 0.9750402476226417, + "learning_rate": 5.265570036553813e-08, + "loss": 0.5118, + "step": 11914 + }, + { + "epoch": 0.97, + "grad_norm": 0.950881631306124, + "learning_rate": 5.238625303332412e-08, + "loss": 0.4694, + "step": 11915 + }, + { + "epoch": 0.97, + "grad_norm": 0.9596144934035122, + "learning_rate": 5.2117495061918544e-08, + "loss": 0.4668, + "step": 11916 + }, + { + "epoch": 0.97, + "grad_norm": 0.9621390065847045, + "learning_rate": 5.184942646994762e-08, + "loss": 0.5706, + "step": 11917 + }, + { + "epoch": 0.97, + "grad_norm": 0.9387416367581379, + "learning_rate": 5.158204727598759e-08, + "loss": 0.4659, + "step": 11918 + }, + { + "epoch": 0.97, + "grad_norm": 1.0693341512877415, + "learning_rate": 5.131535749856698e-08, + "loss": 0.4836, + "step": 11919 + }, + { + "epoch": 0.97, + "grad_norm": 1.0015756147477828, + "learning_rate": 5.104935715616766e-08, + "loss": 0.5097, + "step": 11920 + }, + { + "epoch": 0.97, + "grad_norm": 1.272574570392914, + "learning_rate": 5.0784046267223775e-08, + "loss": 0.4799, + "step": 11921 + }, + { + "epoch": 0.97, + "grad_norm": 0.8719092888254251, + "learning_rate": 5.0519424850119516e-08, + "loss": 0.4908, + "step": 11922 + }, + { + "epoch": 0.97, + "grad_norm": 0.9723855937950857, + "learning_rate": 5.025549292319465e-08, + "loss": 0.4476, + "step": 11923 + }, + { + "epoch": 0.97, + "grad_norm": 0.8891720838974949, + "learning_rate": 4.999225050473788e-08, + "loss": 0.4679, + "step": 11924 + }, + { + "epoch": 0.97, + "grad_norm": 1.00799732731766, + "learning_rate": 4.972969761299351e-08, + "loss": 0.4652, + "step": 11925 + }, + { + "epoch": 0.97, + "grad_norm": 0.9705146089469759, + "learning_rate": 4.9467834266154756e-08, + "loss": 0.4753, + "step": 11926 + }, + { + "epoch": 0.97, + "grad_norm": 0.8757512034841, + "learning_rate": 4.920666048236933e-08, + "loss": 0.438, + "step": 11927 + }, + { + "epoch": 0.97, + "grad_norm": 0.9998789701294412, + "learning_rate": 4.894617627973497e-08, + "loss": 0.4694, + "step": 11928 + }, + { + "epoch": 0.97, + "grad_norm": 0.9252960647533439, + "learning_rate": 4.8686381676305015e-08, + "loss": 0.4889, + "step": 11929 + }, + { + "epoch": 0.97, + "grad_norm": 0.9785224715228302, + "learning_rate": 4.8427276690081735e-08, + "loss": 0.4981, + "step": 11930 + }, + { + "epoch": 0.97, + "grad_norm": 0.9302323682272238, + "learning_rate": 4.8168861339020766e-08, + "loss": 0.4768, + "step": 11931 + }, + { + "epoch": 0.97, + "grad_norm": 0.9128889123665305, + "learning_rate": 4.791113564103111e-08, + "loss": 0.4583, + "step": 11932 + }, + { + "epoch": 0.97, + "grad_norm": 0.9159577759229364, + "learning_rate": 4.7654099613971825e-08, + "loss": 0.4738, + "step": 11933 + }, + { + "epoch": 0.97, + "grad_norm": 0.9667416898072759, + "learning_rate": 4.739775327565532e-08, + "loss": 0.5195, + "step": 11934 + }, + { + "epoch": 0.97, + "grad_norm": 0.9583053889816117, + "learning_rate": 4.714209664384739e-08, + "loss": 0.4987, + "step": 11935 + }, + { + "epoch": 0.97, + "grad_norm": 1.0593700920519042, + "learning_rate": 4.688712973626386e-08, + "loss": 0.4761, + "step": 11936 + }, + { + "epoch": 0.97, + "grad_norm": 0.9518152352507855, + "learning_rate": 4.663285257057393e-08, + "loss": 0.4734, + "step": 11937 + }, + { + "epoch": 0.97, + "grad_norm": 0.9369693183511989, + "learning_rate": 4.637926516439795e-08, + "loss": 0.4472, + "step": 11938 + }, + { + "epoch": 0.97, + "grad_norm": 0.9575936036237775, + "learning_rate": 4.612636753531075e-08, + "loss": 0.4948, + "step": 11939 + }, + { + "epoch": 0.97, + "grad_norm": 1.0008413774296308, + "learning_rate": 4.58741597008372e-08, + "loss": 0.4879, + "step": 11940 + }, + { + "epoch": 0.97, + "grad_norm": 0.9777885044908675, + "learning_rate": 4.5622641678454424e-08, + "loss": 0.4558, + "step": 11941 + }, + { + "epoch": 0.97, + "grad_norm": 0.9945116465130329, + "learning_rate": 4.537181348559405e-08, + "loss": 0.4254, + "step": 11942 + }, + { + "epoch": 0.97, + "grad_norm": 0.9461087646435123, + "learning_rate": 4.512167513963661e-08, + "loss": 0.5021, + "step": 11943 + }, + { + "epoch": 0.97, + "grad_norm": 0.9954162897161014, + "learning_rate": 4.487222665791713e-08, + "loss": 0.5154, + "step": 11944 + }, + { + "epoch": 0.97, + "grad_norm": 0.9237004610309977, + "learning_rate": 4.4623468057722886e-08, + "loss": 0.466, + "step": 11945 + }, + { + "epoch": 0.97, + "grad_norm": 0.9637871569395646, + "learning_rate": 4.437539935629009e-08, + "loss": 0.4733, + "step": 11946 + }, + { + "epoch": 0.97, + "grad_norm": 0.8388679921392534, + "learning_rate": 4.412802057081278e-08, + "loss": 0.4363, + "step": 11947 + }, + { + "epoch": 0.97, + "grad_norm": 0.9179034699969338, + "learning_rate": 4.388133171843278e-08, + "loss": 0.5191, + "step": 11948 + }, + { + "epoch": 0.97, + "grad_norm": 0.8819156108312363, + "learning_rate": 4.3635332816245324e-08, + "loss": 0.4188, + "step": 11949 + }, + { + "epoch": 0.97, + "grad_norm": 0.9320418097418713, + "learning_rate": 4.339002388129787e-08, + "loss": 0.4578, + "step": 11950 + }, + { + "epoch": 0.97, + "grad_norm": 0.972402913307539, + "learning_rate": 4.3145404930591275e-08, + "loss": 0.4979, + "step": 11951 + }, + { + "epoch": 0.97, + "grad_norm": 0.9727729975849237, + "learning_rate": 4.2901475981074195e-08, + "loss": 0.5532, + "step": 11952 + }, + { + "epoch": 0.97, + "grad_norm": 1.035867758045349, + "learning_rate": 4.2658237049655325e-08, + "loss": 0.4867, + "step": 11953 + }, + { + "epoch": 0.97, + "grad_norm": 1.0089929989437811, + "learning_rate": 4.241568815318675e-08, + "loss": 0.4247, + "step": 11954 + }, + { + "epoch": 0.97, + "grad_norm": 1.0064757121513352, + "learning_rate": 4.2173829308479466e-08, + "loss": 0.4637, + "step": 11955 + }, + { + "epoch": 0.97, + "grad_norm": 0.9584510660983068, + "learning_rate": 4.193266053229339e-08, + "loss": 0.486, + "step": 11956 + }, + { + "epoch": 0.97, + "grad_norm": 1.0334196207407, + "learning_rate": 4.1692181841340716e-08, + "loss": 0.4957, + "step": 11957 + }, + { + "epoch": 0.97, + "grad_norm": 0.971222793955482, + "learning_rate": 4.1452393252285894e-08, + "loss": 0.4645, + "step": 11958 + }, + { + "epoch": 0.97, + "grad_norm": 1.0089128046903748, + "learning_rate": 4.1213294781748956e-08, + "loss": 0.4739, + "step": 11959 + }, + { + "epoch": 0.97, + "grad_norm": 0.916419587613008, + "learning_rate": 4.097488644629555e-08, + "loss": 0.4691, + "step": 11960 + }, + { + "epoch": 0.97, + "grad_norm": 0.9609627617532643, + "learning_rate": 4.0737168262450224e-08, + "loss": 0.4733, + "step": 11961 + }, + { + "epoch": 0.97, + "grad_norm": 0.9136837067317344, + "learning_rate": 4.050014024668425e-08, + "loss": 0.4675, + "step": 11962 + }, + { + "epoch": 0.97, + "grad_norm": 0.8788214931690304, + "learning_rate": 4.02638024154256e-08, + "loss": 0.4911, + "step": 11963 + }, + { + "epoch": 0.97, + "grad_norm": 0.9283939728597378, + "learning_rate": 4.002815478505007e-08, + "loss": 0.4347, + "step": 11964 + }, + { + "epoch": 0.97, + "grad_norm": 0.9048435075657899, + "learning_rate": 3.9793197371889026e-08, + "loss": 0.4434, + "step": 11965 + }, + { + "epoch": 0.97, + "grad_norm": 1.0281323566018774, + "learning_rate": 3.955893019222501e-08, + "loss": 0.4806, + "step": 11966 + }, + { + "epoch": 0.97, + "grad_norm": 0.9526276177157981, + "learning_rate": 3.93253532622917e-08, + "loss": 0.4953, + "step": 11967 + }, + { + "epoch": 0.97, + "grad_norm": 0.9364679716036685, + "learning_rate": 3.909246659827726e-08, + "loss": 0.4233, + "step": 11968 + }, + { + "epoch": 0.97, + "grad_norm": 0.9841615329388482, + "learning_rate": 3.8860270216319885e-08, + "loss": 0.5168, + "step": 11969 + }, + { + "epoch": 0.97, + "grad_norm": 1.047709700000648, + "learning_rate": 3.862876413250893e-08, + "loss": 0.5589, + "step": 11970 + }, + { + "epoch": 0.97, + "grad_norm": 0.863210692248217, + "learning_rate": 3.839794836288935e-08, + "loss": 0.4264, + "step": 11971 + }, + { + "epoch": 0.97, + "grad_norm": 0.8951007919327821, + "learning_rate": 3.816782292345611e-08, + "loss": 0.442, + "step": 11972 + }, + { + "epoch": 0.97, + "grad_norm": 1.0667982781868577, + "learning_rate": 3.7938387830156464e-08, + "loss": 0.4819, + "step": 11973 + }, + { + "epoch": 0.97, + "grad_norm": 0.8919483517785504, + "learning_rate": 3.7709643098891024e-08, + "loss": 0.4757, + "step": 11974 + }, + { + "epoch": 0.97, + "grad_norm": 1.0189103671289825, + "learning_rate": 3.748158874550934e-08, + "loss": 0.5292, + "step": 11975 + }, + { + "epoch": 0.97, + "grad_norm": 0.8422196036000987, + "learning_rate": 3.725422478581764e-08, + "loss": 0.468, + "step": 11976 + }, + { + "epoch": 0.97, + "grad_norm": 0.9799552575419327, + "learning_rate": 3.702755123557111e-08, + "loss": 0.4893, + "step": 11977 + }, + { + "epoch": 0.97, + "grad_norm": 1.0308217161205298, + "learning_rate": 3.6801568110478304e-08, + "loss": 0.5068, + "step": 11978 + }, + { + "epoch": 0.97, + "grad_norm": 0.8779731833204971, + "learning_rate": 3.6576275426200014e-08, + "loss": 0.4277, + "step": 11979 + }, + { + "epoch": 0.97, + "grad_norm": 0.9053805546833162, + "learning_rate": 3.635167319834709e-08, + "loss": 0.465, + "step": 11980 + }, + { + "epoch": 0.97, + "grad_norm": 0.8666755276899345, + "learning_rate": 3.612776144248597e-08, + "loss": 0.4476, + "step": 11981 + }, + { + "epoch": 0.97, + "grad_norm": 0.9849607754708499, + "learning_rate": 3.590454017413314e-08, + "loss": 0.5167, + "step": 11982 + }, + { + "epoch": 0.97, + "grad_norm": 0.857166337628939, + "learning_rate": 3.568200940875732e-08, + "loss": 0.442, + "step": 11983 + }, + { + "epoch": 0.97, + "grad_norm": 0.9616493391315466, + "learning_rate": 3.546016916178063e-08, + "loss": 0.5472, + "step": 11984 + }, + { + "epoch": 0.97, + "grad_norm": 0.9561335160933164, + "learning_rate": 3.523901944857522e-08, + "loss": 0.4597, + "step": 11985 + }, + { + "epoch": 0.97, + "grad_norm": 1.0152458863216052, + "learning_rate": 3.5018560284466595e-08, + "loss": 0.5377, + "step": 11986 + }, + { + "epoch": 0.97, + "grad_norm": 0.9116349271646171, + "learning_rate": 3.4798791684733655e-08, + "loss": 0.4812, + "step": 11987 + }, + { + "epoch": 0.97, + "grad_norm": 0.9578005502526648, + "learning_rate": 3.457971366460422e-08, + "loss": 0.4809, + "step": 11988 + }, + { + "epoch": 0.97, + "grad_norm": 0.9950944460120628, + "learning_rate": 3.436132623926169e-08, + "loss": 0.5159, + "step": 11989 + }, + { + "epoch": 0.97, + "grad_norm": 1.0323696694058275, + "learning_rate": 3.414362942384064e-08, + "loss": 0.473, + "step": 11990 + }, + { + "epoch": 0.97, + "grad_norm": 0.8910655305456949, + "learning_rate": 3.392662323342566e-08, + "loss": 0.5181, + "step": 11991 + }, + { + "epoch": 0.97, + "grad_norm": 1.0142482538193889, + "learning_rate": 3.371030768305583e-08, + "loss": 0.4883, + "step": 11992 + }, + { + "epoch": 0.97, + "grad_norm": 0.8941388462250626, + "learning_rate": 3.349468278772139e-08, + "loss": 0.4608, + "step": 11993 + }, + { + "epoch": 0.97, + "grad_norm": 0.9806960085628176, + "learning_rate": 3.3279748562364824e-08, + "loss": 0.4906, + "step": 11994 + }, + { + "epoch": 0.97, + "grad_norm": 0.9284249097360684, + "learning_rate": 3.3065505021881995e-08, + "loss": 0.5173, + "step": 11995 + }, + { + "epoch": 0.97, + "grad_norm": 0.962827598357366, + "learning_rate": 3.2851952181118805e-08, + "loss": 0.4712, + "step": 11996 + }, + { + "epoch": 0.98, + "grad_norm": 0.9785884863012381, + "learning_rate": 3.2639090054874534e-08, + "loss": 0.4511, + "step": 11997 + }, + { + "epoch": 0.98, + "grad_norm": 0.9825408225097995, + "learning_rate": 3.242691865790071e-08, + "loss": 0.5059, + "step": 11998 + }, + { + "epoch": 0.98, + "grad_norm": 1.0175010878886457, + "learning_rate": 3.22154380048989e-08, + "loss": 0.551, + "step": 11999 + }, + { + "epoch": 0.98, + "grad_norm": 0.8098232963770842, + "learning_rate": 3.200464811052628e-08, + "loss": 0.4347, + "step": 12000 + }, + { + "epoch": 0.98, + "grad_norm": 0.943884107957791, + "learning_rate": 3.1794548989391163e-08, + "loss": 0.4533, + "step": 12001 + }, + { + "epoch": 0.98, + "grad_norm": 0.9370564155146761, + "learning_rate": 3.158514065605078e-08, + "loss": 0.4661, + "step": 12002 + }, + { + "epoch": 0.98, + "grad_norm": 0.8380704807977243, + "learning_rate": 3.1376423125019093e-08, + "loss": 0.4335, + "step": 12003 + }, + { + "epoch": 0.98, + "grad_norm": 0.9139190205140328, + "learning_rate": 3.116839641075786e-08, + "loss": 0.4962, + "step": 12004 + }, + { + "epoch": 0.98, + "grad_norm": 1.018881269398722, + "learning_rate": 3.0961060527685546e-08, + "loss": 0.5046, + "step": 12005 + }, + { + "epoch": 0.98, + "grad_norm": 0.9367199554553238, + "learning_rate": 3.0754415490168446e-08, + "loss": 0.4828, + "step": 12006 + }, + { + "epoch": 0.98, + "grad_norm": 0.8418975025483183, + "learning_rate": 3.054846131252731e-08, + "loss": 0.4576, + "step": 12007 + }, + { + "epoch": 0.98, + "grad_norm": 0.9456790269529668, + "learning_rate": 3.034319800903629e-08, + "loss": 0.4752, + "step": 12008 + }, + { + "epoch": 0.98, + "grad_norm": 0.9483389987156057, + "learning_rate": 3.013862559391734e-08, + "loss": 0.4741, + "step": 12009 + }, + { + "epoch": 0.98, + "grad_norm": 0.9600136688183358, + "learning_rate": 2.993474408134911e-08, + "loss": 0.4849, + "step": 12010 + }, + { + "epoch": 0.98, + "grad_norm": 0.9480643663449927, + "learning_rate": 2.9731553485459197e-08, + "loss": 0.4928, + "step": 12011 + }, + { + "epoch": 0.98, + "grad_norm": 0.8911982744195088, + "learning_rate": 2.9529053820329667e-08, + "loss": 0.4711, + "step": 12012 + }, + { + "epoch": 0.98, + "grad_norm": 0.97406248259707, + "learning_rate": 2.932724509999263e-08, + "loss": 0.4866, + "step": 12013 + }, + { + "epoch": 0.98, + "grad_norm": 0.9693131231472446, + "learning_rate": 2.9126127338432454e-08, + "loss": 0.5288, + "step": 12014 + }, + { + "epoch": 0.98, + "grad_norm": 0.8760148975933071, + "learning_rate": 2.8925700549589096e-08, + "loss": 0.4298, + "step": 12015 + }, + { + "epoch": 0.98, + "grad_norm": 0.9383849892653244, + "learning_rate": 2.8725964747350342e-08, + "loss": 0.4584, + "step": 12016 + }, + { + "epoch": 0.98, + "grad_norm": 0.9725559701311824, + "learning_rate": 2.852691994555623e-08, + "loss": 0.4955, + "step": 12017 + }, + { + "epoch": 0.98, + "grad_norm": 1.0068941623928347, + "learning_rate": 2.8328566158002392e-08, + "loss": 0.4362, + "step": 12018 + }, + { + "epoch": 0.98, + "grad_norm": 0.9606479146924113, + "learning_rate": 2.8130903398434496e-08, + "loss": 0.4623, + "step": 12019 + }, + { + "epoch": 0.98, + "grad_norm": 0.9535023751808042, + "learning_rate": 2.7933931680550476e-08, + "loss": 0.5133, + "step": 12020 + }, + { + "epoch": 0.98, + "grad_norm": 0.8856677096306654, + "learning_rate": 2.7737651017998303e-08, + "loss": 0.4567, + "step": 12021 + }, + { + "epoch": 0.98, + "grad_norm": 0.8725884384204944, + "learning_rate": 2.754206142438265e-08, + "loss": 0.4795, + "step": 12022 + }, + { + "epoch": 0.98, + "grad_norm": 1.0567535458362625, + "learning_rate": 2.734716291325712e-08, + "loss": 0.5445, + "step": 12023 + }, + { + "epoch": 0.98, + "grad_norm": 0.9246990984926732, + "learning_rate": 2.7152955498126465e-08, + "loss": 0.4927, + "step": 12024 + }, + { + "epoch": 0.98, + "grad_norm": 0.9147568261561264, + "learning_rate": 2.695943919244992e-08, + "loss": 0.4427, + "step": 12025 + }, + { + "epoch": 0.98, + "grad_norm": 0.8809789552613674, + "learning_rate": 2.676661400963898e-08, + "loss": 0.496, + "step": 12026 + }, + { + "epoch": 0.98, + "grad_norm": 0.9264478746652443, + "learning_rate": 2.6574479963054068e-08, + "loss": 0.4888, + "step": 12027 + }, + { + "epoch": 0.98, + "grad_norm": 1.0732755625344113, + "learning_rate": 2.6383037066013417e-08, + "loss": 0.4947, + "step": 12028 + }, + { + "epoch": 0.98, + "grad_norm": 0.9045542732560331, + "learning_rate": 2.6192285331779754e-08, + "loss": 0.4218, + "step": 12029 + }, + { + "epoch": 0.98, + "grad_norm": 0.926681609482207, + "learning_rate": 2.6002224773574725e-08, + "loss": 0.4675, + "step": 12030 + }, + { + "epoch": 0.98, + "grad_norm": 0.9571897718833516, + "learning_rate": 2.5812855404568903e-08, + "loss": 0.5279, + "step": 12031 + }, + { + "epoch": 0.98, + "grad_norm": 0.8379559114592033, + "learning_rate": 2.5624177237884017e-08, + "loss": 0.4349, + "step": 12032 + }, + { + "epoch": 0.98, + "grad_norm": 0.9385323324140011, + "learning_rate": 2.5436190286597384e-08, + "loss": 0.5041, + "step": 12033 + }, + { + "epoch": 0.98, + "grad_norm": 0.9581379073190894, + "learning_rate": 2.524889456373525e-08, + "loss": 0.5283, + "step": 12034 + }, + { + "epoch": 0.98, + "grad_norm": 0.9226299394723871, + "learning_rate": 2.506229008227723e-08, + "loss": 0.4763, + "step": 12035 + }, + { + "epoch": 0.98, + "grad_norm": 1.0014369490594117, + "learning_rate": 2.4876376855154095e-08, + "loss": 0.5233, + "step": 12036 + }, + { + "epoch": 0.98, + "grad_norm": 0.9786997303191647, + "learning_rate": 2.469115489525109e-08, + "loss": 0.4833, + "step": 12037 + }, + { + "epoch": 0.98, + "grad_norm": 1.0225673016820687, + "learning_rate": 2.4506624215402396e-08, + "loss": 0.4294, + "step": 12038 + }, + { + "epoch": 0.98, + "grad_norm": 1.0178203672748507, + "learning_rate": 2.4322784828395562e-08, + "loss": 0.5122, + "step": 12039 + }, + { + "epoch": 0.98, + "grad_norm": 0.960735625727089, + "learning_rate": 2.4139636746972617e-08, + "loss": 0.4957, + "step": 12040 + }, + { + "epoch": 0.98, + "grad_norm": 0.9003834052306827, + "learning_rate": 2.395717998382341e-08, + "loss": 0.4717, + "step": 12041 + }, + { + "epoch": 0.98, + "grad_norm": 1.0236375174607777, + "learning_rate": 2.377541455159338e-08, + "loss": 0.4855, + "step": 12042 + }, + { + "epoch": 0.98, + "grad_norm": 0.9208499654286599, + "learning_rate": 2.3594340462878007e-08, + "loss": 0.4828, + "step": 12043 + }, + { + "epoch": 0.98, + "grad_norm": 0.9747632253006461, + "learning_rate": 2.3413957730226144e-08, + "loss": 0.5228, + "step": 12044 + }, + { + "epoch": 0.98, + "grad_norm": 0.8512407956116504, + "learning_rate": 2.3234266366137794e-08, + "loss": 0.486, + "step": 12045 + }, + { + "epoch": 0.98, + "grad_norm": 0.9528617655477253, + "learning_rate": 2.305526638306521e-08, + "loss": 0.4579, + "step": 12046 + }, + { + "epoch": 0.98, + "grad_norm": 0.9290253967263423, + "learning_rate": 2.2876957793412923e-08, + "loss": 0.4823, + "step": 12047 + }, + { + "epoch": 0.98, + "grad_norm": 0.7917551407534277, + "learning_rate": 2.2699340609537713e-08, + "loss": 0.3943, + "step": 12048 + }, + { + "epoch": 0.98, + "grad_norm": 0.969076220647739, + "learning_rate": 2.2522414843748618e-08, + "loss": 0.4756, + "step": 12049 + }, + { + "epoch": 0.98, + "grad_norm": 0.9774628839866263, + "learning_rate": 2.2346180508305836e-08, + "loss": 0.4676, + "step": 12050 + }, + { + "epoch": 0.98, + "grad_norm": 0.9611488238609713, + "learning_rate": 2.217063761542293e-08, + "loss": 0.4263, + "step": 12051 + }, + { + "epoch": 0.98, + "grad_norm": 1.0969373743056006, + "learning_rate": 2.1995786177264612e-08, + "loss": 0.4796, + "step": 12052 + }, + { + "epoch": 0.98, + "grad_norm": 0.9629598594205779, + "learning_rate": 2.1821626205947854e-08, + "loss": 0.5476, + "step": 12053 + }, + { + "epoch": 0.98, + "grad_norm": 0.9234136061403723, + "learning_rate": 2.1648157713540786e-08, + "loss": 0.4845, + "step": 12054 + }, + { + "epoch": 0.98, + "grad_norm": 1.0456502401405645, + "learning_rate": 2.147538071206712e-08, + "loss": 0.4983, + "step": 12055 + }, + { + "epoch": 0.98, + "grad_norm": 0.9620817812734163, + "learning_rate": 2.130329521349728e-08, + "loss": 0.5425, + "step": 12056 + }, + { + "epoch": 0.98, + "grad_norm": 0.9936463735647684, + "learning_rate": 2.113190122975839e-08, + "loss": 0.4874, + "step": 12057 + }, + { + "epoch": 0.98, + "grad_norm": 0.9191229093038062, + "learning_rate": 2.096119877272873e-08, + "loss": 0.4658, + "step": 12058 + }, + { + "epoch": 0.98, + "grad_norm": 1.0217672518961693, + "learning_rate": 2.0791187854234396e-08, + "loss": 0.5153, + "step": 12059 + }, + { + "epoch": 0.98, + "grad_norm": 1.0039471938425795, + "learning_rate": 2.0621868486060402e-08, + "loss": 0.5135, + "step": 12060 + }, + { + "epoch": 0.98, + "grad_norm": 0.9101474014009246, + "learning_rate": 2.045324067993959e-08, + "loss": 0.4847, + "step": 12061 + }, + { + "epoch": 0.98, + "grad_norm": 0.9451141667951678, + "learning_rate": 2.0285304447557052e-08, + "loss": 0.4758, + "step": 12062 + }, + { + "epoch": 0.98, + "grad_norm": 1.1071361661435415, + "learning_rate": 2.011805980055015e-08, + "loss": 0.5314, + "step": 12063 + }, + { + "epoch": 0.98, + "grad_norm": 0.9478821974145435, + "learning_rate": 1.9951506750510718e-08, + "loss": 0.5055, + "step": 12064 + }, + { + "epoch": 0.98, + "grad_norm": 0.9473836420133486, + "learning_rate": 1.9785645308978417e-08, + "loss": 0.4738, + "step": 12065 + }, + { + "epoch": 0.98, + "grad_norm": 1.047067744675001, + "learning_rate": 1.962047548744961e-08, + "loss": 0.4963, + "step": 12066 + }, + { + "epoch": 0.98, + "grad_norm": 0.9487194859396089, + "learning_rate": 1.9455997297368467e-08, + "loss": 0.4543, + "step": 12067 + }, + { + "epoch": 0.98, + "grad_norm": 0.9020631622345036, + "learning_rate": 1.9292210750134766e-08, + "loss": 0.4766, + "step": 12068 + }, + { + "epoch": 0.98, + "grad_norm": 0.9240780684461876, + "learning_rate": 1.9129115857097203e-08, + "loss": 0.4414, + "step": 12069 + }, + { + "epoch": 0.98, + "grad_norm": 1.0183690019040301, + "learning_rate": 1.896671262955896e-08, + "loss": 0.4414, + "step": 12070 + }, + { + "epoch": 0.98, + "grad_norm": 0.9719576260291167, + "learning_rate": 1.8805001078774364e-08, + "loss": 0.4942, + "step": 12071 + }, + { + "epoch": 0.98, + "grad_norm": 0.8229267596156057, + "learning_rate": 1.8643981215951125e-08, + "loss": 0.4737, + "step": 12072 + }, + { + "epoch": 0.98, + "grad_norm": 0.926850956701825, + "learning_rate": 1.8483653052244754e-08, + "loss": 0.4606, + "step": 12073 + }, + { + "epoch": 0.98, + "grad_norm": 0.9709028631688023, + "learning_rate": 1.832401659876859e-08, + "loss": 0.4355, + "step": 12074 + }, + { + "epoch": 0.98, + "grad_norm": 1.0114948920634896, + "learning_rate": 1.8165071866583785e-08, + "loss": 0.4659, + "step": 12075 + }, + { + "epoch": 0.98, + "grad_norm": 0.996860587805961, + "learning_rate": 1.8006818866705968e-08, + "loss": 0.5256, + "step": 12076 + }, + { + "epoch": 0.98, + "grad_norm": 0.8907455895141424, + "learning_rate": 1.7849257610101923e-08, + "loss": 0.431, + "step": 12077 + }, + { + "epoch": 0.98, + "grad_norm": 0.9866829023109346, + "learning_rate": 1.7692388107689584e-08, + "loss": 0.4946, + "step": 12078 + }, + { + "epoch": 0.98, + "grad_norm": 0.9840949466758064, + "learning_rate": 1.7536210370341366e-08, + "loss": 0.4973, + "step": 12079 + }, + { + "epoch": 0.98, + "grad_norm": 0.934940510323555, + "learning_rate": 1.7380724408878613e-08, + "loss": 0.4776, + "step": 12080 + }, + { + "epoch": 0.98, + "grad_norm": 0.9316063795807219, + "learning_rate": 1.7225930234077147e-08, + "loss": 0.448, + "step": 12081 + }, + { + "epoch": 0.98, + "grad_norm": 0.9043191059114086, + "learning_rate": 1.7071827856663947e-08, + "loss": 0.4002, + "step": 12082 + }, + { + "epoch": 0.98, + "grad_norm": 1.0040959642431642, + "learning_rate": 1.6918417287318245e-08, + "loss": 0.4702, + "step": 12083 + }, + { + "epoch": 0.98, + "grad_norm": 1.0118369320866807, + "learning_rate": 1.6765698536671538e-08, + "loss": 0.5233, + "step": 12084 + }, + { + "epoch": 0.98, + "grad_norm": 0.9343418206195916, + "learning_rate": 1.661367161530647e-08, + "loss": 0.4925, + "step": 12085 + }, + { + "epoch": 0.98, + "grad_norm": 0.9724808816874202, + "learning_rate": 1.646233653375795e-08, + "loss": 0.5342, + "step": 12086 + }, + { + "epoch": 0.98, + "grad_norm": 0.9234714441172094, + "learning_rate": 1.6311693302515364e-08, + "loss": 0.4269, + "step": 12087 + }, + { + "epoch": 0.98, + "grad_norm": 0.9908340940280683, + "learning_rate": 1.6161741932017026e-08, + "loss": 0.5122, + "step": 12088 + }, + { + "epoch": 0.98, + "grad_norm": 1.0136407090289303, + "learning_rate": 1.601248243265352e-08, + "loss": 0.555, + "step": 12089 + }, + { + "epoch": 0.98, + "grad_norm": 0.8792657315259043, + "learning_rate": 1.586391481476879e-08, + "loss": 0.497, + "step": 12090 + }, + { + "epoch": 0.98, + "grad_norm": 0.9361415339097172, + "learning_rate": 1.5716039088660152e-08, + "loss": 0.4718, + "step": 12091 + }, + { + "epoch": 0.98, + "grad_norm": 1.051550955427514, + "learning_rate": 1.5568855264572745e-08, + "loss": 0.4684, + "step": 12092 + }, + { + "epoch": 0.98, + "grad_norm": 1.0455735739038166, + "learning_rate": 1.5422363352708414e-08, + "loss": 0.4771, + "step": 12093 + }, + { + "epoch": 0.98, + "grad_norm": 0.9894632078153739, + "learning_rate": 1.5276563363217923e-08, + "loss": 0.5236, + "step": 12094 + }, + { + "epoch": 0.98, + "grad_norm": 0.9306248195163555, + "learning_rate": 1.51314553062043e-08, + "loss": 0.5081, + "step": 12095 + }, + { + "epoch": 0.98, + "grad_norm": 1.030265257408779, + "learning_rate": 1.498703919172506e-08, + "loss": 0.5067, + "step": 12096 + }, + { + "epoch": 0.98, + "grad_norm": 0.9951251961523201, + "learning_rate": 1.4843315029786642e-08, + "loss": 0.4647, + "step": 12097 + }, + { + "epoch": 0.98, + "grad_norm": 0.9202627425313655, + "learning_rate": 1.4700282830351077e-08, + "loss": 0.4568, + "step": 12098 + }, + { + "epoch": 0.98, + "grad_norm": 1.0646728167615809, + "learning_rate": 1.4557942603327103e-08, + "loss": 0.5333, + "step": 12099 + }, + { + "epoch": 0.98, + "grad_norm": 0.9165375441309248, + "learning_rate": 1.4416294358582383e-08, + "loss": 0.474, + "step": 12100 + }, + { + "epoch": 0.98, + "grad_norm": 0.9989449749096093, + "learning_rate": 1.4275338105930181e-08, + "loss": 0.5541, + "step": 12101 + }, + { + "epoch": 0.98, + "grad_norm": 0.8842241995287015, + "learning_rate": 1.4135073855139348e-08, + "loss": 0.4333, + "step": 12102 + }, + { + "epoch": 0.98, + "grad_norm": 1.0020455888538515, + "learning_rate": 1.3995501615930996e-08, + "loss": 0.5113, + "step": 12103 + }, + { + "epoch": 0.98, + "grad_norm": 0.9392848855909458, + "learning_rate": 1.3856621397977388e-08, + "loss": 0.4491, + "step": 12104 + }, + { + "epoch": 0.98, + "grad_norm": 0.8645236971420108, + "learning_rate": 1.3718433210901938e-08, + "loss": 0.4684, + "step": 12105 + }, + { + "epoch": 0.98, + "grad_norm": 0.9476125835258605, + "learning_rate": 1.358093706428032e-08, + "loss": 0.4455, + "step": 12106 + }, + { + "epoch": 0.98, + "grad_norm": 0.9120247324159362, + "learning_rate": 1.3444132967642687e-08, + "loss": 0.4567, + "step": 12107 + }, + { + "epoch": 0.98, + "grad_norm": 0.9439904162223841, + "learning_rate": 1.3308020930468123e-08, + "loss": 0.4762, + "step": 12108 + }, + { + "epoch": 0.98, + "grad_norm": 0.8476734435812402, + "learning_rate": 1.3172600962190196e-08, + "loss": 0.3775, + "step": 12109 + }, + { + "epoch": 0.98, + "grad_norm": 1.0085984962827712, + "learning_rate": 1.3037873072192509e-08, + "loss": 0.5417, + "step": 12110 + }, + { + "epoch": 0.98, + "grad_norm": 0.9458556391105006, + "learning_rate": 1.2903837269810926e-08, + "loss": 0.4422, + "step": 12111 + }, + { + "epoch": 0.98, + "grad_norm": 0.927148332515927, + "learning_rate": 1.2770493564335795e-08, + "loss": 0.4492, + "step": 12112 + }, + { + "epoch": 0.98, + "grad_norm": 0.8648138309308088, + "learning_rate": 1.2637841965006392e-08, + "loss": 0.4853, + "step": 12113 + }, + { + "epoch": 0.98, + "grad_norm": 0.9085857938849952, + "learning_rate": 1.2505882481016473e-08, + "loss": 0.4589, + "step": 12114 + }, + { + "epoch": 0.98, + "grad_norm": 0.9850194006674354, + "learning_rate": 1.2374615121508726e-08, + "loss": 0.5077, + "step": 12115 + }, + { + "epoch": 0.98, + "grad_norm": 0.8683619860038134, + "learning_rate": 1.2244039895582538e-08, + "loss": 0.4737, + "step": 12116 + }, + { + "epoch": 0.98, + "grad_norm": 0.986180421412581, + "learning_rate": 1.2114156812284006e-08, + "loss": 0.4238, + "step": 12117 + }, + { + "epoch": 0.98, + "grad_norm": 0.8630904059814098, + "learning_rate": 1.1984965880615929e-08, + "loss": 0.4534, + "step": 12118 + }, + { + "epoch": 0.98, + "grad_norm": 0.9567877247689517, + "learning_rate": 1.1856467109530034e-08, + "loss": 0.4496, + "step": 12119 + }, + { + "epoch": 0.99, + "grad_norm": 0.9244238624328863, + "learning_rate": 1.1728660507931423e-08, + "loss": 0.4996, + "step": 12120 + }, + { + "epoch": 0.99, + "grad_norm": 0.9355603986596689, + "learning_rate": 1.1601546084677451e-08, + "loss": 0.4538, + "step": 12121 + }, + { + "epoch": 0.99, + "grad_norm": 1.0998083959992173, + "learning_rate": 1.147512384857663e-08, + "loss": 0.5296, + "step": 12122 + }, + { + "epoch": 0.99, + "grad_norm": 1.0029466284947572, + "learning_rate": 1.134939380838973e-08, + "loss": 0.5109, + "step": 12123 + }, + { + "epoch": 0.99, + "grad_norm": 1.018903049451067, + "learning_rate": 1.1224355972829782e-08, + "loss": 0.53, + "step": 12124 + }, + { + "epoch": 0.99, + "grad_norm": 0.9814802302583996, + "learning_rate": 1.1100010350562073e-08, + "loss": 0.4953, + "step": 12125 + }, + { + "epoch": 0.99, + "grad_norm": 0.9378454731447133, + "learning_rate": 1.0976356950203049e-08, + "loss": 0.4791, + "step": 12126 + }, + { + "epoch": 0.99, + "grad_norm": 0.8459287341705666, + "learning_rate": 1.0853395780322518e-08, + "loss": 0.4792, + "step": 12127 + }, + { + "epoch": 0.99, + "grad_norm": 0.9874219066355951, + "learning_rate": 1.0731126849441442e-08, + "loss": 0.5082, + "step": 12128 + }, + { + "epoch": 0.99, + "grad_norm": 0.9790531177343795, + "learning_rate": 1.0609550166033045e-08, + "loss": 0.4642, + "step": 12129 + }, + { + "epoch": 0.99, + "grad_norm": 0.8633723815748132, + "learning_rate": 1.0488665738521697e-08, + "loss": 0.4535, + "step": 12130 + }, + { + "epoch": 0.99, + "grad_norm": 0.9307003256715729, + "learning_rate": 1.0368473575285143e-08, + "loss": 0.491, + "step": 12131 + }, + { + "epoch": 0.99, + "grad_norm": 0.9499840617739655, + "learning_rate": 1.0248973684653385e-08, + "loss": 0.4717, + "step": 12132 + }, + { + "epoch": 0.99, + "grad_norm": 0.9656425326858633, + "learning_rate": 1.0130166074906467e-08, + "loss": 0.4833, + "step": 12133 + }, + { + "epoch": 0.99, + "grad_norm": 0.969248297391733, + "learning_rate": 1.0012050754277802e-08, + "loss": 0.4377, + "step": 12134 + }, + { + "epoch": 0.99, + "grad_norm": 1.0964434122566502, + "learning_rate": 9.894627730953066e-09, + "loss": 0.4803, + "step": 12135 + }, + { + "epoch": 0.99, + "grad_norm": 0.9802224309989128, + "learning_rate": 9.777897013069082e-09, + "loss": 0.5238, + "step": 12136 + }, + { + "epoch": 0.99, + "grad_norm": 0.8730273662548644, + "learning_rate": 9.661858608716045e-09, + "loss": 0.4808, + "step": 12137 + }, + { + "epoch": 0.99, + "grad_norm": 0.9342879322938532, + "learning_rate": 9.546512525934193e-09, + "loss": 0.4506, + "step": 12138 + }, + { + "epoch": 0.99, + "grad_norm": 0.8701452487731054, + "learning_rate": 9.43185877271824e-09, + "loss": 0.4778, + "step": 12139 + }, + { + "epoch": 0.99, + "grad_norm": 1.0066801314510307, + "learning_rate": 9.317897357011829e-09, + "loss": 0.4871, + "step": 12140 + }, + { + "epoch": 0.99, + "grad_norm": 0.9434565414798493, + "learning_rate": 9.204628286714202e-09, + "loss": 0.5085, + "step": 12141 + }, + { + "epoch": 0.99, + "grad_norm": 0.960720693115298, + "learning_rate": 9.092051569674632e-09, + "loss": 0.5068, + "step": 12142 + }, + { + "epoch": 0.99, + "grad_norm": 1.0266753458833535, + "learning_rate": 8.980167213692437e-09, + "loss": 0.4912, + "step": 12143 + }, + { + "epoch": 0.99, + "grad_norm": 0.8768551645097515, + "learning_rate": 8.868975226523634e-09, + "loss": 0.4614, + "step": 12144 + }, + { + "epoch": 0.99, + "grad_norm": 0.9258059202902077, + "learning_rate": 8.758475615872065e-09, + "loss": 0.4769, + "step": 12145 + }, + { + "epoch": 0.99, + "grad_norm": 0.8688950692394761, + "learning_rate": 8.648668389397153e-09, + "loss": 0.4356, + "step": 12146 + }, + { + "epoch": 0.99, + "grad_norm": 0.9116665627644456, + "learning_rate": 8.539553554706148e-09, + "loss": 0.479, + "step": 12147 + }, + { + "epoch": 0.99, + "grad_norm": 0.9170045332553114, + "learning_rate": 8.431131119361891e-09, + "loss": 0.4745, + "step": 12148 + }, + { + "epoch": 0.99, + "grad_norm": 0.9571233551136623, + "learning_rate": 8.323401090877258e-09, + "loss": 0.4227, + "step": 12149 + }, + { + "epoch": 0.99, + "grad_norm": 0.9635271757833536, + "learning_rate": 8.216363476718503e-09, + "loss": 0.4495, + "step": 12150 + }, + { + "epoch": 0.99, + "grad_norm": 0.972196287676129, + "learning_rate": 8.110018284304132e-09, + "loss": 0.5253, + "step": 12151 + }, + { + "epoch": 0.99, + "grad_norm": 0.9668650890442178, + "learning_rate": 8.004365521001589e-09, + "loss": 0.468, + "step": 12152 + }, + { + "epoch": 0.99, + "grad_norm": 0.9220578685108054, + "learning_rate": 7.899405194133902e-09, + "loss": 0.4399, + "step": 12153 + }, + { + "epoch": 0.99, + "grad_norm": 1.109967136629721, + "learning_rate": 7.795137310974143e-09, + "loss": 0.5623, + "step": 12154 + }, + { + "epoch": 0.99, + "grad_norm": 0.9759862425878394, + "learning_rate": 7.691561878748755e-09, + "loss": 0.4836, + "step": 12155 + }, + { + "epoch": 0.99, + "grad_norm": 0.9096238591337812, + "learning_rate": 7.588678904635328e-09, + "loss": 0.4849, + "step": 12156 + }, + { + "epoch": 0.99, + "grad_norm": 1.0074088319601273, + "learning_rate": 7.486488395762604e-09, + "loss": 0.556, + "step": 12157 + }, + { + "epoch": 0.99, + "grad_norm": 0.9484713726196886, + "learning_rate": 7.384990359212696e-09, + "loss": 0.4814, + "step": 12158 + }, + { + "epoch": 0.99, + "grad_norm": 0.9423528640935501, + "learning_rate": 7.284184802019978e-09, + "loss": 0.4615, + "step": 12159 + }, + { + "epoch": 0.99, + "grad_norm": 1.0101292494710938, + "learning_rate": 7.1840717311688625e-09, + "loss": 0.5216, + "step": 12160 + }, + { + "epoch": 0.99, + "grad_norm": 0.9788949263707301, + "learning_rate": 7.084651153599353e-09, + "loss": 0.4846, + "step": 12161 + }, + { + "epoch": 0.99, + "grad_norm": 0.8823697123892938, + "learning_rate": 6.985923076199275e-09, + "loss": 0.4516, + "step": 12162 + }, + { + "epoch": 0.99, + "grad_norm": 0.9332411928489063, + "learning_rate": 6.88788750580982e-09, + "loss": 0.4917, + "step": 12163 + }, + { + "epoch": 0.99, + "grad_norm": 0.8939196205465503, + "learning_rate": 6.790544449227776e-09, + "loss": 0.5001, + "step": 12164 + }, + { + "epoch": 0.99, + "grad_norm": 0.915649796606784, + "learning_rate": 6.693893913195526e-09, + "loss": 0.4448, + "step": 12165 + }, + { + "epoch": 0.99, + "grad_norm": 0.8985794749672081, + "learning_rate": 6.597935904413267e-09, + "loss": 0.4557, + "step": 12166 + }, + { + "epoch": 0.99, + "grad_norm": 0.8898060428434278, + "learning_rate": 6.502670429529012e-09, + "loss": 0.4438, + "step": 12167 + }, + { + "epoch": 0.99, + "grad_norm": 0.911361729983376, + "learning_rate": 6.40809749514637e-09, + "loss": 0.4731, + "step": 12168 + }, + { + "epoch": 0.99, + "grad_norm": 0.9853357071007716, + "learning_rate": 6.314217107817877e-09, + "loss": 0.5042, + "step": 12169 + }, + { + "epoch": 0.99, + "grad_norm": 0.9595718090401706, + "learning_rate": 6.221029274049439e-09, + "loss": 0.4843, + "step": 12170 + }, + { + "epoch": 0.99, + "grad_norm": 0.9264152524313263, + "learning_rate": 6.1285340003003346e-09, + "loss": 0.4648, + "step": 12171 + }, + { + "epoch": 0.99, + "grad_norm": 1.0823810828783662, + "learning_rate": 6.03673129297877e-09, + "loss": 0.5243, + "step": 12172 + }, + { + "epoch": 0.99, + "grad_norm": 1.0166043314385933, + "learning_rate": 5.945621158446324e-09, + "loss": 0.4803, + "step": 12173 + }, + { + "epoch": 0.99, + "grad_norm": 1.0495585663202873, + "learning_rate": 5.855203603017945e-09, + "loss": 0.4966, + "step": 12174 + }, + { + "epoch": 0.99, + "grad_norm": 0.9321136127621876, + "learning_rate": 5.765478632959731e-09, + "loss": 0.4823, + "step": 12175 + }, + { + "epoch": 0.99, + "grad_norm": 0.89851925044116, + "learning_rate": 5.676446254488932e-09, + "loss": 0.4719, + "step": 12176 + }, + { + "epoch": 0.99, + "grad_norm": 1.0085946206564493, + "learning_rate": 5.588106473775057e-09, + "loss": 0.5553, + "step": 12177 + }, + { + "epoch": 0.99, + "grad_norm": 0.8649896612244649, + "learning_rate": 5.500459296939875e-09, + "loss": 0.4842, + "step": 12178 + }, + { + "epoch": 0.99, + "grad_norm": 0.9366458571588505, + "learning_rate": 5.413504730058527e-09, + "loss": 0.4386, + "step": 12179 + }, + { + "epoch": 0.99, + "grad_norm": 0.8616063953962477, + "learning_rate": 5.327242779156194e-09, + "loss": 0.485, + "step": 12180 + }, + { + "epoch": 0.99, + "grad_norm": 0.947643360116589, + "learning_rate": 5.2416734502103165e-09, + "loss": 0.4673, + "step": 12181 + }, + { + "epoch": 0.99, + "grad_norm": 1.5494313108108435, + "learning_rate": 5.156796749150595e-09, + "loss": 0.458, + "step": 12182 + }, + { + "epoch": 0.99, + "grad_norm": 0.9505065847530575, + "learning_rate": 5.0726126818601e-09, + "loss": 0.456, + "step": 12183 + }, + { + "epoch": 0.99, + "grad_norm": 0.9026091368438374, + "learning_rate": 4.989121254171947e-09, + "loss": 0.4619, + "step": 12184 + }, + { + "epoch": 0.99, + "grad_norm": 0.9139503054391478, + "learning_rate": 4.9063224718726154e-09, + "loss": 0.4758, + "step": 12185 + }, + { + "epoch": 0.99, + "grad_norm": 1.0371267800898043, + "learning_rate": 4.824216340698629e-09, + "loss": 0.5083, + "step": 12186 + }, + { + "epoch": 0.99, + "grad_norm": 0.9140285052535965, + "learning_rate": 4.74280286634099e-09, + "loss": 0.4906, + "step": 12187 + }, + { + "epoch": 0.99, + "grad_norm": 1.0079408181761793, + "learning_rate": 4.662082054441852e-09, + "loss": 0.5653, + "step": 12188 + }, + { + "epoch": 0.99, + "grad_norm": 0.9830808916635849, + "learning_rate": 4.582053910594519e-09, + "loss": 0.4774, + "step": 12189 + }, + { + "epoch": 0.99, + "grad_norm": 0.9077023704770761, + "learning_rate": 4.502718440344556e-09, + "loss": 0.4558, + "step": 12190 + }, + { + "epoch": 0.99, + "grad_norm": 0.7935871680265658, + "learning_rate": 4.4240756491897854e-09, + "loss": 0.4481, + "step": 12191 + }, + { + "epoch": 0.99, + "grad_norm": 0.8963596426210041, + "learning_rate": 4.346125542581403e-09, + "loss": 0.4413, + "step": 12192 + }, + { + "epoch": 0.99, + "grad_norm": 0.9176413396869422, + "learning_rate": 4.268868125919534e-09, + "loss": 0.4847, + "step": 12193 + }, + { + "epoch": 0.99, + "grad_norm": 0.8930937176633225, + "learning_rate": 4.192303404559894e-09, + "loss": 0.4818, + "step": 12194 + }, + { + "epoch": 0.99, + "grad_norm": 0.9774095396399091, + "learning_rate": 4.1164313838060184e-09, + "loss": 0.4618, + "step": 12195 + }, + { + "epoch": 0.99, + "grad_norm": 0.9304736319360605, + "learning_rate": 4.041252068918145e-09, + "loss": 0.469, + "step": 12196 + }, + { + "epoch": 0.99, + "grad_norm": 1.0330209884154204, + "learning_rate": 3.966765465105438e-09, + "loss": 0.4876, + "step": 12197 + }, + { + "epoch": 0.99, + "grad_norm": 0.8581916788578621, + "learning_rate": 3.892971577528215e-09, + "loss": 0.4076, + "step": 12198 + }, + { + "epoch": 0.99, + "grad_norm": 1.018750336762057, + "learning_rate": 3.819870411302385e-09, + "loss": 0.5133, + "step": 12199 + }, + { + "epoch": 0.99, + "grad_norm": 0.9301758200353073, + "learning_rate": 3.747461971492783e-09, + "loss": 0.4803, + "step": 12200 + }, + { + "epoch": 0.99, + "grad_norm": 0.9524198887721386, + "learning_rate": 3.6757462631176186e-09, + "loss": 0.5102, + "step": 12201 + }, + { + "epoch": 0.99, + "grad_norm": 0.9998380514117441, + "learning_rate": 3.6047232911462506e-09, + "loss": 0.4627, + "step": 12202 + }, + { + "epoch": 0.99, + "grad_norm": 0.9303895789749116, + "learning_rate": 3.5343930605002964e-09, + "loss": 0.4983, + "step": 12203 + }, + { + "epoch": 0.99, + "grad_norm": 0.9814676213506741, + "learning_rate": 3.4647555760547456e-09, + "loss": 0.5019, + "step": 12204 + }, + { + "epoch": 0.99, + "grad_norm": 0.8899742237254751, + "learning_rate": 3.3958108426346283e-09, + "loss": 0.4679, + "step": 12205 + }, + { + "epoch": 0.99, + "grad_norm": 0.9854215413848316, + "learning_rate": 3.327558865017233e-09, + "loss": 0.4697, + "step": 12206 + }, + { + "epoch": 0.99, + "grad_norm": 0.9949671671252079, + "learning_rate": 3.259999647933221e-09, + "loss": 0.4586, + "step": 12207 + }, + { + "epoch": 0.99, + "grad_norm": 0.9767718153567564, + "learning_rate": 3.193133196064402e-09, + "loss": 0.5054, + "step": 12208 + }, + { + "epoch": 0.99, + "grad_norm": 1.0785275131904468, + "learning_rate": 3.126959514043737e-09, + "loss": 0.5084, + "step": 12209 + }, + { + "epoch": 0.99, + "grad_norm": 0.8674149722017763, + "learning_rate": 3.0614786064586676e-09, + "loss": 0.4421, + "step": 12210 + }, + { + "epoch": 0.99, + "grad_norm": 0.9315561827978559, + "learning_rate": 2.996690477844455e-09, + "loss": 0.4992, + "step": 12211 + }, + { + "epoch": 0.99, + "grad_norm": 0.9269373342663652, + "learning_rate": 2.9325951326930614e-09, + "loss": 0.5106, + "step": 12212 + }, + { + "epoch": 0.99, + "grad_norm": 1.0034662424592324, + "learning_rate": 2.8691925754453785e-09, + "loss": 0.5164, + "step": 12213 + }, + { + "epoch": 0.99, + "grad_norm": 0.9529358562010914, + "learning_rate": 2.8064828104956697e-09, + "loss": 0.4973, + "step": 12214 + }, + { + "epoch": 0.99, + "grad_norm": 1.0100316056822873, + "learning_rate": 2.7444658421882375e-09, + "loss": 0.516, + "step": 12215 + }, + { + "epoch": 0.99, + "grad_norm": 0.947131493385448, + "learning_rate": 2.6831416748229755e-09, + "loss": 0.4978, + "step": 12216 + }, + { + "epoch": 0.99, + "grad_norm": 1.013465954899412, + "learning_rate": 2.622510312647597e-09, + "loss": 0.4745, + "step": 12217 + }, + { + "epoch": 0.99, + "grad_norm": 1.0572863965813828, + "learning_rate": 2.5625717598642962e-09, + "loss": 0.5839, + "step": 12218 + }, + { + "epoch": 0.99, + "grad_norm": 0.8450663723277408, + "learning_rate": 2.5033260206275277e-09, + "loss": 0.4414, + "step": 12219 + }, + { + "epoch": 0.99, + "grad_norm": 0.9349043179917564, + "learning_rate": 2.4447730990428964e-09, + "loss": 0.4565, + "step": 12220 + }, + { + "epoch": 0.99, + "grad_norm": 0.8921858303213847, + "learning_rate": 2.386912999167157e-09, + "loss": 0.4308, + "step": 12221 + }, + { + "epoch": 0.99, + "grad_norm": 1.1106808138466342, + "learning_rate": 2.329745725010435e-09, + "loss": 0.5236, + "step": 12222 + }, + { + "epoch": 0.99, + "grad_norm": 0.9107858977787934, + "learning_rate": 2.273271280534006e-09, + "loss": 0.4344, + "step": 12223 + }, + { + "epoch": 0.99, + "grad_norm": 1.0104185102825205, + "learning_rate": 2.217489669652517e-09, + "loss": 0.5316, + "step": 12224 + }, + { + "epoch": 0.99, + "grad_norm": 0.9485027710249507, + "learning_rate": 2.1624008962306543e-09, + "loss": 0.4754, + "step": 12225 + }, + { + "epoch": 0.99, + "grad_norm": 0.886480688086442, + "learning_rate": 2.108004964086474e-09, + "loss": 0.4888, + "step": 12226 + }, + { + "epoch": 0.99, + "grad_norm": 0.937058241545652, + "learning_rate": 2.0543018769902946e-09, + "loss": 0.4611, + "step": 12227 + }, + { + "epoch": 0.99, + "grad_norm": 0.9598546147109381, + "learning_rate": 2.0012916386613625e-09, + "loss": 0.4744, + "step": 12228 + }, + { + "epoch": 0.99, + "grad_norm": 0.974531542912316, + "learning_rate": 1.9489742527756263e-09, + "loss": 0.5349, + "step": 12229 + }, + { + "epoch": 0.99, + "grad_norm": 0.9519705243013669, + "learning_rate": 1.8973497229568537e-09, + "loss": 0.4761, + "step": 12230 + }, + { + "epoch": 0.99, + "grad_norm": 0.9238988769473946, + "learning_rate": 1.8464180527844044e-09, + "loss": 0.4833, + "step": 12231 + }, + { + "epoch": 0.99, + "grad_norm": 1.0111873319625468, + "learning_rate": 1.7961792457865668e-09, + "loss": 0.5243, + "step": 12232 + }, + { + "epoch": 0.99, + "grad_norm": 0.967548228009322, + "learning_rate": 1.7466333054450001e-09, + "loss": 0.5089, + "step": 12233 + }, + { + "epoch": 0.99, + "grad_norm": 0.918909362153841, + "learning_rate": 1.6977802351936246e-09, + "loss": 0.4543, + "step": 12234 + }, + { + "epoch": 0.99, + "grad_norm": 0.7785482161984433, + "learning_rate": 1.6496200384163996e-09, + "loss": 0.3861, + "step": 12235 + }, + { + "epoch": 0.99, + "grad_norm": 0.9384527289121802, + "learning_rate": 1.6021527184528761e-09, + "loss": 0.4751, + "step": 12236 + }, + { + "epoch": 0.99, + "grad_norm": 0.9505856087988792, + "learning_rate": 1.555378278591535e-09, + "loss": 0.4957, + "step": 12237 + }, + { + "epoch": 0.99, + "grad_norm": 0.9896863520416174, + "learning_rate": 1.5092967220742272e-09, + "loss": 0.4905, + "step": 12238 + }, + { + "epoch": 0.99, + "grad_norm": 1.0768453855263935, + "learning_rate": 1.4639080520939541e-09, + "loss": 0.5181, + "step": 12239 + }, + { + "epoch": 0.99, + "grad_norm": 1.0133044427198161, + "learning_rate": 1.4192122717959777e-09, + "loss": 0.5047, + "step": 12240 + }, + { + "epoch": 0.99, + "grad_norm": 0.9404999214974434, + "learning_rate": 1.3752093842778204e-09, + "loss": 0.4799, + "step": 12241 + }, + { + "epoch": 0.99, + "grad_norm": 1.0552931054831358, + "learning_rate": 1.3318993925881541e-09, + "loss": 0.5162, + "step": 12242 + }, + { + "epoch": 1.0, + "grad_norm": 1.0552380665620096, + "learning_rate": 1.2892822997301324e-09, + "loss": 0.5035, + "step": 12243 + }, + { + "epoch": 1.0, + "grad_norm": 0.9391633731274371, + "learning_rate": 1.2473581086558383e-09, + "loss": 0.4921, + "step": 12244 + }, + { + "epoch": 1.0, + "grad_norm": 1.0052164661643428, + "learning_rate": 1.2061268222707257e-09, + "loss": 0.4663, + "step": 12245 + }, + { + "epoch": 1.0, + "grad_norm": 0.9722915792156718, + "learning_rate": 1.165588443431398e-09, + "loss": 0.4943, + "step": 12246 + }, + { + "epoch": 1.0, + "grad_norm": 0.9102927236377634, + "learning_rate": 1.12574297494783e-09, + "loss": 0.4471, + "step": 12247 + }, + { + "epoch": 1.0, + "grad_norm": 0.9398310361694413, + "learning_rate": 1.0865904195822563e-09, + "loss": 0.4707, + "step": 12248 + }, + { + "epoch": 1.0, + "grad_norm": 0.808454573445147, + "learning_rate": 1.048130780046952e-09, + "loss": 0.398, + "step": 12249 + }, + { + "epoch": 1.0, + "grad_norm": 0.9746782424320476, + "learning_rate": 1.0103640590064524e-09, + "loss": 0.5442, + "step": 12250 + }, + { + "epoch": 1.0, + "grad_norm": 0.9649576296411937, + "learning_rate": 9.73290259078663e-10, + "loss": 0.5274, + "step": 12251 + }, + { + "epoch": 1.0, + "grad_norm": 0.8107375447971296, + "learning_rate": 9.369093828326403e-10, + "loss": 0.431, + "step": 12252 + }, + { + "epoch": 1.0, + "grad_norm": 1.0699241121892324, + "learning_rate": 9.012214327897006e-10, + "loss": 0.5015, + "step": 12253 + }, + { + "epoch": 1.0, + "grad_norm": 0.905378821625332, + "learning_rate": 8.662264114234209e-10, + "loss": 0.4383, + "step": 12254 + }, + { + "epoch": 1.0, + "grad_norm": 1.0107933303670742, + "learning_rate": 8.319243211585281e-10, + "loss": 0.4994, + "step": 12255 + }, + { + "epoch": 1.0, + "grad_norm": 0.8628225975921563, + "learning_rate": 7.983151643708997e-10, + "loss": 0.4456, + "step": 12256 + }, + { + "epoch": 1.0, + "grad_norm": 1.0056042550071769, + "learning_rate": 7.653989433920039e-10, + "loss": 0.5566, + "step": 12257 + }, + { + "epoch": 1.0, + "grad_norm": 1.0487775395495176, + "learning_rate": 7.331756605011286e-10, + "loss": 0.511, + "step": 12258 + }, + { + "epoch": 1.0, + "grad_norm": 0.9034474611671817, + "learning_rate": 7.016453179320426e-10, + "loss": 0.4756, + "step": 12259 + }, + { + "epoch": 1.0, + "grad_norm": 0.860531555843401, + "learning_rate": 6.708079178685545e-10, + "loss": 0.4613, + "step": 12260 + }, + { + "epoch": 1.0, + "grad_norm": 0.894604087497659, + "learning_rate": 6.40663462450064e-10, + "loss": 0.4818, + "step": 12261 + }, + { + "epoch": 1.0, + "grad_norm": 0.956638263621099, + "learning_rate": 6.112119537637906e-10, + "loss": 0.4995, + "step": 12262 + }, + { + "epoch": 1.0, + "grad_norm": 0.950141944738411, + "learning_rate": 5.82453393850324e-10, + "loss": 0.4644, + "step": 12263 + }, + { + "epoch": 1.0, + "grad_norm": 0.9197142556995642, + "learning_rate": 5.54387784703625e-10, + "loss": 0.4667, + "step": 12264 + }, + { + "epoch": 1.0, + "grad_norm": 0.9409690029323907, + "learning_rate": 5.270151282688041e-10, + "loss": 0.4442, + "step": 12265 + }, + { + "epoch": 1.0, + "grad_norm": 0.9224085642149458, + "learning_rate": 5.003354264421223e-10, + "loss": 0.4995, + "step": 12266 + }, + { + "epoch": 1.0, + "grad_norm": 0.9039475339596145, + "learning_rate": 4.743486810732111e-10, + "loss": 0.4782, + "step": 12267 + }, + { + "epoch": 1.0, + "grad_norm": 0.9500554852385881, + "learning_rate": 4.490548939617423e-10, + "loss": 0.5374, + "step": 12268 + }, + { + "epoch": 1.0, + "grad_norm": 1.021632409812602, + "learning_rate": 4.2445406686075776e-10, + "loss": 0.5037, + "step": 12269 + }, + { + "epoch": 1.0, + "grad_norm": 0.9019301685248371, + "learning_rate": 4.005462014766703e-10, + "loss": 0.4847, + "step": 12270 + }, + { + "epoch": 1.0, + "grad_norm": 1.0448459387319498, + "learning_rate": 3.7733129946371237e-10, + "loss": 0.5018, + "step": 12271 + }, + { + "epoch": 1.0, + "grad_norm": 0.9567017288240108, + "learning_rate": 3.548093624328175e-10, + "loss": 0.5002, + "step": 12272 + }, + { + "epoch": 1.0, + "grad_norm": 0.9393370125389402, + "learning_rate": 3.32980391943849e-10, + "loss": 0.5022, + "step": 12273 + }, + { + "epoch": 1.0, + "grad_norm": 1.0039457861354868, + "learning_rate": 3.118443895100409e-10, + "loss": 0.4833, + "step": 12274 + }, + { + "epoch": 1.0, + "grad_norm": 0.876708231374574, + "learning_rate": 2.914013565957774e-10, + "loss": 0.4747, + "step": 12275 + }, + { + "epoch": 1.0, + "grad_norm": 1.0483753373478573, + "learning_rate": 2.716512946165928e-10, + "loss": 0.5458, + "step": 12276 + }, + { + "epoch": 1.0, + "grad_norm": 5.854264871286635, + "learning_rate": 2.525942049436125e-10, + "loss": 0.4194, + "step": 12277 + }, + { + "epoch": 1.0, + "grad_norm": 0.9330203416041252, + "learning_rate": 2.3423008889467134e-10, + "loss": 0.4675, + "step": 12278 + }, + { + "epoch": 1.0, + "grad_norm": 0.928285603265314, + "learning_rate": 2.165589477443053e-10, + "loss": 0.4496, + "step": 12279 + }, + { + "epoch": 1.0, + "grad_norm": 0.896315861070505, + "learning_rate": 1.9958078271709037e-10, + "loss": 0.5164, + "step": 12280 + }, + { + "epoch": 1.0, + "grad_norm": 0.9970984789464427, + "learning_rate": 1.8329559498875272e-10, + "loss": 0.4903, + "step": 12281 + }, + { + "epoch": 1.0, + "grad_norm": 0.9862019097831741, + "learning_rate": 1.6770338568838918e-10, + "loss": 0.4294, + "step": 12282 + }, + { + "epoch": 1.0, + "grad_norm": 0.8625424437572193, + "learning_rate": 1.5280415589624676e-10, + "loss": 0.4841, + "step": 12283 + }, + { + "epoch": 1.0, + "grad_norm": 2.6996509177567276, + "learning_rate": 1.3859790664483287e-10, + "loss": 0.5431, + "step": 12284 + }, + { + "epoch": 1.0, + "grad_norm": 0.983653006981926, + "learning_rate": 1.250846389189153e-10, + "loss": 0.4933, + "step": 12285 + }, + { + "epoch": 1.0, + "grad_norm": 0.9691698910333484, + "learning_rate": 1.1226435365441212e-10, + "loss": 0.478, + "step": 12286 + }, + { + "epoch": 1.0, + "grad_norm": 0.8999462139403965, + "learning_rate": 1.0013705174061195e-10, + "loss": 0.4665, + "step": 12287 + }, + { + "epoch": 1.0, + "grad_norm": 0.9701615848783587, + "learning_rate": 8.870273401684338e-11, + "loss": 0.469, + "step": 12288 + }, + { + "epoch": 1.0, + "grad_norm": 0.9788648850371204, + "learning_rate": 7.796140127691587e-11, + "loss": 0.4688, + "step": 12289 + }, + { + "epoch": 1.0, + "grad_norm": 0.9041305602197512, + "learning_rate": 6.791305426356865e-11, + "loss": 0.4562, + "step": 12290 + }, + { + "epoch": 1.0, + "grad_norm": 0.9908625759964073, + "learning_rate": 5.855769367402176e-11, + "loss": 0.5208, + "step": 12291 + }, + { + "epoch": 1.0, + "grad_norm": 0.9195748995284373, + "learning_rate": 4.9895320156645445e-11, + "loss": 0.407, + "step": 12292 + }, + { + "epoch": 1.0, + "grad_norm": 1.0408210431502338, + "learning_rate": 4.192593431096015e-11, + "loss": 0.5351, + "step": 12293 + }, + { + "epoch": 1.0, + "grad_norm": 0.9092443947587706, + "learning_rate": 3.4649536690967154e-11, + "loss": 0.4844, + "step": 12294 + }, + { + "epoch": 1.0, + "grad_norm": 0.9725729468080113, + "learning_rate": 2.8066127798487274e-11, + "loss": 0.5036, + "step": 12295 + }, + { + "epoch": 1.0, + "grad_norm": 0.9842735837618908, + "learning_rate": 2.2175708092042615e-11, + "loss": 0.5061, + "step": 12296 + }, + { + "epoch": 1.0, + "grad_norm": 0.9175628272236795, + "learning_rate": 1.6978277979085023e-11, + "loss": 0.4502, + "step": 12297 + }, + { + "epoch": 1.0, + "grad_norm": 1.0374820658516282, + "learning_rate": 1.2473837819326762e-11, + "loss": 0.4847, + "step": 12298 + }, + { + "epoch": 1.0, + "grad_norm": 0.8909161564068082, + "learning_rate": 8.662387924740501e-12, + "loss": 0.4672, + "step": 12299 + }, + { + "epoch": 1.0, + "grad_norm": 0.9415518726489468, + "learning_rate": 5.54392855955932e-12, + "loss": 0.4625, + "step": 12300 + }, + { + "epoch": 1.0, + "grad_norm": 0.9500390955938162, + "learning_rate": 3.118459941386931e-12, + "loss": 0.4868, + "step": 12301 + }, + { + "epoch": 1.0, + "grad_norm": 0.922814982524896, + "learning_rate": 1.3859822356465657e-12, + "loss": 0.5076, + "step": 12302 + }, + { + "epoch": 1.0, + "grad_norm": 1.0370158785381784, + "learning_rate": 3.4649556446275656e-13, + "loss": 0.5309, + "step": 12303 + }, + { + "epoch": 1.0, + "grad_norm": 0.844434207744529, + "learning_rate": 0.0, + "loss": 0.4437, + "step": 12304 + }, + { + "epoch": 1.0, + "step": 12304, + "total_flos": 1.5084962346172416e+16, + "train_loss": 0.5391508782323648, + "train_runtime": 261842.5175, + "train_samples_per_second": 6.015, + "train_steps_per_second": 0.047 + } + ], + "logging_steps": 1.0, + "max_steps": 12304, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1846, + "total_flos": 1.5084962346172416e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}