{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.96879875195008, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0998439937597504, "grad_norm": 0.4140388733942774, "learning_rate": 0.0, "loss": 1.9149, "step": 1 }, { "epoch": 0.1996879875195008, "grad_norm": 4.125320422125423, "learning_rate": 0.0001, "loss": 2.0842, "step": 2 }, { "epoch": 0.2995319812792512, "grad_norm": 4.114561571796027, "learning_rate": 0.0001, "loss": 2.0845, "step": 3 }, { "epoch": 0.3993759750390016, "grad_norm": 7.814135082802521, "learning_rate": 9.94949494949495e-05, "loss": 2.2529, "step": 4 }, { "epoch": 0.49921996879875197, "grad_norm": 4.959578007928057, "learning_rate": 9.8989898989899e-05, "loss": 2.1802, "step": 5 }, { "epoch": 0.5990639625585024, "grad_norm": 4.020980884960991, "learning_rate": 9.848484848484849e-05, "loss": 2.0996, "step": 6 }, { "epoch": 0.6989079563182528, "grad_norm": 2.913148956691379, "learning_rate": 9.797979797979798e-05, "loss": 2.0739, "step": 7 }, { "epoch": 0.7987519500780031, "grad_norm": 1.5904378694728827, "learning_rate": 9.747474747474747e-05, "loss": 2.0438, "step": 8 }, { "epoch": 0.8985959438377535, "grad_norm": 1.1120971271683253, "learning_rate": 9.696969696969698e-05, "loss": 2.023, "step": 9 }, { "epoch": 0.9984399375975039, "grad_norm": 1.4857926676190432, "learning_rate": 9.646464646464647e-05, "loss": 2.0256, "step": 10 }, { "epoch": 1.0982839313572543, "grad_norm": 1.7514837650567026, "learning_rate": 9.595959595959596e-05, "loss": 2.0297, "step": 11 }, { "epoch": 1.1981279251170047, "grad_norm": 1.3684063083659384, "learning_rate": 9.545454545454546e-05, "loss": 1.9984, "step": 12 }, { "epoch": 1.2979719188767551, "grad_norm": 0.8391319984906789, "learning_rate": 9.494949494949495e-05, "loss": 1.9938, "step": 13 }, { "epoch": 1.3978159126365055, "grad_norm": 0.7680152250335479, "learning_rate": 9.444444444444444e-05, "loss": 1.9984, "step": 14 }, { "epoch": 1.497659906396256, "grad_norm": 1.1427900590537006, "learning_rate": 9.393939393939395e-05, "loss": 2.0059, "step": 15 }, { "epoch": 1.5975039001560063, "grad_norm": 1.2852588832884364, "learning_rate": 9.343434343434344e-05, "loss": 2.0057, "step": 16 }, { "epoch": 1.6973478939157567, "grad_norm": 0.8509981577726656, "learning_rate": 9.292929292929293e-05, "loss": 1.97, "step": 17 }, { "epoch": 1.797191887675507, "grad_norm": 0.4374249257660765, "learning_rate": 9.242424242424242e-05, "loss": 1.9852, "step": 18 }, { "epoch": 1.8970358814352575, "grad_norm": 1.006945747433108, "learning_rate": 9.191919191919192e-05, "loss": 1.9736, "step": 19 }, { "epoch": 1.9968798751950079, "grad_norm": 1.1714326150671521, "learning_rate": 9.141414141414141e-05, "loss": 1.9866, "step": 20 }, { "epoch": 2.0967238689547583, "grad_norm": 0.6697915843016325, "learning_rate": 9.090909090909092e-05, "loss": 1.9669, "step": 21 }, { "epoch": 2.1965678627145087, "grad_norm": 0.43542954442572934, "learning_rate": 9.040404040404041e-05, "loss": 1.9596, "step": 22 }, { "epoch": 2.296411856474259, "grad_norm": 0.8895989581186896, "learning_rate": 8.98989898989899e-05, "loss": 1.9777, "step": 23 }, { "epoch": 2.3962558502340094, "grad_norm": 0.748473401890919, "learning_rate": 8.93939393939394e-05, "loss": 1.9828, "step": 24 }, { "epoch": 2.49609984399376, "grad_norm": 0.4762840239068188, "learning_rate": 8.888888888888889e-05, "loss": 1.9863, "step": 25 }, { "epoch": 2.5959438377535102, "grad_norm": 0.4634914120924797, "learning_rate": 8.83838383838384e-05, "loss": 1.9728, "step": 26 }, { "epoch": 2.6957878315132606, "grad_norm": 0.576721007312459, "learning_rate": 8.787878787878789e-05, "loss": 1.9813, "step": 27 }, { "epoch": 2.795631825273011, "grad_norm": 0.4717088615288276, "learning_rate": 8.737373737373738e-05, "loss": 1.9709, "step": 28 }, { "epoch": 2.8954758190327614, "grad_norm": 0.5243076095653101, "learning_rate": 8.686868686868688e-05, "loss": 1.9889, "step": 29 }, { "epoch": 2.995319812792512, "grad_norm": 0.35563844256479116, "learning_rate": 8.636363636363637e-05, "loss": 1.969, "step": 30 }, { "epoch": 3.095163806552262, "grad_norm": 0.5040313506272054, "learning_rate": 8.585858585858586e-05, "loss": 1.9701, "step": 31 }, { "epoch": 3.1950078003120126, "grad_norm": 0.5293887294443628, "learning_rate": 8.535353535353535e-05, "loss": 1.9774, "step": 32 }, { "epoch": 3.294851794071763, "grad_norm": 0.33336016676106733, "learning_rate": 8.484848484848486e-05, "loss": 1.9702, "step": 33 }, { "epoch": 3.3946957878315134, "grad_norm": 0.5156182664373749, "learning_rate": 8.434343434343435e-05, "loss": 1.9552, "step": 34 }, { "epoch": 3.4945397815912638, "grad_norm": 0.410792592829029, "learning_rate": 8.383838383838384e-05, "loss": 1.9642, "step": 35 }, { "epoch": 3.594383775351014, "grad_norm": 0.40267682408922495, "learning_rate": 8.333333333333334e-05, "loss": 1.9688, "step": 36 }, { "epoch": 3.6942277691107646, "grad_norm": 0.3869359148412346, "learning_rate": 8.282828282828283e-05, "loss": 1.9733, "step": 37 }, { "epoch": 3.794071762870515, "grad_norm": 0.37728712869432585, "learning_rate": 8.232323232323233e-05, "loss": 1.9648, "step": 38 }, { "epoch": 3.8939157566302653, "grad_norm": 0.3922418131207954, "learning_rate": 8.181818181818183e-05, "loss": 1.9689, "step": 39 }, { "epoch": 3.9937597503900157, "grad_norm": 0.26353046722639645, "learning_rate": 8.131313131313132e-05, "loss": 1.9727, "step": 40 }, { "epoch": 4.093603744149766, "grad_norm": 0.3911091474488452, "learning_rate": 8.080808080808081e-05, "loss": 1.9631, "step": 41 }, { "epoch": 4.1934477379095165, "grad_norm": 0.33402240826623614, "learning_rate": 8.03030303030303e-05, "loss": 1.9665, "step": 42 }, { "epoch": 4.2932917316692665, "grad_norm": 0.34654808232868395, "learning_rate": 7.97979797979798e-05, "loss": 1.9646, "step": 43 }, { "epoch": 4.393135725429017, "grad_norm": 0.3031078864703629, "learning_rate": 7.92929292929293e-05, "loss": 1.9693, "step": 44 }, { "epoch": 4.492979719188767, "grad_norm": 0.35342072957234116, "learning_rate": 7.878787878787879e-05, "loss": 1.9688, "step": 45 }, { "epoch": 4.592823712948518, "grad_norm": 0.3918161921811716, "learning_rate": 7.828282828282829e-05, "loss": 1.9609, "step": 46 }, { "epoch": 4.692667706708268, "grad_norm": 0.24995683506017796, "learning_rate": 7.777777777777778e-05, "loss": 1.9515, "step": 47 }, { "epoch": 4.792511700468019, "grad_norm": 0.3308078104166398, "learning_rate": 7.727272727272727e-05, "loss": 1.9607, "step": 48 }, { "epoch": 4.892355694227769, "grad_norm": 0.3130926472973521, "learning_rate": 7.676767676767676e-05, "loss": 1.9699, "step": 49 }, { "epoch": 4.99219968798752, "grad_norm": 0.30892356920484393, "learning_rate": 7.626262626262627e-05, "loss": 1.9645, "step": 50 }, { "epoch": 5.09204368174727, "grad_norm": 0.2804202715276883, "learning_rate": 7.575757575757576e-05, "loss": 1.9569, "step": 51 }, { "epoch": 5.1918876755070205, "grad_norm": 0.2789049399636327, "learning_rate": 7.525252525252525e-05, "loss": 1.9585, "step": 52 }, { "epoch": 5.29173166926677, "grad_norm": 0.2906929505804403, "learning_rate": 7.474747474747475e-05, "loss": 1.9565, "step": 53 }, { "epoch": 5.391575663026521, "grad_norm": 0.2033727950080347, "learning_rate": 7.424242424242424e-05, "loss": 1.9755, "step": 54 }, { "epoch": 5.491419656786271, "grad_norm": 0.31364461369416163, "learning_rate": 7.373737373737373e-05, "loss": 1.9647, "step": 55 }, { "epoch": 5.591263650546022, "grad_norm": 0.2531087381531638, "learning_rate": 7.323232323232324e-05, "loss": 1.9578, "step": 56 }, { "epoch": 5.691107644305772, "grad_norm": 0.23764498764830225, "learning_rate": 7.272727272727273e-05, "loss": 1.9617, "step": 57 }, { "epoch": 5.790951638065523, "grad_norm": 0.24888591334854687, "learning_rate": 7.222222222222222e-05, "loss": 1.963, "step": 58 }, { "epoch": 5.890795631825273, "grad_norm": 0.2647075657405339, "learning_rate": 7.171717171717171e-05, "loss": 1.9685, "step": 59 }, { "epoch": 5.990639625585024, "grad_norm": 0.27820470985704615, "learning_rate": 7.121212121212121e-05, "loss": 1.9654, "step": 60 }, { "epoch": 6.090483619344774, "grad_norm": 0.20068946885468097, "learning_rate": 7.07070707070707e-05, "loss": 1.9667, "step": 61 }, { "epoch": 6.190327613104524, "grad_norm": 0.25026234630326394, "learning_rate": 7.020202020202021e-05, "loss": 1.9542, "step": 62 }, { "epoch": 6.290171606864274, "grad_norm": 0.22856925269883635, "learning_rate": 6.96969696969697e-05, "loss": 1.9573, "step": 63 }, { "epoch": 6.390015600624025, "grad_norm": 0.2392183076591563, "learning_rate": 6.91919191919192e-05, "loss": 1.9647, "step": 64 }, { "epoch": 6.489859594383775, "grad_norm": 0.20384525102843132, "learning_rate": 6.86868686868687e-05, "loss": 1.9628, "step": 65 }, { "epoch": 6.589703588143526, "grad_norm": 0.23941897200051984, "learning_rate": 6.818181818181818e-05, "loss": 1.9667, "step": 66 }, { "epoch": 6.689547581903276, "grad_norm": 0.20375278551444306, "learning_rate": 6.767676767676769e-05, "loss": 1.9572, "step": 67 }, { "epoch": 6.789391575663027, "grad_norm": 0.20727005267599333, "learning_rate": 6.717171717171718e-05, "loss": 1.9581, "step": 68 }, { "epoch": 6.889235569422777, "grad_norm": 0.22300809533132504, "learning_rate": 6.666666666666667e-05, "loss": 1.9693, "step": 69 }, { "epoch": 6.9890795631825275, "grad_norm": 0.21742318730398613, "learning_rate": 6.616161616161617e-05, "loss": 1.9656, "step": 70 }, { "epoch": 7.0889235569422775, "grad_norm": 0.20343822391223, "learning_rate": 6.565656565656566e-05, "loss": 1.9656, "step": 71 }, { "epoch": 7.188767550702028, "grad_norm": 0.2364200066637671, "learning_rate": 6.515151515151516e-05, "loss": 1.95, "step": 72 }, { "epoch": 7.288611544461778, "grad_norm": 0.18261048615751524, "learning_rate": 6.464646464646466e-05, "loss": 1.9538, "step": 73 }, { "epoch": 7.388455538221529, "grad_norm": 0.24533487813163474, "learning_rate": 6.414141414141415e-05, "loss": 1.952, "step": 74 }, { "epoch": 7.488299531981279, "grad_norm": 0.2539612930496735, "learning_rate": 6.363636363636364e-05, "loss": 1.9598, "step": 75 }, { "epoch": 7.58814352574103, "grad_norm": 0.2991457603613546, "learning_rate": 6.313131313131313e-05, "loss": 1.9733, "step": 76 }, { "epoch": 7.68798751950078, "grad_norm": 0.2209105824741669, "learning_rate": 6.262626262626264e-05, "loss": 1.9531, "step": 77 }, { "epoch": 7.787831513260531, "grad_norm": 0.31698310918964695, "learning_rate": 6.212121212121213e-05, "loss": 1.964, "step": 78 }, { "epoch": 7.887675507020281, "grad_norm": 0.17584276182725114, "learning_rate": 6.161616161616162e-05, "loss": 1.9644, "step": 79 }, { "epoch": 7.9875195007800315, "grad_norm": 0.29996919622824225, "learning_rate": 6.111111111111112e-05, "loss": 1.9555, "step": 80 }, { "epoch": 8.087363494539781, "grad_norm": 0.2406502202676367, "learning_rate": 6.060606060606061e-05, "loss": 1.9554, "step": 81 }, { "epoch": 8.187207488299531, "grad_norm": 0.32705142732170855, "learning_rate": 6.01010101010101e-05, "loss": 1.9517, "step": 82 }, { "epoch": 8.287051482059283, "grad_norm": 0.27249925952338305, "learning_rate": 5.959595959595959e-05, "loss": 1.9474, "step": 83 }, { "epoch": 8.386895475819033, "grad_norm": 0.29448831027669287, "learning_rate": 5.90909090909091e-05, "loss": 1.9459, "step": 84 }, { "epoch": 8.486739469578783, "grad_norm": 0.29998154037028857, "learning_rate": 5.858585858585859e-05, "loss": 1.9606, "step": 85 }, { "epoch": 8.586583463338533, "grad_norm": 0.23153724936859055, "learning_rate": 5.808080808080808e-05, "loss": 1.9598, "step": 86 }, { "epoch": 8.686427457098285, "grad_norm": 0.22081595887056477, "learning_rate": 5.757575757575758e-05, "loss": 1.9586, "step": 87 }, { "epoch": 8.786271450858035, "grad_norm": 0.19177670537863922, "learning_rate": 5.707070707070707e-05, "loss": 1.9715, "step": 88 }, { "epoch": 8.886115444617785, "grad_norm": 0.25725928107907137, "learning_rate": 5.6565656565656563e-05, "loss": 1.9602, "step": 89 }, { "epoch": 8.985959438377535, "grad_norm": 0.26044371305524344, "learning_rate": 5.606060606060606e-05, "loss": 1.9607, "step": 90 }, { "epoch": 9.085803432137286, "grad_norm": 0.23728151561491595, "learning_rate": 5.555555555555556e-05, "loss": 1.9588, "step": 91 }, { "epoch": 9.185647425897036, "grad_norm": 0.20354348868729488, "learning_rate": 5.5050505050505056e-05, "loss": 1.9492, "step": 92 }, { "epoch": 9.285491419656786, "grad_norm": 0.18672087839741056, "learning_rate": 5.4545454545454546e-05, "loss": 1.9457, "step": 93 }, { "epoch": 9.385335413416536, "grad_norm": 0.1939858201242329, "learning_rate": 5.4040404040404044e-05, "loss": 1.9453, "step": 94 }, { "epoch": 9.485179407176288, "grad_norm": 0.19172060706771135, "learning_rate": 5.353535353535354e-05, "loss": 1.958, "step": 95 }, { "epoch": 9.585023400936038, "grad_norm": 0.1837920882880991, "learning_rate": 5.303030303030303e-05, "loss": 1.9577, "step": 96 }, { "epoch": 9.684867394695788, "grad_norm": 0.2162949878555464, "learning_rate": 5.2525252525252536e-05, "loss": 1.9622, "step": 97 }, { "epoch": 9.784711388455538, "grad_norm": 0.19325381586186333, "learning_rate": 5.2020202020202026e-05, "loss": 1.9433, "step": 98 }, { "epoch": 9.88455538221529, "grad_norm": 0.2018142831658023, "learning_rate": 5.151515151515152e-05, "loss": 1.9605, "step": 99 }, { "epoch": 9.98439937597504, "grad_norm": 0.176671565601027, "learning_rate": 5.101010101010101e-05, "loss": 1.9578, "step": 100 }, { "epoch": 10.08424336973479, "grad_norm": 0.2117788085352089, "learning_rate": 5.050505050505051e-05, "loss": 1.9478, "step": 101 }, { "epoch": 10.18408736349454, "grad_norm": 0.1816135304249716, "learning_rate": 5e-05, "loss": 1.9423, "step": 102 }, { "epoch": 10.283931357254291, "grad_norm": 0.2680310363226074, "learning_rate": 4.94949494949495e-05, "loss": 1.9519, "step": 103 }, { "epoch": 10.383775351014041, "grad_norm": 0.17934299698555412, "learning_rate": 4.898989898989899e-05, "loss": 1.9625, "step": 104 }, { "epoch": 10.48361934477379, "grad_norm": 0.19786074542682824, "learning_rate": 4.848484848484849e-05, "loss": 1.95, "step": 105 }, { "epoch": 10.58346333853354, "grad_norm": 0.17490489580858018, "learning_rate": 4.797979797979798e-05, "loss": 1.9513, "step": 106 }, { "epoch": 10.683307332293293, "grad_norm": 0.224513887757472, "learning_rate": 4.7474747474747476e-05, "loss": 1.9499, "step": 107 }, { "epoch": 10.783151326053042, "grad_norm": 0.16993980203530532, "learning_rate": 4.696969696969697e-05, "loss": 1.944, "step": 108 }, { "epoch": 10.882995319812792, "grad_norm": 0.18436224376063975, "learning_rate": 4.6464646464646464e-05, "loss": 1.9494, "step": 109 }, { "epoch": 10.982839313572542, "grad_norm": 0.1858801208244774, "learning_rate": 4.595959595959596e-05, "loss": 1.9504, "step": 110 }, { "epoch": 11.082683307332294, "grad_norm": 0.21397140437157122, "learning_rate": 4.545454545454546e-05, "loss": 1.9467, "step": 111 }, { "epoch": 11.182527301092044, "grad_norm": 0.16934479906947233, "learning_rate": 4.494949494949495e-05, "loss": 1.9475, "step": 112 }, { "epoch": 11.282371294851794, "grad_norm": 0.17710883760980722, "learning_rate": 4.4444444444444447e-05, "loss": 1.9418, "step": 113 }, { "epoch": 11.382215288611544, "grad_norm": 0.2278025006688675, "learning_rate": 4.3939393939393944e-05, "loss": 1.9456, "step": 114 }, { "epoch": 11.482059282371296, "grad_norm": 0.18727166458531316, "learning_rate": 4.343434343434344e-05, "loss": 1.9408, "step": 115 }, { "epoch": 11.581903276131046, "grad_norm": 0.17348080665741175, "learning_rate": 4.292929292929293e-05, "loss": 1.9367, "step": 116 }, { "epoch": 11.681747269890796, "grad_norm": 0.21559975863343248, "learning_rate": 4.242424242424243e-05, "loss": 1.9509, "step": 117 }, { "epoch": 11.781591263650546, "grad_norm": 0.20515384184563593, "learning_rate": 4.191919191919192e-05, "loss": 1.9503, "step": 118 }, { "epoch": 11.881435257410295, "grad_norm": 0.17579996751101729, "learning_rate": 4.141414141414142e-05, "loss": 1.9443, "step": 119 }, { "epoch": 11.981279251170047, "grad_norm": 0.1870399234707776, "learning_rate": 4.0909090909090915e-05, "loss": 1.9507, "step": 120 }, { "epoch": 12.081123244929797, "grad_norm": 0.2323975590399996, "learning_rate": 4.0404040404040405e-05, "loss": 1.9486, "step": 121 }, { "epoch": 12.180967238689547, "grad_norm": 0.17332911391024705, "learning_rate": 3.98989898989899e-05, "loss": 1.9441, "step": 122 }, { "epoch": 12.280811232449299, "grad_norm": 0.23886491083540215, "learning_rate": 3.939393939393939e-05, "loss": 1.9489, "step": 123 }, { "epoch": 12.380655226209049, "grad_norm": 0.192192583869745, "learning_rate": 3.888888888888889e-05, "loss": 1.936, "step": 124 }, { "epoch": 12.480499219968799, "grad_norm": 0.24070020033146947, "learning_rate": 3.838383838383838e-05, "loss": 1.9363, "step": 125 }, { "epoch": 12.580343213728549, "grad_norm": 0.17061145664967614, "learning_rate": 3.787878787878788e-05, "loss": 1.947, "step": 126 }, { "epoch": 12.680187207488299, "grad_norm": 0.20420044689274344, "learning_rate": 3.7373737373737376e-05, "loss": 1.9462, "step": 127 }, { "epoch": 12.78003120124805, "grad_norm": 0.16640664781155742, "learning_rate": 3.686868686868687e-05, "loss": 1.9404, "step": 128 }, { "epoch": 12.8798751950078, "grad_norm": 0.17534875646136103, "learning_rate": 3.6363636363636364e-05, "loss": 1.9441, "step": 129 }, { "epoch": 12.97971918876755, "grad_norm": 0.1881647742956635, "learning_rate": 3.5858585858585855e-05, "loss": 1.9452, "step": 130 }, { "epoch": 13.0795631825273, "grad_norm": 0.21130090774448568, "learning_rate": 3.535353535353535e-05, "loss": 1.938, "step": 131 }, { "epoch": 13.179407176287052, "grad_norm": 0.19012207225624486, "learning_rate": 3.484848484848485e-05, "loss": 1.93, "step": 132 }, { "epoch": 13.279251170046802, "grad_norm": 0.19535583015453165, "learning_rate": 3.434343434343435e-05, "loss": 1.9423, "step": 133 }, { "epoch": 13.379095163806552, "grad_norm": 0.1972934873185412, "learning_rate": 3.3838383838383844e-05, "loss": 1.9449, "step": 134 }, { "epoch": 13.478939157566302, "grad_norm": 0.21172258190614646, "learning_rate": 3.3333333333333335e-05, "loss": 1.9423, "step": 135 }, { "epoch": 13.578783151326054, "grad_norm": 0.20243808248600392, "learning_rate": 3.282828282828283e-05, "loss": 1.9454, "step": 136 }, { "epoch": 13.678627145085803, "grad_norm": 0.29468220957824104, "learning_rate": 3.232323232323233e-05, "loss": 1.9329, "step": 137 }, { "epoch": 13.778471138845553, "grad_norm": 0.1852836649334086, "learning_rate": 3.181818181818182e-05, "loss": 1.9397, "step": 138 }, { "epoch": 13.878315132605305, "grad_norm": 0.17635021846243693, "learning_rate": 3.131313131313132e-05, "loss": 1.9414, "step": 139 }, { "epoch": 13.978159126365055, "grad_norm": 0.1837620268343685, "learning_rate": 3.080808080808081e-05, "loss": 1.9265, "step": 140 }, { "epoch": 14.078003120124805, "grad_norm": 0.1851416429157977, "learning_rate": 3.0303030303030306e-05, "loss": 1.938, "step": 141 }, { "epoch": 14.177847113884555, "grad_norm": 0.18177436704033564, "learning_rate": 2.9797979797979796e-05, "loss": 1.9338, "step": 142 }, { "epoch": 14.277691107644305, "grad_norm": 0.20249599488147646, "learning_rate": 2.9292929292929294e-05, "loss": 1.943, "step": 143 }, { "epoch": 14.377535101404057, "grad_norm": 0.1914943764672633, "learning_rate": 2.878787878787879e-05, "loss": 1.9381, "step": 144 }, { "epoch": 14.477379095163807, "grad_norm": 0.18144339446500468, "learning_rate": 2.8282828282828282e-05, "loss": 1.9493, "step": 145 }, { "epoch": 14.577223088923557, "grad_norm": 0.22871591479507436, "learning_rate": 2.777777777777778e-05, "loss": 1.9394, "step": 146 }, { "epoch": 14.677067082683307, "grad_norm": 0.2409736531836878, "learning_rate": 2.7272727272727273e-05, "loss": 1.9363, "step": 147 }, { "epoch": 14.776911076443058, "grad_norm": 0.21702411701682794, "learning_rate": 2.676767676767677e-05, "loss": 1.9324, "step": 148 }, { "epoch": 14.876755070202808, "grad_norm": 0.186963824720383, "learning_rate": 2.6262626262626268e-05, "loss": 1.9254, "step": 149 }, { "epoch": 14.976599063962558, "grad_norm": 0.20551876684974787, "learning_rate": 2.575757575757576e-05, "loss": 1.9385, "step": 150 }, { "epoch": 15.076443057722308, "grad_norm": 0.17794734645273458, "learning_rate": 2.5252525252525256e-05, "loss": 1.935, "step": 151 }, { "epoch": 15.17628705148206, "grad_norm": 0.19787955354426204, "learning_rate": 2.474747474747475e-05, "loss": 1.9286, "step": 152 }, { "epoch": 15.27613104524181, "grad_norm": 0.21663975391838738, "learning_rate": 2.4242424242424244e-05, "loss": 1.9274, "step": 153 }, { "epoch": 15.37597503900156, "grad_norm": 0.19056508068402894, "learning_rate": 2.3737373737373738e-05, "loss": 1.9328, "step": 154 }, { "epoch": 15.47581903276131, "grad_norm": 0.20643529046597323, "learning_rate": 2.3232323232323232e-05, "loss": 1.9374, "step": 155 }, { "epoch": 15.575663026521061, "grad_norm": 0.17428582721990332, "learning_rate": 2.272727272727273e-05, "loss": 1.9435, "step": 156 }, { "epoch": 15.675507020280811, "grad_norm": 0.17915807350384474, "learning_rate": 2.2222222222222223e-05, "loss": 1.9342, "step": 157 }, { "epoch": 15.775351014040561, "grad_norm": 0.17934386940217817, "learning_rate": 2.171717171717172e-05, "loss": 1.9252, "step": 158 }, { "epoch": 15.875195007800311, "grad_norm": 0.16971494417624172, "learning_rate": 2.1212121212121215e-05, "loss": 1.9333, "step": 159 }, { "epoch": 15.975039001560063, "grad_norm": 0.1710725442382166, "learning_rate": 2.070707070707071e-05, "loss": 1.9397, "step": 160 }, { "epoch": 16.07488299531981, "grad_norm": 0.16048331708079347, "learning_rate": 2.0202020202020203e-05, "loss": 1.9354, "step": 161 }, { "epoch": 16.174726989079563, "grad_norm": 0.2209212482793433, "learning_rate": 1.9696969696969697e-05, "loss": 1.9572, "step": 162 }, { "epoch": 16.274570982839315, "grad_norm": 0.17292517371584637, "learning_rate": 1.919191919191919e-05, "loss": 1.9384, "step": 163 }, { "epoch": 16.374414976599063, "grad_norm": 0.1756696399704993, "learning_rate": 1.8686868686868688e-05, "loss": 1.9287, "step": 164 }, { "epoch": 16.474258970358814, "grad_norm": 0.193814973934712, "learning_rate": 1.8181818181818182e-05, "loss": 1.9285, "step": 165 }, { "epoch": 16.574102964118566, "grad_norm": 0.21108116449806094, "learning_rate": 1.7676767676767676e-05, "loss": 1.9249, "step": 166 }, { "epoch": 16.673946957878314, "grad_norm": 0.164152325154632, "learning_rate": 1.7171717171717173e-05, "loss": 1.9335, "step": 167 }, { "epoch": 16.773790951638066, "grad_norm": 0.1934976757474289, "learning_rate": 1.6666666666666667e-05, "loss": 1.9344, "step": 168 }, { "epoch": 16.873634945397814, "grad_norm": 0.17861559674997443, "learning_rate": 1.6161616161616165e-05, "loss": 1.9315, "step": 169 }, { "epoch": 16.973478939157566, "grad_norm": 0.16812713496720977, "learning_rate": 1.565656565656566e-05, "loss": 1.9278, "step": 170 }, { "epoch": 17.073322932917318, "grad_norm": 0.19243202935397094, "learning_rate": 1.5151515151515153e-05, "loss": 1.939, "step": 171 }, { "epoch": 17.173166926677066, "grad_norm": 0.16546322416204856, "learning_rate": 1.4646464646464647e-05, "loss": 1.9295, "step": 172 }, { "epoch": 17.273010920436818, "grad_norm": 0.19615095413628908, "learning_rate": 1.4141414141414141e-05, "loss": 1.9357, "step": 173 }, { "epoch": 17.37285491419657, "grad_norm": 0.16562858231287156, "learning_rate": 1.3636363636363637e-05, "loss": 1.9372, "step": 174 }, { "epoch": 17.472698907956318, "grad_norm": 0.1755423564949021, "learning_rate": 1.3131313131313134e-05, "loss": 1.9208, "step": 175 }, { "epoch": 17.57254290171607, "grad_norm": 0.16572591523274388, "learning_rate": 1.2626262626262628e-05, "loss": 1.9196, "step": 176 }, { "epoch": 17.672386895475817, "grad_norm": 0.16066050812369387, "learning_rate": 1.2121212121212122e-05, "loss": 1.9379, "step": 177 }, { "epoch": 17.77223088923557, "grad_norm": 0.18230307180057742, "learning_rate": 1.1616161616161616e-05, "loss": 1.9344, "step": 178 }, { "epoch": 17.87207488299532, "grad_norm": 0.16147840026521357, "learning_rate": 1.1111111111111112e-05, "loss": 1.9249, "step": 179 }, { "epoch": 17.97191887675507, "grad_norm": 0.17234298543336798, "learning_rate": 1.0606060606060607e-05, "loss": 1.9341, "step": 180 }, { "epoch": 18.07176287051482, "grad_norm": 0.16952419332241464, "learning_rate": 1.0101010101010101e-05, "loss": 1.9382, "step": 181 }, { "epoch": 18.171606864274573, "grad_norm": 0.17503197241676455, "learning_rate": 9.595959595959595e-06, "loss": 1.9277, "step": 182 }, { "epoch": 18.27145085803432, "grad_norm": 0.16018657280969506, "learning_rate": 9.090909090909091e-06, "loss": 1.9259, "step": 183 }, { "epoch": 18.371294851794072, "grad_norm": 0.16577134954028483, "learning_rate": 8.585858585858587e-06, "loss": 1.9391, "step": 184 }, { "epoch": 18.47113884555382, "grad_norm": 0.1758462044127833, "learning_rate": 8.080808080808082e-06, "loss": 1.9316, "step": 185 }, { "epoch": 18.570982839313572, "grad_norm": 0.16928715932805172, "learning_rate": 7.5757575757575764e-06, "loss": 1.9218, "step": 186 }, { "epoch": 18.670826833073324, "grad_norm": 0.16185874983512785, "learning_rate": 7.0707070707070704e-06, "loss": 1.9244, "step": 187 }, { "epoch": 18.770670826833072, "grad_norm": 0.16445906712178507, "learning_rate": 6.565656565656567e-06, "loss": 1.9425, "step": 188 }, { "epoch": 18.870514820592824, "grad_norm": 0.16313460189322437, "learning_rate": 6.060606060606061e-06, "loss": 1.9336, "step": 189 }, { "epoch": 18.970358814352576, "grad_norm": 0.15990081630753986, "learning_rate": 5.555555555555556e-06, "loss": 1.9178, "step": 190 }, { "epoch": 19.070202808112324, "grad_norm": 0.16547636636850527, "learning_rate": 5.050505050505051e-06, "loss": 1.9281, "step": 191 }, { "epoch": 19.170046801872076, "grad_norm": 0.1625270231867559, "learning_rate": 4.5454545454545455e-06, "loss": 1.9348, "step": 192 }, { "epoch": 19.269890795631824, "grad_norm": 0.16385675767663568, "learning_rate": 4.040404040404041e-06, "loss": 1.9305, "step": 193 }, { "epoch": 19.369734789391575, "grad_norm": 0.16718542619114216, "learning_rate": 3.5353535353535352e-06, "loss": 1.9376, "step": 194 }, { "epoch": 19.469578783151327, "grad_norm": 0.16595125072244407, "learning_rate": 3.0303030303030305e-06, "loss": 1.9264, "step": 195 }, { "epoch": 19.569422776911075, "grad_norm": 0.16912445317015737, "learning_rate": 2.5252525252525253e-06, "loss": 1.9252, "step": 196 }, { "epoch": 19.669266770670827, "grad_norm": 0.15257787442711698, "learning_rate": 2.0202020202020206e-06, "loss": 1.9312, "step": 197 }, { "epoch": 19.76911076443058, "grad_norm": 0.17270934725449602, "learning_rate": 1.5151515151515152e-06, "loss": 1.9241, "step": 198 }, { "epoch": 19.868954758190327, "grad_norm": 0.16771403116909167, "learning_rate": 1.0101010101010103e-06, "loss": 1.9342, "step": 199 }, { "epoch": 19.96879875195008, "grad_norm": 0.17132458008674775, "learning_rate": 5.050505050505052e-07, "loss": 1.9347, "step": 200 } ], "logging_steps": 1.0, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }