|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 19.96879875195008, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0998439937597504, |
|
"grad_norm": 0.4140388733942774, |
|
"learning_rate": 0.0, |
|
"loss": 1.9149, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.1996879875195008, |
|
"grad_norm": 4.125320422125423, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0842, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.2995319812792512, |
|
"grad_norm": 4.114561571796027, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0845, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.3993759750390016, |
|
"grad_norm": 7.814135082802521, |
|
"learning_rate": 9.94949494949495e-05, |
|
"loss": 2.2529, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.49921996879875197, |
|
"grad_norm": 4.959578007928057, |
|
"learning_rate": 9.8989898989899e-05, |
|
"loss": 2.1802, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.5990639625585024, |
|
"grad_norm": 4.020980884960991, |
|
"learning_rate": 9.848484848484849e-05, |
|
"loss": 2.0996, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.6989079563182528, |
|
"grad_norm": 2.913148956691379, |
|
"learning_rate": 9.797979797979798e-05, |
|
"loss": 2.0739, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.7987519500780031, |
|
"grad_norm": 1.5904378694728827, |
|
"learning_rate": 9.747474747474747e-05, |
|
"loss": 2.0438, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.8985959438377535, |
|
"grad_norm": 1.1120971271683253, |
|
"learning_rate": 9.696969696969698e-05, |
|
"loss": 2.023, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.9984399375975039, |
|
"grad_norm": 1.4857926676190432, |
|
"learning_rate": 9.646464646464647e-05, |
|
"loss": 2.0256, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.0982839313572543, |
|
"grad_norm": 1.7514837650567026, |
|
"learning_rate": 9.595959595959596e-05, |
|
"loss": 2.0297, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 1.1981279251170047, |
|
"grad_norm": 1.3684063083659384, |
|
"learning_rate": 9.545454545454546e-05, |
|
"loss": 1.9984, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.2979719188767551, |
|
"grad_norm": 0.8391319984906789, |
|
"learning_rate": 9.494949494949495e-05, |
|
"loss": 1.9938, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.3978159126365055, |
|
"grad_norm": 0.7680152250335479, |
|
"learning_rate": 9.444444444444444e-05, |
|
"loss": 1.9984, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.497659906396256, |
|
"grad_norm": 1.1427900590537006, |
|
"learning_rate": 9.393939393939395e-05, |
|
"loss": 2.0059, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.5975039001560063, |
|
"grad_norm": 1.2852588832884364, |
|
"learning_rate": 9.343434343434344e-05, |
|
"loss": 2.0057, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.6973478939157567, |
|
"grad_norm": 0.8509981577726656, |
|
"learning_rate": 9.292929292929293e-05, |
|
"loss": 1.97, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 1.797191887675507, |
|
"grad_norm": 0.4374249257660765, |
|
"learning_rate": 9.242424242424242e-05, |
|
"loss": 1.9852, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.8970358814352575, |
|
"grad_norm": 1.006945747433108, |
|
"learning_rate": 9.191919191919192e-05, |
|
"loss": 1.9736, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 1.9968798751950079, |
|
"grad_norm": 1.1714326150671521, |
|
"learning_rate": 9.141414141414141e-05, |
|
"loss": 1.9866, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.0967238689547583, |
|
"grad_norm": 0.6697915843016325, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.9669, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 2.1965678627145087, |
|
"grad_norm": 0.43542954442572934, |
|
"learning_rate": 9.040404040404041e-05, |
|
"loss": 1.9596, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.296411856474259, |
|
"grad_norm": 0.8895989581186896, |
|
"learning_rate": 8.98989898989899e-05, |
|
"loss": 1.9777, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 2.3962558502340094, |
|
"grad_norm": 0.748473401890919, |
|
"learning_rate": 8.93939393939394e-05, |
|
"loss": 1.9828, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 2.49609984399376, |
|
"grad_norm": 0.4762840239068188, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 1.9863, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 2.5959438377535102, |
|
"grad_norm": 0.4634914120924797, |
|
"learning_rate": 8.83838383838384e-05, |
|
"loss": 1.9728, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 2.6957878315132606, |
|
"grad_norm": 0.576721007312459, |
|
"learning_rate": 8.787878787878789e-05, |
|
"loss": 1.9813, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 2.795631825273011, |
|
"grad_norm": 0.4717088615288276, |
|
"learning_rate": 8.737373737373738e-05, |
|
"loss": 1.9709, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.8954758190327614, |
|
"grad_norm": 0.5243076095653101, |
|
"learning_rate": 8.686868686868688e-05, |
|
"loss": 1.9889, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 2.995319812792512, |
|
"grad_norm": 0.35563844256479116, |
|
"learning_rate": 8.636363636363637e-05, |
|
"loss": 1.969, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.095163806552262, |
|
"grad_norm": 0.5040313506272054, |
|
"learning_rate": 8.585858585858586e-05, |
|
"loss": 1.9701, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 3.1950078003120126, |
|
"grad_norm": 0.5293887294443628, |
|
"learning_rate": 8.535353535353535e-05, |
|
"loss": 1.9774, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 3.294851794071763, |
|
"grad_norm": 0.33336016676106733, |
|
"learning_rate": 8.484848484848486e-05, |
|
"loss": 1.9702, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 3.3946957878315134, |
|
"grad_norm": 0.5156182664373749, |
|
"learning_rate": 8.434343434343435e-05, |
|
"loss": 1.9552, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 3.4945397815912638, |
|
"grad_norm": 0.410792592829029, |
|
"learning_rate": 8.383838383838384e-05, |
|
"loss": 1.9642, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 3.594383775351014, |
|
"grad_norm": 0.40267682408922495, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.9688, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 3.6942277691107646, |
|
"grad_norm": 0.3869359148412346, |
|
"learning_rate": 8.282828282828283e-05, |
|
"loss": 1.9733, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 3.794071762870515, |
|
"grad_norm": 0.37728712869432585, |
|
"learning_rate": 8.232323232323233e-05, |
|
"loss": 1.9648, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 3.8939157566302653, |
|
"grad_norm": 0.3922418131207954, |
|
"learning_rate": 8.181818181818183e-05, |
|
"loss": 1.9689, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 3.9937597503900157, |
|
"grad_norm": 0.26353046722639645, |
|
"learning_rate": 8.131313131313132e-05, |
|
"loss": 1.9727, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 4.093603744149766, |
|
"grad_norm": 0.3911091474488452, |
|
"learning_rate": 8.080808080808081e-05, |
|
"loss": 1.9631, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 4.1934477379095165, |
|
"grad_norm": 0.33402240826623614, |
|
"learning_rate": 8.03030303030303e-05, |
|
"loss": 1.9665, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 4.2932917316692665, |
|
"grad_norm": 0.34654808232868395, |
|
"learning_rate": 7.97979797979798e-05, |
|
"loss": 1.9646, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 4.393135725429017, |
|
"grad_norm": 0.3031078864703629, |
|
"learning_rate": 7.92929292929293e-05, |
|
"loss": 1.9693, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 4.492979719188767, |
|
"grad_norm": 0.35342072957234116, |
|
"learning_rate": 7.878787878787879e-05, |
|
"loss": 1.9688, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 4.592823712948518, |
|
"grad_norm": 0.3918161921811716, |
|
"learning_rate": 7.828282828282829e-05, |
|
"loss": 1.9609, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 4.692667706708268, |
|
"grad_norm": 0.24995683506017796, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 1.9515, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 4.792511700468019, |
|
"grad_norm": 0.3308078104166398, |
|
"learning_rate": 7.727272727272727e-05, |
|
"loss": 1.9607, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 4.892355694227769, |
|
"grad_norm": 0.3130926472973521, |
|
"learning_rate": 7.676767676767676e-05, |
|
"loss": 1.9699, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 4.99219968798752, |
|
"grad_norm": 0.30892356920484393, |
|
"learning_rate": 7.626262626262627e-05, |
|
"loss": 1.9645, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 5.09204368174727, |
|
"grad_norm": 0.2804202715276883, |
|
"learning_rate": 7.575757575757576e-05, |
|
"loss": 1.9569, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 5.1918876755070205, |
|
"grad_norm": 0.2789049399636327, |
|
"learning_rate": 7.525252525252525e-05, |
|
"loss": 1.9585, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 5.29173166926677, |
|
"grad_norm": 0.2906929505804403, |
|
"learning_rate": 7.474747474747475e-05, |
|
"loss": 1.9565, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 5.391575663026521, |
|
"grad_norm": 0.2033727950080347, |
|
"learning_rate": 7.424242424242424e-05, |
|
"loss": 1.9755, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 5.491419656786271, |
|
"grad_norm": 0.31364461369416163, |
|
"learning_rate": 7.373737373737373e-05, |
|
"loss": 1.9647, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 5.591263650546022, |
|
"grad_norm": 0.2531087381531638, |
|
"learning_rate": 7.323232323232324e-05, |
|
"loss": 1.9578, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 5.691107644305772, |
|
"grad_norm": 0.23764498764830225, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 1.9617, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 5.790951638065523, |
|
"grad_norm": 0.24888591334854687, |
|
"learning_rate": 7.222222222222222e-05, |
|
"loss": 1.963, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 5.890795631825273, |
|
"grad_norm": 0.2647075657405339, |
|
"learning_rate": 7.171717171717171e-05, |
|
"loss": 1.9685, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 5.990639625585024, |
|
"grad_norm": 0.27820470985704615, |
|
"learning_rate": 7.121212121212121e-05, |
|
"loss": 1.9654, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 6.090483619344774, |
|
"grad_norm": 0.20068946885468097, |
|
"learning_rate": 7.07070707070707e-05, |
|
"loss": 1.9667, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 6.190327613104524, |
|
"grad_norm": 0.25026234630326394, |
|
"learning_rate": 7.020202020202021e-05, |
|
"loss": 1.9542, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 6.290171606864274, |
|
"grad_norm": 0.22856925269883635, |
|
"learning_rate": 6.96969696969697e-05, |
|
"loss": 1.9573, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 6.390015600624025, |
|
"grad_norm": 0.2392183076591563, |
|
"learning_rate": 6.91919191919192e-05, |
|
"loss": 1.9647, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 6.489859594383775, |
|
"grad_norm": 0.20384525102843132, |
|
"learning_rate": 6.86868686868687e-05, |
|
"loss": 1.9628, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 6.589703588143526, |
|
"grad_norm": 0.23941897200051984, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 1.9667, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 6.689547581903276, |
|
"grad_norm": 0.20375278551444306, |
|
"learning_rate": 6.767676767676769e-05, |
|
"loss": 1.9572, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 6.789391575663027, |
|
"grad_norm": 0.20727005267599333, |
|
"learning_rate": 6.717171717171718e-05, |
|
"loss": 1.9581, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 6.889235569422777, |
|
"grad_norm": 0.22300809533132504, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.9693, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 6.9890795631825275, |
|
"grad_norm": 0.21742318730398613, |
|
"learning_rate": 6.616161616161617e-05, |
|
"loss": 1.9656, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 7.0889235569422775, |
|
"grad_norm": 0.20343822391223, |
|
"learning_rate": 6.565656565656566e-05, |
|
"loss": 1.9656, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 7.188767550702028, |
|
"grad_norm": 0.2364200066637671, |
|
"learning_rate": 6.515151515151516e-05, |
|
"loss": 1.95, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 7.288611544461778, |
|
"grad_norm": 0.18261048615751524, |
|
"learning_rate": 6.464646464646466e-05, |
|
"loss": 1.9538, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 7.388455538221529, |
|
"grad_norm": 0.24533487813163474, |
|
"learning_rate": 6.414141414141415e-05, |
|
"loss": 1.952, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 7.488299531981279, |
|
"grad_norm": 0.2539612930496735, |
|
"learning_rate": 6.363636363636364e-05, |
|
"loss": 1.9598, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 7.58814352574103, |
|
"grad_norm": 0.2991457603613546, |
|
"learning_rate": 6.313131313131313e-05, |
|
"loss": 1.9733, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 7.68798751950078, |
|
"grad_norm": 0.2209105824741669, |
|
"learning_rate": 6.262626262626264e-05, |
|
"loss": 1.9531, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 7.787831513260531, |
|
"grad_norm": 0.31698310918964695, |
|
"learning_rate": 6.212121212121213e-05, |
|
"loss": 1.964, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 7.887675507020281, |
|
"grad_norm": 0.17584276182725114, |
|
"learning_rate": 6.161616161616162e-05, |
|
"loss": 1.9644, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 7.9875195007800315, |
|
"grad_norm": 0.29996919622824225, |
|
"learning_rate": 6.111111111111112e-05, |
|
"loss": 1.9555, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 8.087363494539781, |
|
"grad_norm": 0.2406502202676367, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 1.9554, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 8.187207488299531, |
|
"grad_norm": 0.32705142732170855, |
|
"learning_rate": 6.01010101010101e-05, |
|
"loss": 1.9517, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 8.287051482059283, |
|
"grad_norm": 0.27249925952338305, |
|
"learning_rate": 5.959595959595959e-05, |
|
"loss": 1.9474, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 8.386895475819033, |
|
"grad_norm": 0.29448831027669287, |
|
"learning_rate": 5.90909090909091e-05, |
|
"loss": 1.9459, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 8.486739469578783, |
|
"grad_norm": 0.29998154037028857, |
|
"learning_rate": 5.858585858585859e-05, |
|
"loss": 1.9606, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 8.586583463338533, |
|
"grad_norm": 0.23153724936859055, |
|
"learning_rate": 5.808080808080808e-05, |
|
"loss": 1.9598, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 8.686427457098285, |
|
"grad_norm": 0.22081595887056477, |
|
"learning_rate": 5.757575757575758e-05, |
|
"loss": 1.9586, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 8.786271450858035, |
|
"grad_norm": 0.19177670537863922, |
|
"learning_rate": 5.707070707070707e-05, |
|
"loss": 1.9715, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 8.886115444617785, |
|
"grad_norm": 0.25725928107907137, |
|
"learning_rate": 5.6565656565656563e-05, |
|
"loss": 1.9602, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 8.985959438377535, |
|
"grad_norm": 0.26044371305524344, |
|
"learning_rate": 5.606060606060606e-05, |
|
"loss": 1.9607, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 9.085803432137286, |
|
"grad_norm": 0.23728151561491595, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 1.9588, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 9.185647425897036, |
|
"grad_norm": 0.20354348868729488, |
|
"learning_rate": 5.5050505050505056e-05, |
|
"loss": 1.9492, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 9.285491419656786, |
|
"grad_norm": 0.18672087839741056, |
|
"learning_rate": 5.4545454545454546e-05, |
|
"loss": 1.9457, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 9.385335413416536, |
|
"grad_norm": 0.1939858201242329, |
|
"learning_rate": 5.4040404040404044e-05, |
|
"loss": 1.9453, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 9.485179407176288, |
|
"grad_norm": 0.19172060706771135, |
|
"learning_rate": 5.353535353535354e-05, |
|
"loss": 1.958, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 9.585023400936038, |
|
"grad_norm": 0.1837920882880991, |
|
"learning_rate": 5.303030303030303e-05, |
|
"loss": 1.9577, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 9.684867394695788, |
|
"grad_norm": 0.2162949878555464, |
|
"learning_rate": 5.2525252525252536e-05, |
|
"loss": 1.9622, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 9.784711388455538, |
|
"grad_norm": 0.19325381586186333, |
|
"learning_rate": 5.2020202020202026e-05, |
|
"loss": 1.9433, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 9.88455538221529, |
|
"grad_norm": 0.2018142831658023, |
|
"learning_rate": 5.151515151515152e-05, |
|
"loss": 1.9605, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 9.98439937597504, |
|
"grad_norm": 0.176671565601027, |
|
"learning_rate": 5.101010101010101e-05, |
|
"loss": 1.9578, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 10.08424336973479, |
|
"grad_norm": 0.2117788085352089, |
|
"learning_rate": 5.050505050505051e-05, |
|
"loss": 1.9478, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 10.18408736349454, |
|
"grad_norm": 0.1816135304249716, |
|
"learning_rate": 5e-05, |
|
"loss": 1.9423, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 10.283931357254291, |
|
"grad_norm": 0.2680310363226074, |
|
"learning_rate": 4.94949494949495e-05, |
|
"loss": 1.9519, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 10.383775351014041, |
|
"grad_norm": 0.17934299698555412, |
|
"learning_rate": 4.898989898989899e-05, |
|
"loss": 1.9625, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 10.48361934477379, |
|
"grad_norm": 0.19786074542682824, |
|
"learning_rate": 4.848484848484849e-05, |
|
"loss": 1.95, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 10.58346333853354, |
|
"grad_norm": 0.17490489580858018, |
|
"learning_rate": 4.797979797979798e-05, |
|
"loss": 1.9513, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 10.683307332293293, |
|
"grad_norm": 0.224513887757472, |
|
"learning_rate": 4.7474747474747476e-05, |
|
"loss": 1.9499, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 10.783151326053042, |
|
"grad_norm": 0.16993980203530532, |
|
"learning_rate": 4.696969696969697e-05, |
|
"loss": 1.944, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 10.882995319812792, |
|
"grad_norm": 0.18436224376063975, |
|
"learning_rate": 4.6464646464646464e-05, |
|
"loss": 1.9494, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 10.982839313572542, |
|
"grad_norm": 0.1858801208244774, |
|
"learning_rate": 4.595959595959596e-05, |
|
"loss": 1.9504, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 11.082683307332294, |
|
"grad_norm": 0.21397140437157122, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.9467, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 11.182527301092044, |
|
"grad_norm": 0.16934479906947233, |
|
"learning_rate": 4.494949494949495e-05, |
|
"loss": 1.9475, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 11.282371294851794, |
|
"grad_norm": 0.17710883760980722, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 1.9418, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 11.382215288611544, |
|
"grad_norm": 0.2278025006688675, |
|
"learning_rate": 4.3939393939393944e-05, |
|
"loss": 1.9456, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 11.482059282371296, |
|
"grad_norm": 0.18727166458531316, |
|
"learning_rate": 4.343434343434344e-05, |
|
"loss": 1.9408, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 11.581903276131046, |
|
"grad_norm": 0.17348080665741175, |
|
"learning_rate": 4.292929292929293e-05, |
|
"loss": 1.9367, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 11.681747269890796, |
|
"grad_norm": 0.21559975863343248, |
|
"learning_rate": 4.242424242424243e-05, |
|
"loss": 1.9509, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 11.781591263650546, |
|
"grad_norm": 0.20515384184563593, |
|
"learning_rate": 4.191919191919192e-05, |
|
"loss": 1.9503, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 11.881435257410295, |
|
"grad_norm": 0.17579996751101729, |
|
"learning_rate": 4.141414141414142e-05, |
|
"loss": 1.9443, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 11.981279251170047, |
|
"grad_norm": 0.1870399234707776, |
|
"learning_rate": 4.0909090909090915e-05, |
|
"loss": 1.9507, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 12.081123244929797, |
|
"grad_norm": 0.2323975590399996, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 1.9486, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 12.180967238689547, |
|
"grad_norm": 0.17332911391024705, |
|
"learning_rate": 3.98989898989899e-05, |
|
"loss": 1.9441, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 12.280811232449299, |
|
"grad_norm": 0.23886491083540215, |
|
"learning_rate": 3.939393939393939e-05, |
|
"loss": 1.9489, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 12.380655226209049, |
|
"grad_norm": 0.192192583869745, |
|
"learning_rate": 3.888888888888889e-05, |
|
"loss": 1.936, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 12.480499219968799, |
|
"grad_norm": 0.24070020033146947, |
|
"learning_rate": 3.838383838383838e-05, |
|
"loss": 1.9363, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 12.580343213728549, |
|
"grad_norm": 0.17061145664967614, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 1.947, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 12.680187207488299, |
|
"grad_norm": 0.20420044689274344, |
|
"learning_rate": 3.7373737373737376e-05, |
|
"loss": 1.9462, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 12.78003120124805, |
|
"grad_norm": 0.16640664781155742, |
|
"learning_rate": 3.686868686868687e-05, |
|
"loss": 1.9404, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 12.8798751950078, |
|
"grad_norm": 0.17534875646136103, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 1.9441, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 12.97971918876755, |
|
"grad_norm": 0.1881647742956635, |
|
"learning_rate": 3.5858585858585855e-05, |
|
"loss": 1.9452, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 13.0795631825273, |
|
"grad_norm": 0.21130090774448568, |
|
"learning_rate": 3.535353535353535e-05, |
|
"loss": 1.938, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 13.179407176287052, |
|
"grad_norm": 0.19012207225624486, |
|
"learning_rate": 3.484848484848485e-05, |
|
"loss": 1.93, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 13.279251170046802, |
|
"grad_norm": 0.19535583015453165, |
|
"learning_rate": 3.434343434343435e-05, |
|
"loss": 1.9423, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 13.379095163806552, |
|
"grad_norm": 0.1972934873185412, |
|
"learning_rate": 3.3838383838383844e-05, |
|
"loss": 1.9449, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 13.478939157566302, |
|
"grad_norm": 0.21172258190614646, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.9423, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 13.578783151326054, |
|
"grad_norm": 0.20243808248600392, |
|
"learning_rate": 3.282828282828283e-05, |
|
"loss": 1.9454, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 13.678627145085803, |
|
"grad_norm": 0.29468220957824104, |
|
"learning_rate": 3.232323232323233e-05, |
|
"loss": 1.9329, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 13.778471138845553, |
|
"grad_norm": 0.1852836649334086, |
|
"learning_rate": 3.181818181818182e-05, |
|
"loss": 1.9397, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 13.878315132605305, |
|
"grad_norm": 0.17635021846243693, |
|
"learning_rate": 3.131313131313132e-05, |
|
"loss": 1.9414, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 13.978159126365055, |
|
"grad_norm": 0.1837620268343685, |
|
"learning_rate": 3.080808080808081e-05, |
|
"loss": 1.9265, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 14.078003120124805, |
|
"grad_norm": 0.1851416429157977, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 1.938, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 14.177847113884555, |
|
"grad_norm": 0.18177436704033564, |
|
"learning_rate": 2.9797979797979796e-05, |
|
"loss": 1.9338, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 14.277691107644305, |
|
"grad_norm": 0.20249599488147646, |
|
"learning_rate": 2.9292929292929294e-05, |
|
"loss": 1.943, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 14.377535101404057, |
|
"grad_norm": 0.1914943764672633, |
|
"learning_rate": 2.878787878787879e-05, |
|
"loss": 1.9381, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 14.477379095163807, |
|
"grad_norm": 0.18144339446500468, |
|
"learning_rate": 2.8282828282828282e-05, |
|
"loss": 1.9493, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 14.577223088923557, |
|
"grad_norm": 0.22871591479507436, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.9394, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 14.677067082683307, |
|
"grad_norm": 0.2409736531836878, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 1.9363, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 14.776911076443058, |
|
"grad_norm": 0.21702411701682794, |
|
"learning_rate": 2.676767676767677e-05, |
|
"loss": 1.9324, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 14.876755070202808, |
|
"grad_norm": 0.186963824720383, |
|
"learning_rate": 2.6262626262626268e-05, |
|
"loss": 1.9254, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 14.976599063962558, |
|
"grad_norm": 0.20551876684974787, |
|
"learning_rate": 2.575757575757576e-05, |
|
"loss": 1.9385, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 15.076443057722308, |
|
"grad_norm": 0.17794734645273458, |
|
"learning_rate": 2.5252525252525256e-05, |
|
"loss": 1.935, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 15.17628705148206, |
|
"grad_norm": 0.19787955354426204, |
|
"learning_rate": 2.474747474747475e-05, |
|
"loss": 1.9286, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 15.27613104524181, |
|
"grad_norm": 0.21663975391838738, |
|
"learning_rate": 2.4242424242424244e-05, |
|
"loss": 1.9274, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 15.37597503900156, |
|
"grad_norm": 0.19056508068402894, |
|
"learning_rate": 2.3737373737373738e-05, |
|
"loss": 1.9328, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 15.47581903276131, |
|
"grad_norm": 0.20643529046597323, |
|
"learning_rate": 2.3232323232323232e-05, |
|
"loss": 1.9374, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 15.575663026521061, |
|
"grad_norm": 0.17428582721990332, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.9435, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 15.675507020280811, |
|
"grad_norm": 0.17915807350384474, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 1.9342, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 15.775351014040561, |
|
"grad_norm": 0.17934386940217817, |
|
"learning_rate": 2.171717171717172e-05, |
|
"loss": 1.9252, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 15.875195007800311, |
|
"grad_norm": 0.16971494417624172, |
|
"learning_rate": 2.1212121212121215e-05, |
|
"loss": 1.9333, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 15.975039001560063, |
|
"grad_norm": 0.1710725442382166, |
|
"learning_rate": 2.070707070707071e-05, |
|
"loss": 1.9397, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 16.07488299531981, |
|
"grad_norm": 0.16048331708079347, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 1.9354, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 16.174726989079563, |
|
"grad_norm": 0.2209212482793433, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 1.9572, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 16.274570982839315, |
|
"grad_norm": 0.17292517371584637, |
|
"learning_rate": 1.919191919191919e-05, |
|
"loss": 1.9384, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 16.374414976599063, |
|
"grad_norm": 0.1756696399704993, |
|
"learning_rate": 1.8686868686868688e-05, |
|
"loss": 1.9287, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 16.474258970358814, |
|
"grad_norm": 0.193814973934712, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 1.9285, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 16.574102964118566, |
|
"grad_norm": 0.21108116449806094, |
|
"learning_rate": 1.7676767676767676e-05, |
|
"loss": 1.9249, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 16.673946957878314, |
|
"grad_norm": 0.164152325154632, |
|
"learning_rate": 1.7171717171717173e-05, |
|
"loss": 1.9335, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 16.773790951638066, |
|
"grad_norm": 0.1934976757474289, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.9344, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 16.873634945397814, |
|
"grad_norm": 0.17861559674997443, |
|
"learning_rate": 1.6161616161616165e-05, |
|
"loss": 1.9315, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 16.973478939157566, |
|
"grad_norm": 0.16812713496720977, |
|
"learning_rate": 1.565656565656566e-05, |
|
"loss": 1.9278, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 17.073322932917318, |
|
"grad_norm": 0.19243202935397094, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 1.939, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 17.173166926677066, |
|
"grad_norm": 0.16546322416204856, |
|
"learning_rate": 1.4646464646464647e-05, |
|
"loss": 1.9295, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 17.273010920436818, |
|
"grad_norm": 0.19615095413628908, |
|
"learning_rate": 1.4141414141414141e-05, |
|
"loss": 1.9357, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 17.37285491419657, |
|
"grad_norm": 0.16562858231287156, |
|
"learning_rate": 1.3636363636363637e-05, |
|
"loss": 1.9372, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 17.472698907956318, |
|
"grad_norm": 0.1755423564949021, |
|
"learning_rate": 1.3131313131313134e-05, |
|
"loss": 1.9208, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 17.57254290171607, |
|
"grad_norm": 0.16572591523274388, |
|
"learning_rate": 1.2626262626262628e-05, |
|
"loss": 1.9196, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 17.672386895475817, |
|
"grad_norm": 0.16066050812369387, |
|
"learning_rate": 1.2121212121212122e-05, |
|
"loss": 1.9379, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 17.77223088923557, |
|
"grad_norm": 0.18230307180057742, |
|
"learning_rate": 1.1616161616161616e-05, |
|
"loss": 1.9344, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 17.87207488299532, |
|
"grad_norm": 0.16147840026521357, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 1.9249, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 17.97191887675507, |
|
"grad_norm": 0.17234298543336798, |
|
"learning_rate": 1.0606060606060607e-05, |
|
"loss": 1.9341, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 18.07176287051482, |
|
"grad_norm": 0.16952419332241464, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 1.9382, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 18.171606864274573, |
|
"grad_norm": 0.17503197241676455, |
|
"learning_rate": 9.595959595959595e-06, |
|
"loss": 1.9277, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 18.27145085803432, |
|
"grad_norm": 0.16018657280969506, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 1.9259, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 18.371294851794072, |
|
"grad_norm": 0.16577134954028483, |
|
"learning_rate": 8.585858585858587e-06, |
|
"loss": 1.9391, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 18.47113884555382, |
|
"grad_norm": 0.1758462044127833, |
|
"learning_rate": 8.080808080808082e-06, |
|
"loss": 1.9316, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 18.570982839313572, |
|
"grad_norm": 0.16928715932805172, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 1.9218, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 18.670826833073324, |
|
"grad_norm": 0.16185874983512785, |
|
"learning_rate": 7.0707070707070704e-06, |
|
"loss": 1.9244, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 18.770670826833072, |
|
"grad_norm": 0.16445906712178507, |
|
"learning_rate": 6.565656565656567e-06, |
|
"loss": 1.9425, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 18.870514820592824, |
|
"grad_norm": 0.16313460189322437, |
|
"learning_rate": 6.060606060606061e-06, |
|
"loss": 1.9336, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 18.970358814352576, |
|
"grad_norm": 0.15990081630753986, |
|
"learning_rate": 5.555555555555556e-06, |
|
"loss": 1.9178, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 19.070202808112324, |
|
"grad_norm": 0.16547636636850527, |
|
"learning_rate": 5.050505050505051e-06, |
|
"loss": 1.9281, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 19.170046801872076, |
|
"grad_norm": 0.1625270231867559, |
|
"learning_rate": 4.5454545454545455e-06, |
|
"loss": 1.9348, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 19.269890795631824, |
|
"grad_norm": 0.16385675767663568, |
|
"learning_rate": 4.040404040404041e-06, |
|
"loss": 1.9305, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 19.369734789391575, |
|
"grad_norm": 0.16718542619114216, |
|
"learning_rate": 3.5353535353535352e-06, |
|
"loss": 1.9376, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 19.469578783151327, |
|
"grad_norm": 0.16595125072244407, |
|
"learning_rate": 3.0303030303030305e-06, |
|
"loss": 1.9264, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 19.569422776911075, |
|
"grad_norm": 0.16912445317015737, |
|
"learning_rate": 2.5252525252525253e-06, |
|
"loss": 1.9252, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 19.669266770670827, |
|
"grad_norm": 0.15257787442711698, |
|
"learning_rate": 2.0202020202020206e-06, |
|
"loss": 1.9312, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 19.76911076443058, |
|
"grad_norm": 0.17270934725449602, |
|
"learning_rate": 1.5151515151515152e-06, |
|
"loss": 1.9241, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 19.868954758190327, |
|
"grad_norm": 0.16771403116909167, |
|
"learning_rate": 1.0101010101010103e-06, |
|
"loss": 1.9342, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 19.96879875195008, |
|
"grad_norm": 0.17132458008674775, |
|
"learning_rate": 5.050505050505052e-07, |
|
"loss": 1.9347, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|