|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 1184, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0033783783783783786, |
|
"grad_norm": 134.4763946533203, |
|
"learning_rate": 0.0, |
|
"loss": 4.6817, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"grad_norm": 125.12445068359375, |
|
"learning_rate": 5.017166594399687e-06, |
|
"loss": 4.5642, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010135135135135136, |
|
"grad_norm": 133.296142578125, |
|
"learning_rate": 7.952020911994375e-06, |
|
"loss": 4.787, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"grad_norm": 72.2229995727539, |
|
"learning_rate": 1.0034333188799373e-05, |
|
"loss": 3.1422, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.016891891891891893, |
|
"grad_norm": 83.57042694091797, |
|
"learning_rate": 1.164950007226698e-05, |
|
"loss": 1.9827, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02027027027027027, |
|
"grad_norm": 35.78062057495117, |
|
"learning_rate": 1.2969187506394062e-05, |
|
"loss": 1.0212, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02364864864864865, |
|
"grad_norm": 48.31964111328125, |
|
"learning_rate": 1.4084967333570947e-05, |
|
"loss": 1.1823, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 10.594072341918945, |
|
"learning_rate": 1.505149978319906e-05, |
|
"loss": 0.7368, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.030405405405405407, |
|
"grad_norm": 7.897088050842285, |
|
"learning_rate": 1.590404182398875e-05, |
|
"loss": 0.6276, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.033783783783783786, |
|
"grad_norm": 3.8260443210601807, |
|
"learning_rate": 1.666666666666667e-05, |
|
"loss": 0.5934, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.037162162162162164, |
|
"grad_norm": 11.113093376159668, |
|
"learning_rate": 1.7356544752637084e-05, |
|
"loss": 0.6987, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04054054054054054, |
|
"grad_norm": 3.2817893028259277, |
|
"learning_rate": 1.7986354100793748e-05, |
|
"loss": 0.5722, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04391891891891892, |
|
"grad_norm": 1.4319814443588257, |
|
"learning_rate": 1.8565722538447282e-05, |
|
"loss": 0.5199, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0472972972972973, |
|
"grad_norm": 1.1898471117019653, |
|
"learning_rate": 1.9102133927970633e-05, |
|
"loss": 0.5063, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05067567567567568, |
|
"grad_norm": 2.1912143230438232, |
|
"learning_rate": 1.9601520984261358e-05, |
|
"loss": 0.5368, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 1.0015403032302856, |
|
"learning_rate": 2.0068666377598747e-05, |
|
"loss": 0.4682, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.057432432432432436, |
|
"grad_norm": 1.114931583404541, |
|
"learning_rate": 2.0507482022971233e-05, |
|
"loss": 0.4997, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.060810810810810814, |
|
"grad_norm": 1.0931981801986694, |
|
"learning_rate": 2.0921208418388435e-05, |
|
"loss": 0.4659, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06418918918918919, |
|
"grad_norm": 0.8113773465156555, |
|
"learning_rate": 2.1312560015880482e-05, |
|
"loss": 0.4697, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06756756756756757, |
|
"grad_norm": 1.0267772674560547, |
|
"learning_rate": 2.1683833261066357e-05, |
|
"loss": 0.4851, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07094594594594594, |
|
"grad_norm": 0.8238614201545715, |
|
"learning_rate": 2.2036988245565324e-05, |
|
"loss": 0.4517, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07432432432432433, |
|
"grad_norm": 0.9366945624351501, |
|
"learning_rate": 2.2373711347036773e-05, |
|
"loss": 0.4982, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0777027027027027, |
|
"grad_norm": 0.8443475365638733, |
|
"learning_rate": 2.269546393362655e-05, |
|
"loss": 0.4128, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 0.8733354806900024, |
|
"learning_rate": 2.3003520695193437e-05, |
|
"loss": 0.4281, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08445945945945946, |
|
"grad_norm": 0.7766995429992676, |
|
"learning_rate": 2.329900014453396e-05, |
|
"loss": 0.4407, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08783783783783784, |
|
"grad_norm": 0.8224064111709595, |
|
"learning_rate": 2.3582889132846968e-05, |
|
"loss": 0.4155, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09121621621621621, |
|
"grad_norm": 0.7931010127067566, |
|
"learning_rate": 2.3856062735983123e-05, |
|
"loss": 0.4191, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0945945945945946, |
|
"grad_norm": 0.6803894639015198, |
|
"learning_rate": 2.4119300522370322e-05, |
|
"loss": 0.4055, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09797297297297297, |
|
"grad_norm": 0.748294472694397, |
|
"learning_rate": 2.4373299964982603e-05, |
|
"loss": 0.4443, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10135135135135136, |
|
"grad_norm": 0.706881582736969, |
|
"learning_rate": 2.4618687578661044e-05, |
|
"loss": 0.4093, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10472972972972973, |
|
"grad_norm": 0.7517805695533752, |
|
"learning_rate": 2.4856028230571212e-05, |
|
"loss": 0.4288, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 0.6549198031425476, |
|
"learning_rate": 2.5085832971998436e-05, |
|
"loss": 0.4222, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.11148648648648649, |
|
"grad_norm": 0.9364942908287048, |
|
"learning_rate": 2.530856566463146e-05, |
|
"loss": 0.416, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11486486486486487, |
|
"grad_norm": 0.624920129776001, |
|
"learning_rate": 2.552464861737092e-05, |
|
"loss": 0.3735, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11824324324324324, |
|
"grad_norm": 0.6083908081054688, |
|
"learning_rate": 2.5734467405837933e-05, |
|
"loss": 0.3315, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.12162162162162163, |
|
"grad_norm": 0.6106983423233032, |
|
"learning_rate": 2.5938375012788124e-05, |
|
"loss": 0.3652, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 0.7572540044784546, |
|
"learning_rate": 2.6136695401116585e-05, |
|
"loss": 0.452, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12837837837837837, |
|
"grad_norm": 0.6654285788536072, |
|
"learning_rate": 2.6329726610280168e-05, |
|
"loss": 0.3328, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13175675675675674, |
|
"grad_norm": 0.676705002784729, |
|
"learning_rate": 2.651774345044166e-05, |
|
"loss": 0.3987, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 0.6630686521530151, |
|
"learning_rate": 2.6700999855466042e-05, |
|
"loss": 0.3794, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13851351351351351, |
|
"grad_norm": 0.8196331858634949, |
|
"learning_rate": 2.687973094532893e-05, |
|
"loss": 0.4218, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.14189189189189189, |
|
"grad_norm": 0.6006856560707092, |
|
"learning_rate": 2.7054154839965013e-05, |
|
"loss": 0.3788, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.14527027027027026, |
|
"grad_norm": 0.6180748343467712, |
|
"learning_rate": 2.722447425965978e-05, |
|
"loss": 0.3946, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14864864864864866, |
|
"grad_norm": 0.7010207772254944, |
|
"learning_rate": 2.739087794143646e-05, |
|
"loss": 0.389, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15202702702702703, |
|
"grad_norm": 0.681907594203949, |
|
"learning_rate": 2.755354189625573e-05, |
|
"loss": 0.399, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1554054054054054, |
|
"grad_norm": 0.6506620645523071, |
|
"learning_rate": 2.771263052802624e-05, |
|
"loss": 0.3924, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.15878378378378377, |
|
"grad_norm": 0.6619815230369568, |
|
"learning_rate": 2.7868297632261957e-05, |
|
"loss": 0.4111, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 0.6953612565994263, |
|
"learning_rate": 2.8020687289593123e-05, |
|
"loss": 0.3958, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.16554054054054054, |
|
"grad_norm": 0.761862576007843, |
|
"learning_rate": 2.8169934667141895e-05, |
|
"loss": 0.3844, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.16891891891891891, |
|
"grad_norm": 0.7193901538848877, |
|
"learning_rate": 2.8316166738933646e-05, |
|
"loss": 0.3847, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.17229729729729729, |
|
"grad_norm": 0.6792317032814026, |
|
"learning_rate": 2.845950293496561e-05, |
|
"loss": 0.4043, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.17567567567567569, |
|
"grad_norm": 0.6307753920555115, |
|
"learning_rate": 2.8600055727246657e-05, |
|
"loss": 0.3806, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.17905405405405406, |
|
"grad_norm": 0.7366315722465515, |
|
"learning_rate": 2.8737931160013153e-05, |
|
"loss": 0.416, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.18243243243243243, |
|
"grad_norm": 0.603863537311554, |
|
"learning_rate": 2.8873229330382812e-05, |
|
"loss": 0.3548, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1858108108108108, |
|
"grad_norm": 0.7424588799476624, |
|
"learning_rate": 2.9006044824904066e-05, |
|
"loss": 0.3982, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 0.5944585204124451, |
|
"learning_rate": 2.913646711677001e-05, |
|
"loss": 0.3536, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.19256756756756757, |
|
"grad_norm": 0.6465046405792236, |
|
"learning_rate": 2.926458092787486e-05, |
|
"loss": 0.3594, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19594594594594594, |
|
"grad_norm": 0.7178723812103271, |
|
"learning_rate": 2.939046655938229e-05, |
|
"loss": 0.3725, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.19932432432432431, |
|
"grad_norm": 0.7240431308746338, |
|
"learning_rate": 2.951420019403574e-05, |
|
"loss": 0.3896, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.20270270270270271, |
|
"grad_norm": 0.7555009126663208, |
|
"learning_rate": 2.963585417306073e-05, |
|
"loss": 0.3633, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20608108108108109, |
|
"grad_norm": 0.6279475092887878, |
|
"learning_rate": 2.9755497250179453e-05, |
|
"loss": 0.3447, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.20945945945945946, |
|
"grad_norm": 0.6943121552467346, |
|
"learning_rate": 2.98731948249709e-05, |
|
"loss": 0.3851, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.21283783783783783, |
|
"grad_norm": 0.6565203070640564, |
|
"learning_rate": 2.9989009157559694e-05, |
|
"loss": 0.3896, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 4.912997722625732, |
|
"learning_rate": 3.010299956639812e-05, |
|
"loss": 0.4528, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2195945945945946, |
|
"grad_norm": 0.7776851058006287, |
|
"learning_rate": 3.021522261071426e-05, |
|
"loss": 0.3151, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.22297297297297297, |
|
"grad_norm": 0.5963460206985474, |
|
"learning_rate": 3.0325732259031143e-05, |
|
"loss": 0.3501, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.22635135135135134, |
|
"grad_norm": 0.8160498142242432, |
|
"learning_rate": 3.043458004501377e-05, |
|
"loss": 0.3383, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.22972972972972974, |
|
"grad_norm": 0.6507856249809265, |
|
"learning_rate": 3.054181521177061e-05, |
|
"loss": 0.3559, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.23310810810810811, |
|
"grad_norm": 0.6282461881637573, |
|
"learning_rate": 3.064748484562093e-05, |
|
"loss": 0.3951, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.23648648648648649, |
|
"grad_norm": 0.6464638710021973, |
|
"learning_rate": 3.0751634000237615e-05, |
|
"loss": 0.3759, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23986486486486486, |
|
"grad_norm": 0.621934175491333, |
|
"learning_rate": 3.085430581198459e-05, |
|
"loss": 0.409, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 0.6526879668235779, |
|
"learning_rate": 3.095554160718781e-05, |
|
"loss": 0.3938, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24662162162162163, |
|
"grad_norm": 0.6536738276481628, |
|
"learning_rate": 3.10553810020076e-05, |
|
"loss": 0.3657, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6381665468215942, |
|
"learning_rate": 3.115386199551628e-05, |
|
"loss": 0.3743, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2533783783783784, |
|
"grad_norm": 0.6818379759788513, |
|
"learning_rate": 3.1251021056528336e-05, |
|
"loss": 0.3188, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.25675675675675674, |
|
"grad_norm": 0.5829269886016846, |
|
"learning_rate": 3.134689320467986e-05, |
|
"loss": 0.3622, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.26013513513513514, |
|
"grad_norm": 0.6988232731819153, |
|
"learning_rate": 3.144151208620804e-05, |
|
"loss": 0.3768, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2635135135135135, |
|
"grad_norm": 0.5981537699699402, |
|
"learning_rate": 3.1534910044841344e-05, |
|
"loss": 0.3655, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2668918918918919, |
|
"grad_norm": 0.5821400284767151, |
|
"learning_rate": 3.1627118188174024e-05, |
|
"loss": 0.358, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 0.620126485824585, |
|
"learning_rate": 3.171816644986573e-05, |
|
"loss": 0.3579, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.27364864864864863, |
|
"grad_norm": 0.6327139139175415, |
|
"learning_rate": 3.18080836479775e-05, |
|
"loss": 0.3916, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.27702702702702703, |
|
"grad_norm": 0.5900242328643799, |
|
"learning_rate": 3.1896897539728616e-05, |
|
"loss": 0.3768, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.28040540540540543, |
|
"grad_norm": 0.5920627117156982, |
|
"learning_rate": 3.198463487293457e-05, |
|
"loss": 0.3709, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.28378378378378377, |
|
"grad_norm": 0.5604241490364075, |
|
"learning_rate": 3.207132143436469e-05, |
|
"loss": 0.3871, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.28716216216216217, |
|
"grad_norm": 0.6093663573265076, |
|
"learning_rate": 3.215698209523821e-05, |
|
"loss": 0.3632, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2905405405405405, |
|
"grad_norm": 0.568435549736023, |
|
"learning_rate": 3.224164085405946e-05, |
|
"loss": 0.3788, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.2939189189189189, |
|
"grad_norm": 0.5885617136955261, |
|
"learning_rate": 3.232532087697698e-05, |
|
"loss": 0.3472, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 0.5474864840507507, |
|
"learning_rate": 3.240804453583615e-05, |
|
"loss": 0.3815, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.30067567567567566, |
|
"grad_norm": 0.6116411089897156, |
|
"learning_rate": 3.248983344408188e-05, |
|
"loss": 0.3374, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.30405405405405406, |
|
"grad_norm": 0.5523970127105713, |
|
"learning_rate": 3.2570708490655414e-05, |
|
"loss": 0.3503, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.30743243243243246, |
|
"grad_norm": 0.6035043597221375, |
|
"learning_rate": 3.265068987201822e-05, |
|
"loss": 0.3699, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3108108108108108, |
|
"grad_norm": 0.4965035617351532, |
|
"learning_rate": 3.2729797122425925e-05, |
|
"loss": 0.3702, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3141891891891892, |
|
"grad_norm": 0.5582573413848877, |
|
"learning_rate": 3.280804914256559e-05, |
|
"loss": 0.333, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.31756756756756754, |
|
"grad_norm": 0.5313113331794739, |
|
"learning_rate": 3.288546422666164e-05, |
|
"loss": 0.329, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.32094594594594594, |
|
"grad_norm": 0.6049978733062744, |
|
"learning_rate": 3.2962060088147464e-05, |
|
"loss": 0.3931, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 0.5387487411499023, |
|
"learning_rate": 3.3037853883992805e-05, |
|
"loss": 0.3432, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3277027027027027, |
|
"grad_norm": 0.588671088218689, |
|
"learning_rate": 3.3112862237770756e-05, |
|
"loss": 0.3508, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3310810810810811, |
|
"grad_norm": 0.536973774433136, |
|
"learning_rate": 3.3187101261541584e-05, |
|
"loss": 0.372, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3344594594594595, |
|
"grad_norm": 0.676276445388794, |
|
"learning_rate": 3.326058657662584e-05, |
|
"loss": 0.3832, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.33783783783783783, |
|
"grad_norm": 0.46296200156211853, |
|
"learning_rate": 3.333333333333334e-05, |
|
"loss": 0.3553, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.34121621621621623, |
|
"grad_norm": 0.674472451210022, |
|
"learning_rate": 3.340535622971072e-05, |
|
"loss": 0.3448, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.34459459459459457, |
|
"grad_norm": 0.5100315809249878, |
|
"learning_rate": 3.3476669529365295e-05, |
|
"loss": 0.3367, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.34797297297297297, |
|
"grad_norm": 0.5637315511703491, |
|
"learning_rate": 3.3547287078419544e-05, |
|
"loss": 0.3786, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 0.5659502744674683, |
|
"learning_rate": 3.361722232164634e-05, |
|
"loss": 0.352, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3547297297297297, |
|
"grad_norm": 0.5390239357948303, |
|
"learning_rate": 3.3686488317832306e-05, |
|
"loss": 0.3473, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3581081081081081, |
|
"grad_norm": 0.5552096366882324, |
|
"learning_rate": 3.375509775441284e-05, |
|
"loss": 0.2945, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3614864864864865, |
|
"grad_norm": 0.5016259551048279, |
|
"learning_rate": 3.382306296142016e-05, |
|
"loss": 0.313, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.36486486486486486, |
|
"grad_norm": 0.5755091905593872, |
|
"learning_rate": 3.38903959247825e-05, |
|
"loss": 0.3367, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.36824324324324326, |
|
"grad_norm": 0.5590441823005676, |
|
"learning_rate": 3.395710829901039e-05, |
|
"loss": 0.3639, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3716216216216216, |
|
"grad_norm": 0.5077652335166931, |
|
"learning_rate": 3.402321141930376e-05, |
|
"loss": 0.3523, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.5614023804664612, |
|
"learning_rate": 3.4088716313110955e-05, |
|
"loss": 0.3418, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 0.5325702428817749, |
|
"learning_rate": 3.415363371116969e-05, |
|
"loss": 0.3208, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.38175675675675674, |
|
"grad_norm": 0.5361766219139099, |
|
"learning_rate": 3.4217974058057e-05, |
|
"loss": 0.3412, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.38513513513513514, |
|
"grad_norm": 0.5173611044883728, |
|
"learning_rate": 3.428174752227455e-05, |
|
"loss": 0.3615, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3885135135135135, |
|
"grad_norm": 0.5852875709533691, |
|
"learning_rate": 3.434496400589353e-05, |
|
"loss": 0.3171, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3918918918918919, |
|
"grad_norm": 0.48835834860801697, |
|
"learning_rate": 3.440763315378198e-05, |
|
"loss": 0.315, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.3952702702702703, |
|
"grad_norm": 0.5958307385444641, |
|
"learning_rate": 3.446976436243603e-05, |
|
"loss": 0.3117, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.39864864864864863, |
|
"grad_norm": 0.5645899772644043, |
|
"learning_rate": 3.4531366788435425e-05, |
|
"loss": 0.3192, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.40202702702702703, |
|
"grad_norm": 0.5764517784118652, |
|
"learning_rate": 3.459244935654219e-05, |
|
"loss": 0.3353, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 0.5075375437736511, |
|
"learning_rate": 3.465302076746041e-05, |
|
"loss": 0.3572, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.40878378378378377, |
|
"grad_norm": 0.48014962673187256, |
|
"learning_rate": 3.471308950527417e-05, |
|
"loss": 0.3217, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.41216216216216217, |
|
"grad_norm": 0.533892035484314, |
|
"learning_rate": 3.477266384457914e-05, |
|
"loss": 0.3214, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.4155405405405405, |
|
"grad_norm": 0.4845181405544281, |
|
"learning_rate": 3.48317518573233e-05, |
|
"loss": 0.3332, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4189189189189189, |
|
"grad_norm": 0.5133293271064758, |
|
"learning_rate": 3.489036141937059e-05, |
|
"loss": 0.354, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.4222972972972973, |
|
"grad_norm": 0.5352755188941956, |
|
"learning_rate": 3.494850021680094e-05, |
|
"loss": 0.353, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.42567567567567566, |
|
"grad_norm": 0.5208103060722351, |
|
"learning_rate": 3.500617575195938e-05, |
|
"loss": 0.3065, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.42905405405405406, |
|
"grad_norm": 0.5982815623283386, |
|
"learning_rate": 3.5063395349265945e-05, |
|
"loss": 0.3329, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.5241547226905823, |
|
"learning_rate": 3.5120166160797804e-05, |
|
"loss": 0.3476, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4358108108108108, |
|
"grad_norm": 0.5543828010559082, |
|
"learning_rate": 3.517649517165415e-05, |
|
"loss": 0.3517, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4391891891891892, |
|
"grad_norm": 0.5167660117149353, |
|
"learning_rate": 3.523238920511395e-05, |
|
"loss": 0.3417, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.44256756756756754, |
|
"grad_norm": 0.5201629400253296, |
|
"learning_rate": 3.528785492759607e-05, |
|
"loss": 0.3543, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.44594594594594594, |
|
"grad_norm": 0.5214123129844666, |
|
"learning_rate": 3.5342898853430836e-05, |
|
"loss": 0.3552, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.44932432432432434, |
|
"grad_norm": 0.526899516582489, |
|
"learning_rate": 3.539752734945143e-05, |
|
"loss": 0.3469, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4527027027027027, |
|
"grad_norm": 0.55535888671875, |
|
"learning_rate": 3.5451746639413466e-05, |
|
"loss": 0.294, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.4560810810810811, |
|
"grad_norm": 0.46593329310417175, |
|
"learning_rate": 3.550556280825011e-05, |
|
"loss": 0.2557, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 0.5066341757774353, |
|
"learning_rate": 3.55589818061703e-05, |
|
"loss": 0.2885, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.46283783783783783, |
|
"grad_norm": 0.4943198263645172, |
|
"learning_rate": 3.561200945260678e-05, |
|
"loss": 0.3436, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.46621621621621623, |
|
"grad_norm": 0.5013923048973083, |
|
"learning_rate": 3.5664651440020616e-05, |
|
"loss": 0.2865, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.46959459459459457, |
|
"grad_norm": 0.5165452361106873, |
|
"learning_rate": 3.571691333756825e-05, |
|
"loss": 0.3394, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.47297297297297297, |
|
"grad_norm": 0.5002908706665039, |
|
"learning_rate": 3.5768800594637304e-05, |
|
"loss": 0.3278, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.47635135135135137, |
|
"grad_norm": 0.5236304402351379, |
|
"learning_rate": 3.582031854425634e-05, |
|
"loss": 0.3343, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.4797297297297297, |
|
"grad_norm": 0.49403268098831177, |
|
"learning_rate": 3.587147240638428e-05, |
|
"loss": 0.3079, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.4831081081081081, |
|
"grad_norm": 0.4847567677497864, |
|
"learning_rate": 3.5922267291084366e-05, |
|
"loss": 0.3305, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 0.5461480617523193, |
|
"learning_rate": 3.5972708201587496e-05, |
|
"loss": 0.3242, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.48986486486486486, |
|
"grad_norm": 0.5380986332893372, |
|
"learning_rate": 3.6022800037249585e-05, |
|
"loss": 0.3341, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.49324324324324326, |
|
"grad_norm": 0.49580562114715576, |
|
"learning_rate": 3.607254759640729e-05, |
|
"loss": 0.31, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4966216216216216, |
|
"grad_norm": 0.5374506115913391, |
|
"learning_rate": 3.612195557913627e-05, |
|
"loss": 0.3048, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.535591721534729, |
|
"learning_rate": 3.6171028589915954e-05, |
|
"loss": 0.3266, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5033783783783784, |
|
"grad_norm": 0.5777239799499512, |
|
"learning_rate": 3.6219771140204575e-05, |
|
"loss": 0.3509, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5067567567567568, |
|
"grad_norm": 0.5570788383483887, |
|
"learning_rate": 3.626818765092802e-05, |
|
"loss": 0.3348, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5101351351351351, |
|
"grad_norm": 0.5780688524246216, |
|
"learning_rate": 3.6316282454886157e-05, |
|
"loss": 0.3218, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"grad_norm": 0.5464823842048645, |
|
"learning_rate": 3.636405979907955e-05, |
|
"loss": 0.347, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5168918918918919, |
|
"grad_norm": 0.46229949593544006, |
|
"learning_rate": 3.6411523846959985e-05, |
|
"loss": 0.2618, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5202702702702703, |
|
"grad_norm": 0.5127717852592468, |
|
"learning_rate": 3.645867868060772e-05, |
|
"loss": 0.3364, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5236486486486487, |
|
"grad_norm": 0.4915751814842224, |
|
"learning_rate": 3.6505528302838193e-05, |
|
"loss": 0.327, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.527027027027027, |
|
"grad_norm": 0.4658984839916229, |
|
"learning_rate": 3.6552076639241027e-05, |
|
"loss": 0.3008, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5304054054054054, |
|
"grad_norm": 0.4983694553375244, |
|
"learning_rate": 3.65983275401539e-05, |
|
"loss": 0.3153, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5337837837837838, |
|
"grad_norm": 0.49560025334358215, |
|
"learning_rate": 3.664428478257371e-05, |
|
"loss": 0.3409, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5371621621621622, |
|
"grad_norm": 0.5617703199386597, |
|
"learning_rate": 3.668995207200753e-05, |
|
"loss": 0.3295, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 0.5226999521255493, |
|
"learning_rate": 3.673533304426541e-05, |
|
"loss": 0.3522, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.543918918918919, |
|
"grad_norm": 0.49685564637184143, |
|
"learning_rate": 3.67804312671975e-05, |
|
"loss": 0.3322, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5472972972972973, |
|
"grad_norm": 0.543743371963501, |
|
"learning_rate": 3.682525024237719e-05, |
|
"loss": 0.3283, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5506756756756757, |
|
"grad_norm": 0.5247477889060974, |
|
"learning_rate": 3.6869793406732636e-05, |
|
"loss": 0.3104, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5540540540540541, |
|
"grad_norm": 0.5228151679039001, |
|
"learning_rate": 3.69140641341283e-05, |
|
"loss": 0.3292, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5574324324324325, |
|
"grad_norm": 0.48610207438468933, |
|
"learning_rate": 3.695806573689844e-05, |
|
"loss": 0.3183, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5608108108108109, |
|
"grad_norm": 0.4703191816806793, |
|
"learning_rate": 3.700180146733426e-05, |
|
"loss": 0.2935, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5641891891891891, |
|
"grad_norm": 0.5501482486724854, |
|
"learning_rate": 3.704527451912639e-05, |
|
"loss": 0.3249, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 0.45025497674942017, |
|
"learning_rate": 3.708848802876438e-05, |
|
"loss": 0.3128, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.5709459459459459, |
|
"grad_norm": 0.5103681087493896, |
|
"learning_rate": 3.7131445076894564e-05, |
|
"loss": 0.2882, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5743243243243243, |
|
"grad_norm": 0.4952690601348877, |
|
"learning_rate": 3.717414868963791e-05, |
|
"loss": 0.3268, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5777027027027027, |
|
"grad_norm": 0.4887889325618744, |
|
"learning_rate": 3.721660183986924e-05, |
|
"loss": 0.3161, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.581081081081081, |
|
"grad_norm": 0.5142275094985962, |
|
"learning_rate": 3.725880744845915e-05, |
|
"loss": 0.2942, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5844594594594594, |
|
"grad_norm": 0.518671452999115, |
|
"learning_rate": 3.730076838547993e-05, |
|
"loss": 0.3232, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5878378378378378, |
|
"grad_norm": 0.5324185490608215, |
|
"learning_rate": 3.734248747137666e-05, |
|
"loss": 0.3115, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5912162162162162, |
|
"grad_norm": 0.5307193398475647, |
|
"learning_rate": 3.738396747810492e-05, |
|
"loss": 0.3245, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 0.47318169474601746, |
|
"learning_rate": 3.7425211130235834e-05, |
|
"loss": 0.3152, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.597972972972973, |
|
"grad_norm": 0.5233657956123352, |
|
"learning_rate": 3.7466221106030115e-05, |
|
"loss": 0.3115, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6013513513513513, |
|
"grad_norm": 0.4699764847755432, |
|
"learning_rate": 3.750700003848157e-05, |
|
"loss": 0.2948, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.6047297297297297, |
|
"grad_norm": 0.5511758327484131, |
|
"learning_rate": 3.7547550516331555e-05, |
|
"loss": 0.337, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6081081081081081, |
|
"grad_norm": 0.5334018468856812, |
|
"learning_rate": 3.75878750850551e-05, |
|
"loss": 0.3244, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6114864864864865, |
|
"grad_norm": 0.5040500164031982, |
|
"learning_rate": 3.7627976247819744e-05, |
|
"loss": 0.3173, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.6148648648648649, |
|
"grad_norm": 0.5094459652900696, |
|
"learning_rate": 3.766785646641792e-05, |
|
"loss": 0.3087, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6182432432432432, |
|
"grad_norm": 0.4679316282272339, |
|
"learning_rate": 3.770751816217383e-05, |
|
"loss": 0.3261, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"grad_norm": 0.5099210143089294, |
|
"learning_rate": 3.7746963716825615e-05, |
|
"loss": 0.3239, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.4242151379585266, |
|
"learning_rate": 3.778619547338356e-05, |
|
"loss": 0.3082, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6283783783783784, |
|
"grad_norm": 0.4796642065048218, |
|
"learning_rate": 3.782521573696528e-05, |
|
"loss": 0.2753, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6317567567567568, |
|
"grad_norm": 0.43798592686653137, |
|
"learning_rate": 3.786402677560832e-05, |
|
"loss": 0.3033, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6351351351351351, |
|
"grad_norm": 0.4947024881839752, |
|
"learning_rate": 3.790263082106134e-05, |
|
"loss": 0.3171, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6385135135135135, |
|
"grad_norm": 0.40923503041267395, |
|
"learning_rate": 3.794103006955407e-05, |
|
"loss": 0.2927, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6418918918918919, |
|
"grad_norm": 0.6707382202148438, |
|
"learning_rate": 3.797922668254715e-05, |
|
"loss": 0.2885, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6452702702702703, |
|
"grad_norm": 0.4996836185455322, |
|
"learning_rate": 3.801722278746213e-05, |
|
"loss": 0.3379, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 0.4873179495334625, |
|
"learning_rate": 3.8055020478392495e-05, |
|
"loss": 0.3091, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.652027027027027, |
|
"grad_norm": 0.4989306330680847, |
|
"learning_rate": 3.809262181679623e-05, |
|
"loss": 0.3758, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6554054054054054, |
|
"grad_norm": 0.5524582862854004, |
|
"learning_rate": 3.813002883217044e-05, |
|
"loss": 0.3044, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.6587837837837838, |
|
"grad_norm": 0.5253859162330627, |
|
"learning_rate": 3.816724352270863e-05, |
|
"loss": 0.3556, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6621621621621622, |
|
"grad_norm": 0.5060839653015137, |
|
"learning_rate": 3.8204267855941266e-05, |
|
"loss": 0.3352, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6655405405405406, |
|
"grad_norm": 0.5375229716300964, |
|
"learning_rate": 3.824110376935989e-05, |
|
"loss": 0.3328, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.668918918918919, |
|
"grad_norm": 0.4941028654575348, |
|
"learning_rate": 3.827775317102552e-05, |
|
"loss": 0.296, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6722972972972973, |
|
"grad_norm": 0.5323918461799622, |
|
"learning_rate": 3.831421794016178e-05, |
|
"loss": 0.3049, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 0.5118021965026855, |
|
"learning_rate": 3.835049992773302e-05, |
|
"loss": 0.3133, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6790540540540541, |
|
"grad_norm": 0.46183663606643677, |
|
"learning_rate": 3.838660095700815e-05, |
|
"loss": 0.3185, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6824324324324325, |
|
"grad_norm": 0.47574761509895325, |
|
"learning_rate": 3.84225228241104e-05, |
|
"loss": 0.302, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6858108108108109, |
|
"grad_norm": 0.5129667520523071, |
|
"learning_rate": 3.8458267298553554e-05, |
|
"loss": 0.3407, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6891891891891891, |
|
"grad_norm": 1.532895803451538, |
|
"learning_rate": 3.8493836123764984e-05, |
|
"loss": 0.2858, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6925675675675675, |
|
"grad_norm": 0.5879459381103516, |
|
"learning_rate": 3.852923101759591e-05, |
|
"loss": 0.3638, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6959459459459459, |
|
"grad_norm": 0.589292585849762, |
|
"learning_rate": 3.856445367281923e-05, |
|
"loss": 0.3077, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6993243243243243, |
|
"grad_norm": 0.44842028617858887, |
|
"learning_rate": 3.859950575761529e-05, |
|
"loss": 0.2791, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 0.6291372179985046, |
|
"learning_rate": 3.8634388916046025e-05, |
|
"loss": 0.3404, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.706081081081081, |
|
"grad_norm": 0.4508068561553955, |
|
"learning_rate": 3.866910476851757e-05, |
|
"loss": 0.2859, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7094594594594594, |
|
"grad_norm": 0.7987334132194519, |
|
"learning_rate": 3.870365491223199e-05, |
|
"loss": 0.3224, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7128378378378378, |
|
"grad_norm": 0.5030388236045837, |
|
"learning_rate": 3.8738040921628215e-05, |
|
"loss": 0.3225, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7162162162162162, |
|
"grad_norm": 0.5433597564697266, |
|
"learning_rate": 3.877226434881253e-05, |
|
"loss": 0.3294, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7195945945945946, |
|
"grad_norm": 0.48354920744895935, |
|
"learning_rate": 3.880632672397897e-05, |
|
"loss": 0.3035, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.722972972972973, |
|
"grad_norm": 0.5904508829116821, |
|
"learning_rate": 3.884022955581985e-05, |
|
"loss": 0.3261, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7263513513513513, |
|
"grad_norm": 0.5152267813682556, |
|
"learning_rate": 3.887397433192676e-05, |
|
"loss": 0.2926, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 0.46898239850997925, |
|
"learning_rate": 3.890756251918219e-05, |
|
"loss": 0.2808, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7331081081081081, |
|
"grad_norm": 0.4848116338253021, |
|
"learning_rate": 3.894099556414216e-05, |
|
"loss": 0.2995, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.7364864864864865, |
|
"grad_norm": 0.48101919889450073, |
|
"learning_rate": 3.897427489341009e-05, |
|
"loss": 0.3054, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7398648648648649, |
|
"grad_norm": 0.49954646825790405, |
|
"learning_rate": 3.900740191400198e-05, |
|
"loss": 0.3239, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7432432432432432, |
|
"grad_norm": 0.46193623542785645, |
|
"learning_rate": 3.904037801370344e-05, |
|
"loss": 0.3215, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7466216216216216, |
|
"grad_norm": 0.4544968605041504, |
|
"learning_rate": 3.9073204561418514e-05, |
|
"loss": 0.2829, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4560447335243225, |
|
"learning_rate": 3.9105882907510644e-05, |
|
"loss": 0.3052, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7533783783783784, |
|
"grad_norm": 0.4681329131126404, |
|
"learning_rate": 3.913841438413601e-05, |
|
"loss": 0.3259, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 0.4784477949142456, |
|
"learning_rate": 3.917080030556938e-05, |
|
"loss": 0.3252, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7601351351351351, |
|
"grad_norm": 0.4931364953517914, |
|
"learning_rate": 3.9203041968522716e-05, |
|
"loss": 0.3252, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7635135135135135, |
|
"grad_norm": 0.4844168424606323, |
|
"learning_rate": 3.923514065245669e-05, |
|
"loss": 0.3185, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7668918918918919, |
|
"grad_norm": 0.5114946961402893, |
|
"learning_rate": 3.926709761988538e-05, |
|
"loss": 0.3136, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7702702702702703, |
|
"grad_norm": 0.4893558621406555, |
|
"learning_rate": 3.929891411667424e-05, |
|
"loss": 0.2977, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7736486486486487, |
|
"grad_norm": 0.4515063762664795, |
|
"learning_rate": 3.933059137233147e-05, |
|
"loss": 0.3031, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.777027027027027, |
|
"grad_norm": 0.48304086923599243, |
|
"learning_rate": 3.9362130600293214e-05, |
|
"loss": 0.3292, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7804054054054054, |
|
"grad_norm": 0.4545653760433197, |
|
"learning_rate": 3.9393532998202405e-05, |
|
"loss": 0.3014, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"grad_norm": 0.4236096739768982, |
|
"learning_rate": 3.942479974818166e-05, |
|
"loss": 0.2804, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7871621621621622, |
|
"grad_norm": 0.4648077189922333, |
|
"learning_rate": 3.945593201710032e-05, |
|
"loss": 0.3243, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.7905405405405406, |
|
"grad_norm": 0.47247567772865295, |
|
"learning_rate": 3.9486930956835724e-05, |
|
"loss": 0.2907, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.793918918918919, |
|
"grad_norm": 0.4848794639110565, |
|
"learning_rate": 3.951779770452894e-05, |
|
"loss": 0.3265, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7972972972972973, |
|
"grad_norm": 0.5114079713821411, |
|
"learning_rate": 3.954853338283512e-05, |
|
"loss": 0.3194, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.8006756756756757, |
|
"grad_norm": 0.4802190959453583, |
|
"learning_rate": 3.9579139100168404e-05, |
|
"loss": 0.2857, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8040540540540541, |
|
"grad_norm": 0.522281289100647, |
|
"learning_rate": 3.960961595094187e-05, |
|
"loss": 0.2843, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8074324324324325, |
|
"grad_norm": 0.4458360970020294, |
|
"learning_rate": 3.96399650158023e-05, |
|
"loss": 0.2755, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 0.53693687915802, |
|
"learning_rate": 3.96701873618601e-05, |
|
"loss": 0.3015, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8141891891891891, |
|
"grad_norm": 0.46697714924812317, |
|
"learning_rate": 3.970028404291448e-05, |
|
"loss": 0.2911, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8175675675675675, |
|
"grad_norm": 0.5392897725105286, |
|
"learning_rate": 3.9730256099673865e-05, |
|
"loss": 0.3093, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.8209459459459459, |
|
"grad_norm": 0.5555176734924316, |
|
"learning_rate": 3.976010455997187e-05, |
|
"loss": 0.3078, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8243243243243243, |
|
"grad_norm": 0.42975664138793945, |
|
"learning_rate": 3.978983043897883e-05, |
|
"loss": 0.2723, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8277027027027027, |
|
"grad_norm": 0.6645532250404358, |
|
"learning_rate": 3.981943473940888e-05, |
|
"loss": 0.3161, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.831081081081081, |
|
"grad_norm": 0.47302716970443726, |
|
"learning_rate": 3.984891845172299e-05, |
|
"loss": 0.3002, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8344594594594594, |
|
"grad_norm": 0.5395675301551819, |
|
"learning_rate": 3.987828255432777e-05, |
|
"loss": 0.3614, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"grad_norm": 0.5418782234191895, |
|
"learning_rate": 3.9907528013770276e-05, |
|
"loss": 0.3226, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8412162162162162, |
|
"grad_norm": 0.4323344826698303, |
|
"learning_rate": 3.993665578492894e-05, |
|
"loss": 0.289, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8445945945945946, |
|
"grad_norm": 0.5098227858543396, |
|
"learning_rate": 3.9965666811200624e-05, |
|
"loss": 0.3346, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.847972972972973, |
|
"grad_norm": 0.4577200412750244, |
|
"learning_rate": 3.999456202468397e-05, |
|
"loss": 0.3526, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8513513513513513, |
|
"grad_norm": 0.48974189162254333, |
|
"learning_rate": 4.002334234635907e-05, |
|
"loss": 0.3086, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8547297297297297, |
|
"grad_norm": 0.47080233693122864, |
|
"learning_rate": 4.005200868626364e-05, |
|
"loss": 0.3578, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8581081081081081, |
|
"grad_norm": 0.5034478902816772, |
|
"learning_rate": 4.008056194366564e-05, |
|
"loss": 0.3166, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8614864864864865, |
|
"grad_norm": 0.44603192806243896, |
|
"learning_rate": 4.010900300723259e-05, |
|
"loss": 0.2969, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 0.5267269611358643, |
|
"learning_rate": 4.013733275519749e-05, |
|
"loss": 0.3341, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8682432432432432, |
|
"grad_norm": 0.424925297498703, |
|
"learning_rate": 4.016555205552158e-05, |
|
"loss": 0.2942, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.8716216216216216, |
|
"grad_norm": 0.5034767389297485, |
|
"learning_rate": 4.0193661766053834e-05, |
|
"loss": 0.2668, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.875, |
|
"grad_norm": 0.5288417935371399, |
|
"learning_rate": 4.022166273468753e-05, |
|
"loss": 0.3424, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.8783783783783784, |
|
"grad_norm": 0.4726288616657257, |
|
"learning_rate": 4.024955579951363e-05, |
|
"loss": 0.2906, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8817567567567568, |
|
"grad_norm": 0.5311090350151062, |
|
"learning_rate": 4.027734178897136e-05, |
|
"loss": 0.3307, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.8851351351351351, |
|
"grad_norm": 0.427002489566803, |
|
"learning_rate": 4.030502152199576e-05, |
|
"loss": 0.2569, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8885135135135135, |
|
"grad_norm": 0.543899416923523, |
|
"learning_rate": 4.033259580816264e-05, |
|
"loss": 0.2925, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 0.4932996928691864, |
|
"learning_rate": 4.036006544783052e-05, |
|
"loss": 0.3058, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8952702702702703, |
|
"grad_norm": 0.5138298273086548, |
|
"learning_rate": 4.0387431232280135e-05, |
|
"loss": 0.2952, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8986486486486487, |
|
"grad_norm": 0.4801797568798065, |
|
"learning_rate": 4.041469394385112e-05, |
|
"loss": 0.2873, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.902027027027027, |
|
"grad_norm": 0.453140527009964, |
|
"learning_rate": 4.0441854356076257e-05, |
|
"loss": 0.2747, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.9054054054054054, |
|
"grad_norm": 0.4941580593585968, |
|
"learning_rate": 4.046891323381315e-05, |
|
"loss": 0.3157, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9087837837837838, |
|
"grad_norm": 0.6151431798934937, |
|
"learning_rate": 4.049587133337347e-05, |
|
"loss": 0.3139, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.9121621621621622, |
|
"grad_norm": 0.4674379229545593, |
|
"learning_rate": 4.0522729402649793e-05, |
|
"loss": 0.3122, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9155405405405406, |
|
"grad_norm": 0.4719599187374115, |
|
"learning_rate": 4.0549488181240096e-05, |
|
"loss": 0.3101, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 0.483374685049057, |
|
"learning_rate": 4.057614840056998e-05, |
|
"loss": 0.3087, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.9222972972972973, |
|
"grad_norm": 0.48100754618644714, |
|
"learning_rate": 4.06027107840126e-05, |
|
"loss": 0.3044, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9256756756756757, |
|
"grad_norm": 0.44335874915122986, |
|
"learning_rate": 4.0629176047006474e-05, |
|
"loss": 0.2929, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.9290540540540541, |
|
"grad_norm": 0.4475744962692261, |
|
"learning_rate": 4.065554489717105e-05, |
|
"loss": 0.321, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9324324324324325, |
|
"grad_norm": 0.43009960651397705, |
|
"learning_rate": 4.068181803442029e-05, |
|
"loss": 0.3026, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9358108108108109, |
|
"grad_norm": 0.48129919171333313, |
|
"learning_rate": 4.0707996151074147e-05, |
|
"loss": 0.2876, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9391891891891891, |
|
"grad_norm": 0.423658549785614, |
|
"learning_rate": 4.073407993196794e-05, |
|
"loss": 0.2968, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9425675675675675, |
|
"grad_norm": 0.485857218503952, |
|
"learning_rate": 4.076007005455996e-05, |
|
"loss": 0.3144, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 0.4936545193195343, |
|
"learning_rate": 4.0785967189036986e-05, |
|
"loss": 0.3103, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9493243243243243, |
|
"grad_norm": 0.47265681624412537, |
|
"learning_rate": 4.0811771998418e-05, |
|
"loss": 0.3136, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.9527027027027027, |
|
"grad_norm": 0.4872439503669739, |
|
"learning_rate": 4.083748513865602e-05, |
|
"loss": 0.335, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.956081081081081, |
|
"grad_norm": 0.4380621910095215, |
|
"learning_rate": 4.086310725873818e-05, |
|
"loss": 0.3036, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9594594594594594, |
|
"grad_norm": 0.4759541451931, |
|
"learning_rate": 4.0888639000783966e-05, |
|
"loss": 0.2827, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9628378378378378, |
|
"grad_norm": 0.995196521282196, |
|
"learning_rate": 4.0914081000141844e-05, |
|
"loss": 0.3162, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9662162162162162, |
|
"grad_norm": 0.5686355829238892, |
|
"learning_rate": 4.0939433885484055e-05, |
|
"loss": 0.2992, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9695945945945946, |
|
"grad_norm": 0.43086209893226624, |
|
"learning_rate": 4.0964698278899874e-05, |
|
"loss": 0.278, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 0.5099641680717468, |
|
"learning_rate": 4.0989874795987185e-05, |
|
"loss": 0.3023, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9763513513513513, |
|
"grad_norm": 0.39634063839912415, |
|
"learning_rate": 4.1014964045942465e-05, |
|
"loss": 0.2806, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9797297297297297, |
|
"grad_norm": 0.4685455858707428, |
|
"learning_rate": 4.103996663164927e-05, |
|
"loss": 0.3078, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9831081081081081, |
|
"grad_norm": 0.4501771330833435, |
|
"learning_rate": 4.106488314976513e-05, |
|
"loss": 0.3048, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9864864864864865, |
|
"grad_norm": 0.4352227747440338, |
|
"learning_rate": 4.108971419080698e-05, |
|
"loss": 0.305, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9898648648648649, |
|
"grad_norm": 0.4946969151496887, |
|
"learning_rate": 4.111446033923516e-05, |
|
"loss": 0.3027, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.9932432432432432, |
|
"grad_norm": 0.463313490152359, |
|
"learning_rate": 4.113912217353596e-05, |
|
"loss": 0.3148, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.9966216216216216, |
|
"grad_norm": 0.42501333355903625, |
|
"learning_rate": 4.116370026630272e-05, |
|
"loss": 0.303, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4174898862838745, |
|
"learning_rate": 4.118819518431564e-05, |
|
"loss": 0.2768, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.0033783783783783, |
|
"grad_norm": 0.5591414570808411, |
|
"learning_rate": 4.121260748862021e-05, |
|
"loss": 0.2319, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.0067567567567568, |
|
"grad_norm": 0.4628024101257324, |
|
"learning_rate": 4.123693773460426e-05, |
|
"loss": 0.2332, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.010135135135135, |
|
"grad_norm": 0.9493624567985535, |
|
"learning_rate": 4.126118647207383e-05, |
|
"loss": 0.2113, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.0135135135135136, |
|
"grad_norm": 0.6151244044303894, |
|
"learning_rate": 4.1285354245327715e-05, |
|
"loss": 0.2428, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0168918918918919, |
|
"grad_norm": 0.4934402406215668, |
|
"learning_rate": 4.1309441593230726e-05, |
|
"loss": 0.2394, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.0202702702702702, |
|
"grad_norm": 0.5464348793029785, |
|
"learning_rate": 4.133344904928585e-05, |
|
"loss": 0.2584, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.0236486486486487, |
|
"grad_norm": 0.754717230796814, |
|
"learning_rate": 4.1357377141705084e-05, |
|
"loss": 0.2191, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.027027027027027, |
|
"grad_norm": 0.48623141646385193, |
|
"learning_rate": 4.1381226393479236e-05, |
|
"loss": 0.2185, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.0304054054054055, |
|
"grad_norm": 32.5340461730957, |
|
"learning_rate": 4.1404997322446435e-05, |
|
"loss": 0.2328, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0337837837837838, |
|
"grad_norm": 0.7727927565574646, |
|
"learning_rate": 4.142869044135967e-05, |
|
"loss": 0.2197, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.037162162162162, |
|
"grad_norm": 0.47851279377937317, |
|
"learning_rate": 4.145230625795311e-05, |
|
"loss": 0.2093, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.0405405405405406, |
|
"grad_norm": 0.684687077999115, |
|
"learning_rate": 4.14758452750074e-05, |
|
"loss": 0.2145, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.0439189189189189, |
|
"grad_norm": 0.4834594428539276, |
|
"learning_rate": 4.149930799041392e-05, |
|
"loss": 0.222, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.0472972972972974, |
|
"grad_norm": 0.5479368567466736, |
|
"learning_rate": 4.152269489723788e-05, |
|
"loss": 0.2245, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0506756756756757, |
|
"grad_norm": 0.48793625831604004, |
|
"learning_rate": 4.1546006483780626e-05, |
|
"loss": 0.2428, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.054054054054054, |
|
"grad_norm": 0.5712085962295532, |
|
"learning_rate": 4.156924323364072e-05, |
|
"loss": 0.2311, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.0574324324324325, |
|
"grad_norm": 0.46200135350227356, |
|
"learning_rate": 4.1592405625774144e-05, |
|
"loss": 0.2315, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.0608108108108107, |
|
"grad_norm": 0.5134051442146301, |
|
"learning_rate": 4.161549413455358e-05, |
|
"loss": 0.2205, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.0641891891891893, |
|
"grad_norm": 0.4155457019805908, |
|
"learning_rate": 4.163850922982668e-05, |
|
"loss": 0.1969, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0675675675675675, |
|
"grad_norm": 0.47662997245788574, |
|
"learning_rate": 4.16614513769734e-05, |
|
"loss": 0.2183, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.070945945945946, |
|
"grad_norm": 0.4776044189929962, |
|
"learning_rate": 4.1684321036962526e-05, |
|
"loss": 0.2581, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0743243243243243, |
|
"grad_norm": 0.5491393804550171, |
|
"learning_rate": 4.170711866640721e-05, |
|
"loss": 0.2454, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.0777027027027026, |
|
"grad_norm": 0.4745365381240845, |
|
"learning_rate": 4.1729844717619684e-05, |
|
"loss": 0.2303, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 0.49116480350494385, |
|
"learning_rate": 4.17524996386651e-05, |
|
"loss": 0.2205, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.0844594594594594, |
|
"grad_norm": 0.47651857137680054, |
|
"learning_rate": 4.177508387341454e-05, |
|
"loss": 0.2538, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.087837837837838, |
|
"grad_norm": 24.205638885498047, |
|
"learning_rate": 4.179759786159719e-05, |
|
"loss": 0.5427, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.0912162162162162, |
|
"grad_norm": 0.7813121676445007, |
|
"learning_rate": 4.182004203885172e-05, |
|
"loss": 0.2445, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.0945945945945945, |
|
"grad_norm": 52.29485321044922, |
|
"learning_rate": 4.184241683677687e-05, |
|
"loss": 0.2577, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.097972972972973, |
|
"grad_norm": 0.8282439112663269, |
|
"learning_rate": 4.1864722682981245e-05, |
|
"loss": 0.2421, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.1013513513513513, |
|
"grad_norm": 0.4729321300983429, |
|
"learning_rate": 4.188696000113232e-05, |
|
"loss": 0.2122, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.1047297297297298, |
|
"grad_norm": 0.6732134819030762, |
|
"learning_rate": 4.190912921100477e-05, |
|
"loss": 0.2295, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.1081081081081081, |
|
"grad_norm": 0.567088782787323, |
|
"learning_rate": 4.1931230728527994e-05, |
|
"loss": 0.2352, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.1114864864864864, |
|
"grad_norm": 8.066313743591309, |
|
"learning_rate": 4.195326496583291e-05, |
|
"loss": 0.7099, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.114864864864865, |
|
"grad_norm": 1.0606003999710083, |
|
"learning_rate": 4.1975232331298125e-05, |
|
"loss": 0.2105, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1182432432432432, |
|
"grad_norm": 0.4454852342605591, |
|
"learning_rate": 4.1997133229595316e-05, |
|
"loss": 0.2063, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.1216216216216217, |
|
"grad_norm": 0.873603880405426, |
|
"learning_rate": 4.201896806173394e-05, |
|
"loss": 0.2447, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.125, |
|
"grad_norm": 0.6670548319816589, |
|
"learning_rate": 4.2040737225105335e-05, |
|
"loss": 0.2298, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.1283783783783783, |
|
"grad_norm": 0.5834859609603882, |
|
"learning_rate": 4.206244111352608e-05, |
|
"loss": 0.1799, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.1317567567567568, |
|
"grad_norm": 0.8503464460372925, |
|
"learning_rate": 4.2084080117280756e-05, |
|
"loss": 0.2254, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 0.5728862285614014, |
|
"learning_rate": 4.210565462316407e-05, |
|
"loss": 0.2342, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.1385135135135136, |
|
"grad_norm": 0.7374505400657654, |
|
"learning_rate": 4.2127165014522315e-05, |
|
"loss": 0.2384, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.1418918918918919, |
|
"grad_norm": 0.5631051659584045, |
|
"learning_rate": 4.214861167129425e-05, |
|
"loss": 0.2213, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.1452702702702702, |
|
"grad_norm": 26.428640365600586, |
|
"learning_rate": 4.2169994970051365e-05, |
|
"loss": 0.8302, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.1486486486486487, |
|
"grad_norm": 1.1665905714035034, |
|
"learning_rate": 4.219131528403759e-05, |
|
"loss": 0.2568, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.152027027027027, |
|
"grad_norm": 0.6159250140190125, |
|
"learning_rate": 4.22125729832083e-05, |
|
"loss": 0.2574, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.1554054054054055, |
|
"grad_norm": 182.47439575195312, |
|
"learning_rate": 4.2233768434268914e-05, |
|
"loss": 0.8118, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.1587837837837838, |
|
"grad_norm": 1.685766339302063, |
|
"learning_rate": 4.225490200071284e-05, |
|
"loss": 0.2347, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.1621621621621623, |
|
"grad_norm": 0.9619600176811218, |
|
"learning_rate": 4.227597404285883e-05, |
|
"loss": 0.2453, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.1655405405405406, |
|
"grad_norm": 0.49926599860191345, |
|
"learning_rate": 4.229698491788791e-05, |
|
"loss": 0.2313, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1689189189189189, |
|
"grad_norm": 0.9699143767356873, |
|
"learning_rate": 4.231793497987961e-05, |
|
"loss": 0.256, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.1722972972972974, |
|
"grad_norm": 0.7428460121154785, |
|
"learning_rate": 4.2338824579847904e-05, |
|
"loss": 0.2655, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.1756756756756757, |
|
"grad_norm": 0.6480849385261536, |
|
"learning_rate": 4.235965406577636e-05, |
|
"loss": 0.2385, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.179054054054054, |
|
"grad_norm": 0.6347863078117371, |
|
"learning_rate": 4.2380423782653e-05, |
|
"loss": 0.2196, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.1824324324324325, |
|
"grad_norm": 0.6471324563026428, |
|
"learning_rate": 4.240113407250459e-05, |
|
"loss": 0.2337, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.1858108108108107, |
|
"grad_norm": 0.5807657241821289, |
|
"learning_rate": 4.24217852744304e-05, |
|
"loss": 0.2438, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.1891891891891893, |
|
"grad_norm": 0.5200501084327698, |
|
"learning_rate": 4.244237772463552e-05, |
|
"loss": 0.2458, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.1925675675675675, |
|
"grad_norm": 0.5445655584335327, |
|
"learning_rate": 4.246291175646371e-05, |
|
"loss": 0.2391, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.195945945945946, |
|
"grad_norm": 0.6283414363861084, |
|
"learning_rate": 4.24833877004298e-05, |
|
"loss": 0.2299, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.1993243243243243, |
|
"grad_norm": 0.5581656694412231, |
|
"learning_rate": 4.250380588425157e-05, |
|
"loss": 0.2272, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.2027027027027026, |
|
"grad_norm": 0.5711120367050171, |
|
"learning_rate": 4.2524166632881255e-05, |
|
"loss": 0.2737, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.2060810810810811, |
|
"grad_norm": 64.4117660522461, |
|
"learning_rate": 4.254447026853656e-05, |
|
"loss": 0.2648, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.2094594594594594, |
|
"grad_norm": 0.84456467628479, |
|
"learning_rate": 4.2564717110731244e-05, |
|
"loss": 0.2643, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.2128378378378377, |
|
"grad_norm": 1.7171592712402344, |
|
"learning_rate": 4.258490747630532e-05, |
|
"loss": 0.6041, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 0.6733882427215576, |
|
"learning_rate": 4.260504167945479e-05, |
|
"loss": 0.2226, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.2195945945945945, |
|
"grad_norm": 0.5079653859138489, |
|
"learning_rate": 4.2625120031760965e-05, |
|
"loss": 0.2205, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.222972972972973, |
|
"grad_norm": 0.5646266341209412, |
|
"learning_rate": 4.264514284221944e-05, |
|
"loss": 0.2272, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.2263513513513513, |
|
"grad_norm": 0.5336301922798157, |
|
"learning_rate": 4.266511041726854e-05, |
|
"loss": 0.2536, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.2297297297297298, |
|
"grad_norm": 0.5218775272369385, |
|
"learning_rate": 4.26850230608176e-05, |
|
"loss": 0.2582, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.2331081081081081, |
|
"grad_norm": 0.47051167488098145, |
|
"learning_rate": 4.2704881074274584e-05, |
|
"loss": 0.2288, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.2364864864864864, |
|
"grad_norm": 0.4820377826690674, |
|
"learning_rate": 4.272468475657351e-05, |
|
"loss": 0.2396, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.239864864864865, |
|
"grad_norm": 1219.8048095703125, |
|
"learning_rate": 4.2744434404201497e-05, |
|
"loss": 1.9213, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.2432432432432432, |
|
"grad_norm": 1.3034559488296509, |
|
"learning_rate": 4.27641303112253e-05, |
|
"loss": 0.2528, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.2466216216216217, |
|
"grad_norm": 0.6443231105804443, |
|
"learning_rate": 4.278377276931767e-05, |
|
"loss": 0.2235, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.4247282147407532, |
|
"learning_rate": 4.2803362067783256e-05, |
|
"loss": 0.2111, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2533783783783785, |
|
"grad_norm": 0.5400434136390686, |
|
"learning_rate": 4.2822898493584104e-05, |
|
"loss": 0.2256, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.2567567567567568, |
|
"grad_norm": 0.43781599402427673, |
|
"learning_rate": 4.284238233136496e-05, |
|
"loss": 0.2295, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.260135135135135, |
|
"grad_norm": 0.5443878173828125, |
|
"learning_rate": 4.286181386347813e-05, |
|
"loss": 0.2394, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.2635135135135136, |
|
"grad_norm": 0.4616173207759857, |
|
"learning_rate": 4.288119337000801e-05, |
|
"loss": 0.2108, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.2668918918918919, |
|
"grad_norm": 0.4537034332752228, |
|
"learning_rate": 4.2900521128795315e-05, |
|
"loss": 0.2192, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2702702702702702, |
|
"grad_norm": 0.5639699697494507, |
|
"learning_rate": 4.291979741546102e-05, |
|
"loss": 0.2403, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.2736486486486487, |
|
"grad_norm": 0.4133036434650421, |
|
"learning_rate": 4.293902250342989e-05, |
|
"loss": 0.2086, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.277027027027027, |
|
"grad_norm": 0.45922860503196716, |
|
"learning_rate": 4.295819666395376e-05, |
|
"loss": 0.236, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.2804054054054055, |
|
"grad_norm": 0.4423050284385681, |
|
"learning_rate": 4.297732016613454e-05, |
|
"loss": 0.2271, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.2837837837837838, |
|
"grad_norm": 0.4588952362537384, |
|
"learning_rate": 4.299639327694684e-05, |
|
"loss": 0.2356, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.2871621621621623, |
|
"grad_norm": 26.979991912841797, |
|
"learning_rate": 4.3015416261260325e-05, |
|
"loss": 0.2247, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.2905405405405406, |
|
"grad_norm": 0.6023054718971252, |
|
"learning_rate": 4.303438938186182e-05, |
|
"loss": 0.2477, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.2939189189189189, |
|
"grad_norm": 0.48854556679725647, |
|
"learning_rate": 4.305331289947705e-05, |
|
"loss": 0.221, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 0.5875428915023804, |
|
"learning_rate": 4.3072187072792184e-05, |
|
"loss": 0.2443, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.3006756756756757, |
|
"grad_norm": 0.42068448662757874, |
|
"learning_rate": 4.309101215847502e-05, |
|
"loss": 0.2043, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.304054054054054, |
|
"grad_norm": 0.492660254240036, |
|
"learning_rate": 4.3109788411195924e-05, |
|
"loss": 0.246, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.3074324324324325, |
|
"grad_norm": 0.4522532820701599, |
|
"learning_rate": 4.312851608364853e-05, |
|
"loss": 0.2305, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.3108108108108107, |
|
"grad_norm": 0.4410349130630493, |
|
"learning_rate": 4.314719542657013e-05, |
|
"loss": 0.2146, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.3141891891891893, |
|
"grad_norm": 38.59849548339844, |
|
"learning_rate": 4.3165826688761796e-05, |
|
"loss": 1.1445, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.3175675675675675, |
|
"grad_norm": 0.6608863472938538, |
|
"learning_rate": 4.318441011710833e-05, |
|
"loss": 0.2591, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.320945945945946, |
|
"grad_norm": 0.7261571884155273, |
|
"learning_rate": 4.3202945956597786e-05, |
|
"loss": 0.2409, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.3243243243243243, |
|
"grad_norm": 13.75738525390625, |
|
"learning_rate": 4.3221434450340956e-05, |
|
"loss": 1.022, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.3277027027027026, |
|
"grad_norm": 1.4299050569534302, |
|
"learning_rate": 4.323987583959045e-05, |
|
"loss": 0.2287, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.3310810810810811, |
|
"grad_norm": 0.6660119295120239, |
|
"learning_rate": 4.325827036375957e-05, |
|
"loss": 0.2148, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.3344594594594594, |
|
"grad_norm": 3.13312029838562, |
|
"learning_rate": 4.327661826044101e-05, |
|
"loss": 0.6968, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.3378378378378377, |
|
"grad_norm": 29.60978126525879, |
|
"learning_rate": 4.329491976542521e-05, |
|
"loss": 0.6915, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.3412162162162162, |
|
"grad_norm": 0.8736880421638489, |
|
"learning_rate": 4.331317511271859e-05, |
|
"loss": 0.2388, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.3445945945945945, |
|
"grad_norm": 0.5873726606369019, |
|
"learning_rate": 4.333138453456147e-05, |
|
"loss": 0.2655, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.347972972972973, |
|
"grad_norm": 2.68512225151062, |
|
"learning_rate": 4.334954826144581e-05, |
|
"loss": 0.6129, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 0.8919135928153992, |
|
"learning_rate": 4.336766652213271e-05, |
|
"loss": 0.2492, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3547297297297298, |
|
"grad_norm": 2.113346576690674, |
|
"learning_rate": 4.338573954366971e-05, |
|
"loss": 0.2578, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.3581081081081081, |
|
"grad_norm": 2.666167736053467, |
|
"learning_rate": 4.340376755140784e-05, |
|
"loss": 0.6334, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.3614864864864864, |
|
"grad_norm": 0.8454524278640747, |
|
"learning_rate": 4.342175076901849e-05, |
|
"loss": 0.2426, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.364864864864865, |
|
"grad_norm": 0.7469472885131836, |
|
"learning_rate": 4.343968941851009e-05, |
|
"loss": 0.2715, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.3682432432432432, |
|
"grad_norm": 0.6965801119804382, |
|
"learning_rate": 4.345758372024448e-05, |
|
"loss": 0.2462, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3716216216216215, |
|
"grad_norm": 1.5828707218170166, |
|
"learning_rate": 4.347543389295324e-05, |
|
"loss": 0.5967, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.375, |
|
"grad_norm": 0.5627582669258118, |
|
"learning_rate": 4.3493240153753666e-05, |
|
"loss": 0.226, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.3783783783783785, |
|
"grad_norm": 0.5101889967918396, |
|
"learning_rate": 4.3511002718164666e-05, |
|
"loss": 0.2266, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.3817567567567568, |
|
"grad_norm": 0.6242619156837463, |
|
"learning_rate": 4.352872180012237e-05, |
|
"loss": 0.2028, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.385135135135135, |
|
"grad_norm": 0.5768154263496399, |
|
"learning_rate": 4.35463976119956e-05, |
|
"loss": 0.2291, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3885135135135136, |
|
"grad_norm": 0.5170788764953613, |
|
"learning_rate": 4.356403036460115e-05, |
|
"loss": 0.2124, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.3918918918918919, |
|
"grad_norm": 0.530846357345581, |
|
"learning_rate": 4.3581620267218916e-05, |
|
"loss": 0.2013, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.3952702702702702, |
|
"grad_norm": 0.5346778631210327, |
|
"learning_rate": 4.359916752760669e-05, |
|
"loss": 0.2439, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.3986486486486487, |
|
"grad_norm": 0.5345392227172852, |
|
"learning_rate": 4.361667235201499e-05, |
|
"loss": 0.2321, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.402027027027027, |
|
"grad_norm": 0.5348331928253174, |
|
"learning_rate": 4.363413494520154e-05, |
|
"loss": 0.2418, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.4054054054054055, |
|
"grad_norm": 0.9726234078407288, |
|
"learning_rate": 4.365155551044572e-05, |
|
"loss": 0.213, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.4087837837837838, |
|
"grad_norm": 2.116424083709717, |
|
"learning_rate": 4.366893424956263e-05, |
|
"loss": 0.5716, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.4121621621621623, |
|
"grad_norm": 0.6004624962806702, |
|
"learning_rate": 4.368627136291726e-05, |
|
"loss": 0.2404, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.4155405405405406, |
|
"grad_norm": 0.44696101546287537, |
|
"learning_rate": 4.370356704943825e-05, |
|
"loss": 0.2349, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.4189189189189189, |
|
"grad_norm": 0.4704833924770355, |
|
"learning_rate": 4.372082150663168e-05, |
|
"loss": 0.2365, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4222972972972974, |
|
"grad_norm": 0.48295876383781433, |
|
"learning_rate": 4.3738034930594475e-05, |
|
"loss": 0.2256, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.4256756756756757, |
|
"grad_norm": 0.4726882576942444, |
|
"learning_rate": 4.3755207516027904e-05, |
|
"loss": 0.2283, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.429054054054054, |
|
"grad_norm": 0.5251411199569702, |
|
"learning_rate": 4.377233945625071e-05, |
|
"loss": 0.2374, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.4324324324324325, |
|
"grad_norm": 94.52925109863281, |
|
"learning_rate": 4.378943094321221e-05, |
|
"loss": 0.4887, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.4358108108108107, |
|
"grad_norm": 0.5848643183708191, |
|
"learning_rate": 4.3806482167505196e-05, |
|
"loss": 0.2373, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.4391891891891893, |
|
"grad_norm": 0.870020866394043, |
|
"learning_rate": 4.382349331837866e-05, |
|
"loss": 0.5312, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.4425675675675675, |
|
"grad_norm": 0.58022540807724, |
|
"learning_rate": 4.3840464583750404e-05, |
|
"loss": 0.2347, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.445945945945946, |
|
"grad_norm": 0.4780952036380768, |
|
"learning_rate": 4.385739615021954e-05, |
|
"loss": 0.2392, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.4493243243243243, |
|
"grad_norm": 0.4691276252269745, |
|
"learning_rate": 4.387428820307874e-05, |
|
"loss": 0.2346, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.4527027027027026, |
|
"grad_norm": 0.4635365903377533, |
|
"learning_rate": 4.3891140926326446e-05, |
|
"loss": 0.2242, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4560810810810811, |
|
"grad_norm": 0.7928866147994995, |
|
"learning_rate": 4.390795450267886e-05, |
|
"loss": 0.2267, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 0.5844922661781311, |
|
"learning_rate": 4.3924729113581876e-05, |
|
"loss": 0.237, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.4628378378378377, |
|
"grad_norm": 0.4539477229118347, |
|
"learning_rate": 4.394146493922276e-05, |
|
"loss": 0.2624, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.4662162162162162, |
|
"grad_norm": 0.4881402254104614, |
|
"learning_rate": 4.395816215854185e-05, |
|
"loss": 0.2076, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.4695945945945945, |
|
"grad_norm": 0.42912018299102783, |
|
"learning_rate": 4.397482094924396e-05, |
|
"loss": 0.224, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.472972972972973, |
|
"grad_norm": 8.578511238098145, |
|
"learning_rate": 4.399144148780977e-05, |
|
"loss": 0.5361, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.4763513513513513, |
|
"grad_norm": 0.4810888469219208, |
|
"learning_rate": 4.400802394950703e-05, |
|
"loss": 0.2274, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.4797297297297298, |
|
"grad_norm": 0.5107089281082153, |
|
"learning_rate": 4.402456850840166e-05, |
|
"loss": 0.2295, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.4831081081081081, |
|
"grad_norm": 0.4564654529094696, |
|
"learning_rate": 4.4041075337368695e-05, |
|
"loss": 0.2447, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 1.4533860683441162, |
|
"learning_rate": 4.405754460810312e-05, |
|
"loss": 0.2522, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.489864864864865, |
|
"grad_norm": 2.172696828842163, |
|
"learning_rate": 4.407397649113065e-05, |
|
"loss": 0.8104, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.4932432432432432, |
|
"grad_norm": 0.4864687919616699, |
|
"learning_rate": 4.40903711558182e-05, |
|
"loss": 0.2657, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.4966216216216215, |
|
"grad_norm": 0.4211529791355133, |
|
"learning_rate": 4.41067287703845e-05, |
|
"loss": 0.246, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.5068204998970032, |
|
"learning_rate": 4.412304950191033e-05, |
|
"loss": 0.219, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.5033783783783785, |
|
"grad_norm": 0.52825927734375, |
|
"learning_rate": 4.413933351634886e-05, |
|
"loss": 0.2293, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.5067567567567568, |
|
"grad_norm": 0.45864129066467285, |
|
"learning_rate": 4.4155580978535707e-05, |
|
"loss": 0.218, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.510135135135135, |
|
"grad_norm": 1.0356566905975342, |
|
"learning_rate": 4.417179205219895e-05, |
|
"loss": 0.2278, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 0.4032718241214752, |
|
"learning_rate": 4.418796689996907e-05, |
|
"loss": 0.2276, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.5168918918918919, |
|
"grad_norm": 0.4162694811820984, |
|
"learning_rate": 4.420410568338872e-05, |
|
"loss": 0.23, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.5202702702702702, |
|
"grad_norm": 0.4550395607948303, |
|
"learning_rate": 4.42202085629224e-05, |
|
"loss": 0.2445, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5236486486486487, |
|
"grad_norm": 0.5222985148429871, |
|
"learning_rate": 4.423627569796601e-05, |
|
"loss": 0.2405, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.527027027027027, |
|
"grad_norm": 0.4746388792991638, |
|
"learning_rate": 4.425230724685638e-05, |
|
"loss": 0.2299, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.5304054054054053, |
|
"grad_norm": 0.47175803780555725, |
|
"learning_rate": 4.4268303366880536e-05, |
|
"loss": 0.234, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.5337837837837838, |
|
"grad_norm": 0.46304166316986084, |
|
"learning_rate": 4.428426421428507e-05, |
|
"loss": 0.2165, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.5371621621621623, |
|
"grad_norm": 0.8317951560020447, |
|
"learning_rate": 4.430018994428521e-05, |
|
"loss": 0.2648, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5405405405405406, |
|
"grad_norm": 0.4764087498188019, |
|
"learning_rate": 4.431608071107392e-05, |
|
"loss": 0.2526, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.5439189189189189, |
|
"grad_norm": 0.39065808057785034, |
|
"learning_rate": 4.433193666783084e-05, |
|
"loss": 0.2428, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.5472972972972974, |
|
"grad_norm": 0.40975654125213623, |
|
"learning_rate": 4.4347757966731156e-05, |
|
"loss": 0.2437, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.5506756756756757, |
|
"grad_norm": 0.4318806231021881, |
|
"learning_rate": 4.436354475895436e-05, |
|
"loss": 0.233, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.554054054054054, |
|
"grad_norm": 0.4574219882488251, |
|
"learning_rate": 4.437929719469291e-05, |
|
"loss": 0.2441, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5574324324324325, |
|
"grad_norm": 0.39155369997024536, |
|
"learning_rate": 4.4395015423160807e-05, |
|
"loss": 0.2233, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.560810810810811, |
|
"grad_norm": 0.4589376747608185, |
|
"learning_rate": 4.4410699592602094e-05, |
|
"loss": 0.225, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.564189189189189, |
|
"grad_norm": 0.8952370882034302, |
|
"learning_rate": 4.442634985029922e-05, |
|
"loss": 0.4926, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.5675675675675675, |
|
"grad_norm": 0.43023157119750977, |
|
"learning_rate": 4.444196634258136e-05, |
|
"loss": 0.2333, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.570945945945946, |
|
"grad_norm": 1.1993354558944702, |
|
"learning_rate": 4.4457549214832566e-05, |
|
"loss": 0.2457, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5743243243243243, |
|
"grad_norm": 0.3993515074253082, |
|
"learning_rate": 4.44730986115e-05, |
|
"loss": 0.2457, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.5777027027027026, |
|
"grad_norm": 0.4565802812576294, |
|
"learning_rate": 4.448861467610187e-05, |
|
"loss": 0.2335, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.5810810810810811, |
|
"grad_norm": 0.4085776209831238, |
|
"learning_rate": 4.4504097551235406e-05, |
|
"loss": 0.237, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.5844594594594594, |
|
"grad_norm": 0.4212399125099182, |
|
"learning_rate": 4.4519547378584725e-05, |
|
"loss": 0.24, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.5878378378378377, |
|
"grad_norm": 0.408805251121521, |
|
"learning_rate": 4.453496429892863e-05, |
|
"loss": 0.2252, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5912162162162162, |
|
"grad_norm": 0.42911630868911743, |
|
"learning_rate": 4.455034845214827e-05, |
|
"loss": 0.2099, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.5945945945945947, |
|
"grad_norm": 0.47564277052879333, |
|
"learning_rate": 4.4565699977234796e-05, |
|
"loss": 0.2135, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.597972972972973, |
|
"grad_norm": 0.3962784707546234, |
|
"learning_rate": 4.458101901229686e-05, |
|
"loss": 0.2346, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.6013513513513513, |
|
"grad_norm": 0.3911672532558441, |
|
"learning_rate": 4.459630569456809e-05, |
|
"loss": 0.2198, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.6047297297297298, |
|
"grad_norm": 1093.1121826171875, |
|
"learning_rate": 4.461156016041444e-05, |
|
"loss": 0.2989, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.6081081081081081, |
|
"grad_norm": 3.732886552810669, |
|
"learning_rate": 4.462678254534156e-05, |
|
"loss": 0.5155, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.6114864864864864, |
|
"grad_norm": 0.5420750975608826, |
|
"learning_rate": 4.464197298400191e-05, |
|
"loss": 0.2645, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.614864864864865, |
|
"grad_norm": 4.157422065734863, |
|
"learning_rate": 4.4657131610201994e-05, |
|
"loss": 0.5192, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.6182432432432432, |
|
"grad_norm": 0.4917280972003937, |
|
"learning_rate": 4.467225855690939e-05, |
|
"loss": 0.2187, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 0.4480583071708679, |
|
"learning_rate": 4.468735395625979e-05, |
|
"loss": 0.2214, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.625, |
|
"grad_norm": 0.4467374384403229, |
|
"learning_rate": 4.470241793956387e-05, |
|
"loss": 0.2414, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.6283783783783785, |
|
"grad_norm": 0.48159006237983704, |
|
"learning_rate": 4.471745063731416e-05, |
|
"loss": 0.2386, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.6317567567567568, |
|
"grad_norm": 0.3858594000339508, |
|
"learning_rate": 4.473245217919187e-05, |
|
"loss": 0.1981, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.635135135135135, |
|
"grad_norm": 0.4672364592552185, |
|
"learning_rate": 4.474742269407355e-05, |
|
"loss": 0.2212, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.6385135135135136, |
|
"grad_norm": 0.5253187417984009, |
|
"learning_rate": 4.476236231003773e-05, |
|
"loss": 0.2753, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6418918918918919, |
|
"grad_norm": 0.488766610622406, |
|
"learning_rate": 4.477727115437156e-05, |
|
"loss": 0.2571, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.6452702702702702, |
|
"grad_norm": 0.4526199996471405, |
|
"learning_rate": 4.479214935357724e-05, |
|
"loss": 0.2341, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.6486486486486487, |
|
"grad_norm": 0.4283704459667206, |
|
"learning_rate": 4.480699703337852e-05, |
|
"loss": 0.2533, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.652027027027027, |
|
"grad_norm": 0.45583993196487427, |
|
"learning_rate": 4.4821814318727016e-05, |
|
"loss": 0.2394, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.6554054054054053, |
|
"grad_norm": 0.3872677683830261, |
|
"learning_rate": 4.483660133380856e-05, |
|
"loss": 0.2005, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6587837837837838, |
|
"grad_norm": 0.5054983496665955, |
|
"learning_rate": 4.485135820204948e-05, |
|
"loss": 0.231, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.6621621621621623, |
|
"grad_norm": 0.40404146909713745, |
|
"learning_rate": 4.486608504612267e-05, |
|
"loss": 0.2529, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.6655405405405406, |
|
"grad_norm": 0.4361649453639984, |
|
"learning_rate": 4.488078198795383e-05, |
|
"loss": 0.2485, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.6689189189189189, |
|
"grad_norm": 0.4425160586833954, |
|
"learning_rate": 4.489544914872745e-05, |
|
"loss": 0.2375, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.6722972972972974, |
|
"grad_norm": 0.4194391965866089, |
|
"learning_rate": 4.4910086648892815e-05, |
|
"loss": 0.2461, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.6756756756756757, |
|
"grad_norm": 0.42155027389526367, |
|
"learning_rate": 4.4924694608169965e-05, |
|
"loss": 0.2301, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.679054054054054, |
|
"grad_norm": 0.412643164396286, |
|
"learning_rate": 4.4939273145555536e-05, |
|
"loss": 0.235, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.6824324324324325, |
|
"grad_norm": 0.3827148973941803, |
|
"learning_rate": 4.495382237932863e-05, |
|
"loss": 0.23, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.685810810810811, |
|
"grad_norm": 0.4173565208911896, |
|
"learning_rate": 4.4968342427056505e-05, |
|
"loss": 0.2374, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.689189189189189, |
|
"grad_norm": 1055.430908203125, |
|
"learning_rate": 4.498283340560031e-05, |
|
"loss": 0.3647, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6925675675675675, |
|
"grad_norm": 0.5496036410331726, |
|
"learning_rate": 4.499729543112076e-05, |
|
"loss": 0.2686, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.695945945945946, |
|
"grad_norm": 0.437640905380249, |
|
"learning_rate": 4.501172861908366e-05, |
|
"loss": 0.2521, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.6993243243243243, |
|
"grad_norm": 0.4216068685054779, |
|
"learning_rate": 4.502613308426546e-05, |
|
"loss": 0.2447, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.7027027027027026, |
|
"grad_norm": 0.4138765037059784, |
|
"learning_rate": 4.504050894075876e-05, |
|
"loss": 0.2379, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.7060810810810811, |
|
"grad_norm": 0.8677256107330322, |
|
"learning_rate": 4.5054856301977696e-05, |
|
"loss": 0.2399, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.7094594594594594, |
|
"grad_norm": 0.4616876542568207, |
|
"learning_rate": 4.506917528066332e-05, |
|
"loss": 0.229, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.7128378378378377, |
|
"grad_norm": 0.6452389359474182, |
|
"learning_rate": 4.508346598888894e-05, |
|
"loss": 0.2237, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.7162162162162162, |
|
"grad_norm": 0.4205520749092102, |
|
"learning_rate": 4.509772853806532e-05, |
|
"loss": 0.2175, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.7195945945945947, |
|
"grad_norm": 0.456882506608963, |
|
"learning_rate": 4.511196303894598e-05, |
|
"loss": 0.2444, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.722972972972973, |
|
"grad_norm": 0.43970227241516113, |
|
"learning_rate": 4.512616960163227e-05, |
|
"loss": 0.2357, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.7263513513513513, |
|
"grad_norm": 0.4025394320487976, |
|
"learning_rate": 4.5140348335578547e-05, |
|
"loss": 0.2375, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 0.39442214369773865, |
|
"learning_rate": 4.515449934959718e-05, |
|
"loss": 0.2444, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.7331081081081081, |
|
"grad_norm": 0.44028565287590027, |
|
"learning_rate": 4.516862275186361e-05, |
|
"loss": 0.2324, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.7364864864864864, |
|
"grad_norm": 0.4466850161552429, |
|
"learning_rate": 4.518271864992127e-05, |
|
"loss": 0.2359, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.739864864864865, |
|
"grad_norm": 0.4365015923976898, |
|
"learning_rate": 4.519678715068652e-05, |
|
"loss": 0.2708, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.7432432432432432, |
|
"grad_norm": 0.4398203194141388, |
|
"learning_rate": 4.521082836045353e-05, |
|
"loss": 0.2452, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.7466216216216215, |
|
"grad_norm": 0.44056662917137146, |
|
"learning_rate": 4.5224842384899045e-05, |
|
"loss": 0.2344, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.43141037225723267, |
|
"learning_rate": 4.523882932908722e-05, |
|
"loss": 0.2511, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.7533783783783785, |
|
"grad_norm": 0.3922167122364044, |
|
"learning_rate": 4.52527892974743e-05, |
|
"loss": 0.2162, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 0.40995466709136963, |
|
"learning_rate": 4.526672239391333e-05, |
|
"loss": 0.2281, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.760135135135135, |
|
"grad_norm": 0.4008403718471527, |
|
"learning_rate": 4.528062872165875e-05, |
|
"loss": 0.2257, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.7635135135135136, |
|
"grad_norm": 0.5048766136169434, |
|
"learning_rate": 4.529450838337104e-05, |
|
"loss": 0.2304, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.7668918918918919, |
|
"grad_norm": 0.40119826793670654, |
|
"learning_rate": 4.530836148112124e-05, |
|
"loss": 0.2341, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.7702702702702702, |
|
"grad_norm": 0.41344285011291504, |
|
"learning_rate": 4.532218811639545e-05, |
|
"loss": 0.272, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.7736486486486487, |
|
"grad_norm": 0.38902515172958374, |
|
"learning_rate": 4.5335988390099284e-05, |
|
"loss": 0.2195, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.777027027027027, |
|
"grad_norm": 0.5754374861717224, |
|
"learning_rate": 4.534976240256232e-05, |
|
"loss": 0.2335, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.7804054054054053, |
|
"grad_norm": 31.7464656829834, |
|
"learning_rate": 4.536351025354245e-05, |
|
"loss": 0.4532, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 0.46500369906425476, |
|
"learning_rate": 4.537723204223021e-05, |
|
"loss": 0.2541, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.7871621621621623, |
|
"grad_norm": 0.41807517409324646, |
|
"learning_rate": 4.53909278672531e-05, |
|
"loss": 0.2287, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.7905405405405406, |
|
"grad_norm": 0.40613386034965515, |
|
"learning_rate": 4.5404597826679824e-05, |
|
"loss": 0.2196, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7939189189189189, |
|
"grad_norm": 0.4200097322463989, |
|
"learning_rate": 4.541824201802449e-05, |
|
"loss": 0.2393, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.7972972972972974, |
|
"grad_norm": 0.48739388585090637, |
|
"learning_rate": 4.543186053825081e-05, |
|
"loss": 0.2469, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.8006756756756757, |
|
"grad_norm": 0.5199280381202698, |
|
"learning_rate": 4.544545348377621e-05, |
|
"loss": 0.2652, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.804054054054054, |
|
"grad_norm": 0.44481709599494934, |
|
"learning_rate": 4.5459020950475946e-05, |
|
"loss": 0.2386, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.8074324324324325, |
|
"grad_norm": 1.6978758573532104, |
|
"learning_rate": 4.5472563033687145e-05, |
|
"loss": 0.5287, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.810810810810811, |
|
"grad_norm": 0.3990303575992584, |
|
"learning_rate": 4.548607982821284e-05, |
|
"loss": 0.2102, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.814189189189189, |
|
"grad_norm": 0.4208585321903229, |
|
"learning_rate": 4.5499571428325935e-05, |
|
"loss": 0.2058, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.8175675675675675, |
|
"grad_norm": 0.41925048828125, |
|
"learning_rate": 4.5513037927773155e-05, |
|
"loss": 0.2349, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.820945945945946, |
|
"grad_norm": 0.43288394808769226, |
|
"learning_rate": 4.5526479419778986e-05, |
|
"loss": 0.2519, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.8243243243243243, |
|
"grad_norm": 0.4215989112854004, |
|
"learning_rate": 4.553989599704948e-05, |
|
"loss": 0.2518, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.8277027027027026, |
|
"grad_norm": 0.4134393334388733, |
|
"learning_rate": 4.555328775177616e-05, |
|
"loss": 0.2341, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.8310810810810811, |
|
"grad_norm": 0.4243868887424469, |
|
"learning_rate": 4.5566654775639785e-05, |
|
"loss": 0.2553, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.8344594594594594, |
|
"grad_norm": 0.40011635422706604, |
|
"learning_rate": 4.5579997159814117e-05, |
|
"loss": 0.2447, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"grad_norm": 0.38058069348335266, |
|
"learning_rate": 4.5593314994969665e-05, |
|
"loss": 0.2307, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.8412162162162162, |
|
"grad_norm": 0.47068503499031067, |
|
"learning_rate": 4.560660837127738e-05, |
|
"loss": 0.1971, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.8445945945945947, |
|
"grad_norm": 0.4493515193462372, |
|
"learning_rate": 4.561987737841229e-05, |
|
"loss": 0.2552, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.847972972972973, |
|
"grad_norm": 0.43196865916252136, |
|
"learning_rate": 4.563312210555719e-05, |
|
"loss": 0.2373, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.8513513513513513, |
|
"grad_norm": 0.4201470911502838, |
|
"learning_rate": 4.564634264140616e-05, |
|
"loss": 0.2304, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.8547297297297298, |
|
"grad_norm": 0.3680441975593567, |
|
"learning_rate": 4.56595390741682e-05, |
|
"loss": 0.2263, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.8581081081081081, |
|
"grad_norm": 0.3840002119541168, |
|
"learning_rate": 4.567271149157073e-05, |
|
"loss": 0.218, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8614864864864864, |
|
"grad_norm": 0.39841052889823914, |
|
"learning_rate": 4.5685859980863086e-05, |
|
"loss": 0.2497, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.864864864864865, |
|
"grad_norm": 0.3956766128540039, |
|
"learning_rate": 4.569898462881999e-05, |
|
"loss": 0.2514, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.8682432432432432, |
|
"grad_norm": 0.4630524218082428, |
|
"learning_rate": 4.571208552174497e-05, |
|
"loss": 0.2744, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.8716216216216215, |
|
"grad_norm": 0.4056280851364136, |
|
"learning_rate": 4.572516274547383e-05, |
|
"loss": 0.2712, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.4127073585987091, |
|
"learning_rate": 4.573821638537794e-05, |
|
"loss": 0.2319, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.8783783783783785, |
|
"grad_norm": 0.38253894448280334, |
|
"learning_rate": 4.575124652636763e-05, |
|
"loss": 0.2302, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.8817567567567568, |
|
"grad_norm": 0.405472993850708, |
|
"learning_rate": 4.5764253252895486e-05, |
|
"loss": 0.2413, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.885135135135135, |
|
"grad_norm": 0.41931191086769104, |
|
"learning_rate": 4.577723664895965e-05, |
|
"loss": 0.2477, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.8885135135135136, |
|
"grad_norm": 0.3967287540435791, |
|
"learning_rate": 4.579019679810706e-05, |
|
"loss": 0.2525, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 0.3867218792438507, |
|
"learning_rate": 4.5803133783436676e-05, |
|
"loss": 0.2226, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8952702702702702, |
|
"grad_norm": 0.4048525094985962, |
|
"learning_rate": 4.581604768760269e-05, |
|
"loss": 0.2434, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.8986486486486487, |
|
"grad_norm": 0.38650572299957275, |
|
"learning_rate": 4.582893859281769e-05, |
|
"loss": 0.2287, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.902027027027027, |
|
"grad_norm": 0.3860650062561035, |
|
"learning_rate": 4.584180658085578e-05, |
|
"loss": 0.2285, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.9054054054054053, |
|
"grad_norm": 0.3902740478515625, |
|
"learning_rate": 4.585465173305571e-05, |
|
"loss": 0.2174, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.9087837837837838, |
|
"grad_norm": 0.4053342342376709, |
|
"learning_rate": 4.5867474130323984e-05, |
|
"loss": 0.2482, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.9121621621621623, |
|
"grad_norm": 0.35971924662590027, |
|
"learning_rate": 4.588027385313786e-05, |
|
"loss": 0.1858, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.9155405405405406, |
|
"grad_norm": 0.4442739486694336, |
|
"learning_rate": 4.5893050981548446e-05, |
|
"loss": 0.2847, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.9189189189189189, |
|
"grad_norm": 0.4430129826068878, |
|
"learning_rate": 4.5905805595183656e-05, |
|
"loss": 0.2265, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.9222972972972974, |
|
"grad_norm": 0.39476659893989563, |
|
"learning_rate": 4.591853777325119e-05, |
|
"loss": 0.258, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.9256756756756757, |
|
"grad_norm": 0.4573532044887543, |
|
"learning_rate": 4.593124759454153e-05, |
|
"loss": 0.2398, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.929054054054054, |
|
"grad_norm": 0.3871706426143646, |
|
"learning_rate": 4.5943935137430806e-05, |
|
"loss": 0.2292, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.9324324324324325, |
|
"grad_norm": 0.38287678360939026, |
|
"learning_rate": 4.595660047988374e-05, |
|
"loss": 0.2313, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.935810810810811, |
|
"grad_norm": 0.41778552532196045, |
|
"learning_rate": 4.59692436994565e-05, |
|
"loss": 0.2501, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.939189189189189, |
|
"grad_norm": 0.3646586537361145, |
|
"learning_rate": 4.5981864873299563e-05, |
|
"loss": 0.2099, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.9425675675675675, |
|
"grad_norm": 0.3954225778579712, |
|
"learning_rate": 4.599446407816052e-05, |
|
"loss": 0.2272, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 0.39867308735847473, |
|
"learning_rate": 4.6007041390386874e-05, |
|
"loss": 0.2352, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.9493243243243243, |
|
"grad_norm": 0.3616117238998413, |
|
"learning_rate": 4.601959688592886e-05, |
|
"loss": 0.2207, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.9527027027027026, |
|
"grad_norm": 0.42957910895347595, |
|
"learning_rate": 4.603213064034216e-05, |
|
"loss": 0.2437, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.9560810810810811, |
|
"grad_norm": 0.3899705708026886, |
|
"learning_rate": 4.604464272879061e-05, |
|
"loss": 0.2436, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.9594594594594594, |
|
"grad_norm": 0.3997238278388977, |
|
"learning_rate": 4.605713322604896e-05, |
|
"loss": 0.2065, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9628378378378377, |
|
"grad_norm": 0.4001472592353821, |
|
"learning_rate": 4.606960220650551e-05, |
|
"loss": 0.2435, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.9662162162162162, |
|
"grad_norm": 0.4067547023296356, |
|
"learning_rate": 4.608204974416481e-05, |
|
"loss": 0.2456, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.9695945945945947, |
|
"grad_norm": 0.3919241428375244, |
|
"learning_rate": 4.6094475912650234e-05, |
|
"loss": 0.2214, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.972972972972973, |
|
"grad_norm": 0.3752463757991791, |
|
"learning_rate": 4.610688078520666e-05, |
|
"loss": 0.2148, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.9763513513513513, |
|
"grad_norm": 0.42536041140556335, |
|
"learning_rate": 4.611926443470301e-05, |
|
"loss": 0.2219, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.9797297297297298, |
|
"grad_norm": 0.3676905930042267, |
|
"learning_rate": 4.6131626933634844e-05, |
|
"loss": 0.2153, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.9831081081081081, |
|
"grad_norm": 0.3774767816066742, |
|
"learning_rate": 4.6143968354126914e-05, |
|
"loss": 0.2459, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.9864864864864864, |
|
"grad_norm": 0.38765907287597656, |
|
"learning_rate": 4.6156288767935646e-05, |
|
"loss": 0.2339, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.989864864864865, |
|
"grad_norm": 0.3474208116531372, |
|
"learning_rate": 4.61685882464517e-05, |
|
"loss": 0.2109, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.9932432432432432, |
|
"grad_norm": 0.39578577876091003, |
|
"learning_rate": 4.61808668607024e-05, |
|
"loss": 0.2232, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9966216216216215, |
|
"grad_norm": 0.3756243884563446, |
|
"learning_rate": 4.619312468135426e-05, |
|
"loss": 0.2286, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.40366870164871216, |
|
"learning_rate": 4.620536177871533e-05, |
|
"loss": 0.1979, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.0033783783783785, |
|
"grad_norm": 0.5792023539543152, |
|
"learning_rate": 4.621757822273772e-05, |
|
"loss": 0.1664, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 2.0067567567567566, |
|
"grad_norm": 0.48093676567077637, |
|
"learning_rate": 4.62297740830199e-05, |
|
"loss": 0.1489, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.010135135135135, |
|
"grad_norm": 0.7113845348358154, |
|
"learning_rate": 4.6241949428809165e-05, |
|
"loss": 0.1666, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.0135135135135136, |
|
"grad_norm": 0.4888218343257904, |
|
"learning_rate": 4.625410432900395e-05, |
|
"loss": 0.1667, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.016891891891892, |
|
"grad_norm": 0.43272024393081665, |
|
"learning_rate": 4.626623885215616e-05, |
|
"loss": 0.1611, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.02027027027027, |
|
"grad_norm": 0.4639625549316406, |
|
"learning_rate": 4.627835306647352e-05, |
|
"loss": 0.183, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.0236486486486487, |
|
"grad_norm": 0.4807380735874176, |
|
"learning_rate": 4.629044703982186e-05, |
|
"loss": 0.1592, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 0.4514772891998291, |
|
"learning_rate": 4.63025208397274e-05, |
|
"loss": 0.1459, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.0304054054054053, |
|
"grad_norm": 0.4227406978607178, |
|
"learning_rate": 4.6314574533379e-05, |
|
"loss": 0.1632, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 2.0337837837837838, |
|
"grad_norm": 0.42751410603523254, |
|
"learning_rate": 4.632660818763041e-05, |
|
"loss": 0.1699, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.0371621621621623, |
|
"grad_norm": 0.45877569913864136, |
|
"learning_rate": 4.633862186900253e-05, |
|
"loss": 0.1725, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.0405405405405403, |
|
"grad_norm": 0.41468122601509094, |
|
"learning_rate": 4.6350615643685535e-05, |
|
"loss": 0.172, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.043918918918919, |
|
"grad_norm": 0.42624783515930176, |
|
"learning_rate": 4.6362589577541154e-05, |
|
"loss": 0.1591, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.0472972972972974, |
|
"grad_norm": 0.3854513466358185, |
|
"learning_rate": 4.637454373610477e-05, |
|
"loss": 0.1486, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.050675675675676, |
|
"grad_norm": 0.40797704458236694, |
|
"learning_rate": 4.638647818458763e-05, |
|
"loss": 0.1521, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 2.054054054054054, |
|
"grad_norm": 0.4326710104942322, |
|
"learning_rate": 4.639839298787892e-05, |
|
"loss": 0.1441, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 2.0574324324324325, |
|
"grad_norm": 0.39950937032699585, |
|
"learning_rate": 4.641028821054793e-05, |
|
"loss": 0.165, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.060810810810811, |
|
"grad_norm": 0.38399842381477356, |
|
"learning_rate": 4.6422163916846124e-05, |
|
"loss": 0.1464, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.064189189189189, |
|
"grad_norm": 0.40971097350120544, |
|
"learning_rate": 4.643402017070924e-05, |
|
"loss": 0.1604, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 2.0675675675675675, |
|
"grad_norm": 0.4519285261631012, |
|
"learning_rate": 4.644585703575936e-05, |
|
"loss": 0.1739, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.070945945945946, |
|
"grad_norm": 0.40968844294548035, |
|
"learning_rate": 4.645767457530692e-05, |
|
"loss": 0.1592, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 2.074324324324324, |
|
"grad_norm": 0.4271821975708008, |
|
"learning_rate": 4.64694728523528e-05, |
|
"loss": 0.1591, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 2.0777027027027026, |
|
"grad_norm": 0.44706177711486816, |
|
"learning_rate": 4.648125192959028e-05, |
|
"loss": 0.1745, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.081081081081081, |
|
"grad_norm": 0.3895583152770996, |
|
"learning_rate": 4.649301186940709e-05, |
|
"loss": 0.1541, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.0844594594594597, |
|
"grad_norm": 0.5112977623939514, |
|
"learning_rate": 4.650475273388737e-05, |
|
"loss": 0.1851, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 2.0878378378378377, |
|
"grad_norm": 0.4201546609401703, |
|
"learning_rate": 4.651647458481359e-05, |
|
"loss": 0.1502, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.0912162162162162, |
|
"grad_norm": 0.4547381103038788, |
|
"learning_rate": 4.652817748366864e-05, |
|
"loss": 0.1533, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 2.0945945945945947, |
|
"grad_norm": 0.48063594102859497, |
|
"learning_rate": 4.653986149163757e-05, |
|
"loss": 0.1704, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.097972972972973, |
|
"grad_norm": 0.3941943943500519, |
|
"learning_rate": 4.655152666960967e-05, |
|
"loss": 0.1338, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.1013513513513513, |
|
"grad_norm": 0.43712350726127625, |
|
"learning_rate": 4.6563173078180315e-05, |
|
"loss": 0.1573, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 2.10472972972973, |
|
"grad_norm": 0.43442657589912415, |
|
"learning_rate": 4.657480077765283e-05, |
|
"loss": 0.1672, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 0.45270854234695435, |
|
"learning_rate": 4.6586409828040405e-05, |
|
"loss": 0.1851, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.1114864864864864, |
|
"grad_norm": 0.5774021744728088, |
|
"learning_rate": 4.659800028906792e-05, |
|
"loss": 0.1451, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.114864864864865, |
|
"grad_norm": 0.4964922368526459, |
|
"learning_rate": 4.660957222017383e-05, |
|
"loss": 0.1656, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 2.1182432432432434, |
|
"grad_norm": 0.417864054441452, |
|
"learning_rate": 4.662112568051194e-05, |
|
"loss": 0.1672, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.1216216216216215, |
|
"grad_norm": 0.5328717231750488, |
|
"learning_rate": 4.663266072895327e-05, |
|
"loss": 0.1761, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.125, |
|
"grad_norm": 0.4234524369239807, |
|
"learning_rate": 4.664417742408782e-05, |
|
"loss": 0.1679, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 2.1283783783783785, |
|
"grad_norm": 0.5051260590553284, |
|
"learning_rate": 4.665567582422637e-05, |
|
"loss": 0.1763, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.1317567567567566, |
|
"grad_norm": 0.44952192902565, |
|
"learning_rate": 4.666715598740224e-05, |
|
"loss": 0.1535, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.135135135135135, |
|
"grad_norm": 0.44377925992012024, |
|
"learning_rate": 4.667861797137309e-05, |
|
"loss": 0.1456, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 2.1385135135135136, |
|
"grad_norm": 0.3786265254020691, |
|
"learning_rate": 4.669006183362258e-05, |
|
"loss": 0.1478, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.141891891891892, |
|
"grad_norm": 0.4277481436729431, |
|
"learning_rate": 4.670148763136221e-05, |
|
"loss": 0.157, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.14527027027027, |
|
"grad_norm": 0.43035444617271423, |
|
"learning_rate": 4.671289542153293e-05, |
|
"loss": 0.1552, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.1486486486486487, |
|
"grad_norm": 0.45837166905403137, |
|
"learning_rate": 4.672428526080691e-05, |
|
"loss": 0.1848, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.152027027027027, |
|
"grad_norm": 0.41413986682891846, |
|
"learning_rate": 4.673565720558918e-05, |
|
"loss": 0.1803, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 2.1554054054054053, |
|
"grad_norm": 0.4246644377708435, |
|
"learning_rate": 4.6747011312019374e-05, |
|
"loss": 0.177, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.1587837837837838, |
|
"grad_norm": 0.4299694001674652, |
|
"learning_rate": 4.6758347635973334e-05, |
|
"loss": 0.164, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 0.4426731765270233, |
|
"learning_rate": 4.676966623306479e-05, |
|
"loss": 0.164, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.1655405405405403, |
|
"grad_norm": 0.4473170340061188, |
|
"learning_rate": 4.678096715864696e-05, |
|
"loss": 0.1642, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 2.168918918918919, |
|
"grad_norm": 0.44445449113845825, |
|
"learning_rate": 4.679225046781422e-05, |
|
"loss": 0.155, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.1722972972972974, |
|
"grad_norm": 0.42584556341171265, |
|
"learning_rate": 4.68035162154037e-05, |
|
"loss": 0.171, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 2.175675675675676, |
|
"grad_norm": 0.39437368512153625, |
|
"learning_rate": 4.681476445599687e-05, |
|
"loss": 0.1692, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 2.179054054054054, |
|
"grad_norm": 0.40476444363594055, |
|
"learning_rate": 4.6825995243921137e-05, |
|
"loss": 0.162, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.1824324324324325, |
|
"grad_norm": 0.42337456345558167, |
|
"learning_rate": 4.683720863325141e-05, |
|
"loss": 0.184, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 2.185810810810811, |
|
"grad_norm": 0.4256432056427002, |
|
"learning_rate": 4.684840467781168e-05, |
|
"loss": 0.175, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.189189189189189, |
|
"grad_norm": 0.40138664841651917, |
|
"learning_rate": 4.685958343117656e-05, |
|
"loss": 0.1577, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.1925675675675675, |
|
"grad_norm": 0.4015848636627197, |
|
"learning_rate": 4.6870744946672826e-05, |
|
"loss": 0.1521, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 2.195945945945946, |
|
"grad_norm": 0.4208281934261322, |
|
"learning_rate": 4.688188927738093e-05, |
|
"loss": 0.1685, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.199324324324324, |
|
"grad_norm": 0.4102751910686493, |
|
"learning_rate": 4.689301647613653e-05, |
|
"loss": 0.1837, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.2027027027027026, |
|
"grad_norm": 0.43685656785964966, |
|
"learning_rate": 4.6904126595532014e-05, |
|
"loss": 0.174, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 2.206081081081081, |
|
"grad_norm": 0.392456978559494, |
|
"learning_rate": 4.69152196879179e-05, |
|
"loss": 0.1741, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 2.2094594594594597, |
|
"grad_norm": 0.4086831212043762, |
|
"learning_rate": 4.692629580540446e-05, |
|
"loss": 0.1559, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.2128378378378377, |
|
"grad_norm": 0.38833364844322205, |
|
"learning_rate": 4.693735499986305e-05, |
|
"loss": 0.1618, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.2162162162162162, |
|
"grad_norm": 0.36838966608047485, |
|
"learning_rate": 4.694839732292767e-05, |
|
"loss": 0.1478, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.2195945945945947, |
|
"grad_norm": 0.40487056970596313, |
|
"learning_rate": 4.6959422825996345e-05, |
|
"loss": 0.1738, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.222972972972973, |
|
"grad_norm": 0.3815852999687195, |
|
"learning_rate": 4.69704315602326e-05, |
|
"loss": 0.1665, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.2263513513513513, |
|
"grad_norm": 0.400643914937973, |
|
"learning_rate": 4.698142357656684e-05, |
|
"loss": 0.1771, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 2.22972972972973, |
|
"grad_norm": 0.3691990077495575, |
|
"learning_rate": 4.6992398925697814e-05, |
|
"loss": 0.1591, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.233108108108108, |
|
"grad_norm": 0.4398171305656433, |
|
"learning_rate": 4.7003357658094e-05, |
|
"loss": 0.2062, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 2.2364864864864864, |
|
"grad_norm": 0.3774857521057129, |
|
"learning_rate": 4.7014299823995005e-05, |
|
"loss": 0.1502, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 2.239864864864865, |
|
"grad_norm": 0.4315558075904846, |
|
"learning_rate": 4.702522547341289e-05, |
|
"loss": 0.1636, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.2432432432432434, |
|
"grad_norm": 0.4040674865245819, |
|
"learning_rate": 4.703613465613363e-05, |
|
"loss": 0.1625, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 2.2466216216216215, |
|
"grad_norm": 0.46972906589508057, |
|
"learning_rate": 4.704702742171841e-05, |
|
"loss": 0.1833, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.41066527366638184, |
|
"learning_rate": 4.7057903819505024e-05, |
|
"loss": 0.1707, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.2533783783783785, |
|
"grad_norm": 0.3562461733818054, |
|
"learning_rate": 4.7068763898609154e-05, |
|
"loss": 0.1508, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 2.2567567567567566, |
|
"grad_norm": 0.4074662923812866, |
|
"learning_rate": 4.707960770792576e-05, |
|
"loss": 0.1786, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.260135135135135, |
|
"grad_norm": 0.3978296220302582, |
|
"learning_rate": 4.709043529613039e-05, |
|
"loss": 0.1797, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.2635135135135136, |
|
"grad_norm": 0.42612069845199585, |
|
"learning_rate": 4.710124671168044e-05, |
|
"loss": 0.1874, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.266891891891892, |
|
"grad_norm": 0.4137380123138428, |
|
"learning_rate": 4.711204200281654e-05, |
|
"loss": 0.1897, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 0.37814861536026, |
|
"learning_rate": 4.712282121756376e-05, |
|
"loss": 0.1617, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.2736486486486487, |
|
"grad_norm": 0.42515134811401367, |
|
"learning_rate": 4.713358440373295e-05, |
|
"loss": 0.1652, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 2.277027027027027, |
|
"grad_norm": 0.42217129468917847, |
|
"learning_rate": 4.7144331608922e-05, |
|
"loss": 0.1872, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 2.2804054054054053, |
|
"grad_norm": 0.3996788263320923, |
|
"learning_rate": 4.715506288051709e-05, |
|
"loss": 0.1603, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.2837837837837838, |
|
"grad_norm": 0.4035404622554779, |
|
"learning_rate": 4.7165778265693935e-05, |
|
"loss": 0.1689, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 2.2871621621621623, |
|
"grad_norm": 0.39791160821914673, |
|
"learning_rate": 4.7176477811419076e-05, |
|
"loss": 0.1804, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 2.2905405405405403, |
|
"grad_norm": 0.4175347685813904, |
|
"learning_rate": 4.718716156445106e-05, |
|
"loss": 0.1811, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.293918918918919, |
|
"grad_norm": 0.39992383122444153, |
|
"learning_rate": 4.7197829571341704e-05, |
|
"loss": 0.1805, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 0.40433621406555176, |
|
"learning_rate": 4.720848187843727e-05, |
|
"loss": 0.1726, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.3006756756756754, |
|
"grad_norm": 0.38958102464675903, |
|
"learning_rate": 4.721911853187975e-05, |
|
"loss": 0.1663, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.304054054054054, |
|
"grad_norm": 0.40906044840812683, |
|
"learning_rate": 4.722973957760799e-05, |
|
"loss": 0.1637, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 2.3074324324324325, |
|
"grad_norm": 0.4296117424964905, |
|
"learning_rate": 4.724034506135888e-05, |
|
"loss": 0.178, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 2.310810810810811, |
|
"grad_norm": 0.40840208530426025, |
|
"learning_rate": 4.725093502866861e-05, |
|
"loss": 0.1725, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.314189189189189, |
|
"grad_norm": 0.3669198751449585, |
|
"learning_rate": 4.7261509524873764e-05, |
|
"loss": 0.161, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.3175675675675675, |
|
"grad_norm": 0.40982183814048767, |
|
"learning_rate": 4.727206859511253e-05, |
|
"loss": 0.1757, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 2.320945945945946, |
|
"grad_norm": 0.4490487575531006, |
|
"learning_rate": 4.7282612284325846e-05, |
|
"loss": 0.1975, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.3243243243243246, |
|
"grad_norm": 0.4303133487701416, |
|
"learning_rate": 4.729314063725853e-05, |
|
"loss": 0.1753, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 2.3277027027027026, |
|
"grad_norm": 0.5020397901535034, |
|
"learning_rate": 4.730365369846044e-05, |
|
"loss": 0.1959, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 2.331081081081081, |
|
"grad_norm": 0.4409542679786682, |
|
"learning_rate": 4.7314151512287594e-05, |
|
"loss": 0.1779, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.3344594594594597, |
|
"grad_norm": 0.41255807876586914, |
|
"learning_rate": 4.732463412290331e-05, |
|
"loss": 0.1647, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 2.3378378378378377, |
|
"grad_norm": 0.41047075390815735, |
|
"learning_rate": 4.73351015742793e-05, |
|
"loss": 0.1699, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 2.3412162162162162, |
|
"grad_norm": 0.4248691201210022, |
|
"learning_rate": 4.7345553910196785e-05, |
|
"loss": 0.1785, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.3445945945945947, |
|
"grad_norm": 0.41576990485191345, |
|
"learning_rate": 4.735599117424759e-05, |
|
"loss": 0.1806, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 2.347972972972973, |
|
"grad_norm": 0.399854838848114, |
|
"learning_rate": 4.7366413409835235e-05, |
|
"loss": 0.1689, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.3513513513513513, |
|
"grad_norm": 0.39361926913261414, |
|
"learning_rate": 4.737682066017604e-05, |
|
"loss": 0.1757, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.35472972972973, |
|
"grad_norm": 0.36152175068855286, |
|
"learning_rate": 4.7387212968300166e-05, |
|
"loss": 0.1421, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 2.358108108108108, |
|
"grad_norm": 0.4112738370895386, |
|
"learning_rate": 4.7397590377052686e-05, |
|
"loss": 0.156, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 2.3614864864864864, |
|
"grad_norm": 1.494449496269226, |
|
"learning_rate": 4.74079529290947e-05, |
|
"loss": 0.433, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.364864864864865, |
|
"grad_norm": 0.44618678092956543, |
|
"learning_rate": 4.741830066690428e-05, |
|
"loss": 0.1738, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.3682432432432434, |
|
"grad_norm": 0.4271227717399597, |
|
"learning_rate": 4.742863363277765e-05, |
|
"loss": 0.1859, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 2.3716216216216215, |
|
"grad_norm": 0.38577789068222046, |
|
"learning_rate": 4.743895186883009e-05, |
|
"loss": 0.1699, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.375, |
|
"grad_norm": 0.45399177074432373, |
|
"learning_rate": 4.7449255416997075e-05, |
|
"loss": 0.2042, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 0.4084223806858063, |
|
"learning_rate": 4.7459544319035206e-05, |
|
"loss": 0.1626, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 2.3817567567567566, |
|
"grad_norm": 0.4444795846939087, |
|
"learning_rate": 4.746981861652332e-05, |
|
"loss": 0.1524, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.385135135135135, |
|
"grad_norm": 0.41611728072166443, |
|
"learning_rate": 4.74800783508634e-05, |
|
"loss": 0.1965, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 2.3885135135135136, |
|
"grad_norm": 0.4463081657886505, |
|
"learning_rate": 4.7490323563281665e-05, |
|
"loss": 0.1703, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 2.391891891891892, |
|
"grad_norm": 0.4274112582206726, |
|
"learning_rate": 4.750055429482949e-05, |
|
"loss": 0.1468, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.39527027027027, |
|
"grad_norm": 0.5068261623382568, |
|
"learning_rate": 4.751077058638445e-05, |
|
"loss": 0.1959, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 2.3986486486486487, |
|
"grad_norm": 0.46266356110572815, |
|
"learning_rate": 4.752097247865126e-05, |
|
"loss": 0.1876, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.402027027027027, |
|
"grad_norm": 0.4017108082771301, |
|
"learning_rate": 4.753116001216277e-05, |
|
"loss": 0.1567, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.4054054054054053, |
|
"grad_norm": 0.4414288103580475, |
|
"learning_rate": 4.7541333227280944e-05, |
|
"loss": 0.171, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 2.4087837837837838, |
|
"grad_norm": 0.47343650460243225, |
|
"learning_rate": 4.755149216419776e-05, |
|
"loss": 0.1973, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 2.4121621621621623, |
|
"grad_norm": 0.4446316361427307, |
|
"learning_rate": 4.756163686293624e-05, |
|
"loss": 0.1804, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.4155405405405403, |
|
"grad_norm": 0.4418063163757324, |
|
"learning_rate": 4.7571767363351344e-05, |
|
"loss": 0.1913, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.418918918918919, |
|
"grad_norm": 0.4041652977466583, |
|
"learning_rate": 4.758188370513093e-05, |
|
"loss": 0.1852, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 2.4222972972972974, |
|
"grad_norm": 0.41937291622161865, |
|
"learning_rate": 4.759198592779667e-05, |
|
"loss": 0.1925, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.4256756756756754, |
|
"grad_norm": 0.4121197462081909, |
|
"learning_rate": 4.760207407070501e-05, |
|
"loss": 0.1821, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 2.429054054054054, |
|
"grad_norm": 0.3996151387691498, |
|
"learning_rate": 4.761214817304805e-05, |
|
"loss": 0.1837, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 0.38412001729011536, |
|
"learning_rate": 4.762220827385448e-05, |
|
"loss": 0.1685, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.435810810810811, |
|
"grad_norm": 1.5490686893463135, |
|
"learning_rate": 4.763225441199049e-05, |
|
"loss": 0.432, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 2.439189189189189, |
|
"grad_norm": 0.4263227880001068, |
|
"learning_rate": 4.7642286626160654e-05, |
|
"loss": 0.1847, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 2.4425675675675675, |
|
"grad_norm": 0.3754761815071106, |
|
"learning_rate": 4.765230495490885e-05, |
|
"loss": 0.1786, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.445945945945946, |
|
"grad_norm": 0.43871861696243286, |
|
"learning_rate": 4.7662309436619115e-05, |
|
"loss": 0.189, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 2.4493243243243246, |
|
"grad_norm": 0.4133760333061218, |
|
"learning_rate": 4.7672300109516563e-05, |
|
"loss": 0.1794, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.4527027027027026, |
|
"grad_norm": 0.38480374217033386, |
|
"learning_rate": 4.768227701166823e-05, |
|
"loss": 0.1666, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.456081081081081, |
|
"grad_norm": 0.4138960540294647, |
|
"learning_rate": 4.7692240180983964e-05, |
|
"loss": 0.1695, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 2.4594594594594597, |
|
"grad_norm": 0.3594178557395935, |
|
"learning_rate": 4.770218965521729e-05, |
|
"loss": 0.1519, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 2.4628378378378377, |
|
"grad_norm": 0.4005281627178192, |
|
"learning_rate": 4.7712125471966245e-05, |
|
"loss": 0.1696, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.4662162162162162, |
|
"grad_norm": 0.3780256509780884, |
|
"learning_rate": 4.7722047668674267e-05, |
|
"loss": 0.174, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.4695945945945947, |
|
"grad_norm": 0.4074977934360504, |
|
"learning_rate": 4.7731956282631004e-05, |
|
"loss": 0.1696, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 2.472972972972973, |
|
"grad_norm": 0.4063095152378082, |
|
"learning_rate": 4.77418513509732e-05, |
|
"loss": 0.1731, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.4763513513513513, |
|
"grad_norm": 0.40110018849372864, |
|
"learning_rate": 4.775173291068547e-05, |
|
"loss": 0.1787, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 2.47972972972973, |
|
"grad_norm": 0.3782990574836731, |
|
"learning_rate": 4.776160099860117e-05, |
|
"loss": 0.1753, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 2.483108108108108, |
|
"grad_norm": 0.3815288543701172, |
|
"learning_rate": 4.777145565140325e-05, |
|
"loss": 0.1766, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.4864864864864864, |
|
"grad_norm": 0.3779781758785248, |
|
"learning_rate": 4.7781296905624986e-05, |
|
"loss": 0.1857, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 2.489864864864865, |
|
"grad_norm": 0.33802899718284607, |
|
"learning_rate": 4.779112479765086e-05, |
|
"loss": 0.1515, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 2.4932432432432434, |
|
"grad_norm": 0.4050140082836151, |
|
"learning_rate": 4.780093936371736e-05, |
|
"loss": 0.1908, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.4966216216216215, |
|
"grad_norm": 0.4146076738834381, |
|
"learning_rate": 4.781074063991376e-05, |
|
"loss": 0.1812, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.34979405999183655, |
|
"learning_rate": 4.782052866218294e-05, |
|
"loss": 0.1505, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.5033783783783785, |
|
"grad_norm": 0.3686615526676178, |
|
"learning_rate": 4.783030346632214e-05, |
|
"loss": 0.1656, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.506756756756757, |
|
"grad_norm": 0.38285037875175476, |
|
"learning_rate": 4.7840065087983786e-05, |
|
"loss": 0.181, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 2.510135135135135, |
|
"grad_norm": 0.39826491475105286, |
|
"learning_rate": 4.784981356267626e-05, |
|
"loss": 0.201, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 2.5135135135135136, |
|
"grad_norm": 0.40031111240386963, |
|
"learning_rate": 4.785954892576465e-05, |
|
"loss": 0.1676, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.516891891891892, |
|
"grad_norm": 0.412266343832016, |
|
"learning_rate": 4.7869271212471554e-05, |
|
"loss": 0.1807, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.52027027027027, |
|
"grad_norm": 0.38458314538002014, |
|
"learning_rate": 4.7878980457877814e-05, |
|
"loss": 0.1639, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 2.5236486486486487, |
|
"grad_norm": 0.45593783259391785, |
|
"learning_rate": 4.7888676696923315e-05, |
|
"loss": 0.1844, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 2.527027027027027, |
|
"grad_norm": 0.36613929271698, |
|
"learning_rate": 4.7898359964407695e-05, |
|
"loss": 0.1536, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 2.5304054054054053, |
|
"grad_norm": 0.4267789125442505, |
|
"learning_rate": 4.790803029499111e-05, |
|
"loss": 0.1706, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 2.5337837837837838, |
|
"grad_norm": 1.5388500690460205, |
|
"learning_rate": 4.7917687723195004e-05, |
|
"loss": 0.4571, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.5371621621621623, |
|
"grad_norm": 0.42774277925491333, |
|
"learning_rate": 4.792733228340281e-05, |
|
"loss": 0.1907, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 2.5405405405405403, |
|
"grad_norm": 0.38267603516578674, |
|
"learning_rate": 4.793696400986071e-05, |
|
"loss": 0.163, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 2.543918918918919, |
|
"grad_norm": 0.4370862543582916, |
|
"learning_rate": 4.7946582936678344e-05, |
|
"loss": 0.2019, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 2.5472972972972974, |
|
"grad_norm": 0.3963480293750763, |
|
"learning_rate": 4.795618909782957e-05, |
|
"loss": 0.158, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 2.5506756756756754, |
|
"grad_norm": 0.4094291031360626, |
|
"learning_rate": 4.796578252715314e-05, |
|
"loss": 0.1769, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.554054054054054, |
|
"grad_norm": 0.41850391030311584, |
|
"learning_rate": 4.797536325835345e-05, |
|
"loss": 0.17, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.5574324324324325, |
|
"grad_norm": 0.4879460036754608, |
|
"learning_rate": 4.7984931325001216e-05, |
|
"loss": 0.2163, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 2.560810810810811, |
|
"grad_norm": 0.4456654489040375, |
|
"learning_rate": 4.799448676053423e-05, |
|
"loss": 0.2086, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 2.564189189189189, |
|
"grad_norm": 0.41925883293151855, |
|
"learning_rate": 4.800402959825802e-05, |
|
"loss": 0.1884, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 0.4378669857978821, |
|
"learning_rate": 4.801355987134653e-05, |
|
"loss": 0.2024, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.570945945945946, |
|
"grad_norm": 0.5047132968902588, |
|
"learning_rate": 4.802307761284289e-05, |
|
"loss": 0.1961, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 2.5743243243243246, |
|
"grad_norm": 0.41319629549980164, |
|
"learning_rate": 4.8032582855660014e-05, |
|
"loss": 0.1957, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 2.5777027027027026, |
|
"grad_norm": 0.46079912781715393, |
|
"learning_rate": 4.8042075632581346e-05, |
|
"loss": 0.1931, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 2.581081081081081, |
|
"grad_norm": 0.431325763463974, |
|
"learning_rate": 4.80515559762615e-05, |
|
"loss": 0.1788, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 2.5844594594594597, |
|
"grad_norm": 0.40309104323387146, |
|
"learning_rate": 4.8061023919226964e-05, |
|
"loss": 0.1817, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.5878378378378377, |
|
"grad_norm": 0.434621661901474, |
|
"learning_rate": 4.807047949387674e-05, |
|
"loss": 0.1831, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 2.5912162162162162, |
|
"grad_norm": 0.35631585121154785, |
|
"learning_rate": 4.807992273248302e-05, |
|
"loss": 0.1683, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 0.41151365637779236, |
|
"learning_rate": 4.808935366719187e-05, |
|
"loss": 0.1904, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 2.597972972972973, |
|
"grad_norm": 0.4089500904083252, |
|
"learning_rate": 4.8098772330023855e-05, |
|
"loss": 0.1742, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 2.6013513513513513, |
|
"grad_norm": 0.38123077154159546, |
|
"learning_rate": 4.81081787528747e-05, |
|
"loss": 0.1727, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.60472972972973, |
|
"grad_norm": 0.37730783224105835, |
|
"learning_rate": 4.811757296751595e-05, |
|
"loss": 0.1904, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 2.608108108108108, |
|
"grad_norm": 0.38781440258026123, |
|
"learning_rate": 4.812695500559561e-05, |
|
"loss": 0.1931, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 2.6114864864864864, |
|
"grad_norm": 0.4114435315132141, |
|
"learning_rate": 4.8136324898638756e-05, |
|
"loss": 0.2105, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 2.614864864864865, |
|
"grad_norm": 0.39372387528419495, |
|
"learning_rate": 4.8145682678048214e-05, |
|
"loss": 0.1831, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 2.618243243243243, |
|
"grad_norm": 0.40174078941345215, |
|
"learning_rate": 4.815502837510518e-05, |
|
"loss": 0.1975, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.6216216216216215, |
|
"grad_norm": 0.4061259627342224, |
|
"learning_rate": 4.816436202096981e-05, |
|
"loss": 0.1744, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 2.625, |
|
"grad_norm": 0.3997017741203308, |
|
"learning_rate": 4.81736836466819e-05, |
|
"loss": 0.1782, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 2.6283783783783785, |
|
"grad_norm": 0.4183482229709625, |
|
"learning_rate": 4.8182993283161485e-05, |
|
"loss": 0.1941, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 2.631756756756757, |
|
"grad_norm": 0.3742780387401581, |
|
"learning_rate": 4.819229096120941e-05, |
|
"loss": 0.1681, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 2.635135135135135, |
|
"grad_norm": 0.3982739746570587, |
|
"learning_rate": 4.820157671150801e-05, |
|
"loss": 0.1941, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.6385135135135136, |
|
"grad_norm": 0.3757505714893341, |
|
"learning_rate": 4.821085056462168e-05, |
|
"loss": 0.1744, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 2.641891891891892, |
|
"grad_norm": 0.39277997612953186, |
|
"learning_rate": 4.822011255099747e-05, |
|
"loss": 0.1803, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 2.64527027027027, |
|
"grad_norm": 0.35470277070999146, |
|
"learning_rate": 4.8229362700965726e-05, |
|
"loss": 0.1651, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 2.6486486486486487, |
|
"grad_norm": 0.381610631942749, |
|
"learning_rate": 4.8238601044740645e-05, |
|
"loss": 0.183, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 2.652027027027027, |
|
"grad_norm": 0.37985488772392273, |
|
"learning_rate": 4.824782761242088e-05, |
|
"loss": 0.1637, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.6554054054054053, |
|
"grad_norm": 0.4003821909427643, |
|
"learning_rate": 4.8257042433990135e-05, |
|
"loss": 0.18, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.6587837837837838, |
|
"grad_norm": 0.37772974371910095, |
|
"learning_rate": 4.826624553931775e-05, |
|
"loss": 0.1886, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 2.6621621621621623, |
|
"grad_norm": 0.3801021873950958, |
|
"learning_rate": 4.827543695815926e-05, |
|
"loss": 0.1718, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 2.6655405405405403, |
|
"grad_norm": 0.3759152591228485, |
|
"learning_rate": 4.8284616720157006e-05, |
|
"loss": 0.1698, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 2.668918918918919, |
|
"grad_norm": 0.38654980063438416, |
|
"learning_rate": 4.82937848548407e-05, |
|
"loss": 0.1885, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.6722972972972974, |
|
"grad_norm": 0.4393473267555237, |
|
"learning_rate": 4.8302941391627947e-05, |
|
"loss": 0.1891, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 2.6756756756756754, |
|
"grad_norm": 0.3831874132156372, |
|
"learning_rate": 4.83120863598249e-05, |
|
"loss": 0.1888, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 2.679054054054054, |
|
"grad_norm": 0.38641834259033203, |
|
"learning_rate": 4.832121978862673e-05, |
|
"loss": 0.1842, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 2.6824324324324325, |
|
"grad_norm": 0.39984583854675293, |
|
"learning_rate": 4.8330341707118276e-05, |
|
"loss": 0.1681, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 2.685810810810811, |
|
"grad_norm": 0.40252962708473206, |
|
"learning_rate": 4.833945214427451e-05, |
|
"loss": 0.1756, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.689189189189189, |
|
"grad_norm": 0.40013495087623596, |
|
"learning_rate": 4.834855112896116e-05, |
|
"loss": 0.2146, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 2.6925675675675675, |
|
"grad_norm": 0.385640412569046, |
|
"learning_rate": 4.835763868993521e-05, |
|
"loss": 0.1851, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 2.695945945945946, |
|
"grad_norm": 0.3977515697479248, |
|
"learning_rate": 4.8366714855845496e-05, |
|
"loss": 0.1809, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 2.6993243243243246, |
|
"grad_norm": 0.40790390968322754, |
|
"learning_rate": 4.837577965523319e-05, |
|
"loss": 0.1887, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.3771408498287201, |
|
"learning_rate": 4.8384833116532396e-05, |
|
"loss": 0.1732, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.706081081081081, |
|
"grad_norm": 0.42990434169769287, |
|
"learning_rate": 4.8393875268070636e-05, |
|
"loss": 0.1952, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 2.7094594594594597, |
|
"grad_norm": 0.38725292682647705, |
|
"learning_rate": 4.84029061380694e-05, |
|
"loss": 0.1903, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 2.7128378378378377, |
|
"grad_norm": 0.4196310341358185, |
|
"learning_rate": 4.841192575464469e-05, |
|
"loss": 0.2035, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 2.7162162162162162, |
|
"grad_norm": 0.4179129898548126, |
|
"learning_rate": 4.842093414580753e-05, |
|
"loss": 0.2002, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 2.7195945945945947, |
|
"grad_norm": 0.44000861048698425, |
|
"learning_rate": 4.842993133946448e-05, |
|
"loss": 0.2299, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.722972972972973, |
|
"grad_norm": 0.4462367296218872, |
|
"learning_rate": 4.843891736341818e-05, |
|
"loss": 0.1931, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 2.7263513513513513, |
|
"grad_norm": 0.42056065797805786, |
|
"learning_rate": 4.8447892245367846e-05, |
|
"loss": 0.2012, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 2.72972972972973, |
|
"grad_norm": 0.48042038083076477, |
|
"learning_rate": 4.845685601290977e-05, |
|
"loss": 0.1988, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 2.733108108108108, |
|
"grad_norm": 0.39594507217407227, |
|
"learning_rate": 4.846580869353787e-05, |
|
"loss": 0.1966, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 2.7364864864864864, |
|
"grad_norm": 0.4334581792354584, |
|
"learning_rate": 4.847475031464416e-05, |
|
"loss": 0.1861, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.739864864864865, |
|
"grad_norm": 0.40071550011634827, |
|
"learning_rate": 4.8483680903519274e-05, |
|
"loss": 0.2009, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 2.743243243243243, |
|
"grad_norm": 0.40345826745033264, |
|
"learning_rate": 4.8492600487352926e-05, |
|
"loss": 0.1692, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 2.7466216216216215, |
|
"grad_norm": 0.41826799511909485, |
|
"learning_rate": 4.850150909323447e-05, |
|
"loss": 0.1904, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.3715426027774811, |
|
"learning_rate": 4.8510406748153355e-05, |
|
"loss": 0.1782, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 2.7533783783783785, |
|
"grad_norm": 0.39606815576553345, |
|
"learning_rate": 4.8519293478999614e-05, |
|
"loss": 0.1824, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.756756756756757, |
|
"grad_norm": 0.3894452452659607, |
|
"learning_rate": 4.8528169312564355e-05, |
|
"loss": 0.1799, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.760135135135135, |
|
"grad_norm": 0.38379618525505066, |
|
"learning_rate": 4.8537034275540264e-05, |
|
"loss": 0.1792, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 2.7635135135135136, |
|
"grad_norm": 0.4054020643234253, |
|
"learning_rate": 4.854588839452205e-05, |
|
"loss": 0.1842, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 2.766891891891892, |
|
"grad_norm": 0.424482524394989, |
|
"learning_rate": 4.855473169600698e-05, |
|
"loss": 0.1781, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.77027027027027, |
|
"grad_norm": 0.40162649750709534, |
|
"learning_rate": 4.856356420639528e-05, |
|
"loss": 0.2129, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.7736486486486487, |
|
"grad_norm": 0.4196532666683197, |
|
"learning_rate": 4.857238595199068e-05, |
|
"loss": 0.1844, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 2.777027027027027, |
|
"grad_norm": 0.39450812339782715, |
|
"learning_rate": 4.858119695900084e-05, |
|
"loss": 0.1917, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 2.7804054054054053, |
|
"grad_norm": 0.37791892886161804, |
|
"learning_rate": 4.858999725353783e-05, |
|
"loss": 0.1562, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 2.7837837837837838, |
|
"grad_norm": 0.4025024473667145, |
|
"learning_rate": 4.8598786861618605e-05, |
|
"loss": 0.1963, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 2.7871621621621623, |
|
"grad_norm": 0.382782518863678, |
|
"learning_rate": 4.860756580916542e-05, |
|
"loss": 0.1811, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.7905405405405403, |
|
"grad_norm": 0.40827038884162903, |
|
"learning_rate": 4.861633412200637e-05, |
|
"loss": 0.1914, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 2.793918918918919, |
|
"grad_norm": 0.35224664211273193, |
|
"learning_rate": 4.862509182587578e-05, |
|
"loss": 0.1587, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 2.7972972972972974, |
|
"grad_norm": 0.40827634930610657, |
|
"learning_rate": 4.863383894641467e-05, |
|
"loss": 0.1648, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 2.8006756756756754, |
|
"grad_norm": 0.3934107720851898, |
|
"learning_rate": 4.864257550917123e-05, |
|
"loss": 0.1867, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 2.804054054054054, |
|
"grad_norm": 0.4319005012512207, |
|
"learning_rate": 4.865130153960124e-05, |
|
"loss": 0.1789, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.8074324324324325, |
|
"grad_norm": 0.4071970582008362, |
|
"learning_rate": 4.8660017063068526e-05, |
|
"loss": 0.1718, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 0.4246816635131836, |
|
"learning_rate": 4.8668722104845403e-05, |
|
"loss": 0.1911, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 2.814189189189189, |
|
"grad_norm": 0.41370970010757446, |
|
"learning_rate": 4.8677416690113134e-05, |
|
"loss": 0.1968, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 2.8175675675675675, |
|
"grad_norm": 0.3547184467315674, |
|
"learning_rate": 4.868610084396232e-05, |
|
"loss": 0.1612, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 2.820945945945946, |
|
"grad_norm": 0.4511527121067047, |
|
"learning_rate": 4.869477459139337e-05, |
|
"loss": 0.1608, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.8243243243243246, |
|
"grad_norm": 0.394466757774353, |
|
"learning_rate": 4.870343795731694e-05, |
|
"loss": 0.1846, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 2.8277027027027026, |
|
"grad_norm": 0.3896351158618927, |
|
"learning_rate": 4.8712090966554334e-05, |
|
"loss": 0.1954, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 2.831081081081081, |
|
"grad_norm": 0.4012662172317505, |
|
"learning_rate": 4.872073364383795e-05, |
|
"loss": 0.1875, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 2.8344594594594597, |
|
"grad_norm": 0.39143961668014526, |
|
"learning_rate": 4.8729366013811674e-05, |
|
"loss": 0.2066, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 0.4024941325187683, |
|
"learning_rate": 4.8737988101031366e-05, |
|
"loss": 0.1836, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.8412162162162162, |
|
"grad_norm": 0.3429911732673645, |
|
"learning_rate": 4.874659992996521e-05, |
|
"loss": 0.1687, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 2.8445945945945947, |
|
"grad_norm": 0.3772415816783905, |
|
"learning_rate": 4.875520152499416e-05, |
|
"loss": 0.2006, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 2.847972972972973, |
|
"grad_norm": 0.41397014260292053, |
|
"learning_rate": 4.876379291041238e-05, |
|
"loss": 0.2175, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 2.8513513513513513, |
|
"grad_norm": 0.41761255264282227, |
|
"learning_rate": 4.8772374110427594e-05, |
|
"loss": 0.1779, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 2.85472972972973, |
|
"grad_norm": 0.4116886854171753, |
|
"learning_rate": 4.878094514916154e-05, |
|
"loss": 0.2, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.858108108108108, |
|
"grad_norm": 0.3655768930912018, |
|
"learning_rate": 4.8789506050650396e-05, |
|
"loss": 0.1818, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 2.8614864864864864, |
|
"grad_norm": 0.43548357486724854, |
|
"learning_rate": 4.879805683884512e-05, |
|
"loss": 0.2029, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 2.864864864864865, |
|
"grad_norm": 0.3908335566520691, |
|
"learning_rate": 4.8806597537611906e-05, |
|
"loss": 0.1936, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 2.868243243243243, |
|
"grad_norm": 0.3648586869239807, |
|
"learning_rate": 4.881512817073255e-05, |
|
"loss": 0.1765, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 2.8716216216216215, |
|
"grad_norm": 0.39892107248306274, |
|
"learning_rate": 4.882364876190489e-05, |
|
"loss": 0.1594, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.875, |
|
"grad_norm": 0.3832027316093445, |
|
"learning_rate": 4.8832159334743136e-05, |
|
"loss": 0.1773, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 2.8783783783783785, |
|
"grad_norm": 0.4372403621673584, |
|
"learning_rate": 4.884065991277833e-05, |
|
"loss": 0.2057, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 2.881756756756757, |
|
"grad_norm": 0.36279168725013733, |
|
"learning_rate": 4.8849150519458726e-05, |
|
"loss": 0.1726, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 2.885135135135135, |
|
"grad_norm": 0.37340691685676575, |
|
"learning_rate": 4.885763117815009e-05, |
|
"loss": 0.178, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 2.8885135135135136, |
|
"grad_norm": 0.3998451828956604, |
|
"learning_rate": 4.886610191213622e-05, |
|
"loss": 0.1816, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.891891891891892, |
|
"grad_norm": 0.3779732584953308, |
|
"learning_rate": 4.887456274461922e-05, |
|
"loss": 0.2068, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 2.89527027027027, |
|
"grad_norm": 0.4435446262359619, |
|
"learning_rate": 4.8883013698719973e-05, |
|
"loss": 0.215, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 2.8986486486486487, |
|
"grad_norm": 0.37998583912849426, |
|
"learning_rate": 4.889145479747843e-05, |
|
"loss": 0.181, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 2.902027027027027, |
|
"grad_norm": 0.4032045900821686, |
|
"learning_rate": 4.889988606385404e-05, |
|
"loss": 0.1878, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 2.9054054054054053, |
|
"grad_norm": 0.4383464753627777, |
|
"learning_rate": 4.8908307520726135e-05, |
|
"loss": 0.1948, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.9087837837837838, |
|
"grad_norm": 0.38737547397613525, |
|
"learning_rate": 4.891671919089425e-05, |
|
"loss": 0.2007, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 2.9121621621621623, |
|
"grad_norm": 0.39179664850234985, |
|
"learning_rate": 4.892512109707855e-05, |
|
"loss": 0.1865, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 2.9155405405405403, |
|
"grad_norm": 0.3944191038608551, |
|
"learning_rate": 4.893351326192016e-05, |
|
"loss": 0.1831, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 2.918918918918919, |
|
"grad_norm": 0.3212383985519409, |
|
"learning_rate": 4.894189570798156e-05, |
|
"loss": 0.1494, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 2.9222972972972974, |
|
"grad_norm": 0.3861388564109802, |
|
"learning_rate": 4.895026845774691e-05, |
|
"loss": 0.2045, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.9256756756756754, |
|
"grad_norm": 0.37900376319885254, |
|
"learning_rate": 4.895863153362244e-05, |
|
"loss": 0.1818, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 2.929054054054054, |
|
"grad_norm": 0.4495367109775543, |
|
"learning_rate": 4.896698495793684e-05, |
|
"loss": 0.1936, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 2.9324324324324325, |
|
"grad_norm": 0.40020546317100525, |
|
"learning_rate": 4.897532875294154e-05, |
|
"loss": 0.1964, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 2.935810810810811, |
|
"grad_norm": 0.3625187277793884, |
|
"learning_rate": 4.8983662940811115e-05, |
|
"loss": 0.1737, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 2.939189189189189, |
|
"grad_norm": 0.3938811719417572, |
|
"learning_rate": 4.899198754364365e-05, |
|
"loss": 0.1488, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.9425675675675675, |
|
"grad_norm": 0.36418697237968445, |
|
"learning_rate": 4.900030258346106e-05, |
|
"loss": 0.1822, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 2.945945945945946, |
|
"grad_norm": 0.3703506588935852, |
|
"learning_rate": 4.900860808220946e-05, |
|
"loss": 0.1846, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 2.9493243243243246, |
|
"grad_norm": 0.4332616925239563, |
|
"learning_rate": 4.90169040617595e-05, |
|
"loss": 0.2105, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 2.9527027027027026, |
|
"grad_norm": 0.3770284354686737, |
|
"learning_rate": 4.9025190543906715e-05, |
|
"loss": 0.1815, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 2.956081081081081, |
|
"grad_norm": 0.3703818619251251, |
|
"learning_rate": 4.903346755037189e-05, |
|
"loss": 0.1713, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.9594594594594597, |
|
"grad_norm": 0.37928834557533264, |
|
"learning_rate": 4.904173510280135e-05, |
|
"loss": 0.1915, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 2.9628378378378377, |
|
"grad_norm": 0.4267534911632538, |
|
"learning_rate": 4.904999322276735e-05, |
|
"loss": 0.2157, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 2.9662162162162162, |
|
"grad_norm": 0.3936428725719452, |
|
"learning_rate": 4.9058241931768385e-05, |
|
"loss": 0.1947, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 2.9695945945945947, |
|
"grad_norm": 0.3503859043121338, |
|
"learning_rate": 4.9066481251229535e-05, |
|
"loss": 0.1836, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 1.8326294422149658, |
|
"learning_rate": 4.907471120250281e-05, |
|
"loss": 0.4757, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.9763513513513513, |
|
"grad_norm": 0.4133761525154114, |
|
"learning_rate": 4.9082931806867474e-05, |
|
"loss": 0.2003, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 2.97972972972973, |
|
"grad_norm": 0.3848228454589844, |
|
"learning_rate": 4.909114308553033e-05, |
|
"loss": 0.19, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 2.983108108108108, |
|
"grad_norm": 0.38853901624679565, |
|
"learning_rate": 4.909934505962615e-05, |
|
"loss": 0.2071, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 2.9864864864864864, |
|
"grad_norm": 0.35326841473579407, |
|
"learning_rate": 4.9107537750217886e-05, |
|
"loss": 0.1742, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 2.989864864864865, |
|
"grad_norm": 0.33690720796585083, |
|
"learning_rate": 4.9115721178297093e-05, |
|
"loss": 0.1734, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.993243243243243, |
|
"grad_norm": 0.35074329376220703, |
|
"learning_rate": 4.9123895364784184e-05, |
|
"loss": 0.1623, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 2.9966216216216215, |
|
"grad_norm": 0.35647299885749817, |
|
"learning_rate": 4.913206033052877e-05, |
|
"loss": 0.1647, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4068041145801544, |
|
"learning_rate": 4.914021609631002e-05, |
|
"loss": 0.1831, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 3.0033783783783785, |
|
"grad_norm": 0.4528217017650604, |
|
"learning_rate": 4.91483626828369e-05, |
|
"loss": 0.128, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 3.0067567567567566, |
|
"grad_norm": 0.3695124089717865, |
|
"learning_rate": 4.915650011074855e-05, |
|
"loss": 0.1077, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.010135135135135, |
|
"grad_norm": 0.570489227771759, |
|
"learning_rate": 4.916462840061458e-05, |
|
"loss": 0.11, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 3.0135135135135136, |
|
"grad_norm": 0.43204137682914734, |
|
"learning_rate": 4.917274757293539e-05, |
|
"loss": 0.1032, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 3.016891891891892, |
|
"grad_norm": 0.5314778089523315, |
|
"learning_rate": 4.918085764814244e-05, |
|
"loss": 0.1304, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 3.02027027027027, |
|
"grad_norm": 0.3921876847743988, |
|
"learning_rate": 4.9188958646598624e-05, |
|
"loss": 0.0949, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 3.0236486486486487, |
|
"grad_norm": 0.35549548268318176, |
|
"learning_rate": 4.919705058859854e-05, |
|
"loss": 0.0989, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.027027027027027, |
|
"grad_norm": 0.4394858181476593, |
|
"learning_rate": 4.920513349436875e-05, |
|
"loss": 0.1152, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 3.0304054054054053, |
|
"grad_norm": 0.4386696219444275, |
|
"learning_rate": 4.92132073840682e-05, |
|
"loss": 0.1163, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 3.0337837837837838, |
|
"grad_norm": 0.38169318437576294, |
|
"learning_rate": 4.922127227778841e-05, |
|
"loss": 0.1078, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 3.0371621621621623, |
|
"grad_norm": 0.410858690738678, |
|
"learning_rate": 4.9229328195553815e-05, |
|
"loss": 0.1165, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 3.0405405405405403, |
|
"grad_norm": 0.4674147069454193, |
|
"learning_rate": 4.923737515732209e-05, |
|
"loss": 0.12, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.043918918918919, |
|
"grad_norm": 0.3922157883644104, |
|
"learning_rate": 4.924541318298438e-05, |
|
"loss": 0.1131, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 3.0472972972972974, |
|
"grad_norm": 0.3732883632183075, |
|
"learning_rate": 4.92534422923657e-05, |
|
"loss": 0.1025, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 3.050675675675676, |
|
"grad_norm": 0.4513222277164459, |
|
"learning_rate": 4.9261462505225106e-05, |
|
"loss": 0.1207, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 3.054054054054054, |
|
"grad_norm": 0.39665260910987854, |
|
"learning_rate": 4.926947384125606e-05, |
|
"loss": 0.1115, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 3.0574324324324325, |
|
"grad_norm": 0.3784550130367279, |
|
"learning_rate": 4.927747632008672e-05, |
|
"loss": 0.1151, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.060810810810811, |
|
"grad_norm": 0.44802817702293396, |
|
"learning_rate": 4.9285469961280226e-05, |
|
"loss": 0.113, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 3.064189189189189, |
|
"grad_norm": 0.37687918543815613, |
|
"learning_rate": 4.9293454784334924e-05, |
|
"loss": 0.1157, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 3.0675675675675675, |
|
"grad_norm": 0.3582609295845032, |
|
"learning_rate": 4.9301430808684754e-05, |
|
"loss": 0.1031, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 3.070945945945946, |
|
"grad_norm": 0.3885122239589691, |
|
"learning_rate": 4.930939805369946e-05, |
|
"loss": 0.1043, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 3.074324324324324, |
|
"grad_norm": 0.5625414252281189, |
|
"learning_rate": 4.93173565386849e-05, |
|
"loss": 0.1109, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.0777027027027026, |
|
"grad_norm": 0.3783611059188843, |
|
"learning_rate": 4.932530628288331e-05, |
|
"loss": 0.1047, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 3.081081081081081, |
|
"grad_norm": 0.48341429233551025, |
|
"learning_rate": 4.933324730547361e-05, |
|
"loss": 0.0961, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 3.0844594594594597, |
|
"grad_norm": 0.399813711643219, |
|
"learning_rate": 4.934117962557165e-05, |
|
"loss": 0.1178, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 3.0878378378378377, |
|
"grad_norm": 0.40703117847442627, |
|
"learning_rate": 4.9349103262230524e-05, |
|
"loss": 0.1179, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 3.0912162162162162, |
|
"grad_norm": 0.39595040678977966, |
|
"learning_rate": 4.935701823444081e-05, |
|
"loss": 0.1161, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.0945945945945947, |
|
"grad_norm": 0.3974362313747406, |
|
"learning_rate": 4.9364924561130845e-05, |
|
"loss": 0.1167, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 3.097972972972973, |
|
"grad_norm": 0.426070898771286, |
|
"learning_rate": 4.937282226116702e-05, |
|
"loss": 0.1305, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 3.1013513513513513, |
|
"grad_norm": 0.38482606410980225, |
|
"learning_rate": 4.938071135335405e-05, |
|
"loss": 0.1018, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 3.10472972972973, |
|
"grad_norm": 0.4263748228549957, |
|
"learning_rate": 4.938859185643519e-05, |
|
"loss": 0.1156, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 0.39638155698776245, |
|
"learning_rate": 4.939646378909259e-05, |
|
"loss": 0.1157, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.1114864864864864, |
|
"grad_norm": 0.40607044100761414, |
|
"learning_rate": 4.940432716994748e-05, |
|
"loss": 0.1197, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 3.114864864864865, |
|
"grad_norm": 0.4135359823703766, |
|
"learning_rate": 4.9412182017560496e-05, |
|
"loss": 0.1202, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 3.1182432432432434, |
|
"grad_norm": 0.38252270221710205, |
|
"learning_rate": 4.942002835043187e-05, |
|
"loss": 0.1126, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 3.1216216216216215, |
|
"grad_norm": 0.3727289140224457, |
|
"learning_rate": 4.942786618700178e-05, |
|
"loss": 0.1111, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 3.125, |
|
"grad_norm": 0.380953848361969, |
|
"learning_rate": 4.9435695545650545e-05, |
|
"loss": 0.1151, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.1283783783783785, |
|
"grad_norm": 0.37086960673332214, |
|
"learning_rate": 4.944351644469891e-05, |
|
"loss": 0.1106, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 3.1317567567567566, |
|
"grad_norm": 0.4138876497745514, |
|
"learning_rate": 4.945132890240829e-05, |
|
"loss": 0.1239, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 3.135135135135135, |
|
"grad_norm": 0.4137141704559326, |
|
"learning_rate": 4.945913293698104e-05, |
|
"loss": 0.1252, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 3.1385135135135136, |
|
"grad_norm": 0.3904976546764374, |
|
"learning_rate": 4.9466928566560696e-05, |
|
"loss": 0.107, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 3.141891891891892, |
|
"grad_norm": 0.3888641595840454, |
|
"learning_rate": 4.9474715809232256e-05, |
|
"loss": 0.113, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.14527027027027, |
|
"grad_norm": 0.39176711440086365, |
|
"learning_rate": 4.948249468302239e-05, |
|
"loss": 0.1135, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 3.1486486486486487, |
|
"grad_norm": 0.3809608817100525, |
|
"learning_rate": 4.9490265205899697e-05, |
|
"loss": 0.1051, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 3.152027027027027, |
|
"grad_norm": 0.4019430875778198, |
|
"learning_rate": 4.9498027395775006e-05, |
|
"loss": 0.121, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 3.1554054054054053, |
|
"grad_norm": 0.3715710937976837, |
|
"learning_rate": 4.950578127050156e-05, |
|
"loss": 0.1225, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 3.1587837837837838, |
|
"grad_norm": 0.4521862864494324, |
|
"learning_rate": 4.95135268478753e-05, |
|
"loss": 0.1294, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.1621621621621623, |
|
"grad_norm": 0.4279133379459381, |
|
"learning_rate": 4.952126414563509e-05, |
|
"loss": 0.1229, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 3.1655405405405403, |
|
"grad_norm": 0.4043607711791992, |
|
"learning_rate": 4.952899318146297e-05, |
|
"loss": 0.1117, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 3.168918918918919, |
|
"grad_norm": 0.34410539269447327, |
|
"learning_rate": 4.9536713972984414e-05, |
|
"loss": 0.0972, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 3.1722972972972974, |
|
"grad_norm": 0.43061432242393494, |
|
"learning_rate": 4.954442653776852e-05, |
|
"loss": 0.1333, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 3.175675675675676, |
|
"grad_norm": 0.36851924657821655, |
|
"learning_rate": 4.955213089332832e-05, |
|
"loss": 0.1114, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.179054054054054, |
|
"grad_norm": 0.37471747398376465, |
|
"learning_rate": 4.955982705712095e-05, |
|
"loss": 0.1169, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 3.1824324324324325, |
|
"grad_norm": 0.35380908846855164, |
|
"learning_rate": 4.956751504654796e-05, |
|
"loss": 0.1005, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 3.185810810810811, |
|
"grad_norm": 0.38561439514160156, |
|
"learning_rate": 4.957519487895548e-05, |
|
"loss": 0.1187, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 3.189189189189189, |
|
"grad_norm": 0.4120042324066162, |
|
"learning_rate": 4.9582866571634485e-05, |
|
"loss": 0.1273, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 3.1925675675675675, |
|
"grad_norm": 0.39863529801368713, |
|
"learning_rate": 4.959053014182106e-05, |
|
"loss": 0.1125, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.195945945945946, |
|
"grad_norm": 0.38955143094062805, |
|
"learning_rate": 4.959818560669655e-05, |
|
"loss": 0.1175, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 3.199324324324324, |
|
"grad_norm": 0.3573088049888611, |
|
"learning_rate": 4.96058329833879e-05, |
|
"loss": 0.1131, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 3.2027027027027026, |
|
"grad_norm": 0.4304436147212982, |
|
"learning_rate": 4.961347228896777e-05, |
|
"loss": 0.1262, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 3.206081081081081, |
|
"grad_norm": 0.39087799191474915, |
|
"learning_rate": 4.962110354045488e-05, |
|
"loss": 0.1191, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 3.2094594594594597, |
|
"grad_norm": 0.3944397568702698, |
|
"learning_rate": 4.962872675481414e-05, |
|
"loss": 0.1182, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.2128378378378377, |
|
"grad_norm": 0.40278294682502747, |
|
"learning_rate": 4.9636341948956906e-05, |
|
"loss": 0.1211, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 3.2162162162162162, |
|
"grad_norm": 0.41054508090019226, |
|
"learning_rate": 4.964394913974124e-05, |
|
"loss": 0.1195, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 3.2195945945945947, |
|
"grad_norm": 0.4175059199333191, |
|
"learning_rate": 4.965154834397211e-05, |
|
"loss": 0.1353, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 3.222972972972973, |
|
"grad_norm": 0.3865712583065033, |
|
"learning_rate": 4.965913957840159e-05, |
|
"loss": 0.1111, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 3.2263513513513513, |
|
"grad_norm": 0.39877578616142273, |
|
"learning_rate": 4.966672285972911e-05, |
|
"loss": 0.1256, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.22972972972973, |
|
"grad_norm": 0.34512677788734436, |
|
"learning_rate": 4.967429820460167e-05, |
|
"loss": 0.1078, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 3.233108108108108, |
|
"grad_norm": 0.4256219267845154, |
|
"learning_rate": 4.9681865629614064e-05, |
|
"loss": 0.1313, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 3.2364864864864864, |
|
"grad_norm": 0.46931833028793335, |
|
"learning_rate": 4.9689425151309074e-05, |
|
"loss": 0.1396, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 3.239864864864865, |
|
"grad_norm": 0.40905606746673584, |
|
"learning_rate": 4.969697678617773e-05, |
|
"loss": 0.1242, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 0.36054709553718567, |
|
"learning_rate": 4.970452055065948e-05, |
|
"loss": 0.1149, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.2466216216216215, |
|
"grad_norm": 0.3850068747997284, |
|
"learning_rate": 4.9712056461142423e-05, |
|
"loss": 0.1074, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.42875200510025024, |
|
"learning_rate": 4.971958453396355e-05, |
|
"loss": 0.1152, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 3.2533783783783785, |
|
"grad_norm": 0.413173109292984, |
|
"learning_rate": 4.972710478540891e-05, |
|
"loss": 0.1273, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 3.2567567567567566, |
|
"grad_norm": 0.3743407428264618, |
|
"learning_rate": 4.973461723171385e-05, |
|
"loss": 0.1149, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 3.260135135135135, |
|
"grad_norm": 0.44052329659461975, |
|
"learning_rate": 4.9742121889063213e-05, |
|
"loss": 0.1301, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.2635135135135136, |
|
"grad_norm": 0.38358744978904724, |
|
"learning_rate": 4.974961877359156e-05, |
|
"loss": 0.1133, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 3.266891891891892, |
|
"grad_norm": 0.423977792263031, |
|
"learning_rate": 4.975710790138336e-05, |
|
"loss": 0.1346, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 3.27027027027027, |
|
"grad_norm": 1.5824426412582397, |
|
"learning_rate": 4.976458928847323e-05, |
|
"loss": 0.3884, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 3.2736486486486487, |
|
"grad_norm": 0.41981515288352966, |
|
"learning_rate": 4.977206295084609e-05, |
|
"loss": 0.1164, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 3.277027027027027, |
|
"grad_norm": 0.42901647090911865, |
|
"learning_rate": 4.9779528904437424e-05, |
|
"loss": 0.1269, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.2804054054054053, |
|
"grad_norm": 0.3595876693725586, |
|
"learning_rate": 4.978698716513342e-05, |
|
"loss": 0.1121, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 3.2837837837837838, |
|
"grad_norm": 0.39047256112098694, |
|
"learning_rate": 4.9794437748771244e-05, |
|
"loss": 0.1219, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 3.2871621621621623, |
|
"grad_norm": 0.41640281677246094, |
|
"learning_rate": 4.9801880671139204e-05, |
|
"loss": 0.1303, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 3.2905405405405403, |
|
"grad_norm": 0.371383398771286, |
|
"learning_rate": 4.980931594797693e-05, |
|
"loss": 0.112, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 3.293918918918919, |
|
"grad_norm": 0.4297455847263336, |
|
"learning_rate": 4.981674359497562e-05, |
|
"loss": 0.1326, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.2972972972972974, |
|
"grad_norm": 0.39308154582977295, |
|
"learning_rate": 4.98241636277782e-05, |
|
"loss": 0.1244, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 3.3006756756756754, |
|
"grad_norm": 0.3956491947174072, |
|
"learning_rate": 4.983157606197955e-05, |
|
"loss": 0.1203, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 3.304054054054054, |
|
"grad_norm": 0.41044941544532776, |
|
"learning_rate": 4.98389809131267e-05, |
|
"loss": 0.1247, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 3.3074324324324325, |
|
"grad_norm": 0.4159061312675476, |
|
"learning_rate": 4.984637819671897e-05, |
|
"loss": 0.1321, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 3.310810810810811, |
|
"grad_norm": 0.3757896423339844, |
|
"learning_rate": 4.985376792820825e-05, |
|
"loss": 0.1157, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.314189189189189, |
|
"grad_norm": 0.3729749321937561, |
|
"learning_rate": 4.986115012299915e-05, |
|
"loss": 0.1103, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 3.3175675675675675, |
|
"grad_norm": 0.3994838297367096, |
|
"learning_rate": 4.986852479644916e-05, |
|
"loss": 0.1284, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 3.320945945945946, |
|
"grad_norm": 0.3987760841846466, |
|
"learning_rate": 4.987589196386893e-05, |
|
"loss": 0.1126, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 3.3243243243243246, |
|
"grad_norm": 0.3847789168357849, |
|
"learning_rate": 4.988325164052236e-05, |
|
"loss": 0.1199, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 3.3277027027027026, |
|
"grad_norm": 0.41571831703186035, |
|
"learning_rate": 4.9890603841626866e-05, |
|
"loss": 0.1295, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.331081081081081, |
|
"grad_norm": 0.3801005184650421, |
|
"learning_rate": 4.989794858235352e-05, |
|
"loss": 0.1276, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 3.3344594594594597, |
|
"grad_norm": 0.4302000403404236, |
|
"learning_rate": 4.990528587782729e-05, |
|
"loss": 0.1355, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 3.3378378378378377, |
|
"grad_norm": 0.3750097155570984, |
|
"learning_rate": 4.9912615743127146e-05, |
|
"loss": 0.1159, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 3.3412162162162162, |
|
"grad_norm": 0.43669402599334717, |
|
"learning_rate": 4.991993819328633e-05, |
|
"loss": 0.1349, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 3.3445945945945947, |
|
"grad_norm": 0.40606677532196045, |
|
"learning_rate": 4.9927253243292505e-05, |
|
"loss": 0.123, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.347972972972973, |
|
"grad_norm": 0.40521538257598877, |
|
"learning_rate": 4.993456090808793e-05, |
|
"loss": 0.1212, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 3.3513513513513513, |
|
"grad_norm": 0.38929492235183716, |
|
"learning_rate": 4.994186120256965e-05, |
|
"loss": 0.1204, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 3.35472972972973, |
|
"grad_norm": 0.3639843761920929, |
|
"learning_rate": 4.9949154141589696e-05, |
|
"loss": 0.1175, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 3.358108108108108, |
|
"grad_norm": 0.3805122971534729, |
|
"learning_rate": 4.995643973995523e-05, |
|
"loss": 0.1198, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 3.3614864864864864, |
|
"grad_norm": 0.395796000957489, |
|
"learning_rate": 4.9963718012428765e-05, |
|
"loss": 0.1348, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.364864864864865, |
|
"grad_norm": 0.3744940459728241, |
|
"learning_rate": 4.9970988973728314e-05, |
|
"loss": 0.1104, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 3.3682432432432434, |
|
"grad_norm": 1.3542141914367676, |
|
"learning_rate": 4.99782526385276e-05, |
|
"loss": 0.3864, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 3.3716216216216215, |
|
"grad_norm": 0.42771685123443604, |
|
"learning_rate": 4.998550902145619e-05, |
|
"loss": 0.1383, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 3.375, |
|
"grad_norm": 0.3889855444431305, |
|
"learning_rate": 4.999275813709971e-05, |
|
"loss": 0.1318, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 0.37391164898872375, |
|
"learning_rate": 5e-05, |
|
"loss": 0.1285, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.3817567567567566, |
|
"grad_norm": 0.3580280542373657, |
|
"learning_rate": 4.9996356488619556e-05, |
|
"loss": 0.1125, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 3.385135135135135, |
|
"grad_norm": 0.3707447052001953, |
|
"learning_rate": 4.9985427016598435e-05, |
|
"loss": 0.1221, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 3.3885135135135136, |
|
"grad_norm": 0.3772697150707245, |
|
"learning_rate": 4.996721476998771e-05, |
|
"loss": 0.1215, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 3.391891891891892, |
|
"grad_norm": 0.40340831875801086, |
|
"learning_rate": 4.9941725057840504e-05, |
|
"loss": 0.1233, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 3.39527027027027, |
|
"grad_norm": 0.3812180161476135, |
|
"learning_rate": 4.9908965310664374e-05, |
|
"loss": 0.1149, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.3986486486486487, |
|
"grad_norm": 0.4367975890636444, |
|
"learning_rate": 4.986894507825522e-05, |
|
"loss": 0.1358, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 3.402027027027027, |
|
"grad_norm": 0.4075244963169098, |
|
"learning_rate": 4.9821676026913475e-05, |
|
"loss": 0.132, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 3.4054054054054053, |
|
"grad_norm": 0.392610639333725, |
|
"learning_rate": 4.9767171936043175e-05, |
|
"loss": 0.1169, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 3.4087837837837838, |
|
"grad_norm": 0.3933659791946411, |
|
"learning_rate": 4.970544869413522e-05, |
|
"loss": 0.1197, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 3.4121621621621623, |
|
"grad_norm": 0.3939339816570282, |
|
"learning_rate": 4.963652429413563e-05, |
|
"loss": 0.1315, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.4155405405405403, |
|
"grad_norm": 0.37384697794914246, |
|
"learning_rate": 4.9560418828200494e-05, |
|
"loss": 0.1187, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 3.418918918918919, |
|
"grad_norm": 0.3945925831794739, |
|
"learning_rate": 4.9477154481838875e-05, |
|
"loss": 0.1353, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 3.4222972972972974, |
|
"grad_norm": 0.42826682329177856, |
|
"learning_rate": 4.9386755527445475e-05, |
|
"loss": 0.1459, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 3.4256756756756754, |
|
"grad_norm": 0.4206187427043915, |
|
"learning_rate": 4.928924831722504e-05, |
|
"loss": 0.1292, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 3.429054054054054, |
|
"grad_norm": 0.38580143451690674, |
|
"learning_rate": 4.9184661275510446e-05, |
|
"loss": 0.1153, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.4324324324324325, |
|
"grad_norm": 0.393794983625412, |
|
"learning_rate": 4.907302489047662e-05, |
|
"loss": 0.1179, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 3.435810810810811, |
|
"grad_norm": 0.44305288791656494, |
|
"learning_rate": 4.895437170525303e-05, |
|
"loss": 0.1302, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 3.439189189189189, |
|
"grad_norm": 0.4056559205055237, |
|
"learning_rate": 4.882873630843699e-05, |
|
"loss": 0.1302, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 3.4425675675675675, |
|
"grad_norm": 0.40912213921546936, |
|
"learning_rate": 4.869615532401074e-05, |
|
"loss": 0.1327, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 3.445945945945946, |
|
"grad_norm": 0.38370949029922485, |
|
"learning_rate": 4.855666740066522e-05, |
|
"loss": 0.129, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.4493243243243246, |
|
"grad_norm": 0.41348859667778015, |
|
"learning_rate": 4.841031320053351e-05, |
|
"loss": 0.1243, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 3.4527027027027026, |
|
"grad_norm": 0.40817150473594666, |
|
"learning_rate": 4.825713538733748e-05, |
|
"loss": 0.1321, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 3.456081081081081, |
|
"grad_norm": 31.2849063873291, |
|
"learning_rate": 4.80971786139509e-05, |
|
"loss": 0.186, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 3.4594594594594597, |
|
"grad_norm": 8.919435501098633, |
|
"learning_rate": 4.793048950938256e-05, |
|
"loss": 0.6012, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 3.4628378378378377, |
|
"grad_norm": 1.2597789764404297, |
|
"learning_rate": 4.7757116665183614e-05, |
|
"loss": 0.373, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.4662162162162162, |
|
"grad_norm": 144.5867156982422, |
|
"learning_rate": 4.757711062128251e-05, |
|
"loss": 0.6334, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 3.4695945945945947, |
|
"grad_norm": 42.916805267333984, |
|
"learning_rate": 4.739052385125216e-05, |
|
"loss": 0.6214, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 3.472972972972973, |
|
"grad_norm": 0.7827467322349548, |
|
"learning_rate": 4.7197410747013376e-05, |
|
"loss": 0.308, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 3.4763513513513513, |
|
"grad_norm": 0.7671857476234436, |
|
"learning_rate": 4.6997827602979024e-05, |
|
"loss": 0.3552, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 3.47972972972973, |
|
"grad_norm": 3.2267792224884033, |
|
"learning_rate": 4.67918325996437e-05, |
|
"loss": 0.3405, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.483108108108108, |
|
"grad_norm": 13.3894624710083, |
|
"learning_rate": 4.6579485786623475e-05, |
|
"loss": 0.4361, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 3.4864864864864864, |
|
"grad_norm": 2.525721311569214, |
|
"learning_rate": 4.636084906515085e-05, |
|
"loss": 0.1359, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 3.489864864864865, |
|
"grad_norm": 52.08081817626953, |
|
"learning_rate": 4.6135986170029947e-05, |
|
"loss": 0.5506, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 3.4932432432432434, |
|
"grad_norm": 1.6132444143295288, |
|
"learning_rate": 4.5904962651057134e-05, |
|
"loss": 0.1839, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 3.4966216216216215, |
|
"grad_norm": 1.116671085357666, |
|
"learning_rate": 4.566784585391263e-05, |
|
"loss": 0.1534, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.6348862051963806, |
|
"learning_rate": 4.542470490052853e-05, |
|
"loss": 0.1429, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 3.5033783783783785, |
|
"grad_norm": 0.6647199392318726, |
|
"learning_rate": 4.517561066893909e-05, |
|
"loss": 0.1351, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 3.506756756756757, |
|
"grad_norm": 0.4823889136314392, |
|
"learning_rate": 4.492063577261908e-05, |
|
"loss": 0.1312, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 3.510135135135135, |
|
"grad_norm": 0.48596346378326416, |
|
"learning_rate": 4.4659854539316174e-05, |
|
"loss": 0.1418, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 0.7251044511795044, |
|
"learning_rate": 4.439334298938374e-05, |
|
"loss": 0.1465, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.516891891891892, |
|
"grad_norm": 0.4920099973678589, |
|
"learning_rate": 4.4121178813620046e-05, |
|
"loss": 0.1481, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 3.52027027027027, |
|
"grad_norm": 0.42203307151794434, |
|
"learning_rate": 4.384344135062071e-05, |
|
"loss": 0.1219, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 3.5236486486486487, |
|
"grad_norm": 0.47877076268196106, |
|
"learning_rate": 4.3560211563650635e-05, |
|
"loss": 0.1282, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 3.527027027027027, |
|
"grad_norm": 0.4383416175842285, |
|
"learning_rate": 4.327157201704241e-05, |
|
"loss": 0.133, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 3.5304054054054053, |
|
"grad_norm": 0.49161508679389954, |
|
"learning_rate": 4.297760685212801e-05, |
|
"loss": 0.1268, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 3.5337837837837838, |
|
"grad_norm": 0.44632676243782043, |
|
"learning_rate": 4.2678401762710726e-05, |
|
"loss": 0.1304, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 3.5371621621621623, |
|
"grad_norm": 0.48019739985466003, |
|
"learning_rate": 4.237404397008455e-05, |
|
"loss": 0.134, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 3.5405405405405403, |
|
"grad_norm": 0.3879307508468628, |
|
"learning_rate": 4.206462219760831e-05, |
|
"loss": 0.1106, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 3.543918918918919, |
|
"grad_norm": 8.76919937133789, |
|
"learning_rate": 4.175022664484191e-05, |
|
"loss": 0.1268, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 3.5472972972972974, |
|
"grad_norm": 0.5539897680282593, |
|
"learning_rate": 4.143094896125227e-05, |
|
"loss": 0.134, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.5506756756756754, |
|
"grad_norm": 0.41708147525787354, |
|
"learning_rate": 4.1106882219496535e-05, |
|
"loss": 0.1185, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 3.554054054054054, |
|
"grad_norm": 0.7360028028488159, |
|
"learning_rate": 4.077812088829051e-05, |
|
"loss": 0.1251, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 3.5574324324324325, |
|
"grad_norm": 0.49543508887290955, |
|
"learning_rate": 4.044476080487005e-05, |
|
"loss": 0.1405, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 3.560810810810811, |
|
"grad_norm": 0.40071457624435425, |
|
"learning_rate": 4.0106899147053564e-05, |
|
"loss": 0.1162, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 3.564189189189189, |
|
"grad_norm": 0.6165151596069336, |
|
"learning_rate": 3.976463440491367e-05, |
|
"loss": 0.123, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 3.5675675675675675, |
|
"grad_norm": 0.4160919487476349, |
|
"learning_rate": 3.941806635206639e-05, |
|
"loss": 0.1276, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 3.570945945945946, |
|
"grad_norm": 0.4072069525718689, |
|
"learning_rate": 3.9067296016586105e-05, |
|
"loss": 0.1336, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 3.5743243243243246, |
|
"grad_norm": 0.4126531183719635, |
|
"learning_rate": 3.871242565155485e-05, |
|
"loss": 0.1267, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 3.5777027027027026, |
|
"grad_norm": 2.380211353302002, |
|
"learning_rate": 3.835355870525455e-05, |
|
"loss": 0.4067, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 3.581081081081081, |
|
"grad_norm": 0.3900314271450043, |
|
"learning_rate": 3.799079979101075e-05, |
|
"loss": 0.1231, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.5844594594594597, |
|
"grad_norm": 0.4255238473415375, |
|
"learning_rate": 3.7624254656696864e-05, |
|
"loss": 0.1405, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 3.5878378378378377, |
|
"grad_norm": 0.36614516377449036, |
|
"learning_rate": 3.7254030153907494e-05, |
|
"loss": 0.118, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 3.5912162162162162, |
|
"grad_norm": 0.40764889121055603, |
|
"learning_rate": 3.688023420681019e-05, |
|
"loss": 0.127, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 3.5945945945945947, |
|
"grad_norm": 0.35974109172821045, |
|
"learning_rate": 3.650297578068448e-05, |
|
"loss": 0.1238, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 3.597972972972973, |
|
"grad_norm": 0.4197885990142822, |
|
"learning_rate": 3.6122364850157326e-05, |
|
"loss": 0.1342, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 3.6013513513513513, |
|
"grad_norm": 0.3706846237182617, |
|
"learning_rate": 3.573851236714447e-05, |
|
"loss": 0.1108, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 3.60472972972973, |
|
"grad_norm": 0.3801131844520569, |
|
"learning_rate": 3.535153022850684e-05, |
|
"loss": 0.1279, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 3.608108108108108, |
|
"grad_norm": 0.35572677850723267, |
|
"learning_rate": 3.496153124343142e-05, |
|
"loss": 0.1263, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 3.6114864864864864, |
|
"grad_norm": 0.3624805510044098, |
|
"learning_rate": 3.4568629100546333e-05, |
|
"loss": 0.1185, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 3.614864864864865, |
|
"grad_norm": 0.36778637766838074, |
|
"learning_rate": 3.417293833477938e-05, |
|
"loss": 0.1222, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.618243243243243, |
|
"grad_norm": 0.34212857484817505, |
|
"learning_rate": 3.377457429397001e-05, |
|
"loss": 0.1196, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 3.6216216216216215, |
|
"grad_norm": 0.3891725540161133, |
|
"learning_rate": 3.337365310524423e-05, |
|
"loss": 0.1291, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 3.625, |
|
"grad_norm": 0.399291068315506, |
|
"learning_rate": 3.2970291641162396e-05, |
|
"loss": 0.1267, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 3.6283783783783785, |
|
"grad_norm": 0.346253901720047, |
|
"learning_rate": 3.25646074856497e-05, |
|
"loss": 0.1207, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 3.631756756756757, |
|
"grad_norm": 0.7178367376327515, |
|
"learning_rate": 3.2156718899719216e-05, |
|
"loss": 0.372, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.635135135135135, |
|
"grad_norm": 0.41096293926239014, |
|
"learning_rate": 3.174674478699772e-05, |
|
"loss": 0.1273, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 3.6385135135135136, |
|
"grad_norm": 0.3635697066783905, |
|
"learning_rate": 3.133480465906398e-05, |
|
"loss": 0.1256, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 3.641891891891892, |
|
"grad_norm": 0.3569808006286621, |
|
"learning_rate": 3.092101860061e-05, |
|
"loss": 0.1334, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 3.64527027027027, |
|
"grad_norm": 0.339813768863678, |
|
"learning_rate": 3.0505507234435122e-05, |
|
"loss": 0.111, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 0.4040168225765228, |
|
"learning_rate": 3.0088391686283214e-05, |
|
"loss": 0.1348, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.652027027027027, |
|
"grad_norm": 0.37235626578330994, |
|
"learning_rate": 2.966979354953336e-05, |
|
"loss": 0.1262, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 3.6554054054054053, |
|
"grad_norm": 0.3623943030834198, |
|
"learning_rate": 2.92498348497541e-05, |
|
"loss": 0.1185, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 3.6587837837837838, |
|
"grad_norm": 0.36152735352516174, |
|
"learning_rate": 2.882863800913182e-05, |
|
"loss": 0.1248, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 3.6621621621621623, |
|
"grad_norm": 0.38995814323425293, |
|
"learning_rate": 2.8406325810783425e-05, |
|
"loss": 0.1282, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 3.6655405405405403, |
|
"grad_norm": 0.35114696621894836, |
|
"learning_rate": 2.798302136296379e-05, |
|
"loss": 0.1098, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 3.668918918918919, |
|
"grad_norm": 0.35527390241622925, |
|
"learning_rate": 2.7558848063178506e-05, |
|
"loss": 0.1129, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 3.6722972972972974, |
|
"grad_norm": 0.3661573827266693, |
|
"learning_rate": 2.713392956221225e-05, |
|
"loss": 0.1187, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 3.6756756756756754, |
|
"grad_norm": 0.3659035265445709, |
|
"learning_rate": 2.6708389728083372e-05, |
|
"loss": 0.1311, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 3.679054054054054, |
|
"grad_norm": 0.36093148589134216, |
|
"learning_rate": 2.6282352609935028e-05, |
|
"loss": 0.1294, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 3.6824324324324325, |
|
"grad_norm": 0.36720040440559387, |
|
"learning_rate": 2.5855942401873734e-05, |
|
"loss": 0.128, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.685810810810811, |
|
"grad_norm": 0.40525415539741516, |
|
"learning_rate": 2.5429283406765436e-05, |
|
"loss": 0.1373, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 3.689189189189189, |
|
"grad_norm": 0.3309617042541504, |
|
"learning_rate": 2.50025e-05, |
|
"loss": 0.115, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 3.6925675675675675, |
|
"grad_norm": 0.3174631893634796, |
|
"learning_rate": 2.4575716593234574e-05, |
|
"loss": 0.105, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 3.695945945945946, |
|
"grad_norm": 0.36353635787963867, |
|
"learning_rate": 2.414905759812627e-05, |
|
"loss": 0.1215, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 3.6993243243243246, |
|
"grad_norm": 0.3367186188697815, |
|
"learning_rate": 2.372264739006497e-05, |
|
"loss": 0.1121, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 3.7027027027027026, |
|
"grad_norm": 0.3455405831336975, |
|
"learning_rate": 2.329661027191664e-05, |
|
"loss": 0.1142, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 3.706081081081081, |
|
"grad_norm": 0.3431136906147003, |
|
"learning_rate": 2.287107043778775e-05, |
|
"loss": 0.1132, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 3.7094594594594597, |
|
"grad_norm": 0.34689489006996155, |
|
"learning_rate": 2.2446151936821504e-05, |
|
"loss": 0.1129, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 3.7128378378378377, |
|
"grad_norm": 0.3749043941497803, |
|
"learning_rate": 2.2021978637036214e-05, |
|
"loss": 0.1268, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 3.7162162162162162, |
|
"grad_norm": 0.34978923201560974, |
|
"learning_rate": 2.1598674189216575e-05, |
|
"loss": 0.1285, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.7195945945945947, |
|
"grad_norm": 0.3356480300426483, |
|
"learning_rate": 2.117636199086818e-05, |
|
"loss": 0.116, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 3.722972972972973, |
|
"grad_norm": 0.3172987699508667, |
|
"learning_rate": 2.0755165150245906e-05, |
|
"loss": 0.1137, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 3.7263513513513513, |
|
"grad_norm": 0.34791630506515503, |
|
"learning_rate": 2.0335206450466658e-05, |
|
"loss": 0.1153, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 3.72972972972973, |
|
"grad_norm": 0.3284004330635071, |
|
"learning_rate": 1.9916608313716786e-05, |
|
"loss": 0.1131, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 3.733108108108108, |
|
"grad_norm": 0.3191678822040558, |
|
"learning_rate": 1.9499492765564878e-05, |
|
"loss": 0.1123, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 3.7364864864864864, |
|
"grad_norm": 0.33098334074020386, |
|
"learning_rate": 1.9083981399389997e-05, |
|
"loss": 0.1156, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 3.739864864864865, |
|
"grad_norm": 0.3604113459587097, |
|
"learning_rate": 1.8670195340936026e-05, |
|
"loss": 0.1164, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 3.743243243243243, |
|
"grad_norm": 0.33711400628089905, |
|
"learning_rate": 1.825825521300229e-05, |
|
"loss": 0.1077, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 3.7466216216216215, |
|
"grad_norm": 0.3241155743598938, |
|
"learning_rate": 1.7848281100280788e-05, |
|
"loss": 0.1053, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.33422669768333435, |
|
"learning_rate": 1.7440392514350303e-05, |
|
"loss": 0.1226, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.7533783783783785, |
|
"grad_norm": 0.30073583126068115, |
|
"learning_rate": 1.7034708358837607e-05, |
|
"loss": 0.0983, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 3.756756756756757, |
|
"grad_norm": 0.37151581048965454, |
|
"learning_rate": 1.6631346894755773e-05, |
|
"loss": 0.1274, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 3.760135135135135, |
|
"grad_norm": 0.3346865773200989, |
|
"learning_rate": 1.6230425706029996e-05, |
|
"loss": 0.1203, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 3.7635135135135136, |
|
"grad_norm": 0.4401615560054779, |
|
"learning_rate": 1.5832061665220622e-05, |
|
"loss": 0.1122, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 3.766891891891892, |
|
"grad_norm": 0.3216763436794281, |
|
"learning_rate": 1.5436370899453666e-05, |
|
"loss": 0.1139, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 3.77027027027027, |
|
"grad_norm": 0.34183549880981445, |
|
"learning_rate": 1.504346875656858e-05, |
|
"loss": 0.1195, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 3.7736486486486487, |
|
"grad_norm": 0.3108338415622711, |
|
"learning_rate": 1.4653469771493166e-05, |
|
"loss": 0.1018, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 3.777027027027027, |
|
"grad_norm": 0.3019558787345886, |
|
"learning_rate": 1.426648763285553e-05, |
|
"loss": 0.1044, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 3.7804054054054053, |
|
"grad_norm": 0.30197712779045105, |
|
"learning_rate": 1.3882635149842685e-05, |
|
"loss": 0.0976, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 15.628427505493164, |
|
"learning_rate": 1.3502024219315524e-05, |
|
"loss": 0.1159, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.7871621621621623, |
|
"grad_norm": 0.32293814420700073, |
|
"learning_rate": 1.3124765793189808e-05, |
|
"loss": 0.108, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 3.7905405405405403, |
|
"grad_norm": 0.31840014457702637, |
|
"learning_rate": 1.2750969846092514e-05, |
|
"loss": 0.1011, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 3.793918918918919, |
|
"grad_norm": 0.33972540497779846, |
|
"learning_rate": 1.2380745343303146e-05, |
|
"loss": 0.115, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 3.7972972972972974, |
|
"grad_norm": 0.3430902063846588, |
|
"learning_rate": 1.2014200208989255e-05, |
|
"loss": 0.1136, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 3.8006756756756754, |
|
"grad_norm": 0.34559592604637146, |
|
"learning_rate": 1.165144129474546e-05, |
|
"loss": 0.1172, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.804054054054054, |
|
"grad_norm": 0.33134159445762634, |
|
"learning_rate": 1.1292574348445151e-05, |
|
"loss": 0.1107, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 3.8074324324324325, |
|
"grad_norm": 0.2891589403152466, |
|
"learning_rate": 1.0937703983413898e-05, |
|
"loss": 0.0962, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 3.810810810810811, |
|
"grad_norm": 0.34017154574394226, |
|
"learning_rate": 1.058693364793361e-05, |
|
"loss": 0.1168, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 3.814189189189189, |
|
"grad_norm": 0.3454993963241577, |
|
"learning_rate": 1.0240365595086335e-05, |
|
"loss": 0.1167, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 3.8175675675675675, |
|
"grad_norm": 0.33566930890083313, |
|
"learning_rate": 9.898100852946441e-06, |
|
"loss": 0.1209, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.820945945945946, |
|
"grad_norm": 0.31290584802627563, |
|
"learning_rate": 9.560239195129949e-06, |
|
"loss": 0.1062, |
|
"step": 1131 |
|
}, |
|
{ |
|
"epoch": 3.8243243243243246, |
|
"grad_norm": 0.33595094084739685, |
|
"learning_rate": 9.226879111709494e-06, |
|
"loss": 0.1091, |
|
"step": 1132 |
|
}, |
|
{ |
|
"epoch": 3.8277027027027026, |
|
"grad_norm": 0.3021032512187958, |
|
"learning_rate": 8.898117780503471e-06, |
|
"loss": 0.1029, |
|
"step": 1133 |
|
}, |
|
{ |
|
"epoch": 3.831081081081081, |
|
"grad_norm": 0.3187675476074219, |
|
"learning_rate": 8.574051038747738e-06, |
|
"loss": 0.1121, |
|
"step": 1134 |
|
}, |
|
{ |
|
"epoch": 3.8344594594594597, |
|
"grad_norm": 0.2892165780067444, |
|
"learning_rate": 8.254773355158088e-06, |
|
"loss": 0.1007, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 3.8378378378378377, |
|
"grad_norm": 0.33364662528038025, |
|
"learning_rate": 7.94037780239169e-06, |
|
"loss": 0.1146, |
|
"step": 1136 |
|
}, |
|
{ |
|
"epoch": 3.8412162162162162, |
|
"grad_norm": 0.32407206296920776, |
|
"learning_rate": 7.630956029915455e-06, |
|
"loss": 0.1016, |
|
"step": 1137 |
|
}, |
|
{ |
|
"epoch": 3.8445945945945947, |
|
"grad_norm": 0.29210153222084045, |
|
"learning_rate": 7.32659823728928e-06, |
|
"loss": 0.0989, |
|
"step": 1138 |
|
}, |
|
{ |
|
"epoch": 3.847972972972973, |
|
"grad_norm": 0.3027746081352234, |
|
"learning_rate": 7.027393147871994e-06, |
|
"loss": 0.1031, |
|
"step": 1139 |
|
}, |
|
{ |
|
"epoch": 3.8513513513513513, |
|
"grad_norm": 0.34020087122917175, |
|
"learning_rate": 6.733427982957593e-06, |
|
"loss": 0.1105, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.85472972972973, |
|
"grad_norm": 0.3309917747974396, |
|
"learning_rate": 6.444788436349374e-06, |
|
"loss": 0.1155, |
|
"step": 1141 |
|
}, |
|
{ |
|
"epoch": 3.858108108108108, |
|
"grad_norm": 0.3191674053668976, |
|
"learning_rate": 6.16155864937929e-06, |
|
"loss": 0.1113, |
|
"step": 1142 |
|
}, |
|
{ |
|
"epoch": 3.8614864864864864, |
|
"grad_norm": 0.3304139971733093, |
|
"learning_rate": 5.883821186379955e-06, |
|
"loss": 0.1113, |
|
"step": 1143 |
|
}, |
|
{ |
|
"epoch": 3.864864864864865, |
|
"grad_norm": 0.3479062616825104, |
|
"learning_rate": 5.6116570106162695e-06, |
|
"loss": 0.1173, |
|
"step": 1144 |
|
}, |
|
{ |
|
"epoch": 3.868243243243243, |
|
"grad_norm": 0.3519810140132904, |
|
"learning_rate": 5.345145460683825e-06, |
|
"loss": 0.1076, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 3.8716216216216215, |
|
"grad_norm": 0.4121153652667999, |
|
"learning_rate": 5.0843642273809276e-06, |
|
"loss": 0.1073, |
|
"step": 1146 |
|
}, |
|
{ |
|
"epoch": 3.875, |
|
"grad_norm": 0.3174130618572235, |
|
"learning_rate": 4.82938933106091e-06, |
|
"loss": 0.1016, |
|
"step": 1147 |
|
}, |
|
{ |
|
"epoch": 3.8783783783783785, |
|
"grad_norm": 0.2952731251716614, |
|
"learning_rate": 4.5802950994714764e-06, |
|
"loss": 0.1038, |
|
"step": 1148 |
|
}, |
|
{ |
|
"epoch": 3.881756756756757, |
|
"grad_norm": 0.304749071598053, |
|
"learning_rate": 4.337154146087377e-06, |
|
"loss": 0.1083, |
|
"step": 1149 |
|
}, |
|
{ |
|
"epoch": 3.885135135135135, |
|
"grad_norm": 0.3059677481651306, |
|
"learning_rate": 4.100037348942866e-06, |
|
"loss": 0.099, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.8885135135135136, |
|
"grad_norm": 0.3029370605945587, |
|
"learning_rate": 3.869013829970057e-06, |
|
"loss": 0.1017, |
|
"step": 1151 |
|
}, |
|
{ |
|
"epoch": 3.891891891891892, |
|
"grad_norm": 0.28835681080818176, |
|
"learning_rate": 3.644150934849158e-06, |
|
"loss": 0.1, |
|
"step": 1152 |
|
}, |
|
{ |
|
"epoch": 3.89527027027027, |
|
"grad_norm": 0.2965335249900818, |
|
"learning_rate": 3.425514213376533e-06, |
|
"loss": 0.1023, |
|
"step": 1153 |
|
}, |
|
{ |
|
"epoch": 3.8986486486486487, |
|
"grad_norm": 0.2934282422065735, |
|
"learning_rate": 3.213167400356303e-06, |
|
"loss": 0.0937, |
|
"step": 1154 |
|
}, |
|
{ |
|
"epoch": 3.902027027027027, |
|
"grad_norm": 0.32294800877571106, |
|
"learning_rate": 3.007172397020972e-06, |
|
"loss": 0.1108, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.9054054054054053, |
|
"grad_norm": 0.2884935438632965, |
|
"learning_rate": 2.8075892529866257e-06, |
|
"loss": 0.1025, |
|
"step": 1156 |
|
}, |
|
{ |
|
"epoch": 3.9087837837837838, |
|
"grad_norm": 0.31093308329582214, |
|
"learning_rate": 2.6144761487478416e-06, |
|
"loss": 0.107, |
|
"step": 1157 |
|
}, |
|
{ |
|
"epoch": 3.9121621621621623, |
|
"grad_norm": 0.3277430236339569, |
|
"learning_rate": 2.4278893787174935e-06, |
|
"loss": 0.1216, |
|
"step": 1158 |
|
}, |
|
{ |
|
"epoch": 3.9155405405405403, |
|
"grad_norm": 0.467430979013443, |
|
"learning_rate": 2.2478833348163886e-06, |
|
"loss": 0.3528, |
|
"step": 1159 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 0.2848818302154541, |
|
"learning_rate": 2.074510490617438e-06, |
|
"loss": 0.0981, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.9222972972972974, |
|
"grad_norm": 0.2947328984737396, |
|
"learning_rate": 1.9078213860491097e-06, |
|
"loss": 0.1003, |
|
"step": 1161 |
|
}, |
|
{ |
|
"epoch": 3.9256756756756754, |
|
"grad_norm": 0.28310471773147583, |
|
"learning_rate": 1.7478646126625187e-06, |
|
"loss": 0.096, |
|
"step": 1162 |
|
}, |
|
{ |
|
"epoch": 3.929054054054054, |
|
"grad_norm": 0.31671369075775146, |
|
"learning_rate": 1.5946867994665007e-06, |
|
"loss": 0.1062, |
|
"step": 1163 |
|
}, |
|
{ |
|
"epoch": 3.9324324324324325, |
|
"grad_norm": 0.2964227795600891, |
|
"learning_rate": 1.4483325993347872e-06, |
|
"loss": 0.102, |
|
"step": 1164 |
|
}, |
|
{ |
|
"epoch": 3.935810810810811, |
|
"grad_norm": 0.309048593044281, |
|
"learning_rate": 1.308844675989258e-06, |
|
"loss": 0.1049, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 3.939189189189189, |
|
"grad_norm": 0.31580665707588196, |
|
"learning_rate": 1.1762636915630122e-06, |
|
"loss": 0.1019, |
|
"step": 1166 |
|
}, |
|
{ |
|
"epoch": 3.9425675675675675, |
|
"grad_norm": 0.27929919958114624, |
|
"learning_rate": 1.0506282947469768e-06, |
|
"loss": 0.0962, |
|
"step": 1167 |
|
}, |
|
{ |
|
"epoch": 3.945945945945946, |
|
"grad_norm": 0.44457200169563293, |
|
"learning_rate": 9.319751095233885e-07, |
|
"loss": 0.3502, |
|
"step": 1168 |
|
}, |
|
{ |
|
"epoch": 3.9493243243243246, |
|
"grad_norm": 0.26814886927604675, |
|
"learning_rate": 8.203387244895631e-07, |
|
"loss": 0.0968, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 3.9527027027027026, |
|
"grad_norm": 0.31180819869041443, |
|
"learning_rate": 7.157516827749585e-07, |
|
"loss": 0.1097, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.956081081081081, |
|
"grad_norm": 0.30869176983833313, |
|
"learning_rate": 6.182444725545342e-07, |
|
"loss": 0.1067, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 3.9594594594594597, |
|
"grad_norm": 0.29772305488586426, |
|
"learning_rate": 5.278455181611357e-07, |
|
"loss": 0.1061, |
|
"step": 1172 |
|
}, |
|
{ |
|
"epoch": 3.9628378378378377, |
|
"grad_norm": 0.3115202784538269, |
|
"learning_rate": 4.445811717995063e-07, |
|
"loss": 0.1081, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 3.9662162162162162, |
|
"grad_norm": 0.31144705414772034, |
|
"learning_rate": 3.684757058643714e-07, |
|
"loss": 0.097, |
|
"step": 1174 |
|
}, |
|
{ |
|
"epoch": 3.9695945945945947, |
|
"grad_norm": 0.3056892156600952, |
|
"learning_rate": 2.995513058647855e-07, |
|
"loss": 0.1069, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.972972972972973, |
|
"grad_norm": 0.327035129070282, |
|
"learning_rate": 2.3782806395682474e-07, |
|
"loss": 0.1168, |
|
"step": 1176 |
|
}, |
|
{ |
|
"epoch": 3.9763513513513513, |
|
"grad_norm": 0.31039026379585266, |
|
"learning_rate": 1.8332397308652485e-07, |
|
"loss": 0.105, |
|
"step": 1177 |
|
}, |
|
{ |
|
"epoch": 3.97972972972973, |
|
"grad_norm": 0.2924991548061371, |
|
"learning_rate": 1.3605492174477425e-07, |
|
"loss": 0.0984, |
|
"step": 1178 |
|
}, |
|
{ |
|
"epoch": 3.983108108108108, |
|
"grad_norm": 0.2952897250652313, |
|
"learning_rate": 9.603468933562955e-08, |
|
"loss": 0.1039, |
|
"step": 1179 |
|
}, |
|
{ |
|
"epoch": 3.9864864864864864, |
|
"grad_norm": 0.28377214074134827, |
|
"learning_rate": 6.32749421594948e-08, |
|
"loss": 0.098, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.989864864864865, |
|
"grad_norm": 0.3225543200969696, |
|
"learning_rate": 3.778523001229054e-08, |
|
"loss": 0.1196, |
|
"step": 1181 |
|
}, |
|
{ |
|
"epoch": 3.993243243243243, |
|
"grad_norm": 0.3456767201423645, |
|
"learning_rate": 1.957298340156484e-08, |
|
"loss": 0.1071, |
|
"step": 1182 |
|
}, |
|
{ |
|
"epoch": 3.9966216216216215, |
|
"grad_norm": 0.30064505338668823, |
|
"learning_rate": 8.64351138044836e-09, |
|
"loss": 0.1043, |
|
"step": 1183 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.30672967433929443, |
|
"learning_rate": 5e-09, |
|
"loss": 0.0813, |
|
"step": 1184 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 1184, |
|
"total_flos": 1.048251868267099e+19, |
|
"train_loss": 0.24899632067771982, |
|
"train_runtime": 10196.9918, |
|
"train_samples_per_second": 2.78, |
|
"train_steps_per_second": 0.116 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 1184, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.048251868267099e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|