|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.001801801801802, |
|
"eval_steps": 500, |
|
"global_step": 833, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010810810810810811, |
|
"grad_norm": 12.305854329298121, |
|
"learning_rate": 2.9999652701989443e-05, |
|
"loss": 2.4911, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.021621621621621623, |
|
"grad_norm": 4.240144865945876, |
|
"learning_rate": 2.9998610824039904e-05, |
|
"loss": 2.0652, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.032432432432432434, |
|
"grad_norm": 3.2725279352000864, |
|
"learning_rate": 2.9996874414396984e-05, |
|
"loss": 1.9611, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.043243243243243246, |
|
"grad_norm": 3.303814919197607, |
|
"learning_rate": 2.9994443553467584e-05, |
|
"loss": 2.0248, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 3.003679341904051, |
|
"learning_rate": 2.9991318353816112e-05, |
|
"loss": 1.9903, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06486486486486487, |
|
"grad_norm": 2.986093823263424, |
|
"learning_rate": 2.9987498960159325e-05, |
|
"loss": 1.8813, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07567567567567568, |
|
"grad_norm": 2.790555184817465, |
|
"learning_rate": 2.99829855493596e-05, |
|
"loss": 1.976, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08648648648648649, |
|
"grad_norm": 2.986932414712697, |
|
"learning_rate": 2.997777833041674e-05, |
|
"loss": 1.9653, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0972972972972973, |
|
"grad_norm": 2.656324556829213, |
|
"learning_rate": 2.9971877544458325e-05, |
|
"loss": 1.8551, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 3.2168328469993335, |
|
"learning_rate": 2.996528346472851e-05, |
|
"loss": 1.9812, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11891891891891893, |
|
"grad_norm": 2.696843177932571, |
|
"learning_rate": 2.9957996396575407e-05, |
|
"loss": 1.9728, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.12972972972972974, |
|
"grad_norm": 3.2884895488967603, |
|
"learning_rate": 2.995001667743691e-05, |
|
"loss": 2.0118, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14054054054054055, |
|
"grad_norm": 2.7402908143169555, |
|
"learning_rate": 2.9941344676825106e-05, |
|
"loss": 1.9478, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.15135135135135136, |
|
"grad_norm": 2.375732605463657, |
|
"learning_rate": 2.993198079630913e-05, |
|
"loss": 1.9092, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 2.6770114768040143, |
|
"learning_rate": 2.9921925469496594e-05, |
|
"loss": 2.0062, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.17297297297297298, |
|
"grad_norm": 2.7654577179498943, |
|
"learning_rate": 2.9911179162013495e-05, |
|
"loss": 1.9634, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1837837837837838, |
|
"grad_norm": 2.9383093484984966, |
|
"learning_rate": 2.9899742371482663e-05, |
|
"loss": 1.9553, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1945945945945946, |
|
"grad_norm": 2.5901057077516443, |
|
"learning_rate": 2.988761562750071e-05, |
|
"loss": 1.9399, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.20540540540540542, |
|
"grad_norm": 2.381483900812707, |
|
"learning_rate": 2.9874799491613513e-05, |
|
"loss": 1.9434, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 2.4229562035711263, |
|
"learning_rate": 2.9861294557290205e-05, |
|
"loss": 1.9481, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22702702702702704, |
|
"grad_norm": 2.5010028477455393, |
|
"learning_rate": 2.9847101449895692e-05, |
|
"loss": 1.9495, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.23783783783783785, |
|
"grad_norm": 2.3138487877840244, |
|
"learning_rate": 2.9832220826661707e-05, |
|
"loss": 1.9259, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.24864864864864866, |
|
"grad_norm": 2.5123033559714245, |
|
"learning_rate": 2.981665337665636e-05, |
|
"loss": 1.9731, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2594594594594595, |
|
"grad_norm": 2.6439754352664644, |
|
"learning_rate": 2.9800399820752236e-05, |
|
"loss": 1.9819, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 2.3833028996710746, |
|
"learning_rate": 2.9783460911593024e-05, |
|
"loss": 1.8926, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2810810810810811, |
|
"grad_norm": 2.4316596585025687, |
|
"learning_rate": 2.9765837433558652e-05, |
|
"loss": 1.9008, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2918918918918919, |
|
"grad_norm": 2.492002538880973, |
|
"learning_rate": 2.9747530202728965e-05, |
|
"loss": 1.9837, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3027027027027027, |
|
"grad_norm": 2.50223915481049, |
|
"learning_rate": 2.9728540066845944e-05, |
|
"loss": 1.9134, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.31351351351351353, |
|
"grad_norm": 2.79878301554176, |
|
"learning_rate": 2.9708867905274444e-05, |
|
"loss": 1.9226, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 2.2798098795033477, |
|
"learning_rate": 2.9688514628961473e-05, |
|
"loss": 1.9029, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.33513513513513515, |
|
"grad_norm": 2.2032821621350536, |
|
"learning_rate": 2.966748118039402e-05, |
|
"loss": 1.917, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.34594594594594597, |
|
"grad_norm": 2.434797694487723, |
|
"learning_rate": 2.9645768533555387e-05, |
|
"loss": 1.9226, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3567567567567568, |
|
"grad_norm": 2.2306447173589947, |
|
"learning_rate": 2.9623377693880123e-05, |
|
"loss": 1.9273, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3675675675675676, |
|
"grad_norm": 2.4678527947600046, |
|
"learning_rate": 2.9600309698207435e-05, |
|
"loss": 1.8761, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 2.172069501103823, |
|
"learning_rate": 2.957656561473319e-05, |
|
"loss": 1.9297, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3891891891891892, |
|
"grad_norm": 2.009177875410658, |
|
"learning_rate": 2.955214654296045e-05, |
|
"loss": 1.8419, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.8950203407246216, |
|
"learning_rate": 2.952705361364855e-05, |
|
"loss": 1.9594, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.41081081081081083, |
|
"grad_norm": 2.5653163311951106, |
|
"learning_rate": 2.950128798876075e-05, |
|
"loss": 1.8869, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.42162162162162165, |
|
"grad_norm": 2.732827590601958, |
|
"learning_rate": 2.947485086141042e-05, |
|
"loss": 1.8974, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 2.144285276139079, |
|
"learning_rate": 2.9447743455805793e-05, |
|
"loss": 1.9196, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.44324324324324327, |
|
"grad_norm": 2.4409981534782914, |
|
"learning_rate": 2.9419967027193267e-05, |
|
"loss": 1.9428, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4540540540540541, |
|
"grad_norm": 2.646738039623452, |
|
"learning_rate": 2.9391522861799298e-05, |
|
"loss": 1.9737, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4648648648648649, |
|
"grad_norm": 2.3721501035510233, |
|
"learning_rate": 2.9362412276770833e-05, |
|
"loss": 1.9554, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.4756756756756757, |
|
"grad_norm": 2.423435489971938, |
|
"learning_rate": 2.93326366201143e-05, |
|
"loss": 1.9787, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 2.207859877546887, |
|
"learning_rate": 2.9302197270633207e-05, |
|
"loss": 1.9259, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4972972972972973, |
|
"grad_norm": 2.4856913311288733, |
|
"learning_rate": 2.9271095637864295e-05, |
|
"loss": 1.9433, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5081081081081081, |
|
"grad_norm": 2.440879046484796, |
|
"learning_rate": 2.9239333162012256e-05, |
|
"loss": 1.8939, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.518918918918919, |
|
"grad_norm": 2.45798549198192, |
|
"learning_rate": 2.9206911313883037e-05, |
|
"loss": 1.9845, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5297297297297298, |
|
"grad_norm": 2.3336768971980706, |
|
"learning_rate": 2.9173831594815768e-05, |
|
"loss": 1.916, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 2.39094175177689, |
|
"learning_rate": 2.9140095536613182e-05, |
|
"loss": 1.8494, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5513513513513514, |
|
"grad_norm": 2.429522504872439, |
|
"learning_rate": 2.9105704701470744e-05, |
|
"loss": 1.9189, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5621621621621622, |
|
"grad_norm": 2.7779327685196957, |
|
"learning_rate": 2.907066068190426e-05, |
|
"loss": 2.027, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.572972972972973, |
|
"grad_norm": 2.4829217011923106, |
|
"learning_rate": 2.903496510067618e-05, |
|
"loss": 1.9414, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5837837837837838, |
|
"grad_norm": 2.307375658362859, |
|
"learning_rate": 2.899861961072041e-05, |
|
"loss": 1.9, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 2.5317691911892855, |
|
"learning_rate": 2.896162589506579e-05, |
|
"loss": 1.9359, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6054054054054054, |
|
"grad_norm": 2.4918163002128138, |
|
"learning_rate": 2.8923985666758178e-05, |
|
"loss": 1.8599, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6162162162162163, |
|
"grad_norm": 2.076258038368402, |
|
"learning_rate": 2.888570066878109e-05, |
|
"loss": 1.9127, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6270270270270271, |
|
"grad_norm": 2.185928367527268, |
|
"learning_rate": 2.884677267397502e-05, |
|
"loss": 1.8128, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6378378378378379, |
|
"grad_norm": 2.866856917703555, |
|
"learning_rate": 2.88072034849553e-05, |
|
"loss": 1.9267, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 2.403239863727176, |
|
"learning_rate": 2.8766994934028697e-05, |
|
"loss": 1.9034, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6594594594594595, |
|
"grad_norm": 2.156072962852351, |
|
"learning_rate": 2.8726148883108505e-05, |
|
"loss": 1.9516, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6702702702702703, |
|
"grad_norm": 2.511207967261495, |
|
"learning_rate": 2.868466722362836e-05, |
|
"loss": 1.8811, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6810810810810811, |
|
"grad_norm": 2.3602190597776467, |
|
"learning_rate": 2.8642551876454625e-05, |
|
"loss": 1.9503, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6918918918918919, |
|
"grad_norm": 2.482488878028461, |
|
"learning_rate": 2.8599804791797483e-05, |
|
"loss": 1.8807, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 2.587845075067827, |
|
"learning_rate": 2.8556427949120587e-05, |
|
"loss": 1.9359, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7135135135135136, |
|
"grad_norm": 2.209476666719861, |
|
"learning_rate": 2.851242335704943e-05, |
|
"loss": 1.898, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7243243243243244, |
|
"grad_norm": 2.419802615785546, |
|
"learning_rate": 2.8467793053278318e-05, |
|
"loss": 1.8444, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.7351351351351352, |
|
"grad_norm": 2.226045413120279, |
|
"learning_rate": 2.842253910447601e-05, |
|
"loss": 1.8982, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.745945945945946, |
|
"grad_norm": 2.44484302970035, |
|
"learning_rate": 2.837666360619002e-05, |
|
"loss": 1.9596, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 2.43331348143749, |
|
"learning_rate": 2.8330168682749594e-05, |
|
"loss": 1.9313, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7675675675675676, |
|
"grad_norm": 2.0567115700480247, |
|
"learning_rate": 2.8283056487167313e-05, |
|
"loss": 1.9314, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7783783783783784, |
|
"grad_norm": 2.3852531892957223, |
|
"learning_rate": 2.8235329201039424e-05, |
|
"loss": 1.8631, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.7891891891891892, |
|
"grad_norm": 2.136466132414422, |
|
"learning_rate": 2.8186989034444794e-05, |
|
"loss": 1.859, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.592701276826683, |
|
"learning_rate": 2.8138038225842577e-05, |
|
"loss": 1.96, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 2.0512819357757, |
|
"learning_rate": 2.808847904196857e-05, |
|
"loss": 1.8646, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8216216216216217, |
|
"grad_norm": 2.1496899989713767, |
|
"learning_rate": 2.8038313777730237e-05, |
|
"loss": 1.8924, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.8324324324324325, |
|
"grad_norm": 2.150493531375372, |
|
"learning_rate": 2.798754475610044e-05, |
|
"loss": 1.7991, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.8432432432432433, |
|
"grad_norm": 2.094639871928105, |
|
"learning_rate": 2.7936174328009864e-05, |
|
"loss": 1.9364, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.8540540540540541, |
|
"grad_norm": 2.3155422463173623, |
|
"learning_rate": 2.7884204872238182e-05, |
|
"loss": 1.8647, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 2.2606435766654234, |
|
"learning_rate": 2.7831638795303873e-05, |
|
"loss": 1.8224, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8756756756756757, |
|
"grad_norm": 2.2579963330786774, |
|
"learning_rate": 2.7778478531352795e-05, |
|
"loss": 1.8282, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8864864864864865, |
|
"grad_norm": 1.9817080023633638, |
|
"learning_rate": 2.7724726542045463e-05, |
|
"loss": 1.8818, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8972972972972973, |
|
"grad_norm": 2.24441068506298, |
|
"learning_rate": 2.7670385316443084e-05, |
|
"loss": 1.9305, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.9081081081081082, |
|
"grad_norm": 2.541165407563793, |
|
"learning_rate": 2.7615457370892257e-05, |
|
"loss": 1.8736, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 1.9510813262964137, |
|
"learning_rate": 2.7559945248908468e-05, |
|
"loss": 1.8999, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9297297297297298, |
|
"grad_norm": 2.4719694412540245, |
|
"learning_rate": 2.7503851521058333e-05, |
|
"loss": 1.8846, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.9405405405405406, |
|
"grad_norm": 2.3098919918683096, |
|
"learning_rate": 2.744717878484053e-05, |
|
"loss": 1.8999, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.9513513513513514, |
|
"grad_norm": 2.465948183249405, |
|
"learning_rate": 2.7389929664565523e-05, |
|
"loss": 1.8028, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9621621621621622, |
|
"grad_norm": 2.204108562575812, |
|
"learning_rate": 2.733210681123406e-05, |
|
"loss": 1.9526, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 2.3661976037384815, |
|
"learning_rate": 2.7273712902414396e-05, |
|
"loss": 1.8472, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9837837837837838, |
|
"grad_norm": 2.1596420136763137, |
|
"learning_rate": 2.7214750642118315e-05, |
|
"loss": 1.849, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9945945945945946, |
|
"grad_norm": 2.1642232408896453, |
|
"learning_rate": 2.715522276067591e-05, |
|
"loss": 1.8476, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.0054054054054054, |
|
"grad_norm": 2.102003078375359, |
|
"learning_rate": 2.709513201460915e-05, |
|
"loss": 1.6092, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.0162162162162163, |
|
"grad_norm": 2.580181780728641, |
|
"learning_rate": 2.7034481186504253e-05, |
|
"loss": 1.3409, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.027027027027027, |
|
"grad_norm": 1.987111465217205, |
|
"learning_rate": 2.6973273084882802e-05, |
|
"loss": 1.3026, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.037837837837838, |
|
"grad_norm": 1.835737689431998, |
|
"learning_rate": 2.691151054407172e-05, |
|
"loss": 1.2992, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.0486486486486486, |
|
"grad_norm": 2.075134750486952, |
|
"learning_rate": 2.684919642407202e-05, |
|
"loss": 1.2751, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.0594594594594595, |
|
"grad_norm": 2.154847674569881, |
|
"learning_rate": 2.6786333610426353e-05, |
|
"loss": 1.2951, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.0702702702702702, |
|
"grad_norm": 2.201259061666919, |
|
"learning_rate": 2.67229250140854e-05, |
|
"loss": 1.2813, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 2.099199218269766, |
|
"learning_rate": 2.6658973571273077e-05, |
|
"loss": 1.2422, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0918918918918918, |
|
"grad_norm": 2.1682257578187185, |
|
"learning_rate": 2.6594482243350558e-05, |
|
"loss": 1.2958, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.1027027027027028, |
|
"grad_norm": 1.9450085810744047, |
|
"learning_rate": 2.6529454016679175e-05, |
|
"loss": 1.2175, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.1135135135135135, |
|
"grad_norm": 1.9627021282761483, |
|
"learning_rate": 2.6463891902482087e-05, |
|
"loss": 1.2143, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.1243243243243244, |
|
"grad_norm": 2.0335479254484716, |
|
"learning_rate": 2.639779893670487e-05, |
|
"loss": 1.2425, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 2.004046748177982, |
|
"learning_rate": 2.6331178179874934e-05, |
|
"loss": 1.2834, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.145945945945946, |
|
"grad_norm": 2.1051127397939333, |
|
"learning_rate": 2.6264032716959778e-05, |
|
"loss": 1.2787, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.1567567567567567, |
|
"grad_norm": 1.941259889065534, |
|
"learning_rate": 2.6196365657224166e-05, |
|
"loss": 1.2456, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.1675675675675676, |
|
"grad_norm": 2.4331557433084643, |
|
"learning_rate": 2.612818013408613e-05, |
|
"loss": 1.2398, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.1783783783783783, |
|
"grad_norm": 2.125511278222413, |
|
"learning_rate": 2.6059479304971867e-05, |
|
"loss": 1.2717, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.1891891891891893, |
|
"grad_norm": 2.254173095275699, |
|
"learning_rate": 2.5990266351169554e-05, |
|
"loss": 1.2694, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.9756822137444563, |
|
"learning_rate": 2.5920544477682012e-05, |
|
"loss": 1.2747, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.2108108108108109, |
|
"grad_norm": 2.1110444161525077, |
|
"learning_rate": 2.5850316913078298e-05, |
|
"loss": 1.295, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.2216216216216216, |
|
"grad_norm": 2.1222020915135995, |
|
"learning_rate": 2.5779586909344206e-05, |
|
"loss": 1.3109, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.2324324324324325, |
|
"grad_norm": 1.8417665825926723, |
|
"learning_rate": 2.570835774173169e-05, |
|
"loss": 1.3056, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.2432432432432432, |
|
"grad_norm": 2.0280022835044016, |
|
"learning_rate": 2.563663270860717e-05, |
|
"loss": 1.32, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.2540540540540541, |
|
"grad_norm": 2.025208781667826, |
|
"learning_rate": 2.5564415131298824e-05, |
|
"loss": 1.2705, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.2648648648648648, |
|
"grad_norm": 1.9120753635362695, |
|
"learning_rate": 2.5491708353942773e-05, |
|
"loss": 1.2645, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.2756756756756757, |
|
"grad_norm": 1.9983727272207963, |
|
"learning_rate": 2.5418515743328232e-05, |
|
"loss": 1.2795, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.2864864864864864, |
|
"grad_norm": 2.0065772123507735, |
|
"learning_rate": 2.534484068874162e-05, |
|
"loss": 1.3017, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 2.0474786433390335, |
|
"learning_rate": 2.5270686601809577e-05, |
|
"loss": 1.25, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.308108108108108, |
|
"grad_norm": 1.9384897745280192, |
|
"learning_rate": 2.5196056916341016e-05, |
|
"loss": 1.2294, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.318918918918919, |
|
"grad_norm": 1.9918888965378558, |
|
"learning_rate": 2.512095508816812e-05, |
|
"loss": 1.2941, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.3297297297297297, |
|
"grad_norm": 1.9846908820959408, |
|
"learning_rate": 2.5045384594986285e-05, |
|
"loss": 1.2538, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.3405405405405406, |
|
"grad_norm": 2.080792691569016, |
|
"learning_rate": 2.4969348936193102e-05, |
|
"loss": 1.2543, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 2.0125722435554514, |
|
"learning_rate": 2.4892851632726306e-05, |
|
"loss": 1.2757, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.3621621621621622, |
|
"grad_norm": 1.9764284092403845, |
|
"learning_rate": 2.481589622690075e-05, |
|
"loss": 1.2625, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.372972972972973, |
|
"grad_norm": 2.027142138762262, |
|
"learning_rate": 2.4738486282244333e-05, |
|
"loss": 1.2831, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.3837837837837839, |
|
"grad_norm": 1.9798502988269455, |
|
"learning_rate": 2.4660625383333028e-05, |
|
"loss": 1.2673, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.3945945945945946, |
|
"grad_norm": 2.047403010408032, |
|
"learning_rate": 2.4582317135624886e-05, |
|
"loss": 1.2698, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.4054054054054055, |
|
"grad_norm": 2.012366168035498, |
|
"learning_rate": 2.450356516529304e-05, |
|
"loss": 1.3192, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4162162162162162, |
|
"grad_norm": 2.05238416440222, |
|
"learning_rate": 2.4424373119057852e-05, |
|
"loss": 1.2696, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.427027027027027, |
|
"grad_norm": 2.2479201479489923, |
|
"learning_rate": 2.4344744664018e-05, |
|
"loss": 1.3024, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.4378378378378378, |
|
"grad_norm": 2.1894458789904583, |
|
"learning_rate": 2.4264683487480687e-05, |
|
"loss": 1.3099, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.4486486486486487, |
|
"grad_norm": 2.037077094665106, |
|
"learning_rate": 2.4184193296790887e-05, |
|
"loss": 1.2514, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 2.0273525133084087, |
|
"learning_rate": 2.410327781915969e-05, |
|
"loss": 1.2798, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.4702702702702704, |
|
"grad_norm": 2.1689507381607713, |
|
"learning_rate": 2.402194080149167e-05, |
|
"loss": 1.3066, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.481081081081081, |
|
"grad_norm": 1.9503795898614715, |
|
"learning_rate": 2.394018601021143e-05, |
|
"loss": 1.2582, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.491891891891892, |
|
"grad_norm": 2.2761809415998706, |
|
"learning_rate": 2.385801723108914e-05, |
|
"loss": 1.2981, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.5027027027027027, |
|
"grad_norm": 2.0356459303417296, |
|
"learning_rate": 2.3775438269065277e-05, |
|
"loss": 1.2505, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 1.9297970402246538, |
|
"learning_rate": 2.3692452948074395e-05, |
|
"loss": 1.2546, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5243243243243243, |
|
"grad_norm": 2.0123397178971714, |
|
"learning_rate": 2.360906511086809e-05, |
|
"loss": 1.2571, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.535135135135135, |
|
"grad_norm": 2.3184894064368033, |
|
"learning_rate": 2.352527861883702e-05, |
|
"loss": 1.2625, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.545945945945946, |
|
"grad_norm": 1.936734476547218, |
|
"learning_rate": 2.3441097351832113e-05, |
|
"loss": 1.3054, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.5567567567567568, |
|
"grad_norm": 2.030353788212261, |
|
"learning_rate": 2.3356525207984916e-05, |
|
"loss": 1.2755, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.5675675675675675, |
|
"grad_norm": 2.039094018045801, |
|
"learning_rate": 2.3271566103527063e-05, |
|
"loss": 1.2686, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.5783783783783782, |
|
"grad_norm": 1.9514327297087253, |
|
"learning_rate": 2.318622397260896e-05, |
|
"loss": 1.2683, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.5891891891891892, |
|
"grad_norm": 1.8736269848724938, |
|
"learning_rate": 2.3100502767117566e-05, |
|
"loss": 1.2255, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.9037413275565993, |
|
"learning_rate": 2.301440645649344e-05, |
|
"loss": 1.2669, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.6108108108108108, |
|
"grad_norm": 2.172424343159148, |
|
"learning_rate": 2.2927939027546895e-05, |
|
"loss": 1.2601, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 2.1709165438362152, |
|
"learning_rate": 2.284110448427341e-05, |
|
"loss": 1.2992, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6324324324324324, |
|
"grad_norm": 1.9517268177516773, |
|
"learning_rate": 2.2753906847668197e-05, |
|
"loss": 1.2602, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.6432432432432433, |
|
"grad_norm": 2.142716687001655, |
|
"learning_rate": 2.266635015554002e-05, |
|
"loss": 1.2387, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.654054054054054, |
|
"grad_norm": 1.892430237031948, |
|
"learning_rate": 2.2578438462324214e-05, |
|
"loss": 1.2796, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.6648648648648647, |
|
"grad_norm": 1.9831414889207626, |
|
"learning_rate": 2.2490175838894928e-05, |
|
"loss": 1.2693, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.6756756756756757, |
|
"grad_norm": 1.968788091532238, |
|
"learning_rate": 2.2401566372376635e-05, |
|
"loss": 1.2826, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.6864864864864866, |
|
"grad_norm": 2.028829960671371, |
|
"learning_rate": 2.231261416595486e-05, |
|
"loss": 1.2412, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.6972972972972973, |
|
"grad_norm": 2.0761836559826126, |
|
"learning_rate": 2.222332333868618e-05, |
|
"loss": 1.2907, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.708108108108108, |
|
"grad_norm": 2.0505303769583714, |
|
"learning_rate": 2.2133698025307487e-05, |
|
"loss": 1.2164, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.718918918918919, |
|
"grad_norm": 1.9892599771865245, |
|
"learning_rate": 2.2043742376044507e-05, |
|
"loss": 1.3029, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 2.056680185472525, |
|
"learning_rate": 2.195346055641966e-05, |
|
"loss": 1.2532, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7405405405405405, |
|
"grad_norm": 2.020031485543879, |
|
"learning_rate": 2.186285674705911e-05, |
|
"loss": 1.2752, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.7513513513513512, |
|
"grad_norm": 1.976348646730665, |
|
"learning_rate": 2.1771935143499233e-05, |
|
"loss": 1.281, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.7621621621621621, |
|
"grad_norm": 1.9671362390239506, |
|
"learning_rate": 2.1680699955992295e-05, |
|
"loss": 1.2567, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.772972972972973, |
|
"grad_norm": 1.9862314788875317, |
|
"learning_rate": 2.1589155409311514e-05, |
|
"loss": 1.2722, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 1.8794017228975768, |
|
"learning_rate": 2.1497305742555416e-05, |
|
"loss": 1.2267, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.7945945945945945, |
|
"grad_norm": 1.8537400724418822, |
|
"learning_rate": 2.140515520895154e-05, |
|
"loss": 1.2856, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.8054054054054054, |
|
"grad_norm": 2.2140372302917424, |
|
"learning_rate": 2.131270807565948e-05, |
|
"loss": 1.2668, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.8162162162162163, |
|
"grad_norm": 2.1224010879465514, |
|
"learning_rate": 2.1219968623573292e-05, |
|
"loss": 1.3403, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.827027027027027, |
|
"grad_norm": 2.150717671959958, |
|
"learning_rate": 2.1126941147123285e-05, |
|
"loss": 1.3294, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"grad_norm": 1.9412785818269171, |
|
"learning_rate": 2.1033629954077123e-05, |
|
"loss": 1.298, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8486486486486486, |
|
"grad_norm": 1.9061258724957593, |
|
"learning_rate": 2.0940039365340363e-05, |
|
"loss": 1.2984, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.8594594594594596, |
|
"grad_norm": 1.91000429693783, |
|
"learning_rate": 2.0846173714756372e-05, |
|
"loss": 1.2541, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.8702702702702703, |
|
"grad_norm": 2.0253367045491957, |
|
"learning_rate": 2.0752037348905656e-05, |
|
"loss": 1.3045, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.881081081081081, |
|
"grad_norm": 2.099866673823967, |
|
"learning_rate": 2.0657634626904544e-05, |
|
"loss": 1.2841, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 2.053590437534557, |
|
"learning_rate": 2.056296992020339e-05, |
|
"loss": 1.2732, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.9027027027027028, |
|
"grad_norm": 2.037664067587095, |
|
"learning_rate": 2.046804761238409e-05, |
|
"loss": 1.2661, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.9135135135135135, |
|
"grad_norm": 2.1032576901560875, |
|
"learning_rate": 2.037287209895713e-05, |
|
"loss": 1.2815, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.9243243243243242, |
|
"grad_norm": 2.0365409183820393, |
|
"learning_rate": 2.0277447787158057e-05, |
|
"loss": 1.281, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.9351351351351351, |
|
"grad_norm": 1.9303661726886707, |
|
"learning_rate": 2.0181779095743335e-05, |
|
"loss": 1.3122, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 2.0989107332254164, |
|
"learning_rate": 2.008587045478581e-05, |
|
"loss": 1.2766, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.9567567567567568, |
|
"grad_norm": 2.2610013717867607, |
|
"learning_rate": 1.9989726305469497e-05, |
|
"loss": 1.2744, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.9675675675675675, |
|
"grad_norm": 1.9759438397315554, |
|
"learning_rate": 1.989335109988397e-05, |
|
"loss": 1.2821, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.9783783783783784, |
|
"grad_norm": 2.0630705322166185, |
|
"learning_rate": 1.9796749300818185e-05, |
|
"loss": 1.2964, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.9891891891891893, |
|
"grad_norm": 2.062855518861247, |
|
"learning_rate": 1.9699925381553824e-05, |
|
"loss": 1.3101, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.004529311768417, |
|
"learning_rate": 1.960288382565816e-05, |
|
"loss": 1.2436, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.0108108108108107, |
|
"grad_norm": 1.8731442249653363, |
|
"learning_rate": 1.9505629126776435e-05, |
|
"loss": 0.7428, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.0216216216216214, |
|
"grad_norm": 2.7743566111060356, |
|
"learning_rate": 1.9408165788423776e-05, |
|
"loss": 0.6521, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.0324324324324325, |
|
"grad_norm": 2.4263480281287326, |
|
"learning_rate": 1.9310498323776642e-05, |
|
"loss": 0.6719, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.0432432432432432, |
|
"grad_norm": 1.7971900573296662, |
|
"learning_rate": 1.9212631255463864e-05, |
|
"loss": 0.6507, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.054054054054054, |
|
"grad_norm": 1.9536502652307774, |
|
"learning_rate": 1.911456911535719e-05, |
|
"loss": 0.6713, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.064864864864865, |
|
"grad_norm": 1.6734389443783486, |
|
"learning_rate": 1.9016316444361443e-05, |
|
"loss": 0.6513, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.075675675675676, |
|
"grad_norm": 2.0889794055853446, |
|
"learning_rate": 1.8917877792204238e-05, |
|
"loss": 0.6391, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.0864864864864865, |
|
"grad_norm": 1.952453907013592, |
|
"learning_rate": 1.881925771722533e-05, |
|
"loss": 0.6278, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.097297297297297, |
|
"grad_norm": 1.9394182987242035, |
|
"learning_rate": 1.872046078616549e-05, |
|
"loss": 0.6268, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 1.7760412655871423, |
|
"learning_rate": 1.862149157395506e-05, |
|
"loss": 0.6217, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.118918918918919, |
|
"grad_norm": 1.697555380465673, |
|
"learning_rate": 1.852235466350212e-05, |
|
"loss": 0.6496, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.1297297297297297, |
|
"grad_norm": 1.9579376565670537, |
|
"learning_rate": 1.8423054645480228e-05, |
|
"loss": 0.6388, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.1405405405405404, |
|
"grad_norm": 2.1761144428281565, |
|
"learning_rate": 1.8323596118115882e-05, |
|
"loss": 0.6293, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.1513513513513516, |
|
"grad_norm": 1.7805116699629946, |
|
"learning_rate": 1.8223983686975576e-05, |
|
"loss": 0.6321, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 2.0913504208478773, |
|
"learning_rate": 1.8124221964752535e-05, |
|
"loss": 0.6312, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.172972972972973, |
|
"grad_norm": 1.7500638235123256, |
|
"learning_rate": 1.80243155710531e-05, |
|
"loss": 0.6217, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.1837837837837837, |
|
"grad_norm": 1.8271358171780696, |
|
"learning_rate": 1.7924269132182855e-05, |
|
"loss": 0.6711, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.1945945945945944, |
|
"grad_norm": 1.9546793256337727, |
|
"learning_rate": 1.782408728093235e-05, |
|
"loss": 0.6392, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.2054054054054055, |
|
"grad_norm": 1.7882856434212824, |
|
"learning_rate": 1.7723774656362602e-05, |
|
"loss": 0.6395, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.2162162162162162, |
|
"grad_norm": 1.7898241991442447, |
|
"learning_rate": 1.762333590359028e-05, |
|
"loss": 0.6521, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.227027027027027, |
|
"grad_norm": 1.6654897277159277, |
|
"learning_rate": 1.752277567357258e-05, |
|
"loss": 0.646, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.237837837837838, |
|
"grad_norm": 1.8175887991140565, |
|
"learning_rate": 1.7422098622891873e-05, |
|
"loss": 0.613, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.2486486486486488, |
|
"grad_norm": 2.208881509880086, |
|
"learning_rate": 1.7321309413540087e-05, |
|
"loss": 0.6375, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.2594594594594595, |
|
"grad_norm": 1.8520498287505407, |
|
"learning_rate": 1.722041271270281e-05, |
|
"loss": 0.6613, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 2.0512133133202988, |
|
"learning_rate": 1.7119413192543165e-05, |
|
"loss": 0.6292, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.281081081081081, |
|
"grad_norm": 1.773677981324912, |
|
"learning_rate": 1.701831552998548e-05, |
|
"loss": 0.6399, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.291891891891892, |
|
"grad_norm": 1.8469844530131443, |
|
"learning_rate": 1.6917124406498697e-05, |
|
"loss": 0.6622, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.3027027027027027, |
|
"grad_norm": 2.0663988043195265, |
|
"learning_rate": 1.68158445078796e-05, |
|
"loss": 0.6428, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.3135135135135134, |
|
"grad_norm": 1.7259116146942584, |
|
"learning_rate": 1.671448052403583e-05, |
|
"loss": 0.6528, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.3243243243243246, |
|
"grad_norm": 1.9046842309367298, |
|
"learning_rate": 1.6613037148768702e-05, |
|
"loss": 0.6619, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.3351351351351353, |
|
"grad_norm": 1.8564656298207483, |
|
"learning_rate": 1.6511519079555887e-05, |
|
"loss": 0.6665, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.345945945945946, |
|
"grad_norm": 1.8614483941828683, |
|
"learning_rate": 1.640993101733383e-05, |
|
"loss": 0.6494, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.3567567567567567, |
|
"grad_norm": 1.8411704889596792, |
|
"learning_rate": 1.6308277666280133e-05, |
|
"loss": 0.6286, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.3675675675675674, |
|
"grad_norm": 1.810238547613111, |
|
"learning_rate": 1.6206563733595666e-05, |
|
"loss": 0.6544, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 1.803680767305892, |
|
"learning_rate": 1.610479392928663e-05, |
|
"loss": 0.6449, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.389189189189189, |
|
"grad_norm": 1.7701921715064215, |
|
"learning_rate": 1.600297296594643e-05, |
|
"loss": 0.6604, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.6556840871349028, |
|
"learning_rate": 1.5901105558537472e-05, |
|
"loss": 0.6775, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.410810810810811, |
|
"grad_norm": 1.734110836750224, |
|
"learning_rate": 1.579919642417281e-05, |
|
"loss": 0.6482, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.4216216216216218, |
|
"grad_norm": 2.000655482106033, |
|
"learning_rate": 1.569725028189772e-05, |
|
"loss": 0.648, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 1.7669642313010707, |
|
"learning_rate": 1.5595271852471204e-05, |
|
"loss": 0.6548, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.443243243243243, |
|
"grad_norm": 1.7114555298061667, |
|
"learning_rate": 1.5493265858147335e-05, |
|
"loss": 0.6291, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.454054054054054, |
|
"grad_norm": 1.982145197600375, |
|
"learning_rate": 1.5391237022456636e-05, |
|
"loss": 0.6648, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.464864864864865, |
|
"grad_norm": 2.057868578472174, |
|
"learning_rate": 1.5289190069987332e-05, |
|
"loss": 0.652, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.4756756756756757, |
|
"grad_norm": 1.6966492734095149, |
|
"learning_rate": 1.5187129726166565e-05, |
|
"loss": 0.6524, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.4864864864864864, |
|
"grad_norm": 1.7126379411947121, |
|
"learning_rate": 1.5085060717041585e-05, |
|
"loss": 0.6691, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.4972972972972975, |
|
"grad_norm": 2.027436021235288, |
|
"learning_rate": 1.4982987769060898e-05, |
|
"loss": 0.6551, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.5081081081081082, |
|
"grad_norm": 1.856345058930321, |
|
"learning_rate": 1.4880915608855402e-05, |
|
"loss": 0.6596, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.518918918918919, |
|
"grad_norm": 1.8317958549352111, |
|
"learning_rate": 1.477884896301953e-05, |
|
"loss": 0.6283, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.5297297297297296, |
|
"grad_norm": 1.7233177960794333, |
|
"learning_rate": 1.467679255789234e-05, |
|
"loss": 0.659, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.5405405405405403, |
|
"grad_norm": 1.70587685226235, |
|
"learning_rate": 1.4574751119338703e-05, |
|
"loss": 0.6375, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.5513513513513515, |
|
"grad_norm": 1.8655585555708385, |
|
"learning_rate": 1.4472729372530432e-05, |
|
"loss": 0.6242, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.562162162162162, |
|
"grad_norm": 1.8001992070479, |
|
"learning_rate": 1.4370732041727495e-05, |
|
"loss": 0.643, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.572972972972973, |
|
"grad_norm": 1.848601528720487, |
|
"learning_rate": 1.426876385005922e-05, |
|
"loss": 0.653, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.583783783783784, |
|
"grad_norm": 1.7058353790178475, |
|
"learning_rate": 1.4166829519305628e-05, |
|
"loss": 0.6189, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 1.7284279167710883, |
|
"learning_rate": 1.406493376967876e-05, |
|
"loss": 0.6447, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6054054054054054, |
|
"grad_norm": 1.9360862264128509, |
|
"learning_rate": 1.396308131960409e-05, |
|
"loss": 0.6396, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.616216216216216, |
|
"grad_norm": 1.6785593995242742, |
|
"learning_rate": 1.386127688550206e-05, |
|
"loss": 0.6305, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.627027027027027, |
|
"grad_norm": 1.7901370985866867, |
|
"learning_rate": 1.3759525181569663e-05, |
|
"loss": 0.6379, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.637837837837838, |
|
"grad_norm": 1.7497083646634908, |
|
"learning_rate": 1.3657830919562151e-05, |
|
"loss": 0.6252, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 2.6486486486486487, |
|
"grad_norm": 1.8186530224800674, |
|
"learning_rate": 1.3556198808574828e-05, |
|
"loss": 0.6751, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.6594594594594594, |
|
"grad_norm": 1.7289128215137377, |
|
"learning_rate": 1.3454633554825029e-05, |
|
"loss": 0.6467, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 2.6702702702702705, |
|
"grad_norm": 1.7639526554500256, |
|
"learning_rate": 1.335313986143416e-05, |
|
"loss": 0.6166, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 2.6810810810810812, |
|
"grad_norm": 1.7186641679091503, |
|
"learning_rate": 1.3251722428209933e-05, |
|
"loss": 0.6845, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 2.691891891891892, |
|
"grad_norm": 1.6965540327042066, |
|
"learning_rate": 1.3150385951428714e-05, |
|
"loss": 0.6487, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 1.8567952974204969, |
|
"learning_rate": 1.3049135123618073e-05, |
|
"loss": 0.6457, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7135135135135133, |
|
"grad_norm": 1.6295081249921362, |
|
"learning_rate": 1.2947974633339499e-05, |
|
"loss": 0.6445, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 2.7243243243243245, |
|
"grad_norm": 1.7231568913626358, |
|
"learning_rate": 1.2846909164971244e-05, |
|
"loss": 0.6434, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 2.735135135135135, |
|
"grad_norm": 1.8455193518645283, |
|
"learning_rate": 1.2745943398491462e-05, |
|
"loss": 0.65, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 2.745945945945946, |
|
"grad_norm": 1.827106966674069, |
|
"learning_rate": 1.2645082009261468e-05, |
|
"loss": 0.6628, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 2.756756756756757, |
|
"grad_norm": 1.7633688689382077, |
|
"learning_rate": 1.254432966780924e-05, |
|
"loss": 0.6491, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.7675675675675677, |
|
"grad_norm": 1.7284101689900349, |
|
"learning_rate": 1.2443691039613128e-05, |
|
"loss": 0.6258, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 2.7783783783783784, |
|
"grad_norm": 1.87274147264611, |
|
"learning_rate": 1.2343170784885859e-05, |
|
"loss": 0.6476, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 2.789189189189189, |
|
"grad_norm": 1.6928834129022787, |
|
"learning_rate": 1.2242773558358701e-05, |
|
"loss": 0.638, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.7838050014292766, |
|
"learning_rate": 1.2142504009065914e-05, |
|
"loss": 0.6402, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 1.7473000738048472, |
|
"learning_rate": 1.2042366780129507e-05, |
|
"loss": 0.615, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.8216216216216217, |
|
"grad_norm": 1.8638020286903605, |
|
"learning_rate": 1.1942366508544195e-05, |
|
"loss": 0.6425, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 2.8324324324324324, |
|
"grad_norm": 1.874577580434278, |
|
"learning_rate": 1.1842507824962694e-05, |
|
"loss": 0.6504, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 2.8432432432432435, |
|
"grad_norm": 1.755404683062205, |
|
"learning_rate": 1.1742795353481291e-05, |
|
"loss": 0.6541, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 2.854054054054054, |
|
"grad_norm": 1.8072551565675472, |
|
"learning_rate": 1.1643233711425716e-05, |
|
"loss": 0.6683, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 2.864864864864865, |
|
"grad_norm": 1.6667276846485022, |
|
"learning_rate": 1.1543827509137329e-05, |
|
"loss": 0.6486, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.8756756756756756, |
|
"grad_norm": 1.688942735670296, |
|
"learning_rate": 1.144458134975964e-05, |
|
"loss": 0.6652, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 2.8864864864864863, |
|
"grad_norm": 1.753840610632178, |
|
"learning_rate": 1.1345499829025136e-05, |
|
"loss": 0.6634, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 2.8972972972972975, |
|
"grad_norm": 1.9607814129942556, |
|
"learning_rate": 1.1246587535042492e-05, |
|
"loss": 0.6426, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 2.908108108108108, |
|
"grad_norm": 1.813335779829059, |
|
"learning_rate": 1.1147849048084105e-05, |
|
"loss": 0.6315, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 2.918918918918919, |
|
"grad_norm": 1.7402704693393687, |
|
"learning_rate": 1.1049288940373972e-05, |
|
"loss": 0.6228, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.92972972972973, |
|
"grad_norm": 1.7680419333729458, |
|
"learning_rate": 1.0950911775876014e-05, |
|
"loss": 0.6, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 2.9405405405405407, |
|
"grad_norm": 1.843223644664368, |
|
"learning_rate": 1.0852722110082693e-05, |
|
"loss": 0.6476, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 2.9513513513513514, |
|
"grad_norm": 1.7069834663839671, |
|
"learning_rate": 1.0754724489804098e-05, |
|
"loss": 0.6593, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 2.962162162162162, |
|
"grad_norm": 1.734691145206824, |
|
"learning_rate": 1.0656923452957354e-05, |
|
"loss": 0.6252, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 1.6530132045637638, |
|
"learning_rate": 1.0559323528356542e-05, |
|
"loss": 0.6218, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.983783783783784, |
|
"grad_norm": 1.7766789557037792, |
|
"learning_rate": 1.0461929235502952e-05, |
|
"loss": 0.6494, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 2.9945945945945946, |
|
"grad_norm": 1.777411804411515, |
|
"learning_rate": 1.036474508437579e-05, |
|
"loss": 0.6409, |
|
"step": 831 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 1385, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 833, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 151014030311424.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|