|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 1000, |
|
"global_step": 1590, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018867924528301886, |
|
"grad_norm": 2.9446594971470272, |
|
"learning_rate": 6.289308176100629e-06, |
|
"loss": 2.3241, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03773584905660377, |
|
"grad_norm": 3.2052443029455557, |
|
"learning_rate": 1.2578616352201259e-05, |
|
"loss": 2.2496, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05660377358490566, |
|
"grad_norm": 2.593311686421377, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 2.2836, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 2.8033026461409043, |
|
"learning_rate": 2.5157232704402517e-05, |
|
"loss": 2.2567, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09433962264150944, |
|
"grad_norm": 3.0647958985618127, |
|
"learning_rate": 3.144654088050314e-05, |
|
"loss": 2.2749, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11320754716981132, |
|
"grad_norm": 2.7224942436893023, |
|
"learning_rate": 3.7735849056603776e-05, |
|
"loss": 2.3334, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1320754716981132, |
|
"grad_norm": 2.507797193629058, |
|
"learning_rate": 4.402515723270441e-05, |
|
"loss": 2.3378, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 2.791876220429587, |
|
"learning_rate": 5.0314465408805034e-05, |
|
"loss": 2.3462, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16981132075471697, |
|
"grad_norm": 2.8934882752144877, |
|
"learning_rate": 5.660377358490566e-05, |
|
"loss": 2.3832, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 2.74130339104755, |
|
"learning_rate": 6.289308176100629e-05, |
|
"loss": 2.3658, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20754716981132076, |
|
"grad_norm": 2.464301918797891, |
|
"learning_rate": 6.918238993710691e-05, |
|
"loss": 2.3894, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.22641509433962265, |
|
"grad_norm": 3.5787748721128176, |
|
"learning_rate": 7.547169811320755e-05, |
|
"loss": 2.4635, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24528301886792453, |
|
"grad_norm": 3.8614586522023586, |
|
"learning_rate": 8.176100628930818e-05, |
|
"loss": 2.4099, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2641509433962264, |
|
"grad_norm": 2.624798812422503, |
|
"learning_rate": 8.805031446540882e-05, |
|
"loss": 2.4141, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2830188679245283, |
|
"grad_norm": 3.4083833226002174, |
|
"learning_rate": 9.433962264150944e-05, |
|
"loss": 2.4505, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 2.4164498878680254, |
|
"learning_rate": 9.999987950741765e-05, |
|
"loss": 2.4853, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.32075471698113206, |
|
"grad_norm": 4.2037868049637, |
|
"learning_rate": 9.9985421100216e-05, |
|
"loss": 2.529, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.33962264150943394, |
|
"grad_norm": 10.425711730519438, |
|
"learning_rate": 9.99468721610658e-05, |
|
"loss": 2.5123, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.3584905660377358, |
|
"grad_norm": 6.075608387062913, |
|
"learning_rate": 9.988425126867315e-05, |
|
"loss": 2.5137, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 5.979582059920921, |
|
"learning_rate": 9.979758860325019e-05, |
|
"loss": 2.4818, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.39622641509433965, |
|
"grad_norm": 300.5526680134449, |
|
"learning_rate": 9.968692593196944e-05, |
|
"loss": 2.5084, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.41509433962264153, |
|
"grad_norm": 2.3641585810185437, |
|
"learning_rate": 9.955231658883432e-05, |
|
"loss": 2.4667, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4339622641509434, |
|
"grad_norm": 2.399558237267707, |
|
"learning_rate": 9.93938254489746e-05, |
|
"loss": 2.4815, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 2.291187744959764, |
|
"learning_rate": 9.921152889737984e-05, |
|
"loss": 2.465, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"grad_norm": 2.2425372020480685, |
|
"learning_rate": 9.900551479208552e-05, |
|
"loss": 2.4827, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.49056603773584906, |
|
"grad_norm": 2.106996905280666, |
|
"learning_rate": 9.877588242182975e-05, |
|
"loss": 2.5077, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5094339622641509, |
|
"grad_norm": 2.56597906125238, |
|
"learning_rate": 9.852274245820096e-05, |
|
"loss": 2.5812, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5283018867924528, |
|
"grad_norm": 2.1161401839810323, |
|
"learning_rate": 9.824621690229965e-05, |
|
"loss": 2.5047, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5471698113207547, |
|
"grad_norm": 2.9746454428316467, |
|
"learning_rate": 9.79464390259397e-05, |
|
"loss": 2.4985, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 2.1237673830934156, |
|
"learning_rate": 9.762355330741796e-05, |
|
"loss": 2.4943, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5849056603773585, |
|
"grad_norm": 1.8440846284987655, |
|
"learning_rate": 9.727771536188275e-05, |
|
"loss": 2.4536, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 3.3815527986620526, |
|
"learning_rate": 9.690909186633492e-05, |
|
"loss": 2.4837, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6226415094339622, |
|
"grad_norm": 2.7797010587604953, |
|
"learning_rate": 9.651786047929773e-05, |
|
"loss": 2.5074, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6415094339622641, |
|
"grad_norm": 2.0947283835947794, |
|
"learning_rate": 9.610420975519408e-05, |
|
"loss": 2.441, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.660377358490566, |
|
"grad_norm": 1.9288902952601223, |
|
"learning_rate": 9.566833905347245e-05, |
|
"loss": 2.4885, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6792452830188679, |
|
"grad_norm": 2.004635564736395, |
|
"learning_rate": 9.521045844252552e-05, |
|
"loss": 2.4342, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6981132075471698, |
|
"grad_norm": 1.6511867070394874, |
|
"learning_rate": 9.473078859844728e-05, |
|
"loss": 2.4425, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7169811320754716, |
|
"grad_norm": 1.4598720970043289, |
|
"learning_rate": 9.422956069867807e-05, |
|
"loss": 2.4567, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7358490566037735, |
|
"grad_norm": 1.5295808219144331, |
|
"learning_rate": 9.370701631058829e-05, |
|
"loss": 2.4636, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 1.606602994374719, |
|
"learning_rate": 9.316340727505468e-05, |
|
"loss": 2.4707, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7735849056603774, |
|
"grad_norm": 1.5773231811089237, |
|
"learning_rate": 9.259899558508543e-05, |
|
"loss": 2.4242, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7924528301886793, |
|
"grad_norm": 1.5694593702673683, |
|
"learning_rate": 9.201405325955221e-05, |
|
"loss": 2.4754, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8113207547169812, |
|
"grad_norm": 1.4513304920200845, |
|
"learning_rate": 9.14088622120905e-05, |
|
"loss": 2.4735, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8301886792452831, |
|
"grad_norm": 2.2119679560211436, |
|
"learning_rate": 9.078371411523084e-05, |
|
"loss": 2.4511, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8490566037735849, |
|
"grad_norm": 1.4837853314532448, |
|
"learning_rate": 9.013891025982704e-05, |
|
"loss": 2.4627, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8679245283018868, |
|
"grad_norm": 1.548323059472257, |
|
"learning_rate": 8.947476140984856e-05, |
|
"loss": 2.4804, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8867924528301887, |
|
"grad_norm": 1.7369189464037587, |
|
"learning_rate": 8.879158765260767e-05, |
|
"loss": 2.4872, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"grad_norm": 1.4222000085980089, |
|
"learning_rate": 8.808971824449275e-05, |
|
"loss": 2.4847, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9245283018867925, |
|
"grad_norm": 1.39169720237414, |
|
"learning_rate": 8.736949145228295e-05, |
|
"loss": 2.4873, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 1.5495461414725966, |
|
"learning_rate": 8.66312543901201e-05, |
|
"loss": 2.4738, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9622641509433962, |
|
"grad_norm": 1.5689856394055257, |
|
"learning_rate": 8.587536285221656e-05, |
|
"loss": 2.4211, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9811320754716981, |
|
"grad_norm": 1.559462761559426, |
|
"learning_rate": 8.510218114137992e-05, |
|
"loss": 2.4183, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.38445361325361, |
|
"learning_rate": 8.43120818934367e-05, |
|
"loss": 2.459, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0188679245283019, |
|
"grad_norm": 1.8042327175721304, |
|
"learning_rate": 8.350544589764016e-05, |
|
"loss": 1.8838, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0377358490566038, |
|
"grad_norm": 1.8176496290402602, |
|
"learning_rate": 8.268266191314848e-05, |
|
"loss": 1.8624, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0566037735849056, |
|
"grad_norm": 1.8868344352432986, |
|
"learning_rate": 8.184412648166183e-05, |
|
"loss": 1.8182, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.0754716981132075, |
|
"grad_norm": 1.7299260995769612, |
|
"learning_rate": 8.099024373630854e-05, |
|
"loss": 1.8391, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.0943396226415094, |
|
"grad_norm": 1.9113984544679725, |
|
"learning_rate": 8.01214252068728e-05, |
|
"loss": 1.8545, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1132075471698113, |
|
"grad_norm": 1.794174287705714, |
|
"learning_rate": 7.923808962145734e-05, |
|
"loss": 1.8367, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"grad_norm": 1.5751797225379325, |
|
"learning_rate": 7.83406627046769e-05, |
|
"loss": 1.8149, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.150943396226415, |
|
"grad_norm": 1.9105350922209694, |
|
"learning_rate": 7.742957697247984e-05, |
|
"loss": 1.8061, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.169811320754717, |
|
"grad_norm": 1.7630498555967447, |
|
"learning_rate": 7.650527152369647e-05, |
|
"loss": 1.8411, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.1886792452830188, |
|
"grad_norm": 1.5261816105997068, |
|
"learning_rate": 7.556819182841497e-05, |
|
"loss": 1.8264, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2075471698113207, |
|
"grad_norm": 1.9369411893196908, |
|
"learning_rate": 7.461878951328653e-05, |
|
"loss": 1.8954, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2264150943396226, |
|
"grad_norm": 1.7688000917923798, |
|
"learning_rate": 7.365752214386321e-05, |
|
"loss": 1.8346, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.2452830188679245, |
|
"grad_norm": 1.6569058541238642, |
|
"learning_rate": 7.268485300407393e-05, |
|
"loss": 1.8805, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.2641509433962264, |
|
"grad_norm": 1.6708545601020437, |
|
"learning_rate": 7.17012508729441e-05, |
|
"loss": 1.7728, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.2830188679245282, |
|
"grad_norm": 1.652310201967167, |
|
"learning_rate": 7.070718979866702e-05, |
|
"loss": 1.8718, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3018867924528301, |
|
"grad_norm": 1.9899020380799617, |
|
"learning_rate": 6.970314887013584e-05, |
|
"loss": 1.8535, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.320754716981132, |
|
"grad_norm": 1.643783798160392, |
|
"learning_rate": 6.868961198604611e-05, |
|
"loss": 1.8344, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3396226415094339, |
|
"grad_norm": 1.8435538882684133, |
|
"learning_rate": 6.766706762168022e-05, |
|
"loss": 1.8759, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.3584905660377358, |
|
"grad_norm": 1.6989197917459231, |
|
"learning_rate": 6.663600859348616e-05, |
|
"loss": 1.7973, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.3773584905660377, |
|
"grad_norm": 1.6640164364452317, |
|
"learning_rate": 6.55969318215641e-05, |
|
"loss": 1.8101, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.3962264150943398, |
|
"grad_norm": 1.663705205393152, |
|
"learning_rate": 6.455033809017512e-05, |
|
"loss": 1.8574, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4150943396226414, |
|
"grad_norm": 1.524574911562225, |
|
"learning_rate": 6.34967318063877e-05, |
|
"loss": 1.8194, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4339622641509435, |
|
"grad_norm": 1.638744038935454, |
|
"learning_rate": 6.24366207569781e-05, |
|
"loss": 1.8557, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.4528301886792452, |
|
"grad_norm": 1.5905792259719815, |
|
"learning_rate": 6.137051586370194e-05, |
|
"loss": 1.8403, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.4716981132075473, |
|
"grad_norm": 1.4115389229640394, |
|
"learning_rate": 6.029893093705492e-05, |
|
"loss": 1.86, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.490566037735849, |
|
"grad_norm": 1.5664716217022607, |
|
"learning_rate": 5.9222382428641174e-05, |
|
"loss": 1.8223, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"grad_norm": 1.3426007079954652, |
|
"learning_rate": 5.814138918226887e-05, |
|
"loss": 1.7957, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5283018867924527, |
|
"grad_norm": 1.4496928054044773, |
|
"learning_rate": 5.7056472183892806e-05, |
|
"loss": 1.8542, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.5471698113207548, |
|
"grad_norm": 1.7249530177698127, |
|
"learning_rate": 5.5968154310524614e-05, |
|
"loss": 1.8043, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.5660377358490565, |
|
"grad_norm": 1.4451712049547103, |
|
"learning_rate": 5.487696007823161e-05, |
|
"loss": 1.7981, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.5849056603773586, |
|
"grad_norm": 1.5035729769726907, |
|
"learning_rate": 5.378341538934566e-05, |
|
"loss": 1.8313, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6037735849056602, |
|
"grad_norm": 1.3823097737594126, |
|
"learning_rate": 5.268804727900391e-05, |
|
"loss": 1.8476, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6226415094339623, |
|
"grad_norm": 1.41439773210909, |
|
"learning_rate": 5.159138366114358e-05, |
|
"loss": 1.7863, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.641509433962264, |
|
"grad_norm": 1.513162165314957, |
|
"learning_rate": 5.049395307407329e-05, |
|
"loss": 1.8363, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.6603773584905661, |
|
"grad_norm": 1.5375457880909025, |
|
"learning_rate": 4.9396284425743326e-05, |
|
"loss": 1.8004, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.6792452830188678, |
|
"grad_norm": 1.5695919072614308, |
|
"learning_rate": 4.829890673883792e-05, |
|
"loss": 1.818, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.6981132075471699, |
|
"grad_norm": 1.3666688643802247, |
|
"learning_rate": 4.7202348895812035e-05, |
|
"loss": 1.7885, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7169811320754715, |
|
"grad_norm": 1.6027481528500458, |
|
"learning_rate": 4.610713938399601e-05, |
|
"loss": 1.7906, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7358490566037736, |
|
"grad_norm": 1.3930291385793376, |
|
"learning_rate": 4.5013806040890294e-05, |
|
"loss": 1.7858, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.7547169811320755, |
|
"grad_norm": 1.4293209085375194, |
|
"learning_rate": 4.392287579977374e-05, |
|
"loss": 1.7796, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.7735849056603774, |
|
"grad_norm": 1.5151788900532224, |
|
"learning_rate": 4.2834874435747305e-05, |
|
"loss": 1.7666, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.7924528301886793, |
|
"grad_norm": 1.5253274784864974, |
|
"learning_rate": 4.1750326312336254e-05, |
|
"loss": 1.7516, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8113207547169812, |
|
"grad_norm": 1.3957421524480444, |
|
"learning_rate": 4.066975412877255e-05, |
|
"loss": 1.7904, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.830188679245283, |
|
"grad_norm": 1.399046653332325, |
|
"learning_rate": 3.959367866807926e-05, |
|
"loss": 1.7605, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.849056603773585, |
|
"grad_norm": 1.48580398039922, |
|
"learning_rate": 3.852261854607866e-05, |
|
"loss": 1.8169, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.8679245283018868, |
|
"grad_norm": 1.4703556780094864, |
|
"learning_rate": 3.7457089961444636e-05, |
|
"loss": 1.7652, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 1.4196287584590106, |
|
"learning_rate": 3.6397606446920294e-05, |
|
"loss": 1.75, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"eval_loss": 2.2884254455566406, |
|
"eval_runtime": 165.0682, |
|
"eval_samples_per_second": 11.413, |
|
"eval_steps_per_second": 2.853, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9056603773584906, |
|
"grad_norm": 1.442346199206303, |
|
"learning_rate": 3.534467862182008e-05, |
|
"loss": 1.7847, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.9245283018867925, |
|
"grad_norm": 1.3835916856247392, |
|
"learning_rate": 3.4298813945936295e-05, |
|
"loss": 1.7737, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.9433962264150944, |
|
"grad_norm": 1.3821884730018883, |
|
"learning_rate": 3.3260516474968285e-05, |
|
"loss": 1.7281, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.9622641509433962, |
|
"grad_norm": 1.3924722724907153, |
|
"learning_rate": 3.223028661759211e-05, |
|
"loss": 1.7924, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.9811320754716981, |
|
"grad_norm": 1.3388702147690976, |
|
"learning_rate": 3.12086208942881e-05, |
|
"loss": 1.7397, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.4015243388990968, |
|
"learning_rate": 3.019601169804216e-05, |
|
"loss": 1.6932, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.018867924528302, |
|
"grad_norm": 1.7480746986263314, |
|
"learning_rate": 2.919294705703647e-05, |
|
"loss": 0.6881, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.0377358490566038, |
|
"grad_norm": 1.7026666847000977, |
|
"learning_rate": 2.819991039944363e-05, |
|
"loss": 0.6078, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.056603773584906, |
|
"grad_norm": 1.7917514233908862, |
|
"learning_rate": 2.7217380320437978e-05, |
|
"loss": 0.6092, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.0754716981132075, |
|
"grad_norm": 1.6723597171494868, |
|
"learning_rate": 2.624583035153609e-05, |
|
"loss": 0.585, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0943396226415096, |
|
"grad_norm": 1.63904815463906, |
|
"learning_rate": 2.5285728732377613e-05, |
|
"loss": 0.577, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.1132075471698113, |
|
"grad_norm": 1.6791437732786112, |
|
"learning_rate": 2.4337538185056762e-05, |
|
"loss": 0.551, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.1320754716981134, |
|
"grad_norm": 1.6076545037137666, |
|
"learning_rate": 2.3401715691112746e-05, |
|
"loss": 0.556, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.150943396226415, |
|
"grad_norm": 1.726665027733004, |
|
"learning_rate": 2.247871227128709e-05, |
|
"loss": 0.5711, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.169811320754717, |
|
"grad_norm": 1.6490156416373818, |
|
"learning_rate": 2.1568972768153556e-05, |
|
"loss": 0.5601, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.188679245283019, |
|
"grad_norm": 1.7210537816210676, |
|
"learning_rate": 2.067293563172581e-05, |
|
"loss": 0.5609, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.207547169811321, |
|
"grad_norm": 1.6521402147978896, |
|
"learning_rate": 1.9791032708145963e-05, |
|
"loss": 0.5417, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.2264150943396226, |
|
"grad_norm": 1.7020323862071838, |
|
"learning_rate": 1.8923689031555697e-05, |
|
"loss": 0.5635, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.2452830188679247, |
|
"grad_norm": 1.5791599921066155, |
|
"learning_rate": 1.807132261925073e-05, |
|
"loss": 0.5371, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.2641509433962264, |
|
"grad_norm": 1.6370275383685373, |
|
"learning_rate": 1.7234344270216713e-05, |
|
"loss": 0.5459, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2830188679245285, |
|
"grad_norm": 1.649807184686461, |
|
"learning_rate": 1.6413157367144354e-05, |
|
"loss": 0.5608, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.30188679245283, |
|
"grad_norm": 1.7662002841569535, |
|
"learning_rate": 1.5608157682018505e-05, |
|
"loss": 0.5613, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.3207547169811322, |
|
"grad_norm": 1.641520954901167, |
|
"learning_rate": 1.4819733185375534e-05, |
|
"loss": 0.537, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.339622641509434, |
|
"grad_norm": 1.6680780951150302, |
|
"learning_rate": 1.4048263859320344e-05, |
|
"loss": 0.5425, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.358490566037736, |
|
"grad_norm": 1.5858289559337815, |
|
"learning_rate": 1.3294121514393637e-05, |
|
"loss": 0.5289, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.3773584905660377, |
|
"grad_norm": 1.609281814988441, |
|
"learning_rate": 1.2557669610377399e-05, |
|
"loss": 0.5155, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.3962264150943398, |
|
"grad_norm": 1.6108061713809745, |
|
"learning_rate": 1.1839263081124946e-05, |
|
"loss": 0.5214, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.4150943396226414, |
|
"grad_norm": 1.5364583247125485, |
|
"learning_rate": 1.113924816350026e-05, |
|
"loss": 0.5326, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.4339622641509435, |
|
"grad_norm": 1.523827370861251, |
|
"learning_rate": 1.04579622305086e-05, |
|
"loss": 0.5218, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.452830188679245, |
|
"grad_norm": 1.6969638639614046, |
|
"learning_rate": 9.795733628699333e-06, |
|
"loss": 0.5341, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.4716981132075473, |
|
"grad_norm": 1.502222163556516, |
|
"learning_rate": 9.152881519918787e-06, |
|
"loss": 0.5102, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.490566037735849, |
|
"grad_norm": 1.6251186914379474, |
|
"learning_rate": 8.529715727489912e-06, |
|
"loss": 0.5113, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.509433962264151, |
|
"grad_norm": 1.641634385361185, |
|
"learning_rate": 7.926536586892591e-06, |
|
"loss": 0.51, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.5283018867924527, |
|
"grad_norm": 1.564996479749529, |
|
"learning_rate": 7.3436348010165025e-06, |
|
"loss": 0.5075, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.547169811320755, |
|
"grad_norm": 1.5204914266086813, |
|
"learning_rate": 6.781291300056647e-06, |
|
"loss": 0.5111, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.5660377358490565, |
|
"grad_norm": 1.5204438359613908, |
|
"learning_rate": 6.239777106118605e-06, |
|
"loss": 0.501, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.5849056603773586, |
|
"grad_norm": 1.6153170323469739, |
|
"learning_rate": 5.719353202599209e-06, |
|
"loss": 0.5065, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.6037735849056602, |
|
"grad_norm": 1.532440501266883, |
|
"learning_rate": 5.220270408405198e-06, |
|
"loss": 0.5268, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.6226415094339623, |
|
"grad_norm": 1.5295028060682831, |
|
"learning_rate": 4.7427692570708445e-06, |
|
"loss": 0.5225, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.641509433962264, |
|
"grad_norm": 1.5576876729006885, |
|
"learning_rate": 4.287079880832478e-06, |
|
"loss": 0.5094, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.660377358490566, |
|
"grad_norm": 1.535240889295645, |
|
"learning_rate": 3.853421899715992e-06, |
|
"loss": 0.4991, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.6792452830188678, |
|
"grad_norm": 1.5668838039374533, |
|
"learning_rate": 3.44200431569075e-06, |
|
"loss": 0.5011, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.69811320754717, |
|
"grad_norm": 1.6597779325377704, |
|
"learning_rate": 3.053025411940802e-06, |
|
"loss": 0.4954, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.7169811320754715, |
|
"grad_norm": 1.5562079580978392, |
|
"learning_rate": 2.6866726573021026e-06, |
|
"loss": 0.5054, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.7358490566037736, |
|
"grad_norm": 1.5996686204830912, |
|
"learning_rate": 2.3431226159116637e-06, |
|
"loss": 0.5154, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.7547169811320753, |
|
"grad_norm": 1.6603987931741782, |
|
"learning_rate": 2.022540862112282e-06, |
|
"loss": 0.5029, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.7735849056603774, |
|
"grad_norm": 1.4442160081367916, |
|
"learning_rate": 1.725081900653791e-06, |
|
"loss": 0.5147, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.7924528301886795, |
|
"grad_norm": 1.5601472307077258, |
|
"learning_rate": 1.4508890922293018e-06, |
|
"loss": 0.4882, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.811320754716981, |
|
"grad_norm": 1.6882814081660615, |
|
"learning_rate": 1.2000945843823551e-06, |
|
"loss": 0.4909, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.830188679245283, |
|
"grad_norm": 1.5897926116142052, |
|
"learning_rate": 9.728192478182574e-07, |
|
"loss": 0.485, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.849056603773585, |
|
"grad_norm": 1.480162495765326, |
|
"learning_rate": 7.691726181503267e-07, |
|
"loss": 0.4985, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.867924528301887, |
|
"grad_norm": 1.5161543246256077, |
|
"learning_rate": 5.892528431090393e-07, |
|
"loss": 0.4816, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.8867924528301887, |
|
"grad_norm": 1.5434464499844907, |
|
"learning_rate": 4.331466352396396e-07, |
|
"loss": 0.4955, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.9056603773584904, |
|
"grad_norm": 1.5292680330833108, |
|
"learning_rate": 3.009292301109412e-07, |
|
"loss": 0.5018, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.9245283018867925, |
|
"grad_norm": 1.501995031518757, |
|
"learning_rate": 1.9266435005540483e-07, |
|
"loss": 0.5011, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.9433962264150946, |
|
"grad_norm": 1.5344813758662075, |
|
"learning_rate": 1.0840417345814313e-07, |
|
"loss": 0.5141, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.9622641509433962, |
|
"grad_norm": 1.5204098865333115, |
|
"learning_rate": 4.818930960945878e-08, |
|
"loss": 0.4904, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.981132075471698, |
|
"grad_norm": 1.5256874098586901, |
|
"learning_rate": 1.2048779133150279e-08, |
|
"loss": 0.4746, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.4804382321073322, |
|
"learning_rate": 0.0, |
|
"loss": 0.5039, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1590, |
|
"total_flos": 83202240675840.0, |
|
"train_loss": 1.594443890733539, |
|
"train_runtime": 15602.0514, |
|
"train_samples_per_second": 3.26, |
|
"train_steps_per_second": 0.102 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1590, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 83202240675840.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|