|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.732620320855615, |
|
"eval_steps": 12, |
|
"global_step": 138, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0213903743315508, |
|
"grad_norm": 23.385425567626953, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2375, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0213903743315508, |
|
"eval_loss": 2.295367479324341, |
|
"eval_runtime": 16.9622, |
|
"eval_samples_per_second": 17.686, |
|
"eval_steps_per_second": 8.843, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0427807486631016, |
|
"grad_norm": 31.645936965942383, |
|
"learning_rate": 4e-05, |
|
"loss": 2.3841, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.06417112299465241, |
|
"grad_norm": 34.55805206298828, |
|
"learning_rate": 6e-05, |
|
"loss": 2.6477, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0855614973262032, |
|
"grad_norm": 32.25712966918945, |
|
"learning_rate": 8e-05, |
|
"loss": 1.9082, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.10695187165775401, |
|
"grad_norm": 25.384239196777344, |
|
"learning_rate": 0.0001, |
|
"loss": 1.5055, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12834224598930483, |
|
"grad_norm": 22.698801040649414, |
|
"learning_rate": 0.00012, |
|
"loss": 1.237, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1497326203208556, |
|
"grad_norm": 25.194936752319336, |
|
"learning_rate": 0.00014, |
|
"loss": 1.1552, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1711229946524064, |
|
"grad_norm": 17.266931533813477, |
|
"learning_rate": 0.00016, |
|
"loss": 0.9119, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1925133689839572, |
|
"grad_norm": 15.346185684204102, |
|
"learning_rate": 0.00018, |
|
"loss": 0.834, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.21390374331550802, |
|
"grad_norm": 26.989032745361328, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7533, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 11.042716979980469, |
|
"learning_rate": 0.00019998370105646414, |
|
"loss": 0.6231, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.25668449197860965, |
|
"grad_norm": 8.389060020446777, |
|
"learning_rate": 0.0001999348095389677, |
|
"loss": 0.4795, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.25668449197860965, |
|
"eval_loss": 0.612250566482544, |
|
"eval_runtime": 17.2275, |
|
"eval_samples_per_second": 17.414, |
|
"eval_steps_per_second": 8.707, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.27807486631016043, |
|
"grad_norm": 32.6131477355957, |
|
"learning_rate": 0.00019985334138511237, |
|
"loss": 0.6842, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2994652406417112, |
|
"grad_norm": 35.244876861572266, |
|
"learning_rate": 0.000199739323151795, |
|
"loss": 1.0486, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.32085561497326204, |
|
"grad_norm": 14.032343864440918, |
|
"learning_rate": 0.00019959279200655044, |
|
"loss": 0.9261, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.3422459893048128, |
|
"grad_norm": 14.313758850097656, |
|
"learning_rate": 0.00019941379571543596, |
|
"loss": 0.811, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 14.679015159606934, |
|
"learning_rate": 0.00019920239262746043, |
|
"loss": 0.7831, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.3850267379679144, |
|
"grad_norm": 14.768975257873535, |
|
"learning_rate": 0.00019895865165556377, |
|
"loss": 0.5663, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.40641711229946526, |
|
"grad_norm": 16.57910919189453, |
|
"learning_rate": 0.00019868265225415265, |
|
"loss": 0.7238, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.42780748663101603, |
|
"grad_norm": 11.922738075256348, |
|
"learning_rate": 0.00019837448439320027, |
|
"loss": 0.6445, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.44919786096256686, |
|
"grad_norm": 17.637170791625977, |
|
"learning_rate": 0.00019803424852891802, |
|
"loss": 1.2175, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 9.833541870117188, |
|
"learning_rate": 0.00019766205557100868, |
|
"loss": 0.8675, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.4919786096256685, |
|
"grad_norm": 11.809141159057617, |
|
"learning_rate": 0.00019725802684651233, |
|
"loss": 1.1414, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.5133689839572193, |
|
"grad_norm": 18.123613357543945, |
|
"learning_rate": 0.00019682229406025635, |
|
"loss": 0.9939, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5133689839572193, |
|
"eval_loss": 0.6862085461616516, |
|
"eval_runtime": 17.2196, |
|
"eval_samples_per_second": 17.422, |
|
"eval_steps_per_second": 8.711, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5347593582887701, |
|
"grad_norm": 11.453226089477539, |
|
"learning_rate": 0.0001963549992519223, |
|
"loss": 0.8239, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.5561497326203209, |
|
"grad_norm": 7.978896617889404, |
|
"learning_rate": 0.00019585629474974415, |
|
"loss": 0.5823, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5775401069518716, |
|
"grad_norm": 7.980404376983643, |
|
"learning_rate": 0.0001953263431208523, |
|
"loss": 0.6231, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5989304812834224, |
|
"grad_norm": 12.718688011169434, |
|
"learning_rate": 0.00019476531711828027, |
|
"loss": 0.6814, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.6203208556149733, |
|
"grad_norm": 11.378684997558594, |
|
"learning_rate": 0.00019417339962465082, |
|
"loss": 0.8471, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.6417112299465241, |
|
"grad_norm": 10.157166481018066, |
|
"learning_rate": 0.0001935507835925601, |
|
"loss": 0.6058, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6631016042780749, |
|
"grad_norm": 10.623138427734375, |
|
"learning_rate": 0.00019289767198167916, |
|
"loss": 0.81, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.6844919786096256, |
|
"grad_norm": 16.875181198120117, |
|
"learning_rate": 0.00019221427769259333, |
|
"loss": 0.8037, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 11.729046821594238, |
|
"learning_rate": 0.0001915008234974012, |
|
"loss": 0.5548, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 10.128629684448242, |
|
"learning_rate": 0.00019075754196709572, |
|
"loss": 0.6979, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.7486631016042781, |
|
"grad_norm": 10.893034934997559, |
|
"learning_rate": 0.0001899846753957507, |
|
"loss": 0.6173, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.7700534759358288, |
|
"grad_norm": 11.063718795776367, |
|
"learning_rate": 0.00018918247572153823, |
|
"loss": 0.6885, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7700534759358288, |
|
"eval_loss": 0.6894535422325134, |
|
"eval_runtime": 16.9719, |
|
"eval_samples_per_second": 17.676, |
|
"eval_steps_per_second": 8.838, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7914438502673797, |
|
"grad_norm": 13.273902893066406, |
|
"learning_rate": 0.0001883512044446023, |
|
"loss": 0.8057, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.8128342245989305, |
|
"grad_norm": 17.475479125976562, |
|
"learning_rate": 0.00018749113254181498, |
|
"loss": 0.8596, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.8342245989304813, |
|
"grad_norm": 13.216853141784668, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 0.7885, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.8556149732620321, |
|
"grad_norm": 11.209785461425781, |
|
"learning_rate": 0.00018568571761675893, |
|
"loss": 0.7028, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8770053475935828, |
|
"grad_norm": 8.61646556854248, |
|
"learning_rate": 0.00018474096312160864, |
|
"loss": 0.5775, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.8983957219251337, |
|
"grad_norm": 10.86780071258545, |
|
"learning_rate": 0.00018376858486299647, |
|
"loss": 0.8106, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.9197860962566845, |
|
"grad_norm": 6.3408427238464355, |
|
"learning_rate": 0.00018276889981568906, |
|
"loss": 0.4089, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 19.630786895751953, |
|
"learning_rate": 0.00018174223385588917, |
|
"loss": 1.1041, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.9625668449197861, |
|
"grad_norm": 10.352812767028809, |
|
"learning_rate": 0.00018068892165500704, |
|
"loss": 0.749, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.983957219251337, |
|
"grad_norm": 8.671784400939941, |
|
"learning_rate": 0.00017960930657056438, |
|
"loss": 0.6975, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.0053475935828877, |
|
"grad_norm": 10.529078483581543, |
|
"learning_rate": 0.00017850374053426723, |
|
"loss": 0.5922, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.0267379679144386, |
|
"grad_norm": 9.373391151428223, |
|
"learning_rate": 0.00017737258393728364, |
|
"loss": 0.729, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0267379679144386, |
|
"eval_loss": 0.6926581859588623, |
|
"eval_runtime": 17.8687, |
|
"eval_samples_per_second": 16.789, |
|
"eval_steps_per_second": 8.395, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.0481283422459893, |
|
"grad_norm": 18.76338005065918, |
|
"learning_rate": 0.00017621620551276366, |
|
"loss": 0.9321, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.0695187165775402, |
|
"grad_norm": 7.443489074707031, |
|
"learning_rate": 0.00017503498221564025, |
|
"loss": 0.5063, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 8.333600044250488, |
|
"learning_rate": 0.00017382929909974987, |
|
"loss": 0.4377, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.1122994652406417, |
|
"grad_norm": 14.056548118591309, |
|
"learning_rate": 0.0001725995491923131, |
|
"loss": 1.2233, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.0213903743315509, |
|
"grad_norm": 13.999143600463867, |
|
"learning_rate": 0.00017134613336581599, |
|
"loss": 0.3174, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.0427807486631016, |
|
"grad_norm": 10.154765129089355, |
|
"learning_rate": 0.00017006946020733425, |
|
"loss": 0.162, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0641711229946524, |
|
"grad_norm": 4.229214191436768, |
|
"learning_rate": 0.00016876994588534234, |
|
"loss": 0.1357, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.085561497326203, |
|
"grad_norm": 4.834641456604004, |
|
"learning_rate": 0.0001674480140140514, |
|
"loss": 0.1686, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.106951871657754, |
|
"grad_norm": 8.32540225982666, |
|
"learning_rate": 0.00016610409551532005, |
|
"loss": 0.2455, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.1283422459893049, |
|
"grad_norm": 5.407528400421143, |
|
"learning_rate": 0.00016473862847818277, |
|
"loss": 0.1116, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.1497326203208555, |
|
"grad_norm": 5.010147571563721, |
|
"learning_rate": 0.0001633520580160424, |
|
"loss": 0.1156, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.1711229946524064, |
|
"grad_norm": 6.022050380706787, |
|
"learning_rate": 0.0001619448361215723, |
|
"loss": 0.225, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1711229946524064, |
|
"eval_loss": 0.7360826134681702, |
|
"eval_runtime": 17.9845, |
|
"eval_samples_per_second": 16.681, |
|
"eval_steps_per_second": 8.34, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.192513368983957, |
|
"grad_norm": 5.873435974121094, |
|
"learning_rate": 0.00016051742151937655, |
|
"loss": 0.0572, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.213903743315508, |
|
"grad_norm": 10.556258201599121, |
|
"learning_rate": 0.0001590702795164551, |
|
"loss": 0.1261, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.2352941176470589, |
|
"grad_norm": 17.438634872436523, |
|
"learning_rate": 0.00015760388185052398, |
|
"loss": 0.382, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.2566844919786098, |
|
"grad_norm": 8.371882438659668, |
|
"learning_rate": 0.00015611870653623825, |
|
"loss": 0.228, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.2780748663101604, |
|
"grad_norm": 9.435052871704102, |
|
"learning_rate": 0.0001546152377093697, |
|
"loss": 0.1978, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.299465240641711, |
|
"grad_norm": 8.652084350585938, |
|
"learning_rate": 0.0001530939654689887, |
|
"loss": 0.1587, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.320855614973262, |
|
"grad_norm": 8.71427059173584, |
|
"learning_rate": 0.00015155538571770218, |
|
"loss": 0.0637, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.3422459893048129, |
|
"grad_norm": 5.761900901794434, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.2027, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 8.064814567565918, |
|
"learning_rate": 0.00014842831533876195, |
|
"loss": 0.1595, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.3850267379679144, |
|
"grad_norm": 7.962220668792725, |
|
"learning_rate": 0.00014684084406997903, |
|
"loss": 0.3658, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4064171122994653, |
|
"grad_norm": 4.9641242027282715, |
|
"learning_rate": 0.00014523810367574272, |
|
"loss": 0.1368, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.427807486631016, |
|
"grad_norm": 7.027231216430664, |
|
"learning_rate": 0.00014362061661555675, |
|
"loss": 0.1688, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.427807486631016, |
|
"eval_loss": 0.8078603148460388, |
|
"eval_runtime": 17.7522, |
|
"eval_samples_per_second": 16.899, |
|
"eval_steps_per_second": 8.45, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.4491978609625669, |
|
"grad_norm": 3.6481807231903076, |
|
"learning_rate": 0.00014198891015602646, |
|
"loss": 0.0844, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 7.879701614379883, |
|
"learning_rate": 0.00014034351619898088, |
|
"loss": 0.3332, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4919786096256684, |
|
"grad_norm": 5.169126987457275, |
|
"learning_rate": 0.00013868497110808395, |
|
"loss": 0.1301, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.5133689839572193, |
|
"grad_norm": 5.538336753845215, |
|
"learning_rate": 0.00013701381553399145, |
|
"loss": 0.145, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.53475935828877, |
|
"grad_norm": 25.778339385986328, |
|
"learning_rate": 0.00013533059423811026, |
|
"loss": 0.2704, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.5561497326203209, |
|
"grad_norm": 5.043937683105469, |
|
"learning_rate": 0.0001336358559150175, |
|
"loss": 0.1458, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5775401069518717, |
|
"grad_norm": 7.204008102416992, |
|
"learning_rate": 0.000131930153013598, |
|
"loss": 0.1012, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.5989304812834224, |
|
"grad_norm": 8.367019653320312, |
|
"learning_rate": 0.00013021404155695725, |
|
"loss": 0.1889, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.6203208556149733, |
|
"grad_norm": 5.196023464202881, |
|
"learning_rate": 0.00012848808096117, |
|
"loss": 0.0881, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.641711229946524, |
|
"grad_norm": 8.067776679992676, |
|
"learning_rate": 0.00012675283385292212, |
|
"loss": 0.2948, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.6631016042780749, |
|
"grad_norm": 8.772676467895508, |
|
"learning_rate": 0.0001250088658861063, |
|
"loss": 0.154, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.6844919786096257, |
|
"grad_norm": 16.28201675415039, |
|
"learning_rate": 0.00012325674555743106, |
|
"loss": 0.3085, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6844919786096257, |
|
"eval_loss": 0.780529797077179, |
|
"eval_runtime": 17.4078, |
|
"eval_samples_per_second": 17.234, |
|
"eval_steps_per_second": 8.617, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 8.888002395629883, |
|
"learning_rate": 0.00012149704402110243, |
|
"loss": 0.2488, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 6.574310779571533, |
|
"learning_rate": 0.00011973033490264001, |
|
"loss": 0.1698, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.748663101604278, |
|
"grad_norm": 10.671408653259277, |
|
"learning_rate": 0.00011795719411188718, |
|
"loss": 0.1778, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.7700534759358288, |
|
"grad_norm": 6.570982933044434, |
|
"learning_rate": 0.0001161781996552765, |
|
"loss": 0.115, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.7914438502673797, |
|
"grad_norm": 11.171520233154297, |
|
"learning_rate": 0.0001143939314474119, |
|
"loss": 0.2032, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.8128342245989306, |
|
"grad_norm": 4.231898307800293, |
|
"learning_rate": 0.00011260497112202895, |
|
"loss": 0.0596, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.8342245989304813, |
|
"grad_norm": 5.200542449951172, |
|
"learning_rate": 0.00011081190184239419, |
|
"loss": 0.1294, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.855614973262032, |
|
"grad_norm": 6.871387958526611, |
|
"learning_rate": 0.00010901530811120655, |
|
"loss": 0.283, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.8770053475935828, |
|
"grad_norm": 4.964606285095215, |
|
"learning_rate": 0.00010721577558006164, |
|
"loss": 0.2673, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.8983957219251337, |
|
"grad_norm": 8.798148155212402, |
|
"learning_rate": 0.00010541389085854176, |
|
"loss": 0.2055, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.9197860962566846, |
|
"grad_norm": 5.59722375869751, |
|
"learning_rate": 0.00010361024132299364, |
|
"loss": 0.209, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.9411764705882353, |
|
"grad_norm": 8.256745338439941, |
|
"learning_rate": 0.00010180541492505604, |
|
"loss": 0.1079, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.9411764705882353, |
|
"eval_loss": 0.7684900760650635, |
|
"eval_runtime": 17.746, |
|
"eval_samples_per_second": 16.905, |
|
"eval_steps_per_second": 8.453, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.962566844919786, |
|
"grad_norm": 4.481995582580566, |
|
"learning_rate": 0.0001, |
|
"loss": 0.1824, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.9839572192513368, |
|
"grad_norm": 3.852792263031006, |
|
"learning_rate": 9.819458507494394e-05, |
|
"loss": 0.124, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 2.0053475935828877, |
|
"grad_norm": 29.360998153686523, |
|
"learning_rate": 9.638975867700638e-05, |
|
"loss": 0.1844, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 2.0267379679144386, |
|
"grad_norm": 4.80811071395874, |
|
"learning_rate": 9.458610914145826e-05, |
|
"loss": 0.1347, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.048128342245989, |
|
"grad_norm": 17.53403091430664, |
|
"learning_rate": 9.27842244199384e-05, |
|
"loss": 0.3378, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 2.06951871657754, |
|
"grad_norm": 7.34214973449707, |
|
"learning_rate": 9.098469188879349e-05, |
|
"loss": 0.2045, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"grad_norm": 4.968944072723389, |
|
"learning_rate": 8.918809815760585e-05, |
|
"loss": 0.2284, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 2.0053475935828877, |
|
"grad_norm": 4.202042579650879, |
|
"learning_rate": 8.739502887797107e-05, |
|
"loss": 0.1655, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.0267379679144386, |
|
"grad_norm": 3.561790704727173, |
|
"learning_rate": 8.560606855258808e-05, |
|
"loss": 0.0442, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 2.0481283422459895, |
|
"grad_norm": 3.8292624950408936, |
|
"learning_rate": 8.382180034472353e-05, |
|
"loss": 0.0821, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.06951871657754, |
|
"grad_norm": 2.1650640964508057, |
|
"learning_rate": 8.204280588811283e-05, |
|
"loss": 0.0384, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"grad_norm": 1.6922334432601929, |
|
"learning_rate": 8.026966509736001e-05, |
|
"loss": 0.0342, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.090909090909091, |
|
"eval_loss": 0.7716657519340515, |
|
"eval_runtime": 17.9156, |
|
"eval_samples_per_second": 16.745, |
|
"eval_steps_per_second": 8.373, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.1122994652406417, |
|
"grad_norm": 1.0654356479644775, |
|
"learning_rate": 7.85029559788976e-05, |
|
"loss": 0.0184, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.1336898395721926, |
|
"grad_norm": 3.1057019233703613, |
|
"learning_rate": 7.674325444256899e-05, |
|
"loss": 0.0417, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.1550802139037435, |
|
"grad_norm": 0.19042205810546875, |
|
"learning_rate": 7.499113411389371e-05, |
|
"loss": 0.0026, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.176470588235294, |
|
"grad_norm": 1.5116851329803467, |
|
"learning_rate": 7.324716614707793e-05, |
|
"loss": 0.0089, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.197860962566845, |
|
"grad_norm": 2.5151679515838623, |
|
"learning_rate": 7.151191903883001e-05, |
|
"loss": 0.0357, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.2192513368983957, |
|
"grad_norm": 2.838503837585449, |
|
"learning_rate": 6.978595844304271e-05, |
|
"loss": 0.0366, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.2406417112299466, |
|
"grad_norm": 3.835000514984131, |
|
"learning_rate": 6.806984698640202e-05, |
|
"loss": 0.1412, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.2620320855614975, |
|
"grad_norm": 3.4443538188934326, |
|
"learning_rate": 6.636414408498249e-05, |
|
"loss": 0.0707, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.283422459893048, |
|
"grad_norm": 2.701524496078491, |
|
"learning_rate": 6.466940576188977e-05, |
|
"loss": 0.0497, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.304812834224599, |
|
"grad_norm": 2.612593412399292, |
|
"learning_rate": 6.298618446600856e-05, |
|
"loss": 0.052, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.3262032085561497, |
|
"grad_norm": 4.986962795257568, |
|
"learning_rate": 6.13150288919161e-05, |
|
"loss": 0.1255, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.3475935828877006, |
|
"grad_norm": 1.8598374128341675, |
|
"learning_rate": 5.965648380101916e-05, |
|
"loss": 0.0309, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3475935828877006, |
|
"eval_loss": 0.785007119178772, |
|
"eval_runtime": 17.7923, |
|
"eval_samples_per_second": 16.861, |
|
"eval_steps_per_second": 8.431, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3689839572192515, |
|
"grad_norm": 1.5813214778900146, |
|
"learning_rate": 5.801108984397354e-05, |
|
"loss": 0.0201, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.3903743315508024, |
|
"grad_norm": 0.13843385875225067, |
|
"learning_rate": 5.6379383384443255e-05, |
|
"loss": 0.0018, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.411764705882353, |
|
"grad_norm": 4.4155707359313965, |
|
"learning_rate": 5.476189632425732e-05, |
|
"loss": 0.0326, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.4331550802139037, |
|
"grad_norm": 3.5101325511932373, |
|
"learning_rate": 5.3159155930021e-05, |
|
"loss": 0.0259, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.4545454545454546, |
|
"grad_norm": 5.201532363891602, |
|
"learning_rate": 5.1571684661238075e-05, |
|
"loss": 0.0761, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.4759358288770055, |
|
"grad_norm": 2.48543119430542, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.0587, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.497326203208556, |
|
"grad_norm": 7.39755916595459, |
|
"learning_rate": 4.844461428229782e-05, |
|
"loss": 0.0391, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.518716577540107, |
|
"grad_norm": 4.151485443115234, |
|
"learning_rate": 4.6906034531011346e-05, |
|
"loss": 0.0982, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.5401069518716577, |
|
"grad_norm": 4.144845485687256, |
|
"learning_rate": 4.53847622906303e-05, |
|
"loss": 0.0707, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.5614973262032086, |
|
"grad_norm": 7.3682732582092285, |
|
"learning_rate": 4.388129346376178e-05, |
|
"loss": 0.0455, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.5828877005347595, |
|
"grad_norm": 4.947929382324219, |
|
"learning_rate": 4.239611814947605e-05, |
|
"loss": 0.033, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.6042780748663104, |
|
"grad_norm": 3.0208606719970703, |
|
"learning_rate": 4.092972048354491e-05, |
|
"loss": 0.0373, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.6042780748663104, |
|
"eval_loss": 0.776565432548523, |
|
"eval_runtime": 17.3019, |
|
"eval_samples_per_second": 17.339, |
|
"eval_steps_per_second": 8.67, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.625668449197861, |
|
"grad_norm": 7.514610290527344, |
|
"learning_rate": 3.948257848062351e-05, |
|
"loss": 0.0323, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 1.8352607488632202, |
|
"learning_rate": 3.80551638784277e-05, |
|
"loss": 0.043, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.6684491978609626, |
|
"grad_norm": 3.525506019592285, |
|
"learning_rate": 3.664794198395764e-05, |
|
"loss": 0.0643, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.6898395721925135, |
|
"grad_norm": 5.074891567230225, |
|
"learning_rate": 3.5261371521817244e-05, |
|
"loss": 0.0658, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.711229946524064, |
|
"grad_norm": 3.6220922470092773, |
|
"learning_rate": 3.3895904484679984e-05, |
|
"loss": 0.1535, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.732620320855615, |
|
"grad_norm": 3.9044840335845947, |
|
"learning_rate": 3.2551985985948616e-05, |
|
"loss": 0.0572, |
|
"step": 138 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 184, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 46, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3759516646572032e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|