learned-league-4a / last-checkpoint /trainer_state.json
AlanFeder's picture
Training in progress, step 138, checkpoint
3f07d6d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.732620320855615,
"eval_steps": 12,
"global_step": 138,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0213903743315508,
"grad_norm": 23.385425567626953,
"learning_rate": 2e-05,
"loss": 2.2375,
"step": 1
},
{
"epoch": 0.0213903743315508,
"eval_loss": 2.295367479324341,
"eval_runtime": 16.9622,
"eval_samples_per_second": 17.686,
"eval_steps_per_second": 8.843,
"step": 1
},
{
"epoch": 0.0427807486631016,
"grad_norm": 31.645936965942383,
"learning_rate": 4e-05,
"loss": 2.3841,
"step": 2
},
{
"epoch": 0.06417112299465241,
"grad_norm": 34.55805206298828,
"learning_rate": 6e-05,
"loss": 2.6477,
"step": 3
},
{
"epoch": 0.0855614973262032,
"grad_norm": 32.25712966918945,
"learning_rate": 8e-05,
"loss": 1.9082,
"step": 4
},
{
"epoch": 0.10695187165775401,
"grad_norm": 25.384239196777344,
"learning_rate": 0.0001,
"loss": 1.5055,
"step": 5
},
{
"epoch": 0.12834224598930483,
"grad_norm": 22.698801040649414,
"learning_rate": 0.00012,
"loss": 1.237,
"step": 6
},
{
"epoch": 0.1497326203208556,
"grad_norm": 25.194936752319336,
"learning_rate": 0.00014,
"loss": 1.1552,
"step": 7
},
{
"epoch": 0.1711229946524064,
"grad_norm": 17.266931533813477,
"learning_rate": 0.00016,
"loss": 0.9119,
"step": 8
},
{
"epoch": 0.1925133689839572,
"grad_norm": 15.346185684204102,
"learning_rate": 0.00018,
"loss": 0.834,
"step": 9
},
{
"epoch": 0.21390374331550802,
"grad_norm": 26.989032745361328,
"learning_rate": 0.0002,
"loss": 0.7533,
"step": 10
},
{
"epoch": 0.23529411764705882,
"grad_norm": 11.042716979980469,
"learning_rate": 0.00019998370105646414,
"loss": 0.6231,
"step": 11
},
{
"epoch": 0.25668449197860965,
"grad_norm": 8.389060020446777,
"learning_rate": 0.0001999348095389677,
"loss": 0.4795,
"step": 12
},
{
"epoch": 0.25668449197860965,
"eval_loss": 0.612250566482544,
"eval_runtime": 17.2275,
"eval_samples_per_second": 17.414,
"eval_steps_per_second": 8.707,
"step": 12
},
{
"epoch": 0.27807486631016043,
"grad_norm": 32.6131477355957,
"learning_rate": 0.00019985334138511237,
"loss": 0.6842,
"step": 13
},
{
"epoch": 0.2994652406417112,
"grad_norm": 35.244876861572266,
"learning_rate": 0.000199739323151795,
"loss": 1.0486,
"step": 14
},
{
"epoch": 0.32085561497326204,
"grad_norm": 14.032343864440918,
"learning_rate": 0.00019959279200655044,
"loss": 0.9261,
"step": 15
},
{
"epoch": 0.3422459893048128,
"grad_norm": 14.313758850097656,
"learning_rate": 0.00019941379571543596,
"loss": 0.811,
"step": 16
},
{
"epoch": 0.36363636363636365,
"grad_norm": 14.679015159606934,
"learning_rate": 0.00019920239262746043,
"loss": 0.7831,
"step": 17
},
{
"epoch": 0.3850267379679144,
"grad_norm": 14.768975257873535,
"learning_rate": 0.00019895865165556377,
"loss": 0.5663,
"step": 18
},
{
"epoch": 0.40641711229946526,
"grad_norm": 16.57910919189453,
"learning_rate": 0.00019868265225415265,
"loss": 0.7238,
"step": 19
},
{
"epoch": 0.42780748663101603,
"grad_norm": 11.922738075256348,
"learning_rate": 0.00019837448439320027,
"loss": 0.6445,
"step": 20
},
{
"epoch": 0.44919786096256686,
"grad_norm": 17.637170791625977,
"learning_rate": 0.00019803424852891802,
"loss": 1.2175,
"step": 21
},
{
"epoch": 0.47058823529411764,
"grad_norm": 9.833541870117188,
"learning_rate": 0.00019766205557100868,
"loss": 0.8675,
"step": 22
},
{
"epoch": 0.4919786096256685,
"grad_norm": 11.809141159057617,
"learning_rate": 0.00019725802684651233,
"loss": 1.1414,
"step": 23
},
{
"epoch": 0.5133689839572193,
"grad_norm": 18.123613357543945,
"learning_rate": 0.00019682229406025635,
"loss": 0.9939,
"step": 24
},
{
"epoch": 0.5133689839572193,
"eval_loss": 0.6862085461616516,
"eval_runtime": 17.2196,
"eval_samples_per_second": 17.422,
"eval_steps_per_second": 8.711,
"step": 24
},
{
"epoch": 0.5347593582887701,
"grad_norm": 11.453226089477539,
"learning_rate": 0.0001963549992519223,
"loss": 0.8239,
"step": 25
},
{
"epoch": 0.5561497326203209,
"grad_norm": 7.978896617889404,
"learning_rate": 0.00019585629474974415,
"loss": 0.5823,
"step": 26
},
{
"epoch": 0.5775401069518716,
"grad_norm": 7.980404376983643,
"learning_rate": 0.0001953263431208523,
"loss": 0.6231,
"step": 27
},
{
"epoch": 0.5989304812834224,
"grad_norm": 12.718688011169434,
"learning_rate": 0.00019476531711828027,
"loss": 0.6814,
"step": 28
},
{
"epoch": 0.6203208556149733,
"grad_norm": 11.378684997558594,
"learning_rate": 0.00019417339962465082,
"loss": 0.8471,
"step": 29
},
{
"epoch": 0.6417112299465241,
"grad_norm": 10.157166481018066,
"learning_rate": 0.0001935507835925601,
"loss": 0.6058,
"step": 30
},
{
"epoch": 0.6631016042780749,
"grad_norm": 10.623138427734375,
"learning_rate": 0.00019289767198167916,
"loss": 0.81,
"step": 31
},
{
"epoch": 0.6844919786096256,
"grad_norm": 16.875181198120117,
"learning_rate": 0.00019221427769259333,
"loss": 0.8037,
"step": 32
},
{
"epoch": 0.7058823529411765,
"grad_norm": 11.729046821594238,
"learning_rate": 0.0001915008234974012,
"loss": 0.5548,
"step": 33
},
{
"epoch": 0.7272727272727273,
"grad_norm": 10.128629684448242,
"learning_rate": 0.00019075754196709572,
"loss": 0.6979,
"step": 34
},
{
"epoch": 0.7486631016042781,
"grad_norm": 10.893034934997559,
"learning_rate": 0.0001899846753957507,
"loss": 0.6173,
"step": 35
},
{
"epoch": 0.7700534759358288,
"grad_norm": 11.063718795776367,
"learning_rate": 0.00018918247572153823,
"loss": 0.6885,
"step": 36
},
{
"epoch": 0.7700534759358288,
"eval_loss": 0.6894535422325134,
"eval_runtime": 16.9719,
"eval_samples_per_second": 17.676,
"eval_steps_per_second": 8.838,
"step": 36
},
{
"epoch": 0.7914438502673797,
"grad_norm": 13.273902893066406,
"learning_rate": 0.0001883512044446023,
"loss": 0.8057,
"step": 37
},
{
"epoch": 0.8128342245989305,
"grad_norm": 17.475479125976562,
"learning_rate": 0.00018749113254181498,
"loss": 0.8596,
"step": 38
},
{
"epoch": 0.8342245989304813,
"grad_norm": 13.216853141784668,
"learning_rate": 0.00018660254037844388,
"loss": 0.7885,
"step": 39
},
{
"epoch": 0.8556149732620321,
"grad_norm": 11.209785461425781,
"learning_rate": 0.00018568571761675893,
"loss": 0.7028,
"step": 40
},
{
"epoch": 0.8770053475935828,
"grad_norm": 8.61646556854248,
"learning_rate": 0.00018474096312160864,
"loss": 0.5775,
"step": 41
},
{
"epoch": 0.8983957219251337,
"grad_norm": 10.86780071258545,
"learning_rate": 0.00018376858486299647,
"loss": 0.8106,
"step": 42
},
{
"epoch": 0.9197860962566845,
"grad_norm": 6.3408427238464355,
"learning_rate": 0.00018276889981568906,
"loss": 0.4089,
"step": 43
},
{
"epoch": 0.9411764705882353,
"grad_norm": 19.630786895751953,
"learning_rate": 0.00018174223385588917,
"loss": 1.1041,
"step": 44
},
{
"epoch": 0.9625668449197861,
"grad_norm": 10.352812767028809,
"learning_rate": 0.00018068892165500704,
"loss": 0.749,
"step": 45
},
{
"epoch": 0.983957219251337,
"grad_norm": 8.671784400939941,
"learning_rate": 0.00017960930657056438,
"loss": 0.6975,
"step": 46
},
{
"epoch": 1.0053475935828877,
"grad_norm": 10.529078483581543,
"learning_rate": 0.00017850374053426723,
"loss": 0.5922,
"step": 47
},
{
"epoch": 1.0267379679144386,
"grad_norm": 9.373391151428223,
"learning_rate": 0.00017737258393728364,
"loss": 0.729,
"step": 48
},
{
"epoch": 1.0267379679144386,
"eval_loss": 0.6926581859588623,
"eval_runtime": 17.8687,
"eval_samples_per_second": 16.789,
"eval_steps_per_second": 8.395,
"step": 48
},
{
"epoch": 1.0481283422459893,
"grad_norm": 18.76338005065918,
"learning_rate": 0.00017621620551276366,
"loss": 0.9321,
"step": 49
},
{
"epoch": 1.0695187165775402,
"grad_norm": 7.443489074707031,
"learning_rate": 0.00017503498221564025,
"loss": 0.5063,
"step": 50
},
{
"epoch": 1.0909090909090908,
"grad_norm": 8.333600044250488,
"learning_rate": 0.00017382929909974987,
"loss": 0.4377,
"step": 51
},
{
"epoch": 1.1122994652406417,
"grad_norm": 14.056548118591309,
"learning_rate": 0.0001725995491923131,
"loss": 1.2233,
"step": 52
},
{
"epoch": 1.0213903743315509,
"grad_norm": 13.999143600463867,
"learning_rate": 0.00017134613336581599,
"loss": 0.3174,
"step": 53
},
{
"epoch": 1.0427807486631016,
"grad_norm": 10.154765129089355,
"learning_rate": 0.00017006946020733425,
"loss": 0.162,
"step": 54
},
{
"epoch": 1.0641711229946524,
"grad_norm": 4.229214191436768,
"learning_rate": 0.00016876994588534234,
"loss": 0.1357,
"step": 55
},
{
"epoch": 1.085561497326203,
"grad_norm": 4.834641456604004,
"learning_rate": 0.0001674480140140514,
"loss": 0.1686,
"step": 56
},
{
"epoch": 1.106951871657754,
"grad_norm": 8.32540225982666,
"learning_rate": 0.00016610409551532005,
"loss": 0.2455,
"step": 57
},
{
"epoch": 1.1283422459893049,
"grad_norm": 5.407528400421143,
"learning_rate": 0.00016473862847818277,
"loss": 0.1116,
"step": 58
},
{
"epoch": 1.1497326203208555,
"grad_norm": 5.010147571563721,
"learning_rate": 0.0001633520580160424,
"loss": 0.1156,
"step": 59
},
{
"epoch": 1.1711229946524064,
"grad_norm": 6.022050380706787,
"learning_rate": 0.0001619448361215723,
"loss": 0.225,
"step": 60
},
{
"epoch": 1.1711229946524064,
"eval_loss": 0.7360826134681702,
"eval_runtime": 17.9845,
"eval_samples_per_second": 16.681,
"eval_steps_per_second": 8.34,
"step": 60
},
{
"epoch": 1.192513368983957,
"grad_norm": 5.873435974121094,
"learning_rate": 0.00016051742151937655,
"loss": 0.0572,
"step": 61
},
{
"epoch": 1.213903743315508,
"grad_norm": 10.556258201599121,
"learning_rate": 0.0001590702795164551,
"loss": 0.1261,
"step": 62
},
{
"epoch": 1.2352941176470589,
"grad_norm": 17.438634872436523,
"learning_rate": 0.00015760388185052398,
"loss": 0.382,
"step": 63
},
{
"epoch": 1.2566844919786098,
"grad_norm": 8.371882438659668,
"learning_rate": 0.00015611870653623825,
"loss": 0.228,
"step": 64
},
{
"epoch": 1.2780748663101604,
"grad_norm": 9.435052871704102,
"learning_rate": 0.0001546152377093697,
"loss": 0.1978,
"step": 65
},
{
"epoch": 1.299465240641711,
"grad_norm": 8.652084350585938,
"learning_rate": 0.0001530939654689887,
"loss": 0.1587,
"step": 66
},
{
"epoch": 1.320855614973262,
"grad_norm": 8.71427059173584,
"learning_rate": 0.00015155538571770218,
"loss": 0.0637,
"step": 67
},
{
"epoch": 1.3422459893048129,
"grad_norm": 5.761900901794434,
"learning_rate": 0.00015000000000000001,
"loss": 0.2027,
"step": 68
},
{
"epoch": 1.3636363636363638,
"grad_norm": 8.064814567565918,
"learning_rate": 0.00014842831533876195,
"loss": 0.1595,
"step": 69
},
{
"epoch": 1.3850267379679144,
"grad_norm": 7.962220668792725,
"learning_rate": 0.00014684084406997903,
"loss": 0.3658,
"step": 70
},
{
"epoch": 1.4064171122994653,
"grad_norm": 4.9641242027282715,
"learning_rate": 0.00014523810367574272,
"loss": 0.1368,
"step": 71
},
{
"epoch": 1.427807486631016,
"grad_norm": 7.027231216430664,
"learning_rate": 0.00014362061661555675,
"loss": 0.1688,
"step": 72
},
{
"epoch": 1.427807486631016,
"eval_loss": 0.8078603148460388,
"eval_runtime": 17.7522,
"eval_samples_per_second": 16.899,
"eval_steps_per_second": 8.45,
"step": 72
},
{
"epoch": 1.4491978609625669,
"grad_norm": 3.6481807231903076,
"learning_rate": 0.00014198891015602646,
"loss": 0.0844,
"step": 73
},
{
"epoch": 1.4705882352941178,
"grad_norm": 7.879701614379883,
"learning_rate": 0.00014034351619898088,
"loss": 0.3332,
"step": 74
},
{
"epoch": 1.4919786096256684,
"grad_norm": 5.169126987457275,
"learning_rate": 0.00013868497110808395,
"loss": 0.1301,
"step": 75
},
{
"epoch": 1.5133689839572193,
"grad_norm": 5.538336753845215,
"learning_rate": 0.00013701381553399145,
"loss": 0.145,
"step": 76
},
{
"epoch": 1.53475935828877,
"grad_norm": 25.778339385986328,
"learning_rate": 0.00013533059423811026,
"loss": 0.2704,
"step": 77
},
{
"epoch": 1.5561497326203209,
"grad_norm": 5.043937683105469,
"learning_rate": 0.0001336358559150175,
"loss": 0.1458,
"step": 78
},
{
"epoch": 1.5775401069518717,
"grad_norm": 7.204008102416992,
"learning_rate": 0.000131930153013598,
"loss": 0.1012,
"step": 79
},
{
"epoch": 1.5989304812834224,
"grad_norm": 8.367019653320312,
"learning_rate": 0.00013021404155695725,
"loss": 0.1889,
"step": 80
},
{
"epoch": 1.6203208556149733,
"grad_norm": 5.196023464202881,
"learning_rate": 0.00012848808096117,
"loss": 0.0881,
"step": 81
},
{
"epoch": 1.641711229946524,
"grad_norm": 8.067776679992676,
"learning_rate": 0.00012675283385292212,
"loss": 0.2948,
"step": 82
},
{
"epoch": 1.6631016042780749,
"grad_norm": 8.772676467895508,
"learning_rate": 0.0001250088658861063,
"loss": 0.154,
"step": 83
},
{
"epoch": 1.6844919786096257,
"grad_norm": 16.28201675415039,
"learning_rate": 0.00012325674555743106,
"loss": 0.3085,
"step": 84
},
{
"epoch": 1.6844919786096257,
"eval_loss": 0.780529797077179,
"eval_runtime": 17.4078,
"eval_samples_per_second": 17.234,
"eval_steps_per_second": 8.617,
"step": 84
},
{
"epoch": 1.7058823529411766,
"grad_norm": 8.888002395629883,
"learning_rate": 0.00012149704402110243,
"loss": 0.2488,
"step": 85
},
{
"epoch": 1.7272727272727273,
"grad_norm": 6.574310779571533,
"learning_rate": 0.00011973033490264001,
"loss": 0.1698,
"step": 86
},
{
"epoch": 1.748663101604278,
"grad_norm": 10.671408653259277,
"learning_rate": 0.00011795719411188718,
"loss": 0.1778,
"step": 87
},
{
"epoch": 1.7700534759358288,
"grad_norm": 6.570982933044434,
"learning_rate": 0.0001161781996552765,
"loss": 0.115,
"step": 88
},
{
"epoch": 1.7914438502673797,
"grad_norm": 11.171520233154297,
"learning_rate": 0.0001143939314474119,
"loss": 0.2032,
"step": 89
},
{
"epoch": 1.8128342245989306,
"grad_norm": 4.231898307800293,
"learning_rate": 0.00011260497112202895,
"loss": 0.0596,
"step": 90
},
{
"epoch": 1.8342245989304813,
"grad_norm": 5.200542449951172,
"learning_rate": 0.00011081190184239419,
"loss": 0.1294,
"step": 91
},
{
"epoch": 1.855614973262032,
"grad_norm": 6.871387958526611,
"learning_rate": 0.00010901530811120655,
"loss": 0.283,
"step": 92
},
{
"epoch": 1.8770053475935828,
"grad_norm": 4.964606285095215,
"learning_rate": 0.00010721577558006164,
"loss": 0.2673,
"step": 93
},
{
"epoch": 1.8983957219251337,
"grad_norm": 8.798148155212402,
"learning_rate": 0.00010541389085854176,
"loss": 0.2055,
"step": 94
},
{
"epoch": 1.9197860962566846,
"grad_norm": 5.59722375869751,
"learning_rate": 0.00010361024132299364,
"loss": 0.209,
"step": 95
},
{
"epoch": 1.9411764705882353,
"grad_norm": 8.256745338439941,
"learning_rate": 0.00010180541492505604,
"loss": 0.1079,
"step": 96
},
{
"epoch": 1.9411764705882353,
"eval_loss": 0.7684900760650635,
"eval_runtime": 17.746,
"eval_samples_per_second": 16.905,
"eval_steps_per_second": 8.453,
"step": 96
},
{
"epoch": 1.962566844919786,
"grad_norm": 4.481995582580566,
"learning_rate": 0.0001,
"loss": 0.1824,
"step": 97
},
{
"epoch": 1.9839572192513368,
"grad_norm": 3.852792263031006,
"learning_rate": 9.819458507494394e-05,
"loss": 0.124,
"step": 98
},
{
"epoch": 2.0053475935828877,
"grad_norm": 29.360998153686523,
"learning_rate": 9.638975867700638e-05,
"loss": 0.1844,
"step": 99
},
{
"epoch": 2.0267379679144386,
"grad_norm": 4.80811071395874,
"learning_rate": 9.458610914145826e-05,
"loss": 0.1347,
"step": 100
},
{
"epoch": 2.048128342245989,
"grad_norm": 17.53403091430664,
"learning_rate": 9.27842244199384e-05,
"loss": 0.3378,
"step": 101
},
{
"epoch": 2.06951871657754,
"grad_norm": 7.34214973449707,
"learning_rate": 9.098469188879349e-05,
"loss": 0.2045,
"step": 102
},
{
"epoch": 2.090909090909091,
"grad_norm": 4.968944072723389,
"learning_rate": 8.918809815760585e-05,
"loss": 0.2284,
"step": 103
},
{
"epoch": 2.0053475935828877,
"grad_norm": 4.202042579650879,
"learning_rate": 8.739502887797107e-05,
"loss": 0.1655,
"step": 104
},
{
"epoch": 2.0267379679144386,
"grad_norm": 3.561790704727173,
"learning_rate": 8.560606855258808e-05,
"loss": 0.0442,
"step": 105
},
{
"epoch": 2.0481283422459895,
"grad_norm": 3.8292624950408936,
"learning_rate": 8.382180034472353e-05,
"loss": 0.0821,
"step": 106
},
{
"epoch": 2.06951871657754,
"grad_norm": 2.1650640964508057,
"learning_rate": 8.204280588811283e-05,
"loss": 0.0384,
"step": 107
},
{
"epoch": 2.090909090909091,
"grad_norm": 1.6922334432601929,
"learning_rate": 8.026966509736001e-05,
"loss": 0.0342,
"step": 108
},
{
"epoch": 2.090909090909091,
"eval_loss": 0.7716657519340515,
"eval_runtime": 17.9156,
"eval_samples_per_second": 16.745,
"eval_steps_per_second": 8.373,
"step": 108
},
{
"epoch": 2.1122994652406417,
"grad_norm": 1.0654356479644775,
"learning_rate": 7.85029559788976e-05,
"loss": 0.0184,
"step": 109
},
{
"epoch": 2.1336898395721926,
"grad_norm": 3.1057019233703613,
"learning_rate": 7.674325444256899e-05,
"loss": 0.0417,
"step": 110
},
{
"epoch": 2.1550802139037435,
"grad_norm": 0.19042205810546875,
"learning_rate": 7.499113411389371e-05,
"loss": 0.0026,
"step": 111
},
{
"epoch": 2.176470588235294,
"grad_norm": 1.5116851329803467,
"learning_rate": 7.324716614707793e-05,
"loss": 0.0089,
"step": 112
},
{
"epoch": 2.197860962566845,
"grad_norm": 2.5151679515838623,
"learning_rate": 7.151191903883001e-05,
"loss": 0.0357,
"step": 113
},
{
"epoch": 2.2192513368983957,
"grad_norm": 2.838503837585449,
"learning_rate": 6.978595844304271e-05,
"loss": 0.0366,
"step": 114
},
{
"epoch": 2.2406417112299466,
"grad_norm": 3.835000514984131,
"learning_rate": 6.806984698640202e-05,
"loss": 0.1412,
"step": 115
},
{
"epoch": 2.2620320855614975,
"grad_norm": 3.4443538188934326,
"learning_rate": 6.636414408498249e-05,
"loss": 0.0707,
"step": 116
},
{
"epoch": 2.283422459893048,
"grad_norm": 2.701524496078491,
"learning_rate": 6.466940576188977e-05,
"loss": 0.0497,
"step": 117
},
{
"epoch": 2.304812834224599,
"grad_norm": 2.612593412399292,
"learning_rate": 6.298618446600856e-05,
"loss": 0.052,
"step": 118
},
{
"epoch": 2.3262032085561497,
"grad_norm": 4.986962795257568,
"learning_rate": 6.13150288919161e-05,
"loss": 0.1255,
"step": 119
},
{
"epoch": 2.3475935828877006,
"grad_norm": 1.8598374128341675,
"learning_rate": 5.965648380101916e-05,
"loss": 0.0309,
"step": 120
},
{
"epoch": 2.3475935828877006,
"eval_loss": 0.785007119178772,
"eval_runtime": 17.7923,
"eval_samples_per_second": 16.861,
"eval_steps_per_second": 8.431,
"step": 120
},
{
"epoch": 2.3689839572192515,
"grad_norm": 1.5813214778900146,
"learning_rate": 5.801108984397354e-05,
"loss": 0.0201,
"step": 121
},
{
"epoch": 2.3903743315508024,
"grad_norm": 0.13843385875225067,
"learning_rate": 5.6379383384443255e-05,
"loss": 0.0018,
"step": 122
},
{
"epoch": 2.411764705882353,
"grad_norm": 4.4155707359313965,
"learning_rate": 5.476189632425732e-05,
"loss": 0.0326,
"step": 123
},
{
"epoch": 2.4331550802139037,
"grad_norm": 3.5101325511932373,
"learning_rate": 5.3159155930021e-05,
"loss": 0.0259,
"step": 124
},
{
"epoch": 2.4545454545454546,
"grad_norm": 5.201532363891602,
"learning_rate": 5.1571684661238075e-05,
"loss": 0.0761,
"step": 125
},
{
"epoch": 2.4759358288770055,
"grad_norm": 2.48543119430542,
"learning_rate": 5.000000000000002e-05,
"loss": 0.0587,
"step": 126
},
{
"epoch": 2.497326203208556,
"grad_norm": 7.39755916595459,
"learning_rate": 4.844461428229782e-05,
"loss": 0.0391,
"step": 127
},
{
"epoch": 2.518716577540107,
"grad_norm": 4.151485443115234,
"learning_rate": 4.6906034531011346e-05,
"loss": 0.0982,
"step": 128
},
{
"epoch": 2.5401069518716577,
"grad_norm": 4.144845485687256,
"learning_rate": 4.53847622906303e-05,
"loss": 0.0707,
"step": 129
},
{
"epoch": 2.5614973262032086,
"grad_norm": 7.3682732582092285,
"learning_rate": 4.388129346376178e-05,
"loss": 0.0455,
"step": 130
},
{
"epoch": 2.5828877005347595,
"grad_norm": 4.947929382324219,
"learning_rate": 4.239611814947605e-05,
"loss": 0.033,
"step": 131
},
{
"epoch": 2.6042780748663104,
"grad_norm": 3.0208606719970703,
"learning_rate": 4.092972048354491e-05,
"loss": 0.0373,
"step": 132
},
{
"epoch": 2.6042780748663104,
"eval_loss": 0.776565432548523,
"eval_runtime": 17.3019,
"eval_samples_per_second": 17.339,
"eval_steps_per_second": 8.67,
"step": 132
},
{
"epoch": 2.625668449197861,
"grad_norm": 7.514610290527344,
"learning_rate": 3.948257848062351e-05,
"loss": 0.0323,
"step": 133
},
{
"epoch": 2.6470588235294117,
"grad_norm": 1.8352607488632202,
"learning_rate": 3.80551638784277e-05,
"loss": 0.043,
"step": 134
},
{
"epoch": 2.6684491978609626,
"grad_norm": 3.525506019592285,
"learning_rate": 3.664794198395764e-05,
"loss": 0.0643,
"step": 135
},
{
"epoch": 2.6898395721925135,
"grad_norm": 5.074891567230225,
"learning_rate": 3.5261371521817244e-05,
"loss": 0.0658,
"step": 136
},
{
"epoch": 2.711229946524064,
"grad_norm": 3.6220922470092773,
"learning_rate": 3.3895904484679984e-05,
"loss": 0.1535,
"step": 137
},
{
"epoch": 2.732620320855615,
"grad_norm": 3.9044840335845947,
"learning_rate": 3.2551985985948616e-05,
"loss": 0.0572,
"step": 138
}
],
"logging_steps": 1,
"max_steps": 184,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 46,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3759516646572032e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}