hyungjoochae's picture
Upload folder using huggingface_hub
b2e1dc0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.937062937062937,
"eval_steps": 18,
"global_step": 355,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013986013986013986,
"grad_norm": 6.746792793273926,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.8294,
"step": 1
},
{
"epoch": 0.013986013986013986,
"eval_loss": 0.8744672536849976,
"eval_runtime": 36.967,
"eval_samples_per_second": 17.367,
"eval_steps_per_second": 2.191,
"step": 1
},
{
"epoch": 0.027972027972027972,
"grad_norm": 6.9825944900512695,
"learning_rate": 6.666666666666667e-07,
"loss": 0.8694,
"step": 2
},
{
"epoch": 0.04195804195804196,
"grad_norm": 7.01480770111084,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.861,
"step": 3
},
{
"epoch": 0.055944055944055944,
"grad_norm": 7.156968593597412,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.9027,
"step": 4
},
{
"epoch": 0.06993006993006994,
"grad_norm": 6.0878005027771,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.8577,
"step": 5
},
{
"epoch": 0.08391608391608392,
"grad_norm": 5.853216648101807,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.8168,
"step": 6
},
{
"epoch": 0.0979020979020979,
"grad_norm": 4.9973978996276855,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.788,
"step": 7
},
{
"epoch": 0.11188811188811189,
"grad_norm": 4.611128330230713,
"learning_rate": 2.666666666666667e-06,
"loss": 0.7959,
"step": 8
},
{
"epoch": 0.1258741258741259,
"grad_norm": 3.1312103271484375,
"learning_rate": 3e-06,
"loss": 0.7374,
"step": 9
},
{
"epoch": 0.13986013986013987,
"grad_norm": 2.9217381477355957,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7329,
"step": 10
},
{
"epoch": 0.15384615384615385,
"grad_norm": 2.5225424766540527,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.6905,
"step": 11
},
{
"epoch": 0.16783216783216784,
"grad_norm": 2.8658440113067627,
"learning_rate": 4.000000000000001e-06,
"loss": 0.702,
"step": 12
},
{
"epoch": 0.18181818181818182,
"grad_norm": 2.6459388732910156,
"learning_rate": 4.333333333333334e-06,
"loss": 0.6659,
"step": 13
},
{
"epoch": 0.1958041958041958,
"grad_norm": 2.4082329273223877,
"learning_rate": 4.666666666666667e-06,
"loss": 0.6732,
"step": 14
},
{
"epoch": 0.2097902097902098,
"grad_norm": 1.8969792127609253,
"learning_rate": 5e-06,
"loss": 0.626,
"step": 15
},
{
"epoch": 0.22377622377622378,
"grad_norm": 1.705984354019165,
"learning_rate": 5.333333333333334e-06,
"loss": 0.6357,
"step": 16
},
{
"epoch": 0.23776223776223776,
"grad_norm": 1.5265748500823975,
"learning_rate": 5.666666666666667e-06,
"loss": 0.6409,
"step": 17
},
{
"epoch": 0.2517482517482518,
"grad_norm": 1.3590223789215088,
"learning_rate": 6e-06,
"loss": 0.6128,
"step": 18
},
{
"epoch": 0.2517482517482518,
"eval_loss": 0.6171885132789612,
"eval_runtime": 35.4252,
"eval_samples_per_second": 18.123,
"eval_steps_per_second": 2.287,
"step": 18
},
{
"epoch": 0.26573426573426573,
"grad_norm": 1.3791933059692383,
"learning_rate": 6.333333333333333e-06,
"loss": 0.6181,
"step": 19
},
{
"epoch": 0.27972027972027974,
"grad_norm": 1.398863434791565,
"learning_rate": 6.666666666666667e-06,
"loss": 0.593,
"step": 20
},
{
"epoch": 0.2937062937062937,
"grad_norm": 1.1556097269058228,
"learning_rate": 7e-06,
"loss": 0.6274,
"step": 21
},
{
"epoch": 0.3076923076923077,
"grad_norm": 1.094146728515625,
"learning_rate": 7.333333333333333e-06,
"loss": 0.6113,
"step": 22
},
{
"epoch": 0.32167832167832167,
"grad_norm": 1.2191824913024902,
"learning_rate": 7.666666666666667e-06,
"loss": 0.6111,
"step": 23
},
{
"epoch": 0.3356643356643357,
"grad_norm": 0.9371815323829651,
"learning_rate": 8.000000000000001e-06,
"loss": 0.5895,
"step": 24
},
{
"epoch": 0.34965034965034963,
"grad_norm": 0.8173602223396301,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6083,
"step": 25
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.0984693765640259,
"learning_rate": 8.666666666666668e-06,
"loss": 0.6072,
"step": 26
},
{
"epoch": 0.3776223776223776,
"grad_norm": 1.0279648303985596,
"learning_rate": 9e-06,
"loss": 0.6001,
"step": 27
},
{
"epoch": 0.3916083916083916,
"grad_norm": 0.9129611253738403,
"learning_rate": 9.333333333333334e-06,
"loss": 0.5644,
"step": 28
},
{
"epoch": 0.40559440559440557,
"grad_norm": 0.832744300365448,
"learning_rate": 9.666666666666667e-06,
"loss": 0.5716,
"step": 29
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.8230701684951782,
"learning_rate": 1e-05,
"loss": 0.59,
"step": 30
},
{
"epoch": 0.43356643356643354,
"grad_norm": 0.8343638181686401,
"learning_rate": 9.999766401714795e-06,
"loss": 0.5876,
"step": 31
},
{
"epoch": 0.44755244755244755,
"grad_norm": 0.7421298623085022,
"learning_rate": 9.999065628686439e-06,
"loss": 0.5959,
"step": 32
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.7471378445625305,
"learning_rate": 9.997897746394684e-06,
"loss": 0.5804,
"step": 33
},
{
"epoch": 0.4755244755244755,
"grad_norm": 0.8300222754478455,
"learning_rate": 9.996262863965651e-06,
"loss": 0.5726,
"step": 34
},
{
"epoch": 0.48951048951048953,
"grad_norm": 0.7753379940986633,
"learning_rate": 9.994161134161635e-06,
"loss": 0.6034,
"step": 35
},
{
"epoch": 0.5034965034965035,
"grad_norm": 0.8331146240234375,
"learning_rate": 9.991592753366822e-06,
"loss": 0.5953,
"step": 36
},
{
"epoch": 0.5034965034965035,
"eval_loss": 0.5805296897888184,
"eval_runtime": 35.0435,
"eval_samples_per_second": 18.32,
"eval_steps_per_second": 2.311,
"step": 36
},
{
"epoch": 0.5174825174825175,
"grad_norm": 0.7212592959403992,
"learning_rate": 9.988557961568956e-06,
"loss": 0.5639,
"step": 37
},
{
"epoch": 0.5314685314685315,
"grad_norm": 0.796295166015625,
"learning_rate": 9.985057042336898e-06,
"loss": 0.5771,
"step": 38
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.8607219457626343,
"learning_rate": 9.981090322794145e-06,
"loss": 0.5763,
"step": 39
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.861869215965271,
"learning_rate": 9.976658173588244e-06,
"loss": 0.5729,
"step": 40
},
{
"epoch": 0.5734265734265734,
"grad_norm": 0.7538414597511292,
"learning_rate": 9.97176100885618e-06,
"loss": 0.571,
"step": 41
},
{
"epoch": 0.5874125874125874,
"grad_norm": 0.7197255492210388,
"learning_rate": 9.966399286185666e-06,
"loss": 0.5421,
"step": 42
},
{
"epoch": 0.6013986013986014,
"grad_norm": 0.7522373199462891,
"learning_rate": 9.960573506572391e-06,
"loss": 0.5603,
"step": 43
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.8054993152618408,
"learning_rate": 9.954284214373204e-06,
"loss": 0.5723,
"step": 44
},
{
"epoch": 0.6293706293706294,
"grad_norm": 0.639057457447052,
"learning_rate": 9.947531997255256e-06,
"loss": 0.5483,
"step": 45
},
{
"epoch": 0.6433566433566433,
"grad_norm": 0.6742891073226929,
"learning_rate": 9.940317486141084e-06,
"loss": 0.5845,
"step": 46
},
{
"epoch": 0.6573426573426573,
"grad_norm": 0.6605424880981445,
"learning_rate": 9.932641355149655e-06,
"loss": 0.5639,
"step": 47
},
{
"epoch": 0.6713286713286714,
"grad_norm": 0.7080878019332886,
"learning_rate": 9.924504321533387e-06,
"loss": 0.5851,
"step": 48
},
{
"epoch": 0.6853146853146853,
"grad_norm": 0.6235523223876953,
"learning_rate": 9.915907145611117e-06,
"loss": 0.574,
"step": 49
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.6567375063896179,
"learning_rate": 9.906850630697068e-06,
"loss": 0.5705,
"step": 50
},
{
"epoch": 0.7132867132867133,
"grad_norm": 0.6011090278625488,
"learning_rate": 9.89733562302578e-06,
"loss": 0.574,
"step": 51
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.6043576002120972,
"learning_rate": 9.887363011673046e-06,
"loss": 0.5849,
"step": 52
},
{
"epoch": 0.7412587412587412,
"grad_norm": 0.7147118449211121,
"learning_rate": 9.876933728472826e-06,
"loss": 0.5584,
"step": 53
},
{
"epoch": 0.7552447552447552,
"grad_norm": 0.6480064392089844,
"learning_rate": 9.866048747930194e-06,
"loss": 0.5494,
"step": 54
},
{
"epoch": 0.7552447552447552,
"eval_loss": 0.5708758234977722,
"eval_runtime": 34.9921,
"eval_samples_per_second": 18.347,
"eval_steps_per_second": 2.315,
"step": 54
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.6563164591789246,
"learning_rate": 9.854709087130261e-06,
"loss": 0.5491,
"step": 55
},
{
"epoch": 0.7832167832167832,
"grad_norm": 0.6024691462516785,
"learning_rate": 9.842915805643156e-06,
"loss": 0.5589,
"step": 56
},
{
"epoch": 0.7972027972027972,
"grad_norm": 0.6186073422431946,
"learning_rate": 9.830670005425012e-06,
"loss": 0.5567,
"step": 57
},
{
"epoch": 0.8111888111888111,
"grad_norm": 0.6993715763092041,
"learning_rate": 9.817972830715003e-06,
"loss": 0.5534,
"step": 58
},
{
"epoch": 0.8251748251748252,
"grad_norm": 0.6327122449874878,
"learning_rate": 9.804825467928423e-06,
"loss": 0.5709,
"step": 59
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.6156756281852722,
"learning_rate": 9.791229145545832e-06,
"loss": 0.5445,
"step": 60
},
{
"epoch": 0.8531468531468531,
"grad_norm": 0.7704036235809326,
"learning_rate": 9.777185133998268e-06,
"loss": 0.5743,
"step": 61
},
{
"epoch": 0.8671328671328671,
"grad_norm": 0.5839553475379944,
"learning_rate": 9.76269474554854e-06,
"loss": 0.5536,
"step": 62
},
{
"epoch": 0.8811188811188811,
"grad_norm": 0.6872385144233704,
"learning_rate": 9.747759334168602e-06,
"loss": 0.5627,
"step": 63
},
{
"epoch": 0.8951048951048951,
"grad_norm": 0.663074791431427,
"learning_rate": 9.73238029541305e-06,
"loss": 0.5643,
"step": 64
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.7018933296203613,
"learning_rate": 9.716559066288716e-06,
"loss": 0.5729,
"step": 65
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.7574678659439087,
"learning_rate": 9.7002971251204e-06,
"loss": 0.5813,
"step": 66
},
{
"epoch": 0.9370629370629371,
"grad_norm": 0.6293357014656067,
"learning_rate": 9.683595991412725e-06,
"loss": 0.5819,
"step": 67
},
{
"epoch": 0.951048951048951,
"grad_norm": 0.6524381041526794,
"learning_rate": 9.666457225708175e-06,
"loss": 0.5856,
"step": 68
},
{
"epoch": 0.965034965034965,
"grad_norm": 0.8389201164245605,
"learning_rate": 9.648882429441258e-06,
"loss": 0.5587,
"step": 69
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.6339119672775269,
"learning_rate": 9.630873244788884e-06,
"loss": 0.5655,
"step": 70
},
{
"epoch": 0.993006993006993,
"grad_norm": 0.6689181923866272,
"learning_rate": 9.612431354516912e-06,
"loss": 0.574,
"step": 71
},
{
"epoch": 1.0,
"grad_norm": 0.7970519661903381,
"learning_rate": 9.593558481822923e-06,
"loss": 0.5541,
"step": 72
},
{
"epoch": 1.0,
"eval_loss": 0.5664608478546143,
"eval_runtime": 34.9634,
"eval_samples_per_second": 18.362,
"eval_steps_per_second": 2.317,
"step": 72
},
{
"epoch": 1.013986013986014,
"grad_norm": 0.6805382370948792,
"learning_rate": 9.574256390175192e-06,
"loss": 0.5175,
"step": 73
},
{
"epoch": 1.027972027972028,
"grad_norm": 0.6378044486045837,
"learning_rate": 9.554526883147926e-06,
"loss": 0.5323,
"step": 74
},
{
"epoch": 1.0419580419580419,
"grad_norm": 0.6296578645706177,
"learning_rate": 9.534371804252727e-06,
"loss": 0.5197,
"step": 75
},
{
"epoch": 1.055944055944056,
"grad_norm": 0.6116400361061096,
"learning_rate": 9.513793036766345e-06,
"loss": 0.504,
"step": 76
},
{
"epoch": 1.06993006993007,
"grad_norm": 0.6288114190101624,
"learning_rate": 9.492792503554695e-06,
"loss": 0.5314,
"step": 77
},
{
"epoch": 1.083916083916084,
"grad_norm": 0.6576322913169861,
"learning_rate": 9.4713721668932e-06,
"loss": 0.5437,
"step": 78
},
{
"epoch": 1.097902097902098,
"grad_norm": 0.5930177569389343,
"learning_rate": 9.44953402828342e-06,
"loss": 0.5213,
"step": 79
},
{
"epoch": 1.1118881118881119,
"grad_norm": 0.7437406778335571,
"learning_rate": 9.427280128266049e-06,
"loss": 0.5441,
"step": 80
},
{
"epoch": 1.1258741258741258,
"grad_norm": 0.7347025275230408,
"learning_rate": 9.404612546230244e-06,
"loss": 0.5078,
"step": 81
},
{
"epoch": 1.1398601398601398,
"grad_norm": 0.6133800148963928,
"learning_rate": 9.381533400219319e-06,
"loss": 0.5129,
"step": 82
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.8068645000457764,
"learning_rate": 9.358044846732848e-06,
"loss": 0.5252,
"step": 83
},
{
"epoch": 1.167832167832168,
"grad_norm": 0.7470645904541016,
"learning_rate": 9.334149080525154e-06,
"loss": 0.5251,
"step": 84
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.6085983514785767,
"learning_rate": 9.309848334400247e-06,
"loss": 0.5119,
"step": 85
},
{
"epoch": 1.1958041958041958,
"grad_norm": 0.6427562236785889,
"learning_rate": 9.285144879003173e-06,
"loss": 0.5327,
"step": 86
},
{
"epoch": 1.2097902097902098,
"grad_norm": 0.5992908477783203,
"learning_rate": 9.26004102260786e-06,
"loss": 0.5174,
"step": 87
},
{
"epoch": 1.2237762237762237,
"grad_norm": 0.6650605201721191,
"learning_rate": 9.23453911090143e-06,
"loss": 0.541,
"step": 88
},
{
"epoch": 1.2377622377622377,
"grad_norm": 0.6733765602111816,
"learning_rate": 9.208641526765024e-06,
"loss": 0.4968,
"step": 89
},
{
"epoch": 1.2517482517482517,
"grad_norm": 0.5896586775779724,
"learning_rate": 9.182350690051134e-06,
"loss": 0.5111,
"step": 90
},
{
"epoch": 1.2517482517482517,
"eval_loss": 0.5681217312812805,
"eval_runtime": 34.9547,
"eval_samples_per_second": 18.367,
"eval_steps_per_second": 2.317,
"step": 90
},
{
"epoch": 1.2657342657342658,
"grad_norm": 0.5879291892051697,
"learning_rate": 9.155669057357515e-06,
"loss": 0.5124,
"step": 91
},
{
"epoch": 1.2797202797202798,
"grad_norm": 0.6704349517822266,
"learning_rate": 9.12859912179762e-06,
"loss": 0.5264,
"step": 92
},
{
"epoch": 1.2937062937062938,
"grad_norm": 0.7005125284194946,
"learning_rate": 9.101143412767665e-06,
"loss": 0.5426,
"step": 93
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.5738447904586792,
"learning_rate": 9.073304495710267e-06,
"loss": 0.5057,
"step": 94
},
{
"epoch": 1.3216783216783217,
"grad_norm": 0.6039765477180481,
"learning_rate": 9.045084971874738e-06,
"loss": 0.5106,
"step": 95
},
{
"epoch": 1.3356643356643356,
"grad_norm": 0.6626608967781067,
"learning_rate": 9.016487478074032e-06,
"loss": 0.5231,
"step": 96
},
{
"epoch": 1.3496503496503496,
"grad_norm": 0.607319176197052,
"learning_rate": 8.987514686438353e-06,
"loss": 0.5373,
"step": 97
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.6294829249382019,
"learning_rate": 8.95816930416548e-06,
"loss": 0.5478,
"step": 98
},
{
"epoch": 1.3776223776223775,
"grad_norm": 0.5931101441383362,
"learning_rate": 8.928454073267801e-06,
"loss": 0.5183,
"step": 99
},
{
"epoch": 1.3916083916083917,
"grad_norm": 0.5525672435760498,
"learning_rate": 8.898371770316113e-06,
"loss": 0.5049,
"step": 100
},
{
"epoch": 1.4055944055944056,
"grad_norm": 0.5554185509681702,
"learning_rate": 8.867925206180166e-06,
"loss": 0.5329,
"step": 101
},
{
"epoch": 1.4195804195804196,
"grad_norm": 0.6104192137718201,
"learning_rate": 8.837117225766033e-06,
"loss": 0.5421,
"step": 102
},
{
"epoch": 1.4335664335664335,
"grad_norm": 0.5591093897819519,
"learning_rate": 8.805950707750268e-06,
"loss": 0.5434,
"step": 103
},
{
"epoch": 1.4475524475524475,
"grad_norm": 0.5589428544044495,
"learning_rate": 8.774428564310939e-06,
"loss": 0.5159,
"step": 104
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.580699622631073,
"learning_rate": 8.742553740855507e-06,
"loss": 0.5143,
"step": 105
},
{
"epoch": 1.4755244755244754,
"grad_norm": 0.6007757186889648,
"learning_rate": 8.710329215745612e-06,
"loss": 0.5066,
"step": 106
},
{
"epoch": 1.4895104895104896,
"grad_norm": 0.6713395118713379,
"learning_rate": 8.677758000018777e-06,
"loss": 0.5318,
"step": 107
},
{
"epoch": 1.5034965034965035,
"grad_norm": 0.5536379814147949,
"learning_rate": 8.644843137107058e-06,
"loss": 0.5159,
"step": 108
},
{
"epoch": 1.5034965034965035,
"eval_loss": 0.5661691427230835,
"eval_runtime": 35.3668,
"eval_samples_per_second": 18.153,
"eval_steps_per_second": 2.29,
"step": 108
},
{
"epoch": 1.5174825174825175,
"grad_norm": 0.645210325717926,
"learning_rate": 8.61158770255267e-06,
"loss": 0.5312,
"step": 109
},
{
"epoch": 1.5314685314685315,
"grad_norm": 0.601094126701355,
"learning_rate": 8.577994803720605e-06,
"loss": 0.5394,
"step": 110
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.5418203473091125,
"learning_rate": 8.544067579508292e-06,
"loss": 0.5264,
"step": 111
},
{
"epoch": 1.5594405594405596,
"grad_norm": 0.5513077974319458,
"learning_rate": 8.509809200052286e-06,
"loss": 0.5269,
"step": 112
},
{
"epoch": 1.5734265734265733,
"grad_norm": 0.6063372492790222,
"learning_rate": 8.475222866432065e-06,
"loss": 0.5199,
"step": 113
},
{
"epoch": 1.5874125874125875,
"grad_norm": 0.5637122988700867,
"learning_rate": 8.440311810370921e-06,
"loss": 0.5342,
"step": 114
},
{
"epoch": 1.6013986013986012,
"grad_norm": 0.5762498378753662,
"learning_rate": 8.405079293933986e-06,
"loss": 0.5419,
"step": 115
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.557772159576416,
"learning_rate": 8.36952860922343e-06,
"loss": 0.5217,
"step": 116
},
{
"epoch": 1.6293706293706294,
"grad_norm": 0.6382875442504883,
"learning_rate": 8.333663078070845e-06,
"loss": 0.5366,
"step": 117
},
{
"epoch": 1.6433566433566433,
"grad_norm": 0.5209150910377502,
"learning_rate": 8.297486051726864e-06,
"loss": 0.5087,
"step": 118
},
{
"epoch": 1.6573426573426573,
"grad_norm": 0.5415475964546204,
"learning_rate": 8.26100091054801e-06,
"loss": 0.5026,
"step": 119
},
{
"epoch": 1.6713286713286712,
"grad_norm": 0.6667906641960144,
"learning_rate": 8.224211063680854e-06,
"loss": 0.5224,
"step": 120
},
{
"epoch": 1.6853146853146854,
"grad_norm": 0.573965311050415,
"learning_rate": 8.18711994874345e-06,
"loss": 0.538,
"step": 121
},
{
"epoch": 1.6993006993006992,
"grad_norm": 0.6206014156341553,
"learning_rate": 8.149731031504136e-06,
"loss": 0.5161,
"step": 122
},
{
"epoch": 1.7132867132867133,
"grad_norm": 0.6324427127838135,
"learning_rate": 8.112047805557693e-06,
"loss": 0.5407,
"step": 123
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.5460613965988159,
"learning_rate": 8.074073791998907e-06,
"loss": 0.5238,
"step": 124
},
{
"epoch": 1.7412587412587412,
"grad_norm": 0.5684161186218262,
"learning_rate": 8.035812539093557e-06,
"loss": 0.5166,
"step": 125
},
{
"epoch": 1.7552447552447552,
"grad_norm": 0.6114190816879272,
"learning_rate": 7.997267621946871e-06,
"loss": 0.5212,
"step": 126
},
{
"epoch": 1.7552447552447552,
"eval_loss": 0.5644441843032837,
"eval_runtime": 34.8941,
"eval_samples_per_second": 18.399,
"eval_steps_per_second": 2.321,
"step": 126
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.5791452527046204,
"learning_rate": 7.958442642169469e-06,
"loss": 0.5219,
"step": 127
},
{
"epoch": 1.7832167832167833,
"grad_norm": 0.5814895033836365,
"learning_rate": 7.919341227540828e-06,
"loss": 0.5492,
"step": 128
},
{
"epoch": 1.797202797202797,
"grad_norm": 0.5562170147895813,
"learning_rate": 7.879967031670313e-06,
"loss": 0.5065,
"step": 129
},
{
"epoch": 1.8111888111888113,
"grad_norm": 0.5666476488113403,
"learning_rate": 7.84032373365578e-06,
"loss": 0.508,
"step": 130
},
{
"epoch": 1.8251748251748252,
"grad_norm": 0.6123917102813721,
"learning_rate": 7.800415037739802e-06,
"loss": 0.5245,
"step": 131
},
{
"epoch": 1.8391608391608392,
"grad_norm": 0.6137180924415588,
"learning_rate": 7.760244672963548e-06,
"loss": 0.5281,
"step": 132
},
{
"epoch": 1.8531468531468531,
"grad_norm": 0.5444206595420837,
"learning_rate": 7.719816392818354e-06,
"loss": 0.496,
"step": 133
},
{
"epoch": 1.867132867132867,
"grad_norm": 0.5935954451560974,
"learning_rate": 7.679133974894984e-06,
"loss": 0.5164,
"step": 134
},
{
"epoch": 1.8811188811188813,
"grad_norm": 0.568263828754425,
"learning_rate": 7.638201220530664e-06,
"loss": 0.509,
"step": 135
},
{
"epoch": 1.895104895104895,
"grad_norm": 0.641503095626831,
"learning_rate": 7.597021954453887e-06,
"loss": 0.5389,
"step": 136
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.5866712927818298,
"learning_rate": 7.555600024427028e-06,
"loss": 0.5163,
"step": 137
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.559259831905365,
"learning_rate": 7.513939300886816e-06,
"loss": 0.5074,
"step": 138
},
{
"epoch": 1.937062937062937,
"grad_norm": 0.5635555386543274,
"learning_rate": 7.472043676582685e-06,
"loss": 0.5184,
"step": 139
},
{
"epoch": 1.951048951048951,
"grad_norm": 0.6236100196838379,
"learning_rate": 7.42991706621303e-06,
"loss": 0.5162,
"step": 140
},
{
"epoch": 1.965034965034965,
"grad_norm": 0.60297691822052,
"learning_rate": 7.387563406059433e-06,
"loss": 0.5123,
"step": 141
},
{
"epoch": 1.9790209790209792,
"grad_norm": 0.5734803080558777,
"learning_rate": 7.344986653618844e-06,
"loss": 0.5281,
"step": 142
},
{
"epoch": 1.993006993006993,
"grad_norm": 0.561177134513855,
"learning_rate": 7.302190787233808e-06,
"loss": 0.5256,
"step": 143
},
{
"epoch": 2.0,
"grad_norm": 0.6918484568595886,
"learning_rate": 7.259179805720726e-06,
"loss": 0.4956,
"step": 144
},
{
"epoch": 2.0,
"eval_loss": 0.5634886622428894,
"eval_runtime": 34.1505,
"eval_samples_per_second": 18.799,
"eval_steps_per_second": 2.372,
"step": 144
},
{
"epoch": 2.013986013986014,
"grad_norm": 0.6467083096504211,
"learning_rate": 7.215957727996208e-06,
"loss": 0.4757,
"step": 145
},
{
"epoch": 2.027972027972028,
"grad_norm": 0.628153920173645,
"learning_rate": 7.17252859270155e-06,
"loss": 0.4701,
"step": 146
},
{
"epoch": 2.041958041958042,
"grad_norm": 0.6287585496902466,
"learning_rate": 7.128896457825364e-06,
"loss": 0.4334,
"step": 147
},
{
"epoch": 2.055944055944056,
"grad_norm": 0.5704949498176575,
"learning_rate": 7.085065400324407e-06,
"loss": 0.4723,
"step": 148
},
{
"epoch": 2.06993006993007,
"grad_norm": 0.6293634176254272,
"learning_rate": 7.041039515742626e-06,
"loss": 0.4875,
"step": 149
},
{
"epoch": 2.0839160839160837,
"grad_norm": 0.7220337390899658,
"learning_rate": 6.9968229178284775e-06,
"loss": 0.4809,
"step": 150
},
{
"epoch": 2.097902097902098,
"grad_norm": 0.5713090896606445,
"learning_rate": 6.952419738150546e-06,
"loss": 0.4998,
"step": 151
},
{
"epoch": 2.111888111888112,
"grad_norm": 0.6713567972183228,
"learning_rate": 6.9078341257114765e-06,
"loss": 0.4837,
"step": 152
},
{
"epoch": 2.125874125874126,
"grad_norm": 0.6542858481407166,
"learning_rate": 6.863070246560319e-06,
"loss": 0.4798,
"step": 153
},
{
"epoch": 2.13986013986014,
"grad_norm": 0.5555688738822937,
"learning_rate": 6.818132283403236e-06,
"loss": 0.4593,
"step": 154
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.5947204232215881,
"learning_rate": 6.773024435212678e-06,
"loss": 0.4831,
"step": 155
},
{
"epoch": 2.167832167832168,
"grad_norm": 0.6230157613754272,
"learning_rate": 6.7277509168350445e-06,
"loss": 0.4634,
"step": 156
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.5586286783218384,
"learning_rate": 6.6823159585968355e-06,
"loss": 0.4803,
"step": 157
},
{
"epoch": 2.195804195804196,
"grad_norm": 0.5558333396911621,
"learning_rate": 6.636723805909384e-06,
"loss": 0.4734,
"step": 158
},
{
"epoch": 2.20979020979021,
"grad_norm": 0.5960513949394226,
"learning_rate": 6.590978718872166e-06,
"loss": 0.4746,
"step": 159
},
{
"epoch": 2.2237762237762237,
"grad_norm": 0.5779184103012085,
"learning_rate": 6.545084971874738e-06,
"loss": 0.4499,
"step": 160
},
{
"epoch": 2.237762237762238,
"grad_norm": 0.5827864408493042,
"learning_rate": 6.499046853197338e-06,
"loss": 0.4826,
"step": 161
},
{
"epoch": 2.2517482517482517,
"grad_norm": 0.6769295930862427,
"learning_rate": 6.452868664610197e-06,
"loss": 0.4797,
"step": 162
},
{
"epoch": 2.2517482517482517,
"eval_loss": 0.5764052271842957,
"eval_runtime": 34.051,
"eval_samples_per_second": 18.854,
"eval_steps_per_second": 2.379,
"step": 162
},
{
"epoch": 2.265734265734266,
"grad_norm": 0.5850751996040344,
"learning_rate": 6.406554720971583e-06,
"loss": 0.4829,
"step": 163
},
{
"epoch": 2.2797202797202796,
"grad_norm": 0.5925103425979614,
"learning_rate": 6.3601093498246215e-06,
"loss": 0.4936,
"step": 164
},
{
"epoch": 2.2937062937062938,
"grad_norm": 0.5747277140617371,
"learning_rate": 6.313536890992935e-06,
"loss": 0.4686,
"step": 165
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.6141413450241089,
"learning_rate": 6.266841696175132e-06,
"loss": 0.4659,
"step": 166
},
{
"epoch": 2.3216783216783217,
"grad_norm": 0.5214844942092896,
"learning_rate": 6.220028128538188e-06,
"loss": 0.4714,
"step": 167
},
{
"epoch": 2.335664335664336,
"grad_norm": 0.6260507106781006,
"learning_rate": 6.173100562309751e-06,
"loss": 0.4731,
"step": 168
},
{
"epoch": 2.3496503496503496,
"grad_norm": 0.6246528625488281,
"learning_rate": 6.1260633823694224e-06,
"loss": 0.4575,
"step": 169
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.5592030882835388,
"learning_rate": 6.078920983839032e-06,
"loss": 0.4293,
"step": 170
},
{
"epoch": 2.3776223776223775,
"grad_norm": 0.5436908602714539,
"learning_rate": 6.031677771671962e-06,
"loss": 0.4821,
"step": 171
},
{
"epoch": 2.3916083916083917,
"grad_norm": 0.5873638987541199,
"learning_rate": 5.984338160241552e-06,
"loss": 0.4755,
"step": 172
},
{
"epoch": 2.4055944055944054,
"grad_norm": 0.6056978106498718,
"learning_rate": 5.936906572928625e-06,
"loss": 0.479,
"step": 173
},
{
"epoch": 2.4195804195804196,
"grad_norm": 0.5452414751052856,
"learning_rate": 5.889387441708162e-06,
"loss": 0.4545,
"step": 174
},
{
"epoch": 2.4335664335664333,
"grad_norm": 0.5708940625190735,
"learning_rate": 5.841785206735192e-06,
"loss": 0.4706,
"step": 175
},
{
"epoch": 2.4475524475524475,
"grad_norm": 0.5819888114929199,
"learning_rate": 5.794104315929904e-06,
"loss": 0.4608,
"step": 176
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.5468575358390808,
"learning_rate": 5.746349224562021e-06,
"loss": 0.4696,
"step": 177
},
{
"epoch": 2.4755244755244754,
"grad_norm": 0.6171605587005615,
"learning_rate": 5.698524394834531e-06,
"loss": 0.4809,
"step": 178
},
{
"epoch": 2.4895104895104896,
"grad_norm": 0.6046556234359741,
"learning_rate": 5.650634295466717e-06,
"loss": 0.4727,
"step": 179
},
{
"epoch": 2.5034965034965033,
"grad_norm": 0.5517058968544006,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.4728,
"step": 180
},
{
"epoch": 2.5034965034965033,
"eval_loss": 0.5757314562797546,
"eval_runtime": 34.5495,
"eval_samples_per_second": 18.582,
"eval_steps_per_second": 2.344,
"step": 180
},
{
"epoch": 2.5174825174825175,
"grad_norm": 0.5916588306427002,
"learning_rate": 5.554676192762891e-06,
"loss": 0.4738,
"step": 181
},
{
"epoch": 2.5314685314685317,
"grad_norm": 0.596782386302948,
"learning_rate": 5.506617155686177e-06,
"loss": 0.4725,
"step": 182
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.5784814357757568,
"learning_rate": 5.458510780649932e-06,
"loss": 0.4743,
"step": 183
},
{
"epoch": 2.5594405594405596,
"grad_norm": 0.5162186622619629,
"learning_rate": 5.4103615626808426e-06,
"loss": 0.4501,
"step": 184
},
{
"epoch": 2.5734265734265733,
"grad_norm": 0.5629183053970337,
"learning_rate": 5.362174000808813e-06,
"loss": 0.4631,
"step": 185
},
{
"epoch": 2.5874125874125875,
"grad_norm": 0.5455092191696167,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.4839,
"step": 186
},
{
"epoch": 2.6013986013986012,
"grad_norm": 0.6234388947486877,
"learning_rate": 5.265701858968944e-06,
"loss": 0.4729,
"step": 187
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.5270193815231323,
"learning_rate": 5.217426293291869e-06,
"loss": 0.4767,
"step": 188
},
{
"epoch": 2.629370629370629,
"grad_norm": 0.5291939973831177,
"learning_rate": 5.169130411451083e-06,
"loss": 0.4659,
"step": 189
},
{
"epoch": 2.6433566433566433,
"grad_norm": 0.5210967063903809,
"learning_rate": 5.120818726180662e-06,
"loss": 0.4532,
"step": 190
},
{
"epoch": 2.6573426573426575,
"grad_norm": 0.5697853565216064,
"learning_rate": 5.072495751691338e-06,
"loss": 0.4669,
"step": 191
},
{
"epoch": 2.6713286713286712,
"grad_norm": 0.4967118203639984,
"learning_rate": 5.024166003248703e-06,
"loss": 0.4777,
"step": 192
},
{
"epoch": 2.6853146853146854,
"grad_norm": 0.5514243245124817,
"learning_rate": 4.9758339967512995e-06,
"loss": 0.4689,
"step": 193
},
{
"epoch": 2.699300699300699,
"grad_norm": 0.5476483702659607,
"learning_rate": 4.927504248308663e-06,
"loss": 0.4898,
"step": 194
},
{
"epoch": 2.7132867132867133,
"grad_norm": 0.5073778033256531,
"learning_rate": 4.87918127381934e-06,
"loss": 0.4462,
"step": 195
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.5061259865760803,
"learning_rate": 4.830869588548918e-06,
"loss": 0.4811,
"step": 196
},
{
"epoch": 2.7412587412587412,
"grad_norm": 0.532632052898407,
"learning_rate": 4.782573706708133e-06,
"loss": 0.4514,
"step": 197
},
{
"epoch": 2.755244755244755,
"grad_norm": 0.5079967379570007,
"learning_rate": 4.734298141031057e-06,
"loss": 0.4706,
"step": 198
},
{
"epoch": 2.755244755244755,
"eval_loss": 0.5748186111450195,
"eval_runtime": 34.6547,
"eval_samples_per_second": 18.526,
"eval_steps_per_second": 2.337,
"step": 198
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.5450592637062073,
"learning_rate": 4.686047402353433e-06,
"loss": 0.4717,
"step": 199
},
{
"epoch": 2.7832167832167833,
"grad_norm": 0.4929758906364441,
"learning_rate": 4.637825999191189e-06,
"loss": 0.469,
"step": 200
},
{
"epoch": 2.797202797202797,
"grad_norm": 0.514842689037323,
"learning_rate": 4.589638437319157e-06,
"loss": 0.4848,
"step": 201
},
{
"epoch": 2.8111888111888113,
"grad_norm": 0.5259736776351929,
"learning_rate": 4.541489219350069e-06,
"loss": 0.4676,
"step": 202
},
{
"epoch": 2.825174825174825,
"grad_norm": 0.571843683719635,
"learning_rate": 4.493382844313826e-06,
"loss": 0.482,
"step": 203
},
{
"epoch": 2.839160839160839,
"grad_norm": 0.49216270446777344,
"learning_rate": 4.445323807237112e-06,
"loss": 0.479,
"step": 204
},
{
"epoch": 2.8531468531468533,
"grad_norm": 0.5383098721504211,
"learning_rate": 4.397316598723385e-06,
"loss": 0.4517,
"step": 205
},
{
"epoch": 2.867132867132867,
"grad_norm": 0.5011985898017883,
"learning_rate": 4.349365704533285e-06,
"loss": 0.4678,
"step": 206
},
{
"epoch": 2.8811188811188813,
"grad_norm": 0.5291906595230103,
"learning_rate": 4.301475605165471e-06,
"loss": 0.4717,
"step": 207
},
{
"epoch": 2.895104895104895,
"grad_norm": 0.5500873923301697,
"learning_rate": 4.25365077543798e-06,
"loss": 0.4572,
"step": 208
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.5690264105796814,
"learning_rate": 4.205895684070099e-06,
"loss": 0.4675,
"step": 209
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.4746716022491455,
"learning_rate": 4.158214793264808e-06,
"loss": 0.4579,
"step": 210
},
{
"epoch": 2.937062937062937,
"grad_norm": 0.5113067626953125,
"learning_rate": 4.1106125582918385e-06,
"loss": 0.5104,
"step": 211
},
{
"epoch": 2.951048951048951,
"grad_norm": 0.5272907018661499,
"learning_rate": 4.063093427071376e-06,
"loss": 0.4532,
"step": 212
},
{
"epoch": 2.965034965034965,
"grad_norm": 0.5059399008750916,
"learning_rate": 4.01566183975845e-06,
"loss": 0.4555,
"step": 213
},
{
"epoch": 2.979020979020979,
"grad_norm": 0.4909096658229828,
"learning_rate": 3.968322228328041e-06,
"loss": 0.4785,
"step": 214
},
{
"epoch": 2.993006993006993,
"grad_norm": 0.5192479491233826,
"learning_rate": 3.92107901616097e-06,
"loss": 0.4477,
"step": 215
},
{
"epoch": 3.0,
"grad_norm": 0.7363195419311523,
"learning_rate": 3.873936617630578e-06,
"loss": 0.4927,
"step": 216
},
{
"epoch": 3.0,
"eval_loss": 0.5740084052085876,
"eval_runtime": 34.8551,
"eval_samples_per_second": 18.419,
"eval_steps_per_second": 2.324,
"step": 216
},
{
"epoch": 3.013986013986014,
"grad_norm": 0.5987377762794495,
"learning_rate": 3.82689943769025e-06,
"loss": 0.4246,
"step": 217
},
{
"epoch": 3.027972027972028,
"grad_norm": 0.589948832988739,
"learning_rate": 3.779971871461813e-06,
"loss": 0.4367,
"step": 218
},
{
"epoch": 3.041958041958042,
"grad_norm": 0.5003005862236023,
"learning_rate": 3.7331583038248688e-06,
"loss": 0.4346,
"step": 219
},
{
"epoch": 3.055944055944056,
"grad_norm": 0.528349757194519,
"learning_rate": 3.6864631090070656e-06,
"loss": 0.3993,
"step": 220
},
{
"epoch": 3.06993006993007,
"grad_norm": 0.5285301208496094,
"learning_rate": 3.639890650175379e-06,
"loss": 0.419,
"step": 221
},
{
"epoch": 3.0839160839160837,
"grad_norm": 0.5721102356910706,
"learning_rate": 3.593445279028418e-06,
"loss": 0.4328,
"step": 222
},
{
"epoch": 3.097902097902098,
"grad_norm": 0.5271673202514648,
"learning_rate": 3.5471313353898056e-06,
"loss": 0.4252,
"step": 223
},
{
"epoch": 3.111888111888112,
"grad_norm": 0.5354319214820862,
"learning_rate": 3.5009531468026646e-06,
"loss": 0.4367,
"step": 224
},
{
"epoch": 3.125874125874126,
"grad_norm": 0.5849824547767639,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.4263,
"step": 225
},
{
"epoch": 3.13986013986014,
"grad_norm": 0.6300305128097534,
"learning_rate": 3.409021281127835e-06,
"loss": 0.4331,
"step": 226
},
{
"epoch": 3.1538461538461537,
"grad_norm": 0.5985769033432007,
"learning_rate": 3.3632761940906167e-06,
"loss": 0.4316,
"step": 227
},
{
"epoch": 3.167832167832168,
"grad_norm": 0.5028027296066284,
"learning_rate": 3.3176840414031653e-06,
"loss": 0.4243,
"step": 228
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.5299258232116699,
"learning_rate": 3.2722490831649568e-06,
"loss": 0.4166,
"step": 229
},
{
"epoch": 3.195804195804196,
"grad_norm": 0.5425248742103577,
"learning_rate": 3.226975564787322e-06,
"loss": 0.4389,
"step": 230
},
{
"epoch": 3.20979020979021,
"grad_norm": 0.5929123759269714,
"learning_rate": 3.181867716596765e-06,
"loss": 0.4288,
"step": 231
},
{
"epoch": 3.2237762237762237,
"grad_norm": 0.5462735891342163,
"learning_rate": 3.1369297534396823e-06,
"loss": 0.4434,
"step": 232
},
{
"epoch": 3.237762237762238,
"grad_norm": 0.4862322211265564,
"learning_rate": 3.092165874288525e-06,
"loss": 0.4133,
"step": 233
},
{
"epoch": 3.2517482517482517,
"grad_norm": 0.48885804414749146,
"learning_rate": 3.0475802618494564e-06,
"loss": 0.4426,
"step": 234
},
{
"epoch": 3.2517482517482517,
"eval_loss": 0.5924859046936035,
"eval_runtime": 34.7085,
"eval_samples_per_second": 18.497,
"eval_steps_per_second": 2.334,
"step": 234
},
{
"epoch": 3.265734265734266,
"grad_norm": 0.4652189314365387,
"learning_rate": 3.0031770821715233e-06,
"loss": 0.4189,
"step": 235
},
{
"epoch": 3.2797202797202796,
"grad_norm": 0.5389134883880615,
"learning_rate": 2.9589604842573762e-06,
"loss": 0.4226,
"step": 236
},
{
"epoch": 3.2937062937062938,
"grad_norm": 0.507276177406311,
"learning_rate": 2.914934599675594e-06,
"loss": 0.4084,
"step": 237
},
{
"epoch": 3.3076923076923075,
"grad_norm": 0.4876704216003418,
"learning_rate": 2.871103542174637e-06,
"loss": 0.4256,
"step": 238
},
{
"epoch": 3.3216783216783217,
"grad_norm": 0.48441073298454285,
"learning_rate": 2.827471407298451e-06,
"loss": 0.4297,
"step": 239
},
{
"epoch": 3.335664335664336,
"grad_norm": 0.4634881317615509,
"learning_rate": 2.7840422720037943e-06,
"loss": 0.4227,
"step": 240
},
{
"epoch": 3.3496503496503496,
"grad_norm": 0.49520549178123474,
"learning_rate": 2.7408201942792755e-06,
"loss": 0.414,
"step": 241
},
{
"epoch": 3.3636363636363638,
"grad_norm": 0.4892767369747162,
"learning_rate": 2.697809212766195e-06,
"loss": 0.4326,
"step": 242
},
{
"epoch": 3.3776223776223775,
"grad_norm": 0.4968920052051544,
"learning_rate": 2.655013346381158e-06,
"loss": 0.4327,
"step": 243
},
{
"epoch": 3.3916083916083917,
"grad_norm": 0.4823973476886749,
"learning_rate": 2.612436593940568e-06,
"loss": 0.4329,
"step": 244
},
{
"epoch": 3.4055944055944054,
"grad_norm": 0.4838135540485382,
"learning_rate": 2.57008293378697e-06,
"loss": 0.4206,
"step": 245
},
{
"epoch": 3.4195804195804196,
"grad_norm": 0.47422581911087036,
"learning_rate": 2.5279563234173177e-06,
"loss": 0.4336,
"step": 246
},
{
"epoch": 3.4335664335664333,
"grad_norm": 0.4846055209636688,
"learning_rate": 2.4860606991131857e-06,
"loss": 0.4184,
"step": 247
},
{
"epoch": 3.4475524475524475,
"grad_norm": 0.5305242538452148,
"learning_rate": 2.444399975572974e-06,
"loss": 0.4394,
"step": 248
},
{
"epoch": 3.4615384615384617,
"grad_norm": 0.487332820892334,
"learning_rate": 2.402978045546114e-06,
"loss": 0.4033,
"step": 249
},
{
"epoch": 3.4755244755244754,
"grad_norm": 0.4706343114376068,
"learning_rate": 2.3617987794693358e-06,
"loss": 0.4408,
"step": 250
},
{
"epoch": 3.4895104895104896,
"grad_norm": 0.503103494644165,
"learning_rate": 2.320866025105016e-06,
"loss": 0.4166,
"step": 251
},
{
"epoch": 3.5034965034965033,
"grad_norm": 0.5077600479125977,
"learning_rate": 2.2801836071816476e-06,
"loss": 0.4423,
"step": 252
},
{
"epoch": 3.5034965034965033,
"eval_loss": 0.5952551364898682,
"eval_runtime": 33.5546,
"eval_samples_per_second": 19.133,
"eval_steps_per_second": 2.414,
"step": 252
},
{
"epoch": 3.5174825174825175,
"grad_norm": 0.48870253562927246,
"learning_rate": 2.2397553270364546e-06,
"loss": 0.4241,
"step": 253
},
{
"epoch": 3.5314685314685317,
"grad_norm": 0.4966093897819519,
"learning_rate": 2.1995849622602017e-06,
"loss": 0.4396,
"step": 254
},
{
"epoch": 3.5454545454545454,
"grad_norm": 0.4564977288246155,
"learning_rate": 2.159676266344222e-06,
"loss": 0.4223,
"step": 255
},
{
"epoch": 3.5594405594405596,
"grad_norm": 0.46915507316589355,
"learning_rate": 2.120032968329687e-06,
"loss": 0.4283,
"step": 256
},
{
"epoch": 3.5734265734265733,
"grad_norm": 0.49805694818496704,
"learning_rate": 2.0806587724591725e-06,
"loss": 0.4382,
"step": 257
},
{
"epoch": 3.5874125874125875,
"grad_norm": 0.48657479882240295,
"learning_rate": 2.0415573578305343e-06,
"loss": 0.4378,
"step": 258
},
{
"epoch": 3.6013986013986012,
"grad_norm": 0.46977299451828003,
"learning_rate": 2.0027323780531312e-06,
"loss": 0.4224,
"step": 259
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.49343907833099365,
"learning_rate": 1.9641874609064443e-06,
"loss": 0.4088,
"step": 260
},
{
"epoch": 3.629370629370629,
"grad_norm": 0.4801478385925293,
"learning_rate": 1.9259262080010938e-06,
"loss": 0.419,
"step": 261
},
{
"epoch": 3.6433566433566433,
"grad_norm": 0.4632829427719116,
"learning_rate": 1.887952194442309e-06,
"loss": 0.4185,
"step": 262
},
{
"epoch": 3.6573426573426575,
"grad_norm": 0.4722610414028168,
"learning_rate": 1.8502689684958664e-06,
"loss": 0.4223,
"step": 263
},
{
"epoch": 3.6713286713286712,
"grad_norm": 0.46521317958831787,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.4311,
"step": 264
},
{
"epoch": 3.6853146853146854,
"grad_norm": 0.49360647797584534,
"learning_rate": 1.7757889363191484e-06,
"loss": 0.4336,
"step": 265
},
{
"epoch": 3.699300699300699,
"grad_norm": 0.46490150690078735,
"learning_rate": 1.738999089451991e-06,
"loss": 0.41,
"step": 266
},
{
"epoch": 3.7132867132867133,
"grad_norm": 0.47419989109039307,
"learning_rate": 1.7025139482731385e-06,
"loss": 0.4489,
"step": 267
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.4471936821937561,
"learning_rate": 1.6663369219291558e-06,
"loss": 0.4075,
"step": 268
},
{
"epoch": 3.7412587412587412,
"grad_norm": 0.4871998727321625,
"learning_rate": 1.6304713907765713e-06,
"loss": 0.4138,
"step": 269
},
{
"epoch": 3.755244755244755,
"grad_norm": 0.4558921754360199,
"learning_rate": 1.5949207060660138e-06,
"loss": 0.4209,
"step": 270
},
{
"epoch": 3.755244755244755,
"eval_loss": 0.5941651463508606,
"eval_runtime": 34.8033,
"eval_samples_per_second": 18.447,
"eval_steps_per_second": 2.327,
"step": 270
},
{
"epoch": 3.769230769230769,
"grad_norm": 0.43444135785102844,
"learning_rate": 1.55968818962908e-06,
"loss": 0.4186,
"step": 271
},
{
"epoch": 3.7832167832167833,
"grad_norm": 0.47602659463882446,
"learning_rate": 1.5247771335679372e-06,
"loss": 0.4138,
"step": 272
},
{
"epoch": 3.797202797202797,
"grad_norm": 0.4794568121433258,
"learning_rate": 1.4901907999477167e-06,
"loss": 0.4512,
"step": 273
},
{
"epoch": 3.8111888111888113,
"grad_norm": 0.47370994091033936,
"learning_rate": 1.4559324204917102e-06,
"loss": 0.4446,
"step": 274
},
{
"epoch": 3.825174825174825,
"grad_norm": 0.4493069052696228,
"learning_rate": 1.4220051962793952e-06,
"loss": 0.4316,
"step": 275
},
{
"epoch": 3.839160839160839,
"grad_norm": 0.4439810812473297,
"learning_rate": 1.3884122974473307e-06,
"loss": 0.4276,
"step": 276
},
{
"epoch": 3.8531468531468533,
"grad_norm": 0.44139519333839417,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.427,
"step": 277
},
{
"epoch": 3.867132867132867,
"grad_norm": 0.45054903626441956,
"learning_rate": 1.3222419999812248e-06,
"loss": 0.4356,
"step": 278
},
{
"epoch": 3.8811188811188813,
"grad_norm": 0.44140151143074036,
"learning_rate": 1.2896707842543898e-06,
"loss": 0.4287,
"step": 279
},
{
"epoch": 3.895104895104895,
"grad_norm": 0.4277818202972412,
"learning_rate": 1.257446259144494e-06,
"loss": 0.4298,
"step": 280
},
{
"epoch": 3.909090909090909,
"grad_norm": 0.4403057098388672,
"learning_rate": 1.225571435689062e-06,
"loss": 0.4185,
"step": 281
},
{
"epoch": 3.9230769230769234,
"grad_norm": 0.4724678099155426,
"learning_rate": 1.1940492922497337e-06,
"loss": 0.4465,
"step": 282
},
{
"epoch": 3.937062937062937,
"grad_norm": 0.47128820419311523,
"learning_rate": 1.1628827742339688e-06,
"loss": 0.4126,
"step": 283
},
{
"epoch": 3.951048951048951,
"grad_norm": 0.4331970512866974,
"learning_rate": 1.1320747938198356e-06,
"loss": 0.4105,
"step": 284
},
{
"epoch": 3.965034965034965,
"grad_norm": 0.4537077844142914,
"learning_rate": 1.1016282296838887e-06,
"loss": 0.4257,
"step": 285
},
{
"epoch": 3.979020979020979,
"grad_norm": 0.46981024742126465,
"learning_rate": 1.0715459267321998e-06,
"loss": 0.4336,
"step": 286
},
{
"epoch": 3.993006993006993,
"grad_norm": 0.4497096538543701,
"learning_rate": 1.0418306958345214e-06,
"loss": 0.4326,
"step": 287
},
{
"epoch": 4.0,
"grad_norm": 0.6176419258117676,
"learning_rate": 1.0124853135616475e-06,
"loss": 0.4261,
"step": 288
},
{
"epoch": 4.0,
"eval_loss": 0.594137966632843,
"eval_runtime": 35.3287,
"eval_samples_per_second": 18.172,
"eval_steps_per_second": 2.293,
"step": 288
},
{
"epoch": 4.013986013986014,
"grad_norm": 0.48881927132606506,
"learning_rate": 9.835125219259694e-07,
"loss": 0.4126,
"step": 289
},
{
"epoch": 4.027972027972028,
"grad_norm": 0.47744905948638916,
"learning_rate": 9.549150281252633e-07,
"loss": 0.3887,
"step": 290
},
{
"epoch": 4.041958041958042,
"grad_norm": 0.4749980568885803,
"learning_rate": 9.266955042897357e-07,
"loss": 0.4085,
"step": 291
},
{
"epoch": 4.055944055944056,
"grad_norm": 0.4653206169605255,
"learning_rate": 8.988565872323362e-07,
"loss": 0.3949,
"step": 292
},
{
"epoch": 4.06993006993007,
"grad_norm": 0.44160446524620056,
"learning_rate": 8.714008782023797e-07,
"loss": 0.4049,
"step": 293
},
{
"epoch": 4.083916083916084,
"grad_norm": 0.43797171115875244,
"learning_rate": 8.443309426424862e-07,
"loss": 0.4038,
"step": 294
},
{
"epoch": 4.0979020979020975,
"grad_norm": 0.4569723904132843,
"learning_rate": 8.176493099488664e-07,
"loss": 0.3956,
"step": 295
},
{
"epoch": 4.111888111888112,
"grad_norm": 0.47445249557495117,
"learning_rate": 7.913584732349788e-07,
"loss": 0.4107,
"step": 296
},
{
"epoch": 4.125874125874126,
"grad_norm": 0.46384716033935547,
"learning_rate": 7.654608890985709e-07,
"loss": 0.3895,
"step": 297
},
{
"epoch": 4.13986013986014,
"grad_norm": 0.47651711106300354,
"learning_rate": 7.399589773921412e-07,
"loss": 0.3859,
"step": 298
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.4623275697231293,
"learning_rate": 7.148551209968279e-07,
"loss": 0.394,
"step": 299
},
{
"epoch": 4.1678321678321675,
"grad_norm": 0.4649985432624817,
"learning_rate": 6.901516655997536e-07,
"loss": 0.4108,
"step": 300
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.4691464304924011,
"learning_rate": 6.658509194748463e-07,
"loss": 0.3626,
"step": 301
},
{
"epoch": 4.195804195804196,
"grad_norm": 0.48455217480659485,
"learning_rate": 6.419551532671542e-07,
"loss": 0.4172,
"step": 302
},
{
"epoch": 4.20979020979021,
"grad_norm": 0.482030987739563,
"learning_rate": 6.184665997806832e-07,
"loss": 0.4038,
"step": 303
},
{
"epoch": 4.223776223776224,
"grad_norm": 0.4398139715194702,
"learning_rate": 5.953874537697573e-07,
"loss": 0.4033,
"step": 304
},
{
"epoch": 4.2377622377622375,
"grad_norm": 0.46925652027130127,
"learning_rate": 5.727198717339511e-07,
"loss": 0.4091,
"step": 305
},
{
"epoch": 4.251748251748252,
"grad_norm": 0.46952134370803833,
"learning_rate": 5.504659717165812e-07,
"loss": 0.4111,
"step": 306
},
{
"epoch": 4.251748251748252,
"eval_loss": 0.6070981025695801,
"eval_runtime": 35.5097,
"eval_samples_per_second": 18.08,
"eval_steps_per_second": 2.281,
"step": 306
},
{
"epoch": 4.265734265734266,
"grad_norm": 0.45535174012184143,
"learning_rate": 5.286278331068018e-07,
"loss": 0.4128,
"step": 307
},
{
"epoch": 4.27972027972028,
"grad_norm": 0.4438033998012543,
"learning_rate": 5.072074964453055e-07,
"loss": 0.4052,
"step": 308
},
{
"epoch": 4.293706293706293,
"grad_norm": 0.4887377917766571,
"learning_rate": 4.862069632336558e-07,
"loss": 0.3894,
"step": 309
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.4616340100765228,
"learning_rate": 4.6562819574727304e-07,
"loss": 0.4242,
"step": 310
},
{
"epoch": 4.321678321678322,
"grad_norm": 0.44037091732025146,
"learning_rate": 4.454731168520754e-07,
"loss": 0.4052,
"step": 311
},
{
"epoch": 4.335664335664336,
"grad_norm": 0.4455097019672394,
"learning_rate": 4.257436098248091e-07,
"loss": 0.3882,
"step": 312
},
{
"epoch": 4.34965034965035,
"grad_norm": 0.47457605600357056,
"learning_rate": 4.064415181770787e-07,
"loss": 0.4102,
"step": 313
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.4474296271800995,
"learning_rate": 3.875686454830885e-07,
"loss": 0.3866,
"step": 314
},
{
"epoch": 4.3776223776223775,
"grad_norm": 0.44111815094947815,
"learning_rate": 3.691267552111183e-07,
"loss": 0.4091,
"step": 315
},
{
"epoch": 4.391608391608392,
"grad_norm": 0.46066638827323914,
"learning_rate": 3.511175705587433e-07,
"loss": 0.422,
"step": 316
},
{
"epoch": 4.405594405594406,
"grad_norm": 0.4345090389251709,
"learning_rate": 3.3354277429182626e-07,
"loss": 0.3882,
"step": 317
},
{
"epoch": 4.41958041958042,
"grad_norm": 0.462768018245697,
"learning_rate": 3.164040085872755e-07,
"loss": 0.4125,
"step": 318
},
{
"epoch": 4.433566433566433,
"grad_norm": 0.4575034976005554,
"learning_rate": 2.997028748796016e-07,
"loss": 0.4138,
"step": 319
},
{
"epoch": 4.4475524475524475,
"grad_norm": 0.43728622794151306,
"learning_rate": 2.834409337112842e-07,
"loss": 0.4133,
"step": 320
},
{
"epoch": 4.461538461538462,
"grad_norm": 0.4533195495605469,
"learning_rate": 2.676197045869511e-07,
"loss": 0.4067,
"step": 321
},
{
"epoch": 4.475524475524476,
"grad_norm": 0.44842609763145447,
"learning_rate": 2.522406658313997e-07,
"loss": 0.4042,
"step": 322
},
{
"epoch": 4.489510489510489,
"grad_norm": 0.4315699636936188,
"learning_rate": 2.3730525445146146e-07,
"loss": 0.3969,
"step": 323
},
{
"epoch": 4.503496503496503,
"grad_norm": 0.43630900979042053,
"learning_rate": 2.2281486600173207e-07,
"loss": 0.3907,
"step": 324
},
{
"epoch": 4.503496503496503,
"eval_loss": 0.6088654398918152,
"eval_runtime": 35.0812,
"eval_samples_per_second": 18.3,
"eval_steps_per_second": 2.309,
"step": 324
},
{
"epoch": 4.5174825174825175,
"grad_norm": 0.43661531805992126,
"learning_rate": 2.0877085445416889e-07,
"loss": 0.4079,
"step": 325
},
{
"epoch": 4.531468531468532,
"grad_norm": 0.43984201550483704,
"learning_rate": 1.9517453207157865e-07,
"loss": 0.4071,
"step": 326
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.43304693698883057,
"learning_rate": 1.8202716928499842e-07,
"loss": 0.4,
"step": 327
},
{
"epoch": 4.559440559440559,
"grad_norm": 0.44190627336502075,
"learning_rate": 1.6932999457498823e-07,
"loss": 0.3936,
"step": 328
},
{
"epoch": 4.573426573426573,
"grad_norm": 0.46403783559799194,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.4142,
"step": 329
},
{
"epoch": 4.5874125874125875,
"grad_norm": 0.448397159576416,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.411,
"step": 330
},
{
"epoch": 4.601398601398602,
"grad_norm": 0.4263162910938263,
"learning_rate": 1.3395125206980774e-07,
"loss": 0.3991,
"step": 331
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.4367568790912628,
"learning_rate": 1.230662715271741e-07,
"loss": 0.4144,
"step": 332
},
{
"epoch": 4.629370629370629,
"grad_norm": 0.4405047297477722,
"learning_rate": 1.1263698832695513e-07,
"loss": 0.3935,
"step": 333
},
{
"epoch": 4.643356643356643,
"grad_norm": 0.4359452426433563,
"learning_rate": 1.0266437697422026e-07,
"loss": 0.3913,
"step": 334
},
{
"epoch": 4.6573426573426575,
"grad_norm": 0.44500768184661865,
"learning_rate": 9.314936930293283e-08,
"loss": 0.4102,
"step": 335
},
{
"epoch": 4.671328671328672,
"grad_norm": 0.46006131172180176,
"learning_rate": 8.40928543888836e-08,
"loss": 0.4138,
"step": 336
},
{
"epoch": 4.685314685314685,
"grad_norm": 0.44435447454452515,
"learning_rate": 7.549567846661388e-08,
"loss": 0.4185,
"step": 337
},
{
"epoch": 4.699300699300699,
"grad_norm": 0.43049922585487366,
"learning_rate": 6.735864485034493e-08,
"loss": 0.3946,
"step": 338
},
{
"epoch": 4.713286713286713,
"grad_norm": 0.4270278513431549,
"learning_rate": 5.968251385891744e-08,
"loss": 0.3969,
"step": 339
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.4480164647102356,
"learning_rate": 5.246800274474439e-08,
"loss": 0.4005,
"step": 340
},
{
"epoch": 4.741258741258742,
"grad_norm": 0.4490266740322113,
"learning_rate": 4.571578562679757e-08,
"loss": 0.3884,
"step": 341
},
{
"epoch": 4.755244755244755,
"grad_norm": 0.4623181223869324,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.4169,
"step": 342
},
{
"epoch": 4.755244755244755,
"eval_loss": 0.6084015965461731,
"eval_runtime": 34.879,
"eval_samples_per_second": 18.406,
"eval_steps_per_second": 2.322,
"step": 342
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.4283956289291382,
"learning_rate": 3.360071381433516e-08,
"loss": 0.3969,
"step": 343
},
{
"epoch": 4.783216783216783,
"grad_norm": 0.4356008470058441,
"learning_rate": 2.823899114382078e-08,
"loss": 0.4027,
"step": 344
},
{
"epoch": 4.7972027972027975,
"grad_norm": 0.44547533988952637,
"learning_rate": 2.3341826411756863e-08,
"loss": 0.3987,
"step": 345
},
{
"epoch": 4.811188811188811,
"grad_norm": 0.4299108386039734,
"learning_rate": 1.8909677205856682e-08,
"loss": 0.4017,
"step": 346
},
{
"epoch": 4.825174825174825,
"grad_norm": 0.4200840890407562,
"learning_rate": 1.494295766310161e-08,
"loss": 0.3885,
"step": 347
},
{
"epoch": 4.839160839160839,
"grad_norm": 0.43688181042671204,
"learning_rate": 1.1442038431044856e-08,
"loss": 0.4119,
"step": 348
},
{
"epoch": 4.853146853146853,
"grad_norm": 0.4302099943161011,
"learning_rate": 8.407246633178601e-09,
"loss": 0.3843,
"step": 349
},
{
"epoch": 4.867132867132867,
"grad_norm": 0.45412999391555786,
"learning_rate": 5.838865838366792e-09,
"loss": 0.4009,
"step": 350
},
{
"epoch": 4.881118881118881,
"grad_norm": 0.43274399638175964,
"learning_rate": 3.737136034349109e-09,
"loss": 0.3951,
"step": 351
},
{
"epoch": 4.895104895104895,
"grad_norm": 0.4244266450405121,
"learning_rate": 2.102253605316684e-09,
"loss": 0.4059,
"step": 352
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.4323265552520752,
"learning_rate": 9.343713135623323e-10,
"loss": 0.3963,
"step": 353
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.4487632215023041,
"learning_rate": 2.335982852064156e-10,
"loss": 0.3937,
"step": 354
},
{
"epoch": 4.937062937062937,
"grad_norm": 0.4363052546977997,
"learning_rate": 0.0,
"loss": 0.405,
"step": 355
}
],
"logging_steps": 1,
"max_steps": 355,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 36,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.28345287429718e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}