terry69's picture
Model save
28e87b0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995095635115253,
"eval_steps": 500,
"global_step": 1019,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000980872976949485,
"grad_norm": 23.71615728078218,
"learning_rate": 9.803921568627452e-08,
"loss": 1.3172,
"step": 1
},
{
"epoch": 0.004904364884747425,
"grad_norm": 21.50955102466688,
"learning_rate": 4.901960784313725e-07,
"loss": 1.3156,
"step": 5
},
{
"epoch": 0.00980872976949485,
"grad_norm": 8.347835856988212,
"learning_rate": 9.80392156862745e-07,
"loss": 1.2118,
"step": 10
},
{
"epoch": 0.014713094654242276,
"grad_norm": 9.70215349989816,
"learning_rate": 1.4705882352941177e-06,
"loss": 1.0495,
"step": 15
},
{
"epoch": 0.0196174595389897,
"grad_norm": 2.935709350105428,
"learning_rate": 1.96078431372549e-06,
"loss": 0.9169,
"step": 20
},
{
"epoch": 0.024521824423737126,
"grad_norm": 2.3710459957915373,
"learning_rate": 2.450980392156863e-06,
"loss": 0.8718,
"step": 25
},
{
"epoch": 0.029426189308484552,
"grad_norm": 2.217748118460408,
"learning_rate": 2.9411764705882355e-06,
"loss": 0.8404,
"step": 30
},
{
"epoch": 0.03433055419323198,
"grad_norm": 2.2456090605656223,
"learning_rate": 3.431372549019608e-06,
"loss": 0.8207,
"step": 35
},
{
"epoch": 0.0392349190779794,
"grad_norm": 2.1968777765698135,
"learning_rate": 3.92156862745098e-06,
"loss": 0.8027,
"step": 40
},
{
"epoch": 0.04413928396272683,
"grad_norm": 2.3343295421758956,
"learning_rate": 4.411764705882353e-06,
"loss": 0.7888,
"step": 45
},
{
"epoch": 0.04904364884747425,
"grad_norm": 2.3845961870372956,
"learning_rate": 4.901960784313726e-06,
"loss": 0.7703,
"step": 50
},
{
"epoch": 0.053948013732221675,
"grad_norm": 2.3216256302247933,
"learning_rate": 5.392156862745098e-06,
"loss": 0.7541,
"step": 55
},
{
"epoch": 0.058852378616969105,
"grad_norm": 2.378678579603692,
"learning_rate": 5.882352941176471e-06,
"loss": 0.749,
"step": 60
},
{
"epoch": 0.06375674350171653,
"grad_norm": 2.355836921671654,
"learning_rate": 6.372549019607843e-06,
"loss": 0.7258,
"step": 65
},
{
"epoch": 0.06866110838646396,
"grad_norm": 2.480621419103395,
"learning_rate": 6.862745098039216e-06,
"loss": 0.7132,
"step": 70
},
{
"epoch": 0.07356547327121138,
"grad_norm": 2.196577695253739,
"learning_rate": 7.352941176470589e-06,
"loss": 0.7168,
"step": 75
},
{
"epoch": 0.0784698381559588,
"grad_norm": 2.418840940827789,
"learning_rate": 7.84313725490196e-06,
"loss": 0.7051,
"step": 80
},
{
"epoch": 0.08337420304070622,
"grad_norm": 2.4161115457147577,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6993,
"step": 85
},
{
"epoch": 0.08827856792545366,
"grad_norm": 2.3049037332530804,
"learning_rate": 8.823529411764707e-06,
"loss": 0.6948,
"step": 90
},
{
"epoch": 0.09318293281020108,
"grad_norm": 2.345395326875072,
"learning_rate": 9.31372549019608e-06,
"loss": 0.6859,
"step": 95
},
{
"epoch": 0.0980872976949485,
"grad_norm": 2.299729975358926,
"learning_rate": 9.803921568627451e-06,
"loss": 0.6836,
"step": 100
},
{
"epoch": 0.10299166257969593,
"grad_norm": 2.2610515025654117,
"learning_rate": 9.999735917410952e-06,
"loss": 0.6794,
"step": 105
},
{
"epoch": 0.10789602746444335,
"grad_norm": 2.450010148482251,
"learning_rate": 9.998122180387662e-06,
"loss": 0.6765,
"step": 110
},
{
"epoch": 0.11280039234919079,
"grad_norm": 2.221120129726642,
"learning_rate": 9.995041891820093e-06,
"loss": 0.6746,
"step": 115
},
{
"epoch": 0.11770475723393821,
"grad_norm": 2.1057039809456377,
"learning_rate": 9.990495955528073e-06,
"loss": 0.6644,
"step": 120
},
{
"epoch": 0.12260912211868563,
"grad_norm": 2.090893956936151,
"learning_rate": 9.984485705382538e-06,
"loss": 0.6695,
"step": 125
},
{
"epoch": 0.12751348700343307,
"grad_norm": 2.0602352081070543,
"learning_rate": 9.977012904914133e-06,
"loss": 0.6519,
"step": 130
},
{
"epoch": 0.13241785188818048,
"grad_norm": 2.0932998186794927,
"learning_rate": 9.968079746795759e-06,
"loss": 0.6657,
"step": 135
},
{
"epoch": 0.1373222167729279,
"grad_norm": 2.136782363642342,
"learning_rate": 9.957688852199201e-06,
"loss": 0.6557,
"step": 140
},
{
"epoch": 0.14222658165767532,
"grad_norm": 2.1261422382559623,
"learning_rate": 9.945843270026021e-06,
"loss": 0.6495,
"step": 145
},
{
"epoch": 0.14713094654242276,
"grad_norm": 2.067127907721412,
"learning_rate": 9.932546476012942e-06,
"loss": 0.6411,
"step": 150
},
{
"epoch": 0.1520353114271702,
"grad_norm": 2.040847654490592,
"learning_rate": 9.91780237171201e-06,
"loss": 0.6416,
"step": 155
},
{
"epoch": 0.1569396763119176,
"grad_norm": 2.0912635989059787,
"learning_rate": 9.901615283345782e-06,
"loss": 0.6503,
"step": 160
},
{
"epoch": 0.16184404119666504,
"grad_norm": 2.8372493265750873,
"learning_rate": 9.883989960537934e-06,
"loss": 0.6424,
"step": 165
},
{
"epoch": 0.16674840608141245,
"grad_norm": 2.0493928993359307,
"learning_rate": 9.86493157491962e-06,
"loss": 0.6387,
"step": 170
},
{
"epoch": 0.17165277096615988,
"grad_norm": 2.083065623023665,
"learning_rate": 9.84444571861201e-06,
"loss": 0.6362,
"step": 175
},
{
"epoch": 0.17655713585090732,
"grad_norm": 2.066269679181421,
"learning_rate": 9.822538402585451e-06,
"loss": 0.6277,
"step": 180
},
{
"epoch": 0.18146150073565473,
"grad_norm": 2.1282623287152167,
"learning_rate": 9.799216054895715e-06,
"loss": 0.6274,
"step": 185
},
{
"epoch": 0.18636586562040217,
"grad_norm": 2.0335813176241095,
"learning_rate": 9.774485518797892e-06,
"loss": 0.6155,
"step": 190
},
{
"epoch": 0.19127023050514957,
"grad_norm": 1.9384285409252677,
"learning_rate": 9.748354050738416e-06,
"loss": 0.638,
"step": 195
},
{
"epoch": 0.196174595389897,
"grad_norm": 2.006376224222685,
"learning_rate": 9.720829318225897e-06,
"loss": 0.613,
"step": 200
},
{
"epoch": 0.20107896027464445,
"grad_norm": 2.088420694162267,
"learning_rate": 9.691919397581304e-06,
"loss": 0.6139,
"step": 205
},
{
"epoch": 0.20598332515939186,
"grad_norm": 2.0409644262977515,
"learning_rate": 9.66163277156821e-06,
"loss": 0.6068,
"step": 210
},
{
"epoch": 0.2108876900441393,
"grad_norm": 2.0023467594132973,
"learning_rate": 9.629978326903778e-06,
"loss": 0.6084,
"step": 215
},
{
"epoch": 0.2157920549288867,
"grad_norm": 1.904954310523434,
"learning_rate": 9.596965351651204e-06,
"loss": 0.6045,
"step": 220
},
{
"epoch": 0.22069641981363414,
"grad_norm": 1.964486390414263,
"learning_rate": 9.562603532494432e-06,
"loss": 0.6197,
"step": 225
},
{
"epoch": 0.22560078469838157,
"grad_norm": 2.0628213777647577,
"learning_rate": 9.526902951895857e-06,
"loss": 0.5853,
"step": 230
},
{
"epoch": 0.23050514958312898,
"grad_norm": 2.067943464789154,
"learning_rate": 9.48987408513794e-06,
"loss": 0.5892,
"step": 235
},
{
"epoch": 0.23540951446787642,
"grad_norm": 2.1163968024177757,
"learning_rate": 9.451527797249538e-06,
"loss": 0.5866,
"step": 240
},
{
"epoch": 0.24031387935262383,
"grad_norm": 2.218142662563601,
"learning_rate": 9.411875339817886e-06,
"loss": 0.5923,
"step": 245
},
{
"epoch": 0.24521824423737126,
"grad_norm": 2.0789211498714057,
"learning_rate": 9.370928347687149e-06,
"loss": 0.6067,
"step": 250
},
{
"epoch": 0.2501226091221187,
"grad_norm": 2.00243401967041,
"learning_rate": 9.328698835544516e-06,
"loss": 0.5733,
"step": 255
},
{
"epoch": 0.25502697400686614,
"grad_norm": 1.9232684596963454,
"learning_rate": 9.285199194394854e-06,
"loss": 0.6039,
"step": 260
},
{
"epoch": 0.2599313388916135,
"grad_norm": 2.4559722910125523,
"learning_rate": 9.240442187924922e-06,
"loss": 0.5837,
"step": 265
},
{
"epoch": 0.26483570377636095,
"grad_norm": 2.0642822474573235,
"learning_rate": 9.19444094875825e-06,
"loss": 0.5816,
"step": 270
},
{
"epoch": 0.2697400686611084,
"grad_norm": 1.8431714309717544,
"learning_rate": 9.147208974601762e-06,
"loss": 0.5891,
"step": 275
},
{
"epoch": 0.2746444335458558,
"grad_norm": 2.455382996504095,
"learning_rate": 9.098760124285255e-06,
"loss": 0.5739,
"step": 280
},
{
"epoch": 0.27954879843060326,
"grad_norm": 2.105877839231797,
"learning_rate": 9.049108613694958e-06,
"loss": 0.5664,
"step": 285
},
{
"epoch": 0.28445316331535064,
"grad_norm": 2.1572999228459637,
"learning_rate": 8.998269011602283e-06,
"loss": 0.5654,
"step": 290
},
{
"epoch": 0.2893575282000981,
"grad_norm": 2.2354758276064257,
"learning_rate": 8.94625623538905e-06,
"loss": 0.5718,
"step": 295
},
{
"epoch": 0.2942618930848455,
"grad_norm": 1.9951110551471705,
"learning_rate": 8.893085546670426e-06,
"loss": 0.5647,
"step": 300
},
{
"epoch": 0.29916625796959295,
"grad_norm": 1.940858410217326,
"learning_rate": 8.838772546816857e-06,
"loss": 0.5503,
"step": 305
},
{
"epoch": 0.3040706228543404,
"grad_norm": 2.1124206964514567,
"learning_rate": 8.783333172376292e-06,
"loss": 0.5625,
"step": 310
},
{
"epoch": 0.30897498773908777,
"grad_norm": 1.9926807872955052,
"learning_rate": 8.726783690398091e-06,
"loss": 0.5406,
"step": 315
},
{
"epoch": 0.3138793526238352,
"grad_norm": 2.3776434685854664,
"learning_rate": 8.669140693659928e-06,
"loss": 0.5412,
"step": 320
},
{
"epoch": 0.31878371750858264,
"grad_norm": 2.0505624375679194,
"learning_rate": 8.610421095799129e-06,
"loss": 0.5465,
"step": 325
},
{
"epoch": 0.3236880823933301,
"grad_norm": 1.9938642512875002,
"learning_rate": 8.550642126349873e-06,
"loss": 0.5448,
"step": 330
},
{
"epoch": 0.3285924472780775,
"grad_norm": 1.8794344216432206,
"learning_rate": 8.489821325687682e-06,
"loss": 0.5309,
"step": 335
},
{
"epoch": 0.3334968121628249,
"grad_norm": 1.9586224504819914,
"learning_rate": 8.427976539882725e-06,
"loss": 0.5256,
"step": 340
},
{
"epoch": 0.33840117704757233,
"grad_norm": 1.9633684416354464,
"learning_rate": 8.365125915463406e-06,
"loss": 0.528,
"step": 345
},
{
"epoch": 0.34330554193231977,
"grad_norm": 1.9574848872158568,
"learning_rate": 8.301287894091812e-06,
"loss": 0.5345,
"step": 350
},
{
"epoch": 0.3482099068170672,
"grad_norm": 2.046001716842364,
"learning_rate": 8.236481207152539e-06,
"loss": 0.5392,
"step": 355
},
{
"epoch": 0.35311427170181464,
"grad_norm": 2.0110023874224257,
"learning_rate": 8.170724870256526e-06,
"loss": 0.5171,
"step": 360
},
{
"epoch": 0.358018636586562,
"grad_norm": 1.8982030843350457,
"learning_rate": 8.104038177661484e-06,
"loss": 0.5245,
"step": 365
},
{
"epoch": 0.36292300147130946,
"grad_norm": 1.9231079403397293,
"learning_rate": 8.036440696610566e-06,
"loss": 0.52,
"step": 370
},
{
"epoch": 0.3678273663560569,
"grad_norm": 1.9562337288746108,
"learning_rate": 7.967952261590936e-06,
"loss": 0.5087,
"step": 375
},
{
"epoch": 0.37273173124080433,
"grad_norm": 1.9474638682438907,
"learning_rate": 7.898592968513919e-06,
"loss": 0.5085,
"step": 380
},
{
"epoch": 0.37763609612555177,
"grad_norm": 1.9123482797519735,
"learning_rate": 7.828383168818457e-06,
"loss": 0.5131,
"step": 385
},
{
"epoch": 0.38254046101029915,
"grad_norm": 2.057943038506519,
"learning_rate": 7.757343463499577e-06,
"loss": 0.4981,
"step": 390
},
{
"epoch": 0.3874448258950466,
"grad_norm": 1.9838558136826598,
"learning_rate": 7.685494697063627e-06,
"loss": 0.5158,
"step": 395
},
{
"epoch": 0.392349190779794,
"grad_norm": 1.9089161018582137,
"learning_rate": 7.612857951412085e-06,
"loss": 0.5115,
"step": 400
},
{
"epoch": 0.39725355566454146,
"grad_norm": 2.053508845540271,
"learning_rate": 7.5394545396556864e-06,
"loss": 0.4983,
"step": 405
},
{
"epoch": 0.4021579205492889,
"grad_norm": 2.0381945487707225,
"learning_rate": 7.465305999860728e-06,
"loss": 0.4864,
"step": 410
},
{
"epoch": 0.4070622854340363,
"grad_norm": 2.060283387944364,
"learning_rate": 7.390434088729348e-06,
"loss": 0.4858,
"step": 415
},
{
"epoch": 0.4119666503187837,
"grad_norm": 2.108098690183231,
"learning_rate": 7.314860775215674e-06,
"loss": 0.4894,
"step": 420
},
{
"epoch": 0.41687101520353115,
"grad_norm": 1.9523420077195515,
"learning_rate": 7.2386082340796715e-06,
"loss": 0.5032,
"step": 425
},
{
"epoch": 0.4217753800882786,
"grad_norm": 1.9725770065321593,
"learning_rate": 7.1616988393806245e-06,
"loss": 0.4917,
"step": 430
},
{
"epoch": 0.426679744973026,
"grad_norm": 1.989450857443718,
"learning_rate": 7.0841551579121144e-06,
"loss": 0.488,
"step": 435
},
{
"epoch": 0.4315841098577734,
"grad_norm": 1.874422665252578,
"learning_rate": 7.005999942580478e-06,
"loss": 0.4871,
"step": 440
},
{
"epoch": 0.43648847474252084,
"grad_norm": 1.9628105831400913,
"learning_rate": 6.927256125728624e-06,
"loss": 0.4774,
"step": 445
},
{
"epoch": 0.4413928396272683,
"grad_norm": 2.052494969259568,
"learning_rate": 6.8479468124072146e-06,
"loss": 0.4846,
"step": 450
},
{
"epoch": 0.4462972045120157,
"grad_norm": 2.196992014411905,
"learning_rate": 6.768095273595176e-06,
"loss": 0.4761,
"step": 455
},
{
"epoch": 0.45120156939676315,
"grad_norm": 2.140187037948035,
"learning_rate": 6.6877249393715115e-06,
"loss": 0.4716,
"step": 460
},
{
"epoch": 0.4561059342815105,
"grad_norm": 2.165318448275331,
"learning_rate": 6.60685939204044e-06,
"loss": 0.462,
"step": 465
},
{
"epoch": 0.46101029916625796,
"grad_norm": 2.0062687355632485,
"learning_rate": 6.525522359211858e-06,
"loss": 0.4592,
"step": 470
},
{
"epoch": 0.4659146640510054,
"grad_norm": 1.9449243429974221,
"learning_rate": 6.443737706839175e-06,
"loss": 0.4662,
"step": 475
},
{
"epoch": 0.47081902893575284,
"grad_norm": 2.0062916535890816,
"learning_rate": 6.36152943221656e-06,
"loss": 0.4618,
"step": 480
},
{
"epoch": 0.4757233938205002,
"grad_norm": 1.9536933195743733,
"learning_rate": 6.278921656937631e-06,
"loss": 0.4586,
"step": 485
},
{
"epoch": 0.48062775870524765,
"grad_norm": 2.1253405020175706,
"learning_rate": 6.195938619817694e-06,
"loss": 0.4643,
"step": 490
},
{
"epoch": 0.4855321235899951,
"grad_norm": 2.0546536638691695,
"learning_rate": 6.112604669781572e-06,
"loss": 0.4553,
"step": 495
},
{
"epoch": 0.4904364884747425,
"grad_norm": 1.9563960276975647,
"learning_rate": 6.0289442587191405e-06,
"loss": 0.4537,
"step": 500
},
{
"epoch": 0.49534085335948996,
"grad_norm": 1.9189930888442277,
"learning_rate": 5.944981934310627e-06,
"loss": 0.4555,
"step": 505
},
{
"epoch": 0.5002452182442374,
"grad_norm": 1.8664708836063784,
"learning_rate": 5.860742332823831e-06,
"loss": 0.4515,
"step": 510
},
{
"epoch": 0.5051495831289848,
"grad_norm": 1.9544083422711673,
"learning_rate": 5.776250171885329e-06,
"loss": 0.447,
"step": 515
},
{
"epoch": 0.5100539480137323,
"grad_norm": 2.0179051671028385,
"learning_rate": 5.691530243227824e-06,
"loss": 0.4386,
"step": 520
},
{
"epoch": 0.5149583128984796,
"grad_norm": 2.0846338532033752,
"learning_rate": 5.6066074054157385e-06,
"loss": 0.4355,
"step": 525
},
{
"epoch": 0.519862677783227,
"grad_norm": 2.0263314582631153,
"learning_rate": 5.521506576551196e-06,
"loss": 0.4401,
"step": 530
},
{
"epoch": 0.5247670426679745,
"grad_norm": 1.8778135562708458,
"learning_rate": 5.436252726962553e-06,
"loss": 0.4341,
"step": 535
},
{
"epoch": 0.5296714075527219,
"grad_norm": 1.9757916591258975,
"learning_rate": 5.350870871877577e-06,
"loss": 0.4364,
"step": 540
},
{
"epoch": 0.5345757724374693,
"grad_norm": 1.9531961318347626,
"learning_rate": 5.265386064083481e-06,
"loss": 0.4323,
"step": 545
},
{
"epoch": 0.5394801373222168,
"grad_norm": 2.0725678291543628,
"learning_rate": 5.179823386575908e-06,
"loss": 0.4364,
"step": 550
},
{
"epoch": 0.5443845022069642,
"grad_norm": 1.860156556204824,
"learning_rate": 5.09420794519907e-06,
"loss": 0.4329,
"step": 555
},
{
"epoch": 0.5492888670917117,
"grad_norm": 1.8698424821946518,
"learning_rate": 5.008564861279188e-06,
"loss": 0.4143,
"step": 560
},
{
"epoch": 0.5541932319764591,
"grad_norm": 1.9655955550507962,
"learning_rate": 4.922919264253368e-06,
"loss": 0.4248,
"step": 565
},
{
"epoch": 0.5590975968612065,
"grad_norm": 1.9066406300773608,
"learning_rate": 4.837296284296113e-06,
"loss": 0.4186,
"step": 570
},
{
"epoch": 0.5640019617459539,
"grad_norm": 1.80977031179867,
"learning_rate": 4.75172104494561e-06,
"loss": 0.4156,
"step": 575
},
{
"epoch": 0.5689063266307013,
"grad_norm": 1.991595721949942,
"learning_rate": 4.666218655731981e-06,
"loss": 0.4156,
"step": 580
},
{
"epoch": 0.5738106915154487,
"grad_norm": 1.910239431707766,
"learning_rate": 4.580814204809618e-06,
"loss": 0.3942,
"step": 585
},
{
"epoch": 0.5787150564001962,
"grad_norm": 1.9231788230679294,
"learning_rate": 4.495532751595813e-06,
"loss": 0.4131,
"step": 590
},
{
"epoch": 0.5836194212849436,
"grad_norm": 1.903193102248486,
"learning_rate": 4.410399319417806e-06,
"loss": 0.4128,
"step": 595
},
{
"epoch": 0.588523786169691,
"grad_norm": 1.8468443833129051,
"learning_rate": 4.325438888170429e-06,
"loss": 0.4007,
"step": 600
},
{
"epoch": 0.5934281510544385,
"grad_norm": 1.8391801455451426,
"learning_rate": 4.2406763869864965e-06,
"loss": 0.4127,
"step": 605
},
{
"epoch": 0.5983325159391859,
"grad_norm": 1.9158169069206314,
"learning_rate": 4.156136686922083e-06,
"loss": 0.4102,
"step": 610
},
{
"epoch": 0.6032368808239333,
"grad_norm": 1.8336931901990852,
"learning_rate": 4.071844593658841e-06,
"loss": 0.3978,
"step": 615
},
{
"epoch": 0.6081412457086808,
"grad_norm": 1.9026371408899738,
"learning_rate": 3.987824840225512e-06,
"loss": 0.4009,
"step": 620
},
{
"epoch": 0.6130456105934281,
"grad_norm": 1.9012934657121805,
"learning_rate": 3.904102079740753e-06,
"loss": 0.3923,
"step": 625
},
{
"epoch": 0.6179499754781755,
"grad_norm": 1.8972920333300498,
"learning_rate": 3.820700878179389e-06,
"loss": 0.3894,
"step": 630
},
{
"epoch": 0.622854340362923,
"grad_norm": 1.8312865024256686,
"learning_rate": 3.73764570716427e-06,
"loss": 0.3822,
"step": 635
},
{
"epoch": 0.6277587052476704,
"grad_norm": 1.9183680244376244,
"learning_rate": 3.654960936785783e-06,
"loss": 0.3926,
"step": 640
},
{
"epoch": 0.6326630701324178,
"grad_norm": 1.828330870696398,
"learning_rate": 3.572670828451177e-06,
"loss": 0.3924,
"step": 645
},
{
"epoch": 0.6375674350171653,
"grad_norm": 1.8312336733506578,
"learning_rate": 3.4907995277657624e-06,
"loss": 0.3984,
"step": 650
},
{
"epoch": 0.6424717999019127,
"grad_norm": 2.0339474627749436,
"learning_rate": 3.4093710574480926e-06,
"loss": 0.3737,
"step": 655
},
{
"epoch": 0.6473761647866602,
"grad_norm": 1.8512202300281744,
"learning_rate": 3.3284093102812144e-06,
"loss": 0.3896,
"step": 660
},
{
"epoch": 0.6522805296714076,
"grad_norm": 1.935099436122936,
"learning_rate": 3.2479380421020336e-06,
"loss": 0.3744,
"step": 665
},
{
"epoch": 0.657184894556155,
"grad_norm": 1.877516062960988,
"learning_rate": 3.167980864830855e-06,
"loss": 0.3872,
"step": 670
},
{
"epoch": 0.6620892594409024,
"grad_norm": 1.8575937601231947,
"learning_rate": 3.0885612395431765e-06,
"loss": 0.3811,
"step": 675
},
{
"epoch": 0.6669936243256498,
"grad_norm": 1.799905567165699,
"learning_rate": 3.009702469585713e-06,
"loss": 0.3793,
"step": 680
},
{
"epoch": 0.6718979892103972,
"grad_norm": 1.9383699331479365,
"learning_rate": 2.93142769373873e-06,
"loss": 0.3712,
"step": 685
},
{
"epoch": 0.6768023540951447,
"grad_norm": 1.9008755304822846,
"learning_rate": 2.853759879426644e-06,
"loss": 0.3738,
"step": 690
},
{
"epoch": 0.6817067189798921,
"grad_norm": 1.8531666768609951,
"learning_rate": 2.7767218159789067e-06,
"loss": 0.3619,
"step": 695
},
{
"epoch": 0.6866110838646395,
"grad_norm": 1.9787710274083234,
"learning_rate": 2.7003361079431547e-06,
"loss": 0.3733,
"step": 700
},
{
"epoch": 0.691515448749387,
"grad_norm": 1.797813113073451,
"learning_rate": 2.624625168452568e-06,
"loss": 0.3762,
"step": 705
},
{
"epoch": 0.6964198136341344,
"grad_norm": 1.9305643695542356,
"learning_rate": 2.5496112126493995e-06,
"loss": 0.3712,
"step": 710
},
{
"epoch": 0.7013241785188818,
"grad_norm": 1.7398004529962572,
"learning_rate": 2.4753162511665936e-06,
"loss": 0.366,
"step": 715
},
{
"epoch": 0.7062285434036293,
"grad_norm": 2.0252060555010902,
"learning_rate": 2.401762083669419e-06,
"loss": 0.3626,
"step": 720
},
{
"epoch": 0.7111329082883766,
"grad_norm": 1.7500094335967311,
"learning_rate": 2.3289702924589914e-06,
"loss": 0.3624,
"step": 725
},
{
"epoch": 0.716037273173124,
"grad_norm": 1.752557189300548,
"learning_rate": 2.256962236139598e-06,
"loss": 0.3677,
"step": 730
},
{
"epoch": 0.7209416380578715,
"grad_norm": 1.784599164157859,
"learning_rate": 2.18575904335163e-06,
"loss": 0.3647,
"step": 735
},
{
"epoch": 0.7258460029426189,
"grad_norm": 1.9166229348625452,
"learning_rate": 2.115381606572018e-06,
"loss": 0.3614,
"step": 740
},
{
"epoch": 0.7307503678273664,
"grad_norm": 1.7664227370032382,
"learning_rate": 2.0458505759839433e-06,
"loss": 0.3539,
"step": 745
},
{
"epoch": 0.7356547327121138,
"grad_norm": 1.7823664294805341,
"learning_rate": 1.9771863534176544e-06,
"loss": 0.3649,
"step": 750
},
{
"epoch": 0.7405590975968612,
"grad_norm": 1.798651877731334,
"learning_rate": 1.90940908636415e-06,
"loss": 0.3584,
"step": 755
},
{
"epoch": 0.7454634624816087,
"grad_norm": 1.7897055029417046,
"learning_rate": 1.8425386620634961e-06,
"loss": 0.3575,
"step": 760
},
{
"epoch": 0.7503678273663561,
"grad_norm": 1.8567965533511697,
"learning_rate": 1.7765947016694902e-06,
"loss": 0.3597,
"step": 765
},
{
"epoch": 0.7552721922511035,
"grad_norm": 1.7069946121225148,
"learning_rate": 1.711596554492428e-06,
"loss": 0.3569,
"step": 770
},
{
"epoch": 0.7601765571358509,
"grad_norm": 1.7171226071674441,
"learning_rate": 1.64756329232161e-06,
"loss": 0.3508,
"step": 775
},
{
"epoch": 0.7650809220205983,
"grad_norm": 1.851171218498648,
"learning_rate": 1.5845137038292851e-06,
"loss": 0.3505,
"step": 780
},
{
"epoch": 0.7699852869053457,
"grad_norm": 1.7875922878269628,
"learning_rate": 1.5224662890576781e-06,
"loss": 0.3404,
"step": 785
},
{
"epoch": 0.7748896517900932,
"grad_norm": 1.6638468790620988,
"learning_rate": 1.4614392539906892e-06,
"loss": 0.3522,
"step": 790
},
{
"epoch": 0.7797940166748406,
"grad_norm": 1.7409632860061264,
"learning_rate": 1.4014505052118893e-06,
"loss": 0.353,
"step": 795
},
{
"epoch": 0.784698381559588,
"grad_norm": 1.815807330350682,
"learning_rate": 1.3425176446503618e-06,
"loss": 0.3414,
"step": 800
},
{
"epoch": 0.7896027464443355,
"grad_norm": 1.8126274287911948,
"learning_rate": 1.2846579644159291e-06,
"loss": 0.3425,
"step": 805
},
{
"epoch": 0.7945071113290829,
"grad_norm": 1.7603201856240744,
"learning_rate": 1.2278884417253033e-06,
"loss": 0.3453,
"step": 810
},
{
"epoch": 0.7994114762138304,
"grad_norm": 1.7250692746456593,
"learning_rate": 1.172225733920616e-06,
"loss": 0.3456,
"step": 815
},
{
"epoch": 0.8043158410985778,
"grad_norm": 1.6574970310295125,
"learning_rate": 1.1176861735818107e-06,
"loss": 0.3357,
"step": 820
},
{
"epoch": 0.8092202059833251,
"grad_norm": 1.7019799673778844,
"learning_rate": 1.0642857637343346e-06,
"loss": 0.3406,
"step": 825
},
{
"epoch": 0.8141245708680726,
"grad_norm": 1.6768204828704758,
"learning_rate": 1.0120401731535213e-06,
"loss": 0.353,
"step": 830
},
{
"epoch": 0.81902893575282,
"grad_norm": 1.6058567769661107,
"learning_rate": 9.609647317670468e-07,
"loss": 0.3413,
"step": 835
},
{
"epoch": 0.8239333006375674,
"grad_norm": 1.722906282315617,
"learning_rate": 9.110744261568206e-07,
"loss": 0.3329,
"step": 840
},
{
"epoch": 0.8288376655223149,
"grad_norm": 1.722830977466264,
"learning_rate": 8.623838951616076e-07,
"loss": 0.3339,
"step": 845
},
{
"epoch": 0.8337420304070623,
"grad_norm": 1.7043628706248817,
"learning_rate": 8.149074255816996e-07,
"loss": 0.3327,
"step": 850
},
{
"epoch": 0.8386463952918097,
"grad_norm": 1.6620351875279176,
"learning_rate": 7.68658947986874e-07,
"loss": 0.3409,
"step": 855
},
{
"epoch": 0.8435507601765572,
"grad_norm": 1.6249652725084447,
"learning_rate": 7.236520326288721e-07,
"loss": 0.3345,
"step": 860
},
{
"epoch": 0.8484551250613046,
"grad_norm": 1.6797312028896152,
"learning_rate": 6.79899885459619e-07,
"loss": 0.3371,
"step": 865
},
{
"epoch": 0.853359489946052,
"grad_norm": 1.6133846444360203,
"learning_rate": 6.374153442563192e-07,
"loss": 0.3291,
"step": 870
},
{
"epoch": 0.8582638548307994,
"grad_norm": 1.639961696466558,
"learning_rate": 5.962108748545942e-07,
"loss": 0.3405,
"step": 875
},
{
"epoch": 0.8631682197155468,
"grad_norm": 1.7763694537319363,
"learning_rate": 5.562985674907467e-07,
"loss": 0.3377,
"step": 880
},
{
"epoch": 0.8680725846002942,
"grad_norm": 1.7034248324336427,
"learning_rate": 5.176901332542378e-07,
"loss": 0.3406,
"step": 885
},
{
"epoch": 0.8729769494850417,
"grad_norm": 1.6095220015390743,
"learning_rate": 4.803969006514175e-07,
"loss": 0.33,
"step": 890
},
{
"epoch": 0.8778813143697891,
"grad_norm": 1.7111560907547738,
"learning_rate": 4.444298122815055e-07,
"loss": 0.335,
"step": 895
},
{
"epoch": 0.8827856792545365,
"grad_norm": 1.6246588106178113,
"learning_rate": 4.0979942162580387e-07,
"loss": 0.3289,
"step": 900
},
{
"epoch": 0.887690044139284,
"grad_norm": 1.6674759393718703,
"learning_rate": 3.76515889951099e-07,
"loss": 0.3287,
"step": 905
},
{
"epoch": 0.8925944090240314,
"grad_norm": 1.6046692843902606,
"learning_rate": 3.445889833281296e-07,
"loss": 0.3324,
"step": 910
},
{
"epoch": 0.8974987739087789,
"grad_norm": 1.7079729479541892,
"learning_rate": 3.140280697660247e-07,
"loss": 0.3258,
"step": 915
},
{
"epoch": 0.9024031387935263,
"grad_norm": 1.6511127606613865,
"learning_rate": 2.8484211646353677e-07,
"loss": 0.3266,
"step": 920
},
{
"epoch": 0.9073075036782736,
"grad_norm": 1.6108380967394342,
"learning_rate": 2.570396871778796e-07,
"loss": 0.3285,
"step": 925
},
{
"epoch": 0.912211868563021,
"grad_norm": 1.7501453734514094,
"learning_rate": 2.3062893971195211e-07,
"loss": 0.3299,
"step": 930
},
{
"epoch": 0.9171162334477685,
"grad_norm": 1.7120576016328979,
"learning_rate": 2.0561762352066638e-07,
"loss": 0.3261,
"step": 935
},
{
"epoch": 0.9220205983325159,
"grad_norm": 1.6683684954483,
"learning_rate": 1.8201307743709927e-07,
"loss": 0.328,
"step": 940
},
{
"epoch": 0.9269249632172634,
"grad_norm": 1.6981337204644897,
"learning_rate": 1.5982222751913079e-07,
"loss": 0.331,
"step": 945
},
{
"epoch": 0.9318293281020108,
"grad_norm": 1.6609453351002048,
"learning_rate": 1.390515850171953e-07,
"loss": 0.3234,
"step": 950
},
{
"epoch": 0.9367336929867582,
"grad_norm": 1.6636906278300567,
"learning_rate": 1.1970724446374592e-07,
"loss": 0.3336,
"step": 955
},
{
"epoch": 0.9416380578715057,
"grad_norm": 1.5711270056253417,
"learning_rate": 1.0179488188499675e-07,
"loss": 0.3299,
"step": 960
},
{
"epoch": 0.9465424227562531,
"grad_norm": 1.6157567675118045,
"learning_rate": 8.531975313545715e-08,
"loss": 0.3398,
"step": 965
},
{
"epoch": 0.9514467876410004,
"grad_norm": 1.6523867015186364,
"learning_rate": 7.028669235575714e-08,
"loss": 0.3294,
"step": 970
},
{
"epoch": 0.9563511525257479,
"grad_norm": 1.740288361114131,
"learning_rate": 5.670011055421365e-08,
"loss": 0.3335,
"step": 975
},
{
"epoch": 0.9612555174104953,
"grad_norm": 1.621637272095815,
"learning_rate": 4.4563994312546435e-08,
"loss": 0.3295,
"step": 980
},
{
"epoch": 0.9661598822952427,
"grad_norm": 1.657618038769045,
"learning_rate": 3.3881904616137054e-08,
"loss": 0.3266,
"step": 985
},
{
"epoch": 0.9710642471799902,
"grad_norm": 1.6751008606908966,
"learning_rate": 2.4656975809160267e-08,
"loss": 0.3315,
"step": 990
},
{
"epoch": 0.9759686120647376,
"grad_norm": 1.6937145693196844,
"learning_rate": 1.689191467490303e-08,
"loss": 0.3313,
"step": 995
},
{
"epoch": 0.980872976949485,
"grad_norm": 1.6407710130512811,
"learning_rate": 1.058899964154092e-08,
"loss": 0.3278,
"step": 1000
},
{
"epoch": 0.9857773418342325,
"grad_norm": 1.6749685115327173,
"learning_rate": 5.750080113598455e-09,
"loss": 0.3246,
"step": 1005
},
{
"epoch": 0.9906817067189799,
"grad_norm": 1.5703580183454553,
"learning_rate": 2.376575929297076e-09,
"loss": 0.3257,
"step": 1010
},
{
"epoch": 0.9955860716037274,
"grad_norm": 1.560579334168069,
"learning_rate": 4.694769439445024e-10,
"loss": 0.3258,
"step": 1015
},
{
"epoch": 0.9995095635115253,
"eval_loss": 0.33406102657318115,
"eval_runtime": 96.9695,
"eval_samples_per_second": 3.114,
"eval_steps_per_second": 0.784,
"step": 1019
},
{
"epoch": 0.9995095635115253,
"step": 1019,
"total_flos": 213305524224000.0,
"train_loss": 0.4876700218573169,
"train_runtime": 22910.3839,
"train_samples_per_second": 1.424,
"train_steps_per_second": 0.044
}
],
"logging_steps": 5,
"max_steps": 1019,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 213305524224000.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}